diff --git a/.bazelrc b/.bazelrc
index e9fc2d4eb20a55..9de6b6e0c2bd54 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -526,34 +526,9 @@ build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_c
 build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
-build:rbe_linux_cuda_nvcc --config=cuda
+build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
-build:rbe_linux_cuda_nvcc --@local_xla//xla/python:enable_gpu=true
-build:rbe_linux_cuda_nvcc --@local_xla//xla/python:jax_cuda_pip_rpaths=true
-build:rbe_linux_cuda_nvcc --define=xla_python_enable_gpu=true
-build:rbe_linux_cuda_nvcc --config=tensorrt
-build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_75,compute_80"
-build:rbe_linux_cuda_nvcc --action_env=TF_CUDA_VERSION="12"
-build:rbe_linux_cuda_nvcc --action_env=TF_CUDNN_VERSION="8"
-build:rbe_linux_cuda_nvcc --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
-build:rbe_linux_cuda_nvcc --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
-build:rbe_linux_cuda_nvcc --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:rbe_linux_cuda_nvcc --crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
-build:rbe_linux_cuda_nvcc --config=rbe_linux
-build:rbe_linux_cuda_nvcc --host_crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
-build:rbe_linux_cuda_nvcc --extra_toolchains="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cuda_nvcc --extra_execution_platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --host_platform="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_python3.9"
-build:rbe_linux_cuda_nvcc --python_path="/usr/bin/python3"
-# These you may need to change for your own GCP project.
-common:rbe_linux_cuda_nvcc --remote_instance_name=projects/tensorflow-testing/instances/default_instance
-build:rbe_linux_cuda_nvcc --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_cuda"
-build:rbe_linux_cuda_nvcc --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_tensorrt"
-build:rbe_linux_cuda_nvcc --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_nccl"
-test:rbe_linux_cuda_nvcc --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+build:rbe_linux_cuda_nvcc --action_env=TF_NVCC_CLANG="1"
 
 # TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
 build:rbe_win --config=rbe_base
@@ -692,19 +667,39 @@ build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda
 build:release_cpu_macos --config=avx_linux
 test:release_cpu_macos --config=release_base
 
-# Build configs for macOS ARM CPUs
+# Base build configs for macOS
+build:release_macos_base --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
+build:release_macos_base --define=no_nccl_support=true --output_filter=^$
+
+# Build configs for macOS x86
+build:release_macos_x86 --config=release_macos_base
+# Build with the AVX instruction set when on macOS x86
+build:release_macos_x86 --config=avx_linux
+build:release_macos_x86 --cpu=darwin
+# Target Catalina as the minimum compatible OS version
+build:release_macos_x86 --macos_minimum_os=10.15
+build:release_macos_x86 --action_env MACOSX_DEPLOYMENT_TARGET=10.15
+
+# Build configs for macOS Arm64
+build:release_macos_arm64 --config=release_macos_base
 build:release_macos_arm64 --cpu=darwin_arm64
-# Set DEVELOPER_DIR to select a version of Xcode.
-build:release_macos_arm64 --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
-build:release_macos_arm64 --define=no_nccl_support=true
-# Suppress all warning messages
-build:release_macos_arm64 --output_filter=^$
-# Disable MKL
 build:release_macos_arm64 --define=tensorflow_mkldnn_contraction_kernel=0
 # Target Moneterey as the minimum compatible OS version
 build:release_macos_arm64 --macos_minimum_os=12.0
 build:release_macos_arm64 --action_env MACOSX_DEPLOYMENT_TARGET=12.0
 
+# Base test configs for macOS
+test:release_macos_base --verbose_failures=true --local_test_jobs=HOST_CPUS
+test:release_macos_base --test_timeout=300,450,1200,3600 --test_output=errors
+test:release_macos_base --build_tests_only --keep_going
+test:release_macos_base --flaky_test_attempts=3
+
+# Test configs for macOS x86
+test:release_macos_x86 --config=release_macos_base
+
+# Test configs for macOS Arm64
+test:release_macos_arm64 --config=release_macos_base
+
 # TODO(kanglan): Update windows configs after b/289091160 is fixed
 build:release_cpu_windows --config=avx_win
 build:release_cpu_windows --define=no_tensorflow_py_deps=true
@@ -723,10 +718,14 @@ build:no_tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compil
 
 # Use --config=tf_public_cache to try and use the TensorFlow public build cache
 # to build TensorFlow. Look at ci/official/envs to find which types of jobs
-# push to the cache.
+# push to the cache.  For macOS, use --config=tf_public_macos_cache
 build:tf_public_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --remote_upload_local_results=false
 # Cache pushes are limited to TF's CI system.
 build:tf_public_cache_push --config=tf_public_cache --remote_upload_local_results=true --google_default_credentials
+# Public cache for macOS builds
+build:tf_public_macos_cache --remote_cache="https://storage.googleapis.com/tensorflow-macos-bazel-cache/oct2023" --remote_upload_local_results=false
+# Cache pushes are limited to TF's CI system.
+build:tf_public_macos_cache_push --config=tf_public_macos_cache --remote_upload_local_results=true --google_default_credentials
 
 # END TF CACHE HELPER OPTIONS
 # BEGIN TF TEST SUITE OPTIONS
@@ -743,22 +742,27 @@ build:linux_libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.
 test:linux_cpu_wheel_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cpu_wheel_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cpu_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_cpu_wheel_test --config=linux_cpu_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cpu_wheel_test --config=linux_cpu_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # CUDA WHEEL
-test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cuda_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # ARM64 WHEEL
 test:linux_arm64_wheel_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_arm64_wheel_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_arm64_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_arm64_wheel_test --config=linux_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...  -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test
+test:linux_arm64_wheel_test --config=linux_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...  -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test
 # MACOS ARM64 WHEEL
 test:macos_arm64_wheel_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
 test:macos_arm64_wheel_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
-test:macos_arm64_wheel_test_filters --test_lang_filters=py
-test:macos_arm64_wheel_test --config=macos_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xla/service/gpu/... -//tensorflow/compiler/xla/tools/multihost_hlo_runner/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/... -//tensorflow/compiler/xla/tests:local_client_aot_test_computation -//tensorflow/compiler/xla/tests:local_client_aot_test_helper -//tensorflow/compiler/xla/tests:local_client_aot_test
+test:macos_arm64_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
+test:macos_arm64_wheel_test --config=macos_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/...
+# MACOS X86 WHEEL
+test:macos_x86_wheel_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
+test:macos_x86_wheel_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
+test:macos_x86_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
+test:macos_x86_wheel_test --config=macos_x86_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/...
 
 # PYCPP TESTS run a suite of Python and C++ tests to verify general correctness over
 # the whole TF code base. These are usually run continuously or upon presubmit.
@@ -766,21 +770,53 @@ test:macos_arm64_wheel_test --config=macos_arm64_wheel_test_filters -- //tensorf
 test:linux_cpu_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_cpu_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_cpu_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # CUDA PYCPP:
 test:linux_cuda_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # ARM64 PYCPP
 test:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --flaky_test_attempts=3
 # TODO(michaelhudgins): Why do we need to specifically omit go and java here? 
-test:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test
+test:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test -//tensorflow/python/tools:aot_compiled_test
+# CROSS-COMPILE ARM64 PYCPP
+test:cross_compile_linux_arm64_pycpp_test --config=linux_arm64_pycpp_test
+# Tests that fail only when cross-compiled
+test:cross_compile_linux_arm64_pycpp_test -//tensorflow/compiler/mlir/quantization/stablehlo:convert_tf_quant_to_mhlo_int_test
 # MACOS ARM64 PYCPP
 test:macos_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
 test:macos_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
-test:macos_arm64_pycpp_test_filters --test_lang_filters=cc,py
-test:macos_arm64_pycpp_test --config=macos_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xla/service/gpu/... -//tensorflow/compiler/xla/tools/multihost_hlo_runner/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/... -//tensorflow/compiler/xla/tests:local_client_aot_test_computation -//tensorflow/compiler/xla/tests:local_client_aot_test_helper -//tensorflow/compiler/xla/tests:local_client_aot_test
+test:macos_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
+test:macos_arm64_pycpp_test --config=macos_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/... -//tensorflow/core/kernels/image:resize_bicubic_op_test
 # END TF TEST SUITE OPTIONS
+
+# START LINUX AARCH64 CROSS-COMPILE CONFIGS
+# Set execution platform to Linux x86
+# Note: Lot of the "host_" flags such as "host_cpu" and "host_crosstool_top"
+# flags seem to be actually used to specify the execution platform details. It
+# seems it is this way because these flags are old and predate the distinction
+# between host and execution platform.
+build:cross_compile_linux_arm64 --host_cpu=k8
+build:cross_compile_linux_arm64 --host_crosstool_top=//tensorflow/tools/toolchains/cross_compile/cc:cross_compile_toolchain_suite
+build:cross_compile_linux_arm64 --extra_execution_platforms=//tensorflow/tools/toolchains/cross_compile/config:linux_x86_64
+
+# Set the target CPU to Aarch64
+build:cross_compile_linux_arm64 --platforms=//tensorflow/tools/toolchains/cross_compile/config:linux_aarch64
+build:cross_compile_linux_arm64 --cpu=aarch64
+build:cross_compile_linux_arm64 --crosstool_top=//tensorflow/tools/toolchains/cross_compile/cc:cross_compile_toolchain_suite
+
+# RBE configs
+build:rbe_cross_compile_linux_arm64 --config=cross_compile_linux_arm64
+build:rbe_cross_compile_linux_arm64 --config=rbe_base
+build:rbe_cross_compile_linux_arm64 --remote_instance_name=projects/tensorflow-testing/instances/default_instance
+
+# Test-related settings below this point
+# We cannot run cross-compiled tests on the remote Linux x86 VMs so we need to
+# force all tests to run locally on the Aarch64 host.
+test:rbe_cross_compile_linux_arm64 --strategy=TestRunner=local
+test:rbe_cross_compile_linux_arm64 --verbose_failures=true --local_test_jobs=HOST_CPUS --test_output=errors
+test:rbe_cross_compile_linux_arm64 --flaky_test_attempts=3 --build_tests_only
+# END LINUX AARCH64 CROSS-COMPILE CONFIGS
diff --git a/.github/workflows/arm-ci.yml b/.github/workflows/arm-ci.yml
index 96467ebaeb35a9..3b07683008391d 100644
--- a/.github/workflows/arm-ci.yml
+++ b/.github/workflows/arm-ci.yml
@@ -20,12 +20,6 @@ on:
     branches:
       - master
       - r2.**
-  pull_request:
-    types: [labeled, opened, synchronize, reopened]
-    branches:
-      - master
-      - r2.**
-
 permissions:
   contents: read
 
diff --git a/.github/workflows/osv-scanner-scheduled.yml b/.github/workflows/osv-scanner-scheduled.yml
index bb39d60168e08d..fb7366768436c5 100644
--- a/.github/workflows/osv-scanner-scheduled.yml
+++ b/.github/workflows/osv-scanner-scheduled.yml
@@ -27,6 +27,7 @@ permissions:
 
 jobs:
   scan-scheduled:
+    if: github.repository == 'tensorflow/tensorflow'
     uses: "google/osv-scanner/.github/workflows/osv-scanner-reusable.yml@main"
     with:
       scan-args: |-
@@ -36,4 +37,4 @@ jobs:
         --lockfile=requirements.txt:./requirements_lock_3_12.txt
         --lockfile=requirements.txt:./ci/official/containers/linux_arm64/devel.requirements.txt
         --lockfile=requirements.txt:./ci/official/containers/linux_arm64/jax.requirements.txt
-        --lockfile=requirements.txt:./ci/official/containers/linux_arm64/devel.usertools/test.requirements.txt
\ No newline at end of file
+        --lockfile=requirements.txt:./ci/official/containers/linux_arm64/devel.usertools/test.requirements.txt
diff --git a/.github/workflows/stale-issues.yml b/.github/workflows/stale-issues.yml
index 84118acca683fd..e439c0f180ed44 100644
--- a/.github/workflows/stale-issues.yml
+++ b/.github/workflows/stale-issues.yml
@@ -31,7 +31,7 @@ jobs:
       pull-requests: write
     steps:
       - name: Awaiting response issues
-        uses: actions/stale@v7
+        uses: actions/stale@6f05e4244c9a0b2ed3401882b05d701dd0a7289b # v7.0.0
         with:
           #Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
           exempt-issue-labels: 'override-stale'
@@ -59,7 +59,7 @@ jobs:
           close-pr-message: "This PR was closed because it has been inactive for 14 days since being marked as stale. Please reopen if you'd like to work on this further."
           repo-token: ${{ secrets.GITHUB_TOKEN }}
       - name: Contribution issues
-        uses: actions/stale@v7
+        uses: actions/stale@6f05e4244c9a0b2ed3401882b05d701dd0a7289b # v7.0.0
         with:
           #Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
           exempt-issue-labels: 'override-stale'
diff --git a/RELEASE.md b/RELEASE.md
index 75350aeccc5542..6ee5c0ca55fa5f 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -9,6 +9,12 @@
 * <DOCUMENT BREAKING CHANGES HERE>
 * <THIS SECTION SHOULD CONTAIN API, ABI AND BEHAVIORAL BREAKING CHANGES>
 
+* `tf.summary.trace_on` now takes a `profiler_outdir` argument. This must be set
+  if `profiler` arg is set to `True`.
+    * `tf.summary.trace_export`'s `profiler_outdir` arg is now a no-op. Enabling
+      the profiler now requires setting `profiler_outdir` in `trace_on`.
+
+
 ### Known Caveats
 
 * <CAVEATS REGARDING THE RELEASE (BUT NOT BREAKING CHANGES).>
@@ -32,6 +38,17 @@
     * Added support for `stablehlo.multiply`.
     * Added support for `stablehlo.maximum`.
     * Added support for `stablehlo.minimum`.
+    * Added boolean parameter support for `tfl.gather_nd`.
+
+* `tf.CheckpointOptions`
+    * It now takes in a new argument called `experimental_sharding_callback`.
+      This is a callback function wrapper that will be executed to determine how
+      tensors will be split into shards when the saver writes the checkpoint
+      shards to disk. `tf.train.experimental.ShardByTaskPolicy` is the default
+      sharding behavior, but `tf.train.experimental.MaxShardSizePolicy` can be
+      used to shard the checkpoint with a maximum shard file size. Users with
+      advanced use cases can also write their own custom
+      `tf.train.experimental.ShardingCallback`s.
 
 ## Keras
 
@@ -48,6 +65,9 @@
       table maintained by the layer. If this layer is not used in conjunction
       with `UpdateEmbeddingCallback` the behavior of the layer would be same as
       `keras.layers.Embedding`.
+*  `keras.optimizers.Adam`
+    * Added the option to set adaptive epsilon to match implementations with Jax
+      and PyTorch equivalents.
 
 ### Breaking Changes
 
@@ -77,6 +97,39 @@ This release contains contributions from many people at Google, as well as:
 
 <INSERT>, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
 
+# Release 2.15.0.post1
+
+## TensorFlow
+
+### Bug Fixes and Other Changes
+
+*   Hot-fix was needed for an issue affecting the TensorFlow installation
+    process.
+    *   TensorFlow 2.15.0 Python package was requesting `tensorrt`-related
+        packages that cannot be found unless the user installs them beforehand
+        or provides additional installation flags.
+    *   This dependency affected anyone installing TensorFlow 2.15 alongside
+        NVIDIA CUDA dependencies via `pip install tensorflow[and-cuda]`.
+    *   Depending on the installation method, TensorFlow 2.14 would be installed
+        instead of 2.15, or users could receive an installation error due to
+        those missing dependencies.
+*   TensorFlow 2.15.0.post1 is being released for Linux x86_64 to resolve this
+    issue as quickly as possible.
+    *   This version removes the `tensorrt` Python package dependencies from the
+        tensorflow[and-cuda] installation method to ensure `pip install
+        tensorflow[and-cuda]` works as originally intended for TensorFlow 2.15.
+    *   Support for TensorRT is otherwise unaffected as long as TensorRT is
+        already installed on the system.
+*   Using .post1 instead of a full minor release allowed us to push this release
+    out quickly. However, please note the following caveat:
+    *   For users wishing to pin their Python dependency in a requirements file
+        or other situation, under Python's version specification rules,
+        `tensorflow[and-cuda]==2.15.0` will not install this fixed version.
+        Please use `==2.15.0.post1` to specify this exact version on Linux
+        platforms, or a fuzzy version specification, such as `==2.15.*`, to
+        specify the most recent compatible version of TensorFlow 2.15 on all
+        platforms.
+
 # Release 2.15.0
 
 ## TensorFlow
@@ -164,29 +217,26 @@ This release contains contributions from many people at Google, as well as:
 
     *   Provided a new `experimental_skip_saver` argument which, if specified, will suppress the addition of `SavedModel`-native save and restore ops to the `SavedModel`, for cases where users already build custom save/restore ops and checkpoint formats for the model being saved, and the creation of the SavedModel-native save/restore ops simply cause longer model serialization times.
 
-* `tf.math.bincount`
-    *   Updated documentation. Fixed "[Bincount doesn't check the tensor type](https://github.com/tensorflow/tensorflow/issues/56499)" and some other corner cases.
-
-## Keras
-
-### Breaking Changes
-
-### Known Caveats
-
-### Major Features and Improvements
-
-### Bug Fixes and Other Changes
-
 * Add ops to `tensorflow.raw_ops` that were missing.
+
 * `tf.CheckpointOptions`
     * It now takes in a new argument called `experimental_write_callbacks`. These are callbacks that will be executed after a saving event finishes writing the checkpoint file.
+
 * Add an option `disable_eager_executer_streaming_enqueue` to `tensorflow.ConfigProto.Experimental` to control the eager runtime's behavior around parallel remote function invocations; when set to `True`, the eager runtime will be allowed to execute multiple function invocations in parallel.
+
 * `tf.constant_initializer`
-    * It now takes a new argument called `support_partition`. If True, constant_initializers can create sharded variables. This is disabled by default similar to existing behavior.
+    * It now takes a new argument called `support_partition`. If True, constant_initializers can create sharded variables. This is disabled by default, similar to existing behavior.
 
 * `tf.lite`
     * Added support for `stablehlo.scatter`.
 
+* `tf.estimator`
+    * The tf.estimator API removal is in progress and will be targeted for the 2.16 release.
+
+## Keras
+
+* This will be the final release before the launch of Keras 3.0, when Keras will become multi-backend. For the compatibility page and other info, please see: https://github.com/keras-team/keras-core
+
 ## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
diff --git a/ci/official/any.sh b/ci/official/any.sh
index 74e4caa666d259..8eae7cd85f3445 100755
--- a/ci/official/any.sh
+++ b/ci/official/any.sh
@@ -34,8 +34,10 @@ if [[ -n "${TF_ANY_SCRIPT:-}" ]]; then
   echo "source ci/official/envs/disable_all_uploads" >> any
   export TFCI=$(realpath any)
   "$TF_ANY_SCRIPT"
-else
+elif [[ -n "${TF_ANY_TARGETS:-}" ]]; then
   source "${BASH_SOURCE%/*}/utilities/setup.sh"
-  read -ra TARGETS_AS_ARRAY <<<"$TF_ANY_TARGETS"
-  tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" "${TF_ANY_MODE:-test}" "${TFCI_BAZEL_COMMON_ARGS[@]}" "${TARGETS_AS_ARRAY[@]}"
+  tfrun bazel $TFCI_BAZEL_BAZELRC_ARGS "${TF_ANY_MODE:-test}" $TFCI_BAZEL_COMMON_ARGS $TF_ANY_TARGETS
+else
+  echo 'Looks like $TF_ANY_TARGETS are $TF_ANY_SCRIPT are both empty. That is an error.'
+  exit 1
 fi
diff --git a/ci/official/containers/linux_arm64/devel.usertools/aarch64.bazelrc b/ci/official/containers/linux_arm64/devel.usertools/aarch64.bazelrc
index f41974b5b6ab7d..f2a08d60720f9a 100644
--- a/ci/official/containers/linux_arm64/devel.usertools/aarch64.bazelrc
+++ b/ci/official/containers/linux_arm64/devel.usertools/aarch64.bazelrc
@@ -49,7 +49,7 @@ test --test_summary=short
 test:nonpip_filters --test_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --build_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
-test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # "pip tests" run a similar suite of tests the "nonpip" tests, but do something
 # odd to attempt to validate the quality of the pip package. The wheel is
@@ -70,10 +70,10 @@ test:pip_venv --python_path="/bazel_pip/bin/python3"
 test:pip_venv --define=no_tensorflow_py_deps=true
 test:pip --config=pip_venv
 # Yes, we don't exclude the gpu tests on pip for some reason.
-test:pip_filters --test_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:pip_filters --build_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:pip_filters --test_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-benchmark-test,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:pip_filters --build_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-benchmark-test,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:pip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
-test:pip --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:pip --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # For building libtensorflow archives
 test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
@@ -83,4 +83,4 @@ build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz
 test:pycpp_filters --test_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:pycpp_filters --build_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:pycpp_filters --test_lang_filters=cc,py --flaky_test_attempts=3 --test_size_filters=small,medium
-test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
diff --git a/ci/official/containers/linux_arm64/devel.usertools/aarch64_clang.bazelrc b/ci/official/containers/linux_arm64/devel.usertools/aarch64_clang.bazelrc
index 50b3851db88ea0..0cb20a89b4bd7f 100644
--- a/ci/official/containers/linux_arm64/devel.usertools/aarch64_clang.bazelrc
+++ b/ci/official/containers/linux_arm64/devel.usertools/aarch64_clang.bazelrc
@@ -60,7 +60,7 @@ test --test_summary=short
 test:nonpip_filters --test_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --build_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
-test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # "pip tests" run a similar suite of tests the "nonpip" tests, but do something
 # odd to attempt to validate the quality of the pip package. The wheel is
@@ -81,10 +81,10 @@ test:pip_venv --python_path="/bazel_pip/bin/python3"
 test:pip_venv --define=no_tensorflow_py_deps=true
 test:pip --config=pip_venv
 # Yes, we don't exclude the gpu tests on pip for some reason.
-test:pip_filters --test_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:pip_filters --build_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:pip_filters --test_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-benchmark-test,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:pip_filters --build_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-benchmark-test,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:pip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
-test:pip --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:pip --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # For building libtensorflow archives
 test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
@@ -94,4 +94,4 @@ build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz
 test:pycpp_filters --test_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:pycpp_filters --build_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_aarch64,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:pycpp_filters --test_lang_filters=cc,py --flaky_test_attempts=3 --test_size_filters=small,medium
-test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
diff --git a/ci/official/envs/ci_default b/ci/official/envs/ci_default
index eb7938c8b3449d..a8e212ff47f005 100644
--- a/ci/official/envs/ci_default
+++ b/ci/official/envs/ci_default
@@ -1,9 +1,9 @@
-TFCI_BAZEL_BAZELRC_ARGS=()
+# Note: this gets sourced in utilities/setup.sh
+TFCI_BAZEL_BAZELRC_ARGS=
+TFCI_BAZEL_COMMON_ARGS=
 TFCI_BAZEL_CONFIG_PREFIX=
-TFCI_BAZEL_COMMON_ARGS=()
-TFCI_PYTHON_VERSION=
-TFCI_BUILD_PIP_PACKAGE_ARGS=()
-TFCI_DOCKER_ARGS=()
+TFCI_BUILD_PIP_PACKAGE_ARGS=
+TFCI_DOCKER_ARGS=
 TFCI_DOCKER_ENABLE=1
 TFCI_DOCKER_IMAGE=
 TFCI_DOCKER_PULL_ENABLE=1
@@ -15,15 +15,24 @@ TFCI_LIB_SUFFIX=
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=
 TFCI_NVIDIA_SMI_ENABLE=
 TFCI_OUTPUT_DIR=build_output
-TFCI_LIBTPU_DOWNLOAD_ENABLE=0
-TFCI_LIBTPU_DOWNLOAD_NIGHTLY_ENABLE=0
-TFCI_LIBTPU_DOWNLOAD_URL=
+TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS=
+TFCI_PYTHON_VERSION=
 TFCI_UPLOAD_LIB_ENABLE=
 TFCI_UPLOAD_LIB_LATEST_ENABLE=
 TFCI_UPLOAD_LIB_LATEST_URI=
 TFCI_UPLOAD_LIB_URI=
 TFCI_UPLOAD_WHL_GCS_ENABLE=
 TFCI_UPLOAD_WHL_GCS_URI=
-TFCI_UPLOAD_WHL_PYPI_ARGS=()
+TFCI_UPLOAD_WHL_PYPI_ARGS=
 TFCI_UPLOAD_WHL_PYPI_ENABLE=
+TFCI_WHL_AUDIT_ENABLE=1
+TFCI_WHL_AUDIT_PLAT=
 TFCI_WHL_BAZEL_TEST_ENABLE=1
+TFCI_WHL_SIZE_LIMIT=
+TFCI_WHL_SIZE_LIMIT_ENABLE=1
+TFCI_MACOS_UPGRADE_PYENV_ENABLE=
+TFCI_MACOS_INSTALL_BAZELISK_ENABLE=
+TFCI_MACOS_INSTALL_BAZELISK_URL=
+TFCI_MACOS_PYENV_INSTALL_ENABLE=
+TFCI_MACOS_BAZEL_TEST_DIR_ENABLE=
+TFCI_MACOS_BAZEL_TEST_DIR_PATH=
diff --git a/ci/official/envs/ci_nightly_uploads b/ci/official/envs/ci_nightly_uploads
index ca6671f5ea3c59..7f62baf903c7e6 100644
--- a/ci/official/envs/ci_nightly_uploads
+++ b/ci/official/envs/ci_nightly_uploads
@@ -4,5 +4,5 @@ TFCI_UPLOAD_LIB_LATEST_ENABLE=1
 TFCI_UPLOAD_LIB_LATEST_GCS_URI="gs://libtensorflow-nightly/latest"
 TFCI_UPLOAD_WHL_GCS_ENABLE=0
 TFCI_UPLOAD_WHL_GCS_URI=
-TFCI_UPLOAD_WHL_PYPI_ARGS=(--config-file="$KOKORO_KEYSTORE_DIR/73361_tensorflow_pypirc_using_global_api_token" --repository pypi-warehouse)
+TFCI_UPLOAD_WHL_PYPI_ARGS="--config-file=$KOKORO_KEYSTORE_DIR/73361_tensorflow_pypirc_using_global_api_token --repository pypi-warehouse"
 TFCI_UPLOAD_WHL_PYPI_ENABLE=1
diff --git a/ci/official/envs/continuous_linux_arm64_cpu_py310 b/ci/official/envs/continuous_linux_arm64_cpu_py310
index b8d7e5c3228356..5f8d16be1aaa6a 100644
--- a/ci/official/envs/continuous_linux_arm64_cpu_py310
+++ b/ci/official/envs/continuous_linux_arm64_cpu_py310
@@ -1,7 +1,6 @@
 # This envrionment is experimental and should not yet be used for production jobs
-source ci/official/envs/ci_default
-TFCI_PYTHON_VERSION=3.10
+TFCI_BAZEL_COMMON_ARGS="--config release_arm64_linux --config tf_public_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
-TFCI_BAZEL_COMMON_ARGS=(--config release_arm64_linux --config tf_public_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
 TFCI_DOCKER_IMAGE=gcr.io/tensorflow-sigs/build-arm64:tf-latest-multi-python
-TFCI_DOCKER_REBUILD_ARGS=(--target=tf ci/official/containers/linux_arm64)
+TFCI_DOCKER_REBUILD_ARGS="--target=tf ci/official/containers/linux_arm64"
+TFCI_PYTHON_VERSION=3.10
diff --git a/ci/official/envs/continuous_linux_arm64_cpu_py311 b/ci/official/envs/continuous_linux_arm64_cpu_py311
index 7a0ae9e84a1134..410fecc1d7be39 100644
--- a/ci/official/envs/continuous_linux_arm64_cpu_py311
+++ b/ci/official/envs/continuous_linux_arm64_cpu_py311
@@ -1,7 +1,6 @@
 # This envrionment is experimental and should not yet be used for production jobs
-source ci/official/envs/ci_default
-TFCI_PYTHON_VERSION=3.11
+TFCI_BAZEL_COMMON_ARGS="--config release_arm64_linux --config tf_public_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
-TFCI_BAZEL_COMMON_ARGS=(--config release_arm64_linux --config tf_public_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
 TFCI_DOCKER_IMAGE=gcr.io/tensorflow-sigs/build-arm64:tf-latest-multi-python
-TFCI_DOCKER_REBUILD_ARGS=(--target=tf ci/official/containers/linux_arm64)
+TFCI_DOCKER_REBUILD_ARGS="--target=tf ci/official/containers/linux_arm64"
+TFCI_PYTHON_VERSION=3.11
diff --git a/ci/official/envs/continuous_linux_arm64_cpu_py311_cross_compile b/ci/official/envs/continuous_linux_arm64_cpu_py311_cross_compile
new file mode 100644
index 00000000000000..d506aca9441b98
--- /dev/null
+++ b/ci/official/envs/continuous_linux_arm64_cpu_py311_cross_compile
@@ -0,0 +1,6 @@
+# This envrionment is experimental and should not yet be used for production jobs
+TFCI_BAZEL_COMMON_ARGS="--config rbe_cross_compile_linux_arm64 --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=cross_compile_linux_arm64
+TFCI_DOCKER_IMAGE=gcr.io/tensorflow-sigs/build-arm64:tf-latest-multi-python
+TFCI_DOCKER_REBUILD_ARGS="--target=tf ci/official/containers/linux_arm64"
+TFCI_PYTHON_VERSION=3.11
diff --git a/ci/official/envs/continuous_linux_arm64_cpu_py39 b/ci/official/envs/continuous_linux_arm64_cpu_py39
index 53aee870f9c66a..7b98c0b838d000 100644
--- a/ci/official/envs/continuous_linux_arm64_cpu_py39
+++ b/ci/official/envs/continuous_linux_arm64_cpu_py39
@@ -1,7 +1,6 @@
 # This envrionment is experimental and should not yet be used for production jobs
-source ci/official/envs/ci_default
-TFCI_PYTHON_VERSION=3.9
+TFCI_BAZEL_COMMON_ARGS="--config release_arm64_linux --config tf_public_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
-TFCI_BAZEL_COMMON_ARGS=(--config release_arm64_linux --config tf_public_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
 TFCI_DOCKER_IMAGE=gcr.io/tensorflow-sigs/build-arm64:tf-latest-multi-python
-TFCI_DOCKER_REBUILD_ARGS=(--target=tf ci/official/containers/linux_arm64)
+TFCI_DOCKER_REBUILD_ARGS="--target=tf ci/official/containers/linux_arm64"
+TFCI_PYTHON_VERSION=3.9
diff --git a/ci/official/envs/continuous_linux_arm64_cpu_py39_cross_compile b/ci/official/envs/continuous_linux_arm64_cpu_py39_cross_compile
new file mode 100644
index 00000000000000..23870d6c181bd3
--- /dev/null
+++ b/ci/official/envs/continuous_linux_arm64_cpu_py39_cross_compile
@@ -0,0 +1,6 @@
+# This envrionment is experimental and should not yet be used for production jobs
+TFCI_BAZEL_COMMON_ARGS="--config rbe_cross_compile_linux_arm64 --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=cross_compile_linux_arm64
+TFCI_DOCKER_IMAGE=gcr.io/tensorflow-sigs/build-arm64:tf-latest-multi-python
+TFCI_DOCKER_REBUILD_ARGS="--target=tf ci/official/containers/linux_arm64"
+TFCI_PYTHON_VERSION=3.9
diff --git a/ci/official/envs/continuous_linux_x86_cpu_py310 b/ci/official/envs/continuous_linux_x86_cpu_py310
index 13b2730a609d4b..5297dd60604781 100644
--- a/ci/official/envs/continuous_linux_x86_cpu_py310
+++ b/ci/official/envs/continuous_linux_x86_cpu_py310
@@ -1,6 +1,5 @@
-source ci/official/envs/ci_default
-TFCI_PYTHON_VERSION=3.10
+TFCI_BAZEL_COMMON_ARGS="--config release_cpu_linux --config rbe_linux_cpu --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
-TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config rbe_linux_cpu --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
-TFCI_DOCKER_IMAGE=tensorflow/build:latest-pythonlatest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
+TFCI_PYTHON_VERSION=3.10
diff --git a/ci/official/envs/continuous_linux_x86_cpu_py311 b/ci/official/envs/continuous_linux_x86_cpu_py311
index 3f92c5c2513257..4a306e19f97258 100644
--- a/ci/official/envs/continuous_linux_x86_cpu_py311
+++ b/ci/official/envs/continuous_linux_x86_cpu_py311
@@ -1,6 +1,5 @@
-source ci/official/envs/ci_default
-TFCI_PYTHON_VERSION=3.11
+TFCI_BAZEL_COMMON_ARGS="--config release_cpu_linux --config rbe_linux_cpu --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
-TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config rbe_linux_cpu --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
+TFCI_PYTHON_VERSION=3.11
diff --git a/ci/official/envs/continuous_linux_x86_cpu_py39 b/ci/official/envs/continuous_linux_x86_cpu_py39
index 4ca275cf32a943..6b225c4e8f3170 100644
--- a/ci/official/envs/continuous_linux_x86_cpu_py39
+++ b/ci/official/envs/continuous_linux_x86_cpu_py39
@@ -1,6 +1,5 @@
-source ci/official/envs/ci_default
-TFCI_PYTHON_VERSION=3.9
+TFCI_BAZEL_COMMON_ARGS="--config release_cpu_linux --config rbe_linux_cpu --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
-TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config rbe_linux_cpu --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
+TFCI_PYTHON_VERSION=3.9
diff --git a/ci/official/envs/continuous_linux_x86_cuda_py310 b/ci/official/envs/continuous_linux_x86_cuda_py310
index f09a5d55110948..95e30867ced0ed 100644
--- a/ci/official/envs/continuous_linux_x86_cuda_py310
+++ b/ci/official/envs/continuous_linux_x86_cuda_py310
@@ -1,8 +1,7 @@
-source ci/official/envs/ci_default
-TFCI_PYTHON_VERSION=3.10
-TFCI_NVIDIA_SMI_ENABLE=1
+TFCI_BAZEL_COMMON_ARGS="--config release_gpu_linux --config rbe_linux_cuda --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
-TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config rbe_linux_cuda --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
-TFCI_DOCKER_ARGS=(--gpus all)
+TFCI_DOCKER_ARGS="--gpus all"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
+TFCI_NVIDIA_SMI_ENABLE=1
+TFCI_PYTHON_VERSION=3.10
diff --git a/ci/official/envs/continuous_linux_x86_cuda_py311 b/ci/official/envs/continuous_linux_x86_cuda_py311
index cd834c2acfbde1..8bc69dc0ed514c 100644
--- a/ci/official/envs/continuous_linux_x86_cuda_py311
+++ b/ci/official/envs/continuous_linux_x86_cuda_py311
@@ -1,8 +1,7 @@
-source ci/official/envs/ci_default
-TFCI_PYTHON_VERSION=3.11
-TFCI_NVIDIA_SMI_ENABLE=1
+TFCI_BAZEL_COMMON_ARGS="--config release_gpu_linux --config rbe_linux_cuda --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
-TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config rbe_linux_cuda --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
-TFCI_DOCKER_ARGS=(--gpus all)
+TFCI_DOCKER_ARGS="--gpus all"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
+TFCI_NVIDIA_SMI_ENABLE=1
+TFCI_PYTHON_VERSION=3.11
diff --git a/ci/official/envs/continuous_linux_x86_cuda_py39 b/ci/official/envs/continuous_linux_x86_cuda_py39
index 798dfdf25109d4..3899fed43065ba 100644
--- a/ci/official/envs/continuous_linux_x86_cuda_py39
+++ b/ci/official/envs/continuous_linux_x86_cuda_py39
@@ -1,8 +1,7 @@
-source ci/official/envs/ci_default
-TFCI_PYTHON_VERSION=3.9
-TFCI_NVIDIA_SMI_ENABLE=1
+TFCI_BAZEL_COMMON_ARGS="--config release_gpu_linux --config rbe_linux_cuda --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
-TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config rbe_linux_cuda --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
-TFCI_DOCKER_ARGS=(--gpus all)
+TFCI_DOCKER_ARGS="--gpus all"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
+TFCI_NVIDIA_SMI_ENABLE=1
+TFCI_PYTHON_VERSION=3.9
diff --git a/ci/official/envs/continuous_macos_arm64_py310 b/ci/official/envs/continuous_macos_arm64_py310
index a08a3350534751..81e98e74ea4c80 100644
--- a/ci/official/envs/continuous_macos_arm64_py310
+++ b/ci/official/envs/continuous_macos_arm64_py310
@@ -1,5 +1,6 @@
-source ci/official/envs/ci_default
-TFCI_PYTHON_VERSION=3.10
+TFCI_BAZEL_COMMON_ARGS="--config release_macos_arm64 --config tf_public_macos_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=macos_arm64
-TFCI_BAZEL_COMMON_ARGS=(--config release_macos_arm64 --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
 TFCI_DOCKER_ENABLE=0
+TFCI_PYTHON_VERSION=3.10
+TFCI_MACOS_BAZEL_TEST_DIR_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_PATH="/Volumes/BuildData/bazel_output"
diff --git a/ci/official/envs/continuous_macos_arm64_py311 b/ci/official/envs/continuous_macos_arm64_py311
index 230d18d7c2b2f6..f4e7ce7120a858 100644
--- a/ci/official/envs/continuous_macos_arm64_py311
+++ b/ci/official/envs/continuous_macos_arm64_py311
@@ -1,5 +1,6 @@
-source ci/official/envs/ci_default
-TFCI_PYTHON_VERSION=3.11
+TFCI_BAZEL_COMMON_ARGS="--config release_macos_arm64 --config tf_public_macos_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=macos_arm64
-TFCI_BAZEL_COMMON_ARGS=(--config release_macos_arm64 --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
 TFCI_DOCKER_ENABLE=0
+TFCI_PYTHON_VERSION=3.11
+TFCI_MACOS_BAZEL_TEST_DIR_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_PATH="/Volumes/BuildData/bazel_output"
diff --git a/ci/official/envs/continuous_macos_arm64_py39 b/ci/official/envs/continuous_macos_arm64_py39
index 59585ff1b37857..66ca0b11dfb918 100644
--- a/ci/official/envs/continuous_macos_arm64_py39
+++ b/ci/official/envs/continuous_macos_arm64_py39
@@ -1,5 +1,6 @@
-source ci/official/envs/ci_default
-TFCI_PYTHON_VERSION=3.9
+TFCI_BAZEL_COMMON_ARGS="--config release_macos_arm64 --config tf_public_macos_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=macos_arm64
-TFCI_BAZEL_COMMON_ARGS=(--config release_macos_arm64 --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
 TFCI_DOCKER_ENABLE=0
+TFCI_PYTHON_VERSION=3.9
+TFCI_MACOS_BAZEL_TEST_DIR_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_PATH="/Volumes/BuildData/bazel_output"
diff --git a/ci/official/envs/nightly_libtensorflow_linux_x86_cpu b/ci/official/envs/nightly_libtensorflow_linux_x86_cpu
index 9fbd23ae501601..d5e7b0b634f0ef 100644
--- a/ci/official/envs/nightly_libtensorflow_linux_x86_cpu
+++ b/ci/official/envs/nightly_libtensorflow_linux_x86_cpu
@@ -1,8 +1,7 @@
-source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_PYTHON_VERSION=3.10
-TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BAZEL_COMMON_ARGS="--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
 TFCI_LIB_SUFFIX="-cpu-linux-x86_64"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.10
diff --git a/ci/official/envs/nightly_libtensorflow_linux_x86_cuda b/ci/official/envs/nightly_libtensorflow_linux_x86_cuda
index 0b35c0e67f78ee..adb557c7845196 100644
--- a/ci/official/envs/nightly_libtensorflow_linux_x86_cuda
+++ b/ci/official/envs/nightly_libtensorflow_linux_x86_cuda
@@ -1,10 +1,9 @@
-source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_PYTHON_VERSION=3.10
-TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
-TFCI_DOCKER_ARGS=(--gpus all)
+TFCI_BAZEL_COMMON_ARGS="--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
+TFCI_DOCKER_ARGS="--gpus all"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
 TFCI_LIB_SUFFIX="-gpu-linux-x86_64"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
 TFCI_NVIDIA_SMI_ENABLE=1
+TFCI_PYTHON_VERSION=3.10
diff --git a/ci/official/envs/nightly_libtensorflow_macos_arm64 b/ci/official/envs/nightly_libtensorflow_macos_arm64
index d29447dc50415c..195563aaa1f79c 100644
--- a/ci/official/envs/nightly_libtensorflow_macos_arm64
+++ b/ci/official/envs/nightly_libtensorflow_macos_arm64
@@ -1,8 +1,7 @@
-source ci/official/envs/ci_default
-source ci/official/envs/ci_nightly_uploads
-TFCI_PYTHON_VERSION=3.10
-TFCI_BAZEL_COMMON_ARGS=(--config release_macos_arm64 --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+# Disable arm64 uploads while being worked on
+source ci/official/envs/disable_all_uploads
+TFCI_BAZEL_COMMON_ARGS="--config release_macos_arm64 --config tf_public_macos_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_DOCKER_ENABLE=0
-TFCI_LIB_SUFFIX="-cpu-macos-arm64"
+TFCI_LIB_SUFFIX="-cpu-darwin-arm64"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
-TFCI_UPLOAD_WHL_GCS_URI=1
\ No newline at end of file
+TFCI_PYTHON_VERSION=3.10
diff --git a/ci/official/envs/nightly_libtensorflow_macos_x86 b/ci/official/envs/nightly_libtensorflow_macos_x86
new file mode 100644
index 00000000000000..113111468bfb67
--- /dev/null
+++ b/ci/official/envs/nightly_libtensorflow_macos_x86
@@ -0,0 +1,7 @@
+# Disable macOS x86 uploads while being worked on
+source ci/official/envs/disable_all_uploads
+TFCI_BAZEL_COMMON_ARGS="--config release_macos_x86 --config tf_public_macos_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
+TFCI_DOCKER_ENABLE=0
+TFCI_LIB_SUFFIX="-cpu-darwin-x86_64"
+TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.10
diff --git a/ci/official/envs/nightly_linux_arm64_cpu_py310 b/ci/official/envs/nightly_linux_arm64_cpu_py310
index 5b7900c43423b2..99abd33e228d06 100644
--- a/ci/official/envs/nightly_linux_arm64_cpu_py310
+++ b/ci/official/envs/nightly_linux_arm64_cpu_py310
@@ -1,10 +1,11 @@
-source ci/official/envs/ci_default
 # Disable arm64 uploads while being worked on
 source ci/official/envs/disable_all_uploads
-TFCI_PYTHON_VERSION=3.10
-TFCI_BAZEL_COMMON_ARGS=(--config release_arm64_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BAZEL_COMMON_ARGS="--config release_arm64_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
 TFCI_DOCKER_IMAGE=gcr.io/tensorflow-sigs/build-arm64:tf-latest-multi-python
-TFCI_DOCKER_REBUILD_ARGS=(--target=tf ci/official/containers/linux_arm64)
+TFCI_DOCKER_REBUILD_ARGS="--target=tf ci/official/containers/linux_arm64"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.10
+TFCI_WHL_AUDIT_PLAT=manylinux2014_aarch64
+TFCI_WHL_SIZE_LIMIT_ENABLE=
diff --git a/ci/official/envs/nightly_linux_arm64_cpu_py311 b/ci/official/envs/nightly_linux_arm64_cpu_py311
index 6edb93ba0bdf73..5ce6b38552bee4 100644
--- a/ci/official/envs/nightly_linux_arm64_cpu_py311
+++ b/ci/official/envs/nightly_linux_arm64_cpu_py311
@@ -1,10 +1,11 @@
-source ci/official/envs/ci_default
 # Disable arm64 uploads while being worked on
 source ci/official/envs/disable_all_uploads
-TFCI_PYTHON_VERSION=3.11
-TFCI_BAZEL_COMMON_ARGS=(--config release_arm64_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BAZEL_COMMON_ARGS="--config release_arm64_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
 TFCI_DOCKER_IMAGE=gcr.io/tensorflow-sigs/build-arm64:tf-latest-multi-python
-TFCI_DOCKER_REBUILD_ARGS=(--target=tf ci/official/containers/linux_arm64)
+TFCI_DOCKER_REBUILD_ARGS="--target=tf ci/official/containers/linux_arm64"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.11
+TFCI_WHL_AUDIT_PLAT=manylinux2014_aarch64
+TFCI_WHL_SIZE_LIMIT_ENABLE=
diff --git a/ci/official/envs/nightly_linux_arm64_cpu_py312 b/ci/official/envs/nightly_linux_arm64_cpu_py312
index dfe96fafb5568e..59ac34a405b3cb 100644
--- a/ci/official/envs/nightly_linux_arm64_cpu_py312
+++ b/ci/official/envs/nightly_linux_arm64_cpu_py312
@@ -1,10 +1,11 @@
-source ci/official/envs/ci_default
 # Disable arm64 uploads while being worked on
 source ci/official/envs/disable_all_uploads
-TFCI_PYTHON_VERSION=3.12
-TFCI_BAZEL_COMMON_ARGS=(--config release_arm64_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BAZEL_COMMON_ARGS="--config release_arm64_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
 TFCI_DOCKER_IMAGE=gcr.io/tensorflow-sigs/build-arm64:tf-latest-multi-python
-TFCI_DOCKER_REBUILD_ARGS=(--target=tf ci/official/containers/linux_arm64)
+TFCI_DOCKER_REBUILD_ARGS="--target=tf ci/official/containers/linux_arm64"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.12
+TFCI_WHL_AUDIT_PLAT=manylinux2014_aarch64
+TFCI_WHL_SIZE_LIMIT_ENABLE=
diff --git a/ci/official/envs/nightly_linux_arm64_cpu_py39 b/ci/official/envs/nightly_linux_arm64_cpu_py39
index e3b516111fdc85..e707083e020661 100644
--- a/ci/official/envs/nightly_linux_arm64_cpu_py39
+++ b/ci/official/envs/nightly_linux_arm64_cpu_py39
@@ -1,10 +1,11 @@
-source ci/official/envs/ci_default
 # Disable arm64 uploads while being worked on
 source ci/official/envs/disable_all_uploads
-TFCI_PYTHON_VERSION=3.9
-TFCI_BAZEL_COMMON_ARGS=(--config release_arm64_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BAZEL_COMMON_ARGS="--config release_arm64_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
 TFCI_DOCKER_IMAGE=gcr.io/tensorflow-sigs/build-arm64:tf-latest-multi-python
-TFCI_DOCKER_REBUILD_ARGS=(--target=tf ci/official/containers/linux_arm64)
+TFCI_DOCKER_REBUILD_ARGS="--target=tf ci/official/containers/linux_arm64"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.9
+TFCI_WHL_AUDIT_PLAT=manylinux2014_aarch64
+TFCI_WHL_SIZE_LIMIT_ENABLE=
diff --git a/ci/official/envs/nightly_linux_x86_cpu_py310 b/ci/official/envs/nightly_linux_x86_cpu_py310
index 574ac7bee1f004..6576b8ab239593 100644
--- a/ci/official/envs/nightly_linux_x86_cpu_py310
+++ b/ci/official/envs/nightly_linux_x86_cpu_py310
@@ -1,9 +1,10 @@
-source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_PYTHON_VERSION=3.10
-TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BAZEL_COMMON_ARGS="--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.10
+TFCI_WHL_AUDIT_PLAT=manylinux2014_x86_64
+TFCI_WHL_SIZE_LIMIT=240M
diff --git a/ci/official/envs/nightly_linux_x86_cpu_py311 b/ci/official/envs/nightly_linux_x86_cpu_py311
index d1b8bfea93cc74..544fff21a905fd 100644
--- a/ci/official/envs/nightly_linux_x86_cpu_py311
+++ b/ci/official/envs/nightly_linux_x86_cpu_py311
@@ -1,9 +1,10 @@
-source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_PYTHON_VERSION=3.11
-TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BAZEL_COMMON_ARGS="--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.11
+TFCI_WHL_AUDIT_PLAT=manylinux2014_x86_64
+TFCI_WHL_SIZE_LIMIT=240M
diff --git a/ci/official/envs/nightly_linux_x86_cpu_py312 b/ci/official/envs/nightly_linux_x86_cpu_py312
index 586fd92e5d703c..b8442d9e03cb4a 100644
--- a/ci/official/envs/nightly_linux_x86_cpu_py312
+++ b/ci/official/envs/nightly_linux_x86_cpu_py312
@@ -1,10 +1,10 @@
-source ci/official/envs/ci_default
-# Disable 3.12 uploads while being worked on
-source ci/official/envs/disable_all_uploads
-TFCI_PYTHON_VERSION=3.12
-TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+source ci/official/envs/ci_nightly_uploads
+TFCI_BAZEL_COMMON_ARGS="--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.12
+TFCI_WHL_AUDIT_PLAT=manylinux2014_x86_64
+TFCI_WHL_SIZE_LIMIT=240M
diff --git a/ci/official/envs/nightly_linux_x86_cpu_py39 b/ci/official/envs/nightly_linux_x86_cpu_py39
index 2c3e1183a37171..69696ee814f77e 100644
--- a/ci/official/envs/nightly_linux_x86_cpu_py39
+++ b/ci/official/envs/nightly_linux_x86_cpu_py39
@@ -1,9 +1,10 @@
-source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_PYTHON_VERSION=3.9
-TFCI_BAZEL_COMMON_ARGS=(--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BAZEL_COMMON_ARGS="--config release_cpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.9
+TFCI_WHL_AUDIT_PLAT=manylinux2014_x86_64
+TFCI_WHL_SIZE_LIMIT=240M
diff --git a/ci/official/envs/nightly_linux_x86_cuda_py310 b/ci/official/envs/nightly_linux_x86_cuda_py310
index 16038d62bd646d..ec26fb1cb14905 100644
--- a/ci/official/envs/nightly_linux_x86_cuda_py310
+++ b/ci/official/envs/nightly_linux_x86_cuda_py310
@@ -1,10 +1,11 @@
-source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_PYTHON_VERSION=3.10
-TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BAZEL_COMMON_ARGS="--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--nightly_flag)
-TFCI_DOCKER_ARGS=(--gpus all)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--nightly_flag"
+TFCI_DOCKER_ARGS="--gpus all"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.10
+TFCI_WHL_AUDIT_PLAT=manylinux2014_x86_64
+TFCI_WHL_SIZE_LIMIT=580M
diff --git a/ci/official/envs/nightly_linux_x86_cuda_py311 b/ci/official/envs/nightly_linux_x86_cuda_py311
index 1d0d931477a686..e7101efa94cb57 100644
--- a/ci/official/envs/nightly_linux_x86_cuda_py311
+++ b/ci/official/envs/nightly_linux_x86_cuda_py311
@@ -1,10 +1,11 @@
-source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_PYTHON_VERSION=3.11
-TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BAZEL_COMMON_ARGS="--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--nightly_flag)
-TFCI_DOCKER_ARGS=(--gpus all)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--nightly_flag"
+TFCI_DOCKER_ARGS="--gpus all"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.11
+TFCI_WHL_AUDIT_PLAT=manylinux2014_x86_64
+TFCI_WHL_SIZE_LIMIT=580M
diff --git a/ci/official/envs/nightly_linux_x86_cuda_py312 b/ci/official/envs/nightly_linux_x86_cuda_py312
index 4767f6dbdd6483..4b9e371ae26ed3 100644
--- a/ci/official/envs/nightly_linux_x86_cuda_py312
+++ b/ci/official/envs/nightly_linux_x86_cuda_py312
@@ -1,11 +1,11 @@
-source ci/official/envs/ci_default
-# Disable 3.12 uploads while being worked on
-source ci/official/envs/disable_all_uploads
-TFCI_PYTHON_VERSION=3.12
-TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+source ci/official/envs/ci_nightly_uploads
+TFCI_BAZEL_COMMON_ARGS="--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--nightly_flag)
-TFCI_DOCKER_ARGS=(--gpus all)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--nightly_flag"
+TFCI_DOCKER_ARGS="--gpus all"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
-TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
\ No newline at end of file
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
+TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.12
+TFCI_WHL_AUDIT_PLAT=manylinux2014_x86_64
+TFCI_WHL_SIZE_LIMIT=580M
diff --git a/ci/official/envs/nightly_linux_x86_cuda_py39 b/ci/official/envs/nightly_linux_x86_cuda_py39
index e3a5d3f8c8d1a8..63ee868a8db0b3 100644
--- a/ci/official/envs/nightly_linux_x86_cuda_py39
+++ b/ci/official/envs/nightly_linux_x86_cuda_py39
@@ -1,10 +1,11 @@
-source ci/official/envs/ci_default
 source ci/official/envs/ci_nightly_uploads
-TFCI_PYTHON_VERSION=3.9
-TFCI_BAZEL_COMMON_ARGS=(--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
+TFCI_BAZEL_COMMON_ARGS="--config release_gpu_linux --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cuda
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--nightly_flag)
-TFCI_DOCKER_ARGS=(--gpus all)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--nightly_flag"
+TFCI_DOCKER_ARGS="--gpus all"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.9
+TFCI_WHL_AUDIT_PLAT=manylinux2014_x86_64
+TFCI_WHL_SIZE_LIMIT=580M
diff --git a/ci/official/envs/nightly_linux_x86_tpu_py310 b/ci/official/envs/nightly_linux_x86_tpu_py310
index 4e8014120f3762..8367da6b55b456 100644
--- a/ci/official/envs/nightly_linux_x86_tpu_py310
+++ b/ci/official/envs/nightly_linux_x86_tpu_py310
@@ -1,11 +1,13 @@
-source ci/official/envs/ci_default
 # Disable tpu uploads while being worked on
-source ci/official/envs/disable_all_uploads
-TFCI_PYTHON_VERSION=3.10
-TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config=tpu)
+source ci/official/envs/ci_nightly_uploads
+TFCI_BAZEL_COMMON_ARGS="--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config=release_cpu_linux --config=tpu"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--tpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--tpu --nightly_flag"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
-TFCI_LIBTPU_DOWNLOAD_NIGHTLY_ENABLE=1
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS="-f https://storage.googleapis.com/libtpu-releases/index.html"
+TFCI_PYTHON_VERSION=3.10
+TFCI_WHL_AUDIT_PLAT=manylinux2014_x86_64
+TFCI_WHL_BAZEL_TEST_ENABLE=0
+TFCI_WHL_SIZE_LIMIT=580M
diff --git a/ci/official/envs/nightly_linux_x86_tpu_py311 b/ci/official/envs/nightly_linux_x86_tpu_py311
index e4ae8cccf4fd46..8a186aad7dcce0 100644
--- a/ci/official/envs/nightly_linux_x86_tpu_py311
+++ b/ci/official/envs/nightly_linux_x86_tpu_py311
@@ -1,11 +1,13 @@
-source ci/official/envs/ci_default
 # Disable tpu uploads while being worked on
-source ci/official/envs/disable_all_uploads
-TFCI_PYTHON_VERSION=3.11
-TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config=tpu)
+source ci/official/envs/ci_nightly_uploads
+TFCI_BAZEL_COMMON_ARGS="--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config=release_cpu_linux --config=tpu"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--tpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--tpu --nightly_flag"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
-TFCI_LIBTPU_DOWNLOAD_NIGHTLY_ENABLE=1
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS="-f https://storage.googleapis.com/libtpu-releases/index.html"
+TFCI_PYTHON_VERSION=3.11
+TFCI_WHL_AUDIT_PLAT=manylinux2014_x86_64
+TFCI_WHL_BAZEL_TEST_ENABLE=0
+TFCI_WHL_SIZE_LIMIT=580M
diff --git a/ci/official/envs/nightly_linux_x86_tpu_py312 b/ci/official/envs/nightly_linux_x86_tpu_py312
index 54d96b16548a4a..0f8c73bd601e26 100644
--- a/ci/official/envs/nightly_linux_x86_tpu_py312
+++ b/ci/official/envs/nightly_linux_x86_tpu_py312
@@ -1,11 +1,13 @@
-source ci/official/envs/ci_default
 # Disable tpu uploads while being worked on
-source ci/official/envs/disable_all_uploads
-TFCI_PYTHON_VERSION=3.12
-TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config=tpu)
+source ci/official/envs/ci_nightly_uploads
+TFCI_BAZEL_COMMON_ARGS="--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config=release_cpu_linux --config=tpu"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--tpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--tpu --nightly_flag"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
-TFCI_LIBTPU_DOWNLOAD_NIGHTLY_ENABLE=1
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS="-f https://storage.googleapis.com/libtpu-releases/index.html"
+TFCI_PYTHON_VERSION=3.12
+TFCI_WHL_AUDIT_PLAT=manylinux2014_x86_64
+TFCI_WHL_BAZEL_TEST_ENABLE=0
+TFCI_WHL_SIZE_LIMIT=580M
diff --git a/ci/official/envs/nightly_linux_x86_tpu_py39 b/ci/official/envs/nightly_linux_x86_tpu_py39
index 4adaa8b216fbba..aa413f939ee5fd 100644
--- a/ci/official/envs/nightly_linux_x86_tpu_py39
+++ b/ci/official/envs/nightly_linux_x86_tpu_py39
@@ -1,11 +1,13 @@
-source ci/official/envs/ci_default
 # Disable tpu uploads while being worked on
-source ci/official/envs/disable_all_uploads
-TFCI_PYTHON_VERSION=3.9
-TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config=tpu)
+source ci/official/envs/ci_nightly_uploads
+TFCI_BAZEL_COMMON_ARGS="--config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config=release_cpu_linux --config=tpu"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--tpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--tpu --nightly_flag"
 TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
-TFCI_DOCKER_REBUILD_ARGS=(--build-arg PYTHON_VERSION=$TFCI_PYTHON_VERSION --target=devel tools/tf_sig_build_dockerfiles)
-TFCI_LIBTPU_DOWNLOAD_NIGHTLY_ENABLE=1
+TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS="-f https://storage.googleapis.com/libtpu-releases/index.html"
+TFCI_PYTHON_VERSION=3.9
+TFCI_WHL_AUDIT_PLAT=manylinux2014_x86_64
+TFCI_WHL_BAZEL_TEST_ENABLE=0
+TFCI_WHL_SIZE_LIMIT=580M
diff --git a/ci/official/envs/nightly_macos_arm64_py310 b/ci/official/envs/nightly_macos_arm64_py310
index 81fa2c977d6944..6c007ce1c318d7 100644
--- a/ci/official/envs/nightly_macos_arm64_py310
+++ b/ci/official/envs/nightly_macos_arm64_py310
@@ -1,9 +1,12 @@
-source ci/official/envs/ci_default
 source ci/official/envs/disable_all_uploads
-TFCI_PYTHON_VERSION=3.10
+TFCI_BAZEL_COMMON_ARGS="--config release_macos_arm64 --config tf_public_macos_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=macos_arm64
-TFCI_BAZEL_COMMON_ARGS=(--config release_macos_arm64 --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
 TFCI_DOCKER_ENABLE=0
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
-TFCI_UPLOAD_WHL_GCS_ENABLE=1
+TFCI_PYTHON_VERSION=3.10
+TFCI_WHL_AUDIT_ENABLE=
+TFCI_WHL_SIZE_LIMIT=240M
+TFCI_MACOS_PYENV_INSTALL_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_PATH="/Volumes/BuildData/bazel_output"
diff --git a/ci/official/envs/nightly_macos_arm64_py311 b/ci/official/envs/nightly_macos_arm64_py311
index e8046a3b5951b1..a3dfd672843273 100644
--- a/ci/official/envs/nightly_macos_arm64_py311
+++ b/ci/official/envs/nightly_macos_arm64_py311
@@ -1,9 +1,11 @@
-source ci/official/envs/ci_default
 source ci/official/envs/disable_all_uploads
-TFCI_PYTHON_VERSION=3.11
+TFCI_BAZEL_COMMON_ARGS="--config release_macos_arm64 --config tf_public_macos_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=macos_arm64
-TFCI_BAZEL_COMMON_ARGS=(--config release_macos_arm64 --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
 TFCI_DOCKER_ENABLE=0
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
-TFCI_UPLOAD_WHL_GCS_ENABLE=1
+TFCI_PYTHON_VERSION=3.11
+TFCI_WHL_AUDIT_ENABLE=
+TFCI_WHL_SIZE_LIMIT=240M
+TFCI_MACOS_BAZEL_TEST_DIR_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_PATH="/Volumes/BuildData/bazel_output"
diff --git a/ci/official/envs/nightly_macos_arm64_py312 b/ci/official/envs/nightly_macos_arm64_py312
index 21432f076f6283..3da9c1040956da 100644
--- a/ci/official/envs/nightly_macos_arm64_py312
+++ b/ci/official/envs/nightly_macos_arm64_py312
@@ -1,9 +1,12 @@
-source ci/official/envs/ci_default
 source ci/official/envs/disable_all_uploads
-TFCI_PYTHON_VERSION=3.12
+TFCI_BAZEL_COMMON_ARGS="--config release_macos_arm64 --config tf_public_macos_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=macos_arm64
-TFCI_BAZEL_COMMON_ARGS=(--config release_macos_arm64 --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
 TFCI_DOCKER_ENABLE=0
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
-TFCI_UPLOAD_WHL_GCS_ENABLE=1
+TFCI_PYTHON_VERSION=3.12
+TFCI_WHL_AUDIT_ENABLE=
+TFCI_WHL_SIZE_LIMIT=240M
+TFCI_MACOS_PYENV_INSTALL_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_PATH="/Volumes/BuildData/bazel_output"
diff --git a/ci/official/envs/nightly_macos_arm64_py39 b/ci/official/envs/nightly_macos_arm64_py39
index ee58e84c6624ca..36682a1e08421b 100644
--- a/ci/official/envs/nightly_macos_arm64_py39
+++ b/ci/official/envs/nightly_macos_arm64_py39
@@ -1,9 +1,14 @@
-source ci/official/envs/ci_default
 source ci/official/envs/disable_all_uploads
-TFCI_PYTHON_VERSION=3.9
+# TODO(srnitin): Add resultstore config once the macOS builds have the right
+# permissions
+TFCI_BAZEL_COMMON_ARGS="--config release_macos_arm64 --config tf_public_macos_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=macos_arm64
-TFCI_BAZEL_COMMON_ARGS=(--config release_macos_arm64 --config tf_public_cache_push --config resultstore --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION)
-TFCI_BUILD_PIP_PACKAGE_ARGS=(--cpu --nightly_flag)
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
 TFCI_DOCKER_ENABLE=0
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
-TFCI_UPLOAD_WHL_GCS_ENABLE=1
+TFCI_PYTHON_VERSION=3.9
+TFCI_WHL_AUDIT_ENABLE=
+TFCI_WHL_SIZE_LIMIT=240M
+TFCI_MACOS_PYENV_INSTALL_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_PATH="/Volumes/BuildData/bazel_output"
diff --git a/ci/official/envs/nightly_macos_x86_py310 b/ci/official/envs/nightly_macos_x86_py310
new file mode 100644
index 00000000000000..9577841dea84ec
--- /dev/null
+++ b/ci/official/envs/nightly_macos_x86_py310
@@ -0,0 +1,16 @@
+# Disable macOS x86 uploads while being worked on
+source ci/official/envs/disable_all_uploads
+TFCI_BAZEL_COMMON_ARGS="--config release_macos_x86 --config tf_public_macos_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=macos_x86
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
+TFCI_DOCKER_ENABLE=0
+TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.10
+TFCI_WHL_AUDIT_ENABLE=
+TFCI_WHL_SIZE_LIMIT=255M
+TFCI_MACOS_INSTALL_BAZELISK_ENABLE=1
+TFCI_MACOS_INSTALL_BAZELISK_URL="https://github.com/bazelbuild/bazelisk/releases/download/v1.11.0/bazelisk-darwin-amd64"
+TFCI_MACOS_UPGRADE_PYENV_ENABLE=1
+TFCI_MACOS_PYENV_INSTALL_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_PATH="/Volumes/BuildData/bazel_output"
diff --git a/ci/official/envs/nightly_macos_x86_py311 b/ci/official/envs/nightly_macos_x86_py311
new file mode 100644
index 00000000000000..4fe9bad43f89f6
--- /dev/null
+++ b/ci/official/envs/nightly_macos_x86_py311
@@ -0,0 +1,16 @@
+# Disable macOS x86 uploads while being worked on
+source ci/official/envs/disable_all_uploads
+TFCI_BAZEL_COMMON_ARGS="--config release_macos_x86 --config tf_public_macos_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=macos_x86
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
+TFCI_DOCKER_ENABLE=0
+TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.11
+TFCI_WHL_AUDIT_ENABLE=
+TFCI_WHL_SIZE_LIMIT=255M
+TFCI_MACOS_INSTALL_BAZELISK_ENABLE=1
+TFCI_MACOS_INSTALL_BAZELISK_URL="https://github.com/bazelbuild/bazelisk/releases/download/v1.11.0/bazelisk-darwin-amd64"
+TFCI_MACOS_UPGRADE_PYENV_ENABLE=1
+TFCI_MACOS_PYENV_INSTALL_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_PATH="/Volumes/BuildData/bazel_output"
diff --git a/ci/official/envs/nightly_macos_x86_py312 b/ci/official/envs/nightly_macos_x86_py312
new file mode 100644
index 00000000000000..a4397de120d90c
--- /dev/null
+++ b/ci/official/envs/nightly_macos_x86_py312
@@ -0,0 +1,16 @@
+# Disable macOS x86 uploads while being worked on
+source ci/official/envs/disable_all_uploads
+TFCI_BAZEL_COMMON_ARGS="--config release_macos_x86 --config tf_public_macos_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=macos_x86
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
+TFCI_DOCKER_ENABLE=0
+TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.12
+TFCI_WHL_AUDIT_ENABLE=
+TFCI_WHL_SIZE_LIMIT=255M
+TFCI_MACOS_INSTALL_BAZELISK_ENABLE=1
+TFCI_MACOS_INSTALL_BAZELISK_URL="https://github.com/bazelbuild/bazelisk/releases/download/v1.11.0/bazelisk-darwin-amd64"
+TFCI_MACOS_UPGRADE_PYENV_ENABLE=1
+TFCI_MACOS_PYENV_INSTALL_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_PATH="/Volumes/BuildData/bazel_output"
\ No newline at end of file
diff --git a/ci/official/envs/nightly_macos_x86_py39 b/ci/official/envs/nightly_macos_x86_py39
new file mode 100644
index 00000000000000..58c570c5d10507
--- /dev/null
+++ b/ci/official/envs/nightly_macos_x86_py39
@@ -0,0 +1,14 @@
+# Disable macOS x86 uploads while being worked on
+source ci/official/envs/disable_all_uploads
+TFCI_BAZEL_COMMON_ARGS="--config release_macos_x86 --config tf_public_macos_cache_push --repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
+TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=macos_x86
+TFCI_BUILD_PIP_PACKAGE_ARGS="--cpu --nightly_flag"
+TFCI_DOCKER_ENABLE=0
+TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=1
+TFCI_PYTHON_VERSION=3.9
+TFCI_WHL_AUDIT_ENABLE=
+TFCI_WHL_SIZE_LIMIT=255M
+TFCI_MACOS_INSTALL_BAZELISK_ENABLE=1
+TFCI_MACOS_INSTALL_BAZELISK_URL="https://github.com/bazelbuild/bazelisk/releases/download/v1.11.0/bazelisk-darwin-amd64"
+TFCI_MACOS_BAZEL_TEST_DIR_ENABLE=1
+TFCI_MACOS_BAZEL_TEST_DIR_PATH="/Volumes/BuildData/bazel_output"
\ No newline at end of file
diff --git a/ci/official/envs/sample b/ci/official/envs/sample
index 1e01d6ae93b877..e7717e0b25fcae 100644
--- a/ci/official/envs/sample
+++ b/ci/official/envs/sample
@@ -16,7 +16,7 @@ set +u; source ci/official/envs/your_choice_here; set -u
 # different Python versions. You can add e.g. "--repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION"
 # to change the Python version to anything available (including the default) in
 # tensorflow/tools/toolchains/python/python_repo.bzl.
-TFCI_BAZEL_COMMON_ARGS=(--config tf_public_cache --disk_cache=build_output/cache)
+TFCI_BAZEL_COMMON_ARGS='--config tf_public_cache --disk_cache=build_output/cache'
 
 # Disable all CI-specific behavior. You never need any of these if you are
 # running a script locally.
diff --git a/ci/official/libtensorflow.sh b/ci/official/libtensorflow.sh
index 402de63ebc97a2..e6b8ff4dd865b3 100755
--- a/ci/official/libtensorflow.sh
+++ b/ci/official/libtensorflow.sh
@@ -25,8 +25,8 @@ if [[ "$TFCI_NIGHTLY_UPDATE_VERSION_ENABLE" == 1 ]]; then
   tfrun python3 tensorflow/tools/ci_build/update_version.py --nightly
 fi
 
-tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" test "${TFCI_BAZEL_COMMON_ARGS[@]}" --config=linux_libtensorflow_test
-tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" build "${TFCI_BAZEL_COMMON_ARGS[@]}" --config=linux_libtensorflow_build
+tfrun bazel $TFCI_BAZEL_BAZELRC_ARGS test $TFCI_BAZEL_COMMON_ARGS --config=linux_libtensorflow_test
+tfrun bazel $TFCI_BAZEL_BAZELRC_ARGS build $TFCI_BAZEL_COMMON_ARGS --config=linux_libtensorflow_build
 
 tfrun ./ci/official/utilities/repack_libtensorflow.sh "$TFCI_OUTPUT_DIR" "$TFCI_LIB_SUFFIX"
 
diff --git a/ci/official/pycpp.sh b/ci/official/pycpp.sh
index 6a4bd8821bbefb..3c83fd5772e1b8 100755
--- a/ci/official/pycpp.sh
+++ b/ci/official/pycpp.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 source "${BASH_SOURCE%/*}/utilities/setup.sh"
 
-tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" test "${TFCI_BAZEL_COMMON_ARGS[@]}" --profile "$TFCI_OUTPUT_DIR/profile.json.gz" --config="${TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX}_pycpp_test"
+tfrun bazel $TFCI_BAZEL_BAZELRC_ARGS test $TFCI_BAZEL_COMMON_ARGS --profile "$TFCI_OUTPUT_DIR/profile.json.gz" --config="${TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX}_pycpp_test"
 
 # Note: the profile can be viewed by visiting chrome://tracing in a Chrome browser.
 # See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling
diff --git a/ci/official/utilities/cleanup_summary.sh b/ci/official/utilities/cleanup_summary.sh
index 046e1d79014953..dbe2203fa130af 100755
--- a/ci/official/utilities/cleanup_summary.sh
+++ b/ci/official/utilities/cleanup_summary.sh
@@ -14,6 +14,8 @@
 # limitations under the License.
 # ==============================================================================
 
+set -euxo pipefail
+
 function resultstore_extract_fallback {
   # In case the main script fails somehow.
   cat <<EOF
@@ -29,7 +31,17 @@ EOF
 # Print out any ResultStore URLs for Bazel invocations' results.
 # Each failed target there will have its own representation, making failures
 # easier to find and read.
-python3 \
-  "$TFCI_GIT_DIR/ci/official/utilities/extract_resultstore_links.py" \
-  "$TFCI_OUTPUT_DIR/script.log" \
-  --print || resultstore_extract_fallback
+function resultstore_extract {
+  local \
+    XML_PATH="$TFCI_OUTPUT_DIR/Bazel_Test_and_Build_Results/sponge_log.xml"
+
+  python3 \
+    "$TFCI_GIT_DIR/ci/official/utilities/extract_resultstore_links.py" \
+    "$TFCI_OUTPUT_DIR/script.log" \
+    --print \
+    --xml-out-path "$XML_PATH" || resultstore_extract_fallback
+}
+
+if grep -q "Streaming build results to" "$TFCI_OUTPUT_DIR/script.log"; then
+  resultstore_extract || resultstore_extract_fallback
+fi
diff --git a/ci/official/utilities/code_check_full.bats b/ci/official/utilities/code_check_full.bats
index 4c913215642f96..ec28b573ce7ee1 100644
--- a/ci/official/utilities/code_check_full.bats
+++ b/ci/official/utilities/code_check_full.bats
@@ -53,6 +53,7 @@ do_external_licenses_check(){
 @com_github_grpc_grpc//src/compiler
 @platforms//os
 @ruy//
+@stablehlo//stablehlo/experimental
 EOF
 
   # grep patterns for targets which are allowed to be extra licenses
@@ -171,17 +172,17 @@ EOF
 
   # Get the full list of files and targets which get included into the pip
   # package
-  bazel query --keep_going 'deps(//tensorflow/tools/pip_package:build_pip_package)' | sort -u > $BATS_TEST_TMPDIR/pip_deps
+  bazel cquery --keep_going 'deps(//tensorflow/tools/pip_package:build_pip_package)' | sort -u > $BATS_TEST_TMPDIR/pip_deps
   # Find all Python py_test targets not tagged "no_pip" or "manual", excluding
   # any targets in ignored packages. Combine this list of targets into a bazel
   # query list (e.g. the list becomes "target+target2+target3")
-  bazel query --keep_going 'kind(py_test, //tensorflow/python/...) - attr("tags", "no_pip|manual", //tensorflow/python/...)' | grep -v -f $BATS_TEST_TMPDIR/ignore_deps_for_these_packages | paste -sd "+" - > $BATS_TEST_TMPDIR/deps
+  bazel cquery --keep_going 'kind(py_test, //tensorflow/python/...) - attr("tags", "no_pip|manual", //tensorflow/python/...)' | grep -v -f $BATS_TEST_TMPDIR/ignore_deps_for_these_packages | paste -sd "+" - > $BATS_TEST_TMPDIR/deps
   # Find all one-step dependencies of those tests which are from //tensorflow
   # (since external deps will come from Python-level pip dependencies),
   # excluding dependencies and files that are known to be unneccessary.
   # This creates a list of targets under //tensorflow that are required for
   # TensorFlow python tests.
-  bazel query --keep_going "deps($(cat $BATS_TEST_TMPDIR/deps), 1)" | grep "^//tensorflow" | grep -v -f $BATS_TEST_TMPDIR/ignore_these_deps | sort -u > $BATS_TEST_TMPDIR/required_deps
+  bazel cquery --keep_going "deps($(cat $BATS_TEST_TMPDIR/deps), 1)" | grep "^//tensorflow" | grep -v -f $BATS_TEST_TMPDIR/ignore_these_deps | sort -u > $BATS_TEST_TMPDIR/required_deps
 
 
   # Find if any required dependencies are missing from the list of dependencies
@@ -203,7 +204,7 @@ EOF
       # For every missing dependency, find the tests which directly depend on
       # it, and print that list for debugging. Not really clear if this is
       # helpful since the only examples I've seen are enormous.
-      bazel query "rdeps(kind(py_test, $(cat $BATS_TEST_TMPDIR/deps)), $dep, 1)"
+      bazel cquery "rdeps(kind(py_test, $(cat $BATS_TEST_TMPDIR/deps)), $dep, 1)"
     done < $BATS_TEST_TMPDIR/missing_deps
     exit 1
   fi
diff --git a/ci/official/utilities/docker.sh b/ci/official/utilities/docker.sh
index ea1ecc267a4fe8..c50ea618cfea6c 100755
--- a/ci/official/utilities/docker.sh
+++ b/ci/official/utilities/docker.sh
@@ -18,7 +18,7 @@ if [[ "$TFCI_DOCKER_PULL_ENABLE" == 1 ]]; then
 fi
 
 if [[ "$TFCI_DOCKER_REBUILD_ENABLE" == 1 ]]; then
-  DOCKER_BUILDKIT=1 docker build --cache-from "$TFCI_DOCKER_IMAGE" -t "$TFCI_DOCKER_IMAGE" "${TFCI_DOCKER_REBUILD_ARGS[@]}"
+  DOCKER_BUILDKIT=1 docker build --cache-from "$TFCI_DOCKER_IMAGE" -t "$TFCI_DOCKER_IMAGE" $TFCI_DOCKER_REBUILD_ARGS
   if [[ "$TFCI_DOCKER_REBUILD_UPLOAD_ENABLE" == 1 ]]; then
     docker push "$TFCI_DOCKER_IMAGE"
   fi
@@ -28,9 +28,12 @@ fi
 # The container is not cleaned up automatically! Remove it with:
 # docker rm tf
 if ! docker container inspect tf >/dev/null 2>&1 ; then
-  docker run "${TFCI_DOCKER_ARGS[@]}" --name tf -w "$TFCI_GIT_DIR" -itd --rm \
+  # Pass all existing TFCI_ variables into the Docker container
+  env_file=$(mktemp)
+  env | grep ^TFCI_ > "$env_file"
+  docker run $TFCI_DOCKER_ARGS --name tf -w "$TFCI_GIT_DIR" -itd --rm \
       -v "$TFCI_GIT_DIR:$TFCI_GIT_DIR" \
-      --env TFCI_PYTHON_VERSION \
+      --env-file "$env_file" \
       "$TFCI_DOCKER_IMAGE" \
     bash
 fi
diff --git a/ci/official/utilities/extract_resultstore_links.py b/ci/official/utilities/extract_resultstore_links.py
index a8013974f20e56..da04f5473c505b 100644
--- a/ci/official/utilities/extract_resultstore_links.py
+++ b/ci/official/utilities/extract_resultstore_links.py
@@ -248,11 +248,12 @@ def create_xml_file(result_store_dict: ResultDictType,
     f.write(b'<?xml version="1.0"?>\n')
     tree.write(f)
     if verbose:
-      print(f'Wrote to {file_path}')
+      print(f'\nWrote XML with Bazel invocation results to {file_path}')
 
 
 def print_invocation_results(result_store_dict: ResultDictType):
   """Prints out a short summary of the found ResultStore links (if any)."""
+  print()
   if not result_store_dict:
     print('Found no ResultStore links for Bazel build/test invocations.')
   else:
diff --git a/ci/official/utilities/rename_and_verify_wheels.sh b/ci/official/utilities/rename_and_verify_wheels.sh
index 4388329ae6edd7..5d02a96f7de7a9 100755
--- a/ci/official/utilities/rename_and_verify_wheels.sh
+++ b/ci/official/utilities/rename_and_verify_wheels.sh
@@ -15,28 +15,51 @@
 # limitations under the License.
 # ==============================================================================
 #
-# Check and rename wheels with auditwheel. Inserts the platform tags like
-# "manylinux_xyz" into the wheel filename.
+# Usage: rename_and_verify_wheels.sh
+# This script is aware of TFCI_ variables, so it doesn't need any arguments.
+# Puts new wheel through auditwheel to rename and verify it, deletes the old
+# one, checks the filesize, and then ensures the new wheel is installable.
 set -euxo pipefail
 
-DIR=$1
-find "$DIR" -iname "*.whl" | while read wheel; do
-  echo "Checking and renaming $wheel..."
-  wheel=$(realpath "$wheel")
-  # Repair wheel based upon name/architecture, fallback to x86
-  if [[ $wheel == *"aarch64.whl" ]]; then
-    time python3 -m auditwheel repair --plat manylinux2014_aarch64 "$wheel" --wheel-dir "$DIR" 2>&1 | tee check.txt
-  else
-    time python3 -m auditwheel repair --plat manylinux2014_x86_64 "$wheel" --wheel-dir "$DIR" 2>&1 | tee check.txt
-  fi
+cd "$TFCI_OUTPUT_DIR"
 
-  # We don't need the original wheel if it was renamed
-  new_wheel=$(awk '/Fixed-up wheel written to/ {print $NF}' check.txt)
-  if [[ "$new_wheel" != "$wheel" ]]; then
-    rm "$wheel"
-    wheel="$new_wheel"
-  fi
-  rm check.txt
+if [[ "$(ls *.whl | wc -l | tr -d ' ')" != "1" ]]; then
+  echo "Error: $TFCI_OUTPUT_DIR should contain exactly one .whl file."
+  exit 1
+fi
 
-  TF_WHEEL="$wheel" BUILD_DIR="$DIR" bats ./ci/official/utilities/wheel_verification.bats --timing
-done
+# Repair wheels with auditwheel and delete the old one.
+if [[ "$TFCI_WHL_AUDIT_ENABLE" == "1" ]]; then
+  python3 -m auditwheel repair --plat "$TFCI_WHL_AUDIT_PLAT" --wheel-dir . *.whl
+  # if the wheel is already named correctly, auditwheel won't rename it. so we
+  # list all .whl files by their modification time (ls -t) and delete anything
+  # other than the most recently-modified one (the new one).
+  ls -t *.whl | tail -n +2 | xargs rm
+fi
+
+# Check if size is too big. TFCI_WHL_SIZE_LIMIT is in find's format, which can be
+# 'k' for kilobytes, 'M' for megabytes, or 'G' for gigabytes, and the + to indicate
+# "anything greater than" is added by the script.
+if [[ "$TFCI_WHL_SIZE_LIMIT_ENABLE" == "1" ]] && [[ -n "$(find . -iname "*.whl" -size "+$TFCI_WHL_SIZE_LIMIT")" ]]; then
+  echo "Error: Generated wheel is too big! Limit is $TFCI_WHL_SIZE_LIMIT"
+  echo '(search for TFCI_WHL_SIZE_LIMIT to change it)'
+  ls -sh *.whl
+  exit 2
+fi
+
+# Quick install checks
+venv=$(mktemp -d)
+"python${TFCI_PYTHON_VERSION}" -m venv "$venv"
+python="$venv/bin/python3"
+"$python" -m pip install *.whl $TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS
+"$python" -c 'import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)'
+"$python" -c 'import sys; import tensorflow as tf; sys.exit(0 if "keras" in tf.keras.__name__ else 1)'
+# VERY basic check to ensure the [and-cuda] package variant is installable.
+# Checks TFCI_BAZEL_COMMON_ARGS for "gpu" or "cuda", implying that the test is
+# relevant. All of the GPU test machines have CUDA installed via other means,
+# so I am not sure how to verify that the dependencies themselves are valid for
+# the moment.
+if [[ "$TFCI_BAZEL_COMMON_ARGS" =~ gpu|cuda ]]; then
+  echo "Checking to make sure tensorflow[and-cuda] is installable..."
+  "$python" -m pip install "$(echo *.whl)[and-cuda]" $TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS
+fi
diff --git a/ci/official/utilities/setup.sh b/ci/official/utilities/setup.sh
index aa4e7838ba0b8d..8c004aee8b8141 100755
--- a/ci/official/utilities/setup.sh
+++ b/ci/official/utilities/setup.sh
@@ -51,11 +51,18 @@ cd "$TFCI_GIT_DIR"
 # even works for arrays; e.g. TFCI_SOME_ARRAY="(--array --contents)" ends up
 # as TFCI_SOME_ARRAY=(--array --contents) in the storage file and is thus
 # loaded as an array when sourced.
-if [[ -n "${TFCI:-}" ]]; then
+if [[ -z "${TFCI:-}" ]]; then
+  echo '==TFCI==: The $TFCI variable is not set. This is fine as long as you'
+  echo 'already sourced a TFCI env file with "set -a; source <path>; set +a".'
+  echo 'If you have not, you will see a lot of undefined variable errors.'
+else
   FROM_ENV=$(mktemp)
   # Piping into cat means grep won't abort the process if no errors are found.
   env | grep TFCI_ | cat > "$FROM_ENV"
 
+  # Source the default ci values
+  source ./ci/official/envs/ci_default
+
   # Sourcing TFCI twice, the first time with "-u" unset, means that variable
   # order does not matter. i.e. "TFCI_BAR=$TFCI_FOO; TFCI_FOO=true" will work.
   # TFCI_FOO is only valid the second time through.
@@ -73,10 +80,11 @@ if [[ -n "${TFCI:-}" ]]; then
     source "$FROM_ENV"
     rm "$FROM_ENV"
   fi
-else
-  echo '==TFCI==: The $TFCI variable is not set. This is fine as long as you'
-  echo 'already sourced a TFCI env file with "set -a; source <path>; set +a".'
-  echo 'If you have not, you will see a lot of undefined variable errors.'
+fi
+
+# Mac builds have some specific setup needs. See setup_macos.sh for details
+if [[ "${OSTYPE}" =~ darwin* ]]; then
+  source ./ci/official/utilities/setup_macos.sh
 fi
 
 # Force-disable uploads if the job initiator is not Kokoro
diff --git a/ci/official/utilities/setup_macos.sh b/ci/official/utilities/setup_macos.sh
new file mode 100644
index 00000000000000..a6bd223402490f
--- /dev/null
+++ b/ci/official/utilities/setup_macos.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# macOS specific setup for all TF scripts.
+#
+
+# Mac version of Core utilities differ in usage. Since our scripts are written
+# with the GNU style, we need to set GNU utilities to be default on Mac.
+if [[ -n "$(which grealpath)" ]] &&  [[ -n "$(which gstat)" ]]; then
+  alias realpath=grealpath
+  alias stat=gstat
+  # By default, aliases are only expanded in interactive shells, which means
+  # that they are not substituted for their corresponding commands in shell
+  # scripts. By setting "expand_aliases", we enable alias expansion in
+  # non-interactive shells as well.
+  shopt -s expand_aliases
+else
+  echo '==TFCI==: Error: Cannot find path to grealpath or gstat'
+  echo 'TF CI scripts require GNU core utilties to be installed. Please make'
+  echo 'sure they are present on your system and try again.'
+  exit 1
+fi
+
+# "TFCI_MACOS_BAZEL_TEST_DIR_PATH" specifies the directory that Bazel should use
+# when running tests. Each test will be executed in a separate subdirectory
+# inside this directory. TF Mac builds need ~150 GB of disk space to be able to
+# run all the tests. Since TFCI Mac VMs execute Bazel test commands in a
+# partition with insufficient storage, we specify the
+# 'TFCI_MACOS_BAZEL_TEST_DIR_PATH' environment variable to point to a partition
+# with ample storage. When this variable is empty (i.e by default), Bazel will
+# use the output base directory to run tests.
+if [[ "${TFCI_MACOS_BAZEL_TEST_DIR_ENABLE}" == 1 ]]; then
+  mkdir -p "${TFCI_MACOS_BAZEL_TEST_DIR_PATH}"
+  export TEST_TMPDIR="${TFCI_MACOS_BAZEL_TEST_DIR_PATH}"
+fi
+
+# "TFCI_MACOS_INSTALL_BAZELISK_ENABLE" is used to decide if we need to install
+# Bazelisk manually. We enable this for macOS x86 builds as those VMs do not
+# have Bazelisk pre-installed. "TFCI_MACOS_INSTALL_BAZELISK_URL" contains the
+# link to the Bazelisk binary which needs to be downloaded.
+if [[ "${TFCI_MACOS_INSTALL_BAZELISK_ENABLE}" == 1 ]]; then
+  sudo wget --no-verbose -O "/usr/local/bin/bazel" "${TFCI_MACOS_INSTALL_BAZELISK_URL}"
+  chmod +x "/usr/local/bin/bazel"
+fi
+
+# "TFCI_MACOS_UPGRADE_PYENV_ENABLE" is used to decide if we need to upgrade the
+# Pyenv version. We enable this for macOS x86 builds as the default Pyenv on
+# those VMs does not support installing Python 3.10 and above which we need
+# for running smoke tests in nightly/release wheel builds.
+if [[ "${TFCI_MACOS_UPGRADE_PYENV_ENABLE}" == 1 ]]; then
+  brew upgrade pyenv
+fi
+
+# "TFCI_MACOS_PYENV_INSTALL_ENABLE" controls whether to use Pyenv to install
+# the Python version set in "TFCI_PYTHON_VERSION" and use it as default.
+# We enable this in the nightly and release builds because before uploading the
+# wheels, we install them in a virtual environment and run some smoke tests on
+# it. TFCI Mac VMs only have one Python version installed so we need to install
+# the other versions manually.
+if [[ "${TFCI_MACOS_PYENV_INSTALL_ENABLE}" == 1 ]]; then
+  pyenv install "$TFCI_PYTHON_VERSION"
+  pyenv local "$TFCI_PYTHON_VERSION"
+  # Do a sanity check to make sure that we using the correct Python version
+  python --version
+fi
+
+if [[ "$TFCI_PYTHON_VERSION" == "3.12" ]]; then
+  # dm-tree (Keras v3 dependency) doesn't have pre-built wheels for 3.12 yet.
+  # Having CMake allows building them.
+  # Once the wheels are added, this should be removed - b/308399490.
+  brew install cmake
+fi
+
+# Scheduled nightly and release builds upload build artifacts (Pip packages,
+# Libtensorflow archives) to GCS buckets. TFCI Mac VMs need to authenticate as
+# a service account that has the right permissions to be able to do so.
+set +x
+if [[ -n "${GOOGLE_APPLICATION_CREDENTIALS:-}" ]]; then
+  gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
+fi
+set -x
\ No newline at end of file
diff --git a/ci/official/utilities/wheel_verification.bats b/ci/official/utilities/wheel_verification.bats
deleted file mode 100644
index 99d0f32e35162e..00000000000000
--- a/ci/official/utilities/wheel_verification.bats
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# Suite of verification tests for the SINGLE TensorFlow wheel in the
-# $BUILD_DIR directory, or whatever path is set as $TF_WHEEL.
-
-setup_file() {
-    cd "$BUILD_DIR"
-    if [[ -z "$TF_WHEEL" ]]; then
-        export TF_WHEEL=$(find "$BUILD_DIR" -iname "*.whl")
-    fi
-
-    # Setup the env for the python import testing
-    if [[ $TF_WHEEL == *"aarch64.whl" ]]; then
-        python${TFCI_PYTHON_VERSION} -m venv "$BATS_FILE_TMPDIR/venv"
-    else
-        python3 -m venv "$BATS_FILE_TMPDIR/venv"
-    fi
-}
-
-teardown_file() {
-    rm -rf "$BATS_FILE_TMPDIR/venv"
-}
-
-@test "Wheel is manylinux2014 (manylinux_2_17) compliant" {
-    python3 -m auditwheel show "$TF_WHEEL" > audit.txt
-    # Verify wheel based upon name/architecture, fallback to x86
-    if [[ $TF_WHEEL == *"aarch64.whl" ]]; then
-        grep --quiet -zoP 'is consistent with the following platform tag:\n"manylinux_2_17_aarch64"\.' audit.txt
-    else
-        grep --quiet 'This constrains the platform tag to "manylinux_2_17_x86_64"' audit.txt
-    fi
-}
-
-@test "Wheel conforms to upstream size limitations" {
-    WHEEL_MEGABYTES=$(stat --format %s "$TF_WHEEL" | awk '{print int($1/(1024*1024))}')
-    # Googlers: search for "test_tf_whl_size"
-    case "$TF_WHEEL" in
-        # CPU:
-        *cpu*manylinux*) LARGEST_OK_SIZE=240 ;;
-        # GPU:
-        *manylinux*)     LARGEST_OK_SIZE=580 ;;
-        # Unknown:
-        *)
-            echo "The wheel's name is in an unknown format."
-            exit 1
-            ;;
-    esac
-    # >&3 forces output in bats even if the test passes. See
-    # https://bats-core.readthedocs.io/en/stable/writing-tests.html#printing-to-the-terminal
-    echo "# Size of $TF_WHEEL is $WHEEL_MEGABYTES / $LARGEST_OK_SIZE megabytes." >&3
-    test "$WHEEL_MEGABYTES" -le "$LARGEST_OK_SIZE"
-}
-
-# Note: this runs before the tests further down the file, so TF is installed in
-# the venv and the venv is active when those tests run. The venv gets cleaned
-# up in teardown_file() above.
-@test "Wheel is installable" {
-    source "$BATS_FILE_TMPDIR/venv/bin/activate"
-    python3 -m pip install "$TF_WHEEL"
-}
-
-@test "TensorFlow is importable" {
-    source "$BATS_FILE_TMPDIR/venv/bin/activate"
-    python3 -c 'import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)'
-}
-
-# Is this still useful?
-@test "TensorFlow has Keras" {
-    source "$BATS_FILE_TMPDIR/venv/bin/activate"
-    python3 -c 'import sys; import tensorflow as tf; sys.exit(0 if "keras" in tf.keras.__name__ else 1)'
-}
-
-# Is this still useful?
-@test "TensorFlow has Estimator" {
-    source "$BATS_FILE_TMPDIR/venv/bin/activate"
-    python3 -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.estimator" in tf.estimator.__name__ else 1)'
-}
diff --git a/ci/official/wheel.sh b/ci/official/wheel.sh
index 20c6f2637d7e12..5789e58703a18b 100755
--- a/ci/official/wheel.sh
+++ b/ci/official/wheel.sh
@@ -25,30 +25,17 @@ if [[ "$TFCI_NIGHTLY_UPDATE_VERSION_ENABLE" == 1 ]]; then
   tfrun python3 tensorflow/tools/ci_build/update_version.py --nightly
 fi
 
-# Download libtpu.so for tensorflow-tpu builds only.
-if [[ "$TFCI_LIBTPU_DOWNLOAD_ENABLE" == 1 ]]; then
-  wget -P ./tensorflow/lib/ "$TFCI_LIBTPU_DOWNLOAD_URL"
-fi
-if [[ "$TFCI_LIBTPU_DOWNLOAD_NIGHTLY_ENABLE" == 1 ]]; then
-  # For nightly jobs, libtpu.so comes from the latest nightly libtpu build.
-  # Note: expects a working wheel for today
-  DATE=$(TZ='America/Los_Angeles' date '+%Y%m%d')
-  tfrun wget "https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/wheels/libtpu-nightly/libtpu_nightly-0.1.dev${DATE}-py3-none-any.whl" -O libtpu.whl
-  # -j to discard intermediate directories; -o to overwrite if exists; -d to set output dir
-  tfrun unzip libtpu.whl libtpu/libtpu.so -j -o -d ./tensorflow/lib
-fi
-
-tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" build "${TFCI_BAZEL_COMMON_ARGS[@]}" //tensorflow/tools/pip_package:build_pip_package
-tfrun ./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$TFCI_OUTPUT_DIR" "${TFCI_BUILD_PIP_PACKAGE_ARGS[@]}"
-tfrun ./ci/official/utilities/rename_and_verify_wheels.sh "$TFCI_OUTPUT_DIR"
+tfrun bazel $TFCI_BAZEL_BAZELRC_ARGS build $TFCI_BAZEL_COMMON_ARGS //tensorflow/tools/pip_package:build_pip_package
+tfrun ./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$TFCI_OUTPUT_DIR" $TFCI_BUILD_PIP_PACKAGE_ARGS
+tfrun ./ci/official/utilities/rename_and_verify_wheels.sh
 
 if [[ "$TFCI_UPLOAD_WHL_PYPI_ENABLE" == 1 ]]; then
-  twine upload "${TFCI_UPLOAD_WHL_PYPI_ARGS[@]}" "$TFCI_OUTPUT_DIR"/*.whl
+  twine upload $TFCI_UPLOAD_WHL_PYPI_ARGS "$TFCI_OUTPUT_DIR"/*.whl
 fi
 if [[ "$TFCI_UPLOAD_WHL_GCS_ENABLE" == 1 ]]; then
   gsutil cp "$TFCI_OUTPUT_DIR"/*.whl "$TFCI_UPLOAD_WHL_GCS_URI"
 fi
 
 if [[ "$TFCI_WHL_BAZEL_TEST_ENABLE" == 1 ]]; then
-  tfrun bazel "${TFCI_BAZEL_BAZELRC_ARGS[@]}" test "${TFCI_BAZEL_COMMON_ARGS[@]}" --config="${TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX}_wheel_test"
+  tfrun bazel $TFCI_BAZEL_BAZELRC_ARGS test $TFCI_BAZEL_COMMON_ARGS --config="${TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX}_wheel_test"
 fi
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index ef01b603800a71..289f37ef902c63 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -41,8 +41,11 @@ load(
 )
 
 # copybara:uncomment_begin
+# # buildifier: disable=out-of-order-load
+# load("//devtools/build_cleaner/skylark:action_config_test.bzl", "action_config_test")
 # load("//devtools/copybara/rules:copybara.bzl", "copybara_config_test")
 # load("//tools/build_defs/license:license.bzl", "license")
+# # buildifier: enable=out-of-order-load
 # copybara:uncomment_end
 
 # copybara:comment_begin(oss-only)
@@ -183,6 +186,11 @@ package(
 #     ],
 #     deps = [":copybara_config"],
 # )
+#
+# action_config_test(
+#     name = "build_cleaner_spec_test",
+#     src = "build_cleaner_spec.textproto",
+# )
 # copybara:uncomment_end
 
 licenses(["notice"])
@@ -1366,7 +1374,6 @@ tf_cc_shared_library(
         "//tensorflow/compiler/mlir/lite/quantization/lite:quantize_model",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/compiler/mlir/lite/sparsity:sparsify_model",
-        "//tensorflow/compiler/mlir/python:mlir",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:custom_aggregator_op",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc_impl",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
@@ -1449,9 +1456,14 @@ tf_cc_shared_library(
         "//tensorflow/lite:util",
         "//tensorflow/python/grappler:cost_analyzer_lib",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
-    ] + (tf_monitoring_python_deps() +
-         tf_additional_plugin_deps() +
-         tf_additional_profiler_deps()) + if_xla_available([
+    ] + select({
+        "//tensorflow/compiler/mlir/python:disable_mlir_config": [],
+        "//conditions:default": [
+            "//tensorflow/compiler/mlir/python:mlir",
+        ],
+    }) + (tf_monitoring_python_deps() +
+          tf_additional_plugin_deps() +
+          tf_additional_profiler_deps()) + if_xla_available([
         "//tensorflow/compiler/aot:tfcompile_lib",
     ]) + if_static(extra_deps = [
         "//tensorflow/core/platform:tensor_float_32_utils",
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 321738084016a7..1ccf2fe07f0af9 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -30,7 +30,6 @@
 import distutils as _distutils
 import importlib
 import inspect as _inspect
-import logging as _logging
 import os as _os
 import site as _site
 import sys as _sys
@@ -62,18 +61,6 @@
   __path__.append(_tf_api_dir)
 
 # Hook external TensorFlow modules.
-# Import compat before trying to import summary from tensorboard, so that
-# reexport_tf_summary can get compat from sys.modules. Only needed if using
-# lazy loading.
-_current_module.compat.v2  # pylint: disable=pointless-statement
-try:
-  from tensorboard.summary._tf import summary
-  _current_module.__path__ = (
-      [_module_util.get_parent_dir(summary)] + _current_module.__path__)
-  setattr(_current_module, "summary", summary)
-except ImportError:
-  _logging.warning(
-      "Limited tf.summary API due to missing TensorBoard installation.")
 
 # Load tensorflow-io-gcs-filesystem if enabled
 if (_os.getenv("TF_USE_MODULAR_FILESYSTEM", "0") == "true" or
diff --git a/tensorflow/build_cleaner_spec.textproto b/tensorflow/build_cleaner_spec.textproto
new file mode 100644
index 00000000000000..bea7e8ac36462a
--- /dev/null
+++ b/tensorflow/build_cleaner_spec.textproto
@@ -0,0 +1,14 @@
+# proto-file: devtools/build_cleaner/proto/actions.proto
+# proto-message: ActionSpecs
+
+# Python rules should not have more than one source file.
+action_spec {
+  action: CHECK_FILE_COUNT
+  file_count_params {
+    rule_selector {
+      rule_kind_regex: "^.*py(type)?(_strict)?_(binary|library|test).*$"
+      generator_function_regex: "^(?!boq_header)$"
+    }
+    max_source_count: 1
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/c/experimental/next_pluggable_device/BUILD b/tensorflow/c/experimental/next_pluggable_device/BUILD
index 5c7bbddc3af6f2..d6ad4fe8d5d244 100644
--- a/tensorflow/c/experimental/next_pluggable_device/BUILD
+++ b/tensorflow/c/experimental/next_pluggable_device/BUILD
@@ -16,7 +16,7 @@ cc_library(
         "//tensorflow/c:c_api_macros_hdrs",
         "//tensorflow/c:kernels_experimental_hdrs",
         "//tensorflow/c:kernels_hdrs",
-        "//tensorflow/c:tf_buffer_internal",
+        "//tensorflow/c:tf_buffer",
         "//tensorflow/c:tf_status_internal",
         "//tensorflow/c:tf_tensor_internal",
         "//tensorflow/compiler/jit:variable_info",
diff --git a/tensorflow/c/experimental/ops/gen/cpp/golden/testing_ops.cc.golden b/tensorflow/c/experimental/ops/gen/cpp/golden/testing_ops.cc.golden
index 490514f80e18a4..54a45cb23ed110 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/golden/testing_ops.cc.golden
+++ b/tensorflow/c/experimental/ops/gen/cpp/golden/testing_ops.cc.golden
@@ -45,7 +45,7 @@ Status Neg(AbstractContext* ctx, AbstractTensorHandle* const x, AbstractTensorHa
 // Summary:
 //
 // Description:
-Status MatMul(AbstractContext* ctx, AbstractTensorHandle* const a, AbstractTensorHandle* const b, AbstractTensorHandle** product, bool transpose_a, bool transpose_b, const char* name, const char* raw_device_name) {
+Status MatMul(AbstractContext* ctx, AbstractTensorHandle* const a, AbstractTensorHandle* const b, AbstractTensorHandle** product, bool transpose_a, bool transpose_b, bool grad_a, bool grad_b, const char* name, const char* raw_device_name) {
   AbstractOperationPtr op_ptr(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(op_ptr->Reset("MatMul", raw_device_name));
   TF_RETURN_IF_ERROR(MaybeSetOpName(op_ptr.get(), name));
@@ -53,6 +53,8 @@ Status MatMul(AbstractContext* ctx, AbstractTensorHandle* const a, AbstractTenso
   TF_RETURN_IF_ERROR(op_ptr->AddInput(b));
   TF_RETURN_IF_ERROR(op_ptr->SetAttrBool("transpose_a", transpose_a));
   TF_RETURN_IF_ERROR(op_ptr->SetAttrBool("transpose_b", transpose_b));
+  TF_RETURN_IF_ERROR(op_ptr->SetAttrBool("grad_a", grad_a));
+  TF_RETURN_IF_ERROR(op_ptr->SetAttrBool("grad_b", grad_b));
   int num_retvals = 1;
   return op_ptr->Execute(absl::MakeSpan(product, 1), &num_retvals);
 }
diff --git a/tensorflow/c/experimental/ops/gen/cpp/golden/testing_ops.h.golden b/tensorflow/c/experimental/ops/gen/cpp/golden/testing_ops.h.golden
index 4b24a4f55ecff1..1d1255a20d6aa9 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/golden/testing_ops.h.golden
+++ b/tensorflow/c/experimental/ops/gen/cpp/golden/testing_ops.h.golden
@@ -28,7 +28,7 @@ namespace ops {
 Status Neg(AbstractContext* ctx, AbstractTensorHandle* const x, AbstractTensorHandle** y, const char* name = nullptr, const char* raw_device_name = nullptr);
 
 //
-Status MatMul(AbstractContext* ctx, AbstractTensorHandle* const a, AbstractTensorHandle* const b, AbstractTensorHandle** product, bool transpose_a = false, bool transpose_b = false, const char* name = nullptr, const char* raw_device_name = nullptr);
+Status MatMul(AbstractContext* ctx, AbstractTensorHandle* const a, AbstractTensorHandle* const b, AbstractTensorHandle** product, bool transpose_a = false, bool transpose_b = false, bool grad_a = false, bool grad_b = false, const char* name = nullptr, const char* raw_device_name = nullptr);
 
 //
 Status IdentityN(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> input, absl::Span<AbstractTensorHandle*> output, const char* name = nullptr, const char* raw_device_name = nullptr);
diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index 60f6d6e0250e75..af37ab0cb19011 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -13,7 +13,6 @@ load(
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        # copybara:uncomment(<g3 only>) "//learning/brain/tfrt/aot:__pkg__",
         "//tensorflow/c:__subpackages__",
         "//tensorflow/c/experimental/saved_model/internal:__pkg__",
     ],
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.cc b/tensorflow/c/experimental/stream_executor/stream_executor.cc
index 3fcd255a2248ab..12391143a4d9e0 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.cc
@@ -230,10 +230,6 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
   DeviceMemoryBase Allocate(uint64 size) {
     return Allocate(size, /*memory_space=*/0);
   }
-  void* GetSubBuffer(DeviceMemoryBase* parent, uint64 offset,
-                     uint64 size) override {
-    LOG(FATAL) << "GetSubBuffer is not supported by pluggable device.";
-  }
 
   void Deallocate(DeviceMemoryBase* mem) override {
     SP_DeviceMemoryBase device_memory_base = DeviceMemoryBaseToC(mem);
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
index 7eda58471c2f57..0f3e2e76aa4ebe 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index 09ebb300969aef..1d22fa18cba53a 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -249,7 +249,8 @@ class CAsyncOpKernel : public AsyncOpKernel {
     n.WaitForNotification();
   }
 
-  void ComputeAsync(OpKernelContext* ctx, AsyncOpKernelDoneCallback done) {
+  void ComputeAsync(OpKernelContext* ctx,
+                    AsyncOpKernelDoneCallback done) override {
     (*compute_async_func_)(
         c_kernel_, reinterpret_cast<TF_OpKernelContext*>(ctx),
         reinterpret_cast<TF_AsyncOpKernelDoneCallback*>(&done));
diff --git a/tensorflow/c/kernels_experimental.cc b/tensorflow/c/kernels_experimental.cc
index 7e6f818be47b39..09ce84d42f7392 100644
--- a/tensorflow/c/kernels_experimental.cc
+++ b/tensorflow/c/kernels_experimental.cc
@@ -292,7 +292,7 @@ struct TmpVar : public ResourceBase {
   tensorflow::mutex mu;
   Tensor val;
   std::string name;
-  std::string DebugString() const { return name; }
+  std::string DebugString() const override { return name; }
   ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; }
 };
 
@@ -626,7 +626,7 @@ static Status CCBinaryAddFunc(
     binary_add_func(ctx, a, b, out);
     return cc_ctx->status();
   }
-};
+}
 
 static Status VariantBinaryAddFunc(
     ::tensorflow::OpKernelContext* cc_ctx, const Variant& a, const Variant& b,
diff --git a/tensorflow/cc/saved_model/bundle_v2.cc b/tensorflow/cc/saved_model/bundle_v2.cc
index dcf0b5c5443187..d059c5d0c5729d 100644
--- a/tensorflow/cc/saved_model/bundle_v2.cc
+++ b/tensorflow/cc/saved_model/bundle_v2.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/cc/saved_model/bundle_v2.h"
 
+#include <memory>
 #include <string>
 #include <utility>
 
@@ -113,8 +114,8 @@ absl::Status SavedModelV2Bundle::Load(const std::string& export_dir,
     // Load the variables checkpoint reader.
     const std::string variables_prefix =
         io::JoinPath(variables_dir, kSavedModelVariablesFilename);
-    bundle->variable_reader_.reset(
-        new BundleReader(Env::Default(), variables_prefix));
+    bundle->variable_reader_ =
+        std::make_unique<BundleReader>(Env::Default(), variables_prefix);
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
         bundle->variable_reader_->status(),
         "Unable to load SavedModel variables checkpoint from ",
diff --git a/tensorflow/cc/saved_model/image_format/BUILD b/tensorflow/cc/saved_model/image_format/BUILD
index 10a35871a708be..7fd743cf9c8356 100644
--- a/tensorflow/cc/saved_model/image_format/BUILD
+++ b/tensorflow/cc/saved_model/image_format/BUILD
@@ -32,7 +32,9 @@ cc_library(
         "//tensorflow/tools/proto_splitter/cc:max_size",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
     ] + if_not_windows_or_mac([
         "//tensorflow/tools/proto_splitter:merge",
         "//tensorflow/tools/proto_splitter/cc:saved_model_splitter",
diff --git a/tensorflow/cc/saved_model/image_format/internal_api.cc b/tensorflow/cc/saved_model/image_format/internal_api.cc
index b959602ba445c9..db38d1786e59ea 100644
--- a/tensorflow/cc/saved_model/image_format/internal_api.cc
+++ b/tensorflow/cc/saved_model/image_format/internal_api.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/image_format/internal_api.h"
 
 #include <string>
+#include <tuple>
 
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/cord.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/cc/saved_model/metrics.h"
 #include "tensorflow/cc/saved_model/util.h"
@@ -31,7 +33,7 @@ limitations under the License.
 #include "tensorflow/tools/proto_splitter/cc/saved_model_splitter.h"
 #include "tensorflow/tools/proto_splitter/merge.h"
 #endif
-
+#define IS_OSS false
 namespace tensorflow {
 namespace image_format {
 
@@ -104,6 +106,27 @@ absl::Status WriteSavedModel(SavedModel* saved_model_proto,
 #endif
 }
 
+absl::StatusOr<std::tuple<std::string, bool>> WriteSavedModelToString(
+    SavedModel* saved_model_proto) {
+#if !defined(PLATFORM_WINDOWS) && !defined(__APPLE__)
+  tools::proto_splitter::SavedModelSplitter splitter(saved_model_proto);
+  return splitter.WriteToString();
+#else
+  return absl::UnimplementedError(
+      "WriteSavedModelToString not implemented for Windows or MacOS.");
+#endif
+}
+
+#if !IS_OSS
+// TODO(b/311769337): Define the function unconditionally after tf oss
+// dependency is updated to protobuf v22.x.
+absl::StatusOr<std::tuple<absl::Cord, bool>> WriteSavedModelToCord(
+    SavedModel* saved_model_proto) {
+  tools::proto_splitter::SavedModelSplitter splitter(saved_model_proto);
+  return splitter.WriteToCord();
+}
+#endif
+
 absl::Status WriteSavedModel(SavedModel* saved_model_proto,
                              const std::string& file_prefix,
                              int debug_max_size) {
diff --git a/tensorflow/cc/saved_model/image_format/internal_api.h b/tensorflow/cc/saved_model/image_format/internal_api.h
index 465b00a74bfada..5c9b13d0f97364 100644
--- a/tensorflow/cc/saved_model/image_format/internal_api.h
+++ b/tensorflow/cc/saved_model/image_format/internal_api.h
@@ -17,10 +17,15 @@ limitations under the License.
 #define TENSORFLOW_CC_SAVED_MODEL_IMAGE_FORMAT_INTERNAL_API_H_
 
 #include <string>
+#include <tuple>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
 #include "tensorflow/core/protobuf/saved_model.pb.h"
 
+#define IS_OSS false
+
 namespace tensorflow {
 namespace image_format {
 
@@ -29,13 +34,24 @@ namespace image_format {
 absl::Status ReadSavedModel(const std::string& file_prefix,
                             SavedModel* saved_model_proto);
 
-// Writes the SavedModel proto to {file_prefix}{.pb|.cpb}.
-// If the proto is < the protobuf maximum size, then it will be serialized
-// as a `.pb` proto binary. When larger than the maximum size, the SavedModel
-// proto is destructively separated into chunks and written to
+// Writes the SavedModel proto to a file or to string. If the proto is < the
+// protobuf maximum size, then it will be serialized as a `.pb` proto binary.
+// When larger than the maximum size, the SavedModel proto is destructively
+// separated into chunks and written to
 // `.cpb` (chunked proto).
+//
+// Write SavedModel to {file_prefix}{.pb|.cpb}.
 absl::Status WriteSavedModel(SavedModel* saved_model_proto,
                              const std::string& file_prefix);
+// Writes the SavedModel proto to std::string
+// The bool field record whether it's saved as a chunked protobuf (true) or
+// regular protobuf (false)
+absl::StatusOr<std::tuple<std::string, bool>> WriteSavedModelToString(
+    SavedModel* saved_model_proto);
+#if !IS_OSS
+absl::StatusOr<std::tuple<absl::Cord, bool>> WriteSavedModelToCord(
+    SavedModel* saved_model_proto);
+#endif
 
 // See above. The `debug_max_size` argument can be used to the maximum size to
 // less than 2GB for testing purposes.
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 399d4cf37fef4c..a245bf59a1f187 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/cc/saved_model/loader.h"
 
+#include <memory>
 #include <string>
 #include <unordered_set>
 
@@ -267,7 +268,7 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
 
 }  // namespace
 
-SavedModelBundleInterface::~SavedModelBundleInterface() {}
+SavedModelBundleInterface::~SavedModelBundleInterface() = default;
 
 Status LoadMetagraphIntoSession(const SessionOptions& session_options,
                                 const MetaGraphDef& meta_graph,
@@ -491,7 +492,7 @@ Status LoadSavedModel(const SessionOptions& session_options,
   TF_RETURN_IF_ERROR(LoadSavedModel(rewritten_options, run_options, export_dir,
                                     tags, &legacy_bundle));
   *bundle = SavedModelBundleLite(
-      absl::make_unique<LiteSessionWrapper>(std::move(legacy_bundle.session)),
+      std::make_unique<LiteSessionWrapper>(std::move(legacy_bundle.session)),
       std::move(*legacy_bundle.meta_graph_def.mutable_signature_def()));
   return OkStatus();
 }
diff --git a/tensorflow/cc/saved_model/loader.h b/tensorflow/cc/saved_model/loader.h
index f2d318a25b7274..1dcd951d92b5ed 100644
--- a/tensorflow/cc/saved_model/loader.h
+++ b/tensorflow/cc/saved_model/loader.h
@@ -121,7 +121,7 @@ Status LoadMetagraphIntoSession(const SessionOptions& session_options,
 Status LoadSavedModel(const SessionOptions& session_options,
                       const RunOptions& run_options, const string& export_dir,
                       const std::unordered_set<string>& tags,
-                      SavedModelBundle* const bundle);
+                      SavedModelBundle* bundle);
 
 /// Loads a SavedModel from the specified export directory. The MetaGraphDef
 /// to be loaded is identified by the supplied tags, corresponding exactly to
@@ -133,7 +133,7 @@ Status LoadSavedModel(const SessionOptions& session_options,
 Status LoadSavedModel(const SessionOptions& session_options,
                       const RunOptions& run_options, const string& export_dir,
                       const std::unordered_set<string>& tags,
-                      SavedModelBundleLite* const bundle);
+                      SavedModelBundleLite* bundle);
 
 /// Checks whether the provided directory could contain a SavedModel. Note that
 /// the method does not load any data by itself. If the method returns `false`,
diff --git a/tensorflow/compat_template.__init__.py b/tensorflow/compat_template.__init__.py
index 9d2f954293eddc..701623c328081e 100644
--- a/tensorflow/compat_template.__init__.py
+++ b/tensorflow/compat_template.__init__.py
@@ -16,7 +16,6 @@
 
 # pylint: disable=g-bad-import-order,g-import-not-at-top,protected-access
 
-import logging as _logging
 import os as _os
 import sys as _sys
 import typing as _typing
@@ -31,15 +30,6 @@
 
 # Hook external TensorFlow modules.
 _current_module = _sys.modules[__name__]
-try:
-  from tensorboard.summary._tf import summary
-  _current_module.__path__ = (
-      [_module_util.get_parent_dir(summary)] + _current_module.__path__)
-  setattr(_current_module, "summary", summary)
-except ImportError:
-  _logging.warning(
-      "Limited tf.compat.v2.summary API due to missing TensorBoard "
-      "installation.")
 
 # Lazy-load estimator.
 _estimator_module = "tensorflow_estimator.python.estimator.api._v2.estimator"
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 6c276dbedef1f2..92d62b34be8bf9 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -1,8 +1,8 @@
 load("//tensorflow:strict.default.bzl", "py_strict_binary")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "genrule")
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -131,7 +131,6 @@ genrule(
 tfcompile_test_dep_configs = [
     ("", "None"),
     ("_mlir_bridge", "Bridge"),
-    ("_mhlo_lowering", "HloLowering"),
 ]
 
 [
@@ -473,42 +472,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "tfcompile_test_mhlo_lowering",
-    srcs = ["tfcompile_test.cc"],
-    extra_copts = ["-DMHLO_LOWERING_TEST"],
-    tags = [
-        "manual",
-        "no_mac",  # TODO(b/228273415)
-    ],
-    deps = [
-        ":test_graph_tfadd_mhlo_lowering",
-        ":test_graph_tfadd_with_ckpt_mhlo_lowering",
-        ":test_graph_tfadd_with_ckpt_saver_mhlo_lowering",
-        ":test_graph_tfassert_eq_mhlo_lowering",
-        ":test_graph_tfcond_mhlo_lowering",
-        ":test_graph_tffunction_mhlo_lowering",
-        ":test_graph_tfgather_mhlo_lowering",
-        ":test_graph_tfmatmul_mhlo_lowering",
-        ":test_graph_tfmatmulandadd_mhlo_lowering",
-        ":test_graph_tfsplits_mhlo_lowering",
-        ":test_graph_tftop_k_mhlo_lowering",
-        ":test_graph_tfvariable_mhlo_lowering",
-        ":test_graph_tfvariable_readonly_mhlo_lowering",
-        ":test_graph_tfvariable_sequential_updates_mhlo_lowering",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/platform:regexp",
-        "@com_google_absl//absl/strings",
-        "@eigen_archive//:eigen3",
-        "@local_xla//xla:shape_util",
-        "@local_xla//xla:test",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/service:hlo_profile_printer",
-    ],
-)
-
 tf_cc_test(
     name = "tfcompile_test_mlir_bridge",
     srcs = ["tfcompile_test.cc"],
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index f056533d1b21e6..a543aae5b92997 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -329,16 +329,7 @@ def _tf_library(
             "@local_xla//xla/service/cpu:runtime_single_threaded_conv2d",
             "@local_xla//xla/service/cpu:runtime_single_threaded_matmul",
             "@eigen_archive//:eigen3",
-        ] or []) + (
-            mlir_components.count("HloLowering") > 0 and [
-                "@local_xla//xla/runtime:aot_ffi_c_symbols",
-                "@local_xla//xla/service/cpu:runtime_mlir_utils",
-            ] or []
-        ) + (
-            include_standard_runtime_deps and mlir_components == "HloLowering" and [
-                "@local_xla//xla/service/cpu/runtime:retain",
-            ] or []
-        ) + (deps or []),
+        ] or []) + (deps or []),
         tags = tags,
         copts = copts,
     )
@@ -559,31 +550,6 @@ def tf_library(
         copts,
         xla_flags,
     )
-    if mlir_components == "None":
-        _tf_library(
-            name + "_mlir",
-            graph,
-            config,
-            debug_info,
-            freeze_checkpoint,
-            freeze_saver,
-            cpp_class,
-            gen_test,
-            gen_benchmark,
-            gen_compiler_log,
-            visibility,
-            testonly,
-            tfcompile_flags,
-            tfcompile_tool,
-            include_standard_runtime_deps,
-            enable_xla_hlo_profiling,
-            enable_tracemes,
-            "HloLowering",
-            deps,
-            tags + ["notap", "local", "manual"],
-            copts,
-            xla_flags,
-        )
 
 def target_llvm_triple():
     """Returns the target LLVM triple to be used for compiling the target."""
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index 276ab1786a8260..82ed25767b90de 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -283,6 +283,7 @@ void AllocateAndParseFlags() {
   bool enable_mlir_merge_control_flow_pass = true;
   bool enable_mlir_convert_control_to_data_outputs_pass = false;
   bool enable_mlir_strict_clusters = false;
+  bool enable_mlir_multiple_local_cpu_devices = false;
   // Dump graphs in TFG dialect.
   bool use_tfg_graph_dumper = false;
   bool enable_mlir_generic_outside_compilation = false;
@@ -377,6 +378,11 @@ void AllocateAndParseFlags() {
             "MLIR-Based TensorFlow Compiler Bridge."),
        Flag("tf_mlir_enable_strict_clusters", &enable_mlir_strict_clusters,
             "Do not allow clusters that have cyclic control dependencies."),
+       Flag("tf_mlir_enable_multiple_local_cpu_devices",
+            &enable_mlir_multiple_local_cpu_devices,
+            "Enable multiple local CPU devices. CPU ops which are outside "
+            "compiled inside the tpu cluster will also be replicated across "
+            "multiple cpu devices."),
        Flag("tf_dump_graphs_in_tfg", &use_tfg_graph_dumper,
             "When tf_dump_graphs_in_tfg is true, graphs after transformations "
             "are dumped in MLIR TFG dialect and not in GraphDef"),
@@ -413,6 +419,8 @@ void AllocateAndParseFlags() {
       enable_mlir_generic_outside_compilation;
   mlir_flags->tf_mlir_enable_tpu_variable_runtime_reformatting_pass =
       enable_tpu_variable_runtime_reformatting_pass;
+  mlir_flags->tf_mlir_enable_multiple_local_cpu_devices =
+      enable_mlir_multiple_local_cpu_devices;
 
   if (use_tfg_graph_dumper) {
     UseMlirForGraphDump(MlirDumpConfig{}.elide_large_attributes().emit_dialect(
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 04a15136d43072..45a4c83a614afd 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -291,6 +291,9 @@ struct MlirCommonFlags {
   bool tf_mlir_enable_strict_clusters;
   bool tf_mlir_enable_generic_outside_compilation;
   bool tf_mlir_enable_tpu_variable_runtime_reformatting_pass;
+  // TODO(pineapplejuice233): Revisit this flag once the performance impact is verified
+  // with different local CPU devices settings.
+  bool tf_mlir_enable_multiple_local_cpu_devices;
 };
 
 // Flags for the JitRt pipeline -- see tf_jitrt_pipeline.h for details.
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index f3272f81fe6182..02c9f486e8e000 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -61,6 +61,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:lower_cluster_to_runtime_ops",
         "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
         "//tensorflow/compiler/mlir/tf2xla/internal/passes:clustering_passes",
+        "//tensorflow/compiler/mlir/tf2xla/internal/passes:mlir_to_graph_passes",
         "//tensorflow/compiler/mlir/tf2xla/transforms:tf_xla_passes",
         "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
         "//tensorflow/compiler/mlir/tosa:tf_passes",
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 8117705b0fac2b..b6d406f040a296 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -319,9 +319,24 @@ cc_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "tensorflow_lite_canonicalize_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            ["-gen-rewriters"],
+            "ir/tfl_canonicalize.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/tfl_canonicalize.td",
+    deps = [":tensorflow_lite_patterns_td_files"],
+)
+
 cc_library(
     name = "tensorflow_lite",
     srcs = [
+        "ir/tfl_canonicalize.inc",
         "ir/tfl_ops.cc",
         "ir/tfl_ops.cc.inc",
         "ir/tfl_ops.h.inc",
@@ -343,8 +358,10 @@ cc_library(
         "@llvm-project//mlir:include/mlir/Transforms/InliningUtils.h",
     ],
     deps = [
+        ":converter_inc",
         ":cost_estimators",
         ":size_utils",
+        ":tensorflow_lite_canonicalize_inc_gen",
         ":tensorflow_lite_op_enums_inc_gen",
         ":tensorflow_lite_op_interfaces_inc_gen",
         ":tensorflow_lite_ops_inc_gen",
@@ -360,6 +377,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/core:framework",
         "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
@@ -1314,18 +1332,18 @@ cc_library(
         ":common",
         ":fake_quant_utils",
         ":tensorflow_lite_d2s",
-        ":tensorflow_lite_legalize_tf",
-        ":tensorflow_lite_optimize",
-        ":tensorflow_lite_optimize_batch_matmul",
-        ":tensorflow_lite_quantize",
+        ":tensorflow_lite_legalize_tf",  # buildcleaner: keep
+        ":tensorflow_lite_optimize",  # buildcleaner: keep
+        ":tensorflow_lite_optimize_batch_matmul",  # buildcleaner: keep
+        ":tensorflow_lite_quantize",  # buildcleaner: keep
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_passes",
         "//tensorflow/compiler/mlir/lite/quantization/tensorflow:tf_quantization_passes",
         "//tensorflow/compiler/mlir/lite/stablehlo:compose_uniform_quantized_type_pass",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_tf_xla_call_module_to_stablehlo_pass",
         "//tensorflow/compiler/mlir/lite/stablehlo:rename_entrypoint_to_main",
-        "//tensorflow/compiler/mlir/lite/stablehlo:tf_legalize_hlo",
-        "//tensorflow/compiler/mlir/lite/stablehlo:tfl_legalize_hlo",
+        "//tensorflow/compiler/mlir/lite/stablehlo:tf_legalize_hlo",  # buildcleaner: keep
+        "//tensorflow/compiler/mlir/lite/stablehlo:tfl_legalize_hlo",  # buildcleaner: keep
         "//tensorflow/compiler/mlir/lite/stablehlo:transforms",
         "//tensorflow/compiler/mlir/lite/stablehlo:uniform_quantized_stablehlo_to_tfl_pass",
         "//tensorflow/compiler/mlir/tensorflow",
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 4f044e153c68bb..81f69bfa87e940 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -165,12 +165,15 @@ constexpr size_t kInitialBufferSize = 10240;
 // `isSigned` is set to false for other types.
 static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
                                                   bool is_signed = true) {
-  if (!is_signed && type.isSignlessInteger(8)) {
-    return tflite::TensorType_UINT8;
-  }
   if (!is_signed) {
-    return Status(absl::StatusCode::kInvalidArgument,
-                  "'isSigned' can only be set for 8-bits integer type");
+    if (type.isSignlessInteger(8)) {
+      return tflite::TensorType_UINT8;
+    } else if (type.isSignlessInteger(16)) {
+      return tflite::TensorType_UINT16;
+    } else {
+      return Status(absl::StatusCode::kInvalidArgument,
+                    "'isSigned' can only be set for 8/16-bits integer type");
+    }
   }
 
   if (type.isF32()) {
@@ -535,14 +538,16 @@ class Translator {
       const std::unordered_set<std::string>& tags,
       OpOrArgNameMapper* op_or_arg_name_mapper,
       const std::map<std::string, std::string>& metadata,
-      bool serialize_stablehlo_ops);
+      bool serialize_stablehlo_ops,
+      std::optional<size_t> custom_option_alignment);
 
  private:
   enum class OpType : char { kTfliteBuiltin, kSelectTf, kCustomOp };
   explicit Translator(ModuleOp module, const toco::TocoFlags& toco_flags,
                       const std::unordered_set<std::string>& saved_model_tags,
                       OpOrArgNameMapper* op_or_arg_name_mapper,
-                      const std::map<std::string, std::string>& metadata)
+                      const std::map<std::string, std::string>& metadata,
+                      std::optional<size_t> custom_option_alignment)
       : module_(module),
         name_mapper_(*op_or_arg_name_mapper),
         builder_(kInitialBufferSize),
@@ -553,7 +558,8 @@ class Translator {
         metadata_(metadata),
         supported_backends_(toco_flags.supported_backends().begin(),
                             toco_flags.supported_backends().end()),
-        use_buffer_offset_(toco_flags.use_buffer_offset()) {
+        use_buffer_offset_(toco_flags.use_buffer_offset()),
+        custom_option_alignment_(custom_option_alignment) {
     // The first buffer must be empty according to the schema definition.
     empty_buffer_ = tflite::CreateBuffer(builder_);
     buffers_.push_back(empty_buffer_);
@@ -582,9 +588,10 @@ class Translator {
 
   // Returns TFLite buffer populated with constant value if the operation is
   // TFLite constant operation. Otherwise, returns an empty buffer. Emits error
-  // and returns std::nullopt on failure.
-  std::optional<BufferOffset<tflite::Buffer>> BuildBuffer(Value value,
-                                                          int index);
+  // and returns std::nullopt on failure. The buffer index may be changed if
+  // duplicated buffer is found.
+  std::optional<BufferOffset<tflite::Buffer>> BuildBuffer(
+      Value value, bool can_be_deduplicated, int& index);
 
   // Build TFLite tensor from the given type. This function is for tfl.lstm
   // intermediates, which should have UniformQuantizedType.
@@ -675,11 +682,6 @@ class Translator {
   std::optional<VectorBufferOffset<BufferOffset<tflite::Metadata>>>
   CreateMetadataVector();
 
-  // Encodes the `tfl.metadata_buffer` array attribute of the module to the
-  // metadata_buffer section in the final model. Returns empty if there isn't
-  // such attribute in the mlir module.
-  VectorBufferOffset<int32_t> CreateMetadataBufferVector();
-
   // Builds and returns list of tfl.SignatureDef sections in the model.
   std::optional<VectorBufferOffset<BufferOffset<tflite::SignatureDef>>>
   CreateSignatureDefs(const std::vector<SignatureDefData>& signature_defs);
@@ -751,6 +753,10 @@ class Translator {
       const std::vector<int32_t>& operands,
       const std::vector<int32_t>& results);
 
+  std::optional<BufferOffset<tflite::Operator>> BuildStablehloPadOp(
+      mlir::stablehlo::PadOp pad_op, const std::vector<int32_t>& operands,
+      const std::vector<int32_t>& results);
+
   // create a subgraph given a unnamed mlir region, return the corresponding
   // subgraph index
   int32_t UnnamedRegionToSubgraph(mlir::Region* region,
@@ -837,6 +843,12 @@ class Translator {
   bool use_buffer_offset_ = false;
 
   bool require_use_buffer_offset_ = false;
+
+  std::optional<size_t> custom_option_alignment_ = std::nullopt;
+
+  // Map from mlir constant attribute to the buffer index. This is used to
+  // deduplicate the buffers in the flatbuffer.
+  llvm::DenseMap<mlir::ElementsAttr, int> const_attribute_to_buffer_map_;
 };
 
 bool Translator::EstimateArithmeticCount(int64_t* count) {
@@ -860,7 +872,7 @@ std::string Translator::UniqueName(mlir::Value val) {
 }
 
 std::optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
-    mlir::Value value, int index) {
+    mlir::Value value, bool can_be_deduplicated, int& index) {
   auto inst = value.getDefiningOp();
   ElementsAttr attr;
   if (auto cst = dyn_cast<mlir::arith::ConstantOp>(inst)) {
@@ -883,6 +895,15 @@ std::optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
     return empty_buffer_;
   }
 
+  if (can_be_deduplicated) {
+    if (const_attribute_to_buffer_map_.find(attr) !=
+        const_attribute_to_buffer_map_.end()) {
+      index = const_attribute_to_buffer_map_[attr];
+      return empty_buffer_;
+    }
+    const_attribute_to_buffer_map_[attr] = index;
+  }
+
   // TF doesn't currently support 4-bit types (DT_INT4), so we'll run into
   // trouble calling ConvertToTensor(). For now, extract the tensor data from
   // ElementsAttr directly in this and read type from tflite::TensorType instead
@@ -1168,6 +1189,13 @@ std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
       break;
     }
   }
+  // The value is used as a variable if produced by an op with "tfl.is_variable"
+  // attribute. This provides a hook for the user to represent the variable
+  // tensor in the MLIR level.
+  if (auto* inst = value.getDefiningOp();
+      inst && inst->hasAttr("tfl.is_variable")) {
+    is_variable = true;
+  }
 
   bool has_rank = type.hasRank();
 
@@ -1296,11 +1324,16 @@ BufferOffset<tflite::Operator> Translator::BuildCustomOperator(
         /*builtin_options=*/0,
         /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS);
   }
+  if (custom_option_alignment_.has_value()) {
+    builder_.ForceVectorAlignment(custom_option_vector.size(), sizeof(uint8_t),
+                                  custom_option_alignment_.value());
+  }
+  auto custom_option_fbs_vector =
+      builder_.CreateVector<uint8_t>(custom_option_vector);
   return tflite::CreateOperator(
       builder_, opcode_index, builder_.CreateVector(operands),
       builder_.CreateVector(results), tflite::BuiltinOptions_NONE,
-      /*builtin_options=*/0,
-      builder_.CreateVector<uint8_t>(custom_option_vector),
+      /*builtin_options=*/0, custom_option_fbs_vector,
       tflite::CustomOptionsFormat_FLEXBUFFERS);
 }
 
@@ -1603,6 +1636,30 @@ Translator::BuildStablehloRngBitGeneratorOp(
       rng_options.Union());
 }
 
+std::optional<BufferOffset<tflite::Operator>> Translator::BuildStablehloPadOp(
+    mlir::stablehlo::PadOp pad_op, const std::vector<int32_t>& operands,
+    const std::vector<int32_t>& results) {
+  std::string op_name = pad_op->getName().getStringRef().str();
+  uint32_t opcode_index =
+      GetOpcodeIndex(op_name, tflite::BuiltinOperator_STABLEHLO_PAD);
+
+  auto edge_padding_low =
+      builder_.CreateVector(pad_op.getEdgePaddingLow().vec());
+  auto edge_padding_high =
+      builder_.CreateVector(pad_op.getEdgePaddingHigh().vec());
+  auto interior_padding =
+      builder_.CreateVector(pad_op.getInteriorPadding().vec());
+
+  auto pad_option = tflite::CreateStablehloPadOptions(
+      builder_, edge_padding_low, edge_padding_high, interior_padding);
+
+  return tflite::CreateOperator(
+      builder_, opcode_index, builder_.CreateVector(operands),
+      builder_.CreateVector(results), tflite::BuiltinOptions_NONE, 0, 0,
+      tflite::CustomOptionsFormat_FLEXBUFFERS, 0, 0, 0, 0,
+      tflite::BuiltinOptions2_StablehloPadOptions, pad_option.Union());
+}
+
 std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
     Operation* inst, std::vector<int32_t> operands,
     const std::vector<int32_t>& results,
@@ -1704,6 +1761,9 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
       return BuildStablehloOperatorwithoutOptions(
           inst, operands, results, tflite::BuiltinOperator_STABLEHLO_MINIMUM);
     }
+    if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::PadOp>(inst)) {
+      return BuildStablehloPadOp(shlo_op, operands, results);
+    }
     // for ops don't have kernels, only serialize when conversion is set to true
     if (convert_stablehlo_) {
       if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::LogisticOp>(inst)) {
@@ -1817,8 +1877,7 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
         uint32_t opcode_index = GetOpcodeIndex(
             op_name, tflite::BuiltinOperator_STABLEHLO_DYNAMIC_SLICE);
 
-        auto slice_sizes = builder_.CreateVector(
-            mlir::GetOptionalVector<int64_t>(shlo_op.getSliceSizes()));
+        auto slice_sizes = builder_.CreateVector(shlo_op.getSliceSizes().vec());
 
         auto dynamic_slice_option =
             tflite::CreateStablehloDynamicSliceOptions(builder_, slice_sizes);
@@ -1854,27 +1913,6 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
             tflite::BuiltinOptions2_StablehloCompareOptions,
             compare_option.Union());
       }
-      if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::PadOp>(inst)) {
-        std::string op_name = inst->getName().getStringRef().str();
-        uint32_t opcode_index =
-            GetOpcodeIndex(op_name, tflite::BuiltinOperator_STABLEHLO_PAD);
-
-        auto edge_padding_low = builder_.CreateVector(
-            mlir::GetOptionalVector<int64_t>(shlo_op.getEdgePaddingLowAttr()));
-        auto edge_padding_high = builder_.CreateVector(
-            mlir::GetOptionalVector<int64_t>(shlo_op.getEdgePaddingHighAttr()));
-        auto interior_padding = builder_.CreateVector(
-            mlir::GetOptionalVector<int64_t>(shlo_op.getInteriorPaddingAttr()));
-
-        auto pad_option = tflite::CreateStablehloPadOptions(
-            builder_, edge_padding_low, edge_padding_high, interior_padding);
-
-        return tflite::CreateOperator(
-            builder_, opcode_index, builder_.CreateVector(operands),
-            builder_.CreateVector(results), tflite::BuiltinOptions_NONE, 0, 0,
-            tflite::CustomOptionsFormat_FLEXBUFFERS, 0, 0, 0, 0,
-            tflite::BuiltinOptions2_StablehloPadOptions, pad_option.Union());
-      }
       if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::ConcatenateOp>(inst)) {
         std::string op_name = inst->getName().getStringRef().str();
         uint32_t opcode_index = GetOpcodeIndex(
@@ -1895,12 +1933,11 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
         uint32_t opcode_index =
             GetOpcodeIndex(op_name, tflite::BuiltinOperator_STABLEHLO_SLICE);
 
-        auto start_indices = builder_.CreateVector(
-            mlir::GetOptionalVector<int64_t>(shlo_op.getStartIndicesAttr()));
-        auto limit_indices = builder_.CreateVector(
-            mlir::GetOptionalVector<int64_t>(shlo_op.getLimitIndicesAttr()));
-        auto strides = builder_.CreateVector(
-            mlir::GetOptionalVector<int64_t>(shlo_op.getStridesAttr()));
+        auto start_indices =
+            builder_.CreateVector(shlo_op.getStartIndices().vec());
+        auto limit_indices =
+            builder_.CreateVector(shlo_op.getLimitIndices().vec());
+        auto strides = builder_.CreateVector(shlo_op.getStrides().vec());
 
         auto slice_option = tflite::CreateStablehloSliceOptions(
             builder_, start_indices, limit_indices, strides);
@@ -2172,8 +2209,7 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
             op_name, tflite::BuiltinOperator_STABLEHLO_TRANSPOSE);
 
         auto transpose_option = tflite::CreateStablehloTransposeOptions(
-            builder_, builder_.CreateVector(mlir::GetOptionalVector<int64_t>(
-                          shlo_op.getPermutation())));
+            builder_, builder_.CreateVector(shlo_op.getPermutation().vec()));
 
         return tflite::CreateOperator(
             builder_, opcode_index, builder_.CreateVector(operands),
@@ -2394,22 +2430,31 @@ std::optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
         quant_parameters = GetQuantizationForQuantStatsOpOutput(stats_op);
       }
     }
-    auto tensor_or =
-        BuildTensor(value, tensor_name, buffers_.size(), quant_parameters);
-    if (!tensor_or) return false;
-    tensors.push_back(*tensor_or);
 
+    int buffer_index = buffers_.size();
+    // If a constant is returned as subgraph's output, this constant cannot be
+    // deduplicated.
+    const bool not_returned_by_subgraph = llvm::none_of(
+        value.getUsers(),
+        [](Operation* user) { return llvm::isa<mlir::func::ReturnOp>(user); });
     // TODO(ashwinm): Check if for stateful tensors, if it is also needed to
     // make the Buffer empty apart from setting the buffer_idx=0 in the
     // Tensor. This does not seem to affect runtime behavior for RNN/LSTM,
     // but would be good for reducing memory footprint.
     if (value.getDefiningOp()) {
-      auto buffer_or = BuildBuffer(value, buffers_.size());
+      auto buffer_or =
+          BuildBuffer(value, not_returned_by_subgraph, buffer_index);
       if (!buffer_or) return false;
       buffers_.push_back(*buffer_or);
     } else {
       buffers_.push_back(empty_buffer_);
     }
+
+    auto tensor_or =
+        BuildTensor(value, tensor_name, buffer_index, quant_parameters);
+    if (!tensor_or) return false;
+    tensors.push_back(*tensor_or);
+
     return true;
   };
 
@@ -2625,18 +2670,6 @@ Translator::CreateMetadataVector() {
   return builder_.CreateVector(metadata);
 }
 
-VectorBufferOffset<int32_t> Translator::CreateMetadataBufferVector() {
-  auto array_attr =
-      module_->getAttrOfType<mlir::ArrayAttr>("tfl.metadata_buffer");
-  std::vector<int32_t> metadata_buffer;
-  if (!array_attr) return 0;
-  for (auto value : array_attr.getAsValueRange<mlir::IntegerAttr>()) {
-    metadata_buffer.push_back(value.getSExtValue());
-  }
-
-  return builder_.CreateVector(metadata_buffer);
-}
-
 // Helper method that returns list of all strings in a StringAttr identified
 // by 'attr_key' and values are separated by a comma.
 llvm::SmallVector<llvm::StringRef, 2> GetStringsFromAttrWithSeparator(
@@ -2824,21 +2857,23 @@ std::optional<std::string> Translator::Translate(
     const std::unordered_set<std::string>& tags,
     OpOrArgNameMapper* op_or_arg_name_mapper,
     const std::map<std::string, std::string>& metadata,
-    bool serialize_stablehlo_ops) {
+    bool serialize_stablehlo_ops,
+    std::optional<size_t> custom_option_alignment) {
   OpOrArgLocNameMapper default_op_or_arg_name_mapper;
   if (!op_or_arg_name_mapper)
     op_or_arg_name_mapper = &default_op_or_arg_name_mapper;
   if (!UpdateEntryFunction(module)) return std::nullopt;
   if (!IsValidTFLiteMlirModule(module)) return std::nullopt;
   Translator translator(module, toco_flags, tags, op_or_arg_name_mapper,
-                        metadata);
+                        metadata, custom_option_alignment);
   translator.convert_stablehlo_ = serialize_stablehlo_ops;
   auto ret = translator.TranslateInternal();
   if (translator.require_use_buffer_offset_) {
     auto new_toco_flags = toco_flags;
     new_toco_flags.set_use_buffer_offset(true);
     Translator new_translator(module, new_toco_flags, tags,
-                              op_or_arg_name_mapper, metadata);
+                              op_or_arg_name_mapper, metadata,
+                              custom_option_alignment);
     return new_translator.TranslateInternal();
   }
   return ret;
@@ -3039,8 +3074,7 @@ std::optional<std::string> Translator::TranslateInternal() {
 
   // Build the model and finish the model building process.
   auto description = builder_.CreateString(model_description.data());
-  VectorBufferOffset<int32_t> metadata_buffer =
-      CreateMetadataBufferVector();  // Deprecated
+  VectorBufferOffset<int32_t> metadata_buffer = 0;  // Deprecated
   auto metadata = CreateMetadataVector();
   if (!metadata) return std::nullopt;
 
@@ -3131,6 +3165,10 @@ void Translator::AppendBufferData(std::string& result) {
 
   for (auto& it : custom_op_data_map_) {
     while (result.size() % 16 != 0) result += '\0';
+    if (custom_option_alignment_.has_value()) {
+      while (result.size() % custom_option_alignment_.value() != 0)
+        result += '\0';
+    }
     auto buffer = std::string(it.second.begin(), it.second.end());
     int64_t offset = result.size();
     int64_t size = it.second.size();
@@ -3345,7 +3383,8 @@ bool MlirToFlatBufferTranslateFunction(mlir::ModuleOp module,
                                        bool serialize_stablehlo_ops) {
   auto maybe_translated = Translator::Translate(
       module, options.toco_flags, options.saved_model_tags,
-      options.op_or_arg_name_mapper, options.metadata, serialize_stablehlo_ops);
+      options.op_or_arg_name_mapper, options.metadata, serialize_stablehlo_ops,
+      options.custom_option_alignment);
   if (!maybe_translated) return false;
   *serialized_flatbuffer = std::move(*maybe_translated);
   return true;
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.h b/tensorflow/compiler/mlir/lite/flatbuffer_export.h
index b279c113c94a2a..cd461c96115375 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.h
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_EXPORT_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_EXPORT_H_
 
+#include <cstddef>
 #include <map>
+#include <optional>
 #include <string>
 #include <unordered_set>
 
@@ -42,6 +44,10 @@ struct FlatbufferExportOptions {
   // OpOrArgNameMapper to convert location of the op to name in flatbuffer.
   // If not set, a default mapper will be used.
   tensorflow::OpOrArgNameMapper* op_or_arg_name_mapper = nullptr;
+  // User-specified value of flatbuffer alignment requirement for custom
+  // options. If specified, the value should be multiplier of 16 (default
+  // alignment for TFL flatbuffer).
+  std::optional<size_t> custom_option_alignment = std::nullopt;
 };
 
 // Translates the given MLIR `module` into a FlatBuffer and stores the
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 6eb2aee99aacb9..69dd8ad342cfe1 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -604,10 +604,10 @@ static mlir::ElementsAttr GetSplat(RankedTensorType type, int unique_index,
 }
 
 // TODO(b/172664358): Creates a new op instead of reusing constant op.
-// Creates a constant op to represent stateful variable. The function static
-// variable `stateful_variable_idx` is used as a unique value for each constant
-// to avoid CSEed. `tensor` is the data structure of flatbuffer. `shaped_type`
-// is the ShapedType for the const op.
+// Creates a constant op with "tfl.is_variable" attribute to represent stateful
+// variable. The function static variable `stateful_variable_idx` is used as a
+// unique value for each constant to avoid CSEed. `tensor` is the data structure
+// of flatbuffer. `shaped_type` is the ShapedType for the const op.
 StatusOr<Operation*> BuildVariableOp(const tflite::TensorT& tensor,
                                      OpBuilder builder, Location loc) {
   TF_ASSIGN_OR_RETURN(auto type, GetTensorType(tensor, builder,
@@ -626,6 +626,7 @@ StatusOr<Operation*> BuildVariableOp(const tflite::TensorT& tensor,
     return op.getOperation();
   }
   auto op = builder.create<tfl::ConstOp>(loc, value);
+  op->setAttr("tfl.is_variable", builder.getUnitAttr());
   if (tensor.quantization && !tensor.quantization->min.empty()) {
     if (auto stats_op =
             ConvertMinMaxToStatsOp(tensor, builder, op.getResult())) {
@@ -1904,11 +1905,6 @@ OwningOpRef<mlir::ModuleOp> tflite::FlatBufferToMlir(
                     mlir::UnitAttr::get(builder.getContext()));
   }
 
-  if (!model->metadata_buffer.empty()) {
-    module->setAttr("tfl.metadata_buffer",
-                    builder.getI32ArrayAttr(model->metadata_buffer));
-  }
-
   if (use_stablehlo_constant) {
     module->setAttr("tfl.metadata",
                     builder.getDictionaryAttr(builder.getNamedAttr(
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
index 8d97a1e1f2b349..b51d1b1d7019c5 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
@@ -285,6 +285,14 @@ static mlir::Attribute BuildRankedTensorAttr(std::vector<int64_t> shape,
   return mlir::DenseIntElementsAttr::get(ty, value);
 }
 
+static mlir::Attribute BuildI64ArrayAttr(std::vector<int64_t> shape,
+                                         std::vector<int64_t> value,
+                                         mlir::Builder builder) {
+  // Expand splats. BuildI64ArrayAttr assumes shape.size() == 1.
+  if (value.size() == 1) value.resize(shape[0], value[0]);
+  return builder.getDenseI64ArrayAttr(value);
+}
+
 static mlir::Attribute BuildF32ArrayAttr(std::vector<float> value,
                                          mlir::Builder builder) {
   std::vector<float> typecast(value.begin(), value.end());
@@ -400,13 +408,11 @@ void BuiltinOptions2ToAttributesManual(
     std::vector<int64_t> shape = {
         static_cast<int64_t>(op->start_indices.size())};
     attributes.emplace_back(builder.getNamedAttr(
-        "start_indices",
-        BuildRankedTensorAttr(shape, op->start_indices, builder)));
+        "start_indices", BuildI64ArrayAttr(shape, op->start_indices, builder)));
     attributes.emplace_back(builder.getNamedAttr(
-        "limit_indices",
-        BuildRankedTensorAttr(shape, op->limit_indices, builder)));
+        "limit_indices", BuildI64ArrayAttr(shape, op->limit_indices, builder)));
     attributes.emplace_back(builder.getNamedAttr(
-        "strides", BuildRankedTensorAttr(shape, op->strides, builder)));
+        "strides", BuildI64ArrayAttr(shape, op->strides, builder)));
     return;
   }
   if (const auto* op = op_union.AsStablehloConvolutionOptions()) {
@@ -496,20 +502,20 @@ void BuiltinOptions2ToAttributesManual(
         static_cast<int64_t>(op->edge_padding_low.size())};
     attributes.emplace_back(builder.getNamedAttr(
         "edge_padding_low",
-        BuildRankedTensorAttr(shape, op->edge_padding_low, builder)));
+        BuildI64ArrayAttr(shape, op->edge_padding_low, builder)));
     attributes.emplace_back(builder.getNamedAttr(
         "edge_padding_high",
-        BuildRankedTensorAttr(shape, op->edge_padding_high, builder)));
+        BuildI64ArrayAttr(shape, op->edge_padding_high, builder)));
     attributes.emplace_back(builder.getNamedAttr(
         "interior_padding",
-        BuildRankedTensorAttr(shape, op->interior_padding, builder)));
+        BuildI64ArrayAttr(shape, op->interior_padding, builder)));
     return;
   }
   if (const auto* op = op_union.AsStablehloDynamicSliceOptions()) {
     attributes.emplace_back(builder.getNamedAttr(
         "slice_sizes",
-        BuildRankedTensorAttr({static_cast<int64_t>(op->slice_sizes.size())},
-                              op->slice_sizes, builder)));
+        BuildI64ArrayAttr({static_cast<int64_t>(op->slice_sizes.size())},
+                          op->slice_sizes, builder)));
     return;
   }
   if (const auto* op = op_union.AsStablehloCompareOptions()) {
@@ -623,8 +629,8 @@ void BuiltinOptions2ToAttributesManual(
     if (!op->permutation.empty()) {
       attributes.emplace_back(builder.getNamedAttr(
           "permutation",
-          BuildRankedTensorAttr({static_cast<int64_t>(op->permutation.size())},
-                                op->permutation, builder)));
+          BuildI64ArrayAttr({static_cast<int64_t>(op->permutation.size())},
+                            op->permutation, builder)));
     }
 
     return;
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_canonicalize.td b/tensorflow/compiler/mlir/lite/ir/tfl_canonicalize.td
new file mode 100644
index 00000000000000..d9200ddc70f112
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_canonicalize.td
@@ -0,0 +1,56 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the optimization pattern definition file for TensorFlow Lite.
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/PatternBase.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
+include "tensorflow/compiler/mlir/lite/utils/utils.td"
+
+// Returns Squeezed shape of a ranked-tensor.
+// Squeezed, here, means eliminating any 1s' in the
+// dimensions of the tensor
+def GetSqueezedShape: NativeCodeCall<"GetSqueezedShape($0)">;
+
+// This is a utility function to deduct the effective permutation to apply on
+// TFL_TransposeOp when the tensor has some dimensions with value==1
+def GetSqueezedPermutation: NativeCodeCall<"GetSqueezedPermutation($0, $1)">;
+
+// Check to see if the tensor dimensions can be Squeezed by eliminating 1s'
+def CanSqueezeTensor : Constraint<CPred<
+  "GetShape($0).getNumElements() > GetSqueezedShape($0).getNumElements()">>;
+
+
+// Pattern to convert TFL_TransposeOp with rank>6 to rank<=6 if there are
+// redundant dimensions in the tensor. For example- [2x1x3] == [2x3] and 1 is
+// not contributing to the dimentionality. This will run if the rank>6
+// Pattern will convert-
+// %0 = "tfl.transpose"(%arg0, %cst) : (tensor<56x8x56x1x1x1x7xf32>, tensor<7xi32>) -> tensor<1x1x8x56x56x7x1xf32>
+// to-
+// %0 = "tfl.reshape"(%arg0, %cst) : (tensor<56x8x56x1x1x1x7xf32>, tensor<4xi32>) -> tensor<56x8x56x7xf32>
+// %1 = "tfl.transpose"(%0, %cst_0) : (tensor<56x8x56x7xf32>, tensor<4xi32>) -> tensor<8x56x56x7xf32>
+// %2 = "tfl.reshape"(%1, %cst_1) : (tensor<8x56x56x7xf32>, tensor<7xi32>) -> tensor<1x1x8x56x56x7x1xf32>
+def ConvertTransposeToDecreaseRank : Pat<
+  (TFL_TransposeOp:$output_transpose $input, (Arith_ConstantOp:$permutation $_)),
+  (TFL_ReshapeOp
+    (TFL_TransposeOp
+      (TFL_ReshapeOp $input, (Arith_ConstantOp (GetSqueezedShape $input))),
+      (Arith_ConstantOp (GetSqueezedPermutation $input, $permutation))),
+    (Arith_ConstantOp (GetShape $output_transpose))),
+    [(AnyStaticShapeTensor $input),
+     (HasRankAtLeast<7> $input),
+     (CanSqueezeTensor $input)]>;
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 939d840f404445..779f8580c7144a 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/escaping.h"
 #include "Eigen/Core"  // from @eigen_archive
 #include "llvm/ADT/APFloat.h"
@@ -146,6 +147,74 @@ Operation* getDefiningBroadcastArgsOp(Value operand) {
   }
   return parent_of_defining_op;
 }
+
+// Returns shape of a ranked tensor.
+// Precondition: value_tensor's is ranked tensor.
+// Returns a Squeezed shape. Truncation here means eliminating the redundant
+// dimensions 1.
+DenseElementsAttr GetSqueezedShape(Value value_tensor) {
+  auto value_shape_type = value_tensor.getType().dyn_cast<ShapedType>();
+  assert(value_shape_type.hasRank() && "value_tensor should be ranked tensor");
+
+  auto value_shape = value_shape_type.getShape();
+  SmallVector<int32_t> return_squeeze_shape;
+  return_squeeze_shape.reserve(value_shape.size());
+
+  for (size_t dim_idx = 0; dim_idx < value_shape.size(); ++dim_idx) {
+    int64_t dim = value_shape[dim_idx];
+    if (dim == 1) {
+      continue;
+    }
+    return_squeeze_shape.push_back(
+        ShapedType::isDynamic(dim) ? -1 : static_cast<int32_t>(dim));
+  }
+
+  return mlir::DenseElementsAttr::get(
+      RankedTensorType::get(
+          {static_cast<int>(return_squeeze_shape.size())},
+          mlir::IntegerType::get(value_tensor.getContext(), 32)),
+      llvm::ArrayRef(return_squeeze_shape));
+}
+
+// This is a utility function to deduce the effective permutation to apply on
+// TFL_TransposeOp when the tensor has some dimensions with value==1
+// Example- "tfl.transpose"(tensor<56x8x56x1x1x1x7xf32>, [4, 5, 1, 2, 0, 6, 3])
+// Permutation before squeese is [4, 5, 1, 2, 0, 6, 3] becomes [1, 2, 0, 3]
+// after squeeze is perfomed to retain the relative ordering of the non-1 dims.
+DenseElementsAttr GetSqueezedPermutation(Value input_value,
+                                         Value input_permutation) {
+  auto input_shape = input_value.getType().dyn_cast<ShapedType>().getShape();
+  absl::flat_hash_map<int32_t, int32_t> permutation_map;
+
+  for (size_t before_dim_idx = 0, after_dim_idx = 0;
+       before_dim_idx < input_shape.size(); ++before_dim_idx) {
+    if (input_shape[before_dim_idx] == 1) {
+      continue;
+    }
+    permutation_map.insert({before_dim_idx, after_dim_idx++});
+  }
+
+  SmallVector<int32_t> squeezed_permutation;
+  DenseElementsAttr input_perm_const;
+  if (matchPattern(input_permutation, m_Constant(&input_perm_const))) {
+    for (int32_t idx = 0; idx < input_perm_const.getNumElements(); ++idx) {
+      size_t perm = input_perm_const.getValues<APInt>()[idx].getSExtValue();
+      if (input_shape[perm] == 1) {
+        continue;
+      }
+      squeezed_permutation.push_back(permutation_map[perm]);
+    }
+  }
+
+  return mlir::DenseElementsAttr::get(
+      RankedTensorType::get(
+          {static_cast<int>(squeezed_permutation.size())},
+          mlir::IntegerType::get(input_permutation.getContext(), 32)),
+      llvm::ArrayRef(squeezed_permutation));
+}
+
+#include "tensorflow/compiler/mlir/lite/ir/tfl_canonicalize.inc"
+
 }  // namespace
 
 // Returns true when the given type lists contain a single element of shaped
@@ -3447,6 +3516,11 @@ void ComputePermutation(ArrayRef<int64_t> perms, ArrayRef<int64_t> output_shape,
 
 }  // namespace
 
+void TransposeOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                              MLIRContext* context) {
+  results.add<ConvertTransposeToDecreaseRank>(context);
+}
+
 OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
   auto operands = adaptor.getOperands();
 
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
index 73740be2310ef7..380301a9cbee40 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -50,6 +50,7 @@ typedef TFLDialect TensorFlowLiteDialect;
 class ControlType : public Type::TypeBase<ControlType, Type, TypeStorage> {
  public:
   using Base::Base;
+  static constexpr StringLiteral name = "tfl.control";
 };
 
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_interface.h.inc"
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 4b915afcb0603d..45a1e3b25e1335 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -305,6 +305,14 @@ class TFL_OperandHasRankAtMost<int n, int m> :
   PredOpTrait<"operand " # n # " is at most " # m # "-D",
     TFL_OperandHasRankAtMostPred<n, m>>;
 
+// Not all dimentions in the tensor will contribute to the data move in a
+// TransposeOp. Effective rank is the number of dimentions != 1
+class TFL_TransposeOperandHasEffectiveRankAtMost<int n, int m> :
+  PredOpTrait<"operand " # n # " is at most " # m # "-D",
+    Or<[TFL_OperandIsUnrankedPred<n>,
+      CPred<"GetSqueezedShape($_op.getOperand(" # n #
+      ")).cast<DenseElementsAttr>().size() <= " # m>]>>;
+
 class TFL_OperandHasRankAtLeast<int n, int m> :
   PredOpTrait<"operand " # n # " is at least " # m # "-D",
     Or<[TFL_OperandIsUnrankedPred<n>,
@@ -1211,12 +1219,12 @@ def TFL_GatherNdOp : TFL_Op<"gather_nd", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I16, I64, I32, UI8, TFL_Str]>:$params,
+    TFL_TensorOf<[F32, I1, I8, I16, I64, I32, UI8, TFL_Str]>:$params,
     TFL_TensorOf<[I16, I32, I64]>:$indices
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I16, I64, I32, UI8, TFL_Str]>:$output
+    TFL_TensorOf<[F32, I1, I8, I16, I64, I32, UI8, TFL_Str]>:$output
   );
 }
 
@@ -3488,7 +3496,7 @@ def TFL_TopKV2Op: TFL_Op<"topk_v2", [
 def TFL_TransposeOp : TFL_Op<"transpose", [
     Pure,
     QuantizableResult,
-    TFL_OperandHasRankAtMost<0, 6>,
+    TFL_TransposeOperandHasEffectiveRankAtMost<0, 6>,
     TFL_OperandHasRank<1, 1>,
     PredOpTrait<"input and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
@@ -3512,6 +3520,8 @@ def TFL_TransposeOp : TFL_Op<"transpose", [
 
   let hasFolder = 1;
 
+  let hasCanonicalizer = 1;
+
   let builders = [
     OpBuilder<(ins "Value":$input, "Value":$perm),
     [{ BuildTransposeOp(&$_builder, $_state, input, perm); }]>
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 57a5c93556c4cf..62c2733d2b510c 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cmath>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -24,10 +25,12 @@ limitations under the License.
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -777,6 +780,14 @@ void QuantizationDriver::PreprocessConstantOps() {
     auto type = cst.getType().dyn_cast<ShapedType>();
     if (!type || !type.getElementType().isa<FloatType>()) return;
 
+    // Skip if the value is NaN or INF.
+    // Otherwise the illegal scale/zp will be calculated.
+    auto float_attr = cst.getValueAttr().dyn_cast<DenseFPElementsAttr>();
+    if (float_attr) {
+      auto cst_float_falue = float_attr.getValues<APFloat>()[0];
+      if (!cst_float_falue.isFinite()) return;
+    }
+
     Value value = cst.getResult();
     builder_.setInsertionPoint(cst);
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index 2459f3d214d13a..152b48b1f9043a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -454,6 +454,7 @@ cc_library(
     deps = [
         ":passes_inc_gen",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:uniform_quantized_types",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:Support",
@@ -523,7 +524,6 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions:custom_call",
         "//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions:dot_general",
-        "//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions:pad",
         "//tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions:util",
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm-project//mlir:ArithDialect",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-pad.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-pad.mlir
index f5f69b1cf18340..bffb1da2b07117 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-pad.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-pad.mlir
@@ -3,9 +3,9 @@
 module {
 func.func @main(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<11x131xf32> {
   %0 = "stablehlo.pad"(%arg0, %arg1) {
-    edge_padding_low = dense<[1, 0]> : tensor<2xi64>,
-    edge_padding_high = dense<[2, 3]> : tensor<2xi64>,
-    interior_padding = dense<0> : tensor<2xi64>
+    edge_padding_low = array<i64: 1, 0>,
+    edge_padding_high = array<i64: 2, 3>,
+    interior_padding = array<i64: 0, 0>
   } : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
   func.return %0 : tensor<11x131xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-pad.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-pad.mlir
index 482a7f9e176977..1d47c5c6382837 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-pad.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-pad.mlir
@@ -9,8 +9,8 @@ module {
 
 // CHECK:       module {
 // CHECK-NEXT:    func @main(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<11x131xf32> {
-// CHECK-NEXT:    %0 = stablehlo.pad %arg0, %arg1, low = [1, 0], high = [2, 3], interior = [0, 0] : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
-// CHECK-NEXT:    return %0 : tensor<11x131xf32>
-// CHECK-NEXT:    }
+// CHECK-NEXT:      %0 = stablehlo.pad %arg0, %arg1, low = [1, 0], high = [2, 3], interior = [0, 0] : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
+// CHECK-NEXT:      return %0 : tensor<11x131xf32>
 // CHECK-NEXT:    }
+// CHECK-NEXT:  }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index 593cdbf4fa8b4d..b2948e59fae0b7 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -1786,7 +1786,7 @@ func.func @round_nearest_even(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 // CHECK-SAME:                            %[[VAL_1:.*]]: tensor<256xf32>) -> tensor<1xf32> {
 // CHECK:           %[[VAL_2:.*]] = arith.constant dense<[256, 1]> : tensor<2xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_2]]) : (tensor<256xf32>, tensor<2xi64>) -> tensor<256x1xf32>
-// CHECK:           %[[VAL_4:.*]] = "tf.BatchMatMulV3"(%[[VAL_0]], %[[VAL_3]]) <{adj_x = false, adj_y = false}> : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.BatchMatMulV3"(%[[VAL_0]], %[[VAL_3]]) <{adj_x = false, adj_y = false, grad_x = false, grad_y = false}> : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
 // CHECK:           %[[VAL_5:.*]] = arith.constant dense<1> : tensor<1xi64>
 // CHECK:           %[[VAL_6:.*]] = "tf.Reshape"(%[[VAL_4]], %[[VAL_5]]) : (tensor<1x1xf32>, tensor<1xi64>) -> tensor<1xf32>
 // CHECK:           return %[[VAL_6]] : tensor<1xf32>
@@ -1803,7 +1803,7 @@ func.func @convert_dot_2d_1d(%arg0: tensor<1x256xf32>, %arg1: tensor<256xf32>) -
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
 // CHECK:           %[[VAL_4:.*]] = arith.constant dense<[256, 1]> : tensor<2xi64>
 // CHECK:           %[[VAL_5:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_4]]) : (tensor<256xf32>, tensor<2xi64>) -> tensor<256x1xf32>
-// CHECK:           %[[VAL_6:.*]] = "tf.BatchMatMulV3"(%[[VAL_3]], %[[VAL_5]]) <{adj_x = false, adj_y = false}> : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+// CHECK:           %[[VAL_6:.*]] = "tf.BatchMatMulV3"(%[[VAL_3]], %[[VAL_5]]) <{adj_x = false, adj_y = false, grad_x = false, grad_y = false}> : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
 // CHECK:           %[[VAL_7:.*]] = arith.constant dense<> : tensor<0xi64>
 // CHECK:           %[[VAL_8:.*]] = "tf.Reshape"(%[[VAL_6]], %[[VAL_7]]) : (tensor<1x1xf32>, tensor<0xi64>) -> tensor<f32>
 // CHECK:           return %[[VAL_8]] : tensor<f32>
@@ -1816,7 +1816,7 @@ func.func @convert_dot_1d_1d(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) ->
 // CHECK-LABEL:   func @convert_dot_2d_2d(
 // CHECK-SAME:                            %[[VAL_0:.*]]: tensor<1x256xf32>,
 // CHECK-SAME:                            %[[VAL_1:.*]]: tensor<256x1xf32>) -> tensor<1x1xf32> {
-// CHECK:           %[[VAL_2:.*]] = "tf.BatchMatMulV3"(%[[VAL_0]], %[[VAL_1]]) <{adj_x = false, adj_y = false}> : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.BatchMatMulV3"(%[[VAL_0]], %[[VAL_1]]) <{adj_x = false, adj_y = false, grad_x = false, grad_y = false}> : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
 // CHECK:           return %[[VAL_2]] : tensor<1x1xf32>
 // CHECK:         }
 func.func @convert_dot_2d_2d(%arg0: tensor<1x256xf32>, %arg1: tensor<256x1xf32>) -> tensor<1x1xf32> {
@@ -1895,7 +1895,7 @@ func.func @dynamic_broadcast_in_dim_general_case_expand_middle_dim(%arg0: tensor
 // CHECK:           %[[VAL_7:.*]] = "tf.Reshape"(%[[VAL_3]], %[[VAL_6]]) : (tensor<3x5x1x2x6xf32>, tensor<3xi64>) -> tensor<3x5x12xf32>
 // CHECK:           %[[VAL_8:.*]] = arith.constant dense<[3, 12, 4]> : tensor<3xi64>
 // CHECK:           %[[VAL_9:.*]] = "tf.Reshape"(%[[VAL_5]], %[[VAL_8]]) : (tensor<3x2x6x4xf32>, tensor<3xi64>) -> tensor<3x12x4xf32>
-// CHECK:           %[[VAL_10:.*]] = "tf.BatchMatMulV3"(%[[VAL_7]], %[[VAL_9]]) <{adj_x = false, adj_y = false}> : (tensor<3x5x12xf32>, tensor<3x12x4xf32>) -> tensor<3x5x4xf32>
+// CHECK:           %[[VAL_10:.*]] = "tf.BatchMatMulV3"(%[[VAL_7]], %[[VAL_9]]) <{adj_x = false, adj_y = false, grad_x = false, grad_y = false}> : (tensor<3x5x12xf32>, tensor<3x12x4xf32>) -> tensor<3x5x4xf32>
 // CHECK:           %[[VAL_11:.*]] = arith.constant dense<[3, 5, 1, 4]> : tensor<4xi64>
 // CHECK:           %[[VAL_12:.*]] = "tf.Reshape"(%[[VAL_10]], %[[VAL_11]]) : (tensor<3x5x4xf32>, tensor<4xi64>) -> tensor<3x5x1x4xf32>
 // CHECK:           return %[[VAL_12]] : tensor<3x5x1x4xf32>
@@ -1929,7 +1929,7 @@ func.func @quantized_dot_general_not_converted(%arg0: tensor<1x1x512xf32>, %arg1
 // CHECK-SAME:                                       %[[VAL_1:.*]]: tensor<1024x1024xf32>) -> tensor<1x1x1024xf32> {
 // CHECK:           %[[VAL_2:.*]] = arith.constant dense<[1, 1024]> : tensor<2xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : {{.*}} -> tensor<1x1024xf32>
-// CHECK:           %[[VAL_4:.*]] = "tf.BatchMatMulV3"(%[[VAL_3]], %[[VAL_1]]) <{adj_x = false, adj_y = false}> : {{.*}} -> tensor<1x1024xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.BatchMatMulV3"(%[[VAL_3]], %[[VAL_1]]) <{adj_x = false, adj_y = false, grad_x = false, grad_y = false}> : {{.*}} -> tensor<1x1024xf32>
 // CHECK:           %[[VAL_5:.*]] = arith.constant dense<[1, 1, 1024]> : tensor<3xi64>
 // CHECK:           %[[VAL_6:.*]] = "tf.Reshape"(%[[VAL_4]], %[[VAL_5]]) : {{.*}} -> tensor<1x1x1024xf32>
 // CHECK:           return %[[VAL_6]] : tensor<1x1x1024xf32>
@@ -1952,7 +1952,7 @@ func.func @convert_dot_general_repeated(%arg0: tensor<1x1x1024xf32>, %arg1: tens
 // CHECK-SAME:                              %[[VAL_1:.*]]: tensor<256x8xi8>) -> tensor<8xi32> {
 // CHECK:           %[[VAL_2:.*]] = arith.constant dense<[1, 256]> : tensor<2xi64>
 // CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<256xi8>, tensor<2xi64>) -> tensor<1x256xi8>
-// CHECK:           %[[VAL_4:.*]] = "tf.BatchMatMulV3"(%[[VAL_3]], %[[VAL_1]]) <{adj_x = false, adj_y = false}> : (tensor<1x256xi8>, tensor<256x8xi8>) -> tensor<1x8xi32>
+// CHECK:           %[[VAL_4:.*]] = "tf.BatchMatMulV3"(%[[VAL_3]], %[[VAL_1]]) <{adj_x = false, adj_y = false, grad_x = false, grad_y = false}> : (tensor<1x256xi8>, tensor<256x8xi8>) -> tensor<1x8xi32>
 // CHECK:           %[[VAL_5:.*]] = arith.constant dense<8> : tensor<1xi64>
 // CHECK:           %[[VAL_6:.*]] = "tf.Reshape"(%[[VAL_4]], %[[VAL_5]]) : (tensor<1x8xi32>, tensor<1xi64>) -> tensor<8xi32>
 // CHECK:           return %[[VAL_6]] : tensor<8xi32>
@@ -1982,7 +1982,7 @@ func.func @convert_dot_general_int8(%arg0: tensor<256xi8>, %arg1: tensor<256x8xi
 // CHECK-DAG:       %cst_4 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %4 = "tf.Concat"(%cst_4, %cst_3, %3, %2) : (tensor<i32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK:           %5 = "tf.Reshape"(%0, %4) : (tensor<4x256x?xf32>, tensor<3xi32>) -> tensor<4x256x?xf32>
-// CHECK:           %6 = "tf.BatchMatMulV3"(%arg0, %5) <{adj_x = false, adj_y = false}> : (tensor<4x4x256xf32>, tensor<4x256x?xf32>) -> tensor<4x4x?xf32>
+// CHECK:           %6 = "tf.BatchMatMulV3"(%arg0, %5) <{adj_x = false, adj_y = false, grad_x = false, grad_y = false}> : (tensor<4x4x256xf32>, tensor<4x256x?xf32>) -> tensor<4x4x?xf32>
 // CHECK:           %7 = "tf.Shape"(%arg0) : (tensor<4x4x256xf32>) -> tensor<3xi32>
 // CHECK:           %8 = "tf.Shape"(%arg1) : (tensor<4x?x256xf32>) -> tensor<3xi32>
 // CHECK-DAG:       %cst_5 = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi64>}> : () -> tensor<2xi64>
@@ -2032,7 +2032,7 @@ func.return %0 : tensor<4x4x?xf32>
 // CHECK-DAG:       %cst_9 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %11 = "tf.Concat"(%cst_9, %10, %9, %8) : (tensor<i32>, tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
 // CHECK:           %12 = "tf.Reshape"(%0, %11) : (tensor<2x?x3x4xf32>, tensor<4xi32>) -> tensor<2x?x3x4xf32>
-// CHECK:           %13 = "tf.BatchMatMulV3"(%6, %12) <{adj_x = false, adj_y = false}> : (tensor<2x?x2x3xf32>, tensor<2x?x3x4xf32>) -> tensor<2x?x2x4xf32>
+// CHECK:           %13 = "tf.BatchMatMulV3"(%6, %12) <{adj_x = false, adj_y = false, grad_x = false, grad_y = false}> : (tensor<2x?x2x3xf32>, tensor<2x?x3x4xf32>) -> tensor<2x?x2x4xf32>
 // CHECK:           %14 = "tf.Shape"(%arg0) : (tensor<2x?x2x3xf32>) -> tensor<4xi32>
 // CHECK:           %15 = "tf.Shape"(%arg1) : (tensor<2x?x4x3xf32>) -> tensor<4xi32>
 // CHECK-DAG:       %cst_10 = "tf.Const"() <{value = dense<[0, 1, 2]> : tensor<3xi64>}> : () -> tensor<3xi64>
@@ -2080,7 +2080,7 @@ func.return %0 : tensor<2x?x2x4xf32>
 // CHECK-DAG:       %cst_9 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %9 = "tf.Concat"(%cst_9, %cst_8, %8, %7) : (tensor<i32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK:           %10 = "tf.Reshape"(%0, %9) : (tensor<2x3x4x?xf32>, tensor<3xi32>) -> tensor<2x3x?xf32>
-// CHECK:           %11 = "tf.BatchMatMulV3"(%5, %10) <{adj_x = false, adj_y = false}> : (tensor<2x?x3xf32>, tensor<2x3x?xf32>) -> tensor<2x?x?xf32>
+// CHECK:           %11 = "tf.BatchMatMulV3"(%5, %10) <{adj_x = false, adj_y = false, grad_x = false, grad_y = false}> : (tensor<2x?x3xf32>, tensor<2x3x?xf32>) -> tensor<2x?x?xf32>
 // CHECK:           %12 = "tf.Shape"(%arg0) : (tensor<2x2x?x3xf32>) -> tensor<4xi32>
 // CHECK:           %13 = "tf.Shape"(%arg1) : (tensor<2x4x?x3xf32>) -> tensor<4xi32>
 // CHECK-DAG:       %cst_10 = "tf.Const"() <{value = dense<[0, 1, 2]> : tensor<3xi64>}> : () -> tensor<3xi64>
@@ -2126,7 +2126,7 @@ func.return %0 : tensor<2x2x?x4x?xf32>
 // CHECK-DAG:       %cst_8 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           %8 = "tf.Concat"(%cst_8, %cst_7, %7, %6) : (tensor<i32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK:           %9 = "tf.Reshape"(%arg1, %8) : (tensor<4x?x256xf32>, tensor<3xi32>) -> tensor<4x?x256xf32>
-// CHECK:           %10 = "tf.BatchMatMulV3"(%4, %9) <{adj_x = false, adj_y = false}> : (tensor<4x4x?xf32>, tensor<4x?x256xf32>) -> tensor<4x4x256xf32>
+// CHECK:           %10 = "tf.BatchMatMulV3"(%4, %9) <{adj_x = false, adj_y = false, grad_x = false, grad_y = false}> : (tensor<4x4x?xf32>, tensor<4x?x256xf32>) -> tensor<4x4x256xf32>
 // CHECK:           return %10 : tensor<4x4x256xf32>
 // CHECK:           }
 func.func @convert_dot_general_dynamic_contracting_dim(%arg0: tensor<4x4x?xf32>, %arg1: tensor<4x?x256xf32>) -> tensor<4x4x256xf32> {
@@ -3742,6 +3742,26 @@ func.func @convert_gather(%arg0: tensor<147456xf16>, %arg1: tensor<192x256x1xi32
   func.return %0 : tensor<192x256xf16>
 }
 
+// CHECK-LABEL:   func @convert_gather_with_ui32indices(
+// CHECK-SAME:                         %[[ARG_0:.*]]: tensor<147456xf16>,
+// CHECK-SAME:                         %[[ARG_1:.*]]: tensor<192x256x1xui32>)
+// CHECK:            %[[INDICES:.*]] = "tf.Cast"(%arg1) <{Truncate = false}> : (tensor<192x256x1xui32>) -> tensor<192x256x1xi64>
+// CHECK:            %[[VAL_0:.*]] = "tf.GatherNd"(%[[ARG_0]], %[[INDICES]]) : {{.*}} -> tensor<192x256xf16>
+// CHECK:            return %[[VAL_0]]
+// CHECK:         }
+func.func @convert_gather_with_ui32indices(%arg0: tensor<147456xf16>, %arg1: tensor<192x256x1xui32>) -> tensor<192x256xf16> {
+  %0 = "mhlo.gather"(%arg0, %arg1) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [0],
+      index_vector_dim = 2,
+			start_index_map = [0],
+    >,
+    indices_are_sorted = false,
+    slice_sizes = dense<1> : tensor<1xi64>
+  } : (tensor<147456xf16>, tensor<192x256x1xui32>) -> tensor<192x256xf16>
+  func.return %0 : tensor<192x256xf16>
+}
+
 // CHECK-LABEL:   func @convert_gather_nd(
 // CHECK-SAME:                            %[[VAL_0:.*]]: tensor<98x128xf32>,
 // CHECK-SAME:                            %[[VAL_1:.*]]: tensor<4x64xi32>)
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo_pad.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo_pad.mlir
deleted file mode 100644
index b72b4296c000ff..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo_pad.mlir
+++ /dev/null
@@ -1,175 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -tfl-legalize-hlo -split-input-file | FileCheck %s --dump-input=fail
-
-func.func @mhlo_pad_test__noop(%input: tensor<5x7xf32>, %padding_value: tensor<f32>) -> tensor<5x7xf32> {
-  %0 = "mhlo.pad"(%input, %padding_value) {
-    edge_padding_low = dense<[0, 0]> : tensor<2xi64>,
-    edge_padding_high = dense<[0, 0]> : tensor<2xi64>,
-    interior_padding = dense<[0, 0]> : tensor<2xi64>
-  } : (tensor<5x7xf32>, tensor<f32>) -> tensor<5x7xf32>
-  func.return %0: tensor<5x7xf32>
-
-// CHECK-LABEL: mhlo_pad_test__noop
-// CHECK: return %arg0 : tensor<5x7xf32>
-}
-
-func.func @mhlo_pad_test__pad_all(%input: tensor<5x7xf32>, %padding_value: tensor<f32>) -> tensor<9x10xf32> {
-  %0 = "mhlo.pad"(%input, %padding_value) {
-    edge_padding_low = dense<[3, 2]> : tensor<2xi64>,
-    edge_padding_high = dense<[1, 1]> : tensor<2xi64>,
-    interior_padding = dense<[0, 0]> : tensor<2xi64>
-  } : (tensor<5x7xf32>, tensor<f32>) -> tensor<9x10xf32>
-  func.return %0: tensor<9x10xf32>
-
-// CHECK-LABEL: mhlo_pad_test__pad_all
-// CHECK: %cst = arith.constant dense<{{\[}}[3, 1], [2, 1]]> : tensor<2x2xi64>
-// CHECK: %0 = "tfl.padv2"(%arg0, %cst, %arg1) : (tensor<5x7xf32>, tensor<2x2xi64>, tensor<f32>) -> tensor<9x10xf32>
-// CHECK: return %0 : tensor<9x10xf32>
-}
-
-func.func @mhlo_pad_test__crop_all(%input: tensor<5x7xf32>, %padding_value: tensor<f32>) -> tensor<3x5xf32> {
-  %0 = "mhlo.pad"(%input, %padding_value) {
-    edge_padding_low = dense<[-1, -1]> : tensor<2xi64>,
-    edge_padding_high = dense<[-1, -1]> : tensor<2xi64>,
-    interior_padding = dense<[0, 0]> : tensor<2xi64>
-  } : (tensor<5x7xf32>, tensor<f32>) -> tensor<3x5xf32>
-  func.return %0: tensor<3x5xf32>
-
-// CHECK-LABEL: mhlo_pad_test__crop_all
-// CHECK: %cst = arith.constant dense<1> : tensor<2xi64>
-// CHECK: %cst_0 = arith.constant dense<-1> : tensor<2xi64>
-// CHECK: %cst_1 = arith.constant dense<1> : tensor<2xi64>
-// CHECK: %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<5x7xf32>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<3x5xf32>
-// CHECK: return %0 : tensor<3x5xf32>
-}
-
-func.func @mhlo_pad_test__interior_pad_all(%input: tensor<5x7xf32>, %padding_value: tensor<f32>) -> tensor<9x13xf32> {
-  %0 = "mhlo.pad"(%input, %padding_value) {
-    edge_padding_low = dense<[0, 0]> : tensor<2xi64>,
-    edge_padding_high = dense<[0, 0]> : tensor<2xi64>,
-    interior_padding = dense<[1, 1]> : tensor<2xi64>
-  } : (tensor<5x7xf32>, tensor<f32>) -> tensor<9x13xf32>
-  func.return %0: tensor<9x13xf32>
-
-// CHECK-LABEL: mhlo_pad_test__interior_pad_all
-// CHECK: %cst = arith.constant dense<2> : tensor<2xi32>
-// CHECK: %0 = "tfl.dilate"(%arg0, %cst, %arg1) : (tensor<5x7xf32>, tensor<2xi32>, tensor<f32>) -> tensor<9x13xf32>
-// CHECK: return %0 : tensor<9x13xf32>
-}
-
-func.func @mhlo_pad_test__pad_and_crop(%input: tensor<5x7xf32>, %padding_value: tensor<f32>) -> tensor<5x7xf32> {
-  %0 = "mhlo.pad"(%input, %padding_value) {
-    edge_padding_low = dense<[-1, 1]> : tensor<2xi64>,
-    edge_padding_high = dense<[1, -1]> : tensor<2xi64>,
-    interior_padding = dense<[0, 0]> : tensor<2xi64>
-  } : (tensor<5x7xf32>, tensor<f32>) -> tensor<5x7xf32>
-  func.return %0: tensor<5x7xf32>
-
-// CHECK-LABEL: mhlo_pad_test__pad_and_crop
-// CHECK: %cst = arith.constant dense<{{\[}}[0, 1], [1, 0]]> : tensor<2x2xi64>
-// CHECK: %0 = "tfl.padv2"(%arg0, %cst, %arg1) : (tensor<5x7xf32>, tensor<2x2xi64>, tensor<f32>) -> tensor<6x8xf32>
-// CHECK: %cst_0 = arith.constant dense<[1, 0]> : tensor<2xi64>
-// CHECK: %cst_1 = arith.constant dense<[0, -1]> : tensor<2xi64>
-// CHECK: %cst_2 = arith.constant dense<1> : tensor<2xi64>
-// CHECK: %1 = "tfl.strided_slice"(%0, %cst_0, %cst_1, %cst_2) {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 1 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<6x8xf32>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<5x7xf32>
-// CHECK: return %1 : tensor<5x7xf32>
-}
-
-func.func @mhlo_pad_test__pad_and_crop_and_interior_pad(%input: tensor<5x7xf32>, %padding_value: tensor<f32>) -> tensor<13x25xf32> {
-  %0 = "mhlo.pad"(%input, %padding_value) {
-    edge_padding_low = dense<[-1, 1]> : tensor<2xi64>,
-    edge_padding_high = dense<[1, -1]> : tensor<2xi64>,
-    interior_padding = dense<[2, 3]> : tensor<2xi64>
-  } : (tensor<5x7xf32>, tensor<f32>) -> tensor<13x25xf32>
-  func.return %0: tensor<13x25xf32>
-
-// CHECK-LABEL: mhlo_pad_test__pad_and_crop_and_interior_pad
-// CHECK: %cst = arith.constant dense<[3, 4]> : tensor<2xi32>
-// CHECK: %0 = "tfl.dilate"(%arg0, %cst, %arg1) : (tensor<5x7xf32>, tensor<2xi32>, tensor<f32>) -> tensor<13x25xf32>
-// CHECK: %cst_0 = arith.constant dense<{{\[}}[0, 1], [1, 0]]> : tensor<2x2xi64>
-// CHECK: %1 = "tfl.padv2"(%0, %cst_0, %arg1) : (tensor<13x25xf32>, tensor<2x2xi64>, tensor<f32>) -> tensor<14x26xf32>
-// CHECK: %cst_1 = arith.constant dense<[1, 0]> : tensor<2xi64>
-// CHECK: %cst_2 = arith.constant dense<[0, -1]> : tensor<2xi64>
-// CHECK: %cst_3 = arith.constant dense<1> : tensor<2xi64>
-// CHECK: %2 = "tfl.strided_slice"(%1, %cst_1, %cst_2, %cst_3) {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 1 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<14x26xf32>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<13x25xf32>
-// CHECK: return %2 : tensor<13x25xf32>
-}
-
-func.func @mhlo_pad_test__pad_all_unknown_shape(%input: tensor<?x?x?x?xf32>, %padding_value: tensor<f32>) -> tensor<?x?x?x?xf32> {
-  %0 = "mhlo.pad"(%input, %padding_value) {
-    edge_padding_low = dense<[1, 1, 1, 1]> : tensor<4xi64>,
-    edge_padding_high = dense<[1, 1, 1, 1]> : tensor<4xi64>,
-    interior_padding = dense<[0, 0, 0, 0]> : tensor<4xi64>
-  } : (tensor<?x?x?x?xf32>, tensor<f32>) -> tensor<?x?x?x?xf32>
-  func.return %0: tensor<?x?x?x?xf32>
-
-// CHECK-LABEL: mhlo_pad_test__pad_all_unknown_shape
-// CHECK: %cst = arith.constant dense<1> : tensor<4x2xi64>
-// CHECK: %0 = "tfl.padv2"(%arg0, %cst, %arg1) : (tensor<?x?x?x?xf32>, tensor<4x2xi64>, tensor<f32>) -> tensor<?x?x?x?xf32>
-// CHECK: return %0 : tensor<?x?x?x?xf32>
-}
-
-func.func @mhlo_pad_test__crop_all_unknown_shape(%input: tensor<?x?x?x?xf32>, %padding_value: tensor<f32>) -> tensor<?x?x?x?xf32> {
-  %0 = "mhlo.pad"(%input, %padding_value) {
-    edge_padding_low = dense<[-1, -1, -1, -1]> : tensor<4xi64>,
-    edge_padding_high = dense<[-1, -1, -1, -1]> : tensor<4xi64>,
-    interior_padding = dense<[0, 0, 0, 0]> : tensor<4xi64>
-  } : (tensor<?x?x?x?xf32>, tensor<f32>) -> tensor<?x?x?x?xf32>
-  func.return %0: tensor<?x?x?x?xf32>
-
-// CHECK-LABEL: mhlo_pad_test__crop_all_unknown_shape
-// CHECK: %cst = arith.constant dense<1> : tensor<4xi64>
-// CHECK: %cst_0 = arith.constant dense<-1> : tensor<4xi64>
-// CHECK: %cst_1 = arith.constant dense<1> : tensor<4xi64>
-// CHECK: %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<?x?x?x?xf32>, tensor<4xi64>, tensor<4xi64>, tensor<4xi64>) -> tensor<?x?x?x?xf32>
-// CHECK: return %0 : tensor<?x?x?x?xf32>
-}
-
-func.func @mhlo_pad_test__pad_all_unknown_dim0(%input: tensor<?x2x3x4xf32>, %padding_value: tensor<f32>) -> tensor<?x4x5x6xf32> {
-  %0 = "mhlo.pad"(%input, %padding_value) {
-    edge_padding_low = dense<[1, 1, 1, 1]> : tensor<4xi64>,
-    edge_padding_high = dense<[1, 1, 1, 1]> : tensor<4xi64>,
-    interior_padding = dense<[0, 0, 0, 0]> : tensor<4xi64>
-  } : (tensor<?x2x3x4xf32>, tensor<f32>) -> tensor<?x4x5x6xf32>
-  func.return %0: tensor<?x4x5x6xf32>
-
-// CHECK-LABEL: mhlo_pad_test__pad_all_unknown_dim0
-// CHECK: %cst = arith.constant dense<1> : tensor<4x2xi64>
-// CHECK: %0 = "tfl.padv2"(%arg0, %cst, %arg1) : (tensor<?x2x3x4xf32>, tensor<4x2xi64>, tensor<f32>) -> tensor<?x4x5x6xf32>
-// CHECK: return %0 : tensor<?x4x5x6xf32>
-}
-
-func.func @mhlo_pad_test__crop_all_unknown_dim0(%input: tensor<?x2x3x4xf32>, %padding_value: tensor<f32>) -> tensor<?x0x1x2xf32> {
-  %0 = "mhlo.pad"(%input, %padding_value) {
-    edge_padding_low = dense<[-1, -1, -1, -1]> : tensor<4xi64>,
-    edge_padding_high = dense<[-1, -1, -1, -1]> : tensor<4xi64>,
-    interior_padding = dense<[0, 0, 0, 0]> : tensor<4xi64>
-  } : (tensor<?x2x3x4xf32>, tensor<f32>) -> tensor<?x0x1x2xf32>
-  func.return %0: tensor<?x0x1x2xf32>
-
-// CHECK-LABEL: mhlo_pad_test__crop_all_unknown_dim0
-// CHECK: %cst = arith.constant dense<1> : tensor<4xi64>
-// CHECK: %cst_0 = arith.constant dense<-1> : tensor<4xi64>
-// CHECK: %cst_1 = arith.constant dense<1> : tensor<4xi64>
-// CHECK: %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<?x2x3x4xf32>, tensor<4xi64>, tensor<4xi64>, tensor<4xi64>) -> tensor<?x0x1x2xf32>
-// CHECK: return %0 : tensor<?x0x1x2xf32>
-}
-
-func.func @mhlo_pad_test__pad_and_crop_and_interior_pad_unknown_dim0(%input: tensor<?x2x3x4xf32>, %padding_value: tensor<f32>) -> tensor<?x3x8x15xf32> {
-  %0 = "mhlo.pad"(%input, %padding_value) {
-    edge_padding_low = dense<[-2, -1, 0, 1]> : tensor<4xi64>,
-    edge_padding_high = dense<[1, 0, -1, -2]> : tensor<4xi64>,
-    interior_padding = dense<[1, 2, 3, 4]> : tensor<4xi64>
-  } : (tensor<?x2x3x4xf32>, tensor<f32>) -> tensor<?x3x8x15xf32>
-  func.return %0: tensor<?x3x8x15xf32>
-
-// CHECK-LABEL: mhlo_pad_test__pad_and_crop_and_interior_pad_unknown_dim0
-// CHECK: %cst = arith.constant dense<[2, 3, 4, 5]> : tensor<4xi32>
-// CHECK: %0 = "tfl.dilate"(%arg0, %cst, %arg1) : (tensor<?x2x3x4xf32>, tensor<4xi32>, tensor<f32>) -> tensor<?x4x9x16xf32>
-// CHECK: %cst_0 = arith.constant dense<{{\[}}[0, 1], [0, 0], [0, 0], [1, 0]]> : tensor<4x2xi64>
-// CHECK: %1 = "tfl.padv2"(%0, %cst_0, %arg1) : (tensor<?x4x9x16xf32>, tensor<4x2xi64>, tensor<f32>) -> tensor<?x4x9x17xf32>
-// CHECK: %cst_1 = arith.constant dense<[2, 1, 0, 0]> : tensor<4xi64>
-// CHECK: %cst_2 = arith.constant dense<[0, 0, -1, -2]> : tensor<4xi64>
-// CHECK: %cst_3 = arith.constant dense<1> : tensor<4xi64>
-// CHECK: %2 = "tfl.strided_slice"(%1, %cst_1, %cst_2, %cst_3) {begin_mask = 12 : i32, ellipsis_mask = 0 : i32, end_mask = 3 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<?x4x9x17xf32>, tensor<4xi64>, tensor<4xi64>, tensor<4xi64>) -> tensor<?x3x8x15xf32>
-// CHECK: return %2 : tensor<?x3x8x15xf32>
-}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir
index 073f31e39786d9..ef637a848461d9 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir
@@ -133,7 +133,7 @@ func.func @batchNormTraining_4D_middle_features(
     %x: tensor<3x4x256x6xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>)
     -> (tensor<3x4x256x6xf32>) {
   // CHECK-DAG: %[[CST_AXIS:.+]] = "tf.Const"() <{value = dense<[0, 1, 3]> : tensor<3xi32>}> : () -> tensor<3xi32>
-  // CHECK-DAG: %[[X_SHAPE:.+]] = shape.const_shape [3, 4, 256, 6] : tensor<4xindex>
+  // CHECK-DAG: %[[X_SHAPE:.+]] = shape.shape_of %[[X]] : tensor<3x4x256x6xf32> -> tensor<4xindex>
   // CHECK-DAG: %[[EPS:.+]] = mhlo.constant dense<1.000000e+00> : tensor<256xf32>
   // CHECK-DAG: %[[MEAN:.+]] = "tf.Mean"(%arg0, %[[CST_AXIS]]) <{keep_dims = false}> : (tensor<3x4x256x6xf32>, tensor<3xi32>) -> tensor<256xf32>
   // CHECK-DAG: %[[MEAN_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[X_SHAPE]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xf32>, tensor<4xindex>) -> tensor<3x4x256x6xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
index 15b3e37326cfe0..7272b5f17301cb 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
@@ -24,7 +24,7 @@ func.func @uniform_quantize_op_quantized_input(%arg: tensor<2x2x!quant.uniform<i
 
 // -----
 
-// Tests that the pattern doesn't match when the output tensor's sotrage type
+// Tests that the pattern doesn't match when the output tensor's storage type
 // is ui16. ui16 storage type for quantized type is not compatible with
 // `tfl.quantize`.
 
@@ -38,7 +38,7 @@ func.func @uniform_quantize_op_uint16_output(%arg: tensor<2x2xf32>) -> tensor<2x
 
 // -----
 
-// Tests that the pattern doesn't match when the output tensor's sotrage type
+// Tests that the pattern doesn't match when the output tensor's storage type
 // is i32. i32 storage type for quantized type is not compatible with
 // `tfl.quantize`.
 
@@ -104,8 +104,8 @@ func.func @uniform_dequantize_op_return_f64(%arg: tensor<2x2x!quant.uniform<i8:f
 
 // -----
 
-// CHECK-LABEL: convolution_op
-func.func @convolution_op(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 3.000000e+0:-100>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: convolution_upstream_full_integer
+func.func @convolution_upstream_full_integer(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 3.000000e+0:-100>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x4x2xi8>} : () -> tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>
   %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 3.000000e+0:-100>>, tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>>
   return %1 : tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>>
@@ -123,8 +123,8 @@ func.func @convolution_op(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 3.000000e
 
 // -----
 
-// CHECK-LABEL: convolution_op_non_const_filter
-func.func @convolution_op_non_const_filter(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, %arg1: tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: convolution_upstream_full_integer_non_const_filter
+func.func @convolution_upstream_full_integer_non_const_filter(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, %arg1: tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>>
   return %0 : tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>>
 }
@@ -139,8 +139,8 @@ func.func @convolution_op_non_const_filter(%arg0: tensor<1x3x3x4x!quant.uniform<
 // Test that if the window padding contains values of 0, tfl.pad op is not
 // created and the `padding` attribute is set as "VALID".
 
-// CHECK-LABEL: convolution_op_valid_padding
-func.func @convolution_op_valid_padding(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: convolution_upstream_full_integer_valid_padding
+func.func @convolution_upstream_full_integer_valid_padding(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x4x2xi8>} : () -> tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>
   %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 0], [0, 0]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>>
   return %1 : tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>>
@@ -157,8 +157,8 @@ func.func @convolution_op_valid_padding(%arg0: tensor<1x3x3x4x!quant.uniform<i8:
 // Test that if the window padding value is missing, tfl.pad op is not
 // created and the `padding` attribute is set as "VALID".
 
-// CHECK-LABEL: convolution_op_valid_padding
-func.func @convolution_op_valid_padding(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: convolution_upstream_full_integer_valid_padding
+func.func @convolution_upstream_full_integer_valid_padding(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x4x2xi8>} : () -> tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>
   // The `window` attribute is empty.
   %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>>
@@ -175,8 +175,8 @@ func.func @convolution_op_valid_padding(%arg0: tensor<1x3x3x4x!quant.uniform<i8:
 // Test that if the window stride value is explicitly set, the attribute
 // value is transferred to tfl.conv_2d's stridw_h and stride_w values.
 
-// CHECK-LABEL: convolution_strides
-func.func @convolution_strides(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: convolution_upstream_full_integer_strides
+func.func @convolution_upstream_full_integer_strides(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x4x2xi8>} : () -> tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>
   // The stride value is explicitly set to [1, 2].
   %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 2], pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+0>>
@@ -195,8 +195,8 @@ func.func @convolution_strides(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.00
 
 // Test full integer quantized dot_general with asymmetric quantized input.
 
-// CHECK-LABEL: dot_general_full_integer_asym_input
-func.func @dot_general_full_integer_asym_input(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: dot_general_upstream_full_integer_asym_input
+func.func @dot_general_upstream_full_integer_asym_input(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {
     dot_dimension_numbers = #stablehlo.dot<
@@ -216,8 +216,8 @@ func.func @dot_general_full_integer_asym_input(%arg0: tensor<1x2x3x4x!quant.unif
 
 // Test full integer quantized dot_general with symmetric quantized input.
 
-// CHECK-LABEL: dot_general_full_integer_sym_input
-func.func @dot_general_full_integer_sym_input(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: dot_general_upstream_full_integer_sym_input
+func.func @dot_general_upstream_full_integer_sym_input(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {
     dot_dimension_numbers = #stablehlo.dot<
@@ -237,10 +237,34 @@ func.func @dot_general_full_integer_sym_input(%arg0: tensor<1x2x3x4x!quant.unifo
 
 // -----
 
+// Tests that the pattern does not match when the output tensor's storage
+// type is i32. Currently we support qi8, qi8 -> qi8 only for GEMM ops that
+// are quantized upstream. Other cases should be handled by regular quantized
+// stablehlo.dot_general case.
+
+// CHECK-LABEL: dot_general_upstream_full_integer_i32_output
+func.func @dot_general_upstream_full_integer_i32_output(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i32:f32, 4.000000e+0>> {
+  %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
+  %1 = "stablehlo.dot_general"(%arg0, %0) {
+    dot_dimension_numbers = #stablehlo.dot<
+      lhs_batching_dimensions = [0, 1],
+      rhs_batching_dimensions = [0, 1],
+      lhs_contracting_dimensions = [3],
+      rhs_contracting_dimensions = [2]
+    >,
+    precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]
+  } : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i32:f32, 4.000000e+0>>
+  return %1 : tensor<1x2x3x5x!quant.uniform<i32:f32, 4.000000e+0>>
+}
+// CHECK: stablehlo.dot_general
+// CHECK-NOT: tfl.quantize
+
+// -----
+
 // Test full integer quantized dot_general with activation as RHS
 
-// CHECK-LABEL: dot_general_full_integer_activation_rhs
-func.func @dot_general_full_integer_activation_rhs(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>, %arg1: tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: dot_general_upstream_full_integer_activation_rhs
+func.func @dot_general_upstream_full_integer_activation_rhs(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>, %arg1: tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = "stablehlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #stablehlo.dot<
       lhs_batching_dimensions = [0, 1],
@@ -258,8 +282,8 @@ func.func @dot_general_full_integer_activation_rhs(%arg0: tensor<1x2x3x4x!quant.
 
 // Test full integer quantized dot_general with adj_x
 
-// CHECK-LABEL: dot_general_full_integer_adj_x
-func.func @dot_general_full_integer_adj_x(%arg0: tensor<1x2x4x3x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: dot_general_upstream_full_integer_adj_x
+func.func @dot_general_upstream_full_integer_adj_x(%arg0: tensor<1x2x4x3x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {
     dot_dimension_numbers = #stablehlo.dot<
@@ -282,8 +306,8 @@ func.func @dot_general_full_integer_adj_x(%arg0: tensor<1x2x4x3x!quant.uniform<i
 
 // Test full integer quantized dot_general with adj_y
 
-// CHECK-LABEL: dot_general_full_integer_adj_y
-func.func @dot_general_full_integer_adj_y(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: dot_general_upstream_full_integer_adj_y
+func.func @dot_general_upstream_full_integer_adj_y(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x5x4xi8>} : () -> tensor<1x2x5x4x!quant.uniform<i8:f32, 1.000000e+0>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {
     dot_dimension_numbers = #stablehlo.dot<
@@ -306,8 +330,8 @@ func.func @dot_general_full_integer_adj_y(%arg0: tensor<1x2x3x4x!quant.uniform<i
 
 // Test full integer quantized dot_general with wrong batch dims
 
-// CHECK-LABEL: dot_general_full_integer_too_many_batches
-func.func @dot_general_full_integer_too_many_batches(%arg0: tensor<1x1x1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x1x1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: dot_general_upstream_full_integer_too_many_batches
+func.func @dot_general_upstream_full_integer_too_many_batches(%arg0: tensor<1x1x1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x1x1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x1x1x2x4x5xi8>} : () -> tensor<1x1x1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {
     dot_dimension_numbers = #stablehlo.dot<
@@ -328,8 +352,8 @@ func.func @dot_general_full_integer_too_many_batches(%arg0: tensor<1x1x1x2x3x4x!
 
 // Test full integer quantized dot_general with too many contracting dimension
 
-// CHECK-LABEL: dot_general_full_integer_too_many_contractions
-func.func @dot_general_full_integer_too_many_contractions(%arg0: tensor<1x2x3x4x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: dot_general_upstream_full_integer_too_many_contractions
+func.func @dot_general_upstream_full_integer_too_many_contractions(%arg0: tensor<1x2x3x4x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x4x5xi8>} : () -> tensor<1x2x4x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {
     dot_dimension_numbers = #stablehlo.dot<
@@ -350,8 +374,8 @@ func.func @dot_general_full_integer_too_many_contractions(%arg0: tensor<1x2x3x4x
 
 // Test full integer quantized dot_general with unsupported contracting dim
 
-// CHECK-LABEL: dot_general_full_integer_wrong_contracting
-func.func @dot_general_full_integer_wrong_contracting(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x4x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: dot_general_upstream_full_integer_wrong_contracting
+func.func @dot_general_upstream_full_integer_wrong_contracting(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x4x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {
     dot_dimension_numbers = #stablehlo.dot<
@@ -373,8 +397,8 @@ func.func @dot_general_full_integer_wrong_contracting(%arg0: tensor<1x2x3x4x!qua
 
 // Test full integer quantized dot_general with float operands
 
-// CHECK-LABEL: dot_general_full_integer_float_operands
-func.func @dot_general_full_integer_float_operands(%arg0: tensor<1x2x3x4xf32>, %arg1: tensor<1x2x4x5xf32>) -> tensor<1x2x3x5xf32> {
+// CHECK-LABEL: dot_general_upstream_full_integer_float_operands
+func.func @dot_general_upstream_full_integer_float_operands(%arg0: tensor<1x2x3x4xf32>, %arg1: tensor<1x2x4x5xf32>) -> tensor<1x2x3x5xf32> {
   %0 = "stablehlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #stablehlo.dot<
       lhs_batching_dimensions = [0, 1],
@@ -394,8 +418,8 @@ func.func @dot_general_full_integer_float_operands(%arg0: tensor<1x2x3x4xf32>, %
 
 // Test full integer quantized dot_general with asymmetric weight (rhs).
 
-// CHECK-LABEL: dot_general_full_integer_asym_weight
-func.func @dot_general_full_integer_asym_weight(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: dot_general_upstream_full_integer_asym_weight
+func.func @dot_general_upstream_full_integer_asym_weight(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0:5>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {dot_dimension_numbers = #stablehlo.dot<lhs_batching_dimensions = [0, 1], rhs_batching_dimensions = [0, 1], lhs_contracting_dimensions = [3], rhs_contracting_dimensions = [2]>, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0:5>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
   return %1 : tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
@@ -409,8 +433,8 @@ func.func @dot_general_full_integer_asym_weight(%arg0: tensor<1x2x3x4x!quant.uni
 // Test that when the weight tensor for `stablehlo.dot_general` is per-axis
 // quantized, it is converted to `tfl.fully_connected` op.
 
-// CHECK-LABEL: dot_general_per_axis_quantized_filter
-func.func @dot_general_per_axis_quantized_filter(%arg0: tensor<1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+04:127>> {
+// CHECK-LABEL: dot_general_upstream_full_integer_per_axis_quantized_filter
+func.func @dot_general_upstream_full_integer_per_axis_quantized_filter(%arg0: tensor<1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+04:127>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<3x2xi8>} : () -> tensor<3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>
   %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0] : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
   return %1 : tensor<1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
@@ -428,8 +452,8 @@ func.func @dot_general_per_axis_quantized_filter(%arg0: tensor<1x3x!quant.unifor
 // Test that when the weight tensor for `stablehlo.dot_general` is per-axis
 // quantized but has a batch dimension, it is not converted.
 
-// CHECK-LABEL: dot_general_per_axis_quantized_filter_with_batch_dim
-func.func @dot_general_per_axis_quantized_filter_with_batch_dim(%arg0: tensor<1x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<1x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>> {
+// CHECK-LABEL: dot_general_upstream_full_integer_per_axis_quantized_filter_with_batch_dim
+func.func @dot_general_upstream_full_integer_per_axis_quantized_filter_with_batch_dim(%arg0: tensor<1x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<1x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x3x2xi8>} : () -> tensor<1x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>
   %1 = stablehlo.dot_general %arg0, %0, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<1x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>) -> tensor<1x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
   return %1 : tensor<1x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
@@ -444,8 +468,8 @@ func.func @dot_general_per_axis_quantized_filter_with_batch_dim(%arg0: tensor<1x
 // Test that when the weight tensor for `stablehlo.dot_general` is per-axis
 // quantized but has a batch dim > 1, it is not converted.
 
-// CHECK-LABEL: dot_general_per_axis_quantized_filter_multibatch
-func.func @dot_general_per_axis_quantized_filter_multibatch(%arg0: tensor<3x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<3x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>> {
+// CHECK-LABEL: dot_general_upstream_full_integer_per_axis_quantized_filter_multibatch
+func.func @dot_general_upstream_full_integer_per_axis_quantized_filter_multibatch(%arg0: tensor<3x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<3x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<3x3x2xi8>} : () -> tensor<3x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>
   %1 = stablehlo.dot_general %arg0, %0, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<3x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<3x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>) -> tensor<3x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
   return %1 : tensor<3x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
@@ -460,8 +484,8 @@ func.func @dot_general_per_axis_quantized_filter_multibatch(%arg0: tensor<3x1x3x
 // Test that when the weight tensor for `stablehlo.dot_general` is per-axis
 // quantized but has more than one contracting dimension, it is not converted.
 
-// CHECK-LABEL: dot_general_per_axis_quantized_filter_with_multiple_contracting_dims
-func.func @dot_general_per_axis_quantized_filter_with_multiple_contracting_dims(%arg0: tensor<1x2x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<1x1x!quant.uniform<i8:f32, 4.000000e+04:127>> {
+// CHECK-LABEL: dot_general_upstream_full_integer_per_axis_quantized_filter_with_multiple_contracting_dims
+func.func @dot_general_upstream_full_integer_per_axis_quantized_filter_with_multiple_contracting_dims(%arg0: tensor<1x2x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<1x1x!quant.uniform<i8:f32, 4.000000e+04:127>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x3x2xi8>} : () -> tensor<1x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>
   %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [1, 2] x [2, 1] : (tensor<1x2x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<1x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>) -> tensor<1x1x!quant.uniform<i8:f32, 4.000000e+04:127>>
   return %1 : tensor<1x1x!quant.uniform<i8:f32, 4.000000e+04:127>>
@@ -470,3 +494,25 @@ func.func @dot_general_per_axis_quantized_filter_with_multiple_contracting_dims(
 // CHECK: stablehlo.dot_general
 // CHECK-NOT: tfl.fully_connected
 // CHECK-NOT: tfl.batch_matmul
+
+// -----
+
+// Test that a simple per-tensor quantized stablehlo.dot_general is properly
+// fused with a subsequent requantize (qi32->qi8) op then legalized.
+// Supports the following format: (lhs: qi8, rhs: qi8) -> result: qi32
+
+// CHECK-LABEL: dot_general_full_integer
+// CHECK-SAME: (%[[ARG_1:.*]]: tensor<1x1024x!quant.uniform<i8:f32, {{.*}}>
+ func.func @dot_general_full_integer(%arg0: tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+0:0>> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) {
+    %0 = stablehlo.constant() {value = dense<1> : tensor<1024x3xi8>} : () -> tensor<1024x3x!quant.uniform<i8<-127:127>:f32, 2.000000e+0:0>>
+    %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0] : (tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+0:0>>, tensor<1024x3x!quant.uniform<i8<-127:127>:f32, 2.000000e+0:0>>) -> tensor<1x3x!quant.uniform<i32:f32, 4.000000e+0:-127>>
+    %2 = stablehlo.uniform_quantize %1 : (tensor<1x3x!quant.uniform<i32:f32, 4.000000e+0:-127>>) -> tensor<1x3x!quant.uniform<i8:f32, 2.000000e+0:-127>>
+    %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 2.000000e+0:-127>>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+// CHECK-NOT: stablehlo.dot_general
+// CHECK: %[[QCONST_0:.*]] =  "tfl.pseudo_qconst"() {qtype = tensor<3x1024x!quant.uniform<i8:f32, 2.000000e+00>>, value = dense<1> : tensor<3x1024xi8>} : () -> tensor<3x1024x!quant.uniform<i8:f32, 2.000000e+00>>
+// CHECK: %[[QCONST_1:.*]] =  "tfl.pseudo_qconst"() {qtype = tensor<3x!quant.uniform<i32<-128:127>:f32, 2.000000e+00>>, value = dense<0> : tensor<3xi32>} : () -> tensor<3x!quant.uniform<i32<-128:127>:f32, 2.000000e+00>>
+// CHECK: "tfl.fully_connected"(%[[ARG_1]], %[[QCONST_0]], %[[QCONST_1]])  {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<3x1024x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<3x!quant.uniform<i32<-128:127>:f32, 2.000000e+00>>) -> tensor<1x3x!quant.uniform<i8:f32, 2.000000e+00:-127>>
+// CHECK-NOT: tfl.batch_matmul
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
index a5286025463a52..587c971cdffaef 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
@@ -731,17 +731,18 @@ class ComposeUniformQuantizedConvolutionOp
     auto combined_scale_constant_op = cast<stablehlo::ConstantOp>(
         scale_combined_broadcast_in_dim_op.getOperand().getDefiningOp());
 
-    SmallVector<float> filter_scale_values;
+    SmallVector<double> filter_scale_values;
     for (const auto combined_scale_value : combined_scale_constant_op.getValue()
                                                .cast<DenseFPElementsAttr>()
                                                .getValues<float>()) {
-      const float filter_scale_value =
-          combined_scale_value * input_inverse_scales_value;
+      // UniformQuantizedPerAxisType requires scales to have double dtype.
+      const double filter_scale_value = static_cast<double>(
+          combined_scale_value * input_inverse_scales_value);
       filter_scale_values.emplace_back(filter_scale_value);
     }
 
     // Assumes it is symmetric.
-    SmallVector<int8_t> filter_zero_point_values(
+    SmallVector<int64_t> filter_zero_point_values(
         /*Size=*/filter_scale_values.size(), /*Value=*/0);
 
     // Use quantization dimension = 3 that corresponds to the output channel
@@ -1083,15 +1084,17 @@ class ComposeUniformQuantizedDotGeneralOp
     // s1 * s2
     auto merged_scale_constant_op =
         cast<stablehlo::ConstantOp>(multiply_op_second_operand.getDefiningOp());
-    SmallVector<float> filter_scale_values;
+    SmallVector<double> filter_scale_values;
     for (const auto merged_scale : merged_scale_constant_op.getValue()
                                        .cast<DenseFPElementsAttr>()
                                        .getValues<float>()) {
       // (s1 * s2) * (1 / s1) = s2
-      filter_scale_values.push_back(merged_scale * input_inverse_scale_value);
+      // UniformQuantizedPerAxisType requires scales to have double dtype.
+      filter_scale_values.push_back(
+          static_cast<double>(merged_scale * input_inverse_scale_value));
     }
 
-    SmallVector<int8_t> filter_zero_point_values(
+    SmallVector<int64_t> filter_zero_point_values(
         /*Size=*/filter_scale_values.size(), /*Value=*/0);
 
     const int quantization_dimension = GetFilterQuantizationDimension(
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
index f161bb3c90c3ae..d6ca92d5ca89db 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
@@ -3096,8 +3096,20 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
     auto tf_gather_nd_result_type =
         RankedTensorType::get(transpose_params.canonicalized_output_shape,
                               result_type.getElementType());
+
+    TF::CastOp cast_op = nullptr;
+    if (start_indices_type.getElementType().isUnsignedInteger(32)) {
+      cast_op = rewriter.create<TF::CastOp>(
+          gather_op->getLoc(),
+          RankedTensorType::get(start_indices_type.getShape(),
+                                rewriter.getI64Type()),
+          start_indices);
+    }
+
     auto tf_gather_nd_op = rewriter.create<TF::GatherNdOp>(
-        gather_op->getLoc(), tf_gather_nd_result_type, operand, start_indices);
+        gather_op->getLoc(), tf_gather_nd_result_type, operand,
+        cast_op ? cast_op.getResult() : start_indices);
+
     if (!need_transpose_after) {
       rewriter.replaceOp(gather_op, tf_gather_nd_op->getOpResults());
       return success();
@@ -3386,9 +3398,6 @@ class ConvertIfOp : public OpConversionPattern<mhlo::IfOp> {
 };
 
 // Converts mhlo.pad to tf.PadV2
-// TODO: b/301438955 - This is redundant with the MHLO -> TFLite
-// legalization and covers less usecases. We need to check with DarwiNN that
-// this can be removed without breaking their workflow.
 Value ConvertPadOp(PatternRewriter& rewriter, Operation* old_op) {
   auto pad_op = cast<mhlo::PadOp>(old_op);
   mlir::Location loc = pad_op.getLoc();
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
index fb2b2d6f068350..4aaf08a8686e5b 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/BUILD
@@ -49,24 +49,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "pad",
-    srcs = [
-        "pad.cc",
-    ],
-    hdrs = [
-        "pad.h",
-    ],
-    deps = [
-        ":util",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-        "@local_xla//xla/mlir_hlo",
-    ],
-)
-
 cc_library(
     name = "dot_general",
     srcs = [
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.cc
deleted file mode 100644
index 9fd1fcb8402c51..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.h"
-
-#include <cstdint>
-
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-
-namespace mlir {
-namespace odml {
-
-ConversionState BuildConversionState(mhlo::PadOp mhlo_pad,
-                                     ConversionPatternRewriter& rewriter) {
-  ConversionState state{
-      /*.shlo_op=*/mhlo_pad.getOperation(),
-      /*.rewriter=*/rewriter,
-      /*.last_tf_op=*/nullptr,
-  };
-  return state;
-}
-
-// Converts the given StableHLO Pad operation to a chain of TFLite operations.
-//
-// StableHLO Pad allows dilating, padding and cropping its input, in that order.
-// This can be implemented in TFLite as a sequence of these operations. Note
-// that all operations do not always need to be called: if there is no dilation
-// (resp. pad, crop) we do not need to add it to the chain.
-//
-// TFLite does not provide a crop operation, the StridedSlice one is used
-// instead.
-LogicalResult ConvertPadOp::matchAndRewrite(
-    mhlo::PadOp mhlo_pad, OpAdaptor adaptor,
-    ConversionPatternRewriter& rewriter) const {
-  // We don't need to match the pad op as we always know how to convert it.
-  ConversionState state = BuildConversionState(mhlo_pad, rewriter);
-
-  // Dilate when interior padding is specified different from 0.
-  AddDilateOpIfRequired(state, mhlo_pad.getInteriorPadding(),
-                        mhlo_pad.getPaddingValue(),
-                        /*is_padding=*/true);
-  // Pad when padding has positive values.
-  AddPadOpIfRequired(state, mhlo_pad.getEdgePaddingLow(),
-                     mhlo_pad.getEdgePaddingHigh(), mhlo_pad.getPaddingValue());
-  // Crop when padding has negative values.
-  //
-  // Note that there is no crop operation in TFLite so we use the StridedSlice
-  // operation instead.
-  const DenseElementsAttr strides_data = CreateDenseElementsAttr(
-      state.rewriter,
-      llvm::SmallVector<int64_t, 6>(state.GetOperandShape().size(), 1));
-  AddStridedSliceOpIfRequired(state, mhlo_pad.getEdgePaddingLow(),
-                              mhlo_pad.getEdgePaddingHigh(), strides_data);
-
-  if (state.last_tf_op) {
-    rewriter.replaceOp(mhlo_pad, state.last_tf_op);
-  } else {
-    rewriter.replaceOp(mhlo_pad, mhlo_pad.getOperand());
-  }
-  return success();
-}
-
-}  // namespace odml
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.h
deleted file mode 100644
index c0fa5017b69236..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_PAD_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_PAD_H_
-
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-
-namespace mlir {
-namespace odml {
-
-class ConvertPadOp : public OpConversionPattern<mhlo::PadOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      mhlo::PadOp mhlo_pad, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const final;
-};
-
-}  // namespace odml
-}  // namespace mlir
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_LEGALIZE_HLO_CONVERSIONS_PAD_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
index 4432cec521b99d..c2f533776d0408 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
@@ -164,210 +164,6 @@ LogicalResult MatchBinaryReduceFunction<void>(mlir::Region& function) {
   return success();
 }
 
-Value ConversionState::GetOperand() const {
-  if (last_tf_op) {
-    return last_tf_op->getResult(0);
-  }
-  return hlo_op->getOperand(0);
-}
-
-TensorType ConversionState::GetOperandTensorType() const {
-  if (last_tf_op) {
-    return last_tf_op->getResult(0).getType().cast<TensorType>();
-  }
-  return hlo_op->getOperand(0).getType().cast<TensorType>();
-}
-
-llvm::ArrayRef<int64_t> ConversionState::GetOperandShape() const {
-  return GetOperandTensorType().getShape();
-}
-
-namespace {
-
-// Gets the dilation data for TFLite Dilate.
-//
-// Depending on the definition of the op we are trying to legalize, a dilation
-// can be either seen as interior padding or as a scaling factor where:
-//
-//     scaling_factor = interior_padding + 1
-//
-// The is_padding parameter is used to take this difference into account.
-llvm::SmallVector<int32_t, 6> GetDilateData(const DenseElementsAttr& dilation,
-                                            const bool is_padding) {
-  llvm::SmallVector<int32_t, 6> data;
-  for (const auto& v : dilation.getValues<APInt>()) {
-    data.push_back(v.getSExtValue() + static_cast<int32_t>(is_padding));
-  }
-  return data;
-}
-
-}  // namespace
-
-void AddDilateOpIfRequired(ConversionState& state,
-                           const DenseElementsAttr& dilation,
-                           const Value padding_value, const bool is_padding) {
-  const auto dilate_data = GetDilateData(dilation, is_padding);
-  if (absl::c_any_of(dilate_data, IsNot(1))) {
-    const TensorType output_type = state.ComputeResultTensorType(
-        [](int i, const auto& shape, const auto& dilate_data) {
-          if (shape[i] < 0) {
-            return shape[i];
-          }
-          return shape[i] + (shape[i] - 1) * (dilate_data[i] - 1);
-        },
-        dilate_data);
-
-    auto dilate_tensor = AddConstantTensor(state, dilate_data);
-    auto tfl_dilate = state.rewriter.create<TFL::DilateOp>(
-        state.hlo_op->getLoc(), output_type, state.GetOperand(), dilate_tensor,
-        padding_value);
-
-    state.last_tf_op = tfl_dilate;
-  }
-}
-
-namespace {
-
-// Gets the pad data for TFLite PadV2.
-//
-// StableHLO Pad allows negative values for cropping. This functions replaces
-// negative values with 0.
-llvm::SmallVector<int64_t, 12> GetPadData(
-    const DenseElementsAttr& edge_padding_low,
-    const DenseElementsAttr& edge_padding_high) {
-  llvm::SmallVector<int64_t, 12> data;
-  auto low_values = edge_padding_low.getValues<APInt>();
-  auto high_values = edge_padding_high.getValues<APInt>();
-  for (int i = 0; i < edge_padding_low.getNumElements(); ++i) {
-    const int64_t pad_low = low_values[i].getSExtValue();
-    const int64_t pad_high = high_values[i].getSExtValue();
-    data.push_back(pad_low < 0 ? 0 : pad_low);
-    data.push_back(pad_high < 0 ? 0 : pad_high);
-  }
-  return data;
-}
-
-template <class Container>
-void AddPadOpIfRequiredImpl(ConversionState& state, const Container& pad_data,
-                            const Value padding_value) {
-  if (absl::c_any_of(pad_data, IsNot(0))) {
-    const TensorType output_type = state.ComputeResultTensorType(
-        [](int i, const auto& shape, const auto& pad) {
-          if (shape[i] < 0) {
-            return shape[i];
-          }
-          return shape[i] + pad[2 * i] + pad[2 * i + 1];
-        },
-        pad_data);
-
-    auto pad_tensor = AddConstantTensor(
-        state, pad_data,
-        {static_cast<int64_t>(state.GetOperandShape().size()), 2});
-    auto tfl_pad = state.rewriter.create<TFL::PadV2Op>(
-        state.hlo_op->getLoc(), output_type, state.GetOperand(), pad_tensor,
-        padding_value);
-
-    state.last_tf_op = tfl_pad;
-  }
-}
-
-}  // namespace
-
-void AddPadOpIfRequired(ConversionState& state,
-                        const DenseElementsAttr& edge_padding_low,
-                        const DenseElementsAttr& edge_padding_high,
-                        const Value padding_value) {
-  AddPadOpIfRequiredImpl(state, GetPadData(edge_padding_low, edge_padding_high),
-                         padding_value);
-}
-
-namespace {
-
-// Holds the data needed to generate a TFLite StridedSlice operation.
-struct StridedSliceData {
-  llvm::SmallVector<int64_t, 6> low;
-  llvm::SmallVector<int64_t, 6> high;
-  llvm::SmallVector<int64_t, 6> strides;
-  int32_t begin_mask = 0;
-  int32_t end_mask = 0;
-
-  void resize(const size_t size) {
-    low.resize(size);
-    high.resize(size);
-    strides.resize(size);
-  }
-};
-
-// Updates the strided slice data with the given values for the `i`th element.
-//
-// Warning: this expects the data internal buffers to have at least i+1
-// elements.
-void AppendDataDim(StridedSliceData& data, const int i, const APInt& low,
-                   const APInt& high, const APInt& stride) {
-  const int64_t pad_low = low.getSExtValue();
-  const int64_t pad_high = high.getSExtValue();
-  if (pad_low >= 0) {
-    data.begin_mask |= 1 << i;
-    data.low[i] = 0;
-  } else {
-    data.low[i] = -pad_low;
-  }
-  if (pad_high >= 0) {
-    data.end_mask |= 1 << i;
-    data.high[i] = 0;
-  } else {
-    data.high[i] = pad_high;
-  }
-  data.strides[i] = stride.getSExtValue();
-}
-
-// Gets the data needed to generate a TFLite StridedSlice operation.
-StridedSliceData GetStridedSliceData(const DenseElementsAttr& edge_padding_low,
-                                     const DenseElementsAttr& edge_padding_high,
-                                     const DenseElementsAttr& strides) {
-  StridedSliceData data;
-  data.resize(edge_padding_low.getNumElements());
-  const auto low_values = edge_padding_low.getValues<APInt>();
-  const auto high_values = edge_padding_high.getValues<APInt>();
-  const auto stride_values = strides.getValues<APInt>();
-  for (int i = 0; i < edge_padding_low.getNumElements(); ++i) {
-    AppendDataDim(data, i, low_values[i], high_values[i], stride_values[i]);
-  }
-  return data;
-}
-
-void AddStridedSliceOpIfRequiredImpl(
-    ConversionState& state, const StridedSliceData& strided_slice_data) {
-  if (absl::c_any_of(strided_slice_data.low, IsNot(0)) ||
-      absl::c_any_of(strided_slice_data.high, IsNot(0)) ||
-      absl::c_any_of(strided_slice_data.strides, IsNot(1))) {
-    const TensorType output_type = state.ComputeResultTensorType(
-        [](int i, const auto& shape, const auto& high, const auto& low,
-           const auto& strides) {
-          if (shape[i] < 0) {
-            return shape[i];
-          }
-          return (shape[i] + high[i] - low[i]) / strides[i];
-        },
-        strided_slice_data.high, strided_slice_data.low,
-        strided_slice_data.strides);
-
-    auto crop_begin_tensor = AddConstantTensor(state, strided_slice_data.low);
-    auto crop_end_tensor = AddConstantTensor(state, strided_slice_data.high);
-    auto crop_strides_tensor =
-        AddConstantTensor(state, strided_slice_data.strides);
-    auto tfl_crop = state.rewriter.create<TFL::StridedSliceOp>(
-        state.hlo_op->getLoc(), output_type, state.GetOperand(),
-        crop_begin_tensor, crop_end_tensor, crop_strides_tensor,
-        strided_slice_data.begin_mask, strided_slice_data.end_mask, 0, 0, 0,
-        false);
-
-    state.last_tf_op = tfl_crop;
-  }
-}
-
-}  // namespace
-
 bool NeedsReformatTypeAndPermutation(int batch_dim, int feature_dim,
                                      int spatial_dim_start,
                                      int default_batch_dim,
@@ -426,15 +222,6 @@ Value InsertTranspose(Value value, int batch_dim, int feature_dim,
                                             permutation);
 }
 
-void AddStridedSliceOpIfRequired(ConversionState& state,
-                                 const DenseElementsAttr& edge_padding_low,
-                                 const DenseElementsAttr& edge_padding_high,
-                                 const DenseElementsAttr& strides) {
-  StridedSliceData strided_slice_data =
-      GetStridedSliceData(edge_padding_low, edge_padding_high, strides);
-  AddStridedSliceOpIfRequiredImpl(state, strided_slice_data);
-}
-
 Value CreateCastToInt32(Value val, Location loc, PatternRewriter& rewriter) {
   IntegerType new_ele_type = rewriter.getIntegerType(32);
   if (auto shaped_type = val.getType().dyn_cast<RankedTensorType>()) {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
index 442161ade171f7..c58fdaa76a78d2 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
@@ -139,137 +139,6 @@ LogicalResult MatchBinaryReduceFunction(mlir::Region& function) {
 template <>
 LogicalResult MatchBinaryReduceFunction<void>(mlir::Region& function);
 
-// Concentrates the data needed to substitute StableHLO operations with TFLite
-// ones.
-struct ConversionState {
-  Operation* hlo_op;
-  ConversionPatternRewriter& rewriter;
-  Operation* last_tf_op;
-
-  // Returns the main operand of a NEW op to add to the conversion chain.
-  //
-  // This is generally the result of the last op that was added to the chain.
-  Value GetOperand() const;
-
-  // Returns the type of the operand of a NEW op to add to the conversion chain.
-  //
-  // This is generally the type of the result of the last op that was added to
-  // the chain.
-  TensorType GetOperandTensorType() const;
-
-  llvm::ArrayRef<int64_t> GetOperandShape() const;
-
-  // Computes a new shape from the current operand shape.
-  //
-  // - The args are containers that are indexable using operator[].
-  // - The callback must be callable have a signature that is:
-  //      `int64_t (int idx, shape, decltype(args)...)`
-  //
-  // The callback is called for each element of the operand shape with the
-  // index of the current loop iteration, the shape and args.
-  template <class F, class... Containers>
-  llvm::SmallVector<int64_t, 6> ComputeResultShape(F&& callback,
-                                                   Containers&&... args) const {
-    llvm::ArrayRef<int64_t> shape = GetOperandShape();
-    llvm::SmallVector<int64_t, 6> res;
-    for (int i = 0; i < shape.size(); ++i) {
-      if (shape[i] < 0) {
-        res.push_back(shape[i]);
-      } else {
-        res.push_back(callback(i, shape, args...));
-      }
-    }
-    return res;
-  }
-
-  template <class F, class... Containers>
-  TensorType ComputeResultTensorType(F&& callback, Containers&&... args) const {
-    const llvm::SmallVector<int64_t, 6> shape = ComputeResultShape(
-        static_cast<F&&>(callback), static_cast<Containers&&>(args)...);
-    return GetOperandTensorType().cloneWith(
-        shape, GetOperandTensorType().getElementType());
-  }
-};
-
-// Gets the Type associated to type T from the builder.
-template <class T>
-Type GetElementType(OpBuilder& builder);
-
-#define GET_ELEMENT_TYPE_SPECIALISATION(TYPE, NAME)       \
-  template <>                                             \
-  inline Type GetElementType<TYPE>(OpBuilder & builder) { \
-    return builder.get##NAME##Type();                     \
-  }
-
-GET_ELEMENT_TYPE_SPECIALISATION(int32_t, I32);
-GET_ELEMENT_TYPE_SPECIALISATION(int64_t, I64);
-
-// Create a DenseElementsAttr from given shape and data.
-template <class Data, class Shape = llvm::SmallVector<int64_t, 6>>
-DenseElementsAttr CreateDenseElementsAttr(OpBuilder& builder, const Data& data,
-                                          const Shape& shape = Shape()) {
-  llvm::SmallVector<int64_t, 6> attr_shape(shape.begin(), shape.end());
-  if (attr_shape.empty()) {
-    attr_shape.push_back(static_cast<int64_t>(data.size()));
-  }
-  const Type attr_type = GetElementType<typename Data::value_type>(builder);
-  return DenseElementsAttr::get(RankedTensorType::get(attr_shape, attr_type),
-                                ArrayRef<typename Data::value_type>(data));
-}
-
-// Adds a constant tensor to the conversion chain.
-template <class Data, class Shape = llvm::SmallVector<int64_t, 6>>
-auto AddConstantTensor(ConversionState& state, const Data& data,
-                       const Shape& shape = Shape()) {
-  const DenseElementsAttr attr =
-      CreateDenseElementsAttr(state.rewriter, data, shape);
-  return state.rewriter.create<arith::ConstantOp>(state.hlo_op->getLoc(), attr);
-}
-
-// Builds a callable object that checks that its argument is not the given
-// `value`.
-template <class T>
-auto IsNot(T value) {
-  return [value](auto v) { return v != value; };
-}
-
-// Adds a TFLite Dilate operation to the conversion chain.
-//
-// If the given parameters would end with the identity operation, this does not
-// add anything to the chain.
-//
-// Depending on the definition of the op we are trying to legalize, a dilation
-// can be either seen as interior padding or as a scaling factor where:
-//
-//     scaling_factor = interior_padding + 1
-//
-// The is_padding parameter is used to take this difference into account.
-void AddDilateOpIfRequired(ConversionState& state,
-                           const DenseElementsAttr& dilation,
-                           Value padding_value, bool is_padding);
-
-// Adds a TFLite PadV2 operation to the conversion chain.
-//
-// If the given parameters would end with the identity operation, this does not
-// add anything to the chain.
-void AddPadOpIfRequired(ConversionState& state,
-                        const DenseElementsAttr& edge_padding_low,
-                        const DenseElementsAttr& edge_padding_high,
-                        Value padding_value);
-
-// Adds a TFLite StridedSlice operation to the conversion chain.
-//
-// This overload is used to legalize a crop operation in TFLite. As such, the
-// begin and end specifications of the strided slice are computed from the
-// negative values in the padding parameters.
-//
-// If the given parameters would end with the identity operation, this does not
-// add anything to the chain.
-void AddStridedSliceOpIfRequired(ConversionState& state,
-                                 const DenseElementsAttr& edge_padding_low,
-                                 const DenseElementsAttr& edge_padding_high,
-                                 const DenseElementsAttr& strides);
-
 // Util that casts 'val' to Int32 by adding a tfl cast Op.
 Value CreateCastToInt32(Value val, Location loc, PatternRewriter& rewriter);
 }  // namespace odml
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc
index 858fe15a7f492a..3bb9eddbfa5021 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc
@@ -125,6 +125,16 @@ void StablehloToTflPass::runOnOperation() {
         continue;
       }
 
+      if (attr.isa<::mlir::DenseI64ArrayAttr>()) {
+        auto array_attr = attr.dyn_cast<mlir::DenseI64ArrayAttr>();
+        auto start = fbb->StartVector(key);
+        for (auto int_value : array_attr.asArrayRef()) {
+          fbb->Add(int_value);
+        }
+        fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
+        continue;
+      }
+
       if (attr.isa<::mlir::StringAttr>()) {
         fbb->String(key, attr.dyn_cast<mlir::StringAttr>().data());
         continue;
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc
index ec708f70724c84..5e4f79f18ce503 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc
@@ -75,7 +75,8 @@ class TflToStablehloPass
   }
 
   llvm::SmallVector<mlir::NamedAttribute, 4> ReadAttr(const flexbuffers::Map& m,
-                                                      Builder* builder) {
+                                                      Builder* builder,
+                                                      std::string op_name) {
     llvm::SmallVector<mlir::NamedAttribute, 4> attrs;
     const auto& keys = m.Keys();
     for (size_t i = 0; i < keys.size(); ++i) {
@@ -102,10 +103,19 @@ class TflToStablehloPass
           } else {
             shape.push_back(vec.size());
           }
-          RankedTensorType ty = tensorflow::GetTypeFromTFTensorShape(
-              shape, builder->getIntegerType(64));
-          auto named_attr =
-              builder->getNamedAttr(key, DenseIntElementsAttr::get(ty, vec));
+          Attribute value;
+          if (op_name == "stablehlo.broadcast" ||
+              op_name == "stablehlo.dynamic_slice" ||
+              op_name == "stablehlo.fft" || op_name == "stablehlo.pad" ||
+              op_name == "stablehlo.reverse" || op_name == "stablehlo.slice" ||
+              op_name == "stablehlo.transpose") {
+            value = builder->getDenseI64ArrayAttr(vec);
+          } else {
+            RankedTensorType ty = tensorflow::GetTypeFromTFTensorShape(
+                shape, builder->getIntegerType(64));
+            value = DenseIntElementsAttr::get(ty, vec);
+          }
+          auto named_attr = builder->getNamedAttr(key, value);
           attrs.push_back(named_attr);
           break;
         }
@@ -181,7 +191,8 @@ void TflToStablehloPass::runOnOperation() {
         flexbuffers::GetRoot(option_buf,
                              custom_op.getCustomOption().getValue().size())
             .AsMap();
-    auto attr = ReadAttr(flex_buffer_map, &builder);
+    auto attr =
+        ReadAttr(flex_buffer_map, &builder, custom_op.getCustomCode().str());
     OperationState op_state(custom_op.getLoc(),
                             custom_op.getCustomCode().str());
     op_state.addOperands(custom_op.getOperands());
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
index e50cb2dad9f4c0..6c07c0c0e4b8d2 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.h"  // IWYU pragma: keep
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/pad.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"  // IWYU pragma: keep
@@ -57,7 +56,7 @@ void LegalizeHloToTfLitePass::runOnOperation() {
   MLIRContext& context = getContext();
   RewritePatternSet patterns(&getContext());
   // Add new conversion patterns here.
-  patterns.add<odml::ConvertPadOp, odml::ConvertCustomCallOp>(&context);
+  patterns.add<odml::ConvertCustomCallOp>(&context);
   populateWithGenerated(patterns);
 
   ConversionTarget target(context);
@@ -66,8 +65,7 @@ void LegalizeHloToTfLitePass::runOnOperation() {
   target.addDynamicallyLegalOp<mhlo::CustomCallOp>(IsCustomCallLegal);
   // Converted MHLO ops should be marked illegal here.
   // TODO: b/304003568 - Add TF_TransposeOp folding logic to tflite.
-  target.addIllegalOp<mhlo::PadOp, mhlo::DotGeneralOp, mhlo::DotOp,
-                      mhlo::TransposeOp>();
+  target.addIllegalOp<mhlo::DotGeneralOp, mhlo::DotOp, mhlo::TransposeOp>();
   if (failed(applyPartialConversion(getOperation(), target,
                                     std::move(patterns)))) {
     getOperation().emitError("mhlo to TFLite legalization failed.");
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
index 18070fe59134e3..3ba5ad97ad579e 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h"
 
 #define DEBUG_TYPE "uniform-quantized-stablehlo-to-tfl"
 
@@ -46,6 +47,13 @@ namespace mlir {
 namespace odml {
 namespace {
 
+// TODO: b/311029361: Add e2e test for verifying this legalization once
+// StableHLO Quantizer API migration is complete.
+
+using ::mlir::quant::IsI32F32UniformQuantizedType;
+using ::mlir::quant::IsI8F32UniformQuantizedPerAxisType;
+using ::mlir::quant::IsI8F32UniformQuantizedType;
+using ::mlir::quant::IsSupportedByTfliteQuantizeOrDequantizeOps;
 using ::mlir::quant::QuantizedType;
 using ::mlir::quant::UniformQuantizedPerAxisType;
 using ::mlir::quant::UniformQuantizedType;
@@ -60,95 +68,164 @@ class UniformQuantizedStablehloToTflPass
   void runOnOperation() override;
 };
 
-// Determines whether the storage type of a quantized type is supported by
-// `tfl.quantize` or `tfl.dequantize` ops. ui8, i8 and i16 are supported.
-bool IsSupportedByTfliteQuantizeOrDequantizeOps(IntegerType storage_type) {
-  if ((storage_type.isSigned() &&
-       !(storage_type.getWidth() == 8 || storage_type.getWidth() == 16)) ||
-      (!storage_type.isSigned() && storage_type.getWidth() != 8)) {
-    LLVM_DEBUG(llvm::dbgs()
-               << "Uniform quantize / dequantize op only supports ui8, i8 or "
-                  "i16 for the storage type of uniform quantized type. Got: "
-               << storage_type << ".\n");
-    return false;
-  }
-  return true;
-}
-
-// Returns true iff the storage type of `quantized_type` is 8-bit integer.
-bool IsStorageTypeI8(QuantizedType quantized_type) {
-  const Type storage_type = quantized_type.getStorageType();
-  return storage_type.isInteger(/*width=*/8);
+// Bias scales for matmul-like ops should be input scale * filter scale. Here it
+// is assumed that the input is per-tensor quantized and filter is per-channel
+// quantized.
+SmallVector<double> GetBiasScales(const double input_scale,
+                                  const ArrayRef<double> filter_scales) {
+  SmallVector<double> bias_scales;
+  absl::c_transform(filter_scales, std::back_inserter(bias_scales),
+                    [input_scale](const double filter_scale) -> double {
+                      return filter_scale * input_scale;
+                    });
+  return bias_scales;
 }
 
-// Returns true iff the expressed type of `quantized_type` is f32.
-bool IsExpressedTypeF32(QuantizedType quantized_type) {
-  const Type expressed_type = quantized_type.getExpressedType();
-  return expressed_type.isa<Float32Type>();
+// Returns a bias scale for matmul-like ops. Here it is assumed that both input
+// and filter are per-tensor quantized.
+double GetBiasScale(const double input_scale, const double filter_scale) {
+  return filter_scale * input_scale;
 }
 
-// Returns true iff `type` is a uniform quantized type whose storage type is
-// 8-bit integer and expressed type is f32.
-bool IsI8F32UniformQuantizedType(const Type type) {
-  auto quantized_type = type.dyn_cast_or_null<UniformQuantizedType>();
-  if (!quantized_type) {
-    LLVM_DEBUG(llvm::dbgs()
-               << "Expected a uniform quantized type. Got: " << type << ".\n");
-    return false;
+// Creates a new `tfl.qconst` op for the quantized filter. Transposes the
+// filter value from [i, o] -> [o, i]. This is because we assume `[i, o]`
+// format for `stablehlo.dot_general` (i.e. contracting dimension == 1)
+// whereas `tfl.fully_connected` accepts an OI format.
+TFL::QConstOp CreateTflConstOpForFilter(
+    stablehlo::ConstantOp filter_constant_op, PatternRewriter& rewriter,
+    bool is_per_axis) {
+  const auto filter_values = filter_constant_op.getValue()
+                                 .cast<DenseIntElementsAttr>()
+                                 .getValues<int8_t>();
+
+  ArrayRef<int64_t> filter_shape =
+      filter_constant_op.getType().cast<TensorType>().getShape();
+
+  // Reverse the shapes. This makes sense, assuming that the filter tensor has a
+  // rank of 2 (no batch dimension).
+  SmallVector<int64_t, 2> new_filter_shape(filter_shape.rbegin(),
+                                           filter_shape.rend());
+
+  // Construct the value array of transposed filter. Assumes 2D matrix.
+  SmallVector<int8_t> new_filter_values(filter_values.size(), /*Value=*/0);
+  for (int i = 0; i < filter_shape[0]; ++i) {
+    for (int j = 0; j < filter_shape[1]; ++j) {
+      const int old_idx = i * filter_shape[1] + j;
+      const int new_idx = j * filter_shape[0] + i;
+      new_filter_values[new_idx] = filter_values[old_idx];
+    }
   }
 
-  if (!IsStorageTypeI8(quantized_type)) {
-    LLVM_DEBUG(llvm::dbgs() << "Expected an i8 storage type. Got: "
-                            << quantized_type << ".\n");
-    return false;
-  }
+  auto new_filter_value_attr_type = RankedTensorType::getChecked(
+      filter_constant_op.getLoc(), new_filter_shape,
+      /*elementType=*/rewriter.getI8Type());
 
-  if (!IsExpressedTypeF32(quantized_type)) {
-    LLVM_DEBUG(llvm::dbgs() << "Expected an f32 expressed type. Got: "
-                            << quantized_type << ".\n");
-    return false;
+  Type new_filter_quantized_type;
+
+  if (is_per_axis) {
+    auto filter_quantized_type = filter_constant_op.getResult()
+                                     .getType()
+                                     .cast<TensorType>()
+                                     .getElementType()
+                                     .cast<UniformQuantizedPerAxisType>();
+
+    new_filter_quantized_type = UniformQuantizedPerAxisType::getChecked(
+        filter_constant_op.getLoc(), /*flags=*/true,
+        /*storageType=*/filter_quantized_type.getStorageType(),
+        /*expressedType=*/filter_quantized_type.getExpressedType(),
+        /*scales=*/filter_quantized_type.getScales(),
+        /*zeroPoints=*/filter_quantized_type.getZeroPoints(),
+        /*quantizedDimension=*/0, /*storageTypeMin=*/llvm::minIntN(8),
+        /*storageTypeMax=*/llvm::maxIntN(8));
+  } else {
+    auto filter_quantized_type = filter_constant_op.getResult()
+                                     .getType()
+                                     .cast<TensorType>()
+                                     .getElementType()
+                                     .cast<UniformQuantizedType>();
+    new_filter_quantized_type = UniformQuantizedType::getChecked(
+        filter_constant_op.getLoc(), /*flags=*/true,
+        /*storageType=*/filter_quantized_type.getStorageType(),
+        /*expressedType=*/filter_quantized_type.getExpressedType(),
+        /*scale=*/filter_quantized_type.getScale(),
+        /*zeroPoint=*/filter_quantized_type.getZeroPoint(),
+        /*storageTypeMin=*/llvm::minIntN(8),
+        /*storageTypeMax=*/llvm::maxIntN(8));
   }
 
-  return true;
+  // Required because the quantized dimension is changed from 3 -> 0.
+  auto new_filter_result_type = RankedTensorType::getChecked(
+      filter_constant_op.getLoc(), /*shape=*/new_filter_shape,
+      /*type=*/new_filter_quantized_type);
+
+  auto new_filter_constant_value_attr =
+      DenseIntElementsAttr::get(new_filter_value_attr_type, new_filter_values);
+  return rewriter.create<TFL::QConstOp>(
+      filter_constant_op.getLoc(),
+      /*output=*/TypeAttr::get(new_filter_result_type),
+      /*value=*/new_filter_constant_value_attr);
 }
 
-// Returns true iff `type` is a uniform quantized per-axis (per-channel) type
-// whose storage type is 8-bit integer and expressed type is f32.
-bool IsI8F32UniformQuantizedPerAxisType(const Type type) {
-  auto quantized_per_axis_type =
-      type.dyn_cast_or_null<UniformQuantizedPerAxisType>();
-  if (!quantized_per_axis_type) {
-    LLVM_DEBUG(llvm::dbgs()
-               << "Expected a uniform quantized type. Got: " << type << ".\n");
-    return false;
-  }
+// Creates a new `tfl.qconst` op for the bias. The bias values are 0s, because
+// this bias a dummy bias (note that bias fusion is not considered for this
+// transformation). The quantization scale for the bias is input scale *
+// filter scale. `filter_const_op` is used to retrieve the filter scales and
+// the size of the bias constant.
+// TODO - b/309896242: Support bias fusion legalization.
+TFL::QConstOp CreateTflConstOpForDummyBias(const Location loc,
+                                           const double input_scale,
+                                           TFL::QConstOp filter_const_op,
+                                           PatternRewriter& rewriter,
+                                           bool is_per_axis) {
+  const ArrayRef<int64_t> filter_shape =
+      filter_const_op.getResult().getType().getShape();
+
+  Type bias_quantized_type;
+  if (is_per_axis) {
+    const auto filter_quantized_element_type =
+        filter_const_op.getResult()
+            .getType()
+            .getElementType()
+            .cast<UniformQuantizedPerAxisType>();
 
-  if (!IsStorageTypeI8(quantized_per_axis_type)) {
-    LLVM_DEBUG(llvm::dbgs() << "Expected an i8 storage type. Got: "
-                            << quantized_per_axis_type << ".\n");
-    return false;
-  }
+    // The storage type is i32 for bias, which is the precision used for
+    // accumulation.
+    bias_quantized_type = UniformQuantizedPerAxisType::getChecked(
+        loc, /*flags=*/true, /*storageType=*/rewriter.getI32Type(),
+        /*expressedType=*/rewriter.getF32Type(), /*scales=*/
+        GetBiasScales(input_scale, filter_quantized_element_type.getScales()),
+        /*zeroPoints=*/filter_quantized_element_type.getZeroPoints(),
+        /*quantizedDimension=*/0, /*storageTypeMin=*/llvm::minIntN(8),
+        /*storageTypeMax=*/llvm::maxIntN(8));
+  } else {
+    const auto filter_quantized_element_type =
+        filter_const_op.getResult()
+            .getType()
+            .getElementType()
+            .cast<UniformQuantizedType>();
 
-  if (!IsExpressedTypeF32(quantized_per_axis_type)) {
-    LLVM_DEBUG(llvm::dbgs() << "Expected an f32 expressed type. Got: "
-                            << quantized_per_axis_type << ".\n");
-    return false;
+    // The storage type is i32 for bias, which is the precision used for
+    // accumulation.
+    bias_quantized_type = UniformQuantizedType::getChecked(
+        loc, /*flags=*/true, /*storageType=*/rewriter.getI32Type(),
+        /*expressedType=*/rewriter.getF32Type(), /*scale=*/
+        GetBiasScale(input_scale, filter_quantized_element_type.getScale()),
+        /*zeroPoint=*/filter_quantized_element_type.getZeroPoint(),
+        /*storageTypeMin=*/llvm::minIntN(8),
+        /*storageTypeMax=*/llvm::maxIntN(8));
   }
 
-  return true;
-}
+  SmallVector<int64_t, 1> bias_shape = {filter_shape[0]};
+  auto bias_type =
+      RankedTensorType::getChecked(loc, bias_shape, bias_quantized_type);
 
-// Bias scales for matmul-like ops should be input scale * filter scale. Here it
-// is assumed that the input is per-tensor quantized and filter is per-channel
-// quantized.
-SmallVector<double> GetBiasScales(const double input_scale,
-                                  const ArrayRef<double> filter_scales) {
-  SmallVector<double> bias_scales;
-  absl::c_transform(filter_scales, std::back_inserter(bias_scales),
-                    [input_scale](const double filter_scale) -> double {
-                      return filter_scale * input_scale;
-                    });
-  return bias_scales;
+  auto bias_value_type = RankedTensorType::getChecked(
+      loc, std::move(bias_shape), rewriter.getI32Type());
+  auto bias_value = DenseIntElementsAttr::get(
+      bias_value_type, APInt(/*numBits=*/32, /*value=*/0, /*isSigned=*/true));
+
+  return rewriter.create<TFL::QConstOp>(
+      loc, /*output=*/TypeAttr::get(bias_type), /*value=*/bias_value);
 }
 
 // stablehlo.uniform_quantize -> tfl.quantize
@@ -163,10 +240,11 @@ class RewriteUniformQuantizeOp
   LogicalResult match(stablehlo::UniformQuantizeOp op) const override {
     const Type input_element_type =
         op.getOperand().getType().cast<TensorType>().getElementType();
-    if (!input_element_type.isa<FloatType>()) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Uniform quantize op's input should be a float type. Got: "
-                 << input_element_type << ".\n");
+    if (!(input_element_type.isa<FloatType>() ||
+          IsI32F32UniformQuantizedType(input_element_type))) {
+      LLVM_DEBUG(llvm::dbgs() << "Uniform quantize op's input should be a "
+                                 "float type or int32. Got: "
+                              << input_element_type << ".\n");
       return failure();
     }
 
@@ -257,7 +335,7 @@ class RewriteUniformDequantizeOp
 //   * Not a depthwise convolution.
 //   * Does not consider bias add fusion.
 // TODO: b/294771704 - Support bias quantization.
-class RewriteQuantizedConvolutionOp
+class RewriteUpstreamQuantizedConvolutionOp
     : public OpRewritePattern<stablehlo::ConvolutionOp> {
  public:
   using OpRewritePattern<stablehlo::ConvolutionOp>::OpRewritePattern;
@@ -654,7 +732,7 @@ class RewriteQuantizedConvolutionOp
 //
 // TODO: b/293650675 - Relax the conversion condition to support dot_general in
 // general.
-class RewriteFullIntegerQuantizedDotGeneralOp
+class RewriteUpstreamQuantizedDotGeneralOpToBatchMatmulOp
     : public OpRewritePattern<stablehlo::DotGeneralOp> {
  public:
   using OpRewritePattern<stablehlo::DotGeneralOp>::OpRewritePattern;
@@ -662,7 +740,7 @@ class RewriteFullIntegerQuantizedDotGeneralOp
   static LogicalResult MatchLhs(
       Value lhs, stablehlo::DotDimensionNumbersAttr dimension_numbers) {
     auto lhs_type = lhs.getType().cast<TensorType>();
-    if (!(IsI8F32UniformQuantizedType(lhs_type.getElementType()))) {
+    if (!IsI8F32UniformQuantizedType(lhs_type.getElementType())) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Expected a per-tensor uniform "
                     "quantized (i8->f32) input for dot_general. Got: "
@@ -704,7 +782,7 @@ class RewriteFullIntegerQuantizedDotGeneralOp
     }
 
     auto rhs_type = rhs.getType().cast<TensorType>();
-    if (!(IsI8F32UniformQuantizedType(rhs_type.getElementType()))) {
+    if (!IsI8F32UniformQuantizedType(rhs_type.getElementType())) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Expected a per-tensor uniform "
                     "quantized (i8->f32) weight for dot_general. Got: "
@@ -714,6 +792,19 @@ class RewriteFullIntegerQuantizedDotGeneralOp
     return success();
   }
 
+  static LogicalResult MatchOutput(
+      Value output, stablehlo::DotDimensionNumbersAttr dimension_numbers) {
+    auto output_type = output.getType().cast<TensorType>();
+    if (!IsI8F32UniformQuantizedType(output_type.getElementType())) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Expected a per-tensor uniform "
+                    "quantized (i8->f32) output for dot_general. Got: "
+                 << output_type << "\n");
+      return failure();
+    }
+    return success();
+  }
+
   LogicalResult match(stablehlo::DotGeneralOp op) const override {
     stablehlo::DotDimensionNumbersAttr dimension_numbers =
         op.getDotDimensionNumbers();
@@ -746,6 +837,12 @@ class RewriteFullIntegerQuantizedDotGeneralOp
       return failure();
     }
 
+    if (failed(MatchOutput(op.getResult(), dimension_numbers))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to match output for quantized dot_general.\n");
+      return failure();
+    }
+
     return success();
   }
 
@@ -819,7 +916,7 @@ class RewriteFullIntegerQuantizedDotGeneralOp
 // `RewriteFullIntegerQuantizedDotGeneralOp`.
 // TODO: b/295264927 - `stablehlo.dot_general` with per-axis quantized operands
 // is not specified in the StableHLO dialect. Update the spec to allow this.
-class RewriteQuantizedDotGeneralOpToTflFullyConnectedOp
+class RewriteUpstreamQuantizedDotGeneralOpToTflFullyConnectedOp
     : public OpRewritePattern<stablehlo::DotGeneralOp> {
   using OpRewritePattern<stablehlo::DotGeneralOp>::OpRewritePattern;
 
@@ -867,15 +964,17 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOp
         cast<stablehlo::ConstantOp>(op.getOperand(1).getDefiningOp());
 
     TFL::QConstOp new_filter_constant_op =
-        CreateTflConstOpForFilter(filter_constant_op, rewriter);
+        CreateTflConstOpForFilter(filter_constant_op, rewriter,
+                                  /*is_per_axis=*/true);
     const Value input_value = op.getOperand(0);
     const double input_scale = input_value.getType()
                                    .cast<TensorType>()
                                    .getElementType()
                                    .cast<UniformQuantizedType>()
                                    .getScale();
-    TFL::QConstOp bias_constant_op = CreateTflConstOpForBias(
-        op.getLoc(), input_scale, new_filter_constant_op, rewriter);
+    TFL::QConstOp bias_constant_op = CreateTflConstOpForDummyBias(
+        op.getLoc(), input_scale, new_filter_constant_op, rewriter,
+        /*is_per_axis=*/true);
 
     const Value result_value = op.getResult();
     // Set to `nullptr` because this attribute only matters when the input is
@@ -962,106 +1061,208 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOp
 
     return success();
   }
+};
 
-  // Creates a new `tfl.qconst` op for the quantized filter. Transposes the
-  // filter value from [i, o] -> [o, i]. This is because we assume `[i, o]`
-  // format for `stablehlo.dot_general` (i.e. contracting dimension == 1)
-  // whereas `tfl.fully_connected` accepts an OI format.
-  TFL::QConstOp CreateTflConstOpForFilter(
-      stablehlo::ConstantOp filter_constant_op,
-      PatternRewriter& rewriter) const {
-    const auto filter_values = filter_constant_op.getValue()
-                                   .cast<DenseIntElementsAttr>()
-                                   .getValues<int8_t>();
+// Rewrites `stablehlo.dot_general` to `tfl.fully_connected` or
+// `tfl.batch_matmul` when it accepts uniform quantized tensors.
+//
+// Conditions for `tfl.fully_connected` conversion:
+//   * Input and output tensors are per-tensor uniform quantized (i8->f32)
+//     tensors.
+//   * The filter tensor is constant a per-tensor uniform quantized (i8->f32)
+//     tensor. The quantization dimension should be 1 (the non-contracting
+//     dimension).
+//   * The input tensor's rank is either 2 or 3. The last dimension of the input
+//     tensor should be the contracting dimension, i.e. [..., c_x, r_x].
+//   * The filter tensor's rank is 2. The contracting dimension should be the
+//     first dimension (dim 0), i.e. [c_y, r_y] where c_y == r_x.
+//   * Does not consider activation fusion.
+//   * Does not consider bias add fusion.
+// TODO: b/580909703 - Include conversion conditions for `tfl.batch_matmul` op.
+//
+// TODO: b/295264927 - `stablehlo.dot_general` with per-axis quantized operands
+// is not specified in the StableHLO dialect. Update the spec to allow this.
+class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
+    : public OpRewritePattern<stablehlo::DotGeneralOp> {
+  using OpRewritePattern<stablehlo::DotGeneralOp>::OpRewritePattern;
 
-    ArrayRef<int64_t> filter_shape =
-        filter_constant_op.getType().cast<TensorType>().getShape();
+ public:
+  LogicalResult match(stablehlo::DotGeneralOp op) const override {
+    const stablehlo::DotDimensionNumbersAttr dot_dimension_nums =
+        op.getDotDimensionNumbers();
+    if (const int num_rhs_contracting_dims =
+            dot_dimension_nums.getRhsContractingDimensions().size();
+        num_rhs_contracting_dims != 1) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Expected number of contracting dimensions to be 1. Got: "
+                 << num_rhs_contracting_dims << ".\n");
+      return failure();
+    }
 
-    // Reverse the shapes. This makes sense because it assumes that the filter
-    // tensor has rank of 2 (no batch dimension).
-    SmallVector<int64_t, 2> new_filter_shape(filter_shape.rbegin(),
-                                             filter_shape.rend());
+    if (failed(MatchInput(op.getOperand(0)))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to match input for quantized dot_general op.\n");
+      return failure();
+    }
 
-    // Construct the value array of transposed filter. Assumes 2D matrix.
-    SmallVector<int8_t> new_filter_values(filter_values.size(), /*Value=*/0);
-    for (int i = 0; i < filter_shape[0]; ++i) {
-      for (int j = 0; j < filter_shape[1]; ++j) {
-        const int old_idx = i * filter_shape[1] + j;
-        const int new_idx = j * filter_shape[0] + i;
-        new_filter_values[new_idx] = filter_values[old_idx];
-      }
+    if (failed(MatchFilter(op.getOperand(1)))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to match filter for quantized dot_general op.\n");
+      return failure();
     }
 
-    auto new_filter_value_attr_type = RankedTensorType::getChecked(
-        filter_constant_op.getLoc(), new_filter_shape,
-        /*elementType=*/rewriter.getI8Type());
+    if (failed(MatchOutput(op.getResult()))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to match output for quantized dot_general op.\n");
+      return failure();
+    }
 
-    auto filter_quantized_type = filter_constant_op.getResult()
-                                     .getType()
-                                     .cast<TensorType>()
-                                     .getElementType()
-                                     .cast<UniformQuantizedPerAxisType>();
+    if (failed(MatchUsers(op.getResult()))) {
+      LLVM_DEBUG(llvm::dbgs() << "Failed to match subsequent requantize for "
+                                 "quantized dot_general op.\n");
+      return failure();
+    }
 
-    auto new_filter_quantized_type = UniformQuantizedPerAxisType::getChecked(
-        filter_constant_op.getLoc(), /*flags=*/true,
-        /*storageType=*/filter_quantized_type.getStorageType(),
-        /*expressedType=*/filter_quantized_type.getExpressedType(),
-        /*scales=*/filter_quantized_type.getScales(),
-        /*zeroPoints=*/filter_quantized_type.getZeroPoints(),
-        /*quantizedDimension=*/0, /*storageTypeMin=*/llvm::minIntN(8),
-        /*storageTypeMax=*/llvm::maxIntN(8));
+    return success();
+  }
 
-    // Required because the quantized dimension is changed from 3 -> 0.
-    auto new_filter_result_type = RankedTensorType::getChecked(
-        filter_constant_op.getLoc(), /*shape=*/new_filter_shape,
-        /*type=*/new_filter_quantized_type);
+  void rewrite(stablehlo::DotGeneralOp op,
+               PatternRewriter& rewriter) const override {
+    // Create the new filter constant - transpose filter value
+    // from [i, o] -> [o, i]. This is because we assume `[i, o]` format for
+    // `stablehlo.dot_general` (i.e. contracting dimension == 1) whereas
+    // `tfl.fully_connected` accepts an OI format.
+    auto filter_constant_op =
+        cast<stablehlo::ConstantOp>(op.getOperand(1).getDefiningOp());
 
-    auto new_filter_constant_value_attr = DenseIntElementsAttr::get(
-        new_filter_value_attr_type, new_filter_values);
-    return rewriter.create<TFL::QConstOp>(
-        filter_constant_op.getLoc(),
-        /*output=*/TypeAttr::get(new_filter_result_type),
-        /*value=*/new_filter_constant_value_attr);
+    TFL::QConstOp new_filter_constant_op = CreateTflConstOpForFilter(
+        filter_constant_op, rewriter, /*is_per_axis=*/false);
+    const Value input_value = op.getOperand(0);
+    const double input_scale = input_value.getType()
+                                   .cast<TensorType>()
+                                   .getElementType()
+                                   .cast<UniformQuantizedType>()
+                                   .getScale();
+    TFL::QConstOp bias_constant_op = CreateTflConstOpForDummyBias(
+        op.getLoc(), input_scale, new_filter_constant_op, rewriter,
+        /*is_per_axis=*/false);
+
+    auto output_op = op.getResult().getDefiningOp();
+    Operation* requantize_op = *output_op->getResult(0).getUsers().begin();
+    Operation* dequantize_op = *requantize_op->getResult(0).getUsers().begin();
+
+    // Set to `nullptr` because this attribute only matters when the input is
+    // dynamic-range quantized.
+    const BoolAttr asymmetric_quantize_inputs = nullptr;
+    auto tfl_fully_connected_op = rewriter.create<TFL::FullyConnectedOp>(
+        op.getLoc(),
+        /*output=*/
+        requantize_op->getResult(0).getType(),  // result_value.getType(),
+        /*input=*/input_value, /*filter=*/new_filter_constant_op.getResult(),
+        /*bias=*/bias_constant_op.getResult(),
+        /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
+        /*weights_format=*/rewriter.getStringAttr("DEFAULT"),
+        /*keep_num_dims=*/rewriter.getBoolAttr(false),
+        asymmetric_quantize_inputs);
+
+    auto tfl_dequantize_op = rewriter.create<TFL::DequantizeOp>(
+        op.getLoc(), dequantize_op->getResult(0).getType(),
+        tfl_fully_connected_op->getResult(0));
+
+    rewriter.replaceAllUsesWith(dequantize_op->getResult(0),
+                                tfl_dequantize_op->getResult(0));
+
+    rewriter.replaceAllUsesWith(op.getResult(),
+                                tfl_fully_connected_op.getResult(0));
+
+    rewriter.eraseOp(op);
   }
 
-  // Creates a new `tfl.qconst` op for the bias. The bias values are 0s, because
-  // this bias a dummy bias (note that bias fusion is not considered for this
-  // transformation). The quantization scale for the bias is input scale *
-  // filter scale. `filter_const_op` is used to retrieve the filter scales and
-  // the size of the bias constant.
-  TFL::QConstOp CreateTflConstOpForBias(const Location loc,
-                                        const double input_scale,
-                                        TFL::QConstOp filter_const_op,
-                                        PatternRewriter& rewriter) const {
-    const ArrayRef<int64_t> filter_shape =
-        filter_const_op.getResult().getType().getShape();
-    const auto filter_quantized_element_type =
-        filter_const_op.getResult()
-            .getType()
-            .getElementType()
-            .cast<UniformQuantizedPerAxisType>();
+ private:
+  static LogicalResult MatchInput(Value input) {
+    auto input_type = input.getType().cast<TensorType>();
+    if (!input_type.hasRank() ||
+        !(input_type.getRank() == 2 || input_type.getRank() == 3)) {
+      LLVM_DEBUG(llvm::dbgs() << "Input expected to have rank of 2 or 3. Got: "
+                              << input_type << ".\n");
+      return failure();
+    }
 
-    // The storage type is i32 for bias, which is the precision used for
-    // accumulation.
-    auto bias_quantized_type = UniformQuantizedPerAxisType::getChecked(
-        loc, /*flags=*/true, /*storageType=*/rewriter.getI32Type(),
-        /*expressedType=*/rewriter.getF32Type(), /*scales=*/
-        GetBiasScales(input_scale, filter_quantized_element_type.getScales()),
-        /*zeroPoints=*/filter_quantized_element_type.getZeroPoints(),
-        /*quantizedDimension=*/0, /*storageTypeMin=*/llvm::minIntN(8),
-        /*storageTypeMax=*/llvm::maxIntN(8));
+    if (const auto input_element_type = input_type.getElementType();
+        !IsI8F32UniformQuantizedType(input_element_type)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Expected an i8->f32 uniform quantized type. Got: "
+                 << input_element_type << ".\n");
+      return failure();
+    }
 
-    SmallVector<int64_t, 1> bias_shape = {filter_shape[0]};
-    auto bias_type =
-        RankedTensorType::getChecked(loc, bias_shape, bias_quantized_type);
+    return success();
+  }
 
-    auto bias_value_type = RankedTensorType::getChecked(
-        loc, std::move(bias_shape), rewriter.getI32Type());
-    auto bias_value = DenseIntElementsAttr::get(
-        bias_value_type, APInt(/*numBits=*/32, /*value=*/0, /*isSigned=*/true));
+  static LogicalResult MatchFilter(Value filter) {
+    auto filter_type = filter.getType().cast<TensorType>();
+    if (!filter_type.hasRank() || filter_type.getRank() != 2) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Filter tensor expected to have a tensor rank of 2. Got: "
+                 << filter_type << ".\n");
+      return failure();
+    }
 
-    return rewriter.create<TFL::QConstOp>(
-        loc, /*output=*/TypeAttr::get(bias_type), /*value=*/bias_value);
+    const Type filter_element_type = filter_type.getElementType();
+    if (!IsI8F32UniformQuantizedType(filter_element_type)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Expected a uniform quantized (i8->f32) type. Got: "
+                 << filter_element_type << "\n");
+      return failure();
+    }
+
+    if (Operation* filter_op = filter.getDefiningOp();
+        filter_op == nullptr || !isa<stablehlo::ConstantOp>(filter_op)) {
+      LLVM_DEBUG(llvm::dbgs() << "Filter should be a constant.\n");
+      return failure();
+    }
+
+    return success();
+  }
+
+  static LogicalResult MatchOutput(Value output) {
+    const Type output_element_type =
+        output.getType().cast<TensorType>().getElementType();
+    if (!IsI32F32UniformQuantizedType(output_element_type)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Expected a uniform quantized (i32->f32) type. Got: "
+                 << output_element_type << ".\n");
+      return failure();
+    }
+    return success();
+  }
+
+  static LogicalResult MatchUsers(Value output) {
+    auto output_op = output.getDefiningOp();
+
+    if (!output_op->hasOneUse()) {
+      LLVM_DEBUG(llvm::dbgs() << "Expected output to be used only once.\n");
+      return failure();
+    }
+    // TODO: b/309896242 - Add support for fused op case.
+    if (Operation* requantize_op = dyn_cast_or_null<TFL::QuantizeOp>(
+            *output_op->getResult(0).getUsers().begin())) {
+      const Type requantize_element_type = requantize_op->getResult(0)
+                                               .getType()
+                                               .cast<TensorType>()
+                                               .getElementType();
+      if (!IsI8F32UniformQuantizedType(requantize_element_type)) {
+        LLVM_DEBUG(llvm::dbgs() << "Expected a quantize (i8->f32) type. Got: "
+                                << requantize_element_type << ".\n");
+        return failure();
+      }
+      if (!isa<TFL::DequantizeOp>(
+              *requantize_op->getResult(0).getUsers().begin())) {
+        LLVM_DEBUG(llvm::dbgs() << "Expected a dequantize type.\n");
+        return failure();
+      }
+    }
+    return success();
   }
 };
 
@@ -1071,9 +1272,11 @@ void UniformQuantizedStablehloToTflPass::runOnOperation() {
 
   RewritePatternSet patterns(&ctx);
   patterns.add<RewriteUniformQuantizeOp, RewriteUniformDequantizeOp,
-               RewriteQuantizedConvolutionOp,
-               RewriteFullIntegerQuantizedDotGeneralOp,
-               RewriteQuantizedDotGeneralOpToTflFullyConnectedOp>(&ctx);
+               RewriteUpstreamQuantizedConvolutionOp,
+               RewriteUpstreamQuantizedDotGeneralOpToBatchMatmulOp,
+               RewriteUpstreamQuantizedDotGeneralOpToTflFullyConnectedOp,
+               RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp>(
+      &ctx);
 
   if (failed(applyPatternsAndFoldGreedily(func_op, std::move(patterns)))) {
     func_op.emitError() << "Failed to convert stablehlo ops with uniform "
diff --git a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
index 17b724051cdede..77f634edb94768 100644
--- a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
@@ -344,3 +344,32 @@ func.func @trivial_dynamic_update_slice_wrong_update_shape(%arg0: tensor<2x7x14x
   // CHECK: "tfl.dynamic_update_slice"
   func.return %1 : tensor<2x7x14xf32>
 }
+
+// CHECK-LABEL: OptimizeTranposeWithRank7orMoreEffectiveRank6
+func.func @OptimizeTranposeWithRank7orMoreEffectiveRank6(%arg0: tensor<7x6x5x4x3x2x1xf32> ) -> (tensor<1x2x3x4x5x6x7xf32>)  {
+  %cst = arith.constant dense<[6, 5, 4, 3, 2, 1, 0]> : tensor<7xi32>
+  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<7x6x5x4x3x2x1xf32>, tensor<7xi32>) -> tensor<1x2x3x4x5x6x7xf32>
+  return %0 : tensor<1x2x3x4x5x6x7xf32>
+  // CHECK-DAG: %cst = arith.constant dense<[7, 6, 5, 4, 3, 2]> : tensor<6xi32>
+  // CHECK-DAG: %cst_0 = arith.constant dense<[5, 4, 3, 2, 1, 0]> : tensor<6xi32>
+  // CHECK-DAG: %cst_1 = arith.constant dense<[1, 2, 3, 4, 5, 6, 7]> : tensor<7xi32>
+  // CHECK: %0 = "tfl.reshape"(%arg0, %cst) : (tensor<7x6x5x4x3x2x1xf32>, tensor<6xi32>) -> tensor<7x6x5x4x3x2xf32>
+  // CHECK: %1 = "tfl.transpose"(%0, %cst_0) : (tensor<7x6x5x4x3x2xf32>, tensor<6xi32>) -> tensor<2x3x4x5x6x7xf32>
+  // CHECK: %2 = "tfl.reshape"(%1, %cst_1) : (tensor<2x3x4x5x6x7xf32>, tensor<7xi32>) -> tensor<1x2x3x4x5x6x7xf32>
+  // CHECK: return %2
+}
+
+// CHECK-LABEL: OptimizeTranposeWithRank7orMoreEffectiveRank4
+func.func @OptimizeTranposeWithRank7orMoreEffectiveRank4(%arg0: tensor<56x8x56x1x1x1x7xf32> ) -> (tensor<1x1x8x56x56x7x1xf32>)  {
+  %cst = arith.constant dense<[4, 5, 1, 2, 0, 6, 3]> : tensor<7xi32>
+  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<56x8x56x1x1x1x7xf32>, tensor<7xi32>) -> tensor<1x1x8x56x56x7x1xf32>
+  return %0 : tensor<1x1x8x56x56x7x1xf32>
+  // CHECK-DAG: %cst = arith.constant dense<[56, 8, 56, 7]> : tensor<4xi32>
+  // CHECK-DAG: %cst_0 = arith.constant dense<[1, 2, 0, 3]> : tensor<4xi32>
+  // CHECK-DAG: %cst_1 = arith.constant dense<[1, 1, 8, 56, 56, 7, 1]> : tensor<7xi32>
+  // CHECK: %0 = "tfl.reshape"(%arg0, %cst) : (tensor<56x8x56x1x1x1x7xf32>, tensor<4xi32>) -> tensor<56x8x56x7xf32>
+  // CHECK: %1 = "tfl.transpose"(%0, %cst_0) : (tensor<56x8x56x7xf32>, tensor<4xi32>) -> tensor<8x56x56x7xf32>
+  // CHECK: %2 = "tfl.reshape"(%1, %cst_1) : (tensor<8x56x56x7xf32>, tensor<7xi32>) -> tensor<1x1x8x56x56x7x1xf32>
+  // CHECK: return %2
+}
+
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
index 97e3a647b042a6..33e5cca6e5de17 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
@@ -8,8 +8,8 @@ func.func @main(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x
   func.return %24 : tensor<1x4xf32>
 // CHECK-LABEL: main
 // separate lines since there is no region for this op. third_party/tensorflow/compiler/mlir/lite/ir/tfl_ops.td: 3252
-// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() {value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
-// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() {value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
 // CHECK: %[[RES2:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %[[RES0]], %[[RES1]], %arg18, %arg19, %arg20, %arg21) ({
 // CHECK:  }) {asymmetric_quantize_inputs = false, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
 // CHECK: return %[[RES2]]
@@ -46,8 +46,8 @@ func.func @testLSTMAsymAttributeTrue(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf
   %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {asymmetric_quantize_inputs = true, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   func.return %24 : tensor<1x4xf32>
 
-// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() {value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
-// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() {value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
 // CHECK: %[[RES2:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %[[RES0]], %[[RES1]], %arg18, %arg19, %arg20, %arg21) ({
 // CHECK:  }) {asymmetric_quantize_inputs = true, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
 // CHECK: return %[[RES2]]
@@ -63,8 +63,8 @@ func.func @testLSTMAsymAttributeFalse(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4x
   %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {asymmetric_quantize_inputs = false, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   func.return %24 : tensor<1x4xf32>
 
-// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() {value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
-// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() {value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
 // CHECK: %[[RES2:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %[[RES0]], %[[RES1]], %arg18, %arg19, %arg20, %arg21) ({
 // CHECK:  }) {asymmetric_quantize_inputs = false, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
 // CHECK: return %[[RES2]]
@@ -80,8 +80,8 @@ func.func @testLSTMAsymAttributeDefault(%arg0: tensor<1x4xf32>, %arg1: tensor<4x
   %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   func.return %24 : tensor<1x4xf32>
 
-// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() {value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
-// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() {value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
 // CHECK: %[[RES2:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %[[RES0]], %[[RES1]], %arg18, %arg19, %arg20, %arg21) ({
 // CHECK:  }) {asymmetric_quantize_inputs = false, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
 // CHECK: return %[[RES2]]
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/metadata_buffer.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/metadata_buffer.mlir
deleted file mode 100644
index 6b76b31c9a52bf..00000000000000
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/metadata_buffer.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
-
-// CHECK: tfl.metadata_buffer = [3 : i32, 7 : i32]
-module attributes {tfl.metadata_buffer = [3 : i32, 7 : i32]} {
-  func.func @main(%arg0: tensor<i32>, %arg1: tensor<3x2xi32>) -> tensor<3x2xi32> {
-    %0 = "tfl.add" (%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<i32>, tensor<3x2xi32>) -> tensor<3x2xi32>
-    func.return %0 : tensor<3x2xi32>
-  }
-}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo.mlir
index 64567f5c3d5d68..708dd562195398 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo.mlir
@@ -320,7 +320,7 @@ func.func @dynamic_update_slice(%arg0: tensor<4x4xi64>, %arg1: tensor<2x3xi64>,
 
 func.func @dyanmic_slice(%arg0: tensor<3x3xi64>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<3x3xi64> {
   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {
-    slice_sizes = dense<[3, 3]> : tensor<2xi64>
+    slice_sizes = array<i64: 3, 3>
   } : (tensor<3x3xi64>, tensor<i64>, tensor<i64>) -> tensor<3x3xi64>
   return %0 : tensor<3x3xi64>
 }
@@ -524,7 +524,7 @@ func.func @gather(%operand: tensor<3x4x2xi32>, %start_indices: tensor<2x3x2xi64>
 // CHECK-NEXT:}
 
 func.func @transpose(%arg0: tensor<2x3x2xi32>) -> tensor<2x3x2xi32> {
-  %0 = "stablehlo.transpose"(%arg0) {permutation = dense<[2, 1, 0]> : tensor<3xi64>} : (tensor<2x3x2xi32>) -> tensor<2x3x2xi32>
+  %0 = "stablehlo.transpose"(%arg0) {permutation = array<i64: 2, 1, 0>} : (tensor<2x3x2xi32>) -> tensor<2x3x2xi32>
   return %0 : tensor<2x3x2xi32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/variable.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/variable.mlir
new file mode 100644
index 00000000000000..0914fc37016771
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/variable.mlir
@@ -0,0 +1,8 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
+
+// CHECK-LABEL: main
+func.func @main() -> tensor<3x2xi32> {
+  // CHECK: "tfl.pseudo_const"() {tfl.is_variable, value = dense<0> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
+  %0 = "tfl.pseudo_const"() {value = dense<0> : tensor<3x2xi32>, tfl.is_variable} : () -> tensor<3x2xi32> loc("variable")
+  func.return %0 : tensor<3x2xi32>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/deduplicate_const.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/deduplicate_const.mlir
new file mode 100644
index 00000000000000..c0c1bf4f70dc7f
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/deduplicate_const.mlir
@@ -0,0 +1,93 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+
+module {
+func.func @add(%arg0: tensor<3x2xf32>) -> tensor<3x2xf32> attributes {tf.entry_function = {inputs = "serving_default_x", outputs = "outputs"}} {
+  %0 = "tfl.pseudo_const" () {value = dense<[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]> : tensor<3x2xf32>} : () -> tensor<3x2xf32>
+  %1 = "tfl.add" (%0, %arg0) {fused_activation_function = "NONE"} : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x2xf32>
+  func.return %1 : tensor<3x2xf32>
+}
+
+func.func @sub(%arg0: tensor<3x2xf32>) -> tensor<3x2xf32> attributes {tf.entry_function = {inputs = "serving_default_x", outputs = "outputs"}} {
+  %0 = "tfl.pseudo_const" () {value = dense<[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]> : tensor<3x2xf32>} : () -> tensor<3x2xf32>
+  %1 = "tfl.sub" (%0, %arg0) {fused_activation_function = "NONE"} : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x2xf32>
+  func.return %1 : tensor<3x2xf32>
+}
+}
+
+// CHECK:      {
+// CHECK:        subgraphs: [ {
+// CHECK-NEXT:     tensors: [ {
+// CHECK-NEXT:       shape: [ 3, 2 ],
+// CHECK-NEXT:       buffer: 1,
+// CHECK-NEXT:       name: "serving_default_x",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       },
+// CHECK-NEXT:       has_rank: true
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 3, 2 ],
+// CHECK-NEXT:       buffer: 2,
+// CHECK-NEXT:       name: "tfl.pseudo_const",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       },
+// CHECK-NEXT:       has_rank: true
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 3, 2 ],
+// CHECK-NEXT:       buffer: 3,
+// CHECK-NEXT:       name: "outputs",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       },
+// CHECK-NEXT:       has_rank: true
+// CHECK-NEXT:     } ],
+// CHECK:          name: "add"
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     tensors: [ {
+// CHECK-NEXT:       shape: [ 3, 2 ],
+// CHECK-NEXT:       buffer: 4,
+// CHECK-NEXT:       name: "serving_default_x",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       },
+// CHECK-NEXT:       has_rank: true
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 3, 2 ],
+// CHECK-NEXT:       buffer: 2,
+// CHECK-NEXT:       name: "tfl.pseudo_const1",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       },
+// CHECK-NEXT:       has_rank: true
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 3, 2 ],
+// CHECK-NEXT:       buffer: 6,
+// CHECK-NEXT:       name: "outputs",
+// CHECK-NEXT:       quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:       },
+// CHECK-NEXT:       has_rank: true
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     inputs: [ 0 ],
+// CHECK-NEXT:     outputs: [ 2 ],
+// CHECK:          name: "sub"
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:   description: "MLIR Converted.",
+// CHECK-NEXT:   buffers: [ {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     data: [ 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64, 0, 0, 128, 64, 0, 0, 160, 64, 0, 0, 192, 64 ]
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     data: [ 49, 46, 54, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:   } ],
+// CHECK:      }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
index c1504b979afa5b..3bab94ea490f37 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
@@ -273,7 +273,7 @@ func.func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf3
 // CHECK-NEXT:   }, {
 // CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-EMPTY:
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:   }, {
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_asym_attr.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_asym_attr.mlir
index 62cf9336ad0f29..6e5a70a6b5bb66 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_asym_attr.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_asym_attr.mlir
@@ -273,7 +273,7 @@ func.func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf3
 // CHECK-NEXT:   }, {
 // CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-EMPTY:
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:   }, {
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata_buffer.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata_buffer.mlir
deleted file mode 100644
index f53f3954f14211..00000000000000
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata_buffer.mlir
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
-
-module attributes {tfl.metadata_buffer = [3 : i32, 7 : i32]} {
-  func.func @main(%arg0: tensor<i32>, %arg1: tensor<3x2xi32>) -> tensor<3x2xi32> {
-    %0 = "tfl.add" (%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<i32>, tensor<3x2xi32>) -> tensor<3x2xi32>
-    func.return %0 : tensor<3x2xi32>
-  }
-}
-
-// CHECK: metadata_buffer: [ 3, 7 ],
-// CHECK-NEXT: metadata:
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/signature_def.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/signature_def.mlir
index 8253e8215f9d38..e5c9b4802c15e4 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/signature_def.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/signature_def.mlir
@@ -44,7 +44,7 @@
 // CHECK-NEXT:      has_rank: true
 // CHECK-NEXT:    }, {
 // CHECK-NEXT:      shape: [ 5, 384 ],
-// CHECK-NEXT:      buffer: 5,
+// CHECK-NEXT:      buffer: 4,
 // CHECK-NEXT:      name: "arith.constant2",
 // CHECK-NEXT:      quantization: {
 // CHECK-EMPTY:
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/u16_quant.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/u16_quant.mlir
new file mode 100644
index 00000000000000..251e8bd389cfd1
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/u16_quant.mlir
@@ -0,0 +1,19 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+
+func.func @main(%arg0: tensor<*x!quant.uniform<u16:f32, 2.0:37>>) -> tensor<*x!quant.uniform<u16:f32, 2.0:37>> {
+// CHECK:     {
+// CHECK-NEXT:  version: 3,
+// CHECK-NEXT:  operator_codes: [ ],
+// CHECK-NEXT:  subgraphs: [ {
+// CHECK-NEXT:    tensors: [ {
+// CHECK-NEXT:      shape: [  ],
+// CHECK-NEXT:      type: UINT16,
+// CHECK-NEXT:      buffer: 1,
+// CHECK-NEXT:      name: "arg0",
+// CHECK-NEXT:      quantization: {
+// CHECK-NEXT:        scale: [ 2.0 ],
+// CHECK-NEXT:        zero_point: [ 37 ]
+// CHECK:           }
+// CHECK-NEXT:    } ],
+  return %arg0 : tensor<*x!quant.uniform<u16:f32, 2.0:37>>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
index 669f6068e948b5..738b413c09268b 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s --dump-input=always
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
 
 func.func @main(tensor<4x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4x4xf32> {
 // CHECK: {
@@ -298,7 +298,7 @@ func.func @main(tensor<4x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4x
 // CHECK-NEXT:   }, {
 // CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-EMPTY:
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:   }, {
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/variable.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/variable.mlir
new file mode 100644
index 00000000000000..2b393f7fecaa8d
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/variable.mlir
@@ -0,0 +1,40 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+
+func.func @main() -> tensor<3x2xi32> {
+  %0 = "tfl.pseudo_const" () {value = dense<0> : tensor<3x2xi32>, tfl.is_variable} : () -> tensor<3x2xi32> loc("variable")
+  func.return %0 : tensor<3x2xi32>
+}
+
+// CHECK:      {
+// CHECK-NEXT:     version: 3,
+// CHECK-NEXT:     operator_codes: [ ],
+// CHECK-NEXT:     subgraphs: [ {
+// CHECK-NEXT:       tensors: [ {
+// CHECK-NEXT:         shape: [ 3, 2 ],
+// CHECK-NEXT:         type: INT32,
+// CHECK-NEXT:         name: "variable",
+// CHECK-NEXT:         quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:         },
+// CHECK-NEXT:         is_variable: true
+// CHECK-NEXT:         has_rank: true
+// CHECK-NEXT:       } ],
+// CHECK-NEXT:       inputs: [ ],
+// CHECK-NEXT:       outputs: [ 0 ],
+// CHECK-NEXT:       operators: [ ],
+// CHECK-NEXT:       name: "main"
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     description: "MLIR Converted.",
+// CHECK-NEXT:     buffers: [ {
+// CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:      data: [ {{.*}} ]
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:      data: [ {{.*}} ]
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     metadata: [ {
+// CHECK-NEXT:     name: "min_runtime_version",
+// CHECK-NEXT:     buffer: 2
+// CHECK-NEXT:     } ]
+// CHECK-NEXT:     signature_defs: [ ]
+// CHECK-NEXT:   }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index fff00820ce353b..0769e768507ee7 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -226,7 +226,7 @@ func.func @matmulNoTransposeAOrB(%arg0: tensor<1x1280xf32>, %arg1: tensor<1280x1
   // CHECK-LABEL: matmulNoTransposeAOrB
   // CHECK: %[[RES:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<?xi32>
   // CHECK: %[[TRANS:.*]] = "tf.Transpose"(%arg1, %[[RES]]) : (tensor<1280x1000xf32>, tensor<?xi32>) -> tensor<*xf32>
-  // CHECK: %[[MM:.*]] = "tf.MatMul"(%arg0, %[[TRANS]]) <{transpose_a = false, transpose_b = true}> : (tensor<1x1280xf32>, tensor<*xf32>) -> tensor<1x1000xf32>
+  // CHECK: %[[MM:.*]] = "tf.MatMul"(%arg0, %[[TRANS]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = true}> : (tensor<1x1280xf32>, tensor<*xf32>) -> tensor<1x1000xf32>
   // CHECK: return %[[MM]] : tensor<1x1000xf32>
  }
 
@@ -238,7 +238,7 @@ func.func @matmulNoTransposeB(%arg0: tensor<1x1280xf32>, %arg1: tensor<1280x1000
   // CHECK: %[[RES:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<?xi32>
   // CHECK: %[[TRANS1:.*]] = "tf.Transpose"(%arg0, %[[RES]]) : (tensor<1x1280xf32>, tensor<?xi32>) -> tensor<*xf32>
   // CHECK: %[[TRANS2:.*]] = "tf.Transpose"(%arg1, %[[RES]]) : (tensor<1280x1000xf32>, tensor<?xi32>) -> tensor<*xf32>
-  // CHECK: %[[MM:.*]] = "tf.MatMul"(%[[TRANS1]], %[[TRANS2]]) <{transpose_a = false, transpose_b = true}> : (tensor<*xf32>, tensor<*xf32>) -> tensor<1x1000xf32>
+  // CHECK: %[[MM:.*]] = "tf.MatMul"(%[[TRANS1]], %[[TRANS2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = true}> : (tensor<*xf32>, tensor<*xf32>) -> tensor<1x1000xf32>
   // CHECK: return %[[MM]] : tensor<1x1000xf32>
 
 }
@@ -718,7 +718,7 @@ func.func @QuantDequantTranspose(%arg0: tensor<2x3xf32>) -> (tensor<2x4xf32>) {
   // CHECK: %[[QUANT:.*]] = "tfl.quantize"(%[[CST_0]]) {qtype = tensor<3x4x!quant.uniform<u8:f32:1, {0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128}>>} : (tensor<3x4xf32>) -> tensor<3x4x!quant.uniform<u8:f32:1, {0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128}>>
   // CHECK: %[[DEQUANT:.*]] = "tfl.dequantize"(%[[QUANT]]) : (tensor<3x4x!quant.uniform<u8:f32:1, {0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128}>>) -> tensor<3x4xf32>
   // CHECK: %[[TRANSPOSE:.*]] = "tf.Transpose"(%[[DEQUANT]], %[[CST]]) : (tensor<3x4xf32>, tensor<?xi32>) -> tensor<*xf32>
-  // CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %[[TRANSPOSE]]) <{transpose_a = false, transpose_b = true}> : (tensor<2x3xf32>, tensor<*xf32>) -> tensor<2x4xf32>
+  // CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %[[TRANSPOSE]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = true}> : (tensor<2x3xf32>, tensor<*xf32>) -> tensor<2x4xf32>
   // CHECK: return %[[MATMUL]] : tensor<2x4xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
index ec970065be3576..8a3abc94e2af57 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
@@ -407,7 +407,7 @@ void DenseToSparsePass::runOnOperation() {
       }
 
       if (result.needs_densify) {
-        const auto value = op->getOperand(operand);
+        auto value = op->getOperand(operand);
         auto densify =
             builder.create<DensifyOp>(op->getLoc(), value.getType(), value);
         value.replaceAllUsesWith(densify);
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index a99d4a9a1c688e..a2ea10fe199736 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -387,13 +387,13 @@ def LegalizeDiv : Pat<(TF_DivOp $lhs, $rhs),
 // fall through to here and convert to TF Lite BatchMatMul.
 // TODO(b/207064634): CreateEmptyBoolAttr is a temporary workaround for this bug.
 def LegalizeBatchMatMulV3UnknownBatch : Pat<
-  (TF_BatchMatMulV3Op $lhs, $rhs, $adj_x, $adj_y),
+  (TF_BatchMatMulV3Op $lhs, $rhs, $adj_x, $adj_y, $grad_x, $grad_y),
   (TFL_BatchMatMulOp $lhs, $rhs, $adj_x, $adj_y, CreateEmptyBoolAttr:$adj_y)>;
 def LegalizeBatchMatMulV2UnknownBatch : Pat<
-  (TF_BatchMatMulV2Op $lhs, $rhs, $adj_x, $adj_y),
+  (TF_BatchMatMulV2Op $lhs, $rhs, $adj_x, $adj_y, $grad_x, $grad_y),
   (TFL_BatchMatMulOp $lhs, $rhs, $adj_x, $adj_y, CreateEmptyBoolAttr:$adj_y)>;
 def LegalizeBatchMatMulUnknownBatch : Pat<
-  (TF_BatchMatMulOp $lhs, $rhs, $adj_x, $adj_y),
+  (TF_BatchMatMulOp $lhs, $rhs, $adj_x, $adj_y, $grad_x, $grad_y),
   (TFL_BatchMatMulOp $lhs, $rhs, $adj_x, $adj_y, CreateEmptyBoolAttr:$adj_y)>;
 
 def LegalizeFakeQuantWithMinMaxVars: Pat<
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index 64bff681053f6e..2ed3c0519d8526 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -362,34 +362,6 @@ TypeAttr RescaleQtype(Type input, Attribute factor) {
   return quant::RescaleQuantizedType(input, factor);
 }
 
-// Returns shape of a ranked tensor.
-// Precondition: output_val's is ranked tensor.
-// Returns a truncated shape when `truncate` is set to true.
-DenseElementsAttr GetShape(Value output_val, bool truncate = false) {
-  auto output_shape = output_val.getType().dyn_cast<ShapedType>().getShape();
-
-  SmallVector<int32_t> shape;
-  shape.reserve(output_shape.size());
-
-  bool needs_truncation = true;
-  for (size_t dim_idx = 0; dim_idx < output_shape.size(); ++dim_idx) {
-    int64_t dim = output_shape[dim_idx];
-    if (truncate && needs_truncation && dim == 1) {
-      continue;
-    } else if (needs_truncation && dim != 1) {
-      needs_truncation = false;
-    }
-    shape.push_back(ShapedType::isDynamic(dim) ? -1
-                                               : static_cast<int32_t>(dim));
-  }
-
-  return mlir::DenseElementsAttr::get(
-      RankedTensorType::get(
-          {static_cast<int>(shape.size())},
-          mlir::IntegerType::get(output_val.getContext(), 32)),
-      llvm::ArrayRef(shape));
-}
-
 // Utility function to map final permutation to initial permutation
 // initial -> permutation1 -> permutation2 -> final
 DenseElementsAttr RemapPermutation(Value permutation1, Value permutation2) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index dccf57ab1ecf40..e1e3d766ed3e5b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -41,11 +41,6 @@ class HasRankAtMost<int n> : Constraint<
     CPred<"$0.getType().cast<ShapedType>().hasRank() && "
           "$0.getType().cast<ShapedType>().getRank() <= " # n>>;
 
-// Checks if the value has rank at most 'n'.
-class HasRankAtLeast<int n> : Constraint<
-    CPred<"$0.getType().cast<ShapedType>().hasRank() && "
-          "$0.getType().cast<ShapedType>().getRank() >= " # n>>;
-
 // Checks if the value has rank 'n'.
 class HasRank<int n> : Constraint<
     CPred<"$0.getType().cast<ShapedType>().hasRank() && "
@@ -698,14 +693,10 @@ foreach ValueOp = [TFL_CeilOp, TFL_ExpOp, TFL_FloorOp, TFL_NegOp,
   }
 }
 
-// Returns shape of a ranked tensor.
-// if called without a ranked tensor it will fail.
-def GetShape: NativeCodeCall<"GetShape($0)">;
-
 // Returns truncated shape of a ranked-tensor.
-// Truncated, here, means eliminating any contiguous 1s' in the lower
+// Prefix-Truncated, here, means eliminating any contiguous 1s' in the lower
 // dimentions of the tensor
-def GetTruncatedShape: NativeCodeCall<"GetShape($0, true)">;
+def GetPrefixTruncatedShape: NativeCodeCall<"GetShape($0, true)">;
 
 // Returns True if the operand type is RankedTensorType and valid.
 def HasValidRankedTensor : Constraint<CPred<
@@ -713,7 +704,7 @@ def HasValidRankedTensor : Constraint<CPred<
   "$0.getType().cast<RankedTensorType>().getNumDynamicDims() <= 1">>;
 
 // Check if the truncated shape of the lhs is equal to the shape of rhs
-def IsTruncatedShapeEqualTo : Constraint<CPred<
+def IsPrefixTruncatedShapeEqualTo : Constraint<CPred<
   "GetShape($0, true) == GetShape($1)">>;
 
 def ConvertSqueezeToReshape : Pat<
@@ -735,9 +726,9 @@ def ConvertTrasposeReshapeTransposeToReshape : Pat<
       (TFL_TransposeOp:$first_transpose $input, $permutation2),
         $shape),
       $permutation1),
-  (TFL_ReshapeOp $input, (Arith_ConstantOp (GetTruncatedShape $input))),
-  [(IsTruncatedShapeEqualTo $first_transpose, $middle_reshape),
-   (IsTruncatedShapeEqualTo $input, $second_transpose)]>;
+  (TFL_ReshapeOp $input, (Arith_ConstantOp (GetPrefixTruncatedShape $input))),
+  [(IsPrefixTruncatedShapeEqualTo $first_transpose, $middle_reshape),
+   (IsPrefixTruncatedShapeEqualTo $input, $second_transpose)]>;
 
 // TODO(b/294385379): This pattern only appears when we convert
 // from shlo due to differences in broadcasting behavior
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
index abd57fe7372ef8..c625b329be6413 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
@@ -51,21 +51,21 @@ class TFi32<int v> : ConstantAttr<I32ElementsAttr, !cast<string>(v)>;
 // Matmul without transpose on b to matmul with explicit transpose op and
 // transposed b.
 def ConvertMatmulWithoutTransposeToWithTranspose :
-      Pat<(TF_MatMulOp $a, $b, ConstBoolAttrFalse:$at, ConstBoolAttrFalse),
+      Pat<(TF_MatMulOp $a, $b, ConstBoolAttrFalse:$at, ConstBoolAttrFalse, $grad_a, $grad_b),
           (TF_MatMulOp $a, (TF_TransposeOp $b, (TF_SubOp (TF_RangeOp
              /*start=*/(TF_RankOp $b),
              /*limit=*/(TF_ConstOp TFi32<0>),
              /*delta=*/(TF_ConstOp TFi32<-1>)), (TF_ConstOp TFi32<1>))),
-           $at, ConstBoolAttrTrue)>;
+           $at, ConstBoolAttrTrue, $grad_a, $grad_b)>;
 
 // Matmul with transpose on a to matmul with explicit transpose op and a not
 // transposed.
-def ConvertMatmulWithTranspose : Pat<(TF_MatMulOp $a, $b, ConstBoolAttrTrue, $bt),
+def ConvertMatmulWithTranspose : Pat<(TF_MatMulOp $a, $b, ConstBoolAttrTrue, $bt, $grad_a, $grad_b),
           (TF_MatMulOp (TF_TransposeOp $a, (TF_SubOp (TF_RangeOp
              /*start=*/(TF_RankOp $a),
              /*limit=*/(TF_ConstOp TFi32<0>),
              /*delta=*/(TF_ConstOp TFi32<-1>)), (TF_ConstOp TFi32<1>))), $b,
-           ConstBoolAttrFalse, $bt)>;
+           ConstBoolAttrFalse, $bt, $grad_a, $grad_b)>;
 
 // Partially supported in TFLite, treated as passthrough IdentityOp
 def ConvertCheckNumerics : Pat<(TF_CheckNumericsOp $arg, $msg), (TF_IdentityOp $arg)>;
diff --git a/tensorflow/compiler/mlir/lite/utils/utils.h b/tensorflow/compiler/mlir/lite/utils/utils.h
index 9fe43f34b256cf..6130bab6531ba2 100644
--- a/tensorflow/compiler/mlir/lite/utils/utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/utils.h
@@ -107,6 +107,34 @@ inline ShapedType GetTransposedType(Value input,
   return transposed_type;
 }
 
+// Returns shape of a ranked tensor.
+// Precondition: output_val's is ranked tensor.
+// Returns a truncated shape when `truncate` is set to true.
+inline DenseElementsAttr GetShape(Value output_val, bool truncate = false) {
+  auto output_shape = output_val.getType().dyn_cast<ShapedType>().getShape();
+
+  SmallVector<int32_t> shape;
+  shape.reserve(output_shape.size());
+
+  bool needs_truncation = true;
+  for (size_t dim_idx = 0; dim_idx < output_shape.size(); ++dim_idx) {
+    int64_t dim = output_shape[dim_idx];
+    if (truncate && needs_truncation && dim == 1) {
+      continue;
+    } else if (needs_truncation && dim != 1) {
+      needs_truncation = false;
+    }
+    shape.push_back(ShapedType::isDynamic(dim) ? -1
+                                               : static_cast<int32_t>(dim));
+  }
+
+  return mlir::DenseElementsAttr::get(
+      RankedTensorType::get(
+          {static_cast<int>(shape.size())},
+          mlir::IntegerType::get(output_val.getContext(), 32)),
+      llvm::ArrayRef(shape));
+}
+
 }  // namespace TFL
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/lite/utils/utils.td b/tensorflow/compiler/mlir/lite/utils/utils.td
index c2b953d2cf0585..e64b591ae78eda 100644
--- a/tensorflow/compiler/mlir/lite/utils/utils.td
+++ b/tensorflow/compiler/mlir/lite/utils/utils.td
@@ -17,6 +17,16 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mlir/IR/PatternBase.td"
+
+// Returns shape of a ranked tensor.
+// if called without a ranked tensor it will fail.
+def GetShape: NativeCodeCall<"GetShape($0)">;
+
+// Checks if the value has rank at most 'n'.
+class HasRankAtLeast<int n> : Constraint<
+    CPred<"$0.getType().cast<ShapedType>().hasRank() && "
+          "$0.getType().cast<ShapedType>().getRank() >= " # n>>;
 
 // Checks value is not produced by a TFL_Quant or
 // from TFL_Quant Op with same quant type.
diff --git a/tensorflow/compiler/mlir/python/BUILD b/tensorflow/compiler/mlir/python/BUILD
index acc7bd1a8fb01e..afc088517dc35f 100644
--- a/tensorflow/compiler/mlir/python/BUILD
+++ b/tensorflow/compiler/mlir/python/BUILD
@@ -1,3 +1,4 @@
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
@@ -7,6 +8,17 @@ package(
     licenses = ["notice"],
 )
 
+bool_flag(
+    name = "disable_mlir",
+    build_setting_default = False,
+)
+
+config_setting(
+    name = "disable_mlir_config",
+    flag_values = {":disable_mlir": "True"},
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "mlir",
     srcs = ["mlir.cc"],
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc
index 6042a896709d9e..8c82fc9bc12b42 100644
--- a/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc
@@ -29,8 +29,7 @@ PYBIND11_MODULE(filecheck_wrapper, m) {
                           llvm::SMLoc());
     SM.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(check),
                           llvm::SMLoc());
-    llvm::Regex regex = fc.buildCheckPrefixRegex();
-    fc.readCheckFile(SM, llvm::StringRef(check), regex);
+    fc.readCheckFile(SM, llvm::StringRef(check));
     return fc.checkInput(SM, llvm::StringRef(input));
   });
 }
diff --git a/tensorflow/compiler/mlir/quantization/common/BUILD b/tensorflow/compiler/mlir/quantization/common/BUILD
new file mode 100644
index 00000000000000..a39f7e5a64d268
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/BUILD
@@ -0,0 +1,123 @@
+load("@llvm-project//mlir:tblgen.bzl", "td_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    # By default, these targets should only be used within the quantization library.
+    default_visibility = [
+        "//learning/brain/mlir/quantization:__subpackages__",
+        "//tensorflow/compiler/mlir/quantization:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+td_library(
+    name = "lift_as_function_call_td_files",
+    srcs = [
+        "lift_as_function_call.td",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "@llvm-project//mlir:FuncTdFiles",
+    ],
+)
+
+cc_library(
+    name = "lift_as_function_call",
+    srcs = ["lift_as_function_call.cc"],
+    hdrs = ["lift_as_function_call.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:stablehlo_type_utils",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:quantization_unit_loc",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow:xla_call_module_attrs",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core/ir/types:Dialect",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "lift_as_function_call_test",
+    srcs = ["lift_as_function_call_test.cc"],
+    deps = [
+        ":lift_as_function_call",
+        ":test_base",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
+
+cc_library(
+    name = "test_base",
+    testonly = 1,
+    srcs = [],
+    hdrs = ["test_base.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/ops:stablehlo_op_quant_spec",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:QuantOps",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
+
+cc_library(
+    name = "attrs_and_constraints",
+    srcs = [
+        "attrs_and_constraints.cc",
+    ],
+    hdrs = [
+        "attrs_and_constraints.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+td_library(
+    name = "quant_td_files",
+    srcs = [
+        "attrs_and_constraints.td",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite_ops_td_files",
+        "//tensorflow/compiler/mlir/quantization/common:lift_as_function_call_td_files",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
+        "@llvm-project//mlir:ArithOpsTdFiles",
+        "@llvm-project//mlir:FuncTdFiles",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.cc b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc
similarity index 79%
rename from tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.cc
rename to tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc
index a72098d3fa8aae..a5d4f745a7d02a 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.cc
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc
@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
-
-#include <memory>
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 
 #include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
-namespace mlir {
-namespace quant {
+namespace mlir::quant {
 
 bool HasQuantizedTensors(Operation* op) {
   if (!IsOpQuantizable(op)) return false;
@@ -72,5 +76,4 @@ SmallVector<Value> CloneOpWithReplacedOperands(
   return builder.clone(*op, mapping)->getResults();
 }
 
-}  // namespace quant
-}  // namespace mlir
+}  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
similarity index 90%
rename from tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h
rename to tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
index 320b6b93aa536d..791e608dc064dc 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,23 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_UTILS_H_
-#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_UTILS_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_ATTRS_AND_CONSTRAINTS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_ATTRS_AND_CONSTRAINTS_H_
 
+#include <cstdint>
 #include <type_traits>
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
-namespace mlir {
-namespace quant {
+namespace mlir::quant {
 
 constexpr char kQuantizeFuncName[] = "quantize_i8";
 constexpr char kDequantizeFuncName[] = "dequantize_i8";
@@ -132,6 +132,6 @@ bool AreSplatValuesEqual(Value x, Value y) {
 SmallVector<Value> CloneOpWithReplacedOperands(
     OpBuilder &builder, Operation *op, const SmallVector<Value> &new_operands);
 
-}  // namespace quant
-}  // namespace mlir
-#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_UTILS_H_
+}  // namespace mlir::quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_ATTRS_AND_CONSTRAINTS_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td
similarity index 92%
rename from tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td
rename to tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td
index 654e4af58d3fc6..a5d1d8544ae931 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td
@@ -147,3 +147,15 @@ def GetDefiningOp :  NativeCodeCall<"$0.getDefiningOp()">;
 def CloneOpWithReplacedOperands : NativeCodeCall<
   "CloneOpWithReplacedOperands("
     "$_builder, $0, llvm::SmallVector<Value>{$1...}).front()">;
+
+// Checks whether the value of a constant equals the given float, regardless
+// of the tensor dimension.
+class FloatValueEquals<string val> : Constraint<CPred<
+  "FloatValueEquals($0, " # val # ")">>;
+
+// Fetches the default or null attribute, used for pattern matching.
+def DefaultOrNullAttr : NativeCodeCall<"DefaultOrNullAttr($_builder, $0)">;
+
+// Returns true if the given op is a StableHLO constant op.
+def IsStableHLOConstantOp : Constraint<CPred<"dyn_cast_or_null<::mlir::stablehlo::ConstantOp>($0.getDefiningOp())">>;
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
similarity index 73%
rename from tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.cc
rename to tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
index 37d9b56ac1a7de..d74c8a952c8c24 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 
 #include <algorithm>
 #include <cstdint>
@@ -22,50 +22,74 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
+#include "tensorflow/core/ir/types/dialect.h"
 #include "tensorflow/core/platform/mutex.h"
 
-namespace mlir {
-namespace quant {
+namespace mlir::quant {
+
+// Default version number for native serialization.
+constexpr int64_t kDefaultVersion = 9;
+// Default platform for XlaCallModuleOp.
+constexpr StringRef kPlatformCpu = "CPU";
+// Name of `tf.XlaCallModule`'s dictionary attribute for keeping the
+// deserialized stablehlo module's attributes.
+constexpr llvm::StringRef kStablehloModuleAttrsAttrName =
+    "_stablehlo_module_attrs";
+// Attribute required for running shape refinement pass enabled in XlaCallModule
+// version 8 and above.
+constexpr llvm::StringRef kUsesShapePolymorphismAttr =
+    "jax.uses_shape_polymorphism";
 
 // Checks if the op is inside a lifted function.
-bool IsInLiftedFunc(Operation *op) {
-  return op->getParentOfType<func::FuncOp>()->hasAttr(kFusedFunctionAttr);
+bool IsInLiftedFunc(Operation& op) {
+  return op.getParentOfType<func::FuncOp>()->hasAttr(kFusedFunctionAttr);
 }
 
 // Inserts the function to the symbol table of the module thread-safely.
-StringAttr InsertToSymbolTable(Operation *module, Operation *function,
-                               const std::string &func_name) {
-  static tensorflow::mutex *mtx = new tensorflow::mutex();
+StringAttr InsertToSymbolTable(Operation& module, Operation& function,
+                               const std::string& func_name) {
+  static tensorflow::mutex* mtx = new tensorflow::mutex();
   tensorflow::mutex_lock lock(*mtx);
 
-  SymbolTable symbol_table(module);
+  SymbolTable symbol_table(&module);
   std::string unique_name = func_name;
   int32_t uniquing_counter = 0;
   while (symbol_table.lookup(unique_name) != nullptr) {
     ++uniquing_counter;
     unique_name = func_name + "_" + std::to_string(uniquing_counter);
   }
-  function->setAttr("sym_name",
-                    StringAttr::get(module->getContext(), unique_name));
-  return symbol_table.insert(function);
+  function.setAttr("sym_name",
+                   StringAttr::get(module.getContext(), unique_name));
+  return symbol_table.insert(&function);
 }
 
 // Creates the TF::PartitionedCallOp with the given arguments and output types.
@@ -100,15 +124,16 @@ ValueRange createTFXlaCallModuleOp(OpBuilder builder, Location location,
         tf_type::ShapeAttr::get(ctx, result_type.cast<ShapedType>()));
   }
   auto empty_array_attr = ArrayAttr::get(ctx, {});
+  auto platforms = ArrayAttr::get(ctx, {StringAttr::get(ctx, kPlatformCpu)});
 
   TF::XlaCallModuleOp call_op = builder.create<TF::XlaCallModuleOp>(
       location,
       /*output=*/output_types,
       /*args=*/args,
-      /*version=*/5, /*module=*/"",
+      /*version=*/kDefaultVersion, /*module=*/"",
       /*Sout=*/ArrayAttr::get(ctx, shape_attrs),
       /*dim_args_spec=*/empty_array_attr,
-      /*platforms=*/empty_array_attr,
+      /*platforms=*/platforms,
       /*function_list=*/empty_array_attr,
       /*has_token_input_output=*/false,
       /*disabled_checks=*/empty_array_attr);
@@ -130,6 +155,12 @@ ValueRange createTFXlaCallModuleOp(OpBuilder builder, Location location,
       builder.getStringAttr(llvm::StringRef(
           std::string(QuantTraitValues[QuantizationTrait::FullyQuantizable]))));
 
+  // Set jax.uses_shape_polymorphism=true to enable shape refinement at runtime.
+  // This is needed for native serialization version >= 8.
+  call_op->setAttr(kStablehloModuleAttrsAttrName,
+                   builder.getDictionaryAttr(builder.getNamedAttr(
+                       kUsesShapePolymorphismAttr, builder.getBoolAttr(true))));
+
   return call_op.getOutput();
 }
 
@@ -152,14 +183,14 @@ ValueRange createFunctionCallOp(OpBuilder builder, Location location,
 
 // Finds ops in the paths from arguments to results. The ops is listed in an
 // order that the former ops shouldn't have any dependencies on the later ones.
-llvm::SmallVector<Operation *> FindOpsFromArgumentsToResults(
-    const llvm::SmallVector<Value> &arguments,
-    const llvm::SmallVector<Value> &results) {
+llvm::SmallVector<Operation*> FindOpsFromArgumentsToResults(
+    const llvm::SmallVector<Value>& arguments,
+    const llvm::SmallVector<Value>& results) {
   std::queue<Value> value_queue;
   for (Value result : results) {
     value_queue.push(result);
   }
-  absl::flat_hash_set<mlir::detail::ValueImpl *> argument_set;
+  absl::flat_hash_set<mlir::detail::ValueImpl*> argument_set;
   for (Value argument : arguments) {
     argument_set.insert(argument.getImpl());
   }
@@ -167,15 +198,15 @@ llvm::SmallVector<Operation *> FindOpsFromArgumentsToResults(
   // Searching for ops from results to arguments. Duplicate ops in the op stack
   // are intentional in order to make sure the op on the top of the stack
   // doesn't depends on any ops below it.
-  std::stack<Operation *> op_stack;
+  std::stack<Operation*> op_stack;
   while (!value_queue.empty()) {
     Value current_value = value_queue.front();
     value_queue.pop();
 
-    Operation *defining_node = current_value.getDefiningOp();
+    Operation* defining_node = current_value.getDefiningOp();
     if (defining_node == nullptr) continue;
     op_stack.push(defining_node);
-    for (const auto &arg : defining_node->getOperands()) {
+    for (const auto& arg : defining_node->getOperands()) {
       if (!argument_set.contains(arg.getImpl())) {
         value_queue.push(arg);
       }
@@ -183,10 +214,10 @@ llvm::SmallVector<Operation *> FindOpsFromArgumentsToResults(
   }
 
   // Remove duplicate ops from the op stack.
-  llvm::SmallVector<Operation *> sorted_ops;
-  absl::flat_hash_set<Operation *> unique_ops;
+  llvm::SmallVector<Operation*> sorted_ops;
+  absl::flat_hash_set<Operation*> unique_ops;
   while (!op_stack.empty()) {
-    Operation *current_op = op_stack.top();
+    Operation* current_op = op_stack.top();
     op_stack.pop();
     if (unique_ops.contains(current_op)) continue;
     sorted_ops.push_back(current_op);
@@ -206,21 +237,20 @@ llvm::SmallVector<Operation *> FindOpsFromArgumentsToResults(
 // identifiers.
 // This function returns success if all attributes could be found.
 LogicalResult SetAttributeMap(
-    MLIRContext *context, const llvm::SmallVector<NamedAttribute> &attributes,
-    const llvm::SmallVector<Operation *> &ops) {
+    MLIRContext& context, const llvm::SmallVector<NamedAttribute>& attributes,
+    const llvm::SmallVector<Operation*>& ops) {
   // A map to find which operation an attribute belongs to.
   // The key for this map uses the entire NamedAttribute object, i.e. the
   // {attribute_name, attribute_value} pair.
-  llvm::SmallDenseMap<NamedAttribute, Operation *> attr_to_op_map;
-  for (Operation *op : ops) {
-    for (const auto &named_attr : op->getAttrs()) {
+  llvm::SmallDenseMap<NamedAttribute, Operation*> attr_to_op_map;
+  for (Operation* op : ops) {
+    for (const NamedAttribute named_attr : op->getAttrs()) {
       attr_to_op_map.insert({named_attr, op});
     }
   }
 
   for (int idx : llvm::seq<int>(0, attributes.size())) {
-    const NamedAttribute &attribute = attributes[idx];
-
+    const NamedAttribute& attribute = attributes[idx];
     // Skip the following steps if the attribute value is `NullAttribute`.
     if (const auto string_attr =
             attribute.getValue().dyn_cast_or_null<StringAttr>();
@@ -229,27 +259,38 @@ LogicalResult SetAttributeMap(
       continue;
     }
 
-    if (attr_to_op_map.count(attribute) == 0) {
-      mlir::emitError(UnknownLoc::get(context),
+    if (std::find_if(
+            attr_to_op_map.begin(), attr_to_op_map.end(), [&](auto attr_op) {
+              return std::get<0>(attr_op).getName() == attribute.getName();
+            }) == attr_to_op_map.end()) {
+      mlir::emitError(UnknownLoc::get(&context),
                       "Could not find attribute: " + attribute.getName().str());
       return failure();
     }
 
-    Operation *owner_op = attr_to_op_map[attribute];
-
-    std::string new_attr_map_str{};
-    if (owner_op->hasAttr(kAttrMapAttribute)) {
-      new_attr_map_str =
-          owner_op->getAttrOfType<StringAttr>(kAttrMapAttribute).str();
-      absl::StrAppend(&new_attr_map_str, ",");
+    Operation* owner_op;
+    for (const auto& [attr, val] : attr_to_op_map) {
+      if (attr.getName() == attribute.getName()) owner_op = val;
     }
+    if (stablehlo::IsStablehloOp(owner_op)) {
+      owner_op->setAttr(StringRef(attribute.getName()), attribute.getValue());
+    } else {
+      owner_op = attr_to_op_map[attribute];
+
+      std::string new_attr_map_str{};
+      if (owner_op->hasAttr(kAttrMapAttribute)) {
+        new_attr_map_str =
+            owner_op->getAttrOfType<StringAttr>(kAttrMapAttribute).str();
+        absl::StrAppend(&new_attr_map_str, ",");
+      }
 
-    // Append "<identifier>:<attribute_name>". Ex) "0:transpose_a".
-    const std::string identifier = std::to_string(idx);
-    const mlir::StringAttr attribute_name = attribute.getName();
-    absl::StrAppend(&new_attr_map_str, identifier, ":", attribute_name.str());
-    owner_op->setAttr(kAttrMapAttribute,
-                      StringAttr::get(context, new_attr_map_str));
+      // Append "<identifier>:<attribute_name>". Ex) "0:transpose_a".
+      const std::string identifier = std::to_string(idx);
+      const mlir::StringAttr attribute_name = attribute.getName();
+      absl::StrAppend(&new_attr_map_str, identifier, ":", attribute_name.str());
+      owner_op->setAttr(kAttrMapAttribute,
+                        StringAttr::get(&context, new_attr_map_str));
+    }
   }
   return success();
 }
@@ -257,15 +298,15 @@ LogicalResult SetAttributeMap(
 // Creates a function to wrap the section between arguments and results.
 llvm::SmallVector<Value, 4> LiftAsFunctionCall(
     OpBuilder builder, Location location, FunctionCallOpType call_op_type,
-    StringRef func_name, const llvm::SmallVector<Value> &arguments,
-    const llvm::SmallVector<Value> &results,
-    const llvm::SmallVector<NamedAttribute> &attributes) {
-  MLIRContext *context = builder.getContext();
+    StringRef func_name, const llvm::SmallVector<Value>& arguments,
+    const llvm::SmallVector<Value>& results,
+    const llvm::SmallVector<NamedAttribute>& attributes) {
+  MLIRContext* context = builder.getContext();
   if (results.empty()) {
     mlir::emitError(UnknownLoc::get(context), "No result values specified");
     return {};
   }
-  Operation *result_op = results[0].getDefiningOp();
+  Operation* result_op = results[0].getDefiningOp();
   auto module = result_op->getParentOfType<ModuleOp>();
 
   // Create a private function and copy all ops between arguments and results.
@@ -277,7 +318,7 @@ llvm::SmallVector<Value, 4> LiftAsFunctionCall(
   auto func_type = FunctionType::get(context, arg_types, result_types);
 
   llvm::SmallVector<Location> arg_locs;
-  for (const auto &arg : arguments) {
+  for (const auto& arg : arguments) {
     arg_locs.push_back(arg.getLoc());
   }
   auto wrap_func = builder.create<func::FuncOp>(location, func_name, func_type);
@@ -298,7 +339,7 @@ llvm::SmallVector<Value, 4> LiftAsFunctionCall(
   auto cloning_ops = FindOpsFromArgumentsToResults(arguments, results);
   // Set the location of call op to QuantizationUnitLoc if found.
   Location call_op_loc = location;
-  for (Operation *op : cloning_ops) {
+  for (Operation* op : cloning_ops) {
     std::optional<QuantizationUnitLoc::QuantizationUnit> unit =
         FindQuantizationUnitFromLoc(op->getLoc());
     if (unit.has_value()) {
@@ -306,10 +347,10 @@ llvm::SmallVector<Value, 4> LiftAsFunctionCall(
     }
   }
 
-  if (failed(SetAttributeMap(context, attributes, cloning_ops))) {
+  if (failed(SetAttributeMap(*context, attributes, cloning_ops))) {
     current_func.emitError() << "Some attributes couldn't be found.";
   }
-  for (Operation *op : cloning_ops) {
+  for (Operation* op : cloning_ops) {
     builder.clone(*op, mapping);
   }
 
@@ -321,7 +362,7 @@ llvm::SmallVector<Value, 4> LiftAsFunctionCall(
 
   // Create a function call to the newly created function.
   StringAttr new_func_name =
-      InsertToSymbolTable(module, wrap_func, func_name.str());
+      InsertToSymbolTable(*module, *wrap_func, func_name.str());
   builder.setInsertionPointAfter(result_op);
   ValueRange new_results =
       createFunctionCallOp(builder, call_op_loc, call_op_type,
@@ -331,15 +372,15 @@ llvm::SmallVector<Value, 4> LiftAsFunctionCall(
 
 llvm::SmallVector<Value, 4> LiftAsFunctionCall(
     OpBuilder builder, Location location, FunctionCallOpType call_op_type,
-    StringRef func_name, const llvm::SmallVector<Value> &arguments,
-    const llvm::SmallVector<Value> &results) {
+    StringRef func_name, const llvm::SmallVector<Value>& arguments,
+    const llvm::SmallVector<Value>& results) {
   llvm::SmallVector<NamedAttribute> attributes;
   return LiftAsFunctionCall(builder, location, call_op_type, func_name,
                             arguments, results, attributes);
 }
 
 llvm::SmallVector<Value> AppendToVector(
-    const llvm::SmallVector<Value> &arguments, Value append) {
+    const llvm::SmallVector<Value>& arguments, Value append) {
   llvm::SmallVector<Value> ret(arguments);
   ret.push_back(append);
   return ret;
@@ -422,5 +463,4 @@ bool IsEinsumSupportedByXlaDotV2(mlir::StringAttr equation_attr) {
          rhs_out_idx_start >= batch_dim_size;
 }
 
-}  // namespace quant
-}  // namespace mlir
+}  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
similarity index 76%
rename from tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h
rename to tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
index 83f1ed2ce6d59d..c796fbbca32a2f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,22 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_LIFT_AS_FUNCTION_CALL_UTILS_H_
-#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_LIFT_AS_FUNCTION_CALL_UTILS_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_LIFT_AS_FUNCTION_CALL_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_LIFT_AS_FUNCTION_CALL_H_
 
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
-// This header file defines common utils used by TF-Quant transformation
-// passes to lift op compositions to a function.
-namespace mlir {
-namespace quant {
+namespace mlir::quant {
 
 // This attribute will be set for functions created by this pass.
+// Presence of this attribute will mark the function as quantization target.
 inline constexpr absl::string_view kFusedFunctionAttr =
     "tf_quant.composite_function";
 // The keyword to detect if this is a `NullAttribute`.
@@ -43,7 +43,7 @@ inline constexpr absl::string_view kOriginalStablehloEntryFunctionAttrName =
 enum FunctionCallOpType { TFPartitionedCallOp = 0, TFXlaCallModuleOp = 1 };
 
 // Checks if the op is inside a lifted function.
-bool IsInLiftedFunc(Operation *op);
+bool IsInLiftedFunc(Operation &op);
 
 // Checks if the given einsum op is supported for XlaDotV2 quantization.
 bool IsEinsumSupportedByXlaDotV2(mlir::StringAttr equation_attr);
@@ -70,6 +70,6 @@ llvm::SmallVector<Value, 4> LiftAsFunctionCall(
 llvm::SmallVector<Value> AppendToVector(
     const llvm::SmallVector<Value> &arguments, Value append);
 
-}  // namespace quant
-}  // namespace mlir
-#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_LIFT_AS_FUNCTION_CALL_UTILS_H_
+}  // namespace mlir::quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_LIFT_AS_FUNCTION_CALL_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.td b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td
similarity index 96%
rename from tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.td
rename to tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td
index 6110a38c721f98..a4437b50ac0cf0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.td
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -59,7 +59,7 @@ class NamedAttr<string attr_name> :
 // Checks if the value is not defined inside a lifted function by checking the
 // `tf_quant.composite_function` attribute.
 def IsNotInLiftedFunc :
-      Constraint<CPred<"!IsInLiftedFunc($0.getDefiningOp())">>;
+      Constraint<CPred<"!IsInLiftedFunc(*$0.getDefiningOp())">>;
 
 // Checks if the given einsum op is supported for XlaDotV2 quantization.
 def IsEinsumSupportedByXlaDotV2 :
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
new file mode 100644
index 00000000000000..4947fcd910e64b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
@@ -0,0 +1,131 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/test_base.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir::quant::common {
+namespace {
+
+class LiftAsFunctionCallTest : public QuantizationTestBase {};
+
+constexpr absl::string_view kModuleLifted = R"mlir(
+  module {
+    func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+      return %0 : tensor<1x3xf32>
+    }
+  }
+)mlir";
+
+TEST_F(LiftAsFunctionCallTest, LiftedFunctionSucceeds) {
+  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleLifted);
+  func::FuncOp composite_dot_general_fn =
+      GetFunctionFromModule(*module_op_ref, "composite_dot_general_fn_1");
+  Operation* dot_general_op =
+      FindOperationOfType<mlir::stablehlo::DotGeneralOp>(
+          composite_dot_general_fn);
+  EXPECT_TRUE(IsInLiftedFunc(*dot_general_op));
+}
+
+constexpr absl::string_view kModuleStableHlo = R"mlir(
+  module {
+    func.func private @main(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+      return %0 : tensor<1x3xf32>
+    }
+  }
+)mlir";
+
+TEST_F(LiftAsFunctionCallTest, FunctionLiftedAsXlaCallModuleOp) {
+  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleStableHlo);
+  func::FuncOp main_fn = GetFunctionFromModule(*module_op_ref, "main");
+  Operation* dot_general_op =
+      FindOperationOfType<mlir::stablehlo::DotGeneralOp>(main_fn);
+
+  const SmallVector<NamedAttribute>& attributes = {
+      builder_.getNamedAttr("precision_config",
+                            builder_.getArrayAttr(SmallVector<Attribute>(
+                                1, stablehlo::PrecisionAttr::get(
+                                       &ctx_, stablehlo::Precision::DEFAULT)))),
+  };
+  Operation* lifted_op =
+      LiftAsFunctionCall(builder_, dot_general_op->getLoc(),
+                         FunctionCallOpType::TFXlaCallModuleOp,
+                         "composite_dot_general_fn",
+                         dot_general_op->getOperands(),
+                         dot_general_op->getResults(), attributes)[0]
+          .getDefiningOp();
+  const auto entry_function_symbol_ref =
+      lifted_op->getAttrOfType<FlatSymbolRefAttr>("_entry_function");
+  SymbolTable symbol_table(*module_op_ref);
+  auto entry_func = dyn_cast_or_null<func::FuncOp>(
+      symbol_table.lookup(entry_function_symbol_ref.getValue()));
+  Operation* lifted_dot_general_op =
+      FindOperationOfType<mlir::stablehlo::DotGeneralOp>(entry_func);
+
+  EXPECT_TRUE(isa<TF::XlaCallModuleOp>(lifted_op));
+  EXPECT_EQ(lifted_op->getAttr("_original_entry_function").cast<StringAttr>(),
+            "composite_dot_general_fn_1");
+  EXPECT_EQ(
+      lifted_dot_general_op->getAttr("precision_config").cast<ArrayAttr>(),
+      builder_.getArrayAttr(SmallVector<Attribute>(
+          1, stablehlo::PrecisionAttr::get(&ctx_,
+                                           stablehlo::Precision::DEFAULT))));
+}
+
+TEST_F(LiftAsFunctionCallTest, FunctionNoAttrLiftedAsXlaCallModuleOp) {
+  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleStableHlo);
+  func::FuncOp main_fn = GetFunctionFromModule(*module_op_ref, "main");
+  Operation* dot_general_op =
+      FindOperationOfType<mlir::stablehlo::DotGeneralOp>(main_fn);
+  Operation* lifted_op =
+      LiftAsFunctionCall(
+          builder_, dot_general_op->getLoc(),
+          FunctionCallOpType::TFXlaCallModuleOp, "composite_dot_general_fn",
+          dot_general_op->getOperands(), dot_general_op->getResults())[0]
+          .getDefiningOp();
+  EXPECT_TRUE(isa<TF::XlaCallModuleOp>(lifted_op));
+  EXPECT_EQ(lifted_op->getAttr("_original_entry_function").cast<StringAttr>(),
+            "composite_dot_general_fn_1");
+}
+
+TEST_F(LiftAsFunctionCallTest, EinsumSupportedForXlaDotV2Succeeds) {
+  StringAttr einsum_supported_by_xla_dot_v2_attr =
+      builder_.getStringAttr("ijk,ikm->ijm");
+  StringAttr einsum_one_operand = builder_.getStringAttr("ijk->ikj");
+  StringAttr einsum_ellipsis = builder_.getStringAttr("...gse->...gs");
+  EXPECT_TRUE(IsEinsumSupportedByXlaDotV2(einsum_supported_by_xla_dot_v2_attr));
+  EXPECT_FALSE(IsEinsumSupportedByXlaDotV2(einsum_one_operand));
+  EXPECT_FALSE(IsEinsumSupportedByXlaDotV2(einsum_ellipsis));
+}
+
+}  // namespace
+}  // namespace mlir::quant::common
diff --git a/tensorflow/compiler/mlir/quantization/common/test_base.h b/tensorflow/compiler/mlir/quantization/common/test_base.h
new file mode 100644
index 00000000000000..ad847a29477779
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/test_base.h
@@ -0,0 +1,79 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TEST_BASE_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TEST_BASE_H_
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace mlir::quant::common {
+
+using ::testing::Test;
+
+class QuantizationTestBase : public Test {
+ protected:
+  QuantizationTestBase() {
+    ctx_.loadDialect<arith::ArithDialect, mlir::stablehlo::StablehloDialect,
+                     func::FuncDialect, TF::TensorFlowDialect,
+                     quant::QuantizationDialect,
+                     quantfork::QuantizationForkDialect>();
+  }
+
+  // Parses `module_op_str` to create a `ModuleOp`. Checks whether the created
+  // module op is valid.
+  OwningOpRef<ModuleOp> ParseModuleOpString(
+      const absl::string_view module_op_str) {
+    auto module_op_ref = parseSourceString<ModuleOp>(module_op_str, &ctx_);
+    EXPECT_TRUE(module_op_ref);
+    return module_op_ref;
+  }
+
+  // Gets the function with the given name from the module.
+  func::FuncOp GetFunctionFromModule(ModuleOp module,
+                                     absl::string_view function_name) {
+    SymbolTable symbol_table(module);
+    return symbol_table.lookup<func::FuncOp>(function_name);
+  }
+
+  // Returns the first operation with the given type in the function.
+  template <typename OpType>
+  OpType FindOperationOfType(func::FuncOp function) {
+    for (auto op : function.getBody().getOps<OpType>()) {
+      return op;
+    }
+    return nullptr;
+  }
+
+  mlir::MLIRContext ctx_{};
+  OpBuilder builder_{&ctx_};
+};
+
+}  // namespace mlir::quant::common
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TEST_BASE_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index 640cd2e6cb7366..6ab58f78aac025 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -25,13 +25,13 @@ package(
     licenses = ["notice"],
 )
 
-# TODO(b/264218457): Add quantize and post_quantize passes.
 cc_library(
     name = "passes",
     srcs = [
         "passes/lift_quantizable_spots_as_functions.cc",
         "passes/lift_quantizable_spots_as_functions_fusion.inc",
         "passes/lift_quantizable_spots_as_functions_simple.inc",
+        "passes/populate_shape.cc",
         "passes/post_quantize.cc",
         "passes/prepare_quantize.cc",
         "passes/quantize.cc",
@@ -40,6 +40,7 @@ cc_library(
         "passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc",
         "passes/restore_function_name.cc",
         "passes/unfuse_mhlo_batch_norm.cc",
+        "passes/unwrap_xla_call_module_op.cc",
     ],
     hdrs = [
         "passes/passes.h",
@@ -51,6 +52,7 @@ cc_library(
         ":lift_quantizable_spots_as_functions_fusion_inc_gen",
         ":lift_quantizable_spots_as_functions_simple_inc_gen",
         ":quantization_options_proto_cc",
+        ":quantization_patterns",
         ":stablehlo_passes_inc_gen",
         ":stablehlo_type_utils",
         ":uniform_quantized_types",
@@ -58,15 +60,20 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:pass_utils",
+        "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:lift_as_function_call",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/ops:stablehlo_op_quant_spec",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow/ops:tf_op_quant_spec",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/utils:lift_as_function_call_utils",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tensorflow:xla_call_module_attrs",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/ir/types:Dialect",
         "//tensorflow/core/platform:path",
         "//tensorflow/core/tpu:tpu_defs",
@@ -88,6 +95,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Rewrite",
+        "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
@@ -99,9 +107,42 @@ cc_library(
         "@stablehlo//:chlo_ops",
         "@stablehlo//:stablehlo_ops",
     ],
-    # Alwayslink is required for registering the MLIR passes.
-    # TODO(b/255530126): Split the pass registration from the definitions to avoid binary size bloat.
-    alwayslink = True,
+)
+
+cc_library(
+    name = "quantization_patterns",
+    srcs = ["passes/quantization_patterns.cc"],
+    hdrs = [
+        "passes/quantization_patterns.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":bridge_passes",
+        ":uniform_quantized_types",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/ops:stablehlo_op_quant_spec",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:xla_call_module_attrs",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:path",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:stablehlo_ops",
+    ],
 )
 
 td_library(
@@ -109,13 +150,12 @@ td_library(
     srcs = [
         "passes/lift_quantizable_spots_as_functions_fusion.td",
         "passes/lift_quantizable_spots_as_functions_simple.td",
-        "passes/utils.td",
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_ops_td_files",
+        "//tensorflow/compiler/mlir/quantization/common:lift_as_function_call_td_files",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quant_td_files",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/utils:lift_as_function_call_utils_td_files",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
         "@llvm-project//mlir:ArithOpsTdFiles",
         "@llvm-project//mlir:FuncTdFiles",
@@ -134,7 +174,10 @@ gentbl_cc_library(
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/lift_quantizable_spots_as_functions_simple.td",
-    deps = [":quant_td_files"],
+    deps = [
+        ":quant_td_files",
+        "//tensorflow/compiler/mlir/quantization/common:quant_td_files",
+    ],
 )
 
 gentbl_cc_library(
@@ -148,7 +191,10 @@ gentbl_cc_library(
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes/lift_quantizable_spots_as_functions_fusion.td",
-    deps = [":quant_td_files"],
+    deps = [
+        ":quant_td_files",
+        "//tensorflow/compiler/mlir/quantization/common:quant_td_files",
+    ],
 )
 
 gentbl_cc_library(
@@ -222,8 +268,6 @@ cc_library(
         "@local_xla//xla/translate/hlo_to_mhlo:attribute_importer",
         "@stablehlo//:chlo_ops",
     ],
-    # Force link to ensure ConvertTFQuantOpsToMHLOPass is registered.
-    alwayslink = True,
 )
 
 tf_cc_test(
@@ -331,11 +375,8 @@ cc_library(
         ":fill_quantization_options",
         ":passes",
         ":quantization_options_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
         "//tensorflow/core/platform:path",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:Pass",
     ],
@@ -487,6 +528,27 @@ tf_proto_library(
 # )
 # copybara:uncomment_end
 
+# OSS only: This target is header-only. Link `quantization_config_proto_cc_impl` only to
+# `libtensorflow_framework.so` via `lib_internal_impl`. Do NOT link
+# `quantization_config_proto_cc_impl` directly unless the target does not link
+# `libtensorflow_framework.so`.
+tf_proto_library(
+    name = "quantization_config_proto",
+    srcs = ["quantization_config.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    visibility = ["//visibility:public"],
+)
+
+# copybara:uncomment_begin(google-only)
+# py_proto_library(
+#     name = "quantization_config_py_pb2",
+#     api_version = 2,
+#     visibility = [":internal_visibility_allowlist_package"],
+#     deps = [":quantization_config_proto"],
+# )
+# copybara:uncomment_end
+
 exports_files([
     "run_lit.sh",
 ])
@@ -503,14 +565,20 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
+        "//tensorflow/core/ir/types:Dialect",
         "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
         "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
+        "@local_xla//xla/mlir_hlo:mhlo_passes",
         "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:stablehlo_passes",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
new file mode 100644
index 00000000000000..5c94eb06e617df
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
@@ -0,0 +1,161 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow:tensorflow.default.bzl",
+    "get_compatible_with_portable",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:__subpackages__",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "component",
+    hdrs = ["component.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "io",
+    srcs = ["io.cc"],
+    hdrs = ["io.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+tf_cc_test(
+    name = "io_test",
+    srcs = ["io_test.cc"],
+    deps = [
+        ":io",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:types",
+    ],
+)
+
+cc_library(
+    name = "graph_def",
+    srcs = [],
+    hdrs = ["graph_def.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "graph_def_test",
+    srcs = ["graph_def_test.cc"],
+    deps = [
+        ":graph_def",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
+
+cc_library(
+    name = "debugger",
+    srcs = ["debugger.cc"],
+    hdrs = ["debugger.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:graph_def",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+cc_library(
+    name = "export",
+    srcs = ["export.cc"],
+    hdrs = ["export.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+tf_cc_test(
+    name = "export_test",
+    srcs = ["export_test.cc"],
+    deps = [
+        ":export",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
+
+cc_library(
+    name = "precalibration",
+    srcs = ["precalibration.cc"],
+    hdrs = ["precalibration.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":component",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "@com_google_absl//absl/log:die_if_null",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/platform:errors",
+    ],
+)
+
+tf_cc_test(
+    name = "precalibration_test",
+    srcs = ["precalibration_test.cc"],
+    deps = [
+        ":precalibration",
+        "//tensorflow/compiler/mlir/quantization/common:test_base",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:status_matchers",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
new file mode 100644
index 00000000000000..946a733c2e7da9
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
@@ -0,0 +1,61 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:__subpackages__",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "min_max_value",
+    srcs = [],
+    hdrs = ["min_max_value.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [],
+)
+
+cc_library(
+    name = "statistics",
+    srcs = ["statistics.cc"],
+    hdrs = ["statistics.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:graph_def",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
+cc_library(
+    name = "assign_ids",
+    srcs = ["assign_ids.cc"],
+    hdrs = ["assign_ids.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:graph_def",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "assign_ids_test",
+    srcs = ["assign_ids_test.cc"],
+    deps = [
+        ":assign_ids",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton_impl",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/assign_ids.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/assign_ids.cc
new file mode 100644
index 00000000000000..31e990bbcf20a5
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/assign_ids.cc
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/assign_ids.h"
+
+#include <cstdint>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace stablehlo::quantization {
+namespace {
+
+using ::tensorflow::GraphDef;
+using ::tensorflow::NodeDef;
+using ::tensorflow::calibrator::CalibratorSingleton;
+
+}  // namespace
+
+void AssignIdsToCustomAggregatorOps(GraphDef& graph_def) {
+  MutateNodeDefs(graph_def, [](NodeDef& node_def) {
+    if (node_def.op() == "CustomAggregator") {
+      const int64_t new_id = CalibratorSingleton::IssueNewId();
+      (*node_def.mutable_attr())["id"].set_s(absl::StrCat(new_id));
+    }
+  });
+}
+
+}  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/assign_ids.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/assign_ids.h
new file mode 100644
index 00000000000000..6feaa81cc16ce4
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/assign_ids.h
@@ -0,0 +1,30 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_ASSIGN_IDS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_ASSIGN_IDS_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace stablehlo::quantization {
+
+// Assigns unique ids to each CustomAggregator op found in `graph_def`. The
+// ids are set to the `id` attribute. The ids are used during the calibration
+// step to identify the collected quantization statistics for each
+// CustsomAggregator op.
+void AssignIdsToCustomAggregatorOps(tensorflow::GraphDef& graph_def);
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_ASSIGN_IDS_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/assign_ids_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/assign_ids_test.cc
new file mode 100644
index 00000000000000..488315a32271b5
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/assign_ids_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/assign_ids.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep for tsl::protobuf
+
+namespace stablehlo::quantization {
+namespace {
+
+using ::tensorflow::GraphDef;
+using ::testing::IsEmpty;
+using ::testing::Not;
+using ::testing::SizeIs;
+using ::tsl::protobuf::TextFormat;
+
+TEST(AssignIdsTest, IdsAddedToCustomAggregatorOps) {
+  GraphDef graph_def;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(
+        node { op: "CustomAggregator" name: "foo" }
+      )pb",
+      &graph_def));
+
+  AssignIdsToCustomAggregatorOps(graph_def);
+
+  ASSERT_THAT(graph_def.node(), SizeIs(1));
+  EXPECT_TRUE(graph_def.node()[0].attr().contains("id"));
+  EXPECT_THAT(graph_def.node()[0].attr().at("id").s(), Not(IsEmpty()));
+}
+
+TEST(AssignIdsTest, IdsNotAddedForNonCustomAggregatorOps) {
+  GraphDef graph_def;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(
+        node { op: "NotCustomAggregator" name: "bar" }
+      )pb",
+      &graph_def));
+
+  AssignIdsToCustomAggregatorOps(graph_def);
+
+  ASSERT_THAT(graph_def.node(), SizeIs(1));
+  EXPECT_FALSE(graph_def.node()[0].attr().contains("id"));
+}
+
+}  // namespace
+}  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/min_max_value.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/min_max_value.h
new file mode 100644
index 00000000000000..5302bad49dd5e8
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/min_max_value.h
@@ -0,0 +1,28 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_MIN_MAX_VALUE_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_MIN_MAX_VALUE_H_
+
+#include <utility>
+
+namespace stablehlo::quantization {
+
+// Represents the (min, max) value pair, representing the range of values after
+// calibrating for quantization.
+using MinMaxValue = std::pair<float, float>;
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_MIN_MAX_VALUE_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc
new file mode 100644
index 00000000000000..6fe1f8d9cd4f8f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc
@@ -0,0 +1,71 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
+
+#include <optional>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace stablehlo::quantization {
+namespace {
+
+using ::tensorflow::GraphDef;
+using ::tensorflow::NodeDef;
+using ::tensorflow::calibrator::CalibrationStatistics;
+using ::tensorflow::calibrator::CalibratorSingleton;
+using ::tensorflow::quantization::CalibrationOptions;
+using ::tensorflow::quantization::PyFunctionLibrary;
+
+}  // namespace
+
+absl::Status AddCalibrationStatistics(
+    GraphDef& graph_def, const CalibrationOptions& calibration_options,
+    const PyFunctionLibrary& py_function_library) {
+  absl::Status status = absl::OkStatus();
+  MutateNodeDefs(graph_def, [&py_function_library, &calibration_options,
+                             &status](NodeDef& node_def) {
+    if (node_def.op() != "CustomAggregator") return;
+    const std::string& id = node_def.attr().at("id").s();
+    std::optional<CalibrationStatistics> statistics =
+        CalibratorSingleton::GetStatistics(id);
+    if (statistics == std::nullopt) {
+      status = absl::InternalError(
+          absl::StrFormat("Calibrated data does not exist. Cannot find "
+                          "statistics. value for id: %s",
+                          id));
+      return;
+    }
+
+    const auto [min_value, max_value] =
+        py_function_library.GetCalibrationMinMaxValue(*statistics,
+                                                      calibration_options);
+    CalibratorSingleton::ClearData(id);
+
+    (*node_def.mutable_attr())["min"].set_f(min_value);
+    (*node_def.mutable_attr())["max"].set_f(max_value);
+  });
+  return status;
+}
+
+}  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h
new file mode 100644
index 00000000000000..c1a551806f287c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_STATISTICS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_STATISTICS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace stablehlo::quantization {
+
+// Adds calibrated min / max values to CustomAggregator nodes in `graph_def`.
+// The min and max values will be added to the "min" and "max" attributes,
+// respectively. `calibration_options` provides the strategy to retrieve min and
+// max values.
+absl::Status AddCalibrationStatistics(
+    tensorflow::GraphDef& graph_def,
+    const tensorflow::quantization::CalibrationOptions& calibration_options,
+    const tensorflow::quantization::PyFunctionLibrary& py_function_library);
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_STATISTICS_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h
new file mode 100644
index 00000000000000..a1ddb5cb4688ff
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h
@@ -0,0 +1,40 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_COMPONENT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_COMPONENT_H_
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Component is a public abstraction for StableHLO Quantizer that represents the
+// most basic unit of action applied to the StableHLO graph. Derived classes
+// should override the `Run` method to implement the action.
+class Component {
+ public:
+  virtual ~Component() = default;
+
+  // Runs the action to the StableHLO graph, passed by the `module_op`. `config`
+  // should provide information necessary to configure the action's behavior.
+  virtual absl::StatusOr<ModuleOp> Run(
+      ModuleOp module_op,
+      const ::stablehlo::quantization::QuantizationConfig& config) = 0;
+};
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_COMPONENT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc
new file mode 100644
index 00000000000000..4588d5f00a7523
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc
@@ -0,0 +1,73 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h"
+
+#include <string>
+#include <unordered_set>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace stablehlo::quantization {
+namespace {
+
+using ::tensorflow::NodeDef;
+using ::tensorflow::SignatureDef;
+using ::tensorflow::quantization::DebuggerOptions;
+using ::tensorflow::quantization::ExportedModel;
+using ::tensorflow::quantization::PyFunctionLibrary;
+
+}  // namespace
+
+void EnableDebugging(
+    ExportedModel& exported_model, const DebuggerOptions& debugger_options,
+    const PyFunctionLibrary& py_function_library,
+    const absl::string_view src_saved_model_path,
+    const std::unordered_set<std::string>& tags,
+    const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map) {
+  // Enable `DumpTensor` nodes in `graph_def`. DumpTensor is disabled by
+  // default to avoid logging data during calibration.
+  MutateNodeDefs(*exported_model.mutable_graph_def(), [](NodeDef& node_def) {
+    if (node_def.op() == "DumpTensor") {
+      (*node_def.mutable_attr())["enabled"].set_b(true);
+    }
+  });
+
+  if (debugger_options.debugger_type() ==
+      DebuggerOptions::DEBUGGER_TYPE_WHOLE_MODEL) {
+    // TODO: b/295139417 - Remove CustomAggregator op in unquantized dump model.
+    // TODO: b/296916287 - Create a separate function for saving unquantized
+    // dump model.
+    py_function_library.SaveExportedModel(
+        debugger_options.unquantized_dump_model_path(), exported_model,
+        src_saved_model_path, tags, signature_def_map);
+
+    // Update the `DumpTensor` ops' file name in `graph_def`.
+    MutateNodeDefs(*exported_model.mutable_graph_def(), [](NodeDef& node_def) {
+      if (node_def.op() == "DumpTensor") {
+        (*node_def.mutable_attr())["file_name"].set_s(
+            "quantized_tensor_data.pb");
+      }
+    });
+  }
+}
+
+}  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
new file mode 100644
index 00000000000000..6bb427ecbdf1fd
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
@@ -0,0 +1,50 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_DEBUGGER_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_DEBUGGER_H_
+
+#include <string>
+#include <unordered_set>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace stablehlo::quantization {
+
+// Enables debugging on `exported_model` by updating the `DumpTensor` ops.
+//
+// Saves the current model to `debugger_options.unquantized_dump_model_path()`
+// if the debugger type is `DEBUGGER_TYPE_WHOLE_MODEL`. This is required because
+// in whole-model debugging mode the `DumpTensor` ops for the unquantized
+// tensors are only inserted in the unquantized model whereas `DumpTensor` ops
+// for the quantized tensors are only inserted in the quantized model. Both
+// models are required to be able to dump both quantized and unquantized tensors
+// and compare them offline.
+void EnableDebugging(
+    tensorflow::quantization::ExportedModel& exported_model,
+    const tensorflow::quantization::DebuggerOptions& debugger_options,
+    const tensorflow::quantization::PyFunctionLibrary& py_function_library,
+    absl::string_view src_saved_model_path,
+    const std::unordered_set<std::string>& tags,
+    const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
+        signature_def_map);
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_DEBUGGER_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/export.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/export.cc
new file mode 100644
index 00000000000000..bf90f153bf0f91
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/export.cc
@@ -0,0 +1,89 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/export.h"
+
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saver.pb.h"
+
+namespace stablehlo::quantization {
+
+using ::tensorflow::AssetFileDef;
+using ::tensorflow::GraphDef;
+using ::tensorflow::SaverDef;
+using ::tensorflow::quantization::ExportedModel;
+
+ExportedModel CreateExportedModel(
+    GraphDef&& graph_def, const absl::string_view init_node_name,
+    const absl::string_view checkpoint_dir,
+    const std::optional<SaverDef> saver_def,
+    const absl::flat_hash_map<std::string, std::string>& function_aliases,
+    const std::vector<AssetFileDef>& asset_file_defs) {
+  ExportedModel exported_model{};
+  *exported_model.mutable_graph_def() = graph_def;
+  exported_model.set_init_node_name(std::string(init_node_name));
+  exported_model.set_checkpoint_dir(std::string(checkpoint_dir));
+
+  exported_model.mutable_function_aliases()->insert(function_aliases.begin(),
+                                                    function_aliases.end());
+
+  for (const AssetFileDef& asset_file_def : asset_file_defs) {
+    *exported_model.mutable_asset_file_defs()->Add() = asset_file_def;
+  }
+
+  if (saver_def != std::nullopt) {
+    *exported_model.mutable_saver_def() = *std::move(saver_def);
+  }
+
+  return exported_model;
+}
+
+// TODO: b/315746734 - Test this function using a test-only pass.
+void AddExportPasses(mlir::PassManager& pm,
+                     const bool duplicate_shape_determining_constants) {
+  if (duplicate_shape_determining_constants) {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::quant::CreateDuplicateShapeDeterminingConstantsPass());
+  }
+
+  pm.addPass(mlir::quant::CreateInsertMainFunctionPass());
+  pm.addPass(mlir::quant::CreateLiftHashTableOpsAsArgsPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::CreateFunctionalToExecutorDialectConversionPass());
+  pm.addPass(mlir::CreateBreakUpIslandsPass());
+  pm.addPass(mlir::quant::CreateMergeInitializerFunctionOpsToMainPass());
+  pm.addPass(mlir::quant::CreateMergeSaveFunctionOpsToMainPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::quant::CreateMergeDuplicateResourceOpsPass());
+
+  // Used to clean up the "tf._noinliner" attribute that is previously used to
+  // prevent certain functions from being inlined (see
+  // `MarkFunctionsNoinlinePass`). InlinerPass must not come after this pass.
+  pm.addPass(mlir::TF::CreateStripNoinlineAttributePass());
+}
+
+}  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/export.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/export.h
new file mode 100644
index 00000000000000..9c5117cf97e4c5
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/export.h
@@ -0,0 +1,79 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_EXPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_EXPORT_H_
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saver.pb.h"
+
+namespace stablehlo::quantization {
+
+// Suffix string for the module export step. Used for debugging.
+constexpr absl::string_view kExportStepSuffix = "_export";
+
+// Options when running passes for exporting an MLIR ModuleOp.
+struct ExportOptions {
+  // If set to `true`, it runs `DuplicateShapeDeterminingConstantsPass` before
+  // lowering to tf_executor dialect.
+  bool duplicate_shape_determining_constants = true;
+
+  // If set to `true`, unfreezes constants into variables and saves them to a
+  // checkpoint file. Setting this to `true` is an experimental feature that has
+  // no stability guarantees.
+  bool unfreeze_constants = false;
+
+  // Path to the directory where checkpoint files are saved.
+  std::string checkpoint_dir = "";
+
+  // Name used to identify the ModuleOp this is exporting. Only used for
+  // debugging and does not modify the behavior of the export.
+  std::string debug_name = "stablehlo_quant";
+};
+
+// Factory function for `ExportedModel`.
+[[nodiscard]] tensorflow::quantization::ExportedModel CreateExportedModel(
+    tensorflow::GraphDef&& graph_def, absl::string_view init_node_name,
+    absl::string_view checkpoint_dir,
+    std::optional<tensorflow::SaverDef> saver_def,
+    const absl::flat_hash_map<std::string, std::string>& function_aliases,
+    const std::vector<tensorflow::AssetFileDef>& asset_file_defs);
+
+// Adds passes for transforming the MLIR module op so that it can be exported
+// back to GraphDef. Roughly, this consists of:
+//   1) Inserting the @main function, which will become the main Graph.
+//   2) Duplicating shape-determining constants.
+//   3) Converting TF dialect -> tf_executor dialect.
+//   4) Adding initializer function's ops into @main function for correct
+//      resource initialization when loading the exported model.
+//
+// Duplicating shape-determining constants is required to place constants that
+// affect the shape of a tensor to be placed in the TPU graph instead of in the
+// CPU graph, when the graph gets converted for TPU inference. This allows these
+// constants to be known at XLA compilation time.
+void AddExportPasses(mlir::PassManager& pm,
+                     bool duplicate_shape_determining_constants);
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_EXPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/export_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/export_test.cc
new file mode 100644
index 00000000000000..b6749c6621de31
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/export_test.cc
@@ -0,0 +1,102 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/export.h"
+
+#include <optional>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/protobuf/saver.pb.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+
+namespace stablehlo::quantization {
+namespace {
+
+using ::tensorflow::AssetFileDef;
+using ::tensorflow::GraphDef;
+using ::tensorflow::SaverDef;
+using ::tensorflow::quantization::ExportedModel;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+using ::testing::StrEq;
+using ::tsl::protobuf::TextFormat;
+
+TEST(CreateExportedModelTest, CreateExportedModelBasicFieldsSet) {
+  GraphDef graph_def{};
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(R"pb(node { name: "foo" })pb", &graph_def));
+
+  const ExportedModel exported_model =
+      CreateExportedModel(std::move(graph_def), "init_node_name",
+                          "checkpoint_dir", /*saver_def=*/std::nullopt,
+                          /*function_aliases=*/{}, /*asset_file_defs=*/{});
+  ASSERT_THAT(exported_model.graph_def().node(), SizeIs(1));
+  EXPECT_THAT(exported_model.graph_def().node()[0].name(), StrEq("foo"));
+
+  EXPECT_THAT(exported_model.init_node_name(), StrEq("init_node_name"));
+  EXPECT_THAT(exported_model.checkpoint_dir(), StrEq("checkpoint_dir"));
+  EXPECT_FALSE(exported_model.has_saver_def());
+  EXPECT_THAT(exported_model.function_aliases(), IsEmpty());
+  EXPECT_THAT(exported_model.asset_file_defs(), IsEmpty());
+}
+
+TEST(CreateExportedModelTest, CreateExportedModelWithAddedFunctionAliases) {
+  const ExportedModel exported_model = CreateExportedModel(
+      GraphDef(), /*init_node_name=*/"", /*checkpoint_dir=*/"",
+      /*saver_def=*/std::nullopt,
+      /*function_aliases=*/{{"func1", "alias1"}, {"func2", "alias2"}},
+      /*asset_file_defs=*/{});
+  ASSERT_THAT(exported_model.function_aliases(), SizeIs(2));
+  EXPECT_TRUE(exported_model.function_aliases().contains("func1"));
+  EXPECT_THAT(exported_model.function_aliases().at("func1"), StrEq("alias1"));
+  EXPECT_TRUE(exported_model.function_aliases().contains("func2"));
+  EXPECT_THAT(exported_model.function_aliases().at("func2"), StrEq("alias2"));
+}
+
+TEST(CreateExportedModelTest, CreateExportedModelWithAddedAssetFileDefs) {
+  AssetFileDef asset1;
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(R"pb(filename: "fname1")pb", &asset1));
+
+  AssetFileDef asset2;
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(R"pb(filename: "fname2")pb", &asset2));
+
+  const ExportedModel exported_model = CreateExportedModel(
+      GraphDef(), /*init_node_name=*/"", /*checkpoint_dir=*/"",
+      /*saver_def=*/std::nullopt, /*function_aliases=*/{},
+      /*asset_file_defs=*/{asset1, asset2});
+  ASSERT_THAT(exported_model.asset_file_defs(), SizeIs(2));
+  EXPECT_THAT(exported_model.asset_file_defs()[0].filename(), StrEq("fname1"));
+  EXPECT_THAT(exported_model.asset_file_defs()[1].filename(), StrEq("fname2"));
+}
+
+TEST(CreateExportedModelTest, CreateExportedModelWithAddedSaverDef) {
+  SaverDef saver_def;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(filename_tensor_name: "my_file")pb", &saver_def));
+
+  const ExportedModel exported_model = CreateExportedModel(
+      GraphDef(), /*init_node_name=*/"", /*checkpoint_dir=*/"", saver_def,
+      /*function_aliases=*/{}, /*asset_file_defs=*/{});
+  EXPECT_THAT(exported_model.saver_def().filename_tensor_name(), "my_file");
+}
+
+}  // namespace
+}  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h
new file mode 100644
index 00000000000000..5796b18e65d632
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h
@@ -0,0 +1,46 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_GRAPH_DEF_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_GRAPH_DEF_H_
+
+#include <type_traits>
+
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace stablehlo::quantization {
+
+// Mutates all `NodeDef`s in `graph_def` by applying `func`. It modifies the
+// top-level `NodeDef`s as well as all `NodeDef`s in the function library.
+// `func` should accept a `NodeDef` reference.
+template <typename FuncT, typename = std::enable_if_t<std::is_invocable_r_v<
+                              void, FuncT, tensorflow::NodeDef&>>>
+void MutateNodeDefs(tensorflow::GraphDef& graph_def, FuncT&& func) {
+  for (tensorflow::NodeDef& node_def : *graph_def.mutable_node()) {
+    func(node_def);
+  }
+
+  for (tensorflow::FunctionDef& function_def :
+       *graph_def.mutable_library()->mutable_function()) {
+    for (tensorflow::NodeDef& node_def : *function_def.mutable_node_def()) {
+      func(node_def);
+    }
+  }
+}
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_GRAPH_DEF_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def_test.cc
new file mode 100644
index 00000000000000..58796acc4231bf
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+
+namespace stablehlo::quantization {
+namespace {
+
+using ::tensorflow::GraphDef;
+using ::tensorflow::NodeDef;
+using ::testing::SizeIs;
+using ::testing::StrEq;
+using ::tsl::protobuf::TextFormat;
+
+TEST(GraphDefTest, MutateNodeDefsMutatesTopLevelNodeDefs) {
+  GraphDef graph_def;
+  ASSERT_TRUE(TextFormat::ParseFromString(R"pb(
+                                            node { name: "foo" }
+                                          )pb",
+                                          &graph_def));
+  MutateNodeDefs(graph_def,
+                 [](NodeDef& node_def) { node_def.set_name("bar"); });
+
+  ASSERT_THAT(graph_def.node(), SizeIs(1));
+  EXPECT_THAT(graph_def.node()[0].name(), StrEq("bar"));
+}
+
+TEST(GraphDefTest, MutateNodeDefsMutatesFunctionNodeDefs) {
+  GraphDef graph_def;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(
+        library { function { node_def { name: "foo" } } }
+      )pb",
+      &graph_def));
+
+  MutateNodeDefs(graph_def,
+                 [](NodeDef& node_def) { node_def.set_name("bar"); });
+
+  ASSERT_THAT(graph_def.library().function(), SizeIs(1));
+  ASSERT_THAT(graph_def.library().function()[0].node_def(), SizeIs(1));
+  EXPECT_THAT(graph_def.library().function()[0].node_def()[0].name(),
+              StrEq("bar"));
+}
+
+}  // namespace
+}  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.cc
new file mode 100644
index 00000000000000..16a1013ae25166
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.cc
@@ -0,0 +1,56 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/statusor.h"
+
+namespace stablehlo::quantization::io {
+
+absl::StatusOr<std::string> GetLocalTmpFileName(tsl::Env* const env) {
+  std::string tmp_fname{};
+  if (!env->LocalTempFilename(&tmp_fname)) {
+    return absl::InternalError("Failed to create tmp file name.");
+  }
+
+  return tmp_fname;
+}
+
+absl::StatusOr<std::string> GetLocalTmpFileName() {
+  return GetLocalTmpFileName(tsl::Env::Default());
+}
+
+absl::StatusOr<std::string> CreateTmpDir(tsl::Env* const env) {
+  TF_ASSIGN_OR_RETURN(std::string tmp_dir, GetLocalTmpFileName(env));
+
+  if (!env->RecursivelyCreateDir(tmp_dir).ok()) {
+    return absl::InternalError(
+        absl::StrFormat("Failed to create tmp dir: '%s'", tmp_dir));
+  }
+
+  return tmp_dir;
+}
+
+absl::StatusOr<std::string> CreateTmpDir() {
+  // The overloaded function uses the default env.
+  return CreateTmpDir(tsl::Env::Default());
+}
+
+}  // namespace stablehlo::quantization::io
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h
new file mode 100644
index 00000000000000..bf17ba641f9da5
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h
@@ -0,0 +1,46 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_IO_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_IO_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "tsl/platform/env.h"
+
+namespace stablehlo::quantization::io {
+
+// Generates a unique local tmp file name. This function only generates the name
+// (path) and doesn't actually creates the file.
+absl::StatusOr<std::string> GetLocalTmpFileName(tsl::Env* env);
+
+// Generates a unique local tmp file name. This function only generates the name
+// (path) and doesn't actually creates the file. The default environment
+// `tsl::Env::Default` is used to generate the name.
+absl::StatusOr<std::string> GetLocalTmpFileName();
+
+// Creates a temporary directory on an environment defined by the implementation
+// of `tsl::Env` and returns its path. Returns an InternalError status if
+// failed.
+absl::StatusOr<std::string> CreateTmpDir(tsl::Env* env);
+
+// Creates a temporary directory and returns its path. Returns an InternalError
+// status if failed. The file system used will be the default environment
+// returned by `tsl::Env::Default`.
+absl::StatusOr<std::string> CreateTmpDir();
+
+}  // namespace stablehlo::quantization::io
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_IO_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc
new file mode 100644
index 00000000000000..b5cee2fc492f85
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc
@@ -0,0 +1,144 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/file_system.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/types.h"
+
+namespace stablehlo::quantization::io {
+namespace {
+
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::Not;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::StatusIs;
+
+// A test-only derived class of `tsl::Env` which is broken. Used to cause
+// failure for the `CreateTmpDir` function. Each of the overridden member
+// functions implements a dummy functionality just to be able to create an
+// instance of this class.
+class TestEnvBrokenFileSystem : public tsl::Env {
+ public:
+  TestEnvBrokenFileSystem() = default;
+
+  bool MatchPath(const tsl::string& path, const tsl::string& pattern) override {
+    return false;
+  }
+
+  void SleepForMicroseconds(int64_t micros) override {}
+
+  tsl::string GetRunfilesDir() override { return tsl::string("dummy_path"); }
+
+  int32_t GetCurrentThreadId() override { return 0; }
+
+  tsl::Thread* StartThread(const tsl::ThreadOptions& thread_options,
+                           const tsl::string& name,
+                           absl::AnyInvocable<void()> fn) override {
+    return nullptr;
+  }
+
+  bool GetCurrentThreadName(tsl::string* name) override { return false; }
+
+  void SchedClosure(absl::AnyInvocable<void()> closure) override {}
+
+  void SchedClosureAfter(int64_t micros,
+                         absl::AnyInvocable<void()> closure) override {}
+
+  absl::Status LoadDynamicLibrary(const char* library_filename,
+                                  void** handle) override {
+    return tsl::OkStatus();
+  }
+
+  absl::Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
+                                    void** symbol) override {
+    return tsl::OkStatus();
+  }
+
+  tsl::string FormatLibraryFileName(const tsl::string& name,
+                                    const tsl::string& version) override {
+    return tsl::string("dummy_path");
+  }
+
+  // This is the part that would break the `CreateTmpDir` function because it
+  // fails to provide a valid file system.
+  absl::Status GetFileSystemForFile(const std::string& fname,
+                                    tsl::FileSystem** result) override {
+    return absl::InternalError("Broken file system");
+  }
+
+ private:
+  void GetLocalTempDirectories(std::vector<tsl::string>* list) override {
+    list->push_back("/tmp");
+  }
+};
+
+// Represents an environment with broken file system and no available local tmp
+// directories.
+class TestEnvBrokenFileSystemAndNoLocalTempDirs
+    : public TestEnvBrokenFileSystem {
+ private:
+  // This is the part that essentially breaks the `GetLocalTmpFileName` function
+  // because it doesn't provide any available temp dirs.
+  void GetLocalTempDirectories(std::vector<tsl::string>* list) override {}
+};
+
+TEST(IoTest, GetLocalTmpFileNameGivesValidFileName) {
+  absl::StatusOr<std::string> tmp_file_name = GetLocalTmpFileName();
+
+  ASSERT_THAT(tmp_file_name, IsOk());
+  EXPECT_THAT(*tmp_file_name, Not(IsEmpty()));
+}
+
+TEST(IoTest, GetLocalTmpFileNameWhenNoTempDirsReturnsInternalError) {
+  TestEnvBrokenFileSystemAndNoLocalTempDirs broken_env;
+  absl::StatusOr<std::string> tmp_file_name = GetLocalTmpFileName(&broken_env);
+
+  EXPECT_THAT(tmp_file_name,
+              StatusIs(absl::StatusCode::kInternal,
+                       HasSubstr("Failed to create tmp file name")));
+}
+
+TEST(IoTest, CreateTmpDirReturnsValidTmpPath) {
+  absl::StatusOr<std::string> tmp_dir = CreateTmpDir();
+
+  ASSERT_THAT(tmp_dir, IsOk());
+
+  auto* const env = tsl::Env::Default();
+  EXPECT_THAT(env->FileExists(*tmp_dir), IsOk());
+}
+
+TEST(IoTest, CreateTmpDirWhenInvalidPathReturnsInternalError) {
+  TestEnvBrokenFileSystem test_env{};
+  absl::StatusOr<std::string> tmp_dir = CreateTmpDir(&test_env);
+
+  EXPECT_THAT(tmp_dir, StatusIs(absl::StatusCode::kInternal,
+                                HasSubstr("Failed to create tmp dir")));
+}
+
+}  // namespace
+}  // namespace stablehlo::quantization::io
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/precalibration.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/precalibration.cc
new file mode 100644
index 00000000000000..7bc1233bf35e04
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/precalibration.cc
@@ -0,0 +1,52 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/precalibration.h"
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.h"
+#include "tsl/platform/errors.h"
+
+namespace mlir::quant::stablehlo {
+namespace {
+
+using ::stablehlo::quantization::QuantizationConfig;
+using ::tensorflow::quantization::RunPasses;
+
+// Name of the post-training quantization pre-calibration step. Used for
+// debugging purposes.
+constexpr absl::string_view kQuantPtqPreCalibrationStepName =
+    "quant_ptq_pre_calibration";
+
+}  // namespace
+
+absl::StatusOr<ModuleOp> PreCalibrationComponent::Run(
+    ModuleOp module_op, const QuantizationConfig& config) {
+  TF_RETURN_IF_ERROR(RunPasses(
+      /*name=*/kQuantPtqPreCalibrationStepName,
+      /*add_passes_func=*/
+      [this](mlir::PassManager& pm) {
+        AddQuantizePtqPreCalibrationStablehloPasses(pm, calibration_options_);
+      },
+      ctx_, module_op));
+  return module_op;
+}
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/precalibration.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/precalibration.h
new file mode 100644
index 00000000000000..8a0d90935825df
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/precalibration.h
@@ -0,0 +1,57 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PRECALIBRATION_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PRECALIBRATION_H_
+
+#include <utility>
+
+#include "absl/log/die_if_null.h"
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Performs pre-calibration graph transformation as part of post-training
+// static-range quantization.
+
+// The resulting `ModuleOp` contains `TF::CustomAggregatorOp`s for collecting
+// quantization statistics, along with `TF::XlaCallModuleOp`s that correspond to
+// lifted quantizable functions.
+class PreCalibrationComponent : public Component {
+ public:
+  PreCalibrationComponent(
+      MLIRContext* ctx,
+      tensorflow::quantization::CalibrationOptions calibration_options)
+      : ctx_(*ABSL_DIE_IF_NULL(ctx)),  // Crash OK
+        calibration_options_(std::move(calibration_options)) {}
+
+  absl::StatusOr<ModuleOp> Run(
+      ModuleOp,
+      const ::stablehlo::quantization::QuantizationConfig& config) override;
+
+ private:
+  MLIRContext& ctx_;
+  // TODO: b/315747711 - Allow `QuantizationConfig` to express calibration
+  // options and remove this field.
+  tensorflow::quantization::CalibrationOptions calibration_options_;
+};
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PRECALIBRATION_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/precalibration_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/precalibration_test.cc
new file mode 100644
index 00000000000000..7a0440d9c461d2
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/precalibration_test.cc
@@ -0,0 +1,118 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/precalibration.h"
+
+#include <type_traits>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/test_base.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tsl/platform/status_matchers.h"
+
+namespace mlir::quant::stablehlo {
+namespace {
+
+using ::mlir::quant::common::QuantizationTestBase;
+using ::stablehlo::quantization::QuantizationConfig;
+using ::tensorflow::quantization::CalibrationOptions;
+using ::testing::Contains;
+using ::testing::SizeIs;
+using ::testing::StartsWith;
+using ::testing::StrEq;
+using ::tsl::testing::IsOk;
+
+// Matches an operation whose `getSymName` equals `name`.
+MATCHER_P(HasSymName, name, "") {
+  auto non_const_arg = const_cast<std::remove_const_t<decltype(arg)>>(arg);
+  *result_listener << "where the name is " << non_const_arg.getSymName().str();
+  return non_const_arg.getSymName() == name;
+}
+
+// Matches an operation that has a StringAttr whose name is `name` and value
+// matches `value_matcher`.
+MATCHER_P2(HasStringAttr, name, value_matcher,
+           absl::StrCat(negation ? "doesn't have" : "has",
+                        "string attribute: ", name, ", with desirable value")) {
+  auto non_const_arg = const_cast<std::remove_const_t<decltype(arg)>>(arg);
+  return non_const_arg->template hasAttrOfType<StringAttr>(name) &&
+         ExplainMatchResult(
+             value_matcher,
+             non_const_arg->template getAttrOfType<StringAttr>(name).str(),
+             result_listener);
+}
+
+// TODO: b/315746734 - Use test-only passes for in-depth and easier testing.
+class PreCalibrationComponentTest : public QuantizationTestBase {};
+
+TEST_F(PreCalibrationComponentTest,
+       HasCustomAggregatorOpAndQuantizableFuncForSimpleDotGeneral) {
+  PreCalibrationComponent component(&ctx_, CalibrationOptions());
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(R"mlir(
+    module attributes {} {
+      func.func @main(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> attributes {} {
+        %0 = stablehlo.constant dense<1.0> : tensor<4x3xf32>
+        %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+        return %1 : tensor<1x3xf32>
+      }
+    }
+  )mlir");
+
+  absl::StatusOr<ModuleOp> pre_calibration_result =
+      component.Run(*module_op, QuantizationConfig());
+
+  EXPECT_THAT(pre_calibration_result, IsOk());
+
+  SmallVector<func::FuncOp> func_ops;
+  for (auto func_op : pre_calibration_result->getOps<func::FuncOp>()) {
+    func_ops.push_back(func_op);
+  }
+  ASSERT_THAT(func_ops, SizeIs(1));
+  EXPECT_THAT(func_ops, Contains(HasSymName("main")));
+
+  // Tests that there is a XlaCallModuleOp that is a serialized quantizable
+  // function.
+  SmallVector<TF::XlaCallModuleOp> xla_call_module_ops;
+  for (auto xla_call_module_op : func_ops[0].getOps<TF::XlaCallModuleOp>()) {
+    xla_call_module_ops.push_back(xla_call_module_op);
+  }
+  ASSERT_THAT(xla_call_module_ops, SizeIs(2));
+  EXPECT_THAT(
+      xla_call_module_ops,
+      Contains(HasStringAttr("_tfl_quant_trait", StrEq("fully_quantizable"))));
+  EXPECT_THAT(xla_call_module_ops,
+              Contains(HasStringAttr("_original_entry_function",
+                                     StartsWith("composite_dot_general_fn"))));
+
+  // Tests that there are CustomAggregatorOps inserted.
+  SmallVector<TF::CustomAggregatorOp> custom_aggregator_ops;
+  for (auto custom_aggregator_op :
+       func_ops[0].getOps<TF::CustomAggregatorOp>()) {
+    custom_aggregator_ops.push_back(custom_aggregator_op);
+  }
+  EXPECT_THAT(custom_aggregator_ops, SizeIs(2));
+}
+
+}  // namespace
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
new file mode 100644
index 00000000000000..d3bf62dfce4923
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
@@ -0,0 +1,29 @@
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/compiler/mlir/quantization/tensorflow:internal_visibility_allowlist_package",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "stablehlo_op_quant_spec",
+    srcs = [
+        "stablehlo_op_quant_spec.cc",
+    ],
+    hdrs = ["stablehlo_op_quant_spec.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
new file mode 100644
index 00000000000000..1a20e3d6d995f8
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
@@ -0,0 +1,104 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h"
+
+#include <memory>
+
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir::quant::stablehlo {
+
+std::unique_ptr<OpQuantSpec> GetStableHloOpQuantSpec(Operation* op) {
+  auto spec = std::make_unique<OpQuantSpec>();
+  if (auto call_op = dyn_cast_or_null<TF::XlaCallModuleOp>(op)) {
+    auto entry_function =
+        call_op->getAttrOfType<FlatSymbolRefAttr>("_entry_function");
+    StringRef function_name = entry_function.getValue();
+    if (!function_name.startswith("composite_")) {
+      return spec;
+    }
+    if (function_name.contains("conv")) {
+      spec->coeff_op_quant_dim[1] = 3;
+      if (function_name.contains("with_bias")) {
+        spec->biases_params[2] = {{0, 1},
+                                  quant::GetUniformQuantizedTypeForBias};
+      }
+    } else if (function_name.contains("dot_general")) {
+      spec->coeff_op_quant_dim[1] = -1;
+      if (function_name.contains("with_bias")) {
+        spec->biases_params[2] = {{0, 1},
+                                  quant::GetUniformQuantizedTypeForBias};
+      }
+    } else if (function_name.contains("dot")) {
+      spec->coeff_op_quant_dim[1] = -1;
+      if (function_name.contains("with_bias")) {
+        spec->biases_params[2] = {{0, 1},
+                                  quant::GetUniformQuantizedTypeForBias};
+      }
+    }
+    for (auto quantizable_operand : spec->coeff_op_quant_dim) {
+      spec->quantizable_operands.insert(quantizable_operand.first);
+    }
+  }
+  return spec;
+}
+
+std::unique_ptr<OpQuantScaleSpec> GetStableHloQuantScaleSpec(Operation* op) {
+  auto scale_spec = std::make_unique<OpQuantScaleSpec>();
+  if (llvm::isa<mlir::stablehlo::BroadcastInDimOp,
+                mlir::stablehlo::ConcatenateOp, mlir::stablehlo::ConvertOp,
+                mlir::stablehlo::GatherOp, mlir::stablehlo::PadOp,
+                mlir::stablehlo::ReshapeOp, mlir::stablehlo::SelectOp,
+                mlir::stablehlo::SliceOp, mlir::stablehlo::TransposeOp>(op)) {
+    scale_spec->has_same_scale_requirement = true;
+  }
+  return scale_spec;
+}
+
+bool IsOpQuantizableStableHlo(Operation* op) {
+  if (mlir::isa<func::ConstantOp, mlir::stablehlo::ConstantOp>(op)) {
+    // Constant ops do not have QuantizableResult attribute but can be
+    // quantized.
+    return true;
+  } else if (op->hasTrait<OpTrait::IsTerminator>() ||
+             isa<quantfork::QuantizeCastOp, quantfork::DequantizeCastOp>(op)) {
+    // Terminators, qcast and decast are not quantizable.
+    return false;
+  }
+
+  if (GetStableHloQuantScaleSpec(op)->has_same_scale_requirement) {
+    return true;
+  }
+
+  const bool attr_enforced_quantizable =
+      op->hasAttrOfType<StringAttr>(kQuantTraitAttrName) &&
+      op->getAttrOfType<StringAttr>(kQuantTraitAttrName).getValue().str() ==
+          QuantTraitValues[QuantizationTrait::FullyQuantizable];
+  return attr_enforced_quantizable;
+}
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h
new file mode 100644
index 00000000000000..c898a99c08f68f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h
@@ -0,0 +1,41 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_OPS_STABLEHLO_OP_QUANT_SPEC_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_OPS_STABLEHLO_OP_QUANT_SPEC_H_
+
+#include <memory>
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Returns StableHLO quantization specs for an op.
+std::unique_ptr<OpQuantSpec> GetStableHloOpQuantSpec(Operation* op);
+
+// Returns quantization scale specs (fixed output, same scale) for a StableHLO
+// op.
+std::unique_ptr<OpQuantScaleSpec> GetStableHloQuantScaleSpec(Operation* op);
+
+// Checks if an op is quantizable in StableHLO quantizer. Argument op is not
+// necessarily a StableHLO op.
+bool IsOpQuantizableStableHlo(Operation* op);
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_OPS_STABLEHLO_OP_QUANT_SPEC_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
index 2ff1ba9200261d..e51f55d14aeecf 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
@@ -53,78 +53,8 @@ limitations under the License.
 namespace mlir::quant::stablehlo {
 namespace {
 
-// This helper function create ops to requantize `input` tensor and returns the
-// output tensor. Clamping is done if output integer bit-width < 32.
-//
-// Requantization is essentially dequantize --> quantize.
-//
-// Dequantize: (input - zp) * scale
-// Quantize: input / scale + zp
-//
-// Hence,
-//   output = (input - input_zp) * input_scale / output_scale + output_zp
-//
-// This is simplified as:
-//   output = input * merged_scale + merged_zp
-// where:
-//   merged_zp = output_zp - input_zp * merged_scale.
-//   merged_scale = input_scale / output_scale.
-Value Requantize(mlir::OpState op, Value input,
-                 UniformQuantizedType input_quantized_type,
-                 UniformQuantizedType output_quantized_type,
-                 TensorType output_tensor_type,
-                 ConversionPatternRewriter &rewriter) {
-  // Skip requantization when input and result have the same type.
-  if (input_quantized_type == output_quantized_type) {
-    return rewriter.create<mhlo::ConvertOp>(op->getLoc(), output_tensor_type,
-                                            input);
-  }
-
-  double merged_scale_fp =
-      input_quantized_type.getScale() / output_quantized_type.getScale();
-  Value merged_scale = rewriter.create<mhlo::ConstantOp>(
-      op->getLoc(),
-      rewriter.getF32FloatAttr(static_cast<float>(merged_scale_fp)));
-
-  auto float_tensor_type =
-      input.getType().cast<TensorType>().clone(rewriter.getF32Type());
-  Value output_float =
-      rewriter.create<mhlo::ConvertOp>(op->getLoc(), float_tensor_type, input);
-
-  output_float = rewriter.create<chlo::BroadcastMulOp>(
-      op->getLoc(), float_tensor_type, output_float, merged_scale, nullptr);
-
-  // Add merged_zp only when it is non-zero.
-  double merged_zp_fp = output_quantized_type.getZeroPoint() -
-                        input_quantized_type.getZeroPoint() * merged_scale_fp;
-  if (merged_zp_fp != 0) {
-    Value merged_zp = rewriter.create<mhlo::ConstantOp>(
-        op->getLoc(),
-        rewriter.getF32FloatAttr(static_cast<float>(merged_zp_fp)));
-    output_float = rewriter.create<chlo::BroadcastAddOp>(
-        op->getLoc(), float_tensor_type, output_float, merged_zp, nullptr);
-  }
-
-  // Clamp output if the output integer bit-width <32.
-  if (output_tensor_type.getElementType().cast<IntegerType>().getWidth() < 32) {
-    Value quantization_min = rewriter.create<mhlo::ConstantOp>(
-        op->getLoc(), rewriter.getF32FloatAttr(static_cast<float>(
-                          output_quantized_type.getStorageTypeMin())));
-    Value quantization_max = rewriter.create<mhlo::ConstantOp>(
-        op->getLoc(), rewriter.getF32FloatAttr(static_cast<float>(
-                          output_quantized_type.getStorageTypeMax())));
-    // Clamp results by [quantization_min, quantization_max].
-    output_float = rewriter.create<mhlo::ClampOp>(
-        op->getLoc(), float_tensor_type, quantization_min, output_float,
-        quantization_max);
-  }
-
-  output_float = rewriter.create<mhlo::RoundNearestEvenOp>(
-      op->getLoc(), float_tensor_type, output_float);
-  return rewriter.create<mhlo::ConvertOp>(op->getLoc(), output_tensor_type,
-                                          output_float);
-}
-
+// TODO: b/311218165 - consider extract this to common utils and better ways to
+// handle polymorphism.
 using QuantType =
     std::variant<UniformQuantizedType, UniformQuantizedPerAxisType>;
 FailureOr<QuantType> GetQuantType(Type type) {
@@ -139,6 +69,22 @@ FailureOr<QuantType> GetQuantType(Type type) {
   }
 }
 
+bool IsPerTensorType(QuantType quant_type) {
+  return std::holds_alternative<UniformQuantizedType>(quant_type);
+}
+
+bool IsPerChannelType(QuantType quant_type) {
+  return std::holds_alternative<UniformQuantizedPerAxisType>(quant_type);
+}
+
+UniformQuantizedType GetPerTensorType(QuantType quant_type) {
+  return std::get<UniformQuantizedType>(quant_type);
+}
+
+UniformQuantizedPerAxisType GetPerChannelType(QuantType quant_type) {
+  return std::get<UniformQuantizedPerAxisType>(quant_type);
+}
+
 // Extract scale and zero point info from input quant type info.
 void GetQuantizationParams(OpBuilder &builder, Location loc,
                            QuantType quant_type, Value &scales,
@@ -161,7 +107,7 @@ void GetQuantizationParams(OpBuilder &builder, Location loc,
   } else {
     auto &quant_per_channel_type =
         std::get<UniformQuantizedPerAxisType>(quant_type);
-    llvm::SmallVector<float> scales_vec;
+    SmallVector<float> scales_vec;
     for (auto scale : quant_per_channel_type.getScales())
       scales_vec.push_back(scale);
     scales = builder.create<mhlo::ConstantOp>(
@@ -172,7 +118,7 @@ void GetQuantizationParams(OpBuilder &builder, Location loc,
                      builder.getF32Type()),
                  scales_vec));
     if (output_zero_point_in_fp) {
-      llvm::SmallVector<float> zero_points_vec;
+      SmallVector<float> zero_points_vec;
       for (auto zero_point : quant_per_channel_type.getZeroPoints())
         zero_points_vec.push_back(zero_point);
       zero_points = builder.create<mhlo::ConstantOp>(
@@ -183,7 +129,7 @@ void GetQuantizationParams(OpBuilder &builder, Location loc,
                        builder.getF32Type()),
                    zero_points_vec));
     } else {
-      llvm::SmallVector<int32_t> zero_points_vec;
+      SmallVector<int32_t> zero_points_vec;
       for (auto zero_point : quant_per_channel_type.getZeroPoints())
         zero_points_vec.push_back(zero_point);
       zero_points = builder.create<mhlo::ConstantOp>(
@@ -241,6 +187,147 @@ Type GetQuantStorageType(Type type) {
   }
 }
 
+Type GetQuantStorageType(QuantType type) {
+  if (IsPerTensorType(type)) {
+    return GetPerTensorType(type).getStorageType();
+  } else {
+    return GetPerChannelType(type).getStorageType();
+  }
+}
+
+Value ApplyMergedScalesAndZps(OpBuilder &builder, Location loc,
+                              QuantType input_quant_type,
+                              QuantType output_quant_type,
+                              Value input_float_tensor) {
+  // Use single merged scale and merged zp if both input and output are
+  // per-tensor quantized. Otherwise use a vector.
+  if (IsPerTensorType(input_quant_type) && IsPerTensorType(output_quant_type)) {
+    UniformQuantizedType input_per_tensor_tyep =
+        GetPerTensorType(input_quant_type);
+    UniformQuantizedType output_per_tensor_tyep =
+        GetPerTensorType(output_quant_type);
+    double merged_scale_fp =
+        input_per_tensor_tyep.getScale() / output_per_tensor_tyep.getScale();
+    auto merged_scale = builder.create<mhlo::ConstantOp>(
+        loc, builder.getF32FloatAttr(static_cast<float>(merged_scale_fp)));
+    input_float_tensor = builder.create<chlo::BroadcastMulOp>(
+        loc, input_float_tensor, merged_scale,
+        /*broadcast_dimensions=*/nullptr);
+    // Add merged_zp only when it is non-zero.
+    double merged_zp_fp =
+        output_per_tensor_tyep.getZeroPoint() -
+        input_per_tensor_tyep.getZeroPoint() * merged_scale_fp;
+    if (merged_zp_fp != 0) {
+      Value merged_zp = builder.create<mhlo::ConstantOp>(
+          loc, builder.getF32FloatAttr(static_cast<float>(merged_zp_fp)));
+      input_float_tensor = builder.create<chlo::BroadcastAddOp>(
+          loc, input_float_tensor, merged_zp, /*broadcast_dimensions=*/nullptr);
+    }
+  } else {
+    int64_t channel_size =
+        IsPerChannelType(output_quant_type)
+            ? GetPerChannelType(output_quant_type).getScales().size()
+            : GetPerChannelType(input_quant_type).getScales().size();
+    int64_t quantized_dimension =
+        IsPerChannelType(output_quant_type)
+            ? GetPerChannelType(output_quant_type).getQuantizedDimension()
+            : GetPerChannelType(input_quant_type).getQuantizedDimension();
+    SmallVector<double> merged_scale_double, merged_zp_double;
+    merged_scale_double.resize(channel_size);
+    merged_zp_double.resize(channel_size);
+    for (int i = 0; i < channel_size; ++i) {
+      merged_scale_double[i] =
+          (IsPerChannelType(input_quant_type)
+               ? GetPerChannelType(input_quant_type).getScales()[i]
+               : GetPerTensorType(input_quant_type).getScale()) /
+          (IsPerChannelType(output_quant_type)
+               ? GetPerChannelType(output_quant_type).getScales()[i]
+               : GetPerTensorType(output_quant_type).getScale());
+      merged_zp_double[i] =
+          (IsPerChannelType(output_quant_type)
+               ? GetPerChannelType(output_quant_type).getZeroPoints()[i]
+               : GetPerTensorType(output_quant_type).getZeroPoint()) -
+          (IsPerChannelType(input_quant_type)
+               ? GetPerChannelType(input_quant_type).getZeroPoints()[i]
+               : GetPerTensorType(input_quant_type).getZeroPoint()) *
+              merged_scale_double[i];
+    }
+    SmallVector<float> merged_scale_float(merged_scale_double.begin(),
+                                          merged_scale_double.end()),
+        merged_zp_float(merged_zp_double.begin(), merged_zp_double.end());
+
+    auto broadcast_dims = DenseIntElementsAttr::get(
+        RankedTensorType::get({1}, builder.getI64Type()),
+        {quantized_dimension});
+    Value merged_scale = builder.create<mhlo::ConstantOp>(
+        loc, DenseFPElementsAttr::get(
+                 RankedTensorType::get({channel_size}, builder.getF32Type()),
+                 merged_scale_float));
+    input_float_tensor = builder.create<chlo::BroadcastMulOp>(
+        loc, input_float_tensor, merged_scale, broadcast_dims);
+    if (llvm::any_of(merged_zp_float, [](double zp) { return zp != 0; })) {
+      Value merged_zp = builder.create<mhlo::ConstantOp>(
+          loc, DenseFPElementsAttr::get(
+                   RankedTensorType::get({channel_size}, builder.getF32Type()),
+                   merged_zp_float));
+      input_float_tensor = builder.create<chlo::BroadcastAddOp>(
+          loc, input_float_tensor, merged_zp, broadcast_dims);
+    }
+  }
+  return input_float_tensor;
+}
+
+// This helper function create ops to requantize `input` tensor and returns the
+// output tensor. Clamping is done if output integer bit-width < i32. It assumes
+// that if both input and output tensor are per-channel quantized, they have the
+// same quantization axis.
+//
+// Requantization is essentially dequantize --> quantize.
+//
+// Dequantize: (input - zp) * scale
+// Quantize: input / scale + zp
+//
+// Hence,
+//   output = (input - input_zp) * input_scale / output_scale + output_zp
+//
+// This is simplified as:
+//   output = input * merged_scale + merged_zp
+// where:
+//   merged_zp = output_zp - input_zp * merged_scale.
+//   merged_scale = input_scale / output_scale.
+Value Requantize(mlir::OpState op, Value input, QuantType input_quant_type,
+                 QuantType output_quant_type, TensorType output_tensor_type,
+                 ConversionPatternRewriter &rewriter) {
+  // Skip requantization when input and result have the same type.
+  if (input_quant_type == output_quant_type) {
+    return rewriter.create<mhlo::ConvertOp>(op->getLoc(), output_tensor_type,
+                                            input);
+  }
+
+  auto float_tensor_type = output_tensor_type.clone(rewriter.getF32Type());
+  Value output_float =
+      rewriter.create<mhlo::ConvertOp>(op->getLoc(), float_tensor_type, input);
+
+  output_float =
+      ApplyMergedScalesAndZps(rewriter, op->getLoc(), input_quant_type,
+                              output_quant_type, output_float);
+
+  // Clamp output if the output integer bit-width <32.
+  if (output_tensor_type.getElementType().cast<IntegerType>().getWidth() < 32) {
+    Value quantization_min, quantization_max;
+    GetQuantizationStorageInfo(rewriter, op->getLoc(), output_quant_type,
+                               quantization_min, quantization_max);
+    // Clamp results by [quantization_min, quantization_max].
+    output_float = rewriter.create<mhlo::ClampOp>(
+        op->getLoc(), quantization_min, output_float, quantization_max);
+  }
+
+  output_float = rewriter.create<mhlo::RoundNearestEvenOp>(
+      op->getLoc(), float_tensor_type, output_float);
+  return rewriter.create<mhlo::ConvertOp>(op->getLoc(), output_tensor_type,
+                                          output_float);
+}
+
 class ConvertUniformQuantizeOp
     : public OpConversionPattern<mhlo::UniformQuantizeOp> {
  public:
@@ -255,10 +342,24 @@ class ConvertUniformQuantizeOp
       if (succeeded(quant_type)) {
         return matchAndRewriteQuantize(op, adaptor, rewriter, *quant_type);
       }
-    } else if (input_element_type.isa<UniformQuantizedType>()) {
-      return matchAndRewriteRequantize(op, adaptor, rewriter);
+    } else if (input_element_type.isa<quant::UniformQuantizedType,
+                                      quant::UniformQuantizedPerAxisType>()) {
+      auto input_quant_type = GetQuantType(input_element_type);
+      auto output_quant_type = GetQuantType(op.getResult().getType());
+      if (succeeded(input_quant_type) && succeeded(output_quant_type)) {
+        if (IsPerChannelType(*input_quant_type) &&
+            IsPerChannelType(*output_quant_type) &&
+            GetPerChannelType(*input_quant_type).getQuantizedDimension() !=
+                GetPerChannelType(*output_quant_type).getQuantizedDimension()) {
+          op->emitError("Cannot requantize while changing quantization_axis");
+          return failure();
+        }
+        return matchAndRewriteRequantize(op, adaptor, rewriter,
+                                         *input_quant_type, *output_quant_type);
+      }
     }
-    return rewriter.notifyMatchFailure(op, "Unsupported input element type.");
+    op->emitError("Unsupported input element type.");
+    return failure();
   }
 
   LogicalResult matchAndRewriteQuantize(mhlo::UniformQuantizeOp op,
@@ -298,16 +399,14 @@ class ConvertUniformQuantizeOp
 
   LogicalResult matchAndRewriteRequantize(
       mhlo::UniformQuantizeOp op, mhlo::UniformQuantizeOpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const {
-    auto input_quantized_type = getElementTypeOrSelf(op.getOperand().getType())
-                                    .cast<UniformQuantizedType>();
-    auto output_quantized_type = getElementTypeOrSelf(op.getResult().getType())
-                                     .cast<UniformQuantizedType>();
+      ConversionPatternRewriter &rewriter, QuantType input_quant_type,
+      QuantType output_quant_type) const {
     rewriter.replaceOp(
-        op, Requantize(op, adaptor.getOperand(), input_quantized_type,
-                       output_quantized_type,
+        op, Requantize(op, adaptor.getOperand(), input_quant_type,
+                       output_quant_type,
+                       /*output_tensor_type=*/
                        op.getResult().getType().cast<TensorType>().clone(
-                           output_quantized_type.getStorageType()),
+                           GetQuantStorageType(output_quant_type)),
                        rewriter));
     return success();
   }
@@ -357,18 +456,18 @@ class ConvertUniformQuantizedAddOp : public OpConversionPattern<mhlo::AddOp> {
   LogicalResult matchAndRewrite(
       mhlo::AddOp op, mhlo::AddOpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
-    auto lhs_element_type =
-        op.getLhs().getType().getElementType().dyn_cast<UniformQuantizedType>();
-    auto rhs_element_type =
-        op.getRhs().getType().getElementType().dyn_cast<UniformQuantizedType>();
-    auto result_element_type = op.getResult()
-                                   .getType()
-                                   .getElementType()
-                                   .dyn_cast<UniformQuantizedType>();
+    auto lhs_quant_type =
+        GetQuantType(getElementTypeOrSelf(op.getLhs().getType()));
+    auto rhs_quant_type =
+        GetQuantType(getElementTypeOrSelf(op.getRhs().getType()));
+    auto res_quant_type =
+        GetQuantType(getElementTypeOrSelf(op.getResult().getType()));
 
     // We only handle cases where lhs, rhs and results all have quantized
     // element type.
-    if (!lhs_element_type || !rhs_element_type || !result_element_type) {
+    if (failed(lhs_quant_type) || IsPerChannelType(*lhs_quant_type) ||
+        failed(rhs_quant_type) || IsPerChannelType(*rhs_quant_type) ||
+        failed(res_quant_type) || IsPerChannelType(*res_quant_type)) {
       op->emitError(
           "AddOp requires the same quantized element type for all operands and "
           "results");
@@ -384,17 +483,17 @@ class ConvertUniformQuantizedAddOp : public OpConversionPattern<mhlo::AddOp> {
     // TODO: b/260280919 - Consider avoiding conversion to int32.
     Value lhs = adaptor.getLhs();
     Value lhs_int32_tensor =
-        Requantize(op, lhs, lhs_element_type, result_element_type,
+        Requantize(op, lhs, *lhs_quant_type, *res_quant_type,
                    res_int32_tensor_type, rewriter);
 
     Value rhs = adaptor.getRhs();
     Value rhs_int32_tensor =
-        Requantize(op, rhs, rhs_element_type, result_element_type,
+        Requantize(op, rhs, *rhs_quant_type, *res_quant_type,
                    res_int32_tensor_type, rewriter);
 
     Value zero_point = rewriter.create<mhlo::ConstantOp>(
         op->getLoc(), rewriter.getI32IntegerAttr(static_cast<int32_t>(
-                          result_element_type.getZeroPoint())));
+                          GetPerTensorType(*res_quant_type).getZeroPoint())));
 
     // Now the lhs and rhs have been coverted to the same scale and zps.
     // Given:
@@ -411,24 +510,26 @@ class ConvertUniformQuantizedAddOp : public OpConversionPattern<mhlo::AddOp> {
     Value res_int32 = rewriter.create<chlo::BroadcastSubOp>(
         op->getLoc(), res_int32_tensor_type, add_result, zero_point, nullptr);
 
-    if (result_element_type.getStorageType().isInteger(32)) {
+    if (GetQuantStorageType(*res_quant_type).isInteger(32)) {
       // For i32, clamping is not needed.
       rewriter.replaceOp(op, res_int32);
     } else {
       // Clamp results by [quantization_min, quantization_max] when storage type
       // is not i32.
       Value result_quantization_min = rewriter.create<mhlo::ConstantOp>(
-          op->getLoc(), rewriter.getI32IntegerAttr(static_cast<int32_t>(
-                            result_element_type.getStorageTypeMin())));
+          op->getLoc(),
+          rewriter.getI32IntegerAttr(static_cast<int32_t>(
+              GetPerTensorType(*res_quant_type).getStorageTypeMin())));
       Value result_quantization_max = rewriter.create<mhlo::ConstantOp>(
-          op->getLoc(), rewriter.getI32IntegerAttr(static_cast<int32_t>(
-                            result_element_type.getStorageTypeMax())));
+          op->getLoc(),
+          rewriter.getI32IntegerAttr(static_cast<int32_t>(
+              GetPerTensorType(*res_quant_type).getStorageTypeMax())));
       res_int32 = rewriter.create<mhlo::ClampOp>(
           op->getLoc(), res_int32_tensor_type, result_quantization_min,
           res_int32, result_quantization_max);
       // Convert results back to result storage type.
       auto res_final_tensor_type =
-          res_int32_tensor_type.clone(result_element_type.getStorageType());
+          res_int32_tensor_type.clone(GetQuantStorageType(*res_quant_type));
       rewriter.replaceOpWithNewOp<mhlo::ConvertOp>(op, res_final_tensor_type,
                                                    res_int32);
     }
@@ -445,12 +546,12 @@ class ConvertUniformQuantizedAddOp : public OpConversionPattern<mhlo::AddOp> {
 // dimensions are defined in
 // https://github.com/openxla/stablehlo/blob/main/docs/spec.md#dot_general.
 struct DotLikeDimensionNumbers {
-  ArrayRef<int64_t> lhs_batching_dims;
-  ArrayRef<int64_t> lhs_spatial_dims;
-  ArrayRef<int64_t> lhs_contracting_dims;
-  ArrayRef<int64_t> rhs_batching_dims;
-  ArrayRef<int64_t> rhs_spatial_dims;
-  ArrayRef<int64_t> rhs_contracting_dims;
+  SmallVector<int64_t> lhs_batching_dims;
+  SmallVector<int64_t> lhs_spatial_dims;
+  SmallVector<int64_t> lhs_contracting_dims;
+  SmallVector<int64_t> rhs_batching_dims;
+  SmallVector<int64_t> rhs_spatial_dims;
+  SmallVector<int64_t> rhs_contracting_dims;
 };
 
 // A shared matchAndRewrite implementation for dot-like hybrid quantized
@@ -503,7 +604,7 @@ LogicalResult matchAndRewriteDotLikeHybridOp(
 
 Value CreateZeroPointPartialOffset(OpBuilder &builder, Location loc,
                                    Value tensor, const int64_t other_tensor_zp,
-                                   ArrayRef<int64_t> reduction_dims) {
+                                   SmallVector<int64_t> reduction_dims) {
   // This function calculates part of the zero-point-offset by using
   // mhlo::Reduce to sum over the contracting dims of the tensor, and then
   // multiply by zp of the other tensor.
@@ -512,7 +613,7 @@ Value CreateZeroPointPartialOffset(OpBuilder &builder, Location loc,
   // Calculate the output tensor shape. This is input tensor dims minus
   // contracting dims.
   auto ranked_tensor = tensor.getType().cast<RankedTensorType>();
-  llvm::SmallVector<int64_t> output_dims;
+  SmallVector<int64_t> output_dims;
   for (int64_t i = 0; i < ranked_tensor.getRank(); ++i) {
     if (absl::c_count(reduction_dims, i) == 0) {
       output_dims.push_back(ranked_tensor.getDimSize(i));
@@ -581,7 +682,7 @@ Value CalculateDynamicOutputDims(OpBuilder &builder, Location loc, Value lhs,
   // Calculate each output dim and concatenate into a 1D tensor.
   // Output dims are batching dims, spatial dims, LHS result dims, RHS result
   // dims.
-  llvm::SmallVector<Value> output_dims;
+  SmallVector<Value> output_dims;
   for (int64_t i = 0; i < lhs_shape.getRank(); ++i) {
     if (absl::c_count(dims.lhs_batching_dims, i) != 0) {
       output_dims.push_back(GetDimValue(builder, loc, lhs, lhs_shape, i));
@@ -612,8 +713,8 @@ Value CalculateDynamicOutputDims(OpBuilder &builder, Location loc, Value lhs,
 
 Value BroadcastZpContribution(OpBuilder &builder, Location loc,
                               Value zp_contribution,
-                              llvm::ArrayRef<int64_t> reduction_dims,
-                              llvm::ArrayRef<int64_t> batching_dims,
+                              ArrayRef<int64_t> reduction_dims,
+                              ArrayRef<int64_t> batching_dims,
                               int64_t non_batching_starting_idx,
                               TensorType output_tensor_type,
                               Value &output_dims_value, Value lhs, Value rhs,
@@ -623,7 +724,7 @@ Value BroadcastZpContribution(OpBuilder &builder, Location loc,
   // broadcast.
   auto zp_contribution_rank =
       zp_contribution.getType().cast<ShapedType>().getRank();
-  llvm::SmallVector<int64_t> broadcast_dims;
+  SmallVector<int64_t> broadcast_dims;
   broadcast_dims.resize(zp_contribution_rank, 0);
   // Result tensor will have batching dims first, then LHS result dims, then
   // RHS result dims. So non-batching result dims index doesn't start from 0.
@@ -643,9 +744,9 @@ Value BroadcastZpContribution(OpBuilder &builder, Location loc,
       broadcast_dims[idx] = result_batching_idx++;
     }
   }
-  // Use broadcast_in_dim or dyanmic_broadcast_in_dim based on input shape
+  // Use broadcast_in_dim or dyanmic_broadcast_in_dim based on output shape
   // dynamism.
-  if (zp_contribution.getType().cast<ShapedType>().hasStaticShape()) {
+  if (output_tensor_type.cast<ShapedType>().hasStaticShape()) {
     zp_contribution = builder.create<mhlo::BroadcastInDimOp>(
         loc, output_tensor_type, zp_contribution,
         DenseIntElementsAttr::get(
@@ -677,9 +778,8 @@ Value CalculateZeroPointOffset(OpBuilder &builder, Location loc, Value lhs,
   Value output_dims_value = nullptr;
   // Calculate LHS contribution when RHS zp is non-zero.
   if (rhs_zp != 0) {
-    llvm::SmallVector<int64_t> reduction_dims =
-        llvm::to_vector(llvm::concat<const int64_t>(dims.lhs_spatial_dims,
-                                                    dims.lhs_contracting_dims));
+    SmallVector<int64_t> reduction_dims = to_vector(llvm::concat<const int64_t>(
+        dims.lhs_spatial_dims, dims.lhs_contracting_dims));
     Value lhs_zp_contribution =
         CreateZeroPointPartialOffset(builder, loc, lhs, rhs_zp, reduction_dims);
     // Broadcast lhs ZP contribution to result tensor shape.
@@ -691,9 +791,8 @@ Value CalculateZeroPointOffset(OpBuilder &builder, Location loc, Value lhs,
   }
   // Calculate RHS contribution when LHS zp is non-zero.
   if (lhs_zp != 0) {
-    llvm::SmallVector<int64_t> reduction_dims =
-        llvm::to_vector(llvm::concat<const int64_t>(dims.rhs_spatial_dims,
-                                                    dims.rhs_contracting_dims));
+    SmallVector<int64_t> reduction_dims = to_vector(llvm::concat<const int64_t>(
+        dims.rhs_spatial_dims, dims.rhs_contracting_dims));
     Value rhs_zp_contribution =
         CreateZeroPointPartialOffset(builder, loc, rhs, lhs_zp, reduction_dims);
     // Broadcast rhs ZP contribution to result tensor shape.
@@ -762,11 +861,13 @@ Value CreateDotLikeKernel<mhlo::ConvolutionOp>(OpBuilder &builder, Location loc,
                                                Value &rhs,
                                                ArrayRef<NamedAttribute> attrs) {
   // We only handle the case where RHS zp is zero.
-  auto original_padding = op.getPaddingAttr().getValues<int64_t>();
-
   // Explicitly pad LHS with zp and update LHS value.
-  llvm::SmallVector<NamedAttribute> new_attrs(attrs);
-  if (llvm::any_of(original_padding, [](int64_t x) { return x != 0; })) {
+  SmallVector<NamedAttribute> new_attrs(attrs);
+  if (op.getPadding().has_value() &&
+      llvm::any_of(op.getPaddingAttr().getValues<int64_t>(),
+                   [](int64_t x) { return x != 0; })) {
+    auto original_padding = op.getPaddingAttr().getValues<int64_t>();
+
     Value zp = builder.create<mhlo::ConstantOp>(
         loc,
         DenseIntElementsAttr::get(
@@ -779,7 +880,7 @@ Value CreateDotLikeKernel<mhlo::ConvolutionOp>(OpBuilder &builder, Location loc,
     // mhlo::Convolution. But mhlo::Pad require those for all dimensions. Hence
     // we add 0 to the beginning and end of the padding vectors.
     int64_t rank = lhs.getType().cast<TensorType>().getRank();
-    llvm::SmallVector<int64_t> padding_low(rank, 0), padding_high(rank, 0),
+    SmallVector<int64_t> padding_low(rank, 0), padding_high(rank, 0),
         padding_interior(rank, 0);
     for (int64_t i = 1; i < rank - 1; ++i) {
       padding_low[i] = original_padding[i * 2 - 2];
@@ -962,7 +1063,7 @@ class ConvertUniformQuantizedDotOp : public OpConversionPattern<mhlo::DotOp> {
           rewriter.getContext(), /*lhsBatchingDimensions=*/{},
           /*rhsBatchingDimensions=*/{}, /*lhsContractingDimensions=*/{1},
           /*rhsContractingDimensions=*/{0});
-      llvm::SmallVector<mlir::NamedAttribute> attrs(op->getAttrs());
+      SmallVector<mlir::NamedAttribute> attrs(op->getAttrs());
       attrs.push_back(
           {StringAttr::get(rewriter.getContext(), "dot_dimension_numbers"),
            dims});
@@ -997,12 +1098,14 @@ class ConvertUniformQuantizedDotGeneralOp
       return matchAndRewriteDotLikeOp(
           op, adaptor, op->getAttrs(),
           DotLikeDimensionNumbers{
-              op.getDotDimensionNumbers().getLhsBatchingDimensions(),
+              to_vector(op.getDotDimensionNumbers().getLhsBatchingDimensions()),
               /*lhs_spatial_dims=*/{},
-              op.getDotDimensionNumbers().getLhsContractingDimensions(),
-              op.getDotDimensionNumbers().getRhsBatchingDimensions(),
+              to_vector(
+                  op.getDotDimensionNumbers().getLhsContractingDimensions()),
+              to_vector(op.getDotDimensionNumbers().getRhsBatchingDimensions()),
               /*rhs_spatial_dims=*/{},
-              op.getDotDimensionNumbers().getRhsContractingDimensions()},
+              to_vector(
+                  op.getDotDimensionNumbers().getRhsContractingDimensions())},
           rewriter);
     }
   }
@@ -1088,7 +1191,7 @@ FailureOr<DotLikeDimensionNumbers> VerifyAndConstructDims(
     auto res_element_quant_per_channel_type =
         getElementTypeOrSelf(op.getResult())
             .cast<UniformQuantizedPerAxisType>();
-    llvm::SmallVector<double> scale_ratios(
+    SmallVector<double> scale_ratios(
         res_element_quant_per_channel_type.getScales().size());
     for (int i = 0; i < scale_ratios.size(); ++i) {
       scale_ratios[i] =
@@ -1106,7 +1209,8 @@ FailureOr<DotLikeDimensionNumbers> VerifyAndConstructDims(
     }
   }
   // lhs_dilation must not exist.
-  if (llvm::any_of(op.getLhsDilationAttr().getValues<int64_t>(),
+  if (op.getLhsDilation().has_value() &&
+      llvm::any_of(op.getLhsDilationAttr().getValues<int64_t>(),
                    [](int64_t dilate) { return dilate != 1; })) {
     op->emitError("lhs_dilation must be 1.");
     return failure();
@@ -1160,6 +1264,7 @@ class ConvertUniformQuantizedConvolutionOp
 
 // This pattern lowers a generic MHLO op for uq->int.
 // This pattern essentially just performs type change, with no algorithm change.
+// TODO: b/310685906 - Add operand/result type validations.
 class ConvertGenericOp : public ConversionPattern {
  public:
   explicit ConvertGenericOp(MLIRContext *ctx)
@@ -1169,36 +1274,16 @@ class ConvertGenericOp : public ConversionPattern {
       Operation *op, ArrayRef<Value> operands,
       ConversionPatternRewriter &rewriter) const override {
     // This pattern only handle selected ops.
-    if (!isa<mhlo::ConstantOp, mhlo::ConvertOp, mhlo::BroadcastInDimOp,
-             mhlo::MaxOp, mhlo::MinOp>(op)) {
+    if (!isa<mhlo::BroadcastInDimOp, mhlo::ConcatenateOp, mhlo::ConstantOp,
+             mhlo::ConvertOp, mhlo::GatherOp, mhlo::MaxOp, mhlo::MinOp,
+             mhlo::PadOp, mhlo::ReshapeOp, mhlo::SelectOp, mhlo::SliceOp,
+             mhlo::TransposeOp>(op)) {
       return failure();
     }
 
-    // Check that all operands and result uq types are the same.
-    llvm::SmallVector<Type> uq_types;
-    for (auto result_type : op->getResultTypes()) {
-      auto type =
-          getElementTypeOrSelf(result_type).dyn_cast<UniformQuantizedType>();
-      if (type) {
-        uq_types.push_back(type);
-      }
-    }
-    for (auto operand : op->getOperands()) {
-      auto type = getElementTypeOrSelf(operand.getType())
-                      .dyn_cast<UniformQuantizedType>();
-      if (type) {
-        uq_types.push_back(type);
-      }
-    }
-    for (auto type : uq_types) {
-      if (type != uq_types.front()) {
-        return failure();
-      }
-    }
-
     // Determine new result type: use storage type for uq types; use original
     // type otherwise.
-    llvm::SmallVector<Type, 4> new_result_types;
+    SmallVector<Type, 4> new_result_types;
     for (auto result_type : op->getResultTypes()) {
       new_result_types.push_back(GetQuantStorageType(result_type));
     }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
index f20d1b3609361e..1987b607392379 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
@@ -62,7 +62,9 @@ limitations under the License.
 namespace mlir::quant::stablehlo {
 namespace {
 
-class ConvertTfQuantToMhloIntTest : public ::testing::Test {
+using ::testing::Test;
+
+class ConvertTfQuantToMhloIntTest : public Test {
  protected:
   void SetUp() override {
     DialectRegistry dialects;
@@ -281,7 +283,7 @@ class ConvertTfQuantToMhloIntTest : public ::testing::Test {
   absl::BitGen bitgen_;
 };
 
-TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeAndDequantize) {
+TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeAndDequantizeToValidGraph) {
   constexpr absl::string_view kProgram = R"mlir(
 func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
   %scale = "tf.Const"() { value = dense<0.347> : tensor<f32> } : () -> tensor<f32>
@@ -306,7 +308,7 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
       kProgram, {&arg0}, /*tf_program=*/std::nullopt, /*error_tolerance=*/0.35);
 }
 
-TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizePerChannel) {
+TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizePerChannelToValidGraph) {
   constexpr absl::string_view kProgram = R"mlir(
 func.func @main(
     %arg0: tensor<10x10xf32>, %scale: tensor<10xf32>, %zp: tensor<10xi32>
@@ -330,7 +332,7 @@ func.func @main(
                                        /*error_tolerance=*/1.0);
 }
 
-TEST_F(ConvertTfQuantToMhloIntTest, UniformDequantizePerChannel) {
+TEST_F(ConvertTfQuantToMhloIntTest, UniformDequantizePerChannelToValidGraph) {
   constexpr absl::string_view kProgram = R"mlir(
 func.func @main(
     %arg0: tensor<10x10xi8>, %scale: tensor<10xf32>, %zp: tensor<10xi32>
@@ -350,7 +352,7 @@ func.func @main(
   ExecuteAndCompareResultsWithTfKernel(kProgram, {&arg0, &scale, &zp});
 }
 
-TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeConvolution) {
+TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeConvolutionToValidGraph) {
   constexpr absl::string_view kProgram = R"mlir(
 func.func @main(%input: tensor<1x9x9x9xi8>, %filter: tensor<3x3x9x10xi8>) -> tensor<1x9x9x10xi32> {
   %input_scale = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
@@ -389,7 +391,8 @@ func.func @main(%input: tensor<1x9x9x9xi8>, %filter: tensor<3x3x9x10xi8>) -> ten
   ExecuteAndCompareResultsWithTfKernel(kProgram, {&input, &filter});
 }
 
-TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeConvolutionPerChannel) {
+TEST_F(ConvertTfQuantToMhloIntTest,
+       UniformQuantizeConvolutionPerChannelToValidGraph) {
   constexpr absl::string_view kProgram = R"mlir(
 func.func @main(
     %input: tensor<1x9x9x9xi8>, %filter: tensor<3x3x9x10xi8>, %scale: tensor<10xf32>
@@ -428,7 +431,8 @@ func.func @main(
   ExecuteAndCompareResultsWithTfKernel(kProgram, {&input, &filter, &scale});
 }
 
-TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeConvolutionHybrid) {
+TEST_F(ConvertTfQuantToMhloIntTest,
+       UniformQuantizeConvolutionHybridToValidGraph) {
   constexpr absl::string_view kTfProgram = R"mlir(
 func.func @main(%input: tensor<2x10x10x10xf32>, %filter: tensor<3x3x10x20xi8>) -> tensor<2x10x10x20xf32> {
   %filter_scale = "tf.Const"() { value = dense<0.047> : tensor<f32> } : () -> tensor<f32>
@@ -476,7 +480,7 @@ func.func @main(%input: tensor<2x10x10x10xf32>, %filter: tensor<3x3x10x20xi8>) -
   ExecuteAndCompareResultsWithTfKernel(kProgram, {&input, &filter}, kTfProgram);
 }
 
-TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeDot) {
+TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeDotToValidGraph) {
   constexpr absl::string_view kProgram = R"mlir(
 func.func @main(%input: tensor<8x9xi8>, %filter: tensor<9x10xi8>) -> tensor<8x10xi32> {
   %input_scale = "tf.Const"() { value = dense<0.588> : tensor<f32> } : () -> tensor<f32>
@@ -513,7 +517,7 @@ func.func @main(%input: tensor<8x9xi8>, %filter: tensor<9x10xi8>) -> tensor<8x10
   ExecuteAndCompareResultsWithTfKernel(kProgram, {&input, &filter});
 }
 
-TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeDotHybrid) {
+TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeDotHybridToValidGraph) {
   constexpr absl::string_view kTfProgram = R"mlir(
 func.func @main(%input: tensor<8x9xf32>, %filter: tensor<9x10xi8>) -> tensor<8x10xf32> {
   %filter_scale = "tf.Const"() { value = dense<0.0235> : tensor<f32> } : () -> tensor<f32>
@@ -550,7 +554,7 @@ func.func @main(%input: tensor<8x9xf32>, %filter: tensor<9x10xi8>) -> tensor<8x1
   ExecuteAndCompareResultsWithTfKernel(kProgram, {&input, &filter}, kTfProgram);
 }
 
-TEST_F(ConvertTfQuantToMhloIntTest, UniformRequantize) {
+TEST_F(ConvertTfQuantToMhloIntTest, UniformRequantizeToValidGraph) {
   constexpr absl::string_view kProgram = R"mlir(
 func.func @main(%input: tensor<10xi8>) -> tensor<10xi8> {
   %input_scale = "tf.Const"() { value = dense<0.2235> : tensor<f32> } : () -> tensor<f32>
@@ -579,7 +583,131 @@ func.func @main(%input: tensor<10xi8>) -> tensor<10xi8> {
   ExecuteAndCompareResultsWithTfKernel(kProgram, {&input});
 }
 
-TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeAdd) {
+TEST_F(ConvertTfQuantToMhloIntTest, UniformRequantizePerChannelToValidGraph) {
+  constexpr absl::string_view kProgram = R"mlir(
+func.func @main(
+    %input: tensor<10x10xi8>, %input_scale: tensor<10xf32>,
+    %input_zp: tensor<10xi32>, %output_scale: tensor<10xf32>,
+    %output_zp: tensor<10xi32>
+  ) -> tensor<10x10xi8> {
+  %0 = "tf.Cast"(%input) {} : (tensor<10x10xi8>) -> tensor<10x10x!tf_type.qint8>
+  %1 = "tf.UniformRequantize"(
+    %0, %input_scale, %input_zp, %output_scale, %output_zp
+  ) {
+    Tin = "tfdtype$DT_QINT8", Tout = "tfdtype$DT_QINT8", attr_map = "",
+    device = "", input_quantization_axis = 1,
+    input_quantization_max_val = 127 : i64,
+    input_quantization_min_val = -128 : i64,
+    output_quantization_axis = 1 : i64,
+    output_quantization_max_val = 127 : i64,
+    output_quantization_min_val = -128 : i64
+  } : (
+    tensor<10x10x!tf_type.qint8>, tensor<10xf32>, tensor<10xi32>,
+    tensor<10xf32>, tensor<10xi32>
+  ) -> tensor<10x10x!tf_type.qint8>
+  %2 = "tf.Cast"(%1) {} : (tensor<10x10x!tf_type.qint8>) -> tensor<10x10xi8>
+  return %2 : tensor<10x10xi8>
+})mlir";
+  TF_ASSERT_OK_AND_ASSIGN(auto input, CreateRandomI8Literal({10, 10}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_scale,
+      CreateRandomF32Literal({10}, /*min=*/0.0001, /*max=*/2));
+  TF_ASSERT_OK_AND_ASSIGN(auto input_zp, CreateRandomI32Literal({10}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto output_scale,
+      CreateRandomF32Literal({10}, /*min=*/0.0001, /*max=*/2));
+  TF_ASSERT_OK_AND_ASSIGN(auto output_zp, CreateRandomI32Literal({10}));
+  // error_tolerance is set to be 1 because different rounding implementations
+  // in TF kernel and the lowering passes may cause +/-1 differences.
+  ExecuteAndCompareResultsWithTfKernel(
+      kProgram, {&input, &input_scale, &input_zp, &output_scale, &output_zp},
+      /*tf_program=*/std::nullopt,
+      /*error_tolerance=*/1.0);
+}
+
+TEST_F(ConvertTfQuantToMhloIntTest,
+       UniformRequantizePerTensorToPerChannelToValidGraph) {
+  constexpr absl::string_view kProgram = R"mlir(
+func.func @main(
+    %input: tensor<10x10xi8>, %input_scale: tensor<f32>, %input_zp: tensor<i32>,
+    %output_scale: tensor<10xf32>, %output_zp: tensor<10xi32>
+  ) -> tensor<10x10xi8> {
+  %0 = "tf.Cast"(%input) {} : (tensor<10x10xi8>) -> tensor<10x10x!tf_type.qint8>
+  %1 = "tf.UniformRequantize"(
+    %0, %input_scale, %input_zp, %output_scale, %output_zp
+  ) {
+    Tin = "tfdtype$DT_QINT8", Tout = "tfdtype$DT_QINT8", attr_map = "",
+    device = "", input_quantization_axis = -1,
+    input_quantization_max_val = 127 : i64,
+    input_quantization_min_val = -128 : i64,
+    output_quantization_axis = 1 : i64,
+    output_quantization_max_val = 127 : i64,
+    output_quantization_min_val = -128 : i64
+  } : (
+    tensor<10x10x!tf_type.qint8>, tensor<f32>, tensor<i32>,
+    tensor<10xf32>, tensor<10xi32>
+  ) -> tensor<10x10x!tf_type.qint8>
+  %2 = "tf.Cast"(%1) {} : (tensor<10x10x!tf_type.qint8>) -> tensor<10x10xi8>
+  return %2 : tensor<10x10xi8>
+})mlir";
+  TF_ASSERT_OK_AND_ASSIGN(auto input, CreateRandomI8Literal({10, 10}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_scale, CreateRandomF32Literal({}, /*min=*/0.0001, /*max=*/2));
+  TF_ASSERT_OK_AND_ASSIGN(auto input_zp, CreateRandomI32Literal({}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto output_scale,
+      CreateRandomF32Literal({10}, /*min=*/0.0001, /*max=*/2));
+  TF_ASSERT_OK_AND_ASSIGN(auto output_zp, CreateRandomI32Literal({10}));
+  // error_tolerance is set to be 1 because different rounding implementations
+  // in TF kernel and the lowering passes may cause +/-1 differences.
+  ExecuteAndCompareResultsWithTfKernel(
+      kProgram, {&input, &input_scale, &input_zp, &output_scale, &output_zp},
+      /*tf_program=*/std::nullopt,
+      /*error_tolerance=*/1.0);
+}
+
+TEST_F(ConvertTfQuantToMhloIntTest,
+       UniformRequantizePerChannelToPerTensorToValidGraph) {
+  constexpr absl::string_view kProgram = R"mlir(
+func.func @main(
+    %input: tensor<10x10xi8>, %input_scale: tensor<10xf32>,
+    %input_zp: tensor<10xi32>, %output_scale: tensor<f32>, %output_zp: tensor<i32>
+  ) -> tensor<10x10xi8> {
+  %0 = "tf.Cast"(%input) {} : (tensor<10x10xi8>) -> tensor<10x10x!tf_type.qint8>
+  %1 = "tf.UniformRequantize"(
+    %0, %input_scale, %input_zp, %output_scale, %output_zp
+  ) {
+    Tin = "tfdtype$DT_QINT8", Tout = "tfdtype$DT_QINT8", attr_map = "",
+    device = "", input_quantization_axis = 1,
+    input_quantization_max_val = 127 : i64,
+    input_quantization_min_val = -128 : i64,
+    output_quantization_axis = -1 : i64,
+    output_quantization_max_val = 127 : i64,
+    output_quantization_min_val = -128 : i64
+  } : (
+    tensor<10x10x!tf_type.qint8>, tensor<10xf32>, tensor<10xi32>,
+    tensor<f32>, tensor<i32>
+  ) -> tensor<10x10x!tf_type.qint8>
+  %2 = "tf.Cast"(%1) {} : (tensor<10x10x!tf_type.qint8>) -> tensor<10x10xi8>
+  return %2 : tensor<10x10xi8>
+})mlir";
+  TF_ASSERT_OK_AND_ASSIGN(auto input, CreateRandomI8Literal({10, 10}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_scale,
+      CreateRandomF32Literal({10}, /*min=*/0.0001, /*max=*/2));
+  TF_ASSERT_OK_AND_ASSIGN(auto input_zp, CreateRandomI32Literal({10}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto output_scale, CreateRandomF32Literal({}, /*min=*/0.0001, /*max=*/2));
+  TF_ASSERT_OK_AND_ASSIGN(auto output_zp, CreateRandomI32Literal({}));
+  // error_tolerance is set to be 1 because different rounding implementations
+  // in TF kernel and the lowering passes may cause +/-1 differences.
+  ExecuteAndCompareResultsWithTfKernel(
+      kProgram, {&input, &input_scale, &input_zp, &output_scale, &output_zp},
+      /*tf_program=*/std::nullopt,
+      /*error_tolerance=*/1.0);
+}
+
+TEST_F(ConvertTfQuantToMhloIntTest, UniformQuantizeAddToValidGraph) {
   constexpr absl::string_view kProgram = R"mlir(
 func.func @main(%lhs: tensor<10x10xi32>, %rhs: tensor<10x10xi32>) -> tensor<10x10xi32> {
   %lhs_scale = "tf.Const"() { value = dense<0.518> : tensor<f32> } : () -> tensor<f32>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types_test.cc
index 856bbd49930341..9a5e6c53d3d1d6 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types_test.cc
@@ -38,11 +38,12 @@ using ::mlir::MLIRContext;
 using ::mlir::ModuleOp;
 using ::mlir::OwningOpRef;
 using ::tensorflow::monitoring::testing::CellReader;
+using ::testing::Test;
 
 static constexpr char kMetricsName[] =
     "/tensorflow/core/tf2xla/tf_quant_op_count";
 
-class LegalizeTfTypesTest : public ::testing::Test {
+class LegalizeTfTypesTest : public Test {
  protected:
   void CreateModule(const char* module_string) {
     DialectRegistry mlir_registry;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/legalize_tf_quant_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/legalize_tf_quant_test.cc
index 1fd1a0b6bab721..4c20b6bebdcdad 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/legalize_tf_quant_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/legalize_tf_quant_test.cc
@@ -34,7 +34,9 @@ limitations under the License.
 namespace mlir::quant::stablehlo {
 namespace {
 
-class LegalizeTFQuantTest : public ::testing::Test {
+using ::testing::Test;
+
+class LegalizeTFQuantTest : public Test {
  protected:
   void TestBridgeLowering(llvm::StringRef mlir_module_string,
                           llvm::ArrayRef<tensorflow::TensorShape> arg_shapes) {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/verify_quant_legalization.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/verify_quant_legalization.cc
index 361d98c7775abe..2825195addea12 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/verify_quant_legalization.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/verify_quant_legalization.cc
@@ -57,7 +57,7 @@ bool IsQuantType(Type type) {
          IsTFQintType(element_type);
 }
 
-bool IsMhloUniformQuantizedOp(Operation* op) {
+bool IsMhloUniformQuantizedOp(Operation& op) {
   return llvm::isa<mhlo::UniformQuantizeOp, mhlo::UniformDequantizeOp>(op);
 }
 
@@ -68,7 +68,7 @@ void VerifyQuantLegalization::runOnOperation() {
     // Verify all uq and qint types are lowered.
     if (llvm::any_of(op->getOperandTypes(), IsQuantType) ||
         llvm::any_of(op->getResultTypes(), IsQuantType) ||
-        IsTFUniformQuantizedOp(op) || IsMhloUniformQuantizedOp(op)) {
+        IsTFUniformQuantizedOp(op) || IsMhloUniformQuantizedOp(*op)) {
       op->emitOpError("is illegal as it is a UQ op or contains uq/qint types");
       LOG(ERROR) << "Found illegal op containing uq/qint type: "
                  << op->getName().getStringRef().str();
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
index 383f2430c94eee..6f13634b317aa4 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
@@ -29,9 +29,8 @@ limitations under the License.
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h"
-// TODO - b/303543789: Remove TF Quantizer util dependency.
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 
 namespace mlir::quant::stablehlo {
 
@@ -42,7 +41,7 @@ namespace {
 
 // TODO - b/303543789: Move the helper functions below to a separate util.
 // Fetches the default or null attribute, used for pattern matching.
-static Attribute DefaultOrNullAttr(OpBuilder& builder, Attribute& attr) {
+Attribute DefaultOrNullAttr(OpBuilder& builder, const Attribute& attr) {
   if (!attr) {
     return builder.getStringAttr(kNullAttributeValue);
   }
@@ -51,7 +50,7 @@ static Attribute DefaultOrNullAttr(OpBuilder& builder, Attribute& attr) {
 
 // Checks whether the value of a constant equals the given float, regardless
 // of the tensor dimension.
-static bool FloatValueEquals(const Attribute& attr, double value) {
+bool FloatValueEquals(const Attribute& attr, const double value) {
   auto fp_attr = attr.dyn_cast_or_null<DenseFPElementsAttr>();
   if (!fp_attr) return false;
 
@@ -101,7 +100,7 @@ void LiftQuantizableSpotsAsFunctionsPass::runOnOperation() {
   }
 
   // Remove all attr_map attributes.
-  module_op.walk([&](Operation* op) { op->removeAttr(kAttrMapAttribute); });
+  module_op.walk([](Operation* op) { op->removeAttr(kAttrMapAttribute); });
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td
index 116037d9130df2..0e7706c8d550a1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td
@@ -20,10 +20,8 @@ include "mlir/IR/OpBase.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 include "stablehlo/dialect/StablehloOps.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.td"
-include "tensorflow/compiler/mlir/quantization/stablehlo/passes/utils.td"
-// TODO - b/303543789: Remove TF Quantizer util dependency.
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td"
 
 //===----------------------------------------------------------------------===//
 // Pattern rules for lifting ops with bias as functions
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.td
index fc5af302e794a8..9bc337b8d46949 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.td
@@ -19,10 +19,8 @@ include "mlir/Dialect/Arith/IR/ArithOps.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 include "stablehlo/dialect/StablehloOps.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.td"
-include "tensorflow/compiler/mlir/quantization/stablehlo/passes/utils.td"
-// TODO - b/303543789: Remove TF Quantizer util dependency.
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td"
 
 //===----------------------------------------------------------------------===//
 // Pattern rules for lifting ops as functions
@@ -56,4 +54,4 @@ def LiftDotGeneral : Pat<
     (NamedAttributeList
       (NamedAttr<"dot_dimension_numbers"> $dot_dimension_numbers),
       (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
-  [(IsNotInLiftedFunc $res)], [], (addBenefit 1)>;
\ No newline at end of file
+  [(IsNotInLiftedFunc $res)], [], (addBenefit 1)>;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
index 0b05069b265989..4973c515d96a58 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
@@ -28,7 +28,7 @@ namespace mlir::quant::stablehlo {
 
 // Creates a `QuantizePass` that quantizes ops according to surrounding qcast /
 // dcast ops.
-std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
+std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizePass(
     const quant::QuantizationSpecs& quantization_specs);
 
 // Creates a pass that quantizes weight component of StableHLO graph.
@@ -39,7 +39,7 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizeWeightPass(
 // Creates an instance of the StableHLO dialect PrepareQuantize pass without any
 // arguments. Preset method of SRQ is set to the quantization option by default.
 std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass(
-    bool enable_per_channel_quantization = true, int bit_width = 8);
+    bool enable_per_channel_quantization = false, int bit_width = 8);
 
 // Adds generated pass default constructors or options definitions.
 #define GEN_PASS_DECL
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
index 52dca7897ea05d..c69e72120538b3 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
@@ -67,7 +67,7 @@ def ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass : Pass<"stablehlo-
   }];
 }
 
-def QuantizePass : Pass<"stablehlo-quantize", "mlir::func::FuncOp"> {
+def QuantizePass : Pass<"stablehlo-quantize", "mlir::ModuleOp"> {
   let summary = "Applies static-range quantization on ops.";
   let dependentDialects = [
     "mlir::stablehlo::StablehloDialect",
@@ -103,3 +103,13 @@ def QuantizeCompositeFunctionsPass : Pass<"stablehlo-quantize-composite-function
     "TF::TensorFlowDialect",
   ];
 }
+
+def UnwrapXlaCallModuleOpPass : Pass<"stablehlo-unwrap-xla-call-module-op", "ModuleOp"> {
+  let summary = "Unwrap XlaCallModuleOps into inline functions if not used for quantizing fused patterns.";
+  let dependentDialects = ["TF::TensorFlowDialect"];
+}
+
+def PopulateShapePass : Pass<"populate-shape", "ModuleOp"> {
+  let summary = "Populate output shape with known information for CustomAggregatorOp and XlaCallModuleOp.";
+  let dependentDialects = ["TF::TensorFlowDialect"];
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/populate_shape.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/populate_shape.cc
new file mode 100644
index 00000000000000..0d4f0594f5c7d8
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/populate_shape.cc
@@ -0,0 +1,144 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
+#include "tensorflow/core/ir/types/dialect.h"
+
+namespace mlir::quant::stablehlo {
+
+#define GEN_PASS_DEF_POPULATESHAPEPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
+
+namespace {
+
+class PopulateShapeForCustomAggregatorOp
+    : public OpConversionPattern<TF::CustomAggregatorOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TF::CustomAggregatorOp op, TF::CustomAggregatorOpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    auto input_shape_type = op.getInput().getType().dyn_cast<Type>();
+    auto output_shape_type = op.getOutput().getType();
+
+    if (!input_shape_type.isa<RankedTensorType>()) {
+      input_shape_type = adaptor.getInput().getType();
+    }
+
+    if (input_shape_type.isa<RankedTensorType>() &&
+        !output_shape_type.isa<RankedTensorType>() &&
+        TF::HasCompatibleElementTypes(input_shape_type, output_shape_type)) {
+      auto new_op = rewriter.create<TF::CustomAggregatorOp>(
+          op->getLoc(), /*output=*/input_shape_type,
+          /*args=*/adaptor.getInput(),
+          /*Id=*/op.getId());
+      new_op->setAttrs(op->getAttrs());
+      rewriter.replaceOp(op, new_op);
+      return success();
+    }
+    return failure();
+  }
+};
+
+class PopulateShapeForXlaCallModuleOp
+    : public OpConversionPattern<TF::XlaCallModuleOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TF::XlaCallModuleOp op, TF::XlaCallModuleOpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    if (op->getNumResults() != 1) {
+      op->emitError("XlaCallModuleOp doesn't have 1 output.");
+      return failure();
+    }
+    // Assume XlaCallModuleOp only has 1 output.
+    auto output_shape_type = op->getResultTypes()[0];
+    if (!output_shape_type.isa<RankedTensorType>()) {
+      auto output_shape_attr = op.getSout()[0].dyn_cast<tf_type::ShapeAttr>();
+      if (!output_shape_attr.hasRank()) {
+        return failure();
+      }
+      auto new_output_shape_type = tensorflow::GetTypeFromTFTensorShape(
+          output_shape_attr.getShape(),
+          getElementTypeOrSelf(op.getResultTypes()[0]));
+      auto new_op = rewriter.create<TF::XlaCallModuleOp>(
+          op->getLoc(), /*output=*/new_output_shape_type,
+          /*args=*/adaptor.getOperands(),
+          /*version=*/op.getVersionAttr(),
+          /*module=*/op.getModuleAttr(),
+          /*Sout=*/op.getSoutAttr());
+      new_op->setAttrs(op->getAttrs());
+      rewriter.replaceOp(op, new_op);
+      return success();
+    }
+    return failure();
+  }
+};
+
+class PopulateShapePass
+    : public impl::PopulateShapePassBase<PopulateShapePass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PopulateShapePass)
+
+  explicit PopulateShapePass() = default;
+
+ private:
+  void runOnOperation() override;
+};
+
+void PopulateShapePass::runOnOperation() {
+  Operation *op = getOperation();
+  MLIRContext *context = op->getContext();
+  RewritePatternSet patterns(context);
+  ConversionTarget target(*context);
+  target.addDynamicallyLegalOp<TF::CustomAggregatorOp>([](Operation *op) {
+    auto custom_aggregator_op = llvm::dyn_cast<TF::CustomAggregatorOp>(op);
+    return custom_aggregator_op.getInput().getType().isa<RankedTensorType>() &&
+           custom_aggregator_op.getOutput().getType().isa<RankedTensorType>();
+  });
+  target.addDynamicallyLegalOp<TF::XlaCallModuleOp>([](Operation *op) {
+    if (op->getNumResults() != 1) return true;
+    return op->getResultTypes()[0].isa<RankedTensorType>();
+  });
+
+  patterns
+      .add<PopulateShapeForCustomAggregatorOp, PopulateShapeForXlaCallModuleOp>(
+          context);
+
+  if (failed(applyPartialConversion(op, target, std::move(patterns)))) {
+    return signalPassFailure();
+  }
+}
+}  // namespace
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
index 6da27d9e3c2823..24d15dfd6688d5 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
@@ -15,11 +15,9 @@ limitations under the License.
 // Copied and modified from
 // //third_party/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
 // This transformation pass applies quantization propagation on TF dialect.
-#include <initializer_list>
 #include <memory>
 #include <utility>
 
-#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
@@ -35,6 +33,7 @@ limitations under the License.
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -134,50 +133,6 @@ class ConvertArithConstToStablehloConstOp
   }
 };
 
-std::unique_ptr<OpQuantSpec> GetStableHLOOpQuantSpec(Operation* op) {
-  auto spec = std::make_unique<OpQuantSpec>();
-  if (auto call_op = dyn_cast_or_null<TF::XlaCallModuleOp>(op)) {
-    auto entry_function =
-        call_op->getAttrOfType<FlatSymbolRefAttr>("_entry_function");
-    StringRef function_name = entry_function.getValue();
-    if (!function_name.startswith("composite_")) {
-      return spec;
-    }
-    if (function_name.contains("conv")) {
-      spec->coeff_op_quant_dim[1] = 3;
-      if (function_name.contains("with_bias")) {
-        spec->biases_params[2] = {{0, 1},
-                                  quant::GetUniformQuantizedTypeForBias};
-      }
-    } else if (function_name.contains("dot_general")) {
-      spec->coeff_op_quant_dim[1] = -1;
-      if (function_name.contains("with_bias")) {
-        spec->biases_params[2] = {{0, 1},
-                                  quant::GetUniformQuantizedTypeForBias};
-      }
-    } else if (function_name.contains("dot")) {
-      spec->coeff_op_quant_dim[1] = -1;
-      if (function_name.contains("with_bias")) {
-        spec->biases_params[2] = {{0, 1},
-                                  quant::GetUniformQuantizedTypeForBias};
-      }
-    }
-    for (auto quantizable_operand : spec->coeff_op_quant_dim) {
-      spec->quantizable_operands.insert(quantizable_operand.first);
-    }
-  }
-  return spec;
-}
-
-std::unique_ptr<OpQuantScaleSpec> GetStableHLOQuantScaleSpec(Operation* op) {
-  auto scale_spec = std::make_unique<OpQuantScaleSpec>();
-  if (llvm::isa<mlir::stablehlo::ConvertOp, mlir::stablehlo::ConcatenateOp>(
-          op)) {
-    scale_spec->has_same_scale_requirement = true;
-  }
-  return scale_spec;
-}
-
 void PrepareQuantizePass::runOnOperation() {
   func::FuncOp func = getOperation();
   MLIRContext* ctx = func.getContext();
@@ -185,8 +140,8 @@ void PrepareQuantizePass::runOnOperation() {
   // The function might contain more stats ops than required, and it will
   // introduce requantize if the calibration stats have conflicts. This tries to
   // remove all the redundant stats ops.
-  RemoveRedundantStatsOps(func, GetStableHLOOpQuantSpec,
-                          GetStableHLOQuantScaleSpec);
+  RemoveRedundantStatsOps(func, GetStableHloOpQuantSpec,
+                          GetStableHloQuantScaleSpec);
 
   RewritePatternSet patterns(ctx);
   // Convert quant stats to int8 quantization parameters.
@@ -209,7 +164,7 @@ void PrepareQuantizePass::runOnOperation() {
   // values (tensors).
   ApplyQuantizationParamsPropagation(
       func, /*is_signed=*/true, bit_width_, !enable_per_channel_quantization_,
-      GetStableHLOOpQuantSpec, GetStableHLOQuantScaleSpec,
+      GetStableHloOpQuantSpec, GetStableHloQuantScaleSpec,
       /*infer_tensor_ranges=*/true, /*legacy_float_scale=*/false);
 
   // Restore constants as stablehlo::ConstantOp.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
new file mode 100644
index 00000000000000..6cd3be0cdc572c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
@@ -0,0 +1,432 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h"
+
+#include <cstdint>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+#define DEBUG_TYPE "populate-quantization-patterns"
+
+namespace mlir::quant::stablehlo {
+
+namespace {
+
+using ::mlir::stablehlo::AddOp;
+using ::mlir::stablehlo::ConvolutionOp;
+using ::mlir::stablehlo::DotGeneralOp;
+using ::mlir::stablehlo::DynamicBroadcastInDimOp;
+using ::mlir::stablehlo::UniformQuantizeOp;
+
+constexpr StringRef kCompositeFuncPrefix = "composite_";
+constexpr StringRef kQuantizedFuncPrefix = "quantized_";
+constexpr StringRef kEntryFuncAttrName = "_entry_function";
+
+// Returns true if `type` is a TensorType with quantized elements.
+bool IsQuantizedTensorType(const Type type) {
+  return type.isa<TensorType>() &&
+         type.cast<TensorType>().getElementType().isa<QuantizedType>();
+}
+
+// Returns true if an op has adjacent bias or activation that can be fused
+// together into the quantization function.
+// TODO: b/307620428 - Consider using matchAndRewrite to check and apply
+// patterns at the same time. Also add check for fusible activation or
+// fusible patterns with dynamic shape.
+bool HasFusibleQuantizationPattern(Operation& op) {
+  if (isa<AddOp>(op.getNextNode())) {
+    return true;
+  }
+  return false;
+}
+
+// Returns dynamically broadcasted user op of an input op. Returns null if
+// the op is used multiple times or the user op is not dynamically broadcasted.
+// Dynamic shapes usually has the following pattern. In the example below,
+// the input operand would be stablehlo.gemm_style op, and return value would
+// be stablehlo.add op.
+//
+// ```
+// %2 = stablehlo.gemm_style(%0, %1)
+// %3 = shape.shape_of %2
+// %4 = stablehlo.dynamic_broadcast_in_dims %cst, %3
+// %5 = stablehlo.add %2, %4
+// ```
+Operation* GetDynamicallyBroadcastedUserOp(Operation& op) {
+  if (!op.hasOneUse()) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Target op is used multiple times and will not be checked "
+                  "for dynamic shape case.\n");
+    return nullptr;
+  }
+  Operation& shapeof_op = *op.getNextNode();
+  if (!isa<shape::ShapeOfOp>(shapeof_op)) {
+    return nullptr;
+  }
+  Operation& broadcast_in_dims_op = *shapeof_op.getNextNode();
+  if (!isa<DynamicBroadcastInDimOp>(broadcast_in_dims_op)) {
+    return nullptr;
+  }
+  return broadcast_in_dims_op.getNextNode();
+}
+
+// Checks if all inputs and outputs are quantized.
+bool HasQuantizedOperandOrOutput(Operation& call_op) {
+  SmallVector<Type> arg_types;
+  for (const Value arg : call_op.getOperands()) {
+    arg_types.push_back(arg.getType());
+  }
+
+  SmallVector<Type> output_types;
+  for (const Value output : call_op.getResults()) {
+    output_types.push_back(output.getType());
+  }
+
+  return absl::c_all_of(arg_types, IsQuantizedTensorType) &&
+         absl::c_all_of(output_types, IsQuantizedTensorType);
+}
+
+// Gets the corresponding quantized function name from the given function name.
+// Example: "composite_dot_general_fn_1" => "quantized_dot_general_fn"
+std::string GetQuantizedFunctionName(const StringRef func_name) {
+  return Twine(kQuantizedFuncPrefix)
+      .concat(func_name.rsplit(kCompositeFuncPrefix).second)
+      .str();
+}
+
+// Returns true if `xla_call_module_op` is quantized. To be considered
+// quantized, it should meet three conditions:
+// 1. At least one of the inputs or outputs should be a uniform quantized type.
+// 2. `xla_call_module_op` should have the `kQuantTraitAttrName` attribute.
+// 3. It should also have the `kEntryFuncAttrName` attribute, which points to
+//    the function that `xla_call_module_op` represents.
+bool IsQuantizedXlaCallModuleOp(TF::XlaCallModuleOp xla_call_module_op) {
+  return HasQuantizedOperandOrOutput(*xla_call_module_op) &&
+         xla_call_module_op->hasAttr(kQuantTraitAttrName) &&
+         xla_call_module_op->hasAttr(kEntryFuncAttrName);
+}
+
+// Returns the entry function, i.e. the callee of `xla_call_module_op`.
+func::FuncOp GetEntryFuncOp(TF::XlaCallModuleOp xla_call_module_op,
+                            SymbolTable symbol_table) {
+  const auto entry_function_symbol_ref =
+      xla_call_module_op->getAttrOfType<FlatSymbolRefAttr>(kEntryFuncAttrName);
+
+  return dyn_cast_or_null<func::FuncOp>(
+      symbol_table.lookup(entry_function_symbol_ref.getValue()));
+}
+
+// Replaces the function type of `entry_func_op` to a quantized one, matching
+// the input and output types of `xla_call_module_op`.
+void SetQuantizedFunctionType(PatternRewriter& rewriter,
+                              func::FuncOp entry_func_op,
+                              TF::XlaCallModuleOp xla_call_module_op) {
+  SmallVector<Type> arg_types;
+  SmallVector<Location> arg_locs;
+  for (const Value arg : xla_call_module_op.getArgs()) {
+    arg_types.push_back(arg.getType());
+    arg_locs.push_back(arg.getLoc());
+  }
+
+  SmallVector<Type> output_types;
+  for (const Value output : xla_call_module_op.getOutput()) {
+    output_types.push_back(output.getType());
+  }
+
+  entry_func_op.setFunctionType(
+      rewriter.getFunctionType(arg_types, output_types));
+
+  // Replace argument types and locs.
+  Block& entry = entry_func_op->getRegion(0).front();
+  for (auto [arg, arg_type, arg_loc] :
+       llvm::zip_equal(entry.getArguments(), arg_types, arg_locs)) {
+    arg.setType(arg_type);
+    arg.setLoc(arg_loc);
+  }
+}
+
+// Creates a UniformQuantize op and sets it as return op.
+void CreateAndReturnUniformQuantizeOp(PatternRewriter& rewriter, Operation& op,
+                                      func::FuncOp entry_func_op,
+                                      const Type func_result_type) {
+  // Add i32 -> i8 requantization.
+  UniformQuantizeOp uniform_quant_op = rewriter.create<UniformQuantizeOp>(
+      op.getLoc(), func_result_type, op.getResults());
+  cast<func::ReturnOp>(entry_func_op.getBody().front().getTerminator())
+      .setOperand(0, uniform_quant_op);
+}
+
+// An interface representing patterns that quantizes an entry function's body.
+// The entry function's signatures should have already been quantized at the
+// point of rewriting.
+class EntryFuncBodyQuantizationPattern {
+ public:
+  virtual ~EntryFuncBodyQuantizationPattern() = default;
+
+  // Returns `success()` if `entry_func_op`'s body is eligible for rewriting. At
+  // this point `entry_func_op`'s signature has not been reset with quantized
+  // types.
+  virtual LogicalResult match(func::FuncOp entry_func_op) const = 0;
+
+  // Rewrites the `entry_func_op`'s body.
+  virtual void rewrite(func::FuncOp entry_func_op,
+                       PatternRewriter& rewriter) const = 0;
+};
+
+// Gemm Style Op: glossary/gemm.
+template <typename GemmStyleOp>
+// Match for all gemm_style op and check for possible fusions.
+LogicalResult MatchGemmStyleOp(func::FuncOp entry_func_op) {
+  // function must have input, filter, and optionally bias.
+  auto& operations = entry_func_op.getBody().front().getOperations();
+  if (operations.size() != 2 && operations.size() != 3) {
+    return failure();
+  }
+  if (!isa<GemmStyleOp>(operations.front())) {
+    return failure();
+  } else if (GetDynamicallyBroadcastedUserOp(operations.front())) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Currently gemm style ops quantization only supports static "
+                  " shapes.\n");
+    return failure();
+  } else if (!isa<RankedTensorType>(
+                 operations.front().getResult(0).getType())) {
+    return failure();
+  }
+  return success();
+}
+
+// Gemm Style Op: glossary/gemm.
+template <typename GemmStyleOp>
+void RewriteGemmStyleOp(func::FuncOp entry_func_op, PatternRewriter& rewriter) {
+  // Update the output type of the gemm_style op.
+  GemmStyleOp gemm_style_op = *entry_func_op.getOps<GemmStyleOp>().begin();
+
+  const Type input_type = entry_func_op.getArgumentTypes()[0];
+  const Type filter_type = entry_func_op.getArgumentTypes()[1];
+  const Type func_result_type = entry_func_op.getResultTypes()[0];
+
+  const double input_scale =
+      getElementTypeOrSelf(input_type).cast<UniformQuantizedType>().getScale();
+  const double filter_scale =
+      getElementTypeOrSelf(filter_type).cast<UniformQuantizedType>().getScale();
+  const double result_scale = input_scale * filter_scale;
+
+  // Define the intermediate output type, which is an i32 quantized type.
+  // This is intermediate because the final output type of the entry_func_op
+  // should be an i8 quantized type.
+  const UniformQuantizedType gemm_style_quantized_element_type =
+      CreateI32F32UniformQuantizedType(gemm_style_op->getLoc(),
+                                       *rewriter.getContext(), result_scale,
+                                       /*zero_point=*/0);
+
+  Value gemm_style_op_result = gemm_style_op->getResult(0);
+  auto gemm_style_op_result_type =
+      gemm_style_op_result.getType().cast<RankedTensorType>();
+  const ArrayRef<int64_t> gemm_style_shape =
+      gemm_style_op_result_type.getShape();
+
+  const TensorType new_gemm_style_op_result_type =
+      gemm_style_op_result_type.cloneWith(gemm_style_shape,
+                                          gemm_style_quantized_element_type);
+  gemm_style_op_result.setType(new_gemm_style_op_result_type);
+
+  rewriter.setInsertionPointAfter(gemm_style_op);
+
+  Operation& next_op = *gemm_style_op->getNextNode();
+  // If an op is used multiple times, do not apply quantization of fused
+  // patterns to prevent removal of dependee ops.
+  const bool should_quantize_without_fusion =
+      HasFusibleQuantizationPattern(*gemm_style_op.getOperation()) &&
+      !gemm_style_op->hasOneUse();
+
+  // TODO: b/307620428 - Add support for dynamic shapes.
+  if (should_quantize_without_fusion || !isa<AddOp>(next_op)) {
+    // no bias
+    CreateAndReturnUniformQuantizeOp(rewriter, *gemm_style_op, entry_func_op,
+                                     func_result_type);
+    return;
+  }
+  // bias fusion
+  Value bias_op = next_op.getOperand(1);
+  Value add_op_result = next_op.getResult(0);
+  const auto add_op_result_type =
+      add_op_result.getType().cast<RankedTensorType>();
+  const ArrayRef<int64_t> add_op_shape = add_op_result_type.getShape();
+  // For quantized bias add case, lhs, rhs, and result have the same types.
+  const TensorType new_add_op_result_type = add_op_result_type.cloneWith(
+      add_op_shape, gemm_style_quantized_element_type);
+  add_op_result.setType(new_add_op_result_type);
+
+  AddOp bias_add_op =
+      rewriter.create<AddOp>(gemm_style_op->getLoc(), gemm_style_op, bias_op);
+
+  CreateAndReturnUniformQuantizeOp(rewriter, *bias_add_op, entry_func_op,
+                                   func_result_type);
+}
+
+// Quantizes the entry function's body containing a `DotGeneralOp`.
+class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
+ public:
+  explicit QuantizeDotGeneralOpPattern() = default;
+
+  LogicalResult match(func::FuncOp entry_func_op) const override {
+    return MatchGemmStyleOp<DotGeneralOp>(entry_func_op);
+  }
+
+  void rewrite(func::FuncOp entry_func_op,
+               PatternRewriter& rewriter) const override {
+    RewriteGemmStyleOp<DotGeneralOp>(entry_func_op, rewriter);
+  }
+};
+
+// Quantizes the entry function's body containing a `ConvolutionOp`.
+class QuantizeConvolutionOpPattern : public EntryFuncBodyQuantizationPattern {
+ public:
+  explicit QuantizeConvolutionOpPattern() = default;
+
+  LogicalResult match(func::FuncOp entry_func_op) const override {
+    return MatchGemmStyleOp<ConvolutionOp>(entry_func_op);
+  }
+
+  void rewrite(func::FuncOp entry_func_op,
+               PatternRewriter& rewriter) const override {
+    RewriteGemmStyleOp<ConvolutionOp>(entry_func_op, rewriter);
+  }
+};
+
+// Converts `entry_func_op` to be quantized according to the respective
+// inputs and outputs of `xla_call_module_op` that are possibly quantized. It
+// signature (type) is reset to match that of `xla_call_module_op`.
+// `entry_func_body_quantization_pattern` rewrites the function's body, based on
+// the new signature.
+void QuantizeEntryFuncOp(
+    MLIRContext& ctx, PatternRewriter& rewriter,
+    TF::XlaCallModuleOp xla_call_module_op, func::FuncOp entry_func_op,
+    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern) {
+  SetQuantizedFunctionType(rewriter, entry_func_op, xla_call_module_op);
+
+  body_rewrite_pattern.rewrite(entry_func_op, rewriter);
+
+  // Rename the function to be clear that the function has been quantized.
+  const std::string quantized_function_name =
+      GetQuantizedFunctionName(entry_func_op.getSymName());
+  entry_func_op.setSymName(quantized_function_name);
+}
+
+// Replaces a quantized `xla_call_module_op` with a `func::CallOp`. The callee
+// is expected to remain unquantized (thus having a signature mismatch), and it
+// is also quantized accordingly.
+void ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
+    MLIRContext& ctx, PatternRewriter& rewriter,
+    TF::XlaCallModuleOp xla_call_module_op,
+    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern) {
+  ModuleOp module_op = xla_call_module_op->getParentOfType<ModuleOp>();
+  SymbolTable symbol_table(module_op);
+
+  func::FuncOp entry_func_op = GetEntryFuncOp(xla_call_module_op, symbol_table);
+  QuantizeEntryFuncOp(ctx, rewriter, xla_call_module_op, entry_func_op,
+                      body_rewrite_pattern);
+
+  // Replace the XlaCallModuleOp with a new CallOp.
+  rewriter.setInsertionPoint(xla_call_module_op);
+  rewriter.replaceOpWithNewOp<func::CallOp>(xla_call_module_op, entry_func_op,
+                                            xla_call_module_op.getArgs());
+}
+
+// Pattern that mainly does two things:
+//
+//   1. Replaces quantized `TF::XlaCallModuleOp` with a `func::CallOp`.
+//   2. Quantizes the callee function.
+//
+// The inputs of this pattern assumes an invalid IR, where even if a
+// `TF::XlaCallModuleOp` is quantized the callee remains unquantized. Step (2)
+// not only replaces the input and output tensor types into quantized ones, but
+// also rewrites the body with a quantized equivalent.
+//
+// `FuncBodyRewritePatternT` defines how a function body is quantized and
+// rewritten.
+template <typename FuncBodyRewritePatternT,
+          typename = std::enable_if_t<std::is_base_of_v<
+              EntryFuncBodyQuantizationPattern, FuncBodyRewritePatternT>>>
+class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
+ public:
+  explicit XlaCallModuleOpToCallOp(MLIRContext& ctx)
+      : OpRewritePattern<TF::XlaCallModuleOp>(&ctx) {}
+
+  LogicalResult match(TF::XlaCallModuleOp op) const override {
+    ModuleOp module_op = op->getParentOfType<ModuleOp>();
+    SymbolTable symbol_table(module_op);
+
+    // Ignore unquantized ops.
+    if (!IsQuantizedXlaCallModuleOp(op)) return failure();
+
+    func::FuncOp entry_func_op = GetEntryFuncOp(op, symbol_table);
+    if (!entry_func_op) {
+      op->emitError("Failed to find a valid entry function.");
+      return failure();
+    }
+
+    return FuncBodyRewritePatternT().match(entry_func_op);
+  }
+
+  void rewrite(TF::XlaCallModuleOp xla_call_module_op,
+               PatternRewriter& rewriter) const override {
+    ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
+        *rewriter.getContext(), rewriter, xla_call_module_op,
+        FuncBodyRewritePatternT());
+  }
+};
+
+}  // namespace
+
+// TODO: b/307620428 - Increase fused op coverage for static range quantization.
+void PopulateFusedGemmStylePatterns(MLIRContext& ctx,
+                                    RewritePatternSet& patterns) {
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeDotGeneralOpPattern>,
+               XlaCallModuleOpToCallOp<QuantizeConvolutionOpPattern>>(ctx);
+}
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
new file mode 100644
index 00000000000000..79daa9ce8b48b8
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
@@ -0,0 +1,379 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_QUANTIZATION_PATTERNS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_QUANTIZATION_PATTERNS_H_
+
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Checks if an op is quantizable in StableHLO quantizer. Argument op is not
+// necessarily a StableHLO op.
+bool IsOpQuantizableStableHlo(Operation* op);
+
+// A base rewrite pattern which matches any N-in-M-out operations with
+// quantization parameters propagated to at least one of its operands. The
+// quantization parameters are annotated by the QuantizeOp/DequantizeOp pairs.
+// Each matched pattern are rewritten by its quantized alternatives.
+//
+// The concrete pattern, extends from this base pattern, can specify whether it
+// allows dynamic range quantized operands and results for the operations in the
+// current context. These "DynamicRangeQuantized" operands and results don't
+// have quantization parameters propagated to, so will be in float in the
+// quantized results. The concrete pattern should define the following two
+// functions:
+//
+//   bool AllowDynamicRangeQuantizedOperand(Operation&) const
+//   bool AllowDynamicRangeQuantizedResult(Operation&) const
+//
+// Full integer quantization disallows "DynamicRangeQuantized" operands or
+// results. Dynamic range quantization allows "DynamicRangeQuantized" operands
+// and results.
+//
+// Implementation of this pattern is mostly copied from QuantizationPattern in
+// third_party/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h.
+template <typename ConcreteT, typename QuantizeOpT, typename DequantizeOpT,
+          typename VerifierT, typename RootOpT = DequantizeOpT>
+class StableHloQuantizationPattern : public RewritePattern {
+ public:
+  using BaseType =
+      StableHloQuantizationPattern<ConcreteT, QuantizeOpT, DequantizeOpT,
+                                   VerifierT, RootOpT>;
+
+  explicit StableHloQuantizationPattern(
+      MLIRContext* context, const mlir::quant::QuantPassSpec& quant_params)
+      // Set the score to a large number so it is always preferred.
+      : RewritePattern(RootOpT::getOperationName(), 300, context),
+        quant_params_(quant_params) {}
+
+ private:
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
+    llvm::SmallVector<Operation*, 4> quantizing_ops;
+
+    // Collect all the ops to quantize, as the user / producer of the root op.
+    if constexpr (std::is_same_v<RootOpT, DequantizeOpT>) {
+      if (op->getNumResults() != 1) {
+        op->emitError("Dequantize op should have exactly one result.");
+        return failure();
+      }
+      auto users = op->getResult(0).getUsers();
+      quantizing_ops.append(users.begin(), users.end());
+    } else if constexpr (std::is_same_v<RootOpT, QuantizeOpT>) {
+      if (op->getNumOperands() != 1) {
+        op->emitError("Quantize op should have exactly one operand.");
+        return failure();
+      }
+      Value quantize_operand = op->getOperand(0);
+      if (QuantizedType::getQuantizedElementType(quantize_operand.getType())) {
+        // The input of the quantize op has already been quantized, i.e.
+        // rescale.
+        return failure();
+      }
+      DenseFPElementsAttr attr;
+      if (matchPattern(quantize_operand, m_Constant(&attr))) {
+        // Const-> QuantizeOp pattern will be handled separately.
+        return failure();
+      }
+      if (Operation* quantizing_op = quantize_operand.getDefiningOp()) {
+        quantizing_ops.push_back(quantizing_op);
+      }
+    }
+
+    absl::flat_hash_set<std::string> ops_blocklist =
+        quant_params_.quant_spec.ops_blocklist;
+    absl::flat_hash_set<std::string> nodes_blocklist =
+        quant_params_.quant_spec.nodes_blocklist;
+    CustomMap custom_map = quant_params_.quant_spec.custom_map;
+
+    // Rewrite the floating-point ops to the quantized version, by fusing
+    // preceding dequantize ops and succeding quantize ops.
+    for (Operation* quantizing_op : quantizing_ops) {
+      // If it is requantize op, we shouldn't rewrite this op.
+      if (llvm::isa<QuantizeOpT, DequantizeOpT>(quantizing_op)) {
+        return failure();
+      }
+
+      // If the op is terminator, we shouldn't rewrite.
+      if (quantizing_op->hasTrait<OpTrait::IsTerminator>()) {
+        return failure();
+      }
+
+      if (!IsOpQuantizableStableHlo(quantizing_op) &&
+          !static_cast<const ConcreteT*>(this)->IsQuantizableCustomOp(
+              *quantizing_op, custom_map)) {
+        return failure();
+      }
+
+      if (GetStableHloQuantScaleSpec(quantizing_op)
+              ->has_same_scale_requirement &&
+          !IsConnectedWithQuantizedCompsiteFunction(quantizing_op)) {
+        return failure();
+      }
+
+      // Blocklist op is checked in advance for non-dynamic range quantization
+      // case.
+      if (!quant_params_.quant_spec.weight_quantization &&
+          (ops_blocklist.contains(
+              quantizing_op->getName().getStringRef().str()))) {
+        return failure();
+      }
+
+      if (!nodes_blocklist.empty()) {
+        if (auto name_loc = quantizing_op->getLoc().dyn_cast<NameLoc>()) {
+          std::string sloc = name_loc.getName().str();
+          if (!sloc.empty() &&
+              (nodes_blocklist.find(sloc) != nodes_blocklist.end())) {
+            return failure();
+          }
+        }
+      }
+
+      // Collect all the quantized inputs and "clone" the matched op by these
+      // inputs.
+      SmallVector<Value, 4> inputs;
+      inputs.reserve(quantizing_op->getNumOperands());
+      for (auto operand : quantizing_op->getOperands()) {
+        Type operand_type = operand.getType();
+        if (operand_type.isa<NoneType>()) {
+          inputs.push_back(operand);
+          continue;
+        }
+
+        auto ele_type = operand.getType().cast<TensorType>().getElementType();
+        if (auto dq_op =
+                dyn_cast_or_null<DequantizeOpT>(operand.getDefiningOp())) {
+          inputs.push_back(dq_op.getOperand());
+        } else if (!ele_type.isF32()) {
+          // If the operand is an integer tensor, then it doesn't require the
+          // DequantizeOp in the pattern.
+          inputs.push_back(operand);
+        } else {
+          return failure();
+        }
+      }
+
+      // Collect all the quantized outputs and replace them by the results of
+      // the new quantized op.
+      llvm::SmallDenseMap<Value, int> outputs_replaced;
+      SmallVector<Type, 4> output_types;
+      output_types.reserve(quantizing_op->getNumResults());
+      for (const auto& enumerated_result :
+           llvm::enumerate(quantizing_op->getResults())) {
+        Value result = enumerated_result.value();
+        Type result_type = result.getType();
+        // Add this to the test coverage once we create test ops with none type
+        // results.
+        if (result_type.isa<NoneType>()) {
+          outputs_replaced.insert({result, enumerated_result.index()});
+          output_types.push_back(result_type);
+          continue;
+        }
+        Type result_ele_type =
+            result.getType().cast<TensorType>().getElementType();
+        // If the user is the QuantizeOp, it must be the only user.
+        if (result.hasOneUse() &&
+            llvm::isa<QuantizeOpT>(*result.user_begin())) {
+          auto user = llvm::cast<QuantizeOpT>(*result.user_begin());
+          outputs_replaced.insert(
+              {user.getResult(), enumerated_result.index()});
+          output_types.push_back(user.getType());
+        } else if (!result_ele_type.isF32()) {
+          // If the result is an integer tensor, then it doesn't require the
+          // D op in the pattern.
+          outputs_replaced.insert({result, enumerated_result.index()});
+          output_types.push_back(result.getType());
+        } else if (static_cast<const ConcreteT*>(this)
+                       ->AllowDynamicRangeQuantizedResult(*quantizing_op,
+                                                          custom_map)) {
+          outputs_replaced.insert({result, enumerated_result.index()});
+          output_types.push_back(result.getType());
+        } else {
+          return failure();
+        }
+      }
+
+      rewriter.setInsertionPointAfter(quantizing_op);
+      OperationState new_state(quantizing_op->getLoc(),
+                               quantizing_op->getName().getStringRef(), inputs,
+                               output_types, quantizing_op->getAttrs());
+      for (int i = 0; i < quantizing_op->getNumRegions(); ++i) {
+        new_state.addRegion();
+      }
+      Operation* quantized_op = rewriter.create(new_state);
+      if (quantizing_op->getNumRegions() != 0) {
+        for (const auto& indexed_regions :
+             llvm::enumerate(quantizing_op->getRegions())) {
+          Region& target_region =
+              quantized_op->getRegion(indexed_regions.index());
+          IRMapping mapping;
+          indexed_regions.value().cloneInto(&target_region, mapping);
+        }
+      }
+      for (auto output : outputs_replaced) {
+        output.getFirst().replaceAllUsesWith(
+            quantized_op->getResult(output.getSecond()));
+      }
+    }
+    return success();
+  }
+
+  // Checks whether the operation is connnected with a quantized composite
+  // function. If not, the same-scale op will not be quantized. This decision is
+  // based on the current assumption that the performance gain of the same-scale
+  // op itself could not beat the overhead of the quantize and dequantize
+  // routines need to be added around that op. When the assumption changes,
+  // this policy might change as well.
+  bool IsConnectedWithQuantizedCompsiteFunction(
+      Operation* same_scale_op) const {
+    for (const auto& operand : same_scale_op->getOperands()) {
+      auto dq_op = dyn_cast_or_null<quantfork::DequantizeCastOp>(
+          operand.getDefiningOp());
+      if (!dq_op) continue;
+
+      Operation* preceding_op = dq_op.getArg().getDefiningOp();
+      if (!preceding_op) continue;
+
+      // Check whether the preceding op is a quantized composite function.
+      if (llvm::isa<TF::XlaCallModuleOp>(preceding_op)) {
+        auto call_op = llvm::cast<TF::XlaCallModuleOp>(preceding_op);
+        if (!IsQuantizedCompositeFunction(call_op)) continue;
+        return true;
+      }
+
+      // Check whether the preceding op is a quantized same-scale op.
+      if (GetStableHloQuantScaleSpec(preceding_op)
+              ->has_same_scale_requirement) {
+        for (auto result : preceding_op->getResults()) {
+          auto element_type = getElementTypeOrSelf(result.getType());
+          if (element_type.isa<UniformQuantizedType>()) {
+            return true;
+          }
+        }
+      }
+    }
+
+    for (const auto& result : same_scale_op->getResults()) {
+      // If the user is the Quantize op, it must be the only user.
+      if (!result.hasOneUse() ||
+          !llvm::isa<quantfork::QuantizeCastOp>(*result.user_begin())) {
+        continue;
+      }
+
+      auto q_op = llvm::cast<quantfork::QuantizeCastOp>(*result.user_begin());
+      for (auto following_op : q_op->getUsers()) {
+        // Check whether the following op is a quantized composite function.
+        if (llvm::isa<TF::XlaCallModuleOp>(following_op)) {
+          auto call_op = llvm::cast<TF::XlaCallModuleOp>(following_op);
+          if (!IsQuantizedCompositeFunction(call_op)) continue;
+          return true;
+        }
+
+        // Check whether the following op is a quantized same-scale op.
+        if (GetStableHloQuantScaleSpec(following_op)
+                ->has_same_scale_requirement) {
+          for (auto operand : following_op->getOperands()) {
+            auto element_type = getElementTypeOrSelf(operand.getType());
+            if (element_type.isa<UniformQuantizedType>()) {
+              return true;
+            }
+          }
+        }
+      }
+    }
+
+    return false;
+  }
+
+  // Checks if op calls a composite function and all the inputs and outputs are
+  // quantized.
+  bool IsQuantizedCompositeFunction(TF::XlaCallModuleOp call_op) const {
+    if (!call_op->hasAttr(kQuantTraitAttrName)) {
+      return false;
+    }
+
+    const auto function_name = call_op->getAttrOfType<FlatSymbolRefAttr>(
+        TF::kStablehloEntryFunctionAttrName);
+    if (!function_name || !function_name.getValue().startswith("composite_")) {
+      return false;
+    }
+
+    bool has_quantized_types = false;
+    for (Value input : call_op.getArgs()) {
+      if (auto type = input.getType().dyn_cast<TensorType>()) {
+        if (type.getElementType().isa<FloatType>()) {
+          return false;
+        }
+        if (type.getElementType().isa<UniformQuantizedType>()) {
+          has_quantized_types = true;
+        }
+      }
+    }
+    for (Value output : call_op.getOutput()) {
+      if (auto type = output.getType().dyn_cast<TensorType>()) {
+        if (type.getElementType().isa<FloatType>()) {
+          return false;
+        }
+        if (type.getElementType().isa<UniformQuantizedType>()) {
+          has_quantized_types = true;
+        }
+      }
+    }
+    return has_quantized_types;
+  }
+
+  QuantPassSpec quant_params_;
+};
+
+// Gemm Style Op: glossary/gemm.
+// Populates conversion patterns to unfuse batch normalization operations.
+void PopulateFusedGemmStylePatterns(MLIRContext& ctx,
+                                    RewritePatternSet& patterns);
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_QUANTIZATION_PATTERNS_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
index 16e7ad1cfd7010..d629b26b4f0fc8 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
@@ -20,10 +20,13 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
@@ -31,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h"
 
 namespace mlir::quant::stablehlo {
 
@@ -42,31 +46,32 @@ namespace {
 // Base struct for quantization.
 template <typename ConcreteT, typename RootOpT = quantfork::DequantizeCastOp>
 struct StableHloQuantizationBase
-    : public QuantizationPattern<ConcreteT, quantfork::QuantizeCastOp,
-                                 quantfork::DequantizeCastOp,
-                                 /*VerifierT=*/void, RootOpT> {
+    : public StableHloQuantizationPattern<ConcreteT, quantfork::QuantizeCastOp,
+                                          quantfork::DequantizeCastOp,
+                                          /*VerifierT=*/void, RootOpT> {
   explicit StableHloQuantizationBase(MLIRContext* ctx,
                                      const QuantPassSpec& quant_params)
-      : QuantizationPattern<ConcreteT, quantfork::QuantizeCastOp,
-                            quantfork::DequantizeCastOp,
-                            /*VerifierT=*/void, RootOpT>(ctx, quant_params) {}
+      : StableHloQuantizationPattern<ConcreteT, quantfork::QuantizeCastOp,
+                                     quantfork::DequantizeCastOp,
+                                     /*VerifierT=*/void, RootOpT>(
+            ctx, quant_params) {}
 
-  static bool IsQuantizableCustomOp(Operation* op,
+  static bool IsQuantizableCustomOp(Operation& op,
                                     const CustomMap& custom_op_map) {
     return false;
   }
 
   static bool AllowDynamicRangeQuantizedOperand(
-      Operation* quantized_op, const CustomMap& custom_op_map) {
+      Operation& quantized_op, const CustomMap& custom_op_map) {
     return false;
   }
 
-  static bool AllowDynamicRangeQuantizedResult(Operation* quantized_op,
+  static bool AllowDynamicRangeQuantizedResult(Operation& quantized_op,
                                                const CustomMap& custom_op_map) {
     return false;
   }
 
-  static bool IsWeightOnlyOp(Operation* quantized_op,
+  static bool IsWeightOnlyOp(Operation& quantized_op,
                              absl::flat_hash_set<std::string>& ops_blocklist,
                              bool weight_only_quantization,
                              const CustomMap& custom_op_map) {
@@ -112,7 +117,7 @@ class QuantizePass : public impl::QuantizePassBase<QuantizePass> {
 };
 
 void QuantizePass::runOnOperation() {
-  func::FuncOp func = getOperation();
+  ModuleOp module_op = getOperation();
   MLIRContext& ctx = getContext();
 
   NumericVerifySpec numeric_verify_spec;
@@ -125,20 +130,21 @@ void QuantizePass::runOnOperation() {
   RewritePatternSet patterns(&ctx);
   patterns.add<StableHloQuantization, StableHloQuantizationReverse>(
       &ctx, quant_params);
+  PopulateFusedGemmStylePatterns(ctx, patterns);
 
-  if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+  if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
     // There are cases where no rewrites happen even if a pattern matches,
     // causing this to result in a convergence failure. Consider this as a
     // best-effort.
     // TODO: b/305469508 - Make QuantizationPattern converge if there are no
     // patterns that are rewritable.
-    func.emitWarning("Failed to converge pattern at QuantizePass.");
+    module_op.emitWarning("Failed to converge pattern at QuantizePass.");
   }
 }
 
 }  // namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
+std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizePass(
     const QuantizationSpecs& quantization_specs) {
   return std::make_unique<QuantizePass>(quantization_specs);
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
index cf0c44f779a9ae..01230798bdcf8c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -50,6 +52,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
+#define DEBUG_TYPE "quantize-composite-functions"
+
 namespace mlir::quant::stablehlo {
 
 #define GEN_PASS_DEF_QUANTIZECOMPOSITEFUNCTIONSPASS
@@ -58,7 +62,10 @@ namespace mlir::quant::stablehlo {
 namespace {
 
 using QuantMethod = tensorflow::quantization::QuantizationMethod::PresetMethod;
+using ::mlir::stablehlo::AddOp;
+using ::mlir::stablehlo::ConvolutionOp;
 using ::mlir::stablehlo::DotGeneralOp;
+using ::mlir::stablehlo::DynamicBroadcastInDimOp;
 using ::mlir::stablehlo::UniformQuantizeOp;
 using ::tensorflow::quantization::RunPassesOnModuleOp;
 
@@ -79,248 +86,6 @@ class QuantizeCompositeFunctionsPass
   void runOnOperation() override;
 };
 
-// Returns true if `type` is a TensorType with quantized elements.
-bool IsQuantizedTensorType(const Type type) {
-  return type.isa<TensorType>() &&
-         type.cast<TensorType>().getElementType().isa<QuantizedType>();
-}
-
-// Checks if all inputs and outputs are quantized.
-bool HasQuantizedOperandOrOutput(Operation* call_op) {
-  SmallVector<Type> arg_types;
-  for (const Value arg : call_op->getOperands()) {
-    arg_types.push_back(arg.getType());
-  }
-
-  SmallVector<Type> output_types;
-  for (const Value output : call_op->getResults()) {
-    output_types.push_back(output.getType());
-  }
-
-  return absl::c_all_of(arg_types, IsQuantizedTensorType) &&
-         absl::c_all_of(output_types, IsQuantizedTensorType);
-}
-
-// Get the corresponding quantized function name from the given function name.
-// Example: "composite_dot_general_fn_1" => "quantized_dot_general_fn"
-std::string GetQuantizedFunctionName(const StringRef func_name) {
-  return Twine(kQuantizedFuncPrefix)
-      .concat(func_name.rsplit(kCompositeFuncPrefix).second)
-      .str();
-}
-
-// Returns true if `xla_call_module_op` is quantized. To be considered
-// quantized, it should meet three conditions:
-// 1. At least one of the inputs or outputs should be a uniform quantized type.
-// 2. `xla_call_module_op` should have the `kQuantTraitAttrName` attribute.
-// 3. It should also have the `kEntryFuncAttrName` attribute, which points to
-//    the function that `xla_call_module_op` represents.
-bool IsQuantizedXlaCallModuleOp(TF::XlaCallModuleOp xla_call_module_op) {
-  return HasQuantizedOperandOrOutput(xla_call_module_op) &&
-         xla_call_module_op->hasAttr(kQuantTraitAttrName) &&
-         xla_call_module_op->hasAttr(kEntryFuncAttrName);
-}
-
-// Returns the entry function, i.e. the callee of `xla_call_module_op`.
-func::FuncOp GetEntryFuncOp(TF::XlaCallModuleOp xla_call_module_op,
-                            SymbolTable symbol_table) {
-  auto entry_function_symbol_ref =
-      xla_call_module_op->getAttrOfType<FlatSymbolRefAttr>(kEntryFuncAttrName);
-
-  // Don't match if there are no DotGeneralOp.
-  // if (target_func_op.getOps<DotGeneralOp>().empty()) return {};
-  return dyn_cast_or_null<func::FuncOp>(
-      symbol_table.lookup(entry_function_symbol_ref.getValue()));
-}
-
-// Replaces the function type of `entry_func_op` to a quantized one, matching
-// the input and output types of `xla_call_module_op`.
-void SetQuantizedFunctionType(PatternRewriter& rewriter,
-                              func::FuncOp entry_func_op,
-                              TF::XlaCallModuleOp xla_call_module_op) {
-  SmallVector<Type> arg_types;
-  SmallVector<Location> arg_locs;
-  for (const Value arg : xla_call_module_op.getArgs()) {
-    arg_types.push_back(arg.getType());
-    arg_locs.push_back(arg.getLoc());
-  }
-
-  SmallVector<Type> output_types;
-  for (const Value output : xla_call_module_op.getOutput()) {
-    output_types.push_back(output.getType());
-  }
-
-  entry_func_op.setFunctionType(
-      rewriter.getFunctionType(arg_types, output_types));
-
-  // Replace argument types and locs.
-  Block& entry = entry_func_op->getRegion(0).front();
-  for (auto [arg, arg_type, arg_loc] :
-       llvm::zip_equal(entry.getArguments(), arg_types, arg_locs)) {
-    arg.setType(arg_type);
-    arg.setLoc(arg_loc);
-  }
-}
-
-// An interface representing patterns that quantizes an entry function's body.
-// The entry function's signatures should have already been quantized at the
-// point of rewriting.
-class EntryFuncBodyQuantizationPattern {
- public:
-  virtual ~EntryFuncBodyQuantizationPattern() = default;
-
-  // Returns `success()` if `entry_func_op`'s body is eligible for rewriting. At
-  // this point `entry_func_op`'s signature has not been reset with quantized
-  // types.
-  virtual LogicalResult match(func::FuncOp entry_func_op) const = 0;
-
-  // Rewrites the `entry_func_op`'s body.
-  virtual void rewrite(func::FuncOp entry_func_op,
-                       PatternRewriter& rewriter) const = 0;
-};
-
-// Quantizes the entry function's body containing a `DotGeneralOp`.
-class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
- public:
-  explicit QuantizeDotGeneralOpPattern(MLIRContext& ctx) : ctx_(&ctx) {}
-
-  LogicalResult match(func::FuncOp entry_func_op) const override {
-    auto& operations = entry_func_op.getBody().front().getOperations();
-    return success(operations.size() == 2 &&
-                   isa<DotGeneralOp>(operations.front()));
-  }
-
-  void rewrite(func::FuncOp entry_func_op,
-               PatternRewriter& rewriter) const override {
-    // Update the output type of the dot_general op.
-    auto dot_general_op = *entry_func_op.getOps<DotGeneralOp>().begin();
-
-    const Type input_type = entry_func_op.getArgumentTypes()[0];
-    const Type rhs_type = entry_func_op.getArgumentTypes()[1];
-    const Type func_result_type = entry_func_op.getResultTypes()[0];
-
-    const double input_scale = getElementTypeOrSelf(input_type)
-                                   .cast<UniformQuantizedType>()
-                                   .getScale();
-    const double rhs_scale =
-        getElementTypeOrSelf(rhs_type).cast<UniformQuantizedType>().getScale();
-
-    // Define the intermediate output type, which is an i32 quantized type.
-    // This is intermediate because the final output type of the entry_func_op
-    // should be an i8 quantized type.
-    const UniformQuantizedType output_quantized_element_type =
-        CreateI32F32UniformQuantizedType(dot_general_op->getLoc(), *ctx_,
-                                         input_scale * rhs_scale,
-                                         /*zero_point=*/0);
-
-    Value dot_general_op_result = dot_general_op->getResult(0);
-    const auto dot_general_op_result_type =
-        dot_general_op_result.getType().cast<RankedTensorType>();
-    const ArrayRef<int64_t> shape = dot_general_op_result_type.getShape();
-
-    const TensorType new_dot_general_op_result_type =
-        dot_general_op_result_type.cloneWith(shape,
-                                             output_quantized_element_type);
-    dot_general_op_result.setType(new_dot_general_op_result_type);
-
-    // Add i32 -> i8 requantization.
-    rewriter.setInsertionPointAfter(dot_general_op);
-    auto uniform_quant_op = rewriter.create<UniformQuantizeOp>(
-        dot_general_op->getLoc(), func_result_type,
-        dot_general_op->getResults());
-
-    auto return_op =
-        cast<func::ReturnOp>(entry_func_op.getBody().front().getTerminator());
-    return_op.setOperand(0, uniform_quant_op);
-  }
-
- private:
-  MLIRContext* ctx_ = nullptr;
-};
-
-// Converts `entry_func_op` to be quantized according to the respective
-// inputs and outputs of `xla_call_module_op` that are possibly quantized. It
-// signature (type) is reset to match that of `xla_call_module_op`.
-// `entry_func_body_quantization_pattern` rewrites the function's body, based on
-// the new signature.
-void QuantizeEntryFuncOp(
-    MLIRContext& ctx, PatternRewriter& rewriter,
-    TF::XlaCallModuleOp xla_call_module_op, func::FuncOp entry_func_op,
-    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern) {
-  SetQuantizedFunctionType(rewriter, entry_func_op, xla_call_module_op);
-
-  body_rewrite_pattern.rewrite(entry_func_op, rewriter);
-
-  // Rename the function to be clear that the function has been quantized.
-  const std::string quantized_function_name =
-      GetQuantizedFunctionName(entry_func_op.getSymName());
-  entry_func_op.setSymName(quantized_function_name);
-}
-
-// Replaces a quantized `xla_call_module_op` with a `func::CallOp`. The callee
-// is expected to remain unquantized (thus having a signature mismatch), and it
-// is also quantized accordingly.
-void ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
-    MLIRContext& ctx, PatternRewriter& rewriter,
-    TF::XlaCallModuleOp xla_call_module_op,
-    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern) {
-  auto module_op = xla_call_module_op->getParentOfType<ModuleOp>();
-  SymbolTable symbol_table(module_op);
-
-  func::FuncOp entry_func_op = GetEntryFuncOp(xla_call_module_op, symbol_table);
-  QuantizeEntryFuncOp(ctx, rewriter, xla_call_module_op, entry_func_op,
-                      body_rewrite_pattern);
-
-  // Replace the XlaCallModuleOp with a new CallOp.
-  rewriter.setInsertionPoint(xla_call_module_op);
-  rewriter.replaceOpWithNewOp<func::CallOp>(xla_call_module_op, entry_func_op,
-                                            xla_call_module_op.getArgs());
-}
-
-// Pattern that mainly does two things:
-//
-//   1. Replaces quantized `TF::XlaCallModuleOp` with a `func::CallOp`.
-//   2. Quantizes the callee function.
-//
-// The inputs of this pattern assumes an invalid IR, where even if a
-// `TF::XlaCallModuleOp` is quantized the callee remains unquantized. Step (2)
-// not only replaces the input and output tensor types into quantized ones, but
-// also rewrites the body with a quantized equivalent.
-//
-// `FuncBodyRewritePatternT` defines how a function body is quantized and
-// rewritten.
-template <typename FuncBodyRewritePatternT,
-          typename = std::enable_if_t<std::is_base_of_v<
-              EntryFuncBodyQuantizationPattern, FuncBodyRewritePatternT>>>
-class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
- public:
-  explicit XlaCallModuleOpToCallOp(MLIRContext& ctx)
-      : OpRewritePattern<TF::XlaCallModuleOp>(&ctx) {}
-
-  LogicalResult match(TF::XlaCallModuleOp op) const override {
-    auto module_op = op->getParentOfType<ModuleOp>();
-    SymbolTable symbol_table(module_op);
-
-    // Ignore unquantized ops.
-    if (!IsQuantizedXlaCallModuleOp(op)) return failure();
-
-    func::FuncOp entry_func_op = GetEntryFuncOp(op, symbol_table);
-    if (!entry_func_op) {
-      op->emitError("Failed to find a valid entry function.");
-      return failure();
-    }
-
-    return FuncBodyRewritePatternT(*getContext()).match(entry_func_op);
-  }
-
-  void rewrite(TF::XlaCallModuleOp xla_call_module_op,
-               PatternRewriter& rewriter) const override {
-    ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
-        *rewriter.getContext(), rewriter, xla_call_module_op,
-        FuncBodyRewritePatternT(*getContext()));
-  }
-};
-
 void QuantizeCompositeFunctionsPass::runOnOperation() {
   MLIRContext& ctx = getContext();
 
@@ -334,7 +99,9 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
   pm.enableVerifier(false);
 
   pm.addNestedPass<func::FuncOp>(CreatePrepareQuantizePass());
-  pm.addNestedPass<func::FuncOp>(CreateQuantizePass(quant_specs));
+  // QuantizePass modifies FuncOps referenced outside of its given scope
+  // and therefore requires a module-level context.
+  pm.addPass(CreateQuantizePass(quant_specs));
   pm.addNestedPass<func::FuncOp>(createPostQuantizePass());
 
   ModuleOp module_op = getOperation();
@@ -343,14 +110,6 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
       !pm_run_status.ok()) {
     signalPassFailure();
   }
-
-  // TODO - b/307839649: Move this as a separate pass.
-  RewritePatternSet patterns(&ctx);
-  patterns.add<XlaCallModuleOpToCallOp<QuantizeDotGeneralOpPattern>>(ctx);
-
-  if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
-    signalPassFailure();
-  }
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
index 5bf8ba7ec07657..c870e7be4087a7 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
@@ -12,14 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
 #include <string>
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
@@ -28,6 +31,8 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
@@ -41,6 +46,13 @@ namespace mlir::quant::stablehlo {
 namespace {
 
 constexpr StringRef kQuantizeTargetOpAttr = "tf_quant.composite_function";
+constexpr StringRef kStablehloModuleAttrsAttrName = "_stablehlo_module_attrs";
+constexpr StringRef kUsesShapePolymorphismAttr = "jax.uses_shape_polymorphism";
+
+// Default version number for native serialization.
+constexpr int64_t kDefaultVersion = 9;
+// Default platform for XlaCallModuleOp.
+constexpr StringRef kPlatformCpu = "CPU";
 
 class ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass
     : public impl::
@@ -125,7 +137,7 @@ class LiveOuts {
   // Delete the current op from liveouts and moves on to the parent ops.
   void update(Operation& op) {
     for (Value result_value : op.getResults()) {
-      liveouts_.erase(result_value);
+      liveouts_.remove(result_value);
     }
     for (Value operand : op.getOperands()) {
       liveouts_.insert(operand);
@@ -136,19 +148,20 @@ class LiveOuts {
   void snapshot_previous_state() { prev_liveouts_ = liveouts_; }
 
   // Return the current live values.
-  const DenseSet<Value>& get() const { return liveouts_; }
+  const SetVector<Value>& get() const { return liveouts_; }
 
   // Return the previous live values.
-  const DenseSet<Value>& get_previous() const { return prev_liveouts_; }
+  const SetVector<Value>& get_previous() const { return prev_liveouts_; }
 
  private:
-  DenseSet<Value> liveouts_;
-  DenseSet<Value> prev_liveouts_;
+  // Use SerVector to ensure deterministic traversal order.
+  SetVector<Value> liveouts_;
+  SetVector<Value> prev_liveouts_;
 };
 
 // Creates the tf.XlaCallModuleOp from attributes.
-void CreateXlaCallModuleOp(ArrayRef<Value> inputs, ArrayRef<Value> outputs,
-                           ArrayRef<Type> result_types,
+void CreateXlaCallModuleOp(ValueRange inputs, ValueRange outputs,
+                           TypeRange result_types,
                            ArrayRef<Operation*> reverse_subgraph,
                            func::FuncOp stablehlo_func_op, ModuleOp module_op) {
   MLIRContext* ctx = module_op.getContext();
@@ -163,19 +176,26 @@ void CreateXlaCallModuleOp(ArrayRef<Value> inputs, ArrayRef<Value> outputs,
         tf_type::ShapeAttr::get(ctx, result_type.cast<ShapedType>()));
   }
   auto empty_array_attr = ArrayAttr::get(ctx, {});
-  // TODO - b/303363466: Allow XlaCallModuleOp with versions >5.
+  // TODO - b/310291615: Support platforms = ["TPU"].
+  auto platforms = ArrayAttr::get(ctx, {StringAttr::get(ctx, kPlatformCpu)});
+
   auto xla_call_module_op = builder.create<TF::XlaCallModuleOp>(
       module_op.getLoc(), /*output=*/result_types,
       /*args=*/inputs,
-      /*version=*/5, /*module=*/"",
+      /*version=*/kDefaultVersion, /*module=*/"",
       /*Sout=*/ArrayAttr::get(ctx, shape_attrs),
-      /*dim_args_spec=*/empty_array_attr,
-      /*platforms=*/empty_array_attr,
+      /*dim_args_spec=*/empty_array_attr, platforms,
       /*function_list=*/empty_array_attr,
       /*has_token_input_output=*/false,
       /*disabled_checks=*/empty_array_attr);
   xla_call_module_op->setAttr(TF::kStablehloEntryFunctionAttrName,
                               SymbolRefAttr::get(stablehlo_func_op));
+  // Set jax.uses_shape_polymorphism=true to enable shape refinement at runtime.
+  // This is needed for native serialization version >= 8.
+  xla_call_module_op->setAttr(
+      kStablehloModuleAttrsAttrName,
+      builder.getDictionaryAttr(builder.getNamedAttr(
+          kUsesShapePolymorphismAttr, builder.getBoolAttr(true))));
 
   for (auto [original_output_value, xla_call_module_op_result_value] :
        llvm::zip_equal(outputs, xla_call_module_op->getResults())) {
@@ -251,18 +271,18 @@ void ReplaceStablehloOpsWithXlaCallModuleOp(
 // Contains the actual logic for updating states and replacing StableHLO ops
 // with tf.XlaCallModuleOps.
 void UpdateStatesAndReplaceStablehloOps(
-    const DenseSet<Value>& operands, const DenseSet<Value>& defined_values,
+    const SetVector<Value>& operands, const SetVector<Value>& defined_values,
     const LiveOuts& liveouts, ModuleOp module_op,
     ArrayRef<Operation*> reverse_subgraph, const int stablehlo_func_id,
     func::FuncOp main_func, const bool is_last_subgraph = false) {
-  DenseSet<Value> inputs = operands;
+  SetVector<Value> inputs = operands;
   for (Value defined_value : defined_values) {
-    inputs.erase(defined_value);
+    inputs.remove(defined_value);
   }
 
-  DenseSet<Value> outputs = liveouts.get_previous();
+  SetVector<Value> outputs = liveouts.get_previous();
   for (Value live_value : liveouts.get()) {
-    outputs.erase(live_value);
+    outputs.remove(live_value);
   }
 
   if (is_last_subgraph) {
@@ -270,7 +290,7 @@ void UpdateStatesAndReplaceStablehloOps(
     // throughout (functions as an invisible op above the very first op that
     // returns the arguments).
     for (const BlockArgument arg : main_func.getArguments()) {
-      outputs.erase(arg);
+      outputs.remove(arg);
     }
   }
 
@@ -298,20 +318,65 @@ void ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOps(
   // statement is not included in any subgraph (e.g. XlaCallModuleOp) and is
   // untouched.
   SmallVector<Operation*> reverse_main_func_block_ops;
+  SetVector<Operation*> ops_to_add;
   for (Operation& main_func_block_op :
        llvm::reverse(main_func_block.without_terminator())) {
     reverse_main_func_block_ops.push_back(&main_func_block_op);
+    ops_to_add.insert(&main_func_block_op);
   }
 
   // Create a separate subgraph invoked with XlaCallModuleOp per each
   // set of StableHLO ops in the main func block.
   SmallVector<Operation*> reverse_subgraph;
-  DenseSet<Value> operands;
-  DenseSet<Value> defined_values;
+  SetVector<Value> operands;
+  SetVector<Value> defined_values;
+
+  // Add op to the subgraph.
+  auto add_to_subgraph = [&](Operation* op) {
+    // Move on to the parent ops.
+    liveouts.update(*op);
+    ops_to_add.remove(op);
+
+    if (!IsStablehloOp(op)) {
+      // Always update the liveouts when the subgraph isn't being continued.
+      liveouts.snapshot_previous_state();
+      return;
+    }
+
+    reverse_subgraph.push_back(op);
+    defined_values.insert(op->getResults().begin(), op->getResults().end());
+    operands.insert(op->getOperands().begin(), op->getOperands().end());
+  };
 
   int stablehlo_func_id = -1;
   for (Operation* op : reverse_main_func_block_ops) {
+    if (!ops_to_add.contains(op)) continue;
+    // When hitting a non-StableHLO op, i.e. tf.CustomAggregatorOp, start
+    // recursively tracing defining ops of the current subgraph's operands. This
+    // makes sure that all dependencies needed for shape inference are included
+    // in the subgraph. Tracing stops when hitting a non-StableHLO ops or an op
+    // with multiple uses. In case of the latter scenario, we have to stop
+    // because otherwise other users of the op will become dangling references.
+    // TODO: b/311239049 - Consider rewrite this using BFS.
     if (!IsStablehloOp(op)) {
+      bool should_add_op = true;
+      while (should_add_op) {
+        should_add_op = false;
+        Operation* defining_op = nullptr;
+        for (Value v : operands) {
+          if (defined_values.contains(v)) continue;
+          // Check if op has branch and skip if so.
+          if (v.getDefiningOp() && IsStablehloOp(v.getDefiningOp()) &&
+              v.getDefiningOp()->hasOneUse()) {
+            defining_op = v.getDefiningOp();
+            should_add_op = true;
+            break;
+          }
+        }
+        if (should_add_op) {
+          add_to_subgraph(defining_op);
+        }
+      }
       // Create an XlaCallModuleOp if reverse_subgraph isn't empty.
       if (!reverse_subgraph.empty()) {
         UpdateStatesAndReplaceStablehloOps(operands, defined_values, liveouts,
@@ -324,20 +389,7 @@ void ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOps(
         defined_values.clear();
       }
     }
-
-    // Move on to the parent ops.
-    liveouts.update(*op);
-
-    if (!IsStablehloOp(op)) {
-      // Always update the liveouts when the subgraph isn't being continued.
-      liveouts.snapshot_previous_state();
-      continue;
-    }
-
-    reverse_subgraph.push_back(op);
-
-    defined_values.insert(op->getResults().begin(), op->getResults().end());
-    operands.insert(op->getOperands().begin(), op->getOperands().end());
+    add_to_subgraph(op);
   }
 
   // Create the last subgraph if it isn't empty.
@@ -348,6 +400,37 @@ void ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOps(
   }
 }
 
+// Duplicate small constants for each use.
+//
+// In the subsequent graph partitioning, constants for shape inference need to
+// be in the same subgraph. But graph partitioning stops at ops with multiple
+// uses. So here we duplicate small constants for each use so that if a
+// constant is useful for shape inference for multiple subgraphs, they can be
+// included in each subgraphs. If duplicate constants are accidentally created
+// in the same subgraph, they can be easily removed with a canonicalizer pass.
+//
+// We set a size limit since constants needed for shape inference are no
+// larger than tensor rank. This avoids duplicating large constants.
+void DuplicateSmallConstantOps(ModuleOp module_op, func::FuncOp main_func) {
+  OpBuilder builder(main_func.getContext());
+  for (auto constant_op :
+       main_func.getBody().getOps<mlir::stablehlo::ConstantOp>()) {
+    builder.setInsertionPointAfter(constant_op);
+    if (constant_op.getResult().use_empty() ||
+        constant_op.getResult().hasOneUse())
+      continue;
+    // Do not duplicate constant op if the size is too large.
+    // 32 is chosen to be larger than all constants useful for shape references,
+    // while not too large to possibly significantly increase model size.
+    if (constant_op.getValue().getNumElements() > 32) continue;
+    while (!constant_op.getResult().hasOneUse()) {
+      auto new_constant_op = builder.clone(*constant_op.getOperation());
+      constant_op.getResult().getUses().begin()->assign(
+          dyn_cast<mlir::stablehlo::ConstantOp>(new_constant_op));
+    }
+  }
+}
+
 void ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass::
     runOnOperation() {
   ModuleOp module_op = getOperation();
@@ -355,14 +438,15 @@ void ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass::
   func::FuncOp main_func = GetMainFunc(module_op);
   if (!main_func) return;
 
+  DuplicateSmallConstantOps(module_op, main_func);
   ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOps(module_op, main_func);
 
   // TODO - b/298966126: Currently quantizable functions are identified in TF
-  // Quantizer via the tf_quant.composite_function UnitAttr attached to func
-  // ops. We remove this attribute as this interferes with VHLO conversion.
+  // Quantizer via the tf_quant.composite_function UnitAttr attached to
+  // func ops. We remove this attribute as this interferes with VHLO conversion.
   // Remove this temporary hack.
   for (auto func_op : module_op.getOps<func::FuncOp>()) {
-    func_op->removeAttr(kQuantizeTargetOpAttr);
+    func_op->removeAttr(kFusedFunctionAttr);
   }
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/restore_function_name.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/restore_function_name.cc
index 545d36b625b532..57b6a2a07a04d1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/restore_function_name.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/restore_function_name.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
 
@@ -54,12 +54,12 @@ class RestoreFunctionNamePass
 
 void RestoreFunctionNameFromXlaCallModuleOp(TF::XlaCallModuleOp& call_op,
                                             SymbolTable& symbol_table) {
-  if (!call_op->hasAttr(mlir::quant::kOriginalStablehloEntryFunctionAttrName)) {
+  if (!call_op->hasAttr(kOriginalStablehloEntryFunctionAttrName)) {
     return;
   }
 
   auto original_function_name = call_op->getAttrOfType<StringAttr>(
-      mlir::quant::kOriginalStablehloEntryFunctionAttrName);
+      kOriginalStablehloEntryFunctionAttrName);
   auto current_function_name = call_op->getAttrOfType<FlatSymbolRefAttr>(
       TF::kStablehloEntryFunctionAttrName);
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/unwrap_xla_call_module_op.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/unwrap_xla_call_module_op.cc
new file mode 100644
index 00000000000000..a65694a7a7287f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/unwrap_xla_call_module_op.cc
@@ -0,0 +1,122 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
+
+namespace mlir::quant::stablehlo {
+
+#define GEN_PASS_DEF_UNWRAPXLACALLMODULEOPPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
+
+namespace {
+
+// Unwraps XlaCallModule ops without quantizable trait that call function with
+// '_from_xla_call_module' trait.
+class UnwrapXlaCallModuleOpPass
+    : public impl::UnwrapXlaCallModuleOpPassBase<UnwrapXlaCallModuleOpPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(UnwrapXlaCallModuleOpPass)
+
+  explicit UnwrapXlaCallModuleOpPass() = default;
+
+ private:
+  void runOnOperation() override;
+};
+
+void UnwrapXlaCallModuleOp(TF::XlaCallModuleOp call_op,
+                           SymbolTable& symbol_table) {
+  // Do not inline lifted quantized functions used for fusing patterns.
+  // TODO - b/310539922: Remove reference to TF/TFL utils.
+  if (call_op->hasAttr(kQuantTraitAttrName)) {
+    return;
+  }
+
+  auto function_name = call_op
+                           ->getAttrOfType<FlatSymbolRefAttr>(
+                               TF::kStablehloEntryFunctionAttrName)
+                           .getValue();
+  func::FuncOp func_op = symbol_table.lookup<func::FuncOp>(function_name);
+
+  // We should not unwrap if the function is not from
+  // ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass.
+  if (!func_op->hasAttr(TF::kFromXlaCallModuleAttrName)) {
+    return;
+  }
+
+  MLIRContext* context = call_op.getContext();
+  OpBuilder builder(context);
+  builder.setInsertionPointAfter(call_op);
+
+  IRMapping arg_mapper;
+  for (auto [func_arg, operand] :
+       llvm::zip_equal(func_op.getArguments(), call_op.getOperands())) {
+    arg_mapper.map(func_arg, operand);
+  }
+
+  Region& function_body = func_op.getBody();
+  IRMapping new_op_mapper;
+  for (Operation& op : function_body.getOps()) {
+    if (llvm::isa<func::ReturnOp>(op)) {
+      for (auto [call_result, return_value] :
+           llvm::zip_equal(call_op.getResults(), op.getOperands())) {
+        Value new_result = new_op_mapper.lookup(return_value);
+
+        call_result.replaceAllUsesWith(new_result);
+      }
+      continue;
+    }
+
+    Operation& new_op = *builder.clone(op, arg_mapper);
+    for (auto [result, new_result] :
+         llvm::zip_equal(op.getResults(), new_op.getResults())) {
+      new_op_mapper.map(result, new_result);
+    }
+  }
+
+  call_op.erase();
+}
+
+void UnwrapXlaCallModuleOpPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  SymbolTable symbol_table(module_op);
+
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    Region& function_body = func_op.getBody();
+
+    function_body.walk([&](TF::XlaCallModuleOp call_op) {
+      UnwrapXlaCallModuleOp(call_op, symbol_table);
+    });
+  }
+}
+
+}  // namespace
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/utils.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/utils.td
deleted file mode 100644
index 744637d58d8760..00000000000000
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/utils.td
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2023 The StableHLO Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-//===----------------------------------------------------------------------===//
-// Helper functions.
-//===----------------------------------------------------------------------===//
-
-// Checks whether the value of a constant equals the given float, regardless
-// of the tensor dimension.
-class FloatValueEquals<string val> : Constraint<CPred<
-  "FloatValueEquals($0, " # val # ")">>;
-
-// Fetches the default or null attribute, used for pattern matching.
-def DefaultOrNullAttr : NativeCodeCall<"DefaultOrNullAttr($_builder, $0)">;
-
-// Returns true if the given op is a StableHLO constant op.
-def IsStableHLOConstantOp : Constraint<CPred<"dyn_cast_or_null<::mlir::stablehlo::ConstantOp>($0.getDefiningOp())">>;
-
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
new file mode 100644
index 00000000000000..00503b7ce45a0d
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
@@ -0,0 +1,110 @@
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load(
+    "//tensorflow:tensorflow.default.bzl",
+    "tf_py_strict_test",
+    "tf_python_pybind_extension",
+)
+load("//tensorflow/compiler/mlir/quantization/stablehlo:internal_visibility_allowlist.bzl", "internal_visibility_allowlist")
+
+package_group(
+    name = "internal_visibility_allowlist_package",
+    packages = [
+        "//tensorflow/compiler/mlir/lite/...",
+        "//tensorflow/compiler/mlir/quantization/...",
+        "//tensorflow/compiler/mlir/tf2xla/transforms/...",
+        "//tensorflow/lite/...",
+        "//third_party/cloud_tpu/inference_converter/...",  # TPU Inference Converter V1
+    ] + internal_visibility_allowlist(),
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["@stablehlo//:license"],
+    default_visibility = [
+        ":internal_visibility_allowlist_package",
+        "//tensorflow:__pkg__",
+    ],
+    licenses = ["notice"],
+)
+
+pytype_strict_library(
+    name = "quantization",
+    srcs = ["quantization.py"],
+    deps = [
+        ":pywrap_quantization",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_py",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib_py",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:representative_dataset",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:save_model",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/saved_model:loader",
+    ],
+)
+
+pytype_strict_library(
+    name = "quantize_model_test_base",
+    testonly = 1,
+    srcs = ["integration_test/quantize_model_test_base.py"],
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/types:core",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tf_py_strict_test(
+    name = "quantize_model_test",
+    srcs = ["integration_test/quantize_model_test.py"],
+    deps = [
+        ":quantization",
+        ":quantize_model_test_base",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_py",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:representative_dataset",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:tag_constants",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "pywrap_quantization",
+    srcs = ["pywrap_quantization.cc"],
+    pytype_srcs = ["pywrap_quantization.pyi"],
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:debugger",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:assign_ids",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:statistics",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:type_casters",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:absl_casters",
+        "@pybind11_abseil//pybind11_abseil:import_status_module",
+        "@pybind11_abseil//pybind11_abseil:status_casters",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
new file mode 100644
index 00000000000000..a59d4a988c9b79
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
@@ -0,0 +1,349 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import itertools
+from typing import Optional, Sequence
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.compiler.mlir.quantization.stablehlo.python import quantization
+from tensorflow.compiler.mlir.quantization.stablehlo.python.integration_test import quantize_model_test_base
+from tensorflow.compiler.mlir.quantization.tensorflow import quantization_options_pb2 as quant_opts_pb2
+from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as repr_dataset
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import tag_constants
+
+# Type aliases for quantization method protobuf enums.
+_PresetMethod = quant_opts_pb2.QuantizationMethod.PresetMethod
+
+
+def parameter_combinations(test_parameters):
+  """Generate all combinations of test parameters."""
+  real_parameters = []
+  for parameters in test_parameters:
+    keys = parameters.keys()
+    for curr in itertools.product(*parameters.values()):
+      real_parameters.append(dict(zip(keys, curr)))
+  return real_parameters
+
+
+# Test cases for Static Range Quantization.
+# Tries to run all tests cases in both the graph mode (default in TF1) and the
+# eager mode (default in TF2) to ensure support for when TF2 is disabled.
+class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
+
+  @parameterized.parameters(
+      parameter_combinations([{
+          'activation_fn': [None],
+          'has_bias': [True, False],
+          'dim_sizes': [
+              # tf.MatMul cases.
+              ([None, 1024], [1024, 3]),  # dynamic batch dim.
+              ([1, 1024], [1024, 3]),
+              # tf.BatchMatMul cases.
+              ([10, 1, 1024], [10, 1024, 3]),
+              ([2, 3, 1, 1024], [2, 3, 1024, 3]),
+          ],
+      }])
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_matmul_ptq_model(
+      self,
+      activation_fn: Optional[ops.Operation],
+      has_bias: bool,
+      dim_sizes: Sequence[int],
+  ):
+    target_opset = quant_opts_pb2.STABLEHLO
+
+    lhs_dim_size, rhs_dim_size = dim_sizes
+    input_shape = (*lhs_dim_size,)
+    filter_shape = (*rhs_dim_size,)
+    static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
+    model = self._create_matmul_model(
+        input_shape,
+        filter_shape,
+        self._input_saved_model_path,
+        has_bias,
+        activation_fn,
+    )
+
+    rng = np.random.default_rng(seed=1235)
+    input_data = ops.convert_to_tensor(
+        rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
+            np.float32
+        )
+    )
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      for _ in range(100):
+        yield {
+            'input_tensor': rng.uniform(
+                low=0.0, high=1.0, size=static_input_shape
+            ).astype(np.float32)
+        }
+
+    dataset_path = self.create_tempfile('tfrecord').full_path
+    path_map = {'serving_default': dataset_path}
+    repr_dataset.TfRecordRepresentativeDatasetSaver(path_map).save(
+        {'serving_default': data_gen()}
+    )
+
+    config = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
+        ),
+        tags={tag_constants.SERVING},
+        signature_keys=['serving_default'],
+        op_set=target_opset,
+        representative_datasets={
+            'serving_default': quant_opts_pb2.RepresentativeDatasetFile(
+                tfrecord_file_path=dataset_path
+            )
+        },
+        calibration_options=quant_opts_pb2.CalibrationOptions(
+            calibration_method=quant_opts_pb2.CalibrationOptions.CALIBRATION_METHOD_MIN_MAX
+        ),
+    )
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    expected_outputs = model.matmul(input_data)
+
+    root = load.load(self._output_saved_model_path)
+    self.assertCountEqual(root.signatures.keys(), {'serving_default'})
+
+    new_outputs = root.signatures['serving_default'](
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+    # Tests that the quantized graph outputs similar values. The rtol value is
+    # arbitrary.
+    # TODO: b/309674337 - Fix the large numerical errors.
+    self.assertAllClose(new_outputs, expected_outputs, atol=0.3)
+
+  @parameterized.parameters(
+      parameter_combinations([{
+          'same_scale_op': [
+              'concatenate',
+              'gather',
+              'pad',
+              'reshape',
+              'select',
+              'slice',
+              'transpose',
+          ],
+      }])
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_matmul_and_same_scale_ptq_model(
+      self,
+      same_scale_op: str,
+  ):
+    target_opset = quant_opts_pb2.STABLEHLO
+
+    input_shape = (2, 3, 1, 1024)
+    filter_shape = (2, 3, 1024, 3)
+    static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
+
+    model = self._create_matmul_and_same_scale_model(
+        input_shape,
+        filter_shape,
+        self._input_saved_model_path,
+        same_scale_op,
+    )
+
+    rng = np.random.default_rng(seed=1235)
+    input_data = ops.convert_to_tensor(
+        rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
+            np.float32
+        )
+    )
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      for _ in range(100):
+        yield {
+            'input_tensor': rng.uniform(
+                low=0.0, high=1.0, size=static_input_shape
+            ).astype(np.float32)
+        }
+
+    dataset_path = self.create_tempfile('tfrecord').full_path
+    path_map = {'serving_default': dataset_path}
+    repr_dataset.TfRecordRepresentativeDatasetSaver(path_map).save(
+        {'serving_default': data_gen()}
+    )
+
+    config = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
+        ),
+        tags={tag_constants.SERVING},
+        signature_keys=['serving_default'],
+        op_set=target_opset,
+        representative_datasets={
+            'serving_default': quant_opts_pb2.RepresentativeDatasetFile(
+                tfrecord_file_path=dataset_path
+            )
+        },
+        calibration_options=quant_opts_pb2.CalibrationOptions(
+            calibration_method=quant_opts_pb2.CalibrationOptions.CALIBRATION_METHOD_MIN_MAX
+        ),
+    )
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    expected_outputs = model.matmul_and_same_scale(input_data)
+
+    root = load.load(self._output_saved_model_path)
+    self.assertCountEqual(root.signatures.keys(), {'serving_default'})
+
+    new_outputs = root.signatures['serving_default'](
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+    # Tests that the quantized graph outputs similar values. The rtol value is
+    # arbitrary.
+    # TODO: b/309674337 - Fix the large numerical errors.
+    self.assertAllClose(new_outputs, expected_outputs, rtol=0.3)
+
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'none',
+          'activation_fn': None,
+          'has_bias': False,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.STABLEHLO,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': False,
+      },
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_conv_ptq_model(
+      self,
+      activation_fn: Optional[ops.Operation],
+      has_bias: bool,
+      has_batch_norm: bool,
+      target_opset: quant_opts_pb2.OpSet,
+      input_shape_dynamic: bool,
+      enable_per_channel_quantization: bool,
+      dilations: Sequence[int] = None,
+  ):
+    input_shape = (None, None, None, 3) if input_shape_dynamic else (1, 3, 4, 3)
+    filter_shape = (2, 3, 3, 2)
+    strides = (1, 1, 1, 1)
+    model = self._create_conv2d_model(
+        input_shape,
+        filter_shape,
+        self._input_saved_model_path,
+        has_bias,
+        has_batch_norm,
+        activation_fn,
+        strides,
+        dilations,
+    )
+
+    # Generate model input data.
+    rng = np.random.default_rng(seed=1224)
+    static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
+    input_data = ops.convert_to_tensor(
+        rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
+            np.float32
+        )
+    )
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      for _ in range(100):
+        yield {
+            'input_tensor': rng.uniform(
+                low=0.0, high=1.0, size=static_input_shape
+            ).astype(np.float32)
+        }
+
+    dataset_path = self.create_tempfile('tfrecord').full_path
+    path_map = {'serving_default': dataset_path}
+    repr_dataset.TfRecordRepresentativeDatasetSaver(path_map).save(
+        {'serving_default': data_gen()}
+    )
+    tags = {tag_constants.SERVING}
+
+    config = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
+        ),
+        tags=tags,
+        signature_keys=['serving_default'],
+        op_set=target_opset,
+        representative_datasets={
+            'serving_default': quant_opts_pb2.RepresentativeDatasetFile(
+                tfrecord_file_path=dataset_path
+            )
+        },
+        enable_per_channel_quantization=enable_per_channel_quantization,
+        calibration_options=quant_opts_pb2.CalibrationOptions(
+            calibration_method=quant_opts_pb2.CalibrationOptions.CALIBRATION_METHOD_MIN_MAX
+        ),
+    )
+
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    expected_outputs = model.conv2d(input_data)
+
+    root = load.load(self._output_saved_model_path)
+    self.assertCountEqual(root.signatures.keys(), {'serving_default'})
+
+    new_outputs = root.signatures['serving_default'](
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+    # Tests that the quantized graph outputs similar values. The rtol value is
+    # arbitrary.
+    self.assertAllClose(new_outputs, expected_outputs, rtol=0.04)
+
+  def test_when_preset_not_srq_raise_error(self):
+    self._create_matmul_model(
+        input_shape=(1, 1024),
+        weight_shape=(1024, 3),
+        saved_model_path=self._input_saved_model_path,
+    )
+
+    config = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            preset_method=_PresetMethod.METHOD_NO_QUANTIZE
+        ),
+        tags={tag_constants.SERVING},
+        signature_keys=['serving_default'],
+        op_set=quant_opts_pb2.STABLEHLO,
+    )
+
+    with self.assertRaisesRegex(ValueError, 'only supports static-range PTQ'):
+      quantization.quantize_saved_model(
+          self._input_saved_model_path,
+          self._output_saved_model_path,
+          config,
+      )
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
new file mode 100644
index 00000000000000..86f7fadb671e1e
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
@@ -0,0 +1,299 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base test class for quantize_model Tests."""
+from typing import Mapping, Sequence, Optional
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow  # pylint: disable=unused-import
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.module import module
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import save as saved_model_save
+from tensorflow.python.types import core
+
+
+class QuantizedModelTest(test.TestCase, parameterized.TestCase):
+  """Base test class for StableHLO quant tests."""
+
+  def setUp(self) -> None:
+    super().setUp()
+
+    # Many test cases for quantization involve creating and saving the input
+    # model and saving the output quantized model. These two member
+    # attributes can be used to specify the paths for such models,
+    # respectively. These paths will be cleaned up after each test case.
+    self._input_saved_model_path = self.create_tempdir('input').full_path
+    self._output_saved_model_path = self.create_tempdir('output').full_path
+    # Extra output path occasionally used for comparing two different
+    # quantized models.
+    self._output_saved_model_path_2 = self.create_tempdir('output2').full_path
+
+  def _create_matmul_model(
+      self,
+      input_shape: Sequence[int],
+      weight_shape: Sequence[int],
+      saved_model_path: str,
+      has_bias: bool = False,
+      activation_fn: Optional[ops.Operation] = None,
+      bias_size: Optional[int] = None,
+      use_biasadd: bool = True,
+  ) -> module.Module:
+    class MatmulModel(module.Module):
+      """A simple model with a single matmul.
+
+      Bias and activation function are optional.
+      """
+
+      def __init__(
+          self,
+          weight_shape: Sequence[int],
+          bias_size: Optional[int] = None,
+          activation_fn: Optional[ops.Operation] = None,
+          use_biasadd: bool = True,
+      ) -> None:
+        """Initializes a MatmulModel.
+
+        Args:
+          weight_shape: Shape of the weight tensor.
+          bias_size: If None, do not use bias. Else, use given size as bias.
+          activation_fn: The activation function to be used. No activation
+            function if None.
+          use_biasadd: If True, use BiasAdd for adding bias, else use AddV2.
+        """
+        self.bias_size = bias_size
+        self.activation_fn = activation_fn
+        self.use_biasadd = use_biasadd
+        self.filters = np.random.uniform(low=-1.0, high=1.0, size=weight_shape)
+
+        if bias_size is not None:
+          self.bias = np.random.uniform(low=-1.0, high=1.0, size=bias_size)
+
+      def has_bias(self) -> bool:
+        return self.bias_size is not None
+
+      def has_reshape(self) -> bool:
+        return self.has_bias() and self.bias_size != self.filters.shape[-1]
+
+      @def_function.function
+      def matmul(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
+        """Performs a matrix multiplication.
+
+        Depending on self.has_bias and self.activation_fn, it may add a bias
+        term or
+        go through the activaction function.
+
+        Args:
+          input_tensor: Input tensor to matmul with the filter.
+
+        Returns:
+          A map of: output key -> output result.
+        """
+        out = math_ops.matmul(input_tensor, self.filters, name='sample/matmul')
+
+        return {'output': out}
+
+    # If bias_size is not explictly given, it should default to width of weight.
+    if bias_size is None and has_bias:
+      bias_size = weight_shape[-1]
+
+    # Verify that when bias_size is not None, has_bias should be True.
+    # And if bias_size is None, has_bias should be False.
+    assert (bias_size is None) != has_bias
+
+    model = MatmulModel(weight_shape, bias_size, activation_fn)
+    saved_model_save.save(
+        model,
+        saved_model_path,
+        signatures=model.matmul.get_concrete_function(
+            tensor_spec.TensorSpec(
+                shape=input_shape, dtype=dtypes.float32, name='input_tensor'
+            )
+        ),
+    )
+    return model
+
+  def _create_matmul_and_same_scale_model(
+      self,
+      input_shape: Sequence[int],
+      weight_shape: Sequence[int],
+      saved_model_path: str,
+      same_scale_op: str,
+  ) -> module.Module:
+    class MatmulAndSameScaleModel(module.Module):
+      """A simple model with a same-scale op.
+
+      Op name in StableHLO dialect is given as a string.
+      """
+
+      def __init__(
+          self,
+          weight_shape: Sequence[int],
+          same_scale_op: str,
+      ) -> None:
+        """Initializes a MatmulModel.
+
+        Args:
+          weight_shape: Shape of the weight tensor.
+          same_scale_op: Name of the same-scale op to be tested. Raises error
+            when an unknown name is given.
+        """
+        self.filters = np.random.uniform(low=-1.0, high=1.0, size=weight_shape)
+        self.same_scale_op = same_scale_op
+
+      @def_function.function
+      def matmul_and_same_scale(
+          self, input_tensor: core.Tensor
+      ) -> Mapping[str, core.Tensor]:
+        """Performs a matrix multiplication.
+
+        Args:
+          input_tensor: Input tensor to matmul with the filter.
+
+        Returns:
+          A map of: output key -> output result.
+        """
+        out = math_ops.matmul(input_tensor, self.filters, name='sample/matmul')
+
+        if self.same_scale_op == 'concatenate':
+          ones = array_ops.ones_like(out)
+          out = array_ops.concat([out, ones], 0)
+        elif self.same_scale_op == 'gather':
+          out = array_ops.gather(out, indices=[0], axis=0)
+        elif self.same_scale_op == 'pad':
+          paddings = array_ops.ones(
+              (array_ops.rank(out), 2), dtype=dtypes.int32
+          )
+          out = array_ops.pad(out, paddings, 'CONSTANT')
+        elif self.same_scale_op == 'reshape':
+          out = array_ops.reshape(out, (array_ops.size(out), -1))
+        elif self.same_scale_op == 'select':
+          rng = np.random.default_rng(seed=1234)
+          condition = ops.convert_to_tensor(
+              rng.uniform(low=0.0, high=1.0, size=out.shape) < 0.5
+          )
+          ones = array_ops.ones_like(out)
+          out = math_ops.select(condition, out, ones)
+        elif self.same_scale_op == 'slice':
+          begin = array_ops.zeros(
+              (array_ops.rank(out)), dtype=dtypes.int32
+          )
+          size = array_ops.ones(
+              (array_ops.rank(out)), dtype=dtypes.int32
+          )
+          out = array_ops.slice(out, begin, size)
+        elif self.same_scale_op == 'transpose':
+          out = array_ops.transpose(out)
+        else:
+          raise NotImplementedError(
+              '{} is not implemented for integration test.'.format(
+                  self.same_scale_op
+              )
+          )
+
+        return {'output': out}
+
+    model = MatmulAndSameScaleModel(weight_shape, same_scale_op)
+    saved_model_save.save(
+        model,
+        saved_model_path,
+        signatures=model.matmul_and_same_scale.get_concrete_function(
+            tensor_spec.TensorSpec(
+                shape=input_shape, dtype=dtypes.float32, name='input_tensor'
+            )
+        ),
+    )
+    return model
+
+  def _create_conv2d_model(
+      self,
+      input_shape: Sequence[int],
+      filter_shape: Sequence[int],
+      saved_model_path: str,
+      has_bias: bool = False,
+      has_batch_norm: bool = False,
+      activation_fn: Optional[ops.Operation] = None,
+      strides: Sequence[int] = (1, 1, 1, 1),
+      dilations: Sequence[int] = (1, 1, 1, 1),
+      padding: str = 'SAME',
+  ) -> module.Module:
+    class ConvModel(module.Module):
+      """A simple model with a single conv2d, bias and relu."""
+
+      def __init__(self):
+        self.out_channel_size = filter_shape[-1]
+
+        # This ensures filters will have different value range per out channel
+        self.filters = np.stack(
+            [
+                np.random.uniform(
+                    low=-(i + 1), high=(i + 1), size=filter_shape[:-1]
+                ).astype('f4')
+                for i in range(self.out_channel_size)
+            ],
+            axis=-1,
+        )
+
+        self.bias = np.random.uniform(
+            low=0, high=10, size=(self.out_channel_size)
+        ).astype('f4')
+
+      @def_function.function
+      def conv2d(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
+        """Performs a 2D convolution operation.
+
+        Args:
+          input_tensor: Input tensor to perform convolution on.
+
+        Returns:
+          A map of: output key -> output result.
+        """
+        scale = [1.0] * self.out_channel_size
+        offset = [0.5] * self.out_channel_size
+        mean, variance = scale, offset
+        out = nn_ops.conv2d(
+            input_tensor,
+            self.filters,
+            strides=strides,
+            dilations=dilations,
+            padding=padding,
+            data_format='NHWC',
+            name='sample/conv',
+        )
+        if has_batch_norm:
+          # Fusing is supported for non-training case.
+          out, _, _, _, _, _ = nn_ops.fused_batch_norm_v3(
+              out, scale, offset, mean, variance, is_training=False
+          )
+        return {'output': out}
+
+    model = ConvModel()
+    saved_model_save.save(
+        model,
+        saved_model_path,
+        signatures=model.conv2d.get_concrete_function(
+            tensor_spec.TensorSpec(
+                shape=input_shape, dtype=dtypes.float32, name='input_tensor'
+            )
+        ),
+    )
+    return model
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc
new file mode 100644
index 00000000000000..08fac877481157
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc
@@ -0,0 +1,187 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "pybind11/cast.h"  // from @pybind11
+#include "pybind11/detail/common.h"  // from @pybind11
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
+#include "pybind11/stl.h"  // from @pybind11  // IWYU pragma: keep
+#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil   // IWYU pragma: keep
+#include "pybind11_abseil/import_status_module.h"  // from @pybind11_abseil
+#include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/assign_ids.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/type_casters.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace py = pybind11;
+
+namespace {
+
+using ::stablehlo::quantization::AddCalibrationStatistics;
+using ::stablehlo::quantization::AssignIdsToCustomAggregatorOps;
+using ::stablehlo::quantization::EnableDebugging;
+using ::stablehlo::quantization::io::CreateTmpDir;
+using ::tensorflow::SignatureDef;
+using ::tensorflow::quantization::ExportedModel;
+using ::tensorflow::quantization::PyFunctionLibrary;
+using ::tensorflow::quantization::QuantizationOptions;
+
+}  // namespace
+
+PYBIND11_MODULE(pywrap_quantization, m) {
+  // Supports absl::Status type conversions.
+  pybind11::google::ImportStatusModule();
+
+  m.doc() = "StableHLO Quantization APIs.";
+
+  m.def(
+      // If the function signature changes, likely its corresponding .pyi type
+      // hinting should also change.
+      // LINT.IfChange
+      "static_range_ptq",
+      [](const absl::string_view src_saved_model_path,
+         const absl::string_view dst_saved_model_path,
+         const QuantizationOptions& quantization_options,
+         const std::vector<std::string>& signature_keys,
+         const absl::flat_hash_map<std::string, SignatureDef>&
+             signature_def_map,
+         const absl::flat_hash_map<std::string, std::string>& function_aliases,
+         const PyFunctionLibrary& py_function_library,
+         py::object representative_dataset) -> absl::Status {
+        // LINT.ThenChange(pywrap_quantization.pyi:static_range_ptq)
+        std::unordered_set<std::string> tags;
+        tags.insert(quantization_options.tags().begin(),
+                    quantization_options.tags().end());
+
+        absl::StatusOr<ExportedModel> exported_model =
+            QuantizePtqModelPreCalibration(src_saved_model_path, signature_keys,
+                                           tags, quantization_options,
+                                           function_aliases);
+        if (!exported_model.ok()) return exported_model.status();
+
+        AssignIdsToCustomAggregatorOps(*exported_model->mutable_graph_def());
+
+        const absl::StatusOr<std::string> precalibrated_saved_model_dir =
+            CreateTmpDir();
+        if (!precalibrated_saved_model_dir.ok()) {
+          throw py::value_error(absl::StrFormat(
+              "Failed to create tmp dir for precalibrated saved model: %s",
+              precalibrated_saved_model_dir.status().ToString()));
+        }
+
+        py_function_library.SaveExportedModel(
+            *precalibrated_saved_model_dir, *exported_model,
+            src_saved_model_path, tags, signature_def_map);
+
+        py_function_library.RunCalibration(
+            *precalibrated_saved_model_dir, signature_keys, tags,
+            quantization_options.calibration_options(),
+            quantization_options.force_graph_mode_calibration(),
+            representative_dataset);
+
+        if (absl::Status status = AddCalibrationStatistics(
+                *exported_model->mutable_graph_def(),
+                quantization_options.calibration_options(),
+                py_function_library);
+            !status.ok()) {
+          LOG(WARNING) << "Some CustomAggregator ops do not have min or max "
+                          "values. Parts of the graph are not quantized. "
+                       << status;
+        }
+
+        if (quantization_options.has_debugger_options()) {
+          EnableDebugging(*exported_model,
+                          quantization_options.debugger_options(),
+                          py_function_library, src_saved_model_path, tags,
+                          signature_def_map);
+        }
+
+        const absl::StatusOr<std::string> calibrated_saved_model_path =
+            CreateTmpDir();
+        if (!calibrated_saved_model_path.ok()) {
+          throw py::value_error(absl::StrFormat(
+              "Failed to create tmp dir for calibrated saved model: %s",
+              calibrated_saved_model_path.status().ToString()));
+        }
+
+        py_function_library.SaveExportedModel(
+            *calibrated_saved_model_path, *exported_model, src_saved_model_path,
+            tags, signature_def_map);
+
+        const absl::flat_hash_map<std::string, std::string>
+            function_aliases_after_calibration(
+                exported_model->function_aliases().begin(),
+                exported_model->function_aliases().end());
+
+        const absl::StatusOr<ExportedModel> post_calibrated_exported_model =
+            QuantizePtqModelPostCalibration(
+                *calibrated_saved_model_path, signature_keys, tags,
+                quantization_options, function_aliases_after_calibration);
+        if (!post_calibrated_exported_model.ok()) {
+          return post_calibrated_exported_model.status();
+        }
+
+        // Remove the `tpu` tag from the debug quantized saved model as it is
+        // for CPU. Note the 'tpu' value should be the same as `TPU` defined in
+        // tensorflow/python/saved_model/tag_constants.py.
+        if (quantization_options.has_debugger_options()) {
+          tags.erase("tpu");
+        }
+        py_function_library.SaveExportedModel(
+            dst_saved_model_path, *post_calibrated_exported_model,
+            *calibrated_saved_model_path, tags, signature_def_map);
+
+        return absl::OkStatus();
+      },
+      R"pbdoc(
+      Runs static-range post-training quantization (PTQ) on a SavedModel at
+      `src_saved_model_path` and saves the resulting model to
+      `dst_saved_model_path`.
+
+      The user should pass a serialized `QuantizationOptions` for the
+      `quantization_options_serialized` argument, and a signature key ->
+      serialized `SignatureDef` mapping for the `signature_def_map_serialized`
+      argument.
+
+      `function_aliases` maps actual function names to the function aliases, as
+      defined by the `MetaGraphDef::MetaInfoDef::function_aliases` from the
+      input SavedModel.
+
+      Raises `StatusNotOk` exception if when the run was unsuccessful.
+      )pbdoc",
+      py::arg("saved_model_path"), py::arg("dst_saved_model_path"),
+      py::arg("quantization_options_serialized"), py::kw_only(),
+      py::arg("signature_keys"), py::arg("signature_def_map_serialized"),
+      py::arg("function_aliases"), py::arg("py_function_library"),
+      py::arg("representative_dataset"));
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi
new file mode 100644
index 00000000000000..1870115a4aa847
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi
@@ -0,0 +1,33 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from typing import Any
+
+from tensorflow.compiler.mlir.quantization.tensorflow.python import py_function_lib
+from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as rd
+
+# LINT.IfChange(static_range_ptq)
+def static_range_ptq(
+    src_saved_model_path: str,
+    dst_saved_model_path: str,
+    quantization_options_serialized: bytes,
+    *,
+    signature_keys: list[str],
+    signature_def_map_serialized: dict[str, bytes],
+    function_aliases: dict[str, str],
+    py_function_library: py_function_lib.PyFunctionLibrary,
+    representative_dataset: rd.RepresentativeDatasetOrMapping,
+) -> Any: ...  # Status
+
+# LINT.ThenChange()
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
new file mode 100644
index 00000000000000..fab36e2005110f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
@@ -0,0 +1,100 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""StableHLO Quantizer."""
+from typing import Mapping
+
+from tensorflow.compiler.mlir.quantization.stablehlo.python import pywrap_quantization
+from tensorflow.compiler.mlir.quantization.tensorflow import quantization_options_pb2 as quant_opts_pb2
+from tensorflow.compiler.mlir.quantization.tensorflow.python import py_function_lib
+from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as rd
+from tensorflow.compiler.mlir.quantization.tensorflow.python import save_model
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.saved_model import loader_impl
+
+# Mapping of signature def key -> SignatureDef.
+_SignatureDefMap = Mapping[str, meta_graph_pb2.SignatureDef]
+
+
+def _serialize_signature_def_map(
+    signature_def_map: _SignatureDefMap,
+) -> dict[str, bytes]:
+  """Serializes SignatureDef values in `signature_def_map`.
+
+  Args:
+    signature_def_map: Signature key -> SignatureDef mapping.
+
+  Returns:
+    Signature def map where the values (`SignatureDef`) are serialized.
+  """
+  signature_def_map_serialized = {}
+  for key, signature_def in signature_def_map.items():
+    signature_def_map_serialized[key] = signature_def.SerializeToString()
+
+  return signature_def_map_serialized
+
+
+# TODO: b/310594193 - Export API to pip package.
+def quantize_saved_model(
+    src_saved_model_path: str,
+    dst_saved_model_path: str,
+    config: quant_opts_pb2.QuantizationOptions,
+) -> None:
+  """Quantizes a saved model.
+
+  Args:
+    src_saved_model_path: Path to the directory for the source SavedModel.
+    dst_saved_model_path: Path to the directory for the destination SavedModel.
+    config: Quantization configuration.
+
+  Raises:
+    ValueError: When `config` was not configured for static-range PTQ
+    single representative dataset.
+  """
+  if not (
+      config.quantization_method.preset_method
+      == quant_opts_pb2.QuantizationMethod.PresetMethod.METHOD_STATIC_RANGE_INT8
+      and len(config.representative_datasets) == 1
+  ):
+    raise ValueError(
+        '`quantize_saved_model` currently only supports static-range PTQ with a'
+        ' single signature.'
+    )
+
+  signature_def_map = save_model.get_signatures_from_saved_model(
+      src_saved_model_path,
+      list(config.signature_keys),
+      set(config.tags),
+  )
+
+  loader = loader_impl.SavedModelLoader(src_saved_model_path)
+  function_aliases = loader.get_meta_graph_def_from_tags(
+      config.tags
+  ).meta_info_def.function_aliases
+
+  representative_dataset = rd.RepresentativeDatasetLoader(
+      config.representative_datasets
+  ).load()
+
+  signature_def_map_serialized = _serialize_signature_def_map(signature_def_map)
+  pywrap_quantization.static_range_ptq(
+      src_saved_model_path,
+      dst_saved_model_path,
+      quantization_options_serialized=config.SerializeToString(),
+      signature_keys=list(config.signature_keys),
+      signature_def_map_serialized=signature_def_map_serialized,
+      function_aliases=dict(function_aliases),
+      py_function_library=py_function_lib.PyFunctionLibrary(),
+      representative_dataset=representative_dataset,
+  )
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
new file mode 100644
index 00000000000000..c28e95da07004f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
@@ -0,0 +1,49 @@
+// Protobuf messages for configuring StableHLO Quantizer.
+syntax = "proto3";
+
+package stablehlo.quantization;
+
+option cc_enable_arenas = true;
+
+// Represents a single TFRecord file. See
+// https://www.tensorflow.org/tutorials/load_data/tfrecord for details on the
+// TFRecord format.
+// Next ID: 2
+message TfRecordFile {
+  string path = 1;
+}
+
+// Configures a single representative dataset used to calibrate a single
+// function.
+// Next ID: 3
+message RepresentativeDatasetConfig {
+  oneof file {
+    // Represents representative dataset saved as a .tfrecord file format.
+    TfRecordFile tf_record = 1;
+  }
+
+  // [TF SavedModel] Identifies a SignatureDef which represents a single
+  // logical function in a graph.
+  optional string signature_key = 2;
+}
+
+// Preset config for static-range post-training quantization (PTQ).
+// Minimal user input about representative datasets is required. Representative
+// datasets are required for static-range PTQ to retrieve quantization
+// statistics via calibration.
+// Next ID: 2
+message StaticRangePtqPreset {
+  // Configures representative dataset. Each item corresponds to a
+  // representative dataset used to calibrate a function.
+  repeated RepresentativeDatasetConfig representative_datasets = 1;
+}
+
+// Quantization configuration for StableHLO Quantizer. This is the primary
+// message containing all configurable options.
+// Next ID: 2
+message QuantizationConfig {
+  oneof preset {
+    // Performs best-effort static-range post-training quantization (PTQ).
+    StaticRangePtqPreset static_range_ptq_preset = 1;
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
index 4c078033215618..6fc15864fb0f8b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -46,3 +46,24 @@ tf_cc_test(
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
+
+tf_cc_test(
+    name = "stablehlo_op_quant_spec_test",
+    srcs = ["stablehlo_op_quant_spec_test.cc"],
+    deps = [
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common:test_base",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/ops:stablehlo_op_quant_spec",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:QuantOps",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
index 65c8497aa9a41a..713c55281ff0e3 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
@@ -95,12 +95,15 @@ func.func @uniform_quantize_and_dequantize_type_exensions(%arg0: tensor<?x?xf32,
 
 // -----
 
+#SV = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
+
+// CHECK: #[[$SV:.*]] = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
 // CHECK-LABEL: func @uniform_quantize_and_dequantize_sparse_tensor_encoding
-func.func @uniform_quantize_and_dequantize_sparse_tensor_encoding(%arg0: tensor<?xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> () {
-  // CHECK: %[[QUANTIZED:.*]] = mhlo.convert %[[VAL0:.*]] : (tensor<?xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<?xi8, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>
-  %0 = mhlo.uniform_quantize %arg0 : (tensor<?xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>
-  // CHECK: %[[DEQUANTIZED:.*]] = chlo.broadcast_multiply %[[VAL1:.*]], %[[CONST_SCALE:.*]] : (tensor<?xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>, tensor<f32>) -> tensor<?xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>
-  %1 = mhlo.uniform_dequantize %0 : (tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<?xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>
+func.func @uniform_quantize_and_dequantize_sparse_tensor_encoding(%arg0: tensor<?xf32, #SV>) -> () {
+  // CHECK: %[[QUANTIZED:.*]] = mhlo.convert %[[VAL0:.*]] : (tensor<?xf32, #[[$SV]]>) -> tensor<?xi8, #[[$SV]]>
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<?xf32, #SV>) -> tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #SV>
+  // CHECK: %[[DEQUANTIZED:.*]] = chlo.broadcast_multiply %[[VAL1:.*]], %[[CONST_SCALE:.*]] : (tensor<?xf32, #[[$SV]]>, tensor<f32>) -> tensor<?xf32, #[[$SV]]>
+  %1 = mhlo.uniform_dequantize %0 : (tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #SV>) -> tensor<?xf32, #SV>
   return
 }
 
@@ -341,6 +344,91 @@ func.func @requantize_merged_zp_zero(
 
 // -----
 
+// CHECK-LABEL: func @requantize_per_channel
+func.func @requantize_per_channel(
+    %arg0: tensor<2x2x!quant.uniform<i8:f32:1, {1.000000e+01:3, 5.000000e+00:2}>>
+  ) -> tensor<2x2x!quant.uniform<i8:f32:1, {5.000000e+00:1, 1.000000e+01:-1}>> {
+  // CHECK-DAG: %[[VAL1:.*]] = mhlo.convert %arg0 : (tensor<2x2xi8>) -> tensor<2x2xf32>
+  // CHECK-DAG: %[[MERGED_SCALE:.*]] = mhlo.constant dense<[2.000000e+00, 5.000000e-01]> : tensor<2xf32>
+  // CHECK: %[[VAL2:.*]] = chlo.broadcast_multiply %[[VAL1]], %[[MERGED_SCALE]]
+  // CHECK-SAME: broadcast_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-DAG: %[[MERGED_ZP:.*]] = mhlo.constant dense<[-5.000000e+00, -2.000000e+00]> : tensor<2xf32>
+  // CHECK: %[[VAL3:.*]] = chlo.broadcast_add %[[VAL2]], %[[MERGED_ZP]]
+  // CHECK-SAME: broadcast_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-DAG: %[[QUANT_MIN:.*]] = mhlo.constant dense<-1.280000e+02> : tensor<f32>
+  // CHECK-DAG: %[[QUANT_MAX:.*]] = mhlo.constant dense<1.270000e+02> : tensor<f32>
+  // CHECK: %[[VAL4:.*]] = mhlo.clamp %[[QUANT_MIN]], %[[VAL3]], %[[QUANT_MAX]]
+  // CHECK: %[[VAL5:.*]] = mhlo.round_nearest_even %[[VAL4]] : tensor<2x2xf32>
+  // CHECK: %[[VAL6:.*]] = mhlo.convert %[[VAL5]] : (tensor<2x2xf32>) -> tensor<2x2xi8>
+  %0 = mhlo.uniform_quantize %arg0 : (
+      tensor<2x2x!quant.uniform<i8:f32:1, {1.000000e+01:3, 5.000000e+00:2}>>
+    ) -> tensor<2x2x!quant.uniform<i8:f32:1, {5.000000e+00:1, 1.000000e+01:-1}>>
+  return %0 : tensor<2x2x!quant.uniform<i8:f32:1, {5.000000e+00:1, 1.000000e+01:-1}>>
+}
+
+// -----
+
+// CHECK-LABEL: func @requantize_per_channel_to_per_tensor
+func.func @requantize_per_channel_to_per_tensor(
+    %arg0: tensor<2x2x!quant.uniform<i8:f32:1, {1.000000e+01:3, 5.000000e+00:2}>>
+  ) -> tensor<2x2x!quant.uniform<i8:f32, 5.000000e+00:1>> {
+  // CHECK-DAG: %[[VAL1:.*]] = mhlo.convert %arg0 : (tensor<2x2xi8>) -> tensor<2x2xf32>
+  // CHECK-DAG: %[[MERGED_SCALE:.*]] = mhlo.constant dense<[2.000000e+00, 1.000000e+00]> : tensor<2xf32>
+  // CHECK: %[[VAL2:.*]] = chlo.broadcast_multiply %[[VAL1]], %[[MERGED_SCALE]]
+  // CHECK-SAME: broadcast_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-DAG: %[[MERGED_ZP:.*]] = mhlo.constant dense<[-5.000000e+00, -1.000000e+00]> : tensor<2xf32>
+  // CHECK: %[[VAL3:.*]] = chlo.broadcast_add %[[VAL2]], %[[MERGED_ZP]]
+  // CHECK-SAME: broadcast_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-DAG: %[[QUANT_MIN:.*]] = mhlo.constant dense<-1.280000e+02> : tensor<f32>
+  // CHECK-DAG: %[[QUANT_MAX:.*]] = mhlo.constant dense<1.270000e+02> : tensor<f32>
+  // CHECK: %[[VAL4:.*]] = mhlo.clamp %[[QUANT_MIN]], %[[VAL3]], %[[QUANT_MAX]]
+  // CHECK: %[[VAL5:.*]] = mhlo.round_nearest_even %[[VAL4]] : tensor<2x2xf32>
+  // CHECK: %[[VAL6:.*]] = mhlo.convert %[[VAL5]] : (tensor<2x2xf32>) -> tensor<2x2xi8>
+  %0 = mhlo.uniform_quantize %arg0 : (
+      tensor<2x2x!quant.uniform<i8:f32:1, {1.000000e+01:3, 5.000000e+00:2}>>
+    ) -> tensor<2x2x!quant.uniform<i8:f32, 5.000000e+00:1>>
+  return %0 : tensor<2x2x!quant.uniform<i8:f32, 5.000000e+00:1>>
+}
+
+// -----
+
+// CHECK-LABEL: func @requantize_per_tensor_to_per_channel
+func.func @requantize_per_tensor_to_per_channel(
+    %arg0: tensor<2x2x!quant.uniform<i8:f32, 5.000000e+00:2>>
+  ) -> tensor<2x2x!quant.uniform<i8:f32:1, {5.000000e+00:1, 1.000000e+01:-1}>> {
+  // CHECK-DAG: %[[VAL1:.*]] = mhlo.convert %arg0 : (tensor<2x2xi8>) -> tensor<2x2xf32>
+  // CHECK-DAG: %[[MERGED_SCALE:.*]] = mhlo.constant dense<[1.000000e+00, 5.000000e-01]> : tensor<2xf32>
+  // CHECK: %[[VAL2:.*]] = chlo.broadcast_multiply %[[VAL1]], %[[MERGED_SCALE]]
+  // CHECK-SAME: broadcast_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-DAG: %[[MERGED_ZP:.*]] = mhlo.constant dense<[-1.000000e+00, -2.000000e+00]> : tensor<2xf32>
+  // CHECK: %[[VAL3:.*]] = chlo.broadcast_add %[[VAL2]], %[[MERGED_ZP]]
+  // CHECK-SAME: broadcast_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-DAG: %[[QUANT_MIN:.*]] = mhlo.constant dense<-1.280000e+02> : tensor<f32>
+  // CHECK-DAG: %[[QUANT_MAX:.*]] = mhlo.constant dense<1.270000e+02> : tensor<f32>
+  // CHECK: %[[VAL4:.*]] = mhlo.clamp %[[QUANT_MIN]], %[[VAL3]], %[[QUANT_MAX]]
+  // CHECK: %[[VAL5:.*]] = mhlo.round_nearest_even %[[VAL4]] : tensor<2x2xf32>
+  // CHECK: %[[VAL6:.*]] = mhlo.convert %[[VAL5]] : (tensor<2x2xf32>) -> tensor<2x2xi8>
+  %0 = mhlo.uniform_quantize %arg0 : (
+      tensor<2x2x!quant.uniform<i8:f32, 5.000000e+00:2>>
+    ) -> tensor<2x2x!quant.uniform<i8:f32:1, {5.000000e+00:1, 1.000000e+01:-1}>>
+  return %0 : tensor<2x2x!quant.uniform<i8:f32:1, {5.000000e+00:1, 1.000000e+01:-1}>>
+}
+
+// -----
+
+func.func @requantize_per_channel_change_axis(
+    %arg0: tensor<2x2x!quant.uniform<i8:f32:0, {1.000000e+01:3, 5.000000e+00:2}>>
+  ) -> tensor<2x2x!quant.uniform<i8:f32:1, {5.000000e+00:1, 1.000000e+01:-1}>> {
+  // expected-error@+2 {{Cannot requantize while changing quantization_axis}}
+  // expected-error@+1 {{failed to legalize operation 'mhlo.uniform_quantize' that was explicitly marked illegal}}
+  %0 = mhlo.uniform_quantize %arg0 : (
+      tensor<2x2x!quant.uniform<i8:f32:0, {1.000000e+01:3, 5.000000e+00:2}>>
+    ) -> tensor<2x2x!quant.uniform<i8:f32:1, {5.000000e+00:1, 1.000000e+01:-1}>>
+  return %0 : tensor<2x2x!quant.uniform<i8:f32:1, {5.000000e+00:1, 1.000000e+01:-1}>>
+}
+
+// -----
+
 // CHECK-LABEL: func @dot
 func.func @dot(%arg0: tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:3>>,
                %arg1: tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
@@ -493,7 +581,6 @@ func.func @dot_dynamic_result_dim(
   // CHECK-SAME: broadcast_dimensions = dense<1>
   // CHECK-SAME: (tensor<?xi32>, tensor<2xi64>) -> tensor<?x?xi32>
 
-
   %0 = "mhlo.dot" (%arg0, %arg1) : (
       tensor<?x2x!quant.uniform<i8:f32, 2.000000e+00:3>>,
       tensor<2x?x!quant.uniform<i8:f32, 1.000000e+00:3>>
@@ -503,6 +590,39 @@ func.func @dot_dynamic_result_dim(
 
 // -----
 
+// CHECK-LABEL: func @dot_dynamic_batch_dim
+func.func @dot_dynamic_batch_dim(
+    %arg0: tensor<?x2x!quant.uniform<i8:f32, 2.000000e+00:3>>,
+    %arg1: tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  ) -> tensor<?x2x!quant.uniform<i32:f32, 1.000000e+00:3>> {
+  // CHECK: "mhlo.dot_general"
+  // CHECK-SAME: lhs_contracting_dimensions = [1]
+  // CHECK-SAME: rhs_contracting_dimensions = [0]
+  // CHECK-SAME: (tensor<?x2xi8>, tensor<2x2xi8>) -> tensor<?x2xi32>
+
+  // CHECK: mhlo.reduce
+  // CHECK-SAME: applies mhlo.add across dimensions = [1]
+  // CHECK-SAME: (tensor<?x2xi32>, tensor<i32>) -> tensor<?xi32>
+  // CHECK: mhlo.dynamic_broadcast_in_dim
+  // CHECK-SAME: broadcast_dimensions = dense<0>
+  // CHECK-SAME: (tensor<?xi32>, tensor<2xi64>) -> tensor<?x2xi32>
+
+  // CHECK: mhlo.reduce
+  // CHECK-SAME: applies mhlo.add across dimensions = [0]
+  // CHECK-SAME: (tensor<2x2xi32>, tensor<i32>) -> tensor<2xi32>
+  // CHECK: mhlo.dynamic_broadcast_in_dim
+  // CHECK-SAME: broadcast_dimensions = dense<1>
+  // CHECK-SAME: (tensor<2xi32>, tensor<2xi64>) -> tensor<?x2xi32>
+
+  %0 = "mhlo.dot" (%arg0, %arg1) : (
+      tensor<?x2x!quant.uniform<i8:f32, 2.000000e+00:3>>,
+      tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+    ) -> tensor<?x2x!quant.uniform<i32:f32, 1.000000e+00:3>>
+  return %0 : tensor<?x2x!quant.uniform<i32:f32, 1.000000e+00:3>>
+}
+
+// -----
+
 // CHECK-LABEL: func @dot_general
 func.func @dot_general(
     %arg0: tensor<2x5x6x!quant.uniform<i8:f32, 2.000000e+00:3>>,
@@ -1113,6 +1233,27 @@ func.func @conv2d_static(
 
 // -----
 
+// CHECK-LABEL: func @conv2d_default_attr
+func.func @conv2d_default_attr(
+    %arg0: tensor<128x28x28x1x!quant.uniform<i8:f32, 2.000000e+00:4>>,
+    %arg1: tensor<3x3x1x128x!quant.uniform<i8:f32, 3.000000e+00:0>>
+  ) -> tensor<128x26x26x128x!quant.uniform<i32:f32, 1.000000e+00:5>> {
+  // CHECK: mhlo.convolution
+  // CHECK-NOT: quant.uniform
+  %0 = mhlo.convolution(%arg0, %arg1)
+    dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+    window = {
+    }
+    {
+      batch_group_count = 1 : i64,
+      feature_group_count = 1 : i64
+    } : (tensor<128x28x28x1x!quant.uniform<i8:f32, 2.000000e+00:4>>, tensor<3x3x1x128x!quant.uniform<i8:f32, 3.000000e+00:0>>)
+    -> tensor<128x26x26x128x!quant.uniform<i32:f32, 1.000000e+00:5>>
+  return %0 : tensor<128x26x26x128x!quant.uniform<i32:f32, 1.000000e+00:5>>
+}
+
+// -----
+
 // CHECK-LABEL: func @conv2d_static_padding
 func.func @conv2d_static_padding(
     %arg0: tensor<128x28x28x1x!quant.uniform<i8:f32, 2.000000e+00:4>>,
@@ -1660,6 +1801,21 @@ func.func @broadcast(
 
 // -----
 
+// CHECK-LABEL: func @broadcast_per_channel
+func.func @broadcast_per_channel(
+    %arg0: tensor<2x!quant.uniform<i32:f32:0, {4.000000e+00:0, 2.000000e+00:0}>>
+  ) -> tensor<128x26x26x2x!quant.uniform<i32:f32:3, {4.000000e+00:0, 2.000000e+00:0}>>  {
+  // CHECK: "mhlo.broadcast_in_dim"
+  // CHECK-SAME: broadcast_dimensions = dense<3> : tensor<1xi64>
+  // CHECK-SAME: (tensor<2xi32>) -> tensor<128x26x26x2xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<3> : tensor<1xi64>}: (
+      tensor<2x!quant.uniform<i32:f32:0, {4.000000e+00:0, 2.000000e+00:0}>>
+    ) -> tensor<128x26x26x2x!quant.uniform<i32:f32:3, {4.000000e+00:0, 2.000000e+00:0}>>
+  return %0 : tensor<128x26x26x2x!quant.uniform<i32:f32:3, {4.000000e+00:0, 2.000000e+00:0}>>
+}
+
+// -----
+
 // CHECK-LABEL: func @max
 func.func @max(
     %arg0: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
@@ -1675,6 +1831,21 @@ func.func @max(
 
 // -----
 
+// CHECK-LABEL: func @max_per_channel
+func.func @max_per_channel(
+    %arg0: tensor<1x2x!quant.uniform<i8:f32:1, {2.000000e+00:3, 1.000000e+00:-2}>>
+  ) -> tensor<1x2x!quant.uniform<i8:f32:1, {2.000000e+00:3, 1.000000e+00:-2}>> {
+  // CHECK: mhlo.maximum
+  // CHECK-SAME: tensor<1x2xi8>
+  %0 = "mhlo.maximum"(%arg0, %arg0) : (
+    tensor<1x2x!quant.uniform<i8:f32:1, {2.000000e+00:3, 1.000000e+00:-2}>>,
+    tensor<1x2x!quant.uniform<i8:f32:1, {2.000000e+00:3, 1.000000e+00:-2}>>
+  ) -> tensor<1x2x!quant.uniform<i8:f32:1, {2.000000e+00:3, 1.000000e+00:-2}>>
+  return %0 : tensor<1x2x!quant.uniform<i8:f32:1, {2.000000e+00:3, 1.000000e+00:-2}>>
+}
+
+// -----
+
 // CHECK-LABEL: func @min
 func.func @min(
     %arg0: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
@@ -1690,6 +1861,21 @@ func.func @min(
 
 // -----
 
+// CHECK-LABEL: func @min_per_channel
+func.func @min_per_channel(
+    %arg0: tensor<1x2x!quant.uniform<i8:f32:1, {2.000000e+00:3, 1.000000e+00:-2}>>
+  ) -> tensor<1x2x!quant.uniform<i8:f32:1, {2.000000e+00:3, 1.000000e+00:-2}>> {
+  // CHECK: mhlo.minimum
+  // CHECK-SAME: tensor<1x2xi8>
+  %0 = "mhlo.minimum"(%arg0, %arg0) : (
+    tensor<1x2x!quant.uniform<i8:f32:1, {2.000000e+00:3, 1.000000e+00:-2}>>,
+    tensor<1x2x!quant.uniform<i8:f32:1, {2.000000e+00:3, 1.000000e+00:-2}>>
+  ) -> tensor<1x2x!quant.uniform<i8:f32:1, {2.000000e+00:3, 1.000000e+00:-2}>>
+  return %0 : tensor<1x2x!quant.uniform<i8:f32:1, {2.000000e+00:3, 1.000000e+00:-2}>>
+}
+
+// -----
+
 // CHECK-LABEL: func @function(%arg0: tensor<1x2xi8>) -> tensor<1x2xi8>
 func.func @function(
     %arg0: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
@@ -1700,27 +1886,124 @@ func.func @function(
 
 // -----
 
-func.func @min_mix_uq_type1(
-    %arg0: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>,
-    %arg1: tensor<1x2x!quant.uniform<i8:f32, 1.000000e+00:2>>
-  ) -> tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>> {
-  // expected-error@+1 {{failed to legalize operation 'mhlo.minimum' that was explicitly marked illegal}}
-  %0 = "mhlo.minimum"(%arg0, %arg1) : (
-    tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>,
-    tensor<1x2x!quant.uniform<i8:f32, 1.000000e+00:2>>
-  ) -> tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
-  return %0 : tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK-LABEL: func @concatenate
+func.func @concatenate(
+    %arg0: tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>,
+    %arg1: tensor<1x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  ) -> tensor<4x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>> {
+  // CHECK: mhlo.concatenate
+  // CHECK-SAME: (tensor<3x2xi8>, tensor<1x2xi8>) -> tensor<4x2xi8>
+  %0 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (
+    tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>,
+    tensor<1x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  ) -> tensor<4x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  return %0 : tensor<4x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
 }
 
 // -----
 
-func.func @min_mix_uq_type2(
-    %arg0: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
-  ) -> tensor<1x2x!quant.uniform<i8:f32, 1.000000e+00:2>> {
-  // expected-error@+1 {{failed to legalize operation 'mhlo.minimum' that was explicitly marked illegal}}
-  %0 = "mhlo.minimum"(%arg0, %arg0) : (
-    tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>,
-    tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
-  ) -> tensor<1x2x!quant.uniform<i8:f32, 1.000000e+00:2>>
-  return %0 : tensor<1x2x!quant.uniform<i8:f32, 1.000000e+00:2>>
+// CHECK-LABEL: func @pad
+func.func @pad(
+    %arg0: tensor<2x3x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>,
+    %arg1: tensor<!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  ) -> tensor<5x9x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>> {
+  // CHECK: mhlo.pad
+  // CHECK-SAME: (tensor<2x3xi8>, tensor<i8>) -> tensor<5x9xi8>
+  %0 = "mhlo.pad"(%arg0, %arg1) {
+    edge_padding_low = dense<[0, 1]> : tensor<2xi64>,
+    edge_padding_high = dense<[2, 1]> : tensor<2xi64>,
+    interior_padding = dense<[1, 2]> : tensor<2xi64>
+  }: (
+    tensor<2x3x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>,
+    tensor<!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  ) -> tensor<5x9x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  return %0 : tensor<5x9x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+}
+
+// -----
+
+// CHECK-LABEL: func @reshape
+func.func @reshape(
+    %arg0: tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  ) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>> {
+  // CHECK: mhlo.reshape
+  // CHECK-SAME: (tensor<1x3xi8>) -> tensor<3x1xi8>
+  %0 = "mhlo.reshape"(%arg0) : (
+    tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  ) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  return %0 : tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+}
+
+// -----
+
+// CHECK-LABEL: func @select
+func.func @select(
+    %arg0: tensor<1x3xi1>,
+    %arg1: tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>,
+    %arg2: tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  ) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>> {
+  // CHECK: mhlo.select
+  // CHECK-SAME: tensor<1x3xi8>
+  %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (
+    tensor<1x3xi1>,
+    tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>,
+    tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  ) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  return %0 : tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+}
+
+// -----
+
+// CHECK-LABEL: func @transpose
+func.func @transpose(
+    %arg0: tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  ) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>> {
+  // CHECK: mhlo.transpose
+  // CHECK-SAME: (tensor<3x1xi8>) -> tensor<1x3xi8>
+  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (
+    tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  ) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  return %0 : tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+}
+
+// -----
+
+// CHECK-LABEL: func @gather
+func.func @gather(
+    %arg0: tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>,
+    %arg1:  tensor<2x3x2xi64>
+  ) -> tensor<2x3x2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>> {
+  // CHECK: mhlo.gather
+  // CHECK-SAME: (tensor<3x4x2xi8>, tensor<2x3x2xi64>) -> tensor<2x3x2x2xi8>
+  %0 = "mhlo.gather"(%arg0, %arg1) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2, 3],
+      collapsed_slice_dims = [0],
+      start_index_map = [1, 0],
+      index_vector_dim = 2>,
+    slice_sizes = dense<[1, 2, 2]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (
+    tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>,
+    tensor<2x3x2xi64>
+  ) -> tensor<2x3x2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  return %0 : tensor<2x3x2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+}
+
+// -----
+
+// CHECK-LABEL: func @slice
+func.func @slice(
+    %arg0: tensor<3x4x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  ) -> tensor<2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>> {
+  // CHECK: mhlo.slice
+  // CHECK-SAME: (tensor<3x4xi8>) -> tensor<2x2xi8>
+  %0 = "mhlo.slice"(%arg0) {
+    start_indices = dense<[1, 2]> : tensor<2xi64>,
+    limit_indices = dense<[3, 4]> : tensor<2xi64>,
+    strides = dense<1> : tensor<2xi64>
+  } : (
+    tensor<3x4x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  ) -> tensor<2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  return %0 : tensor<2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/populate_shape.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/populate_shape.mlir
new file mode 100644
index 00000000000000..05f10405356ba9
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/populate_shape.mlir
@@ -0,0 +1,44 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -populate-shape --mlir-print-ir-after-all | FileCheck %s
+
+// CHECK-LABEL: @populate_shape_for_custom_aggregator
+func.func @populate_shape_for_custom_aggregator(%input: tensor<?x56x56x64xf32>) {
+  // CHECK: %[[OUTPUT:.*]] = "tf.CustomAggregator"(%[[INPUT:.*]]) <{id = "49d53b0"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 6.000000e+00 : f32, max_percentile = 0.000000e+00 : f32, min = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<?x56x56x64xf32>) -> tensor<?x56x56x64xf32>
+  %0 = "tf.CustomAggregator"(%input) <{id = "49d53b0"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 6.000000e+00 : f32, max_percentile = 0.000000e+00 : f32, min = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<?x56x56x64xf32>) -> tensor<*xf32>
+  func.return
+}
+
+// ----
+
+// CHECK-LABEL: @populate_shape_for_xla_call_module
+func.func @populate_shape_for_xla_call_module(%input: tensor<?x56x56x256xf32>) {
+  %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<1x1x64x256xf32>} : () -> tensor<1x1x64x256xf32>
+  // CHECK: %[[OUTPUT:.*]] = "tf.XlaCallModule"(%[[INPUT:.*]], %[[CST:.*]]) <{Sout = [#tf_type.shape<?x56x56x256>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @main_9, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<?x56x56x256xf32>, tensor<1x1x64x256xf32>) -> tensor<?x56x56x256xf32>
+  %0 = "tf.XlaCallModule"(%input, %cst) <{Sout = [#tf_type.shape<?x56x56x256>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @main_9, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<?x56x56x256xf32>, tensor<1x1x64x256xf32>) -> tensor<*xf32>
+  func.return
+}
+
+// ----
+
+// CHECK-LABEL: @populate_shape_for_chain_of_ops
+func.func @populate_shape_for_chain_of_ops(%input: tensor<?x56x56x64xf32>) {
+  %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<1x1x64x256xf32>} : () -> tensor<1x1x64x256xf32>
+  // CHECK: %[[VAL_0:.*]] = "tf.CustomAggregator"(%[[INPUT:.*]]) <{id = "49d53b0"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 6.000000e+00 : f32, max_percentile = 0.000000e+00 : f32, min = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<?x56x56x64xf32>) -> tensor<?x56x56x64xf32>
+  // CHECK: %[[VAL_1:.*]] = "tf.XlaCallModule"(%[[VAL_0:.*]], %[[CST:.*]]) <{Sout = [#tf_type.shape<?x56x56x256>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @main_9, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<?x56x56x64xf32>, tensor<1x1x64x256xf32>) -> tensor<?x56x56x256xf32>
+  // CHECK: %[[VAL_2:.*]] = "tf.CustomAggregator"(%[[VAL_1:.*]]) <{id = "49d53b1"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 6.000000e+00 : f32, max_percentile = 0.000000e+00 : f32, min = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<?x56x56x256xf32>) -> tensor<?x56x56x256xf32>
+  %0 = "tf.CustomAggregator"(%input) <{id = "49d53b0"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 6.000000e+00 : f32, max_percentile = 0.000000e+00 : f32, min = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<?x56x56x64xf32>) -> tensor<*xf32>
+  %1 = "tf.XlaCallModule"(%0, %cst) <{Sout = [#tf_type.shape<?x56x56x256>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @main_9, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<*xf32>, tensor<1x1x64x256xf32>) -> tensor<*xf32>
+  %2 = "tf.CustomAggregator"(%1) <{id = "49d53b1"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 6.000000e+00 : f32, max_percentile = 0.000000e+00 : f32, min = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
+  func.return
+}
+
+// ----
+
+// CHECK-LABEL: @populate_shape_for_xla_call_module_failure_not_single_output
+func.func @populate_shape_for_xla_call_module_failure_not_single_output(%input: tensor<?x56x56x256xf32>) {
+  %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<1x1x64x256xf32>} : () -> tensor<1x1x64x256xf32>
+  // expected-error @+2 {{XlaCallModuleOp doesn't have 1 output.}}
+  %0, %1 = "tf.XlaCallModule"(%input, %cst) <{Sout = [#tf_type.shape<?x56x56x256>, #tf_type.shape<?x56x56x256>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @main_9, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<?x56x56x256xf32>, tensor<1x1x64x256xf32>) -> (tensor<*xf32>, tensor<*xf32>)
+  // expected-error @+1 {{XlaCallModuleOp doesn't have 1 output.}}
+  "tf.XlaCallModule"(%input, %cst) <{Sout = [], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @main_9, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<?x56x56x256xf32>, tensor<1x1x64x256xf32>) -> ()
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize.mlir
index 8f38f889f28e33..a873f30a20cff8 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize.mlir
@@ -105,3 +105,36 @@ func.func @merge_consecutive_qcast(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %
   %6 = "quantfork.stats"(%5) {layerStats = dense<[-1.5726943, 4.6875381]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
   func.return  %3, %6 : tensor<*xf32>, tensor<*xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @skip_nan_inf_constant
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<?x112x112x64xf32>) -> tensor<?x56x56x64xf32>
+func.func @skip_nan_inf_constant(%arg0: tensor<?x112x112x64xf32>) -> tensor<?x56x56x64xf32> {
+  // CHECK: %[[cst0:.*]] = stablehlo.constant
+  // CHECK: %[[cst1:.*]] = stablehlo.constant
+  // CHECK: %[[cst2:.*]] = stablehlo.constant
+  // CHECK: %[[cst3:.*]] = stablehlo.constant
+  // CHECK-NOT: %[[q0:.*]] = "quantfork.qcast"(%[[cst0]])
+  // CHECK-NOT: %[[q1:.*]] = "quantfork.qcast"(%[[cst1]])
+  // CHECK: %[[q2:.*]] = "quantfork.qcast"(%[[cst2]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.023529411764705882:-128>
+  // CHECK: %[[dq2:.*]] = "quantfork.dcast"(%[[q2]])
+  // CHECK-SAME: quant.uniform<i8:f32, 0.023529411764705882:-128>
+  // CHECK: %[[q3:.*]] = "quantfork.qcast"(%[[cst3]])
+  // CHECK-SAME: quant.uniform<i8:f32, 3.9215686274509805E-9>
+  // CHECK: %[[dq3:.*]] = "quantfork.dcast"(%[[q3]])
+  // CHECK-SAME: quant.uniform<i8:f32, 3.9215686274509805E-9>
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>
+  %1 = stablehlo.constant dense<0x7FC00000> : tensor<f32>
+  %2 = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+  %3 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  %4 = "stablehlo.add"(%0, %1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %5 = stablehlo.clamp %3, %arg0, %2 : (tensor<f32>, tensor<?x112x112x64xf32>, tensor<f32>) -> tensor<?x112x112x64xf32>
+  %6 = "stablehlo.reduce_window"(%5, %4) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    %7 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+    stablehlo.return %7 : tensor<f32>
+  }) {padding = dense<[[0, 0], [0, 1], [0, 1], [0, 0]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 3, 3, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<?x112x112x64xf32>, tensor<f32>) -> tensor<?x56x56x64xf32>
+  return %6 : tensor<?x56x56x64xf32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize.mlir
index d1bfea7a236448..e794dded354da9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize.mlir
@@ -1,5 +1,8 @@
 // RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-quantize -verify-each=false | FileCheck %s
 
+// Tests for PopulateFusedGemmStylePatterns are handled in
+// quantize_composite_functions for module-level evaluation of functions.
+
 // CHECK-LABEL: quantize_simple_xla_call_module
 func.func private @quantize_simple_xla_call_module(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
   %0 = stablehlo.constant dense<1.000000e+00> : tensor<4x3xf32>
@@ -40,3 +43,27 @@ func.func private @quantize_simple_xla_call_module_no_operand() -> tensor<1x3xf3
 // CHECK: %[[XLACALLMODULE_0:.*]] = "tf.XlaCallModule"() <{{{.*}}}> {{{.*}}} : () -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
 // CHECK: %[[DCAST_0:.*]] = "quantfork.dcast"(%[[XLACALLMODULE_0]]) : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
 // CHECK: "func.return"(%[[DCAST_0]]) : (tensor<1x3xf32>) -> ()
+
+// -----
+
+// Tests for emitting an error when there is no corresponding entry
+// function to quantize (@composite_dot_general_fn).
+
+module attributes {tf_saved_model.semantics} {
+// The following pattern does not converge because of a bug in QuantizePass.
+// TODO - b/305469508: Fix the QuantizePass to avoid this warning.
+// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
+ func.func private @error_when_no_entry_function(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+   %0 = stablehlo.constant dense<1.000000e+00> : tensor<2x3xf32>
+   %1 = "quantfork.qcast"(%0) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+   %2 = "quantfork.dcast"(%1) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<2x3xf32>
+   %3 = "quantfork.qcast"(%arg0) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+   %4 = "quantfork.dcast"(%3) : (tensor<1x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x2xf32>
+// expected-error @+2 {{Failed to find a valid entry function}}
+// expected-error @+1 {{'tf.XlaCallModule' op operand #0 must be variadic of tensor of tf.dtype values}}
+   %5 = "tf.XlaCallModule"(%4, %2) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+   %6 = "quantfork.qcast"(%5) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+   %7 = "quantfork.dcast"(%6) : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<1x3xf32>
+   return %7 : tensor<1x3xf32>
+ }
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_composite_functions.mlir
index 97ea1f30be81ba..b6efc8cab0060a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_composite_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_composite_functions.mlir
@@ -1,14 +1,17 @@
 // RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
 // RUN:     -stablehlo-quantize-composite-functions | FileCheck %s
 
+
+// Tests that basic dot_general is properly quantized.
+
+// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
 module attributes {tf_saved_model.semantics} {
 // The following pattern does not converge because of a bug in QuantizePass.
 // TODO - b/305469508: Fix the QuantizePass to avoid this warning.
-// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
-  func.func private @quantize_dot_general(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
-    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<3x3xf32>} : () -> tensor<3x3xf32>
-    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
+  func.func private @quantize_dot_general(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
     return %2 : tensor<1x3xf32>
   }
@@ -16,99 +19,199 @@ module attributes {tf_saved_model.semantics} {
 // calls the quantized entry function.
 
 // CHECK-LABEL: func.func private @quantize_dot_general
-// CHECK-SAME: (%[[ARG_1:.*]]: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK: %[[CONST_0:.*]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<3x3xi8>} : () -> tensor<3x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
-// CHECK: %[[UNIFORM_QUANTIZE_0:.*]] = stablehlo.uniform_quantize %[[ARG_1]] : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.*]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<3x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-SAME: (%[[ARG_1:.*]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.*]] = stablehlo.uniform_quantize %[[ARG_1]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.*]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.*]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
 
-  func.func private @composite_dot_general_fn(%arg0: tensor<1x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
-    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
     return %0 : tensor<1x3xf32>
   }
 // Checks that the entry function is quantized for dot_general. Quantized
 // dot_general outputs an i32 quantized tensor, followed by requantization to
 // i8 quantized tensor.
 
-// CHECK: func.func private @quantized_dot_general_fn(%[[ARG_2:.*]]: tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_3:.*]]: tensor<3x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
-// CHECK: %[[DOT_GENERAL_0:.*]] = stablehlo.dot_general %[[ARG_2]], %[[ARG_3]], contracting_dims = [1] x [0] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<3x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK: func.func private @quantized_dot_general_fn(%[[ARG_2:.*]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_3:.*]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[DOT_GENERAL_0:.*]] = stablehlo.dot_general %[[ARG_2]], %[[ARG_3]], contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>
 // CHECK: %[[UNIFORM_QUANTIZE_1:.*]] = stablehlo.uniform_quantize %[[DOT_GENERAL_0]] : (tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
 }
 
 // -----
 
-// Tests error when there are no corresponding entry function to quantize
-// (@composite_dot_general_fn).
+// Tests that fused pattern for dot_general + bias is properly quantized.
 
+// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
 module attributes {tf_saved_model.semantics} {
 // The following pattern does not converge because of a bug in QuantizePass.
 // TODO - b/305469508: Fix the QuantizePass to avoid this warning.
-// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
-  func.func private @error_when_no_entry_function(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
-    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<3x3xf32>} : () -> tensor<3x3xf32>
-    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-// expected-error @+2 {{Failed to find a valid entry function}}
-// expected-error @+1 {{'tf.XlaCallModule' op operand #0 must be variadic of tensor of tf.dtype values}}
-    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
+  func.func private @quantize_dot_general_with_bias(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x3xf32>} : () -> tensor<1x3xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_bias_fn, _original_entry_function = "composite_dot_general_with_bias_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
     return %2 : tensor<1x3xf32>
   }
+
+// CHECK-LABEL: func.func private @quantize_dot_general_with_bias
+// CHECK-SAME: (%[[ARG_1:.*]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x3xi32>} : () -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.*]] = stablehlo.uniform_quantize %[[ARG_1]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.*]] = call @quantized_dot_general_with_bias_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x3x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.*]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
+
+// CHECK: func.func private @quantized_dot_general_with_bias_fn(%[[ARG_2:.*]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_3:.*]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_4:.*]]: tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+  func.func private @composite_dot_general_with_bias_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+// CHECK: %[[DOT_GENERAL_0:.*]] = stablehlo.dot_general %[[ARG_2]], %[[ARG_3]], contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i32:f32, 8.3371932554046126E-6>>
+// CHECK: %[[ADD_0:.*]] = stablehlo.add %[[DOT_GENERAL_0]], %[[ARG_4]] : tensor<1x3x!quant.uniform<i32:f32, 8.3371932554046126E-6>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.*]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+
 }
 
 // -----
 
-// Tests that XlaCallModule op is not quantized without the quantfork.stats ops.
+// Tests that fused pattern for dot_general + bias with dynamic shape is
+// not quantized.
+// TODO: b/307620428 - Add support for fused bias with dynamic shapes.
 
+// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
 module attributes {tf_saved_model.semantics} {
-  func.func private @not_quantized_without_stats(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
-    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<3x3xf32>} : () -> tensor<3x3xf32>
-    %1 = "tf.XlaCallModule"(%arg0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
-    return %1 : tensor<1x3xf32>
+// The following pattern does not converge because of a bug in QuantizePass.
+// TODO - b/305469508: Fix the QuantizePass to avoid this warning.
+  func.func private @quantize_dot_general_with_bias_dynamic(%arg0: tensor<?x2xf32>) -> tensor<?x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<3xf32>} : () -> tensor<3xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x2xf32>) -> tensor<?x2xf32>
+    // expected-error@+1 {{'tf.XlaCallModule' op operand #0 must be variadic of tensor of tf.dtype values, but got}}
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<?x3>], _entry_function = @composite_dot_general_with_bias_dynamic_fn, _original_entry_function = "composite_dot_general_with_bias_dynamic_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x2xf32>, tensor<2x3xf32>, tensor<3xf32>) -> tensor<?x3xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<?x3xf32>) -> tensor<?x3xf32>
+    return %2 : tensor<?x3xf32>
   }
-// Check that "tf.Const" is converted to stablehlo.constant. XlaCallModule is
-// not quantized.
 
-// CHECK-LABEL: func.func private @not_quantized_without_stats
-// CHECK-SAME: (%[[ARG_1:.*]]: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<3.000000e-01> : tensor<3x3xf32>
-// CHECK: %[[XLA_CALL_MODULE_0:.*]] = "tf.XlaCallModule"(%[[ARG_1]], %[[CONST_0]]) <{{{.*}}}> {{{.*_entry_function = @composite_dot_general_fn.*}}} : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
-// CHECK: return %[[XLA_CALL_MODULE_0]]
+  func.func private @composite_dot_general_with_bias_dynamic_fn(%arg0: tensor<?x2xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<3xf32>) -> tensor<?x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<?x2xf32>, tensor<2x3xf32>) -> tensor<?x3xf32>
+    %1 = shape.shape_of %0 : tensor<?x3xf32> -> tensor<2xindex>
+    %2 = stablehlo.dynamic_broadcast_in_dim %arg2, %1, dims = [1] : (tensor<3xf32>, tensor<2xindex>) -> tensor<?x3xf32>
+    %3 = stablehlo.add %0, %2 : tensor<?x3xf32>
+    return %3 : tensor<?x3xf32>
+  }
+}
 
-  func.func private @composite_dot_general_fn(%arg0: tensor<1x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
-    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
-    return %0 : tensor<1x3xf32>
+// -----
+
+// Tests that basic convolution is properly quantized.
+
+// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
+module attributes {tf_saved_model.semantics} {
+// The following pattern does not converge because of a bug in QuantizePass.
+// TODO - b/305469508: Fix the QuantizePass to avoid this warning.
+  func.func private @quantize_convolution(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64, _entry_function = @composite_convolution_fn, _original_entry_function = "composite_convolution_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
   }
-// Check that the composite_dot_general_fn is untouched.
+// Checks that the quantized XlaCallModule has been replaced by a CallOp, which
+// calls the quantized entry function.
 
-// CHECK: func.func private @composite_dot_general_fn(%[[ARG_2:.*]]: tensor<1x3xf32>, %[[ARG_3:.*]]: tensor<3x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module}
-// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %[[ARG_2]], %[[ARG_3]]
-// CHECK: return %[[DOT_GENERAL]]
+// CHECK-LABEL: func.func private @quantize_convolution
+// CHECK-SAME: (%[[ARG_1:.*]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.*]] = stablehlo.uniform_quantize %[[ARG_1]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.*]] = call @quantized_convolution_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.*]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+  func.func private @composite_convolution_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+// Checks that the entry function is quantized for convolution. Quantized
+// convolution outputs an i32 quantized tensor, followed by requantization to
+// i8 quantized tensor.
+
+// CHECK: func.func private @quantized_convolution_fn(%[[ARG_2:.*]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_3:.*]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[CONVOLUTION_0:.*]] = stablehlo.convolution(%[[ARG_2]], %[[ARG_3]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.*]] = stablehlo.uniform_quantize %[[CONVOLUTION_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
 }
 
 // -----
 
-// Tests that a fusion pattern for dot_general is not yet supported. Further op
-// coverage will be provided in the future.
-// TODO - b/307620428: Increase op coverage to cover this test case.
+// Tests that fused pattern for convolution + bias is properly quantized.
 
+// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
 module attributes {tf_saved_model.semantics} {
 // The following pattern does not converge because of a bug in QuantizePass.
 // TODO - b/305469508: Fix the QuantizePass to avoid this warning.
-// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
-  func.func private @dot_general_fn_fusion_not_quantized(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
-    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<3x3xf32>} : () -> tensor<3x3xf32>
-    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-// expected-error @+1 {{'tf.XlaCallModule' op operand #0 must be variadic of tensor of tf.dtype values}}
-    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
-    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-    return %2 : tensor<1x3xf32>
+  func.func private @quantize_convolution_with_bias(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x3x4x2xf32>} : () -> tensor<1x3x4x2xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_convolution_with_bias_fn, _original_entry_function = "composite_convolution_with_bias_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
   }
 
-  func.func private @composite_dot_general_fn(%arg0: tensor<1x3xf32>, %arg1: tensor<3x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
-    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x3xf32>, tensor<3x3xf32>) -> tensor<1x3xf32>
-    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+// CHECK-LABEL: func.func private @quantize_convolution_with_bias
+// CHECK-SAME: (%[[ARG_1:.*]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x3x4x2xi32>} : () -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.*]] = stablehlo.uniform_quantize %[[ARG_1]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.*]] = call @quantized_convolution_with_bias_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.*]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+// CHECK: func.func private @quantized_convolution_with_bias_fn(%[[ARG_2:.*]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_3:.*]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_4:.*]]: tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+  func.func private @composite_convolution_with_bias_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    %1 = stablehlo.add %0, %arg2 : tensor<1x3x4x2xf32>
+    return %1 : tensor<1x3x4x2xf32>
+  }
+// CHECK: %[[CONVOLUTION_0:.*]] = stablehlo.convolution(%[[ARG_2]], %[[ARG_3]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK: %[[ADD_0:.*]] = stablehlo.add %[[CONVOLUTION_0]], %[[ARG_4]] : tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.*]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+}
+
+// -----
+
+// Tests that XlaCallModule op is not quantized without the quantfork.stats ops.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @not_quantized_without_stats(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
     return %1 : tensor<1x3xf32>
   }
+// Check that "tf.Const" is converted to stablehlo.constant. XlaCallModule is
+// not quantized.
+
+// CHECK-LABEL: func.func private @not_quantized_without_stats
+// CHECK-SAME: (%[[ARG_1:.*]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
+// CHECK: %[[XLA_CALL_MODULE_0:.*]] = "tf.XlaCallModule"(%[[ARG_1]], %[[CONST_0]]) <{{{.*}}}> {{{.*_entry_function = @composite_dot_general_fn.*}}} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+// CHECK: return %[[XLA_CALL_MODULE_0]]
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+// Check that the composite_dot_general_fn is untouched.
+
+// CHECK: func.func private @composite_dot_general_fn(%[[ARG_2:.*]]: tensor<1x2xf32>, %[[ARG_3:.*]]: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module}
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %[[ARG_2]], %[[ARG_3]]
+// CHECK: return %[[DOT_GENERAL]]
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_same_scale.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_same_scale.mlir
new file mode 100644
index 00000000000000..7878bccf9d7e61
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_same_scale.mlir
@@ -0,0 +1,261 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-quantize -verify-each=false | FileCheck %s
+
+// CHECK-LABEL: same_scale_after_composite
+func.func @same_scale_after_composite() -> tensor<3x1xf32> {
+  // CHECK: %[[CALL:.*]] = "tf.XlaCallModule"()
+  // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
+  // CHECK-SAME: () -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[RESHAPE:.*]] = "stablehlo.reshape"(%[[CALL]]) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%[[RESHAPE]]) : (tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1xf32>
+  // CHECK: "func.return"(%[[DQ]])
+
+  %0 = "tf.XlaCallModule"() {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : () -> tensor<1x3xf32>
+  %1 = "quantfork.qcast"(%0) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %2 = "quantfork.dcast"(%1) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  %3 = stablehlo.reshape %2 : (tensor<1x3xf32>) -> tensor<3x1xf32>
+  %4 = "quantfork.qcast"(%3) {volatile} : (tensor<3x1xf32>) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %5 = "quantfork.dcast"(%4) : (tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1xf32>
+  return %5 : tensor<3x1xf32>
+}
+
+// -----
+
+// CHECK-LABEL: same_scale_indirectly_connected
+func.func @same_scale_indirectly_connected() -> tensor<1x3xf32> {
+  // CHECK: %[[CALL:.*]] = "tf.XlaCallModule"()
+  // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
+  // CHECK-SAME: () -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[RESHAPE:.*]] = "stablehlo.reshape"(%[[CALL]]) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[TRANSPOSE:.*]] = "stablehlo.transpose"(%[[RESHAPE]]) {permutation = array<i64: 1, 0>} : (tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%[[TRANSPOSE]]) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  // CHECK: "func.return"(%[[DQ]])
+
+  %0 = "tf.XlaCallModule"() {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : () -> tensor<1x3xf32>
+  %1 = "quantfork.qcast"(%0) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %2 = "quantfork.dcast"(%1) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  %3 = stablehlo.reshape %2 : (tensor<1x3xf32>) -> tensor<3x1xf32>
+  %4 = "quantfork.qcast"(%3) {volatile} : (tensor<3x1xf32>) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %5 = "quantfork.dcast"(%4) : (tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1xf32>
+  %6 = "stablehlo.transpose"(%5) {permutation = array<i64: 1, 0>} : (tensor<3x1xf32>) -> tensor<1x3xf32>
+  %7 = "quantfork.qcast"(%6) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %8 = "quantfork.dcast"(%7) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  return %8 : tensor<1x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: same_scale_not_connected_to_composite
+func.func @same_scale_not_connected_to_composite() -> tensor<3x1xf32> {
+  // CHECK: %[[CST:.*]] = stablehlo.constant
+  // CHECK: %[[Q1:.*]] = "quantfork.qcast"(%[[CST]]) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[DQ1:.*]] = "quantfork.dcast"(%[[Q1]]) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  // CHECK: %[[RESHAPE:.*]] = stablehlo.reshape %[[DQ1]]
+  // CHECK: %[[Q2:.*]] = "quantfork.qcast"(%[[RESHAPE]]) {volatile} : (tensor<3x1xf32>) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[DQ2:.*]] = "quantfork.dcast"(%[[Q2]]) : (tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1xf32>
+  // CHECK: return %[[DQ2]]
+
+  %0 = stablehlo.constant dense<1.000000e+00> : tensor<1x3xf32>
+  %1 = "quantfork.qcast"(%0) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %2 = "quantfork.dcast"(%1) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  %3 = stablehlo.reshape %2 : (tensor<1x3xf32>) -> tensor<3x1xf32>
+  %4 = "quantfork.qcast"(%3) {volatile} : (tensor<3x1xf32>) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %5 = "quantfork.dcast"(%4) : (tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1xf32>
+  return %5 : tensor<3x1xf32>
+}
+
+// -----
+
+// CHECK-LABEL: concatenate_and_composite
+// CHECK: %[[ARG0:.*]]: tensor<3x2xf32>
+// CHECK-SAME: %[[ARG1:.*]]: tensor<1x2xf32>
+func.func @concatenate_and_composite(%arg0: tensor<3x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<4x5xf32> {
+  // CHECK: %[[Q1:.*]] = "quantfork.qcast"(%[[ARG0]]) {volatile} : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  // CHECK: %[[Q2:.*]] = "quantfork.qcast"(%[[ARG1]]) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  // CHECK: %[[PAD:.*]] = "stablehlo.concatenate"(%[[Q1]], %[[Q2]]) {dimension = 0 : i64}
+  // CHECK-SAME: (tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>, tensor<1x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<4x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  // CHECK: %[[CALL:.*]] = "tf.XlaCallModule"(%[[PAD]])
+  // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
+  // CHECK-SAME: (tensor<4x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<4x5x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%[[CALL]]) : (tensor<4x5x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<4x5xf32>
+  // CHECK:  "func.return"(%[[DQ]])
+
+  %0 = "quantfork.qcast"(%arg0) {volatile} : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  %1 = "quantfork.dcast"(%0) : (tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<3x2xf32>
+  %2 = "quantfork.qcast"(%arg1) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  %3 = "quantfork.dcast"(%2) : (tensor<1x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<1x2xf32>
+  %4 = "stablehlo.concatenate"(%1, %3) {
+    dimension = 0 : i64
+  } : (tensor<3x2xf32>, tensor<1x2xf32>) -> tensor<4x2xf32>
+  %5 = "quantfork.qcast"(%4) {volatile} : (tensor<4x2xf32>) -> tensor<4x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  %6 = "quantfork.dcast"(%5) : (tensor<4x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<4x2xf32>
+  %7 = "tf.XlaCallModule"(%6) {Sout = [#tf_type.shape<4x5>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<4x2xf32>) -> tensor<4x5xf32>
+  %8 = "quantfork.qcast"(%7) {volatile} : (tensor<4x5xf32>) -> tensor<4x5x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  %9 = "quantfork.dcast"(%8) : (tensor<4x5x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<4x5xf32>
+  return %9 : tensor<4x5xf32>
+}
+
+// -----
+
+// CHECK-LABEL: composite_and_convert
+func.func @composite_and_convert() -> tensor<1x3xf32> {
+  // CHECK: %[[CALL:.*]] = "tf.XlaCallModule"()
+  // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
+  // CHECK-SAME: () -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[CONVERT:.*]] = "stablehlo.convert"(%[[CALL]]) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%[[CONVERT]]) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  // CHECK:  "func.return"(%[[DQ]])
+
+  %0 = "tf.XlaCallModule"() {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : () -> tensor<1x3xf32>
+  %1 = "quantfork.qcast"(%0) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %2 = "quantfork.dcast"(%1) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  %3 = stablehlo.convert %2 : (tensor<1x3xf32>) -> (tensor<1x3xf32>)
+  %4 = "quantfork.qcast"(%3) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %5 = "quantfork.dcast"(%4) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  return %5 : tensor<1x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: pad_and_composite
+// CHECK: %[[ARG0:.*]]: tensor<2x3xf32>
+// CHECK-SAME: %[[ARG1:.*]]: tensor<f32>
+func.func @pad_and_composite(%arg0: tensor<2x3xf32>, %arg1: tensor<f32>) -> tensor<5x6xf32> {
+  // CHECK: %[[Q1:.*]] = "quantfork.qcast"(%[[ARG0]]) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  // CHECK: %[[Q2:.*]] = "quantfork.qcast"(%[[ARG1]]) {volatile} : (tensor<f32>) -> tensor<!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  // CHECK: %[[PAD:.*]] = "stablehlo.pad"(%[[Q1]], %[[Q2]])
+  // CHECK-SAME: {edge_padding_high = array<i64: 2, 1>, edge_padding_low = array<i64: 0, 1>, interior_padding = array<i64: 1, 2>}
+  // CHECK-SAME: (tensor<2x3x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>, tensor<!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<5x9x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  // CHECK: %[[CALL:.*]] = "tf.XlaCallModule"(%[[PAD]])
+  // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
+  // CHECK-SAME: (tensor<5x9x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<5x6x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%[[CALL]]) : (tensor<5x6x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<5x6xf32>
+  // CHECK:  "func.return"(%[[DQ]])
+
+  %0 = "quantfork.qcast"(%arg0) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  %1 = "quantfork.dcast"(%0) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<2x3xf32>
+  %2 = "quantfork.qcast"(%arg1) {volatile} : (tensor<f32>) -> tensor<!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  %3 = "quantfork.dcast"(%2) : (tensor<!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<f32>
+  %4 = "stablehlo.pad"(%1, %3) {
+    edge_padding_low = array<i64: 0, 1>,
+    edge_padding_high = array<i64: 2, 1>,
+    interior_padding = array<i64: 1, 2>
+  }: (tensor<2x3xf32>, tensor<f32>) -> tensor<5x9xf32>
+  %5 = "quantfork.qcast"(%4) {volatile} : (tensor<5x9xf32>) -> tensor<5x9x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  %6 = "quantfork.dcast"(%5) : (tensor<5x9x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>) -> tensor<5x9xf32>
+  %7 = "tf.XlaCallModule"(%6) {Sout = [#tf_type.shape<5x6>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<5x9xf32>) -> tensor<5x6xf32>
+  %8 = "quantfork.qcast"(%7) {volatile} : (tensor<5x6xf32>) -> tensor<5x6x!quant.uniform<i8:f32, 1.000000e-03:-3>>
+  %9 = "quantfork.dcast"(%8) : (tensor<5x6x!quant.uniform<i8:f32, 1.000000e-03:-3>>) -> tensor<5x6xf32>
+  return %9 : tensor<5x6xf32>
+}
+
+// -----
+
+// CHECK-LABEL: composite_and_select
+// CHECK: %[[ARG0:.*]]: tensor<1x3xi1>
+// CHECK-SAME: %[[ARG1:.*]]: tensor<1x3xf32>
+func.func @composite_and_select(%arg0: tensor<1x3xi1>, %arg1: tensor<1x3xf32>) -> tensor<1x3xf32> {
+  // CHECK: %[[CALL:.*]] = "tf.XlaCallModule"()
+  // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
+  // CHECK-SAME: () -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[Q1:.*]] = "quantfork.qcast"(%[[ARG1]]) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[SELECT:.*]] = "stablehlo.select"(%[[ARG0]], %[[CALL]], %[[Q1]]) : (tensor<1x3xi1>, tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>, tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%2) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  // CHECK:  "func.return"(%[[DQ]])
+
+  %0 = "tf.XlaCallModule"() {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : () -> tensor<1x3xf32>
+  %1 = "quantfork.qcast"(%0) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %2 = "quantfork.dcast"(%1) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  %3 = "quantfork.qcast"(%arg1) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %4 = "quantfork.dcast"(%3) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  %7 = stablehlo.select %arg0, %2, %4 : (tensor<1x3xi1>, tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  %8 = "quantfork.qcast"(%7) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %9 = "quantfork.dcast"(%8) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  return %9 : tensor<1x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: composite_and_broadcast_in_dim
+func.func @composite_and_broadcast_in_dim() -> tensor<2x3x2xf32> {
+  // CHECK: %[[CALL:.*]] = "tf.XlaCallModule"()
+  // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
+  // CHECK-SAME: () -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[BROADCAST:.*]] = "stablehlo.broadcast_in_dim"(%[[CALL]])
+  // CHECK-SAME: (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x3x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%[[BROADCAST]]) : (tensor<2x3x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x3x2xf32>
+  // CHECK: "func.return"(%[[DQ]])
+
+  %0 = "tf.XlaCallModule"() {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : () -> tensor<1x3xf32>
+  %1 = "quantfork.qcast"(%0) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %2 = "quantfork.dcast"(%1) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+  %3 = "stablehlo.broadcast_in_dim"(%2) {
+    broadcast_dimensions = dense<[2, 1]>: tensor<2xi64>
+  } : (tensor<1x3xf32>) -> tensor<2x3x2xf32>
+  %4 = "quantfork.qcast"(%3) {volatile} : (tensor<2x3x2xf32>) -> tensor<2x3x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %5 = "quantfork.dcast"(%4) : (tensor<2x3x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x3x2xf32>
+  return %5 : tensor<2x3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: composite_and_gather
+// CHECK: %[[ARG0:.*]]: tensor<2x3x2xi64>
+func.func @composite_and_gather(%arg0: tensor<2x3x2xi64>) -> tensor<2x3x2x2xf32> {
+  // CHECK: %[[CALL:.*]] = "tf.XlaCallModule"()
+  // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
+  // CHECK-SAME: () -> tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[GATHER:.*]] = "stablehlo.gather"(%[[CALL]], %[[ARG0]])
+  // CHECK-SAME: (tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>, tensor<2x3x2xi64>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%[[GATHER]]) : (tensor<2x3x2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x3x2x2xf32>
+  // CHECK: "func.return"(%[[DQ]])
+
+  %0 = "tf.XlaCallModule"() {Sout = [#tf_type.shape<3x4x2>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : () -> tensor<3x4x2xf32>
+  %1 = "quantfork.qcast"(%0) {volatile} : (tensor<3x4x2xf32>) -> tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %2 = "quantfork.dcast"(%1) : (tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x4x2xf32>
+  %3 = "stablehlo.gather"(%2, %arg0) {
+    dimension_numbers = #stablehlo.gather<
+      offset_dims = [2, 3],
+      collapsed_slice_dims = [0],
+      start_index_map = [1, 0],
+      index_vector_dim = 2>,
+    slice_sizes = dense<[1, 2, 2]> : tensor<3xi64>,
+    indices_are_sorted = false
+  } : (tensor<3x4x2xf32>, tensor<2x3x2xi64>) -> tensor<2x3x2x2xf32>
+  %4 = "quantfork.qcast"(%3) {volatile} : (tensor<2x3x2x2xf32>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %5 = "quantfork.dcast"(%4) : (tensor<2x3x2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x3x2x2xf32>
+  return %5 : tensor<2x3x2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: composite_and_slice
+func.func @composite_and_slice() -> tensor<2x2xf32> {
+  // CHECK: %[[CALL:.*]] = "tf.XlaCallModule"()
+  // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
+  // CHECK-SAME: () -> tensor<3x4x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[SLICE:.*]] = "stablehlo.slice"(%[[CALL]])
+  // CHECK-SAME: (tensor<3x4x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%[[SLICE]]) : (tensor<2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x2xf32>
+  // CHECK: "func.return"(%[[DQ]])
+
+  %0 = "tf.XlaCallModule"() {Sout = [#tf_type.shape<3x4>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : () -> tensor<3x4xf32>
+  %1 = "quantfork.qcast"(%0) {volatile} : (tensor<3x4xf32>) -> tensor<3x4x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %2 = "quantfork.dcast"(%1) : (tensor<3x4x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x4xf32>
+  %3 = "stablehlo.slice"(%2) {
+    start_indices = array<i64: 1, 2>,
+    limit_indices = array<i64: 3, 4>,
+    strides = array<i64: 1, 1>
+  } : (tensor<3x4xf32>) -> tensor<2x2xf32>
+  %4 = "quantfork.qcast"(%3) {volatile} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+  %5 = "quantfork.dcast"(%4) : (tensor<2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x2xf32>
+  return %5 : tensor<2x2xf32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
index 3d04c72dec7f7e..745d44282c9e0f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
@@ -1,6 +1,9 @@
-// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-replace-stablehlo-ops-in-main-function-with-xla-call-module-ops | FileCheck %s
+// RUN: stablehlo-quant-opt %s -split-input-file \
+// RUN:    -stablehlo-replace-stablehlo-ops-in-main-function-with-xla-call-module-ops \
+// RUN:    | FileCheck %s
 
-// Modules with "main" or "serving_default" should properly run this pass and convert subgraphs into XLACallModuleOp.
+// Modules with "main" or "serving_default" should properly run this pass and
+// convert subgraphs into XLACallModuleOp.
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
 
@@ -20,23 +23,23 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
     %1 = stablehlo.constant dense<1.000000e+03> : tensor<1x3xf32>
     %2 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-    %3 = "tf.XlaCallModule"(%2, %0, %1) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+    %3 = "tf.XlaCallModule"(%2, %0, %1) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
     %4 = "tf.CustomAggregator"(%3) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
     %5 = stablehlo.constant dense<1.000000e+03> : tensor<3x64xf32>
     %6 = stablehlo.constant dense<1.000000e+03> : tensor<1x64xf32>
     %7 = "tf.CustomAggregator"(%4) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-    %8 = "tf.XlaCallModule"(%7, %5, %6) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3xf32>, tensor<3x64xf32>, tensor<1x64xf32>) -> tensor<1x64xf32>
+    %8 = "tf.XlaCallModule"(%7, %5, %6) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x3xf32>, tensor<3x64xf32>, tensor<1x64xf32>) -> tensor<1x64xf32>
     %9 = "tf.CustomAggregator"(%6) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x64xf32>) -> tensor<1x64xf32>
     return %9 : tensor<1x64xf32>
   }
 
   // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], {{.*}}}> {_entry_function = @_stablehlo_main_1
   // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-  // CHECK: %[[XLA_CALL_MODULE_0:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK: %[[XLA_CALL_MODULE_0:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable"}
   // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE_0]])
   // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], {{.*}}}> {_entry_function = @_stablehlo_main_0
   // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]] = "tf.CustomAggregator"(%[[CUSTOM_AGGREGATOR_1]]) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-  // CHECK: %[[XLA_CALL_MODULE_1:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}}> {_entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1"
+  // CHECK: %[[XLA_CALL_MODULE_1:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _tfl_quant_trait = "fully_quantizable"}
   // CHECK: %[[CUSTOM_AGGREGATOR_3:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE_1:.*]])
   // CHECK: return %[[CUSTOM_AGGREGATOR_3]] : tensor<1x64xf32>
   // CHECK: }
@@ -63,6 +66,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 
 // -----
 
+// Tests that the subgraph in serving_default excluding the tf.Identity is
+// converted to a single XlaCallModuleOp.
+
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1654 : i32}, tf_saved_model.semantics} {
 
   // CHECK: func private @_stablehlo_main_0
@@ -85,8 +91,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     return %5 : tensor<1x1024xf32>
   }
 
- // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"(%arg0) <{Sout = [#tf_type.shape<1x1024>]
- // CHECK-SAME: _entry_function = @_stablehlo_main_0
+ // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"(%arg0) <{Sout = [#tf_type.shape<1x1024>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @_stablehlo_main_0, _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
  // CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP]])
  // CHECK: return %[[IDENTITY]]
  // CHECK }
@@ -95,8 +100,10 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 
 // -----
 
+// Tests that the first stablehlo.constant is converted to XlaCallModuleOp.
+
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
-  // CHECK: func private @_stablehlo_main_
+  // CHECK: func private @_stablehlo_main_0
   // CHECK: %[[CONSTANT:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
   // CHECK: return %[[CONSTANT:.*]]
   // CHECK: }
@@ -105,12 +112,12 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   func.func @serving_default(%arg0: tensor<1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
     %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
     %1 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-    %2 = "tf.XlaCallModule"(%1, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    %2 = "tf.XlaCallModule"(%1, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
     %3 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
     return %3 : tensor<1x3xf32>
   }
 
-  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_
+  // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @_stablehlo_main_0, _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}}
   // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
   // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
@@ -127,7 +134,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 
 // -----
 
-// Tests to confirm that the StableHLO graph is not replaced if "main" or "serving_default" function is in the module.
+// Tests to confirm that the StableHLO graph is not replaced if "main" or
+// "serving_default" function is not in the module.
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
   // CHECK-NOT: func private @_stablehlo_main_
@@ -136,14 +144,14 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   func.func @random_name(%arg0: tensor<1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
     %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
     %1 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-    %2 = "tf.XlaCallModule"(%1, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    %2 = "tf.XlaCallModule"(%1, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
     %3 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
     return %3 : tensor<1x3xf32>
   }
 
   // CHECK: %[[CONSTANT:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
   // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR:.*]], %[[XLA_CALL_MODULE_EXTRACTED_FROM_SUBGRAPH:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR:.*]], %[[XLA_CALL_MODULE_EXTRACTED_FROM_SUBGRAPH:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
   // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: return %[[CUSTOM_AGGREGATOR_1]]
   // CHECK: }
@@ -155,3 +163,97 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     return %0 : tensor<1x3xf32>
   }
 }
+
+// -----
+
+// Tests where StableHLO graph in main has a small constant to be duplicated.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
+  // CHECK: func private @_stablehlo_main_1() -> tensor<1024x3xf32> attributes {_from_xla_call_module}
+  // CHECK: %[[CONSTANT1:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+  // CHECK: return %[[CONSTANT1:.*]]
+  // CHECK: }
+
+  // CHECK: func private @_stablehlo_main_0
+  // CHECK-SAME: %[[INPUT1:.*]]: tensor<1024x3xf32>, %[[INPUT2:.*]]: tensor<1024x3xf32>
+  // CHECK: %[[CONSTANT2:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+  // CHECK: %[[ADD:.*]] = stablehlo.add %[[INPUT1]], %[[CONSTANT2]] : tensor<1024x3xf32>
+  // CHECK: %[[MUL:.*]] = stablehlo.multiply %[[INPUT1]], %[[INPUT2]] : tensor<1024x3xf32>
+  // CHECK: return %[[ADD]], %[[MUL]]
+  // CHECK: }
+
+  // CHECK: @serving_default
+  func.func @serving_default(%arg0: tensor<1024x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1024x3xf32> {tf_saved_model.index_path = ["output1"]}, tensor<1024x3xf32> {tf_saved_model.index_path = ["output2"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+    %1 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+    %2 = "tf.XlaCallModule"(%1, %0) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
+    %3 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x3xf32>) -> tensor<1024x3xf32>
+    %4 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
+    %5 = stablehlo.add %3, %4 : tensor<1024x3xf32>
+    %6 = stablehlo.multiply %3, %0 : tensor<1024x3xf32>
+    return %5, %6 : tensor<1024x3xf32>, tensor<1024x3xf32>
+  }
+
+  // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_1
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: %[[SUBGRAPH_2:.*]]:2 = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<1024x3>, #tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_0
+  // CHECK: return %[[SUBGRAPH_2]]#0, %[[SUBGRAPH_2]]#1
+  // CHECK: }
+
+  // CHECK: @composite_dot_general_fn_1
+  // CHECK-NOT: tf_quant.composite_function
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+// Tests where StableHLO graph in main has branches.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1629 : i32}, tf_saved_model.semantics} {
+  // CHECK: func private @_stablehlo_main_1(%[[INPUT:.*]]: tensor<3x3xf32>) -> tensor<3x3xf32>
+  // CHECK: %[[CONSTANT1:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<3x3xf32>
+  // CHECK: %[[ADD:.*]] = stablehlo.add %[[CONSTANT1]], %[[INPUT]] : tensor<3x3xf32>
+  // CHECK: return %[[ADD:.*]]
+  // CHECK: }
+
+  // CHECK: func private @_stablehlo_main_0
+  // CHECK-SAME: (%[[INPUT1:.*]]: tensor<3x3xf32>, %[[INPUT2:.*]]: tensor<3x3xf32>)
+  // CHECK-SAME: -> tensor<3x3xf32>
+  // CHECK: %[[CONSTANT2:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<3x3xf32>
+  // CHECK: %[[ADD:.*]] = stablehlo.add %[[INPUT1]], %[[INPUT2]] : tensor<3x3xf32>
+  // CHECK: %[[MUL:.*]] = stablehlo.multiply %[[ADD]], %[[CONSTANT2]] : tensor<3x3xf32>
+  // CHECK: return %[[MUL]]
+  // CHECK: }
+
+  // CHECK: @serving_default
+  func.func @serving_default(%arg0: tensor<3x3xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<3x3xf32> {tf_saved_model.index_path = ["output1"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = stablehlo.constant dense<1.000000e+03> : tensor<3x3xf32>
+    %1 = stablehlo.add %0, %arg0 : tensor<3x3xf32>
+    %2 = "tf.CustomAggregator"(%1) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<3x3xf32>) -> tensor<3x3xf32>
+    %3 = "tf.XlaCallModule"(%2, %2) {Sout = [#tf_type.shape<3x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+    %4 = "tf.CustomAggregator"(%3) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<3x3xf32>) -> tensor<3x3xf32>
+    %5 = stablehlo.add %4, %1 : tensor<3x3xf32>
+    %6 = stablehlo.multiply %5, %0 : tensor<3x3xf32>
+    return %6 : tensor<3x3xf32>
+  }
+
+  // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"(%[[INPUT:.*]]) <{Sout = [#tf_type.shape<3x3>], {{.*}}}> {_entry_function = @_stablehlo_main_1
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]] = "tf.CustomAggregator"(%[[SUBGRAPH_1]]) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<3x3xf32>) -> tensor<3x3xf32>
+  // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]], %[[CUSTOM_AGGREGATOR_1]]) <{Sout = [#tf_type.shape<3x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
+  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
+  // CHECK: %[[SUBGRAPH_2:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<3x3>], {{.*}}}> {_entry_function = @_stablehlo_main_0
+  // CHECK: return %[[SUBGRAPH_2]]
+  // CHECK: }
+
+  // CHECK: @composite_dot_general_fn_1
+  // CHECK-NOT: tf_quant.composite_function
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+    return %0 : tensor<3x3xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/stablehlo_op_quant_spec_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/tests/stablehlo_op_quant_spec_test.cc
new file mode 100644
index 00000000000000..8c0c2e5fc06116
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/stablehlo_op_quant_spec_test.cc
@@ -0,0 +1,177 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h"
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/test_base.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace mlir::quant::stablehlo {
+namespace {
+
+using ::mlir::quant::common::QuantizationTestBase;
+
+class IsOpQuantizableStableHloTest : public QuantizationTestBase {};
+
+// Quantizable ops: constants
+// Non-quantizable ops: normal StableHLO ops and terminators
+constexpr absl::string_view module_constant_add = R"mlir(
+  module {
+    func.func @constant_add() -> (tensor<3x2xf32>) {
+      %cst1 = stablehlo.constant dense<2.4> : tensor<3x2xf32>
+      %cst2 = stablehlo.constant dense<5.7> : tensor<3x2xf32>
+      %add = stablehlo.add %cst1, %cst2 : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x2xf32>
+      func.return %add : tensor<3x2xf32>
+    }
+  }
+)mlir";
+
+// Quantizable ops: XlaCallModule op with "fully_quantizable" attribute and
+// same-scale StableHLO ops
+// Non-quantizable ops: quantize/dequantize ops
+constexpr absl::string_view module_composite_same_scale = R"mlir(
+  module {
+    func.func @same_scale_after_composite() -> tensor<3x1xf32> {
+      %0 = "tf.XlaCallModule"() {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : () -> tensor<1x3xf32>
+      %1 = "quantfork.qcast"(%0) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+      %2 = "quantfork.dcast"(%1) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
+      %3 = stablehlo.reshape %2 : (tensor<1x3xf32>) -> tensor<3x1xf32>
+      %4 = "quantfork.qcast"(%3) {volatile} : (tensor<3x1xf32>) -> tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+      %5 = "quantfork.dcast"(%4) : (tensor<3x1x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x1xf32>
+      return %5 : tensor<3x1xf32>
+    }
+  }
+)mlir";
+
+// Non-quantizable ops: XlaCallModule op without "fully_quantizable" attribute
+constexpr absl::string_view module_composite_no_attr = R"mlir(
+  module {
+    func.func @composite_without_attr() -> tensor<1x3xf32> {
+      %0 = "tf.XlaCallModule"() {Sout = [#tf_type.shape<1x3>], _entry_function = @non_quantizable_composite, _original_entry_function = "non_quantizable_composite", _stablehlo_module_attrs = {}, device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : () -> tensor<1x3xf32>
+      return %0 : tensor<1x3xf32>
+    }
+  }
+)mlir";
+
+TEST_F(IsOpQuantizableStableHloTest, ConstantOpQuantizable) {
+  OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(module_constant_add);
+  func::FuncOp test_func =
+      GetFunctionFromModule(*module_op_ref, "constant_add");
+  Operation* constant_op =
+      FindOperationOfType<mlir::stablehlo::ConstantOp>(test_func);
+  bool is_constant_quantizable =
+      mlir::quant::stablehlo::IsOpQuantizableStableHlo(constant_op);
+
+  EXPECT_TRUE(is_constant_quantizable);
+}
+
+TEST_F(IsOpQuantizableStableHloTest, TerminatorOpNotQuantizable) {
+  OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(module_constant_add);
+  func::FuncOp test_func =
+      GetFunctionFromModule(*module_op_ref, "constant_add");
+  Operation* return_op = FindOperationOfType<func::ReturnOp>(test_func);
+  bool is_return_quantizable =
+      mlir::quant::stablehlo::IsOpQuantizableStableHlo(return_op);
+
+  EXPECT_FALSE(is_return_quantizable);
+}
+
+TEST_F(IsOpQuantizableStableHloTest, SameScaleOpQuantizable) {
+  OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(module_composite_same_scale);
+  func::FuncOp test_func =
+      GetFunctionFromModule(*module_op_ref, "same_scale_after_composite");
+  Operation* reshape_op =
+      FindOperationOfType<mlir::stablehlo::ReshapeOp>(test_func);
+  bool is_reshape_quantizable =
+      mlir::quant::stablehlo::IsOpQuantizableStableHlo(reshape_op);
+
+  EXPECT_TRUE(is_reshape_quantizable);
+}
+
+TEST_F(IsOpQuantizableStableHloTest, NonSameScaleOpNotQuantizable) {
+  OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(module_constant_add);
+  func::FuncOp test_func =
+      GetFunctionFromModule(*module_op_ref, "constant_add");
+  Operation* add_op = FindOperationOfType<mlir::stablehlo::AddOp>(test_func);
+  bool is_add_quantizable =
+      mlir::quant::stablehlo::IsOpQuantizableStableHlo(add_op);
+
+  EXPECT_FALSE(is_add_quantizable);
+}
+
+TEST_F(IsOpQuantizableStableHloTest, ValidXlaCallModuleOpQuantizable) {
+  OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(module_composite_same_scale);
+  func::FuncOp test_func =
+      GetFunctionFromModule(*module_op_ref, "same_scale_after_composite");
+  Operation* xla_call_module_op =
+      FindOperationOfType<TF::XlaCallModuleOp>(test_func);
+  bool is_xla_call_module_quantizable =
+      mlir::quant::stablehlo::IsOpQuantizableStableHlo(xla_call_module_op);
+
+  EXPECT_TRUE(is_xla_call_module_quantizable);
+}
+
+TEST_F(IsOpQuantizableStableHloTest, InvalidXlaCallModuleOpNotQuantizable) {
+  OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(module_composite_no_attr);
+  func::FuncOp test_func =
+      GetFunctionFromModule(*module_op_ref, "composite_without_attr");
+  Operation* xla_call_module_op =
+      FindOperationOfType<TF::XlaCallModuleOp>(test_func);
+  bool is_xla_call_module_quantizable =
+      mlir::quant::stablehlo::IsOpQuantizableStableHlo(xla_call_module_op);
+
+  EXPECT_FALSE(is_xla_call_module_quantizable);
+}
+
+TEST_F(IsOpQuantizableStableHloTest, QuantizeDequantizeOpNotQuantizable) {
+  OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(module_composite_same_scale);
+  func::FuncOp test_func =
+      GetFunctionFromModule(*module_op_ref, "same_scale_after_composite");
+  Operation* quantize_op =
+      FindOperationOfType<quantfork::QuantizeCastOp>(test_func);
+  Operation* dequantize_op =
+      FindOperationOfType<quantfork::DequantizeCastOp>(test_func);
+  bool is_quantize_quantizable =
+      mlir::quant::stablehlo::IsOpQuantizableStableHlo(quantize_op);
+  bool is_dequantize_quantizable =
+      mlir::quant::stablehlo::IsOpQuantizableStableHlo(dequantize_op);
+
+  EXPECT_FALSE(is_quantize_quantizable);
+  EXPECT_FALSE(is_dequantize_quantizable);
+}
+
+}  // namespace
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/unwrap_xla_call_module_op.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/unwrap_xla_call_module_op.mlir
new file mode 100644
index 00000000000000..dde460411168d0
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/unwrap_xla_call_module_op.mlir
@@ -0,0 +1,53 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-unwrap-xla-call-module-op | FileCheck %s
+
+// Tests if XlaCallModule op without quantizable trait that calls function with
+// '_from_xla_call_module' trait is unwrapped.
+// Tests if XlaCallModule op with quantizable trait is not unwrapped.
+// Tests if XlaCallModule op without quantizable trait that calls function
+// without '_from_xla_call_module' trait is not unwrapped.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1682 : i32}, tf_saved_model.semantics} {
+  // CHECK-LABEL: @main_00
+  // CHECK: %[[ARG0:.*]]: tensor<10x1x1024xf32>
+  func.func private @main_00(%arg0: tensor<10x1x1024xf32>) -> tensor<6x5xf32> attributes {tf._original_func_name = "main_0"} {
+    %0 = "tf.Const"() <{value = dense<1.000000e+00> : tensor<10x1024x3xf32>}> : () -> tensor<10x1024x3xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<10x1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    %2 = "tf.XlaCallModule"(%1) <{Sout = [#tf_type.shape<3x10>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @main_0, _stablehlo_module_attrs = {}, device = ""} : (tensor<10x1x3xf32>) -> tensor<3x10xf32>
+    %3 = "tf.XlaCallModule"(%2) <{Sout = [#tf_type.shape<6x5>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @main_1, _stablehlo_module_attrs = {}, device = ""} : (tensor<3x10xf32>) -> tensor<6x5xf32>
+    return %3 : tensor<6x5xf32>
+  }
+  // CHECK: %[[CST:.*]] = "tf.Const"()
+  // CHECK-NEXT: %[[CALL1:.*]] = "tf.XlaCallModule"(%[[ARG0]], %[[CST]])
+  // CHECK-SAME: _entry_function = @composite_dot_general_fn_1
+  // CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
+  // CHECK-NOT: "tf.XlaCallModule"
+  // CHECK-NEXT: %[[RESHAPE:.*]] = stablehlo.reshape %[[CALL1]] : (tensor<10x1x3xf32>) -> tensor<3x10xf32>
+  // CHECK-NEXT: %[[CALL2:.*]] = "tf.XlaCallModule"(%[[RESHAPE]])
+  // CHECK-SAME: _entry_function = @main_1
+  // CHECK-NOT:  _tfl_quant_trait = "fully_quantizable"
+  // CHECK-NEXT: return %[[CALL2]]
+
+  // CHECK: @composite_dot_general_fn_1
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<10x1x1024xf32>, %arg1: tensor<10x1024x3xf32>) -> tensor<10x1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0] x [0], contracting_dims = [2] x [1] {mhlo.frontend_attributes = {grad_x = "false", grad_y = "false"}} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    return %0 : tensor<10x1x3xf32>
+  }
+  // CHECK: %[[DOT:.*]] = stablehlo.dot_general
+  // CHECK-NEXT: return %[[DOT]]
+
+  // CHECK: @main_0
+  func.func private @main_0(%arg0: tensor<10x1x3xf32>) -> tensor<3x10xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.reshape %arg0 : (tensor<10x1x3xf32>) -> tensor<3x10xf32>
+    return %0 : tensor<3x10xf32>
+  }
+  // CHECK: %[[RESHAPE:.*]] = stablehlo.reshape
+  // CHECK-NEXT: return %[[RESHAPE]]
+
+  // CHECK: @main_1
+  func.func private @main_1(%arg0: tensor<3x10xf32>) -> tensor<6x5xf32> {
+    %0 = stablehlo.reshape %arg0 : (tensor<3x10xf32>) -> tensor<6x5xf32>
+    return %0 : tensor<6x5xf32>
+  }
+  // CHECK: %[[RESHAPE:.*]] = stablehlo.reshape
+  // CHECK-NEXT: return %[[RESHAPE]]
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc b/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
index 3afc42e21d1f6e..a55b1a88e5d964 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
@@ -13,14 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "mlir/InitAllPasses.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "stablehlo/transforms/Passes.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
@@ -29,8 +33,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/IR/register.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tensorflow/core/ir/types/dialect.h"
 
 int main(int argc, char** argv) {
   tensorflow::InitMlir y(&argc, &argv);
@@ -39,13 +46,15 @@ int main(int argc, char** argv) {
   mlir::registerTensorFlowPasses();
   mlir::quant::stablehlo::registerPasses();
   mlir::quant::stablehlo::registerBridgePasses();
+  mlir::stablehlo::registerPasses();
+  mlir::mhlo::registerAllMhloPasses();
 
   mlir::DialectRegistry registry;
   registry.insert<mlir::scf::SCFDialect, mlir::TF::TensorFlowDialect,
                   mlir::tf_saved_model::TensorFlowSavedModelDialect,
                   mlir::func::FuncDialect, mlir::shape::ShapeDialect,
                   mlir::arith::ArithDialect, mlir::tf_type::TFTypeDialect,
-                  mlir::quant::QuantizationDialect,
+                  mlir::quant::QuantizationDialect, mlir::tensor::TensorDialect,
                   mlir::quantfork::QuantizationForkDialect,
                   mlir::stablehlo::StablehloDialect,
                   mlir::tf_executor::TensorFlowExecutorDialect>();
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.cc b/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.cc
index bfd9de9ca60d25..eecc96b04be9eb 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.cc
@@ -16,20 +16,24 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
+#define DEBUG_TYPE "uniform-quantized-types"
+
 namespace mlir {
 namespace quant {
 
 UniformQuantizedType CreateI8F32UniformQuantizedType(const Location loc,
                                                      MLIRContext& context,
-                                                     const float scale,
-                                                     const int8_t zero_point) {
+                                                     const double scale,
+                                                     const int64_t zero_point) {
   return UniformQuantizedType::getChecked(
       loc, /*flags=*/QuantizationFlags::Signed,
       /*storageType=*/IntegerType::get(&context, /*width=*/8),
@@ -38,8 +42,8 @@ UniformQuantizedType CreateI8F32UniformQuantizedType(const Location loc,
 }
 
 UniformQuantizedType CreateI32F32UniformQuantizedType(
-    const Location loc, MLIRContext& context, const float scale,
-    const int32_t zero_point) {
+    const Location loc, MLIRContext& context, const double scale,
+    const int64_t zero_point) {
   return UniformQuantizedType::getChecked(
       loc, /*flags=*/QuantizationFlags::Signed,
       /*storageType=*/IntegerType::get(&context, /*width=*/32),
@@ -49,8 +53,8 @@ UniformQuantizedType CreateI32F32UniformQuantizedType(
 }
 
 UniformQuantizedPerAxisType CreateI8F32UniformQuantizedPerAxisType(
-    const Location loc, MLIRContext& context, const ArrayRef<float> scales,
-    const ArrayRef<int8_t> zero_points, const int quantization_dimension) {
+    const Location loc, MLIRContext& context, const ArrayRef<double> scales,
+    const ArrayRef<int64_t> zero_points, const int quantization_dimension) {
   return UniformQuantizedPerAxisType::getChecked(
       loc, /*flags=*/QuantizationFlags::Signed,
       /*storageType=*/IntegerType::get(&context, /*width=*/8),
@@ -60,5 +64,106 @@ UniformQuantizedPerAxisType CreateI8F32UniformQuantizedPerAxisType(
       /*storageTypeMax=*/llvm::maxIntN(8));
 }
 
+bool IsStorageTypeI8(const QuantizedType quantized_type) {
+  const Type storage_type = quantized_type.getStorageType();
+  return storage_type.isInteger(/*width=*/8);
+}
+
+bool IsStorageTypeI32(const QuantizedType quantized_type) {
+  const Type storage_type = quantized_type.getStorageType();
+  return storage_type.isInteger(/*width=*/32);
+}
+
+bool IsExpressedTypeF32(const QuantizedType quantized_type) {
+  const Type expressed_type = quantized_type.getExpressedType();
+  return expressed_type.isa<Float32Type>();
+}
+
+bool IsI8F32UniformQuantizedType(const Type type) {
+  const UniformQuantizedType quantized_type =
+      type.dyn_cast_or_null<UniformQuantizedType>();
+  if (!quantized_type) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Expected a uniform quantized type. Got: " << type << ".\n");
+    return false;
+  }
+
+  if (!IsStorageTypeI8(quantized_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an i8 storage type. Got: "
+                            << quantized_type << ".\n");
+    return false;
+  }
+
+  if (!IsExpressedTypeF32(quantized_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an f32 expressed type. Got: "
+                            << quantized_type << ".\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool IsI8F32UniformQuantizedPerAxisType(const Type type) {
+  const UniformQuantizedPerAxisType quantized_per_axis_type =
+      type.dyn_cast_or_null<UniformQuantizedPerAxisType>();
+  if (!quantized_per_axis_type) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Expected a uniform quantized type. Got: " << type << ".\n");
+    return false;
+  }
+
+  if (!IsStorageTypeI8(quantized_per_axis_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an i8 storage type. Got: "
+                            << quantized_per_axis_type << ".\n");
+    return false;
+  }
+
+  if (!IsExpressedTypeF32(quantized_per_axis_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an f32 expressed type. Got: "
+                            << quantized_per_axis_type << ".\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool IsI32F32UniformQuantizedType(const Type type) {
+  const UniformQuantizedType quantized_type =
+      type.dyn_cast_or_null<UniformQuantizedType>();
+  if (!quantized_type) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Expected a uniform quantized type. Got: " << type << ".\n");
+    return false;
+  }
+
+  if (!IsStorageTypeI32(quantized_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an i32 storage type. Got: "
+                            << quantized_type << ".\n");
+    return false;
+  }
+
+  if (!IsExpressedTypeF32(quantized_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an f32 expressed type. Got: "
+                            << quantized_type << ".\n");
+    return false;
+  }
+
+  return true;
+}
+
+// Determines whether the storage type of a quantized type is supported by
+// `tfl.quantize` or `tfl.dequantize` ops. ui8, i8 and i16 are supported.
+bool IsSupportedByTfliteQuantizeOrDequantizeOps(IntegerType storage_type) {
+  if (storage_type.getWidth() == 8 ||
+      (storage_type.isSigned() && storage_type.getWidth() == 16)) {
+    return true;
+  }
+  LLVM_DEBUG(llvm::dbgs()
+             << "Uniform quantize / dequantize op only supports ui8, i8 or "
+                "i16 for the storage type of uniform quantized type. Got: "
+             << storage_type << ".\n");
+  return false;
+}
+
 }  // namespace quant
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h b/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h
index 68774b2ecb876b..d04dc5a5761b8f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h
@@ -18,8 +18,10 @@ limitations under the License.
 #include <cstdint>
 
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
@@ -32,8 +34,8 @@ namespace quant {
 // values can be non-zero values.
 UniformQuantizedType CreateI8F32UniformQuantizedType(Location loc,
                                                      MLIRContext& context,
-                                                     float scale,
-                                                     int8_t zero_point);
+                                                     double scale,
+                                                     int64_t zero_point);
 
 // Creates a `UniformQuantizedType` with the given `scale` and `zero_point`
 // values. The produced type has f32 as its expressed type and i32 as its
@@ -42,8 +44,8 @@ UniformQuantizedType CreateI8F32UniformQuantizedType(Location loc,
 // non-zero values.
 UniformQuantizedType CreateI32F32UniformQuantizedType(Location loc,
                                                       MLIRContext& context,
-                                                      float scale,
-                                                      int32_t zero_point);
+                                                      double scale,
+                                                      int64_t zero_point);
 
 // Creates a `UniformQuantizedPerAxisType` with the given `scales` and
 // `zero_points` values. The produced type has f32 as its expressed type and
@@ -51,8 +53,30 @@ UniformQuantizedType CreateI32F32UniformQuantizedType(Location loc,
 // storage value, i.e. [-128, 127]. Assumes asymmetric quantization, meaning the
 // zero point values can be non-zero values.
 UniformQuantizedPerAxisType CreateI8F32UniformQuantizedPerAxisType(
-    Location loc, MLIRContext& context, ArrayRef<float> scales,
-    ArrayRef<int8_t> zero_points, int quantization_dimension);
+    Location loc, MLIRContext& context, ArrayRef<double> scales,
+    ArrayRef<int64_t> zero_points, int quantization_dimension);
+
+bool IsStorageTypeI8(QuantizedType quantized_type);
+
+bool IsStorageTypeI32(QuantizedType quantized_type);
+
+bool IsExpressedTypeF32(QuantizedType quantized_type);
+
+// Returns true iff `type` is a uniform quantized type whose storage type is
+// 8-bit integer and expressed type is f32.
+bool IsI8F32UniformQuantizedType(Type type);
+
+// Returns true iff `type` is a uniform quantized per-axis (per-channel) type
+// whose storage type is 8-bit integer and expressed type is f32.
+bool IsI8F32UniformQuantizedPerAxisType(Type type);
+
+// Returns true iff `type` is a uniform quantized type whose storage type is
+// 32-bit integer and expressed type is f32.
+bool IsI32F32UniformQuantizedType(Type type);
+
+// Determines whether the storage type of a quantized type is supported by
+// `tfl.quantize` or `tfl.dequantize` ops. ui8, i8 and i16 are supported.
+bool IsSupportedByTfliteQuantizeOrDequantizeOps(IntegerType storage_type);
 
 }  // namespace quant
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types_test.cc
index 0888bfa8d22908..f33b322cfbd9e4 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
@@ -30,8 +31,10 @@ namespace quant {
 namespace {
 
 using ::testing::ElementsAreArray;
+using ::testing::NotNull;
+using ::testing::Test;
 
-class CreateI8F32UniformQuantizedTypeTest : public ::testing::Test {
+class CreateI8F32UniformQuantizedTypeTest : public Test {
  protected:
   CreateI8F32UniformQuantizedTypeTest() : ctx_() {
     ctx_.loadDialect<quant::QuantizationDialect>();
@@ -40,7 +43,7 @@ class CreateI8F32UniformQuantizedTypeTest : public ::testing::Test {
   MLIRContext ctx_;
 };
 
-TEST_F(CreateI8F32UniformQuantizedTypeTest, HasI8StorageType) {
+TEST_F(CreateI8F32UniformQuantizedTypeTest, I8StorageTypeSucceeds) {
   const UniformQuantizedType quantized_type =
       CreateI8F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
                                       /*scale=*/1.0, /*zero_point=*/0);
@@ -48,7 +51,7 @@ TEST_F(CreateI8F32UniformQuantizedTypeTest, HasI8StorageType) {
   EXPECT_TRUE(quantized_type.getStorageType().isSignlessInteger(8));
 }
 
-TEST_F(CreateI8F32UniformQuantizedTypeTest, HasF32ExpressedType) {
+TEST_F(CreateI8F32UniformQuantizedTypeTest, F32ExpressedTypeSucceeds) {
   const UniformQuantizedType quantized_type =
       CreateI8F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
                                       /*scale=*/1.0, /*zero_point=*/0);
@@ -56,7 +59,7 @@ TEST_F(CreateI8F32UniformQuantizedTypeTest, HasF32ExpressedType) {
   EXPECT_TRUE(quantized_type.getExpressedType().isF32());
 }
 
-TEST_F(CreateI8F32UniformQuantizedTypeTest, IsSigned) {
+TEST_F(CreateI8F32UniformQuantizedTypeTest, SignedQuantizedTypeSucceeds) {
   const UniformQuantizedType quantized_type =
       CreateI8F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
                                       /*scale=*/1.0, /*zero_point=*/0);
@@ -64,7 +67,7 @@ TEST_F(CreateI8F32UniformQuantizedTypeTest, IsSigned) {
   EXPECT_TRUE(quantized_type.isSigned());
 }
 
-TEST_F(CreateI8F32UniformQuantizedTypeTest, SotrageTypeMinMaxEqualToI8MinMax) {
+TEST_F(CreateI8F32UniformQuantizedTypeTest, StorageTypeMinMaxEqualToI8MinMax) {
   const UniformQuantizedType quantized_type =
       CreateI8F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
                                       /*scale=*/1.0, /*zero_point=*/0);
@@ -82,7 +85,7 @@ TEST_F(CreateI8F32UniformQuantizedTypeTest, HasScaleAndZeroPointProperlySet) {
   EXPECT_EQ(quantized_type.getZeroPoint(), 99);
 }
 
-class CreateI32F32UniformQuantizedTypeTest : public ::testing::Test {
+class CreateI32F32UniformQuantizedTypeTest : public Test {
  protected:
   CreateI32F32UniformQuantizedTypeTest() : ctx_() {
     ctx_.loadDialect<quant::QuantizationDialect>();
@@ -91,7 +94,7 @@ class CreateI32F32UniformQuantizedTypeTest : public ::testing::Test {
   MLIRContext ctx_;
 };
 
-TEST_F(CreateI32F32UniformQuantizedTypeTest, HasI32StorageType) {
+TEST_F(CreateI32F32UniformQuantizedTypeTest, I32StorageTypeSucceeds) {
   const UniformQuantizedType quantized_type =
       CreateI32F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
                                        /*scale=*/1.0, /*zero_point=*/0);
@@ -99,7 +102,7 @@ TEST_F(CreateI32F32UniformQuantizedTypeTest, HasI32StorageType) {
   EXPECT_TRUE(quantized_type.getStorageType().isSignlessInteger(32));
 }
 
-TEST_F(CreateI32F32UniformQuantizedTypeTest, HasF32ExpressedType) {
+TEST_F(CreateI32F32UniformQuantizedTypeTest, F32ExpressedTypeSucceeds) {
   const UniformQuantizedType quantized_type =
       CreateI32F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
                                        /*scale=*/1.0, /*zero_point=*/0);
@@ -107,7 +110,7 @@ TEST_F(CreateI32F32UniformQuantizedTypeTest, HasF32ExpressedType) {
   EXPECT_TRUE(quantized_type.getExpressedType().isF32());
 }
 
-TEST_F(CreateI32F32UniformQuantizedTypeTest, IsSigned) {
+TEST_F(CreateI32F32UniformQuantizedTypeTest, SignedQuantizedTypeSucceeds) {
   const UniformQuantizedType quantized_type =
       CreateI32F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
                                        /*scale=*/1.0, /*zero_point=*/0);
@@ -116,7 +119,7 @@ TEST_F(CreateI32F32UniformQuantizedTypeTest, IsSigned) {
 }
 
 TEST_F(CreateI32F32UniformQuantizedTypeTest,
-       SotrageTypeMinMaxEqualToI32MinMax) {
+       StorageTypeMinMaxEqualToI32MinMax) {
   const UniformQuantizedType quantized_type =
       CreateI32F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
                                        /*scale=*/1.0, /*zero_point=*/0);
@@ -136,7 +139,7 @@ TEST_F(CreateI32F32UniformQuantizedTypeTest, HasScaleAndZeroPointProperlySet) {
   EXPECT_EQ(quantized_type.getZeroPoint(), 1111);
 }
 
-class CreateI8F32UniformQuantizedPerAxisTypeTest : public ::testing::Test {
+class CreateI8F32UniformQuantizedPerAxisTypeTest : public Test {
  protected:
   CreateI8F32UniformQuantizedPerAxisTypeTest() : ctx_() {
     ctx_.loadDialect<quant::QuantizationDialect>();
@@ -145,34 +148,35 @@ class CreateI8F32UniformQuantizedPerAxisTypeTest : public ::testing::Test {
   MLIRContext ctx_;
 };
 
-TEST_F(CreateI8F32UniformQuantizedPerAxisTypeTest, HasI8StorageType) {
+TEST_F(CreateI8F32UniformQuantizedPerAxisTypeTest, I8StorageTypeSucceeds) {
   const UniformQuantizedPerAxisType quantized_type =
       CreateI8F32UniformQuantizedPerAxisType(
           UnknownLoc::get(&ctx_), ctx_,
-          /*scales=*/SmallVector<float, 2>{1.0, 1.0},
-          /*zero_points=*/SmallVector<int8_t, 2>{0, 0},
+          /*scales=*/SmallVector<double, 2>{1.0, 1.0},
+          /*zero_points=*/SmallVector<int64_t, 2>{0, 0},
           /*quantization_dimension=*/0);
 
   EXPECT_TRUE(quantized_type.getStorageType().isSignlessInteger(8));
 }
 
-TEST_F(CreateI8F32UniformQuantizedPerAxisTypeTest, HasF32ExpressedType) {
+TEST_F(CreateI8F32UniformQuantizedPerAxisTypeTest, F32ExpressedTypeSucceeds) {
   const UniformQuantizedPerAxisType quantized_type =
       CreateI8F32UniformQuantizedPerAxisType(
           UnknownLoc::get(&ctx_), ctx_,
-          /*scales=*/SmallVector<float, 2>{1.0, 1.0},
-          /*zero_points=*/SmallVector<int8_t, 2>{0, 0},
+          /*scales=*/SmallVector<double, 2>{1.0, 1.0},
+          /*zero_points=*/SmallVector<int64_t, 2>{0, 0},
           /*quantization_dimension=*/0);
 
   EXPECT_TRUE(quantized_type.getExpressedType().isF32());
 }
 
-TEST_F(CreateI8F32UniformQuantizedPerAxisTypeTest, IsSigned) {
+TEST_F(CreateI8F32UniformQuantizedPerAxisTypeTest,
+       SignedQuantizedTypeSucceeds) {
   const UniformQuantizedPerAxisType quantized_type =
       CreateI8F32UniformQuantizedPerAxisType(
           UnknownLoc::get(&ctx_), ctx_,
-          /*scales=*/SmallVector<float, 2>{1.0, 1.0},
-          /*zero_points=*/SmallVector<int8_t, 2>{0, 0},
+          /*scales=*/SmallVector<double, 2>{1.0, 1.0},
+          /*zero_points=*/SmallVector<int64_t, 2>{0, 0},
           /*quantization_dimension=*/0);
 
   EXPECT_TRUE(quantized_type.isSigned());
@@ -183,8 +187,8 @@ TEST_F(CreateI8F32UniformQuantizedPerAxisTypeTest,
   const UniformQuantizedPerAxisType quantized_type =
       CreateI8F32UniformQuantizedPerAxisType(
           UnknownLoc::get(&ctx_), ctx_,
-          /*scales=*/SmallVector<float, 2>{1.0, 1.0},
-          /*zero_points=*/SmallVector<int8_t, 2>{0, 0},
+          /*scales=*/SmallVector<double, 2>{1.0, 1.0},
+          /*zero_points=*/SmallVector<int64_t, 2>{0, 0},
           /*quantization_dimension=*/0);
 
   EXPECT_EQ(quantized_type.getStorageTypeMin(), -128);
@@ -196,8 +200,8 @@ TEST_F(CreateI8F32UniformQuantizedPerAxisTypeTest,
   const UniformQuantizedPerAxisType quantized_type =
       CreateI8F32UniformQuantizedPerAxisType(
           UnknownLoc::get(&ctx_), ctx_,
-          /*scales=*/SmallVector<float, 2>{1.0, 1.0},
-          /*zero_points=*/SmallVector<int8_t, 2>{0, 0},
+          /*scales=*/SmallVector<double, 2>{1.0, 1.0},
+          /*zero_points=*/SmallVector<int64_t, 2>{0, 0},
           /*quantization_dimension=*/3);
 
   EXPECT_EQ(quantized_type.getQuantizedDimension(), 3);
@@ -208,14 +212,182 @@ TEST_F(CreateI8F32UniformQuantizedPerAxisTypeTest,
   const UniformQuantizedPerAxisType quantized_type =
       CreateI8F32UniformQuantizedPerAxisType(
           UnknownLoc::get(&ctx_), ctx_,
-          /*scales=*/SmallVector<float, 2>{8.0, 9.0},
-          /*zero_points=*/SmallVector<int8_t, 2>{98, 99},
+          /*scales=*/SmallVector<double, 2>{8.0, 9.0},
+          /*zero_points=*/SmallVector<int64_t, 2>{98, 99},
           /*quantization_dimension=*/0);
 
   EXPECT_THAT(quantized_type.getScales(), ElementsAreArray({8.0, 9.0}));
   EXPECT_THAT(quantized_type.getZeroPoints(), ElementsAreArray({98, 99}));
 }
 
+class IsI8F32UniformQuantizedTypeTest : public Test {
+ protected:
+  IsI8F32UniformQuantizedTypeTest() {
+    ctx_.loadDialect<quant::QuantizationDialect>();
+  }
+
+  MLIRContext ctx_;
+  OpBuilder builder_{&ctx_};
+};
+
+TEST_F(IsI8F32UniformQuantizedTypeTest, I8F32UniformQuantizedTypeSucceeds) {
+  const UniformQuantizedType qi8_type = quant::UniformQuantizedType::get(
+      /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(), /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+  EXPECT_TRUE(IsI8F32UniformQuantizedType(qi8_type));
+}
+
+TEST_F(IsI8F32UniformQuantizedTypeTest, UniformQuantizedTypeSucceeds) {
+  const UniformQuantizedType qi8_type = quant::UniformQuantizedType::get(
+      /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(), /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+  EXPECT_THAT(qi8_type.dyn_cast_or_null<UniformQuantizedType>(), NotNull());
+}
+
+TEST_F(IsI8F32UniformQuantizedTypeTest, StorageTypeI8Succeeds) {
+  const UniformQuantizedType qi8_type = quant::UniformQuantizedType::get(
+      /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(), /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+  EXPECT_TRUE(IsStorageTypeI8(qi8_type));
+}
+
+TEST_F(IsI8F32UniformQuantizedTypeTest, ExpressedTypeF32Succeeds) {
+  const UniformQuantizedType qi8_type = quant::UniformQuantizedType::get(
+      /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(), /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+  EXPECT_TRUE(IsExpressedTypeF32(qi8_type));
+}
+
+class IsI8F32UniformQuantizedPerAxisTypeTest : public Test {
+ protected:
+  IsI8F32UniformQuantizedPerAxisTypeTest() {
+    ctx_.loadDialect<quant::QuantizationDialect>();
+  }
+
+  MLIRContext ctx_;
+  OpBuilder builder_{&ctx_};
+};
+
+TEST_F(IsI8F32UniformQuantizedPerAxisTypeTest,
+       I8F32UniformQuantizedPerAxisTypeSucceeds) {
+  const UniformQuantizedPerAxisType qi8_per_axis_type =
+      quant::UniformQuantizedPerAxisType::get(
+          /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(),
+          /*scales=*/{1.0},
+          /*zeroPoints=*/{0}, /*quantizedDimension=*/0, /*storageTypeMin=*/0,
+          /*storageTypeMax=*/255);
+  EXPECT_TRUE(IsI8F32UniformQuantizedPerAxisType(qi8_per_axis_type));
+  EXPECT_FALSE(IsI8F32UniformQuantizedType(qi8_per_axis_type));
+}
+
+TEST_F(IsI8F32UniformQuantizedTypeTest, UniformQuantizedPerAxisTypeSucceeds) {
+  const UniformQuantizedPerAxisType qi8_per_axis_type =
+      quant::UniformQuantizedPerAxisType::get(
+          /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(),
+          /*scales=*/{1.0},
+          /*zeroPoints=*/{0}, /*quantizedDimension=*/0, /*storageTypeMin=*/0,
+          /*storageTypeMax=*/255);
+  EXPECT_THAT(qi8_per_axis_type.dyn_cast_or_null<UniformQuantizedPerAxisType>(),
+              NotNull());
+}
+
+TEST_F(IsI8F32UniformQuantizedPerAxisTypeTest, StorageTypeI8Succeeds) {
+  const UniformQuantizedPerAxisType qi8_per_axis_type =
+      quant::UniformQuantizedPerAxisType::get(
+          /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(),
+          /*scales=*/{1.0},
+          /*zeroPoints=*/{0}, /*quantizedDimension=*/0, /*storageTypeMin=*/0,
+          /*storageTypeMax=*/255);
+  EXPECT_TRUE(IsStorageTypeI8(qi8_per_axis_type));
+}
+
+TEST_F(IsI8F32UniformQuantizedPerAxisTypeTest, ExpressedTypeF32Succeeds) {
+  const UniformQuantizedPerAxisType qi8_per_axis_type =
+      quant::UniformQuantizedPerAxisType::get(
+          /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(),
+          /*scales=*/{1.0},
+          /*zeroPoints=*/{0}, /*quantizedDimension=*/0, /*storageTypeMin=*/0,
+          /*storageTypeMax=*/255);
+  EXPECT_TRUE(IsExpressedTypeF32(qi8_per_axis_type));
+}
+
+class IsI32F32UniformQuantizedTypeTest : public Test {
+ protected:
+  IsI32F32UniformQuantizedTypeTest() {
+    ctx_.loadDialect<quant::QuantizationDialect>();
+  }
+
+  MLIRContext ctx_;
+  OpBuilder builder_{&ctx_};
+};
+
+TEST_F(IsI32F32UniformQuantizedTypeTest, I32F32UniformQuantizedTypeSucceeds) {
+  const UniformQuantizedType qi32_type = quant::UniformQuantizedType::get(
+      /*flags=*/0, builder_.getI32Type(), builder_.getF32Type(), /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+  EXPECT_TRUE(IsI32F32UniformQuantizedType(qi32_type));
+}
+
+TEST_F(IsI32F32UniformQuantizedTypeTest, UniformQuantizedTypeSucceeds) {
+  const UniformQuantizedType qi32_type = quant::UniformQuantizedType::get(
+      /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(), /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+  EXPECT_THAT(qi32_type.dyn_cast_or_null<UniformQuantizedType>(), NotNull());
+}
+
+TEST_F(IsI32F32UniformQuantizedTypeTest, StorageTypeI32Succeeds) {
+  const UniformQuantizedType qi32_type = quant::UniformQuantizedType::get(
+      /*flags=*/0, builder_.getI32Type(), builder_.getF32Type(), /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+  EXPECT_TRUE(IsStorageTypeI32(qi32_type));
+}
+
+TEST_F(IsI32F32UniformQuantizedTypeTest, ExpressedTypeF32Succeeds) {
+  const UniformQuantizedType qi32_per_axis_type =
+      quant::UniformQuantizedType::get(
+          /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(),
+          /*scale=*/1.0,
+          /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+  EXPECT_TRUE(IsExpressedTypeF32(qi32_per_axis_type));
+}
+
+class IsSupportedByTfliteQuantizeOrDequantizeOpsTest : public Test {
+ protected:
+  IsSupportedByTfliteQuantizeOrDequantizeOpsTest() {
+    ctx_.loadDialect<quant::QuantizationDialect>();
+  }
+
+  MLIRContext ctx_;
+  OpBuilder builder_{&ctx_};
+};
+
+TEST_F(IsSupportedByTfliteQuantizeOrDequantizeOpsTest, StorageTypeI8Succeeds) {
+  auto qi8_type = quant::UniformQuantizedType::get(
+      /*flags=*/0, builder_.getIntegerType(8, /*isSigned=*/true),
+      builder_.getF32Type(), /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+  EXPECT_TRUE(IsSupportedByTfliteQuantizeOrDequantizeOps(
+      dyn_cast_or_null<IntegerType>(qi8_type.getStorageType())));
+}
+
+TEST_F(IsSupportedByTfliteQuantizeOrDequantizeOpsTest, StorageTypeI16Succeeds) {
+  auto qi16_type = quant::UniformQuantizedType::get(
+      /*flags=*/0, builder_.getIntegerType(16, /*isSigned=*/true),
+      builder_.getF32Type(), /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+  EXPECT_TRUE(IsSupportedByTfliteQuantizeOrDequantizeOps(
+      dyn_cast_or_null<IntegerType>(qi16_type.getStorageType())));
+}
+
+TEST_F(IsSupportedByTfliteQuantizeOrDequantizeOpsTest, StorageTypeUI8Succeeds) {
+  auto qi8_type = quant::UniformQuantizedType::get(
+      /*flags=*/0, builder_.getIntegerType(8, /*isSigned=*/false),
+      builder_.getF32Type(), /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+  EXPECT_TRUE(IsSupportedByTfliteQuantizeOrDequantizeOps(
+      dyn_cast_or_null<IntegerType>(qi8_type.getStorageType())));
+}
+
 }  // namespace
 }  // namespace quant
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils_test.cc
index 4dcdb637e1b430..a864ee556ff5af 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils_test.cc
@@ -24,14 +24,23 @@ limitations under the License.
 namespace mlir::quant::stablehlo {
 namespace {
 
-TEST(UtilsTest, IsStablehloOp) {
-  MLIRContext ctx;
-  OpBuilder b(&ctx);
-  ctx.loadDialect<mlir::stablehlo::StablehloDialect, mlir::func::FuncDialect>();
+using ::testing::Test;
 
+class StablehloTypeUtilsTest : public Test {
+ protected:
+  StablehloTypeUtilsTest() {
+    ctx_.loadDialect<mlir::stablehlo::StablehloDialect,
+                     mlir::func::FuncDialect>();
+  }
+
+  MLIRContext ctx_;
+  OpBuilder builder_{&ctx_};
+};
+
+TEST_F(StablehloTypeUtilsTest, ValidStablehloOpSucceeds) {
   mlir::stablehlo::ConstantOp constant_op =
-      b.create<mlir::stablehlo::ConstantOp>(b.getUnknownLoc(),
-                                            b.getI32IntegerAttr(0));
+      builder_.create<mlir::stablehlo::ConstantOp>(
+          builder_.getUnknownLoc(), builder_.getI32IntegerAttr(0));
   EXPECT_TRUE(IsStablehloOp(constant_op));
   constant_op->erase();
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils_test.cc
index 03495d3ddae7aa..87d71438cf4e7c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils_test.cc
@@ -90,7 +90,7 @@ std::unique_ptr<MLIRContext> CreateContext() {
   return context;
 }
 
-TEST(GetDenseAttrFromTensorProtoAttrTest, Qint8ToUQ8) {
+TEST(GetDenseAttrFromTensorProtoAttrTest, Qint8ToUQ8Succeeds) {
   auto context = CreateContext();
   TensorType result_tensor_type = RankedTensorType::get(
       {2, 2}, quant::UniformQuantizedType::get(
@@ -109,7 +109,7 @@ TEST(GetDenseAttrFromTensorProtoAttrTest, Qint8ToUQ8) {
   EXPECT_EQ(dense_attr->getValues<int8_t>()[3], 4);
 }
 
-TEST(GetDenseAttrFromTensorProtoAttrTest, Qint8ToInt8) {
+TEST(GetDenseAttrFromTensorProtoAttrTest, Qint8ToInt8Succeeds) {
   auto context = CreateContext();
   TensorType result_tensor_type =
       RankedTensorType::get({2, 2}, IntegerType::get(context.get(), 8));
@@ -125,7 +125,7 @@ TEST(GetDenseAttrFromTensorProtoAttrTest, Qint8ToInt8) {
   EXPECT_EQ(dense_attr->getValues<int8_t>()[3], 4);
 }
 
-TEST(GetDenseAttrFromTensorProtoAttrTest, Qint32ToUQ32) {
+TEST(GetDenseAttrFromTensorProtoAttrTest, Qint32ToUQ32Succeeds) {
   auto context = CreateContext();
   TensorType result_tensor_type = RankedTensorType::get(
       {2, 2},
@@ -145,7 +145,7 @@ TEST(GetDenseAttrFromTensorProtoAttrTest, Qint32ToUQ32) {
   EXPECT_EQ(dense_attr->getValues<int32_t>()[3], 4);
 }
 
-TEST(GetDenseAttrFromTensorProtoAttrTest, Qint32ToInt32) {
+TEST(GetDenseAttrFromTensorProtoAttrTest, Qint32ToInt32Succeeds) {
   auto context = CreateContext();
   TensorType result_tensor_type =
       RankedTensorType::get({2, 2}, IntegerType::get(context.get(), 32));
@@ -161,7 +161,7 @@ TEST(GetDenseAttrFromTensorProtoAttrTest, Qint32ToInt32) {
   EXPECT_EQ(dense_attr->getValues<int32_t>()[3], 4);
 }
 
-TEST(GetDenseAttrFromTensorProtoAttrTest, UnsupportedQint16) {
+TEST(GetDenseAttrFromTensorProtoAttrTest, UnsupportedQint16Fails) {
   auto context = CreateContext();
   TensorType result_tensor_type =
       RankedTensorType::get({2, 2}, IntegerType::get(context.get(), 16));
@@ -170,7 +170,7 @@ TEST(GetDenseAttrFromTensorProtoAttrTest, UnsupportedQint16) {
       GetDenseAttrFromTensorProtoAttr(GetQint16Tensor(), result_tensor_type)));
 }
 
-TEST(IsTFQintTypeTest, IsTFQintType) {
+TEST(IsTFQintTypeTest, ValidTFQintTypeSucceeds) {
   auto context = CreateContext();
 
   EXPECT_TRUE(IsTFQintType(TF::Qint8Type::get(context.get())));
@@ -183,7 +183,7 @@ TEST(IsTFQintTypeTest, IsTFQintType) {
   EXPECT_FALSE(IsTFQintType(TF::Float8E5M2RefType::get(context.get())));
 }
 
-TEST(GetIntTypeFromTFQintTest, GetIntTypeFromTFQint) {
+TEST(GetIntTypeFromTFQintTest, ChecksIntTypesFromTFQint) {
   auto context = CreateContext();
 
   auto type = GetIntTypeFromTFQint(TF::Qint8Type::get(context.get()));
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index f5c170122977ba..c973a4fed16bb0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -51,24 +51,6 @@ genrule(
     tools = ["gen_quantized_function_library"],
 )
 
-cc_library(
-    name = "pass_utils",
-    srcs = [
-        "passes/utils.cc",
-    ],
-    hdrs = [
-        "passes/utils.h",
-    ],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":quantization_options_proto_cc",
-        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
 cc_library(
     name = "manipulate_model_attr",
     srcs = [
@@ -117,12 +99,11 @@ td_library(
         "passes/quantize_composite_functions.td",
         "passes/replace_cast_hacks_with_tf_xla_ops.td",
         "passes/tf_quant_ops.td",
-        "passes/utils.td",
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_ops_td_files",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/utils:lift_as_function_call_utils_td_files",
+        "//tensorflow/compiler/mlir/quantization/common:quant_td_files",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
         "@llvm-project//mlir:ArithOpsTdFiles",
         "@llvm-project//mlir:FuncTdFiles",
@@ -411,7 +392,6 @@ cc_library(
         ":lift_quantizable_spots_as_functions_inc_gen",
         ":manipulate_model_attr",
         ":optimize_inc_gen",
-        ":pass_utils",
         ":post_quantize_inc_gen",
         ":prepare_lifting_inc_gen",
         ":prepare_quantize_inc_gen",
@@ -425,6 +405,8 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:lift_as_function_call",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:const_op_size",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:constant_fold",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:quantization_unit_loc",
@@ -432,7 +414,6 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow/ops:tf_op_quant_spec",
         "//tensorflow/compiler/mlir/quantization/tensorflow/ops:tf_quantize_op",
         "//tensorflow/compiler/mlir/quantization/tensorflow/utils:fake_quant_utils",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/utils:lift_as_function_call_utils",
         "//tensorflow/compiler/mlir/quantization/tensorflow/utils:tf_to_uniform_attribute_utils",
         "//tensorflow/compiler/mlir/quantization/tensorflow/utils:tf_to_xla_attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -543,6 +524,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo:mhlo_passes",
     ],
@@ -616,5 +598,6 @@ tf_cc_binary(
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:ShapeDialect",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
index aec612c95b7b62..34260f6e75e1c4 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
@@ -12,6 +12,7 @@ load(
     "get_compatible_with_portable",
     "tf_kernel_library",
     "tf_py_strict_test",
+    "tf_python_pybind_extension",
 )
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load(
@@ -35,7 +36,6 @@ cc_library(
     srcs = ["calibrator_singleton.cc"],
     hdrs = ["calibrator_singleton.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:private"],
     deps = [
         ":calibration_statistics_collector_average_min_max",
         ":calibration_statistics_collector_base",
@@ -223,9 +223,9 @@ tf_py_strict_test(
     deps = [
         ":calibration_statistics_proto_py",
         ":gen_custom_aggregator_op_wrapper",
+        ":pywrap_calibration",
         "//tensorflow:tensorflow_py",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_py",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/python:pywrap_quantize_model",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
@@ -233,3 +233,17 @@ tf_py_strict_test(
         "//tensorflow/python/platform:client_testlib",
     ],
 )
+
+tf_python_pybind_extension(
+    name = "pywrap_calibration",
+    srcs = ["pywrap_calibration.cc"],
+    pytype_srcs = ["pywrap_calibration.pyi"],
+    deps = [
+        ":calibration_statistics_proto_cc",
+        ":calibrator_singleton",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@pybind11",
+        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc
index d3b0475a3dbc74..95e89a7c573c91 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -107,6 +108,11 @@ std::optional<CalibrationStatistics> CalibratorSingleton::GetStatistics(
   return instance.id_to_collector_[id_str]->GetStatistics();
 }
 
+int64_t CalibratorSingleton::IssueNewId() {
+  CalibratorSingleton& instance = GetInstance();
+  return instance.next_id_++;
+}
+
 void CalibratorSingleton::AssignIfNotExists(
     std::string id_str, const CalibrationOptions& calib_opts) {
   CalibratorSingleton& instance = GetInstance();
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h
index 138352bfcf3d53..38432b01a5a3da 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATOR_SINGLETON_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATOR_SINGLETON_H_
 
+#include <atomic>
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -35,6 +37,7 @@ namespace calibrator {
 
 using tensorflow::quantization::CalibrationOptions;
 
+// TODO: b/315084876 - Move to stablehlo quantizer directory.
 class CalibratorSingleton {
  public:
   // Clears the collected information.
@@ -65,12 +68,20 @@ class CalibratorSingleton {
   static std::optional<CalibrationStatistics> GetStatistics(
       absl::string_view id);
 
+  // Issues a new node ID that uniquely identifies a set of calibration
+  // statistics.
+  static int64_t IssueNewId();
+
  private:
   static CalibratorSingleton& GetInstance();
   static absl::Mutex lock_;
   static void AssignIfNotExists(std::string id_str,
                                 const CalibrationOptions& calib_opts);
 
+  // Indicates the next id for a set of calibration statistics. For every new ID
+  // issued this will be incremented atomically.
+  std::atomic<int64_t> next_id_{0};
+
   absl::flat_hash_map<std::string,
                       std::unique_ptr<CalibrationStatisticsCollectorBase>>
       id_to_collector_;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc
index d6e85c33da8c76..d58dbb838be792 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
 
+#include <cstdint>
 #include <optional>
 #include <vector>
 
@@ -201,6 +202,12 @@ TEST(CalibratorSingletonTest, SimpleAverageMinMax) {
   EXPECT_EQ(statistics.value().average_min_max_statistics().num_samples(), 3);
 }
 
+TEST(CalibratorSingletonTest, IssueNewIdGeneratesNewId) {
+  const int64_t id = CalibratorSingleton::IssueNewId();
+  const int64_t next_id = CalibratorSingleton::IssueNewId();
+  EXPECT_NE(id, next_id);
+}
+
 }  // namespace
 }  // namespace calibrator
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py
index a9d1ccacbb7533..5818017a155b58 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py
@@ -16,16 +16,15 @@
 
 import tensorflow  # pylint: disable=unused-import
 
-# pylint: disable=invalid-import-order,g-bad-import-order
-from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
+from tensorflow.compiler.mlir.quantization.tensorflow import quantization_options_pb2 as quant_opts_pb2
+from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import calibration_statistics_pb2 as calib_stat_pb2
 from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import custom_aggregator_op_wrapper
-from tensorflow.compiler.mlir.quantization.tensorflow.python import pywrap_quantize_model as quantize_model_wrapper
+from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import pywrap_calibration
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-from tensorflow.compiler.mlir.quantization.tensorflow import quantization_options_pb2 as quant_opts_pb2
-from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import calibration_statistics_pb2 as calib_stat_pb2
 
 _CalibrationMethod = quant_opts_pb2.CalibrationOptions.CalibrationMethod
 
@@ -37,8 +36,8 @@ def setUp(self):
     ops.disable_eager_execution()
 
   def testBypassAndMinMax(self):
-    with self.test_session():
-      quantize_model_wrapper.clear_calibrator()
+    with self.session():
+      pywrap_calibration.clear_calibrator()
       input_tensor = array_ops.constant(
           [1.0, 2.0, 3.0, 4.0, 5.0], dtypes.float32
       )
@@ -51,7 +50,7 @@ def testBypassAndMinMax(self):
       self.assertAllEqual(self.evaluate(aggregator), [1.0, 2.0, 3.0, 4.0, 5.0])
 
       statistics: calib_stat_pb2.CalibrationStatistics = (
-          quantize_model_wrapper.get_statistics_from_calibrator('1')
+          pywrap_calibration.get_statistics_from_calibrator('1')
       )
 
       min_val = statistics.min_max_statistics.global_min
@@ -60,8 +59,8 @@ def testBypassAndMinMax(self):
       self.assertAllEqual((min_val, max_val), (1.0, 5.0))
 
   def testTwoIdentities(self):
-    with self.test_session():
-      quantize_model_wrapper.clear_calibrator()
+    with self.session():
+      pywrap_calibration.clear_calibrator()
       input_tensor1 = array_ops.constant(
           [1.0, 2.0, 3.0, 4.0, 5.0], dtypes.float32
       )
@@ -84,21 +83,21 @@ def testTwoIdentities(self):
       )
 
       statistics: calib_stat_pb2 = (
-          quantize_model_wrapper.get_statistics_from_calibrator('2')
+          pywrap_calibration.get_statistics_from_calibrator('2')
       )
       min_val = statistics.min_max_statistics.global_min
       max_val = statistics.min_max_statistics.global_max
       self.assertAllEqual((min_val, max_val), (1.0, 5.0))
       statistics: calib_stat_pb2 = (
-          quantize_model_wrapper.get_statistics_from_calibrator('3')
+          pywrap_calibration.get_statistics_from_calibrator('3')
       )
       min_val = statistics.min_max_statistics.global_min
       max_val = statistics.min_max_statistics.global_max
       self.assertAllEqual((min_val, max_val), (-5.0, -1.0))
 
   def testClearData(self):
-    with self.test_session():
-      quantize_model_wrapper.clear_calibrator()
+    with self.session():
+      pywrap_calibration.clear_calibrator()
       input_tensor1 = array_ops.constant(
           [1.0, 2.0, 3.0, 4.0, 5.0], dtypes.float32
       )
@@ -121,33 +120,33 @@ def testClearData(self):
       )
 
       statistics: calib_stat_pb2 = (
-          quantize_model_wrapper.get_statistics_from_calibrator('4')
+          pywrap_calibration.get_statistics_from_calibrator('4')
       )
       min_val = statistics.min_max_statistics.global_min
       max_val = statistics.min_max_statistics.global_max
       self.assertAllEqual((min_val, max_val), (1.0, 5.0))
 
       statistics: calib_stat_pb2 = (
-          quantize_model_wrapper.get_statistics_from_calibrator('5')
+          pywrap_calibration.get_statistics_from_calibrator('5')
       )
       min_val = statistics.min_max_statistics.global_min
       max_val = statistics.min_max_statistics.global_max
       self.assertAllEqual((min_val, max_val), (-5.0, -1.0))
 
-      quantize_model_wrapper.clear_data_from_calibrator('4')
+      pywrap_calibration.clear_data_from_calibrator('4')
       with self.assertRaises(ValueError):
-        quantize_model_wrapper.get_statistics_from_calibrator('4')
+        pywrap_calibration.get_statistics_from_calibrator('4')
 
       statistics: calib_stat_pb2 = (
-          quantize_model_wrapper.get_statistics_from_calibrator('5')
+          pywrap_calibration.get_statistics_from_calibrator('5')
       )
       min_val = statistics.min_max_statistics.global_min
       max_val = statistics.min_max_statistics.global_max
       self.assertAllEqual((min_val, max_val), (-5.0, -1.0))
 
   def testBypassAndAverageMinMax(self):
-    with self.test_session():
-      quantize_model_wrapper.clear_calibrator()
+    with self.session():
+      pywrap_calibration.clear_calibrator()
       input_tensor1 = array_ops.constant(
           [-50.0, -25.0, 0.0, 25.0, 50.0], dtypes.float32
       )
@@ -173,7 +172,7 @@ def testBypassAndAverageMinMax(self):
       )
 
       statistics: calib_stat_pb2 = (
-          quantize_model_wrapper.get_statistics_from_calibrator('6')
+          pywrap_calibration.get_statistics_from_calibrator('6')
       )
 
       min_sum = statistics.average_min_max_statistics.min_sum
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.cc
new file mode 100644
index 00000000000000..8f7c4e30457a2e
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.cc
@@ -0,0 +1,91 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <optional>
+
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
+
+namespace py = ::pybind11;
+
+namespace {
+
+using ::tensorflow::calibrator::CalibrationStatistics;
+using ::tensorflow::calibrator::CalibratorSingleton;
+
+// Retrieves collected statistics of a `CustomAggregator` node from the
+// singleton. `id` is the identifier of the `CustomAggregator`.
+CalibrationStatistics GetStatisticsFromCalibrator(const absl::string_view id) {
+  std::optional<CalibrationStatistics> statistics =
+      CalibratorSingleton::GetStatistics(id);
+
+  if (!statistics.has_value()) {
+    throw py::value_error(absl::StrFormat(
+        "Calibrated data does not exist. Cannot find statistics."
+        "value for id: '%s'",
+        id));
+  }
+
+  return *statistics;
+}
+
+}  // namespace
+
+PYBIND11_MODULE(pywrap_calibration, m) {
+  // Allows type casting protobuf objects.
+  pybind11_protobuf::ImportNativeProtoCasters();
+
+  m.doc() = "Defines functions for interacting with CalibratorSingleton.";
+
+  m.def(
+      // If the function signature changes, likely its corresponding .pyi type
+      // hinting should also change.
+      // LINT.IfChange
+      "clear_calibrator",
+      []() -> void
+      // LINT.ThenChange(pywrap_calibration.pyi:clear_calibrator)
+      { CalibratorSingleton::ClearCollectedInformation(); },
+      R"pbdoc(
+      Clears the collected metrics from the calibrator.
+    )pbdoc");
+  m.def(
+      // If the function signature changes, likely its corresponding .pyi type
+      // hinting should also change.
+      // LINT.IfChange
+      "clear_data_from_calibrator",
+      [](const absl::string_view id) -> void
+      // LINT.ThenChange(pywrap_calibration.pyi:clear_data_from_calibrator)
+      { CalibratorSingleton::ClearData(id); },
+      R"pbdoc(
+      Clears the collected data of the given id from calibrator.
+      )pbdoc",
+      py::arg("id"));
+  m.def(
+      // If the function signature changes, likely its corresponding .pyi type
+      // hinting should also change.
+      // LINT.IfChange
+      "get_statistics_from_calibrator",
+      [](const absl::string_view id) -> CalibrationStatistics {
+        // LINT.ThenChange(pywrap_calibration.pyi:get_statistics_from_calibrator)
+        return GetStatisticsFromCalibrator(id);
+      },
+      R"pbdoc(
+      Returns the proto CalibrationStatistics given id from calibrator.
+      )pbdoc",
+      py::arg("id"));
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.pyi b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.pyi
new file mode 100644
index 00000000000000..5d859fee947364
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.pyi
@@ -0,0 +1,32 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import calibration_statistics_pb2
+
+# LINT.IfChange(clear_calibrator)
+def clear_calibrator() -> None: ...
+
+# LINT.ThenChange()
+
+# LINT.IfChange(clear_data_from_calibrator)
+def clear_data_from_calibrator(id: bytes) -> None: ...
+
+# LINT.ThenChange()
+
+# LINT.IfChange(get_statistics_from_calibrator)
+def get_statistics_from_calibrator(
+    id: bytes,
+) -> calibration_statistics_pb2.CalibrationStatistics: ...
+
+# LINT.ThenChange()
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
index 635f71ec59fb6e..574eb7be350d4e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
@@ -12,8 +12,7 @@ package(
     # By default, these targets should only be used within the quantization library.
     default_visibility = [
         "//learning/brain/mlir/quantization:__subpackages__",
-        "//tensorflow/compiler/mlir/quantization/stablehlo:__subpackages__",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:__subpackages__",
+        "//tensorflow/compiler/mlir/quantization:__subpackages__",
     ],
     licenses = ["notice"],
 )
@@ -126,27 +125,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "status_macro",
-    hdrs = ["status_macro.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:macros",
-    ],
-)
-
-tf_cc_test(
-    name = "status_macro_test",
-    srcs = ["status_macro_test.cc"],
-    deps = [
-        ":status_macro",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@com_google_absl//absl/status",
-    ],
-)
-
 cc_library(
     name = "run_passes",
     srcs = ["run_passes.cc"],
@@ -157,6 +135,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@local_tsl//tsl/platform:statusor",
@@ -173,7 +152,7 @@ cc_library(
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        "//tensorflow/compiler/mlir/quantization/tensorflow/utils:lift_as_function_call_utils",
+        "//tensorflow/compiler/mlir/quantization/common:lift_as_function_call",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow/transforms:constant_fold_utils",
         "@com_google_absl//absl/container:flat_hash_set",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc
index 64d89dad2e27f0..565adebfe52300 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.h"
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/status_macro.h b/tensorflow/compiler/mlir/quantization/tensorflow/cc/status_macro.h
deleted file mode 100644
index 5dc784dc8a67c8..00000000000000
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/status_macro.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_STATUS_MACRO_H_
-#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_STATUS_MACRO_H_
-
-#include "tsl/platform/macros.h"
-
-namespace tensorflow {
-namespace quantization {
-
-// Similar to TF_RETURN_IF_ERROR but used for `absl::Status`.
-#define TF_QUANT_RETURN_IF_ERROR(expr)                   \
-  do {                                                   \
-    ::absl::Status _status = (expr);                     \
-    if (TF_PREDICT_FALSE(!_status.ok())) return _status; \
-  } while (0)
-
-}  // namespace quantization
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_STATUS_MACRO_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/status_macro_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/status_macro_test.cc
deleted file mode 100644
index 1e9de6b43d74ed..00000000000000
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/status_macro_test.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/status_macro.h"
-
-#include "absl/status/status.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace quantization {
-namespace {
-
-using ::testing::Eq;
-
-TEST(TfQuantReturnIfErrorTest, DoesNotReturnIfOk) {
-  const auto returned_status = []() -> absl::Status {
-    TF_QUANT_RETURN_IF_ERROR(absl::OkStatus());
-    return absl::InternalError("Expected");
-  }();
-
-  EXPECT_THAT(returned_status.message(), Eq("Expected"));
-}
-
-TEST(TfQuantReturnIfErrorTest, ReturnsIfOk) {
-  const auto returned_status = []() -> absl::Status {
-    TF_QUANT_RETURN_IF_ERROR(absl::InternalError("Expected"));
-    return absl::OkStatus();
-  }();
-
-  EXPECT_THAT(returned_status.message(), Eq("Expected"));
-}
-
-}  // namespace
-}  // namespace quantization
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
index d556e09ee9bba2..fa201ff6a716bc 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -46,7 +46,6 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:pass_utils",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow/utils:tf_quantize_op_utils",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -65,7 +64,7 @@ tf_cc_test(
     srcs = ["tf_quantize_op_test.cc"],
     deps = [
         ":tf_quantize_op",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:pass_utils",
+        "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op_test.cc
index 971237c5175eb7..6fea7f1cc4778a 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op_test.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
index 8ab909ba432231..4a205648a777e6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.td
index ace1a77e6f32ae..80c65560aa1421 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.td
@@ -17,8 +17,8 @@ include "mlir/IR/OpBase.td"
 include "mlir/IR/PatternBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
 
 //===----------------------------------------------------------------------===//
 // Pattern rules for converting bfloat16 operations to fp32 conversions.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_xla_op_to_tf_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_xla_op_to_tf_op.cc
index 994ebea795d079..d23a0f8d3a7af2 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_xla_op_to_tf_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_xla_op_to_tf_op.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_xla_op_to_tf_op.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_xla_op_to_tf_op.td
index c2046a3fd70d47..2e6e92ba467fda 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_xla_op_to_tf_op.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_xla_op_to_tf_op.td
@@ -17,9 +17,8 @@ include "mlir/IR/OpBase.td"
 include "mlir/IR/PatternBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
-include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
-include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td"
 
 // Only handles the case where precision config is default.
 def IsPrecisionEmpty :
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.td
index 945f992188642f..9d39d89c42ae53 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.td
@@ -15,7 +15,7 @@ limitations under the License.
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 include "mlir/IR/OpBase.td"
 include "mlir/IR/PatternBase.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
 
 // Combines the two variadic arguments ($in_tensors and $captured_tensors).
 def GetBatchFunctionOpArgOperands:
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
index f4994cd9c4eaea..68014ebec46605 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
@@ -32,10 +32,10 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -196,37 +196,54 @@ class AddCustomAggregationOp : public RewritePattern {
     // Return early if the given operator is the custom aggregator op.
     if (dyn_cast_or_null<TF::CustomAggregatorOp>(op)) return failure();
 
-    // Return early if the given op is a non-quantizable op.
-    auto call_op = dyn_cast_or_null<TF::PartitionedCallOp>(op);
-    if (call_op && !op->hasAttr(kQuantTraitAttrName)) {
-      return failure();
-    }
-
-    bool mutated = false;
-    for (Value input : op->getOperands()) {
-      Type element_type = getElementTypeOrSelf(input.getType());
-      // Non-float cases won't be calibrated.
-      if (!element_type.isF32()) {
-        continue;
-      }
-
-      // Skip when the given operator is under the quantizable spot.
-      if (IsInLiftedFunc(op)) {
-        continue;
-      }
-
-      // Skip when there is any already existing CustomAggregatorOp found.
-      Operation *defining_op = input.getDefiningOp();
-      if (dyn_cast_or_null<TF::CustomAggregatorOp>(defining_op)) {
-        continue;
+    // The CustomAggregatorOp is only added after quantizable values.
+    SmallVector<Value> quantizable_values;
+    if (isCallToLiftedFunction(op)) {
+      // Quantize inputs of quantizable composite functions.
+      for (Value input : op->getOperands()) {
+        Type element_type = getElementTypeOrSelf(input.getType());
+        // Non-float cases won't be calibrated.
+        if (!element_type.isF32()) {
+          continue;
+        }
+
+        // Skip when there is any already existing CustomAggregatorOp found.
+        Operation *defining_op = input.getDefiningOp();
+        if (dyn_cast_or_null<TF::CustomAggregatorOp>(defining_op)) {
+          continue;
+        }
+
+        // Skip calibration when the given operand comes from a constant.
+        if (defining_op != nullptr &&
+            defining_op->hasTrait<OpTrait::ConstantLike>()) {
+          continue;
+        }
+
+        quantizable_values.push_back(input);
       }
-
-      // Skip calibration when the given operand comes from a constant.
-      if (defining_op != nullptr &&
-          defining_op->hasTrait<OpTrait::ConstantLike>()) {
-        continue;
+    } else {
+      // Quantize output of fully quantizable composite functions.
+      for (Value input : op->getOperands()) {
+        auto defining_op = input.getDefiningOp();
+        if (!isCallToLiftedFunction(defining_op)) {
+          continue;
+        }
+
+        // Do not add CustomAggregatorOp after Gather since it is a weight-only
+        // quantizable op.
+        if (auto call_op =
+                dyn_cast_or_null<TF::PartitionedCallOp>(defining_op)) {
+          StringRef function_name =
+              call_op.getFAttr().cast<FlatSymbolRefAttr>().getValue();
+          if (function_name.contains("gather")) continue;
+        }
+
+        quantizable_values.push_back(input);
       }
+    }
+    if (quantizable_values.empty()) return failure();
 
+    for (Value value : quantizable_values) {
       // ID attribute will have empty value for now.
       SmallVector<NamedAttribute, 5> attributes{
           rewriter.getNamedAttr("id", rewriter.getStringAttr("")),
@@ -248,24 +265,32 @@ class AddCustomAggregationOp : public RewritePattern {
       };
 
       // Insert custom aggregation op between operand and operator.
-      rewriter.setInsertionPointAfterValue(input);
+      rewriter.setInsertionPointAfterValue(value);
       Operation *aggregator_op = rewriter.create<TF::CustomAggregatorOp>(
-          op->getLoc(), input.getType(), input, attributes);
+          op->getLoc(), value.getType(), value, attributes);
 
       Value aggregator_op_result = aggregator_op->getOpResult(0);
-      input.replaceAllUsesWith(aggregator_op_result);
-      aggregator_op->replaceUsesOfWith(aggregator_op_result, input);
-
-      // Mark mutated.
-      mutated = true;
+      value.replaceAllUsesWith(aggregator_op_result);
+      aggregator_op->replaceUsesOfWith(aggregator_op_result, value);
     }
 
-    // Return failure when there is no matching operand.
-    return mutated ? success() : failure();
+    return success();
   }
 
  private:
   CalibrationOptions calib_opts_;
+
+  // Whether the op is a call op to lifted composite function.
+  bool isCallToLiftedFunction(Operation *op) const {
+    if (!op) return false;
+    if (isa<TF::XlaCallModuleOp>(op)) return true;
+
+    TF::PartitionedCallOp call_op = dyn_cast_or_null<TF::PartitionedCallOp>(op);
+    return call_op && call_op->hasAttrOfType<StringAttr>(kQuantTraitAttrName) &&
+           call_op->getAttrOfType<StringAttr>(kQuantTraitAttrName)
+               .getValue()
+               .equals(QuantTraitValues[QuantizationTrait::FullyQuantizable]);
+  }
 };
 
 void InsertCustomAggregationOpsPass::runOnOperation() {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc
index 0e6ce592ea0b8e..b471b7910d0eef 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
index 38eabad77a9052..1f94cdfff15754 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -38,12 +39,12 @@ limitations under the License.
 #include "re2/re2.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -51,14 +52,12 @@ namespace mlir {
 namespace quant {
 namespace {
 
-using QuantizationOptions = tensorflow::quantization::QuantizationOptions;
-using QuantizationMethod = tensorflow::quantization::QuantizationMethod;
-using QuantizationComponentSpec =
-    tensorflow::quantization::QuantizationComponentSpec;
-using UnitWiseQuantizationSpec =
-    tensorflow::quantization::UnitWiseQuantizationSpec;
 using QuantizationUnit =
-    tensorflow::quantization::UnitWiseQuantizationSpec::QuantizationUnit;
+    ::tensorflow::quantization::UnitWiseQuantizationSpec::QuantizationUnit;
+using ::tensorflow::quantization::QuantizationComponentSpec;
+using ::tensorflow::quantization::QuantizationMethod;
+using ::tensorflow::quantization::QuantizationOptions;
+using ::tensorflow::quantization::UnitWiseQuantizationSpec;
 
 class LiftQuantizableSpotsAsFunctionsPass
     : public PassWrapper<LiftQuantizableSpotsAsFunctionsPass,
@@ -278,6 +277,16 @@ class CheckQuantizableOps
       if (!shaped_type || !shaped_type.hasRank()) {
         return absl::InternalError("The input of BatchMatMul must have rank.");
       }
+    } else if (function_name.contains("gather")) {
+      // This op is guaranteed to be a constant as ODS checks IsConstTensor.
+      // Check if the number of elements meets the requirement.
+      int64_t num_elements =
+          call_op.getOperand(0).getType().cast<ShapedType>().getNumElements();
+      if (num_elements < quant_options_.min_num_elements_for_weights()) {
+        return absl::InternalError(
+            "The params of Gather have fewer number of elements than "
+            "the `min_num_elements_for_weights`.");
+      }
     }
 
     // Disable quantization if the quantization method is NO_QUANTIZE.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td
index 1628cedf99e9cd..d56ee05dc071dc 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td
@@ -17,8 +17,8 @@ include "mlir/IR/OpBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td"
 
 //===----------------------------------------------------------------------===//
 // Helper functions.
@@ -62,7 +62,7 @@ def LiftDepthwiseConv : Pat<
   [(IsNotInLiftedFunc $res)], [], (addBenefit 1)>;
 
 def LiftMatMul : Pat<
-  (TF_MatMulOp:$res $a, $b, $transpose_a, $transpose_b),
+  (TF_MatMulOp:$res $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b),
   (LiftAsTFPartitionedCall<"composite_matmul_fn">
     (ArgumentList $a, $b),
     (ResultList $res),
@@ -84,7 +84,7 @@ def LiftConv3D : Pat<
   [(IsNotInLiftedFunc $res)], [], (addBenefit 1)>;
 
 def LiftBatchMatMul : Pat<
-  (TF_BatchMatMulV2Op:$res $x, $y, $adj_x, $adj_y),
+  (TF_BatchMatMulV2Op:$res $x, $y, $adj_x, $adj_y, $grad_x, $grad_y),
   (LiftAsTFPartitionedCall<"composite_batch_matmul_fn">
     (ArgumentList $x, $y),
     (ResultList $res),
@@ -142,7 +142,7 @@ def LiftConv2dWithBias : Pat<
 
 def LiftMatmulWithBias : Pat<
   (TF_BiasAddOp:$res
-    (TF_MatMulOp $a, $b, $transpose_a, $transpose_b),
+    (TF_MatMulOp $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b),
     $bias, IsDataFormatNHWC:$bias_data_format),
   (LiftAsTFPartitionedCall<"composite_matmul_with_bias_fn">
     (ArgumentList $a, $b, $bias),
@@ -157,7 +157,7 @@ def LiftMatmulWithBias : Pat<
 def LiftMatmulWithReshapeAndBias : Pat<
   (TF_BiasAddOp:$res
     (TF_ReshapeOp:$out
-      (TF_MatMulOp $a, $b, $transpose_a, $transpose_b),
+      (TF_MatMulOp $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b),
     $shape),
     $bias, IsDataFormatNHWC:$bias_data_format),
   (LiftAsTFPartitionedCall<"composite_matmul_with_reshape_and_bias_fn">
@@ -184,7 +184,7 @@ def LiftConv3dWithBias : Pat<
 
 def LiftBatchMatMulWithBias : Pat<
   (TF_BiasAddOp:$res
-    (TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y),
+    (TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y, $grad_x, $grad_y),
     $bias, IsDataFormatNHWC:$bias_data_format),
   (LiftAsTFPartitionedCall<"composite_batch_matmul_with_bias_fn">
     (ArgumentList $x, $y, $bias),
@@ -276,7 +276,7 @@ multiclass LiftCompositeOpsWithActivation<Op ActivationOp, string ActivationName
 
   def LiftMatmulWith#ActivationOp : Pat<
     (ActivationOp:$res
-      (TF_MatMulOp $a, $b, $transpose_a, $transpose_b)),
+      (TF_MatMulOp $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b)),
     (LiftAsTFPartitionedCall<"composite_matmul_with_"# ActivationName #"_fn">
       (ArgumentList $a, $b),
       (ResultList $res),
@@ -288,7 +288,7 @@ multiclass LiftCompositeOpsWithActivation<Op ActivationOp, string ActivationName
   def LiftMatmulWithBiasAnd#LastFusedOp : Pat<
     (ActivationOp:$res
       (TF_BiasAddOp
-        (TF_MatMulOp $a, $b, $transpose_a, $transpose_b),
+        (TF_MatMulOp $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b),
       $bias, IsDataFormatNHWC:$bias_data_format)),
     (LiftAsTFPartitionedCall<"composite_matmul_with_bias_and_"# ActivationName #"_fn">
       (ArgumentList $a, $b, $bias),
@@ -328,7 +328,7 @@ multiclass LiftCompositeOpsWithActivation<Op ActivationOp, string ActivationName
 
   def LiftBatchMatMulWith#ActivationOp : Pat<
     (ActivationOp:$res
-      (TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y)),
+      (TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y, $grad_x, $grad_y)),
     (LiftAsTFPartitionedCall<"composite_batch_matmul_with_"# ActivationName #"_fn">
       (ArgumentList $x, $y),
       (ResultList $res),
@@ -340,7 +340,7 @@ multiclass LiftCompositeOpsWithActivation<Op ActivationOp, string ActivationName
   def LiftBatchMatMulWithBiasAnd#LastFusedOp : Pat<
     (ActivationOp:$res
       (TF_BiasAddOp
-        (TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y),
+        (TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y, $grad_x, $grad_y),
         $bias, IsDataFormatNHWC:$bias_data_format)),
     (LiftAsTFPartitionedCall<"composite_batch_matmul_with_bias_and_"# ActivationName #"_fn">
       (ArgumentList $x, $y, $bias),
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
index f86dcf3b3287ed..3e631835cd0ee5 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
@@ -26,10 +26,10 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -37,7 +37,8 @@ namespace mlir {
 namespace quant {
 namespace {
 
-using QuantMethod = tensorflow::quantization::QuantizationMethod::PresetMethod;
+using QuantMethod =
+    ::tensorflow::quantization::QuantizationMethod::PresetMethod;
 
 class LiftQuantizableSpotsAsFunctionsDRQPass
     : public PassWrapper<LiftQuantizableSpotsAsFunctionsDRQPass,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.td
index 3fab7fe2fc1c4b..cd978b302f46f0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.td
@@ -17,8 +17,8 @@ include "mlir/IR/OpBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
+include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td"
 
 //===----------------------------------------------------------------------===//
 // Pattern rules for lifting ops as functions
@@ -52,7 +52,7 @@ def LiftDepthwiseConv : Pat<
   [(IsNotInLiftedFunc $res), (IsConstTensor $filter)], [], (addBenefit 1)>;
 
 def LiftMatMul : Pat<
-  (TF_MatMulOp:$res $a, $b, $transpose_a, $transpose_b),
+  (TF_MatMulOp:$res $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b),
   (LiftAsTFPartitionedCall<"composite_matmul_fn">
     (ArgumentList $a, $b),
     (ResultList $res),
@@ -83,7 +83,7 @@ def LiftConv3D : Pat<
   [(IsNotInLiftedFunc $res), (IsConstTensor $filter)], [], (addBenefit 1)>;
 
 def LiftBatchMatMul : Pat<
-  (TF_BatchMatMulV2Op:$res $x, $y, $adj_x, $adj_y),
+  (TF_BatchMatMulV2Op:$res $x, $y, $adj_x, $adj_y, $grad_x, $grad_y),
   (LiftAsTFPartitionedCall<"composite_batch_matmul_fn">
     (ArgumentList $x, $y),
     (ResultList $res),
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.cc
index 1ba9d68347e2ce..b459bbcd901125 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir::quant {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.td
index 2348ac80b845f1..c40902d283e8cc 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.td
@@ -17,8 +17,8 @@ include "mlir/IR/OpBase.td"
 include "mlir/IR/PatternBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
 
 // Remove redundant `CastOp` to int8 if the input is properly clipped.
 def RemoveRedundantCastOps : Pat<
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h b/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
index 7deed9306fcf88..7300cb3996b131 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.td
index 5d879adea90a50..7e00f588f9dc71 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.td
@@ -18,9 +18,9 @@ include "mlir/IR/PatternBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
 include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
 
 // Re-orders the Identity op following a quantized composite function. This
 // allows the QuantizeCompositeFunctionsPass to merge the DequantizeCast with
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
index e0fb1224d5540a..886a27011b1825 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
@@ -37,9 +37,9 @@ limitations under the License.
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_identity_op_pattern.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/einsum.h"
@@ -182,7 +182,7 @@ Value MakeOneDimValueBroadcastable(OpBuilder& builder, Location loc,
   return ConstantFoldOpIfPossible(reshape_op).front();
 }
 
-// Checks if a value can be symetrically quantized.
+// Checks if a value can be symmetrically quantized.
 bool CanBeSymmetricallyQuantized(Value weight) {
   auto dq_op = weight.getDefiningOp<quantfork::DequantizeCastOp>();
   if (!dq_op) return true;
@@ -215,7 +215,7 @@ SmallVector<T> MultiplyTwoArrays(ArrayRef<T> a, ArrayRef<T> b) {
 }
 
 // Multiplies the value followed by a FakeQuant op and adjusts the quantization
-// params. This funtion only supports symetrically quantized values.
+// params. This function only supports symmetrically quantized values.
 Value MultiplyFakeQuantValue(OpBuilder& builder, Location loc, Value value,
                              Value multiplier) {
   auto dq_op = value.getDefiningOp<quantfork::DequantizeCastOp>();
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
index f88644a378dd9a..30e298dd6e7048 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
@@ -17,8 +17,8 @@ include "mlir/IR/OpBase.td"
 include "mlir/IR/PatternBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
 
 // Converts arith.constant ops from freezing passes back to tf.Const ops.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
index 209c173bcae701..b5fb96396f7ef9 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
@@ -41,7 +41,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.td
index 328736da06c40d..4fa7ef333f67ee 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.td
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
 include "mlir/IR/OpBase.td"
 include "mlir/IR/PatternBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
 // Converts tf.Const to arith.constant for statically shaped, non-opaque constants.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
index 2d96d13091c62c..8f550a8e5633e4 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
@@ -31,9 +31,8 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
-#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 
 //===----------------------------------------------------------------------===//
@@ -46,6 +45,7 @@ namespace {
 
 using QuantizationUnit = std::pair<Operation*, int>;
 using QuantizationUnits = llvm::SetVector<QuantizationUnit>;
+using ::mlir::quant::OpSet;
 
 // Applies prepare quantization on the model in TF dialect for dynamic range
 // quantization case.
@@ -127,7 +127,7 @@ class PrepareDRQQuantizableOp : public OpRewritePattern<arith::ConstantOp> {
       return failure();
     }
 
-    // 2. Quantize collected ops. It is immediatly quantized by inserting Q-DQ
+    // 2. Quantize collected ops. It is immediately quantized by inserting Q-DQ
     // pair for int8.
     if (!(quantizeOps(rewriter, op, quantizable_ops))) {
       return failure();
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
index 3c23c4edf0bb11..3f6960dd861fb6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
@@ -31,8 +31,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 
 //===----------------------------------------------------------------------===//
@@ -43,7 +43,8 @@ namespace quant {
 
 namespace {
 
-using QuantMethod = tensorflow::quantization::QuantizationMethod::PresetMethod;
+using QuantMethod =
+    ::tensorflow::quantization::QuantizationMethod::PresetMethod;
 using QuantizationUnit = std::pair<Operation*, int>;
 using QuantizationUnits = llvm::SetVector<QuantizationUnit>;
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.td
index 328736da06c40d..4fa7ef333f67ee 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.td
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
 include "mlir/IR/OpBase.td"
 include "mlir/IR/PatternBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
 // Converts tf.Const to arith.constant for statically shaped, non-opaque constants.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/propagate_quantize_type.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/propagate_quantize_type.cc
index 4e69d48eed69c1..8570652b4019e7 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/propagate_quantize_type.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/propagate_quantize_type.cc
@@ -32,8 +32,8 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
index bf93774e67f73d..56c43988e42e4a 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
@@ -46,8 +46,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -315,7 +315,7 @@ class QuantizeSameScaleOpsPattern
   }
 
  private:
-  // Checks whether the operation is connnected with a composite function.
+  // Checks whether the operation is connected with a composite function.
   // If not, the same-scale op will not be quantized. This decision is based
   // on the current assumption that the performance gain of the same-scale
   // op itself could not beat the overhead of the quantize and dequantize
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
index 1945b69f36f71c..e4eecf204e85c9 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
@@ -50,7 +50,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.td
index 1d2bee74d9b4a4..23722a510ac987 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.td
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
 include "mlir/IR/OpBase.td"
 include "mlir/IR/PatternBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
 // Converts reamaining arith.constant ops from quantization passes back to
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_weights.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_weights.cc
index e666fae001024b..2cd7949be7f60c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_weights.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_weights.cc
@@ -44,8 +44,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc
index a206a719c26599..374d687428ee3e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc
@@ -36,8 +36,8 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.td
index b6810be4d846d9..ccd477c310e27c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.td
@@ -16,8 +16,8 @@ include "mlir/IR/OpBase.td"
 include "mlir/IR/PatternBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
-include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
 
 def CreateXLAConvOpFromTFConv2DOp : NativeCodeCall<
   "CreateXlaConvOpFromTfConv2dOp($_builder, $_loc, $0...)">;
@@ -216,7 +216,7 @@ def ConvertTFMatMulToXLADotV2Op : Pat<
   (TF_MatMulOp:$matmul
     (TF_SubOp (TF_CastOp $input, $truncate), $input_zp),
     (TF_CastOp (TF_IdentityOp $weight), $truncate1),
-    $transpose_a, $transpose_b),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
   (CreateXlaDotV2OpFromTfMatMulOp
     $input, $weight, $input_zp,
     /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">), $matmul,
@@ -235,7 +235,7 @@ def ConvertTFMatMulToXLADotV2OpDynamicRange : Pat<
   (TF_MatMulOp:$matmul
     (TF_SubOp:$input (TF_CastOp $input_i8, $truncate0), $input_zp),
     (TF_CastOp (TF_IdentityOp $weight), $truncate1),
-    $transpose_a, $transpose_b),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
   (CreateXlaDotV2OpFromTfMatMulOp
     $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
     /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
@@ -252,7 +252,7 @@ def ConvertTFMatMulToXLADotV2OpWeightOnly : Pat<
   (TF_MatMulOp:$matmul
     $input,
     (TF_MulOp (TF_CastOp (TF_IdentityOp $weight), $truncate1), $scale),
-    $transpose_a, $transpose_b),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
   (TF_MulOp (CreateXlaDotV2OpFromTfMatMulOp
     $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
     /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
@@ -270,7 +270,7 @@ def ConvertTFMatMulWithNoZeroPointToXLADotV2Op : Pat<
   (TF_MatMulOp:$matmul
     (TF_CastOp $input, $truncate),
     (TF_CastOp (TF_IdentityOp $weight), $truncate1),
-    $transpose_a, $transpose_b),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
   (CreateXlaDotV2OpFromTfMatMulOp
     $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
     /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
@@ -288,7 +288,7 @@ def ConvertTFMatMulWithTwoInputTensorsToXLADotV2Op : Pat<
   (TF_MatMulOp:$matmul
     (TF_SubOp (TF_CastOp $input, $truncate1), $input_zp),
     (TF_SubOp (TF_CastOp $weight, $truncate2), $weight_zp),
-    $transpose_a, $transpose_b),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
   (CreateXlaDotV2OpFromTfMatMulOp
     $input, $weight, $input_zp, $weight_zp, $matmul, $transpose_a, $transpose_b),
   [(IsInt8ElementType $input),
@@ -306,7 +306,7 @@ def ConvertTFMatMulWithTwoInputTensorsAndNoInputZeroPointToXLADotV2Op : Pat<
   (TF_MatMulOp:$matmul
     (TF_CastOp $input, $truncate),
     (TF_SubOp (TF_CastOp $weight, $truncate2), $weight_zp),
-    $transpose_a, $transpose_b),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
   (CreateXlaDotV2OpFromTfMatMulOp
     $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
     $weight_zp, $matmul, $transpose_a, $transpose_b),
@@ -324,7 +324,7 @@ def ConvertTFMatMulWithTwoInputTensorsAndNoWeightZeroPointToXLADotV2Op : Pat<
   (TF_MatMulOp:$matmul
     (TF_SubOp (TF_CastOp $input, $truncate), $input_zp),
     (TF_CastOp $weight, $truncate1),
-    $transpose_a, $transpose_b),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
   (CreateXlaDotV2OpFromTfMatMulOp
     $input, $weight, $input_zp,
     /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
@@ -343,7 +343,7 @@ def ConvertTFMatMulWithTwoInputTensorsAndNoBothZeroPointsToXLADotV2Op : Pat<
   (TF_MatMulOp:$matmul
     (TF_CastOp $input, $truncate),
     (TF_CastOp $weight, $truncate1),
-    $transpose_a, $transpose_b),
+    $transpose_a, $transpose_b, $grad_a, $grad_b),
   (CreateXlaDotV2OpFromTfMatMulOp
     $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
     /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
@@ -407,7 +407,7 @@ def ConvertTFBatchMatMulToXLADotV2Op : Pat<
   (TF_BatchMatMulV2Op:$batch_matmul
     (TF_SubOp (TF_CastOp $input, $truncate), $input_zp),
     (TF_CastOp (TF_IdentityOp $weight), $truncate1),
-    $adj_x, $adj_y),
+    $adj_x, $adj_y, $grad_x, $grad_y),
   (CreateXlaDotV2OpFromTfBatchMatMulOp
     $input, $weight, $input_zp,
     /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
@@ -426,7 +426,7 @@ def ConvertTFBatchMatMulWithNoZeroPointToXLADotV2Op : Pat<
   (TF_BatchMatMulV2Op:$batch_matmul
     (TF_CastOp $input, $truncate),
     (TF_CastOp (TF_IdentityOp $weight), $truncate1),
-    $adj_x, $adj_y),
+    $adj_x, $adj_y, $grad_x, $grad_y),
   (CreateXlaDotV2OpFromTfBatchMatMulOp
     $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
     /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
@@ -447,7 +447,7 @@ def ConvertTFBatchMatMulWithTwoInputTensorsToXLADotV2Op : Pat<
   (TF_BatchMatMulV2Op:$batch_matmul
     (TF_SubOp (TF_CastOp $input, $truncate), $input_zp),
     (TF_SubOp (TF_CastOp (TF_IdentityOp $weight), $truncate1), $weight_zp),
-    $adj_x, $adj_y),
+    $adj_x, $adj_y, $grad_x, $grad_y),
   (CreateXlaDotV2OpFromTfBatchMatMulOp
     $input, $weight, $input_zp, $weight_zp, $batch_matmul, $adj_x, $adj_y),
   [(IsInt8ElementType $input),
@@ -465,7 +465,7 @@ def ConvertTFBatchMatMulWithTwoInputTensorsAndNoInputZeroPointToXLADotV2Op : Pat
   (TF_BatchMatMulV2Op:$batch_matmul
     (TF_CastOp $input, $truncate),
     (TF_SubOp (TF_CastOp (TF_IdentityOp $weight), $truncate1), $weight_zp),
-    $adj_x, $adj_y),
+    $adj_x, $adj_y, $grad_x, $grad_y),
   (CreateXlaDotV2OpFromTfBatchMatMulOp
     $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
     $weight_zp, $batch_matmul, $adj_x, $adj_y),
@@ -483,7 +483,7 @@ def ConvertTFBatchMatMulWithTwoInputTensorsAndNoWeightZeroPointToXLADotV2Op : Pa
   (TF_BatchMatMulV2Op:$batch_matmul
     (TF_SubOp (TF_CastOp $input, $truncate1), $input_zp),
     (TF_CastOp $weight, $truncate2),
-    $adj_x, $adj_y),
+    $adj_x, $adj_y, $grad_x, $grad_y),
   (CreateXlaDotV2OpFromTfBatchMatMulOp
     $input, $weight, $input_zp,
     /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
@@ -502,7 +502,7 @@ def ConvertTFBatchMatMulWithTwoInputTensorsAndNoBothZeroPointsToXLADotV2Op : Pat
   (TF_BatchMatMulV2Op:$batch_matmul
     (TF_CastOp $input, $truncate1),
     (TF_CastOp $weight, $truncate2),
-    $adj_x, $adj_y),
+    $adj_x, $adj_y, $grad_x, $grad_y),
   (CreateXlaDotV2OpFromTfBatchMatMulOp
     $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
     /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_opt.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_opt.cc
index 5020550ca65a7c..87d230fb16bbde 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_opt.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_opt.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "mlir/InitAllPasses.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
@@ -40,7 +41,8 @@ int main(int argc, char **argv) {
                   mlir::arith::ArithDialect, mlir::tf_type::TFTypeDialect,
                   mlir::quant::QuantizationDialect,
                   mlir::quantfork::QuantizationForkDialect,
-                  mlir::tf_executor::TensorFlowExecutorDialect>();
+                  mlir::tf_executor::TensorFlowExecutorDialect,
+                  mlir::stablehlo::StablehloDialect>();
   mlir::func::registerAllExtensions(registry);
   return failed(
       mlir::MlirOptMain(argc, argv, "TF quant Pass Driver\n", registry));
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
index 31a016b374675c..808e0b36af2d24 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
@@ -13,6 +13,7 @@ load(
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:__subpackages__",
         "//tensorflow/compiler/mlir/quantization/tensorflow:internal_visibility_allowlist_package",
         "//tensorflow/python:__subpackages__",
     ],
@@ -33,7 +34,12 @@ cc_library(
         "//tensorflow/python:__pkg__",
     ],
     deps = [
+        ":unfreeze_constants",
         "//tensorflow/cc/saved_model:loader",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:export",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:precalibration",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
@@ -42,15 +48,12 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:custom_aggregator_op",  # Required for CustomAggregator op registration.
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:convert_asset_args",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:save_variables",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:status_macro",
         "//tensorflow/compiler/mlir/quantization/tensorflow/debugging:dump_tensor_op",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:mlir_import_options",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
@@ -59,6 +62,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -71,11 +75,10 @@ cc_library(
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:ShapeDialect",
-        "@local_tsl//tsl/platform:env",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -100,11 +103,28 @@ cc_library(
 pytype_strict_library(
     name = "py_function_lib_py",
     srcs = ["py_function_lib.py"],
-    visibility = ["//visibility:private"],
     deps = [
         ":pywrap_function_lib",
+        ":representative_dataset",
+        ":save_model",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_py",
-        "@pypi_typing_extensions//:pkg",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_py",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_algorithm",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_py",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:pywrap_calibration",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/lib/io:file_io",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/types:core",
+        "//third_party/py/numpy",
+        "@absl_py//absl/logging",
     ],
 )
 
@@ -130,13 +150,14 @@ cc_library(
         "-use_header_modules",
         "-parse_headers",
     ],
-    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/python/lib/core:pybind11_lib",
         "//third_party/python_runtime:headers",  # build_cleaner: keep; Required for pybind11.
         "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:protobuf",
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
     ],
@@ -145,9 +166,38 @@ cc_library(
 cc_library(
     name = "py_function_lib",
     hdrs = ["py_function_lib.h"],
-    visibility = ["//visibility:private"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:min_max_value",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings:string_view",
+        "@pybind11",
+    ],
+)
+
+cc_library(
+    name = "unfreeze_constants",
+    srcs = ["unfreeze_constants.cc"],
+    hdrs = ["unfreeze_constants.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:save_variables",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -159,8 +209,14 @@ tf_python_pybind_extension(
     deps = [
         ":py_function_lib",
         ":type_casters",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:min_max_value",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
-        "//tensorflow/python/lib/core:pybind11_lib",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings:string_view",
         "@pybind11",
     ],
 )
@@ -175,15 +231,19 @@ tf_python_pybind_extension(
     deps = [
         ":py_function_lib",
         ":type_casters",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:debugger",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:assign_ids",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:statistics",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton",
-        "//tensorflow/python/lib/core:pybind11_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
         "@pybind11_abseil//pybind11_abseil:import_status_module",
@@ -198,6 +258,7 @@ tf_py_strict_test(
         "pywrap_quantize_model_test.py",
     ],
     deps = [
+        ":py_function_lib_py",
         ":pywrap_quantize_model",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python/platform:client_testlib",
@@ -242,6 +303,7 @@ pytype_strict_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_py",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_algorithm",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_py",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:pywrap_calibration",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
@@ -256,7 +318,6 @@ pytype_strict_library(
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/types:core",
         "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
         "@absl_py//absl/logging",
     ],
 )
@@ -378,6 +439,7 @@ pytype_strict_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/client:session",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/eager:context",
@@ -386,6 +448,7 @@ pytype_strict_library(
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/types:core",
         "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -394,7 +457,9 @@ tf_py_strict_test(
     srcs = ["representative_dataset_test.py"],
     deps = [
         ":representative_dataset",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index 4a1fd7148b0c15..02a3b703b9db18 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -2518,68 +2518,6 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     else:
       self.assertAllClose(new_outputs, expected_outputs, atol=0.13)
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_matmul_ptq_model_stablehlo(self):
-    activation_fn = None
-    has_bias = False
-    batch_sizes = ([], [])
-    target_opset = quant_opts_pb2.STABLEHLO
-
-    lhs_batch_size, rhs_batch_size = batch_sizes
-    input_shape = (*lhs_batch_size, 1, 1024)
-    filter_shape = (*rhs_batch_size, 1024, 3)
-    static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
-    model = self._create_matmul_model(
-        input_shape,
-        filter_shape,
-        self._input_saved_model_path,
-        has_bias,
-        activation_fn,
-    )
-    rng = np.random.default_rng(seed=1234)
-
-    input_data = ops.convert_to_tensor(
-        rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
-            np.float32
-        )
-    )
-    expected_outputs = model.matmul(input_data)
-
-    def data_gen() -> repr_dataset.RepresentativeDataset:
-      for _ in range(100):
-        yield {
-            'input_tensor': rng.uniform(
-                low=0.0, high=1.0, size=static_input_shape
-            ).astype(np.float32)
-        }
-
-    quantization_options = quant_opts_pb2.QuantizationOptions(
-        quantization_method=quant_opts_pb2.QuantizationMethod(
-            preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
-        ),
-        tags={tag_constants.SERVING},
-        signature_keys=['serving_default'],
-        op_set=target_opset,
-    )
-    converted_model = quantize_model.quantize(
-        self._input_saved_model_path,
-        self._output_saved_model_path,
-        quantization_options,
-        representative_dataset=data_gen(),
-    )
-
-    self.assertIsNotNone(converted_model)
-    self.assertCountEqual(
-        converted_model.signatures._signatures.keys(), {'serving_default'}
-    )
-
-    new_outputs = converted_model.signatures['serving_default'](
-        input_tensor=ops.convert_to_tensor(input_data)
-    )
-    # Tests that the quantized graph outputs similar values. The rtol value is
-    # arbitrary.
-    self.assertAllClose(new_outputs, expected_outputs, rtol=0.02)
-
   @parameterized.named_parameters(
       {
           'testcase_name': 'with_biasadd',
@@ -2972,13 +2910,17 @@ def test_matmul_ptq_model_with_unfreeze_constants(self):
       )
 
   @parameterized.named_parameters(
-      ('use_constant_with_int32_input', dtypes.int32, False),
-      ('use_variable_with_int32_input', dtypes.int32, True),
-      ('use_constant_with_int64_input', dtypes.int64, False),
-      ('use_variable_with_int64_input', dtypes.int64, True),
+      ('use_constant_with_int32_input', dtypes.int32, False, True),
+      ('use_variable_with_int32_input', dtypes.int32, True, True),
+      ('use_constant_with_int64_input', dtypes.int64, False, True),
+      ('use_variable_with_int64_input', dtypes.int64, True, True),
+      ('small_gather_use_constant', dtypes.int32, False, False),
+      ('small_gather_use_variable', dtypes.int32, True, False),
   )
   @test_util.run_v2_only
-  def test_gather_model(self, input_type, use_variable):
+  def test_gather_model(
+      self, input_type, use_variable, expect_quantized_gather
+  ):
     model = self._create_gather_model(input_type, use_variable)
 
     saved_model_save.save(model, self._input_saved_model_path)
@@ -2991,7 +2933,9 @@ def test_gather_model(self, input_type, use_variable):
         ),
         tags=tags,
         signature_keys=['serving_default'],
-        op_set=quant_opts_pb2.TF,
+        op_set=quant_opts_pb2.XLA,
+        # Gather op is opt-outed if the size is smaller than the threshold.
+        min_num_elements_for_weights=1024 if expect_quantized_gather else 8192,
     )
 
     data_gen = self._create_data_generator(
@@ -3014,11 +2958,14 @@ def test_gather_model(self, input_type, use_variable):
         converted_model.signatures._signatures.keys(), {'serving_default'}
     )
 
-    output_loader = saved_model_loader.SavedModelLoader(
-        self._output_saved_model_path
-    )
-    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
-    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
+    if expect_quantized_gather:
+      self.assertSizeRatioLessThan(
+          self._output_saved_model_path, self._input_saved_model_path, 1 / 3
+      )
+    else:
+      self.assertSizeRatioGreaterThan(
+          self._output_saved_model_path, self._input_saved_model_path, 2 / 3
+      )
 
   @test_util.run_in_graph_and_eager_modes
   def test_model_ptq_use_representative_samples_list(self):
@@ -3366,7 +3313,7 @@ def test_model_ptq_use_tf_dataset_for_representative_dataset(self):
     self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
   @test_util.run_in_graph_and_eager_modes
-  def test_model_ptq_no_representative_sample_shows_warnings(self):
+  def test_model_ptq_no_representative_sample_not_quantized(self):
     self._create_matmul_model(
         input_shape=(1, 1024),
         weight_shape=(1024, 3),
@@ -3382,30 +3329,14 @@ def test_model_ptq_no_representative_sample_shows_warnings(self):
         signature_keys=['serving_default'],
     )
 
-    with self.assertLogs(level='WARN') as warning_logs:
-      # Save the logger verbosity.
-      prev_log_level = logging.get_verbosity()
-      logging.set_verbosity(logging.WARN)
-
-      try:
-        converted_model = quantize_model.quantize(
-            self._input_saved_model_path,
-            self._output_saved_model_path,
-            quantization_options,
-            # Put no sample into the representative dataset to make calibration
-            # impossible.
-            representative_dataset=[],
-        )
-      finally:
-        # Restore the logger verbosity.
-        logging.set_verbosity(prev_log_level)
-
-      self.assertNotEmpty(warning_logs.records)
-      self.assertTrue(
-          self._any_log_contains(
-              'does not have min or max values', warning_logs.records
-          )
-      )
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        quantization_options,
+        # Put no sample into the representative dataset to make calibration
+        # impossible.
+        representative_dataset=[],
+    )
 
     self.assertIsNotNone(converted_model)
     self.assertCountEqual(
@@ -3486,36 +3417,12 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
         op_set=quant_opts_pb2.TF,
     )
 
-    with self.assertLogs(level='WARN') as warning_logs:
-      # Save the logger verbosity.
-      log_level = logging.get_verbosity()
-      logging.set_verbosity(logging.WARN)
-
-      try:
-        converted_model = quantize_model.quantize(
-            self._input_saved_model_path,
-            self._output_saved_model_path,
-            quantization_options,
-            representative_dataset=data_gen(),
-        )
-      finally:
-        # Restore the logger verbosity.
-        logging.set_verbosity(log_level)
-
-      self.assertNotEmpty(warning_logs.records)
-
-      # Warning message should contain the function name. The uncalibrated path
-      # is when the condition is true, so 'cond_true' function must be part of
-      # the warning message.
-      self.assertTrue(self._any_log_contains('cond_true', warning_logs.records))
-      self.assertFalse(
-          self._any_log_contains('cond_false', warning_logs.records)
-      )
-      self.assertTrue(
-          self._any_log_contains(
-              'does not have min or max values', warning_logs.records
-          )
-      )
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        quantization_options,
+        representative_dataset=data_gen(),
+    )
 
     self.assertIsNotNone(converted_model)
     self.assertCountEqual(
@@ -3527,6 +3434,25 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
     self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
+    # Tests that the false branch contains a quantized function call whereas the
+    # true branch doesn't.
+    def _is_quantized_function_call_node(
+        node_def: node_def_pb2.NodeDef,
+    ) -> bool:
+      return node_def.op == 'PartitionedCall' and node_def.attr[
+          'f'
+      ].func.name.startswith('quantized_')
+
+    for func in output_graphdef.library.function:
+      if func.signature.name.startswith('cond_false'):
+        self.assertTrue(
+            any(map(_is_quantized_function_call_node, func.node_def))
+        )
+      elif func.signature.name.startswith('cond_true'):
+        self.assertFalse(
+            any(map(_is_quantized_function_call_node, func.node_def))
+        )
+
   # Run this test only with the eager mode.
   @test_util.run_v2_only
   def test_ptq_model_with_multiple_signatures(self):
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h b/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h
index 4d120f29491293..dbb557f2b5b033 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h
@@ -15,7 +15,18 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_PY_FUNCTION_LIB_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_PY_FUNCTION_LIB_H_
 
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "pybind11/pytypes.h"  // from @pybind11
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/min_max_value.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace tensorflow::quantization {
 
@@ -27,12 +38,71 @@ class PyFunctionLibrary {
  public:
   virtual ~PyFunctionLibrary() = default;
 
-  // Assigns UUIDs to each CustomAggregator op found in each GraphDef in
-  // `exported_model`. The UUIDs are set to the `id` attributes. The UUIDs will
-  // be used during calibration step to identify the collected quantization
-  // statistics for each CustsomAggregator op.
-  virtual ExportedModel AssignIdsToCustomAggregatorOps(
-      const ExportedModel& exported_model) const = 0;
+  // Saves `exported_model` to `dst_saved_model_path` as SavedModel.
+  // `src_saved_model_path` is the path to the source SavedModel from which the
+  // exported model is produced. It is used to copy the asset files to
+  // `dst_saved_model_path`. `tags` will be attached to the saved
+  // `MetaGraphDef`. `signature_def_map` will be passed to the
+  // `add_meta_graph_and_variables` function, which is internally used to add a
+  // `MetaGraphDef` to save to the SavedModel.
+  //
+  // If the function signature changes, likely its corresponding .pyi type
+  // hinting and definition should also change.
+  // LINT.IfChange
+  virtual void SaveExportedModel(
+      absl::string_view dst_saved_model_path,
+      const ExportedModel& exported_model,
+      absl::string_view src_saved_model_path,
+      const std::unordered_set<std::string>& tags,
+      const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
+          signature_def_map) const = 0;
+  // LINT.ThenChange(
+  //     pywrap_function_lib.pyi:save_exported_model,
+  //     py_function_lib.py:save_exported_model,
+  // )
+
+  // Runs calibration on a model saved at `saved_model_path`. `exported_model`
+  // should be the corresponding exported model resulting from the
+  // pre-calibration step. `signature_keys` is a set of keys that identify a
+  // SignatureDef to run the calibration on. `tags` is a set of strings that
+  // identify the `MetaGraphDef`. `calibration_options` provides configurations
+  // for the calibration behavior. `representative_dataset` is a python object
+  // of type `RepresentativeDatasetOrMapping`, which is used to run the
+  // calibration.
+  //
+  // Returns the updated exported model where the collected calibration
+  // statistics are added to `CustomAggregator` nodes at the `min` and `max`
+  // attributes.
+  //
+  // If the function signature changes, likely its corresponding .pyi type
+  // hinting and definition should also change.
+  // LINT.IfChange(run_calibration)
+  virtual void RunCalibration(
+      absl::string_view saved_model_path,
+      const std::vector<std::string>& signature_keys,
+      const std::unordered_set<std::string>& tags,
+      const CalibrationOptions& calibration_options,
+      bool force_graph_mode_calibration,
+      pybind11::object representative_dataset) const = 0;
+  // LINT.ThenChange(
+  //     pywrap_function_lib.pyi:run_calibration,
+  //     py_function_lib.py:run_calibration,
+  // )
+
+  // Retrieves min and max value from `calibration_statistics`, based on the
+  // calibration method specified by `calibration_options`.
+  //
+  // If the function signature changes, likely its corresponding .pyi type
+  // hinting and definition should also change.
+  // LINT.IfChange(get_calibration_min_max_value)
+  virtual stablehlo::quantization::MinMaxValue GetCalibrationMinMaxValue(
+      const tensorflow::calibrator::CalibrationStatistics&
+          calibration_statistics,
+      const CalibrationOptions& calibration_options) const = 0;
+  // LINT.ThenChange(
+  //     pywrap_function_lib.pyi:get_calibration_min_max_value,
+  //     py_function_lib.py:get_calibration_min_max_value,
+  // )
 };
 
 }  // namespace tensorflow::quantization
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py
index 145149e5341042..22c3be3d6034e7 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py
@@ -13,10 +13,516 @@
 # limitations under the License.
 # ==============================================================================
 """Defines a wrapper class for overridden python method definitions."""
-import uuid
+from collections.abc import Callable, Collection, Mapping, Sequence
+from typing import Optional
+
+from absl import logging
 
 from tensorflow.compiler.mlir.quantization.tensorflow import exported_model_pb2
+from tensorflow.compiler.mlir.quantization.tensorflow import quantization_options_pb2
+from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import calibration_algorithm
+from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import calibration_statistics_pb2
+from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import pywrap_calibration
 from tensorflow.compiler.mlir.quantization.tensorflow.python import pywrap_function_lib
+from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as rd
+from tensorflow.compiler.mlir.quantization.tensorflow.python import save_model
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.eager import wrap_function
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.trackable import autotrackable
+from tensorflow.python.types import core
+
+# Name of the saved model assets directory.
+_ASSETS_DIR = 'assets'
+_ASSETS_EXTRA_DIR = 'assets.extra'
+
+
+def _get_saver_def_or_none(
+    exported_model: exported_model_pb2.ExportedModel,
+) -> Optional[saver_pb2.SaverDef]:
+  """Returns the SaverDef from ExportedModel, None otherwise.
+
+  Args:
+    exported_model: ExportedModel to take the SaverDef from.
+
+  Returns:
+    SaverDef instance if the field `saver_def` is set. None otherwise.
+  """
+  if exported_model.HasField('saver_def'):
+    return exported_model.saver_def
+  return None
+
+
+def _copy_assets(src_path: str, dst_path: str) -> None:
+  """Copies the assets directory of the saved model.
+
+  Clones the contents of the assets/ directory from the source saved model
+  directory to the destination saved model directory. Nothing will be copied if
+  there are no assets directory in the source directory.
+
+  Args:
+    src_path: Source saved model directory.
+    dst_path: Destination saved model directory. This directory must exist.
+  """
+  for assets_dir_name in [_ASSETS_DIR, _ASSETS_EXTRA_DIR]:
+    src_assets_path = file_io.join(src_path, assets_dir_name)
+    if not file_io.file_exists_v2(src_assets_path):
+      # Do nothing if the source assets path does not exist.
+      continue
+
+    dst_assets_path = file_io.join(dst_path, assets_dir_name)
+    file_io.create_dir_v2(dst_assets_path)
+
+    for curr_dir, _, files in file_io.walk_v2(src_assets_path):
+      for asset_file_name in files:
+        src_asset_file = file_io.join(curr_dir, asset_file_name)
+
+        # Construct the destination assets file path.
+        curr_dst_dir = curr_dir.replace(src_assets_path, dst_assets_path)
+        dst_asset_file = file_io.join(curr_dst_dir, asset_file_name)
+
+        file_io.copy_v2(src_asset_file, dst_asset_file)
+        logging.info(
+            'Copied asset file: %s -> %s', src_asset_file, dst_asset_file
+        )
+
+
+def _validate_representative_dataset(
+    representative_dataset: rd.RepresentativeDatasetOrMapping,
+    signature_keys: Collection[str],
+) -> None:
+  """Validates the representative dataset, based on the signature keys.
+
+  Representative dataset can be provided in two different forms: a single
+  instance of `RepresentativeDataset` or a map of signature key to the
+  corresponding `RepresentativeDataset`. These have a relationship with
+  `signature_keys`.
+
+  This function validates the following conditions:
+  * If `len(signature_keys) > 1`, then `representative_dataset` should be a
+    mapping where the keys exactly match the elements in `signature_keys`.
+  * If `len(signature_keys) == 1`, then both a mapping and a single instance of
+    `RepresentativeDataset` are allowed.
+  * This function also assumes `len(signature_keys) > 0`.
+
+  Args:
+    representative_dataset: A `RepresentativeDataset` or a map of string to
+      `RepresentativeDataset` to be validated.
+    signature_keys: A collection of strings that contains the signature keys,
+      each identifying a `SignatureDef`.
+
+  Raises:
+    ValueError: Iff `representative_dataset` does not satisfy the conditions
+      above.
+  """
+  if isinstance(representative_dataset, Mapping):
+    if set(signature_keys) != set(representative_dataset.keys()):
+      raise ValueError(
+          'The signature keys and the keys of representative dataset map '
+          f'do not match. Signature keys: {set(signature_keys)}, '
+          f'representative dataset map: {set(representative_dataset.keys())}.'
+      )
+  else:
+    if len(signature_keys) > 1:
+      raise ValueError(
+          'Representative dataset is not a mapping '
+          f'(got: {type(representative_dataset)}), '
+          'but there is more than one signature key provided. '
+          'Please provide a map of {signature_key -> dataset} '
+          'with more than one signature key.'
+      )
+
+
+def _replace_tensors_by_numpy_ndarrays(
+    repr_ds_map: rd.RepresentativeDatasetMapping,
+) -> None:
+  """Replaces tf.Tensors by their evaluated numpy arrays.
+
+  This assumes that tf.Tensors in representative samples are created in the
+  default Graph. It will raise an error if tensors are created in a different
+  graph.
+
+  Args:
+    repr_ds_map: SignatureDef key -> RepresentativeDataset mapping.
+  """
+  with session.Session() as sess:
+    for signature_def_key in repr_ds_map:
+      # Replaces the dataset with a new dataset where tf.Tensors are replaced
+      # by their evaluated values.
+      ds = repr_ds_map[signature_def_key]
+      repr_ds_map[signature_def_key] = rd.replace_tensors_by_numpy_ndarrays(
+          ds, sess
+      )
+
+
+def _create_sample_validator(
+    expected_input_keys: Collection[str],
+) -> Callable[[rd.RepresentativeSample], rd.RepresentativeSample]:
+  """Creates a validator function for a representative sample.
+
+  Args:
+    expected_input_keys: Input keys (keyword argument names) that the function
+      the sample will be used for is expecting to receive.
+
+  Returns:
+    A callable that validates a `RepresentativeSample`.
+  """
+
+  def validator(
+      sample: rd.RepresentativeSample,
+  ) -> rd.RepresentativeSample:
+    """Validates a single instance of representative sample.
+
+    This provides a simple check for `sample` that this is a mapping of
+    {input_key: input_value}.
+
+    Args:
+      sample: A `RepresentativeSample` to validate.
+
+    Returns:
+      `sample` iff it is valid.
+
+    Raises:
+      ValueError: iff the sample isn't an instance of `Mapping`.
+      KeyError: iff the sample does not have the set of input keys that match
+        the input keys of the function.
+    """
+    if not isinstance(sample, Mapping):
+      raise ValueError(
+          'Invalid representative sample type. Provide a mapping '
+          '(usually a dict) of {input_key: input_value}. '
+          f'Got type: {type(sample)} instead.'
+      )
+
+    if set(sample.keys()) != expected_input_keys:
+      raise KeyError(
+          'Invalid input keys for representative sample. The function expects '
+          f'input keys of: {set(expected_input_keys)}. '
+          f'Got: {set(sample.keys())}. Please provide correct input keys for '
+          'representative samples.'
+      )
+
+    return sample
+
+  return validator
+
+
+# TODO(b/249918070): Implement a progress bar.
+def _log_sample_num_for_calibration(
+    representative_dataset: rd.RepresentativeDataset,
+) -> rd.RepresentativeDataset:
+  """Logs the sample number for calibration.
+
+  If in debug logging level, the "sample number / total num samples" is logged
+  for every 5 iterations.
+
+  This is often useful when tracking the progress of the calibration step which
+  is often slow and may look stale if there's no logs being printed.
+
+  Args:
+    representative_dataset: The representative dataset.
+
+  Yields:
+    The representative samples from `representative_dataset` without any
+    modification.
+  """
+  num_samples: Optional[int] = rd.get_num_samples(representative_dataset)
+  if num_samples is None:
+    total_num_samples = '?'
+    logging.info('Representative dataset size unknown.')
+  else:
+    total_num_samples = str(num_samples)
+    logging.info('Using representative dataset of size: %s', total_num_samples)
+
+  sample_num = 0
+  for sample in representative_dataset:
+    sample_num += 1
+
+    # Log the sample number for every 5 iterations.
+    logging.log_every_n(
+        logging.DEBUG,
+        'Running representative sample for calibration: %d / %s',
+        5,
+        sample_num,
+        total_num_samples,
+    )
+    yield sample
+
+  logging.info(
+      'Running representative samples complete: %d / %s',
+      sample_num,
+      total_num_samples,
+  )
+
+
+def _run_function_for_calibration_graph_mode(
+    sess: session.Session,
+    signature_def: meta_graph_pb2.SignatureDef,
+    representative_dataset: rd.RepresentativeDataset,
+) -> None:
+  """Runs the representative dataset through a function for calibration.
+
+  NOTE: This is intended to be run in graph mode (TF1).
+
+  The function is identified by the SignatureDef.
+
+  Args:
+    sess: The Session object to run the function in.
+    signature_def: A SignatureDef that identifies a function by specifying the
+      inputs and outputs.
+    representative_dataset: The representative dataset to run through the
+      function.
+  """
+  output_tensor_names = [
+      output_tensor_info.name
+      for output_tensor_info in signature_def.outputs.values()
+  ]
+
+  sample_validator = _create_sample_validator(
+      expected_input_keys=signature_def.inputs.keys()
+  )
+
+  for sample in map(
+      sample_validator, _log_sample_num_for_calibration(representative_dataset)
+  ):
+    # Create a mapping from input tensor name to the input tensor value.
+    # ex) "Placeholder:0" -> [0, 1, 2]
+    feed_dict = rd.create_feed_dict_from_input_data(sample, signature_def)
+    sess.run(output_tensor_names, feed_dict=feed_dict)
+
+
+def _run_graph_for_calibration_graph_mode(
+    model_dir: str,
+    tags: Collection[str],
+    representative_dataset_map: rd.RepresentativeDatasetMapping,
+) -> None:
+  """Runs the graph for calibration in graph mode.
+
+  This function assumes _graph mode_ (used when legacy TF1 is used or when eager
+  mode is explicitly disabled) when running the graph. This step is used in
+  order to collect the statistics in CustomAggregatorOp for quantization using
+  the representative dataset for the actual data provided for inference.
+
+  Args:
+    model_dir: Path to SavedModel directory.
+    tags: Collection of tags identifying the MetaGraphDef within the SavedModel.
+    representative_dataset_map: A map where signature keys are mapped to
+      corresponding representative datasets.
+
+  Raises:
+    ValueError: When running the function with the representative dataset fails.
+  """
+  # Replace tf.Tensors by numpy ndarrays in order to reuse the samples in a
+  # different graph when running the calibration.
+  _replace_tensors_by_numpy_ndarrays(representative_dataset_map)
+
+  # Run the calibration in a new graph to avoid name collision, which could
+  # happen when the same model is loaded multiple times in the default graph.
+  with ops.Graph().as_default(), session.Session() as sess:
+    meta_graph: meta_graph_pb2.MetaGraphDef = loader_impl.load(
+        sess, tags, export_dir=model_dir
+    )
+
+    for signature_key, repr_ds in representative_dataset_map.items():
+      sig_def = meta_graph.signature_def[signature_key]
+
+      try:
+        _run_function_for_calibration_graph_mode(
+            sess, signature_def=sig_def, representative_dataset=repr_ds
+        )
+      except Exception as ex:
+        raise ValueError(
+            'Failed to run representative dataset through the '
+            f'function with the signature key: {signature_key}.'
+        ) from ex
+
+
+def _convert_values_to_tf_tensors(
+    sample: rd.RepresentativeSample,
+) -> Mapping[str, core.Tensor]:
+  """Converts TensorLike values of `sample` to Tensors.
+
+  Creates a copy of `sample`, where each value is converted to Tensors
+  unless it is already a Tensor.
+  The values are not converted in-place (i.e. `sample` is not mutated).
+
+  Args:
+    sample: A representative sample, which is a map of {name -> tensorlike
+      value}.
+
+  Returns:
+    Converted map of {name -> tensor}.
+  """
+  tensor_mapping = {}
+  for name, tensorlike_value in sample.items():
+    if isinstance(tensorlike_value, core.Tensor):
+      tensor_value = tensorlike_value
+    else:
+      tensor_value = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          tensorlike_value
+      )
+
+    tensor_mapping[name] = tensor_value
+
+  return tensor_mapping
+
+
+def _run_function_for_calibration_eager_mode(
+    func: wrap_function.WrappedFunction,
+    representative_dataset: rd.RepresentativeDataset,
+) -> None:
+  """Runs the representative dataset through a function for calibration.
+
+  NOTE: This is intended to be run in eager mode (TF2).
+
+  Args:
+    func: The function to run the representative samples through.
+    representative_dataset: Representative dataset used for calibration. The
+      input keys and input values of the representative samples should match the
+      keyword arguments of `func`.
+  """
+  _, keyword_args = func.structured_input_signature
+  sample_validator = _create_sample_validator(
+      expected_input_keys=keyword_args.keys()
+  )
+
+  for sample in map(
+      sample_validator, _log_sample_num_for_calibration(representative_dataset)
+  ):
+    # Convert any non-Tensor values from the sample to Tensors.
+    # This conversion is required because the model saved in `model_dir` is
+    # saved using TF1 SavedModelBuilder, which doesn't save the
+    # SavedObjectGraph.
+    # TODO(b/236795224): Remove the need for this conversion by keeping the
+    # FunctionSpec (object graph) in the SavedModel. Related: b/213406917.
+    func_kwargs = _convert_values_to_tf_tensors(sample)
+    func(**func_kwargs)
+
+
+def _run_graph_for_calibration_eager_mode(
+    model_dir: str,
+    tags: Collection[str],
+    representative_dataset_map: rd.RepresentativeDatasetMapping,
+) -> None:
+  """Runs the graph for calibration in eager mode.
+
+  This function assumes _eager mode_ (enabled in TF2 by default) when running
+  the graph. This step is used in order to collect the statistics in
+  CustomAggregatorOp for quantization using the representative dataset for the
+  actual data provided for inference.
+
+  Args:
+    model_dir: Path to SavedModel directory.
+    tags: Collection of tags identifying the MetaGraphDef within the SavedModel.
+    representative_dataset_map: A map where signature keys are mapped to
+      corresponding representative datasets.
+
+  Raises:
+    ValueError: When running the function with the representative dataset fails.
+  """
+  root: autotrackable.AutoTrackable = load.load(model_dir, tags)
+  for signature_key, repr_ds in representative_dataset_map.items():
+    try:
+      _run_function_for_calibration_eager_mode(
+          func=root.signatures[signature_key], representative_dataset=repr_ds
+      )
+    except Exception as ex:
+      raise ValueError(
+          'Failed to run representative dataset through the '
+          f'function with the signature key: {signature_key}.'
+      ) from ex
+
+
+def _run_graph_for_calibration(
+    float_model_dir: str,
+    signature_keys: Sequence[str],
+    tags: Collection[str],
+    representative_dataset: rd.RepresentativeDatasetOrMapping,
+    force_graph_mode_calibration: bool,
+) -> None:
+  """Runs the graph for calibration using representative datasets.
+
+  Args:
+    float_model_dir: Path to the model to calibrate.
+    signature_keys: Sequence of keys identifying SignatureDef containing inputs
+      and outputs.
+    tags: Collection of tags identifying the MetaGraphDef within the SavedModel
+      to analyze.
+    representative_dataset: An iterator that returns a dictionary of {input_key:
+      input_value} or a mapping from signature keys to such iterators. When
+      `signature_keys` contains more than one signature key,
+      `representative_datsaet` should be a mapping that maps each signature keys
+      to the corresponding representative dataset.
+    force_graph_mode_calibration: If set to true, it forces calibration in graph
+      model instead of eager mode when the context is in eager mode.
+
+  Raises:
+    ValueError iff:
+      * The representative dataset format is invalid.
+      * It fails to run the functions using the representative datasets.
+  """
+  try:
+    _validate_representative_dataset(representative_dataset, signature_keys)
+  except Exception as ex:
+    raise ValueError('Invalid representative dataset.') from ex
+
+  # If `representative_dataset` is not a mapping, convert to a mapping for the
+  # following functions to handle representative datasets more conveniently.
+  representative_dataset_map = representative_dataset
+  if not isinstance(representative_dataset, Mapping):
+    # `signature_keys` is guaranteed to have only one element after the
+    # validation.
+    representative_dataset_map = {signature_keys[0]: representative_dataset}
+
+  try:
+    if context.executing_eagerly() and not force_graph_mode_calibration:
+      logging.info('Calibration step is executed in eager mode.')
+      _run_graph_for_calibration_eager_mode(
+          float_model_dir, tags, representative_dataset_map
+      )
+    else:
+      logging.info('Calibration step is executed in graph mode.')
+      _run_graph_for_calibration_graph_mode(
+          float_model_dir, tags, representative_dataset_map
+      )
+  except Exception as ex:
+    raise ValueError(
+        'Failed to run graph for post-training quantization calibration.'
+    ) from ex
+
+  logging.info('Calibration step complete.')
+
+
+def _get_min_max_from_calibrator(
+    node_id: bytes,
+    calib_opts: quantization_options_pb2.CalibrationOptions,
+) -> tuple[float, float]:
+  """Calculate min and max from statistics using calibration options.
+
+  Args:
+    node_id: bytes of node id.
+    calib_opts: Calibration options used for calculating min and max.
+
+  Returns:
+    (min_value, max_value): Min and max calculated using calib_opts.
+
+  Raises:
+    ValueError: Unsupported calibration method is given.
+  """
+  statistics: calibration_statistics_pb2.CalibrationStatistics = (
+      pywrap_calibration.get_statistics_from_calibrator(node_id)
+  )
+  min_value, max_value = calibration_algorithm.get_min_max_value(
+      statistics, calib_opts
+  )
+  return min_value, max_value
 
 
 class PyFunctionLibrary(pywrap_function_lib.PyFunctionLibrary):
@@ -26,27 +532,117 @@ class PyFunctionLibrary(pywrap_function_lib.PyFunctionLibrary):
   declared in `pywrap_function_lib.PyFunctionLibrary`.
   """
 
-  def assign_ids_to_custom_aggregator_ops(
+  # LINT.IfChange(save_exported_model)
+  def save_exported_model(
       self,
+      dst_saved_model_path: str,
       exported_model_serialized: bytes,
-  ) -> bytes:
-    """Assigns UUIDs to each CustomAggregator op find in the graph def.
+      src_saved_model_path: str,
+      tags: set[str],
+      serialized_signature_def_map: dict[str, bytes],
+  ) -> None:
+    # LINT.ThenChange(py_function_lib.h:save_exported_model)
+    """Saves `ExportedModel` to `dst_saved_model_path` as a SavedModel.
 
     Args:
-      exported_model_serialized: Serialized `ExportedModel` instance.
-
-    Returns:
-      Serialized `ExportedModel` whose CustomAggregator ops are assigned UUIDs
-      to their `id` attributes.
+      dst_saved_model_path: Destination path to save the exported model.
+      exported_model_serialized: Exported model to export as SavedModel.
+      src_saved_model_path: Path to the source SavedModel. This will be used to
+        copy the asset files to `dst_saved_model_path`.
+      tags: Tags to attach to the saved MetaGraphDef.
+      serialized_signature_def_map: Signature key -> serialized SignatureDef.
     """
     exported_model = exported_model_pb2.ExportedModel.FromString(
         exported_model_serialized
     )
 
-    graph_def = exported_model.graph_def
-    for function_def in graph_def.library.function:
-      for node_def in function_def.node_def:
-        if node_def.op == 'CustomAggregator':
-          node_def.attr['id'].s = uuid.uuid4().hex.encode('ascii')
+    # Deserialize values in signature_def_map.
+    signature_def_map = {}
+    for key, serialized_signature_def in serialized_signature_def_map.items():
+      signature_def_map[key] = meta_graph_pb2.SignatureDef.FromString(
+          serialized_signature_def
+      )
+
+    save_model.save_model_v1(
+        exported_model.graph_def,
+        dst_saved_model_path,
+        signature_def_map,
+        tags,
+        init_op_name=exported_model.init_node_name,
+        saver_def=_get_saver_def_or_none(exported_model),
+        checkpoint_dir=exported_model.checkpoint_dir,
+        function_aliases=exported_model.function_aliases,
+        asset_file_defs=exported_model.asset_file_defs,
+    )
+
+    _copy_assets(src_saved_model_path, dst_saved_model_path)
 
-    return exported_model.SerializeToString()
+  # TODO: b/311097139 - Extract calibration related functions into a separate
+  # file.
+  # LINT.IfChange(run_calibration)
+  def run_calibration(
+      self,
+      saved_model_path: str,
+      signature_keys: list[str],
+      tags: set[str],
+      calibration_options_serialized: bytes,
+      force_graph_mode_calibration: bool,
+      representative_dataset: rd.RepresentativeDatasetOrMapping,
+  ) -> None:
+    # LINT.ThenChange(py_function_lib.h:run_calibration)
+    """Runs calibration and adds calibration statistics to exported model.
+
+    Args:
+      saved_model_path: Path to the SavedModel to run calibration.
+      signature_keys: List of signature keys corresponding to SignatureDefs to
+        run calibration on.
+      tags: A set of tags that identify the MetaGraphDef.
+      calibration_options_serialized: Serialized `CalibrationOptions`.
+      force_graph_mode_calibration: If True, runs the calibration in graph mode.
+      representative_dataset: Representative dataset to run calibration.
+
+    Returns:
+      Updated exported model (serialized) where the collected calibration
+      statistics are added to `CustomerAggregator` nodes at the `min` and `max`
+      attributes.
+    """
+    # Uses the representative dataset to collect statistics for calibration.
+    # After this operation, min & max values are stored separately in a global
+    # CalibratorSingleton instance.
+    _run_graph_for_calibration(
+        saved_model_path,
+        signature_keys,
+        tags,
+        representative_dataset,
+        force_graph_mode_calibration,
+    )
+
+  # LINT.IfChange(get_calibration_min_max_value)
+  def get_calibration_min_max_value(
+      self,
+      calibration_statistics_serialized: bytes,
+      calibration_options_serialized: bytes,
+  ) -> tuple[float, float]:
+    """Calculates min and max values from statistics.
+
+    Args:
+      calibration_statistics_serialized: Serialized `CalibrationStatistics`.
+        This will be the source to calculate min and max values from.
+      calibration_options_serialized: Serialized `CalibrationOptions`. Specifies
+        how the min / max should be calculated.
+
+    Returns:
+      (min_value, max_value): Min and max calculated using calib_opts.
+
+    Raises:
+      ValueError: Unsupported calibration method is given.
+    """
+    # LINT.ThenChange(py_function_lib.h:get_calibration_min_max_value)
+    return calibration_algorithm.get_min_max_value(
+        calibration_statistics_pb2.CalibrationStatistics.FromString(
+            calibration_statistics_serialized
+        ),
+        quantization_options_pb2.CalibrationOptions.FromString(
+            calibration_options_serialized
+        ),
+    )
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib_test.py
index fbac4dad0454de..b170daca109e98 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib_test.py
@@ -13,45 +13,13 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for py_function_lib."""
-from tensorflow.compiler.mlir.quantization.tensorflow import exported_model_pb2
-from tensorflow.compiler.mlir.quantization.tensorflow.python import py_function_lib
-from tensorflow.core.framework import function_pb2
-from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.platform import test
 
 
 class PyFunctionLibTest(test.TestCase):
-
-  def test_assign_ids_to_custom_aggregator_ops(self):
-    func_lib = py_function_lib.PyFunctionLibrary()
-    exported_model = exported_model_pb2.ExportedModel()
-    function_def: function_pb2.FunctionDef = (
-        exported_model.graph_def.library.function.add()
-    )
-
-    node_def_1: node_def_pb2.NodeDef = function_def.node_def.add()
-    node_def_1.op = 'CustomAggregator'
-
-    node_def_2: node_def_pb2.NodeDef = function_def.node_def.add()
-    node_def_2.op = 'Identity'
-
-    result_exported_model = exported_model_pb2.ExportedModel.FromString(
-        func_lib.assign_ids_to_custom_aggregator_ops(
-            exported_model.SerializeToString()
-        )
-    )
-    result_function_def = result_exported_model.graph_def.library.function[0]
-
-    # Check that a 'CustomAggregatorOp' has an 'id' attribute whereas other ops
-    # don't.
-    result_node_def_1 = result_function_def.node_def[0]
-    self.assertEqual(result_node_def_1.op, 'CustomAggregator')
-    self.assertIn('id', result_node_def_1.attr)
-    self.assertLen(result_node_def_1.attr, 1)
-
-    result_node_def_2 = result_function_def.node_def[1]
-    self.assertEqual(result_node_def_2.op, 'Identity')
-    self.assertNotIn('id', result_node_def_2.attr)
+  # Functions in PyFunctionLib is in the process of migration to c++
+  # implementations.
+  pass
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.cc
index 4b84bca54b71b9..3e14a9bd1e8b73 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.cc
@@ -12,15 +12,33 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "pybind11/cast.h"  // from @pybind11
 #include "pybind11/detail/common.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/min_max_value.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/python/type_casters.h"
-#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/type_casters.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace py = ::pybind11;
 
 namespace {
 
+using ::stablehlo::quantization::MinMaxValue;
+using ::tensorflow::SignatureDef;
+using ::tensorflow::calibrator::CalibrationStatistics;
+using ::tensorflow::quantization::CalibrationOptions;
 using ::tensorflow::quantization::ExportedModel;
 using ::tensorflow::quantization::PyFunctionLibrary;
 
@@ -33,10 +51,35 @@ class PyFunctionLibraryTrampoline : public PyFunctionLibrary {
  public:
   using PyFunctionLibrary::PyFunctionLibrary;
 
-  ExportedModel AssignIdsToCustomAggregatorOps(
-      const ExportedModel& exported_model) const override {
-    PYBIND11_OVERRIDE_PURE(ExportedModel, PyFunctionLibrary,
-                           assign_ids_to_custom_aggregator_ops, exported_model);
+  void SaveExportedModel(const absl::string_view dst_saved_model_path,
+                         const ExportedModel& exported_model,
+                         const absl::string_view src_saved_model_path,
+                         const std::unordered_set<std::string>& tags,
+                         const absl::flat_hash_map<std::string, SignatureDef>&
+                             signature_def_map) const override {
+    PYBIND11_OVERRIDE_PURE(void, PyFunctionLibrary, save_exported_model,
+                           dst_saved_model_path, exported_model,
+                           src_saved_model_path, tags, signature_def_map);
+  }
+
+  void RunCalibration(const absl::string_view saved_model_path,
+                      const std::vector<std::string>& signature_keys,
+                      const std::unordered_set<std::string>& tags,
+                      const CalibrationOptions& calibration_options,
+                      const bool force_graph_mode_calibration,
+                      const py::object representative_dataset) const override {
+    PYBIND11_OVERRIDE_PURE(void, PyFunctionLibrary, run_calibration,
+                           saved_model_path, signature_keys, tags,
+                           calibration_options, force_graph_mode_calibration,
+                           representative_dataset);
+  }
+
+  MinMaxValue GetCalibrationMinMaxValue(
+      const CalibrationStatistics& calibration_statistics,
+      const CalibrationOptions& calibration_options) const override {
+    PYBIND11_OVERRIDE_PURE(MinMaxValue, PyFunctionLibrary,
+                           get_calibration_min_max_value,
+                           calibration_statistics, calibration_options);
   }
 };
 
@@ -46,6 +89,18 @@ PYBIND11_MODULE(pywrap_function_lib, m) {
   py::class_<PyFunctionLibrary, PyFunctionLibraryTrampoline>(
       m, "PyFunctionLibrary")
       .def(py::init<>())
-      .def("assign_ids_to_custom_aggregator_ops",
-           &PyFunctionLibrary::AssignIdsToCustomAggregatorOps);
+      .def("save_exported_model", &PyFunctionLibrary::SaveExportedModel,
+           py::arg("dst_saved_model_path"),
+           py::arg("exported_model_serialized"),
+           py::arg("src_saved_model_path"), py::arg("tags"),
+           py::arg("serialized_signature_def_map"))
+      .def("run_calibration", &PyFunctionLibrary::RunCalibration,
+           py::arg("saved_model_path"), py::arg("signature_keys"),
+           py::arg("tags"), py::arg("calibration_options_serialized"),
+           py::arg("force_graph_mode_calibration"),
+           py::arg("representative_dataset"))
+      .def("get_calibration_min_max_value",
+           &PyFunctionLibrary::GetCalibrationMinMaxValue,
+           py::arg("calibration_statistics_serialized"),
+           py::arg("calibration_options_serialized"));
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.pyi b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.pyi
index 4c1c8937e8d38b..55c7a4fb346a70 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.pyi
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.pyi
@@ -12,7 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+from typing import Any
+
 class PyFunctionLibrary:
-  def assign_ids_to_custom_aggregator_ops(
-      self, exported_model_serialized: bytes
-  ) -> bytes: ...
+
+  # LINT.IfChange(save_exported_model)
+  def save_exported_model(
+      self,
+      dst_saved_model_path: str,
+      exported_model_serialized: bytes,
+      src_saved_model_path: str,
+      tags: set[str],
+      serialized_signature_def_map: dict[str, bytes],
+  ) -> None: ...
+  # LINT.ThenChange()
+
+  # LINT.IfChange(run_calibration)
+  def run_calibration(
+      self,
+      saved_model_path: str,
+      signature_keys: list[str],
+      tags: set[str],
+      calibration_options_serialized: bytes,
+      force_graph_mode_calibration: bool,
+      representative_dataset: Any,
+  ) -> None: ...
+  # LINT.ThenChange()
+
+  # LINT.IfChange(get_calibration_min_max_value)
+  def get_calibration_min_max_value(
+      self,
+      calibration_statistics_serialized: bytes,
+      calibration_options_serialized: bytes,
+  ) -> tuple[float, float]: ...
+  # LINT.ThenChange()
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
index 05eb0123589c0a..43eebfd53468d8 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <optional>
 #include <string>
 #include <unordered_set>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "pybind11/cast.h"  // from @pybind11
 #include "pybind11/detail/common.h"  // from @pybind11
@@ -30,19 +30,27 @@ limitations under the License.
 #include "pybind11_abseil/import_status_module.h"  // from @pybind11_abseil
 #include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil  // IWYU pragma: keep
 #include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/assign_ids.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/type_casters.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace py = pybind11;
 
 namespace {
 
-using ::tensorflow::calibrator::CalibrationStatistics;
-using ::tensorflow::calibrator::CalibratorSingleton;
+using ::stablehlo::quantization::AddCalibrationStatistics;
+using ::stablehlo::quantization::AssignIdsToCustomAggregatorOps;
+using ::stablehlo::quantization::EnableDebugging;
+using ::stablehlo::quantization::io::CreateTmpDir;
+using ::tensorflow::SignatureDef;
 using ::tensorflow::quantization::ExportedModel;
 using ::tensorflow::quantization::PyFunctionLibrary;
 using ::tensorflow::quantization::QuantizationOptions;
@@ -52,157 +60,283 @@ using ::tensorflow::quantization::QuantizePtqModelPreCalibration;
 using ::tensorflow::quantization::QuantizeQatModel;
 using ::tensorflow::quantization::QuantizeWeightOnly;
 
-// Retrieves collected statistics of a `CustomAggregator` node from the
-// singleton. `id` is the identifier of the `CustomAggregator`.
-CalibrationStatistics GetStatisticsFromCalibrator(const absl::string_view id) {
-  std::optional<CalibrationStatistics> statistics =
-      CalibratorSingleton::GetStatistics(id);
-
-  if (!statistics.has_value()) {
-    throw py::value_error(absl::StrFormat(
-        "Calibrated data does not exist. Cannot find statistics."
-        "value for id: '%s'",
-        id));
-  }
-
-  return *statistics;
-}
-
 }  // namespace
 
 PYBIND11_MODULE(pywrap_quantize_model, m) {
   // Supports absl::StatusOr<T> type conversions.
   pybind11::google::ImportStatusModule();
-  // TODO - b/308532051: Make protobuf objects work without serialization
-  // overhead.
   pybind11_protobuf::ImportNativeProtoCasters();
 
-  // Calibrator related functions.
-  m.def(
-      "clear_calibrator",
-      [] { CalibratorSingleton::ClearCollectedInformation(); },
-      R"pbdoc(
-      Clears the collected metrics from the calibrator.
-    )pbdoc");
-  m.def(
-      "clear_data_from_calibrator",
-      [](const absl::string_view id) { CalibratorSingleton::ClearData(id); },
-      R"pbdoc(
-      Clears the collected data of the given id from calibrator.
-    )pbdoc");
-  m.def(
-      "get_statistics_from_calibrator",
-      [](const absl::string_view id) -> CalibrationStatistics {
-        return GetStatisticsFromCalibrator(id);
-      },
-      R"pbdoc(
-      Returns the proto CalibrationStatistics given id from calibrator.
-    )pbdoc");
-
-  // Quantization functions.
   m.def(
+      // If the function signature changes, likely its corresponding .pyi type
+      // hinting should also change.
+      // LINT.IfChange
       "quantize_qat_model",
-      [](const absl::string_view saved_model_path,
+      [](const absl::string_view src_saved_model_path,
+         const absl::string_view dst_saved_model_path,
+         const QuantizationOptions& quantization_options,
          const std::vector<std::string>& signature_keys,
-         const std::unordered_set<std::string>& tags,
-         const QuantizationOptions& quant_opts,
-         const absl::flat_hash_map<std::string, std::string>& function_aliases)
-          -> absl::StatusOr<ExportedModel> {
-        return QuantizeQatModel(saved_model_path, signature_keys, tags,
-                                quant_opts, function_aliases);
+         const absl::flat_hash_map<std::string, SignatureDef>&
+             signature_def_map,
+         const absl::flat_hash_map<std::string, std::string>& function_aliases,
+         const PyFunctionLibrary& py_function_library) -> absl::Status {
+        // LINT.ThenChange(pywrap_quantize_model.pyi:quantize_qat_model)
+        std::unordered_set<std::string> tags;
+        tags.insert(quantization_options.tags().begin(),
+                    quantization_options.tags().end());
+        const absl::StatusOr<ExportedModel> exported_model =
+            QuantizeQatModel(src_saved_model_path, signature_keys, tags,
+                             quantization_options, function_aliases);
+        if (!exported_model.ok()) return exported_model.status();
+
+        // Remove the `tpu` tag from the debug quantized saved model as it is
+        // for CPU. Note the 'tpu' value should be the same as `TPU` defined in
+        // tensorflow/python/saved_model/tag_constants.py.
+        if (quantization_options.has_debugger_options()) {
+          tags.erase("tpu");
+        }
+        py_function_library.SaveExportedModel(
+            dst_saved_model_path, *exported_model, src_saved_model_path, tags,
+            signature_def_map);
+
+        return absl::OkStatus();
       },
       R"pbdoc(
-      Returns serialized ExportedModel that contains the quantized model's
-      GraphDef and metadata. The user should pass a serialized
-      `QuantizationOptions` for the `quant_opts` argument.
+      Quantizes a model that went through quantization-aware training (QAT)
+      saved at `src_saved_model_path`. The resulting model will be saved to
+      `dst_saved_model_path`. Returns an OK sataus when successful, otherwise
+      raises `StatusNotOk` exception.
 
-      Raises `StatusNotOk` exception if when the run was unsuccessful.
-    )pbdoc");
+      The user should pass a serialized `QuantizationOptions` for the
+      `quantization_options_serialized` argument, and a signature key ->
+      serialized `SignatureDef` mapping for the `signature_def_map_serialized`
+      argument.
+
+      `function_aliases` maps actual function names to the function aliases, as
+      defined by the `MetaGraphDef::MetaInfoDef::function_aliases` from the
+      input SavedModel.
+      )pbdoc",
+      py::arg("src_saved_model_path"), py::arg("dst_saved_model_path"),
+      py::arg("quantization_options_serialized"), py::kw_only(),
+      py::arg("signature_keys"), py::arg("signature_def_map_serialized"),
+      py::arg("function_aliases"), py::arg("py_function_library"));
 
   m.def(
+      // If the function signature changes, likely its corresponding .pyi type
+      // hinting should also change.
+      // LINT.IfChange
       "quantize_ptq_dynamic_range",
-      [](const absl::string_view saved_model_path,
+      [](const absl::string_view src_saved_model_path,
+         const absl::string_view dst_saved_model_path,
+         const QuantizationOptions& quantization_options,
          const std::vector<std::string>& signature_keys,
-         const std::unordered_set<std::string>& tags,
-         const QuantizationOptions& quant_opts,
-         const absl::flat_hash_map<std::string, std::string>& function_aliases)
-          -> absl::StatusOr<ExportedModel> {
-        return QuantizePtqDynamicRange(saved_model_path, signature_keys, tags,
-                                       quant_opts, function_aliases);
+         const absl::flat_hash_map<std::string, SignatureDef>&
+             signature_def_map,
+         const absl::flat_hash_map<std::string, std::string>& function_aliases,
+         const PyFunctionLibrary& py_function_library) -> absl::Status {
+        // LINT.ThenChange(pywrap_quantize_model.pyi:quantize_ptq_dynamic_range)
+        std::unordered_set<std::string> tags;
+        tags.insert(quantization_options.tags().begin(),
+                    quantization_options.tags().end());
+
+        const absl::StatusOr<ExportedModel> exported_model =
+            QuantizePtqDynamicRange(src_saved_model_path, signature_keys, tags,
+                                    quantization_options, function_aliases);
+
+        // Remove the `tpu` tag from the debug quantized saved model as it is
+        // for CPU. Note the 'tpu' value should be the same as `TPU` defined in
+        // tensorflow/python/saved_model/tag_constants.py.
+        if (quantization_options.has_debugger_options()) {
+          tags.erase("tpu");
+        }
+        py_function_library.SaveExportedModel(
+            dst_saved_model_path, *exported_model, src_saved_model_path, tags,
+            signature_def_map);
+
+        return absl::OkStatus();
       },
       R"pbdoc(
-      Returns serialized ExportedModel that contains the quantized model's
-      GraphDef and metadata. The user should pass a serialized
-      `QuantizationOptions` for the `quant_opts` argument.
+      Quantizes a model saved at `src_saved_model_path` using dynamic-range
+      quantization algorithm. The resulting model will be saved to
+      `dst_saved_model_path`. Returns an OK sataus when successful, otherwise
+      raises `StatusNotOk` exception.
 
-      Raises `StatusNotOk` exception if when the run was unsuccessful.
-    )pbdoc");
+      The user should pass a serialized `QuantizationOptions` for the
+      `quantization_options_serialized` argument, and a signature key ->
+      serialized `SignatureDef` mapping for the `signature_def_map_serialized`
+      argument.
+
+      `function_aliases` maps actual function names to the function aliases, as
+      defined by the `MetaGraphDef::MetaInfoDef::function_aliases` from the
+      input SavedModel.
+      )pbdoc",
+      py::arg("src_saved_model_path"), py::arg("dst_saved_model_path"),
+      py::arg("quantization_options_serialized"), py::kw_only(),
+      py::arg("signature_keys"), py::arg("signature_def_map_serialized"),
+      py::arg("function_aliases"), py::arg("py_function_library"));
 
   m.def(
+      // If the function signature changes, likely its corresponding .pyi type
+      // hinting should also change.
+      // LINT.IfChange
       "quantize_weight_only",
-      [](const absl::string_view saved_model_path,
-         const QuantizationOptions& quant_opts,
-         const absl::flat_hash_map<std::string, std::string>& function_aliases)
-          -> absl::StatusOr<ExportedModel> {
-        return QuantizeWeightOnly(saved_model_path, quant_opts,
-                                  function_aliases);
+      [](const absl::string_view src_saved_model_path,
+         const absl::string_view dst_saved_model_path,
+         const QuantizationOptions& quantization_options,
+         const absl::flat_hash_map<std::string, SignatureDef>&
+             signature_def_map,
+         const absl::flat_hash_map<std::string, std::string>& function_aliases,
+         const PyFunctionLibrary& py_function_library) -> absl::Status {
+        // LINT.ThenChange(pywrap_quantize_model.pyi:quantize_weight_only)
+        const absl::StatusOr<ExportedModel> exported_model = QuantizeWeightOnly(
+            src_saved_model_path, quantization_options, function_aliases);
+        if (!exported_model.ok()) return exported_model.status();
+
+        std::unordered_set<std::string> tags;
+        tags.insert(quantization_options.tags().begin(),
+                    quantization_options.tags().end());
+
+        py_function_library.SaveExportedModel(
+            dst_saved_model_path, *exported_model, src_saved_model_path, tags,
+            signature_def_map);
+
+        return absl::OkStatus();
       },
       R"pbdoc(
-      Returns serialized ExportedModel that contains the quantized model's
-      GraphDef and metadata. The user should pass a serialized
-      `QuantizationOptions` for the `quant_opts` argument.
+      Quantizes a model saved at `src_saved_model_path` using weight-only
+      quantization algorithm. The resulting model will be saved to
+      `dst_saved_model_path`. Returns an OK sataus when successful, otherwise
+      raises `StatusNotOk` exception.
 
-      Raises `StatusNotOk` exception if when the run was unsuccessful.
-    )pbdoc");
+      The user should pass a serialized `QuantizationOptions` for the
+      `quantization_options_serialized` argument, and a signature key ->
+      serialized `SignatureDef` mapping for the `signature_def_map_serialized`
+      argument.
+
+      `function_aliases` maps actual function names to the function aliases, as
+      defined by the `MetaGraphDef::MetaInfoDef::function_aliases` from the
+      input SavedModel.
+      )pbdoc",
+      py::arg("src_saved_model_path"), py::arg("dst_saved_model_path"),
+      py::arg("quantization_options_serialized"), py::kw_only(),
+      py::arg("signature_def_map_serialized"), py::arg("function_aliases"),
+      py::arg("py_function_library"));
 
   m.def(
-      "quantize_ptq_model_pre_calibration",
-      [](const absl::string_view saved_model_path,
+      // If the function signature changes, likely its corresponding .pyi type
+      // hinting should also change.
+      // LINT.IfChange
+      "quantize_ptq_static_range",
+      [](const absl::string_view src_saved_model_path,
+         const absl::string_view dst_saved_model_path,
+         const QuantizationOptions& quantization_options,
          const std::vector<std::string>& signature_keys,
-         const std::unordered_set<std::string>& tags,
-         const QuantizationOptions& quant_opts,
+         const absl::flat_hash_map<std::string, SignatureDef>&
+             signature_def_map,
          const absl::flat_hash_map<std::string, std::string>& function_aliases,
-         const PyFunctionLibrary& py_function_lib)
-          -> absl::StatusOr<ExportedModel> {
-        const absl::StatusOr<ExportedModel> exported_model =
-            QuantizePtqModelPreCalibration(saved_model_path, signature_keys,
-                                           tags, quant_opts, function_aliases);
-        if (!exported_model.ok()) {
-          return exported_model.status();
+         const PyFunctionLibrary& py_function_library,
+         py::object representative_dataset) -> absl::Status {
+        // LINT.ThenChange(pywrap_quantize_model.pyi:quantize_ptq_model_static_range)
+        std::unordered_set<std::string> tags;
+        tags.insert(quantization_options.tags().begin(),
+                    quantization_options.tags().end());
+
+        absl::StatusOr<ExportedModel> exported_model =
+            QuantizePtqModelPreCalibration(src_saved_model_path, signature_keys,
+                                           tags, quantization_options,
+                                           function_aliases);
+        if (!exported_model.ok()) return exported_model.status();
+
+        AssignIdsToCustomAggregatorOps(*exported_model->mutable_graph_def());
+
+        const absl::StatusOr<std::string> precalibrated_saved_model_dir =
+            CreateTmpDir();
+        if (!precalibrated_saved_model_dir.ok()) {
+          throw py::value_error(
+              precalibrated_saved_model_dir.status().ToString());
         }
 
-        return py_function_lib.AssignIdsToCustomAggregatorOps(*exported_model);
-      },
-      R"pbdoc(
-      Returns serialized ExportedModel that contains the model's GraphDef and
-      metadata. The GraphDef contains extra ops required for calibration. The
-      user should pass a serialized `QuantizationOptions` for the `quant_opts`
-      argument.
+        py_function_library.SaveExportedModel(
+            *precalibrated_saved_model_dir, *exported_model,
+            src_saved_model_path, tags, signature_def_map);
 
-      The argument `custom_aggregator_id_assigner` is an instance of
-      `CustomAggregatorIdAssigner` whose virtual function `assign_ids` is
-      implemented in python.
+        py_function_library.RunCalibration(
+            *precalibrated_saved_model_dir, signature_keys, tags,
+            quantization_options.calibration_options(),
+            quantization_options.force_graph_mode_calibration(),
+            representative_dataset);
 
-      Raises `StatusNotOk` exception if when the run was unsuccessful.
-    )pbdoc");
+        if (absl::Status status = AddCalibrationStatistics(
+                *exported_model->mutable_graph_def(),
+                quantization_options.calibration_options(),
+                py_function_library);
+            !status.ok()) {
+          LOG(WARNING) << "Some CustomAggregator ops do not have min or max "
+                          "values. Parts of the graph are not quantized. "
+                       << status;
+        }
 
-  m.def(
-      "quantize_ptq_model_post_calibration",
-      [](const absl::string_view saved_model_path,
-         const std::vector<std::string>& signature_keys,
-         const std::unordered_set<std::string>& tags,
-         const QuantizationOptions& quant_opts,
-         const absl::flat_hash_map<std::string, std::string>& function_aliases)
-          -> absl::StatusOr<ExportedModel> {
-        return QuantizePtqModelPostCalibration(saved_model_path, signature_keys,
-                                               tags, quant_opts,
-                                               function_aliases);
+        if (quantization_options.has_debugger_options()) {
+          EnableDebugging(*exported_model,
+                          quantization_options.debugger_options(),
+                          py_function_library, src_saved_model_path, tags,
+                          signature_def_map);
+        }
+
+        const absl::StatusOr<std::string> calibrated_saved_model_path =
+            CreateTmpDir();
+        if (!calibrated_saved_model_path.ok()) {
+          throw py::value_error(
+              calibrated_saved_model_path.status().ToString());
+        }
+
+        py_function_library.SaveExportedModel(
+            *calibrated_saved_model_path, *exported_model, src_saved_model_path,
+            tags, signature_def_map);
+
+        const absl::flat_hash_map<std::string, std::string>
+            function_aliases_after_calibration(
+                exported_model->function_aliases().begin(),
+                exported_model->function_aliases().end());
+
+        const absl::StatusOr<ExportedModel> post_calibrated_exported_model =
+            QuantizePtqModelPostCalibration(
+                *calibrated_saved_model_path, signature_keys, tags,
+                quantization_options, function_aliases_after_calibration);
+        if (!post_calibrated_exported_model.ok())
+          return post_calibrated_exported_model.status();
+
+        // Remove the `tpu` tag from the debug quantized saved model as it is
+        // for CPU. Note the 'tpu' value should be the same as `TPU` defined in
+        // tensorflow/python/saved_model/tag_constants.py.
+        if (quantization_options.has_debugger_options()) {
+          tags.erase("tpu");
+        }
+        py_function_library.SaveExportedModel(
+            dst_saved_model_path, *post_calibrated_exported_model,
+            *calibrated_saved_model_path, tags, signature_def_map);
+
+        return absl::OkStatus();
       },
       R"pbdoc(
-      Returns serialized ExportedModel that contains the quantized model's
-      GraphDef and metadata. The user should pass a serialized
-      `QuantizationOptions` for the `quant_opts` argument.
+      Runs static-range post-training quantization (PTQ) on a SavedModel at
+      `src_saved_model_path` and saves the resulting model to
+      `dst_saved_model_path`.
+
+      The user should pass a serialized `QuantizationOptions` for the
+      `quantization_options_serialized` argument, and a signature key ->
+      serialized `SignatureDef` mapping for the `signature_def_map_serialized`
+      argument.
+
+      `function_aliases` maps actual function names to the function aliases, as
+      defined by the `MetaGraphDef::MetaInfoDef::function_aliases` from the
+      input SavedModel.
 
       Raises `StatusNotOk` exception if when the run was unsuccessful.
-    )pbdoc");
+      )pbdoc",
+      py::arg("saved_model_path"), py::arg("dst_saved_model_path"),
+      py::arg("quantization_options_serialized"), py::kw_only(),
+      py::arg("signature_keys"), py::arg("signature_def_map_serialized"),
+      py::arg("function_aliases"), py::arg("py_function_library"),
+      py::arg("representative_dataset"));
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.pyi b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.pyi
index 6e47f029f5e4d9..afe61d54854e71 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.pyi
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.pyi
@@ -12,45 +12,64 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+from typing import Any
+
 from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import calibration_statistics_pb2
 from tensorflow.compiler.mlir.quantization.tensorflow.python import py_function_lib
+from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as rd
 
-def clear_calibrator() -> None: ...
-def clear_data_from_calibrator(id: bytes) -> None: ...
-def get_statistics_from_calibrator(
-    id: bytes,
-) -> calibration_statistics_pb2.CalibrationStatistics: ...
+# LINT.IfChange(quantize_qat_model)
 def quantize_qat_model(
-    saved_model_path: str,
-    signature_keys: list[str],
-    tags: set[str],
+    src_saved_model_path: str,
+    dst_saved_model_path: str,
     quantization_options_serialized: bytes,
+    *,
+    signature_keys: list[str],
+    signature_def_map_serialized: dict[str, bytes],
     function_aliases: dict[str, str],
-) -> bytes: ...
+    py_function_library: py_function_lib.PyFunctionLibrary,
+) -> Any: ...  # Status
+
+# LINT.ThenChange()
+
+# LINT.IfChange(quantize_ptq_dynamic_range)
 def quantize_ptq_dynamic_range(
-    saved_model_path: str,
-    signature_keys: list[str],
-    tags: set[str],
+    src_saved_model_path: str,
+    dst_saved_model_path: str,
     quantization_options_serialized: bytes,
+    *,
+    signature_keys: list[str],
+    signature_def_map_serialized: dict[str, bytes],
     function_aliases: dict[str, str],
-) -> bytes: ...
+    py_function_library: py_function_lib.PyFunctionLibrary,
+) -> Any: ...  # Status
+
+# LINT.ThenChange()
+
+# LINT.IfChange(quantize_weight_only)
 def quantize_weight_only(
-    saved_model_path: str,
-    quantization_options_serialized: bytes,
-    function_aliases: dict[str, str],
-) -> bytes: ...
-def quantize_ptq_model_pre_calibration(
-    saved_model_path: str,
-    signature_keys: list[str],
-    tags: set[str],
+    src_saved_model_path: str,
+    dst_saved_model_path: str,
     quantization_options_serialized: bytes,
+    *,
+    signature_def_map_serialized: dict[str, bytes],
     function_aliases: dict[str, str],
     py_function_library: py_function_lib.PyFunctionLibrary,
-) -> bytes: ...
-def quantize_ptq_model_post_calibration(
-    saved_model_path: str,
-    signature_keys: list[str],
-    tags: set[str],
+) -> Any: ...  # Status
+
+# LINT.ThenChange()
+
+# LINT.IfChange(quantize_ptq_static_range)
+def quantize_ptq_static_range(
+    src_saved_model_path: str,
+    dst_saved_model_path: str,
     quantization_options_serialized: bytes,
+    *,
+    signature_keys: list[str],
+    signature_def_map_serialized: dict[str, bytes],
     function_aliases: dict[str, str],
-) -> bytes: ...
+    py_function_library: py_function_lib.PyFunctionLibrary,
+    representative_dataset: rd.RepresentativeDatasetOrMapping,
+) -> Any: ...  # Status
+
+# LINT.ThenChange()
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model_test.py
index ed531218290c7b..b29edcfaed4c9c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model_test.py
@@ -17,6 +17,7 @@
 These test cases are mostly for validation checks. Tests for functionalities
 are at `quantize_model_test.py`.
 """
+from tensorflow.compiler.mlir.quantization.tensorflow.python import py_function_lib
 from tensorflow.compiler.mlir.quantization.tensorflow.python import pywrap_quantize_model
 from tensorflow.python.platform import test
 
@@ -25,25 +26,39 @@ class PywrapQuantizeModelTest(test.TestCase):
   """Test cases for quantize_model python wrappers."""
 
   def test_quantize_model_fails_when_invalid_quant_options_serialization(self):
-    saved_model_path = self.create_tempdir('saved_model').full_path
+    src_saved_model_path = self.create_tempdir().full_path
+    dst_saved_model_path = self.create_tempdir().full_path
     signature_def_keys = ['serving_default']
-    tags = {'serve'}
-    quant_opts_serialized = 'invalid protobuf serialization string'
+    quant_opts_serialized = 'invalid proto serialization string'.encode('utf-8')
 
     with self.assertRaisesRegex(TypeError, 'incompatible function arguments'):
-      pywrap_quantize_model.quantize_ptq_model_pre_calibration(
-          saved_model_path, signature_def_keys, tags, quant_opts_serialized
+      pywrap_quantize_model.quantize_ptq_static_range(
+          src_saved_model_path,
+          dst_saved_model_path,
+          quant_opts_serialized,
+          signature_keys=signature_def_keys,
+          signature_def_map_serialized={},
+          function_aliases={},
+          py_function_library=py_function_lib.PyFunctionLibrary(),
+          representative_dataset=None,
       )
 
   def test_quantize_model_fails_when_invalid_quant_options_type(self):
-    saved_model_path = self.create_tempdir('saved_model').full_path
+    src_saved_model_path = self.create_tempdir().full_path
+    dst_saved_model_path = self.create_tempdir().full_path
     signature_def_keys = ['serving_default']
-    tags = {'serve'}
     invalid_quant_opts_object = ('a', 'b', 'c')
 
     with self.assertRaisesRegex(TypeError, 'incompatible function arguments'):
-      pywrap_quantize_model.quantize_ptq_model_pre_calibration(
-          saved_model_path, signature_def_keys, tags, invalid_quant_opts_object
+      pywrap_quantize_model.quantize_ptq_static_range(
+          src_saved_model_path,
+          dst_saved_model_path,
+          invalid_quant_opts_object,
+          signature_keys=signature_def_keys,
+          signature_def_map_serialized={},
+          function_aliases={},
+          py_function_library=py_function_lib.PyFunctionLibrary(),
+          representative_dataset=None,
       )
 
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
index 4054ce5ab6f354..ab4f3327956cd0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
@@ -22,10 +22,14 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/SmallVector.h"
@@ -39,20 +43,22 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/export.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/precalibration.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/status_macro.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
@@ -63,8 +69,7 @@ limitations under the License.
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saver.pb.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/status.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace tensorflow {
@@ -73,66 +78,16 @@ namespace {
 
 using ::mlir::quant::kTfFilePrefix;
 using ::mlir::quant::kTfQuantSaveOpName;
+using ::mlir::quant::stablehlo::PreCalibrationComponent;
 using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
 using ::mlir::tf_saved_model::kTfSavedModelInitializerInitType;
 using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
-
-// Suffix string for the module export step. Used for debugging.
-constexpr absl::string_view kExportStepSuffix = "_export";
-
-// Options when running passes for exporting an MLIR ModuleOp.
-struct ExportOptions {
-  // If set to `true`, it runs `DuplicateShapeDeterminingConstantsPass` before
-  // lowering to tf_executor dialect.
-  bool duplicate_shape_determining_constants = true;
-
-  // If set to `true`, unfreezes constants into variables and saves them to a
-  // checkpoint file. Setting this to `true` is an experimental feature that has
-  // no stability guarantees.
-  bool unfreeze_constants = false;
-
-  // Path to the directory where checkpoint files are saved.
-  std::string checkpoint_dir = "";
-
-  // Name used to identify the ModuleOp this is exporting. Only used for
-  // debugging and does not modify the behavior of the export.
-  std::string debug_name = "tf_quant";
-};
-
-// Add passes for transforming the MLIR module op so that it can be exported
-// back to GraphDef. Roughly, this consists of:
-//   1) Inserting the @main function, which will become the main Graph.
-//   2) Duplicating shape-determining constants.
-//   3) Converting TF dialect -> tf_executor dialect.
-//   4) Adding initializer function's ops into @main function for correct
-//      resource initialization when loading the exported model.
-//
-// Duplicating shape-determining constants is required to place constants that
-// affect the shape of a tensor to be placed in the TPU graph instead of in the
-// CPU graph, when the graph gets converted for TPU inference. This allows these
-// constants to be known at XLA compilation time.
-void AddExportPasses(const bool duplicate_shape_determining_constants,
-                     mlir::PassManager &pm) {
-  if (duplicate_shape_determining_constants) {
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::quant::CreateDuplicateShapeDeterminingConstantsPass());
-  }
-
-  pm.addPass(mlir::quant::CreateInsertMainFunctionPass());
-  pm.addPass(mlir::quant::CreateLiftHashTableOpsAsArgsPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::CreateFunctionalToExecutorDialectConversionPass());
-  pm.addPass(mlir::CreateBreakUpIslandsPass());
-  pm.addPass(mlir::quant::CreateMergeInitializerFunctionOpsToMainPass());
-  pm.addPass(mlir::quant::CreateMergeSaveFunctionOpsToMainPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::quant::CreateMergeDuplicateResourceOpsPass());
-
-  // Used to clean up the "tf._noinliner" attribute that is previously used to
-  // prevent certain functions from being inlined (see
-  // `MarkFunctionsNoinlinePass`). InlinerPass must not come after this pass.
-  pm.addPass(mlir::TF::CreateStripNoinlineAttributePass());
-}
+using ::stablehlo::quantization::AddExportPasses;
+using ::stablehlo::quantization::CreateExportedModel;
+using ::stablehlo::quantization::ExportOptions;
+using ::stablehlo::quantization::kExportStepSuffix;
+using ::stablehlo::quantization::QuantizationConfig;
+using ::stablehlo::quantization::io::GetLocalTmpFileName;
 
 // Finds and returns the name of the node from a set of control output nodes.
 // The name should contain the string `contains`. Returns an empty string if no
@@ -151,32 +106,6 @@ std::string GetNodeName(const absl::flat_hash_set<Node *> &control_ret_nodes,
   return "";
 }
 
-// Factory function for `ExportedModel`.
-[[nodiscard]] ExportedModel CreateExportedModel(
-    GraphDef &&graph_def, const absl::string_view init_node_name,
-    const absl::string_view checkpoint_dir,
-    const std::optional<SaverDef> saver_def,
-    const absl::flat_hash_map<std::string, std::string> &function_aliases,
-    const std::vector<AssetFileDef> &asset_file_defs) {
-  ExportedModel exported_model{};
-  *exported_model.mutable_graph_def() = graph_def;
-  exported_model.set_init_node_name(std::string(init_node_name));
-  exported_model.set_checkpoint_dir(std::string(checkpoint_dir));
-
-  exported_model.mutable_function_aliases()->insert(function_aliases.begin(),
-                                                    function_aliases.end());
-
-  for (const auto &asset_file_def : asset_file_defs) {
-    *exported_model.mutable_asset_file_defs()->Add() = asset_file_def;
-  }
-
-  if (saver_def != std::nullopt) {
-    *exported_model.mutable_saver_def() = *std::move(saver_def);
-  }
-
-  return exported_model;
-}
-
 // Returns the file prefix tensor name. An empty string is returned if no such a
 // tensor is found (when there are no variables to restore, it is expected that
 // the file prefix tensor does not exist). The file prefix tensor is found among
@@ -197,7 +126,7 @@ std::string FindFilePrefixTensorName(const GraphDef &graph_def) {
         if (const auto file_prefix_itr =
                 absl::c_find(index_paths, kTfFilePrefix.str());
             file_prefix_itr != index_paths.end()) {
-          // ":0" appended to inidicate that it is a tensor, not an Operation.
+          // ":0" appended to indicate that it is a tensor, not an Operation.
           return absl::StrCat(node_def.name(), ":0");
         }
       }
@@ -322,60 +251,6 @@ absl::flat_hash_map<std::string, std::string> UpdateFunctionAliases(
   return updated_function_aliases;
 }
 
-// Create a unique local temporary filename. It only creates the name, not the
-// actual file.
-absl::StatusOr<std::string> GetLocalTempFilename() {
-  auto *env = Env::Default();
-  std::string tmp_fname{};
-  if (!env->LocalTempFilename(&tmp_fname)) {
-    return absl::InternalError("Failed to create a local temp file name.");
-  }
-
-  return tmp_fname;
-}
-
-// Unfreezes constants into variables and saves them to a checkpoint files under
-// `checkpoint_dir`. `checkpoint_dir` will be created within this function. It
-// will return a non-OK status if it already exists or permission is denied.
-// TODO(b/261652258): Make sure this works for when there are non-frozen
-// variables in the model.
-// TODO(b/262189534): Move this to a separate file for better testing.
-absl::Status UnfreezeConstantsAndSaveVariables(
-    const absl::string_view checkpoint_dir, mlir::MLIRContext &ctx,
-    mlir::ModuleOp module_op) {
-  TF_QUANT_RETURN_IF_ERROR(RunPasses(
-      /*name=*/kTfQuantConstantUnfreezingStepName,
-      /*add_passes_func=*/
-      [](mlir::PassManager &pm) {
-        pm.addPass(mlir::quant::CreateUnfreezeConstantsPass());
-      },
-      ctx, module_op));
-
-  if (const tsl::Status create_dir_status =
-          Env::Default()->CreateDir(std::string(checkpoint_dir));
-      !create_dir_status.ok()) {
-    LOG(ERROR) << "Failed to create checkpoint directory at: "
-               << checkpoint_dir;
-    return create_dir_status;
-  }
-
-  TF_ASSIGN_OR_RETURN(const auto _,
-                      SaveVariablesToCheckpoint(checkpoint_dir, module_op));
-
-  return RunPasses(
-      /*name=*/kTfQuantInsertRestoreOpStepName,
-      /*add_passes_func=*/
-      [](mlir::PassManager &pm) {
-        pm.addPass(mlir::quant::CreateInsertRestoreOpPass());
-        pm.addPass(mlir::quant::CreateInsertSaveOpPass());
-        // Initialization by `tf.ConstOp` is no longer required as there is
-        // a `tf.RestoreV2Op` now.
-        pm.addPass(
-            mlir::quant::CreateRemoveVariableInitializationByConstPass());
-      },
-      ctx, module_op);
-}
-
 // Sets up and runs the passes for exporting `module_op`. The behavior of the
 // exporting passes is controlled by `export_opts`. Returns `AssetFileDef`s that
 // associate the input arguments of @main and the asset file names. Asset file
@@ -385,17 +260,17 @@ absl::StatusOr<llvm::SmallVector<AssetFileDef>> RunExportPasses(
     const ExportOptions &export_opts, mlir::MLIRContext &ctx,
     mlir::ModuleOp module_op) {
   if (export_opts.unfreeze_constants) {
-    TF_QUANT_RETURN_IF_ERROR(UnfreezeConstantsAndSaveVariables(
+    TF_RETURN_IF_ERROR(UnfreezeConstantsAndSaveVariables(
         export_opts.checkpoint_dir, ctx, module_op));
     LOG(INFO) << "Unfrozen constants and saved variables to checkpoint file: "
               << export_opts.checkpoint_dir;
   }
 
-  if (const absl::Status pass_run_status = RunPasses(
+  if (absl::Status pass_run_status = RunPasses(
           /*name=*/export_opts.debug_name,
           /*add_passes_func=*/
           [dup_constants = export_opts.duplicate_shape_determining_constants](
-              mlir::PassManager &pm) { AddExportPasses(dup_constants, pm); },
+              mlir::PassManager &pm) { AddExportPasses(pm, dup_constants); },
           ctx, module_op);
       !pass_run_status.ok()) {
     return pass_run_status;
@@ -462,15 +337,14 @@ absl::StatusOr<ExportedModel> QuantizeQatModel(
     return aliased_function_names.insert(aliases.first);
   });
 
-  TF_QUANT_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
+  TF_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
       /*mlir_dump_file_prefix=*/kDefaultTfQuantMlirDumpFilePrefix,
       /*is_inliner_run=*/true,
       /*noinline_functions=*/aliased_function_names, module_ref.get(), &context,
       bundle ? bundle->GetSession() : nullptr, /*run_tf_to_stablehlo=*/false));
 
-  TF_QUANT_RETURN_IF_ERROR(RunPasses(
-      /*name=*/kTfQuantQatStepName,
-      /*add_passes_func=*/
+  TF_RETURN_IF_ERROR(RunPasses(
+      /*name=*/kTfQuantQatStepName, /*add_passes_func=*/
       [&quantization_options](mlir::PassManager &pm) {
         AddQuantizeQatPasses(pm, quantization_options, kTfQuantQatStepName);
       },
@@ -478,7 +352,7 @@ absl::StatusOr<ExportedModel> QuantizeQatModel(
 
   const bool unfreeze_constants = !quantization_options.freeze_all_variables();
 
-  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTempFilename());
+  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
 
   const auto export_opts = ExportOptions{
       /*duplicate_shape_determining_constants=*/true, unfreeze_constants,
@@ -533,25 +407,21 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
 
   const bool run_tf_to_stablehlo = (quantization_options.op_set() ==
                                     tensorflow::quantization::OpSet::STABLEHLO);
-  TF_QUANT_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
+  TF_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
       /*mlir_dump_file_prefix=*/kTfQuantPtqPreCalibrationStepName,
-      /*is_inliner_run=*/true,
-      /*noinline_functions=*/aliased_function_names, module_ref.get(), &context,
-      bundle ? bundle->GetSession() : nullptr, run_tf_to_stablehlo));
+      /*is_inliner_run=*/true, /*noinline_functions=*/aliased_function_names,
+      module_ref.get(), &context, bundle ? bundle->GetSession() : nullptr,
+      run_tf_to_stablehlo));
 
   // Use StableHLO Quantizer option if opset is specified.
   if (run_tf_to_stablehlo) {
-    TF_QUANT_RETURN_IF_ERROR(RunPasses(
-        /*name=*/kTfQuantPtqPreCalibrationStepStableHloName,
-        /*add_passes_func=*/
-        [&quantization_options](mlir::PassManager &pm) {
-          AddQuantizePtqPreCalibrationStablehloPasses(pm, quantization_options);
-        },
-        context, *module_ref));
+    PreCalibrationComponent pre_calibration_component(
+        &context, quantization_options.calibration_options());
+    TF_ASSIGN_OR_RETURN(*module_ref, pre_calibration_component.Run(
+                                         *module_ref, QuantizationConfig()));
   } else {
-    TF_QUANT_RETURN_IF_ERROR(RunPasses(
-        /*name=*/kTfQuantPtqPreCalibrationStepName,
-        /*add_passes_func=*/
+    TF_RETURN_IF_ERROR(RunPasses(
+        /*name=*/kTfQuantPtqPreCalibrationStepName, /*add_passes_func=*/
         [&quantization_options](mlir::PassManager &pm) {
           AddQuantizePtqPreCalibrationPasses(pm, quantization_options);
         },
@@ -559,7 +429,7 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
   }
 
   const bool unfreeze_constants = !quantization_options.freeze_all_variables();
-  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTempFilename());
+  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
 
   // `duplicate_shape_determining_constants = false` because the
   // resulting graph of this step is not expected to be loaded on TPU.
@@ -619,28 +489,26 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
   // Freezing is required again since variables might have been produced during
   // the pre-calibration step. `is_inliner_run = false` to prevent the functions
   // lifted for quantization from being inlined.
-  TF_QUANT_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
+  TF_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
       /*mlir_dump_file_prefix=*/kTfQuantPtqPostCalibrationStepName,
-      /*is_inliner_run=*/false,
-      /*noinline_functions=*/aliased_function_names, module_ref.get(), &context,
-      bundle ? bundle->GetSession() : nullptr, /*run_tf_to_stablehlo=*/false));
+      /*is_inliner_run=*/false, /*noinline_functions=*/aliased_function_names,
+      module_ref.get(), &context, bundle ? bundle->GetSession() : nullptr,
+      /*run_tf_to_stablehlo=*/false));
 
   // Use StableHLO Quantizer option if opset is specified.
   if (quantization_options.op_set() ==
       tensorflow::quantization::OpSet::STABLEHLO) {
-    TF_QUANT_RETURN_IF_ERROR(RunPasses(
-        /*name=*/kTfQuantPtqPostCalibrationStepStableHloName,
-        /*add_passes_func=*/
-        [&quantization_options](mlir::PassManager &pm) {
-          AddQuantizePtqPostCalibrationStablehloPasses(
-              pm, quantization_options,
-              kTfQuantPtqPostCalibrationStepStableHloName);
-        },
-        context, *module_ref));
+    TF_RETURN_IF_ERROR(
+        RunPasses(/*name=*/kTfQuantPtqPostCalibrationStepStableHloName,
+                  /*add_passes_func=*/
+                  [](mlir::PassManager &pm) {
+                    AddQuantizePtqPostCalibrationStablehloPasses(
+                        pm, kTfQuantPtqPostCalibrationStepStableHloName);
+                  },
+                  context, *module_ref));
   } else {
-    TF_QUANT_RETURN_IF_ERROR(RunPasses(
-        /*name=*/kTfQuantPtqPostCalibrationStepName,
-        /*add_passes_func=*/
+    TF_RETURN_IF_ERROR(RunPasses(
+        /*name=*/kTfQuantPtqPostCalibrationStepName, /*add_passes_func=*/
         [&quantization_options](mlir::PassManager &pm) {
           AddQuantizePtqPostCalibrationPasses(
               pm, quantization_options, kTfQuantPtqPostCalibrationStepName);
@@ -649,7 +517,7 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
   }
 
   const bool unfreeze_constants = !quantization_options.freeze_all_variables();
-  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTempFilename());
+  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
 
   const auto export_opts = ExportOptions{
       /*duplicate_shape_determining_constants=*/true, unfreeze_constants,
@@ -705,15 +573,14 @@ absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
     return aliased_function_names.insert(aliases.first);
   });
 
-  TF_QUANT_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
+  TF_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
       /*mlir_dump_file_prefix=*/kDefaultTfQuantMlirDumpFilePrefix,
-      /*is_inliner_run=*/true,
-      /*noinline_functions=*/aliased_function_names, module_ref.get(), &context,
-      bundle ? bundle->GetSession() : nullptr, /*run_tf_to_stablehlo=*/false));
+      /*is_inliner_run=*/true, /*noinline_functions=*/aliased_function_names,
+      module_ref.get(), &context, bundle ? bundle->GetSession() : nullptr,
+      /*run_tf_to_stablehlo=*/false));
 
-  TF_QUANT_RETURN_IF_ERROR(RunPasses(
-      /*name=*/kTfQuantPtqDynamicRangeStepName,
-      /*add_passes_func=*/
+  TF_RETURN_IF_ERROR(RunPasses(
+      /*name=*/kTfQuantPtqDynamicRangeStepName, /*add_passes_func=*/
       [&quantization_options](mlir::PassManager &pm) {
         AddQuantizePtqDynamicRangePasses(pm, quantization_options,
                                          kTfQuantPtqDynamicRangeStepName);
@@ -721,7 +588,7 @@ absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
       context, *module_ref));
 
   const bool unfreeze_constants = !quantization_options.freeze_all_variables();
-  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTempFilename());
+  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
 
   const auto export_opts = ExportOptions{
       /*duplicate_shape_determining_constants=*/true, unfreeze_constants,
@@ -780,23 +647,22 @@ absl::StatusOr<ExportedModel> QuantizeWeightOnly(
     return aliased_function_names.insert(aliases.first);
   });
 
-  TF_QUANT_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
+  TF_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
       /*mlir_dump_file_prefix=*/kDefaultTfQuantMlirDumpFilePrefix,
       /*is_inliner_run=*/true,
       /*noinline_functions=*/aliased_function_names, module_ref.get(), &context,
       bundle ? bundle->GetSession() : nullptr, /*run_tf_to_stablehlo=*/false));
 
-  TF_QUANT_RETURN_IF_ERROR(RunPasses(
-      /*name=*/kTfQuantWeightOnlyStepName,
-      /*add_passes_func=*/
-      [&quantization_options](mlir::PassManager &pm) {
-        AddQuantizeWeightOnlyPasses(pm, quantization_options,
-                                    kTfQuantWeightOnlyStepName);
-      },
-      context, *module_ref));
+  TF_RETURN_IF_ERROR(
+      RunPasses(/*name=*/kTfQuantWeightOnlyStepName, /*add_passes_func=*/
+                [&quantization_options](mlir::PassManager &pm) {
+                  AddQuantizeWeightOnlyPasses(pm, quantization_options,
+                                              kTfQuantWeightOnlyStepName);
+                },
+                context, *module_ref));
 
   const bool unfreeze_constants = !quantization_options.freeze_all_variables();
-  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTempFilename());
+  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
 
   const auto export_opts = ExportOptions{
       /*duplicate_shape_determining_constants=*/true, unfreeze_constants,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
index 4db0f667a619d5..81e5b6167fc0e3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
@@ -39,14 +39,8 @@ inline constexpr absl::string_view kTfQuantPtqDynamicRangeStepName =
     "tf_quant_ptq_dynamic_range";
 inline constexpr absl::string_view kTfQuantWeightOnlyStepName =
     "tf_quant_weight_only";
-inline constexpr absl::string_view kTfQuantConstantUnfreezingStepName =
-    "tf_quant_constant_unfreezing";
-inline constexpr absl::string_view kTfQuantInsertRestoreOpStepName =
-    "tf_quant_insert_restore_op";
 
 // StableHLO Quantization passes that are ran if StableHLO opset is selected.
-inline constexpr absl::string_view kTfQuantPtqPreCalibrationStepStableHloName =
-    "tf_quant_ptq_pre_calibration_stablehlo";
 inline constexpr absl::string_view kTfQuantPtqPostCalibrationStepStableHloName =
     "tf_quant_ptq_post_calibration_stablehlo";
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
index 3746afa13b8dbe..affc19a9250890 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
@@ -14,36 +14,23 @@
 # ==============================================================================
 """Defines TF Quantization API from SavedModel to SavedModel."""
 
-import collections.abc
 import tempfile
-from typing import Callable, Collection, Dict, Mapping, Optional, Sequence
+from typing import Mapping, Optional
 
 from absl import logging
-import numpy as np
 
-from tensorflow.compiler.mlir.quantization.tensorflow import exported_model_pb2
 from tensorflow.compiler.mlir.quantization.tensorflow import quantization_options_pb2 as quant_opts_pb2
-from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import calibration_algorithm
-from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import calibration_statistics_pb2 as calib_stats_pb2
 from tensorflow.compiler.mlir.quantization.tensorflow.python import py_function_lib
 from tensorflow.compiler.mlir.quantization.tensorflow.python import pywrap_quantize_model
 from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as repr_dataset
 from tensorflow.compiler.mlir.quantization.tensorflow.python import save_model
-from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf import saver_pb2
-from tensorflow.python.client import session
-from tensorflow.python.eager import context
-from tensorflow.python.eager import wrap_function
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.saved_model import load as saved_model_load
 from tensorflow.python.saved_model import loader_impl as saved_model_loader
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.trackable import autotrackable
-from tensorflow.python.types import core
 from tensorflow.python.util import tf_export
 
 # Type aliases for quant_opts_pb2 messages.
@@ -76,10 +63,6 @@
 # during dynamic range quantization (DRQ) and weight-only quantization.
 _DYNAMIC_RANGE_DEFAULT_MIN_NUM_ELEMENTS_FOR_WEIGHTS = 1024
 
-# Name of the saved model assets directory.
-_ASSETS_DIR = 'assets'
-_ASSETS_EXTRA_DIR = 'assets.extra'
-
 
 def _is_qat_saved_model(saved_model_path: str):
   """Checks if the SavedModel is QAT-enabled by looking for 'FakeQuant' ops."""
@@ -95,485 +78,22 @@ def _is_qat_saved_model(saved_model_path: str):
   return False
 
 
-def _create_sample_validator(
-    expected_input_keys: Collection[str],
-) -> Callable[
-    [repr_dataset.RepresentativeSample], repr_dataset.RepresentativeSample
-]:
-  """Creates a validator function for a representative sample.
-
-  Args:
-    expected_input_keys: Input keys (keyword argument names) that the function
-      the sample will be used for is expecting to receive.
-
-  Returns:
-    A callable that validates a `RepresentativeSample`.
-  """
-
-  def validator(
-      sample: repr_dataset.RepresentativeSample,
-  ) -> repr_dataset.RepresentativeSample:
-    """Validates a single instance of representative sample.
-
-    This provides a simple check for `sample` that this is a mapping of
-    {input_key: input_value}.
-
-    Args:
-      sample: A `RepresentativeSample` to validate.
-
-    Returns:
-      `sample` iff it is valid.
-
-    Raises:
-      ValueError: iff the sample isn't an instance of `Mapping`.
-      KeyError: iff the sample does not have the set of input keys that match
-        the input keys of the function.
-    """
-    if not isinstance(sample, collections.abc.Mapping):
-      raise ValueError(
-          'Invalid representative sample type. Provide a mapping '
-          '(usually a dict) of {input_key: input_value}. '
-          f'Got type: {type(sample)} instead.'
-      )
-
-    if set(sample.keys()) != expected_input_keys:
-      raise KeyError(
-          'Invalid input keys for representative sample. The function expects '
-          f'input keys of: {set(expected_input_keys)}. '
-          f'Got: {set(sample.keys())}. Please provide correct input keys for '
-          'representative samples.'
-      )
-
-    return sample
-
-  return validator
-
-
-def _validate_representative_dataset(
-    representative_dataset: repr_dataset.RepresentativeDatasetOrMapping,
-    signature_keys: Collection[str],
-) -> None:
-  """Validates the representative dataset, based on the signature keys.
-
-  Representative dataset can be provided in two different forms: a single
-  instance of `RepresentativeDataset` or a map of signature key to the
-  corresponding `RepresentativeDataset`. These have a relationship with
-  `signature_keys`.
-
-  This function validates the following conditions:
-  * If `len(signature_keys) > 1`, then `representative_dataset` should be a
-    mapping where the keys exactly match the elements in `signature_keys`.
-  * If `len(signature_keys) == 1`, then both a mapping and a single instance of
-    `RepresentativeDataset` are allowed.
-  * This function also assumes `len(signature_keys) > 0`.
-
-  Args:
-    representative_dataset: A `RepresentativeDataset` or a map of string to
-      `RepresentativeDataset` to be validated.
-    signature_keys: A collection of strings that contains the signature keys,
-      each identifying a `SignatureDef`.
-
-  Raises:
-    ValueError: Iff `representative_dataset` does not satisfy the conditions
-      above.
-  """
-  if isinstance(representative_dataset, collections.abc.Mapping):
-    if set(signature_keys) != set(representative_dataset.keys()):
-      raise ValueError(
-          'The signature keys and the keys of representative dataset map '
-          f'do not match. Signature keys: {set(signature_keys)}, '
-          f'representative dataset map: {set(representative_dataset.keys())}.'
-      )
-  else:
-    if len(signature_keys) > 1:
-      raise ValueError(
-          'Representative dataset is not a mapping '
-          f'(got: {type(representative_dataset)}), '
-          'but there is more than one signature key provided. '
-          'Please provide a map of {signature_key -> dataset} '
-          'with more than one signature key.'
-      )
-
-
-def _convert_values_to_tf_tensors(
-    sample: repr_dataset.RepresentativeSample,
-) -> Mapping[str, core.Tensor]:
-  """Converts TensorLike values of `sample` to Tensors.
-
-  Creates a copy of `sample`, where each value is converted to Tensors
-  unless it is already a Tensor.
-  The values are not converted in-place (i.e. `sample` is not mutated).
-
-  Args:
-    sample: A representative sample, which is a map of {name -> tensorlike
-      value}.
-
-  Returns:
-    Converted map of {name -> tensor}.
-  """
-  tensor_mapping = {}
-  for name, tensorlike_value in sample.items():
-    if isinstance(tensorlike_value, core.Tensor):
-      tensor_value = tensorlike_value
-    else:
-      tensor_value = tensor_conversion.convert_to_tensor_v2_with_dispatch(
-          tensorlike_value
-      )
-
-    tensor_mapping[name] = tensor_value
-
-  return tensor_mapping
-
-
-def _create_feed_dict_from_input_data(
-    input_data: repr_dataset.RepresentativeSample,
-    signature_def: meta_graph_pb2.SignatureDef,
-) -> Dict[str, np.ndarray]:
-  """Constructs a feed_dict from input data.
-
-  Note: This function should only be used in graph mode.
-
-  This is a helper function that converts an 'input key -> input value' mapping
-  to a feed dict. A feed dict is an 'input tensor name -> input value' mapping
-  and can be directly passed to the `feed_dict` argument of `sess.run()`.
+def _serialize_signature_def_map(
+    signature_def_map: _SignatureDefMap,
+) -> dict[str, bytes]:
+  """Serializes SignatureDef values in `signature_def_map`.
 
   Args:
-    input_data: Input key -> input value mapping. The input keys should match
-      the input keys of `signature_def`.
-    signature_def: A SignatureDef representing the function that `input_data` is
-      an input to.
+    signature_def_map: Signature key -> SignatureDef mapping.
 
   Returns:
-    Feed dict, which is intended to be used as input for `sess.run`. It is
-    essentially a mapping: input tensor name -> input value. Note that the input
-    value in the feed dict is not a `Tensor`.
+    Signature def map where the values (`SignatureDef`) are serialized.
   """
-  feed_dict = {}
-  for input_key, input_value in input_data.items():
-    input_tensor_name = signature_def.inputs[input_key].name
-
-    value = input_value
-    if isinstance(input_value, core.Tensor):
-      # Take the data out of the tensor.
-      value = input_value.eval()
-
-    feed_dict[input_tensor_name] = value
-
-  return feed_dict
-
-
-# TODO(b/249918070): Implement a progress bar.
-def _log_sample_num_for_calibration(
-    representative_dataset: repr_dataset.RepresentativeDataset,
-) -> repr_dataset.RepresentativeDataset:
-  """Logs the sample number for calibration.
+  signature_def_map_serialized = {}
+  for key, signature_def in signature_def_map.items():
+    signature_def_map_serialized[key] = signature_def.SerializeToString()
 
-  If in debug logging level, the "sample number / total num samples" is logged
-  for every 5 iterations.
-
-  This is often useful when tracking the progress of the calibration step which
-  is often slow and may look stale if there's no logs being printed.
-
-  Args:
-    representative_dataset: The representative dataset.
-
-  Yields:
-    The representative samples from `representative_dataset` without any
-    modification.
-  """
-  num_samples: Optional[int] = repr_dataset.get_num_samples(
-      representative_dataset
-  )
-  if num_samples is None:
-    total_num_samples = '?'
-    logging.info('Representative dataset size unknown.')
-  else:
-    total_num_samples = str(num_samples)
-    logging.info('Using representative dataset of size: %s', total_num_samples)
-
-  sample_num = 0
-  for sample in representative_dataset:
-    sample_num += 1
-
-    # Log the sample number for every 5 iterations.
-    logging.log_every_n(
-        logging.DEBUG,
-        'Running representative sample for calibration: %d / %s',
-        5,
-        sample_num,
-        total_num_samples,
-    )
-    yield sample
-
-  logging.info(
-      'Running representative samples complete: %d / %s',
-      sample_num,
-      total_num_samples,
-  )
-
-
-def _run_function_for_calibration_graph_mode(
-    sess: session.Session,
-    signature_def: meta_graph_pb2.SignatureDef,
-    representative_dataset: repr_dataset.RepresentativeDataset,
-) -> None:
-  """Runs the representative dataset through a function for calibration.
-
-  NOTE: This is intended to be run in graph mode (TF1).
-
-  The function is identified by the SignatureDef.
-
-  Args:
-    sess: The Session object to run the function in.
-    signature_def: A SignatureDef that identifies a function by specifying the
-      inputs and outputs.
-    representative_dataset: The representative dataset to run through the
-      function.
-  """
-  output_tensor_names = [
-      output_tensor_info.name
-      for output_tensor_info in signature_def.outputs.values()
-  ]
-
-  sample_validator = _create_sample_validator(
-      expected_input_keys=signature_def.inputs.keys()
-  )
-
-  for sample in map(
-      sample_validator, _log_sample_num_for_calibration(representative_dataset)
-  ):
-    # Create a mapping from input tensor name to the input tensor value.
-    # ex) "Placeholder:0" -> [0, 1, 2]
-    feed_dict = _create_feed_dict_from_input_data(sample, signature_def)
-    sess.run(output_tensor_names, feed_dict=feed_dict)
-
-
-def _replace_tensors_by_numpy_ndarrays(
-    repr_ds_map: repr_dataset.RepresentativeDatasetMapping,
-) -> None:
-  """Replaces tf.Tensors by their evaluated numpy arrays.
-
-  This assumes that tf.Tensors in representative samples are created in the
-  default Graph. It will raise an error if tensors are created in a different
-  graph.
-
-  Args:
-    repr_ds_map: SignatureDef key -> RepresentativeDataset mapping.
-  """
-  with session.Session() as sess:
-    for signature_def_key in repr_ds_map:
-      # Replaces the dataset with a new dataset where tf.Tensors are replaced
-      # by their evaluated values.
-      ds = repr_ds_map[signature_def_key]
-      repr_ds_map[signature_def_key] = (
-          repr_dataset.replace_tensors_by_numpy_ndarrays(ds, sess)
-      )
-
-
-def _run_graph_for_calibration_graph_mode(
-    model_dir: str,
-    tags: Collection[str],
-    representative_dataset_map: repr_dataset.RepresentativeDatasetMapping,
-) -> None:
-  """Runs the graph for calibration in graph mode.
-
-  This function assumes _graph mode_ (used when legacy TF1 is used or when eager
-  mode is explicitly disabled) when running the graph. This step is used in
-  order to collect the statistics in CustomAggregatorOp for quantization using
-  the representative dataset for the actual data provided for inference.
-
-  Args:
-    model_dir: Path to SavedModel directory.
-    tags: Collection of tags identifying the MetaGraphDef within the SavedModel.
-    representative_dataset_map: A map where signature keys are mapped to
-      corresponding representative datasets.
-
-  Raises:
-    ValueError: When running the function with the representative dataset fails.
-  """
-  # Replace tf.Tensors by numpy ndarrays in order to reuse the samples in a
-  # different graph when running the calibration.
-  _replace_tensors_by_numpy_ndarrays(representative_dataset_map)
-
-  # Run the calibration in a new graph to avoid name collision, which could
-  # happen when the same model is loaded multiple times in the default graph.
-  with ops.Graph().as_default(), session.Session() as sess:
-    meta_graph: meta_graph_pb2.MetaGraphDef = saved_model_loader.load(
-        sess, tags, export_dir=model_dir
-    )
-
-    for signature_key, repr_ds in representative_dataset_map.items():
-      sig_def = meta_graph.signature_def[signature_key]
-
-      try:
-        _run_function_for_calibration_graph_mode(
-            sess, signature_def=sig_def, representative_dataset=repr_ds
-        )
-      except Exception as ex:
-        raise ValueError(
-            'Failed to run representative dataset through the '
-            f'function with the signature key: {signature_key}.'
-        ) from ex
-
-
-def _run_function_for_calibration_eager_mode(
-    func: wrap_function.WrappedFunction,
-    representative_dataset: repr_dataset.RepresentativeDataset,
-) -> None:
-  """Runs the representative dataset through a function for calibration.
-
-  NOTE: This is intended to be run in eager mode (TF2).
-
-  Args:
-    func: The function to run the representative samples through.
-    representative_dataset: Representative dataset used for calibration. The
-      input keys and input values of the representative samples should match the
-      keyword arguments of `func`.
-  """
-  _, keyword_args = func.structured_input_signature
-  sample_validator = _create_sample_validator(
-      expected_input_keys=keyword_args.keys()
-  )
-
-  for sample in map(
-      sample_validator, _log_sample_num_for_calibration(representative_dataset)
-  ):
-    # Convert any non-Tensor values from the sample to Tensors.
-    # This conversion is required because the model saved in `model_dir` is
-    # saved using TF1 SavedModelBuilder, which doesn't save the
-    # SavedObjectGraph.
-    # TODO(b/236795224): Remove the need for this conversion by keeping the
-    # FunctionSpec (object graph) in the SavedModel. Related: b/213406917.
-    func_kwargs = _convert_values_to_tf_tensors(sample)
-    func(**func_kwargs)
-
-
-def _run_graph_for_calibration_eager_mode(
-    model_dir: str,
-    tags: Collection[str],
-    representative_dataset_map: repr_dataset.RepresentativeDatasetMapping,
-) -> None:
-  """Runs the graph for calibration in eager mode.
-
-  This function assumes _eager mode_ (enabled in TF2 by default) when running
-  the graph. This step is used in order to collect the statistics in
-  CustomAggregatorOp for quantization using the representative dataset for the
-  actual data provided for inference.
-
-  Args:
-    model_dir: Path to SavedModel directory.
-    tags: Collection of tags identifying the MetaGraphDef within the SavedModel.
-    representative_dataset_map: A map where signature keys are mapped to
-      corresponding representative datasets.
-
-  Raises:
-    ValueError: When running the function with the representative dataset fails.
-  """
-  root: autotrackable.AutoTrackable = saved_model_load.load(model_dir, tags)
-  for signature_key, repr_ds in representative_dataset_map.items():
-    try:
-      _run_function_for_calibration_eager_mode(
-          func=root.signatures[signature_key], representative_dataset=repr_ds
-      )
-    except Exception as ex:
-      raise ValueError(
-          'Failed to run representative dataset through the '
-          f'function with the signature key: {signature_key}.'
-      ) from ex
-
-
-def _run_graph_for_calibration(
-    float_model_dir: str,
-    signature_keys: Sequence[str],
-    tags: Collection[str],
-    representative_dataset: repr_dataset.RepresentativeDatasetOrMapping,
-    force_graph_mode_calibration: bool,
-) -> None:
-  """Runs the graph for calibration using representative datasets.
-
-  Args:
-    float_model_dir: Path to the model to calibrate.
-    signature_keys: Sequence of keys identifying SignatureDef containing inputs
-      and outputs.
-    tags: Collection of tags identifying the MetaGraphDef within the SavedModel
-      to analyze.
-    representative_dataset: An iterator that returns a dictionary of {input_key:
-      input_value} or a mapping from signature keys to such iterators. When
-      `signature_keys` contains more than one signature key,
-      `representative_datsaet` should be a mapping that maps each signature keys
-      to the corresponding representative dataset.
-    force_graph_mode_calibration: If set to true, it forces calibration in graph
-      model instead of eager mode when the context is in eager mode.
-
-  Raises:
-    ValueError iff:
-      * The representative dataset format is invalid.
-      * It fails to run the functions using the representative datasets.
-  """
-  try:
-    _validate_representative_dataset(representative_dataset, signature_keys)
-  except Exception as ex:
-    raise ValueError('Invalid representative dataset.') from ex
-
-  # If `representative_dataset` is not a mapping, convert to a mapping for the
-  # following functions to handle representative datasets more conveniently.
-  representative_dataset_map = representative_dataset
-  if not isinstance(representative_dataset, collections.abc.Mapping):
-    # `signature_keys` is guaranteed to have only one element after the
-    # validation.
-    representative_dataset_map = {signature_keys[0]: representative_dataset}
-
-  try:
-    if context.executing_eagerly() and not force_graph_mode_calibration:
-      logging.info('Calibration step is executed in eager mode.')
-      _run_graph_for_calibration_eager_mode(
-          float_model_dir, tags, representative_dataset_map
-      )
-    else:
-      logging.info('Calibration step is executed in graph mode.')
-      _run_graph_for_calibration_graph_mode(
-          float_model_dir, tags, representative_dataset_map
-      )
-  except Exception as ex:
-    raise ValueError(
-        'Failed to run graph for post-training quantization calibration.'
-    ) from ex
-
-  logging.info('Calibration step complete.')
-
-
-def _copy_assets(src_path: str, dst_path: str) -> None:
-  """Copies the assets directory of the saved model.
-
-  Clones the contents of the assets/ directory from the source saved model
-  directory to the destination saved model directory. Nothing will be copied if
-  there are no assets directory in the source directory.
-
-  Args:
-    src_path: Source saved model directory.
-    dst_path: Destination saved model directory. This directory must exist.
-  """
-  for assets_dir_name in [_ASSETS_DIR, _ASSETS_EXTRA_DIR]:
-    src_assets_path = file_io.join(src_path, assets_dir_name)
-    if not file_io.file_exists_v2(src_assets_path):
-      # Do nothing if the source assets path does not exist.
-      continue
-
-    dst_assets_path = file_io.join(dst_path, assets_dir_name)
-    file_io.create_dir_v2(dst_assets_path)
-
-    for curr_dir, _, files in file_io.walk_v2(src_assets_path):
-      for asset_file_name in files:
-        src_asset_file = file_io.join(curr_dir, asset_file_name)
-
-        # Construct the destination assets file path.
-        curr_dst_dir = curr_dir.replace(src_assets_path, dst_assets_path)
-        dst_asset_file = file_io.join(curr_dst_dir, asset_file_name)
-
-        file_io.copy_v2(src_asset_file, dst_asset_file)
-        logging.info(
-            'Copied asset file: %s -> %s', src_asset_file, dst_asset_file
-        )
+  return signature_def_map_serialized
 
 
 def _run_static_range_qat(
@@ -599,145 +119,17 @@ def _run_static_range_qat(
       quant_opts.tags
   ).meta_info_def.function_aliases
 
-  exported_model_serialized = pywrap_quantize_model.quantize_qat_model(
+  pywrap_quantize_model.quantize_qat_model(
       src_saved_model_path,
-      list(quant_opts.signature_keys),
-      set(quant_opts.tags),
-      quant_opts.SerializeToString(),
-      dict(function_aliases),
-  )
-
-  exported_model = exported_model_pb2.ExportedModel.FromString(
-      exported_model_serialized
-  )
-
-  save_model.save_model_v1(
-      exported_model.graph_def,
       dst_saved_model_path,
-      signature_def_map,
-      quant_opts.tags,
-      init_op_name=exported_model.init_node_name,
-      saver_def=_get_saver_def_or_none(exported_model),
-      checkpoint_dir=exported_model.checkpoint_dir,
-      function_aliases=exported_model.function_aliases,
-      asset_file_defs=exported_model.asset_file_defs,
-  )
-
-  _copy_assets(src_saved_model_path, dst_saved_model_path)
-
-
-def _get_min_max_from_calibrator(
-    node_id: bytes,
-    calib_opts: quant_opts_pb2.CalibrationOptions,
-) -> tuple[float, float]:
-  """Calculate min and max from statistics using calibration options.
-
-  Args:
-    node_id: bytes of node id.
-    calib_opts: Calibration options used for calculating min and max.
-
-  Returns:
-    (min_value, max_value): Min and max calculated using calib_opts.
-
-  Raises:
-    ValueError: Unsupported calibration method is given.
-  """
-  statistics: calib_stats_pb2.CalibrationStatistics = (
-      pywrap_quantize_model.get_statistics_from_calibrator(node_id)
+      quantization_options_serialized=quant_opts.SerializeToString(),
+      signature_keys=list(quant_opts.signature_keys),
+      signature_def_map_serialized=_serialize_signature_def_map(
+          signature_def_map
+      ),
+      function_aliases=dict(function_aliases),
+      py_function_library=py_function_lib.PyFunctionLibrary(),
   )
-  min_value, max_value = calibration_algorithm.get_min_max_value(
-      statistics, calib_opts
-  )
-  return min_value, max_value
-
-
-def _add_calibration_statistics(
-    graph_def: graph_pb2.GraphDef,
-    calib_opts: quant_opts_pb2.CalibrationOptions,
-) -> None:
-  """Adds calibration statistics to the graph def.
-
-  This function must be run after running the graph with a representative
-  dataset. Retrieves calibration statistics from the global calibrator and adds
-  them to the corresponding nodes as attributes.
-
-  Args:
-    graph_def: GraphDef to add calibration statistics to.
-    calib_opts: Calibration options to calculate min and max.
-  """
-  for function_def in graph_def.library.function:
-    for node_def in function_def.node_def:
-      if node_def.op != 'CustomAggregator':
-        continue
-
-      node_id = node_def.attr['id'].s
-      try:
-        min_value, max_value = _get_min_max_from_calibrator(node_id, calib_opts)
-        pywrap_quantize_model.clear_data_from_calibrator(node_id)
-
-        node_def.attr['min'].f = min_value
-        node_def.attr['max'].f = max_value
-      except ValueError:
-        logging.warning(
-            (
-                'CustomAggregator id "%s" from FunctionDef "%s" does not have '
-                'min or max values. Parts of this function are not quantized.'
-            ),
-            node_id.decode('utf-8'),
-            function_def.signature.name,
-        )
-
-
-def _enable_dump_tensor(graph_def: graph_pb2.GraphDef) -> None:
-  """Enable DumpTensor in the graph def.
-
-  DumpTensor is disabled by default to avoid logging data during calibration.
-  This function is called after calibration to enable DumpTensor.
-
-  Args:
-    graph_def: GraphDef to enable DumpTensor
-  """
-  for function_def in graph_def.library.function:
-    for node_def in function_def.node_def:
-      if node_def.op != 'DumpTensor':
-        continue
-
-      node_def.attr['enabled'].b = True
-
-
-def _change_dump_tensor_file_name(graph_def: graph_pb2.GraphDef) -> None:
-  """Change file_name used by DumpTensor to quantized_tensor_data.pb.
-
-  In whole model verify, DumpTensor in unquantized model uses file_name
-  unquantized_tensor_data.pb.
-  After unquantized dump model is created, this function allows quantized dump
-  model to use quantized_tensor_data.pb as file_name.
-
-  Args:
-    graph_def: GraphDef to change file_name of DumpTensor
-  """
-  for function_def in graph_def.library.function:
-    for node_def in function_def.node_def:
-      if node_def.op != 'DumpTensor':
-        continue
-
-      node_def.attr['file_name'].s = 'quantized_tensor_data.pb'.encode('utf-8')
-
-
-def _get_saver_def_or_none(
-    exported_model: exported_model_pb2.ExportedModel,
-) -> Optional[saver_pb2.SaverDef]:
-  """Returns the SaverDef from ExportedModel, None otherwise.
-
-  Args:
-    exported_model: ExportedModel to take the SaverDef from.
-
-  Returns:
-    SaverDef instance if the field `saver_def` is set. None otherwise.
-  """
-  if exported_model.HasField('saver_def'):
-    return exported_model.saver_def
-  return None
 
 
 def _run_static_range_ptq(
@@ -766,134 +158,29 @@ def _run_static_range_ptq(
   Raises:
     ValueError if the graph doesn't contain a valid signature.
   """
-  logging.info('Running post-training quantization pre-calibration step.')
+  logging.info('Running static-range post-training quantization.')
 
   loader = saved_model_loader.SavedModelLoader(src_saved_model_path)
   function_aliases = loader.get_meta_graph_def_from_tags(
       quant_opts.tags
   ).meta_info_def.function_aliases
 
-  exported_model_serialized = (
-      pywrap_quantize_model.quantize_ptq_model_pre_calibration(
-          src_saved_model_path,
-          list(quant_opts.signature_keys),
-          set(quant_opts.tags),
-          quant_opts.SerializeToString(),
-          dict(function_aliases),
-          py_function_lib.PyFunctionLibrary(),
-      )
-  )
-  exported_model = exported_model_pb2.ExportedModel.FromString(
-      exported_model_serialized
-  )
-
-  graph_def = exported_model.graph_def
-  pre_calib_output_model_path = tempfile.mkdtemp()
-  save_model.save_model_v1(
-      graph_def,
-      pre_calib_output_model_path,
-      signature_def_map,
-      quant_opts.tags,
-      exported_model.init_node_name,
-      _get_saver_def_or_none(exported_model),
-      exported_model.checkpoint_dir,
-      exported_model.function_aliases,
-      asset_file_defs=exported_model.asset_file_defs,
-  )
-
-  _copy_assets(src_saved_model_path, pre_calib_output_model_path)
-
-  # Uses the representative dataset to collect statistics for calibration.
-  # Handles the graph mode execution separately in case TF2 is disabled or
-  # eager execution is disabled. The min & max values are stored separately
-  # in a global CalibratorSingleton instance.
-  _run_graph_for_calibration(
-      pre_calib_output_model_path,
-      quant_opts.signature_keys,
-      quant_opts.tags,
-      representative_dataset,
-      quant_opts.force_graph_mode_calibration,
-  )
-
-  _add_calibration_statistics(graph_def, quant_opts.calibration_options)
-
-  if quant_opts.HasField('debugger_options'):
-    # Since DumpTensor was disabled by default, we need to enable them.
-    _enable_dump_tensor(graph_def)
-
-    if (
-        quant_opts.debugger_options.debugger_type
-        == quant_opts_pb2.DebuggerOptions.DebuggerType.DEBUGGER_TYPE_WHOLE_MODEL
-    ):
-      # TODO: b/295139417 - Remove CustomAggregator op in unquantized dump model
-      # TODO: b/296916287 - Create a separate function for saving unquantized
-      # dump model
-      save_model.save_model_v1(
-          graph_def,
-          quant_opts.debugger_options.unquantized_dump_model_path,
-          signature_def_map,
-          quant_opts.tags,
-          exported_model.init_node_name,
-          _get_saver_def_or_none(exported_model),
-          exported_model.checkpoint_dir,
-          exported_model.function_aliases,
-          asset_file_defs=exported_model.asset_file_defs,
-      )
-
-      _copy_assets(
-          src_saved_model_path,
-          quant_opts.debugger_options.unquantized_dump_model_path,
-      )
-
-      _change_dump_tensor_file_name(graph_def)
-
-  calibrated_model_path = tempfile.mkdtemp()
-  save_model.save_model_v1(
-      graph_def,
-      calibrated_model_path,
-      signature_def_map,
-      quant_opts.tags,
-      exported_model.init_node_name,
-      _get_saver_def_or_none(exported_model),
-      exported_model.checkpoint_dir,
-      asset_file_defs=exported_model.asset_file_defs,
-  )
-
-  _copy_assets(pre_calib_output_model_path, calibrated_model_path)
-
-  logging.info('Running post-training quantization post-calibration step.')
-  exported_model_serialized = (
-      pywrap_quantize_model.quantize_ptq_model_post_calibration(
-          calibrated_model_path,
-          list(quant_opts.signature_keys),
-          set(quant_opts.tags),
-          quant_opts.SerializeToString(),
-          dict(exported_model.function_aliases),
-      )
-  )
-
-  exported_model = exported_model_pb2.ExportedModel.FromString(
-      exported_model_serialized
-  )
-
-  save_model.save_model_v1(
-      exported_model.graph_def,
+  signature_def_map_serialized = _serialize_signature_def_map(signature_def_map)
+  pywrap_quantize_model.quantize_ptq_static_range(
+      src_saved_model_path,
       dst_saved_model_path,
-      signature_def_map,
-      quant_opts.tags,
-      init_op_name=exported_model.init_node_name,
-      saver_def=_get_saver_def_or_none(exported_model),
-      checkpoint_dir=exported_model.checkpoint_dir,
-      function_aliases=exported_model.function_aliases,
-      asset_file_defs=exported_model.asset_file_defs,
+      quantization_options_serialized=quant_opts.SerializeToString(),
+      signature_keys=list(quant_opts.signature_keys),
+      signature_def_map_serialized=signature_def_map_serialized,
+      function_aliases=dict(function_aliases),
+      py_function_library=py_function_lib.PyFunctionLibrary(),
+      representative_dataset=representative_dataset,
   )
 
-  _copy_assets(calibrated_model_path, dst_saved_model_path)
-
 
 def _static_range_quantize(
-    saved_model_path: str,
-    output_directory: str,
+    src_saved_model_path: str,
+    dst_saved_model_path: str,
     quantization_options: _QuantizationOptions,
     representative_dataset: Optional[
         repr_dataset.RepresentativeDatasetOrMapping
@@ -907,10 +194,10 @@ def _static_range_quantize(
   model input, `representative_dataset` will be ignored.
 
   Args:
-    saved_model_path: Path to the saved model. When representative_dataset is
-      not provided, this should be a model trained with QAT.
-    output_directory: The path to save the output SavedModel. The directory will
-      be overwritten if not empty.
+    src_saved_model_path: Path to the saved model. When representative_dataset
+      is not provided, this should be a model trained with QAT.
+    dst_saved_model_path: The path to save the output SavedModel. The directory
+      will be overwritten if not empty.
     quantization_options: QuantizationOptions proto describing quantization
       related config.
     representative_dataset: a generator that returns a dictionary in {input_key:
@@ -927,18 +214,18 @@ def _static_range_quantize(
       in the SavedModel.
   """
   logging.info(
-      'Running static range quantization on model: %s', saved_model_path
+      'Running static range quantization on model: %s', src_saved_model_path
   )
   logging.info('QuantizationOptions: \n%s', quantization_options)
 
   is_qat_saved_model_or_method_no_quantize = _is_qat_saved_model(
-      saved_model_path
+      src_saved_model_path
   ) or (
       quantization_options.quantization_method.preset_method
       == _QuantizationMethod.METHOD_NO_QUANTIZE
   )
   signature_def_map = save_model.get_signatures_from_saved_model(
-      saved_model_path,
+      src_saved_model_path,
       quantization_options.signature_keys,
       set(quantization_options.tags),
   )
@@ -961,34 +248,34 @@ def _static_range_quantize(
 
   if is_qat_saved_model_or_method_no_quantize:
     _run_static_range_qat(
-        saved_model_path,
-        output_directory,
+        src_saved_model_path,
+        dst_saved_model_path,
         quantization_options,
         signature_def_map,
     )
   else:
     _run_static_range_ptq(
-        saved_model_path,
-        output_directory,
+        src_saved_model_path,
+        dst_saved_model_path,
         quantization_options,
         representative_dataset,
         signature_def_map,
     )
 
-  return saved_model_load.load(output_directory)
+  return saved_model_load.load(dst_saved_model_path)
 
 
 def _dynamic_range_quantize(
-    saved_model_path: str,
-    output_directory: str,
+    src_saved_model_path: str,
+    dst_saved_model_path: str,
     quantization_options: _QuantizationOptions,
 ) -> autotrackable.AutoTrackable:
   """Quantizes the given SavedModel via post-training dynamic range quantization.
 
   Args:
-    saved_model_path: Path to the saved model.
-    output_directory: The path to save the output SavedModel. The directory will
-      be overwritten if not empty.
+    src_saved_model_path: Path to the saved model.
+    dst_saved_model_path: The path to save the output SavedModel. The directory
+      will be overwritten if not empty.
     quantization_options: QuantizationOptions proto describing quantization
       related config.
 
@@ -999,68 +286,56 @@ def _dynamic_range_quantize(
     ValueError: when the model is QAT model.
   """
   mode_str = 'dynamic-range quantization'
-  if _is_qat_saved_model(saved_model_path):
+  if _is_qat_saved_model(src_saved_model_path):
     raise ValueError(
         'The models trained with quantization-aware training (QAT) is not '
         'supported for %s.' % mode_str
     )
 
   logging.info(
-      'Running post-training %s on model: %s', mode_str, saved_model_path
+      'Running post-training %s on model: %s', mode_str, src_saved_model_path
   )
   logging.info('QuantizationOptions: \n%s', quantization_options)
 
-  loader = saved_model_loader.SavedModelLoader(saved_model_path)
+  loader = saved_model_loader.SavedModelLoader(src_saved_model_path)
 
   function_aliases = loader.get_meta_graph_def_from_tags(
       quantization_options.tags
   ).meta_info_def.function_aliases
 
-  # Apply post-training dynamic range quantization to the model.
-  exported_model_serialized = pywrap_quantize_model.quantize_ptq_dynamic_range(
-      saved_model_path,
-      list(quantization_options.signature_keys),
-      set(quantization_options.tags),
-      quantization_options.SerializeToString(),
-      dict(function_aliases),
-  )
-
-  exported_model = exported_model_pb2.ExportedModel.FromString(
-      exported_model_serialized
-  )
   signature_def_map = save_model.get_signatures_from_saved_model(
-      saved_model_path,
+      src_saved_model_path,
       quantization_options.signature_keys,
       quantization_options.tags,
   )
 
-  save_model.save_model_v1(
-      exported_model.graph_def,
-      output_directory,
-      signature_def_map,
-      quantization_options.tags,
-      init_op_name=exported_model.init_node_name,
-      saver_def=_get_saver_def_or_none(exported_model),
-      checkpoint_dir=exported_model.checkpoint_dir,
-      function_aliases=exported_model.function_aliases,
-      asset_file_defs=exported_model.asset_file_defs,
+  # Apply post-training dynamic range quantization to the model.
+  pywrap_quantize_model.quantize_ptq_dynamic_range(
+      src_saved_model_path,
+      dst_saved_model_path,
+      quantization_options_serialized=quantization_options.SerializeToString(),
+      signature_keys=list(quantization_options.signature_keys),
+      signature_def_map_serialized=_serialize_signature_def_map(
+          signature_def_map
+      ),
+      function_aliases=dict(function_aliases),
+      py_function_library=py_function_lib.PyFunctionLibrary(),
   )
-  _copy_assets(saved_model_path, output_directory)
 
-  return saved_model_load.load(output_directory)
+  return saved_model_load.load(dst_saved_model_path)
 
 
 def _weight_only_quantize(
-    saved_model_path: str,
-    output_directory: str,
+    src_saved_model_path: str,
+    dst_saved_model_path: str,
     quantization_options: quant_opts_pb2.QuantizationOptions,
 ) -> autotrackable.AutoTrackable:
   """Quantizes the given SavedModel via weight-only quantization.
 
   Args:
-    saved_model_path: Path to the saved model.
-    output_directory: The path to save the output SavedModel. The directory will
-      be overwritten if not empty.
+    src_saved_model_path: Path to the saved model.
+    dst_saved_model_path: The path to save the output SavedModel. The directory
+      will be overwritten if not empty.
     quantization_options: QuantizationOptions proto describing quantization
       related config.
 
@@ -1073,52 +348,41 @@ def _weight_only_quantize(
   mode_str = 'weight-only quantization'
 
   # QAT weight-only is not supported yet.
-  if _is_qat_saved_model(saved_model_path):
+  if _is_qat_saved_model(src_saved_model_path):
     raise ValueError(
         'The models trained with quantization-aware training (QAT) is not '
         'supported for %s.' % mode_str
     )
 
   logging.info(
-      'Running post-training %s on model: %s', mode_str, saved_model_path
+      'Running post-training %s on model: %s', mode_str, src_saved_model_path
   )
   logging.info('QuantizationOptions: \n%s', quantization_options)
 
-  loader = saved_model_loader.SavedModelLoader(saved_model_path)
+  loader = saved_model_loader.SavedModelLoader(src_saved_model_path)
 
   function_aliases = loader.get_meta_graph_def_from_tags(
       quantization_options.tags
   ).meta_info_def.function_aliases
 
-  exported_model_serialized = pywrap_quantize_model.quantize_weight_only(
-      saved_model_path,
-      quantization_options.SerializeToString(),
-      dict(function_aliases),
-  )
-
-  exported_model = exported_model_pb2.ExportedModel.FromString(
-      exported_model_serialized
-  )
   signature_def_map = save_model.get_signatures_from_saved_model(
-      saved_model_path,
+      src_saved_model_path,
       list(quantization_options.signature_keys),
       set(quantization_options.tags),
   )
 
-  save_model.save_model_v1(
-      exported_model.graph_def,
-      output_directory,
-      signature_def_map,
-      quantization_options.tags,
-      init_op_name=exported_model.init_node_name,
-      saver_def=_get_saver_def_or_none(exported_model),
-      checkpoint_dir=exported_model.checkpoint_dir,
-      function_aliases=exported_model.function_aliases,
-      asset_file_defs=exported_model.asset_file_defs,
+  pywrap_quantize_model.quantize_weight_only(
+      src_saved_model_path,
+      dst_saved_model_path,
+      quantization_options_serialized=quantization_options.SerializeToString(),
+      signature_def_map_serialized=_serialize_signature_def_map(
+          signature_def_map
+      ),
+      function_aliases=dict(function_aliases),
+      py_function_library=py_function_lib.PyFunctionLibrary(),
   )
-  _copy_assets(saved_model_path, output_directory)
 
-  return saved_model_load.load(output_directory)
+  return saved_model_load.load(dst_saved_model_path)
 
 
 def _verify_output_dir(output_dir: Optional[str], overwrite: bool) -> None:
@@ -1356,28 +620,34 @@ def _populate_quantization_options_default_values(
         'Legacy weight-only is deprecated. Use weight-only quantization method.'
     )
 
+  # Converter assumes options are specified. So set SRQ explicitly.
+  if (
+      quantization_options.quantization_method.preset_method
+      == _PresetMethod.METHOD_UNSPECIFIED
+  ):
+    logging.debug(
+        '"preset_method" for QuantizationMethod is not specified.'
+        'Static range quantization is used by default.'
+    )
+    quantization_options.quantization_method.preset_method = (
+        _PresetMethod.METHOD_STATIC_RANGE_INT8
+    )
+
   # Check default quantization option values for weight-only quantization.
   # TODO(b/242805842): Find good minimum_elements_for_weights number for server.
   # please also update default value in tflite converter:
   # tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc;l=201
-  if (
-      quantization_options.quantization_method.preset_method
-      == _PresetMethod.METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8
-  ) or (
-      quantization_options.quantization_method.preset_method
-      == _PresetMethod.METHOD_DYNAMIC_RANGE_INT8
-  ):
-    if quantization_options.min_num_elements_for_weights == 0:
-      quantization_options.min_num_elements_for_weights = (
-          _DYNAMIC_RANGE_DEFAULT_MIN_NUM_ELEMENTS_FOR_WEIGHTS
-      )
-      logging.warning(
-          (
-              'QuantizationOptions.min_num_elements_for_weights is not set (0).'
-              ' Setting to the default value: %d.'
-          ),
-          _DYNAMIC_RANGE_DEFAULT_MIN_NUM_ELEMENTS_FOR_WEIGHTS,
-      )
+  if quantization_options.min_num_elements_for_weights == 0:
+    quantization_options.min_num_elements_for_weights = (
+        _DYNAMIC_RANGE_DEFAULT_MIN_NUM_ELEMENTS_FOR_WEIGHTS
+    )
+    logging.warning(
+        (
+            'QuantizationOptions.min_num_elements_for_weights is not set (0).'
+            ' Setting to the default value: %d.'
+        ),
+        _DYNAMIC_RANGE_DEFAULT_MIN_NUM_ELEMENTS_FOR_WEIGHTS,
+    )
 
   # TODO: b/307900054 - Set the per-channel quantization by default.
   if quantization_options.enable_per_channel_quantization and not (
@@ -1417,19 +687,6 @@ def _populate_quantization_options_default_values(
         ' quantization via TF Quantizer.'
     )
 
-  # Converter assumes options are specified. So set SRQ explicitly.
-  if (
-      quantization_options.quantization_method.preset_method
-      == _PresetMethod.METHOD_UNSPECIFIED
-  ):
-    logging.debug(
-        '"preset_method" for QuantizationMethod is not specified.'
-        'Static range quantization is used by default.'
-    )
-    quantization_options.quantization_method.preset_method = (
-        _PresetMethod.METHOD_STATIC_RANGE_INT8
-    )
-
   if quantization_options.HasField('debugger_options'):
     # Set `force_graph_mode_calibration` to True to avoid skipping op execution,
     # which are not connected to return ops, during calibration execution.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py
index f3e8cc9d6bcb50..6fc618b5f92646 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py
@@ -18,7 +18,10 @@
 import os
 from typing import Iterable, Mapping, Optional, Union
 
+import numpy as np
+
 from tensorflow.compiler.mlir.quantization.tensorflow import quantization_options_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import context
@@ -302,3 +305,40 @@ def get_num_samples(repr_ds: RepresentativeDataset) -> Optional[int]:
       return None
   else:
     return None
+
+
+def create_feed_dict_from_input_data(
+    input_data: RepresentativeSample,
+    signature_def: meta_graph_pb2.SignatureDef,
+) -> Mapping[str, np.ndarray]:
+  """Constructs a feed_dict from input data.
+
+  Note: This function should only be used in graph mode.
+
+  This is a helper function that converts an 'input key -> input value' mapping
+  to a feed dict. A feed dict is an 'input tensor name -> input value' mapping
+  and can be directly passed to the `feed_dict` argument of `sess.run()`.
+
+  Args:
+    input_data: Input key -> input value mapping. The input keys should match
+      the input keys of `signature_def`.
+    signature_def: A SignatureDef representing the function that `input_data` is
+      an input to.
+
+  Returns:
+    Feed dict, which is intended to be used as input for `sess.run`. It is
+    essentially a mapping: input tensor name -> input value. Note that the input
+    value in the feed dict is not a `Tensor`.
+  """
+  feed_dict = {}
+  for input_key, input_value in input_data.items():
+    input_tensor_name = signature_def.inputs[input_key].name
+
+    value = input_value
+    if isinstance(input_value, core.Tensor):
+      # Take the data out of the tensor.
+      value = input_value.eval()
+
+    feed_dict[input_tensor_name] = value
+
+  return feed_dict
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset_test.py
index b5fa11c43bbc5b..f9e05be36eb5af 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset_test.py
@@ -18,7 +18,9 @@
 import numpy as np
 
 from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as repr_dataset
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
@@ -224,6 +226,57 @@ def __len__(self):
 
     self.assertIsNone(repr_dataset.get_num_samples(LenRaisingError()))
 
+  @test_util.deprecated_graph_mode_only
+  def test_create_feed_dict_from_input_data(self):
+    signature_def = meta_graph_pb2.SignatureDef(
+        inputs={'input_tensor': meta_graph_pb2.TensorInfo(name='input:0')}
+    )
+    rng = np.random.default_rng(seed=14)
+
+    input_tensor_value = rng.random(size=(2, 2))
+    sample = {'input_tensor': input_tensor_value}
+
+    feed_dict = repr_dataset.create_feed_dict_from_input_data(
+        sample, signature_def
+    )
+
+    self.assertLen(feed_dict, 1)
+    self.assertIn('input:0', feed_dict)
+    self.assertAllEqual(feed_dict['input:0'], input_tensor_value)
+
+  @test_util.deprecated_graph_mode_only
+  def test_create_feed_dict_from_input_data_core_tensors(self):
+    signature_def = meta_graph_pb2.SignatureDef(
+        inputs={'input_tensor': meta_graph_pb2.TensorInfo(name='input:0')}
+    )
+
+    with self.session():
+      input_tensor = constant_op.constant([1, 2, 3, 4, 5, 6])
+      sample = {'input_tensor': input_tensor}
+
+      feed_dict = repr_dataset.create_feed_dict_from_input_data(
+          sample, signature_def
+      )
+      input_tensor_data = input_tensor.eval()
+
+    self.assertLen(feed_dict, 1)
+    self.assertIn('input:0', feed_dict)
+    self.assertIsInstance(feed_dict['input:0'], np.ndarray)
+    self.assertAllEqual(feed_dict['input:0'], input_tensor_data)
+
+  @test_util.deprecated_graph_mode_only
+  def test_create_feed_dict_from_input_data_empty(self):
+    signature_def = meta_graph_pb2.SignatureDef(
+        inputs={'input_tensor': meta_graph_pb2.TensorInfo(name='input:0')}
+    )
+
+    sample = {}
+    feed_dict = repr_dataset.create_feed_dict_from_input_data(
+        sample, signature_def
+    )
+
+    self.assertEmpty(feed_dict)
+
 
 class RepresentativeDatasetSaverTest(test.TestCase):
   """Test cases for RepresentativeDatasetSaver."""
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/type_casters.h b/tensorflow/compiler/mlir/quantization/tensorflow/python/type_casters.h
index 669415a1aac078..a7beffd826a083 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/type_casters.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/type_casters.h
@@ -16,102 +16,131 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_TYPE_CASTERS_H_
 
 #include <string>
+#include <type_traits>
 #include <utility>
 
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "pybind11/cast.h"  // from @pybind11
 #include "pybind11/detail/common.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 
 namespace pybind11::detail {
 namespace internal {
 
-// Serializes an ExportedModel. Raises python ValueError if serialization fails.
-std::string Serialize(
-    const tensorflow::quantization::ExportedModel& exported_model) {
-  const std::string exported_model_serialized =
-      exported_model.SerializeAsString();
+// Serializes a protobuf object. Raises python ValueError if serialization
+// fails.
+inline std::string Serialize(const tsl::protobuf::Message& protobuf_object) {
+  const std::string serialized = protobuf_object.SerializeAsString();
 
   // Empty string means it failed to serialize the protobuf with an error. See
   // the docstring for SerializeAsString for details.
-  if (exported_model_serialized.empty()) {
-    throw py::value_error("Failed to serialize ExportedModel.");
+  if (serialized.empty()) {
+    // Show the name of the protobuf message type to provide more information
+    // and easier debugging.
+    const std::string descriptor_name =
+        protobuf_object.GetDescriptor() == nullptr
+            ? "unknown"
+            : protobuf_object.GetDescriptor()->full_name();
+    throw py::value_error(absl::StrFormat(
+        "Failed to serialize protobuf object: %s.", descriptor_name));
   }
 
-  return exported_model_serialized;
+  return serialized;
 }
 
-}  // namespace internal
-
-// Handles `ExportedModel` (c++) <-> `bytes` (python) conversion. The `bytes`
-// object in the python layer is a serialization of `ExportedModel`.
+// Handles `ProtoT` (c++) <-> `bytes` (python) conversion. The `bytes`
+// object in the python layer is a serialization of `ProtoT`.
 //
-// See https://pybind11.readthedocs.io/en/stable/advanced/cast/custom.html for
-// further details on how custom type conversions work for pybind11.
-template <>
-struct type_caster<tensorflow::quantization::ExportedModel> {
+// The caller of c++ interfaces should make sure to pass valid serialized
+// `ProtoT` objects as arguments. Failing to do so results in raising a
+// `ValueError`. Similarly, the python implementation of a c++ virtual member
+// function that return an `ProtoT` should return a valid serialized `ProtoT`.
+//
+// See https://pybind11.readthedocs.io/en/stable/advanced/cast/custom.html
+template <typename ProtoT, typename = std::enable_if_t<std::is_base_of_v<
+                               tsl::protobuf::Message, ProtoT>>>
+struct SerializedProtobufCaster {
  public:
-  PYBIND11_TYPE_CASTER(tensorflow::quantization::ExportedModel,
-                       const_name("ExportedModel"));
+  PYBIND11_TYPE_CASTER(ProtoT, const_name<ProtoT>());
 
-  // Loads an `ExportedModel` instance from a python `bytes` object (`src`).
+  // Loads an `ProtoT` instance from a python `bytes` object (`src`).
   bool load(handle src, const bool convert) {
     auto caster = make_caster<absl::string_view>();
     // Make sure the user passed a valid python string.
-    if (!caster.load(src, convert)) {
-      return false;
-    }
+    if (!caster.load(src, convert)) return false;
 
-    const absl::string_view exported_model_serialized =
+    const absl::string_view serialized_proto =
         cast_op<absl::string_view>(std::move(caster));
 
     // NOLINTNEXTLINE: Explicit std::string conversion required for OSS.
-    return value.ParseFromString(std::string(exported_model_serialized));
+    return value.ParseFromString(std::string(serialized_proto));
   }
 
-  // Constructs a `bytes` object after serializing `src`.
-  static handle cast(tensorflow::quantization::ExportedModel&& src,
-                     return_value_policy policy, handle parent) {
+  // Constructs a `bytes` object by serializing `src`.
+  static handle cast(ProtoT&& src, return_value_policy policy, handle parent) {
     // release() prevents the reference count from decreasing upon the
     // destruction of py::bytes and returns a raw python object handle.
-    return py::bytes(internal::Serialize(src)).release();
+    return py::bytes(Serialize(src)).release();
   }
 
-  // Constructs a `bytes` object after serializing `src`.
-  static handle cast(const tensorflow::quantization::ExportedModel& src,
-                     return_value_policy policy, handle parent) {
+  // Constructs a `bytes` object by serializing `src`.
+  static handle cast(const ProtoT& src, return_value_policy policy,
+                     handle parent) {
     // release() prevents the reference count from decreasing upon the
     // destruction of py::bytes and returns a raw python object handle.
-    return py::bytes(internal::Serialize(src)).release();
+    return py::bytes(Serialize(src)).release();
   }
 };
 
-// Python -> cpp conversion for `QuantizationOptions`. Accepts a serialized
-// protobuf string and deserializes into an instance of `QuantizationOptions`.
+}  // namespace internal
+
+// The following explicit specializations of protobuf `type_caster`s for
+// specific protobuf message types are there to have higher priority over those
+// defined in `native_proto_caster.h` during the resolution process. This is
+// because the type casters in `native_proto_caster.h`, which allow seamlessly
+// exchanging protobuf messages across c++-python boundaries, potentially
+// without serialization, fail in the open-source environment.
+// Explicitly-specialized type casters for serialized protobufs are added on an
+// on-demand basis for quantization library.
+// TODO: b/308532051 - Make `native_proto_caster.h` work in the open-source
+// environment.
+
 template <>
-struct type_caster<tensorflow::quantization::QuantizationOptions> {
- public:
-  PYBIND11_TYPE_CASTER(tensorflow::quantization::QuantizationOptions,
-                       const_name("QuantizationOptions"));
+struct type_caster<tensorflow::quantization::ExportedModel>
+    : public internal::SerializedProtobufCaster<
+          tensorflow::quantization::ExportedModel> {};
 
-  bool load(handle src, const bool convert) {
-    auto caster = make_caster<absl::string_view>();
-    // The user should have passed a valid python string.
-    if (!caster.load(src, convert)) {
-      return false;
-    }
+template <>
+struct type_caster<tensorflow::quantization::QuantizationOptions>
+    : public internal::SerializedProtobufCaster<
+          tensorflow::quantization::QuantizationOptions> {};
 
-    const absl::string_view quantization_opts_serialized =
-        cast_op<absl::string_view>(std::move(caster));
+template <>
+struct type_caster<tensorflow::quantization::CalibrationOptions>
+    : public internal::SerializedProtobufCaster<
+          tensorflow::quantization::CalibrationOptions> {};
 
-    // NOLINTNEXTLINE: Explicit std::string conversion required for OSS.
-    return value.ParseFromString(std::string(quantization_opts_serialized));
-  }
-};
+template <>
+struct type_caster<tensorflow::SignatureDef>
+    : public internal::SerializedProtobufCaster<tensorflow::SignatureDef> {};
+
+template <>
+struct type_caster<tensorflow::GraphDef>
+    : public internal::SerializedProtobufCaster<tensorflow::GraphDef> {};
+
+template <>
+struct type_caster<tensorflow::calibrator::CalibrationStatistics>
+    : public internal::SerializedProtobufCaster<
+          tensorflow::calibrator::CalibrationStatistics> {};
 
 }  // namespace pybind11::detail
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.cc
new file mode 100644
index 00000000000000..b957ffe469a004
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.cc
@@ -0,0 +1,75 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h"
+
+#include <string>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+#include "tensorflow/core/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace quantization {
+
+// Unfreezes constants into variables and saves them to a checkpoint files under
+// `checkpoint_dir`. `checkpoint_dir` will be created within this function. It
+// will return a non-OK status if it already exists or permission is denied.
+// TODO(b/261652258): Make sure this works for when there are non-frozen
+// variables in the model.
+absl::Status UnfreezeConstantsAndSaveVariables(
+    const absl::string_view checkpoint_dir, mlir::MLIRContext &ctx,
+    mlir::ModuleOp module_op) {
+  TF_RETURN_IF_ERROR(RunPasses(
+      /*name=*/kTfQuantConstantUnfreezingStepName, /*add_passes_func=*/
+      [](mlir::PassManager &pm) {
+        pm.addPass(mlir::quant::CreateUnfreezeConstantsPass());
+      },
+      ctx, module_op));
+
+  if (const tsl::Status create_dir_status =
+          Env::Default()->CreateDir(std::string(checkpoint_dir));
+      !create_dir_status.ok()) {
+    LOG(ERROR) << "Failed to create checkpoint directory at: "
+               << checkpoint_dir;
+    return create_dir_status;
+  }
+
+  TF_ASSIGN_OR_RETURN(const auto unused_variable_names,
+                      SaveVariablesToCheckpoint(checkpoint_dir, module_op));
+
+  return RunPasses(
+      /*name=*/kTfQuantInsertRestoreOpStepName,
+      /*add_passes_func=*/
+      [](mlir::PassManager &pm) {
+        pm.addPass(mlir::quant::CreateInsertRestoreOpPass());
+        pm.addPass(mlir::quant::CreateInsertSaveOpPass());
+        // Initialization by `tf.ConstOp` is no longer required as there is
+        // a `tf.RestoreV2Op` now.
+        pm.addPass(
+            mlir::quant::CreateRemoveVariableInitializationByConstPass());
+      },
+      ctx, module_op);
+}
+}  // namespace quantization
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h b/tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h
new file mode 100644
index 00000000000000..3086d705f315b7
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_UNFREEZE_CONSTANTS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_UNFREEZE_CONSTANTS_H_
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace quantization {
+
+inline constexpr absl::string_view kTfQuantConstantUnfreezingStepName =
+    "tf_quant_constant_unfreezing";
+inline constexpr absl::string_view kTfQuantInsertRestoreOpStepName =
+    "tf_quant_insert_restore_op";
+
+absl::Status UnfreezeConstantsAndSaveVariables(absl::string_view checkpoint_dir,
+                                               mlir::MLIRContext &ctx,
+                                               mlir::ModuleOp module_op);
+
+}  // namespace quantization
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_UNFREEZE_CONSTANTS_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
index 0b9cdc09ca5b93..4825d316f6e691 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <optional>
 
 #include "absl/strings/string_view.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
@@ -36,7 +37,7 @@ using ::tensorflow::quantization::QuantizationOptions;
 // Currently server cannot handle UniformQuantizedTypes. Instead, unpack
 // quantized ops to primitive StableHLO ops. We currently go through a
 // StableHLO <-> MHLO roundtrip to utilize the MHLOQuantToInt pass.
-void AddStablehloQuantToIntPasses(mlir::PassManager &pm) {
+void AddStablehloQuantToIntPasses(mlir::OpPassManager &pm) {
   pm.addPass(mlir::createInlinerPass());
   // StableHLO -> MHLO legalization.
   pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
@@ -50,32 +51,50 @@ void AddStablehloQuantToIntPasses(mlir::PassManager &pm) {
 }
 
 void AddStaticRangeQuantizationPass(
-    mlir::PassManager &pm, const QuantizationOptions &quantization_options,
+    mlir::OpPassManager &pm,
     std::optional<const absl::string_view> mlir_dump_file_prefix) {
   pm.addPass(mlir::quant::stablehlo::createQuantizeCompositeFunctionsPass());
 }
 
-void AddConvertTpuToCpuModelPasses(mlir::PassManager &pm) {
+void AddConvertTpuToCpuModelPasses(mlir::OpPassManager &pm) {
   pm.addPass(mlir::quant::CreateConvertTpuModelToCpuPass());
   pm.addPass(mlir::createInlinerPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   pm.addPass(mlir::quant::CreateCastBf16OpsToF32Pass());
 }
 
+// Legalizes shape/tensor/arith dialect ops to StableHLO for handling dynamic
+// shapes, by going through a round-trip to MHLO.
+void AddShapeLegalizationPasses(mlir::OpPassManager &pm) {
+  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createShapeLegalizeToHloPass(/*legalizeConstraints=*/true));
+  // The following 2 passes are used to clean up the spurious UnrealizedCast ops
+  // and shape.assuming regions leftover from the ShapeLegalizeToHlo pass. See
+  // pass definition for details.
+  pm.addPass(mlir::createReconcileUnrealizedCastsPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
+}
+
 // NOMUTANTS -- Add tests for individual passes with migration below.
 // Serializes the StableHLO module into a tf.XlaCallModuleOp for compatibility
 // with passes that expect TF format. This also allows the StableHLO ops to be
 // exported as a TF SavedModel.
-void AddCallModuleSerializationPasses(mlir::PassManager &pm) {
+void AddCallModuleSerializationPasses(mlir::OpPassManager &pm) {
+  AddShapeLegalizationPasses(pm);
   pm.addPass(
       mlir::quant::stablehlo::
           createReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass());
+  // ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass may create
+  // duplicate constants. Add canonicalizer to deduplicate.
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   pm.addPass(mlir::TF::CreateXlaCallModuleSerializationPass());
 }
 }  // namespace
 
 void AddQuantizeQatPasses(
-    mlir::PassManager &pm, const QuantizationOptions &quantization_options,
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
     std::optional<const absl::string_view> mlir_dump_file_prefix) {
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::quant::CreateConvertFakeQuantToQdqPass());
@@ -123,7 +142,7 @@ void AddQuantizeQatPasses(
 }
 
 void AddQuantizePtqDynamicRangePasses(
-    mlir::PassManager &pm, const QuantizationOptions &quantization_options,
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
     std::optional<const absl::string_view> mlir_dump_file_prefix) {
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::TF::CreateUnrollBatchMatMulPassPass());
@@ -167,7 +186,7 @@ void AddQuantizePtqDynamicRangePasses(
 }
 
 void AddQuantizePtqPreCalibrationPasses(
-    mlir::PassManager &pm, const QuantizationOptions &quantization_options) {
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options) {
   if (quantization_options.op_set() == OpSet::UNIFORM_QUANTIZED) {
     pm.addNestedPass<mlir::func::FuncOp>(
         mlir::TF::CreateUnrollBatchMatMulPassPass());
@@ -195,7 +214,7 @@ void AddQuantizePtqPreCalibrationPasses(
 }
 
 void AddQuantizePtqPostCalibrationPasses(
-    mlir::PassManager &pm, const QuantizationOptions &quantization_options,
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
     std::optional<const absl::string_view> mlir_dump_file_prefix) {
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
@@ -228,14 +247,12 @@ void AddQuantizePtqPostCalibrationPasses(
 }
 
 // StableHLO Quantization passes that are ran if StableHLO opset is selected.
-// TODO: b/298581932 - Add tests for passes below once migration is complete.
 void AddQuantizePtqPreCalibrationStablehloPasses(
-    mlir::PassManager &pm, const QuantizationOptions &quantization_options) {
+    mlir::OpPassManager &pm, const CalibrationOptions &calibration_options) {
   pm.addPass(
       mlir::quant::stablehlo::createLiftQuantizableSpotsAsFunctionsPass());
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::quant::CreateInsertCustomAggregationOpsPass(
-          quantization_options.calibration_options()));
+      mlir::quant::CreateInsertCustomAggregationOpsPass(calibration_options));
   pm.addPass(mlir::quant::CreateIssueIDsOfCustomAggregationOpsPass());
   // NOMUTANTS -- Add tests after all passes in function below are migrated.
   // StableHLO Quantizer currently uses TF's calibration passes. Serialize
@@ -243,25 +260,29 @@ void AddQuantizePtqPreCalibrationStablehloPasses(
   AddCallModuleSerializationPasses(pm);
 }
 
-// TODO: b/298581932 - Migrate and add passes below.
 void AddQuantizePtqPostCalibrationStablehloPasses(
-    mlir::PassManager &pm, const QuantizationOptions &quantization_options,
+    mlir::OpPassManager &pm,
     std::optional<const absl::string_view> mlir_dump_file_prefix) {
   // Deserializes the StableHLO module embedded in tf.XlaCallModule and lifts
   // the StableHLO functions to the top level module. This is needed for
   // StableHLO quantization.
+  //
+  // Calibration may result in partial shape information loss. Add this pass to
+  // populate shape information based on the known information.
+  pm.addPass(mlir::quant::stablehlo::createPopulateShapePass());
   pm.addPass(mlir::TF::CreateXlaCallModuleDeserializationPass());
   pm.addPass(mlir::quant::stablehlo::createRestoreFunctionNamePass());
+  pm.addPass(mlir::quant::stablehlo::createUnwrapXlaCallModuleOpPass());
+  pm.addPass(mlir::createSymbolDCEPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::quant::CreateConvertCustomAggregationOpToQuantStatsPass());
-  AddStaticRangeQuantizationPass(pm, quantization_options,
-                                 mlir_dump_file_prefix);
+  AddStaticRangeQuantizationPass(pm, mlir_dump_file_prefix);
   AddStablehloQuantToIntPasses(pm);
   AddCallModuleSerializationPasses(pm);
 }
 
 void AddQuantizeWeightOnlyPasses(
-    mlir::PassManager &pm, const QuantizationOptions &quantization_options,
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
     std::optional<const absl::string_view> mlir_dump_file_prefix) {
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
   // Add PrepareLiftingPass to utilize its functionalities like folding batch
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.h b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.h
index 3aef23b5667d51..5d757b4c944441 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.h
@@ -26,35 +26,35 @@ namespace quantization {
 
 // mlir_dump_file_prefix is an optional field that is used for debugging to save
 // mlir dump files.
-void AddQuantizeQatPasses(mlir::PassManager &pm,
+void AddQuantizeQatPasses(mlir::OpPassManager &pm,
                           const QuantizationOptions &quantization_options,
                           std::optional<const absl::string_view>
                               mlir_dump_file_prefix = std::nullopt);
 
 void AddQuantizePtqDynamicRangePasses(
-    mlir::PassManager &pm, const QuantizationOptions &quantization_options,
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
     std::optional<const absl::string_view> mlir_dump_file_prefix =
         std::nullopt);
 
 void AddQuantizeWeightOnlyPasses(
-    mlir::PassManager &pm, const QuantizationOptions &quantization_options,
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
     std::optional<const absl::string_view> mlir_dump_file_prefix =
         std::nullopt);
 
 void AddQuantizePtqPreCalibrationPasses(
-    mlir::PassManager &pm, const QuantizationOptions &quantization_options);
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options);
 
 void AddQuantizePtqPostCalibrationPasses(
-    mlir::PassManager &pm, const QuantizationOptions &quantization_options,
+    mlir::OpPassManager &pm, const QuantizationOptions &quantization_options,
     std::optional<const absl::string_view> mlir_dump_file_prefix =
         std::nullopt);
 
 // StableHLO Quantization passes that are ran if StableHLO opset is selected.
 void AddQuantizePtqPreCalibrationStablehloPasses(
-    mlir::PassManager &pm, const QuantizationOptions &quantization_options);
+    mlir::OpPassManager &pm, const CalibrationOptions &quantization_options);
 
 void AddQuantizePtqPostCalibrationStablehloPasses(
-    mlir::PassManager &pm, const QuantizationOptions &quantization_options,
+    mlir::OpPassManager &pm,
     std::optional<const absl::string_view> mlir_dump_file_prefix =
         std::nullopt);
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir
index fa747357169f55..b8ed5d5f361d36 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir
@@ -1,62 +1,60 @@
-// RUN: tf-quant-opt %s -quant-insert-custom-aggregation-ops='test-case=MIN_MAX'  | FileCheck --check-prefix=MIN-MAX-CHECK %s
-// RUN: tf-quant-opt %s -quant-insert-custom-aggregation-ops='test-case=AVERAGE_MIN_MAX'  | FileCheck --check-prefix=AVERAGE-MIN-MAX-CHECK %s
-// RUN: tf-quant-opt %s -quant-insert-custom-aggregation-ops='test-case=HISTOGRAM_PERCENTILE'  | FileCheck --check-prefix=HISTOGRAM-PERCENTILE-CHECK %s
-// RUN: tf-quant-opt %s -quant-insert-custom-aggregation-ops='test-case=HISTOGRAM_MSE_BRUTEFORCE'  | FileCheck --check-prefix=HISTOGRAM-MSE-BRUTEFORCE-CHECK %s
-// RUN: tf-quant-opt %s -quant-insert-custom-aggregation-ops='test-case=HISTOGRAM_MSE_MAX_FREQUENCY'  | FileCheck --check-prefix=HISTOGRAM-MSE-MAX-FREQUENCY-CHECK %s
-// RUN: tf-quant-opt %s -quant-insert-custom-aggregation-ops='test-case=HISTOGRAM_MSE_SYMMETRIC'  | FileCheck --check-prefix=HISTOGRAM-MSE-SYMMETRIC-CHECK %s
+// RUN: tf-quant-opt %s -quant-insert-custom-aggregation-ops='test-case=MIN_MAX' -split-input-file | FileCheck --check-prefix=MIN-MAX-CHECK %s
+// RUN: tf-quant-opt %s -quant-insert-custom-aggregation-ops='test-case=AVERAGE_MIN_MAX'  -split-input-file | FileCheck --check-prefix=AVERAGE-MIN-MAX-CHECK %s
+// RUN: tf-quant-opt %s -quant-insert-custom-aggregation-ops='test-case=HISTOGRAM_PERCENTILE' -split-input-file | FileCheck --check-prefix=HISTOGRAM-PERCENTILE-CHECK %s
+// RUN: tf-quant-opt %s -quant-insert-custom-aggregation-ops='test-case=HISTOGRAM_MSE_BRUTEFORCE' -split-input-file | FileCheck --check-prefix=HISTOGRAM-MSE-BRUTEFORCE-CHECK %s
+// RUN: tf-quant-opt %s -quant-insert-custom-aggregation-ops='test-case=HISTOGRAM_MSE_MAX_FREQUENCY' -split-input-file | FileCheck --check-prefix=HISTOGRAM-MSE-MAX-FREQUENCY-CHECK %s
+// RUN: tf-quant-opt %s -quant-insert-custom-aggregation-ops='test-case=HISTOGRAM_MSE_SYMMETRIC' -split-input-file | FileCheck --check-prefix=HISTOGRAM-MSE-SYMMETRIC-CHECK %s
 
 module {
-  func.func @add_custom_ops(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-    %add = "tf.AddV2"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    func.return %add : tensor<*xf32>
+  func.func @wrap_composite_func(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+    %0 = "tf.PartitionedCall"(%arg0, %arg1) <{f = @composite_conv2d_with_relu6_fn}> {_tfl_quant_trait = "fully_quantizable"}
+          : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    func.return %0 : tensor<*xf32>
   }
 
-  func.func @no_custom_ops_on_non_f32_type(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi32> {
-    %add = "tf.AddV2"(%arg0, %arg1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
-    func.return %add : tensor<*xi32>
+  func.func @no_composite_func(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+    %add = "tf.AddV2"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    func.return %add : tensor<*xf32>
   }
 
-  func.func @composite_conv2d_with_bias_and_relu6_fn(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+  func.func @composite_conv2d_with_relu6_fn(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
     %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    %1 = "tf.BiasAdd"(%0, %arg2) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
-    %2 = "tf.Relu6"(%1) : (tensor<*xf32>) -> tensor<*xf32>
-    func.return %2 : tensor<*xf32>
+    %1 = "tf.Relu6"(%0) : (tensor<*xf32>) -> tensor<*xf32>
+    func.return %1 : tensor<*xf32>
   }
 }
 
 // CalibrationOptions(calibration_method=CALIBRATION_METHOD_MIN_MAX)
-// MIN-MAX-CHECK: func @add_custom_ops
+// MIN-MAX-CHECK: func @wrap_composite_func
 // MIN-MAX-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // MIN-MAX-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// MIN-MAX-CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+// MIN-MAX-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
 // MIN-MAX-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // MIN-MAX-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
-// MIN-MAX-CHECK: func @no_custom_ops_on_non_f32_type
+// MIN-MAX-CHECK: func @no_composite_func
 // MIN-MAX-CHECK-NEXT:  "tf.AddV2"
 // MIN-MAX-CHECK-NEXT:  return
 
-// MIN-MAX-CHECK: func @composite_conv2d_with_bias_and_relu6_fn
+// MIN-MAX-CHECK: func @composite_conv2d_with_relu6_fn
 // MIN-MAX-CHECK-NEXT:  "tf.Conv2D"
-// MIN-MAX-CHECK-NEXT:  "tf.BiasAdd"
 // MIN-MAX-CHECK-NEXT:  "tf.Relu6"
 // MIN-MAX-CHECK-NEXT:  return
 
 // CalibrationOptions(calibration_method=CALIBRATION_METHOD_AVERAGE_MIN_MAX)
-// AVERAGE-MIN-MAX-CHECK: func @add_custom_ops
+// AVERAGE-MIN-MAX-CHECK: func @wrap_composite_func
 // AVERAGE-MIN-MAX-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // AVERAGE-MIN-MAX-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// AVERAGE-MIN-MAX-CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+// AVERAGE-MIN-MAX-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
 // AVERAGE-MIN-MAX-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // AVERAGE-MIN-MAX-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
-// AVERAGE-MIN-MAX-CHECK: func @no_custom_ops_on_non_f32_type
+// AVERAGE-MIN-MAX-CHECK: func @no_composite_func
 // AVERAGE-MIN-MAX-CHECK-NEXT:  "tf.AddV2"
 // AVERAGE-MIN-MAX-CHECK-NEXT:  return
 
-// AVERAGE-MIN-MAX-CHECK: func @composite_conv2d_with_bias_and_relu6_fn
+// AVERAGE-MIN-MAX-CHECK: func @composite_conv2d_with_relu6_fn
 // AVERAGE-MIN-MAX-CHECK-NEXT:  "tf.Conv2D"
-// AVERAGE-MIN-MAX-CHECK-NEXT:  "tf.BiasAdd"
 // AVERAGE-MIN-MAX-CHECK-NEXT:  "tf.Relu6"
 // AVERAGE-MIN-MAX-CHECK-NEXT:  return
 
@@ -64,20 +62,19 @@ module {
 //   calibration_method=CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
 //   calibration_parameters=CalibrationParameters(initial_num_bins=256, min_percentile=0.001, max_percentile=99.999)
 // )
-// HISTOGRAM-PERCENTILE-CHECK: func @add_custom_ops
+// HISTOGRAM-PERCENTILE-CHECK: func @wrap_composite_func
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
-// HISTOGRAM-PERCENTILE-CHECK: func @no_custom_ops_on_non_f32_type
+// HISTOGRAM-PERCENTILE-CHECK: func @no_composite_func
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  "tf.AddV2"
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  return
 
-// HISTOGRAM-PERCENTILE-CHECK: func @composite_conv2d_with_bias_and_relu6_fn
+// HISTOGRAM-PERCENTILE-CHECK: func @composite_conv2d_with_relu6_fn
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  "tf.Conv2D"
-// HISTOGRAM-PERCENTILE-CHECK-NEXT:  "tf.BiasAdd"
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  "tf.Relu6"
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  return
 
@@ -85,20 +82,19 @@ module {
 //   calibration_method=CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
 //   calibration_parameters=CalibrationParameters(initial_num_bins=256)
 // )
-// HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @add_custom_ops
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @wrap_composite_func
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
-// HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @no_custom_ops_on_non_f32_type
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @no_composite_func
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  "tf.AddV2"
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  return
 
-// HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @composite_conv2d_with_bias_and_relu6_fn
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @composite_conv2d_with_relu6_fn
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  "tf.Conv2D"
-// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  "tf.BiasAdd"
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  "tf.Relu6"
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  return
 
@@ -106,20 +102,19 @@ module {
 //   calibration_method=CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,
 //   calibration_parameters=CalibrationParameters(initial_num_bins=256)
 // )
-// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @add_custom_ops
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @wrap_composite_func
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
-// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @no_custom_ops_on_non_f32_type
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @no_composite_func
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  "tf.AddV2"
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  return
 
-// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @composite_conv2d_with_bias_and_relu6_fn
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @composite_conv2d_with_relu6_fn
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  "tf.Conv2D"
-// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  "tf.BiasAdd"
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  "tf.Relu6"
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  return
 
@@ -127,20 +122,56 @@ module {
 //   calibration_method=CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,
 //   calibration_parameters=CalibrationParameters(initial_num_bins=256)
 // )
-// HISTOGRAM-MSE-SYMMETRIC-CHECK: func @add_custom_ops
+// HISTOGRAM-MSE-SYMMETRIC-CHECK: func @wrap_composite_func
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[rhs:%.*]] = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[lhs:%.*]] = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[res:%.*]] = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
-// HISTOGRAM-MSE-SYMMETRIC-CHECK: func @no_custom_ops_on_non_f32_type
+// HISTOGRAM-MSE-SYMMETRIC-CHECK: func @no_composite_func
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  "tf.AddV2"
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  return
 
-// HISTOGRAM-MSE-SYMMETRIC-CHECK: func @composite_conv2d_with_bias_and_relu6_fn
+// HISTOGRAM-MSE-SYMMETRIC-CHECK: func @composite_conv2d_with_relu6_fn
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  "tf.Conv2D"
-// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  "tf.BiasAdd"
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  "tf.Relu6"
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  return
 
+
+// -----
+
+module {
+  // CHECK-LABEL: func.func @main
+  func.func @main(%arg0: tensor<?x100352xf32>, %arg1: tensor<100352x10xf32>) -> tensor<?x10xf32> {
+    // CHECK-DAG: %[[ARG0_ID:.*]] = "tf.Identity"(%arg0)
+    // CHECK-DAG: %[[ARG1_ID:.*]] = "tf.Identity"(%arg1)
+    // CHECK-DAG: %[[ARG0_AGG:.*]] = "tf.CustomAggregator"(%[[ARG0_ID]])
+    // CHECK-DAG: %[[ARG1_AGG:.*]] = "tf.CustomAggregator"(%[[ARG1_ID]])
+    // CHECK: %[[RES:.*]] = "tf.XlaCallModule"(%[[ARG0_AGG]], %[[ARG1_AGG]])
+    // CHECK: %[[RES_AGG:.*]] = "tf.CustomAggregator"(%[[RES]])
+    // CHECK-DAG: %[[RES_ID:.*]] = "tf.Identity"(%[[RES_AGG]])
+    // CHECK: return %[[RES_ID]] : tensor<?x10xf32>
+    %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<?x100352xf32>) -> tensor<?x100352xf32>
+    %1 = "tf.Identity"(%arg1) {device = ""} : (tensor<100352x10xf32>) -> tensor<100352x10xf32>
+    %2 = "tf.XlaCallModule"(%0, %1) <{
+        Sout = [#tf_type.shape<?x10>], dim_args_spec = [],
+        disabled_checks = [], function_list = [],
+        has_token_input_output = false, module = "", platforms = [],
+        version = 5 : i64
+    }> {
+        _entry_function = @composite_dot_general_fn_1,
+        _original_entry_function = "composite_dot_general_fn_1",
+        _tfl_quant_trait = "fully_quantizable"
+    } : (tensor<?x100352xf32>, tensor<100352x10xf32>) -> tensor<?x10xf32>
+    %3 = "tf.Identity"(%2) {device = ""} : (tensor<?x10xf32>) -> tensor<?x10xf32>
+    return %3 : tensor<?x10xf32>
+  }
+
+  // CHECK-LABEL: func.func private @composite_dot_general_fn_1
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<?x100352xf32>, %arg1: tensor<100352x10xf32>) -> tensor<?x10xf32> {
+    // CHECK-NOT: tf.CustomAggregator
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<?x100352xf32>, tensor<100352x10xf32>) -> tensor<?x10xf32>
+    return %0 : tensor<?x10xf32>
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
index e3b67f59a5e829..e7b42fcd09e2aa 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
@@ -1,4 +1,3 @@
-load("@llvm-project//mlir:tblgen.bzl", "td_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
@@ -34,39 +33,6 @@ cc_library(
     ],
 )
 
-td_library(
-    name = "lift_as_function_call_utils_td_files",
-    srcs = [
-        "lift_as_function_call_utils.td",
-    ],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@llvm-project//mlir:FuncTdFiles",
-    ],
-)
-
-cc_library(
-    name = "lift_as_function_call_utils",
-    srcs = ["lift_as_function_call_utils.cc"],
-    hdrs = ["lift_as_function_call_utils.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:pass_utils",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:quantization_unit_loc",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
-        "//tensorflow/compiler/mlir/tensorflow:xla_call_module_attrs",
-        "//tensorflow/core:framework_lite",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
 cc_library(
     name = "tf_to_uniform_attribute_utils",
     srcs = ["tf_to_uniform_attribute_utils.cc"],
@@ -74,7 +40,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:pass_utils",
+        "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow/ops:uniform_op_quant_spec",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
@@ -112,7 +78,7 @@ cc_library(
     hdrs = ["tf_to_xla_attribute_utils.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        "//tensorflow/compiler/mlir/quantization/tensorflow:pass_utils",
+        "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:constant_fold",
         "//tensorflow/lite/kernels:padding",
         "@com_google_absl//absl/algorithm:container",
@@ -129,7 +95,7 @@ tf_cc_test(
     deps = [
         ":tf_to_xla_attribute_utils",
         "//tensorflow/c/eager:c_api",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:pass_utils",
+        "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h
index 35a00db79e368f..922729d9c8c3a6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "llvm/ADT/StringMap.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 
 namespace mlir::quant {
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc
index 168c99a7b2cf86..f1d7a6ae576c7b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_format.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/lite/kernels/padding.h"
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.h b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.h
index 52dcdcbc780325..80212b9acec5fb 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.h
@@ -19,7 +19,6 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TO_XLA_ATTRIBUTE_UTILS_H_
 
 #include "mlir/IR/Builders.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 
 namespace mlir::quant {
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils_test.cc
index b71ccae8f7c0a0..cc4bbb344026da 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils_test.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 
 namespace mlir::quant {
diff --git a/tensorflow/compiler/mlir/stablehlo/BUILD b/tensorflow/compiler/mlir/stablehlo/BUILD
index f1265510044fdd..c16a7118cecce0 100644
--- a/tensorflow/compiler/mlir/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/stablehlo/BUILD
@@ -53,5 +53,6 @@ py_strict_test(
     python_version = "PY3",
     deps = [
         ":stablehlo",
+        #internal proto upb dep
     ],
 )
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index a24ea5e7a8fe63..77530c113b9be2 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -353,6 +353,7 @@ cc_library(
         ":attribute_utils",
         ":convert_type",
         ":dynamic_shape_utils",
+        ":side_effect_analysis_util",
         ":tensorflow_all_ops_inc_gen",
         ":tensorflow_attributes",
         ":tensorflow_op_interfaces",
@@ -407,6 +408,7 @@ cc_library(
     deps = [
         ":attribute_utils",
         ":serialize_mlir_module_utils",
+        ":side_effect_analysis_util",
         ":tensorflow_attributes",
         ":tensorflow_op_interfaces",
         ":tensorflow_op_interfaces_inc_gen",
@@ -451,6 +453,7 @@ cc_library(
         "ir/tf_remaining_ops.h.inc",
     ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
     deps = [
+        ":side_effect_analysis_util",
         ":tensorflow_attributes",
         ":tensorflow_op_interfaces",
         ":tensorflow_op_interfaces_inc_gen",
@@ -492,6 +495,7 @@ cc_library(
         "ir/tfrt_ops.h",
     ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
     deps = [
+        ":side_effect_analysis_util",
         ":tensorflow_all_ops_inc_gen",
         ":tensorflow_attributes",
         ":tensorflow_op_interfaces",
@@ -760,133 +764,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "upgrade_graph",
-    srcs = ["translate/upgrade_graph.cc"],
-    hdrs = ["translate/upgrade_graph.h"],
-    deps = [
-        ":attribute_utils",
-        "//tensorflow/compiler/tf2xla:functionalize_control_flow",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework",
-        "//tensorflow/core/common_runtime:device",
-        "//tensorflow/core/common_runtime:device_factory",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:grappler_item_builder",
-        "//tensorflow/core/grappler/clusters:virtual_cluster",
-        "//tensorflow/core/grappler/optimizers:meta_optimizer",
-        "//tensorflow/core/protobuf:for_core_protos_cc",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_library(
-    name = "export_graphdef",
-    srcs = [
-        "translate/export_graphdef.cc",
-    ],
-    hdrs = [
-        "translate/export_graphdef.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":convert_type",
-        ":error_util",
-        ":export_tf_dialect_op",
-        ":export_utils",
-        ":mlir_roundtrip_flags",
-        ":tensorflow",
-        ":translate_utils",
-        ":verify_suitable_for_graph_export",
-        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
-        "//tensorflow/compiler/mlir/utils:name_utils",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/graph/regularization:util",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@local_xla//xla:status_macros",
-    ],
-)
-
-cc_library(
-    name = "import_model",
-    srcs = [
-        "translate/import_model.cc",
-    ],
-    hdrs = [
-        "translate/export_graphdef.h",
-        "translate/import_model.h",
-    ],
-    deps = [
-        ":attribute_utils",
-        ":convert_attr",
-        ":convert_tensor",
-        ":convert_type",
-        ":dump_mlir_util",
-        ":dynamic_shape_utils",
-        ":error_util",
-        ":mangling_util",
-        ":mlir_import_options",
-        ":mlir_roundtrip_flags",
-        ":tensorflow",
-        ":tensorflow_attributes",
-        ":tensorflow_types",
-        ":translate_utils",
-        ":upgrade_graph",
-        "//tensorflow/cc/saved_model:bundle_v2",
-        "//tensorflow/cc/saved_model:constants",
-        "//tensorflow/cc/saved_model:loader_lite",
-        "//tensorflow/cc/saved_model:loader_util",
-        "//tensorflow/compiler/jit:shape_inference_helpers",
-        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
-        "//tensorflow/compiler/mlir/tensorflow:xla_sharding_util",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:initialize_variables_in_session_init",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:lift_variables_lib",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:mark_initialized_variables_lib",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler/utils:transitive_fanin",
-        "//tensorflow/core/platform:crash_analysis",
-        "//tensorflow/core/platform:types",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@local_xla//xla:status_macros",
-        "@local_xla//xla/client:sharding_builder",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/service:hlo_parser",
-    ],
-)
-
 cc_library(
     name = "parse_text_proto",
     srcs = ["utils/parse_text_proto.cc"],
@@ -912,20 +789,6 @@ cc_library(
     ],
 )
 
-tf_cc_test(
-    name = "tf_mlir_translate_registration_test",
-    size = "small",
-    srcs = ["translate/tf_mlir_translate_registration_test.cc"],
-    deps = [
-        ":translate_registration",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:TranslateLib",
-    ],
-)
-
 cc_library(
     name = "export_utils",
     srcs = [
@@ -1002,92 +865,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "export_tf_dialect_op",
-    srcs = [
-        "translate/export_tf_dialect_op.cc",
-    ],
-    hdrs = [
-        "translate/export_tf_dialect_op.h",
-    ],
-    deps = [
-        ":convert_type",
-        ":export_utils",
-        ":tensorflow",
-        "//tensorflow/compiler/mlir/utils:string_container_utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:DerivedAttributeOpInterface",
-        "@llvm-project//mlir:IR",
-        "@local_xla//xla:status_macros",
-    ],
-)
-
-cc_library(
-    name = "translate_tf_dialect_op",
-    srcs = ["translate/translate_tf_dialect_op.cc"],
-    deps = [
-        ":export_tf_dialect_op",
-        ":tensorflow",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TranslateLib",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "mlir_roundtrip_pass",
-    srcs = ["translate/mlir_roundtrip_pass.cc"],
-    hdrs = ["translate/mlir_roundtrip_pass.h"],
-    deps = [
-        ":error_util",
-        ":export_graphdef",
-        ":import_model",
-        ":mlir_roundtrip_flags",
-        "//tensorflow/core:core_cpu_lib",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@local_xla//xla:status_macros",
-    ],
-)
-
-cc_library(
-    name = "mlir_roundtrip_pass_registration",
-    srcs = ["translate/mlir_roundtrip_pass_registration.cc"],
-    deps = [
-        ":mlir_roundtrip_pass",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "mlir_roundtrip_flags",
-    srcs = ["translate/mlir_roundtrip_flags.cc"],
-    hdrs = ["translate/mlir_roundtrip_flags.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:types",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@local_xla//xla:status_macros",
-    ],
-)
-
 cc_library(
     name = "convert_attr",
     srcs = ["utils/convert_attr.cc"],
@@ -1249,90 +1026,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "mlir_import_options",
-    hdrs = ["translate/mlir_import_options.h"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "translate_lib",
-    srcs = ["translate/tf_mlir_translate.cc"],
-    hdrs = ["translate/tf_mlir_translate.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":error_util",
-        ":import_model",
-        ":import_utils",
-        ":mangling_util",
-        ":mlir_import_options",
-        ":mlir_roundtrip_flags",
-        "//tensorflow/cc/saved_model:bundle_v2",
-        "//tensorflow/cc/saved_model:loader_lite",
-        "//tensorflow/cc/saved_model:reader",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:ops",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler/utils:transitive_fanin",
-        "//tensorflow/core/util/tensor_bundle:byteswaptensor",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-    ],
-)
-
-cc_library(
-    name = "translate_cl_options",
-    srcs = [
-        "translate/tf_mlir_translate_cl.cc",
-    ],
-    hdrs = [
-        "translate/tf_mlir_translate_cl.h",
-    ],
-    deps = [
-        "@llvm-project//llvm:Support",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "translate_registration",
-    srcs = [
-        "translate/tf_mlir_translate_registration.cc",
-    ],
-    deps = [
-        ":export_graphdef",
-        ":mlir_roundtrip_flags",
-        ":tensorflow",
-        ":translate_cl_options",
-        ":translate_lib",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:TranslateLib",
-        "@local_xla//xla/client:client_library",
-        "@local_xla//xla/client:compile_only_client",
-        "@local_xla//xla/service/cpu:cpu_compiler",
-        "@local_xla//xla/service/cpu:cpu_transfer_manager",
-        "@local_xla//xla/stream_executor",
-        "@local_xla//xla/stream_executor/host:host_platform",
-        "@local_xla//xla/stream_executor/host:host_platform_id",
-    ],
-    alwayslink = 1,
-)
-
 tf_cc_test(
     name = "error_util_test",
     srcs = ["utils/error_util_test.cc"],
@@ -1488,6 +1181,7 @@ cc_library(
         ":device_util",
         ":tensorflow",
         ":tensorflow_types",
+        "//tensorflow/compiler/jit:flags_headers",
         "//tensorflow/compiler/mlir/utils:string_container_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -1513,6 +1207,7 @@ tf_cc_test(
         ":serialize_mlir_module_utils",
         ":tensorflow",
         ":tpu_rewrite_device_util",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -1937,27 +1632,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "split_into_island_per_op_pass",
-    srcs = ["translate/split_into_island_per_op_pass.cc"],
-    hdrs = [
-        "ir/tf_executor.h",
-        "translate/split_into_island_per_op_pass.h",
-    ],
-    deps = [
-        ":tensorflow",
-        ":tensorflow_executor_inc_gen",
-        ":tensorflow_types",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_pass_inc_gen",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:InferTypeOpInterface",
-        "@llvm-project//mlir:Pass",
-    ],
-)
-
 tf_cc_test(
     name = "xla_rewrite_util_test",
     size = "small",
@@ -1968,6 +1642,7 @@ tf_cc_test(
         ":tensorflow",
         ":tpu_rewrite_device_util",
         ":xla_rewrite_util",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -1978,6 +1653,22 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "side_effect_analysis_util",
+    srcs = [
+        "utils/side_effect_analysis_util.cc",
+    ],
+    hdrs = [
+        "utils/side_effect_analysis_util.h",
+    ],
+    deps = [
+        "tensorflow_side_effects",
+        "tensorflow_types",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 build_test(
     name = "tensorflow_build_test",
     targets = [
@@ -2008,3 +1699,30 @@ build_test(
 # )
 #
 # copybara:uncomment_end(google-only)
+
+# Required as we created the transforms subpackage and need to update
+# these BUILD targets in a follow up.
+aliased_targets = [
+    "export_graphdef",
+    "import_model",
+    "export_tf_dialect_op",
+    "translate_tf_dialect_op",
+    "mlir_roundtrip_pass",
+    "mlir_roundtrip_pass_registration",
+    "mlir_roundtrip_flags",
+    "mlir_import_options",
+    "translate_lib",
+    "translate_cl_options",
+    "translate_registration",
+    "split_into_island_per_op_pass",
+    "upgrade_graph",
+]
+
+[
+    alias(
+        name = target,
+        actual = "//tensorflow/compiler/mlir/tensorflow/translate:%s" % target,
+        visibility = ["//visibility:public"],
+    )
+    for target in aliased_targets
+]
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index 744fa37a914de0..b0d730898316d5 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
@@ -346,6 +347,7 @@ class OpSideEffectCollector {
   }
 
   bool IsCallToPureFunction(Operation* callOp) const;
+  bool IsPureFunction(func::FuncOp func_op) const;
 
  private:
   // Adds op-based side effects from all ops in `region` to `op` side effects.
@@ -510,18 +512,42 @@ bool OpSideEffectCollector::IsCallToPureFunction(Operation* callOp) const {
     return false;  // not a call
   func::FuncOp func_op = dyn_cast<func::FuncOp>(call.resolveCallable(
       &symbol_table_collection_));
+  return IsPureFunction(func_op);
+}
+
+bool OpSideEffectCollector::IsPureFunction(func::FuncOp func_op) const {
   auto it = is_pure_function_.find(func_op);
   if (it == is_pure_function_.end()) {
     bool is_pure = true;
+    is_pure_function_[func_op] = is_pure;  // prevent infinite recursion
     func_op->walk([&](Operation* op) {
-      if (op == func_op) return WalkResult::advance();
+      if (op == func_op) {
+        return WalkResult::advance();
+      }
+      // AssertOp is not, technically, pure. However, we treat functions
+      // that contain an assert as pure, so that graphs with and without
+      // assert don't have different side effect semantics. Also see
+      // b/309824992 for the challenges associated with improving the side
+      // effect modelling of Assert on the op level.
+      if (llvm::isa<AssertOp>(op)) {
+        return WalkResult::advance();
+      }
+      if (auto if_op = llvm::dyn_cast<IfOp>(op)) {
+        if (IsPureFunction(if_op.then_function()) &&
+            IsPureFunction(if_op.else_function())) {
+          return WalkResult::advance();
+        }
+      }
+      if (IsCallToPureFunction(op)) {
+        return WalkResult::advance();
+      }
       if (TensorFlowDialect::CanHaveSideEffects(op)) {
         is_pure = false;
         return WalkResult::interrupt();
       }
       return WalkResult::advance();
     });
-    is_pure_function_.insert({func_op, is_pure});
+    is_pure_function_[func_op] = is_pure;
   }
   return is_pure_function_[func_op];
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
index 95d766359b1d05..a3c95bdf2332a8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
@@ -52,11 +52,13 @@ class TensorFlowExecutorDialect : public Dialect {
 class ControlType : public Type::TypeBase<ControlType, Type, TypeStorage> {
  public:
   using Base::Base;
+  static constexpr ::mlir::StringLiteral name = "tf_executor.control";
 };
 
 class TokenType : public Type::TypeBase<TokenType, Type, TypeStorage> {
  public:
   using Base::Base;
+  static constexpr ::mlir::StringLiteral name = "tf_executor.token";
 };
 
 }  // namespace tf_executor
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index f69d3d4f9c97f4..553794ecd25b90 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -1198,7 +1198,9 @@ It is computed as:
     Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>, [{2-D or higher with shape `[..., r_y, c_y]`.}]>:$y,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$adj_x,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$adj_y
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$adj_y,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$grad_x,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$grad_y
   );
 
   let results = (outs
@@ -1245,7 +1247,9 @@ about broadcasting
     Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{2-D or higher with shape `[..., r_y, c_y]`.}]>:$y,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$adj_x,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$adj_y
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$adj_y,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$grad_x,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$grad_y
   );
 
   let results = (outs
@@ -1292,7 +1296,9 @@ about broadcasting
     Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint8]>, [{2-D or higher with shape `[..., r_y, c_y]`.}]>:$y,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$adj_x,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$adj_y
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$adj_y,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$grad_x,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$grad_y
   );
 
   let results = (outs
@@ -2095,7 +2101,7 @@ def TF_CeilOp : TF_Op<"Ceil", [Pure, TF_Idempotent, TF_SameOperandsAndResultType
   }];
 }
 
-def TF_CheckNumericsOp : TF_Op<"CheckNumerics", [TF_SameOperandsAndResultTypeResolveRef]> {
+def TF_CheckNumericsOp : TF_Op<"CheckNumerics", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, DeclareOpInterfaceMethods<TF_GetResourceInstanceInterface>, TF_SameOperandsAndResultTypeResolveRef]> {
   let summary = "Checks a tensor for NaN and Inf values.";
 
   let description = [{
@@ -8615,7 +8621,9 @@ cublas.
     TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$b,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$transpose_a,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$transpose_b
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$transpose_b,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$grad_a,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$grad_b
   );
 
   let results = (outs
@@ -22008,7 +22016,7 @@ a u64[2] and for PHILOX a u64[3].}]>:$initial_state,
 
   let results = (outs
     TF_Uint64Tensor:$output_key,
-    TensorOf<[TF_Int32, TF_Int64, TF_Uint32, TF_Uint64]>:$output
+    TensorOf<[TF_Int32, TF_Int64, TF_Int8, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr Tshape = TF_DerivedOperandTypeAttr<2>;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index cee0e40f9cfeb5..8b8b069ea6f40d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -67,6 +67,7 @@ limitations under the License.
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -85,6 +86,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
@@ -1090,6 +1092,24 @@ OpFoldResult CastOp::fold(FoldAdaptor) {
   return {};
 }
 
+//===----------------------------------------------------------------------===//
+// CheckNumericsOp
+//===----------------------------------------------------------------------===//
+
+void CheckNumericsOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>&
+        effects) {
+  effects.emplace_back(MemoryEffects::Write::get(),
+                       ResourceEffects::CheckNumerics::get());
+  MarkResourceAsReadOnly(getTensor(), effects);
+}
+
+// For `CheckNumerics` ops the `device` attribute corresponds to the resource
+// instance.
+std::optional<std::string> CheckNumericsOp::GetResourceInstanceStr() {
+  return GetDeviceAttrAsResourceInstanceStr(*this);
+}
+
 //===----------------------------------------------------------------------===//
 // CollectiveReduceV2Op
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index 01cbbb9a46967c..122677ee4ad6da 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -88,6 +88,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.h"
 
 namespace mlir {
 namespace TF {
@@ -2344,21 +2345,13 @@ void TPUExecuteOp::getEffects(
   effects.emplace_back(MemoryEffects::Write::get(),
                        ResourceEffects::TPUExecute::get());
 
+  // Conservatively mark resource handles as read and write, as without
+  // analyzing TPUCompile, there is not sufficient information to determine
+  // effects on resources. For the MLIR bridge, this op will never be
+  // populated with resource handles and tf.TPUExecuteAndUpdateVariables is
+  // used instead.
   for (Value value : getArgs()) {
-    if (value.getType()
-            .cast<TensorType>()
-            .getElementType()
-            .isa<ResourceType>()) {
-      // Conservatively mark resource handles as read and write, as without
-      // analyzing TPUCompile, there is not sufficient information to determine
-      // effects on resources. For the MLIR bridge, this op will never be
-      // populated with resource handles and tf.TPUExecuteAndUpdateVariables is
-      // used instead.
-      effects.emplace_back(MemoryEffects::Read::get(), value,
-                           ResourceEffects::Variable::get());
-      effects.emplace_back(MemoryEffects::Write::get(), value,
-                           ResourceEffects::Variable::get());
-    }
+    MarkResourceAsReadAndWrite(value, effects);
   }
 }
 
@@ -2373,19 +2366,11 @@ void _XlaRunOp::getEffects(
   effects.emplace_back(MemoryEffects::Write::get(),
                        ResourceEffects::_XlaRun::get());
 
+  // Conservatively mark resource handles as read and write, as without
+  // analyzing _XlaCompile, there is not sufficient information to determine
+  // effects on resources.
   for (Value value : getArgs()) {
-    if (value.getType()
-            .cast<TensorType>()
-            .getElementType()
-            .isa<ResourceType>()) {
-      // Conservatively mark resource handles as read and write, as without
-      // analyzing _XlaCompile, there is not sufficient information to determine
-      // effects on resources.
-      effects.emplace_back(MemoryEffects::Read::get(), value,
-                           ResourceEffects::Variable::get());
-      effects.emplace_back(MemoryEffects::Write::get(), value,
-                           ResourceEffects::Variable::get());
-    }
+    MarkResourceAsReadAndWrite(value, effects);
   }
 }
 
@@ -3059,35 +3044,22 @@ LogicalResult XlaCallModuleOp::verifySymbolUses(
 void XlaLaunchOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
-  effects.reserve(getArgs().size() + 1);
+  effects.reserve(2 * getArgs().size() + 1);
   effects.emplace_back(MemoryEffects::Write::get(),
                        ResourceEffects::XlaLaunch::get());
 
+  // Conservatively mark resource handles as read and write, as without
+  // analyzing XlaLaunch, there is not sufficient information to determine
+  // effects on resources.
   for (Value value : getArgs()) {
-    if (value.getType()
-            .cast<TensorType>()
-            .getElementType()
-            .isa<ResourceType>()) {
-      // Conservatively mark resource handles as read and write, as without
-      // analyzing XlaLaunch, there is not sufficient information to determine
-      // effects on resources.
-      effects.emplace_back(MemoryEffects::Read::get(), value,
-                           ResourceEffects::Variable::get());
-      effects.emplace_back(MemoryEffects::Write::get(), value,
-                           ResourceEffects::Variable::get());
-    }
+    MarkResourceAsReadAndWrite(value, effects);
   }
 }
 
 // For `XlaLaunch` ops the `device` attribute corresponds to the resource
 // instance.
 std::optional<std::string> XlaLaunchOp::GetResourceInstanceStr() {
-  auto device_attr = (*this)->getAttrOfType<StringAttr>("device");
-  // Treat missing device attribute like unspecified (= empty string) attribute.
-  // Note that different op instances with the same string (including empty
-  // string) are seen as dependent (same resource instance).
-  if (!device_attr) return "";
-  return device_attr.str();
+  return GetDeviceAttrAsResourceInstanceStr(*this);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
index 6384d0770a3358..9bcc75fbe1e424 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
@@ -121,6 +121,11 @@ struct _XlaRun : public ::mlir::SideEffects::Resource::Base<_XlaRun> {
   StringRef getName() final { return "_XlaRun"; }
 };
 
+struct CheckNumerics
+    : public ::mlir::SideEffects::Resource::Base<CheckNumerics> {
+  StringRef getName() final { return "CheckNumerics"; }
+};
+
 // Returns true iff resource type with given ID is only self-dependent, i.e.,
 // there are no dependencies to other resource types (including unknown resource
 // type).
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 612f01ce23ce8a..13cdaa1a445842 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -27,7 +27,7 @@ func.func @testGatherToV2(%params: tensor<4x3xf32>, %indices: tensor<1x2xi32>) -
 
 // CHECK-LABEL: testBatchMatMulToV2
 func.func @testBatchMatMulToV2(%arg0: tensor<2x3x5xf32>, %arg1: tensor<2x5x7xf32>) -> tensor<2x3x7xf32> {
-  // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) <{adj_x = false, adj_y = false}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
+  // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) <{adj_x = false, adj_y = false, grad_x = false, grad_y = false}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
   %0 = "tf.BatchMatMul"(%arg0, %arg1) <{adj_x = false, adj_y = false}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3x5xf32>, tensor<2x5x7xf32>) -> tensor<2x3x7xf32>
   func.return %0: tensor<2x3x7xf32>
 }
@@ -41,7 +41,7 @@ func.func @testDynamicBatchMatMulToV2(%arg0: tensor<2x3x5xf32>, %arg1: tensor<?x
 
 // CHECK-LABEL: testBatchMatMulToMatMul
 func.func @testBatchMatMulToMatMul(%arg0: tensor<2x3xf32>, %arg1: tensor<3x2xf32>) -> tensor<2x2xf32> {
-  // CHECK: %0 = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>, tensor<3x2xf32>) -> tensor<2x2xf32>
+  // CHECK: %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>, tensor<3x2xf32>) -> tensor<2x2xf32>
   %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>, tensor<3x2xf32>) -> tensor<2x2xf32>
   // CHECK: return %0
   func.return %0: tensor<2x2xf32>
@@ -49,7 +49,7 @@ func.func @testBatchMatMulToMatMul(%arg0: tensor<2x3xf32>, %arg1: tensor<3x2xf32
 
 // CHECK-LABEL: testBatchMatMulV2ToMatMul
 func.func @testBatchMatMulV2ToMatMul(%arg0: tensor<4x3xf32>, %arg1: tensor<4x5xf32>) -> tensor<3x5xf32> {
-  // CHECK: %0 = "tf.MatMul"(%arg0, %arg1) <{transpose_a = true, transpose_b = false}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4x3xf32>, tensor<4x5xf32>) -> tensor<3x5xf32>
+  // CHECK: %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = false}> {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4x3xf32>, tensor<4x5xf32>) -> tensor<3x5xf32>
   %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = false, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<4x3xf32>, tensor<4x5xf32>) -> tensor<3x5xf32>
   // CHECK: return %0
   func.return %0: tensor<3x5xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
index c5cf58971296fb..4285b57a322217 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-regions.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -pass-pipeline='builtin.module(tf-functional-control-flow-to-regions{allow-passthrough-args})' -split-input-file | FileCheck %s
+// RUN: tf-opt %s -pass-pipeline='builtin.module(tf-functional-control-flow-to-regions{allow-passthrough-args})' -split-input-file -verify-diagnostics | FileCheck %s
 
 // Simple If
 // CHECK: func private @testIf1Then{{.+}}
@@ -298,3 +298,73 @@ func.func @testWhileDevice() {
   // CHECK: device = "/device:CPU:0"
   func.return
 }
+
+// -----
+
+// CHECK-LABEL: func @init
+func.func @init(%arg0: tensor<4xf32>) -> tensor<7xf32> {
+    %0 = builtin.unrealized_conversion_cast to tensor<7xf32>
+    return %0 : tensor<7xf32>
+}
+
+// CHECK-LABEL: func @next
+func.func @next(%arg0: tensor<7xf32>, %arg1: tensor<3xf32>) -> tensor<6xf32> {
+    %0 = builtin.unrealized_conversion_cast to tensor<6xf32>
+    return %0 : tensor<6xf32>
+}
+
+// CHECK-LABEL: func @finalize
+func.func @finalize(%arg0: tensor<6xf32>, %arg1: tensor<2xf32>) -> tensor<5xf32> {
+    %0 = builtin.unrealized_conversion_cast to tensor<5xf32>
+    return %0 : tensor<5xf32>
+}
+
+// CHECK-LABEL: func @testGeneratorDataset
+func.func @testGeneratorDataset(%arg0: tensor<4xf32>,
+                                %arg1: tensor<3xf32>,
+                                %arg2: tensor<!tf_type.resource>,
+                                %arg3: tensor<2xf32>) {
+  // CHECK-NOT: tf.GeneratorDataset
+  // CHECK: tf.GeneratorDatasetRegion
+  // CHECK: ^
+  // CHECK-SAME: tensor<4xf32>
+  // CHECK: func.call @init
+  // CHECK: ^
+  // CHECK-SAME: tensor<7xf32>
+  // CHECK-SAME: tensor<3xf32>
+  // CHECK-NOT: tf_type.resource
+  // CHECK: func.call @next
+  // CHECK: ^
+  // CHECK-SAME: tensor<6xf32>
+  // CHECK-SAME: tensor<2xf32>
+  // CHECK: func.call @finalize
+  // CHECK-NOT: tf.GeneratorDataset
+  %0 = "tf.GeneratorDataset"(%arg0, %arg1, %arg2, %arg3) {
+      device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0",
+      finalize_func = @finalize,
+      init_func = @init,
+      next_func = @next,
+      operandSegmentSizes = array<i32: 1, 2, 1>,
+      output_shapes = [#tf_type.shape<>],
+      output_types = [!tf_type.string],
+      metadata = ""} : (
+              tensor<4xf32>,
+              tensor<3xf32>,
+              tensor<!tf_type.resource>,
+              tensor<2xf32>) -> tensor<!tf_type.variant>
+  return
+}
+
+// -----
+
+func.func @testIncompleteGeneratorDataset() {
+  // expected-error@+1 {{'tf.GeneratorDataset' op failed to convert to region form}}
+  %0 = "tf.GeneratorDataset"() {
+      finalize_func = @invalid,
+      init_func = @invalid,
+      next_func = @invalid,
+      output_shapes = [#tf_type.shape<>],
+      output_types = [!tf_type.string],
+      metadata = "" } : () -> tensor<!tf_type.variant>
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/host_launch_to_outside_compiled.mlir b/tensorflow/compiler/mlir/tensorflow/tests/host_launch_to_outside_compiled.mlir
deleted file mode 100644
index d7867332a4812c..00000000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/host_launch_to_outside_compiled.mlir
+++ /dev/null
@@ -1,192 +0,0 @@
-// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-device-host-launch-to-outside-compiled | FileCheck %s
-
-// Tests invalid device error returned when invalid device set on module.
-
-// expected-error@+1 {{not a valid device}}
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["bad_device"]} {
-  func.func @bad_device_error() -> () {
-    "tf_device.cluster"() ({
-      "tf.A"() : () -> ()
-      "tf_device.launch"() ({
-        "tf.B"() : () -> ()
-	tf_device.return
-      }) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> ()
-      "tf.C"() : () -> ()
-      tf_device.return
-    }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> ()
-    func.return
-  }
-}
-
-// -----
-
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
-
-  // Tests the unwrap of unreplicated launch of a single outside compiled op with no input or output dependencies.
-
-  // CHECK-LABEL: func @single_op_launch_not_host
-  func.func @single_op_launch_not_host() -> () {
-    // CHECK:      "tf.A"
-    // CHECK:      "tf_device.launch"
-    // CHECK-SAME:      device = "/job:worker/replica:0/task:0/device:TPU:0"
-    // CHECK:        "tf.B"
-    // CHECK-NOT:    _xla_outside_compilation
-    // CHECK:      "tf.C"
-    // CHECK-NEXT: tf_device.return
-    "tf_device.cluster"() ({
-      "tf.A"() : () -> ()
-      "tf_device.launch"() ({
-        "tf.B"() : () -> ()
-	tf_device.return
-      }) {device = "/job:worker/replica:0/task:0/device:TPU:0"} : () -> ()
-      "tf.C"() : () -> ()
-      tf_device.return
-    }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> ()
-    func.return
-  }
-
-  // CHECK-LABEL: func @single_op_hostlaunch_no_input_output
-  func.func @single_op_hostlaunch_no_input_output() -> () {
-    // CHECK:      "tf.A"
-    // CHECK-NOT:  "tf_device.launch"
-    // CHECK-NEXT: "tf.B"
-    // CHECK-SAME:    _xla_outside_compilation
-    // CHECK:      "tf.C"
-    // CHECK-NEXT: tf_device.return
-    "tf_device.cluster"() ({
-      "tf.A"() : () -> ()
-      "tf_device.launch"() ({
-        "tf.B"() : () -> ()
-	tf_device.return
-      }) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> ()
-      "tf.C"() : () -> ()
-      tf_device.return
-    }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> ()
-    func.return
-  }
-
-  // CHECK-LABEL: func @single_op_host_launch_input_output
-  func.func @single_op_host_launch_input_output() -> () {
-    // CHECK:      %[[A_OUTPUT:[0-9]*]] = "tf.A"
-    // CHECK-NOT:  "tf_device.launch"
-    // CHECK-NEXT: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]])
-    // CHECK-SAME:    _xla_outside_compilation
-    // CHECK:      "tf.C"(%[[B_OUTPUT]])
-    // CHECK-NEXT: tf_device.return
-    "tf_device.cluster"() ({
-      %1 = "tf.A"() : () -> (tensor<?xi32>)
-      %2 = "tf_device.launch"() ({
-        %3 = "tf.B"(%1) : (tensor<?xi32>) -> (tensor<?xi32>)
-	tf_device.return %3 : tensor<?xi32>
-      }) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> (tensor<?xi32>)
-      %4 = "tf.C"(%2) : (tensor<?xi32>) -> tensor<?xi32>
-      tf_device.return
-    }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> ()
-    func.return
-  }
-
-  // CHECK-LABEL: func @multiple_ops_host_launch_input_output
-  func.func @multiple_ops_host_launch_input_output() -> () {
-    // CHECK:      %[[A_OUTPUT:[0-9]*]] = "tf.A"
-    // CHECK-NOT:  "tf_device.launch"
-    // CHECK-NEXT: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]])
-    // CHECK-SAME:    _xla_outside_compilation
-    // CHECK-NEXT: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[B_OUTPUT]])
-    // CHECK-SAME:    _xla_outside_compilation
-    // CHECK:      "tf.C"(%[[D_OUTPUT]])
-    // CHECK-NEXT: tf_device.return
-    "tf_device.cluster"() ({
-      %1 = "tf.A"() : () -> (tensor<?xi32>)
-      %2 = "tf_device.launch"() ({
-        %3 = "tf.B"(%1) : (tensor<?xi32>) -> (tensor<?xi32>)
-        %4 = "tf.D"(%3) : (tensor<?xi32>) -> (tensor<?xi32>)
-	tf_device.return %4 : tensor<?xi32>
-      }) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> (tensor<?xi32>)
-      %5 = "tf.C"(%2) : (tensor<?xi32>) -> tensor<?xi32>
-      tf_device.return
-    }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> ()
-    func.return
-  }
-
-  // Tests a host launch that's called from a tf_device.cluster.
-
-  func.func @called_hostlaunch() -> () {
-    "tf_device.cluster"() ({
-      "tf.PartitionedCall"() {f = @called_hostlaunch_callee} : () -> ()
-      tf_device.return
-    }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> ()
-    func.return
-  }
-  // CHECK-LABEL: func @called_hostlaunch_callee
-  func.func @called_hostlaunch_callee() -> () {
-    // CHECK:      "tf.A"
-    // CHECK-NOT:  "tf_device.launch"
-    // CHECK-NEXT: "tf.B"
-    // CHECK-SAME:    _xla_outside_compilation
-    // CHECK:      "tf.C"
-    "tf.A"() : () -> ()
-    "tf_device.launch"() ({
-      "tf.B"() : () -> ()
-      tf_device.return
-    }) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> ()
-    "tf.C"() : () -> ()
-    func.return
-  }
-
-  // Test that the same outside compiled function cannot be called from two
-  // different TPU clusters.
-
-  func.func @called_hostlaunch_bad() -> () {
-    "tf_device.cluster"() ({
-      "tf.PartitionedCall"() {f = @called_hostlaunch_bad_callee} : () -> ()
-      tf_device.return
-    }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> ()
-    "tf_device.cluster"() ({
-      "tf.PartitionedCall"() {f = @called_hostlaunch_bad_callee} : () -> ()
-      tf_device.return
-    }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> ()
-    func.return
-  }
-  // expected-error@+1 {{The same function is reachable from multiple TPU Clusters.}}
-  func.func @called_hostlaunch_bad_callee() -> () {
-    // CHECK:      "tf.A"
-    // CHECK-NOT:  "tf_device.launch"
-    // CHECK-NEXT: "tf.B"
-    // CHECK-SAME:    _xla_outside_compilation
-    // CHECK:      "tf.C"
-    "tf.A"() : () -> ()
-    "tf_device.launch"() ({
-      "tf.B"() : () -> ()
-      tf_device.return
-    }) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> ()
-    "tf.C"() : () -> ()
-    func.return
-  }
-}
-
-// -----
-
-// Checks that transform to outside compiled occurs when there is model
-// parallelism.
-
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
-  // CHECK-LABEL: func @model_parallelism
-  func.func @model_parallelism() -> () {
-    // CHECK:      "tf.A"
-    // CHECK-NOT:  "tf_device.launch"
-    // CHECK-NEXT: "tf.B"
-    // CHECK-SAME:    _xla_outside_compilation
-    // CHECK:      "tf.C"
-    // CHECK-NEXT: tf_device.return
-    "tf_device.cluster"() ({
-      "tf.A"() : () -> ()
-      "tf_device.launch"() ({
-        "tf.B"() : () -> ()
-	tf_device.return
-      }) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> ()
-      "tf.C"() : () -> ()
-      tf_device.return
-    }) {num_cores_per_replica = 2, topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1]} : () -> ()
-    func.return
-  }
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/outside_compiled_to_host_launch.mlir b/tensorflow/compiler/mlir/tensorflow/tests/outside_compiled_to_host_launch.mlir
deleted file mode 100644
index c0230b43d1db04..00000000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/outside_compiled_to_host_launch.mlir
+++ /dev/null
@@ -1,194 +0,0 @@
-// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-outside-compiled-to-host-launch | FILECHECK_OPTS="" FileCheck %s
-
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
-
-  // Tests that TPU cluster with no outside compilation does not generate launch op.
-
-  // CHECK-LABEL: func @no_outside_compilation
-  // CHECK-NOT: "tf_device.launch"
-  func.func @no_outside_compilation() -> tensor<?xi32> {
-    %0 = "tf_device.cluster"() ({
-      %1 = "tf.A"() : () -> tensor<?xi32>
-      %2 = "tf.B"(%1) : (tensor<?xi32>) -> tensor<?xi32>
-      tf_device.return %2 : tensor<?xi32>
-    }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> tensor<?xi32>
-    func.return %0 : tensor<?xi32>
-  }
-
-
-  // Tests the launch wrap of a single outside compiled cluster with no input or output dependencies.
-
-  // CHECK-LABEL: func @nodep_single_outside_compilation
-  func.func @nodep_single_outside_compilation() -> () {
-    // CHECK:      "tf.A"
-    // CHECK:      "tf_device.launch"
-    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
-    // CHECK-NEXT:   "tf.B"
-    // CHECK-NOT:    _xla_outside_compilation
-    // CHECK-NEXT: tf_device.return
-    // CHECK: device_assignment =  [], num_cores_per_replica = 1 : i64, topology =  ""
-    "tf_device.cluster"() ({
-      "tf.A"() : () -> ()
-      "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
-      "tf.C"() : () -> ()
-      tf_device.return
-    }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> ()
-    func.return
-  }
-
-  // Tests the launch wrap of a single outside compiled cluster with data parallelism.
-
-  // CHECK-LABEL: func @single_outside_compilation_with_replicate
-  func.func @single_outside_compilation_with_replicate(%arg0: tensor<?xi32>) -> () {
-    // CHECK:      "tf.A"
-    // CHECK:      tf_device.replicate
-    // CHECK-NEXT:   "tf_device.cluster"
-    // CHECK-NEXT:     "tf.B"
-    // CHECK-NEXT:     "tf_device.launch"
-    // CHECK-SAME:     device = "TPU_REPLICATED_HOST_0"
-    // CHECK-NEXT:       "tf.C"
-    // CHECK-NOT:        _xla_outside_compilation
-    // CHECK:            tf_device.return
-    // CHECK: device_assignment =  [], num_cores_per_replica = 1 : i64, topology =  ""
-    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-    tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-      "tf_device.cluster"() ({
-        "tf.B"() : () -> ()
-        "tf.C"(%ri_0) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
-        "tf.D"() : () -> ()
-        tf_device.return
-      }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> ()
-      tf_device.return
-    }
-    func.return
-  }
-
-  // Tests launch wrap of a single outside compiled cluster with input/output.
-
-  // CHECK-LABEL: func @single_outside_compilation_input_output
-  func.func @single_outside_compilation_input_output(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-    // CHECK:          "tf_device.cluster"
-    // CHECK:          %[[A_OUTPUT:[0-9]*]] = "tf.A"
-    // CHECK-NEXT:     %[[LAUNCH_OUTPUT:[0-9]*]] = "tf_device.launch"
-    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]])
-    // CHECK:            tf_device.return %[[B_OUTPUT]]
-    // CHECK:          "tf.C"(%[[LAUNCH_OUTPUT]])
-    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-      %2 = "tf_device.cluster"() ({
-        %3 = "tf.A"() : () -> (tensor<?xi32>)
-        %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> tensor<?xi32>
-        %5 = "tf.C"(%4) : (tensor<?xi32>) -> tensor<?xi32>
-        tf_device.return %5 : tensor<?xi32>
-      }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> tensor<?xi32>
-      tf_device.return %2 : tensor<?xi32>
-    }
-
-    func.return %1 : tensor<?xi32>
-  }
-
-  // Tests launch wrap of multiple outside compiled cluster with input/output.
-
-  // CHECK-LABEL: func @multiple_outside_compilation_input_output
-  func.func @multiple_outside_compilation_input_output(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-    // CHECK:          "tf_device.cluster"
-    // CHECK:          %[[A_OUTPUT:[0-9]*]] = "tf.A"
-    // CHECK-NEXT:     %[[LAUNCH_OUTPUT:[0-9]*]] = "tf_device.launch"
-    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]])
-    // CHECK:            tf_device.return %[[B_OUTPUT]]
-    // CHECK:          %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[LAUNCH_OUTPUT]])
-    // CHECK-NEXT:     %[[LAUNCH_OUTPUT2:[0-9]*]] = "tf_device.launch"
-    // CHECK:            %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[C_OUTPUT]])
-    // CHECK:            tf_device.return %[[D_OUTPUT]]
-    // CHECK:          %[[LAUNCH_OUTPUT3:[0-9]*]] = "tf_device.launch"
-    // CHECK:            %[[E_OUTPUT:[0-9]*]] = "tf.E"(%[[LAUNCH_OUTPUT2]])
-    // CHECK:            tf_device.return %[[E_OUTPUT]]
-    // CHECK:          "tf.F"(%[[LAUNCH_OUTPUT3]])
-    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-      %2 = "tf_device.cluster"() ({
-        %3 = "tf.A"() : () -> (tensor<?xi32>)
-        %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> tensor<?xi32>
-        %5 = "tf.C"(%4) : (tensor<?xi32>) -> tensor<?xi32>
-        %6 = "tf.D"(%5) {_xla_outside_compilation = "cluster2"} : (tensor<?xi32>) -> tensor<?xi32>
-        %7 = "tf.E"(%6) {_xla_outside_compilation = "cluster2"} : (tensor<?xi32>) -> tensor<?xi32>
-        %8 = "tf.F"(%7) : (tensor<?xi32>) -> tensor<?xi32>
-        tf_device.return %8 : tensor<?xi32>
-      }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> tensor<?xi32>
-      tf_device.return %2 : tensor<?xi32>
-    }
-
-    func.return %1 : tensor<?xi32>
-  }
-
-  // Tests the launch wrap of an outside compiled op that's called from a tf_device.cluster.
-
-  func.func @called_outside_compilation() -> () {
-    "tf_device.cluster"() ({
-      "tf.PartitionedCall"() {f = @called_outside_compilation_callee} : () -> ()
-      tf_device.return
-    }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> ()
-    func.return
-  }
-  // CHECK-LABEL: func @called_outside_compilation_callee
-  func.func @called_outside_compilation_callee() -> () {
-    // CHECK:      "tf.A"
-    // CHECK:      "tf_device.launch"
-    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
-    // CHECK-NEXT:   "tf.B"
-    // CHECK-NOT:    _xla_outside_compilation
-    // CHECK-NEXT: tf_device.return
-    "tf.A"() : () -> ()
-    "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
-    "tf.C"() : () -> ()
-    func.return
-  }
-
-  // Test that the same outside compiled function cannot be called from two
-  // different TPU clusters.
-
-  func.func @called_outside_compilation_bad() -> () {
-    "tf_device.cluster"() ({
-      "tf.PartitionedCall"() {f = @called_outside_compilation_bad_callee} : () -> ()
-      tf_device.return
-    }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> ()
-    "tf_device.cluster"() ({
-      "tf.PartitionedCall"() {f = @called_outside_compilation_bad_callee} : () -> ()
-      tf_device.return
-    }) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> ()
-    func.return
-  }
-  // expected-error@+1 {{The same function is reachable from multiple TPU Clusters.}}
-  func.func @called_outside_compilation_bad_callee() -> () {
-    "tf.A"() : () -> ()
-    "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
-    "tf.C"() : () -> ()
-    func.return
-  }
-}
-
-// -----
-
-// Tests that model parallelism does not affect outside compilation.
-
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
-  // CHECK-LABEL: func @outside_compilation_model_parallelism
-  func.func @outside_compilation_model_parallelism() -> () {
-    // CHECK:      "tf.A"
-    // CHECK:      "tf_device.launch"
-    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
-    // CHECK-NEXT:   "tf.B"
-    // CHECK-NOT:    _xla_outside_compilation
-    // CHECK-NEXT: tf_device.return
-    // CHECK: num_cores_per_replica = 2 : i64
-    %0 = "tf_device.cluster"() ({
-      "tf.A"() : () -> ()
-      "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
-      "tf.C"() : () -> ()
-      tf_device.return
-    }) {num_cores_per_replica = 2, topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1]} : () -> tensor<2xi32>
-    func.return
-  }
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
index c57e07b5e3f74e..7246cdb4513280 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
@@ -2964,3 +2964,83 @@ func.func @global_iter_id_effect() -> () {
   // expected-remark@above {{ID: 6}}
   // expected-remark@above {{Sinks: {}}}
 }
+
+// -----
+
+func.func @add(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+  // expected-remark@above {{ID: 2}}
+  %sum = "tf.Add"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  // expected-remark@above {{ID: 0}}
+  func.return %sum : tensor<1xf32>
+  // expected-remark@above {{ID: 1}}
+  // expected-remark@above {{Sinks: {}}}
+}
+
+func.func @intermediary(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+  // expected-remark@above {{ID: 2}}
+  %result = "tf.StatefulPartitionedCall"(%arg0, %arg1) {config="", config_proto="", executor_type="", f=@add} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  // expected-remark@above {{ID: 0}}
+  func.return %result : tensor<1xf32>
+  // expected-remark@above {{ID: 1}}
+  // expected-remark@above {{Sinks: {}}}
+}
+
+// CHECK-LABEL: func @call_pure_function
+func.func @call_pure_function(%arg0: tensor<!tf_type.resource>) -> tensor<!tf_type.resource> {
+  // expected-remark@above {{ID: 5}}
+  %one = "tf.Const"() { value = dense<1.0> : tensor<1xf32> } : () -> tensor<1xf32>
+  // expected-remark@above {{ID: 0}}
+  %r1 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource>) -> tensor<1xf32>
+  // expected-remark@above {{ID: 1}}
+  %two = "tf.StatefulPartitionedCall"(%one, %one) {config="", config_proto="", executor_type="", f=@intermediary} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  // expected-remark@above {{ID: 2}}
+  %r2 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource>) -> tensor<1xf32>
+  // expected-remark@above {{ID: 3}}
+  func.return %arg0 : tensor<!tf_type.resource>
+  // expected-remark@above {{ID: 4}}
+  // expected-remark@above {{Sinks: {1,3}}}
+}
+
+// -----
+
+func.func @assert(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<i1> {
+  // expected-remark@above {{ID: 3}}
+  %cond = builtin.unrealized_conversion_cast to tensor<i1>
+  // expected-remark@above {{ID: 0}}
+  "tf.Assert"(%cond, %arg1) {device = "/job:localhost/replica:0/task:0/device:CPU:0", summarize = 3 : i64} : (tensor<i1>, tensor<1xf32>) -> ()
+  // expected-remark@above {{ID: 1}}
+  func.return %cond : tensor<i1>
+  // expected-remark@above {{ID: 2}}
+  // expected-remark@above {{Sinks: {1}}}
+}
+
+func.func @intermediary(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+  // expected-remark@above {{ID: 3}}
+  %cond = builtin.unrealized_conversion_cast to tensor<i1>
+  // expected-remark@above {{ID: 0}}
+  %sum = "tf.If"(%cond, %arg0, %arg1) {
+      then_branch = @assert,
+      else_branch = @assert,
+      is_stateless = false
+  } : (tensor<i1>, tensor<1xf32>, tensor<1xf32>) -> tensor<i1>
+  // expected-remark@-5 {{ID: 1}}
+  func.return %arg0 : tensor<1xf32>
+  // expected-remark@above {{ID: 2}}
+  // expected-remark@above {{Sinks: {1}}}
+}
+
+// CHECK-LABEL: func @assert_within_if
+func.func @assert_within_if(%arg0: tensor<!tf_type.resource>) -> tensor<!tf_type.resource> {
+  // expected-remark@above {{ID: 5}}
+  %one = "tf.Const"() { value = dense<1.0> : tensor<1xf32> } : () -> tensor<1xf32>
+  // expected-remark@above {{ID: 0}}
+  %r1 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource>) -> tensor<1xf32>
+  // expected-remark@above {{ID: 1}}
+  %result = "tf.StatefulPartitionedCall"(%one, %one) {config="", config_proto="", executor_type="", f=@intermediary} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  // expected-remark@above {{ID: 2}}
+  %r2 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource>) -> tensor<1xf32>
+  // expected-remark@above {{ID: 3}}
+  func.return %arg0 : tensor<!tf_type.resource>
+  // expected-remark@above {{ID: 4}}
+  // expected-remark@above {{Sinks: {1,3}}}
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir b/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
index 4333e79e0ee430..a7423b729dd287 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
@@ -32,12 +32,12 @@ func.func @batchMatMulTwoDim(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<2x3x5x6xf
   // CHECK: %[[RHS_5:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#4, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_6:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#5, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
@@ -76,12 +76,12 @@ func.func @batchMatMulTwoDimAdjXY(%arg0: tensor<2x3x5x4xf32>, %arg1: tensor<2x3x
   // CHECK: %[[RHS_5:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#4, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x6x5xf32>, tensor<2xi64>) -> tensor<6x5xf32>
   // CHECK: %[[RHS_6:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#5, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x6x5xf32>, tensor<2xi64>) -> tensor<6x5xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
@@ -109,9 +109,9 @@ func.func @batchMatMulOneDim(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>)
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
@@ -131,7 +131,7 @@ func.func @batchMatMulSingleBatch(%arg0: tensor<1x4x5xf32>, %arg1: tensor<1x5x6x
 
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%arg1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]]) <{axis = 0 : i64}> : (tensor<4x6xf32>) -> tensor<1x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<1x4x6xf32>
@@ -152,9 +152,9 @@ func.func @batchMatMulUnbatchedLeft(%arg0: tensor<4x5xf32>, %arg1: tensor<3x5x6x
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg0, %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg0, %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
@@ -175,9 +175,9 @@ func.func @batchMatMulUnbatchedRight(%arg0: tensor<3x4x5xf32>, %arg1: tensor<5x6
   // CHECK: %[[LHS_2:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#1, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
   // CHECK: %[[LHS_3:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#2, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
@@ -190,7 +190,7 @@ func.func @batchMatMulMatrix(%arg0: tensor<4x5xf32>, %arg1: tensor<5x6xf32>) ->
   func.return %0 : tensor<4x6xf32>
 
   // CHECK-LABEL: batchMatMulMatrix
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
   // CHECK: return %[[MATMUL_1]] : tensor<4x6xf32>
 }
 
@@ -201,7 +201,7 @@ func.func @batchMatMulMatrixAdjXY(%arg0: tensor<5x4xf32>, %arg1: tensor<6x5xf32>
   func.return %0 : tensor<4x6xf32>
 
   // CHECK-LABEL: batchMatMulMatrixAdjXY
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
   // CHECK: return %[[MATMUL_1]] : tensor<4x6xf32>
 }
 
@@ -238,12 +238,12 @@ func.func @batchMatMulV2TwoDim(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<2x3x5x6
   // CHECK: %[[RHS_5:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#4, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_6:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#5, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
@@ -282,12 +282,12 @@ func.func @batchMatMulV2TwoDimAdjXY(%arg0: tensor<2x3x5x4xf32>, %arg1: tensor<2x
   // CHECK: %[[RHS_5:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#4, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x6x5xf32>, tensor<2xi64>) -> tensor<6x5xf32>
   // CHECK: %[[RHS_6:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#5, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x6x5xf32>, tensor<2xi64>) -> tensor<6x5xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
@@ -319,12 +319,12 @@ func.func @batchMatMulV2Broadcast(%arg0: tensor<2x1x4x5xf32>, %arg1: tensor<1x3x
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
@@ -352,9 +352,9 @@ func.func @batchMatMulV2OneDim(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
@@ -374,7 +374,7 @@ func.func @batchMatMulV2SingleBatch(%arg0: tensor<1x4x5xf32>, %arg1: tensor<1x5x
 
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%arg1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]]) <{axis = 0 : i64}> : (tensor<4x6xf32>) -> tensor<1x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<1x4x6xf32>
@@ -395,9 +395,9 @@ func.func @batchMatMulV2UnbatchedLeft(%arg0: tensor<4x5xf32>, %arg1: tensor<3x5x
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg0, %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg0, %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
@@ -418,9 +418,9 @@ func.func @batchMatMulV2UnbatchedRight(%arg0: tensor<3x4x5xf32>, %arg1: tensor<5
   // CHECK: %[[LHS_2:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#1, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
   // CHECK: %[[LHS_3:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#2, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
@@ -433,7 +433,7 @@ func.func @batchMatMulV2Matrix(%arg0: tensor<4x5xf32>, %arg1: tensor<5x6xf32>) -
   func.return %0 : tensor<4x6xf32>
 
   // CHECK-LABEL: batchMatMulV2Matrix
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
   // CHECK: return %[[MATMUL_1]] : tensor<4x6xf32>
 }
 
@@ -444,7 +444,7 @@ func.func @batchMatMulV2MatrixAdjXY(%arg0: tensor<5x4xf32>, %arg1: tensor<6x5xf3
   func.return %0 : tensor<4x6xf32>
 
   // CHECK-LABEL: batchMatMulV2MatrixAdjXY
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
   // CHECK: return %[[MATMUL_1]] : tensor<4x6xf32>
 }
 
@@ -455,7 +455,7 @@ func.func @batchMatMulV2DynamicSize(%arg0: tensor<?x?xf32>, %arg1: tensor<?x4xf3
   func.return %0 : tensor<?x4xf32>
 
   // CHECK-LABEL: batchMatMulV2DynamicSize
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<?x?xf32>, tensor<?x4xf32>) -> tensor<?x4xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<?x?xf32>, tensor<?x4xf32>) -> tensor<?x4xf32>
   // CHECK: return %[[MATMUL_1]] : tensor<?x4xf32>
 }
 
@@ -492,12 +492,12 @@ func.func @batchMatMulV3TwoDim(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<2x3x5x6
   // CHECK: %[[RHS_5:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#4, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_6:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#5, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
@@ -536,12 +536,12 @@ func.func @batchMatMulV3TwoDimAdjXY(%arg0: tensor<2x3x5x4xf32>, %arg1: tensor<2x
   // CHECK: %[[RHS_5:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#4, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x6x5xf32>, tensor<2xi64>) -> tensor<6x5xf32>
   // CHECK: %[[RHS_6:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#5, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x6x5xf32>, tensor<2xi64>) -> tensor<6x5xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_4]], %[[RHS_4]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_5]], %[[RHS_5]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_6]], %[[RHS_6]]) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
@@ -573,12 +573,12 @@ func.func @batchMatMulV3Broadcast(%arg0: tensor<2x1x4x5xf32>, %arg1: tensor<1x3x
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_SPLIT]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_4:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_5:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_6:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]], %[[MATMUL_4]], %[[MATMUL_5]], %[[MATMUL_6]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<6x4x6xf32>
   // CHECK: %[[RESULT:.*]] = "tf.Reshape"(%[[MATMUL_PACKED]], %[[RESULT_SHAPE]]) : (tensor<6x4x6xf32>, tensor<4xi64>) -> tensor<2x3x4x6xf32>
@@ -606,9 +606,9 @@ func.func @batchMatMulV3OneDim(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
@@ -628,7 +628,7 @@ func.func @batchMatMulV3SingleBatch(%arg0: tensor<1x4x5xf32>, %arg1: tensor<1x5x
 
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%arg1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]]) <{axis = 0 : i64}> : (tensor<4x6xf32>) -> tensor<1x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<1x4x6xf32>
@@ -649,9 +649,9 @@ func.func @batchMatMulV3UnbatchedLeft(%arg0: tensor<4x5xf32>, %arg1: tensor<3x5x
   // CHECK: %[[RHS_2:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#1, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
   // CHECK: %[[RHS_3:.*]] = "tf.Reshape"(%[[RHS_RESHAPED]]#2, %[[MATMUL_RHS_SHAPE]]) : (tensor<1x5x6xf32>, tensor<2xi64>) -> tensor<5x6xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[RHS_1]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg0, %[[RHS_2]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[RHS_3]]) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %[[RHS_1]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%arg0, %[[RHS_2]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%arg0, %[[RHS_3]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
@@ -672,9 +672,9 @@ func.func @batchMatMulV3UnbatchedRight(%arg0: tensor<3x4x5xf32>, %arg1: tensor<5
   // CHECK: %[[LHS_2:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#1, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
   // CHECK: %[[LHS_3:.*]] = "tf.Reshape"(%[[LHS_SPLIT]]#2, %[[MATMUL_LHS_SHAPE]]) : (tensor<1x4x5xf32>, tensor<2xi64>) -> tensor<4x5xf32>
 
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%[[LHS_1]], %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_2:.*]] = "tf.MatMul"(%[[LHS_2]], %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_3:.*]] = "tf.MatMul"(%[[LHS_3]], %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
 
   // CHECK: %[[MATMUL_PACKED:.*]] = "tf.Pack"(%[[MATMUL_1]], %[[MATMUL_2]], %[[MATMUL_3]]) <{axis = 0 : i64}> : (tensor<4x6xf32>, tensor<4x6xf32>, tensor<4x6xf32>) -> tensor<3x4x6xf32>
   // CHECK: return %[[MATMUL_PACKED]] : tensor<3x4x6xf32>
@@ -687,7 +687,7 @@ func.func @batchMatMulV3Matrix(%arg0: tensor<4x5xf32>, %arg1: tensor<5x6xf32>) -
   func.return %0 : tensor<4x6xf32>
 
   // CHECK-LABEL: batchMatMulV3Matrix
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> : (tensor<4x5xf32>, tensor<5x6xf32>) -> tensor<4x6xf32>
   // CHECK: return %[[MATMUL_1]] : tensor<4x6xf32>
 }
 
@@ -698,7 +698,7 @@ func.func @batchMatMulV3MatrixAdjXY(%arg0: tensor<5x4xf32>, %arg1: tensor<6x5xf3
   func.return %0 : tensor<4x6xf32>
 
   // CHECK-LABEL: batchMatMulV3MatrixAdjXY
-  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = true, transpose_b = true}> : (tensor<5x4xf32>, tensor<6x5xf32>) -> tensor<4x6xf32>
   // CHECK: return %[[MATMUL_1]] : tensor<4x6xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
index 58c3338ad2264e..3114f0d9546a5f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
@@ -477,7 +477,6 @@ cc_library(
         "guarantee_all_funcs_one_use.cc",
         "hoist_loop_invariant.cc",
         "hoist_replicate_invariant_resource_writes.cc",
-        "host_launch_to_outside_compiled.cc",
         "init_text_file_to_import.cc",
         "launch_to_device_attribute.cc",
         "layout_optimization.cc",
@@ -490,7 +489,6 @@ cc_library(
         "name_anonymous_iterators.cc",
         "optimize.cc",
         "order_by_dialect.cc",
-        "outside_compiled_to_host_launch.cc",
         "parallel_execute_to_islands.cc",
         "prepare_tpu_computation_for_tf_export.cc",
         "print.cc",
@@ -566,9 +564,7 @@ cc_library(
         ":cluster_formation",
         ":decompose_resource_ops",
         ":decompose_resource_ops_inc_gen",
-        ":extract_outside_compilation",
         ":lower_tf_lib",
-        ":mark_ops_for_outside_compilation",
         ":shape_inference_pass",
         ":tensorflow_optimize_inc_gen",
         ":tf_data_optimization",
@@ -578,7 +574,6 @@ cc_library(
         ":tfe_legalize_tfg",
         ":unroll_batch_matmul_pass",
         ":verify_no_outside_compilation_markers_pass",
-        ":xla_cluster_formation",
         "//tensorflow/compiler/jit:flags_headers",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/lite:validators",
@@ -617,6 +612,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:xla_call_module_attrs",
         "//tensorflow/compiler/mlir/tensorflow:xla_rewrite_util",
         "//tensorflow/compiler/mlir/tensorflow:xla_sharding_util",
+        "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:tpu_metadata_utils",
         "//tensorflow/compiler/mlir/tf2xla/internal/inference:inference_metrics_pass",
         "//tensorflow/compiler/mlir/tf2xla/transforms:legalization_op_config",
         "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
@@ -705,138 +701,24 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "xla_cluster_formation",
-    srcs = ["xla_cluster_formation.cc"],
-    textual_hdrs = [
-        "tf_passes.h.inc",
-    ],
-    deps = [
-        ":tf_device_pass_inc_gen",
-        ":tf_pass_inc_gen",
-        ":verify_no_outside_compilation_markers_pass",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
-        "//tensorflow/compiler/mlir/tensorflow:call_graph_util",
-        "//tensorflow/compiler/mlir/tensorflow:cluster_util",
-        "//tensorflow/compiler/mlir/tensorflow:string_util",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:portable_gif_internal",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
-
-cc_library(
-    name = "extract_outside_compilation",
-    srcs = ["extract_outside_compilation.cc"],
-    textual_hdrs = [
-        "tf_passes.h.inc",
-    ],
-    deps = [
-        ":lower_tf_lib",
-        ":shape_inference_pass",
-        ":tf_pass_inc_gen",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
-        "//tensorflow/compiler/mlir/tensorflow:device_util",
-        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
-        "//tensorflow/compiler/mlir/tensorflow:string_util",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
-        "//tensorflow/compiler/mlir/tf2xla/transforms:legalization_op_config",
-        "//tensorflow/compiler/mlir/tf2xla/transforms:legalize_tf",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Rewrite",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
-
-cc_library(
-    name = "mark_ops_for_outside_compilation",
-    srcs = ["mark_ops_for_outside_compilation.cc"],
-    textual_hdrs = [
-        "tf_passes.h.inc",
-    ],
-    deps = [
-        ":lower_tf_lib",
-        ":tf_pass_inc_gen",
-        ":verify_no_outside_compilation_markers_pass",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
-        "//tensorflow/compiler/mlir/tensorflow:string_util",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
-        "//tensorflow/compiler/mlir/tf2xla/transforms:legalization_op_config",
-        "//tensorflow/compiler/mlir/tf2xla/transforms:legalize_tf",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Rewrite",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
-
 cc_library(
     name = "bridge",
     srcs = ["bridge.cc"],
     hdrs = ["bridge.h"],
     deps = [
         ":tensorflow_passes",
-        "//tensorflow/compiler/jit:flags_headers",
-        "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:lower_cluster_to_runtime_ops",
-        "//tensorflow/compiler/mlir/tf2xla/api/v1:tf_dialect_to_executor",
-        "//tensorflow/compiler/mlir/tf2xla/api/v2:cluster_tf",
-        "//tensorflow/compiler/mlir/tf2xla/api/v2:device_type_proto_cc",
-        "//tensorflow/compiler/mlir/tf2xla/api/v2:tf_dialect_to_executor",
-        "//tensorflow/compiler/mlir/tf2xla/internal:clustering_bridge_passes",
         "//tensorflow/compiler/mlir/tf2xla/internal:logging_hooks",
-        "//tensorflow/compiler/mlir/tf2xla/internal/inference:inference_metrics_pass",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core/platform:error_payloads",
-        "//tensorflow/core/platform:stacktrace",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Transforms",
-        "@local_tsl//tsl/platform:error_logging",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index a7f1037b312544..07f399e53d1a3d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -15,32 +15,16 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/bridge.h"
 
-#include <memory>
-#include <string>
-#include <utility>
-
+#include "absl/log/log.h"
 #include "llvm/ADT/StringRef.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
-#include "tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.h"
-#include "tensorflow/compiler/mlir/tf2xla/internal/inference/inference_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/logging_hooks.h"
-#include "tensorflow/core/framework/metrics.h"
-#include "tensorflow/core/platform/error_payloads.h"
-#include "tensorflow/core/platform/stacktrace.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/protobuf/core_platform_payloads.pb.h"
-#include "tensorflow/core/util/debug_data_dumper.h"
-#include "tsl/platform/error_logging.h"
 
 namespace mlir {
 namespace TF {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
index 4525bb7ce7db81..e403e4c6f7e960 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
@@ -87,13 +87,13 @@ def GatherToV2 : Pat<
 // with V1.
 def BatchMatMulToV2 : Pat<
   (TF_BatchMatMulOp:$src AnyStaticShapeTensor:$x, AnyStaticShapeTensor:$y,
-    $adj_x, $adj_y),
-  (TF_BatchMatMulV2Op:$dest $x, $y, $adj_x, $adj_y),
+    $adj_x, $adj_y, $grad_x, $grad_y),
+  (TF_BatchMatMulV2Op:$dest $x, $y, $adj_x, $adj_y, $grad_x, $grad_y),
   [], [(CopyAttrs $src, $dest)]>;
 
 def BatchMatMulToMatMul : Pat<
-  (TF_BatchMatMulOp:$src $x, $y, $adj_x, $adj_y),
-  (TF_MatMulOp:$dest $x, $y, $adj_x, $adj_y),
+  (TF_BatchMatMulOp:$src $x, $y, $adj_x, $adj_y, $grad_x, $grad_y),
+  (TF_MatMulOp:$dest $x, $y, $adj_x, $adj_y, $grad_x, $grad_y),
   [(IsRank2Tensor $x), (IsRank2Tensor $y)],
   [(CopyAttrs $src, $dest)]>;
 
@@ -102,8 +102,8 @@ def BatchMatMulToMatMul : Pat<
 //===----------------------------------------------------------------------===//
 
 def BatchMatMulV2ToMatMul : Pat<
-  (TF_BatchMatMulV2Op:$src $x, $y, $adj_x, $adj_y),
-  (TF_MatMulOp:$dest $x, $y, $adj_x, $adj_y),
+  (TF_BatchMatMulV2Op:$src $x, $y, $adj_x, $adj_y, $grad_x, $grad_y),
+  (TF_MatMulOp:$dest $x, $y, $adj_x, $adj_y, $grad_x, $grad_y),
   [(IsRank2Tensor $x), (IsRank2Tensor $y)],
   [(CopyAttrs $src, $dest)]>;
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc
index 8f902c1eff7a0a..51afea6d84671e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc
@@ -361,15 +361,14 @@ IslandOp CreateNewIsland(const MergedIsland& merged_island,
 
 // Creates respective YieldOp for the new merged island.
 YieldOp CreateNewIslandYieldOp(IslandOp new_island,
-                               llvm::ArrayRef<IslandResult> results) {
+                               llvm::MutableArrayRef<IslandResult> results) {
   llvm::SmallVector<Value, 8> yield_operands;
   yield_operands.reserve(results.size());
 
-  for (auto ret_vals : llvm::zip(results, new_island.getOutputs())) {
-    const auto& old_result = std::get<0>(ret_vals);
-
+  for (auto [old_result, new_island] :
+       llvm::zip(results, new_island.getOutputs())) {
     // Replace original island result with new island result.
-    old_result.island_result.replaceAllUsesWith(std::get<1>(ret_vals));
+    old_result.island_result.replaceAllUsesWith(new_island);
 
     // Add associated inner op result to operands of the YieldOp.
     yield_operands.push_back(old_result.inner_op_result);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
index d88550d7920ab6..125cbbd6163c33 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
@@ -102,7 +102,9 @@ Value ConvertConditionToBoolean(Operation* op, Value cond) {
       return cond;
 
   OpBuilder builder(op);
-  return builder.create<TF::ToBoolOp>(op->getLoc(), cond);
+  Value to_bool = builder.create<TF::ToBoolOp>(op->getLoc(), cond);
+  CopyDeviceAndUnderscoredAttributes(op, to_bool.getDefiningOp());
+  return to_bool;
 }
 
 // Transform a functional IfOp to a region based IfRegionOp.
@@ -171,6 +173,48 @@ LogicalResult ConvertWhileOp(WhileOp while_op, bool allow_passthrough_args) {
   return success();
 }
 
+LogicalResult ConvertGeneratorDatasetOp(GeneratorDatasetOp generator_op) {
+  auto generator_region =
+      OpBuilder(generator_op)
+          .create<TF::GeneratorDatasetRegionOp>(
+              generator_op.getLoc(), generator_op->getResultTypes(),
+              generator_op.getInitFuncOtherArgs(),
+              generator_op.getNextFuncOtherArgs(),
+              generator_op.getFinalizeFuncOtherArgs(),
+              generator_op.getOutputTypes(), generator_op.getOutputShapes(),
+              generator_op.getMetadata());
+  CopyDeviceAndUnderscoredAttributes(generator_op, generator_region);
+
+  func::FuncOp init_function =
+      SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(
+          generator_op, generator_op.getInitFunc());
+  func::FuncOp next_function =
+      SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(
+          generator_op, generator_op.getNextFunc());
+  func::FuncOp finalize_function =
+      SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(
+          generator_op, generator_op.getFinalizeFunc());
+
+  if (!init_function || !next_function || !finalize_function) {
+    return failure();
+  }
+
+  CreateCall(generator_op, init_function, generator_region.getInit(),
+             generator_region.getInitFuncOtherArgs(),
+             /*use_region_args=*/true, /*forward_block_args=*/false);
+  CreateCall(generator_op, next_function, generator_region.getNext(),
+             generator_region.getNextFuncOtherArgs(),
+             /*use_region_args=*/true, /*forward_block_args=*/false);
+  CreateCall(generator_op, finalize_function, generator_region.getFinalize(),
+             generator_region.getFinalizeFuncOtherArgs(),
+             /*use_region_args=*/true, /*forward_block_args=*/false);
+
+  generator_op->replaceAllUsesWith(generator_region->getResults());
+  generator_op->erase();
+
+  return success();
+}
+
 void FunctionalControlFlowToRegions::runOnOperation() {
   ModuleOp module = getOperation();
   auto result = module.walk([&](Operation* op) {
@@ -189,6 +233,13 @@ void FunctionalControlFlowToRegions::runOnOperation() {
         op->emitOpError() << "failed to convert to region form";
         return WalkResult::interrupt();
       }
+    } else if (auto generator_op = llvm::dyn_cast<GeneratorDatasetOp>(op)) {
+      if (allow_passthrough_args_) {
+        if (failed(ConvertGeneratorDatasetOp(generator_op))) {
+          op->emitOpError() << "failed to convert to region form";
+          return WalkResult::interrupt();
+        }
+      }
     }
     return WalkResult::advance();
   });
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_launch_to_outside_compiled.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_launch_to_outside_compiled.cc
deleted file mode 100644
index 1c4383326e7625..00000000000000
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_launch_to_outside_compiled.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "mlir/Analysis/CallGraph.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_cluster_util.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
-
-namespace mlir {
-namespace TFDevice {
-
-namespace {
-
-constexpr char kDeviceAttr[] = "device";
-constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
-
-#define GEN_PASS_DEF_HOSTLAUNCHTOOUTSIDECOMPILEDPASS
-#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.h.inc"
-
-struct HostLaunchToOutsideCompiledPass
-    : public impl::HostLaunchToOutsideCompiledPassBase<
-          HostLaunchToOutsideCompiledPass> {
-  void runOnOperation() override;
-};
-
-// Assign all ops in region with _xla_outside_compilation attribute.
-void MarkOutsideCompiledInRegion(Region& region) {
-  region.walk([&](Operation* op) {
-    op->setAttr(kXlaOutsideCompilationAttr,
-                StringAttr::get(op->getContext(), "from_launch"));
-  });
-}
-
-void HoistOpsAndAnnotateWithOutsideCompilation(tf_device::LaunchOp launch) {
-  // Forward launch inner op results to launch op results.
-  launch.replaceAllUsesWith(launch.GetBody().getTerminator()->getOperands());
-
-  // For all inner ops, assign the launch device as a `device` attribute.
-  MarkOutsideCompiledInRegion(launch.getBody());
-
-  // Move all inner ops of the launch to the block containing the launch.
-  auto body = launch.GetBody().without_terminator();
-  Operation* launch_op = launch.getOperation();
-  launch_op->getBlock()->getOperations().splice(
-      launch_op->getIterator(), launch.GetBody().getOperations(), body.begin(),
-      body.end());
-
-  launch.erase();
-}
-
-void HostLaunchToOutsideCompiledPass::runOnOperation() {
-  auto traverse_op = [&](Operation* op, tf_device::ClusterOp tpu_cluster,
-                         std::optional<std::string> host_device) {
-    // Hoist launch.
-    if (tf_device::LaunchOp launch = dyn_cast<tf_device::LaunchOp>(op)) {
-      StringAttr device_attr = launch->getAttrOfType<StringAttr>(kDeviceAttr);
-      if (host_device && device_attr &&
-          device_attr.getValue().equals(*host_device))
-        HoistOpsAndAnnotateWithOutsideCompilation(launch);
-    }
-    return WalkResult::advance();
-  };
-
-  ModuleOp module = getOperation();
-  if (failed(TFTPU::WalkReachableFromTpuCluster(module, traverse_op)))
-    return signalPassFailure();
-}
-
-}  // anonymous namespace
-
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateHostLaunchToOutsideCompiledPass() {
-  return std::make_unique<HostLaunchToOutsideCompiledPass>();
-}
-
-}  // namespace TFDevice
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
index 359dd5c4624712..aa5097f19dbd5d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
@@ -75,3 +75,60 @@ tf_cc_test(
         "@llvm-project//mlir:Pass",
     ],
 )
+
+cc_library(
+    name = "tpu_metadata_utils",
+    srcs = [
+        "tpu_metadata_utils.cc",
+    ],
+    hdrs = [
+        "tpu_metadata_utils.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
+        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
+        "//tensorflow/compiler/mlir/tensorflow:xla_sharding_util",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla:xla_proto_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "tpu_metadata_utils_test",
+    srcs = ["tpu_metadata_utils_test.cc"],
+    data = [
+        "testdata/basic_cluster.mlir",
+        "testdata/spmd.mlir",
+    ],
+    deps = [
+        ":tpu_metadata_utils",
+        "//tensorflow/compiler/mlir:register_common_dialects",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
index ab9a56e3b6db8e..3e3e8db504f1da 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
@@ -50,7 +50,6 @@ namespace {
 using mlir::DialectRegistry;
 using mlir::MLIRContext;
 using mlir::ModuleOp;
-using mlir::OpPassManager;
 using mlir::OwningOpRef;
 using mlir::func::FuncOp;
 using ::tensorflow::monitoring::testing::CellReader;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/testdata/spmd.mlir b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/testdata/spmd.mlir
new file mode 100644
index 00000000000000..21e27e013832f3
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/testdata/spmd.mlir
@@ -0,0 +1,9 @@
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU:2", "/job:localhost/replica:0/task:0/device:TPU:3", "/job:localhost/replica:0/task:0/device:TPU:4", "/job:localhost/replica:0/task:0/device:TPU:5", "/job:localhost/replica:0/task:0/device:TPU:6", "/job:localhost/replica:0/task:0/device:TPU:7"]} {
+  func.func @main(%arg0: tensor<*xf32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}) {
+    "tf_device.cluster_func"(%arg0) <{func = @empty_func}> {_dynamic_arg_index = [], _replication_info = "cluster", _xla_compile_device_type = "TPU", allow_soft_placement = false, computation_shape = [], device = "", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], host_compute_core = [], input_sharding_configuration = ["{devices=[2,1]0,1}"], num_cores_per_replica = 2 : i64, output_sharding_configuration = [""], padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", tpu_compile_options_proto = "", use_spmd_for_xla_partitioning = true, use_tpu = true} : (tensor<*xf32>) -> (tensor<*xf32>)
+    func.return
+  }
+  func.func @empty_func(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+    func.return %arg0 : tensor<*xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc
new file mode 100644
index 00000000000000..767d5cf7f0cf8c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc
@@ -0,0 +1,250 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.h"
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+
+namespace mlir {
+namespace TFTPU {
+namespace {
+constexpr char kStepMarkerLocationAttr[] = "step_marker_location";
+constexpr char kUseXlaSpmdAttr[] = "use_spmd_for_xla_partitioning";
+
+constexpr char kBadStringArrayElementMsg[] =
+    "bad '{0}' attribute at index {1}, not a string";
+constexpr char kBadArrayElementMsg[] =
+    "bad '{0}' attribute at index {1} with value '{2}': failed to parse to {3}";
+constexpr char kBadArrayAttrLengthMsg[] =
+    "bad '{0}' attribute, expected array attribute of size {1}, got size {2}";
+
+// Creates a missing attribute error message.
+std::string CreateMissingAttributeMsg(llvm::StringRef attribute) {
+  return llvm::formatv("requires attribute '{0}'", attribute).str();
+}
+
+// Populates a TPUCompileMetadataProto with StepMarkerLocation from a
+// `tf_device::ClusterFuncOp`.
+LogicalResult SetMetadataProtoStepMarkerLocation(
+    tf_device::ClusterFuncOp op,
+    tensorflow::tpu::TPUCompileMetadataProto* metadata) {
+  auto step_marker_location =
+      op->getAttrOfType<StringAttr>(kStepMarkerLocationAttr);
+  if (!step_marker_location)
+    return op.emitOpError(CreateMissingAttributeMsg(kStepMarkerLocationAttr));
+
+  // Default to `STEP_MARK_AT_ENTRY` for step marker location if attribute is
+  // empty.
+  xla::DebugOptions::StepMarkerLocation location =
+      xla::DebugOptions::STEP_MARK_AT_ENTRY;
+  if (!step_marker_location.getValue().empty() &&
+      !xla::DebugOptions::StepMarkerLocation_Parse(
+          std::string(step_marker_location.getValue()), &location))
+    return op.emitOpError(llvm::formatv("bad '{0}' attribute with value '{1}'",
+                                        kStepMarkerLocationAttr,
+                                        step_marker_location.getValue()));
+
+  metadata->set_step_marker_location(location);
+
+  return success();
+}
+
+// Parses a xla::OpSharding from a string attribute.
+LogicalResult SetOpSharding(Operation* op, Attribute attr, llvm::StringRef name,
+                            int index, xla::OpSharding* sharding_ptr) {
+  auto sharding_attr = attr.dyn_cast<StringAttr>();
+  if (!sharding_attr)
+    return op->emitOpError(
+        llvm::formatv(kBadStringArrayElementMsg, name, index));
+  if (tensorflow::DecodeShardingAttribute(sharding_attr, *sharding_ptr)
+          .failed()) {
+    return op->emitOpError(llvm::formatv(kBadArrayElementMsg, name, index,
+                                         sharding_attr.getValue(),
+                                         "xla::OpSharding"));
+  }
+  return success();
+}
+
+// Populates a TPUCompileMetadataProto with argument types and sharding from a
+// `tf_device::ClusterFuncOp`.
+LogicalResult SetMetadataProtoArgs(
+    tf_device::ClusterFuncOp op,
+    tensorflow::tpu::TPUCompileMetadataProto* metadata) {
+  auto input_shardings =
+      op->getAttrOfType<ArrayAttr>(tensorflow::kInputShardingAttr);
+  if (!input_shardings)
+    return op.emitOpError(
+        CreateMissingAttributeMsg(tensorflow::kInputShardingAttr));
+
+  if (input_shardings.size() != op.getNumOperands())
+    return op.emitOpError(
+        llvm::formatv(kBadArrayAttrLengthMsg, tensorflow::kInputShardingAttr,
+                      op.getNumOperands(), input_shardings.size()));
+
+  // Set args metadata in proto.
+  mlir::StringAttr replication_attr_name = mlir::StringAttr::get(
+      op.getContext(), "mhlo.is_same_data_across_replicas");
+
+  auto dynamic_arg_idx = op->getAttrOfType<ArrayAttr>(TF::kDynamicArgIndexAttr);
+  llvm::SmallSet<int, 4> dynamic_arg_idx_set;
+  if (dynamic_arg_idx) {
+    for (auto idx : dynamic_arg_idx.getValue()) {
+      dynamic_arg_idx_set.insert(idx.dyn_cast<IntegerAttr>().getInt());
+    }
+  }
+
+  for (auto operand_type_and_idx : llvm::enumerate(op.getOperandTypes())) {
+    Type operand_type = operand_type_and_idx.value();
+    int index = operand_type_and_idx.index();
+    tensorflow::tpu::TPUCompileMetadataProto::Arg* arg = metadata->add_args();
+    tensorflow::DataType dtype;
+    tensorflow::Status status =
+        tensorflow::ConvertToDataType(operand_type, &dtype);
+    if (!status.ok())
+      return op.emitOpError(
+          llvm::formatv("failed to determine operand type at index {0}: {1}",
+                        index, status.message()));
+
+    arg->set_dtype(dtype);
+    // TODO(lyandy): Support other arg kinds.
+    if (dtype == tensorflow::DT_RESOURCE)
+      arg->set_kind(tensorflow::tpu::TPUCompileMetadataProto::Arg::VARIABLE);
+    else
+      arg->set_kind(tensorflow::tpu::TPUCompileMetadataProto::Arg::PARAMETER);
+
+    // Populate argument shapes.
+    *arg->mutable_shape() = tensorflow::TensorShapeProto();
+    if (auto ranked_tensor_type = operand_type.dyn_cast<RankedTensorType>()) {
+      tensorflow::TensorShapeProto shape_proto;
+      ConvertToTensorShapeProto(ranked_tensor_type.getShape(), &shape_proto);
+      *arg->mutable_shape() = std::move(shape_proto);
+    } else {
+      arg->mutable_shape()->set_unknown_rank(true);
+    }
+
+    if (failed(SetOpSharding(op, input_shardings.getValue()[index],
+                             tensorflow::kInputShardingAttr, index,
+                             arg->mutable_sharding())))
+      return failure();
+
+    // Populate set_is_same_data_across_replicas
+    // Note: this information is duplicated and can be removed from the proto
+    // and here once MLIR bridge phase 2 doesn't fallback to the old bridge.
+    auto attr = op.getFuncOp().getArgAttrOfType<mlir::BoolAttr>(
+        index, replication_attr_name);
+    arg->set_is_same_data_across_replicas(attr != nullptr && attr.getValue());
+
+    // Currently only support first dimension to be bounded dynamic.
+    arg->mutable_is_bounded_dynamic_dim()->Add(
+        dynamic_arg_idx_set.contains(index));
+  }
+
+  return success();
+}
+
+// Populates a TPUCompileMetadataProto with result sharding from a
+// `tf_device::ClusterFuncOp`.
+LogicalResult SetMetadataProtoRetvals(
+    tf_device::ClusterFuncOp op,
+    tensorflow::tpu::TPUCompileMetadataProto* metadata) {
+  auto output_shardings =
+      op->getAttrOfType<ArrayAttr>(tensorflow::kOutputShardingAttr);
+  if (!output_shardings)
+    return op.emitOpError(
+        CreateMissingAttributeMsg(tensorflow::kOutputShardingAttr));
+
+  if (output_shardings.size() != op.getNumResults())
+    return op.emitOpError(
+        llvm::formatv(kBadArrayAttrLengthMsg, tensorflow::kOutputShardingAttr,
+                      op.getNumResults(), output_shardings.size()));
+
+  // Set retvals metadata in proto.
+  for (auto output_sharding_and_idx : llvm::enumerate(output_shardings))
+    if (failed(SetOpSharding(op, output_sharding_and_idx.value(),
+                             tensorflow::kOutputShardingAttr,
+                             output_sharding_and_idx.index(),
+                             metadata->add_retvals()->mutable_sharding())))
+      return failure();
+
+  return success();
+}
+
+}  // namespace
+
+// Populates a TPUCompileMetadataProto from attributes of a
+// `tf_device::ClusterFuncOp`. If any necessary attributes are missing from the
+// op, a failure will be returned.
+// TODO(lyandy): Support session handle and guaranteed consts.
+LogicalResult SetMetadataProtoFromClusterFuncOp(
+    tf_device::ClusterFuncOp op, int num_replicas, int num_cores_per_replica,
+    std::optional<xla::DeviceAssignmentProto>&& xla_device_assignment,
+    tensorflow::tpu::TPUCompileMetadataProto* metadata) {
+  if (auto options_attr =
+          op->getAttrOfType<StringAttr>("tpu_compile_options_proto")) {
+    if (!metadata->mutable_compile_options()->ParseFromArray(
+            options_attr.data(), options_attr.size())) {
+      return failure();
+    }
+  }
+  metadata->set_num_replicas(num_replicas);
+  metadata->set_num_cores_per_replica(num_cores_per_replica);
+
+  if (failed(SetMetadataProtoStepMarkerLocation(op, metadata)))
+    return failure();
+
+  if (xla_device_assignment.has_value())
+    *metadata->mutable_device_assignment() =
+        std::move(xla_device_assignment.value());
+  auto use_spmd_attr = op->getAttrOfType<BoolAttr>(kUseXlaSpmdAttr);
+  if (!use_spmd_attr)
+    return op.emitOpError(CreateMissingAttributeMsg(kUseXlaSpmdAttr));
+  metadata->set_use_spmd_for_xla_partitioning(use_spmd_attr.getValue());
+
+  if (failed(SetMetadataProtoArgs(op, metadata))) return failure();
+
+  return SetMetadataProtoRetvals(op, metadata);
+}
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.h b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.h
new file mode 100644
index 00000000000000..b58401eb6897d4
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_HOST_RUNTIME_TPU_METADATA_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_HOST_RUNTIME_TPU_METADATA_UTILS_H_
+
+#include <optional>
+
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+
+namespace mlir {
+namespace TFTPU {
+
+// Populates a TPUCompileMetadataProto from attributes of a
+// `tf_device::ClusterFuncOp`. If any necessary attributes are missing from the
+// op, a failure will be returned.
+// TODO(lyandy): Support session handle and guaranteed consts.
+LogicalResult SetMetadataProtoFromClusterFuncOp(
+    tf_device::ClusterFuncOp op, int num_replicas, int num_cores_per_replica,
+    std::optional<xla::DeviceAssignmentProto>&& xla_device_assignment,
+    tensorflow::tpu::TPUCompileMetadataProto* metadata);
+}  // namespace TFTPU
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_HOST_RUNTIME_TPU_METADATA_UTILS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils_test.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils_test.cc
new file mode 100644
index 00000000000000..50fd035ebb8153
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils_test.cc
@@ -0,0 +1,182 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.h"
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/register_common_dialects.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/core/platform/resource_loader.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/platform/statusor.h"
+
+namespace mlir {
+namespace TFTPU {
+namespace {
+
+using mlir::DialectRegistry;
+using mlir::MLIRContext;
+using mlir::ModuleOp;
+using mlir::OwningOpRef;
+
+// TODO(b/229726259): Make EqualsProto available in OSS
+class ProtoStringMatcher {
+ public:
+  explicit ProtoStringMatcher(const tsl::protobuf::Message& expected)
+      : expected_(expected.SerializeAsString()) {}
+
+  template <typename Message>
+  bool MatchAndExplain(const Message& p, testing::MatchResultListener*) const {
+    return p.SerializeAsString() == expected_;
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << expected_; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "not equal to expected message: " << expected_;
+  }
+
+ private:
+  const std::string expected_;
+};
+
+inline ::testing::PolymorphicMatcher<ProtoStringMatcher> EqualsProto(
+    const tsl::protobuf::Message& x) {
+  return ::testing::MakePolymorphicMatcher(ProtoStringMatcher(x));
+}
+
+std::string TestDataPath() {
+  return tensorflow::GetDataDependencyFilepath(
+      "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/testdata/");
+}
+
+class TpuMetadataUtilsTest : public ::testing::Test {
+ public:
+  TpuMetadataUtilsTest() {
+    mlir::RegisterCommonToolingDialects(registry_);
+    context_.appendDialectRegistry(registry_);
+    context_.loadAllAvailableDialects();
+  }
+
+  absl::StatusOr<std::vector<mlir::tf_device::ClusterFuncOp>> GetClusterFuncOps(
+      absl::string_view mlir_module_filename) {
+    TF_RETURN_IF_ERROR(CreateMlirModule(mlir_module_filename));
+    std::vector<mlir::tf_device::ClusterFuncOp> cluster_func_ops;
+
+    mlir_module_->walk([&](mlir::tf_device::ClusterFuncOp op) {
+      cluster_func_ops.push_back(op);
+    });
+    return cluster_func_ops;
+  }
+
+ private:
+  absl::Status CreateMlirModule(absl::string_view mlir_module_filename) {
+    std::string mlir_module_path =
+        absl::StrCat(TestDataPath(), mlir_module_filename);
+    mlir_module_ =
+        mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context_);
+    if (!mlir_module_) {
+      return absl::Status(
+          absl::StatusCode::kNotFound,
+          absl::StrCat("Could not find MLIR module at ", mlir_module_path));
+    }
+    return absl::OkStatus();
+  }
+
+  DialectRegistry registry_;
+  MLIRContext context_;
+  OwningOpRef<mlir::ModuleOp> mlir_module_;
+};
+
+TEST_F(TpuMetadataUtilsTest, SingleDevice) {
+  TF_ASSERT_OK_AND_ASSIGN(auto cluster_func_ops,
+                          GetClusterFuncOps("basic_cluster.mlir"));
+  mlir::tf_device::ClusterFuncOp cluster_func_op = cluster_func_ops.front();
+
+  tensorflow::tpu::TPUCompileMetadataProto compile_metadata;
+
+  ASSERT_TRUE(mlir::succeeded(SetMetadataProtoFromClusterFuncOp(
+      cluster_func_op,
+      /*num_replicas=*/1, /*num_cores_per_replica=*/1, {}, &compile_metadata)));
+
+  tensorflow::tpu::TPUCompileMetadataProto expected_compile_metadata;
+  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
+      R"pb(
+        num_replicas: 1 num_cores_per_replica: 1
+      )pb",
+      &expected_compile_metadata));
+
+  EXPECT_THAT(compile_metadata, EqualsProto(expected_compile_metadata));
+}
+
+TEST_F(TpuMetadataUtilsTest, spmd) {
+  TF_ASSERT_OK_AND_ASSIGN(auto cluster_func_ops,
+                          GetClusterFuncOps("spmd.mlir"));
+  mlir::tf_device::ClusterFuncOp cluster_func_op = cluster_func_ops.front();
+
+  tensorflow::tpu::TPUCompileMetadataProto compile_metadata;
+
+  ASSERT_TRUE(mlir::succeeded(SetMetadataProtoFromClusterFuncOp(
+      cluster_func_op,
+      /*num_replicas=*/1, /*num_cores_per_replica=*/2, {}, &compile_metadata)));
+
+  tensorflow::tpu::TPUCompileMetadataProto expected_compile_metadata;
+  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
+      R"pb(
+        args {
+          dtype: DT_FLOAT
+          shape { unknown_rank: true }
+          kind: PARAMETER
+          sharding {
+            type: OTHER
+            tile_assignment_dimensions: 2
+            tile_assignment_dimensions: 1
+            tile_assignment_devices: 0
+            tile_assignment_devices: 1
+          }
+          is_bounded_dynamic_dim: false
+        }
+        retvals { sharding {} }
+        num_replicas: 1
+        num_cores_per_replica: 2
+        use_spmd_for_xla_partitioning: true
+        compile_options {}
+      )pb",
+      &expected_compile_metadata));
+
+  EXPECT_THAT(compile_metadata, EqualsProto(expected_compile_metadata));
+}
+
+}  // namespace
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/outside_compiled_to_host_launch.cc b/tensorflow/compiler/mlir/tensorflow/transforms/outside_compiled_to_host_launch.cc
deleted file mode 100644
index e710e76b03a3c5..00000000000000
--- a/tensorflow/compiler/mlir/tensorflow/transforms/outside_compiled_to_host_launch.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Analysis/CallGraph.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_cluster_util.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
-
-namespace mlir {
-namespace TFDevice {
-
-namespace {
-
-constexpr char kDeviceAttr[] = "device";
-constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
-
-#define GEN_PASS_DEF_OUTSIDECOMPILEDTOHOSTLAUNCHPASS
-#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.h.inc"
-
-struct OutsideCompiledToHostLaunchPass
-    : public impl::OutsideCompiledToHostLaunchPassBase<
-          OutsideCompiledToHostLaunchPass> {
-  void runOnOperation() override;
-};
-
-void WrapOpInLaunch(Operation* host_op, llvm::StringRef host_device) {
-  OpBuilder builder(host_op);
-
-  auto launch_op = builder.create<tf_device::LaunchOp>(
-      host_op->getLoc(), builder.getStringAttr(host_device),
-      /*result_types=*/host_op->getResultTypes());
-  host_op->replaceAllUsesWith(launch_op);
-
-  launch_op.getBody().push_back(new Block);
-  builder.setInsertionPointToEnd(&launch_op.GetBody());
-  auto* return_op =
-      builder
-          .create<tf_device::ReturnOp>(host_op->getLoc(), host_op->getResults())
-          .getOperation();
-  MLIRContext* context = launch_op.getContext();
-  host_op->removeAttr(StringAttr::get(context, kXlaOutsideCompilationAttr));
-  host_op->removeAttr(StringAttr::get(context, kDeviceAttr));
-  host_op->moveBefore(return_op);
-}
-
-void OutsideCompiledToHostLaunchPass::runOnOperation() {
-  // traverse_op is applied to each op reachable from each tf_device::ClusterOp
-  // in the module returned by getOperation().
-  auto traverse_op = [&](Operation* op, tf_device::ClusterOp tpu_cluster,
-                         std::optional<std::string> host_device) {
-    // Apply WrapOpInLaunch when the op has _xla_outside_compilation.
-    if (op->hasAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
-      if (!host_device) {
-        tpu_cluster.emitOpError(
-            "outside compilation is not supported with model parallelism.");
-        return WalkResult::interrupt();
-      }
-      WrapOpInLaunch(op, *host_device);
-    }
-    return WalkResult::advance();
-  };
-  if (failed(TFTPU::WalkReachableFromTpuCluster(getOperation(), traverse_op)))
-    return signalPassFailure();
-}
-
-}  // anonymous namespace
-
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateOutsideCompiledToHostLaunchPass() {
-  return std::make_unique<OutsideCompiledToHostLaunchPass>();
-}
-
-}  // namespace TFDevice
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 12cf30eea9dfff..00bd6166c63521 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -458,15 +458,6 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateParallelExecuteToIslandsPass(
 std::unique_ptr<OperationPass<ModuleOp>>
 CreateAnnotateParameterReplicationPass();
 
-// Creates a pass that marks unsupported ops in device cluster for outside
-// compilation.
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateMarkOpsForOutsideCompilationPass();
-
-// Creates a pass that extract outside compilation (Host ops inside cevice
-// cluster) ops to a separate parallel_execute region to run on CPU.
-std::unique_ptr<OperationPass<ModuleOp>> CreateExtractOutsideCompilationPass();
-
 // Creates a pass that merges control flow with similar predicates.
 std::unique_ptr<OperationPass<ModuleOp>> CreateMergeControlFlowPass();
 
@@ -481,24 +472,11 @@ CreateDeviceAttributeToLaunchPass();
 std::unique_ptr<OperationPass<func::FuncOp>> CreateLaunchToDeviceAttributePass(
     bool legacy_graph_export = true);
 
-// Creates a pass that extracts ops in tf_device.launch op with host device
-// assignment and adds an `_xla_outside_compilation` attribute value.
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateHostLaunchToOutsideCompiledPass();
-
-// Creates a pass that wraps ops with the same `_xla_outside_compilation`
-// attribute value in a tf_device.launch op with host device assignment.
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateOutsideCompiledToHostLaunchPass();
-
 // Creates a pass to ensure that the `_xla_outside_compilation` and
 // tf_device.launch op no longer exist after Outside Compilation is complete.
 std::unique_ptr<OperationPass<func::FuncOp>>
 CreateVerifyNoOutsideCompilationMarkersPass();
 
-// Create a pass that encapsulates StatefulPartitionedCallOp within a cluster.
-std::unique_ptr<OperationPass<ModuleOp>> CreateXlaClusterFormationPass();
-
 // Create a pass that inlines the StatefulPartitionedCallOp op based in the
 // parent region.
 std::unique_ptr<OperationPass<ModuleOp>> CreateXlaInlineDeviceOpsPass();
@@ -677,7 +655,6 @@ enum MoveTransposeDirection { kBegin, kEnd };
 #define GEN_PASS_DECL_LOCALIZEVARHANDLESPASS
 #define GEN_PASS_DECL_LOWERQUANTIZEDPASS
 #define GEN_PASS_DECL_MARKINPUTOUTPUTALIASESPASS
-#define GEN_PASS_DECL_MARKOPSFOROUTSIDECOMPILATIONPASS
 #define GEN_PASS_DECL_MATERIALIZEPASSTHROUGHOP
 #define GEN_PASS_DECL_MERGECONTROLFLOWPASS
 #define GEN_PASS_DECL_MOVETRANSPOSESPASS
@@ -706,7 +683,6 @@ enum MoveTransposeDirection { kBegin, kEnd };
 #define GEN_PASS_DECL_TPUCOLOCATECOMPOSITERESOURCEOPSPASS
 #define GEN_PASS_DECL_TPUDEVICEPROPAGATIONPASS
 #define GEN_PASS_DECL_TPUDYNAMICLAYOUTPASS
-#define GEN_PASS_DECL_TPUEXTRACTOUTSIDECOMPILATIONPASS
 #define GEN_PASS_DECL_TPUHOSTCOMPUTATIONEXPANSIONPASS
 #define GEN_PASS_DECL_TPUIDENTITYPRUNINGPASS
 #define GEN_PASS_DECL_TPUMERGEVARIABLESWITHEXECUTEPASS
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index c458b2c6cd8725..afad8871b399ba 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -1896,7 +1896,8 @@ bool ShapeInference::InferShapeForXlaSelectAndScatterOp(
 
 bool ShapeInference::InferShapeForXlaGatherOp(XlaGatherOp op) {
   xla::Shape input_shape = xla::TypeToShape(op.getOperand().getType());
-  if (input_shape == xla::Shape()) return false;
+  if (input_shape == xla::Shape() || input_shape.is_unbounded_dynamic())
+    return false;
 
   xla::Shape start_indices_shape =
       xla::TypeToShape(op.getStartIndices().getType());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
index 8bfda6dbb25c55..c89c909375df67 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
@@ -154,80 +154,6 @@ def DeviceAttributeToLaunchPass : Pass<"tf-device-attribute-to-launch", "mlir::f
   let constructor = "TFDevice::CreateDeviceAttributeToLaunchPass()";
 }
 
-def OutsideCompiledToHostLaunchPass : Pass<"tf-outside-compiled-to-host-launch", "ModuleOp"> {
-  let summary = "Wraps each op with the _xla_outside_compiled attribute in a separate tf_device.launch on replicated host device.";
-
-  let description = [{
-    This pass wraps ops with the same `_xla_outside_compilation`
-    attribute value in a tf_device.launch op with host device assignment. The
-    `_xla_outside_compilation` attribute is deleted from the wrapped ops.
-
-    A simple example:
-
-    ```mlir
-      "tf_device.cluster"() ( {
-        "tf.A"()
-        "tf.B"() {_xla_outside_compilation = "cluster1"}
-        "tf.C"()
-        tf_device.return
-      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []}
-    ```
-
-    Would become the following ops (unimportant attribute, type are omitted):
-
-    ```mlir
-      "tf_device.cluster"() ( {
-        "tf.A"()
-        "tf_device.launch"() {
-          "tf.B"() // Note xla_outside_compilation attribute deleted.
-          tf_device.return
-        } {device = "TPU_REPLICATED_HOST_0"} : () -> ()
-        "tf.C"()
-        tf_device.return
-      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []}
-    ```
-  }];
-
-  let constructor = "TFDevice::CreateOutsideCompiledToHostLaunchPass()";
-}
-
-def HostLaunchToOutsideCompiledPass : Pass<"tf-device-host-launch-to-outside-compiled", "ModuleOp"> {
-  let summary = "Converts each op wrapped in launch op with host device assignnment to op with _xla_outside_compiled attribute.";
-
-  let description = [{
-    This pass takes ops wrapped in a tf_device.launch op with host device
-    assignment extracts them from launch and adds an `_xla_outside_compilation`
-    attribute. This is the inverse of OutsideCompiledToHostLaunchPass.
-
-    A simple example:
-
-    ```mlir
-      "tf_device.cluster"() ( {
-        "tf.A"()
-        "tf_device.launch"() {
-          "tf.B"()
-          tf_device.return
-        } {device = "TPU_REPLICATED_HOST_0"} : () -> ()
-        "tf.C"()
-        tf_device.return
-      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []}
-    ```
-
-    Would become the following ops (unimportant attribute, type are omitted):
-
-    ```mlir
-      "tf_device.cluster"() ( {
-        "tf.A"()
-        "tf.B"() {_xla_outside_compilation = "cluster1"}
-        "tf.C"()
-        tf_device.return
-      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []}
-    ```
-  }];
-
-  let constructor = "TFDevice::CreateHostLaunchToOutsideCompiledPass()";
-}
-
 def VerifyNoOutsideCompilationMarkersPass : Pass<"verify-no-outside-compilation-markers", "mlir::func::FuncOp"> {
   let summary = "Verifies that after Outside Compilation passes complete, there are no more _xla_outside_compilation attributes and no tf_device.launch ops.";
 
@@ -337,50 +263,6 @@ def LaunchToDeviceAttributePass : Pass<"tf-launch-to-device-attribute", "mlir::f
   let constructor = "TFDevice::CreateLaunchToDeviceAttributePass()";
 }
 
-def XlaClusterFormationPass : Pass<"tf-xla-cluster-formation", "ModuleOp"> {
-  let summary = "Encapsulate partitioned calls within a Cluster op";
-  let description = [{
-    This pass clusters `tf.PartitionedCall` and `tf.StatefulPartitionedCall`
-    with `_xla_compile_device_type` attribute into a `tf_device.cluster`.
-    Notice this pass will only rewrite the outermost call if there are nested
-    calls to avoid nested `tf.XlaLaunch` operations from being created later.
-
-    For example, the following code
-
-    ```mlir
-    func.func @main() -> tensor<i32> {
-      %0 = "tf.StatefulPartitionedCall"() {_xla_compile_device_type = "CPU", f = @stateful_pcall_func} : () -> (tensor<i32>)
-      func.return %0 : tensor<i32>
-    }
-
-    func.func @stateful_pcall_func() -> tensor<i32> {
-      %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-      func.return %0 : tensor<i32>
-    }
-    ```
-
-    will be transformed into,
-
-    ```mlir
-    func.func @main() -> tensor<i32> {
-      %0 = "tf_device.cluster"() ({
-        %1 = "tf.StatefulPartitionedCall"() {_xla_compile_device_type = "CPU", f = @stateful_pcall_func} : () -> tensor<i32>
-        tf_device.return %1 : tensor<i32>
-      }) : () -> tensor<i32>
-      func.return %0 : tensor<i32>
-    }
-
-    func.func @stateful_pcall_func() -> tensor<i32> {
-      %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-      func.return %0 : tensor<i32>
-    }
-
-    ```
-  }];
-  let constructor = "TFDevice::CreateXlaClusterFormationPass()";
-  let dependentDialects = ["tf_device::TensorFlowDeviceDialect"];
-}
-
 def XlaInlineDeviceOpsPass : Pass<"tf-xla-inline-device-ops", "ModuleOp"> {
   let summary = "Inline all Cluster op based in the parent region";
   let constructor = "TFDevice::CreateXlaInlineDeviceOpsPass()";
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
index 782232cb3038f7..b8fa543318778c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
@@ -1226,62 +1226,6 @@ def TPUResourceReadForWritePass : Pass<"tf-tpu-resource-read-for-write", "Module
   let constructor = "TFTPU::CreateTPUResourceReadForWritePass()";
 }
 
-def ExtractOutsideCompilationPass : Pass<"tf-extract-outside-compilation", "ModuleOp"> {
-  let summary = "Extracts device outside compilation computation to a separate tf_device.parallel_execute region.";
-
-  let description = [{
-    This pass extracts a CPU computation cluster with `_xla_outside_compilation`
-    annotation, which denotes ops that should be run on CPU/host, from a device cluster.
-    Each outside compilation cluster is moved to
-    a tf_device.parallel_execute region. The device cluster is also moved to a
-    tf_device.parallel_execute region. Communication ops between device and host are
-    added to pass inputs/outputs to/from the outside compiled region.
-
-    For example, the following tf_device.cluster with an op marked for `xla_outside_compilation`:
-
-    ```mlir
-    func @outside_compilation() -> tensor<f32> {
-      %0 = "tf_device.cluster"() ( {
-        %1 = "tf.Const"() {_xla_outside_compilation = "0", value = dense<1.0> : tensor<f32>} : () -> (tensor<f32>)
-        %2 = "tf.Identity"(%1) {_xla_outside_compilation = "0"} : (tensor<f32>) -> (tensor<f32>)
-        %3 = "tf.AddV2"(%1, %2) : (tensor<f32>, tensor<f32>) -> (tensor<f32>)
-        tf_device.return %3 : tensor<f32>
-      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
-      return %0 : tensor<f32>
-    }
-    ```
-
-    will become a tf_device.parallel_execute op with a CPU/host region and
-    a tf_device.cluster with communication ops to send data to/from device/host:
-
-    ```mlir
-    func @outside_compilation() -> tensor<f32> {
-      %0 = "tf_device.parallel_execute"() ( {
-        "tf_device.launch"() ( {
-          %1 = "tf._XlaCompileMlirPlaceholderProgramKey"() : () -> tensor<3x!tf_type.string>
-          %2 = "tf._XlaRecvAtHost"(%1) {device_ordinal = 0 : i64, key = "host_compute_channel_0_0_args"} : (tensor<3x!tf_type.string>) -> tensor<f32>
-          %3 = "tf.Identity"(%2) : (tensor<f32>) -> tensor<f32>
-          "tf._XlaSendFromHost"(%3, %1) {device_ordinal = 0 : i64, key = "host_compute_channel_0_0_retvals"} : (tensor<f32>, tensor<3x!tf_type.string>) -> ()
-          tf_device.return
-        }) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> ()
-        tf_device.return
-      },  {
-        %1 = "tf_device.cluster"() ( {
-          %2 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
-          %3 = "tf._XlaHostComputeMlir"(%2) {recv_key = "host_compute_channel_0_0_retvals", send_key = "host_compute_channel_0_0_args", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
-          %4 = "tf.AddV2"(%2, %3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-          tf_device.return %4 : tensor<f32>
-        }) {device_assignment = [], num_cores_per_replica = 1 : i64, topology = ""} : () -> tensor<f32>
-        tf_device.return %1 : tensor<f32>
-      }) : () -> tensor<f32>
-      return %0 : tensor<f32>
-    }
-    ```
-  }];
-
-  let constructor = "TFDevice::CreateExtractOutsideCompilationPass()";
-}
-
 def HoistReplicateInvariantResourceWritesPass : Pass<"tf-hoist-replicate-invariant-resource-writes", "mlir::func::FuncOp"> {
   let summary = "Hoists writes to replicate invariant resource variables.";
 
@@ -1301,53 +1245,6 @@ def HoistReplicateInvariantResourceWritesPass : Pass<"tf-hoist-replicate-invaria
   let constructor = "TF::CreateHoistReplicateInvariantResourceWritesPass()";
 }
 
-def MarkOpsForOutsideCompilationPass : Pass<"tf-mark-ops-for-outside-compilation", "ModuleOp"> {
-  let summary = "Marks ops in device cluster for outside compilation if they are unsupported on device.";
-
-  let description = [{
-    This pass marks unsupported ops in a device cluster with
-    `_xla_outside_compilation` attribute so the operations will run on the host
-    instead of the device. Unsupported ops are ops that can not be code
-    generated to run on the device for the cluster including:
-
-    1. String operations on TPUs.
-    2. Operations that don't have a kernel defined for the device.
-
-    This pass is conservative in that it will mark all ops for outside compilation
-    that can not be compiled for the device.  Exceptions for this are added for ops
-    that will be rewritten or decomposed before compiling on device.
-
-
-    For example, tf_device.cluster op with an unsupported op, tf.UnsupportedOp:
-
-    ```mlir
-    func @unsupported_op() -> tensor<i32> {
-      %0 = "tf_device.cluster"() ( {
-        %1 = "tf.UnsupportedOp"() : () -> tensor<i32>
-        %2 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
-        tf_device.return %2 : tensor<i32>
-      }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<i32>
-      return %0 : tensor<i32>
-    }
-    ```
-
-    will mark tf.UnsupportedOp with `_xla_outside_compilation` attribute:
-
-    ```mlir
-    func @unsupported_op() -> tensor<i32> {
-      %0 = "tf_device.cluster"() ( {
-        %1 = "tf.UnsupportedOp"() {_xla_outside_compilation = "auto0"} : () -> tensor<i32>
-        %2 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
-        tf_device.return %2 : tensor<i32>
-      }) {allow_soft_placement = true, device_assignment = [], num_cores_per_replica = 1 : i64, topology = ""} : () -> tensor<i32>
-      return %0 : tensor<i32>
-    }
-    ```
-  }];
-
-  let constructor = "TFDevice::CreateMarkOpsForOutsideCompilationPass()";
-}
-
 def FunctionalControlFlowToRegionsPass : Pass<"tf-functional-control-flow-to-regions", "ModuleOp"> {
   let summary = "Transforms functional control flow operations to their region-based counterparts";
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index f27e1f62f074fe..62afa2b10ed67b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
@@ -158,186 +159,6 @@ LogicalResult EncapsulateFuncAndSerialize(const std::string& module_name,
   return success();
 }
 
-// Populates a TPUCompileMetadataProto with StepMarkerLocation from a
-// `tf_device::ClusterFuncOp`.
-LogicalResult SetMetadataProtoStepMarkerLocation(
-    tf_device::ClusterFuncOp op,
-    tensorflow::tpu::TPUCompileMetadataProto* metadata) {
-  auto step_marker_location =
-      op->getAttrOfType<StringAttr>(kStepMarkerLocationAttr);
-  if (!step_marker_location)
-    return op.emitOpError(CreateMissingAttributeMsg(kStepMarkerLocationAttr));
-
-  // Default to `STEP_MARK_AT_ENTRY` for step marker location if attribute is
-  // empty.
-  xla::DebugOptions::StepMarkerLocation location =
-      xla::DebugOptions::STEP_MARK_AT_ENTRY;
-  if (!step_marker_location.getValue().empty() &&
-      !xla::DebugOptions::StepMarkerLocation_Parse(
-          std::string(step_marker_location.getValue()), &location))
-    return op.emitOpError(llvm::formatv("bad '{0}' attribute with value '{1}'",
-                                        kStepMarkerLocationAttr,
-                                        step_marker_location.getValue()));
-
-  metadata->set_step_marker_location(location);
-
-  return success();
-}
-
-// Parses a xla::OpSharding from a string attribute.
-LogicalResult SetOpSharding(Operation* op, Attribute attr, llvm::StringRef name,
-                            int index, xla::OpSharding* sharding_ptr) {
-  auto sharding_attr = attr.dyn_cast<StringAttr>();
-  if (!sharding_attr)
-    return op->emitOpError(
-        llvm::formatv(kBadStringArrayElementMsg, name, index));
-  if (tensorflow::DecodeShardingAttribute(sharding_attr, *sharding_ptr)
-          .failed()) {
-    return op->emitOpError(llvm::formatv(kBadArrayElementMsg, name, index,
-                                         sharding_attr.getValue(),
-                                         "xla::OpSharding"));
-  }
-  return success();
-}
-
-// Populates a TPUCompileMetadataProto with argument types and sharding from a
-// `tf_device::ClusterFuncOp`.
-LogicalResult SetMetadataProtoArgs(
-    tf_device::ClusterFuncOp op,
-    tensorflow::tpu::TPUCompileMetadataProto* metadata) {
-  auto input_shardings =
-      op->getAttrOfType<ArrayAttr>(tensorflow::kInputShardingAttr);
-  if (!input_shardings)
-    return op.emitOpError(
-        CreateMissingAttributeMsg(tensorflow::kInputShardingAttr));
-
-  if (input_shardings.size() != op.getNumOperands())
-    return op.emitOpError(
-        llvm::formatv(kBadArrayAttrLengthMsg, tensorflow::kInputShardingAttr,
-                      op.getNumOperands(), input_shardings.size()));
-
-  // Set args metadata in proto.
-  mlir::StringAttr replication_attr_name = mlir::StringAttr::get(
-      op.getContext(), "mhlo.is_same_data_across_replicas");
-
-  auto dynamic_arg_idx = op->getAttrOfType<ArrayAttr>(TF::kDynamicArgIndexAttr);
-  llvm::SmallSet<int, 4> dynamic_arg_idx_set;
-  if (dynamic_arg_idx) {
-    for (auto idx : dynamic_arg_idx.getValue()) {
-      dynamic_arg_idx_set.insert(idx.dyn_cast<IntegerAttr>().getInt());
-    }
-  }
-
-  for (auto operand_type_and_idx : llvm::enumerate(op.getOperandTypes())) {
-    Type operand_type = operand_type_and_idx.value();
-    int index = operand_type_and_idx.index();
-    tensorflow::tpu::TPUCompileMetadataProto::Arg* arg = metadata->add_args();
-    tensorflow::DataType dtype;
-    tensorflow::Status status =
-        tensorflow::ConvertToDataType(operand_type, &dtype);
-    if (!status.ok())
-      return op.emitOpError(
-          llvm::formatv("failed to determine operand type at index {0}: {1}",
-                        index, status.message()));
-
-    arg->set_dtype(dtype);
-    // TODO(lyandy): Support other arg kinds.
-    if (dtype == tensorflow::DT_RESOURCE)
-      arg->set_kind(tensorflow::tpu::TPUCompileMetadataProto::Arg::VARIABLE);
-    else
-      arg->set_kind(tensorflow::tpu::TPUCompileMetadataProto::Arg::PARAMETER);
-
-    // Populate argument shapes.
-    *arg->mutable_shape() = tensorflow::TensorShapeProto();
-    if (auto ranked_tensor_type = operand_type.dyn_cast<RankedTensorType>()) {
-      tensorflow::TensorShapeProto shape_proto;
-      ConvertToTensorShapeProto(ranked_tensor_type.getShape(), &shape_proto);
-      *arg->mutable_shape() = std::move(shape_proto);
-    } else {
-      arg->mutable_shape()->set_unknown_rank(true);
-    }
-
-    if (failed(SetOpSharding(op, input_shardings.getValue()[index],
-                             tensorflow::kInputShardingAttr, index,
-                             arg->mutable_sharding())))
-      return failure();
-
-    // Populate set_is_same_data_across_replicas
-    // Note: this information is duplicated and can be removed from the proto
-    // and here once MLIR bridge phase 2 doesn't fallback to the old bridge.
-    auto attr = op.getFuncOp().getArgAttrOfType<mlir::BoolAttr>(
-        index, replication_attr_name);
-    arg->set_is_same_data_across_replicas(attr != nullptr && attr.getValue());
-
-    // Currently only support first dimension to be bounded dynamic.
-    arg->mutable_is_bounded_dynamic_dim()->Add(
-        dynamic_arg_idx_set.contains(index));
-  }
-
-  return success();
-}
-
-// Populates a TPUCompileMetadataProto with result sharding from a
-// `tf_device::ClusterFuncOp`.
-LogicalResult SetMetadataProtoRetvals(
-    tf_device::ClusterFuncOp op,
-    tensorflow::tpu::TPUCompileMetadataProto* metadata) {
-  auto output_shardings =
-      op->getAttrOfType<ArrayAttr>(tensorflow::kOutputShardingAttr);
-  if (!output_shardings)
-    return op.emitOpError(
-        CreateMissingAttributeMsg(tensorflow::kOutputShardingAttr));
-
-  if (output_shardings.size() != op.getNumResults())
-    return op.emitOpError(
-        llvm::formatv(kBadArrayAttrLengthMsg, tensorflow::kOutputShardingAttr,
-                      op.getNumResults(), output_shardings.size()));
-
-  // Set retvals metadata in proto.
-  for (auto output_sharding_and_idx : llvm::enumerate(output_shardings))
-    if (failed(SetOpSharding(op, output_sharding_and_idx.value(),
-                             tensorflow::kOutputShardingAttr,
-                             output_sharding_and_idx.index(),
-                             metadata->add_retvals()->mutable_sharding())))
-      return failure();
-
-  return success();
-}
-
-// Populates a TPUCompileMetadataProto from attributes of a
-// `tf_device::ClusterFuncOp`. If any necessary attributes are missing from the
-// op, a failure will be returned.
-// TODO(lyandy): Support session handle and guaranteed consts.
-LogicalResult SetMetadataProtoFromClusterFuncOp(
-    tf_device::ClusterFuncOp op, int num_replicas, int num_cores_per_replica,
-    std::optional<xla::DeviceAssignmentProto>&& xla_device_assignment,
-    tensorflow::tpu::TPUCompileMetadataProto* metadata) {
-  if (auto options_attr =
-          op->getAttrOfType<StringAttr>("tpu_compile_options_proto")) {
-    if (!metadata->mutable_compile_options()->ParseFromArray(
-            options_attr.data(), options_attr.size())) {
-      return failure();
-    }
-  }
-  metadata->set_num_replicas(num_replicas);
-  metadata->set_num_cores_per_replica(num_cores_per_replica);
-
-  if (failed(SetMetadataProtoStepMarkerLocation(op, metadata)))
-    return failure();
-
-  if (xla_device_assignment.has_value())
-    *metadata->mutable_device_assignment() =
-        std::move(xla_device_assignment.value());
-  auto use_spmd_attr = op->getAttrOfType<BoolAttr>(kUseXlaSpmdAttr);
-  if (!use_spmd_attr)
-    return op.emitOpError(CreateMissingAttributeMsg(kUseXlaSpmdAttr));
-  metadata->set_use_spmd_for_xla_partitioning(use_spmd_attr.getValue());
-
-  if (failed(SetMetadataProtoArgs(op, metadata))) return failure();
-
-  return SetMetadataProtoRetvals(op, metadata);
-}
-
 // Create a `tf._TPUCompileMlir` that contains a MLIR module that is
 // functionally equivalent to the function referenced by cluster_func.
 Operation* BuildCompileOp(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_deserialization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_deserialization.cc
index 42b0516c7cb038..c044eff3e15c32 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_deserialization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_deserialization.cc
@@ -100,8 +100,10 @@ FailureOr<StringAttr> RenameStablehloFunctions(
     MLIRContext *context, SymbolTableCollection &symbol_tables,
     ModuleOp tf_module, ModuleOp stablehlo_module) {
   SymbolTable &tf_symbol_table = symbol_tables.getSymbolTable(tf_module);
-  SymbolTable &stablehlo_symbol_table =
-      symbol_tables.getSymbolTable(stablehlo_module);
+  // `stablehlo_module` is deleted right after the deserialization, so no need
+  // to store its `SymbolTable` to `SymbolTableCollection`.
+  SymbolTable stablehlo_symbol_table(stablehlo_module);
+
   Builder builder(context);
   StringAttr main_func_name;
   for (auto func : stablehlo_module.getOps<func::FuncOp>()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite_v2.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite_v2.cc
index 830dd1cb124705..f8752e316233dd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite_v2.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite_v2.cc
@@ -251,10 +251,8 @@ mlir::LogicalResult RemapOutputsFromLogicalDevices(
     mlir::tf_device::ParallelExecuteOp old_parallel_execute, int cluster_idx,
     mlir::tf_device::ParallelExecuteOp new_parallel_execute,
     mlir::OpBuilder* builder) {
-  for (const auto& result_and_index :
+  for (auto [output_index, old_parallel_execute_output] :
        llvm::enumerate(old_parallel_execute.getResults())) {
-    const auto output_index = result_and_index.index();
-    const auto old_parallel_execute_output = result_and_index.value();
     const auto output_from_logical_device =
         new_parallel_execute.GetRegionOutputs(cluster_idx)[output_index];
     old_parallel_execute_output.replaceAllUsesWith(output_from_logical_device);
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/BUILD b/tensorflow/compiler/mlir/tensorflow/translate/BUILD
new file mode 100644
index 00000000000000..46af8590c8108e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/translate/BUILD
@@ -0,0 +1,339 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "import_model",
+    srcs = [
+        "import_model.cc",
+    ],
+    hdrs = [
+        "export_graphdef.h",
+        "import_model.h",
+    ],
+    deps = [
+        ":mlir_roundtrip_flags",
+        ":upgrade_graph",
+        "//tensorflow/cc/saved_model:bundle_v2",
+        "//tensorflow/cc/saved_model:constants",
+        "//tensorflow/cc/saved_model:loader_lite",
+        "//tensorflow/cc/saved_model:loader_util",
+        "//tensorflow/compiler/jit:shape_inference_helpers",
+        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
+        "//tensorflow/compiler/mlir/tensorflow:convert_attr",
+        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:mangling_util",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_import_options",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow:translate_utils",
+        "//tensorflow/compiler/mlir/tensorflow:xla_sharding_util",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:initialize_variables_in_session_init",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:lift_variables_lib",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:mark_initialized_variables_lib",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/utils:transitive_fanin",
+        "//tensorflow/core/platform:crash_analysis",
+        "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@local_xla//xla:status_macros",
+        "@local_xla//xla/client:sharding_builder",
+        "@local_xla//xla/hlo/ir:hlo",
+        "@local_xla//xla/service:hlo_parser",
+    ],
+)
+
+tf_cc_test(
+    name = "tf_mlir_translate_registration_test",
+    size = "small",
+    srcs = ["tf_mlir_translate_registration_test.cc"],
+    deps = [
+        ":translate_registration",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:TranslateLib",
+    ],
+)
+
+cc_library(
+    name = "export_tf_dialect_op",
+    srcs = [
+        "export_tf_dialect_op.cc",
+    ],
+    hdrs = [
+        "export_tf_dialect_op.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
+        "//tensorflow/compiler/mlir/tensorflow:export_utils",
+        "//tensorflow/compiler/mlir/utils:string_container_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
+        "@llvm-project//mlir:IR",
+        "@local_xla//xla:status_macros",
+    ],
+)
+
+cc_library(
+    name = "translate_tf_dialect_op",
+    srcs = ["translate_tf_dialect_op.cc"],
+    deps = [
+        ":export_tf_dialect_op",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TranslateLib",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "mlir_roundtrip_pass",
+    srcs = ["mlir_roundtrip_pass.cc"],
+    hdrs = ["mlir_roundtrip_pass.h"],
+    deps = [
+        ":export_graphdef",
+        ":import_model",
+        ":mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@local_xla//xla:status_macros",
+    ],
+)
+
+cc_library(
+    name = "mlir_roundtrip_pass_registration",
+    srcs = ["mlir_roundtrip_pass_registration.cc"],
+    deps = [
+        ":mlir_roundtrip_pass",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "mlir_roundtrip_flags",
+    srcs = ["mlir_roundtrip_flags.cc"],
+    hdrs = ["mlir_roundtrip_flags.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@local_xla//xla:status_macros",
+    ],
+)
+
+cc_library(
+    name = "mlir_import_options",
+    hdrs = ["mlir_import_options.h"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "translate_lib",
+    srcs = ["tf_mlir_translate.cc"],
+    hdrs = ["tf_mlir_translate.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":import_model",
+        ":mlir_roundtrip_flags",
+        "//tensorflow/cc/saved_model:bundle_v2",
+        "//tensorflow/cc/saved_model:loader_lite",
+        "//tensorflow/cc/saved_model:reader",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:import_utils",
+        "//tensorflow/compiler/mlir/tensorflow:mangling_util",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_import_options",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/utils:transitive_fanin",
+        "//tensorflow/core/util/tensor_bundle:byteswaptensor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+    ],
+)
+
+cc_library(
+    name = "translate_cl_options",
+    srcs = [
+        "tf_mlir_translate_cl.cc",
+    ],
+    hdrs = [
+        "tf_mlir_translate_cl.h",
+    ],
+    deps = [
+        "@llvm-project//llvm:Support",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "export_graphdef",
+    srcs = [
+        "export_graphdef.cc",
+    ],
+    hdrs = [
+        "export_graphdef.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":export_tf_dialect_op",
+        ":mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:export_utils",
+        "//tensorflow/compiler/mlir/tensorflow:translate_utils",
+        "//tensorflow/compiler/mlir/tensorflow:verify_suitable_for_graph_export",
+        "//tensorflow/compiler/mlir/utils:name_utils",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/graph/regularization:util",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@local_xla//xla:status_macros",
+    ],
+)
+
+cc_library(
+    name = "translate_registration",
+    srcs = [
+        "tf_mlir_translate_registration.cc",
+    ],
+    deps = [
+        ":export_graphdef",
+        ":mlir_roundtrip_flags",
+        ":translate_cl_options",
+        ":translate_lib",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:TranslateLib",
+        "@local_xla//xla/client:client_library",
+        "@local_xla//xla/client:compile_only_client",
+        "@local_xla//xla/service/cpu:cpu_compiler",
+        "@local_xla//xla/service/cpu:cpu_transfer_manager",
+        "@local_xla//xla/stream_executor",
+        "@local_xla//xla/stream_executor/host:host_platform",
+        "@local_xla//xla/stream_executor/host:host_platform_id",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "split_into_island_per_op_pass",
+    srcs = ["split_into_island_per_op_pass.cc"],
+    hdrs = [
+        "split_into_island_per_op_pass.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_executor_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_pass_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+cc_library(
+    name = "upgrade_graph",
+    srcs = ["upgrade_graph.cc"],
+    hdrs = ["upgrade_graph.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
+        "//tensorflow/compiler/tf2xla:functionalize_control_flow",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/common_runtime:device",
+        "//tensorflow/core/common_runtime:device_factory",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@llvm-project//llvm:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
index 260caf3494be9c..4a19c06154b6d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
@@ -102,7 +102,7 @@ mlir::LogicalResult EvaluateOperation(
     RETURN_FAILURE_IF_ERROR(status);
   }
 
-  VLOG(1) << "Start to evaluate node: " << node_def->DebugString();
+  VLOG(1) << "Start to evaluate node: " << *node_def;
 
   // Adds inputs to the TF operation.
   for (const auto operand : operands) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.cc
new file mode 100644
index 00000000000000..7a6da9fcbd04d2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.cc
@@ -0,0 +1,63 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.h"
+
+#include <string>
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace TF {
+
+std::string GetDeviceAttrAsResourceInstanceStr(mlir::Operation* op) {
+  auto device_attr = op->getAttrOfType<StringAttr>("device");
+  // Treat missing device attribute like unspecified (= empty string) attribute.
+  // Note that different op instances with the same string (including empty
+  // string) are seen as dependent (same resource instance).
+  if (!device_attr) return "";
+  return device_attr.str();
+}
+
+void MarkResourceAsReadAndWrite(
+    Value value,
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>&
+        effects) {
+  if (value.getType().cast<TensorType>().getElementType().isa<ResourceType>()) {
+    effects.emplace_back(MemoryEffects::Read::get(), value,
+                         ResourceEffects::Variable::get());
+    effects.emplace_back(MemoryEffects::Write::get(), value,
+                         ResourceEffects::Variable::get());
+  }
+}
+
+void MarkResourceAsReadOnly(
+    Value value,
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>&
+        effects) {
+  if (value.getType().cast<TensorType>().getElementType().isa<ResourceType>()) {
+    effects.emplace_back(MemoryEffects::Read::get(), value,
+                         ResourceEffects::Variable::get());
+  }
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.h b/tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.h
new file mode 100644
index 00000000000000..c55ad530f15962
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.h
@@ -0,0 +1,44 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SIDE_EFFECT_ANALYSIS_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SIDE_EFFECT_ANALYSIS_UTIL_H_
+
+#include <string>
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace TF {
+
+std::string GetDeviceAttrAsResourceInstanceStr(Operation* op);
+
+void MarkResourceAsReadAndWrite(
+    Value value,
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>&
+        effect);
+
+void MarkResourceAsReadOnly(
+    Value value,
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>&
+        effect);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SIDE_EFFECT_ANALYSIS_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 208311345c3f8e..c6ff5f5c93c6ef 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -189,10 +190,22 @@ std::string GetTPUCompilationDevice(ParsedDevice system_device) {
 }
 
 // Find the host CPU device for a given TPU device with `DEVICE_CPU` as its
-// type and `id` 0.
-std::string GetCPUHostDeviceForTPUDevice(ParsedDevice tpu_device) {
+// type. If multiple local cpu devices are disabled, always assign id 0. If
+// set, use the same id as the tpu device.
+StatusOr<std::string> GetCPUHostDeviceForTPUDevice(ParsedDevice tpu_device,
+                                                   ParsedDevices devices) {
   tpu_device.type = DEVICE_CPU;
-  tpu_device.id = 0;
+  bool enable_multiple_local_cpu_devices =
+      tensorflow::GetMlirCommonFlags()
+          ->tf_mlir_enable_multiple_local_cpu_devices;
+  if (!enable_multiple_local_cpu_devices) {
+    tpu_device.id = 0;
+  }
+  if (FindMatchingDevices(devices, tpu_device).empty()) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Can't find device: ", DeviceNameUtils::ParsedNameToString(tpu_device),
+        " in the devices list."));
+  }
   return DeviceNameUtils::ParsedNameToString(tpu_device);
 }
 
@@ -203,7 +216,8 @@ std::string GetCPUHostDeviceForTPUDevice(ParsedDevice tpu_device) {
 // number of TPU devices available, and `num_cores_per_replica` must be 1.
 StatusOr<TPUDevicesAndHosts> GetFullMeshTPUExecutionDeviceAssignment(
     int num_replicas, int num_cores_per_replica,
-    llvm::ArrayRef<llvm::SmallVector<ParsedDevice, 8>> tpu_devices) {
+    llvm::ArrayRef<llvm::SmallVector<ParsedDevice, 8>> tpu_devices,
+    ParsedDevices devices) {
   const int num_tasks = tpu_devices.size();
   const int num_tpus_per_task = tpu_devices[0].size();
   const int num_tpu_devices = num_tasks * num_tpus_per_task;
@@ -226,7 +240,7 @@ StatusOr<TPUDevicesAndHosts> GetFullMeshTPUExecutionDeviceAssignment(
     const auto& tpu_device = tpu_devices[task][device];
     devices_and_hosts.push_back({TPUDeviceAndHost(
         /*device=*/tensorflow::DeviceNameUtils::ParsedNameToString(tpu_device),
-        /*host=*/GetCPUHostDeviceForTPUDevice(tpu_device))});
+        /*host=*/*GetCPUHostDeviceForTPUDevice(tpu_device, devices))});
   }
 
   return devices_and_hosts;
@@ -365,7 +379,7 @@ StatusOr<std::pair<TPUDevicesAndHosts, xla::DeviceAssignmentProto>>
 GetGeneralTPUExecutionDeviceAssignment(
     int num_replicas, int num_cores_per_replica,
     llvm::ArrayRef<llvm::SmallVector<ParsedDevice, 8>> tpu_devices,
-    llvm::StringRef topology_attr,
+    ParsedDevices devices, llvm::StringRef topology_attr,
     llvm::ArrayRef<int64_t> device_assignment_attr) {
   const int num_tasks = tpu_devices.size();
   const int num_tpus_per_task = tpu_devices[0].size();
@@ -431,7 +445,7 @@ GetGeneralTPUExecutionDeviceAssignment(
       auto& device_and_host = devices_and_hosts[replica][logical_core];
       const auto& tpu_device = tpu_devices[task][device];
       device_and_host.device = DeviceNameUtils::ParsedNameToString(tpu_device);
-      device_and_host.host = GetCPUHostDeviceForTPUDevice(tpu_device);
+      device_and_host.host = *GetCPUHostDeviceForTPUDevice(tpu_device, devices);
     }
   }
 
@@ -626,9 +640,10 @@ StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
           absl::StrCat("'", kDeviceAssignmentAttr, "' must not be set when '",
                        kTopologyAttr, "' is not set"));
 
-    TF_ASSIGN_OR_RETURN(auto execution_devices,
-                        GetFullMeshTPUExecutionDeviceAssignment(
-                            num_replicas, num_cores_per_replica, tpu_devices));
+    TF_ASSIGN_OR_RETURN(
+        auto execution_devices,
+        GetFullMeshTPUExecutionDeviceAssignment(
+            num_replicas, num_cores_per_replica, tpu_devices, devices));
     return TPUDeviceAssignment(compilation_device,
                                std::move(execution_devices));
   }
@@ -636,7 +651,7 @@ StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
   TF_ASSIGN_OR_RETURN(auto devices_and_ids,
                       GetGeneralTPUExecutionDeviceAssignment(
                           num_replicas, num_cores_per_replica, tpu_devices,
-                          topology_attr, device_assignment_attr));
+                          devices, topology_attr, device_assignment_attr));
   return TPUDeviceAssignment(compilation_device,
                              std::move(devices_and_ids.first),
                              std::move(devices_and_ids.second));
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index fb88bc8bc44530..2c749b549cdc86 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -118,8 +118,10 @@ TEST_P(ParameterizedMetadataTest, BadMetadata) {
   ASSERT_TRUE(DeviceNamesToParsedNames(
       {"/job:worker/replica:0/task:0/device:TPU_SYSTEM:0",
        "/job:worker/replica:0/task:0/device:TPU:0",
+       "/job:worker/replica:0/task:0/device:CPU:0",
        "/job:worker/replica:0/task:1/device:TPU_SYSTEM:0",
-       "/job:worker/replica:0/task:1/device:TPU:0"},
+       "/job:worker/replica:0/task:1/device:TPU:0",
+       "/job:worker/replica:0/task:1/device:CPU:0"},
       &devices));
   std::string compilation_device;
   llvm::SmallVector<llvm::SmallVector<std::string, 8>, 8> execution_devices;
@@ -863,6 +865,7 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceNotReplicated) {
                 builder.getStrArrayAttr(llvm::ArrayRef<llvm::StringRef>(
                     {"/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0",
                      "/job:localhost/replica:0/task:0/device:TPU:0",
+                     "/job:localhost/replica:0/task:0/device:CPU:0",
                      "/job:worker/replica:0/task:0/device:CPU:0"})));
 
   llvm::SmallVector<mlir::Type, 8> result_types;
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
index c943f0c9ec3aa1..58adaa41349b14 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
@@ -639,10 +639,8 @@ mlir::LogicalResult RemapOutputsFromLogicalDevices(
     mlir::tf_device::ParallelExecuteOp old_parallel_execute, int cluster_idx,
     mlir::tf_device::ParallelExecuteOp new_parallel_execute,
     mlir::OpBuilder* builder) {
-  for (const auto& result_and_index :
+  for (auto [output_index, old_parallel_execute_output] :
        llvm::enumerate(old_parallel_execute.getResults())) {
-    const auto output_index = result_and_index.index();
-    const auto old_parallel_execute_output = result_and_index.value();
     if (output_index < num_results_pre_cluster) {
       // Replace the use of those results of old parallel_execute op from host
       // with corresponding results of new parallel_execute op
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
index d0653b9677d5c7..693d1f37766d81 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
@@ -3,13 +3,17 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:public"],
+    default_visibility = [
+        "//tensorflow/compiler/mlir/tf2xla/api:__subpackages__",
+        "//tensorflow/compiler/mlir/tf2xla/internal:__subpackages__",
+    ],
 )
 
 cc_library(
     name = "compile_mlir_util_no_tf_dialect_passes",
     srcs = ["compile_mlir_util.cc"],
     hdrs = ["compile_mlir_util.h"],
+    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/mlir/quantization/stablehlo:bridge_passes",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -46,6 +50,7 @@ cc_library(
         "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/tpu:tpu_defs",
+        "@com_google_absl//absl/base:core_headers",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -94,7 +99,6 @@ cc_library(
     srcs = ["compile_tf_graph.cc"],
     hdrs = ["compile_tf_graph.h"],
     deps = [
-        ":compile_mlir_util_no_tf_dialect_passes",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
@@ -105,6 +109,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:set_tpu_infeed_layout",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/tf2xla:layout_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_helpers",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -113,13 +118,21 @@ cc_library(
         "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
         "//tensorflow/core/tpu/kernels:tpu_compile_proto_cc",
         "//tensorflow/core/tpu/kernels:tpu_util",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla:shape_util",
+        "@local_xla//xla:status_macros",
         "@local_xla//xla/client:compile_only_client",
+        "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
         "@local_xla//xla/pjrt:compile_options_proto_cc",
     ],
@@ -167,6 +180,9 @@ cc_library(
     name = "cluster_tf",
     srcs = ["cluster_tf.cc"],
     hdrs = ["cluster_tf.h"],
+    visibility = [
+        "//tensorflow/compiler/tf2xla:__pkg__",
+    ],
     deps = [
         ":tf_dialect_to_executor",
         "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
@@ -225,6 +241,7 @@ cc_library(
     name = "tf_dialect_to_executor",
     srcs = ["tf_dialect_to_executor.cc"],
     hdrs = ["tf_dialect_to_executor.h"],
+    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/jit:flags_headers",
         "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
index 2f8469ee3f6f69..bb27edab8aa88a 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
@@ -103,6 +103,7 @@ tensorflow::Status RunTFXLABridge(
   }
 
   PassManager bridge(module.getContext());
+  bridge.enableVerifier();
   ::tensorflow::applyTensorflowAndCLOptions(bridge);
 
   // Populate a passmanager with the list of passes that implement the bridge.
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
index 12e2212ba81445..3f6e446ca28fd9 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/base/attributes.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -63,8 +64,7 @@ namespace tensorflow {
 //   result shapes.
 // custom_legalization_passes: passes to run before the default TF legalization
 //   passes for backend-specific ops.
-//
-// TODO(hinsu): Migrate options to a separate struct.
+ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
 Status ConvertMLIRToXlaComputation(
     mlir::ModuleOp module_op, llvm::StringRef device_type,
     xla::XlaComputation* xla_computation, bool use_tuple_args,
@@ -98,6 +98,7 @@ Status ConvertMLIRToXlaComputation(
 //   true, includes legalization and MHLO lowering passes.
 // allow_partial_conversion: when this is true, allow operations that can't be
 //   legalized.
+ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
 void CreateConvertMlirToXlaHloPipeline(
     mlir::OpPassManager& pm, llvm::StringRef device_type,
     bool enable_op_fallback,
@@ -112,12 +113,14 @@ struct TensorOrResourceShape {
 };
 
 // Refine MLIR types based on new shape information.
+ABSL_DEPRECATED("Not meant to be used directly and should be a util.")
 Status RefineShapes(llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
                     mlir::ModuleOp module);
 
 // Lower TF to MHLO and insert HLO into the XlaBuilder. xla_params are HLO-level
 // inputs to module_op that have already been added to the XlaBuilder. returns
 // are the returned XlaOps.
+ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
 Status BuildHloFromTf(mlir::ModuleOp module_op, xla::XlaBuilder& builder,
                       llvm::ArrayRef<xla::XlaOp> xla_params,
                       std::vector<xla::XlaOp>& returns,
@@ -129,6 +132,7 @@ Status BuildHloFromTf(mlir::ModuleOp module_op, xla::XlaBuilder& builder,
 // Apply shape, description, and resource information to inputs and outputs
 // in the XlaCompilationResult. This should be called after
 // compilation_result->computation was set.
+ABSL_DEPRECATED("Not meant to be used directly and should be a util.")
 Status PopulateResultIOInfo(
     mlir::ModuleOp module_op, llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
     bool use_tuple_args, bool use_resource_updates_for_aliases,
@@ -142,6 +146,7 @@ Status PopulateResultIOInfo(
 //
 // If enable_op_fallback is set to false, graph is legalized only if the graph
 // analysis for the graph is successful. Otherwise, an error is returned.
+ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
 StatusOr<std::string> CompileMlirToXlaHlo(
     mlir::ModuleOp module_op, llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
     llvm::StringRef device_type, bool use_tuple_args, bool enable_op_fallback,
@@ -157,6 +162,7 @@ StatusOr<std::string> CompileMlirToXlaHlo(
 //
 // If lower_to_xla_hlo is true then compiles down into XLA HLO, generates all
 // accompanying metadata and stores them in CompilationResult.
+ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
 StatusOr<std::string> CompileSerializedMlirToXlaHlo(
     llvm::StringRef mlir_module_string, llvm::ArrayRef<TensorShape> arg_shapes,
     llvm::StringRef device_type, bool use_tuple_args, bool enable_op_fallback,
@@ -172,6 +178,7 @@ StatusOr<std::string> CompileSerializedMlirToXlaHlo(
 // metadata and stores them in CompilationResult. This will rewrite arguments
 // and run the TensorFlow standard pipeline prior to invoking
 // `CompileMlirToXlaHlo`.
+ABSL_DEPRECATED("Use v2/legalize_tf.h::LegalizeMlirToHlo instead.")
 Status CompileGraphToXlaHlo(
     mlir::ModuleOp module_op, llvm::ArrayRef<XlaArgument> args,
     llvm::StringRef device_type, bool use_tuple_args, bool enable_op_fallback,
@@ -183,6 +190,8 @@ Status CompileGraphToXlaHlo(
 
 // Compiles a TensorFlow Graph into XLA HLO, generates all accompanying metadata
 // and stores them in CompilationResult.
+ABSL_DEPRECATED(
+    "Use v1/compile_tf_graph.h::CompileTensorflowGraphToHloinstead.")
 Status CompileGraphToXlaHlo(
     const Graph& graph, llvm::ArrayRef<XlaArgument> args,
     llvm::ArrayRef<std::string> control_rets, llvm::StringRef device_type,
@@ -197,6 +206,8 @@ Status CompileGraphToXlaHlo(
 // XlaBuilder. This function adds HLO to a larger HLO computation, so
 // HLO-level inputs are supplied, and HLO-level outputs are produced.
 // xla_params is the HLO-level inputs and returns is the HLO-level outputs.
+ABSL_DEPRECATED(
+    "Use v1/compile_tf_graph.h::CompileTensorflowGraphToHloinstead.")
 Status BuildHloFromGraph(
     const Graph& graph, xla::XlaBuilder& builder,
     mlir::MLIRContext& mlir_context, llvm::ArrayRef<xla::XlaOp> xla_params,
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
index ace94d1e17303d..003732ffb22f5a 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
@@ -15,12 +15,21 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <variant>
 #include <vector>
 
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
@@ -31,14 +40,30 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
+#include "tensorflow/compiler/tf2xla/layout_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/client/compile_only_client.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/mlir_hlo/mhlo/IR/register.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/lib/monitoring/sampler.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 #include "tensorflow/core/tpu/tpu_compile.h"
+#include "tsl/lib/monitoring/sampler.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
@@ -165,10 +190,8 @@ Status PrepareAndExportToLibrary(mlir::ModuleOp module,
                                         flib_def);
 }
 
-}  // namespace
-
-tsl::Status CompileTensorflowGraphToHlo(
-    const std::variant<tpu::MlirToHloArgs, tpu::FunctionToHloArgs>& computation,
+tsl::Status CompileTFFunctionWithoutMlir(
+    FunctionToHloArgs function_computation,
     const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
     const XlaShapeLayoutHelpers::ShapeDeterminationFns
         shape_determination_funcs,
@@ -177,45 +200,40 @@ tsl::Status CompileTensorflowGraphToHlo(
     std::vector<std::vector<xla::Shape>>* per_core_arg_shapes,
     xla::CompileOnlyClient* client,
     XlaCompiler::CompilationResult* compilation_result) {
-  LOG_FIRST_N(INFO, 1) << "Compiling MLIR computation to XLA HLO using the "
-                          "old (non-MLIR) tf2xla bridge";
-
-  *compilation_result = {};
-  bool has_mlir = computation.index() == 0;
-
-  std::string mlir_string = has_mlir ? "has_mlir" : "has_function_to_hlo";
-  const std::string kBridgePhase2Config =
-      absl::StrCat("graph_old_bridge_", mlir_string);
-  CompilationTimer timer;
-
-  if (!has_mlir) {
-    FunctionToHloArgs function_computation = std::get<1>(computation);
-    Status comp_status = CompileTFFunctionToHlo(
-        *function_computation.flib_def, function_computation.graph_def_version,
-        shape_determination_funcs, arg_shapes,
-        function_computation.guaranteed_constants,
-        *function_computation.function, metadata, client, arg_core_mapping,
-        per_core_arg_shapes, use_tuple_args, compilation_result);
-    if (comp_status.ok()) {
-      phase2_bridge_compilation_status->GetCell(kOldBridgeNoMlirSuccess)
-          ->IncrementBy(1);
-    } else {
-      phase2_bridge_compilation_status->GetCell(kOldBridgeNoMlirFailure)
-          ->IncrementBy(1);
-    }
-
-    phase2_bridge_compilation_time->GetCell(kBridgePhase2Config)
-        ->Add(timer.ElapsedCyclesInMilliseconds());
-    return comp_status;
+  Status comp_status = CompileTFFunctionToHlo(
+      *function_computation.flib_def, function_computation.graph_def_version,
+      shape_determination_funcs, arg_shapes,
+      function_computation.guaranteed_constants, *function_computation.function,
+      metadata, client, arg_core_mapping, per_core_arg_shapes, use_tuple_args,
+      compilation_result);
+  if (comp_status.ok()) {
+    phase2_bridge_compilation_status->GetCell(kOldBridgeNoMlirSuccess)
+        ->IncrementBy(1);
+  } else {
+    phase2_bridge_compilation_status->GetCell(kOldBridgeNoMlirFailure)
+        ->IncrementBy(1);
   }
 
+  return comp_status;
+}
+
+tsl::Status CompileMLIRTFFunction(
+    tpu::MlirToHloArgs mlir_computation,
+    const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
+    const XlaShapeLayoutHelpers::ShapeDeterminationFns
+        shape_determination_funcs,
+    const std::vector<tensorflow::TensorShape>& arg_shapes,
+    std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
+    std::vector<std::vector<xla::Shape>>* per_core_arg_shapes,
+    xla::CompileOnlyClient* client,
+    XlaCompiler::CompilationResult* compilation_result) {
   mlir::DialectRegistry registry;
   mlir::RegisterAllTensorFlowDialects(registry);
   mlir::mhlo::registerAllMhloDialects(registry);
   mlir::MLIRContext context(registry);
 
   mlir::OwningOpRef<mlir::ModuleOp> mlir_module;
-  TF_RETURN_IF_ERROR(DeserializeMlirModule(std::get<0>(computation).mlir_module,
+  TF_RETURN_IF_ERROR(DeserializeMlirModule(mlir_computation.mlir_module,
                                            &context, &mlir_module));
   if (!mlir::SetTPUInfeedLayout(mlir_module))
     return errors::Internal("Failed to set layouts attribute");
@@ -256,11 +274,51 @@ tsl::Status CompileTensorflowGraphToHlo(
       consts, func, metadata, client, arg_core_mapping, per_core_arg_shapes,
       use_tuple_args, compilation_result));
 
+  return PopulateInputOutputAliasing(main_fn, compilation_result,
+                                     use_tuple_args);
+}
+
+}  // namespace
+
+tsl::Status CompileTensorflowGraphToHlo(
+    const std::variant<tpu::MlirToHloArgs, tpu::FunctionToHloArgs>& computation,
+    const tpu::TPUCompileMetadataProto& metadata, bool use_tuple_args,
+    const XlaShapeLayoutHelpers::ShapeDeterminationFns
+        shape_determination_funcs,
+    const std::vector<tensorflow::TensorShape>& arg_shapes,
+    std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
+    std::vector<std::vector<xla::Shape>>* per_core_arg_shapes,
+    xla::CompileOnlyClient* client,
+    XlaCompiler::CompilationResult* compilation_result) {
+  LOG_FIRST_N(INFO, 1) << "Compiling MLIR computation to XLA HLO using the "
+                          "old (non-MLIR) tf2xla bridge";
+
+  CompilationTimer timer;
+  *compilation_result = {};
+  bool has_mlir = computation.index() == 0;
+
+  std::string mlir_string = has_mlir ? "has_mlir" : "has_function_to_hlo";
+  const std::string kBridgePhase2Config =
+      absl::StrCat("graph_old_bridge_", mlir_string);
+
+  if (has_mlir) {
+    TF_RETURN_IF_ERROR(CompileMLIRTFFunction(
+        std::get<0>(computation), metadata, use_tuple_args,
+        shape_determination_funcs, arg_shapes, arg_core_mapping,
+        per_core_arg_shapes, client, compilation_result));
+
+  } else {
+    FunctionToHloArgs function_computation = std::get<1>(computation);
+    TF_RETURN_IF_ERROR(CompileTFFunctionWithoutMlir(
+        function_computation, metadata, use_tuple_args,
+        shape_determination_funcs, arg_shapes, arg_core_mapping,
+        per_core_arg_shapes, client, compilation_result));
+  }
+
   phase2_bridge_compilation_time->GetCell(kBridgePhase2Config)
       ->Add(timer.ElapsedCyclesInMilliseconds());
 
-  return PopulateInputOutputAliasing(main_fn, compilation_result,
-                                     use_tuple_args);
+  return tsl::OkStatus();
 }
 
 };  // namespace v1
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
index 236282f625e20a..9d0b884ebbe85d 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
@@ -127,6 +127,7 @@ tensorflow::Status ExportFromTensorflowDialectToExecutor(
     ModuleOp module, llvm::StringRef module_name) {
   PassManager tf_to_executor(module.getContext());
   ::tensorflow::applyTensorflowAndCLOptions(tf_to_executor);
+  tf_to_executor.enableVerifier();
 
   AddTfDialectToExecutorPasses(tf_to_executor);
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
index 70a84bccff586a..73880851e7abc1 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
@@ -2,25 +2,11 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
+# Please reach out to tf-bridge-team@ before using the TF2XLA bridge.
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":__subpackages__",
-        ":tf2xla_users",
-    ],
-)
-
-# Please reach out to tf-bridge-team@ before using the TF2XLA bridge.
-package_group(
-    name = "tf2xla_users",
-    packages = [
-        "//tensorflow/compiler/mlir/quantization/stablehlo/...",
-        "//learning/serving/contrib/tfrt/mlir/saved_model_analysis",
-        "//tensorflow/compiler/mlir/tfrt",
-        "//tensorflow/compiler/tf2xla",
-        "//tensorflow/compiler/mlir",
-        # Legacy due to where the bridge currently runs. This should go away.
-        "//tensorflow/compiler/mlir/tensorflow/transforms",
     ],
 )
 
@@ -28,6 +14,12 @@ cc_library(
     name = "legalize_tf",
     srcs = ["legalize_tf.cc"],
     hdrs = ["legalize_tf.h"],
+    visibility = [
+        "//learning/brain/google/xla:__pkg__",
+        "//learning/brain/mlir/bridge:__pkg__",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:__pkg__",
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:__pkg__",
+    ],
     deps = [
         ":device_type_proto_cc",
         "//tensorflow/compiler/jit:flags_headers",
@@ -99,12 +91,22 @@ tf_proto_library(
     name = "device_type_proto",
     srcs = ["device_type.proto"],
     cc_api_version = 2,
+    visibility = [
+        "//learning/serving/contrib/tfrt/mlir/saved_model_analysis:__pkg__",
+    ],
 )
 
 cc_library(
     name = "cluster_tf",
     srcs = ["cluster_tf.cc"],
     hdrs = ["cluster_tf.h"],
+    visibility = [
+        "//learning/serving/contrib/tfrt/mlir/saved_model_analysis:__pkg__",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:__pkg__",
+        "//tensorflow/compiler/mlir/tfrt:__pkg__",
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:__pkg__",
+        "//tensorflow/compiler/tf2xla:__pkg__",
+    ],
     deps = [
         ":device_type_proto_cc",
         ":tf_dialect_to_executor",
@@ -165,6 +167,12 @@ cc_library(
     name = "tf_dialect_to_executor",
     srcs = ["tf_dialect_to_executor.cc"],
     hdrs = ["tf_dialect_to_executor.h"],
+    visibility = [
+        "//learning/serving/contrib/tfrt/mlir/saved_model_analysis:__pkg__",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:__pkg__",
+        "//tensorflow/compiler/mlir/tfrt:__pkg__",
+        "//tensorflow/compiler/tf2xla:__pkg__",
+    ],
     deps = [
         "//tensorflow/compiler/jit:flags_headers",
         "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
index 24de1be6fe97dc..289d4d0faec78e 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
@@ -73,6 +73,7 @@ tensorflow::Status RunTFXLABridge(
   }
 
   PassManager bridge(module.getContext());
+  bridge.enableVerifier();
   ::tensorflow::applyTensorflowAndCLOptions(bridge);
 
   // Populate a passmanager with the list of passes that implement the bridge.
@@ -142,6 +143,10 @@ tensorflow::Status RecordIfErrorStatus(const std::string error_prefix,
 }
 
 void CreateClusteringPipeline(OpPassManager &pm, llvm::StringRef module_name) {
+  // Since the internal bridge clustering passes are shared among TF1/TF2
+  // TF2-only passes should go here. However, this should be very rare and
+  // new passes generally should go into the internal
+  // AddBridgeClusteringPipelinePasses.
   pm.addPass(mlir::TFTPU::CreateTPUValidateInputsPass());
   pm.addNestedPass<FuncOp>(
       mlir::TF::CreateCanonicalizeCompileAndReplicateAttributesPass());
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
index 69f1c0e20a5e1b..455a59d6607c49 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
@@ -126,6 +126,7 @@ tensorflow::Status ExportFromTensorflowDialectToExecutor(
     ModuleOp module, llvm::StringRef module_name) {
   PassManager tf_to_executor(module.getContext());
   ::tensorflow::applyTensorflowAndCLOptions(tf_to_executor);
+  tf_to_executor.enableVerifier();
   AddTfDialectToExecutorPasses(tf_to_executor);
 
   if (VLOG_IS_ON(1) ||
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
index 1cad3d1d5cc615..a0261b398fcc8f 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
@@ -34,6 +34,8 @@ namespace internal {
 using mlir::OpPassManager;
 using mlir::func::FuncOp;
 
+// LINT.IfChange(tpu_bridge_passes)
+
 // Adds Bridge clustering pipeline passes to the given pass_manager. Does not
 // run them.
 void AddBridgeClusteringPipelinePasses(OpPassManager& pm,
@@ -80,7 +82,6 @@ void AddBridgeClusteringPipelinePasses(OpPassManager& pm,
   // Run TPU cluster cleanup attributes so ops with no outside compiled
   // attribute have no host device attribute.
   pm.addPass(mlir::TFTPU::CreateTPUClusterCleanupAttributesPass());
-  pm.addPass(mlir::TFDevice::CreateOutsideCompiledToHostLaunchPass());
   pm.addNestedPass<FuncOp>(mlir::TFDevice::CreateDeviceAttributeToLaunchPass());
   // Running canonicalizer before decomposing resource ops in cluster helps the
   // latter pass to converge faster as it does not have to spend time folding
@@ -97,10 +98,6 @@ void AddBridgeClusteringPipelinePasses(OpPassManager& pm,
     func_pm.addPass(mlir::TFTPU::CreateTPUHostComputationExpansionPass());
     func_pm.addPass(mlir::TFTPU::CreateTPUUpdateEmbeddingEnqueueOpInputsPass());
   }
-  // TODO(b/173622615): This should incrementally be moved down as
-  // more passes support this representation and then can be removed once
-  // all passes support it.
-  pm.addPass(mlir::TFDevice::CreateHostLaunchToOutsideCompiledPass());
 
   // TODO(b/173622615): Once OutsideCompilation is represented by launch op and
   // the remaining passes including Inliner support it, remove this
@@ -109,9 +106,6 @@ void AddBridgeClusteringPipelinePasses(OpPassManager& pm,
   // will be removed from launch causing an error.
   pm.addNestedPass<FuncOp>(mlir::TFDevice::CreateLaunchToDeviceAttributePass());
 
-  // TODO(b/173622615): This can be removed once more passes support outside
-  // compilation represented by op and conversion back to attribute is removed.
-  pm.addPass(mlir::TFDevice::CreateOutsideCompiledToHostLaunchPass());
   // Note that the region-based control-flow produced here still contains
   // function call ops which get inlined by the subsequent inliner pass.
   pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
@@ -138,15 +132,12 @@ void AddBridgeClusteringPipelinePasses(OpPassManager& pm,
     pm.addPass(mlir::TFDevice::CreateMergeControlFlowPass());
   }
 
-  // TODO(b/173622615): This should incrementally be moved down as
-  // more passes support this representation and then can be removed once
-  // all passes support it.
-  pm.addPass(mlir::TFDevice::CreateHostLaunchToOutsideCompiledPass());
-
-  pm.addPass(mlir::TFDevice::CreateMarkOpsForOutsideCompilationPass());
+  pm.addPass(
+      tensorflow::tf2xla::internal::CreateMarkOpsForOutsideCompilationPass());
   pm.addPass(tensorflow::tf2xla::internal::
                  CreateExtractHeadTailOutsideCompilationPass());
-  pm.addPass(mlir::TFDevice::CreateExtractOutsideCompilationPass());
+  pm.addPass(
+      tensorflow::tf2xla::internal::CreateExtractOutsideCompilationPass());
   pm.addNestedPass<FuncOp>(
       mlir::TFDevice::CreateVerifyNoOutsideCompilationMarkersPass());
 
@@ -167,18 +158,21 @@ void AddBridgeClusteringPipelinePasses(OpPassManager& pm,
   pm.addNestedPass<FuncOp>(
       tensorflow::tf2xla::internal::CreateVerifyClusteringPass());
 }
+// LINT.ThenChange(:non_tpu_bridge_passes)
 
 void NoCanonicalization(OpPassManager& pm) {}
 
+// LINT.IfChange(non_tpu_bridge_passes)
 void AddNonTPUBridgeClusteringPipelinePasses(OpPassManager& pm) {
   // The following ops must be preserved regardless of reachability. Ideally,
   // all graphs should have control dependencies to enforce this.
   VLOG(2) << "Create TF XLA Bridge pipeline";
+  pm.addPass(mlir::TFDevice::CreateXlaValidateInputsPass());
   pm.addNestedPass<FuncOp>(
       mlir::TF::CreateCanonicalizeCompileAndReplicateAttributesPass());
-  // This pass expectes unified compilation markers.
-  pm.addPass(mlir::TFDevice::CreateXlaValidateInputsPass());
-  const llvm::SmallVector<std::string, 4> ops_to_preserve = {};
+  const llvm::SmallVector<std::string, 4> ops_to_preserve = {
+      "tf.TPUReplicateMetadata", "tf.TPUCompilationResult",
+      "tf.TPUReplicatedOutput"};
   pm.addNestedPass<FuncOp>(
       mlir::tf_executor::CreateTFExecutorGraphPruningPass(ops_to_preserve));
   // It is assumed at this stage there are no V1 control flow ops as Graph
@@ -190,9 +184,17 @@ void AddNonTPUBridgeClusteringPipelinePasses(OpPassManager& pm) {
   // inference.
   pm.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  // The following passe are addded to match TPU pipeline and expected to be
+  // no-op.
+  pm.addNestedPass<FuncOp>(mlir::TFTPU::CreateTPUPartitionedOpConversionPass());
+  pm.addNestedPass<FuncOp>(
+      mlir::TFTPU::CreateTPUReorderReplicateAndPartitionedInputsPass());
+  pm.addNestedPass<FuncOp>(mlir::TF::CreateDecomposeReduceDatasetPass());
+  pm.addPass(mlir::TFDevice::CreateEmbeddingPipeliningPass());
+  pm.addPass(mlir::TFDevice::CreateEmbeddingSequencingPass());
   // Encapsulate PartitionedCall ops within a cluster so that the composite
   // resource ops can be decomposed.
-  pm.addPass(mlir::TFDevice::CreateXlaClusterFormationPass());
+  pm.addPass(tensorflow::tf2xla::internal::CreateXlaClusterFormationPass());
   // Running canonicalizer before decomposing resource ops in cluster helps the
   // latter pass to converge faster as it does not have to spend time folding
   // away dead ops.
@@ -223,10 +225,12 @@ void AddNonTPUBridgeClusteringPipelinePasses(OpPassManager& pm) {
   // for generic pipeline is landed.
   if (tensorflow::GetMlirCommonFlags()
           ->tf_mlir_enable_generic_outside_compilation) {
-    pm.addPass(mlir::TFDevice::CreateMarkOpsForOutsideCompilationPass());
+    pm.addPass(
+        tensorflow::tf2xla::internal::CreateMarkOpsForOutsideCompilationPass());
     pm.addPass(tensorflow::tf2xla::internal::
                    CreateExtractHeadTailOutsideCompilationPass());
-    pm.addPass(mlir::TFDevice::CreateExtractOutsideCompilationPass());
+    pm.addPass(
+        tensorflow::tf2xla::internal::CreateExtractOutsideCompilationPass());
   }
   // Outline clusters into cluster functions.
   pm.addPass(mlir::TFDevice::CreateClusterOutliningPass());
@@ -234,6 +238,7 @@ void AddNonTPUBridgeClusteringPipelinePasses(OpPassManager& pm) {
   pm.addNestedPass<FuncOp>(
       tensorflow::tf2xla::internal::CreateVerifyClusteringPass());
 }
+// LINT.ThenChange(:tpu_bridge_passes)
 
 };  // namespace internal
 };  // namespace tf2xla
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc
index 91b80fa485a83f..d3201bffa137a0 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc
@@ -28,14 +28,14 @@ TEST(ClusteringBridgePassesTest, AddsBridgePasses) {
   OpPassManager pass_manager;
   AddBridgeClusteringPipelinePasses(pass_manager);
 
-  EXPECT_EQ(pass_manager.size(), 47);
+  EXPECT_EQ(pass_manager.size(), 43);
 }
 
 TEST(ClusteringBridgePassesTest, AddsNonTPUBridgePasses) {
   OpPassManager pass_manager;
   AddNonTPUBridgeClusteringPipelinePasses(pass_manager);
 
-  EXPECT_EQ(pass_manager.size(), 15);
+  EXPECT_EQ(pass_manager.size(), 20);
 }
 
 };  // namespace internal
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/hlo_post_processing/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/hlo_post_processing/BUILD
new file mode 100644
index 00000000000000..2c9500af0052ae
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/hlo_post_processing/BUILD
@@ -0,0 +1,7 @@
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/compiler/mlir/tf2xla/internal:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
index 0e25e62b150047..a391e189e6215c 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
@@ -17,9 +17,6 @@ package(
 
 cc_library(
     name = "clustering_passes",
-    srcs = [
-        "verify_clustering_pass.cc",
-    ],
     hdrs = [
         "clustering_passes.h",
     ],
@@ -27,14 +24,31 @@ cc_library(
         "clustering_passes.h.inc",
     ],
     deps = [
-        ":clustering_passes_inc_gen",
         ":extract_head_tail_outside_compilation",
+        ":extract_outside_compilation",
+        ":mark_ops_for_outside_compilation",
         ":tpu_cluster_formation",
+        ":verify_clustering_pass",
+        ":xla_cluster_formation",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+cc_library(
+    name = "verify_clustering_pass",
+    srcs = [
+        "verify_clustering_pass.cc",
+    ],
+    deps = [
+        ":clustering_passes_inc_gen",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:string_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
         "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
+        "//tensorflow/compiler/mlir/tf2xla/internal/utils:dialect_detection_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core/transforms/toposort:Pass",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -56,7 +70,7 @@ gentbl_cc_library(
         (
             [
                 "-gen-pass-decls",
-                "-name=TFXLABridge",
+                "-name=TFXLABridgeClustering",
             ],
             "clustering_passes.h.inc",
         ),
@@ -74,7 +88,6 @@ tf_cc_test(
     deps = [
         ":clustering_passes",
         "//tensorflow/compiler/mlir/tf2xla/transforms:test_utils",
-        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -131,8 +144,8 @@ cc_library(
 )
 
 cc_library(
-    name = "extract_head_tail_outside_compilation",
-    srcs = ["extract_head_tail_outside_compilation.cc"],
+    name = "extract_outside_compilation",
+    srcs = ["extract_outside_compilation.cc"],
     textual_hdrs = [
         "clustering_passes.h.inc",
     ],
@@ -141,13 +154,16 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:device_util",
+        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
         "//tensorflow/compiler/mlir/tensorflow:string_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:shape_inference_pass",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
@@ -161,43 +177,144 @@ cc_library(
 )
 
 cc_library(
-    name = "dialect_to_executor_passes",
-    srcs = [
-        "dialect_to_executor_passes.h",
-    ],
+    name = "extract_head_tail_outside_compilation",
+    srcs = ["extract_head_tail_outside_compilation.cc"],
     textual_hdrs = [
-        "dialect_to_executor_passes.h.inc",
+        "clustering_passes.h.inc",
     ],
     deps = [
-        ":dialect_to_executor_passes_inc_gen",
+        ":clustering_passes_inc_gen",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
+        "//tensorflow/compiler/mlir/tensorflow:device_util",
+        "//tensorflow/compiler/mlir/tensorflow:string_util",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
         "//tensorflow/core:framework",
-        "//tensorflow/core/transforms/toposort:Pass",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Rewrite",
         "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "mlir_to_graph_passes",
+    hdrs = [
+        "mlir_to_graph_passes.h",
+    ],
+    textual_hdrs = [
+        "mlir_to_graph_passes.h.inc",
+    ],
+    deps = [
+        ":verify_input_dialect_to_executor_pass",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:Pass",
     ],
 )
 
 gentbl_cc_library(
-    name = "dialect_to_executor_passes_inc_gen",
+    name = "mlir_to_graph_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
     tbl_outs = [
         (
             [
                 "-gen-pass-decls",
-                "-name=TFXLABridge",
+                "-name=TFXLABridgeMlirToGraph",
             ],
-            "dialect_to_executor_passes.h.inc",
+            "mlir_to_graph_passes.h.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "dialect_to_executor_passes.td",
+    td_file = "mlir_to_graph_passes.td",
     deps = [
         "@llvm-project//mlir:PassBaseTdFiles",
     ],
 )
+
+cc_library(
+    name = "verify_input_dialect_to_executor_pass",
+    srcs = [
+        "verify_input_dialect_to_executor_pass.cc",
+    ],
+    deps = [
+        ":mlir_to_graph_passes_inc_gen",
+        "//tensorflow/compiler/mlir/tf2xla/internal/utils:dialect_detection_utils",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+cc_library(
+    name = "xla_cluster_formation",
+    srcs = ["xla_cluster_formation.cc"],
+    textual_hdrs = [
+        "clustering_passes.h.inc",
+    ],
+    deps = [
+        ":clustering_passes_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
+        "//tensorflow/compiler/mlir/tensorflow:call_graph_util",
+        "//tensorflow/compiler/mlir/tensorflow:cluster_util",
+        "//tensorflow/compiler/mlir/tensorflow:string_util",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:portable_gif_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "mark_ops_for_outside_compilation",
+    srcs = ["mark_ops_for_outside_compilation.cc"],
+    textual_hdrs = [
+        "clustering_passes.h.inc",
+    ],
+    deps = [
+        ":clustering_passes_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
+        "//tensorflow/compiler/mlir/tensorflow:string_util",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:lower_tf_lib",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_pass_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:legalization_op_config",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:legalize_tf",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Rewrite",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
index 79721a0da640ae..8062ac32b70bb0 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
@@ -37,10 +37,27 @@ CreateTPUClusterFormationPass(bool strict_clusters = false);
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateExtractHeadTailOutsideCompilationPass();
 
+// Creates a pass that extract outside compilation (Host ops inside cevice
+// cluster) ops to a separate parallel_execute region to run on CPU.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateExtractOutsideCompilationPass();
+
+// Create a pass that encapsulates StatefulPartitionedCallOp within a cluster.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateXlaClusterFormationPass();
+
+// Creates a pass that marks unsupported ops in device cluster for outside
+// compilation.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateMarkOpsForOutsideCompilationPass();
+
 #define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_MARKOPSFOROUTSIDECOMPILATIONPASS
 #define GEN_PASS_DECL_TPUCLUSTERFORMATIONPASS
 #define GEN_PASS_DECL_TPUEXTRACTHEADTAILOUTSIDECOMPILATIONPASS
+#define GEN_PASS_DECL_TPUEXTRACTOUTSIDECOMPILATIONPASS
 #define GEN_PASS_DECL_VERIFYCLUSTERINGPASS
+#define GEN_PASS_DECL_XLACLUSTERFORMATIONPASS
 #include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
 
 }  // namespace internal
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td
index 4fc8af15ffa4fc..8dafe11afea4e3 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td
@@ -153,5 +153,149 @@ def ExtractHeadTailOutsideCompilationPass : Pass<"tf-extract-head-tail-outside-c
   let constructor = "tensorflow::tf2xla::internal::CreateExtractHeadTailOutsideCompilationPass()";
 }
 
+def ExtractOutsideCompilationPass : Pass<"tf-extract-outside-compilation", "ModuleOp"> {
+  let summary = "Extracts device outside compilation computation to a separate tf_device.parallel_execute region.";
 
+  let description = [{
+    This pass extracts a CPU computation cluster with `_xla_outside_compilation`
+    annotation, which denotes ops that should be run on CPU/host, from a device cluster.
+    Each outside compilation cluster is moved to
+    a tf_device.parallel_execute region. The device cluster is also moved to a
+    tf_device.parallel_execute region. Communication ops between device and host are
+    added to pass inputs/outputs to/from the outside compiled region.
+
+    For example, the following tf_device.cluster with an op marked for `xla_outside_compilation`:
+
+    ```mlir
+    func @outside_compilation() -> tensor<f32> {
+      %0 = "tf_device.cluster"() ( {
+        %1 = "tf.Const"() {_xla_outside_compilation = "0", value = dense<1.0> : tensor<f32>} : () -> (tensor<f32>)
+        %2 = "tf.Identity"(%1) {_xla_outside_compilation = "0"} : (tensor<f32>) -> (tensor<f32>)
+        %3 = "tf.AddV2"(%1, %2) : (tensor<f32>, tensor<f32>) -> (tensor<f32>)
+        tf_device.return %3 : tensor<f32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
+      return %0 : tensor<f32>
+    }
+    ```
+
+    will become a tf_device.parallel_execute op with a CPU/host region and
+    a tf_device.cluster with communication ops to send data to/from device/host:
+
+    ```mlir
+    func @outside_compilation() -> tensor<f32> {
+      %0 = "tf_device.parallel_execute"() ( {
+        "tf_device.launch"() ( {
+          %1 = "tf._XlaCompileMlirPlaceholderProgramKey"() : () -> tensor<3x!tf_type.string>
+          %2 = "tf._XlaRecvAtHost"(%1) {device_ordinal = 0 : i64, key = "host_compute_channel_0_0_args"} : (tensor<3x!tf_type.string>) -> tensor<f32>
+          %3 = "tf.Identity"(%2) : (tensor<f32>) -> tensor<f32>
+          "tf._XlaSendFromHost"(%3, %1) {device_ordinal = 0 : i64, key = "host_compute_channel_0_0_retvals"} : (tensor<f32>, tensor<3x!tf_type.string>) -> ()
+          tf_device.return
+        }) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> ()
+        tf_device.return
+      },  {
+        %1 = "tf_device.cluster"() ( {
+          %2 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+          %3 = "tf._XlaHostComputeMlir"(%2) {recv_key = "host_compute_channel_0_0_retvals", send_key = "host_compute_channel_0_0_args", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
+          %4 = "tf.AddV2"(%2, %3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+          tf_device.return %4 : tensor<f32>
+        }) {device_assignment = [], num_cores_per_replica = 1 : i64, topology = ""} : () -> tensor<f32>
+        tf_device.return %1 : tensor<f32>
+      }) : () -> tensor<f32>
+      return %0 : tensor<f32>
+    }
+    ```
+  }];
+
+  let constructor = "tensorflow::tf2xla::internal::CreateExtractOutsideCompilationPass()";
+}
+
+def XlaClusterFormationPass : Pass<"tf-xla-cluster-formation", "ModuleOp"> {
+  let summary = "Encapsulate partitioned calls within a Cluster op";
+  let description = [{
+    This pass clusters `tf.PartitionedCall` and `tf.StatefulPartitionedCall`
+    with `_xla_compile_device_type` attribute into a `tf_device.cluster`.
+    Notice this pass will only rewrite the outermost call if there are nested
+    calls to avoid nested `tf.XlaLaunch` operations from being created later.
+
+    For example, the following code
+
+    ```mlir
+    func.func @main() -> tensor<i32> {
+      %0 = "tf.StatefulPartitionedCall"() {_xla_compile_device_type = "CPU", f = @stateful_pcall_func} : () -> (tensor<i32>)
+      func.return %0 : tensor<i32>
+    }
+
+    func.func @stateful_pcall_func() -> tensor<i32> {
+      %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+      func.return %0 : tensor<i32>
+    }
+    ```
+
+    will be transformed into,
+
+    ```mlir
+    func.func @main() -> tensor<i32> {
+      %0 = "tf_device.cluster"() ({
+        %1 = "tf.StatefulPartitionedCall"() {_xla_compile_device_type = "CPU", f = @stateful_pcall_func} : () -> tensor<i32>
+        tf_device.return %1 : tensor<i32>
+      }) : () -> tensor<i32>
+      func.return %0 : tensor<i32>
+    }
+
+    func.func @stateful_pcall_func() -> tensor<i32> {
+      %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+      func.return %0 : tensor<i32>
+    }
+
+    ```
+  }];
+  let constructor = "tensorflow::tf2xla::internal::CreateXlaClusterFormationPass()";
+  let dependentDialects = ["mlir::tf_device::TensorFlowDeviceDialect"];
+}
+
+def MarkOpsForOutsideCompilationPass : Pass<"tf-mark-ops-for-outside-compilation", "ModuleOp"> {
+  let summary = "Marks ops in device cluster for outside compilation if they are unsupported on device.";
+
+  let description = [{
+    This pass marks unsupported ops in a device cluster with
+    `_xla_outside_compilation` attribute so the operations will run on the host
+    instead of the device. Unsupported ops are ops that can not be code
+    generated to run on the device for the cluster including:
+
+    1. String operations on TPUs.
+    2. Operations that don't have a kernel defined for the device.
+
+    This pass is conservative in that it will mark all ops for outside compilation
+    that can not be compiled for the device.  Exceptions for this are added for ops
+    that will be rewritten or decomposed before compiling on device.
+
+
+    For example, tf_device.cluster op with an unsupported op, tf.UnsupportedOp:
+
+    ```mlir
+    func @unsupported_op() -> tensor<i32> {
+      %0 = "tf_device.cluster"() ( {
+        %1 = "tf.UnsupportedOp"() : () -> tensor<i32>
+        %2 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
+        tf_device.return %2 : tensor<i32>
+      }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<i32>
+      return %0 : tensor<i32>
+    }
+    ```
+
+    will mark tf.UnsupportedOp with `_xla_outside_compilation` attribute:
+
+    ```mlir
+    func @unsupported_op() -> tensor<i32> {
+      %0 = "tf_device.cluster"() ( {
+        %1 = "tf.UnsupportedOp"() {_xla_outside_compilation = "auto0"} : () -> tensor<i32>
+        %2 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
+        tf_device.return %2 : tensor<i32>
+      }) {allow_soft_placement = true, device_assignment = [], num_cores_per_replica = 1 : i64, topology = ""} : () -> tensor<i32>
+      return %0 : tensor<i32>
+    }
+    ```
+  }];
 
+  let constructor = "tensorflow::tf2xla::internal::CreateMarkOpsForOutsideCompilationPass()";
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/extract_outside_compilation.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/extract_outside_compilation.cc
similarity index 86%
rename from tensorflow/compiler/mlir/tensorflow/transforms/extract_outside_compilation.cc
rename to tensorflow/compiler/mlir/tf2xla/internal/passes/extract_outside_compilation.cc
index ccc72962fd141d..6bc3468a2729e3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/extract_outside_compilation.cc
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
+#include <cassert>
 #include <cstdint>
 #include <deque>
-#include <iterator>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -34,33 +34,65 @@ limitations under the License.
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/string_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 
-namespace mlir {
-namespace TFDevice {
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
 
 namespace {
 
+using llvm::ArrayRef;
+using llvm::SmallVector;
+using mlir::Block;
+using mlir::BlockArgument;
+using mlir::DenseIntElementsAttr;
+using mlir::IRMapping;
+using mlir::Location;
+using mlir::LogicalResult;
+using mlir::ModuleOp;
+using mlir::OpBuilder;
+using mlir::Operation;
+using mlir::OperationPass;
+using mlir::OpOperand;
+using mlir::OpResult;
+using mlir::OwningOpRef;
+using mlir::RankedTensorType;
+using mlir::StringAttr;
+using mlir::StringRef;
+using mlir::SymbolTable;
+using mlir::Type;
+using mlir::TypeRange;
+using mlir::Value;
+using mlir::ValueRange;
+using mlir::WalkResult;
+using mlir::func::FuncOp;
+using mlir::func::ReturnOp;
+
 constexpr char kDeviceAttr[] = "device";
 constexpr char kHostFunctionAttr[] = "host_func";
 constexpr char kXlaMapOutsideCompilationAttr[] = "_xla_map_outside_compilation";
@@ -68,7 +100,7 @@ constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 constexpr char kNoReplicationCluster[] = "__no_replication_cluster";
 
 #define GEN_PASS_DEF_EXTRACTOUTSIDECOMPILATIONPASS
-#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
 
 struct ExtractOutsideCompilation
     : public impl::ExtractOutsideCompilationPassBase<
@@ -79,9 +111,9 @@ struct ExtractOutsideCompilation
 // Build a function containing `ops` with `inputs` and `outputs` using
 // `builder`.  The `ops` are cloned and modified to use the function arguments
 // as inputs.
-func::FuncOp BuildFunction(llvm::ArrayRef<Operation*> ops,
-                           llvm::ArrayRef<Value> inputs,
-                           llvm::ArrayRef<Value> outputs, OpBuilder* builder) {
+FuncOp BuildFunction(llvm::ArrayRef<Operation*> ops,
+                     llvm::ArrayRef<Value> inputs,
+                     llvm::ArrayRef<Value> outputs, OpBuilder* builder) {
   llvm::SmallVector<Type, 4> operand_types;
   operand_types.reserve(inputs.size());
   for (Value v : inputs) operand_types.emplace_back(v.getType());
@@ -91,8 +123,8 @@ func::FuncOp BuildFunction(llvm::ArrayRef<Operation*> ops,
 
   auto func_type = builder->getFunctionType(operand_types, output_types);
 
-  func::FuncOp outlined_func =
-      func::FuncOp::create(ops.front()->getLoc(), kHostFunctionAttr, func_type);
+  FuncOp outlined_func =
+      FuncOp::create(ops.front()->getLoc(), kHostFunctionAttr, func_type);
 
   // Create function body.
   Block* outlined_func_block = outlined_func.addEntryBlock();
@@ -111,13 +143,13 @@ func::FuncOp BuildFunction(llvm::ArrayRef<Operation*> ops,
     results_after_mapping.push_back(mapping.lookupOrDefault(result));
   }
 
-  builder->create<func::ReturnOp>(ops.front()->getLoc(), results_after_mapping);
+  builder->create<ReturnOp>(ops.front()->getLoc(), results_after_mapping);
   return outlined_func;
 }
 
 // Encapsulates `func` in a module and serializes that module.
 // `serialized_func_module` is set to the serialized module.
-void EncapsulateFuncAndSerialize(func::FuncOp func,
+void EncapsulateFuncAndSerialize(FuncOp func,
                                  std::string* serialized_func_module) {
   // Create a new module to hold func and all referenced functions.
   OwningOpRef<mlir::ModuleOp> module_for_func =
@@ -175,14 +207,14 @@ Operation* CreateSendFromHostOp(OpBuilder& builder, Location loc,
                                 llvm::StringRef communication_key) {
   if (device_ordinal)
     return ApplyXlaHostTransferAttr(
-        builder.create<TF::_XlaSendFromHostV2Op>(
+        builder.create<mlir::TF::_XlaSendFromHostV2Op>(
             loc, inputs,
             /*dynamic_key=*/compilation_key, device_ordinal,
             builder.getStringAttr(communication_key), device_type_attr),
         builder);
 
   return ApplyXlaHostTransferAttr(
-      builder.create<TF::_XlaSendFromHostOp>(
+      builder.create<mlir::TF::_XlaSendFromHostOp>(
           loc, inputs,
           /*dynamic_key=*/compilation_key,
           builder.getStringAttr(communication_key),
@@ -200,13 +232,13 @@ Operation* CreateRecvAtHostOp(OpBuilder& builder, Location loc,
                               llvm::StringRef communication_key) {
   if (device_ordinal)
     return ApplyXlaHostTransferAttr(
-        builder.create<TF::_XlaRecvAtHostV2Op>(
+        builder.create<mlir::TF::_XlaRecvAtHostV2Op>(
             loc, output_types, /*dynamic_key=*/compilation_key, device_ordinal,
             builder.getStringAttr(communication_key), device_type_attr),
         builder);
 
   return ApplyXlaHostTransferAttr(
-      builder.create<TF::_XlaRecvAtHostOp>(
+      builder.create<mlir::TF::_XlaRecvAtHostOp>(
           loc, output_types, /*dynamic_key=*/compilation_key,
           builder.getStringAttr(communication_key),
           /*device_ordinal=*/builder.getI64IntegerAttr(default_device_ordinal),
@@ -216,10 +248,10 @@ Operation* CreateRecvAtHostOp(OpBuilder& builder, Location loc,
 
 // Clones an IfRegionOp 'if_region' and attributes and creates then/else regions
 // with yield op and an empty block.
-TF::IfRegionOp CloneEmptyIfWithPredicate(TF::IfRegionOp if_region,
-                                         OpBuilder& builder) {
+mlir::TF::IfRegionOp CloneEmptyIfWithPredicate(mlir::TF::IfRegionOp if_region,
+                                               OpBuilder& builder) {
   // Mark op as stateful due to side-effecting communication ops added later.
-  auto host_side_if = builder.create<TF::IfRegionOp>(
+  auto host_side_if = builder.create<mlir::TF::IfRegionOp>(
       if_region.getLoc(), llvm::SmallVector<Type, 4>{}, if_region.getCond(),
       /*is_stateless=*/false, if_region.get_thenFuncNameAttr(),
       if_region.get_elseFuncNameAttr());
@@ -228,23 +260,23 @@ TF::IfRegionOp CloneEmptyIfWithPredicate(TF::IfRegionOp if_region,
   auto& then_branch = host_side_if.getThenBranch();
   then_branch.push_back(new Block);
   builder.setInsertionPointToEnd(&then_branch.front());
-  builder.create<TF::YieldOp>(if_region.getLoc(),
-                              /*operands=*/ArrayRef<Value>{});
+  builder.create<mlir::TF::YieldOp>(if_region.getLoc(),
+                                    /*operands=*/ArrayRef<Value>{});
 
   // Create empty else branch region.
   auto& else_branch = host_side_if.getElseBranch();
   else_branch.push_back(new Block);
   builder.setInsertionPointToEnd(&else_branch.front());
-  builder.create<TF::YieldOp>(if_region.getLoc(),
-                              /*operands=*/ArrayRef<Value>{});
+  builder.create<mlir::TF::YieldOp>(if_region.getLoc(),
+                                    /*operands=*/ArrayRef<Value>{});
   return host_side_if;
 }
 // Creates a WhileRegionOp cond and body regions with yield op and
 // an empty body.
-TF::WhileRegionOp CloneEmptyWhile(uint64_t parallel_iterations, Location loc,
-                                  OpBuilder& builder) {
+mlir::TF::WhileRegionOp CloneEmptyWhile(uint64_t parallel_iterations,
+                                        Location loc, OpBuilder& builder) {
   // Mark op as stateful due to side-effecting communication ops added later.
-  auto host_side_while = builder.create<TF::WhileRegionOp>(
+  auto host_side_while = builder.create<mlir::TF::WhileRegionOp>(
       loc, /*output=*/ArrayRef<Type>{}, /*input=*/ArrayRef<Value>{},
       parallel_iterations, /*is_stateless=*/false, /*shape_invariant=*/false);
 
@@ -252,7 +284,7 @@ TF::WhileRegionOp CloneEmptyWhile(uint64_t parallel_iterations, Location loc,
   auto& body = host_side_while.getBody();
   body.push_back(new Block);
   builder.setInsertionPointToEnd(&body.front());
-  builder.create<TF::YieldOp>(loc, /*operands=*/ArrayRef<Value>{});
+  builder.create<mlir::TF::YieldOp>(loc, /*operands=*/ArrayRef<Value>{});
   return host_side_while;
 }
 
@@ -261,16 +293,16 @@ TF::WhileRegionOp CloneEmptyWhile(uint64_t parallel_iterations, Location loc,
 // _XlaSendFromHost but the _XlaCompileMlir has not yet been created for device
 // cluster that contains the outside compiled ops. This placeholder should be
 // replaced by the TPU cluster _XlaCompileMlir in a subsequent pass.
-TF::_XlaCompileMlirPlaceholderProgramKeyOp CreateCompilationKeyPlaceholder(
-    Location loc, OpBuilder& builder) {
+mlir::TF::_XlaCompileMlirPlaceholderProgramKeyOp
+CreateCompilationKeyPlaceholder(Location loc, OpBuilder& builder) {
   auto result_type =
-      RankedTensorType::get({3}, builder.getType<TF::StringType>());
-  return builder.create<TF::_XlaCompileMlirPlaceholderProgramKeyOp>(
+      RankedTensorType::get({3}, builder.getType<mlir::TF::StringType>());
+  return builder.create<mlir::TF::_XlaCompileMlirPlaceholderProgramKeyOp>(
       loc, /*program=*/result_type, llvm::ArrayRef<Value>{});
 }
 
 // Creates a `tf_device.launch` to wrap cluster ops.
-tf_device::LaunchOp CreateLaunchOpForOutsideCluster(
+mlir::tf_device::LaunchOp CreateLaunchOpForOutsideCluster(
     OpBuilder& builder, Operation* loc_op, llvm::StringRef host_device,
     llvm::SmallVector<Value, 4>& return_value_from_host) {
   llvm::SmallVector<Type, 4> host_result_types;
@@ -281,20 +313,21 @@ tf_device::LaunchOp CreateLaunchOpForOutsideCluster(
   // An empty string placeholder is used for the device as that will be later
   // populated with the device of the associated Device op.
   // For TPU case, it is TPUReplicateMetadata op.
-  auto launch_op = builder.create<tf_device::LaunchOp>(
+  auto launch_op = builder.create<mlir::tf_device::LaunchOp>(
       loc_op->getLoc(), builder.getStringAttr(host_device),
       /*result_types=*/host_result_types);
 
   launch_op.getBody().push_back(new Block);
   builder.setInsertionPointToEnd(&launch_op.GetBody());
-  builder.create<tf_device::ReturnOp>(loc_op->getLoc(), return_value_from_host);
+  builder.create<mlir::tf_device::ReturnOp>(loc_op->getLoc(),
+                                            return_value_from_host);
   return launch_op;
 }
 
 // Returns true if `op` has non-static shaped outputs.
 bool HasDynamicOutputs(Operation* op) {
   for (Value v : op->getResults()) {
-    if (TF::CanBeRefined(v.getType())) return true;
+    if (mlir::TF::CanBeRefined(v.getType())) return true;
   }
   return false;
 }
@@ -307,7 +340,7 @@ bool HasDynamicOutputs(const llvm::SmallSetVector<Operation*, 4>& cluster_ops) {
       if (cluster_ops.count(use.getOwner())) {
         continue;
       }
-      if (TF::CanBeRefined(use.get().getType())) return true;
+      if (mlir::TF::CanBeRefined(use.get().getType())) return true;
     }
   }
   return false;
@@ -317,7 +350,7 @@ bool HasDynamicExternalValues(Operation* op) {
   return op
       ->walk([](Operation* walked_op) {
         for (Value v : walked_op->getOperands()) {
-          if (TF::CanBeRefined(v.getType())) {
+          if (mlir::TF::CanBeRefined(v.getType())) {
             return WalkResult::interrupt();
           }
         }
@@ -330,14 +363,14 @@ bool HasDynamicExternalValues(Operation* op) {
 // communicated from device->host. This is for the case when all operands have a
 // static shape.
 llvm::SmallSetVector<Value, 4> GetStaticExternalOperands(
-    tf_device::ClusterOp device_cluster,
+    mlir::tf_device::ClusterOp device_cluster,
     const llvm::SmallSetVector<Operation*, 4>& cluster_ops) {
   llvm::SmallSetVector<Value, 4> external_values;
   for (Operation* op : cluster_ops) {
     op->walk([&](Operation* walked_op) {
-      if (llvm::isa<TF::_XlaRecvAtHostOp, TF::_XlaRecvAtHostV2Op,
-                    TF::_XlaSendFromHostOp, TF::_XlaSendFromHostV2Op>(
-              walked_op))
+      if (llvm::isa<mlir::TF::_XlaRecvAtHostOp, mlir::TF::_XlaRecvAtHostV2Op,
+                    mlir::TF::_XlaSendFromHostOp,
+                    mlir::TF::_XlaSendFromHostV2Op>(walked_op))
         return WalkResult::advance();
       for (Value v : walked_op->getOperands()) {
         if (!tensorflow::TypeValidForXLA(v.getType())) continue;
@@ -347,8 +380,8 @@ llvm::SmallSetVector<Value, 4> GetStaticExternalOperands(
               !HasOutsideCompilationAncestor(defining_op) &&
               // Ignore operands that have already been received by a previously
               // created cluster.
-              !llvm::isa<TF::_XlaRecvAtHostOp, TF::_XlaRecvAtHostV2Op>(
-                  defining_op)) {
+              !llvm::isa<mlir::TF::_XlaRecvAtHostOp,
+                         mlir::TF::_XlaRecvAtHostV2Op>(defining_op)) {
             external_values.insert(v);
           }
           continue;
@@ -385,7 +418,7 @@ llvm::SmallSetVector<Value, 4> GetAllExternalOperands(
 // Returns a SmallSetVector containing all of the operands that need to be
 // communicated from device->host.
 llvm::SmallSetVector<Value, 4> GetExternalOperands(
-    tf_device::ClusterOp device_cluster,
+    mlir::tf_device::ClusterOp device_cluster,
     const llvm::SmallSetVector<Operation*, 4>& cluster_ops) {
   // If there are any dynamic outputs, get all of the operands which are defined
   // external to `cluster_ops`.
@@ -418,9 +451,11 @@ void GetExternalOutputs(const llvm::SmallSetVector<Operation*, 4>& cluster_ops,
         if (!user_set.insert(user).second) continue;
         for (Value v : user->getOperands()) {
           if (tensorflow::TypeValidForXLA(v.getType()) &&
-              v.getDefiningOp() == op && !isa<tf_device::ReturnOp>(user))
+              v.getDefiningOp() == op &&
+              !llvm::isa<mlir::tf_device::ReturnOp>(user))
             external_outputs.insert(v);
-          if (v.getDefiningOp() == op && isa<tf_device::ReturnOp>(user))
+          if (v.getDefiningOp() == op &&
+              llvm::isa<mlir::tf_device::ReturnOp>(user))
             tmp_host_outputs.push_back(v);
         }
       }
@@ -464,7 +499,7 @@ LogicalResult GetShardShapedType(Operation* context_op,
     shape.push_back(in_shape[i]);
   }
   shard_type = RankedTensorType::Builder(ranked_type).setShape(shape);
-  return success();
+  return mlir::success();
 }
 
 // Output `sharding`, which is the sharding of `val`. `context_op` is used for
@@ -483,7 +518,7 @@ LogicalResult GetShardingOfValue(Operation* context_op, Value val,
            << "A map_outside_compilation op's input should have an explicit "
               "sharding. There is no _XlaSharding attribute on the input op.";
   sharding = sharding_attr.str();
-  return success();
+  return mlir::success();
 }
 
 // Create an `_XlaHostComputeMlir` for the map_outside_compilation case. Inputs
@@ -508,7 +543,7 @@ LogicalResult CreateHostComputeMap(
     Type shard_type;
     if (failed(GetShardShapedType(original_op, num_cores_per_replica,
                                   output.getType(), shard_type)))
-      return failure();
+      return mlir::failure();
     shard_output_types.push_back(shard_type);
     full_output_types.push_back(output.getType());
   }
@@ -522,10 +557,10 @@ LogicalResult CreateHostComputeMap(
     Type shard_type;
     if (failed(GetShardShapedType(original_op, num_cores_per_replica,
                                   in.getType(), shard_type)))
-      return failure();
+      return mlir::failure();
     std::string in_sharding;
     if (failed(GetShardingOfValue(original_op, in, in_sharding)))
-      return failure();
+      return mlir::failure();
     if (common_split_sharding.empty()) {
       common_split_sharding = std::move(in_sharding);
     } else {
@@ -534,14 +569,14 @@ LogicalResult CreateHostComputeMap(
                << "All inputs and outputs of map_outside_compilation should "
                   "have the same sharding.";
     }
-    auto in_manual = builder.create<TF::XlaSpmdFullToShardShapeOp>(
+    auto in_manual = builder.create<mlir::TF::XlaSpmdFullToShardShapeOp>(
         loc, shard_type, in, common_split_sharding, /*dim=*/-1,
         /*unspecified_dims=*/builder.getI64ArrayAttr({}));
     manual_inputs.push_back(in_manual);
   }
 
   // Create the _XlaHostComputeMlirOp
-  auto host_compute = builder.create<TF::_XlaHostComputeMlirOp>(
+  auto host_compute = builder.create<mlir::TF::_XlaHostComputeMlirOp>(
       loc, shard_output_types, manual_inputs,
       /*send_key=*/builder.getStringAttr(args_communication_key),
       /*recv_key=*/builder.getStringAttr(retvals_communication_key),
@@ -556,7 +591,7 @@ LogicalResult CreateHostComputeMap(
     if (!full_type_ranked)
       return original_op->emitOpError()
              << "map_outside_compilation must have ranked outputs";
-    auto out_full = builder.create<TF::XlaSpmdShardToFullShapeOp>(
+    auto out_full = builder.create<mlir::TF::XlaSpmdShardToFullShapeOp>(
         loc, full_type, out, common_split_sharding, full_type_ranked.getShape(),
         /*dim=*/-1,
         /*unspecified_dims=*/builder.getI64ArrayAttr({}));
@@ -564,7 +599,7 @@ LogicalResult CreateHostComputeMap(
     full_outputs.push_back(out_full);
   }
 
-  return success();
+  return mlir::success();
 }
 
 // Create the _XlaHostComputeMlir with `inputs` and `outputs` for the ordinary
@@ -581,7 +616,7 @@ void CreateHostComputeNotMap(OpBuilder& builder, Location loc,
   llvm::SmallVector<Type, 4> device_output_types;
   for (const auto& output : outputs)
     device_output_types.push_back(output.getType());
-  auto host_compute = builder.create<TF::_XlaHostComputeMlirOp>(
+  auto host_compute = builder.create<mlir::TF::_XlaHostComputeMlirOp>(
       loc, device_output_types, inputs,
       builder.getStringAttr(args_communication_key),
       builder.getStringAttr(retvals_communication_key),
@@ -612,7 +647,7 @@ LogicalResult CreateHostCompute(
                             args_communication_key, retvals_communication_key,
                             serialized_func_module, full_outputs,
                             host_compute_out_ops);
-    return success();
+    return mlir::success();
   }
 }
 
@@ -630,17 +665,18 @@ bool ShouldCloseCluster(llvm::ArrayRef<Value> outputs) {
   bool has_dynamic_output = false;
   bool has_nonxla_output = false;
   for (Value v : outputs) {
-    if (TF::CanBeRefined(v.getType())) {
+    if (mlir::TF::CanBeRefined(v.getType())) {
       has_dynamic_output = true;
       for (Operation* user : v.getUsers()) {
         if (!HasOutsideCompilationAncestor(user) &&
-            !isa<tf_device::ReturnOp>(user))
+            !llvm::isa<mlir::tf_device::ReturnOp>(user))
           return true;
       }
     }
     if (!tensorflow::TypeValidForXLA(v.getType()))
       for (const Operation* user : v.getUsers())
-        if (!isa<tf_device::ReturnOp>(user)) has_nonxla_output = true;
+        if (!llvm::isa<mlir::tf_device::ReturnOp>(user))
+          has_nonxla_output = true;
   }
 
   return !has_nonxla_output && !has_dynamic_output;
@@ -656,7 +692,7 @@ void ReplaceExternalOperandUsage(ArrayRef<Value> external_operands,
                                  Operation* insertion_point,
                                  Block* original_op_block) {
   auto replace_operand_usage = [&](OpOperand& operand) {
-    if (TF::CanBeRefined(operand.get().getType()) ||
+    if (mlir::TF::CanBeRefined(operand.get().getType()) ||
         HasDynamicOutputs(operand.getOwner())) {
       return insertion_point->getParentRegion()->isAncestor(
           operand.getOwner()->getParentRegion());
@@ -675,7 +711,7 @@ void ReplaceExternalOperandUsage(ArrayRef<Value> external_operands,
 
 bool HasDynamicOutputs(llvm::ArrayRef<Value> outputs) {
   for (Value v : outputs) {
-    if (TF::CanBeRefined(v.getType())) {
+    if (mlir::TF::CanBeRefined(v.getType())) {
       return true;
     }
   }
@@ -723,7 +759,7 @@ std::pair<std::string, std::string> MakeCommunicationKeys(
 
   // Use a unique name when sending just the IfRegion predicate.  This is
   // for readable and to match the key in the TF2XLA bridge.
-  if (clustered_ops.size() == 1 && llvm::isa<TF::IfRegionOp>(op) &&
+  if (clustered_ops.size() == 1 && llvm::isa<mlir::TF::IfRegionOp>(op) &&
       external_operands.size() == 1) {
     args_communication_key =
         llvm::formatv("if_predicate_channel_{0}", (communication_key_index))
@@ -786,20 +822,21 @@ void CloneFirstHost(llvm::SmallVector<IRMapping>& core_to_mapping,
       builder.setInsertionPoint(core_to_host_insertion_point[core]);
       Operation* clone = builder.clone(*op, core_to_mapping[core]);
       core_to_mapping[core].map(op, clone);
-      if (auto recv_at_host = llvm::dyn_cast<TF::_XlaRecvAtHostOp>(clone)) {
+      if (auto recv_at_host =
+              llvm::dyn_cast<mlir::TF::_XlaRecvAtHostOp>(clone)) {
         recv_at_host.setDeviceOrdinal(core);
         clone->setOperand(0, core_to_compilation_key[core]);
       } else if (auto send_from_host =
-                     llvm::dyn_cast<TF::_XlaSendFromHostOp>(clone)) {
+                     llvm::dyn_cast<mlir::TF::_XlaSendFromHostOp>(clone)) {
         send_from_host.setDeviceOrdinal(core);
         clone->setOperand(1, core_to_compilation_key[core]);
       } else if (auto recv_at_host =
-                     llvm::dyn_cast<TF::_XlaRecvAtHostV2Op>(clone)) {
+                     llvm::dyn_cast<mlir::TF::_XlaRecvAtHostV2Op>(clone)) {
         recv_at_host.setOperand(0, core_to_compilation_key[core]);
         builder.setInsertionPoint(recv_at_host);
         recv_at_host.setOperand(1, core_to_device_ordinal[core]);
       } else if (auto send_from_host =
-                     llvm::dyn_cast<TF::_XlaSendFromHostV2Op>(clone)) {
+                     llvm::dyn_cast<mlir::TF::_XlaSendFromHostV2Op>(clone)) {
         send_from_host.setOperand(1, core_to_compilation_key[core]);
         builder.setInsertionPoint(send_from_host);
         send_from_host.setOperand(2, core_to_device_ordinal[core]);
@@ -830,8 +867,8 @@ LogicalResult MoveToHostSingleCluster(
 
   std::string serialized_func_module;
   if (HasDynamicOutputs(external_outputs)) {
-    func::FuncOp shape_op = BuildFunction(clustered_ops, external_operands,
-                                          external_outputs, &builder);
+    FuncOp shape_op = BuildFunction(clustered_ops, external_operands,
+                                    external_outputs, &builder);
     EncapsulateFuncAndSerialize(shape_op, &serialized_func_module);
   }
 
@@ -843,7 +880,7 @@ LogicalResult MoveToHostSingleCluster(
           args_communication_key, retvals_communication_key,
           serialized_func_module, is_map_oc, num_cores_per_replica,
           common_split_sharding, host_compute_outputs, host_compute_out_ops)))
-    return failure();
+    return mlir::failure();
 
   // Insert ops on the host side computation to receive data from device.
   // host0_ops are the ops that will make up the first host process. In the
@@ -881,7 +918,7 @@ LogicalResult MoveToHostSingleCluster(
     ++communication_key_index;
   }
 
-  return success();
+  return mlir::success();
 }
 
 // Update is_map_oc the true if op has attribute _xla_map_outside_compilation
@@ -903,7 +940,7 @@ LogicalResult UpdateIsMapOutsideCompilation(Operation& op, bool control_above,
     return op.emitOpError() << "map_outside_compilation inside control flow "
                                "is not implemented.";
   }
-  return success();
+  return mlir::success();
 }
 
 // Move outside compiled ops in `src` to `insertion_point` in host
@@ -920,7 +957,7 @@ LogicalResult UpdateIsMapOutsideCompilation(Operation& op, bool control_above,
 // program. Currently only map_outside_compilation-only or ordinary
 // outside_compilation only is supported.
 LogicalResult MoveToHostMultiCluster(
-    tf_device::ClusterOp device_cluster, Block* src,
+    mlir::tf_device::ClusterOp device_cluster, Block* src,
     ArrayRef<Operation*> core_to_host_insertion_point,
     ArrayRef<Value> core_to_compilation_key,
     ArrayRef<Value> core_to_device_ordinal, int default_device_ordinal,
@@ -938,8 +975,8 @@ LogicalResult MoveToHostMultiCluster(
   // single op except in the case where some of the input/output shapes are
   // non-static.
   llvm::SmallSetVector<Operation*, 4> clustered_ops;
-  auto device_type_attr =
-      device_cluster->getAttrOfType<StringAttr>(TF::kCompileDeviceTypeAttr);
+  auto device_type_attr = device_cluster->getAttrOfType<StringAttr>(
+      mlir::TF::kCompileDeviceTypeAttr);
 
   for (Operation& op : llvm::make_early_inc_range(*src)) {
     if (HasOutsideCompilationAncestorExclusive(&op) ||
@@ -947,7 +984,7 @@ LogicalResult MoveToHostMultiCluster(
       continue;
 
     if (failed(UpdateIsMapOutsideCompilation(op, control_above, is_map_oc)))
-      return failure();
+      return mlir::failure();
 
     llvm::SmallSetVector<Value, 4> external_outputs;
     llvm::SmallVector<Value, 4> host_outputs;
@@ -971,7 +1008,7 @@ LogicalResult MoveToHostMultiCluster(
               core_to_device_ordinal, default_device_ordinal, device_type_attr,
               *is_map_oc, num_cores_per_replica, common_split_sharding,
               communication_key_index)))
-        return failure();
+        return mlir::failure();
       clustered_ops.clear();
     }
 
@@ -999,18 +1036,18 @@ LogicalResult MoveToHostMultiCluster(
               core_to_device_ordinal, default_device_ordinal, device_type_attr,
               *is_map_oc, num_cores_per_replica, common_split_sharding,
               communication_key_index)))
-        return failure();
+        return mlir::failure();
       clustered_ops.clear();
     }
   }
-  return success();
+  return mlir::success();
 }
 
 void GetReturnValueFromDevice(
-    tf_device::ClusterOp device_cluster,
+    mlir::tf_device::ClusterOp device_cluster,
     const llvm::SmallVector<Value, 4>& return_value_from_host,
     llvm::SmallVector<Value, 4>& return_value_from_device) {
-  if (auto return_op = llvm::dyn_cast_or_null<tf_device::ReturnOp>(
+  if (auto return_op = llvm::dyn_cast_or_null<mlir::tf_device::ReturnOp>(
           device_cluster.GetBody().getTerminator())) {
     for (auto v : return_op.getOperands()) {
       if (absl::c_count(return_value_from_host, v) == 0) {
@@ -1028,14 +1065,14 @@ void GetReturnValueFromDevice(
 // launch in tf_device.parallel_execute.  Uses `compilation_key,
 // `device_ordinal` and `communication_key_index` when creating communication
 // ops.
-LogicalResult DecomposeControlFlow(tf_device::ClusterOp device_cluster,
+LogicalResult DecomposeControlFlow(mlir::tf_device::ClusterOp device_cluster,
                                    ArrayRef<Value> core_to_compilation_key,
                                    ArrayRef<Value> core_to_device_ordinal,
                                    int default_device_ordinal,
                                    int& communication_key_index,
                                    std::optional<bool>& is_map_oc) {
   auto result = device_cluster.GetBody().walk([&](Operation* op) {
-    if (auto if_op = llvm::dyn_cast<TF::IfRegionOp>(op)) {
+    if (auto if_op = llvm::dyn_cast<mlir::TF::IfRegionOp>(op)) {
       if (!HasOutsideCompilationNested(op)) return WalkResult::advance();
       OpBuilder builder(if_op);
       auto host_if = CloneEmptyIfWithPredicate(if_op, builder);
@@ -1057,7 +1094,7 @@ LogicalResult DecomposeControlFlow(tf_device::ClusterOp device_cluster,
       if_op->setAttr("is_stateless", builder.getBoolAttr(false));
       MarkOutsideCompiled(host_if.getOperation());
     }
-    if (auto while_op = llvm::dyn_cast<TF::WhileRegionOp>(op)) {
+    if (auto while_op = llvm::dyn_cast<mlir::TF::WhileRegionOp>(op)) {
       if (!HasOutsideCompilationNested(op)) return WalkResult::advance();
       OpBuilder builder(while_op);
       auto host_while = CloneEmptyWhile(while_op.getParallelIterations(),
@@ -1071,8 +1108,8 @@ LogicalResult DecomposeControlFlow(tf_device::ClusterOp device_cluster,
       auto condition =
           while_op.getCond().front().getTerminator()->getOperand(0);
       builder.setInsertionPoint(while_op.getCond().front().getTerminator());
-      builder.create<TF::XlaSendToHostOp>(while_op.getLoc(), condition,
-                                          condition_send_recv_key);
+      builder.create<mlir::TF::XlaSendToHostOp>(while_op.getLoc(), condition,
+                                                condition_send_recv_key);
       // device_ordinal0 is the ordinal of TPU_REPLICATED_CORE_0 and is only
       // used in the replicated case.
       Value device_ordinal0 = nullptr;
@@ -1082,10 +1119,11 @@ LogicalResult DecomposeControlFlow(tf_device::ClusterOp device_cluster,
       auto recv_condition_at_host = CreateRecvAtHostOp(
           builder, while_op.getLoc(), TypeRange{condition.getType()},
           core_to_compilation_key[0], device_ordinal0, default_device_ordinal,
-          device_cluster->getAttrOfType<StringAttr>(TF::kCompileDeviceTypeAttr),
+          device_cluster->getAttrOfType<StringAttr>(
+              mlir::TF::kCompileDeviceTypeAttr),
           condition_send_recv_key);
-      builder.create<TF::YieldOp>(while_op.getLoc(),
-                                  recv_condition_at_host->getResults());
+      builder.create<mlir::TF::YieldOp>(while_op.getLoc(),
+                                        recv_condition_at_host->getResults());
 
       if (failed(MoveToHostMultiCluster(
               device_cluster, &while_op.getCond().front(),
@@ -1106,14 +1144,14 @@ LogicalResult DecomposeControlFlow(tf_device::ClusterOp device_cluster,
     }
     return WalkResult::advance();
   });
-  if (result.wasInterrupted()) return failure();
-  return success();
+  if (result.wasInterrupted()) return mlir::failure();
+  return mlir::success();
 }
 
 // Removes outside compilation from all ops inside `host_launch_op`.  Should
 // only be run after all outside compiled ops have been moved to
 // `host_launch_op`.
-void RemoveOutsideCompilation(tf_device::LaunchOp host_launch_op) {
+void RemoveOutsideCompilation(mlir::tf_device::LaunchOp host_launch_op) {
   host_launch_op.GetBody().walk([&](Operation* op) {
     if (op->hasAttr(kXlaOutsideCompilationAttr)) {
       op->removeAttr(
@@ -1129,14 +1167,16 @@ void RemoveOutsideCompilation(tf_device::LaunchOp host_launch_op) {
 // if it is non replicated cluster and there is a device attr with some
 // non-empty device, then that device's ordinal (0 out of TPU:0 and
 // 1 out of TPU:1) is extracted and the default ordinal is set to this value.
-LogicalResult GetDefaultDeviceOrdinal(tf_device::ClusterOp device_cluster,
+LogicalResult GetDefaultDeviceOrdinal(mlir::tf_device::ClusterOp device_cluster,
                                       int& default_ordinal) {
-  bool has_replication = device_cluster->hasAttr(TF::kReplicationInfoAttr);
+  bool has_replication =
+      device_cluster->hasAttr(mlir::TF::kReplicationInfoAttr);
 
   std::string replication_info;
   if (has_replication) {
     replication_info =
-        device_cluster->getAttrOfType<StringAttr>(TF::kReplicationInfoAttr)
+        device_cluster
+            ->getAttrOfType<StringAttr>(mlir::TF::kReplicationInfoAttr)
             .str();
   }
   if (replication_info == kNoReplicationCluster || replication_info.empty()) {
@@ -1156,7 +1196,7 @@ LogicalResult GetDefaultDeviceOrdinal(tf_device::ClusterOp device_cluster,
              << " could not find ordinal for the given device";
     }
   }
-  return success();
+  return mlir::success();
 }
 
 // The results of parallel executes is the combination of return values from
@@ -1177,7 +1217,7 @@ llvm::SmallVector<Type, 4> GetParallelExecuteResultsTypes(
 
 // Remap the device cluster results with parallel execute op results
 llvm::SmallVector<Value, 4> GetRemappedTpuClusterResults(
-    tf_device::ClusterOp device_cluster,
+    mlir::tf_device::ClusterOp device_cluster,
     const llvm::SmallVector<Value, 4>& return_value_from_host,
     const llvm::SmallVector<Value, 4>& return_value_from_device) {
   llvm::SmallVector<Value, 4> remapped_device_cluster_results;
@@ -1187,7 +1227,7 @@ llvm::SmallVector<Value, 4> GetRemappedTpuClusterResults(
       return_value_from_host.size() + return_value_from_device.size());
 
   llvm::SmallDenseMap<Value, std::deque<int>> return_operand_map;
-  auto return_op = llvm::dyn_cast<tf_device::ReturnOp>(
+  auto return_op = llvm::dyn_cast<mlir::tf_device::ReturnOp>(
       device_cluster.GetBody().getTerminator());
 
   for (OpOperand& operand : return_op->getOpOperands()) {
@@ -1221,8 +1261,8 @@ llvm::SmallVector<Value, 4> GetRemappedTpuClusterResults(
 // Remap cluster results with parallel_execute results if user is outside of
 // parallel_execute.
 void RemapDeviceClusterResultsWithParallelExecuteResults(
-    tf_device::ClusterOp device_cluster,
-    tf_device::ParallelExecuteOp parallel_execute_op,
+    mlir::tf_device::ClusterOp device_cluster,
+    mlir::tf_device::ParallelExecuteOp parallel_execute_op,
     const llvm::SmallVector<Value, 4>& return_value_from_host,
     const llvm::SmallVector<Value, 4>& return_value_from_device) {
   llvm::SmallVector<Value, 4> remapped_device_cluster_results =
@@ -1261,7 +1301,7 @@ llvm::SmallVector<Type, 4> GetNewDeviceTypes(
 }
 
 // Move ops in old device cluster to new device cluster
-void MoveOldTpuClusterToNewTpuCluster(tf_device::ClusterOp device_cluster,
+void MoveOldTpuClusterToNewTpuCluster(mlir::tf_device::ClusterOp device_cluster,
                                       Operation* after_op_r) {
   for (Operation& op : llvm::make_early_inc_range(device_cluster.GetBody())) {
     if (&op != device_cluster.GetBody().getTerminator()) {
@@ -1271,7 +1311,7 @@ void MoveOldTpuClusterToNewTpuCluster(tf_device::ClusterOp device_cluster,
 }
 
 // Move ops in the tmp host launch op to new host launch op
-void MoveTmpLaunchOpToNewLaunchOp(tf_device::LaunchOp tmp_host_launch_op,
+void MoveTmpLaunchOpToNewLaunchOp(mlir::tf_device::LaunchOp tmp_host_launch_op,
                                   Operation* after_op_host_cluster) {
   for (Operation& op :
        llvm::make_early_inc_range(tmp_host_launch_op.GetBody())) {
@@ -1285,10 +1325,10 @@ void MoveTmpLaunchOpToNewLaunchOp(tf_device::LaunchOp tmp_host_launch_op,
 // outside compiled ops, we can create the actual parallel_execute regions.
 // Still, one region is for the host computation for outside compilation and
 // the other one is for the original Device cluster computation.
-tf_device::ParallelExecuteOp CreateFinalParallelExecuteOp(
+mlir::tf_device::ParallelExecuteOp CreateFinalParallelExecuteOp(
     OpBuilder& builder, int num_regions, ArrayRef<std::string> core_to_host,
-    tf_device::ClusterOp device_cluster,
-    ArrayRef<tf_device::LaunchOp> core_to_tmp_host_launch,
+    mlir::tf_device::ClusterOp device_cluster,
+    ArrayRef<mlir::tf_device::LaunchOp> core_to_tmp_host_launch,
     ArrayRef<Value> return_value_from_host,
     ArrayRef<Value> return_value_from_device) {
   llvm::SmallVector<Type, 4> parallel_execute_result_types =
@@ -1296,9 +1336,9 @@ tf_device::ParallelExecuteOp CreateFinalParallelExecuteOp(
                                      return_value_from_device);
 
   builder.setInsertionPoint(device_cluster);
-  auto parallel_execute_op = builder.create<tf_device::ParallelExecuteOp>(
+  auto parallel_execute_op = builder.create<mlir::tf_device::ParallelExecuteOp>(
       device_cluster.getLoc(), num_regions, parallel_execute_result_types);
-  SmallVector<tf_device::LaunchOp, 4> core_to_host_launch;
+  SmallVector<mlir::tf_device::LaunchOp, 4> core_to_host_launch;
   for (int core = 0; core < core_to_tmp_host_launch.size(); ++core) {
     Block& host_computation_block =
         parallel_execute_op.GetRegionBlockWithIndex(core);
@@ -1313,14 +1353,14 @@ tf_device::ParallelExecuteOp CreateFinalParallelExecuteOp(
     llvm::SmallVector<Value, 4> host_results;
     host_results.insert(host_results.end(), return_value_from_host.begin(),
                         return_value_from_host.end());
-    tf_device::LaunchOp host_launch_op = CreateLaunchOpForOutsideCluster(
+    mlir::tf_device::LaunchOp host_launch_op = CreateLaunchOpForOutsideCluster(
         builder, device_cluster, core_to_host[core], host_results);
     core_to_host_launch.push_back(host_launch_op);
 
     // Create a return op for host computation block
     builder.setInsertionPointToEnd(&host_computation_block);
-    builder.create<tf_device::ReturnOp>(device_cluster.getLoc(),
-                                        host_launch_op->getResults());
+    builder.create<mlir::tf_device::ReturnOp>(device_cluster.getLoc(),
+                                              host_launch_op->getResults());
   }
 
   // Move the launch body to last parallel_execute block.
@@ -1337,7 +1377,7 @@ tf_device::ParallelExecuteOp CreateFinalParallelExecuteOp(
 
   // Create a empty device cluster op with same attribute but different return
   // type
-  auto new_device_cluster = builder.create<tf_device::ClusterOp>(
+  auto new_device_cluster = builder.create<mlir::tf_device::ClusterOp>(
       device_cluster.getLoc(), device_result_types,
       /*operands=*/llvm::ArrayRef<Value>{}, device_cluster->getAttrs());
 
@@ -1345,14 +1385,14 @@ tf_device::ParallelExecuteOp CreateFinalParallelExecuteOp(
   builder.setInsertionPointToEnd(&new_device_cluster.GetBody());
 
   // Create return op for device computation region in the paralle_execute op
-  Operation* after_op_r = builder.create<tf_device::ReturnOp>(
+  Operation* after_op_r = builder.create<mlir::tf_device::ReturnOp>(
       new_device_cluster.getLoc(), device_results);
 
   builder.setInsertionPointToEnd(&parallel_execute_device_block);
 
   // Create return op for the new device cluster op
-  builder.create<tf_device::ReturnOp>(device_cluster.getLoc(),
-                                      new_device_cluster.getResults());
+  builder.create<mlir::tf_device::ReturnOp>(device_cluster.getLoc(),
+                                            new_device_cluster.getResults());
 
   MoveOldTpuClusterToNewTpuCluster(device_cluster, after_op_r);
 
@@ -1371,8 +1411,8 @@ tf_device::ParallelExecuteOp CreateFinalParallelExecuteOp(
 // a region for `device_cluster` computation by extracting outside compiled ops
 // to host computation.
 LogicalResult CreateParallelExecuteForOutsideCompilation(
-    tf_device::ClusterOp device_cluster,
-    llvm::SmallVector<tf_device::ParallelExecuteOp, 4>& ops,
+    mlir::tf_device::ClusterOp device_cluster,
+    llvm::SmallVector<mlir::tf_device::ParallelExecuteOp, 4>& ops,
     std::optional<bool>& is_map_oc, ArrayRef<std::string> core_to_host,
     bool has_tpu_device) {
   OpBuilder builder(device_cluster);
@@ -1385,10 +1425,11 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
   // `map_outside_compilation` case `num_host_regions == num_cores_per_replica`.
   const int num_host_regions = core_to_host.size();
   const int num_regions = 1 + num_host_regions;
-  auto tmp_parallel_execute_op = builder.create<tf_device::ParallelExecuteOp>(
-      device_cluster.getLoc(), num_regions, llvm::ArrayRef<Type>{});
+  auto tmp_parallel_execute_op =
+      builder.create<mlir::tf_device::ParallelExecuteOp>(
+          device_cluster.getLoc(), num_regions, llvm::ArrayRef<Type>{});
   SmallVector<Operation*, 4> core_to_host_insertion_point;
-  SmallVector<tf_device::LaunchOp, 4> core_to_tmp_launch;
+  SmallVector<mlir::tf_device::LaunchOp, 4> core_to_tmp_launch;
   SmallVector<Operation*, 4> compilation_key_ops;
   SmallVector<Value, 4> core_to_compilation_key;
   SmallVector<Operation*, 4> core_to_device_ordinal_op;
@@ -1399,13 +1440,14 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
     builder.setInsertionPointToEnd(&tmp_host_computation_block);
     // Create a single tmp launch op for all outside compiled ops.
     llvm::SmallVector<Value, 4> tmp_host_results;
-    tf_device::LaunchOp tmp_host_launch_op = CreateLaunchOpForOutsideCluster(
-        builder, device_cluster, core_to_host[core], tmp_host_results);
+    mlir::tf_device::LaunchOp tmp_host_launch_op =
+        CreateLaunchOpForOutsideCluster(builder, device_cluster,
+                                        core_to_host[core], tmp_host_results);
     core_to_tmp_launch.push_back(tmp_host_launch_op);
     // Create a tmp return op for tmp host computation block
     builder.setInsertionPointToEnd(&tmp_host_computation_block);
-    builder.create<tf_device::ReturnOp>(device_cluster.getLoc(),
-                                        llvm::ArrayRef<Value>{});
+    builder.create<mlir::tf_device::ReturnOp>(device_cluster.getLoc(),
+                                              llvm::ArrayRef<Value>{});
     core_to_host_insertion_point.push_back(
         tmp_host_launch_op.GetBody().getTerminator());
 
@@ -1418,16 +1460,17 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
     compilation_key_op =
         CreateCompilationKeyPlaceholder(device_cluster.getLoc(), builder);
     compilation_key =
-        llvm::dyn_cast<TF::_XlaCompileMlirPlaceholderProgramKeyOp>(
+        llvm::dyn_cast<mlir::TF::_XlaCompileMlirPlaceholderProgramKeyOp>(
             compilation_key_op)
             .getProgram();
     if (has_tpu_device) {
-      device_ordinal_op = builder.create<TF::_TPUDeviceOrdinalPlaceholderOp>(
-          device_cluster.getLoc(),
-          RankedTensorType::get({}, builder.getI64Type()),
-          builder.getI64IntegerAttr(core));
+      device_ordinal_op =
+          builder.create<mlir::TF::_TPUDeviceOrdinalPlaceholderOp>(
+              device_cluster.getLoc(),
+              RankedTensorType::get({}, builder.getI64Type()),
+              builder.getI64IntegerAttr(core));
     } else {
-      device_ordinal_op = builder.create<TF::ConstOp>(
+      device_ordinal_op = builder.create<mlir::TF::ConstOp>(
           device_cluster.getLoc(),
           DenseIntElementsAttr::get(
               RankedTensorType::get({}, builder.getI64Type()),
@@ -1436,7 +1479,7 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
     compilation_key_ops.push_back(compilation_key_op);
     core_to_compilation_key.push_back(compilation_key);
     core_to_device_ordinal_op.push_back(device_ordinal_op);
-    if (device_cluster->getParentOfType<tf_device::ReplicateOp>())
+    if (device_cluster->getParentOfType<mlir::tf_device::ReplicateOp>())
       core_to_device_ordinal.push_back(
           core_to_device_ordinal_op[core]->getResults()[0]);
   }
@@ -1444,7 +1487,7 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
   builder.setInsertionPoint(tmp_parallel_execute_op);
   int default_device_ordinal = 0;
   if (failed(GetDefaultDeviceOrdinal(device_cluster, default_device_ordinal))) {
-    return failure();
+    return mlir::failure();
   }
   // communication_key_index is part of the message identifier and is
   // incremented for each _XlaHostComputeMlir.
@@ -1455,7 +1498,7 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
   if (failed(DecomposeControlFlow(
           device_cluster, core_to_compilation_key, core_to_device_ordinal,
           default_device_ordinal, communication_key_index, is_map_oc)))
-    return failure();
+    return mlir::failure();
 
   // Move all outside compiled ops including control flow to tmp host launch.
   // Also set the values returned from the host when ops are moved.
@@ -1465,7 +1508,7 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
           core_to_device_ordinal, default_device_ordinal,
           /*control_above=*/false, is_map_oc, communication_key_index,
           &returns_from_host)))
-    return failure();
+    return mlir::failure();
 
   llvm::SmallVector<Value, 4> returns_from_device;
   GetReturnValueFromDevice(device_cluster, returns_from_host,
@@ -1477,10 +1520,10 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
   if (communication_key_index == 0 || core_to_device_ordinal.empty())
     for (auto op : core_to_device_ordinal_op) op->erase();
 
-  for (tf_device::LaunchOp tmp_host_launch_op : core_to_tmp_launch)
+  for (mlir::tf_device::LaunchOp tmp_host_launch_op : core_to_tmp_launch)
     RemoveOutsideCompilation(tmp_host_launch_op);
 
-  tf_device::ParallelExecuteOp parallel_execute_op =
+  mlir::tf_device::ParallelExecuteOp parallel_execute_op =
       CreateFinalParallelExecuteOp(builder, num_regions, core_to_host,
                                    device_cluster, core_to_tmp_launch,
                                    returns_from_host, returns_from_device);
@@ -1494,12 +1537,12 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
 
   device_cluster.erase();
 
-  return success();
+  return mlir::success();
 }
 
 // Check that cluster results are valid. An result is invalid when it does not
 // have a valid XLA type.
-LogicalResult CheckClusterResults(tf_device::ClusterOp cluster) {
+LogicalResult CheckClusterResults(mlir::tf_device::ClusterOp cluster) {
   for (OpResult result : cluster.getResults()) {
     if (!tensorflow::TypeValidForXLA(result.getType())) {
       return cluster.emitError()
@@ -1508,14 +1551,14 @@ LogicalResult CheckClusterResults(tf_device::ClusterOp cluster) {
              << result.getType();
     }
   }
-  return success();
+  return mlir::success();
 }
 
 // Check that op marked for outside compilation has an ancestor also marked for
 // outside compilation.
 LogicalResult CheckAncestorNotOutsideComp(Operation* op) {
   if (!op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr))
-    return success();
+    return mlir::success();
   Operation* iter_op = op;
   while (auto* parent_op = iter_op->getParentOp()) {
     if (parent_op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
@@ -1526,7 +1569,7 @@ LogicalResult CheckAncestorNotOutsideComp(Operation* op) {
     }
     iter_op = parent_op;
   }
-  return success();
+  return mlir::success();
 }
 
 // Check the validity of the module, pre-pass.
@@ -1535,18 +1578,18 @@ LogicalResult CheckPreconditions(ModuleOp module) {
     if (failed(CheckAncestorNotOutsideComp(op))) return WalkResult::interrupt();
     return WalkResult::advance();
   });
-  if (walk_result.wasInterrupted()) return failure();
-  return success();
+  if (walk_result.wasInterrupted()) return mlir::failure();
+  return mlir::success();
 }
 
 // Check the validity of the module, post-pass.
 LogicalResult CheckPostconditions(ModuleOp module) {
-  auto walk_result = module.walk([&](tf_device::ClusterOp cluster) {
+  auto walk_result = module.walk([&](mlir::tf_device::ClusterOp cluster) {
     if (failed(CheckClusterResults(cluster))) return WalkResult::interrupt();
     return WalkResult::advance();
   });
-  if (walk_result.wasInterrupted()) return failure();
-  return success();
+  if (walk_result.wasInterrupted()) return mlir::failure();
+  return mlir::success();
 }
 
 void ExtractOutsideCompilation::runOnOperation() {
@@ -1558,10 +1601,11 @@ void ExtractOutsideCompilation::runOnOperation() {
   if (failed(tensorflow::GetDevicesFromOp(module, &devices)))
     return signalPassFailure();
 
-  llvm::SmallVector<tf_device::ParallelExecuteOp, 4> tmp_parallel_execute_ops;
+  llvm::SmallVector<mlir::tf_device::ParallelExecuteOp, 4>
+      tmp_parallel_execute_ops;
   std::optional<bool> is_map_oc;
 
-  module.walk([&](tf_device::ClusterOp device_cluster) {
+  module.walk([&](mlir::tf_device::ClusterOp device_cluster) {
     if (HasOutsideCompilationNested(device_cluster.getOperation())) {
       SmallVector<std::string, 8> core_to_host;
       if (failed(tensorflow::GetDeviceToHostMap(device_cluster, core_to_host)))
@@ -1594,5 +1638,6 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateExtractOutsideCompilationPass() {
   return std::make_unique<ExtractOutsideCompilation>();
 }
 
-}  // namespace TFDevice
-}  // namespace mlir
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/mark_ops_for_outside_compilation.cc
similarity index 70%
rename from tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
rename to tensorflow/compiler/mlir/tf2xla/internal/passes/mark_ops_for_outside_compilation.cc
index ca68e36e581443..6a38f620377cf2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/mark_ops_for_outside_compilation.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,30 +19,61 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/log/log.h"
+#include "absl/strings/str_join.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Rewrite/PatternApplicator.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/string_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
 
-namespace mlir {
-namespace TFDevice {
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
 
 namespace {
 
+using mlir::Block;
+using mlir::BoolAttr;
+using mlir::Dialect;
+using mlir::LogicalResult;
+using mlir::MLIRContext;
+using mlir::ModuleOp;
+using mlir::Operation;
+using mlir::OperationName;
+using mlir::OperationPass;
+using mlir::Pattern;
+using mlir::PatternApplicator;
+using mlir::RewritePatternSet;
+using mlir::StringAttr;
+using mlir::TensorType;
+using mlir::Type;
+using mlir::Value;
+using mlir::WalkResult;
+
 constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 constexpr char kAllowSoftPlacementAttr[] = "allow_soft_placement";
 
@@ -52,7 +83,7 @@ auto* auto_outside_compilation_gauge =
         "Tracks if auto outside compilation is enabled");
 
 #define GEN_PASS_DEF_MARKOPSFOROUTSIDECOMPILATIONPASS
-#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
 
 struct MarkOpsForOutsideCompilation
     : public impl::MarkOpsForOutsideCompilationPassBase<
@@ -79,16 +110,17 @@ void AddCanonicalizationPatterns(MLIRContext* context,
 void AddSupportedOpsUsingFolding(MLIRContext* context,
                                  llvm::DenseSet<OperationName>* supported_ops) {
   llvm::SmallDenseSet<OperationName, 8> allowlist_ops = {
-      OperationName(TF::BroadcastArgsOp::getOperationName(), context),
-      OperationName(TF::BroadcastGradientArgsOp::getOperationName(), context),
-      OperationName(TF::ConcatOffsetOp::getOperationName(), context),
-      OperationName(TF::EmptyOp::getOperationName(), context),
-      OperationName(TF::ListDiffOp::getOperationName(), context),
-      OperationName(TF::RankOp::getOperationName(), context),
-      OperationName(TF::RangeOp::getOperationName(), context),
-      OperationName(TF::ShapeOp::getOperationName(), context),
-      OperationName(TF::ShapeNOp::getOperationName(), context),
-      OperationName(TF::SizeOp::getOperationName(), context),
+      OperationName(mlir::TF::BroadcastArgsOp::getOperationName(), context),
+      OperationName(mlir::TF::BroadcastGradientArgsOp::getOperationName(),
+                    context),
+      OperationName(mlir::TF::ConcatOffsetOp::getOperationName(), context),
+      OperationName(mlir::TF::EmptyOp::getOperationName(), context),
+      OperationName(mlir::TF::ListDiffOp::getOperationName(), context),
+      OperationName(mlir::TF::RankOp::getOperationName(), context),
+      OperationName(mlir::TF::RangeOp::getOperationName(), context),
+      OperationName(mlir::TF::ShapeOp::getOperationName(), context),
+      OperationName(mlir::TF::ShapeNOp::getOperationName(), context),
+      OperationName(mlir::TF::SizeOp::getOperationName(), context),
   };
 
   supported_ops->insert(allowlist_ops.begin(), allowlist_ops.end());
@@ -102,14 +134,16 @@ void AddSupportedOpsUsingFolding(MLIRContext* context,
 void AddOldBridgeOnlyOps(MLIRContext* context,
                          llvm::DenseSet<OperationName>* supported_ops) {
   llvm::SmallDenseSet<OperationName, 8> allowlist_ops = {
-      OperationName(TF::DynamicPartitionOp::getOperationName(), context),
-      OperationName(TF::OutfeedEnqueueOp::getOperationName(), context),
-      OperationName(TF::WhereOp::getOperationName(), context),
-      OperationName(TF::UniqueOp::getOperationName(), context),
-      OperationName(TF::XlaSetDynamicDimensionSizeOp::getOperationName(),
+      OperationName(mlir::TF::DynamicPartitionOp::getOperationName(), context),
+      OperationName(mlir::TF::OutfeedEnqueueOp::getOperationName(), context),
+      OperationName(mlir::TF::WhereOp::getOperationName(), context),
+      OperationName(mlir::TF::UniqueOp::getOperationName(), context),
+      OperationName(mlir::TF::XlaSetDynamicDimensionSizeOp::getOperationName(),
+                    context),
+      OperationName(mlir::TF::XlaSpmdFullToShardShapeOp::getOperationName(),
+                    context),
+      OperationName(mlir::TF::XlaSpmdShardToFullShapeOp::getOperationName(),
                     context),
-      OperationName(TF::XlaSpmdFullToShardShapeOp::getOperationName(), context),
-      OperationName(TF::XlaSpmdShardToFullShapeOp::getOperationName(), context),
   };
 
   supported_ops->insert(allowlist_ops.begin(), allowlist_ops.end());
@@ -120,46 +154,46 @@ void AddOldBridgeOnlyOps(MLIRContext* context,
 void AddSupportedFunctionalOps(MLIRContext* context,
                                llvm::DenseSet<OperationName>* supported_ops) {
   supported_ops->insert(
-      OperationName(TF::CaseRegionOp::getOperationName(), context));
-  supported_ops->insert(
-      OperationName(TF::IfRegionOp::getOperationName(), context));
-  supported_ops->insert(
-      OperationName(TF::InplaceAddOp::getOperationName(), context));
-  supported_ops->insert(
-      OperationName(TF::WhileRegionOp::getOperationName(), context));
-  supported_ops->insert(
-      OperationName(TF::XlaCallModuleOp::getOperationName(), context));
+      OperationName(mlir::TF::CaseRegionOp::getOperationName(), context));
   supported_ops->insert(
-      OperationName(TF::XlaReduceOp::getOperationName(), context));
+      OperationName(mlir::TF::IfRegionOp::getOperationName(), context));
   supported_ops->insert(
-      OperationName(TF::XlaReduceWindowOp::getOperationName(), context));
+      OperationName(mlir::TF::InplaceAddOp::getOperationName(), context));
   supported_ops->insert(
-      OperationName(TF::XlaRngBitGeneratorOp::getOperationName(), context));
+      OperationName(mlir::TF::WhileRegionOp::getOperationName(), context));
   supported_ops->insert(
-      OperationName(TF::XlaScatterOp::getOperationName(), context));
+      OperationName(mlir::TF::XlaCallModuleOp::getOperationName(), context));
   supported_ops->insert(
-      OperationName(TF::XlaSelectAndScatterOp::getOperationName(), context));
+      OperationName(mlir::TF::XlaReduceOp::getOperationName(), context));
   supported_ops->insert(
-      OperationName(TF::SymbolicGradientOp::getOperationName(), context));
+      OperationName(mlir::TF::XlaReduceWindowOp::getOperationName(), context));
+  supported_ops->insert(OperationName(
+      mlir::TF::XlaRngBitGeneratorOp::getOperationName(), context));
   supported_ops->insert(
-      OperationName(TF::XlaVariadicReduceOp::getOperationName(), context));
+      OperationName(mlir::TF::XlaScatterOp::getOperationName(), context));
+  supported_ops->insert(OperationName(
+      mlir::TF::XlaSelectAndScatterOp::getOperationName(), context));
   supported_ops->insert(
-      OperationName(TF::XlaVariadicReduceV2Op::getOperationName(), context));
+      OperationName(mlir::TF::SymbolicGradientOp::getOperationName(), context));
+  supported_ops->insert(OperationName(
+      mlir::TF::XlaVariadicReduceOp::getOperationName(), context));
+  supported_ops->insert(OperationName(
+      mlir::TF::XlaVariadicReduceV2Op::getOperationName(), context));
   supported_ops->insert(
-      OperationName(TF::XlaVariadicSortOp::getOperationName(), context));
+      OperationName(mlir::TF::XlaVariadicSortOp::getOperationName(), context));
   supported_ops->insert(
-      OperationName(TF::XlaReplicaIdOp::getOperationName(), context));
+      OperationName(mlir::TF::XlaReplicaIdOp::getOperationName(), context));
   supported_ops->insert(
-      OperationName(TF::YieldOp::getOperationName(), context));
+      OperationName(mlir::TF::YieldOp::getOperationName(), context));
 }
 
 // These embedding ops are rewritten when running TPUCompileOp.
 void AddRewrittenEmbeddingOps(MLIRContext* context,
                               llvm::DenseSet<OperationName>* supported_ops) {
   supported_ops->insert(OperationName(
-      TF::RecvTPUEmbeddingActivationsOp::getOperationName(), context));
+      mlir::TF::RecvTPUEmbeddingActivationsOp::getOperationName(), context));
   supported_ops->insert(OperationName(
-      TF::SendTPUEmbeddingGradientsOp::getOperationName(), context));
+      mlir::TF::SendTPUEmbeddingGradientsOp::getOperationName(), context));
 }
 
 // Stack, TensorList and TensorArray ops are rewritten during the second phase
@@ -171,32 +205,32 @@ void AddRewrittenCompositeOps(MLIRContext* context,
 #define GET_OPERATION_NAME(op) OperationName(op::getOperationName(), context)
   llvm::SmallDenseSet<OperationName, 32> allowlist_ops = {
       // Stack ops.
-      GET_OPERATION_NAME(TF::StackV2Op),
-      GET_OPERATION_NAME(TF::StackPushV2Op),
-      GET_OPERATION_NAME(TF::StackPopV2Op),
+      GET_OPERATION_NAME(mlir::TF::StackV2Op),
+      GET_OPERATION_NAME(mlir::TF::StackPushV2Op),
+      GET_OPERATION_NAME(mlir::TF::StackPopV2Op),
       // Tensor Array ops.
-      GET_OPERATION_NAME(TF::TensorArrayV3Op),
-      GET_OPERATION_NAME(TF::TensorArrayReadV3Op),
-      GET_OPERATION_NAME(TF::TensorArrayWriteV3Op),
-      GET_OPERATION_NAME(TF::TensorArrayConcatV3Op),
-      GET_OPERATION_NAME(TF::TensorArraySplitV3Op),
-      GET_OPERATION_NAME(TF::TensorArraySizeV3Op),
-      GET_OPERATION_NAME(TF::TensorArrayGradV3Op),
-      GET_OPERATION_NAME(TF::TensorArrayGatherV3Op),
-      GET_OPERATION_NAME(TF::TensorArrayScatterV3Op),
+      GET_OPERATION_NAME(mlir::TF::TensorArrayV3Op),
+      GET_OPERATION_NAME(mlir::TF::TensorArrayReadV3Op),
+      GET_OPERATION_NAME(mlir::TF::TensorArrayWriteV3Op),
+      GET_OPERATION_NAME(mlir::TF::TensorArrayConcatV3Op),
+      GET_OPERATION_NAME(mlir::TF::TensorArraySplitV3Op),
+      GET_OPERATION_NAME(mlir::TF::TensorArraySizeV3Op),
+      GET_OPERATION_NAME(mlir::TF::TensorArrayGradV3Op),
+      GET_OPERATION_NAME(mlir::TF::TensorArrayGatherV3Op),
+      GET_OPERATION_NAME(mlir::TF::TensorArrayScatterV3Op),
       // Tensor List Ops.
-      GET_OPERATION_NAME(TF::EmptyTensorListOp),
-      GET_OPERATION_NAME(TF::TensorListReserveOp),
-      GET_OPERATION_NAME(TF::TensorListFromTensorOp),
-      GET_OPERATION_NAME(TF::TensorListPushBackOp),
-      GET_OPERATION_NAME(TF::TensorListPopBackOp),
-      GET_OPERATION_NAME(TF::TensorListGetItemOp),
-      GET_OPERATION_NAME(TF::TensorListSetItemOp),
-      GET_OPERATION_NAME(TF::TensorListLengthOp),
-      GET_OPERATION_NAME(TF::TensorListElementShapeOp),
-      GET_OPERATION_NAME(TF::TensorListGatherOp),
-      GET_OPERATION_NAME(TF::TensorListScatterIntoExistingListOp),
-      GET_OPERATION_NAME(TF::TensorListStackOp),
+      GET_OPERATION_NAME(mlir::TF::EmptyTensorListOp),
+      GET_OPERATION_NAME(mlir::TF::TensorListReserveOp),
+      GET_OPERATION_NAME(mlir::TF::TensorListFromTensorOp),
+      GET_OPERATION_NAME(mlir::TF::TensorListPushBackOp),
+      GET_OPERATION_NAME(mlir::TF::TensorListPopBackOp),
+      GET_OPERATION_NAME(mlir::TF::TensorListGetItemOp),
+      GET_OPERATION_NAME(mlir::TF::TensorListSetItemOp),
+      GET_OPERATION_NAME(mlir::TF::TensorListLengthOp),
+      GET_OPERATION_NAME(mlir::TF::TensorListElementShapeOp),
+      GET_OPERATION_NAME(mlir::TF::TensorListGatherOp),
+      GET_OPERATION_NAME(mlir::TF::TensorListScatterIntoExistingListOp),
+      GET_OPERATION_NAME(mlir::TF::TensorListStackOp),
   };
 #undef GET_OPERATION_NAME
 
@@ -204,13 +238,13 @@ void AddRewrittenCompositeOps(MLIRContext* context,
 }
 
 bool IsStringType(Type type) {
-  if (type.isa<TF::StringType>()) return true;
+  if (type.isa<mlir::TF::StringType>()) return true;
 
-  auto sub_type = type.dyn_cast<TF::TensorFlowTypeWithSubtype>();
+  auto sub_type = type.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
   if (!sub_type) return false;
 
   bool has_string = llvm::any_of(sub_type.GetSubtypes(), [](TensorType type) {
-    return type.getElementType().isa<TF::StringType>();
+    return type.getElementType().isa<mlir::TF::StringType>();
   });
   return has_string;
 }
@@ -241,11 +275,10 @@ bool MatchesPattern(Operation& op,
 bool IsSupportedOp(Operation& op,
                    const llvm::DenseSet<OperationName>& supported_ops,
                    const Dialect* tf_dialect) {
-  if (op.getDialect() != tf_dialect)
-    return true;
+  if (op.getDialect() != tf_dialect) return true;
   // Assert has a legalization that later removes it so we don't want to outside
   // compile it ever for performance reasons.
-  if (llvm::isa<TF::AssertOp>(op)) return true;
+  if (llvm::isa<mlir::TF::AssertOp>(op)) return true;
 
   if (HasStringOperand(op)) return false;
   if (HasStringResult(op)) return false;
@@ -253,25 +286,11 @@ bool IsSupportedOp(Operation& op,
 
   auto abstractOp = op.getRegisteredInfo();
   if (!abstractOp) return false;
-  return mhlo::HasTf2XlaFallback(abstractOp->getTypeID());
-}
-
-// Checks all regions of `op` for captured string operands.
-bool HasCapturedStringOperand(Operation* op) {
-  bool string_operand = false;
-  for (auto& region : op->getRegions()) {
-    mlir::visitUsedValuesDefinedAbove(
-        region, region, [&](mlir::OpOperand* operand) {
-          if (getElementTypeOrSelf(operand->get()).isa<TF::StringType>())
-            string_operand = true;
-        });
-    if (string_operand) return string_operand;
-  }
-  return string_operand;
+  return mlir::mhlo::HasTf2XlaFallback(abstractOp->getTypeID());
 }
 
 bool IsVariant(Value value) {
-  return getElementTypeOrSelf(value.getType()).isa<TF::VariantType>();
+  return getElementTypeOrSelf(value.getType()).isa<mlir::TF::VariantType>();
 }
 
 bool HasOutsideCompiledAncestor(Operation* op) {
@@ -287,7 +306,7 @@ bool HasOutsideCompiledAncestor(Operation* op) {
 // If any tf.variants are inputs/outputs to the another outside compiled
 // Operation, `op`, mark  them for outside compilation unless they are already
 // marks with outside compilation attribute.
-void MarkVariantInputsOutputs(tf_device::ClusterOp tpu_cluster) {
+void MarkVariantInputsOutputs(mlir::tf_device::ClusterOp tpu_cluster) {
   std::queue<Operation*> outside_compiled_ops;
   tpu_cluster.walk([&](Operation* op) {
     if (op->hasAttrOfType<StringAttr>(kXlaOutsideCompilationAttr))
@@ -316,7 +335,7 @@ void MarkVariantInputsOutputs(tf_device::ClusterOp tpu_cluster) {
       for (auto value : op->getResults()) {
         if (IsVariant(value)) {
           for (auto user : value.getUsers()) {
-            if (!user->hasTrait<OpTrait::IsTerminator>() &&
+            if (!user->hasTrait<mlir::OpTrait::IsTerminator>() &&
                 !HasOutsideCompiledAncestor(user) &&
                 !user->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
               user->setAttr(kXlaOutsideCompilationAttr,
@@ -358,7 +377,7 @@ LogicalResult MarkUncompilableOps(
   if (outside_compiled_cluster_counter > 0) {
     auto_outside_compilation_gauge->GetCell()->Set(true);
   }
-  return success();
+  return mlir::success();
 }
 
 // Check for uncompilable ops that are in `tf_dialect` and are not already
@@ -369,7 +388,7 @@ bool ContainsUncompilableOps(const Dialect* tf_dialect, Block* block,
   // Check if op or any parent is already marked for outside compilation.
   block->walk([&](Operation* op) {
     Operation* iter_op = op;
-    while (iter_op && !llvm::isa<tf_device::ClusterOp>(iter_op)) {
+    while (iter_op && !llvm::isa<mlir::tf_device::ClusterOp>(iter_op)) {
       if (iter_op->hasAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
         return;
       }
@@ -444,9 +463,9 @@ void MarkOpsForOutsideCompilation::runOnOperation() {
     return signalPassFailure();
   }
   RewritePatternSet patterns(&getContext());
-  mhlo::PopulateLegalizeTfPatterns(module.getContext(), &patterns);
-  TF::PopulateTFLoweringBeforeHLOPatterns(module.getContext(), &patterns);
-  TF::PopulateLoweringQuantizedPatterns(module.getContext(), &patterns);
+  mlir::mhlo::PopulateLegalizeTfPatterns(module.getContext(), &patterns);
+  mlir::TF::PopulateTFLoweringBeforeHLOPatterns(module.getContext(), &patterns);
+  mlir::TF::PopulateLoweringQuantizedPatterns(module.getContext(), &patterns);
   AddCanonicalizationPatterns(module.getContext(), &patterns);
 
   // `supported_ops` contains the name of all of the ops that can potentially be
@@ -465,7 +484,7 @@ void MarkOpsForOutsideCompilation::runOnOperation() {
   AddRewrittenEmbeddingOps(module.getContext(), &supported_ops);
   AddRewrittenCompositeOps(module.getContext(), &supported_ops);
 
-  auto result = module.walk([&](tf_device::ClusterOp cluster) {
+  auto result = module.walk([&](mlir::tf_device::ClusterOp cluster) {
     // Only if `allow_soft_placement` attribute is true should we mark ops
     // for outside compilation.
     auto soft_placement_attr =
@@ -498,5 +517,6 @@ CreateMarkOpsForOutsideCompilationPass() {
   return std::make_unique<MarkOpsForOutsideCompilation>();
 }
 
-}  // namespace TFDevice
-}  // namespace mlir
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/dialect_to_executor_passes.h b/tensorflow/compiler/mlir/tf2xla/internal/passes/mlir_to_graph_passes.h
similarity index 76%
rename from tensorflow/compiler/mlir/tf2xla/internal/passes/dialect_to_executor_passes.h
rename to tensorflow/compiler/mlir/tf2xla/internal/passes/mlir_to_graph_passes.h
index 74247860fcd36e..4e28930b3c1f8e 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/dialect_to_executor_passes.h
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/mlir_to_graph_passes.h
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_DIALECT_TO_EXECUTOR_PASSES_H_
-#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_DIALECT_TO_EXECUTOR_PASSES_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_MLIR_TO_GRAPH_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_MLIR_TO_GRAPH_PASSES_H_
 
 #include <memory>
 
@@ -25,8 +25,11 @@ namespace internal {
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 CreateVerifyInputDialectToExecutorPass();
 
+#define GEN_PASS_REGISTRATION
 #define GEN_PASS_DECL_VERIFYINPUTDIALECTTOEXECUTORPASS
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/mlir_to_graph_passes.h.inc"
+
 }  // namespace internal
 }  // namespace tf2xla
 }  // namespace tensorflow
-#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_DIALECT_TO_EXECUTOR_PASSES_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_PASSES_MLIR_TO_GRAPH_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/dialect_to_executor_passes.td b/tensorflow/compiler/mlir/tf2xla/internal/passes/mlir_to_graph_passes.td
similarity index 90%
rename from tensorflow/compiler/mlir/tf2xla/internal/passes/dialect_to_executor_passes.td
rename to tensorflow/compiler/mlir/tf2xla/internal/passes/mlir_to_graph_passes.td
index 9c7891daa84c6b..a8796805753144 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/dialect_to_executor_passes.td
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/mlir_to_graph_passes.td
@@ -1,4 +1,3 @@
-
 /* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +11,10 @@ limitations under the License.
 ==============================================================================*/
 include "mlir/Pass/PassBase.td"
 
-def VerifyInputDialectToExecutor : Pass<"verify-input-dialect-to-executor-pass", "mlir::func::FuncOp"> {
+def VerifyInputDialectToExecutorPass : Pass<"verify-input-dialect-to-executor-pass", "mlir::func::FuncOp"> {
   let summary = "Verify that TF dialect to executor converter receives the correct input.";
   let description = [{
     Verifies the input before exporting to TF executor. This includes checking whether the Ops are in TF functional, have device attributes & there are no tf_device.cluster_func ops.
   }];
   let constructor = "tensorflow::tf2xla::internal::CreateVerifyInputDialectToExecutorPass()";
-}
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass.cc
index 235a7ca1ec5468..1cf9115d9572a2 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass.cc
@@ -14,13 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
-#include <set>
 #include <string>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
+#include "tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils.h"
 
 namespace tensorflow {
 namespace tf2xla {
@@ -31,6 +33,9 @@ namespace {
 #define GEN_PASS_DEF_VERIFYCLUSTERINGPASS
 #include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
 
+using mlir::Operation;
+using mlir::WalkResult;
+
 class VerifyClusteringPass
     : public impl::VerifyClusteringPassBase<VerifyClusteringPass> {
  public:
@@ -38,20 +43,26 @@ class VerifyClusteringPass
 };
 
 void VerifyClusteringPass::runOnOperation() {
-  std::set<std::string> valid_namespaces = {"tf", "func", "return", "tf_device",
-                                            "builtin"};
-  mlir::Operation* func_op = getOperation();
+  Operation* func_op = getOperation();
 
-  auto walk_result = func_op->walk([&](mlir::Operation* op) {
-    if (valid_namespaces.find(op->getDialect()->getNamespace().str()) ==
-        valid_namespaces.end()) {
+  auto walk_result = func_op->walk([&](Operation* op) {
+    if (!tensorflow::tf2xla::internal::IsInBridgeAcceptableDialects(op)) {
       std::string error = "op is in dialect " +
                           op->getDialect()->getNamespace().str() +
                           " not in tf functional dialect";
       op->emitError() << error;
+      return WalkResult::interrupt();
+    }
+
+    if (op->hasAttr(mlir::TF::kXlaOutsideCompilationAttr)) {
+      std::string error =
+          "op has outside compilation attribute _xla_outside_compilation which "
+          "is not allowed after clustering";
+      op->emitError() << error;
       return mlir::WalkResult::interrupt();
     }
-    return mlir::WalkResult::advance();
+
+    return WalkResult::advance();
   });
 
   if (walk_result.wasInterrupted()) {
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.mlir b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.mlir
index 23e60242621f37..7ba98798c126df 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.mlir
@@ -13,4 +13,14 @@ func.func @testNotTfDialect(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>
 func.func @testTFDialect(%arg0: tensor<4x?x!tf_type.stringref>) -> tensor<4x2x!tf_type.string> {
   %0 = "tf.Identity"(%arg0) : (tensor<4x?x!tf_type.stringref>) -> tensor<4x2x!tf_type.string>
   func.return %0 : tensor<4x2x!tf_type.string>
-}
\ No newline at end of file
+}
+
+
+// -----
+
+func.func @testTFDialect(%arg0: tensor<4x?x!tf_type.stringref>) -> tensor<4x2x!tf_type.string> {
+   // expected-error@below {{op has outside compilation attribute _xla_outside_compilation which is not allowed after clustering}}
+  %0 = "tf.Identity"(%arg0) {_xla_outside_compilation = "cluster1"}: (tensor<4x?x!tf_type.stringref>) -> tensor<4x2x!tf_type.string>
+  func.return %0 : tensor<4x2x!tf_type.string>
+}
+
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_input_dialect_to_executor_pass.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_input_dialect_to_executor_pass.cc
new file mode 100644
index 00000000000000..53c1e5bab16ad0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_input_dialect_to_executor_pass.cc
@@ -0,0 +1,84 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+namespace {
+using mlir::Operation;
+using mlir::OperationPass;
+using mlir::WalkResult;
+using mlir::func::FuncOp;
+
+#define GEN_PASS_DEF_VERIFYINPUTDIALECTTOEXECUTORPASS
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/mlir_to_graph_passes.h.inc"
+
+class VerifyInputDialectToExecutorPass
+    : public impl::VerifyInputDialectToExecutorPassBase<
+          VerifyInputDialectToExecutorPass> {
+ public:
+  void runOnOperation() override;
+};
+
+bool IsTfDeviceClusterFuncOp(Operation* op) {
+  std::string kClusterFuncOpName = "tf_device.cluster_func";
+  return op->getName().getStringRef().str() == kClusterFuncOpName;
+}
+
+void VerifyInputDialectToExecutorPass::runOnOperation() {
+  Operation* func_op = getOperation();
+
+  auto walk_result = func_op->walk([&](Operation* op) {
+    if (!tensorflow::tf2xla::internal::IsInBridgeAcceptableDialects(op)) {
+      std::string error = "op is in dialect " +
+                          op->getDialect()->getNamespace().str() +
+                          " which is not an accepted dialect";
+      op->emitError() << error;
+      return WalkResult::interrupt();
+    }
+
+    if (IsTfDeviceClusterFuncOp(op)) {
+      std::string error =
+          "failed TF functional to executor validation, op "
+          "tf_device.cluster_func is not allowed";
+      op->emitError() << error;
+      return WalkResult::interrupt();
+    }
+
+    return WalkResult::advance();
+  });
+
+  if (walk_result.wasInterrupted()) {
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<FuncOp>>
+CreateVerifyInputDialectToExecutorPass() {
+  return std::make_unique<VerifyInputDialectToExecutorPass>();
+}
+
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_input_dialect_to_executor_pass_test.mlir b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_input_dialect_to_executor_pass_test.mlir
new file mode 100644
index 00000000000000..5a6fda697d23fa
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_input_dialect_to_executor_pass_test.mlir
@@ -0,0 +1,34 @@
+// RUN: tf-opt -verify-input-dialect-to-executor-pass  -split-input-file -verify-diagnostics %s | FileCheck %s
+// Tests the VerifyClusteringPass Pass, ensures that an error is thrown when validation fails.
+
+// -----
+
+// CHECK-LABEL: func @testNoClusterFuncOpPasses
+func.func @testNoClusterFuncOpPasses(%arg0: tensor<4x?x!tf_type.stringref>) -> tensor<4x2x!tf_type.string> {
+  %0 = "tf.Identity"(%arg0) : (tensor<4x?x!tf_type.stringref>) -> tensor<4x2x!tf_type.string>
+  func.return %0 : tensor<4x2x!tf_type.string>
+}
+
+// -----
+
+func.func @testClusterFuncOpFails(%arg0: tensor<i32>) -> tensor<i32> {
+   // expected-error@below {{failed TF functional to executor validation, op tf_device.cluster_func is not allowed}}
+  %cluster = "tf_device.cluster_func"(%arg0) {func = @_func} : (tensor<i32>) -> tensor<i32>
+ func.return %cluster : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: func @testTFDialect
+func.func @testTFDialect(%arg0: tensor<4x?x!tf_type.stringref>) -> tensor<4x2x!tf_type.string> {
+  %0 = "tf.Identity"(%arg0) : (tensor<4x?x!tf_type.stringref>) -> tensor<4x2x!tf_type.string>
+  func.return %0 : tensor<4x2x!tf_type.string>
+}
+
+// -----
+
+func.func @testNotTfDialect(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
+ // expected-error@below {{op is in dialect chlo which is not an accepted dialect}}
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  func.return %0 : tensor<1x32x10x32xi32>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/xla_cluster_formation.cc
similarity index 62%
rename from tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc
rename to tensorflow/compiler/mlir/tf2xla/internal/passes/xla_cluster_formation.cc
index f99b754074f568..cbedce815b8229 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/xla_cluster_formation.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,23 +14,47 @@ limitations under the License.
 ==============================================================================*/
 
 #include <functional>
-#include <stack>
+#include <memory>
+#include <string>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.h"
 #include "tensorflow/core/common_runtime/inline_function_utils.h"
 
-namespace mlir {
-
-namespace {
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+using mlir::Block;
+using mlir::CallInterfaceCallable;
+using mlir::CallOpInterface;
+using mlir::ModuleOp;
+using mlir::OpBuilder;
+using mlir::Operation;
+using mlir::OperationPass;
+using mlir::SymbolTable;
+using mlir::SymbolTableCollection;
+using mlir::SymbolUserOpInterface;
+using mlir::func::FuncOp;
 
 #define GEN_PASS_DEF_XLACLUSTERFORMATIONPASS
-#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.h.inc"
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
 
 constexpr char kAllowSoftPlacementAttr[] = "allow_soft_placement";
 
@@ -47,22 +71,22 @@ void CopyAttribute(const llvm::StringRef attr, Operation *src,
   }
 }
 
-std::string getClusterOutlinedFunctionName(func::FuncOp func) {
+std::string getClusterOutlinedFunctionName(FuncOp func) {
   return func.getSymName().str() + "_cluster_func";
 }
 
-void AddClusterAttributes(OpBuilder &builder, func::FuncOp entry_func,
-                          tf_device::ClusterOp cluster) {
-  TF::CopyDeviceAndUnderscoredAttributes(entry_func, cluster);
+void AddClusterAttributes(OpBuilder &builder, FuncOp entry_func,
+                          mlir::tf_device::ClusterOp cluster) {
+  mlir::TF::CopyDeviceAndUnderscoredAttributes(entry_func, cluster);
   CopyAttribute(kAllowSoftPlacementAttr, entry_func, cluster);
   cluster->setAttr(
-      TF::kClusterOutlinedFunctionNameAttr,
+      mlir::TF::kClusterOutlinedFunctionNameAttr,
       builder.getStringAttr(getClusterOutlinedFunctionName(entry_func)));
 }
 
 // Wrap the body of `func` in a device cluster. `func` must have a single
 // region and a single block.
-LogicalResult EncapsulateEntryFunctionBody(func::FuncOp entry_func) {
+mlir::LogicalResult EncapsulateEntryFunctionBody(FuncOp entry_func) {
   // We've verified the input graph has single-entry and single-block entry
   // functions. This is just in case passes in the pipeline uninteionally break
   // the assumption, and not expected to happen in practice.
@@ -70,7 +94,7 @@ LogicalResult EncapsulateEntryFunctionBody(func::FuncOp entry_func) {
     entry_func->emitError() << "TF2XLA MLIR CPU/GPU MLIR phase 1 bridge "
                                "expects single region and single "
                                "block in an entry function.";
-    return failure();
+    return mlir::failure();
   }
   std::vector<Operation *> ops_without_terminator;
   for (auto &op : entry_func.front().without_terminator()) {
@@ -79,36 +103,39 @@ LogicalResult EncapsulateEntryFunctionBody(func::FuncOp entry_func) {
   Operation *original_return_op = entry_func.front().getTerminator();
   OpBuilder builder(entry_func.getContext());
   builder.setInsertionPointToEnd(&entry_func.front());
-  auto cluster = builder.create<tf_device::ClusterOp>(
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
       entry_func.getLoc(), entry_func.getResultTypes());
   cluster.getBody().push_back(new Block);
   for (auto &op : ops_without_terminator) {
     op->moveBefore(&cluster.GetBody(), cluster.GetBody().end());
   }
   builder.setInsertionPointToEnd(&cluster.GetBody());
-  builder.create<tf_device::ReturnOp>(original_return_op->getLoc(),
-                                      original_return_op->getResultTypes(),
-                                      original_return_op->getOperands());
+  builder.create<mlir::tf_device::ReturnOp>(
+      original_return_op->getLoc(), original_return_op->getResultTypes(),
+      original_return_op->getOperands());
   original_return_op->erase();
   builder.setInsertionPointToEnd(&entry_func.front());
-  builder.create<func::ReturnOp>(entry_func->getLoc(), cluster->getResults());
+  builder.create<mlir::func::ReturnOp>(entry_func->getLoc(),
+                                       cluster->getResults());
   AddClusterAttributes(builder, entry_func, cluster);
-  return success();
+  return mlir::success();
 }
 
-void EncapsulatePartitionedCall(Operation *call_op, StringAttr callee_name) {
+void EncapsulatePartitionedCall(Operation *call_op,
+                                mlir::StringAttr callee_name) {
   OpBuilder builder(call_op);
-  auto cluster = builder.create<tf_device::ClusterOp>(
+  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
       call_op->getLoc(), call_op->getResultTypes());
   cluster.getBody().push_back(new Block);
   call_op->replaceAllUsesWith(cluster.getResults());
   call_op->moveBefore(&cluster.GetBody(), cluster.GetBody().end());
   builder.setInsertionPointToEnd(&cluster.GetBody());
-  builder.create<tf_device::ReturnOp>(call_op->getLoc(), call_op->getResults());
+  builder.create<mlir::tf_device::ReturnOp>(call_op->getLoc(),
+                                            call_op->getResults());
   // Propagate necessary attributes to the cluster so that when it's outlined,
   // the function will have correct attributes.
-  TF::CopyDeviceAndUnderscoredAttributes(call_op, cluster);
-  cluster->setAttr(TF::kClusterOutlinedFunctionNameAttr, callee_name);
+  mlir::TF::CopyDeviceAndUnderscoredAttributes(call_op, cluster);
+  cluster->setAttr(mlir::TF::kClusterOutlinedFunctionNameAttr, callee_name);
   cluster->setAttr(kAllowSoftPlacementAttr, builder.getBoolAttr(true));
 }
 
@@ -116,30 +143,31 @@ void EncapsulatePartitionedCall(Operation *call_op, StringAttr callee_name) {
 // `func` and is with compilation markers in a device cluster. For nested calls,
 // if the outermost one has the markers, encapsulates the outermost call and
 // returns. Otherwise, we'll keep going through inner calls until we found one.
-LogicalResult EncapsulateFirstXlaCompilablePartitionedCalls(
-    func::FuncOp func, SymbolTableCollection &symbol_table_collection,
+mlir::LogicalResult EncapsulateFirstXlaCompilablePartitionedCalls(
+    FuncOp func, SymbolTableCollection &symbol_table_collection,
     SymbolTable &symtab) {
   auto has_no_compile_device_type = [](SymbolUserOpInterface op) {
-    return !op->hasAttr(TF::kCompileDeviceTypeAttr);
+    return !op->hasAttr(mlir::TF::kCompileDeviceTypeAttr);
   };
 
   mlir::OpBuilder builder(func.getContext());
   auto noinline_attr_name = absl::StrCat("tf.", tensorflow::kNoInlineAttr);
   llvm::SmallVector<SymbolUserOpInterface> noinline_pcall_ops,
       outermost_pcall_ops;
-  if (failed(GetOpsOfTypeUntilMiss<TF::StatefulPartitionedCallOp,
-                                   TF::PartitionedCallOp>(
-          func, symtab, /*predicate*/ has_no_compile_device_type,
-          /*hits*/ noinline_pcall_ops,
-          /*first_misses*/ outermost_pcall_ops))) {
-    return failure();
+  if (mlir::failed(
+          mlir::GetOpsOfTypeUntilMiss<mlir::TF::StatefulPartitionedCallOp,
+                                      mlir::TF::PartitionedCallOp>(
+              func, symtab, /*predicate*/ has_no_compile_device_type,
+              /*hits*/ noinline_pcall_ops,
+              /*first_misses*/ outermost_pcall_ops))) {
+    return mlir::failure();
   }
   // Cluster outermost partitioned calls with _xla_compile_device_type
   // attribute.
   for (auto &pcall_op : outermost_pcall_ops) {
     auto call = llvm::cast<CallOpInterface>(pcall_op.getOperation());
     CallInterfaceCallable callable = call.getCallableForCallee();
-    auto sym = callable.get<SymbolRefAttr>();
+    auto sym = callable.get<mlir::SymbolRefAttr>();
     EncapsulatePartitionedCall(pcall_op, sym.getRootReference());
   }
   // Partitioned calls are executed asynchronous. The calls outside of
@@ -147,20 +175,20 @@ LogicalResult EncapsulateFirstXlaCompilablePartitionedCalls(
   // performance.
   for (auto &pcall_op : noinline_pcall_ops) {
     auto call = llvm::cast<CallOpInterface>(pcall_op.getOperation());
-    auto callee = llvm::cast<func::FuncOp>(
-        call.resolveCallable(&symbol_table_collection));
+    auto callee =
+        llvm::cast<FuncOp>(call.resolveCallable(&symbol_table_collection));
     callee->setAttr(noinline_attr_name, builder.getBoolAttr(true));
   }
-  return success();
+  return mlir::success();
 }
 
 void XlaClusterFormationPass::runOnOperation() {
   ModuleOp module = getOperation();
   SymbolTableCollection symbol_table_collection;
   SymbolTable symtab = symbol_table_collection.getSymbolTable(module);
-  llvm::SmallVector<func::FuncOp> entry_funcs = GetEntryFunctions(module);
+  llvm::SmallVector<FuncOp> entry_funcs = GetEntryFunctions(module);
   for (auto &entry_func : entry_funcs) {
-    if (entry_func->hasAttr(TF::kCompileDeviceTypeAttr)) {
+    if (entry_func->hasAttr(mlir::TF::kCompileDeviceTypeAttr)) {
       if (EncapsulateEntryFunctionBody(entry_func).failed()) {
         return signalPassFailure();
       }
@@ -172,12 +200,10 @@ void XlaClusterFormationPass::runOnOperation() {
   }
 }
 
-}  // namespace
-
-namespace TFDevice {
 std::unique_ptr<OperationPass<ModuleOp>> CreateXlaClusterFormationPass() {
   return std::make_unique<XlaClusterFormationPass>();
 }
-}  // namespace TFDevice
 
-}  // namespace mlir
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/utils/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/utils/BUILD
new file mode 100644
index 00000000000000..a67178be9d770a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/utils/BUILD
@@ -0,0 +1,45 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/compiler/mlir/tf2xla/internal:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "dialect_detection_utils",
+    srcs = [
+        "dialect_detection_utils.cc",
+    ],
+    hdrs = [
+        "dialect_detection_utils.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/transforms/toposort:Pass",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+tf_cc_test(
+    name = "dialect_detection_utils_test",
+    srcs = ["dialect_detection_utils_test.cc"],
+    deps = [
+        ":dialect_detection_utils",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@stablehlo//:chlo_ops",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils.cc b/tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils.cc
new file mode 100644
index 00000000000000..fe37304826416f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils.cc
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils.h"
+
+#include <set>
+#include <string>
+
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+bool IsInBridgeAcceptableDialects(mlir::Operation* op) {
+  const std::set<std::string> kBuiltinNamespaces = {"func", "return",
+                                                    "builtin"};
+  const std::set<std::string> kBridgeAcceptableNamespaces = {"tf", "tf_device"};
+  bool isInDefaulNamespaces =
+      kBuiltinNamespaces.find(op->getDialect()->getNamespace().str()) !=
+      kBuiltinNamespaces.end();
+  bool isInBridgeAcceptableNamespaces =
+      kBridgeAcceptableNamespaces.find(
+          op->getDialect()->getNamespace().str()) !=
+      kBridgeAcceptableNamespaces.end();
+  return isInDefaulNamespaces || isInBridgeAcceptableNamespaces;
+}
+
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_input_dialect_to_executor.cc b/tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils.h
similarity index 53%
rename from tensorflow/compiler/mlir/tf2xla/internal/passes/verify_input_dialect_to_executor.cc
rename to tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils.h
index dd78c065371d6b..6dd9851f7507bf 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_input_dialect_to_executor.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils.h
@@ -1,44 +1,33 @@
 /* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <memory>
 
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_UTILS_DIALECT_DETECTION_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_UTILS_DIALECT_DETECTION_UTILS_H_
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
 
 namespace tensorflow {
 namespace tf2xla {
 namespace internal {
 
-namespace {
-
-#define GEN_PASS_DEF_VERIFYINPUTDIALECTTOEXECUTORPASS
-#include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
-
-class VerifyInputDialectToexecutorPass
-    : public impl::VerifyInputDialectToexecutorPassBase<
-          VerifyInputDialectToexecutorPass> {
- public:
-  void runOnOperation() override;
-};
-
-void VerifyInputDialectToexecutorPass::runOnOperation() {}
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>>
-CreateVerifyInputDialectToExecutorPass() {
-  return std::make_unique<VerifyInputDialectToexecutorPass>();
-}
+// Returns true if the op has a valid namespace during clustering & tf dialect
+// to executor components of the Bridge.
+bool IsInBridgeAcceptableDialects(mlir::Operation* op);
 
 }  // namespace internal
 }  // namespace tf2xla
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_UTILS_DIALECT_DETECTION_UTILS_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils_test.cc
new file mode 100644
index 00000000000000..b6a56d70290ceb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/internal/utils/dialect_detection_utils.h"
+
+#include <gtest/gtest.h>
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+namespace {
+
+using mlir::MLIRContext;
+using mlir::OpBuilder;
+using mlir::Operation;
+using mlir::OperationState;
+using mlir::UnknownLoc;
+using mlir::chlo::ChloDialect;
+using mlir::TF::TensorFlowDialect;
+using tensorflow::tf2xla::internal::IsInBridgeAcceptableDialects;
+
+class SharedUtilsTest : public ::testing::Test {};
+
+TEST_F(SharedUtilsTest, IsInFunctionalDialectPasses) {
+  MLIRContext context;
+  context.loadDialect<TensorFlowDialect>();
+  OpBuilder opBuilder(&context);
+  OperationState state(UnknownLoc::get(opBuilder.getContext()),
+                       /*OperationName=*/"tf.Const");
+  mlir::Operation* op = Operation::create(state);
+
+  bool result = IsInBridgeAcceptableDialects(op);
+
+  EXPECT_TRUE(result);
+  op->destroy();
+}
+
+TEST_F(SharedUtilsTest, IsInFunctionalDialectFails) {
+  MLIRContext context;
+  context.loadDialect<ChloDialect>();
+  OpBuilder opBuilder(&context);
+  OperationState state(UnknownLoc::get(opBuilder.getContext()),
+                       /*OperationName=*/"chlo.broadcast_add");
+  Operation* op = Operation::create(state);
+
+  bool result = IsInBridgeAcceptableDialects(op);
+
+  EXPECT_FALSE(result);
+  op->destroy();
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-include-tf2xla-fallback.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-include-tf2xla-fallback.mlir
index f6e3ca10f5a279..56620e66870520 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-include-tf2xla-fallback.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-include-tf2xla-fallback.mlir
@@ -51,7 +51,7 @@ func.func @batchmatmulv2(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32>) ->
   // SUPPORTED_FALLBACK_DEVICE: mhlo.dot_general
   // SUPPORTED_FALLBACK_DEVICE: mhlo.transpose
 
-  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<1x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, grad_x = false, grad_y = false, device = ""} : (tensor<1x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
   func.return %0 : tensor<3x4x4xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir
index a732c6d61281ca..b8552d1b6bdd10 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir
@@ -524,7 +524,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK: mhlo.reduce
     // CHECK: mhlo.dot_general
     // CHECK: mhlo.transpose
-    %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<1x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
+    %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, grad_x = false, grad_y = false, device = ""} : (tensor<1x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
     func.return %0 : tensor<3x4x4xf32>
   }
 
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
index ed0429ad242c94..0f0b1182e50bb7 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
@@ -466,6 +466,7 @@ cc_library(
     hdrs = ["legalization_op_config.h"],
     visibility = [
         "//tensorflow/compiler/mlir/tensorflow/transforms:__pkg__",
+        "//tensorflow/compiler/mlir/tf2xla/internal:__subpackages__",
     ],
     deps = [
         "//tensorflow/compiler/mlir/tensorflow",
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc
index 6e5f8285a0a928..979ec3f97e629e 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc
@@ -336,6 +336,7 @@ bool IsOpTypeAllowedTf2XlaFallback(const TypeID& type_id) {
             TypeID::get<TF::UnpackOp>(),
             TypeID::get<TF::UpperBoundOp>(),
             TypeID::get<TF::WhereOp>(),
+            TypeID::get<TF::XlaSendTPUEmbeddingGradientsOp>(),
             TypeID::get<TF::XlaBroadcastHelperOp>(),
             TypeID::get<TF::XlaCallModuleOp>(),
             TypeID::get<TF::XlaCustomCallV2Op>(),
@@ -481,6 +482,7 @@ bool IsOpTypeAllowedTf2XlaPreferred(const TypeID& type_id) {
     TypeID::get<TF::UnsortedSegmentProdOp>(),
     TypeID::get<TF::UnsortedSegmentSumOp>(),
     TypeID::get<TF::XdivyOp>(),
+    TypeID::get<TF::XlaSendTPUEmbeddingGradientsOp>(),
     TypeID::get<TF::XlaAllReduceOp>(),
     TypeID::get<TF::XlaGatherOp>(),
     TypeID::get<TF::Xlog1pyOp>(),
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
index ade2b5faa73c8a..7084f98b28568e 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
@@ -131,8 +131,8 @@ TEST_F(LegalizationOpConfigTest, CountLoweringsSet) {
   // from MLIR to TF2XLA), these numbers should change. Or if TF Dialect adds
   // a new op, we should expect these to change too.
   EXPECT_EQ(mlir_lowering_count, 67);
-  EXPECT_EQ(tf2xla_fallback_count, 315);
-  EXPECT_EQ(non_categorized_count, 422);
+  EXPECT_EQ(tf2xla_fallback_count, 316);
+  EXPECT_EQ(non_categorized_count, 421);
 }
 
 // Just a counter test to see which ops have duplicate lowerings. This isn't a
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
index 70a32c7a270049..763e94734f6d01 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
@@ -501,8 +501,8 @@ Value CreateSubTuple(OpBuilder& builder, Value value, size_t end) {
 // return the first element. Otherwise, `mhlo.get_tuple_element` users are
 // simply updated with `replacement`, and all other users are updated with a
 // slice of `replacement`.
-void ReplaceWithTupleResult(OpBuilder& builder, ArrayRef<Value> values,
-                            ArrayRef<Value> replacements, bool flatten_tuple) {
+void ReplaceWithTupleResult(OpBuilder& builder, ValueRange values,
+                            ValueRange replacements, bool flatten_tuple) {
   if (flatten_tuple) {
     for (size_t result_index = 0; result_index < values.size(); result_index++)
       values[result_index].replaceAllUsesWith(replacements[result_index]);
@@ -547,10 +547,8 @@ Value UpdateControlFlowBlockArgWithToken(OpBuilder& builder, Block& block,
   block.addArguments(
       types, SmallVector<Location>(types.size(), block.getParent()->getLoc()));
 
-  auto old_args = ArrayRef<Value>(block.getArguments().begin(),
-                                  block.getArguments().begin() + old_args_size);
-  auto new_args = ArrayRef<Value>(block.getArguments().begin() + old_args_size,
-                                  block.getArguments().end());
+  ValueRange old_args = block.getArguments().take_front(old_args_size);
+  ValueRange new_args = block.getArguments().drop_front(old_args_size);
   assert(!new_args.empty());
 
   ReplaceWithTupleResult(builder, old_args, new_args, /*flatten_tuple=*/true);
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
index 3aad616f162b17..0ee5d1dee5925d 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
@@ -392,7 +392,7 @@ foreach src = [TF_PreventGradientOp, TF_CheckNumericsOp] in
 def GetPrecisionConfig: NativeCodeCall<
   "GetPrecisionConfig(&$_builder)">;
 
-def : Pat<(TF_MatMulOp $a, $b, $transpose_a, $transpose_b),
+def : Pat<(TF_MatMulOp $a, $b, $transpose_a, $transpose_b, $grad_a, $grad_b),
           (MHLO_DotOp
           (TF_TransposeOp $a, (TF_ConstOp (Get2DTransposePerm $transpose_a))),
           (TF_TransposeOp $b, (TF_ConstOp (Get2DTransposePerm $transpose_b))),
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index f803230ea4f504..be8298824029dd 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/mlprogram_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h"
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/mlir_to_graph_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tosa/tf_passes.h"
 #include "tensorflow/compiler/mlir/tosa/tf_tfl_passes.h"
@@ -56,7 +57,8 @@ int main(int argc, char **argv) {
   mlir::mhlo::registerLegalizeTfPasses();
   mlir::mhlo::registerTfXlaPasses();
   mlir::quant::stablehlo::registerBridgePasses();
-  tensorflow::tf2xla::internal::registerTFXLABridgePasses();
+  tensorflow::tf2xla::internal::registerTFXLABridgeClusteringPasses();
+  tensorflow::tf2xla::internal::registerTFXLABridgeMlirToGraphPasses();
   mlir::tosa::registerLegalizeTosaPasses();
   mlir::tosa::registerTFtoTOSALegalizationPipeline();
   mlir::tosa::registerTFLtoTOSALegalizationPipeline();
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
index b5d2bf7d9933b9..fade4b23bf70ea 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
@@ -89,8 +89,7 @@ class TFRInlinerInterface : public DialectInlinerInterface {
 
   // Handle the given inlined terminator by replacing it with a new operation
   // as necessary. Required when the region has only one block.
-  void handleTerminator(Operation *op,
-                        ArrayRef<Value> valuesToRepl) const final {
+  void handleTerminator(Operation *op, ValueRange valuesToRepl) const final {
     auto retValOp = dyn_cast<TFRReturnOp>(op);
     if (!retValOp) return;
 
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_types.h b/tensorflow/compiler/mlir/tfr/ir/tfr_types.h
index c862f0f1b5f983..e0e24f4aca8f2d 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_types.h
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_types.h
@@ -102,18 +102,21 @@ class TFRTypeImpl : public Type::TypeBase<Derived, TFRType, TFRTypeStorage> {
 class TFRTensorType : public detail::TFRTypeImpl<TFRTensorType> {
  public:
   using TFRBase::TFRBase;
+  static constexpr StringLiteral name = "tfr.tensor";
   static std::string getTypeName() { return "TFRTensorType"; }
 };
 
 class TFRTensorListType : public detail::TFRTypeImpl<TFRTensorListType> {
  public:
   using TFRBase::TFRBase;
+  static constexpr StringLiteral name = "tfr.tensor_list";
   static std::string getTypeName() { return "TFRTensorListType"; }
 };
 
 class TFRAttrType : public Type::TypeBase<TFRAttrType, TFRType, TypeStorage> {
  public:
   using Base::Base;
+  static constexpr StringLiteral name = "tfr.attr";
   static std::string getTypeName() { return "TFRAttrType"; }
 };
 
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index d41437a3fe796c..a73fc10ae083c4 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -1,5 +1,5 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("//tensorflow:tensorflow.bzl", "if_google", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "if_google", "tf_cc_binary")
 
 # Note: keep the following lines separate due to the way copybara works
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
@@ -19,7 +19,6 @@ package_group(
     name = "friends",
     packages = [
         "//tensorflow/compiler/...",
-        "//tensorflow/core/runtime_fallback/...",
         "//tensorflow/core/tfrt/experimental/data/...",
         "//tensorflow/core/tfrt/graph_executor/...",
         "//tensorflow/core/tfrt/ifrt/...",
@@ -127,164 +126,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "tf_ifrt_passes",
-    srcs = [
-        "transforms/ifrt/rewrite_cluster_to_ifrt_call.cc",
-        "transforms/ifrt/tf_ifrt_passes.cc",
-    ],
-    hdrs = [
-        "transforms/ifrt/rewrite_cluster_to_ifrt_call.h",
-        "transforms/ifrt/tf_ifrt_passes.h",
-    ],
-    #compatible_with = get_compatible_with_portable(),  # copybara: comment
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
-        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
-        "//tensorflow/core:framework",
-        "//tensorflow/core/platform:random",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "tf2hlo",
-    srcs = ["transforms/ifrt/tf2hlo.cc"],
-    hdrs = ["transforms/ifrt/tf2hlo.h"],
-    deps = [
-        "//tensorflow/compiler/jit:xla_cpu_jit",
-        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
-        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
-        "//tensorflow/compiler/mlir/tf2xla/api/v2:legalize_tf",
-        "//tensorflow/compiler/tf2xla:layout_util",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_headers_for_pybind",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
-        "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_xla//xla:shape_util",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/client:client_library",
-        "@local_xla//xla/python/ifrt",
-        "@local_xla//xla/stream_executor",
-        "@local_xla//xla/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
-    ],
-)
-
-tf_cc_test(
-    name = "tf2hlo_test",
-    srcs = [
-        "transforms/ifrt/tf2hlo_test.cc",
-    ],
-    data = [
-        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata",
-    ],
-    tags = ["no_oss"],
-    deps = [
-        ":tf2hlo",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/tf2xla:xla_helpers",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:test",
-        "//tensorflow/core/platform:resource_loader",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_xla//xla/python/ifrt",
-        "@local_xla//xla/python/ifrt:test_util",
-        "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
-    ],
-)
-
-cc_library(
-    name = "ifrt_backend_compiler",
-    srcs = ["transforms/ifrt/ifrt_backend_compiler.cc"],
-    hdrs = ["transforms/ifrt/ifrt_backend_compiler.h"],
-    deps = [
-        ":backend_compiler",
-        ":tf_ifrt_passes",
-        ":tpu_passes",
-        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/mlir/tensorflow:visitor",
-        "//tensorflow/compiler/mlir/tf2xla/api/v2:cluster_tf",
-        "//tensorflow/core/tfrt/ifrt:ifrt_executable_registry",
-        "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
-        "//tensorflow/core/tfrt/ifrt:ifrt_serving_executable",
-        "//tensorflow/core/tfrt/runtime",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/profiler/lib:traceme",
-    ],
-)
-
-tf_cc_test(
-    name = "ifrt_backend_compiler_test",
-    srcs = [
-        "transforms/ifrt/ifrt_backend_compiler_test.cc",
-    ],
-    data = [
-        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata",
-    ],
-    tags = ["no_oss"],
-    deps = [
-        ":ifrt_backend_compiler",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core/platform:resource_loader",
-        "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
-        "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
-        "//tensorflow/core/tfrt/runtime",
-        "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_xla//xla/python/ifrt",
-        "@local_xla//xla/python/ifrt:test_util",
-        "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
-        "@tf_runtime//:hostcontext",
-    ],
-)
-
 cc_library(
     name = "corert_converter",
     srcs = [
@@ -628,7 +469,7 @@ tf_proto_library(
 cc_library(
     name = "passes",
     visibility = [
-        ":__subpackages__",
+        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
     ],
     deps = [
         "//tensorflow/compiler/mlir/tfrt:tf_to_tfrt",
@@ -656,7 +497,6 @@ cc_library(
         ":test_cost_analysis_pass",
         ":test_opkernels",
         ":test_tensor_array_side_effect_analysis",
-        ":tf_ifrt_passes",
         ":tf_to_tfrt",
         ":tpu_passes",
         ":transforms/gpu_passes",
@@ -671,6 +511,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_sync_opdefs",
         "//tensorflow/compiler/mlir/tfrt/ir/mlrt:mlrt_ops",
         "//tensorflow/compiler/mlir/tfrt/ir/mlrt:tf_mlrt_ops",
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:tf_ifrt_passes",
         "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:passes",
         "//tensorflow/core:tensorflow",
         "@llvm-project//mlir:AllPassesAndDialects",
@@ -679,8 +520,6 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
-        "@local_xla//xla/mlir_hlo:gml_st",
-        "@local_xla//xla/mlir_hlo:gml_st_passes",
         "@tf_runtime//:init_tfrt_dialects",
         "@tf_runtime//:print_stream_pass",
     ],
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
index e72c58bdd6b846..4b2b0576430bd1 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
@@ -70,7 +70,6 @@ td_library(
     ],
     includes = ["."],
     visibility = [
-        # copybara:uncomment "//learning/brain/tfrt/mlir:__subpackages__",
         # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h
index 0fb568b44dc8c9..644de2618d691c 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h
@@ -37,6 +37,7 @@ class FutureType
     : public mlir::Type::TypeBase<FutureType, mlir::Type, mlir::TypeStorage> {
  public:
   using Base::Base;
+  static constexpr mlir::StringLiteral name = "mlrt.compiler.future";
 };
 
 // The MLIR type represents a C++ mlrt::Promise.
@@ -44,6 +45,7 @@ class PromiseType
     : public mlir::Type::TypeBase<PromiseType, mlir::Type, mlir::TypeStorage> {
  public:
   using Base::Base;
+  static constexpr mlir::StringLiteral name = "mlrt.compiler.promise";
 };
 
 // The MLIR type represents a C++ mlrt::AsyncHandle.
@@ -51,6 +53,7 @@ class AsyncHandleType : public mlir::Type::TypeBase<AsyncHandleType, mlir::Type,
                                                     mlir::TypeStorage> {
  public:
   using Base::Base;
+  static constexpr mlir::StringLiteral name = "mlrt.compiler.async_handle";
 };
 
 }  // namespace compiler
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h
index da91450aa19fc1..a542373eeccf6a 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h
@@ -41,6 +41,7 @@ class TFTensorType
     : public mlir::Type::TypeBase<TFTensorType, mlir::Type, mlir::TypeStorage> {
  public:
   using Base::Base;
+  static constexpr mlir::StringLiteral name = "tensorflow.tf_mlrt.tf_tensor";
 };
 
 // The MLIR type represents a tensorflow::Device*
@@ -48,6 +49,7 @@ class TFDeviceType
     : public mlir::Type::TypeBase<TFDeviceType, mlir::Type, mlir::TypeStorage> {
  public:
   using Base::Base;
+  static constexpr mlir::StringLiteral name = "tensorflow.tf_mlirt.tf_device";
 };
 
 }  // namespace tf_mlrt
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.h b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.h
index 0d2e941b5cfb0d..24fa464ff6ed31 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.h
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.h
@@ -40,6 +40,7 @@ class FallbackDialect : public Dialect {
 class TFTensorType : public Type::TypeBase<TFTensorType, Type, TypeStorage> {
  public:
   using Base::Base;
+  static constexpr StringLiteral name = "tfrt.tf_tensor";
 };
 
 // The MLIR type represents a tensorflow::Allocator.
@@ -47,6 +48,7 @@ class TFAllocatorType
     : public Type::TypeBase<TFAllocatorType, Type, TypeStorage> {
  public:
   using Base::Base;
+  static constexpr StringLiteral name = "tfrt.tf_allocator";
 };
 
 }  // namespace fallback
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/BUILD b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/BUILD
index 53a5a8489895a9..1065a5fc1a682a 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/BUILD
@@ -1,4 +1,6 @@
-load("@tf_runtime//tools:mlir_to_bef.bzl", "glob_tfrt_lit_tests")
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_test")
+load("@tf_runtime//tools:mlir_to_bef.bzl", "glob_tfrt_lit_tests", "mlir_to_bef")
+# copybara:uncomment load("//third_party/tf_runtime_google/cpp_tests:gen_tests.bzl", "tfrt_cc_test_and_strict_benchmark")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
@@ -9,6 +11,7 @@ filegroup(
     srcs = [
         "//tensorflow/compiler/mlir/tfrt:tfrt_fallback_translate",
         "//tensorflow/core/runtime_fallback:tf_bef_executor",
+        "//tensorflow/core/runtime_fallback/util:fallback_test_util",
         "@llvm-project//llvm:FileCheck",
         "@llvm-project//llvm:not",
         "@llvm-project//mlir:run_lit.sh",
@@ -23,6 +26,9 @@ filegroup(
 #     # copybara:uncomment driver = "//tensorflow/compiler/mlir:run_lit.sh",
 #     exclude = [
 #         "compile.benchmark.large.mlir",
+#         "batch_function_fallback.mlir",
+#         "create_op.mlir",
+#         "custom_thread_pool.mlir",
 #     ],
 #     # copybara:uncomment flaky = ["compile.error.mlir"],
 #     size_override = {
@@ -47,3 +53,91 @@ filegroup(
 #     tfrt_translate = "//tensorflow/compiler/mlir/tfrt:tfrt_fallback_translate",
 # )
 # copybara:uncomment_end
+
+mlir_to_bef(
+    name = "batch_function_fallback.mlir",
+    tfrt_translate = "//tensorflow/compiler/mlir/tfrt:tfrt_fallback_translate",
+)
+
+mlir_to_bef(
+    name = "create_op.mlir",
+    tfrt_translate = "//tensorflow/compiler/mlir/tfrt:tfrt_fallback_translate",
+)
+
+mlir_to_bef(
+    name = "custom_thread_pool.mlir",
+    tfrt_translate = "//tensorflow/compiler/mlir/tfrt:tfrt_fallback_translate",
+)
+
+# copybara:uncomment_begin(internal benchmarking)
+# # C++ benchmarks for batch function runtime fallback.
+# tfrt_cc_test_and_strict_benchmark(
+#     name = "batch_function_fallback_benchmark_test",
+#     srcs = ["batch_function_fallback_benchmark_test.cc"],
+#     data = ["batch_function_fallback.mlir.bef"],
+#     enable_xprof = True,
+#     includes = ["third_party/tf_runtime/include"],
+#     owners = ["tf-runtime-testing"],
+#     tags = [
+#         "need_main",
+#         "no_gpu",
+#     ],
+#     deps = [
+#         "//base",
+#         "//devtools/build/runtime:get_runfiles_dir",
+#         "@com_google_absl//absl/log:check",
+#         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
+#         "//tensorflow/core/platform:env",
+#         "//tensorflow/core/platform:resource_loader",
+#         "//tensorflow/core/platform:status",
+#         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_op_handler",
+#         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_tensor",
+#         "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
+#         "//tensorflow/core/runtime_fallback/util:fallback_test_util",
+#         "//tensorflow/core/runtime_fallback/util:tensor_util",
+#         "//tensorflow/core/tfrt/utils:fallback_tensor",
+#         "@eigen_archive//:eigen3",
+#         "@tf_runtime//:bef",
+#         "@tf_runtime//:befexecutor",
+#         "@tf_runtime//:core_runtime_alwayslink",
+#         "@tf_runtime//:hostcontext_alwayslink",
+#         "@tf_runtime//:mlirtobef",
+#         "@tf_runtime//:support",
+#         "@tf_runtime//:tensor",
+#         "@tf_runtime//backends/cpu:core_runtime_alwayslink",
+#         "@tf_runtime//backends/cpu:test_ops_alwayslink",
+#     ],
+# )
+# copybara:uncomment_end
+
+tf_cc_shared_test(
+    name = "kernel_fallback_compat_test",
+    srcs = ["kernel_fallback_compat_test.cc"],
+    data = [
+        "create_op.mlir.bef",
+        "custom_thread_pool.mlir.bef",
+    ],
+    tags = ["no_oss"],
+    deps = [
+        "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
+        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
+        "//tensorflow/core/runtime_fallback/util:fallback_test_util",
+        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
+        "//tensorflow/core/tfrt/runtime",
+        "//tensorflow/core/tfrt/utils:thread_pool",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@tf_runtime//:bef",
+        "@tf_runtime//:befexecutor",
+        "@tf_runtime//:core_runtime",
+        "@tf_runtime//:hostcontext",
+        "@tf_runtime//:init_tfrt_dialects",
+        "@tf_runtime//:support",
+        "@tf_runtime//:tracing",
+    ],
+)
diff --git a/tensorflow/core/runtime_fallback/test/testdata/batch_function_fallback.mlir b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/batch_function_fallback.mlir
similarity index 100%
rename from tensorflow/core/runtime_fallback/test/testdata/batch_function_fallback.mlir
rename to tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/batch_function_fallback.mlir
diff --git a/tensorflow/core/runtime_fallback/test/batch_function_fallback_benchmark_test.cc b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/batch_function_fallback_benchmark_test.cc
similarity index 87%
rename from tensorflow/core/runtime_fallback/test/batch_function_fallback_benchmark_test.cc
rename to tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/batch_function_fallback_benchmark_test.cc
index 11bc0b6ecbf4f5..1d9d8f1e488984 100644
--- a/tensorflow/core/runtime_fallback/test/batch_function_fallback_benchmark_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/batch_function_fallback_benchmark_test.cc
@@ -12,41 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "base/logging.h"
-#include "devtools/build/runtime/get_runfiles_dir.h"
 #include "testing/base/public/benchmark.h"
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
-#include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.h"
-#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h"
 #include "tensorflow/core/runtime_fallback/util/fallback_test_util.h"
-#include "tensorflow/core/runtime_fallback/util/tensor_util.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 #include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
 #include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/core_runtime/tensor_handle.h"  // from @tf_runtime
+#include "tfrt/host_context/async_value.h"  // from @tf_runtime
+#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
 #include "tfrt/host_context/chain.h"  // from @tf_runtime
 #include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
 #include "tfrt/host_context/execution_context.h"  // from @tf_runtime
 #include "tfrt/host_context/function.h"  // from @tf_runtime
+#include "tfrt/host_context/host_allocator.h"  // from @tf_runtime
 #include "tfrt/host_context/host_context.h"  // from @tf_runtime
-#include "tfrt/support/aligned_buffer.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
 #include "tfrt/support/rc_array.h"  // from @tf_runtime
 #include "tfrt/tensor/dense_host_tensor.h"  // from @tf_runtime
-#include "tfrt/tensor/tensor_metadata.h"  // from @tf_runtime
+#include "tfrt/tensor/tensor.h"  // from @tf_runtime
 
 namespace tensorflow {
-namespace tfd {
 namespace {
 
 // Creates a BEF file with a program that runs
@@ -55,11 +52,11 @@ namespace {
 std::pair<tfrt::BefBuffer, tfrt::RCReference<tfrt::BEFFile>> CreateBefFile(
     tfrt::HostContext* host) {
   std::string file_path = GetDataDependencyFilepath(
-      "tensorflow/core/runtime_fallback/test/testdata/"
+      "tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/"
       "batch_function_fallback.mlir.bef");
 
   std::string data;
-  TF_CHECK_OK(ReadFileToString(Env::Default(), file_path, &data));
+  CHECK_OK(ReadFileToString(Env::Default(), file_path, &data));
 
   tfrt::BefBuffer bef_buffer(data.begin(), data.end());
 
@@ -109,7 +106,7 @@ TEST(BatchFunctionTest, Basic) {
   auto arguments = CreateTestArguments(func, host);
 
   tfrt::ResourceContext resource_ctx;
-  auto exec_ctx = CreateFallbackTestExecutionContext(host, &resource_ctx);
+  auto exec_ctx = tfd::CreateFallbackTestExecutionContext(host, &resource_ctx);
 
   std::vector<tfrt::RCReference<tfrt::AsyncValue>> results;
   results.resize(func->result_types().size());
@@ -141,7 +138,7 @@ void BM_BatchFunctionFallbackWithLargeAttributesAndManyInputsOutputs(
   auto arguments = CreateTestArguments(func, host);
 
   tfrt::ResourceContext resource_ctx;
-  auto exec_ctx = CreateFallbackTestExecutionContext(host, &resource_ctx);
+  auto exec_ctx = tfd::CreateFallbackTestExecutionContext(host, &resource_ctx);
 
   std::vector<tfrt::RCReference<tfrt::AsyncValue>> results;
   results.resize(func->result_types().size());
@@ -157,5 +154,4 @@ void BM_BatchFunctionFallbackWithLargeAttributesAndManyInputsOutputs(
 BENCHMARK(BM_BatchFunctionFallbackWithLargeAttributesAndManyInputsOutputs);
 
 }  // namespace
-}  // namespace tfd
 }  // namespace tensorflow
diff --git a/tensorflow/core/runtime_fallback/test/testdata/create_op.mlir b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/create_op.mlir
similarity index 100%
rename from tensorflow/core/runtime_fallback/test/testdata/create_op.mlir
rename to tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/create_op.mlir
diff --git a/tensorflow/core/runtime_fallback/test/testdata/custom_thread_pool.mlir b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/custom_thread_pool.mlir
similarity index 100%
rename from tensorflow/core/runtime_fallback/test/testdata/custom_thread_pool.mlir
rename to tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/custom_thread_pool.mlir
diff --git a/tensorflow/core/runtime_fallback/test/kernel_fallback_compat_test.cc b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/kernel_fallback_compat_test.cc
similarity index 87%
rename from tensorflow/core/runtime_fallback/test/kernel_fallback_compat_test.cc
rename to tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/kernel_fallback_compat_test.cc
index 75fae5c26e71ac..7b1a51f4fc664a 100644
--- a/tensorflow/core/runtime_fallback/test/kernel_fallback_compat_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/kernel_fallback_compat_test.cc
@@ -16,12 +16,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h"
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/resource_loader.h"
-#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
 #include "tensorflow/core/runtime_fallback/util/fallback_test_util.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
@@ -30,14 +30,15 @@ limitations under the License.
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 #include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
 #include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
+#include "tfrt/host_context/async_value.h"  // from @tf_runtime
 #include "tfrt/host_context/chain.h"  // from @tf_runtime
 #include "tfrt/host_context/function.h"  // from @tf_runtime
 #include "tfrt/host_context/host_context.h"  // from @tf_runtime
-#include "tfrt/init_tfrt_dialects.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+#include "tfrt/support/ref_count.h"  // from @tf_runtime
 #include "tfrt/tracing/tracing.h"  // from @tf_runtime
 
 namespace tensorflow {
-namespace tfd {
 namespace {
 
 // Creates a BEF file with a program that runs tfrt_fallback.batch_function with
@@ -48,9 +49,9 @@ namespace {
 std::pair<tfrt::BefBuffer, tfrt::RCReference<tfrt::BEFFile>> CreateBefFile(
     absl::string_view file_name, tfrt::HostContext* host) {
   std::string file_path = GetDataDependencyFilepath(absl::StrCat(
-      "tensorflow/core/runtime_fallback/test/testdata/", file_name));
+      "tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/", file_name));
   std::string data;
-  TF_CHECK_OK(ReadFileToString(Env::Default(), file_path, &data));
+  CHECK_OK(ReadFileToString(Env::Default(), file_path, &data));
 
   tfrt::BefBuffer bef_buffer(data.begin(), data.end());
 
@@ -69,7 +70,7 @@ TEST(KernelFallbackCompatTest, CreateOp) {
   auto& bef_file = pair.second;
 
   tfrt::ResourceContext resource_ctx;
-  auto exec_ctx = CreateFallbackTestExecutionContext(host, &resource_ctx);
+  auto exec_ctx = tfd::CreateFallbackTestExecutionContext(host, &resource_ctx);
 
   auto chain = tfrt::GetReadyChain();
 
@@ -86,7 +87,7 @@ TEST(KernelFallbackCompatTest, CreateOp) {
 
   auto* fallback_request_state =
       exec_ctx.request_ctx()
-          ->GetDataIfExists<KernelFallbackCompatRequestState>();
+          ->GetDataIfExists<tfd::KernelFallbackCompatRequestState>();
 
   ASSERT_TRUE(fallback_request_state != nullptr);
 
@@ -120,8 +121,8 @@ TEST(KernelFallbackCompatTest, CustomThreadPool) {
   tensorflow::tfrt_stub::TfThreadPool thread_pool(/*name=*/"test",
                                                   /*num_threads=*/1);
 
-  auto exec_ctx =
-      CreateFallbackTestExecutionContext(host, &resource_ctx, &thread_pool);
+  auto exec_ctx = tfd::CreateFallbackTestExecutionContext(host, &resource_ctx,
+                                                          &thread_pool);
 
   auto chain = tfrt::GetReadyChain();
 
@@ -146,5 +147,4 @@ TEST(KernelFallbackCompatTest, CustomThreadPool) {
 }
 
 }  // namespace
-}  // namespace tfd
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc b/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
index 1ae3e8f1c54d31..a07558bac45f77 100644
--- a/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
+++ b/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
@@ -33,8 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/tpu_passes.h"
-#include "xla/mlir_hlo/gml_st/IR/gml_st_ops.h"
-#include "xla/mlir_hlo/gml_st/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tfrt/init_tfrt_dialects.h"  // from @tf_runtime
 
@@ -46,7 +44,6 @@ int main(int argc, char **argv) {
 
   mlir::registerTensorFlowPasses();
 
-  mlir::gml_st::registerGmlStPasses();
 
   tensorflow::mlrt_compiler::RegisterMlrtPasses();
   tensorflow::ifrt_serving::RegisterTfIfrtPasses();
@@ -54,7 +51,6 @@ int main(int argc, char **argv) {
   mlir::DialectRegistry registry;
   mlir::registerAllDialects(registry);
   mlir::RegisterAllTensorFlowDialects(registry);
-  registry.insert<mlir::gml_st::GmlStDialect>();
   registry.insert<mlir::shape::ShapeDialect>();
   registry.insert<mlir::mhlo::MhloDialect>();
   registry.insert<mlir::TFL::TensorFlowLiteDialect>();
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
new file mode 100644
index 00000000000000..ec36fb683bc897
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
@@ -0,0 +1,179 @@
+load("//tensorflow:tensorflow.bzl", "if_google", "tf_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    packages = [
+        "//tensorflow/compiler/mlir/tfrt/...",
+        "//tensorflow/core/tfrt/ifrt/...",
+        "//tensorflow/core/tfrt/saved_model/tests/...",
+    ] + if_google([
+        "//learning/brain/tfrt/cpp_tests/...",
+        # Allow visibility from the mlir language server.
+        "//learning/brain/mlir/mlir_lsp_server/...",
+    ]),
+)
+
+cc_library(
+    name = "tf_ifrt_passes",
+    srcs = [
+        "rewrite_cluster_to_ifrt_call.cc",
+        "tf_ifrt_passes.cc",
+    ],
+    hdrs = [
+        "rewrite_cluster_to_ifrt_call.h",
+        "tf_ifrt_passes.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:random",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "tf2hlo",
+    srcs = ["tf2hlo.cc"],
+    hdrs = ["tf2hlo.h"],
+    deps = [
+        "//tensorflow/compiler/jit:xla_cpu_jit",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
+        "//tensorflow/compiler/mlir/tf2xla/api/v2:legalize_tf",
+        "//tensorflow/compiler/tf2xla:layout_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_headers_for_pybind",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla:shape_util",
+        "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/client:client_library",
+        "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/stream_executor",
+        "@local_xla//xla/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
+    ],
+)
+
+tf_cc_test(
+    name = "tf2hlo_test",
+    srcs = [
+        "tf2hlo_test.cc",
+    ],
+    data = [
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata",
+    ],
+    tags = ["no_oss"],
+    deps = [
+        ":tf2hlo",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core/platform:resource_loader",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/python/ifrt:test_util",
+        "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
+    ],
+)
+
+cc_library(
+    name = "ifrt_backend_compiler",
+    srcs = ["ifrt_backend_compiler.cc"],
+    hdrs = ["ifrt_backend_compiler.h"],
+    deps = [
+        ":tf_ifrt_passes",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:visitor",
+        "//tensorflow/compiler/mlir/tf2xla/api/v2:cluster_tf",
+        "//tensorflow/compiler/mlir/tfrt:backend_compiler",
+        "//tensorflow/compiler/mlir/tfrt:tpu_passes",
+        "//tensorflow/core/tfrt/ifrt:ifrt_executable_registry",
+        "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
+        "//tensorflow/core/tfrt/ifrt:ifrt_serving_executable",
+        "//tensorflow/core/tfrt/runtime",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ],
+)
+
+tf_cc_test(
+    name = "ifrt_backend_compiler_test",
+    srcs = [
+        "ifrt_backend_compiler_test.cc",
+    ],
+    data = [
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata",
+    ],
+    tags = ["no_oss"],
+    deps = [
+        ":ifrt_backend_compiler",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
+        "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
+        "//tensorflow/core/tfrt/runtime",
+        "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/python/ifrt:test_util",
+        "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
+        "@tf_runtime//:hostcontext",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc
index 16b1f0b7776160..978ffd25667b4c 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc
@@ -144,9 +144,6 @@ absl::Status IfrtBackendCompiler::CompileTensorflow(
     tensorflow::DumpMlirOpToFile("ifrt_tpu_bct_conversion_before", module);
   }
 
-  // TODO(b/305734600): conditionally running backward compat pass on host with
-  // tpu only.
-  //
   // Run backward compat pass so that we can use bridge to do clustering.
   auto backward_compat_result =
       tensorflow::RunTPUBackwardCompatConversion(module, {});
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata/tf2hlo_1in1out.mlir b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata/tf2hlo_1in1out.mlir
deleted file mode 100644
index 8bd488bae251f2..00000000000000
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata/tf2hlo_1in1out.mlir
+++ /dev/null
@@ -1,5 +0,0 @@
-module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
-  func.func @main(%arg0: tensor<1x3xi32>) -> (tensor<1x3xi32>) {
-    func.return %arg0: tensor<1x3xi32>
-  }
-}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata/tf2hlo_tuple.mlir b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata/tf2hlo_tuple.mlir
new file mode 100644
index 00000000000000..f1eb5659fb97b8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata/tf2hlo_tuple.mlir
@@ -0,0 +1,6 @@
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+  func.func @main(%arg0: tensor<1x3xf32>, %arg1: tensor<3x1xf32>) -> (tensor<1x1xf32>, tensor<1x3xf32>) {
+    %0 = "tf.MatMul"(%arg0, %arg1): (tensor<1x3xf32>, tensor<3x1xf32>) -> tensor<1x1xf32>
+    func.return %0, %arg0: tensor<1x1xf32>, tensor<1x3xf32>
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
index bf661ab5be5630..246c920d64b964 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
@@ -21,10 +21,13 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
@@ -54,6 +57,8 @@ limitations under the License.
 namespace tensorflow {
 namespace ifrt_serving {
 
+static constexpr absl::string_view kEntryFuncName = "main";
+
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CompileTfToHlo(
     mlir::ModuleOp module, absl::Span<const tensorflow::Tensor> inputs,
     absl::string_view entry_function_name, xla::ifrt::Compiler* ifrt_compiler,
@@ -89,7 +94,21 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CompileTfToHlo(
     // supported.
     metadata_arg1->set_kind(tpu::TPUCompileMetadataProto::Arg::PARAMETER);
   }
-  metadata.add_retvals();
+
+  auto entry_fn = module.lookupSymbol<mlir::func::FuncOp>(kEntryFuncName);
+  if (!entry_fn) {
+    return absl::InternalError("Could not find entry function in MLIR Module.");
+  }
+
+  if (inputs.size() != entry_fn.getNumArguments()) {
+    return absl::InternalError(
+        absl::StrCat("Number of inputs mismatched! Expect",
+                     entry_fn.getNumArguments(), " got", inputs.size()));
+  }
+
+  for (int i = 0; i < entry_fn.getNumResults(); i++) {
+    metadata.add_retvals();
+  }
 
   bool use_tuple_args = false;
   std::vector<tpu::ShardingAndIndex> arg_core_mapping;
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc
index 8ad906ece7a2f4..ff2b4cebfb2530 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc
@@ -72,12 +72,13 @@ TEST(Tf2HloTest, Basic) {
   TF_ASSERT_OK(result.status());
 }
 
-TEST(Tf2HloTest, 1in1out) {
+// Multiple input and multiple out.
+TEST(Tf2HloTest, Tuple) {
   // Create test input module
   constexpr absl::string_view kDataDirectory =
       "tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata";
   std::string mlir_module_path = tensorflow::GetDataDependencyFilepath(
-      absl::StrCat(kDataDirectory, "/tf2hlo_1in1out.mlir"));
+      absl::StrCat(kDataDirectory, "/tf2hlo_tuple.mlir"));
 
   mlir::DialectRegistry registry;
   mlir::registerAllDialects(registry);
@@ -95,8 +96,10 @@ TEST(Tf2HloTest, 1in1out) {
                           xla::ifrt::test_util::GetClient());
 
   std::vector<tensorflow::Tensor> tensors;
-  tensorflow::Tensor x(DT_INT32, tensorflow::TensorShape({1, 3}));
+  tensorflow::Tensor x(DT_FLOAT, tensorflow::TensorShape({1, 3}));
+  tensorflow::Tensor y(DT_FLOAT, tensorflow::TensorShape({3, 1}));
   tensors.push_back(x);
+  tensors.push_back(y);
   auto result = CompileTfToHlo(mlir_module.get(), tensors, "main",
                                client->GetDefaultCompiler(),
                                tensorflow::IdentityShapeRepresentationFn());
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
index 03558438ac6f6b..90ab3af857c542 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
@@ -1,7 +1,6 @@
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
         # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt:__subpackages__",
         "//tensorflow/core/tfrt:__subpackages__",
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc
index 6e04fe1c1e23a1..6dc48e4d6d137f 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc
@@ -68,7 +68,7 @@ StatusOr<mlrt::bc::Buffer> ConvertTfMlirToBytecode(
           TF_RETURN_IF_ERROR(
               ExportFunctionDefs(*copy, [flib_def](FunctionDef function_def) {
                 VLOG(1) << "Exporting MLIR function as function_def: "
-                        << function_def.DebugString();
+                        << function_def;
 
                 // The TF MLIR compiler may change the function name. Then we
                 // need to retrieve the original name from the
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
index 8c85f9f80ac912..1953ddd3d93997 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
@@ -464,6 +464,8 @@ absl::StatusOr<bc::Buffer> EmitExecutable(
     return status;
   }
 
+  buffer.shrink_to_fit();
+
   return buffer;
 }
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index e6ce181074de7f..d391f35e9adf77 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -142,7 +142,6 @@ tf_cc_binary(
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:MlirOptLib",
         "@local_xla//xla/mlir_hlo:all_passes",
-        "@local_xla//xla/mlir_hlo:gml_st",
         "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
         "@stablehlo//:register",
     ],
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
index c1a6daff83008d..e64ef8e2900f47 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
@@ -42,12 +42,15 @@ class OpKernelContextType
     : public Type::TypeBase<OpKernelContextType, Type, TypeStorage> {
  public:
   using Base::Base;
+  static constexpr StringLiteral name =
+      "kernel_gen.tf_framework.op_kernel_context";
 };
 
 class JITCallableType
     : public Type::TypeBase<JITCallableType, Type, TypeStorage> {
  public:
   using Base::Base;
+  static constexpr StringLiteral name = "kernel_gen.tf_framework.jit_callable";
 };
 
 absl::StatusCode ConvertAttrToEnumValue(ErrorCode error_code);
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_abi_knowledge.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_abi_knowledge.mlir
index 8619344681beac..47b5a122ef0dd2 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_abi_knowledge.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_abi_knowledge.mlir
@@ -25,44 +25,44 @@ module attributes {gpu.container_module} {
   // CHECK-LABEL: gpu.module @abs_kernel
   gpu.module @abs_kernel {
     // CHECK-LABEL: llvm.func @abs_kernel
-    // ABI-SAME: %[[ARG0:.*]]: !llvm.ptr<f32>, %[[ARG1:.*]]: !llvm.ptr<f32> {llvm.align = 16 : index},
-    // ABI-SAME: %[[ARG2:.*]]: i64, %[[ARG3:.*]]: i64, %[[ARG4:.*]]: i64, %[[ARG5:.*]]: !llvm.ptr<f32>, %[[ARG6:.*]]: !llvm.ptr<f32> {llvm.align = 16 : index, llvm.noalias},
+    // ABI-SAME: %[[ARG0:.*]]: !llvm.ptr, %[[ARG1:.*]]: !llvm.ptr {llvm.align = 16 : index},
+    // ABI-SAME: %[[ARG2:.*]]: i64, %[[ARG3:.*]]: i64, %[[ARG4:.*]]: i64, %[[ARG5:.*]]: !llvm.ptr, %[[ARG6:.*]]: !llvm.ptr {llvm.align = 16 : index, llvm.noalias},
     // ABI-SAME: %[[ARG7:.*]]: i64, %[[ARG8:.*]]: i64, %[[ARG9:.*]]: i64
-    // SHAPE-SAME: %[[ARG0:.*]]: !llvm.ptr<f32>, %[[ARG1:.*]]: !llvm.ptr<f32>, %[[ARG2:.*]]: i64, %[[ARG3:.*]]: i64, %[[ARG4:.*]]: i64, %[[ARG5:.*]]: !llvm.ptr<f32>, %[[ARG6:.*]]: !llvm.ptr<f32>, %[[ARG7:.*]]: i64, %[[ARG8:.*]]: i64, %[[ARG9:.*]]: i64
-    llvm.func @abs_kernel(%arg0: !llvm.ptr<f32>, %arg1: !llvm.ptr<f32>, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr<f32>, %arg6: !llvm.ptr<f32>, %arg7: i64, %arg8: i64, %arg9: i64) attributes {gpu.kernel} {
+    // SHAPE-SAME: %[[ARG0:.*]]: !llvm.ptr, %[[ARG1:.*]]: !llvm.ptr, %[[ARG2:.*]]: i64, %[[ARG3:.*]]: i64, %[[ARG4:.*]]: i64, %[[ARG5:.*]]: !llvm.ptr, %[[ARG6:.*]]: !llvm.ptr, %[[ARG7:.*]]: i64, %[[ARG8:.*]]: i64, %[[ARG9:.*]]: i64
+    llvm.func @abs_kernel(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64) attributes {gpu.kernel} {
       // ABI: %[[ZERO:.*]] = llvm.mlir.constant(0 : index)
       // ABI: %[[ONE:.*]] = llvm.mlir.constant(1 : index)
       // CHECK: llvm.mlir.undef
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       // ABI-NEXT: llvm.insertvalue %[[ARG1]]
       // SHAPE-NEXT: llvm.insertvalue %[[ARG0]]
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       // CHECK-NEXT: llvm.insertvalue %[[ARG1]]
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       // ABI-NEXT: llvm.insertvalue %[[ZERO]]
       // SHAPE-NEXT: llvm.insertvalue %[[ARG2]]
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       // CHECK-NEXT: llvm.insertvalue %[[ARG3]]
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       // ABI-NEXT: llvm.insertvalue %[[ONE]]
       // SHAPE-NEXT: llvm.insertvalue %[[ARG4]]
-      %5 = llvm.insertvalue %arg4, %4[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+      %5 = llvm.insertvalue %arg4, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       // CHECK-NEXT: llvm.mlir.undef
-      %6 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+      %6 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       // ABI-NEXT: llvm.insertvalue %[[ARG6]]
       // SHAPE-NEXT: llvm.insertvalue %[[ARG5]]
-      %7 = llvm.insertvalue %arg5, %6[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+      %7 = llvm.insertvalue %arg5, %6[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       // CHECK-NEXT: llvm.insertvalue %[[ARG6]]
-      %8 = llvm.insertvalue %arg6, %7[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+      %8 = llvm.insertvalue %arg6, %7[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       // ABI-NEXT: llvm.insertvalue %[[ZERO]]
       // SHAPE-NEXT: llvm.insertvalue %[[ARG7]]
-      %9 = llvm.insertvalue %arg7, %8[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+      %9 = llvm.insertvalue %arg7, %8[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       // ABI-NEXT: llvm.insertvalue %[[ARG8]]
       // SHAPE-NEXT: llvm.insertvalue %[[ARG3]]
-      %10 = llvm.insertvalue %arg8, %9[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+      %10 = llvm.insertvalue %arg8, %9[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       // ABI-NEXT: llvm.insertvalue %[[ONE]]
       // SHAPE-NEXT: llvm.insertvalue %[[ARG4]]
-      %11 = llvm.insertvalue %arg9, %10[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+      %11 = llvm.insertvalue %arg9, %10[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       llvm.return
       // CHECK-NEXT: llvm.return
     }
@@ -93,63 +93,63 @@ module attributes {gpu.container_module} {
     // ABI-SAME: {llvm.align = 16 : index}
     // ABI-SAME: {llvm.align = 16 : index}
     // ABI-SAME: {llvm.align = 16 : index, llvm.noalias}
-    llvm.func @AddV2_kernel(%arg0: i64, %arg1: !llvm.ptr<f32>, %arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: !llvm.ptr<f32>, %arg7: !llvm.ptr<f32>, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr<f32>, %arg12: !llvm.ptr<f32>, %arg13: i64, %arg14: i64, %arg15: i64) attributes {gpu.kernel} {
+    llvm.func @AddV2_kernel(%arg0: i64, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: !llvm.ptr, %arg7: !llvm.ptr, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64) attributes {gpu.kernel} {
       // ABI: %[[C0:.*]] = llvm.mlir.constant(0 : index) : i64
       // ABI: %[[C1:.*]] = llvm.mlir.constant(1 : index) : i64
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg1, %0[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %2 = llvm.insertvalue %arg2, %1[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %3 = llvm.insertvalue %arg3, %2[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %4 = llvm.insertvalue %arg4, %3[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %5 = llvm.insertvalue %arg5, %4[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR0:.*]], %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR0]], %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C1]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[SHP:.*]], %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[STR:.*]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %6 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %7 = llvm.insertvalue %arg6, %6[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %8 = llvm.insertvalue %arg7, %7[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %9 = llvm.insertvalue %arg8, %8[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %10 = llvm.insertvalue %arg9, %9[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %11 = llvm.insertvalue %arg10, %10[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR1:.*]], %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR1]], %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C1]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[SHP]], %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[STR]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %12 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %13 = llvm.insertvalue %arg11, %12[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %14 = llvm.insertvalue %arg12, %13[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %15 = llvm.insertvalue %arg13, %14[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %16 = llvm.insertvalue %arg14, %15[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %17 = llvm.insertvalue %arg15, %16[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR2:.*]], %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR2]], %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C1]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[SHP]], %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[STR]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %1 = llvm.insertvalue %arg1, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %2 = llvm.insertvalue %arg2, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %3 = llvm.insertvalue %arg3, %2[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %4 = llvm.insertvalue %arg4, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %5 = llvm.insertvalue %arg5, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR0:.*]], %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR0]], %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C1]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[SHP:.*]], %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[STR:.*]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %6 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %7 = llvm.insertvalue %arg6, %6[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %8 = llvm.insertvalue %arg7, %7[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %9 = llvm.insertvalue %arg8, %8[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %10 = llvm.insertvalue %arg9, %9[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %11 = llvm.insertvalue %arg10, %10[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR1:.*]], %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR1]], %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C1]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[SHP]], %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[STR]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %12 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %13 = llvm.insertvalue %arg11, %12[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %14 = llvm.insertvalue %arg12, %13[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %15 = llvm.insertvalue %arg13, %14[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %16 = llvm.insertvalue %arg14, %15[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %17 = llvm.insertvalue %arg15, %16[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR2:.*]], %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR2]], %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C1]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[SHP]], %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[STR]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       llvm.return
       // CHECK-NEXT: llvm.return
     }
@@ -181,80 +181,80 @@ module attributes {gpu.container_module} {
     // ABI-SAME: {llvm.align = 16 : index, llvm.noalias}
     // ABI-SAME: {llvm.align = 16 : index}
     // ABI-SAME: {llvm.align = 16 : index}
-    llvm.func @AddV2_kernel(%arg0: i64, %arg1: i64, %arg2: !llvm.ptr<f32>, %arg3: !llvm.ptr<f32> {llvm.align = 16 : index, llvm.noalias}, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: !llvm.ptr<f32>, %arg10: !llvm.ptr<f32> {llvm.align = 16 : index}, %arg11: i64, %arg12: i64, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: !llvm.ptr<f32>, %arg17: !llvm.ptr<f32> {llvm.align = 16 : index}, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: i64) attributes {gpu.kernel} {
+    llvm.func @AddV2_kernel(%arg0: i64, %arg1: i64, %arg2: !llvm.ptr, %arg3: !llvm.ptr {llvm.align = 16 : index, llvm.noalias}, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: !llvm.ptr, %arg10: !llvm.ptr {llvm.align = 16 : index}, %arg11: i64, %arg12: i64, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: !llvm.ptr, %arg17: !llvm.ptr {llvm.align = 16 : index}, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: i64) attributes {gpu.kernel} {
       // ABI: %[[C0:.*]] = llvm.mlir.constant(0 : index) : i64
       // ABI: %[[C1:.*]] = llvm.mlir.constant(1 : index) : i64
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %1 = llvm.insertvalue %arg2, %0[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %2 = llvm.insertvalue %arg3, %1[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %3 = llvm.insertvalue %arg4, %2[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %4 = llvm.insertvalue %arg5, %3[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %6 = llvm.insertvalue %arg6, %5[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR0:.*]], %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR0]], %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C1]], %{{.*}}[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[SHP0:.*]], %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[STR0:.*]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[SHP1:.*]], %{{.*}}[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[STR1:.*]], %{{.*}}[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %8 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %9 = llvm.insertvalue %arg9, %8[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %10 = llvm.insertvalue %arg10, %9[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %11 = llvm.insertvalue %arg11, %10[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %12 = llvm.insertvalue %arg12, %11[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %13 = llvm.insertvalue %arg14, %12[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %14 = llvm.insertvalue %arg13, %13[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %15 = llvm.insertvalue %arg15, %14[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR0:.*]], %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR0]], %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NOT: llvm.insertvalue %[[C1]], %{{.*}}[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[SHP0]], %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NOT: llvm.insertvalue %[[STR0]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE: llvm.insertvalue %[[SHP1]], %{{.*}}[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NOT: llvm.insertvalue %[[STR1]], %{{.*}}[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %16 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %17 = llvm.insertvalue %arg16, %16[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %18 = llvm.insertvalue %arg17, %17[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %19 = llvm.insertvalue %arg18, %18[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %20 = llvm.insertvalue %arg19, %19[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %21 = llvm.insertvalue %arg21, %20[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %22 = llvm.insertvalue %arg20, %21[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      %23 = llvm.insertvalue %arg22, %22[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR0:.*]], %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR0]], %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // ABI-NOT: llvm.insertvalue %[[C1]], %{{.*}}[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[SHP0]], %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NOT: llvm.insertvalue %[[STR0]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE: llvm.insertvalue %[[SHP1]], %{{.*}}[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-      // SHAPE-NOT: llvm.insertvalue %[[STR1]], %{{.*}}[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
+      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %1 = llvm.insertvalue %arg2, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %2 = llvm.insertvalue %arg3, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %3 = llvm.insertvalue %arg4, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %4 = llvm.insertvalue %arg5, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %6 = llvm.insertvalue %arg6, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR0:.*]], %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR0]], %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C1]], %{{.*}}[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[SHP0:.*]], %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[STR0:.*]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[SHP1:.*]], %{{.*}}[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[STR1:.*]], %{{.*}}[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %8 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %9 = llvm.insertvalue %arg9, %8[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %10 = llvm.insertvalue %arg10, %9[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %11 = llvm.insertvalue %arg11, %10[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %12 = llvm.insertvalue %arg12, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %13 = llvm.insertvalue %arg14, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %14 = llvm.insertvalue %arg13, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %15 = llvm.insertvalue %arg15, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR0:.*]], %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR0]], %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NOT: llvm.insertvalue %[[C1]], %{{.*}}[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[SHP0]], %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NOT: llvm.insertvalue %[[STR0]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE: llvm.insertvalue %[[SHP1]], %{{.*}}[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NOT: llvm.insertvalue %[[STR1]], %{{.*}}[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %16 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %17 = llvm.insertvalue %arg16, %16[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %18 = llvm.insertvalue %arg17, %17[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %19 = llvm.insertvalue %arg18, %18[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %20 = llvm.insertvalue %arg19, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %21 = llvm.insertvalue %arg21, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %22 = llvm.insertvalue %arg20, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %23 = llvm.insertvalue %arg22, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR0:.*]], %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR0]], %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // ABI-NOT: llvm.insertvalue %[[C1]], %{{.*}}[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[SHP0]], %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NOT: llvm.insertvalue %[[STR0]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE: llvm.insertvalue %[[SHP1]], %{{.*}}[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      // SHAPE-NOT: llvm.insertvalue %[[STR1]], %{{.*}}[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
       llvm.return
       // CHECK: llvm.return
     }
@@ -289,63 +289,63 @@ module attributes {gpu.container_module} {
     // ABI-SAME: {llvm.align = 16 : index, llvm.noalias}
     // ABI-SAME: {llvm.align = 16 : index}
     // ABI-SAME: {llvm.align = 16 : index}
-    llvm.func @AddV2_kernel(%arg0: i64, %arg1: !llvm.ptr<f32>, %arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: !llvm.ptr<f32>, %arg7: !llvm.ptr<f32>, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr<f32>, %arg12: !llvm.ptr<f32>, %arg13: i64, %arg14: i64, %arg15: i64) attributes {gpu.kernel} {
+    llvm.func @AddV2_kernel(%arg0: i64, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: !llvm.ptr, %arg7: !llvm.ptr, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64) attributes {gpu.kernel} {
       // ABI: %[[C0:.*]] = llvm.mlir.constant(0 : index) : i64
       // ABI: %[[C1:.*]] = llvm.mlir.constant(1 : index) : i64
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg1, %0[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %2 = llvm.insertvalue %arg2, %1[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %3 = llvm.insertvalue %arg3, %2[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %4 = llvm.insertvalue %arg4, %3[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %5 = llvm.insertvalue %arg5, %4[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR0:.*]], %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR0]], %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C1]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[SHP:.*]], %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[STR:.*]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %6 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %7 = llvm.insertvalue %arg6, %6[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %8 = llvm.insertvalue %arg7, %7[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %9 = llvm.insertvalue %arg8, %8[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %10 = llvm.insertvalue %arg9, %9[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %11 = llvm.insertvalue %arg10, %10[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR1:.*]], %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR1]], %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[SHP]], %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NOT: llvm.insertvalue %[[STR]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %12 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %13 = llvm.insertvalue %arg11, %12[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %14 = llvm.insertvalue %arg12, %13[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %15 = llvm.insertvalue %arg13, %14[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %16 = llvm.insertvalue %arg14, %15[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      %17 = llvm.insertvalue %arg15, %16[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR2:.*]], %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[PTR2]], %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // ABI-NEXT: llvm.insertvalue %[[C1]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NEXT: llvm.insertvalue %[[SHP]], %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-      // SHAPE-NOT: llvm.insertvalue %[[STR]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %1 = llvm.insertvalue %arg1, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %2 = llvm.insertvalue %arg2, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %3 = llvm.insertvalue %arg3, %2[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %4 = llvm.insertvalue %arg4, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %5 = llvm.insertvalue %arg5, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR0:.*]], %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR0]], %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C1]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[SHP:.*]], %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[STR:.*]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %6 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %7 = llvm.insertvalue %arg6, %6[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %8 = llvm.insertvalue %arg7, %7[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %9 = llvm.insertvalue %arg8, %8[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %10 = llvm.insertvalue %arg9, %9[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %11 = llvm.insertvalue %arg10, %10[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR1:.*]], %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR1]], %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[SHP]], %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NOT: llvm.insertvalue %[[STR]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %12 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %13 = llvm.insertvalue %arg11, %12[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %14 = llvm.insertvalue %arg12, %13[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %15 = llvm.insertvalue %arg13, %14[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %16 = llvm.insertvalue %arg14, %15[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %17 = llvm.insertvalue %arg15, %16[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR2:.*]], %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[PTR2]], %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C0]], %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // ABI-NEXT: llvm.insertvalue %[[C1]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NEXT: llvm.insertvalue %[[SHP]], %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      // SHAPE-NOT: llvm.insertvalue %[[STR]], %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       llvm.return
       // CHECK: llvm.return
     }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
index 681896f2a235a7..178e899cb33a72 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
-#include "xla/mlir_hlo/gml_st/IR/gml_st_ops.h"
 #include "xla/mlir_hlo/lhlo/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/IR/register.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
@@ -37,8 +36,7 @@ int main(int argc, char **argv) {
   mlir::stablehlo::registerAllDialects(registry);
   mlir::RegisterAllTensorFlowDialects(registry);
 
-  registry.insert<mlir::gml_st::GmlStDialect,
-                  mlir::kernel_gen::tf_framework::TFFrameworkDialect>();
+  registry.insert<mlir::kernel_gen::tf_framework::TFFrameworkDialect>();
 
   return failed(
       mlir::MlirOptMain(argc, argv, "MLIR HLO pass driver\n", registry));
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index 7cf5ef8522bb23..7c2e9d45d12db9 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -1,13 +1,13 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load(
-    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -155,7 +155,6 @@ cc_library(
         "@local_xla//xla:debug_options_flags",
         "@local_xla//xla:xla_proto_cc",
         "@local_xla//xla/mlir_hlo",
-        "@local_xla//xla/mlir_hlo:gml_st",
         "@local_xla//xla/mlir_hlo:lhlo",
         "@local_xla//xla/mlir_hlo:mhlo_passes",
         "@local_xla//xla/mlir_hlo:type_conversion",
@@ -218,7 +217,6 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:Transforms",
-        "@local_xla//xla/mlir_hlo:gml_st",
         "@local_xla//xla/mlir_hlo:lhlo",
         "@local_xla//xla/mlir_hlo:transforms_passes",
     ],
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
index 136b278e8c9dcf..b002effdfccf89 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/Dialect/Complex/IR/Complex.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
 #include "mlir/Dialect/Math/IR/Math.h"  // from @llvm-project
@@ -178,7 +179,7 @@ LogicalResult ConvertLaunchFuncOpToTfRuntimeCallPattern::matchAndRewrite(
   name_buffer.append("_blob");
   Value module_blob = LLVM::createGlobalString(loc, rewriter, name_buffer.str(),
                                                binary_attr.getValue(),
-                                               LLVM::Linkage::Internal, true);
+                                               LLVM::Linkage::Internal);
 
   // Make sure the trailing zero is included in the constant.
   auto kernel_name = launch_op.getKernelName().getValue();
@@ -192,7 +193,7 @@ LogicalResult ConvertLaunchFuncOpToTfRuntimeCallPattern::matchAndRewrite(
           .toStringRef(kernel_name_global_name_buffer);
   auto kernel_name_global = LLVM::createGlobalString(
       loc, rewriter, kernel_name_global_name, kernel_name_buffer,
-      LLVM::Linkage::Internal, true);
+      LLVM::Linkage::Internal);
 
   // The TensorFlow OpKernelContext is the first argument of the surrounding
   // LLVMFunc.
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc
index b1c909bb52364c..b3cb73b78baf20 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 
@@ -61,7 +62,7 @@ Value CreateOrFindGlobalStringConstant(Location loc, StringRef global_name,
                                   ValueRange{c0, c0});
   }
   return LLVM::createGlobalString(loc, *b, global_name, content,
-                                  LLVM::Linkage::Internal, true);
+                                  LLVM::Linkage::Internal);
 }
 
 }  // namespace transforms
diff --git a/tensorflow/compiler/mlir/tosa/BUILD b/tensorflow/compiler/mlir/tosa/BUILD
index 9eca865c4a91fd..d53604011273d8 100644
--- a/tensorflow/compiler/mlir/tosa/BUILD
+++ b/tensorflow/compiler/mlir/tosa/BUILD
@@ -186,8 +186,8 @@ cc_library(
         "transforms/convert_metadata.cc",
         "transforms/convert_tfl_uint8.cc",
         "transforms/legalize_tfl.cc",
+        "transforms/legalize_tfl_stateful.cc",
         "transforms/lower_complex_types.cc",
-        "transforms/lower_global_tensors.cc",
         "transforms/retain_call_once_funcs.cc",
         "transforms/strip_metadata.cc",
         "transforms/strip_quant_types.cc",
@@ -213,7 +213,6 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MLProgramDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:ReconcileUnrealizedCasts",
diff --git a/tensorflow/compiler/mlir/tosa/tests/lower_global_tensors.mlir b/tensorflow/compiler/mlir/tosa/tests/lower_global_tensors.mlir
deleted file mode 100644
index 5b8bd2cc3c09a2..00000000000000
--- a/tensorflow/compiler/mlir/tosa/tests/lower_global_tensors.mlir
+++ /dev/null
@@ -1,145 +0,0 @@
-// RUN: tf-opt --split-input-file --pass-pipeline='builtin.module(tflite-lower-global-tensors)' %s | FileCheck %s
-
-module {
-  // CHECK: ml_program.global private mutable @Variable(dense<1.000000e+00> : tensor<16x16xf32>)
-  // CHECK-LABEL: func.func @state
-  func.func @state(%arg0: tensor<16x16xf32>) -> () {
-    "tfl.call_once"() {session_init_function = "StateInit"} : () -> ()
-    return
-  }
-
-  func.func private @StateInit() {
-    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
-    %1 = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<16x16xf32>} : () -> tensor<16x16xf32>
-    "tfl.assign_variable"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
-    return
-  }
-}
-
-// -----
-
-module {
-  // CHECK: ml_program.global private mutable @Variable(dense<1.000000e+00> : tensor<16x16xf32>)
-
-  // CHECK-LABEL: func.func @assign
-  func.func @assign(%arg0: tensor<16x16xf32>) -> () {
-    "tfl.call_once"() {session_init_function = "AssignInit"} : () -> ()
-    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
-
-    // CHECK: ml_program.global_store @Variable = %arg0
-    "tfl.assign_variable"(%0, %arg0) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
-    return
-  }
-
-  func.func private @AssignInit() {
-    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
-    %1 = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<16x16xf32>} : () -> tensor<16x16xf32>
-    "tfl.assign_variable"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
-    return
-  }
-}
-
-// -----
-
-module {
-  // CHECK: ml_program.global private mutable @Variable(dense<1.000000e+00> : tensor<16x16xf32>)
-
-  // CHECK-LABEL: func.func @read
-  func.func @read(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
-    "tfl.call_once"() {session_init_function = "ReadInit"} : () -> ()
-
-    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
-
-    // CHECK: %[[LOAD:.+]] = ml_program.global_load @Variable : tensor<16x16xf32>
-    %1 = "tfl.read_variable"(%0) : (tensor<*x!tf_type.resource>) -> tensor<16x16xf32>
-    return %1 : tensor<16x16xf32>
-  }
-
-  func.func private @ReadInit() {
-    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
-    %1 = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<16x16xf32>} : () -> tensor<16x16xf32>
-    "tfl.assign_variable"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
-    return
-  }
-}
-
-// -----
-
-module {
-  // CHECK: ml_program.global private mutable @Variable(dense<2.000000e+00> : tensor<16x16xf32>)
-
-  // CHECK-LABEL: func.func @readAssign
-  func.func @readAssign(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
-    "tfl.call_once"() {session_init_function = "ReadAssignInit"} : () -> ()
-    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
-
-    // CHECK: %[[LOAD:.+]] = ml_program.global_load @Variable : tensor<16x16xf32>
-    %1 = "tfl.read_variable"(%0) : (tensor<*x!tf_type.resource>) -> tensor<16x16xf32>
-
-    // CHECK: %[[ADD:.+]] = tfl.add %[[LOAD]], %arg0
-    %2 = tfl.add %1, %arg0 {fused_activation_function = "NONE"} : tensor<16x16xf32>
-
-    // CHECK: ml_program.global_store  @Variable = %[[ADD]]
-    "tfl.assign_variable"(%0, %2) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
-    return %2 : tensor<16x16xf32>
-  }
-  func.func private @ReadAssignInit() {
-    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
-    %1 = "tfl.pseudo_const"() {value = dense<2.000000e+00> : tensor<16x16xf32>} : () -> tensor<16x16xf32>
-    "tfl.assign_variable"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
-    return
-  }
-}
-
-// -----
-
-module {
-  // CHECK: ml_program.global private mutable @Variable(dense<42> : tensor<2x3xi8>)
-  // CHECK-LABEL: func.func @readAssignQuant
-  func.func @readAssignQuant(%arg0: tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> (tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) {
-    "tfl.call_once"() {session_init_function = "ReadAssignInit"} : () -> ()
-    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
-
-    // CHECK: %[[ADDR:.+]] = ml_program.global_load @Variable : tensor<2x3xi8>
-    // CHECK: %[[CAST:.+]] = builtin.unrealized_conversion_cast %[[ADDR]] : tensor<2x3xi8> to tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>
-    %1 = "tfl.read_variable"(%0) : (tensor<*x!tf_type.resource>) -> tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
-
-    // CHECK: %[[ADD:.+]] = tfl.add %[[CAST]], %arg0 {fused_activation_function = "NONE"}
-    %2 = tfl.add %1, %arg0 {fused_activation_function = "NONE"} : tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
-
-    // CHECK: %[[CAST2:.+]] = builtin.unrealized_conversion_cast %[[ADD]] : tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>> to tensor<2x3xi8>
-    // CHECK: ml_program.global_store @Variable = %[[CAST2]]
-    "tfl.assign_variable"(%0, %2) : (tensor<*x!tf_type.resource>, tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> ()
-    return %2 : tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
-  }
-  func.func private @ReadAssignInit() {
-    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
-    %1 = "tfl.pseudo_const"() {qtype = tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>, value = dense<42> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
-    "tfl.assign_variable"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> ()
-    return
-  }
-}
-
-// -----
-
-module {
-  // CHECK-label: @nostate
-  func.func @nostate(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
-    "tfl.call_once"() {session_init_function = "NoStateInit"} : () -> ()
-    // CHECK: tfl.var_handle
-    %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
-
-    // CHECK: tfl.read_variable
-    %1 = "tfl.read_variable"(%0) : (tensor<*x!tf_type.resource>) -> tensor<16x16xf32>
-
-    %2 = tfl.add %1, %arg0 {fused_activation_function = "NONE"} : tensor<16x16xf32>
-
-    // CHECK: tfl.assign_variable
-    "tfl.assign_variable"(%0, %2) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
-    return %2 : tensor<16x16xf32>
-  }
-  func.func private @NoStateInit() {
-    return
-  }
-}
-
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
index 84e61d0de2f7cb..c6c7e649e971a6 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
@@ -271,18 +271,20 @@ func.func @test_conv3d_bias(%arg0: tensor<10x3x64x64x12xf32>, %arg1: tensor<16x2
 // CHECK-LABEL: test_conv3d_qi8(
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<1x4x8x21x17x!quant.uniform<i8:f32, 0.015686264261603355>>
 // CHECK-SAME: %[[VAL_1:.*]]: tensor<2x3x3x17x34xf32>) -> tensor<1x4x8x11x34x!quant.uniform<i8:f32, 0.8929935097694397:-4>>
-// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() <{value = dense<0.0156862643> : tensor<1x1x1x1x1xf32>}>
-// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<1.11982894> : tensor<1x1x1x1x1xf32>}>
-// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<-4.000000e+00> : tensor<1x1x1x1x1xf32>}>
-// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<34xf32>}>
-// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}>
-// CHECK: %[[VAL_7:.*]] = tosa.cast %[[VAL_0]]
-// CHECK: %[[VAL_8:.*]] = tosa.mul %[[VAL_7]], %[[VAL_2]] {shift = 0 : i8}
-// CHECK: %[[VAL_9:.*]] = tosa.transpose %[[VAL_1]], %[[VAL_6]]
-// CHECK: %[[VAL_10:.*]] = tosa.conv3d %[[VAL_8]], %[[VAL_9]], %[[VAL_5]] {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 2>}
-// CHECK: %[[VAL_11:.*]] = tosa.mul %[[VAL_10]], %[[VAL_3]] {shift = 0 : i8}
-// CHECK: %[[VAL_12:.*]] = tosa.add %[[VAL_11]], %[[VAL_4]]
-// CHECK: %[[VAL_13:.*]] = tosa.cast %[[VAL_12]]
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() <{value = dense<0.0156862643> : tensor<1x1x1x1x1xf32>}
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() <{value = dense<1.11982894> : tensor<1x1x1x1x1xf32>}
+// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() <{value = dense<-4> : tensor<1x1x1x1x1xi32>}
+// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<34xf32>}
+// CHECK-DAG: %[[VAL_7:.*]] = "tosa.const"() <{value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}
+// CHECK: %[[VAL_8:.*]] = tosa.cast %[[VAL_0]]
+// CHECK: %[[VAL_10:.*]] = tosa.mul %[[VAL_8]], %[[VAL_3]] {shift = 0 : i8}
+// CHECK: %[[VAL_11:.*]] = tosa.transpose %[[VAL_1]], %[[VAL_7]]
+// CHECK: %[[VAL_12:.*]] = tosa.conv3d %[[VAL_10]], %[[VAL_11]], %[[VAL_6]] {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 2>}
+// CHECK: %[[VAL_13:.*]] = tosa.mul %[[VAL_12]], %[[VAL_4]] {shift = 0 : i8}
+// CHECK: %[[VAL_14:.*]] = tosa.cast %[[VAL_13]]
+// CHECK: %[[VAL_15:.*]] = tosa.add %[[VAL_14]], %[[VAL_5]]
+// CHECK: %[[VAL_16:.*]] = tosa.cast %[[VAL_15]]
+// CHECK: return %[[VAL_16]]
 func.func @test_conv3d_qi8(%arg0: tensor<1x4x8x21x17x!quant.uniform<i8:f32, 0.015686264261603355>>, %arg1: tensor<2x3x3x17x34xf32>) -> (tensor<1x4x8x11x34x!quant.uniform<i8:f32, 0.8929935097694397:-4>>) {
   %0 = "tfl.dequantize"(%arg0) : (tensor<1x4x8x21x17x!quant.uniform<i8:f32, 0.015686264261603355>>) -> tensor<1x4x8x21x17xf32>
   %2 = "tfl.no_value"() {value} : () -> none
@@ -1853,12 +1855,12 @@ func.func @test_one_hot(%arg0: tensor<4x4xi32>, %arg1: tensor<f32>, %arg2: tenso
 // -----
 
 // CHECK-LABEL: test_fakequant_with_min_max_args
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<16383.75> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<6.10360876E-5> : tensor<1x1x1xf32>}>
-// CHECK-DAG: %[[VAR2:.*]] = tosa.mul %arg0, %[[VAR0]] {shift = 0 : i8}
-// CHECK-DAG: %[[VAR3:.*]] = tosa.cast %[[VAR2]]
-// CHECK-DAG: %[[VAR4:.*]] = tosa.cast %[[VAR3]]
-// CHECK-DAG: %[[VAR5:.*]] = tosa.mul %[[VAR4]], %[[VAR1]] {shift = 0 : i8}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<6.10360876E-5> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() <{value = dense<16383.75> : tensor<1x1x1xf32>}
+// CHECK-DAG: %[[VAR3:.*]] = tosa.mul %arg0, %[[VAR2]] {shift = 0 : i8}
+// CHECK-DAG: %[[VAR5:.*]] = tosa.cast %[[VAR3]]
+// CHECK-DAG: %[[VAR6:.*]] = tosa.cast %[[VAR5]]
+// CHECK-DAG: %[[VAR8:.*]] = tosa.mul %[[VAR6]], %[[VAR1]] {shift = 0 : i8}
 func.func @test_fakequant_with_min_max_args(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %0 = "tfl.quantize"(%arg0)  {qtype = tensor<13x21x3x!quant.uniform<u16:f32, 6.1036087586785687E-5:32768>>}  : (tensor<13x21x3xf32>) -> tensor<*x!quant.uniform<u16:f32, 6.1036087586785687E-5:32768>>
   %1 = "tfl.dequantize"(%0) : (tensor<*x!quant.uniform<u16:f32, 6.1036087586785687E-5:32768>>) -> tensor<13x21x3xf32>
@@ -2662,7 +2664,7 @@ func.func @test_reverse_fail(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
 
 // CHECK-LABEL: test_tfl_custom
 // CHECK-SAME: %[[ARG_0:.*]]: tensor<1x64x64x32xf32>
-// CHECK: %[[VAL_0:.*]] = tosa.custom %[[ARG_0]] {config = "TFL", identifier = "MaxPoolingWithArgmax2D", implementation_attrs = "{{.*}}"} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
+// CHECK: %[[VAL_0:.*]] = tosa.custom %[[ARG_0]] {domain_name = "TFL", implementation_attrs = "{{.*}}", operator_name = "MaxPoolingWithArgmax2D"} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
 func.func @test_tfl_custom(%arg0: tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>) {
   // custom op for "tfl.max_pooling_with_argmax_2d"(%arg0) {filter_h = 2 : i32, filter_w = 2 : i32, padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
   %0, %1 = "tfl.custom"(%arg0) {custom_option = #tfl<const_bytes : "0x01000000020000000200000002000000020000000000000000000000000000000000000000000000">, custom_code = "MaxPoolingWithArgmax2D"} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-stateful.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-stateful.mlir
new file mode 100644
index 00000000000000..e0f2d6b3ede707
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-stateful.mlir
@@ -0,0 +1,84 @@
+// RUN: tf-opt --split-input-file --tfl-to-tosa-pipeline --verify-each %s | FileCheck %s
+// RUN: tf-opt --split-input-file --tf-tfl-to-tosa-pipeline --verify-each %s | FileCheck %s
+
+// Operations for testing tfl-to-tosa-pipeline
+
+// -----
+
+module attributes {tf_saved_model.semantics, tfl.description = "Test.", tfl.schema_version = 3 : i32} {
+    // CHECK: tosa.variable @var_x = dense<7.000000e+00> : tensor<1xf32>
+    // CHECK-LABEL: test_stateful_ops
+    // CHECK: tosa.variable.write @var_x, %arg0 : tensor<1xf32>
+    // CHECK: %[[VAL_0:.*]] = tosa.variable.read @var_x : tensor<1xf32>
+    // CHECK: return %[[VAL_0]] : tensor<1xf32>
+    func.func @test_stateful_ops(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["placeholder_0"]})
+      -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]})
+      attributes {tf_saved_model.exported_names = ["serving_default"]} {
+        "tfl.call_once"() {session_init_function = "InitializeX"} : () -> ()
+        %0 = "tfl.var_handle"() {container = "", shared_name = "var_x"} : () -> tensor<!tf_type.resource>
+        "tfl.assign_variable"(%0, %arg0) : (tensor<!tf_type.resource>, tensor<1xf32>) -> ()
+        %1 = "tfl.read_variable"(%0) : (tensor<!tf_type.resource>) -> tensor<1xf32>
+        return %1 : tensor<1xf32>
+    }
+
+    // initialize variable var_x to 7.0
+    func.func private @InitializeX() {
+        %0 = "tfl.var_handle"() {container = "", shared_name = "var_x"} : () -> tensor<!tf_type.resource>
+        %1 = "tfl.pseudo_const"() {value = dense<7.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
+        "tfl.assign_variable"(%0, %1) : (tensor<!tf_type.resource>, tensor<1xf32>) -> ()
+        return
+    }
+}
+
+// -----
+
+module {
+    // CHECK: tosa.variable @Variable = dense<42> : tensor<2x3xi8>
+    // CHECK-LABEL: readAssignQuant
+    // CHECK: %[[VAL_0:.*]] = tosa.variable.read @Variable : tensor<2x3xi8>
+    // CHECK: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[VAL_0]] : tensor<2x3xi8> to tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>
+    // CHECK: %[[VAL_2:.*]] = tosa.rescale %[[VAL_1]] {double_round = true, input_zp = 2 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 11>} : (tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>) -> tensor<2x3xi32>
+    // CHECK: %[[VAL_3:.*]] = tosa.rescale %[[VAL_4:.*]] {double_round = true, input_zp = 2 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 11>} : (tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>) -> tensor<2x3xi32>
+    // CHECK: %[[VAL_5:.*]] = tosa.add %[[VAL_2]], %[[VAL_3]] : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+    // CHECK: %[[VAL_6:.*]] = tosa.rescale %[[VAL_5]] {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 2 : i32, per_channel = false, scale32 = true, shift = array<i32: 49>} : (tensor<2x3xi32>) -> tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>
+    // CHECK: %[[VAL_7:.*]] = builtin.unrealized_conversion_cast %[[VAL_6]] : tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>> to tensor<2x3xi8>
+    // CHECK: tosa.variable.write @Variable, %[[VAL_7]] : tensor<2x3xi8>
+    // CHECK: return %[[VAL_6]] : tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>
+    func.func @readAssignQuant(%arg0: tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> (tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) {
+        "tfl.call_once"() {session_init_function = "ReadAssignInit"} : () -> ()
+        %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+        %1 = "tfl.read_variable"(%0) : (tensor<*x!tf_type.resource>) -> tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+        %2 = tfl.add %1, %arg0 {fused_activation_function = "NONE"} : tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+        "tfl.assign_variable"(%0, %2) : (tensor<*x!tf_type.resource>, tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> ()
+        return %2 : tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+    }
+    func.func private @ReadAssignInit() {
+        %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+        %1 = "tfl.pseudo_const"() {qtype = tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>, value = dense<42> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+        "tfl.assign_variable"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> ()
+        return
+    }
+}
+
+// -----
+
+module {
+    // CHECK-LABEL: @nostate
+    // CHECK: %[[VAL_0:.*]]: tensor<16x16xf32>) -> tensor<16x16xf32> {
+    // CHECK: %[[VAL_1:.*]] = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+    // CHECK: %[[VAL_2:.*]] = "tfl.read_variable"(%[[VAL_1]]) : (tensor<*x!tf_type.resource>) -> tensor<16x16xf32>
+    // CHECK: %[[VAL_3:.*]] = tosa.add %[[VAL_2]], %[[VAL_0]] : (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
+    // CHECK: "tfl.assign_variable"(%[[VAL_1]], %[[VAL_3]]) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
+    // CHECK: return %[[VAL_3]] : tensor<16x16xf32>
+    func.func @nostate(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
+        "tfl.call_once"() {session_init_function = "NoStateInit"} : () -> ()
+        %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+        %1 = "tfl.read_variable"(%0) : (tensor<*x!tf_type.resource>) -> tensor<16x16xf32>
+        %2 = tfl.add %1, %arg0 {fused_activation_function = "NONE"} : tensor<16x16xf32>
+        "tfl.assign_variable"(%0, %2) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
+        return %2 : tensor<16x16xf32>
+    }
+    func.func private @NoStateInit() {
+        return
+    }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tosa/tf_tfl_passes.cc b/tensorflow/compiler/mlir/tosa/tf_tfl_passes.cc
index 2b31e3246fd598..81ea9f6393216c 100644
--- a/tensorflow/compiler/mlir/tosa/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/tosa/tf_tfl_passes.cc
@@ -30,6 +30,9 @@ void createTFTFLtoTOSALegalizationPipeline(
   //----------------------------------------------------------------------------
   // Prepare TFL module for conversion
   //----------------------------------------------------------------------------
+  // For stateful ops
+  pm.addPass(createRetainCallOnceFuncsPass());
+
   // Inline all functions into main and then delete the functions themselves.
   pm.addPass(mlir::createInlinerPass());
 
@@ -52,6 +55,7 @@ void createTFTFLtoTOSALegalizationPipeline(
   if (opts.dequantize_tfl_softmax) {
     pm.addPass(mlir::tosa::createDequantizeTFLSoftmaxPass());
   }
+  pm.addPass(mlir::tosa::createLegalizeTFLStatefulPass());
   pm.addPass(mlir::tosa::createLegalizeTFTFLPass());
 
   //----------------------------------------------------------------------------
diff --git a/tensorflow/compiler/mlir/tosa/tfl_passes.cc b/tensorflow/compiler/mlir/tosa/tfl_passes.cc
index ff3c38e381e8ba..2eb98a4415f668 100644
--- a/tensorflow/compiler/mlir/tosa/tfl_passes.cc
+++ b/tensorflow/compiler/mlir/tosa/tfl_passes.cc
@@ -30,16 +30,14 @@ void createTFLtoTOSALegalizationPipeline(
   //----------------------------------------------------------------------------
   // Prepare TFL module for conversion
   //----------------------------------------------------------------------------
-  if (opts.target_compilation_backend) {
-    pm.addPass(createRetainCallOnceFuncsPass());
-  }
+  pm.addPass(createRetainCallOnceFuncsPass());
+
   // Inline all functions into main and then delete the functions themselves.
   pm.addPass(mlir::createInlinerPass());
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createSymbolDCEPass());
   if (opts.target_compilation_backend) {
     pm.nest<func::FuncOp>().addPass(createConvertFunctionMetadataPass());
-    pm.addPass(createLowerGlobalTensorsPass());
   }
 
   // Add pass to decompose TFLite mixed quantization to non-quantized variants.
@@ -59,6 +57,7 @@ void createTFLtoTOSALegalizationPipeline(
   if (opts.dequantize_tfl_softmax) {
     pm.addPass(mlir::tosa::createDequantizeTFLSoftmaxPass());
   }
+  pm.addPass(mlir::tosa::createLegalizeTFLStatefulPass());
   pm.addPass(mlir::tosa::createLegalizeTFLPass(opts.disabled_patterns,
                                                opts.enabled_patterns));
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
index 7fed578c78f86c..b454dfecbaca98 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
@@ -3514,21 +3514,26 @@ std::optional<Value> convertQuantizeOp(PatternRewriter& rewriter, Operation* op,
   }
 
   ShapedType output_fp_type = output_type.clone(rewriter.getF32Type());
-
-  Value zp_val =
-      getTosaConstTensorSingleF32(rewriter, op, static_cast<float>(zeropoint));
-
-  auto op1_mul_in = CreateOpAndInfer<tosa::MulOp>(
+  Value result = CreateOpAndInfer<tosa::MulOp>(
       rewriter, op->getLoc(), output_fp_type, input_value,
       getTosaConstTensorSingleF32(rewriter, op, static_cast<float>(scale)), 0);
 
-  auto op2_add_op1 = CreateOpAndInfer<tosa::AddOp>(
-      rewriter, op->getLoc(), output_fp_type, op1_mul_in.getResult(), zp_val);
+  if (zeropoint != 0) {
+    // cast to i32 to add zeropoint
+    ShapedType output_i32_type = output_type.clone(rewriter.getI32Type());
+    Value cast_i32 = CreateOpAndInfer<tosa::CastOp>(rewriter, op->getLoc(),
+                                                    output_i32_type, result);
+
+    Value zp_val = getTosaConstTensorSingleI32(rewriter, op, zeropoint);
+
+    result = CreateOpAndInfer<tosa::AddOp>(rewriter, op->getLoc(),
+                                           output_i32_type, cast_i32, zp_val);
+  }
 
-  auto op3_cast_op2 = CreateOpAndInfer<tosa::CastOp>(
-      rewriter, op->getLoc(), output_type, op2_add_op1.getResult());
+  Value final_result = CreateOpAndInfer<tosa::CastOp>(rewriter, op->getLoc(),
+                                                      output_type, result);
 
-  return op3_cast_op2.getResult();
+  return final_result;
 }
 
 // Lowers Dequantize to a sequence of TOSA dequantization ops.
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl_stateful.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl_stateful.cc
new file mode 100644
index 00000000000000..4028093f547a3a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl_stateful.cc
@@ -0,0 +1,187 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Legalize TensorFlow Lite StatefulOps to TOSA
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tosa/transforms/passes.h"
+
+#define PASS_NAME "tosa-legalize-tfl-stateful"
+
+namespace mlir {
+namespace tosa {
+namespace {
+
+#define GEN_PASS_DEF_TOSALEGALIZETFLSTATEFULPASS
+#include "tensorflow/compiler/mlir/tosa/transforms/passes.h.inc"
+
+// Performs lowering tfl stateful operators to TOSA
+class TosaLegalizeTFLStateful
+    : public impl::TosaLegalizeTFLStatefulPassBase<TosaLegalizeTFLStateful> {
+ public:
+  explicit TosaLegalizeTFLStateful() = default;
+  void runOnOperation() override;
+};
+
+void TosaLegalizeTFLStateful::runOnOperation() {
+  auto moduleOp = getOperation();
+  mlir::OpBuilder builder(moduleOp.getBodyRegion());
+
+  DenseMap<StringRef, func::FuncOp> symNameToFunction;
+  for (auto func : moduleOp.getOps<func::FuncOp>()) {
+    symNameToFunction[func.getSymName()] = func;
+  }
+
+  llvm::SmallVector<mlir::TFL::VarHandleOp, 6> handleOps;
+  llvm::SmallVector<mlir::TFL::AssignVariableOp, 6> assignOps;
+  llvm::SmallVector<mlir::TFL::ReadVariableOp, 6> readOps;
+  SmallVector<mlir::TFL::CallOnceOp> callOnceOps;
+  DenseMap<StringRef, mlir::tosa::VariableOp> symbolRefMap;
+
+  for (auto it : symNameToFunction) {
+    auto func = std::get<1>(it);
+    // We also want to grab the list of operations to replace.
+    for (auto& op : func.getOps()) {
+      if (auto handle = dyn_cast<mlir::TFL::VarHandleOp>(op))
+        handleOps.push_back(handle);
+      if (auto assign = dyn_cast<mlir::TFL::AssignVariableOp>(op))
+        assignOps.push_back(assign);
+      if (auto read = dyn_cast<mlir::TFL::ReadVariableOp>(op))
+        readOps.push_back(read);
+    }
+  }
+
+  for (auto func : moduleOp.getOps<func::FuncOp>()) {
+    for (auto init : func.getOps<mlir::TFL::CallOnceOp>()) {
+      callOnceOps.push_back(init);
+    }
+  }
+
+  // Look through the initialization functions and find the assigned values
+  // for each handle, save out the constant value.
+  for (auto init : callOnceOps) {
+    auto findInitFunc =
+        symNameToFunction.find(init.getSessionInitFunctionAttr());
+    if (findInitFunc == symNameToFunction.end()) {
+      init.emitError("unable to find initialization function: ");
+      continue;
+    }
+    func::FuncOp initFunc = std::get<1>(*findInitFunc);
+    for (auto assign : initFunc.getOps<mlir::TFL::AssignVariableOp>()) {
+      // 1. var_handle part
+      auto handle = dyn_cast<mlir::TFL::VarHandleOp>(
+          assign.getResourceId().getDefiningOp());
+      if (!handle) continue;
+
+      // 2. pseudo_const part
+      DenseElementsAttr constant;
+      if (!matchPattern(assign.getValue(), m_Constant(&constant))) {
+        // Quantized types we can not use the m_Constant matcher.
+        if (auto constOp = dyn_cast<mlir::TFL::QConstOp>(
+                assign.getValue().getDefiningOp())) {
+          constant = cast<DenseElementsAttr>(constOp.getValue());
+        }
+      }
+      if (!constant) continue;
+
+      // Create TOSA VariableOps
+      auto name = handle.getSharedName();
+      auto global = builder.create<mlir::tosa::VariableOp>(
+          handle.getLoc(), name, constant.getType(), constant);
+      symbolRefMap[name] = global;
+    }
+  }
+  // TF::CallOnceOps are no longer needed as we have already extracted their
+  // state.
+  for (auto op : callOnceOps) op.erase();
+
+  // Replace the assign ops with a tosa store operation.
+  for (auto assign : assignOps) {
+    auto handle = dyn_cast<mlir::TFL::VarHandleOp>(
+        assign.getResourceId().getDefiningOp());
+    if (!handle) continue;
+
+    Value value = assign.getValue();
+    auto globalOpIt = symbolRefMap.find(handle.getSharedName());
+    if (globalOpIt == symbolRefMap.end()) {
+      assign->emitError(
+          "unable to find corresponding TosaOp for op's VarHandle");
+      continue;
+    }
+    auto globalOp = std::get<1>(*globalOpIt);
+
+    builder.setInsertionPoint(assign);
+    if (globalOp.getType() != value.getType()) {
+      value = builder
+                  .create<UnrealizedConversionCastOp>(assign.getLoc(),
+                                                      globalOp.getType(), value)
+                  .getResult(0);
+    }
+
+    builder.create<mlir::tosa::VariableWriteOp>(
+        assign.getLoc(), llvm::StringRef(globalOp.getName()), value);
+    assign.erase();
+  }
+
+  for (auto read : readOps) {
+    auto handle =
+        dyn_cast<mlir::TFL::VarHandleOp>(read.getResourceId().getDefiningOp());
+    if (!handle) continue;
+
+    auto globalOpIt = symbolRefMap.find(handle.getSharedName());
+    if (globalOpIt == symbolRefMap.end()) continue;
+    auto globalOp = std::get<1>(*globalOpIt);
+
+    builder.setInsertionPoint(read);
+
+    Value load = builder.create<mlir::tosa::VariableReadOp>(
+        read.getLoc(), globalOp.getType(), llvm::StringRef(globalOp.getName()));
+
+    if (read.getType() != load.getType()) {
+      load = builder
+                 .create<UnrealizedConversionCastOp>(read.getLoc(),
+                                                     read.getType(), load)
+                 .getResult(0);
+    }
+    read.getResult().replaceAllUsesWith(load);
+    read.erase();
+  }
+
+  for (auto handle : handleOps) {
+    if (handle.getResult().use_empty()) {
+      handle.erase();
+    }
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the TensorFlow Lite dialect LegalizeTFLStateful pass.
+std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFLStatefulPass() {
+  return std::make_unique<TosaLegalizeTFLStateful>();
+}
+
+}  // namespace tosa
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tosa/transforms/lower_global_tensors.cc b/tensorflow/compiler/mlir/tosa/transforms/lower_global_tensors.cc
deleted file mode 100644
index de30f7c2fb0507..00000000000000
--- a/tensorflow/compiler/mlir/tosa/transforms/lower_global_tensors.cc
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-
-#include "mlir/Dialect/MLProgram/IR/MLProgram.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/tosa/transforms/passes.h"
-
-#define PASS_NAME "tosa-lower-global-tensors"
-#define DEBUG_TYPE PASS_NAME
-
-namespace mlir::tosa {
-
-#define GEN_PASS_DEF_LOWERGLOBALTENSORS
-#include "tensorflow/compiler/mlir/tosa/transforms/passes.h.inc"
-
-namespace {
-
-class LowerGlobalTensorsPass
-    : public impl::LowerGlobalTensorsBase<LowerGlobalTensorsPass> {
- public:
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<mlir::TFL::TensorFlowLiteDialect,
-                    mlir::ml_program::MLProgramDialect>();
-  }
-
-  // Converts TFLite state operations to the MLProgram equivalent.
-  void runOnOperation() override {
-    auto* context = &getContext();
-    auto moduleOp = getOperation();
-    mlir::OpBuilder builder(moduleOp.getBodyRegion());
-
-    DenseMap<StringRef, func::FuncOp> symNameToFunction;
-    for (auto func : moduleOp.getOps<func::FuncOp>()) {
-      symNameToFunction[func.getSymName()] = func;
-    }
-
-    DenseMap<StringRef, DenseElementsAttr> sharedNameToConstant;
-    DenseMap<StringRef, LocationAttr> sharedNameToLoc;
-
-    SmallVector<mlir::TFL::VarHandleOp, 6> handleOps;
-    SmallVector<mlir::TFL::AssignVariableOp, 6> assignOps;
-    SmallVector<mlir::TFL::ReadVariableOp, 6> readOps;
-    for (auto it : symNameToFunction) {
-      auto func = std::get<1>(it);
-      // Look through the initialization functions and find the assigned values
-      // for each handle, save out the constant value.
-      for (auto init : func.getOps<mlir::TFL::CallOnceOp>()) {
-        auto findInitFunc =
-            symNameToFunction.find(init.getSessionInitFunction());
-        if (findInitFunc == symNameToFunction.end()) {
-          init.emitError("unable to find initialization function: " +
-                         init.getSessionInitFunction());
-          continue;
-        }
-        func::FuncOp initFunc = std::get<1>(*findInitFunc);
-        for (auto assign : initFunc.getOps<mlir::TFL::AssignVariableOp>()) {
-          auto handle = dyn_cast<mlir::TFL::VarHandleOp>(
-              assign.getResourceId().getDefiningOp());
-          if (!handle) continue;
-
-          DenseElementsAttr constant;
-          if (!matchPattern(assign.getValue(), m_Constant(&constant))) {
-            // Quantized types we can not use the m_Constant matcher.
-            if (auto constOp = dyn_cast<mlir::TFL::QConstOp>(
-                    assign.getValue().getDefiningOp())) {
-              constant = constOp.getValue().cast<DenseElementsAttr>();
-            }
-          }
-          if (!constant) continue;
-
-          auto name = handle.getSharedName();
-          sharedNameToConstant[name] = constant;
-          sharedNameToLoc[name] = handle.getLoc();
-        }
-      }
-
-      // We also want to grab the list of operations to replace.
-      for (auto& op : func.getOps()) {
-        if (auto handle = dyn_cast<mlir::TFL::VarHandleOp>(op))
-          handleOps.push_back(handle);
-        if (auto assign = dyn_cast<mlir::TFL::AssignVariableOp>(op))
-          assignOps.push_back(assign);
-        if (auto read = dyn_cast<mlir::TFL::ReadVariableOp>(op))
-          readOps.push_back(read);
-      }
-    }
-
-    // TF::CallOnceOps are no longer needed as we have already extracted their
-    // state.
-    SmallVector<mlir::TFL::CallOnceOp> callOnceOps;
-    for (auto func : moduleOp.getOps<func::FuncOp>()) {
-      for (auto init : func.getOps<mlir::TFL::CallOnceOp>()) {
-        callOnceOps.push_back(init);
-      }
-    }
-    for (auto op : callOnceOps) op.erase();
-
-    // Create the ml_program::GlobalOps to store our new global variables.
-    DenseMap<StringRef, mlir::ml_program::GlobalOp> symbolRefMap;
-    for (auto it : sharedNameToConstant) {
-      auto name = std::get<0>(it);
-      auto attribute = std::get<1>(it);
-      auto locIt = sharedNameToLoc.find(name);
-      LocationAttr loc = mlir::UnknownLoc();
-      if (locIt != sharedNameToLoc.end()) {
-        loc = std::get<1>(*locIt);
-      }
-
-      // TODO(suderman): Determine the global type based on all store
-      // operations.
-      auto global = builder.create<mlir::ml_program::GlobalOp>(
-          loc, name, attribute.getType(), /*is_mutable=*/true, attribute,
-          nullptr);
-      global.setPrivate();
-
-      symbolRefMap[name] = global;
-    }
-
-    // Replace the assign ops with a global store operation.
-    for (auto assign : assignOps) {
-      auto handle = dyn_cast<mlir::TFL::VarHandleOp>(
-          assign.getResourceId().getDefiningOp());
-      if (!handle) continue;
-
-      Value value = assign.getValue();
-      auto globalOpIt = symbolRefMap.find(handle.getSharedName());
-      if (globalOpIt == symbolRefMap.end()) {
-        assign->emitError(
-            "unable to find corresponding GlobalOp for op's VarHandle");
-        continue;
-      }
-      auto globalOp = std::get<1>(*globalOpIt);
-
-      builder.setInsertionPoint(assign);
-      if (globalOp.getType() != value.getType()) {
-        value = builder
-                    .create<UnrealizedConversionCastOp>(
-                        assign.getLoc(), globalOp.getType(), value)
-                    .getResult(0);
-      }
-
-      auto globalSymbolRef = SymbolRefAttr::get(context, globalOp.getSymName());
-      builder.create<mlir::ml_program::GlobalStoreOp>(assign.getLoc(),
-                                                      globalSymbolRef, value);
-      assign.erase();
-    }
-
-    for (auto read : readOps) {
-      auto handle = dyn_cast<mlir::TFL::VarHandleOp>(
-          read.getResourceId().getDefiningOp());
-      if (!handle) continue;
-
-      auto globalOpIt = symbolRefMap.find(handle.getSharedName());
-      if (globalOpIt == symbolRefMap.end()) continue;
-      auto globalOp = std::get<1>(*globalOpIt);
-
-      builder.setInsertionPoint(read);
-
-      auto globalSymbolRef = SymbolRefAttr::get(context, globalOp.getSymName());
-      Value load = builder.create<mlir::ml_program::GlobalLoadOp>(
-          read.getLoc(), globalOp.getType(), globalSymbolRef);
-
-      if (read.getType() != load.getType()) {
-        load = builder
-                   .create<UnrealizedConversionCastOp>(read.getLoc(),
-                                                       read.getType(), load)
-                   .getResult(0);
-      }
-      read.getResult().replaceAllUsesWith(load);
-      read.erase();
-    }
-
-    for (auto handle : handleOps) {
-      if (handle.getResult().use_empty()) {
-        handle.erase();
-      }
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>> createLowerGlobalTensorsPass() {
-  return std::make_unique<LowerGlobalTensorsPass>();
-}
-
-}  // namespace mlir::tosa
diff --git a/tensorflow/compiler/mlir/tosa/transforms/passes.h b/tensorflow/compiler/mlir/tosa/transforms/passes.h
index 99f9465c8a639c..e41453b0b9af8b 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/passes.h
@@ -56,7 +56,6 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFLPass(
     ArrayRef<std::string> disabled_patterns = std::nullopt,
     ArrayRef<std::string> enabled_patterns = std::nullopt);
 
-std::unique_ptr<OperationPass<ModuleOp>> createLowerGlobalTensorsPass();
 std::unique_ptr<OperationPass<ModuleOp>> createRetainCallOnceFuncsPass();
 std::unique_ptr<OperationPass<ModuleOp>> createStripModuleMetadataPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createConvertTFLUint8Pass();
@@ -68,6 +67,7 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLowerComplexTypesPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createStripFunctionMetadataPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createStripQuantTypesPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createVerifyFullyConvertedPass();
+std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFLStatefulPass();
 
 #define GEN_PASS_REGISTRATION
 #define GEN_PASS_CLASSES
@@ -79,12 +79,12 @@ std::unique_ptr<OperationPass<func::FuncOp>> createVerifyFullyConvertedPass();
 #define GEN_PASS_DECL_TOSASTRIPQUANTTYPESPASS
 #define GEN_PASS_DECL_TOSALOWERCOMPLEXTYPESPASS
 #define GEN_PASS_DECL_TOSADEQUANTIZETFLSOFTMAXPASS
-#define GEN_PASS_DECL_LOWERGLOBALTENSORS
 #define GEN_PASS_DECL_RETAINCALLONCEFUNCS
 #define GEN_PASS_DECL_STRIPFUNCTIONMETADATA
 #define GEN_PASS_DECL_STRIPMODULEMETADATA
 #define GEN_PASS_DECL_VERIFYFULLYCONVERTED
 #define GEN_PASS_DECL_CONVERTFUNCTIONMETADATA
+#define GEN_PASS_DECL_TOSALEGALIZESTATEFULPASS
 
 #include "tensorflow/compiler/mlir/tosa/transforms/passes.h.inc"
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/passes.td b/tensorflow/compiler/mlir/tosa/transforms/passes.td
index e623760a4e9aca..3cf7749d875f9d 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/passes.td
+++ b/tensorflow/compiler/mlir/tosa/transforms/passes.td
@@ -89,12 +89,6 @@ def TosaDequantizeTFLSoftmaxPass : Pass<"tosa-dequantize-tfl-softmax", "mlir::fu
   let dependentDialects = ["mlir::TFL::TFLDialect", "quantfork::QuantizationForkDialect"];
 }
 
-def LowerGlobalTensors :
-    Pass<"tflite-lower-global-tensors", "mlir::ModuleOp"> {
-  let summary = "Lowers TFLite global tensors to MLProgram dialect variables.";
-  let constructor = "createLowerGlobalTensorsPass()";
-}
-
 def RetainCallOnceFuncs :
     Pass<"tflite-retain-call-once-funcs", "mlir::ModuleOp"> {
   let summary = "Guarantees that functions used by tfl.call_once are retained.";
@@ -125,3 +119,11 @@ def ConvertFunctionMetadata :
   let constructor = "createConvertFunctionMetadataPass()";
 }
 
+def TosaLegalizeTFLStatefulPass : Pass<"tosa-legalize-tfl-stateful-tensors", "mlir::ModuleOp"> {
+  let summary = "Legalize tfl stateful operators to tosa stateful operators";
+  let description = [{
+    This pass is legalizing the tfl.call_once op to tosa stateful operators
+  }];
+  let constructor = "createLegalizeTFLStatefulPass()";
+  let dependentDialects = ["mlir::TFL::TFLDialect"];
+}
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 21dd107643be51..ae803f5d16dd04 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -1017,6 +1017,7 @@ tf_xla_py_strict_test(
     enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
+        "no_aarch64",  # TODO(b/315533266)
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "optonly",
     ],
@@ -1239,7 +1240,7 @@ tf_xla_py_strict_test(
     ],
     enable_mlir_bridge = True,
     python_version = "PY3",
-    shard_count = 5,
+    shard_count = 1,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "optonly",
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index a312df10e1f0f5..b54c2e54fa3552 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -1061,8 +1061,7 @@ def testMatMul(self):
             expected=np.array([[4.2384180773686798]], dtype=dtype),
             rtol=1e-14)
 
-  # TODO(phawkins): failing on GPU, no registered kernel.
-  def DISABLED_testSparseMatMul(self):
+  def testSparseMatMul(self):
     # Binary wrappers for sparse_matmul with different hints
     def SparseMatmulWrapperTF(a, b):
       return math_ops.sparse_matmul(a, b, a_is_sparse=True)
@@ -1073,10 +1072,13 @@ def SparseMatmulWrapperFT(a, b):
     def SparseMatmulWrapperTT(a, b):
       return math_ops.sparse_matmul(a, b, a_is_sparse=True, b_is_sparse=True)
 
-    self._testMatMul(math_ops.sparse_matmul, self.float_types)
-    self._testMatMul(SparseMatmulWrapperTF, self.float_types)
-    self._testMatMul(SparseMatmulWrapperFT, self.float_types)
-    self._testMatMul(SparseMatmulWrapperTT, self.float_types)
+    # TODO(b/314165739): SparseMatmul XlaBuilder lowering does not support
+    # float16 and float64.
+    float_types = self.float_types - {np.float16, np.float64}
+    self._testMatMul(math_ops.sparse_matmul, float_types)
+    self._testMatMul(SparseMatmulWrapperTF, float_types)
+    self._testMatMul(SparseMatmulWrapperFT, float_types)
+    self._testMatMul(SparseMatmulWrapperTT, float_types)
 
   def testBatchMatMul(self):
     # Tests with batches of matrices.
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index 7c48f5e3ec6518..01142082ae24f5 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -284,8 +284,9 @@ def testRandomNormalIsFinite(self):
 
   @parameterized.named_parameters(
       (f'_{dtype.name}_{seed}', dtype, seed)  # pylint: disable=g-complex-comprehension
-      for seed in ([1, 2], [12, 23], [123, 456], [25252, 314159])
-      for dtype in _allowed_types())
+      for seed in ([1, 2], [12, 23], [25252, 314159])
+      for dtype in _allowed_types()
+  )
   def testDistributionOfStatelessRandomNormal(self, dtype, seed):
     """Use Anderson-Darling test to test distribution appears normal."""
     with self.session() as sess, self.test_scope():
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 47cc309b45452b..46f192648ecaa6 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -1270,13 +1270,13 @@ def assert_output_shapes(output, expected_shape):
     ):
       reduce_with_shapes((None, 4, 5), (3, None, 5), (13, 4, 5))
 
-  @parameterized.parameters(
-      random_ops_util.Algorithm.THREEFRY,
-      random_ops_util.Algorithm.PHILOX,
-      random_ops_util.Algorithm.AUTO_SELECT,
+  @parameterized.product(
+      algorithm=[random_ops_util.Algorithm.THREEFRY,
+                 random_ops_util.Algorithm.PHILOX,
+                 random_ops_util.Algorithm.AUTO_SELECT],
+      dtype=[np.uint8, np.uint64],
   )
-  def testRngBitGenerator(self, algorithm):
-    dtype = np.uint64
+  def testRngBitGenerator(self, algorithm, dtype):
     initial_state = array_ops.placeholder(np.uint64, shape=(2,))
     shape = (2, 3)
     res = xla.rng_bit_generator(algorithm, initial_state, shape, dtype=dtype)
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index 448e1cbc9e61ba..b91fb494667c5f 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -3,9 +3,9 @@
 #   and provide TensorRT operators and converter package.
 #   APIs are meant to change over time.
 
-# Placeholder: load py_proto_library
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+# Placeholder: load py_proto_library
 load(
     "//tensorflow:tensorflow.bzl",
     "VERSION",
@@ -21,17 +21,18 @@ load(
     "tf_additional_all_protos",
     "tf_proto_library",
 )
-load(
-    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
 
 # Platform specific build config
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "cuda_rpath_flags",
+)
+load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -103,6 +104,8 @@ tf_cuda_cc_test(
         "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
     ],
     deps = [
         ":trt_logging",
@@ -157,6 +160,8 @@ tf_cuda_cc_test(
         "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
     ],
     deps = [
         ":common_utils",
@@ -239,6 +244,8 @@ tf_cuda_cc_test(
         "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
     ],
     deps = [
         ":testutils",
@@ -318,6 +325,8 @@ tf_cuda_cc_test(
         "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
     ],
     deps = [
         ":common_utils",
@@ -354,6 +363,8 @@ tf_cuda_cc_test(
         "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
     ],
     deps = [
         ":testutils",
@@ -411,6 +422,7 @@ tf_cuda_library(
         "utils/trt_execution_context.h",
         "utils/trt_shape_optimization_profiles.h",
     ],
+    features = ["-layering_check"],
     deps = [
         ":common_utils",
         ":trt_allocator",
@@ -431,6 +443,7 @@ tf_cuda_library(
     name = "trt_logging",
     srcs = ["utils/trt_logger.cc"],
     hdrs = ["utils/trt_logger.h"],
+    features = ["-layering_check"],
     visibility = ["//visibility:public"],
     deps = [
         ":common_utils",
@@ -515,6 +528,7 @@ tf_cuda_library(
     name = "trt_allocator",
     srcs = ["utils/trt_allocator.cc"],
     hdrs = ["utils/trt_allocator.h"],
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:framework_lite",
@@ -561,6 +575,8 @@ tf_cuda_cc_test(
         "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
     ],
     deps = [
         ":trt_resources",
@@ -576,6 +592,7 @@ tf_cuda_library(
         "convert/logger_registry.h",
     ],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
@@ -780,6 +797,8 @@ tf_cuda_cc_test(
         "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
     ],
     deps = [
         ":testutils",
@@ -816,6 +835,8 @@ tf_cuda_cc_test(
         "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
+        # TODO(b/303453873): Re-enable test once TensorRT has been updated
+        "notap",
     ],
     deps = [
         ":testutils",
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 2ae20bdfe8f07d..332be3f50bf342 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -4024,7 +4024,7 @@ TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertFill) {
     Reset();
     // random data
     AddTestWeights("dims", {2}, {2, 2}, DT_INT32);
-    AddTestWeights("value", {1}, {42.0}, tf_type_);
+    AddTestWeights("value", {1}, {42}, tf_type_);
     RunValidationAndConversion(
         node_def, absl::StatusCode::kUnimplemented,
         convert_not_supported_implicit(node_def.op(), node_def.name()));
@@ -4042,16 +4042,19 @@ TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertFill) {
       for (auto output_dims : output_dims_params) {
         for (auto value_dims : value_dims_params) {
           Reset();
-          std::vector<int32> dims_dims = {output_dims.size()};
+          std::vector<int32_t> dims_dims = {
+              static_cast<int32_t>(output_dims.size())};
           if (dims_is_tensor) {
             AddTestTensor("dims", dims_dims, DT_INT32, output_dims, dims_dims);
           } else {
             AddTestWeights("dims", dims_dims, output_dims, DT_INT32);
           }
           if (value_is_tensor) {
-            AddTestTensor("value", value_dims, tf_type_, {val});
+            AddTestTensor("value", value_dims, tf_type_,
+                          {static_cast<int>(val)});
           } else {
-            AddTestWeights("value", value_dims, {val}, tf_type_);
+            AddTestWeights("value", value_dims, {static_cast<int>(val)},
+                           tf_type_);
           }
           size_t nb_el = 1;
           for (auto d : output_dims) {
@@ -4084,7 +4087,7 @@ TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertRange) {
         // (a) for all parameters, when shape_idx > 3
         // (b) for all parameters, except shape_idx, when shape_idx >= 0
         // (c) for none of the shape_idx < 0
-        if (shape_idx > 3 || shape_idx >= 0 && shape_idx != i) {
+        if (shape_idx > 3 || (shape_idx >= 0 && shape_idx != i)) {
           partial_shape_dims = {1};
         }
         AddTestTensor(name[i], {1}, type[i], value[i], partial_shape_dims);
@@ -4140,7 +4143,7 @@ TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertRange) {
                                   limit_type == DT_INT32 &&
                                   delta_type == DT_INT32;
 
-        if (all_weights || all_integers && !config[2]) {
+        if (all_weights || (all_integers && !config[2])) {
           // Reject invalid parameters if delta = 0 and it's passed as a weight.
           param_value[2] = {0};
           set_parameters(param_name, param_value, param_type, config);
@@ -9435,8 +9438,8 @@ void OpConverter_Select::RunTest(const string& opName) {
           std::accumulate(std::begin(expect_dims), std::end(expect_dims), 1,
                           std::multiplies<int>());
 
-      assert(rank_out == expected_out ? expected_out->size()
-                                      : rank[use_indices >= 0 ? 0 : 1]);
+      assert(rank_out == (expected_out ? expected_out->size()
+                                       : rank[use_indices >= 0 ? 0 : 1]));
 
       expected_output.resize(rank_out);
       const auto& data_then = *par_value[1];
@@ -9476,7 +9479,7 @@ void OpConverter_Select::RunTest(const string& opName) {
     const auto nMax = testing_SelectV2 ? 2 : 1;
     for (int n = 0; n < nMax; n++) {
       set_parameters();
-      if (testing_SelectV2 || same_then_else_shapes && same_cond_chape) {
+      if (testing_SelectV2 || (same_then_else_shapes && same_cond_chape)) {
         TestOpConverter(node, exp_dims, OkStatus(), OkStatus(),
                         ElementsAreArray(expected_output));
       } else {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc
index fc5fc589211ec1..0e01bcaadb9f63 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc
@@ -288,7 +288,7 @@ class ConvertRange : public ConvertFillBase<ConvertRange> {
 };
 
 std::string convert_range_error_msg(float start, float limit, float delta) {
-  constexpr char* format_string =
+  constexpr const char* format_string =
       "For parameters (start, limit) = (%.2f, %.2f) "
       "of the Range operation delta cannot be %s, got %.2f";
   return absl::StrFormat(format_string, start, limit,
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/selectv2.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/selectv2.cc
index a68ffceb1534a1..4c21e49f12bf0f 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/ops/selectv2.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/selectv2.cc
@@ -31,10 +31,10 @@ class ConvertSelectBase : public OpConverterBase<ConvertSelectBase> {
  public:
   explicit ConvertSelectBase(const OpConverterParams* params,
                              const std::string& layer_name)
-      : layer_name_(layer_name),
-        OpConverterBase<ConvertSelectBase>(
+      : OpConverterBase<ConvertSelectBase>(
             params,
-            {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}) {}
+            {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}),
+        layer_name_(layer_name) {}
 
   static constexpr std::array<InputArgSpec, 3> InputSpec() {
     return std::array<InputArgSpec, 3>{
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index c8dc6721853ccb..5ae1c907f0138d 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -14,6 +14,7 @@ load(
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -506,9 +507,11 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
+        "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
         "@local_xla//xla:executable_run_options",
         "@local_xla//xla:protobuf_util",
         "@local_xla//xla:shape_util",
@@ -522,10 +525,11 @@ cc_library(
         "@local_xla//xla/client:xla_computation",
         "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/service:computation_placer_hdr",
+        "@local_xla//xla/service:hlo_proto_cc",
         "@local_xla//xla/translate/mhlo_to_hlo:layout_util",
     ] + if_libtpu([
         ":xla_tpu_backend_registration",
-    ]),
+    ]) + if_static(["@local_tsl//tsl/platform:tensor_float_32_utils"]),
     alwayslink = 1,
 )
 
@@ -901,6 +905,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
         "@local_xla//xla:literal",
         "@local_xla//xla:literal_util",
         "@local_xla//xla:statusor",
@@ -908,7 +913,7 @@ tf_cc_test(
         "@local_xla//xla/client:local_client",
         "@local_xla//xla/client:xla_computation",
         "@local_xla//xla/service:cpu_plugin",
-    ],
+    ] + if_static(["@local_tsl//tsl/platform:tensor_float_32_utils"]),
 )
 
 tf_cc_test(
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index e40ae462cab0f0..81bbbe1955642d 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -32,6 +32,8 @@ class BatchMatMulOp : public XlaOpKernel {
   explicit BatchMatMulOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("adj_x", &adj_x_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("adj_y", &adj_y_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("grad_x", &grad_x_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("grad_y", &grad_y_));
 
     if (ctx->HasAttr("Tout")) {
       DataType output_type;
@@ -48,15 +50,18 @@ class BatchMatMulOp : public XlaOpKernel {
         tsl::tensor_float_32_execution_enabled()
             ? xla::PrecisionConfig::DEFAULT
             : xla::PrecisionConfig::HIGHEST;
-    auto result = xla::BatchDot(MaybeConjugate(ctx->Input(0), adj_x_), adj_x_,
-                                MaybeConjugate(ctx->Input(1), adj_y_), adj_y_,
-                                precision, preferred_element_type_);
+    auto result =
+        xla::BatchDot(MaybeConjugate(ctx->Input(0), adj_x_), adj_x_,
+                      MaybeConjugate(ctx->Input(1), adj_y_), adj_y_, precision,
+                      preferred_element_type_, grad_x_, grad_y_);
     ctx->SetOutput(0, result);
   }
 
  private:
   bool adj_x_;
   bool adj_y_;
+  bool grad_x_;
+  bool grad_y_;
   std::optional<xla::PrimitiveType> preferred_element_type_;
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index ed0930e6243b7b..5a2a6e781cd65d 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // XLA-specific MatMul Op.
 
 #include <array>
+#include <optional>
 
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -36,9 +37,16 @@ constexpr std::array<DataType, 10> kMatmulTypes = {
 class MatMulOp : public XlaOpKernel {
  public:
   explicit MatMulOp(OpKernelConstruction* ctx, bool is_sparse = false)
-      : XlaOpKernel(ctx), is_sparse_(is_sparse) {
+      : XlaOpKernel(ctx),
+        is_sparse_(is_sparse),
+        grad_a_(false),
+        grad_b_(false) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
+    if (!is_sparse) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("grad_a", &grad_a_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("grad_b", &grad_b_));
+    }
     if (is_sparse) {
       OP_REQUIRES_OK(ctx, ctx->GetAttr("Ta", &a_type_));
       OP_REQUIRES_OK(ctx, ctx->GetAttr("Tb", &b_type_));
@@ -95,14 +103,16 @@ class MatMulOp : public XlaOpKernel {
         tsl::tensor_float_32_execution_enabled()
             ? xla::PrecisionConfig::DEFAULT
             : xla::PrecisionConfig::HIGHEST;
-    ctx->SetOutput(0,
-                   xla::BatchDot(a, transpose_a_, b, transpose_b_, precision));
+    ctx->SetOutput(0, xla::BatchDot(a, transpose_a_, b, transpose_b_, precision,
+                                    std::nullopt, grad_a_, grad_b_));
   }
 
  private:
   bool is_sparse_;
   bool transpose_a_;
   bool transpose_b_;
+  bool grad_a_;
+  bool grad_b_;
   DataType a_type_;
   DataType b_type_;
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
index 098ecf39792e21..cc0cdfc2036fa7 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
@@ -428,7 +428,7 @@ REGISTER_XLA_OP(Name("StatelessRandomGetKeyCounterAlg"), GetKeyCounterAlgOp);
 REGISTER_XLA_OP(Name("XlaRngBitGenerator")
                     .CompileTimeConstantInput("algorithm")
                     .CompileTimeConstantInput("shape")
-                    .TypeConstraint("dtype", {DT_UINT32, DT_UINT64}),
+                    .TypeConstraint("dtype", {DT_UINT8, DT_UINT32, DT_UINT64}),
                 MlirXlaOpKernel);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index 0cfcc0a5a7a78a..c8a4984c356359 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -237,13 +237,13 @@ MlirOptimizationPassState GetPassStateImpl(
       return MlirOptimizationPassState::FallbackEnabled;
     case MlirBridgeRolloutPolicy::kDisabledByUser:
       VLOG(1) << "Skipping MLIR CPU/GPU Bridge, disabled by user.";
-      metrics::UpdateTfMlirBridgeFirstPhaseCounter("cpu/gpu", "tfxla", false,
+      metrics::UpdateTfMlirBridgeFirstPhaseCounter("cpu/gpu", "v2", false,
                                                    "disabled_by_user");
       return MlirOptimizationPassState::Disabled;
     default:
       // This case should never be hit. Added here to be consistent with OSS
       // implementation.
-      metrics::UpdateTfMlirBridgeFirstPhaseCounter("cpu/gpu", "ftxla", false,
+      metrics::UpdateTfMlirBridgeFirstPhaseCounter("cpu/gpu", "v2", false,
                                                    "invalid_graph");
       return MlirOptimizationPassState::Disabled;
   }
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index 480dc474410359..edb2a40f4d332b 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -816,7 +816,7 @@ REGISTER_OP("XlaRngBitGenerator")
     .Input("shape: Tshape")
     .Output("output_key: uint64")
     .Output("output: dtype")
-    .Attr("dtype: {int32, int64, uint32, uint64} = DT_UINT64")
+    .Attr("dtype: {uint8, int8, int32, int64, uint32, uint64} = DT_UINT64")
     .Attr("Tshape: {int32, int64} = DT_INT32")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle algorithm;
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index 1336d58521404a..01bb69d16ee264 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/tf2xla.h"
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "xla/client/client_library.h"
 #include "xla/client/local_client.h"
@@ -25,20 +27,61 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/test.h"
+#include "tsl/platform/tensor_float_32_utils.h"
 
 namespace tensorflow {
 namespace {
 
+class ConvertGraphDefToXlaWithTF32Disabled : public ::testing::Test {
+ public:
+  ConvertGraphDefToXlaWithTF32Disabled() {
+    tsl::enable_tensor_float_32_execution(false);
+  }
+  ~ConvertGraphDefToXlaWithTF32Disabled() override {
+    tsl::enable_tensor_float_32_execution(true);
+  }
+};
+
 AttrValue TypeAttrValue(DataType type) {
   AttrValue attr_value;
   SetAttrValue(type, &attr_value);
   return attr_value;
 }
 
+AttrValue StringAttrValue(StringPiece str) {
+  AttrValue attr_value;
+  SetAttrValue(str, &attr_value);
+  return attr_value;
+}
+
+AttrValue IntAttrValue(int i) {
+  AttrValue attr_value;
+  SetAttrValue(i, &attr_value);
+  return attr_value;
+}
+
+AttrValue IntVectorAttrValue(const std::vector<int>& ints) {
+  AttrValue attr_value;
+  SetAttrValue(ints, &attr_value);
+  return attr_value;
+}
+
+TensorShapeProto TensorShape(const std::vector<int>& dims) {
+  TensorShapeProto shape;
+  for (int i = 0; i < dims.size(); ++i) {
+    shape.add_dim();
+    shape.mutable_dim(i)->set_size(dims[i]);
+  }
+  return shape;
+}
+
 GraphDef SumGraph() {
   GraphDef graph_def;
   NodeDef* x = graph_def.add_node();
@@ -97,6 +140,190 @@ TEST(ConvertGraphDefToXla, Sum) {
       ConvertGraphDefToXla(graph_def, config, client, &computation)));
 }
 
+GraphDef EinsumGraph() {
+  GraphDef graph_def;
+  NodeDef* x = graph_def.add_node();
+  x->set_name("x");
+  x->set_op("Placeholder");
+  (*x->mutable_attr())["dtype"] = TypeAttrValue(DT_FLOAT);
+  NodeDef* y = graph_def.add_node();
+  y->set_name("y");
+  y->set_op("Placeholder");
+  (*y->mutable_attr())["dtype"] = TypeAttrValue(DT_FLOAT);
+  NodeDef* einsum = graph_def.add_node();
+  einsum->set_name("einsum");
+  einsum->set_op("Einsum");
+  einsum->add_input("x");
+  einsum->add_input("y");
+  (*einsum->mutable_attr())["equation"] = StringAttrValue("ij,jk->ik");
+  (*einsum->mutable_attr())["T"] = TypeAttrValue(DT_FLOAT);
+  (*einsum->mutable_attr())["N"] = IntAttrValue(2);
+  return graph_def;
+}
+
+tf2xla::Config EinsumConfig() {
+  tf2xla::Config config;
+
+  tf2xla::Feed* x_feed = config.add_feed();
+  x_feed->mutable_id()->set_node_name("x");
+  *x_feed->mutable_shape() = TensorShape({2, 2});
+
+  tf2xla::Feed* y_feed = config.add_feed();
+  y_feed->mutable_id()->set_node_name("y");
+  *y_feed->mutable_shape() = TensorShape({2, 2});
+
+  config.add_fetch()->mutable_id()->set_node_name("einsum");
+  return config;
+}
+
+TEST(ConvertGraphDefToXla, EinsumIsConvertedToDotWithDefaultPrecision) {
+  GraphDef graph_def = EinsumGraph();
+  tf2xla::Config config = EinsumConfig();
+
+  xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
+  xla::XlaComputation computation;
+  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
+
+  int num_dots = 0;
+  const xla::HloModuleProto& module_proto = computation.proto();
+  for (const xla::HloComputationProto& computation_proto :
+       module_proto.computations()) {
+    for (const xla::HloInstructionProto& instruction_proto :
+         computation_proto.instructions()) {
+      if (instruction_proto.opcode() == "dot") {
+        num_dots++;
+        ASSERT_EQ(instruction_proto.precision_config().operand_precision_size(),
+                  2);
+        EXPECT_EQ(instruction_proto.precision_config().operand_precision(0),
+                  xla::PrecisionConfig::DEFAULT);
+        EXPECT_EQ(instruction_proto.precision_config().operand_precision(1),
+                  xla::PrecisionConfig::DEFAULT);
+      }
+    }
+  }
+  EXPECT_EQ(num_dots, 1);
+}
+
+TEST_F(ConvertGraphDefToXlaWithTF32Disabled,
+       EinsumIsConvertedToDotWithHighestPrecision) {
+  GraphDef graph_def = EinsumGraph();
+  tf2xla::Config config = EinsumConfig();
+
+  xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
+  xla::XlaComputation computation;
+  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
+
+  int num_dots = 0;
+  const xla::HloModuleProto& module_proto = computation.proto();
+  for (const xla::HloComputationProto& computation_proto :
+       module_proto.computations()) {
+    for (const xla::HloInstructionProto& instruction_proto :
+         computation_proto.instructions()) {
+      if (instruction_proto.opcode() == "dot") {
+        num_dots++;
+        ASSERT_EQ(instruction_proto.precision_config().operand_precision_size(),
+                  2);
+        EXPECT_EQ(instruction_proto.precision_config().operand_precision(0),
+                  xla::PrecisionConfig::HIGHEST);
+        EXPECT_EQ(instruction_proto.precision_config().operand_precision(1),
+                  xla::PrecisionConfig::HIGHEST);
+      }
+    }
+  }
+  EXPECT_EQ(num_dots, 1);
+}
+
+GraphDef Conv2DGraph() {
+  GraphDef graph_def;
+  NodeDef* x = graph_def.add_node();
+  x->set_name("x");
+  x->set_op("Placeholder");
+  (*x->mutable_attr())["dtype"] = TypeAttrValue(DT_FLOAT);
+  NodeDef* y = graph_def.add_node();
+  y->set_name("y");
+  y->set_op("Placeholder");
+  (*y->mutable_attr())["dtype"] = TypeAttrValue(DT_FLOAT);
+  NodeDef* einsum = graph_def.add_node();
+  einsum->set_name("conv2d");
+  einsum->set_op("Conv2D");
+  einsum->add_input("x");
+  einsum->add_input("y");
+  (*einsum->mutable_attr())["T"] = TypeAttrValue(DT_FLOAT);
+  (*einsum->mutable_attr())["padding"] = StringAttrValue("VALID");
+  (*einsum->mutable_attr())["strides"] = IntVectorAttrValue({1, 1, 1, 1});
+  return graph_def;
+}
+
+tf2xla::Config Conv2DConfig() {
+  tf2xla::Config config;
+  tf2xla::Feed* x_feed = config.add_feed();
+  x_feed->mutable_id()->set_node_name("x");
+  *x_feed->mutable_shape() = TensorShape({1, 1, 2, 2});
+
+  tf2xla::Feed* y_feed = config.add_feed();
+  y_feed->mutable_id()->set_node_name("y");
+  *y_feed->mutable_shape() = TensorShape({1, 1, 2, 2});
+  config.add_fetch()->mutable_id()->set_node_name("conv2d");
+  return config;
+}
+
+TEST(ConvertGraphDefToXla, Conv2DIsConvertedToConvolutionWithDefaultPrecision) {
+  GraphDef graph_def = Conv2DGraph();
+  tf2xla::Config config = Conv2DConfig();
+
+  xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
+  xla::XlaComputation computation;
+  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
+
+  int num_convolutions = 0;
+  const xla::HloModuleProto& module_proto = computation.proto();
+  for (const xla::HloComputationProto& computation_proto :
+       module_proto.computations()) {
+    for (const xla::HloInstructionProto& instruction_proto :
+         computation_proto.instructions()) {
+      if (instruction_proto.opcode() == "convolution") {
+        num_convolutions++;
+        ASSERT_EQ(instruction_proto.precision_config().operand_precision_size(),
+                  2);
+        EXPECT_EQ(instruction_proto.precision_config().operand_precision(0),
+                  xla::PrecisionConfig::DEFAULT);
+        EXPECT_EQ(instruction_proto.precision_config().operand_precision(1),
+                  xla::PrecisionConfig::DEFAULT);
+      }
+    }
+  }
+  EXPECT_EQ(num_convolutions, 1);
+}
+
+TEST_F(ConvertGraphDefToXlaWithTF32Disabled,
+       Conv2DIsConvertedToConvolutionWithHighestPrecision) {
+  GraphDef graph_def = Conv2DGraph();
+  tf2xla::Config config = Conv2DConfig();
+
+  xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
+  xla::XlaComputation computation;
+  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
+
+  int num_convolutions = 0;
+  const xla::HloModuleProto& module_proto = computation.proto();
+  for (const xla::HloComputationProto& computation_proto :
+       module_proto.computations()) {
+    for (const xla::HloInstructionProto& instruction_proto :
+         computation_proto.instructions()) {
+      if (instruction_proto.opcode() == "convolution") {
+        num_convolutions++;
+        ASSERT_EQ(instruction_proto.precision_config().operand_precision_size(),
+                  2);
+        EXPECT_EQ(instruction_proto.precision_config().operand_precision(0),
+                  xla::PrecisionConfig::HIGHEST);
+        EXPECT_EQ(instruction_proto.precision_config().operand_precision(1),
+                  xla::PrecisionConfig::HIGHEST);
+      }
+    }
+  }
+  EXPECT_EQ(num_convolutions, 1);
+}
+
 TEST(ConvertGraphDefToXla, SumWithUnusedArgument) {
   GraphDef graph_def = SumGraph();
   tf2xla::Config config = SumConfig();
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index 917d775c80011d..dc4109f52f96b6 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -228,4 +228,34 @@ int XlaCompiledCpuFunction::LookupResultIndex(const string& name) const {
   return LookupNameIndex(name, result_names_);
 }
 
+const char* XlaCompiledCpuFunction::GetArgName(const int index) const {
+  assert(arg_names_ != nullptr);
+  if (index < 0 || index >= num_args_) {
+    std::cerr << "XlaCompiledCpuFunction::GetArgName: index '" << index
+              << "' out of range [0, " << num_args_ << "].\n";
+    return nullptr;
+  }
+  return arg_names_[index];
+}
+
+const char* XlaCompiledCpuFunction::GetVariableName(int index) const {
+  assert(variable_names_ != nullptr);
+  if (index < 0 || index >= num_variables_) {
+    std::cerr << "XlaCompiledCpuFunction::GetVariableName: index '" << index
+              << "' out of range [0, " << num_variables_ << ").\n";
+    return nullptr;
+  }
+  return variable_names_[index];
+}
+
+const char* XlaCompiledCpuFunction::GetResultName(int index) const {
+  assert(result_names_ != nullptr);
+  if (index < 0 || index >= num_results_) {
+    std::cerr << "XlaCompiledCpuFunction::GetResultName: index '" << index
+              << "' out of range [0, " << num_results_ << ").\n";
+    return nullptr;
+  }
+  return result_names_[index];
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index 871ddb32d2652a..d03f06e14f5bce 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -294,6 +294,18 @@ class XlaCompiledCpuFunction {
   // Recommended usage is to capture this in a variable for re-use.
   int LookupResultIndex(const string& name) const;
 
+  // Returns the name of the argument at `index`.
+  // Returns nullptr if `HasNameIndices() == false` or `index` is out of range.
+  const char* GetArgName(int index) const;
+
+  // Returns the name of the variable at `index`.
+  // Returns nullptr if `HasNameIndices() == false` or `index` is out of range.
+  const char* GetVariableName(int index) const;
+
+  // Returns the name of the result at `index`.
+  // Returns nullptr if `HasNameIndices() == false` or `index` is out of range.
+  const char* GetResultName(int index) const;
+
   // Returns the shape of the args and results. May return nullptr if the
   // program shape isn't available.
   const xla::ProgramShapeProto* ProgramShape() const { return program_shape_; }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index aa2c761ccb6e26..bb8b29de5b9acf 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 
 #include <algorithm>
+#include <array>
 #include <map>
 #include <memory>
 #include <numeric>
@@ -27,9 +28,11 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h"
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/variant.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/flags.h"
@@ -52,6 +55,7 @@ limitations under the License.
 #include "xla/client/xla_builder.h"
 #include "xla/client/xla_computation.h"
 #include "xla/protobuf_util.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -72,6 +76,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 #include "tensorflow/core/util/dump_graph.h"
+#include "tsl/platform/tensor_float_32_utils.h"
 
 namespace tensorflow {
 namespace {
@@ -1435,6 +1440,38 @@ class DummyStackTrace : public AbstractStackTrace {
       StackFrame({"dummy_file_name", 10, "dummy_function_name"})};
 };
 
+namespace {
+
+// Add precisions configs to the HLO module to avoid TensorFloat32 computations
+// in XLA.
+//
+// Some operations, such as Einsum are converted through MlirXlaOpKernel, which
+// doesn't set the precisions, so we set them all here.
+//
+// TODO(tdanyluk): We may want to restrict this logic to only set the operand
+// precision for F32 operands. (Historically, it was set without regard to
+// operand type in other parts of TF2XLA.)
+void IncreasePrecisionsToAvoidTF32(xla::HloModuleProto& module) {
+  static constexpr std::array<absl::string_view, 2> kOpsPossiblyUsingTF32 = {
+      "dot", "convolution"};
+
+  xla::PrecisionConfig precision_config;
+  precision_config.add_operand_precision(xla::PrecisionConfig::HIGHEST);
+  precision_config.add_operand_precision(xla::PrecisionConfig::HIGHEST);
+
+  for (xla::HloComputationProto& computation : *module.mutable_computations()) {
+    for (xla::HloInstructionProto& instruction :
+         *computation.mutable_instructions()) {
+      if (absl::c_find(kOpsPossiblyUsingTF32, instruction.opcode()) !=
+          kOpsPossiblyUsingTF32.end()) {
+        *instruction.mutable_precision_config() = precision_config;
+      }
+    }
+  }
+}
+
+}  // namespace
+
 Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                  string const& name,
                                  std::unique_ptr<Graph> graph,
@@ -1571,6 +1608,10 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
     *result->host_compute_metadata.add_host_to_device() = recv;
   }
 
+  if (!tsl::tensor_float_32_execution_enabled()) {
+    IncreasePrecisionsToAvoidTF32(*result->computation->mutable_proto());
+  }
+
   VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
   VLOG(2) << "XLA output shape: "
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
index ff6b72ca562976..ad65c1708794fd 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
@@ -213,6 +213,26 @@ TEST(XlaJitCompiledCpuFunction, Sum) {
   EXPECT_EQ(0, function.num_variables());
   EXPECT_EQ(function.LookupVariableIndex("x"), -1);
 
+  // Expect that name and index lookups match.
+  for (int i = 0; i < function.num_args(); ++i) {
+    const char* name = function.GetArgName(i);
+    ASSERT_NE(name, nullptr);
+    const int roundtrip_i = function.LookupArgIndex(name);
+    EXPECT_EQ(roundtrip_i, i) << " name= " << name;
+  }
+  for (int i = 0; i < function.num_results(); ++i) {
+    const char* name = function.GetResultName(i);
+    ASSERT_NE(name, nullptr);
+    const int roundtrip_i = function.LookupResultIndex(name);
+    EXPECT_EQ(roundtrip_i, i) << " name= " << name;
+  }
+  // Expect correct handling of invalid indices.
+  EXPECT_EQ(function.GetArgName(-1), nullptr);
+  EXPECT_EQ(function.GetArgName(function.num_args()), nullptr);
+  EXPECT_EQ(function.GetResultName(-1), nullptr);
+  EXPECT_EQ(function.GetResultName(function.num_results()), nullptr);
+  EXPECT_EQ(function.GetVariableName(0), nullptr);
+
   // Check program shape.
   using xla::ShapeUtil;
   const xla::Shape s32 = ShapeUtil::MakeShape(xla::S32, {});
@@ -263,6 +283,11 @@ TEST(XlaJitCompiledCpuFunction, SumVariable) {
   EXPECT_EQ(1, function.num_variables());
   EXPECT_EQ(function.LookupVariableIndex("myvar"), 1);
 
+  const char* name = function.GetVariableName(0);
+  EXPECT_EQ(std::string(name), "myvar");
+  EXPECT_EQ(function.GetVariableName(1), nullptr);
+  EXPECT_EQ(function.GetVariableName(-1), nullptr);
+
   // Check program shape.
   using xla::ShapeUtil;
   const xla::Shape s32 = ShapeUtil::MakeShape(xla::S32, {});
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
deleted file mode 100644
index 58cf8a80e3e751..00000000000000
--- a/tensorflow/compiler/xrt/BUILD
+++ /dev/null
@@ -1,172 +0,0 @@
-# Description: Operations defined for XRT
-
-# Placeholder: load py_proto_library
-load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_gen_op_wrapper_py",
-)
-load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_strict_library", "tf_gen_op_libs")
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library",
-)
-load(
-    "@local_config_cuda//cuda:build_defs.bzl",
-    "if_cuda",
-)
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//learning/brain:__subpackages__",
-        "//tensorflow/compiler/xrt:__subpackages__",
-    ],
-    licenses = ["notice"],
-)
-
-tf_proto_library(
-    name = "xrt_proto",
-    srcs = ["xrt.proto"],
-    cc_api_version = 2,
-    protodeps = [
-        "//tensorflow/compiler/tf2xla:host_compute_metadata_proto",
-        "@local_xla//xla:xla_data_proto",
-        "@local_xla//xla:xla_proto",
-        "@local_xla//xla/service:hlo_proto",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "xrt_tpu_utils",
-    srcs = [
-        "xrt_tpu_device.cc",
-    ],
-    hdrs = [
-        "xrt_tpu_device.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/jit:xla_device",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/tpu:tpu_configuration",
-        "@local_xla//xla/client:local_client",
-        "@local_xla//xla/stream_executor/tpu:tpu_node_context",
-    ],
-)
-
-cc_library(
-    name = "xrt_utils",
-    srcs = [
-        "xrt_compilation_cache.cc",
-        "xrt_device.cc",
-        "xrt_memory_manager.cc",
-        "xrt_metrics.cc",
-        "xrt_state.cc",
-        "xrt_util.cc",
-    ],
-    hdrs = [
-        "xrt_compilation_cache.h",
-        "xrt_device.h",
-        "xrt_memory_manager.h",
-        "xrt_metrics.h",
-        "xrt_refptr.h",
-        "xrt_state.h",
-        "xrt_util.h",
-    ],
-    copts = if_cuda(["-DGOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":xrt_proto_cc",
-        "//tensorflow/compiler/jit:xla_device",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/common_runtime/gpu:gpu_runtime",
-        "//tensorflow/core/platform:regexp",
-        "//tensorflow/core/profiler/lib:traceme",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/synchronization",
-        "@local_xla//xla:debug_options_flags",
-        "@local_xla//xla:literal",
-        "@local_xla//xla:shape_util",
-        "@local_xla//xla:status_macros",
-        "@local_xla//xla:statusor",
-        "@local_xla//xla:types",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla:xla_proto_cc",
-        "@local_xla//xla/client:local_client",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/service:backend",
-        "@local_xla//xla/service:executable",
-        "@local_xla//xla/service:shaped_buffer",
-        "@local_xla//xla/stream_executor",
-        "@local_xla//xla/stream_executor:device_memory_allocator",
-        "@local_xla//xla/stream_executor/integrations:tf_allocator_adapter",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "xrt_compile_ops",
-        "xrt_state_ops",
-        "xrt_execute_op",
-    ],
-    deps = [
-        "//tensorflow/compiler/jit:common",
-        "//tensorflow/core:lib",
-    ],
-)
-
-tf_gen_op_wrapper_py(
-    name = "xrt_ops_wrapper_py",
-    out = "xrt_ops.py",
-    extra_py_deps = [
-        "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_export",
-    ],
-    py_lib_rule = py_strict_library,
-    deps = [
-        ":xrt_compile_ops_op_lib",
-        ":xrt_execute_op_op_lib",
-        ":xrt_state_ops_op_lib",
-    ],
-)
-
-tf_custom_op_py_strict_library(
-    name = "xrt_ops",
-    kernels = ["//tensorflow/compiler/xrt/kernels:xrt_ops"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":xrt_ops_wrapper_py",
-    ],
-)
-
-cc_library(
-    name = "xrt_server",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":xrt_compile_ops_op_lib",
-        ":xrt_execute_op_op_lib",
-        ":xrt_state_ops_op_lib",
-        "//tensorflow/compiler/xrt/kernels:xrt_ops",
-    ],
-)
-
-# copybara:uncomment_begin(google-only)
-# py_proto_library(
-#     name = "xrt_proto_py_pb2",
-#     api_version = 2,
-#     visibility = ["//visibility:public"],
-#     deps = [":xrt_proto"],
-# )
-# copybara:uncomment_end
diff --git a/tensorflow/compiler/xrt/cc/BUILD b/tensorflow/compiler/xrt/cc/BUILD
deleted file mode 100644
index 9783aeaafa0815..00000000000000
--- a/tensorflow/compiler/xrt/cc/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-load("//tensorflow:tensorflow.default.bzl", "tf_gen_op_wrappers_cc")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
-)
-
-tf_gen_op_wrappers_cc(
-    name = "xrt_ops",
-    op_lib_names = [
-        "xrt_compile_ops",
-        "xrt_state_ops",
-        "xrt_execute_op",
-    ],
-    pkg = "//tensorflow/compiler/xrt",
-)
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
deleted file mode 100644
index e4c4075a392c3a..00000000000000
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ /dev/null
@@ -1,146 +0,0 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//learning/brain:__subpackages__",
-        "//tensorflow/compiler/xrt:__subpackages__",
-    ],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    includes = [
-        "//tensorflow/compiler/tf2xla:friends",
-    ],
-)
-
-WITH_TPU_SUPPORT = "//tensorflow:with_tpu_support"
-
-DEFAULT = "//conditions:default"
-
-cc_library(
-    name = "xrt_state_ops",
-    hdrs = ["xrt_state_ops.h"],
-    visibility = [":friends"],
-    deps = [
-        "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xrt:xrt_proto_cc",
-        "//tensorflow/compiler/xrt:xrt_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "@local_xla//xla:literal",
-        "@local_xla//xla:shape_util",
-        "@local_xla//xla:status_macros",
-        "@local_xla//xla:statusor",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/client:local_client",
-        "@local_xla//xla/service:computation_placer",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "xrt_tpu_ops",
-    srcs = [
-        "tpu_compile_ops.cc",
-        "tpu_execute_op.cc",
-        "tpu_state_op.cc",
-    ],
-    visibility = [":friends"],
-    deps = [
-        ":xrt_state_ops",
-        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
-        "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xrt:xrt_proto_cc",
-        "//tensorflow/compiler/xrt:xrt_tpu_utils",
-        "//tensorflow/compiler/xrt:xrt_utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/core/tpu:tpu_configuration",
-        "//tensorflow/core/tpu:tpu_defs",
-        "//tensorflow/core/tpu:tpu_execute",
-        "//tensorflow/core/tpu/kernels:tpu_compilation_cache_entry",
-        "//tensorflow/core/tpu/kernels:tpu_compilation_cache_interface",
-        "//tensorflow/core/tpu/kernels:tpu_compilation_cache_key",
-        "//tensorflow/core/tpu/kernels:tpu_compilation_cache_lookup",
-        "//tensorflow/core/tpu/kernels:tpu_compile_op_common",
-        "//tensorflow/core/tpu/kernels:tpu_compile_op_hdrs",
-        "//tensorflow/core/tpu/kernels:tpu_mesh_state_interface",
-        "//tensorflow/core/tpu/kernels:tpu_op_consts",
-        "//tensorflow/core/tpu/kernels:tpu_op_util",
-        "//tensorflow/core/tpu/kernels:tpu_program_group",
-        "//tensorflow/core/tpu/kernels:tpu_program_group_interface",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla:debug_options_flags",
-        "@local_xla//xla:shape_util",
-        "@local_xla//xla:status_macros",
-        "@local_xla//xla:statusor",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/client:client_library",
-        "@local_xla//xla/client:compile_only_client",
-        "@local_xla//xla/client:local_client",
-        "@local_xla//xla/client:xla_computation",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/service:compiler",
-        "@local_xla//xla/service:computation_placer",
-        "@local_xla//xla/service:dump",
-        "@local_xla//xla/service:hlo_proto_cc",
-        "@local_xla//xla/stream_executor",
-        "@local_xla//xla/stream_executor/tpu:tpu_api",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "xrt_ops",
-    srcs = [
-        "xrt_compile_ops.cc",
-        "xrt_execute_op.cc",
-        "xrt_state_ops.cc",
-    ],
-    visibility = [":friends"],
-    deps = select({
-        WITH_TPU_SUPPORT: [":xrt_tpu_ops"],
-        DEFAULT: [],
-    }) + [
-        ":xrt_state_ops",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xrt:xrt_compile_ops_op_lib",
-        "//tensorflow/compiler/xrt:xrt_execute_op_op_lib",
-        "//tensorflow/compiler/xrt:xrt_proto_cc",
-        "//tensorflow/compiler/xrt:xrt_state_ops_op_lib",
-        "//tensorflow/compiler/xrt:xrt_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla:literal_util",
-        "@local_xla//xla:shape_util",
-        "@local_xla//xla:status_macros",
-        "@local_xla//xla:statusor",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/client:client_library",
-        "@local_xla//xla/client:local_client",
-        "@local_xla//xla/client:xla_computation",
-        "@local_xla//xla/hlo/ir:hlo",
-        "@local_xla//xla/service:compiler",
-        "@local_xla//xla/service:computation_placer",
-        "@local_xla//xla/service/gpu:gpu_executable_run_options",
-        "@local_xla//xla/stream_executor",
-    ],
-    alwayslink = 1,
-)
diff --git a/tensorflow/compiler/xrt/kernels/tpu_compile_ops.cc b/tensorflow/compiler/xrt/kernels/tpu_compile_ops.cc
deleted file mode 100644
index 8c3d3aa7300208..00000000000000
--- a/tensorflow/compiler/xrt/kernels/tpu_compile_ops.cc
+++ /dev/null
@@ -1,277 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Classes for compiling XLA computations and managing handles that refer to
-// them.
-
-#include <string>
-#include <vector>
-
-#include "absl/cleanup/cleanup.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "xla/client/client_library.h"
-#include "xla/client/compile_only_client.h"
-#include "xla/client/xla_computation.h"
-#include "xla/debug_options_flags.h"
-#include "xla/service/compiler.h"
-#include "xla/service/dump.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/status_macros.h"
-#include "xla/statusor.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/tpu/tpu_api.h"
-#include "xla/xla_data.pb.h"
-#include "tensorflow/compiler/xrt/xrt.pb.h"
-#include "tensorflow/compiler/xrt/xrt_metrics.h"
-#include "tensorflow/compiler/xrt/xrt_util.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/refcount.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/monitoring/timed.h"
-#include "tensorflow/core/lib/strings/proto_serialization.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/casts.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
-#include "tensorflow/core/tpu/kernels/tpu_compile_op.h"
-#include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
-#include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
-#include "tensorflow/core/tpu/kernels/tpu_op_util.h"
-#include "tensorflow/core/tpu/kernels/tpu_program_group.h"
-#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
-#include "tensorflow/core/tpu/tpu_configuration.h"
-#include "tensorflow/core/tpu/tpu_defs.h"
-
-namespace tensorflow {
-
-class XRTCompileOp : public OpKernel {
- public:
-  explicit XRTCompileOp(OpKernelConstruction* ctx);
-  ~XRTCompileOp() override;
-  XRTCompileOp(const XRTCompileOp&) = delete;
-  XRTCompileOp& operator=(const XRTCompileOp&) = delete;
-
-  void Compute(OpKernelContext* ctx) override;
-
- private:
-  Status Compile(const XLA_TpuMeshState* xla_mesh_state,
-                 const xrt::XLAComputation& computation_proto,
-                 tensorflow::tpu::TpuProgramGroupInterface* tpu_program_group);
-};
-
-XRTCompileOp::XRTCompileOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-Status XRTCompileOp::Compile(
-    const XLA_TpuMeshState* xla_mesh_state,
-    const xrt::XLAComputation& computation_proto,
-    tensorflow::tpu::TpuProgramGroupInterface* tpu_program_group) {
-  return tensorflow::tpu::TpuProgramGroup::CompileAndBuild(
-      computation_proto, xla_mesh_state, tpu_program_group);
-}
-
-tpu::TpuCompilationCacheKey CompilationCacheKey(
-    const xrt::XLAComputation& computation,
-    tensorflow::tpu::TpuMeshStateInterface* mesh_state, int num_replicas,
-    int num_cores_per_replica) {
-  string computation_serialized;
-  CHECK(SerializeToStringDeterministic(computation, &computation_serialized));
-  tpu::TPUCompileMetadataProto metadata;
-  metadata.set_num_replicas(num_replicas);
-  metadata.set_num_cores_per_replica(num_cores_per_replica);
-  const tpu::TpuCompilationCacheKey key = CreateCompilationCacheKey(
-      "compile", 0, tensorflow::Fingerprint64(computation_serialized), {}, {},
-      metadata, *mesh_state);
-  return key;
-}
-
-void ExitCountdown(Env* env, std::shared_ptr<std::atomic<bool>> done) {
-  const int kSleepSeconds = 300;
-  LOG(INFO) << "TpuCompileOp was cancelled. Sleeping for " << kSleepSeconds
-            << " seconds to give time for TPUCompileOp to finished.";
-  env->SleepForMicroseconds(kSleepSeconds * 1000000);
-  if (done->load()) {
-    // If the TpuCompileOp has finished, then terminate peacefully.
-    return;
-  }
-
-  LOG(ERROR) << "Aborting process due to cancelled TpuCompileOp. This "
-             << "termination is to ensure a consistent state.";
-  std::exit(42);
-}
-
-void XRTCompileOp::Compute(OpKernelContext* ctx) {
-  VLOG(1) << "XRTCompileOp::Compute";
-  auto timed = monitoring::MakeTimed(xrt_metrics::GetCompileCell());
-
-  std::shared_ptr<std::atomic<bool>> done(new std::atomic<bool>(false));
-  CancellationToken token =
-      ctx->cancellation_manager()->get_cancellation_token();
-  const bool already_cancelled =
-      !ctx->cancellation_manager()->RegisterCallback(token, [ctx, done]() {
-        if (stream_executor::tpu::OpsApiFn()
-                ->TpuCompile_ShouldTpuCompileOpIgnoreCancellationFn()) {
-          return;
-        }
-
-        // Sleep and exit in another thread so the cancellation manager can
-        // continue running callbacks.
-        Env* env = ctx->env();
-        env->SchedClosure([env, done]() { ExitCountdown(env, done); });
-      });
-
-  // If the RPC was cancelled before we registered the cancellation callback,
-  // don't compile the TPU program.
-  OP_REQUIRES(ctx, !already_cancelled,
-              absl::CancelledError("RPC cancelled, not compiling TPU program"));
-
-  // We only want to abort the process if a cancellation actually occurs during
-  // compilation; we must deregister the callback in the success case. It
-  // doesn't hurt to also deregister the callback in the failure case; the
-  // CancellationManager ensures that already-registered callbacks will be run
-  // once cancellation has started.
-  auto cancellation_cleanup = absl::MakeCleanup([ctx, token, done] {
-    ctx->cancellation_manager()->DeregisterCallback(token);
-    done->store(true);
-  });
-
-  VLOG(1) << "Retrieving pod state";
-  // Retrieve the topology from the resource manager
-  ResourceMgr* rm = GetTPUConfigResourceMgr();
-  tensorflow::tpu::TpuMeshStateInterface* mesh_state;
-  OP_REQUIRES_OK(ctx,
-                 rm->Lookup(rm->default_container(),
-                            tensorflow::tpu::kTpuMeshStateInterfaceResourceName,
-                            &mesh_state));
-  core::ScopedUnref mesh_state_unref(mesh_state);
-
-  const Tensor& computation_input = ctx->input(0);
-  OP_REQUIRES(
-      ctx, TensorShapeUtils::IsScalar(computation_input.shape()),
-      absl::InternalError("computation input should be a string scalar"));
-
-  xrt::XLAComputation computation_proto;
-  OP_REQUIRES(
-      ctx,
-      computation_proto.ParseFromString(computation_input.scalar<tstring>()()),
-      absl::InvalidArgumentError(
-          "Unable to parse computation input to XLAComputation"));
-
-  const xrt::XLAComputationConfig& config = computation_proto.config();
-  int num_replicas = config.num_replicas() ? config.num_replicas() : 1;
-  CHECK_GT(num_replicas, 0);
-  int num_cores_per_replica =
-      config.num_cores_per_replica() ? config.num_cores_per_replica() : 1;
-
-  const tpu::TpuCompilationCacheKey key = CompilationCacheKey(
-      computation_proto, mesh_state, num_replicas, num_cores_per_replica);
-
-  // Process-wide cache of Tpu executables.
-  tpu::TpuCompilationCacheInterface* cache;
-  OP_REQUIRES_OK(ctx, rm->Lookup<tpu::TpuCompilationCacheInterface>(
-                          rm->default_container(),
-                          tpu::kCompilationCacheResourceName, &cache));
-  core::ScopedUnref cache_unref(cache);
-
-  int64_t uid;
-  std::vector<string> proto_key;
-  std::vector<string> shard_key;
-  std::vector<bool> may_modify_variables;
-  absl::Span<const xla::HloProto* const> hlo_metadata;
-  OP_REQUIRES_OK(
-      ctx, cache->CompileIfKeyAbsent(
-               key, /*session_metadata=*/nullptr,
-               /*per_step_ref_holder=*/nullptr, &uid, &proto_key, &shard_key,
-               &may_modify_variables, &hlo_metadata,
-               [&](tpu::TpuProgramGroupInterface* tpu_program_group) {
-                 VLOG(1) << "Compiling TPU executable";
-                 return Compile(mesh_state->data(), computation_proto,
-                                tpu_program_group);
-               }));
-
-  Tensor output(DT_INT64, TensorShape({}));
-  output.scalar<int64_t>()() = uid;
-  ctx->set_output(0, output);
-
-  Tensor program_shape_output(DT_STRING, TensorShape({num_cores_per_replica}));
-  for (int64_t i = 0; i < num_cores_per_replica; ++i) {
-    xla::ProgramShapeProto program_shape =
-        hlo_metadata[i]->hlo_module().host_program_shape();
-    program_shape_output.vec<tstring>()(i) = program_shape.SerializeAsString();
-  }
-  ctx->set_output(1, program_shape_output);
-}
-
-XRTCompileOp::~XRTCompileOp() = default;
-
-class XRTReleaseCompilationRefOp : public OpKernel {
- public:
-  explicit XRTReleaseCompilationRefOp(OpKernelConstruction* ctx);
-  ~XRTReleaseCompilationRefOp() override;
-  XRTReleaseCompilationRefOp(const XRTReleaseCompilationRefOp&) = delete;
-  XRTReleaseCompilationRefOp& operator=(const XRTReleaseCompilationRefOp&) =
-      delete;
-
-  void Compute(OpKernelContext* ctx) override;
-};
-
-XRTReleaseCompilationRefOp::XRTReleaseCompilationRefOp(
-    OpKernelConstruction* ctx)
-    : OpKernel(ctx) {}
-
-XRTReleaseCompilationRefOp::~XRTReleaseCompilationRefOp() = default;
-
-void XRTReleaseCompilationRefOp::Compute(OpKernelContext* ctx) {
-  VLOG(1) << "XRTReleaseCompilationRefOp::Compute";
-  auto timed = monitoring::MakeTimed(xrt_metrics::GetReleaseCompilationCell());
-  ResourceMgr* rm = GetTPUConfigResourceMgr();
-  OP_REQUIRES(ctx, rm != nullptr, absl::InternalError("No resource manager."));
-
-  // Process-wide cache of Tpu executables.
-  tpu::TpuCompilationCacheInterface* cache;
-  OP_REQUIRES_OK(ctx, rm->Lookup<tpu::TpuCompilationCacheInterface>(
-                          rm->default_container(),
-                          tpu::kCompilationCacheResourceName, &cache));
-  core::ScopedUnref cache_unref(cache);
-
-  const Tensor& keys_tensor = ctx->input(0);
-  auto flat_keys = keys_tensor.flat<int64_t>();
-  for (int64_t i = 0; i < flat_keys.size(); ++i) {
-    int64_t key = flat_keys(i);
-    OP_REQUIRES_OK(ctx, cache->Release(key));
-    VLOG(2) << "Released computation handle " << key;
-  }
-}
-
-REGISTER_KERNEL_BUILDER(Name("XRTCompile")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("computation")
-                            .HostMemory("handle"),
-                        XRTCompileOp);
-
-REGISTER_KERNEL_BUILDER(Name("XRTReleaseCompilationHandle")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("handle"),
-                        XRTReleaseCompilationRefOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/kernels/tpu_execute_op.cc b/tensorflow/compiler/xrt/kernels/tpu_execute_op.cc
deleted file mode 100644
index 1073a103c8369a..00000000000000
--- a/tensorflow/compiler/xrt/kernels/tpu_execute_op.cc
+++ /dev/null
@@ -1,490 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/jit/xla_device.h"
-#include "xla/hlo/ir/hlo_input_output_alias_config.h"
-#include "xla/service/computation_placer.h"
-#include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/statusor.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
-#include "xla/xla_data.pb.h"
-#include "tensorflow/compiler/xrt/xrt.pb.h"
-#include "tensorflow/compiler/xrt/xrt_memory_manager.h"
-#include "tensorflow/compiler/xrt/xrt_metrics.h"
-#include "tensorflow/compiler/xrt/xrt_state.h"
-#include "tensorflow/compiler/xrt/xrt_util.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/core/refcount.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/monitoring/timed.h"
-#include "tensorflow/core/platform/casts.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
-#include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
-#include "tensorflow/core/tpu/kernels/tpu_program_group.h"
-#include "tensorflow/core/tpu/tpu_configuration.h"
-#include "tensorflow/core/tpu/tpu_defs.h"
-#include "tensorflow/core/tpu/tpu_execute.h"
-
-namespace tensorflow {
-namespace {
-
-using tensorflow::tpu::CompilationCacheEntryRef;
-using tensorflow::tpu::TpuCompilationCacheEntry;
-using tensorflow::tpu::TpuCompilationCacheLookup;
-using GetBufferFunction =
-    std::function<xla::StatusOr<std::vector<xla::ExecutionInput>>()>;
-
-// Looks up the input `key` in the compilation cache.
-Status GetComputationCacheEntry(
-    ResourceMgr* rm, int64_t key, int core_index_in_replica,
-    std::unique_ptr<CompilationCacheEntryRef>* entry) {
-  profiler::TraceMe trace_me("XRTExecuteOp::LookupProto", /*level=*/2);
-  TpuCompilationCacheLookup* proto_lookup;
-  TF_RETURN_IF_ERROR(rm->Lookup(rm->default_container(),
-                                tpu::kCompiledProtoCacheResourceName,
-                                &proto_lookup));
-  core::ScopedUnref lookup_unref(proto_lookup);
-  TF_RETURN_IF_ERROR(proto_lookup->Lookup(key, core_index_in_replica, entry));
-  return OkStatus();
-}
-
-std::vector<bool> GetDynamicInputInfo(
-    const TPUExecutableInfoProto& executable_proto) {
-  std::vector<bool> input_is_dynamic;
-  input_is_dynamic.reserve(executable_proto.input_shapes().size());
-  for (int64_t i = 0; i < executable_proto.input_shapes().size(); ++i) {
-    input_is_dynamic.push_back(
-        !xla::Shape(executable_proto.input_shapes(i)).is_static());
-  }
-  return input_is_dynamic;
-}
-
-xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetChainedOpInputs(
-    const xrt::XRTChainedExecuteOp& op,
-    absl::Span<const RefPtr<XRTTupleAllocation>> op_inputs,
-    const TPUExecutableInfoProto& executable_proto) {
-  if (op.inputs_size() != executable_proto.input_shapes_size()) {
-    return errors::InvalidArgument(
-        "Number of inputs does not match executable proto input shapes: ",
-        op.inputs_size(), " vs. ", executable_proto.input_shapes_size());
-  }
-
-  std::vector<RefPtr<XRTTupleAllocation>> input_tuples;
-  input_tuples.reserve(op.inputs_size());
-  for (int i = 0; i < op.inputs_size(); ++i) {
-    auto& input = op.inputs(i);
-    const RefPtr<XRTTupleAllocation>& tuple = op_inputs[i];
-    // Thanks to the greatness of proto3, there is no way to query for
-    // explicitly set fields, so the default for output_index (zero) means no
-    // sub-index. As consequence, the real index is output_index - 1.
-    if (input.output_index() == 0) {
-      input_tuples.push_back(tuple);
-    } else {
-      XRTTupleAllocation* sub_tuple;
-      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
-          tuple.get(), {input.output_index() - 1}, &sub_tuple,
-          /*alias_parent_allocation=*/true));
-      input_tuples.emplace_back(sub_tuple);
-    }
-    if (!InputShapeMatches(xla::Shape(executable_proto.input_shapes(i)),
-                           input_tuples.back()->on_host_shape())) {
-      return errors::InvalidArgument(
-          "Run-time shape mismatch for XRTExecute argument[", i, "] (",
-          op.computation_handle(), "). Expected ",
-          executable_proto.input_shapes(i).DebugString(), "; got ",
-          tuple->on_host_shape().DebugString());
-    }
-  }
-  return std::move(input_tuples);
-}
-
-xla::StatusOr<xla::HloInputOutputAliasConfig> GetExecutableAliasConfig(
-    const tpu::TpuProgramGroup* tpu_program_group, xla::Backend* const backend,
-    int core_index) {
-  const TPUExecutableInfoProto& executable =
-      tpu_program_group->executable_info(core_index);
-  return xla::HloInputOutputAliasConfig::CreateFromProto(
-      backend->transfer_manager()->HostShapeToDeviceShape(
-          xla::Shape(executable.output_shape())),
-      tpu_program_group->hlo_metadata(core_index)
-          ->hlo_module()
-          .input_output_alias());
-}
-
-xla::StatusOr<RefPtr<XRTTupleAllocation>> AllocateOutputTuple(
-    tpu::TpuNodeContext* node_context, se::Stream* stream,
-    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
-    const xla::HloInputOutputAliasConfig& input_output_alias,
-    xla::ScopedShapedBuffer output_scoped_buffer, int device_ordinal) {
-  auto output_shaped_buffer = output_scoped_buffer.release();
-
-  xla::Shape output_device_shape = output_shaped_buffer.on_device_shape();
-  if (!output_device_shape.is_static()) {
-    TF_RETURN_IF_ERROR(
-        node_context->backend()->transfer_manager()->ReadDynamicShapes(
-            stream, &output_shaped_buffer, &output_device_shape));
-  }
-
-  XRTTupleAllocation* output_tuple;
-  xla::Shape output_host_shape =
-      xla::ShapeUtil::DeviceShapeToHostShape(output_device_shape);
-
-  TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
-      output_shaped_buffer, output_host_shape, output_device_shape,
-      node_context->backend(), device_ordinal, &output_tuple,
-      node_context->backend()->memory_allocator()));
-  RefPtr<XRTTupleAllocation> output_tuple_ptr(output_tuple);
-
-  // If the input tuples had to release some buffers in order to provide the
-  // proper temporary ownership transfer, we patch the holes here by alising the
-  // buffers from the result tuple. The device address we patch back here, will
-  // essentially be the same one we carved out in the DoWork() function.
-  TF_RETURN_IF_ERROR(
-      RebuildOutputAliases(output_tuple_ptr, input_tuples, input_output_alias));
-
-  return std::move(output_tuple_ptr);
-}
-
-Status AllocateOutputTensors(
-    OpKernelContext* context, XRTMemoryManager* memory_manager,
-    tpu::TpuNodeContext* node_context, se::Stream* stream,
-    const xrt::XRTExecutionConfig& config_proto,
-    const TPUExecutableInfoProto& executable_proto,
-    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
-    const xla::HloInputOutputAliasConfig& input_output_alias,
-    xla::ScopedShapedBuffer output_scoped_buffer, int device_ordinal) {
-  TF_ASSIGN_OR_RETURN(
-      RefPtr<XRTTupleAllocation> output_tuple,
-      AllocateOutputTuple(node_context, stream, input_tuples,
-                          input_output_alias, std::move(output_scoped_buffer),
-                          device_ordinal));
-  return CreateExecuteOutput(context, memory_manager, std::move(output_tuple),
-                             config_proto.return_exploded_tuple());
-}
-
-xla::StatusOr<xla::ExecutionOutput> RunExecutable(
-    OpKernelContext* context, tpu::TpuNodeContext* node_context,
-    const TPUExecutableInfoProto& executable,
-    std::vector<xla::ExecutionInput> arguments, const string& execution_id,
-    const uint32 rng_seed, const tpu::TpuProgramGroup* tpu_program_group,
-    xla::Backend* const backend, se::Stream* stream, int core_index,
-    int device_ordinal, string rendezvous_key_base) {
-  profiler::TraceMe trace_me("RunExecutable", /*level=*/2);
-
-  // se::StreamExecutor* executor = node->stream_executor();
-
-  std::unique_ptr<xla::DeviceAssignment> device_assignment;
-  if (executable.has_device_assignment()) {
-    TF_ASSIGN_OR_RETURN(device_assignment, xla::DeviceAssignment::Deserialize(
-                                               executable.device_assignment()));
-  }
-  // Ideally this should be the host-to-device stream from XlaDeviceContext.
-  // The particular anti-dependency this is avoiding (why we need a separate
-  // transfer stream) is between the executable writing tuple tables and
-  // TPUExecute()'s deregister_stream; if they come from the same stream pool
-  // antidependencies will occur. XlaBackend has a different pool of streams
-  // to the stream->GetOrCreateSubStream() that TPUExecute() uses, so these
-  // will never refer to the same stream.
-  TF_ASSIGN_OR_RETURN(auto transfer_stream_ptr,
-                      backend->BorrowStream(device_ordinal));
-  const TPUHostTransferInfoProto& host_transfer_info =
-      tpu_program_group->host_transfer_info(core_index);
-  TF_ASSIGN_OR_RETURN(
-      xla::ExecutionOutput output,
-      TPUExecute(executable, host_transfer_info,
-                 *tpu_program_group->hlo_metadata(core_index),
-                 std::move(arguments), rendezvous_key_base, rng_seed,
-                 node_context, device_assignment.get(),
-                 context->cancellation_manager(), context, stream,
-                 transfer_stream_ptr.get(),
-                 tpu_program_group->tpu_program(core_index)));
-
-  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-
-  return output;
-}
-
-xla::StatusOr<xla::ExecutionOutput> ExecuteTPUProgram(
-    OpKernelContext* context, tpu::TpuNodeContext* node_context,
-    XRTMemoryManager* memory_manager, const TPUExecutableInfoProto& executable,
-    const GetBufferFunction& get_buffers_fn, const string& execution_id,
-    const uint32 rng_seed, const tpu::TpuProgramGroup* tpu_program_group,
-    xla::Backend* const backend, se::Stream* stream, int core_index,
-    int device_ordinal, string rendezvous_key_base) {
-  auto runfn = [&]() -> xla::StatusOr<xla::ExecutionOutput> {
-    TF_ASSIGN_OR_RETURN(auto arguments, get_buffers_fn());
-    return RunExecutable(context, node_context, executable,
-                         std::move(arguments), execution_id, rng_seed,
-                         tpu_program_group, backend, stream, core_index,
-                         device_ordinal, rendezvous_key_base);
-  };
-  return memory_manager->Run<xla::ExecutionOutput>(
-      runfn, backend, device_ordinal, /*requested_free_size=*/0,
-      backend->memory_allocator());
-}
-
-// XRTExecuteOp
-
-class XRTExecuteOp : public AsyncOpKernel {
- public:
-  explicit XRTExecuteOp(OpKernelConstruction* context);
-
-  void ComputeAsync(OpKernelContext* context, DoneCallback done) override;
-
- private:
-  Status DoWork(OpKernelContext* context);
-};
-
-XRTExecuteOp::XRTExecuteOp(OpKernelConstruction* context)
-    : AsyncOpKernel(context, /* is_deferred = */ true) {}
-
-void XRTExecuteOp::ComputeAsync(OpKernelContext* context, DoneCallback done) {
-  // Schedule onto the default queue, for unbounded concurrency. See b/73520706
-  OP_REQUIRES_OK_ASYNC(context, DoWork(context), done);
-  done();
-}
-
-Status XRTExecuteOp::DoWork(OpKernelContext* context) {
-  VLOG(1) << "XRTExecuteOp::Compute";
-
-  const XlaDevice::Metadata* metadata;
-  TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(context, &metadata));
-  const int device_ordinal = metadata->device_ordinal();
-  // We are guaranteed that the object underlying TpuNodeContext won't be
-  // deleted out from under us, while node_context is alive.
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<tpu::TpuNodeContext> node_context,
-                      tpu::TpuNodeContext::Create(device_ordinal));
-  xla::Backend* const backend = node_context->backend();
-  se::Stream* stream = context->op_device_context()->stream();
-
-  auto timed = monitoring::MakeTimed(xrt_metrics::GetExecuteCell());
-  profiler::TraceMe trace_me(
-      [context] {
-        return profiler::TraceMeEncode("TpuExecuteOp",
-                                       {{"step_id", context->step_id()}});
-      },
-      /*level=*/2);
-  profiler::TraceMe trace_me_init("XRTExecuteOp::Init", /*level=*/2);
-
-  auto* rm = GetTPUConfigResourceMgr();
-  TF_RET_CHECK(rm != nullptr);
-
-  const Tensor& execution_input = context->input(0);
-  TF_RET_CHECK(TensorShapeUtils::IsScalar(execution_input.shape()));
-  int64_t compilation_handle = execution_input.scalar<int64_t>()();
-
-  const Tensor& execution_config = context->input(1);
-  TF_RET_CHECK(TensorShapeUtils::IsScalar(execution_config.shape()));
-  xrt::XRTExecutionConfig config_proto;
-  TF_RET_CHECK(
-      config_proto.ParseFromString(execution_config.scalar<tstring>()()));
-
-  int core_index_in_replica = config_proto.core_index_in_replica();
-  bool release_inputs = config_proto.release_input_handles();
-  bool release_compilation = config_proto.release_compilation_handle();
-
-  string rendezvous_key_base = std::to_string(compilation_handle);
-  std::unique_ptr<CompilationCacheEntryRef> entry;
-  TF_RETURN_IF_ERROR(GetComputationCacheEntry(rm, compilation_handle,
-                                              core_index_in_replica, &entry));
-
-  TpuCompilationCacheEntry centry = entry->get();
-  const tpu::TpuProgramGroup* tpu_program_group =
-      tensorflow::down_cast<const tpu::TpuProgramGroup*>(
-          centry.tpu_program_group());
-  CHECK_NE(tpu_program_group, nullptr);
-
-  if (release_compilation) {
-    // Process-wide cache of Tpu executables.
-    tpu::TpuCompilationCacheInterface* cache;
-    TF_RETURN_IF_ERROR(rm->Lookup<tpu::TpuCompilationCacheInterface>(
-        rm->default_container(), tpu::kCompilationCacheResourceName, &cache));
-    core::ScopedUnref cache_unref(cache);
-    TF_RETURN_IF_ERROR(cache->Release(compilation_handle));
-    VLOG(2) << "Released compilation handle " << compilation_handle;
-  }
-
-  const int core_index = centry.core_index();
-  const TPUExecutableInfoProto& executable =
-      tpu_program_group->executable_info(core_index);
-
-  std::vector<bool> input_is_dynamic = GetDynamicInputInfo(executable);
-
-  TF_ASSIGN_OR_RETURN(
-      xla::HloInputOutputAliasConfig input_output_alias,
-      GetExecutableAliasConfig(tpu_program_group, backend, core_index));
-  TF_ASSIGN_OR_RETURN(std::vector<InputCoords> input_coords,
-                      GetComputationInputs(context, "input_handles"));
-
-  RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
-  XRTMemoryManager::WorkingSet working_set(memory_manager);
-  TF_ASSIGN_OR_RETURN(
-      std::vector<RefPtr<XRTTupleAllocation>> input_tuples,
-      GetInputTupleAllocations(
-          input_coords, &working_set, backend, executable.input_shapes_size(),
-          [&](int64_t i) { return xla::Shape(executable.input_shapes(i)); },
-          release_inputs, backend->memory_allocator()));
-  auto get_buffers_fn = [&]() {
-    return GetArgumentsBuffers(input_output_alias, input_tuples,
-                               input_is_dynamic, release_inputs);
-  };
-  trace_me_init.Stop();
-
-  TF_ASSIGN_OR_RETURN(
-      xla::ExecutionOutput output,
-      ExecuteTPUProgram(
-          context, node_context.get(), memory_manager.get(), executable,
-          get_buffers_fn, config_proto.execution_instance_key(),
-          config_proto.rng_seed(), tpu_program_group, backend, stream,
-          core_index, device_ordinal, rendezvous_key_base));
-
-  // AllocateComputationOutput writes the output tuple handle to the output
-  // tensor return value from the Op.
-  TF_RETURN_IF_ERROR(AllocateOutputTensors(
-      context, memory_manager.get(), node_context.get(), stream, config_proto,
-      executable, input_tuples, input_output_alias, output.ConsumeResult(),
-      device_ordinal));
-  return OkStatus();
-}
-
-class XRTExecuteChainedOp : public AsyncOpKernel {
- public:
-  explicit XRTExecuteChainedOp(OpKernelConstruction* context);
-
-  void ComputeAsync(OpKernelContext* context, DoneCallback done) override;
-
- private:
-  Status DoWork(OpKernelContext* context);
-};
-
-XRTExecuteChainedOp::XRTExecuteChainedOp(OpKernelConstruction* context)
-    : AsyncOpKernel(context, /* is_deferred = */ true) {}
-
-void XRTExecuteChainedOp::ComputeAsync(OpKernelContext* context,
-                                       DoneCallback done) {
-  // Schedule onto the default queue, for unbounded concurrency. See b/73520706
-  OP_REQUIRES_OK_ASYNC(context, DoWork(context), done);
-  done();
-}
-
-Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
-  VLOG(1) << "XRTExecuteChainedOp::Compute";
-  const XlaDevice::Metadata* metadata;
-  TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(context, &metadata));
-  const int device_ordinal = metadata->device_ordinal();
-  // We are guaranteed that the object underlying TpuNodeContext won't be
-  // deleted out from under us, while node_context is alive.
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<tpu::TpuNodeContext> node_context,
-                      tpu::TpuNodeContext::Create(device_ordinal));
-  xla::Backend* const backend = node_context->backend();
-  se::Stream* stream = context->op_device_context()->stream();
-  auto timed = monitoring::MakeTimed(xrt_metrics::GetExecuteChainedCell());
-  profiler::TraceMe trace_me(
-      [context] {
-        return profiler::TraceMeEncode("TpuExecuteChainedOp",
-                                       {{"step_id", context->step_id()}});
-      },
-      /*level=*/2);
-  ResourceMgr* rm = GetTPUConfigResourceMgr();
-  TF_RET_CHECK(rm != nullptr);
-
-  const Tensor& execution_plan = context->input(0);
-  TF_RET_CHECK(TensorShapeUtils::IsScalar(execution_plan.shape()));
-  xrt::XRTChainedExecutePlan plan;
-  TF_RET_CHECK(plan.ParseFromString(execution_plan.scalar<tstring>()()));
-
-  const Tensor& execution_config = context->input(1);
-  TF_RET_CHECK(TensorShapeUtils::IsScalar(execution_config.shape()));
-  xrt::XRTChainedExecuteConfig config;
-  TF_RET_CHECK(config.ParseFromString(execution_config.scalar<tstring>()()));
-
-  TpuCompilationCacheLookup* proto_lookup;
-  TF_RETURN_IF_ERROR(rm->Lookup(rm->default_container(),
-                                tpu::kCompiledProtoCacheResourceName,
-                                &proto_lookup));
-  core::ScopedUnref lookup_unref(proto_lookup);
-  RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
-  auto execute_op = [&](const xrt::XRTChainedExecuteOp& op,
-                        absl::Span<const RefPtr<XRTTupleAllocation>> op_inputs)
-      -> xla::StatusOr<RefPtr<XRTTupleAllocation>> {
-    std::unique_ptr<CompilationCacheEntryRef> entry;
-    TF_RETURN_IF_ERROR(proto_lookup->Lookup(
-        op.computation_handle(), config.core_index_in_replica(), &entry));
-    string rendezvous_key_base = std::to_string(op.computation_handle());
-    TpuCompilationCacheEntry centry = entry->get();
-    const tpu::TpuProgramGroup* tpu_program_group =
-        tensorflow::down_cast<const tpu::TpuProgramGroup*>(
-            centry.tpu_program_group());
-    CHECK_NE(tpu_program_group, nullptr);
-    const int core_index = centry.core_index();
-    const TPUExecutableInfoProto& executable =
-        tpu_program_group->executable_info(core_index);
-    std::vector<bool> input_is_dynamic = GetDynamicInputInfo(executable);
-
-    TF_ASSIGN_OR_RETURN(
-        xla::HloInputOutputAliasConfig input_output_alias,
-        GetExecutableAliasConfig(tpu_program_group, backend, core_index));
-    TF_ASSIGN_OR_RETURN(std::vector<RefPtr<XRTTupleAllocation>> input_tuples,
-                        GetChainedOpInputs(op, op_inputs, executable));
-    auto get_buffers_fn = [&]() {
-      return GetArgumentsBuffers(input_output_alias, input_tuples,
-                                 input_is_dynamic,
-                                 /*release_inputs=*/false);
-    };
-    TF_ASSIGN_OR_RETURN(
-        xla::ExecutionOutput output,
-        ExecuteTPUProgram(context, node_context.get(), memory_manager.get(),
-                          executable, get_buffers_fn,
-                          config.execution_instance_key(), config.rng_seed(),
-                          tpu_program_group, backend, stream, core_index,
-                          device_ordinal, rendezvous_key_base));
-    return AllocateOutputTuple(node_context.get(), stream, input_tuples,
-                               input_output_alias, output.ConsumeResult(),
-                               device_ordinal);
-  };
-
-  return ExecuteChained(context, memory_manager, backend, device_ordinal, plan,
-                        config, execute_op, backend->memory_allocator());
-}
-
-}  // namespace
-
-REGISTER_KERNEL_BUILDER(Name("XRTExecute")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("computation_handle")
-                            .HostMemory("execution_config")
-                            .HostMemory("input_handles")
-                            .HostMemory("output_handle"),
-                        XRTExecuteOp);
-
-REGISTER_KERNEL_BUILDER(Name("XRTExecuteChained")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("execution_plan")
-                            .HostMemory("execution_config")
-                            .HostMemory("output_handle"),
-                        XRTExecuteChainedOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/kernels/tpu_state_op.cc b/tensorflow/compiler/xrt/kernels/tpu_state_op.cc
deleted file mode 100644
index 6fe1321c413887..00000000000000
--- a/tensorflow/compiler/xrt/kernels/tpu_state_op.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Classes for allocating XLA literals in device memory and managing handles
-// that refer to them.
-
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "xla/client/local_client.h"
-#include "tensorflow/compiler/xrt/kernels/xrt_state_ops.h"
-#include "tensorflow/compiler/xrt/xrt_tpu_device.h"
-#include "tensorflow/core/tpu/tpu_defs.h"
-
-namespace tensorflow {
-REGISTER_KERNEL_BUILDER(Name("XRTAllocate")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("allocation")
-                            .HostMemory("handle"),
-                        XRTAllocateOp<XRTTpuDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTAllocateUninitialized")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("handle"),
-                        XRTAllocateUninitializedOp<XRTTpuDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTAllocateFromTensor")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("inputs")
-                            .HostMemory("handle"),
-                        XRTAllocateFromTensorOp<XRTTpuDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTSubTuple")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("base_handle")
-                            .HostMemory("shape_index")
-                            .HostMemory("output_handle"),
-                        XRTSubTupleOp<false, XRTTpuDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTSubTupleAndRelease")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("base_handle")
-                            .HostMemory("shape_index")
-                            .HostMemory("output_handle"),
-                        XRTSubTupleOp<true, XRTTpuDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTMakeTuple")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("tuple_description")
-                            .HostMemory("input_handles")
-                            .HostMemory("output_handle"),
-                        XRTMakeTupleOp<XRTTpuDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTReadLiteral")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("handle")
-                            .HostMemory("literal"),
-                        XRTReadLiteralOp<false, XRTTpuDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTWriteLiteral")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("handle")
-                            .HostMemory("literal")
-                            .HostMemory("output_handle"),
-                        XRTWriteLiteralOp<XRTTpuDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTReadLiteralAndRelease")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("handle")
-                            .HostMemory("literal"),
-                        XRTReadLiteralOp<true, XRTTpuDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTReadToTensor")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("handles")
-                            .HostMemory("tensors"),
-                        XRTReadToTensorOp<XRTTpuDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllocationHandle")
-                            .Device(DEVICE_TPU_NODE)
-                            .HostMemory("handle"),
-                        XRTReleaseAllocationOp<XRTTpuDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("XRTReleaseAllAllocations").Device(DEVICE_TPU_NODE),
-    XRTReleaseAllAllocationsOp<XRTTpuDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTCompactAllocations").Device(DEVICE_TPU_NODE),
-                        XRTCompactAllocationsOp<XRTTpuDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTMemoryInfo").Device(DEVICE_TPU_NODE),
-                        XRTMemoryInfoOp<XRTTpuDeviceAccessor>);
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
deleted file mode 100644
index ec6a9c56dfbdab..00000000000000
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ /dev/null
@@ -1,301 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Classes for compiling XLA computations and managing handles that refer to
-// them.
-
-#include <cstdlib>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "xla/client/client_library.h"
-#include "xla/client/xla_computation.h"
-#include "xla/service/compiler.h"
-#include "xla/status_macros.h"
-#include "xla/statusor.h"
-#include "xla/xla_data.pb.h"
-#include "tensorflow/compiler/xrt/xrt.pb.h"
-#include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
-#include "tensorflow/compiler/xrt/xrt_device.h"
-#include "tensorflow/compiler/xrt/xrt_metrics.h"
-#include "tensorflow/compiler/xrt/xrt_util.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/refcount.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/monitoring/timed.h"
-#include "tensorflow/core/lib/strings/proto_serialization.h"
-#include "tensorflow/core/platform/fingerprint.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-namespace {
-
-Status GenerateXlaDeviceAssignment(
-    const xrt::DeviceAssignment& xrt_device_assignment, int num_replicas,
-    int num_cores_per_replica, xla::DeviceAssignment* device_assignment) {
-  if (num_cores_per_replica !=
-      xrt_device_assignment.computation_devices_size()) {
-    return errors::InvalidArgument(
-        "Device assignment does not have the correct number of "
-        "computation_devices: num_cores_per_replica=",
-        num_cores_per_replica, " computation_devices=",
-        xrt_device_assignment.computation_devices_size());
-  }
-  for (int64_t c = 0; c < xrt_device_assignment.computation_devices_size();
-       ++c) {
-    const auto& computation_devices =
-        xrt_device_assignment.computation_devices(c);
-    if (num_replicas != computation_devices.replica_devices_size()) {
-      return errors::InvalidArgument(
-          "Device assignment does not have the correct number of "
-          "replica_device_ids: num_replicas=",
-          num_replicas,
-          " replica_devices=", computation_devices.replica_devices_size());
-    }
-    for (int64_t r = 0; r < computation_devices.replica_devices_size(); ++r) {
-      const auto& coords = computation_devices.replica_devices(r);
-      if (coords.value_size() != 4) {
-        return errors::InvalidArgument(
-            "Device assignment mesh coordinates must have 4 entries, got ",
-            coords.value_size());
-      }
-      for (int n = 0; n < 3; ++n) {
-        if (coords.value(n) != 0) {
-          return errors::InvalidArgument("Mesh coordinate at index ", n,
-                                         " must be 0, got ", coords.value(n));
-        }
-      }
-      (*device_assignment)(r, c) = coords.value(3);
-    }
-  }
-  return OkStatus();
-}
-
-class XRTCompileOp : public OpKernel {
- public:
-  explicit XRTCompileOp(OpKernelConstruction* ctx);
-  ~XRTCompileOp() override;
-  XRTCompileOp(const XRTCompileOp&) = delete;
-  XRTCompileOp& operator=(const XRTCompileOp&) = delete;
-
-  void Compute(OpKernelContext* ctx) override;
-
- private:
-  Status Compile(OpKernelContext* ctx,
-                 const xrt::XLAComputation& computation_proto,
-                 std::unique_ptr<xla::LocalExecutable>* program);
-};
-
-Status CompilationCacheKey(const xrt::XLAComputation& computation,
-                           string* key) {
-  const size_t size = computation.ByteSizeLong();
-  auto serialized = absl::make_unique<char[]>(size);
-  TF_RET_CHECK(
-      SerializeToBufferDeterministic(computation, serialized.get(), size));
-  uint64 fingerprint = Fingerprint64(absl::string_view(serialized.get(), size));
-  *key = absl::StrCat(fingerprint);
-  return OkStatus();
-}
-
-XRTCompileOp::XRTCompileOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-Status XRTCompileOp::Compile(OpKernelContext* ctx,
-                             const xrt::XLAComputation& computation_proto,
-                             std::unique_ptr<xla::LocalExecutable>* program) {
-  const xrt::XLAComputationConfig& config = computation_proto.config();
-  // Sanity checks for options not yet supported.
-  int num_cores_per_replica = std::max<int>(config.num_cores_per_replica(), 1);
-  TF_RET_CHECK(num_cores_per_replica == 1);
-  TF_RET_CHECK(config.per_core_program_shape_size() == 0);
-
-  // The default config value is 0; treat it as 1 for convenience.
-  int num_replicas = config.num_replicas() ? config.num_replicas() : 1;
-
-  // We are guaranteed that the underlying device object won't be deleted out
-  // from under us, while the ScopedRef is live.
-  class XRTGenericDeviceAccessor::ScopedRef device_ref;
-  TF_RETURN_IF_ERROR(XRTGenericDeviceAccessor::InitScopedRef(ctx, &device_ref));
-
-  xla::LocalClient* client = device_ref.client();
-
-  // There is officially no way to use XLA in a client/server architecture where
-  // client and server are built from different revisions, because the XLA team
-  // does not want to give any guarantees about the stability of the Hlo
-  // proto. For cloud TPU this is fine because server and client versions can be
-  // assumed to be synced to the same version. For general use the mechanism
-  // here (using a snapshot from XlaComputation) works as well as the "official"
-  // XLA client/server design, which serializes the same proto between client
-  // and server, so in reality is probably fine.
-  TF_ASSIGN_OR_RETURN(xla::XlaComputation computation,
-                      client->LoadSnapshot(computation_proto.hlo_snapshot()));
-
-  std::vector<xla::Shape> argument_layouts(
-      config.program_shape().parameters_size());
-  std::vector<const xla::Shape*> argument_layout_ptrs(
-      config.program_shape().parameters_size());
-  for (int i = 0; i < config.program_shape().parameters_size(); ++i) {
-    argument_layouts[i] = xla::Shape(config.program_shape().parameters(i));
-    argument_layout_ptrs[i] = &argument_layouts[i];
-  }
-  xla::ExecutableBuildOptions build_options;
-  build_options.set_device_ordinal(device_ref.device_ordinal());
-  build_options.set_num_replicas(num_replicas);
-  build_options.set_result_layout(xla::Shape(config.program_shape().result()));
-  build_options.set_device_allocator(device_ref.allocator());
-  if (config.has_debug_options()) {
-    *build_options.mutable_debug_options() =
-        BuildXlaDebugOptions(config.debug_options());
-  }
-  if (config.has_device_assignment()) {
-    xla::DeviceAssignment device_assignment(num_replicas,
-                                            num_cores_per_replica);
-    TF_RETURN_IF_ERROR(
-        GenerateXlaDeviceAssignment(config.device_assignment(), num_replicas,
-                                    num_cores_per_replica, &device_assignment));
-    build_options.set_device_assignment(device_assignment);
-  }
-
-  VLOG(1) << "Building executable";
-  TF_ASSIGN_OR_RETURN(
-      auto executables,
-      client->Compile(computation, argument_layout_ptrs, build_options));
-  TF_RET_CHECK(executables.size() == 1);
-  *program = std::move(executables[0]);
-  return OkStatus();
-}
-
-void XRTCompileOp::Compute(OpKernelContext* ctx) {
-  VLOG(1) << "XRTCompileOp::Compute";
-  auto timed = monitoring::MakeTimed(xrt_metrics::GetCompileCell());
-
-  ResourceMgr* rm;
-  OP_REQUIRES_OK(ctx, XRTGenericDeviceAccessor::GetResourceManager(ctx, &rm));
-
-  const Tensor& computation_input = ctx->input(0);
-  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(computation_input.shape()),
-              errors::Internal("computation input should be a string scalar"));
-
-  xrt::XLAComputation computation_proto;
-  OP_REQUIRES(ctx,
-              ParseFromTString(computation_input.scalar<tstring>()(),
-                               &computation_proto),
-              errors::InvalidArgument(
-                  "Unable to parse computation input to XLAComputation"));
-
-  string key;
-  OP_REQUIRES_OK(ctx, CompilationCacheKey(computation_proto, &key));
-
-  // Process-wide cache of XLA executables.
-  auto cache_or = XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
-      ctx, /*max_number_of_entries=*/0);
-  OP_REQUIRES_OK(ctx, cache_or.status());
-  auto cache = std::move(cache_or).value();
-
-  int64_t uid;
-  OP_REQUIRES_OK(
-      ctx, cache->CompileIfKeyAbsent(
-               key, &uid, [&](std::unique_ptr<xla::LocalExecutable>* program) {
-                 VLOG(1) << "Compiling XLA executable";
-                 return Compile(ctx, computation_proto, program);
-               }));
-  std::unique_ptr<XRTCompilationCacheEntryRef> entry;
-  OP_REQUIRES_OK(ctx, cache->Lookup(uid, &entry));
-
-  Tensor handle_output(DT_INT64, TensorShape({}));
-  handle_output.scalar<int64_t>()() = uid;
-  ctx->set_output(0, handle_output);
-
-  xla::LocalExecutable* executable = entry->get().get_executable();
-  xla::ProgramShapeProto program_shape = executable->executable()
-                                             ->module()
-                                             .config()
-                                             .entry_computation_layout()
-                                             .ComputeProgramShape()
-                                             .ToProto();
-  Tensor program_shape_output(DT_STRING, TensorShape({1}));
-  program_shape_output.vec<tstring>()(0) = program_shape.SerializeAsString();
-  ctx->set_output(1, program_shape_output);
-}
-
-XRTCompileOp::~XRTCompileOp() = default;
-
-class XRTReleaseCompilationRefOp : public OpKernel {
- public:
-  explicit XRTReleaseCompilationRefOp(OpKernelConstruction* ctx);
-  ~XRTReleaseCompilationRefOp() override;
-  XRTReleaseCompilationRefOp(const XRTReleaseCompilationRefOp&) = delete;
-  XRTReleaseCompilationRefOp& operator=(const XRTReleaseCompilationRefOp&) =
-      delete;
-
-  void Compute(OpKernelContext* ctx) override;
-};
-
-XRTReleaseCompilationRefOp::XRTReleaseCompilationRefOp(
-    OpKernelConstruction* ctx)
-    : OpKernel(ctx) {}
-
-XRTReleaseCompilationRefOp::~XRTReleaseCompilationRefOp() = default;
-
-void XRTReleaseCompilationRefOp::Compute(OpKernelContext* ctx) {
-  VLOG(1) << "XRTReleaseCompilationRefOp::Compute";
-  auto timed = monitoring::MakeTimed(xrt_metrics::GetReleaseCompilationCell());
-
-  // Process-wide cache of XLA executables.
-  auto cache_or = XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
-      ctx, /*max_number_of_entries=*/0);
-  OP_REQUIRES_OK(ctx, cache_or.status());
-  auto cache = std::move(cache_or).value();
-
-  const Tensor& keys_tensor = ctx->input(0);
-  auto flat_keys = keys_tensor.flat<int64_t>();
-  for (int64_t i = 0; i < flat_keys.size(); ++i) {
-    int64_t key = flat_keys(i);
-    OP_REQUIRES_OK(ctx, cache->Release(key));
-    VLOG(2) << "Released computation handle " << key;
-  }
-}
-
-}  // namespace
-
-REGISTER_KERNEL_BUILDER(Name("XRTCompile")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("computation")
-                            .HostMemory("handle"),
-                        XRTCompileOp);
-REGISTER_KERNEL_BUILDER(Name("XRTCompile")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("computation")
-                            .HostMemory("handle"),
-                        XRTCompileOp);
-
-REGISTER_KERNEL_BUILDER(Name("XRTReleaseCompilationHandle")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("handle"),
-                        XRTReleaseCompilationRefOp);
-REGISTER_KERNEL_BUILDER(Name("XRTReleaseCompilationHandle")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("handle"),
-                        XRTReleaseCompilationRefOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
deleted file mode 100644
index 47c2fa2f2b92c6..00000000000000
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ /dev/null
@@ -1,618 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "xla/hlo/ir/hlo_input_output_alias_config.h"
-#include "xla/literal_util.h"
-#include "xla/service/computation_placer.h"
-#include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/statusor.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
-#include "tensorflow/compiler/xrt/xrt.pb.h"
-#include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
-#include "tensorflow/compiler/xrt/xrt_device.h"
-#include "tensorflow/compiler/xrt/xrt_memory_manager.h"
-#include "tensorflow/compiler/xrt/xrt_metrics.h"
-#include "tensorflow/compiler/xrt/xrt_state.h"
-#include "tensorflow/compiler/xrt/xrt_util.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/core/refcount.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/monitoring/timed.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-namespace {
-
-uint32 InitialRandomSeed() {
-  // Support plumbing the TF seed through to XLA is being worked on.
-  // If a user wants deterministic behavior, their best option
-  // is to start with a known checkpoint. This also handles issues when
-  // multiple random calls can be invoked in any order by TF executor.
-  // Another option is to use stateless random ops. They have much cleaner
-  // semantics.
-  // If a user really wants to set a deterministic seed for XLA-based
-  // devices, this is the place to do it.
-  std::random_device rd;
-  // Make the starting value odd.
-  return rd() | 1;
-}
-
-uint32 GetXLARandomSeed() {
-  // We initialize counter with an odd number and increment it by two
-  // everytime. This ensures that it will never be zero, even
-  // after an overflow. When seeded with zero, some XLA backends
-  // can return all zeros instead of random numbers.
-  static std::atomic<uint32> counter(InitialRandomSeed());
-  return counter.fetch_add(2);
-}
-
-std::vector<bool> GetDynamicInputInfo(
-    const xla::ComputationLayout& computation_layout) {
-  std::vector<bool> input_is_dynamic;
-  input_is_dynamic.reserve(computation_layout.parameter_count());
-  for (int64_t i = 0; i < computation_layout.parameter_count(); ++i) {
-    input_is_dynamic.push_back(
-        !computation_layout.parameter_shape(i).is_static());
-  }
-  return input_is_dynamic;
-}
-
-xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetInputTuples(
-    xla::LocalExecutable* executable, XRTMemoryManager::WorkingSet* working_set,
-    xla::Backend* backend, const std::vector<InputCoords>& input_coords,
-    bool release_inputs, se::DeviceMemoryAllocator* allocator) {
-  const xla::ComputationLayout& computation_layout =
-      executable->executable()->module_config().entry_computation_layout();
-
-  return GetInputTupleAllocations(
-      input_coords, working_set, backend, computation_layout.parameter_count(),
-      [&](int64_t i) { return computation_layout.parameter_shape(i); },
-      release_inputs, allocator);
-}
-
-xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetChainedOpInputTuples(
-    const xrt::XRTChainedExecuteOp& op,
-    absl::Span<const RefPtr<XRTTupleAllocation>> op_inputs) {
-  std::vector<RefPtr<XRTTupleAllocation>> input_tuples;
-  input_tuples.reserve(op.inputs_size());
-  for (int i = 0; i < op.inputs_size(); ++i) {
-    auto& input = op.inputs(i);
-    // Thanks to the greatness of proto3, there is no way to query for
-    // explicitly set fields, so the default for output_index (zero) means no
-    // sub-index. As consequence, the real index is output_index - 1.
-    if (input.output_index() == 0) {
-      input_tuples.emplace_back(op_inputs[i]);
-    } else {
-      XRTTupleAllocation* sub_tuple;
-      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
-          op_inputs[i].get(), {input.output_index() - 1}, &sub_tuple,
-          /*alias_parent_allocation=*/true));
-      input_tuples.emplace_back(sub_tuple);
-    }
-  }
-  return input_tuples;
-}
-
-// Given a shape, returns a byte array representing the shape metadata of the
-// shape. The shape metadata contains dimensions sizes stored as contiguous S32.
-std::vector<int32> PrepareMetadata(const xla::Shape& shape) {
-  DCHECK(shape.is_static());
-  DCHECK(shape.IsArray());
-  // Each dimension size is stored as a S32.
-  std::vector<int32> result(shape.dimensions_size());
-  for (int64_t i = 0; i < shape.dimensions_size(); ++i) {
-    result[i] = shape.dimensions(i);
-  }
-  return result;
-}
-
-// Given a buffer with dynamic shape, update buffer metadata at the correct
-// offset starting from that buffer.
-//
-// +-----------+
-// |Payload    |
-// +-----------+
-// | Padding   |
-// +-----------+
-// |dim_size_0 |  (each dim_size is a S32):
-// +-----------+
-// |dim_size_1 |
-// +-----------+
-//  ..........
-// +-----------+
-//
-// Size of payload = ByteSizeOf(runtime_shape)
-// Size of payload + padding = ByteSizeOf(compile_time_shape_static)
-// Size of payload + padding + metadata = ByteSizeOf(compile_time_shape)
-Status UpdateMetadata(se::Stream* stream, se::DeviceMemory<uint8>* buffer,
-                      const xla::Shape& compile_time_shape,
-                      const xla::Shape& runtime_shape) {
-  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
-                                         stream->parent()->platform()));
-  TF_ASSIGN_OR_RETURN(
-      auto transfer_manager,
-      xla::TransferManager::GetForPlatform(stream->parent()->platform()));
-  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
-  xla::Shape compile_time_shape_static =
-      xla::ShapeUtil::MakeStaticShape(compile_time_shape);
-  uint64 offset = shape_size_fn(compile_time_shape_static);
-  uint64 metadata_size = shape_size_fn(compile_time_shape) - offset;
-  auto metadata_buffer =
-      stream->parent()->GetSubBuffer(buffer, offset, metadata_size);
-
-  auto metadata_literal = std::make_shared<xla::Literal>(
-      xla::LiteralUtil::CreateR1<int32>(PrepareMetadata(runtime_shape)));
-  TF_RETURN_IF_ERROR(transfer_manager->TransferArrayToDeviceAsync(
-      stream, *metadata_literal, metadata_buffer));
-  // Retain the literal until the end of the transfer.
-  stream->ThenDoHostCallback([keep_alive = std::move(metadata_literal)] {});
-  return OkStatus();
-}
-
-// Given a static input buffer, convert it to dynamic form by expanding it to
-// the bounded size and attaching a metadata filled with dimension sizes.
-//
-// From:
-// +--------+
-// |Payload |
-// +--------+
-//
-// To:
-//
-// +--------+
-// |Payload |
-// +--------+
-// | Padding|
-// +--------+
-// |Metadata|
-// +--------+
-//
-// As we can't expand the size of an existing memory allocation, a reallocation
-// is required. A list of new allocations are returned after this function. The
-// caller is reponsible for maintaining those allocations.
-Status UpdateDynamicInputs(
-    se::Stream* stream, se::DeviceMemoryAllocator* allocator,
-    std::vector<xla::ExecutionInput>* execution_inputs,
-    const std::vector<xla::ShapeLayout>& compile_time_shapes) {
-  TF_RET_CHECK(execution_inputs->size() == compile_time_shapes.size());
-  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
-                                         stream->parent()->platform()));
-  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
-  for (int64_t i = 0; i < compile_time_shapes.size(); i++) {
-    const xla::Shape& compile_time_shape = compile_time_shapes[i].shape();
-    if (compile_time_shape.is_static()) {
-      continue;
-    }
-    xla::ExecutionInput* execution_input = &(*execution_inputs)[i];
-    bool element_modified = false;
-    TF_RETURN_IF_ERROR(xla::ShapeUtil::ForEachSubshapeWithStatus(
-        compile_time_shape,
-        [&](const xla::Shape& sub_shape,
-            const xla::ShapeIndex& index) -> Status {
-          if (sub_shape.IsTuple() || sub_shape.is_static()) {
-            return OkStatus();
-          }
-          TF_ASSIGN_OR_RETURN(
-              const xla::Shape* runtime_shape,
-              xla::ShapeUtil::TryGetSubshape(execution_input->shape(), index));
-          TF_RET_CHECK(!runtime_shape->IsTuple());
-          TF_RET_CHECK(xla::ShapeUtil::DynamicArrayShapeIsCompatible(
-              *runtime_shape, sub_shape));
-          TF_ASSIGN_OR_RETURN(
-              se::OwningDeviceMemory dynamic_input,
-              allocator->Allocate(stream->parent()->device_ordinal(),
-                                  shape_size_fn(sub_shape)));
-
-          se::DeviceMemoryBase static_input =
-              execution_input->Buffer(index).AsDeviceMemoryBase();
-          se::DeviceMemory<uint8>* dynamic_input_base = dynamic_input.ptr();
-          // Send the original data to the new location.
-          stream->ThenMemcpyD2D(dynamic_input_base, static_input,
-                                static_input.size());
-          TF_RETURN_IF_ERROR(UpdateMetadata(stream, dynamic_input_base,
-                                            sub_shape, *runtime_shape));
-          // Modify the memory location in the input shape tree to point to the
-          // new input.
-          execution_input->SetBuffer(
-              index, xla::MaybeOwningDeviceMemory(std::move(dynamic_input)));
-          execution_input->ClearUnownedIndex(index);
-          element_modified = true;
-          return OkStatus();
-        }));
-    if (element_modified) {
-      TF_RETURN_IF_ERROR(execution_input->SetDynamicShape(compile_time_shape));
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
-                          execution_input->ToShapedBuffer(
-                              allocator, stream->parent()->device_ordinal()));
-      // The input location has been modified, need to fix tuple table to
-      // point to the correct address.
-      TF_ASSIGN_OR_RETURN(
-          auto transfer_manager,
-          xla::TransferManager::GetForPlatform(stream->parent()->platform()));
-      TF_RETURN_IF_ERROR(
-          transfer_manager->WriteTupleIndexTablesAsync(stream, shaped_buffer));
-    }
-  }
-  return OkStatus();
-}
-
-xla::StatusOr<RefPtr<XRTTupleAllocation>> CreateOutputTuple(
-    se::Stream* stream, xla::ExecutionOutput run_result, xla::Backend* backend,
-    int device_ordinal, se::DeviceMemoryAllocator* allocator) {
-  XRTTupleAllocation* output_tuple;
-  xla::ScopedShapedBuffer* shaped_buffer = run_result.MutableResult();
-  if (shaped_buffer->on_device_shape().is_dynamic()) {
-    // Update dynamic shapes from output buffer, and create a XRT tensor with
-    // dimension sizes read from metadata.
-    xla::Shape output_device_shape = shaped_buffer->on_device_shape();
-    TF_ASSIGN_OR_RETURN(
-        auto transfer_manager,
-        xla::TransferManager::GetForPlatform(stream->parent()->platform()));
-    TF_RETURN_IF_ERROR(transfer_manager->ReadDynamicShapes(
-        stream, shaped_buffer, &output_device_shape));
-    TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
-        *shaped_buffer,
-        xla::ShapeUtil::DeviceShapeToHostShape(output_device_shape),
-        output_device_shape, backend, device_ordinal, &output_tuple,
-        allocator));
-  } else {
-    // Fast-path: Don't copy shapes of output buffer.
-    TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
-        *shaped_buffer, backend, device_ordinal, &output_tuple, allocator));
-  }
-  // After the output tuple is created, we can release the output result
-  // buffers, to make sure they won't be cleared by its destructor.
-  (void)run_result.ConsumeResult().release();
-  return RefPtr<XRTTupleAllocation>(output_tuple);
-}
-
-xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
-    OpKernelContext* context, XRTGenericDeviceAccessor::ScopedRef* device_ref,
-    xla::LocalExecutable* executable,
-    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
-    bool release_inputs, se::Stream* stream, int rng_seed,
-    const xrt::CommonExecutionConfig& config) {
-  const xla::ComputationLayout& computation_layout =
-      executable->executable()->module_config().entry_computation_layout();
-  std::vector<bool> input_is_dynamic = GetDynamicInputInfo(computation_layout);
-  TF_ASSIGN_OR_RETURN(
-      std::vector<xla::ExecutionInput> execution_inputs,
-      GetArgumentsBuffers(
-          executable->executable()->module().input_output_alias_config(),
-          input_tuples, input_is_dynamic, release_inputs));
-
-  se::DeviceMemoryAllocator* allocator = device_ref->allocator();
-  xla::ExecutableRunOptions run_options;
-  run_options.set_stream(stream);
-  run_options.set_allocator(allocator);
-  run_options.set_intra_op_thread_pool(&context->eigen_cpu_device());
-  run_options.set_rng_seed(rng_seed);
-  if (config.run_id() != 0) {
-    run_options.set_run_id(xla::RunId(config.run_id()));
-  }
-  if (executable->executable()
-          ->module_config()
-          .has_static_device_assignment()) {
-    run_options.set_device_assignment(
-        &executable->executable()->module_config().static_device_assignment());
-  }
-  xla::gpu::GpuExecutableRunOptions gpu_options;
-  std::map<int, xla::GlobalDeviceId> gpu_global_ids;
-  if (config.local_replica_mapping_size() > 0) {
-    int i = 0;
-    for (auto& gid : config.local_replica_mapping()) {
-      gpu_global_ids[i++] = xla::GlobalDeviceId(gid);
-    }
-    gpu_options.set_gpu_global_device_ids(gpu_global_ids);
-  }
-  std::shared_ptr<NcclUniqueIdFactory> nccl_factory = GetNcclUniqueIdFactory();
-  if (nccl_factory != nullptr) {
-    auto uid_callback =
-        [&](const xla::gpu::NcclCliqueKey& key) -> xla::StatusOr<std::string> {
-      std::vector<int64_t> replicas;
-      const auto key_devices = key.devices();
-      replicas.reserve(key_devices.size());
-      for (auto& device : key_devices) {
-        replicas.push_back(device.value());
-      }
-      return nccl_factory->GetUniqueId(replicas);
-    };
-    gpu_options.set_nccl_unique_id_callback(uid_callback);
-  }
-  run_options.set_gpu_executable_run_options(&gpu_options);
-
-  const std::vector<xla::ShapeLayout>& shape_layouts =
-      executable->executable()
-          ->module_config()
-          .entry_computation_layout()
-          .parameter_layouts();
-  TF_RETURN_IF_ERROR(UpdateDynamicInputs(stream, run_options.allocator(),
-                                         &execution_inputs, shape_layouts));
-  TF_ASSIGN_OR_RETURN(
-      xla::ExecutionOutput run_result,
-      executable->Run(std::move(execution_inputs), run_options));
-
-  TF_ASSIGN_OR_RETURN(
-      RefPtr<XRTTupleAllocation> output_tuple_ptr,
-      CreateOutputTuple(stream, std::move(run_result), device_ref->backend(),
-                        device_ref->device_ordinal(), allocator));
-  // The ScopedShapedBuffer returned by the executable Run() API, in case of
-  // input/output buffer aliasing, might have holes in it, which need to be
-  // filled using the proper input tuples buffers which are the source of
-  // aliasing.
-  TF_RETURN_IF_ERROR(RebuildOutputAliases(
-      output_tuple_ptr, input_tuples,
-      executable->executable()->module().input_output_alias_config()));
-
-  return std::move(output_tuple_ptr);
-}
-
-xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
-    OpKernelContext* context, XRTMemoryManager* memory_manager,
-    XRTGenericDeviceAccessor::ScopedRef* device_ref,
-    xla::LocalExecutable* executable,
-    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
-    bool release_inputs, se::Stream* stream, int rng_seed,
-    const xrt::CommonExecutionConfig& config) {
-  auto runfn = [&]() {
-    return RunExecutable(context, device_ref, executable, input_tuples,
-                         release_inputs, stream, rng_seed, config);
-  };
-
-  // We pass zero as requested_free_size as there is no simple way to get the
-  // peak heap size. Upon zero, the Run() API will try to free chunks of device
-  // memory, until either the runfn can run, or we run out of freeable memory.
-  return memory_manager->Run<RefPtr<XRTTupleAllocation>>(
-      runfn, device_ref->backend(), device_ref->device_ordinal(),
-      /*requested_free_size=*/0, device_ref->allocator());
-}
-
-xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
-    OpKernelContext* context, const RefPtr<XRTMemoryManager>& memory_manager,
-    XRTGenericDeviceAccessor::ScopedRef* device_ref,
-    xla::LocalExecutable* executable,
-    const std::vector<InputCoords>& input_coords, bool release_inputs,
-    se::Stream* stream, int rng_seed,
-    const xrt::CommonExecutionConfig& config) {
-  XRTMemoryManager::WorkingSet working_set(memory_manager);
-  TF_ASSIGN_OR_RETURN(
-      std::vector<RefPtr<XRTTupleAllocation>> input_tuples,
-      GetInputTuples(executable, &working_set, device_ref->backend(),
-                     input_coords, release_inputs, device_ref->allocator()));
-  return ExecuteComputation(context, memory_manager.get(), device_ref,
-                            executable, input_tuples, release_inputs, stream,
-                            rng_seed, config);
-}
-
-// XRTExecuteOp
-
-class XRTExecuteOp : public AsyncOpKernel {
- public:
-  explicit XRTExecuteOp(OpKernelConstruction* context);
-  ~XRTExecuteOp() override;
-
-  void ComputeAsync(OpKernelContext* context, DoneCallback done) override;
-
- private:
-  Status DoWork(OpKernelContext* context);
-};
-
-XRTExecuteOp::XRTExecuteOp(OpKernelConstruction* context)
-    : AsyncOpKernel(context) {}
-
-void XRTExecuteOp::ComputeAsync(OpKernelContext* context, DoneCallback done) {
-  // Schedule onto the default queue, for unbounded concurrency. See b/73520706
-  Env::Default()->SchedClosure([this, context, done]() {
-    OP_REQUIRES_OK_ASYNC(context, DoWork(context), done);
-    done();
-  });
-}
-
-Status XRTExecuteOp::DoWork(OpKernelContext* context) {
-  VLOG(1) << "XRTExecuteOp::Compute";
-  auto timed = monitoring::MakeTimed(xrt_metrics::GetExecuteCell());
-  ResourceMgr* rm;
-  TF_RETURN_IF_ERROR(
-      XRTGenericDeviceAccessor::GetResourceManager(context, &rm));
-
-  const Tensor& execution_input = context->input(0);
-  TF_RET_CHECK(TensorShapeUtils::IsScalar(execution_input.shape()));
-  int64_t compilation_handle = execution_input.scalar<int64_t>()();
-
-  const Tensor& execution_config = context->input(1);
-  TF_RET_CHECK(TensorShapeUtils::IsScalar(execution_config.shape()));
-  xrt::XRTExecutionConfig config_proto;
-  TF_RET_CHECK(
-      ParseFromTString(execution_config.scalar<tstring>()(), &config_proto));
-
-  int core_index_in_replica = config_proto.core_index_in_replica();
-  TF_RET_CHECK(core_index_in_replica == 0);
-  bool release_inputs = config_proto.release_input_handles();
-  bool release_compilation = config_proto.release_compilation_handle();
-
-  TF_ASSIGN_OR_RETURN(auto cache,
-                      XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
-                          context, /*max_number_of_entries=*/0));
-  // We are guaranteed that the underlying device object won't be deleted out
-  // from under us, while the ScopedRef is live.
-  class XRTGenericDeviceAccessor::ScopedRef device_ref;
-  TF_RETURN_IF_ERROR(
-      XRTGenericDeviceAccessor::InitScopedRef(context, &device_ref));
-
-  int rng_seed = config_proto.rng_seed();
-  if (rng_seed == 0) {
-    rng_seed = GetXLARandomSeed();
-  }
-
-  se::Stream* stream = context->op_device_context()
-                           ? context->op_device_context()->stream()
-                           : nullptr;
-  RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
-  TF_ASSIGN_OR_RETURN(std::vector<InputCoords> input_coords,
-                      GetComputationInputs(context, "input_handles"));
-
-  std::unique_ptr<XRTCompilationCacheEntryRef> entry;
-  TF_RETURN_IF_ERROR(cache->Lookup(compilation_handle, &entry));
-  xla::LocalExecutable* executable = entry->get().get_executable();
-  if (release_compilation) {
-    // Process-wide cache of XLA executables.
-    TF_RETURN_IF_ERROR(cache->Release(compilation_handle));
-    VLOG(2) << "Released compilation handle " << compilation_handle;
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      RefPtr<XRTTupleAllocation> output_tuple,
-      ExecuteComputation(context, memory_manager, &device_ref, executable,
-                         input_coords, release_inputs, stream, rng_seed,
-                         config_proto.common_config()));
-
-  return CreateExecuteOutput(context, memory_manager.get(),
-                             std::move(output_tuple),
-                             config_proto.return_exploded_tuple());
-}
-
-XRTExecuteOp::~XRTExecuteOp() = default;
-
-class XRTExecuteChainedOp : public AsyncOpKernel {
- public:
-  explicit XRTExecuteChainedOp(OpKernelConstruction* context);
-  ~XRTExecuteChainedOp() override;
-
-  void ComputeAsync(OpKernelContext* context, DoneCallback done) override;
-
- private:
-  Status DoWork(OpKernelContext* context);
-};
-
-XRTExecuteChainedOp::XRTExecuteChainedOp(OpKernelConstruction* context)
-    : AsyncOpKernel(context) {}
-
-void XRTExecuteChainedOp::ComputeAsync(OpKernelContext* context,
-                                       DoneCallback done) {
-  // Schedule onto the default queue, for unbounded concurrency. See b/73520706
-  Env::Default()->SchedClosure([this, context, done]() {
-    OP_REQUIRES_OK_ASYNC(context, DoWork(context), done);
-    done();
-  });
-}
-
-Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
-  VLOG(1) << "XRTExecuteChainedOp::Compute";
-  auto timed = monitoring::MakeTimed(xrt_metrics::GetExecuteChainedCell());
-  ResourceMgr* rm;
-  TF_RETURN_IF_ERROR(
-      XRTGenericDeviceAccessor::GetResourceManager(context, &rm));
-
-  const Tensor& execution_plan = context->input(0);
-  TF_RET_CHECK(TensorShapeUtils::IsScalar(execution_plan.shape()));
-  xrt::XRTChainedExecutePlan plan;
-  TF_RET_CHECK(ParseFromTString(execution_plan.scalar<tstring>()(), &plan));
-
-  const Tensor& execution_config = context->input(1);
-  TF_RET_CHECK(TensorShapeUtils::IsScalar(execution_config.shape()));
-  xrt::XRTChainedExecuteConfig config;
-  TF_RET_CHECK(ParseFromTString(execution_config.scalar<tstring>()(), &config));
-
-  TF_ASSIGN_OR_RETURN(auto cache,
-                      XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
-                          context, /*max_number_of_entries=*/0));
-  // We are guaranteed that the underlying device object won't be deleted out
-  // from under us, while the ScopedRef is live.
-  class XRTGenericDeviceAccessor::ScopedRef device_ref;
-  TF_RETURN_IF_ERROR(
-      XRTGenericDeviceAccessor::InitScopedRef(context, &device_ref));
-
-  int rng_seed = config.rng_seed();
-  if (rng_seed == 0) {
-    rng_seed = GetXLARandomSeed();
-  }
-
-  se::Stream* stream = context->op_device_context()
-                           ? context->op_device_context()->stream()
-                           : nullptr;
-  RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
-  auto execute_op = [&](const xrt::XRTChainedExecuteOp& op,
-                        absl::Span<const RefPtr<XRTTupleAllocation>> op_inputs)
-      -> xla::StatusOr<RefPtr<XRTTupleAllocation>> {
-    std::unique_ptr<XRTCompilationCacheEntryRef> entry;
-    TF_RETURN_IF_ERROR(cache->Lookup(op.computation_handle(), &entry));
-    xla::LocalExecutable* executable = entry->get().get_executable();
-
-    TF_ASSIGN_OR_RETURN(std::vector<RefPtr<XRTTupleAllocation>> input_tuples,
-                        GetChainedOpInputTuples(op, op_inputs));
-
-    return ExecuteComputation(
-        context, memory_manager.get(), &device_ref, executable, input_tuples,
-        /*release_inputs=*/false, stream, rng_seed, config.common_config());
-  };
-
-  return ExecuteChained(context, memory_manager, device_ref.backend(),
-                        device_ref.device_ordinal(), plan, config, execute_op,
-                        device_ref.allocator());
-}
-
-XRTExecuteChainedOp::~XRTExecuteChainedOp() = default;
-
-}  // namespace
-
-REGISTER_KERNEL_BUILDER(Name("XRTExecute")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("computation_handle")
-                            .HostMemory("execution_config")
-                            .HostMemory("input_handles")
-                            .HostMemory("output_handle"),
-                        XRTExecuteOp);
-
-REGISTER_KERNEL_BUILDER(Name("XRTExecute")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("computation_handle")
-                            .HostMemory("execution_config")
-                            .HostMemory("input_handles")
-                            .HostMemory("output_handle"),
-                        XRTExecuteOp);
-
-REGISTER_KERNEL_BUILDER(Name("XRTExecuteChained")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("execution_plan")
-                            .HostMemory("execution_config")
-                            .HostMemory("output_handle"),
-                        XRTExecuteChainedOp);
-
-REGISTER_KERNEL_BUILDER(Name("XRTExecuteChained")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("execution_plan")
-                            .HostMemory("execution_config")
-                            .HostMemory("output_handle"),
-                        XRTExecuteChainedOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
deleted file mode 100644
index 09ca1ef948aaf1..00000000000000
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Classes for allocating XLA literals in device memory and managing handles
-// that refer to them.
-
-#include "tensorflow/compiler/xrt/kernels/xrt_state_ops.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "xla/client/local_client.h"
-#include "tensorflow/compiler/xrt/xrt_metrics.h"
-
-namespace tensorflow {
-namespace {
-
-class XRTMetricsCollectOp : public OpKernel {
- public:
-  explicit XRTMetricsCollectOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "XRTMetricsCollectOp::Compute";
-
-    const Tensor& metrics_proto = ctx->input(0);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(metrics_proto.shape()),
-                errors::Internal("request input should be a string scalar"));
-    xrt::XRTMetricsCollect metrics;
-    OP_REQUIRES(ctx,
-                ParseFromTString(metrics_proto.scalar<tstring>()(), &metrics),
-                errors::InvalidArgument(
-                    "Unable to parse request input to XRTMetricsCollect"));
-
-    xla::StatusOr<xrt::MetricsReport> collected_metrics_or =
-        CollectMetrics(metrics);
-    OP_REQUIRES_OK(ctx, collected_metrics_or.status());
-    xrt::MetricsReport collected_metrics =
-        std::move(collected_metrics_or).value();
-    Tensor output(DT_STRING, TensorShape({}));
-    output.scalar<tstring>()() = collected_metrics.SerializeAsString();
-    ctx->set_output(0, output);
-  }
-};
-
-}  // namespace
-
-REGISTER_KERNEL_BUILDER(Name("XRTAllocate")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("allocation")
-                            .HostMemory("handle"),
-                        XRTAllocateOp<XRTGenericDeviceAccessor>);
-REGISTER_KERNEL_BUILDER(Name("XRTAllocate")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("allocation")
-                            .HostMemory("handle"),
-                        XRTAllocateOp<XRTGenericDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTAllocateUninitialized")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("handle"),
-                        XRTAllocateUninitializedOp<XRTGenericDeviceAccessor>);
-REGISTER_KERNEL_BUILDER(Name("XRTAllocateUninitialized")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("handle"),
-                        XRTAllocateUninitializedOp<XRTGenericDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTAllocateFromTensor")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("inputs")
-                            .HostMemory("handle"),
-                        XRTAllocateFromTensorOp<XRTGenericDeviceAccessor>);
-REGISTER_KERNEL_BUILDER(Name("XRTAllocateFromTensor")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("inputs")
-                            .HostMemory("handle"),
-                        XRTAllocateFromTensorOp<XRTGenericDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTSubTuple")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("base_handle")
-                            .HostMemory("shape_index")
-                            .HostMemory("output_handle"),
-                        XRTSubTupleOp<false, XRTGenericDeviceAccessor>);
-REGISTER_KERNEL_BUILDER(Name("XRTSubTuple")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("base_handle")
-                            .HostMemory("shape_index")
-                            .HostMemory("output_handle"),
-                        XRTSubTupleOp<false, XRTGenericDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTSubTupleAndRelease")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("base_handle")
-                            .HostMemory("shape_index")
-                            .HostMemory("output_handle"),
-                        XRTSubTupleOp<true, XRTGenericDeviceAccessor>);
-REGISTER_KERNEL_BUILDER(Name("XRTSubTupleAndRelease")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("base_handle")
-                            .HostMemory("shape_index")
-                            .HostMemory("output_handle"),
-                        XRTSubTupleOp<true, XRTGenericDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTMakeTuple")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("tuple_description")
-                            .HostMemory("input_handles")
-                            .HostMemory("output_handle"),
-                        XRTMakeTupleOp<XRTGenericDeviceAccessor>);
-REGISTER_KERNEL_BUILDER(Name("XRTMakeTuple")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("tuple_description")
-                            .HostMemory("input_handles")
-                            .HostMemory("output_handle"),
-                        XRTMakeTupleOp<XRTGenericDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTReadLiteral")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("handle")
-                            .HostMemory("literal"),
-                        XRTReadLiteralOp<false, XRTGenericDeviceAccessor>);
-REGISTER_KERNEL_BUILDER(Name("XRTReadLiteral")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("handle")
-                            .HostMemory("literal"),
-                        XRTReadLiteralOp<false, XRTGenericDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTWriteLiteral")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("handle")
-                            .HostMemory("literal")
-                            .HostMemory("output_handle"),
-                        XRTWriteLiteralOp<XRTGenericDeviceAccessor>);
-REGISTER_KERNEL_BUILDER(Name("XRTWriteLiteral")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("handle")
-                            .HostMemory("literal")
-                            .HostMemory("output_handle"),
-                        XRTWriteLiteralOp<XRTGenericDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTReadLiteralAndRelease")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("handle")
-                            .HostMemory("literal"),
-                        XRTReadLiteralOp<true, XRTGenericDeviceAccessor>);
-REGISTER_KERNEL_BUILDER(Name("XRTReadLiteralAndRelease")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("handle")
-                            .HostMemory("literal"),
-                        XRTReadLiteralOp<true, XRTGenericDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTReadToTensor")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("handles")
-                            .HostMemory("tensors"),
-                        XRTReadToTensorOp<XRTGenericDeviceAccessor>);
-REGISTER_KERNEL_BUILDER(Name("XRTReadToTensor")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("handles")
-                            .HostMemory("tensors"),
-                        XRTReadToTensorOp<XRTGenericDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllocationHandle")
-                            .Device(DEVICE_XLA_GPU)
-                            .HostMemory("handle"),
-                        XRTReleaseAllocationOp<XRTGenericDeviceAccessor>);
-REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllocationHandle")
-                            .Device(DEVICE_XLA_CPU)
-                            .HostMemory("handle"),
-                        XRTReleaseAllocationOp<XRTGenericDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllAllocations").Device(DEVICE_XLA_GPU),
-                        XRTReleaseAllAllocationsOp<XRTGenericDeviceAccessor>);
-REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllAllocations").Device(DEVICE_XLA_CPU),
-                        XRTReleaseAllAllocationsOp<XRTGenericDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTCompactAllocations").Device(DEVICE_XLA_GPU),
-                        XRTCompactAllocationsOp<XRTGenericDeviceAccessor>);
-REGISTER_KERNEL_BUILDER(Name("XRTCompactAllocations").Device(DEVICE_XLA_CPU),
-                        XRTCompactAllocationsOp<XRTGenericDeviceAccessor>);
-
-REGISTER_KERNEL_BUILDER(Name("XRTMetricsCollect").Device(DEVICE_CPU),
-                        XRTMetricsCollectOp);
-
-REGISTER_KERNEL_BUILDER(Name("XRTMemoryInfo").Device(DEVICE_XLA_GPU),
-                        XRTMemoryInfoOp<XRTGenericDeviceAccessor>);
-REGISTER_KERNEL_BUILDER(Name("XRTMemoryInfo").Device(DEVICE_XLA_CPU),
-                        XRTMemoryInfoOp<XRTGenericDeviceAccessor>);
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
deleted file mode 100644
index 5faf034af023d9..00000000000000
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ /dev/null
@@ -1,784 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Classes for allocating XLA literals in device memory and managing handles
-// that refer to them.
-
-#ifndef TENSORFLOW_COMPILER_XRT_KERNELS_XRT_STATE_OPS_H_
-#define TENSORFLOW_COMPILER_XRT_KERNELS_XRT_STATE_OPS_H_
-
-#include <functional>
-#include <memory>
-#include <string>
-
-#include "tensorflow/compiler/tf2xla/literal_util.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/compiler/tf2xla/type_util.h"
-#include "xla/client/local_client.h"
-#include "xla/layout_util.h"
-#include "xla/literal.h"
-#include "xla/status_macros.h"
-#include "xla/statusor.h"
-#include "xla/xla_data.pb.h"
-#include "tensorflow/compiler/xrt/xrt.pb.h"
-#include "tensorflow/compiler/xrt/xrt_device.h"
-#include "tensorflow/compiler/xrt/xrt_memory_manager.h"
-#include "tensorflow/compiler/xrt/xrt_metrics.h"
-#include "tensorflow/compiler/xrt/xrt_state.h"
-#include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/refcount.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/monitoring/percentile_sampler.h"
-#include "tensorflow/core/lib/monitoring/timed.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-// Helper functions for templated ops.
-class XRTStateHelpers {
- public:
-  // The Status return value allows us to use the
-  // TF_ASSIGN_OR_RETURN macro, which doesn't work within the body of an
-  // OpKernel::Compute method.
-  static Status MakeLiteral(const xla::LiteralProto& proto,
-                            xla::Literal* literal) {
-    TF_ASSIGN_OR_RETURN(*literal, xla::Literal::CreateFromProto(proto));
-    return OkStatus();
-  }
-
-  // ParseTupleNode is the recursive function used to parse a recursive
-  // xrt::XLATupleNode proto and generate the xla::Shape of the 'spine' i.e. the
-  // tuple shape where every leaf is an existing allocation. As a side-effect it
-  // fills in input_vector by looking up allocations from handles in the
-  // input_tensor_list as they are referenced by nodes in the proto.
-  static Status ParseTupleNode(
-      const xrt::XLATupleNode& tuple_node, const OpInputList& input_tensor_list,
-      std::vector<XRTTupleAllocation::ExpandedTupleInput>* input_vector,
-      xla::Shape* shape, ResourceMgr* rm) {
-    if (tuple_node.tuples_size() > 0) {
-      // This is an internal node in the proto so descend recursively.
-      xla::Shape dummy = xla::ShapeUtil::MakeShapeWithType<float>({});
-      std::vector<xla::Shape> subshapes(tuple_node.tuples_size(), dummy);
-      *xla::ShapeUtil::GetMutableSubshape(shape, {}) =
-          xla::ShapeUtil::MakeTupleShape(subshapes);
-      for (int i = 0; i < tuple_node.tuples_size(); ++i) {
-        TF_RETURN_IF_ERROR(ParseTupleNode(
-            tuple_node.tuples(i), input_tensor_list, input_vector,
-            xla::ShapeUtil::GetMutableSubshape(shape, {i}), rm));
-      }
-    } else {
-      // This is a leaf node in the proto so look up the referenced input.
-      int input_index = tuple_node.input_index();
-      if (input_index < 0 || input_index >= input_vector->size()) {
-        return errors::InvalidArgument("Invalid tuple input index ",
-                                       input_index, ": MakeTuple has ",
-                                       input_vector->size(), " inputs.");
-      }
-      bool release_this_input = tuple_node.release_input_handle();
-      XRTTupleAllocation::ExpandedTupleInput& input =
-          input_vector->at(input_index);
-      if (input.allocation != nullptr &&
-          (input.release_allocation_after_use || release_this_input)) {
-        return errors::InvalidArgument(
-            "Invalid tuple tree: input index ", input_index,
-            " is repeated but release_input_handle is true.");
-      }
-      if (input.allocation == nullptr) {
-        // We haven't dereferenced this handle yet.
-        TF_RET_CHECK(
-            TensorShapeUtils::IsScalar(input_tensor_list[input_index].shape()));
-        int64_t key = input_tensor_list[input_index].scalar<int64_t>()();
-        TF_ASSIGN_OR_RETURN(input.allocation,
-                            XRTMemoryManager::Get(rm)->Lookup(key));
-        input.release_allocation_after_use = release_this_input;
-      }
-    }
-    return OkStatus();
-  }
-
-  // Parses a xrt::XLATupleNode proto recursively and returns the corresponding
-  // ShapeTree where each leaf is an allocation corresponding to a handle in
-  // input_tensor_list. The ordinal of one of the allocations is returned in
-  // device_ordinal. Since it's not possible to specify a xrt::XLATupleNode with
-  // no leaves, device_ordinal will always be filled in by a successful call to
-  // ParseTupleTree.
-  static Status ParseTupleTree(
-      const xrt::XLATupleNode& tuple_tree_root,
-      const OpInputList& input_tensor_list,
-      std::vector<XRTTupleAllocation::ExpandedTupleInput>* input_vector,
-      xla::ShapeTree<XRTTupleAllocation::ExpandedTupleInput>* tuple_shape_tree,
-      int* device_ordinal, ResourceMgr* rm) {
-    // First get the shape of the 'spine' of the new tuple, where every leaf is
-    // an existing allocation. As a side-effect dereference the input handles
-    // into allocations in input_vector.
-    xla::Shape tuple_tree_shape;
-    TF_RETURN_IF_ERROR(ParseTupleNode(tuple_tree_root, input_tensor_list,
-                                      input_vector, &tuple_tree_shape, rm));
-    // Make the shape tree of allocations where the shape is the spine and each
-    // leaf is one of the allocations looked up in input_vector. Internal nodes
-    // have nullptr allocations.
-    *tuple_shape_tree = xla::ShapeTree<XRTTupleAllocation::ExpandedTupleInput>(
-        tuple_tree_shape);
-    tuple_shape_tree->ForEachMutableElement(
-        [&](const xla::ShapeIndex& index,
-            XRTTupleAllocation::ExpandedTupleInput* element) {
-          if (tuple_shape_tree->IsLeaf(index)) {
-            // Find the matching leaf in the proto tree.
-            const xrt::XLATupleNode* tuple_node = &tuple_tree_root;
-            for (int i = 0; i < index.size(); ++i) {
-              tuple_node = &tuple_node->tuples(index[i]);
-            }
-            // Copy the appropriate input allocation to the leaf of the
-            // tuple_shape_tree.
-            int input_index = tuple_node->input_index();
-            *element = input_vector->at(input_index);
-            CHECK(element->release_allocation_after_use ==
-                  tuple_node->release_input_handle());
-            // We just need to know the device_ordinal of one of the
-            // allocations. We will validate later that they are all the same.
-            *device_ordinal = (*element).allocation->device_ordinal();
-          }
-        });
-    return OkStatus();
-  }
-};
-
-// Op that allocates memory for a literal and transfers it to the device.
-template <class DeviceAccessor>
-class XRTAllocateOp : public OpKernel {
- public:
-  explicit XRTAllocateOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  ~XRTAllocateOp() override = default;
-  XRTAllocateOp(const XRTAllocateOp&) = delete;
-  XRTAllocateOp& operator=(const XRTAllocateOp&) = delete;
-
-  void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "XRTAllocateOp::Compute";
-    auto timed = monitoring::MakeTimed(xrt_metrics::GetAllocateCell());
-
-    const Tensor& allocation_info = ctx->input(0);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(allocation_info.shape()),
-                errors::Internal("allocation input should be a string scalar"));
-    xrt::XLAAllocation allocation_proto;
-    OP_REQUIRES(ctx,
-                ParseFromTString(allocation_info.scalar<tstring>()(),
-                                 &allocation_proto),
-                errors::InvalidArgument(
-                    "Unable to parse allocation input to XLAAllocation"));
-
-    xla::Literal literal;
-    OP_REQUIRES_OK(
-        ctx, XRTStateHelpers::MakeLiteral(allocation_proto.value(), &literal));
-
-    ResourceMgr* rm;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
-
-    // We are guaranteed that the underlying device object won't be deleted out
-    // from under us, while the ScopedRef is live.
-    class DeviceAccessor::ScopedRef device_ref;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(ctx, &device_ref));
-
-    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
-    XRTTupleAllocation* allocation;
-    OP_REQUIRES_OK(ctx, XRTTupleAllocation::CreateAndTransfer(
-                            literal, memory_manager.get(), device_ref.backend(),
-                            device_ref.device_ordinal(), &allocation,
-                            device_ref.allocator()));
-
-    Tensor output(DT_INT64, TensorShape({}));
-    output.scalar<int64_t>()() = memory_manager->Register(allocation);
-    ctx->set_output(0, output);
-  }
-};
-
-// Op that allocates uninitialized memory on the device for a tensor of
-// a particular shape.
-template <class DeviceAccessor>
-class XRTAllocateUninitializedOp : public OpKernel {
- public:
-  explicit XRTAllocateUninitializedOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &tf_shape_));
-    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, tf_shape_, &xla_shape_));
-  }
-  ~XRTAllocateUninitializedOp() override = default;
-  XRTAllocateUninitializedOp(const XRTAllocateUninitializedOp&) = delete;
-  XRTAllocateUninitializedOp& operator=(const XRTAllocateUninitializedOp&) =
-      delete;
-
-  void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "XRTAllocateUninitializedOp::Compute";
-    auto timed =
-        monitoring::MakeTimed(xrt_metrics::GetAllocateUninitializedCell());
-    ResourceMgr* rm;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
-
-    // We are guaranteed that the underlying device object won't be deleted out
-    // from under us, while the ScopedRef is live.
-    class DeviceAccessor::ScopedRef device_ref;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(ctx, &device_ref));
-
-    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
-    XRTTupleAllocation* allocation;
-    OP_REQUIRES_OK(ctx, XRTTupleAllocation::CreateUninitialized(
-                            xla_shape_, memory_manager.get(),
-                            device_ref.backend(), device_ref.device_ordinal(),
-                            &allocation, device_ref.allocator()));
-
-    Tensor output(DT_INT64, TensorShape({}));
-    output.scalar<int64_t>()() = memory_manager->Register(allocation);
-    ctx->set_output(0, output);
-  }
-
- private:
-  DataType dtype_;
-  TensorShape tf_shape_;
-  xla::Shape xla_shape_;
-};
-
-// Op that allocates memory for a tensor (with optional layout) and transfers it
-// to the device, returning an allocation handle.
-template <class DeviceAccessor>
-class XRTAllocateFromTensorOp : public OpKernel {
- public:
-  explicit XRTAllocateFromTensorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    bool make_tuple = false;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &tf_shapes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("make_tuple", &make_tuple));
-    std::vector<int64_t> minor_to_major;
-    if (ctx->HasAttr("layouts")) {
-      OP_REQUIRES_OK(ctx, ctx->GetAttr("layouts", &minor_to_major));
-    }
-    OP_REQUIRES(
-        ctx, tf_shapes_.size() == dtypes_.size(),
-        errors::InvalidArgument("shapes and dtypes must be the same length"));
-    std::vector<xla::Shape> xla_shapes;
-    xla_shapes.reserve(tf_shapes_.size());
-    for (int i = 0; i < tf_shapes_.size(); i++) {
-      xla::Shape xla_shape;
-      OP_REQUIRES_OK(
-          ctx, TensorShapeToXLAShape(dtypes_[i], tf_shapes_[i], &xla_shape));
-      xla_shapes.push_back(std::move(xla_shape));
-    }
-    if (xla_shapes.size() > 1 || make_tuple) {
-      shape_ = xla::ShapeUtil::MakeTupleShape(xla_shapes);
-    } else {
-      shape_.Swap(&xla_shapes.front());
-    }
-    if (!minor_to_major.empty()) {
-      xla::Shape shape_with_layouts;
-      OP_REQUIRES_OK(ctx, GetShapeWithLayout(shape_, minor_to_major,
-                                             /*layout_func=*/nullptr,
-                                             &shape_with_layouts));
-      shape_.Swap(&shape_with_layouts);
-    }
-  }
-
-  ~XRTAllocateFromTensorOp() override = default;
-  XRTAllocateFromTensorOp(const XRTAllocateFromTensorOp&) = delete;
-  XRTAllocateFromTensorOp& operator=(const XRTAllocateFromTensorOp&) = delete;
-
-  void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "XRTAllocateFromTensorOp::Compute";
-    auto timed =
-        monitoring::MakeTimed(xrt_metrics::GetAllocateFromTensorCell());
-
-    OpInputList values;
-    OP_REQUIRES_OK(ctx, ctx->input_list("inputs", &values));
-    OP_REQUIRES(ctx, values.size() == tf_shapes_.size(),
-                errors::InvalidArgument(
-                    "Wrong number of inputs to XRTAllocateFromTensor: ",
-                    values.size(), " vs. ", tf_shapes_.size()));
-
-    std::vector<const char*> tensors_data;
-    for (size_t i = 0; i < values.size(); ++i) {
-      const Tensor& input_tensor = values[i];
-      OP_REQUIRES(ctx, input_tensor.dtype() == dtypes_[i],
-                  errors::InvalidArgument(
-                      "Input tensor type and input dtype do not match"));
-      // We allow the requested on-device shape to differ from the shape of the
-      // input tensor, as long as they have the same number of elements.
-      OP_REQUIRES(
-          ctx,
-          input_tensor.shape().num_elements() == tf_shapes_[i].num_elements(),
-          errors::InvalidArgument(
-              "Input tensor must have the number of elements specified "
-              "in the matching input shape: ",
-              input_tensor.shape().num_elements(), " vs. ",
-              tf_shapes_[i].num_elements(), " at index ", i));
-      tensors_data.push_back(
-          static_cast<const char*>(DMAHelper::base(&input_tensor)));
-    }
-    // Use the buffer straight out of the input tensors to create the literal.
-    xla::BorrowingLiteral literal =
-        shape_.IsTuple() ? xla::BorrowingLiteral(tensors_data, shape_)
-                         : xla::BorrowingLiteral(tensors_data.front(), shape_);
-    ResourceMgr* rm;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
-
-    // We are guaranteed that the underlying device object won't be deleted out
-    // from under us, while the ScopedRef is live.
-    class DeviceAccessor::ScopedRef device_ref;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(ctx, &device_ref));
-
-    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
-    XRTTupleAllocation* allocation;
-    OP_REQUIRES_OK(ctx, XRTTupleAllocation::CreateAndTransfer(
-                            literal, memory_manager.get(), device_ref.backend(),
-                            device_ref.device_ordinal(), &allocation,
-                            device_ref.allocator()));
-
-    Tensor output(DT_INT64, TensorShape({}));
-    output.scalar<int64_t>()() = memory_manager->Register(allocation);
-    ctx->set_output(0, output);
-  }
-
- private:
-  std::vector<TensorShape> tf_shapes_;
-  DataTypeVector dtypes_;
-  xla::Shape shape_;
-};
-
-// Op that takes a tuple handle input and returns a handle to a sub-tuple of the
-// input.
-template <bool discard_, class DeviceAccessor>
-class XRTSubTupleOp : public OpKernel {
- public:
-  explicit XRTSubTupleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  ~XRTSubTupleOp() override = default;
-  XRTSubTupleOp(const XRTSubTupleOp&) = delete;
-  XRTSubTupleOp& operator=(const XRTSubTupleOp&) = delete;
-
-  void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "XRTSubTupleOp::Compute";
-    auto timed = monitoring::MakeTimed(xrt_metrics::GetSubTupleCell());
-
-    const Tensor& handle_tensor = ctx->input(0);
-    OP_REQUIRES(
-        ctx, TensorShapeUtils::IsScalar(handle_tensor.shape()),
-        errors::Internal("computation input should be an int64 scalar"));
-    int64_t allocation_handle = handle_tensor.scalar<int64_t>()();
-
-    const Tensor& subtuple_info = ctx->input(1);
-    OP_REQUIRES(
-        ctx, TensorShapeUtils::IsVector(subtuple_info.shape()),
-        errors::Internal("tuple index input should be an int32 vector"));
-    xla::ShapeIndex shape_index;
-    for (int i = 0; i < subtuple_info.dim_size(0); ++i) {
-      shape_index.push_back(subtuple_info.vec<int32>()(i));
-    }
-
-    ResourceMgr* rm;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
-
-    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
-    RefPtr<XRTTupleAllocation> allocation;
-    OP_REQUIRES_OK(ctx, memory_manager->Lookup(allocation_handle, &allocation));
-
-    if (discard_) {
-      VLOG(2) << "Releasing handle " << allocation_handle;
-      OP_REQUIRES_OK(ctx, memory_manager->Release(allocation_handle));
-    }
-
-    XRTTupleAllocation* suballocation;
-    OP_REQUIRES_OK(
-        ctx, XRTTupleAllocation::MakeSubBuffer(allocation.get(), shape_index,
-                                               &suballocation, !discard_));
-
-    Tensor output(DT_INT64, TensorShape({}));
-    output.scalar<int64_t>()() = memory_manager->Register(suballocation);
-    ctx->set_output(0, output);
-  }
-};
-
-// Op that allocates memory for a literal and transfers it to the device.
-template <class DeviceAccessor>
-class XRTMakeTupleOp : public OpKernel {
- public:
-  explicit XRTMakeTupleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  ~XRTMakeTupleOp() override = default;
-  XRTMakeTupleOp(const XRTMakeTupleOp&) = delete;
-  XRTMakeTupleOp& operator=(const XRTMakeTupleOp&) = delete;
-
-  void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "XRTMakeTupleOp::Compute";
-    auto timed = monitoring::MakeTimed(xrt_metrics::GetMakeTupleCell());
-
-    const Tensor& tuple_info = ctx->input(0);
-    OP_REQUIRES(
-        ctx, TensorShapeUtils::IsScalar(tuple_info.shape()),
-        errors::Internal("tuple description input should be a string scalar"));
-    xrt::XLATupleNode tuple_proto;
-    OP_REQUIRES(
-        ctx, ParseFromTString(tuple_info.scalar<tstring>()(), &tuple_proto),
-        errors::InvalidArgument("Unable to parse tuple input to XLATupleNode"));
-
-    OpInputList arg_list;
-    OP_REQUIRES_OK(ctx, ctx->input_list("input_handles", &arg_list));
-
-    // For each input, the allocation it corresponds to and a flag indicating
-    // whether or not it should be released, i.e. discarded from the resource
-    // manager. One ref on each allocation is owned by this vector, and freed on
-    // exit.
-    std::vector<XRTTupleAllocation::ExpandedTupleInput> input_vector(
-        arg_list.size());
-    ResourceMgr* rm;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
-
-    xla::ShapeTree<XRTTupleAllocation::ExpandedTupleInput> tuple_shape_tree;
-    // device_ordinal is filled in by ParseTupleTree with the ordinal of one of
-    // the allocations. It is guaranteed that there is at least on allocation in
-    // any legal tree. We validate below in XRTTupleAllocation::MakeTuple that
-    // all the allocations are on the same device.
-    int device_ordinal;
-    OP_REQUIRES_OK(ctx, XRTStateHelpers::ParseTupleTree(
-                            tuple_proto, arg_list, &input_vector,
-                            &tuple_shape_tree, &device_ordinal, rm));
-
-    // We are guaranteed that the underlying device object won't be deleted out
-    // from under us, while the ScopedRef is live.
-    class DeviceAccessor::ScopedRef device_ref;
-    OP_REQUIRES_OK(
-        ctx, DeviceAccessor::InitScopedRef(ctx, device_ordinal, &device_ref));
-
-    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
-    XRTTupleAllocation* output_allocation;
-    OP_REQUIRES_OK(ctx, XRTTupleAllocation::MakeTuple(
-                            memory_manager.get(), device_ref.backend(),
-                            device_ref.device_ordinal(), tuple_shape_tree,
-                            &output_allocation, device_ref.allocator()));
-    RefPtr<XRTTupleAllocation> output_ptr(output_allocation);
-    for (int i = 0; i < input_vector.size(); ++i) {
-      if (input_vector[i].release_allocation_after_use) {
-        OP_REQUIRES_OK(
-            ctx, memory_manager->Release(arg_list[i].scalar<int64_t>()()));
-      }
-    }
-
-    Tensor output(DT_INT64, TensorShape({}));
-    output.scalar<int64_t>()() =
-        memory_manager->Register(std::move(output_ptr));
-    ctx->set_output(0, output);
-  }
-};
-
-// Op that reads a device-resident tuple to host memory and returns it as a
-// literal.
-template <bool discard_, class DeviceAccessor>
-class XRTReadLiteralOp : public OpKernel {
- public:
-  explicit XRTReadLiteralOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  ~XRTReadLiteralOp() override = default;
-  XRTReadLiteralOp(const XRTReadLiteralOp&) = delete;
-  XRTReadLiteralOp& operator=(const XRTReadLiteralOp&) = delete;
-
-  void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "XRTReadLiteralOp::Compute";
-    auto timed = monitoring::MakeTimed(xrt_metrics::GetReadLiteralCell());
-
-    const Tensor& handle_tensor = ctx->input(0);
-    OP_REQUIRES(
-        ctx, TensorShapeUtils::IsScalar(handle_tensor.shape()),
-        errors::Internal("computation input should be an int64 scalar"));
-    int64_t allocation_handle = handle_tensor.scalar<int64_t>()();
-
-    ResourceMgr* rm;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
-
-    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
-    RefPtr<XRTTupleAllocation> allocation;
-    OP_REQUIRES_OK(ctx, memory_manager->Lookup(allocation_handle, &allocation));
-
-    if (discard_) {
-      VLOG(2) << "Releasing handle " << allocation_handle;
-      OP_REQUIRES_OK(ctx, memory_manager->Release(allocation_handle));
-    }
-
-    // We are guaranteed that the underlying device object won't be deleted out
-    // from under us, while the ScopedRef is live.
-    class DeviceAccessor::ScopedRef device_ref;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(
-                            ctx, allocation->device_ordinal(), &device_ref));
-
-    xla::Literal literal(allocation->on_host_shape());
-    OP_REQUIRES_OK(ctx, allocation->ToLiteral(device_ref.backend(), &literal));
-    xla::LiteralProto literal_proto = literal.ToProto();
-
-    Tensor output(DT_STRING, TensorShape({}));
-    SerializeToTString(literal_proto, &output.scalar<tstring>()());
-    ctx->set_output(0, output);
-  }
-};
-
-// Op that reads a device-resident tuple to host memory and returns it as a
-// literal.
-template <class DeviceAccessor>
-class XRTReadToTensorOp : public OpKernel {
- public:
-  explicit XRTReadToTensorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("release_handles", &discard_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
-  }
-  ~XRTReadToTensorOp() override = default;
-  XRTReadToTensorOp(const XRTReadToTensorOp&) = delete;
-  XRTReadToTensorOp& operator=(const XRTReadToTensorOp&) = delete;
-
-  void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "XRTReadToTensorOp::Compute";
-    auto timed = monitoring::MakeTimed(xrt_metrics::GetReadToTensorCell());
-
-    const Tensor& handle_tensor = ctx->input(0);
-    // TODO(phawkins,dlibenzi): accept multiple handles (i.e., vectors, not
-    // just scalars.)
-    OP_REQUIRES(
-        ctx, TensorShapeUtils::IsScalar(handle_tensor.shape()),
-        errors::Internal("computation input should be an int64 scalar"));
-    int64_t allocation_handle = handle_tensor.scalar<int64_t>()();
-
-    ResourceMgr* rm;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
-
-    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
-    RefPtr<XRTTupleAllocation> allocation;
-    OP_REQUIRES_OK(ctx, memory_manager->Lookup(allocation_handle, &allocation));
-
-    if (discard_) {
-      VLOG(2) << "Releasing handle " << allocation_handle;
-      OP_REQUIRES_OK(ctx, memory_manager->Release(allocation_handle));
-    }
-
-    // We are guaranteed that the underlying device object won't be deleted out
-    // from under us, while the ScopedRef is live.
-    class DeviceAccessor::ScopedRef device_ref;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(
-                            ctx, allocation->device_ordinal(), &device_ref));
-
-    xla::Shape shape = allocation->on_host_shape();
-    int output = 0;
-    Status status = xla::ShapeUtil::ForEachMutableSubshapeWithStatus(
-        &shape,
-        [&](xla::Shape* subshape, const xla::ShapeIndex& index) -> Status {
-          if (subshape->IsTuple()) return OkStatus();
-
-          xla::PrimitiveType xla_type;
-          TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(
-              ctx->expected_output_dtype(output), &xla_type));
-          if (xla_type != subshape->element_type()) {
-            return errors::InvalidArgument(
-                "Type mismatch between buffer type (", subshape->ToString(),
-                ") and tensor type (",
-                DataTypeString(ctx->expected_output_dtype(output)),
-                ") for output tensor ", output);
-          }
-
-          TensorShape output_shape;
-          TF_RETURN_IF_ERROR(XLAShapeToTensorShape(*subshape, &output_shape));
-
-          Tensor* output_tensor;
-          TF_RETURN_IF_ERROR(
-              ctx->allocate_output(output, output_shape, &output_tensor));
-
-          XRTTupleAllocation* sub;
-          TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
-              allocation.get(), index, &sub, /*alias_parent_allocation=*/true));
-          core::ScopedUnref sub_unref(sub);
-
-          xla::MutableBorrowingLiteral literal;
-          TF_RETURN_IF_ERROR(HostTensorToMutableBorrowingLiteral(
-              xla::LayoutUtil::GetWithDefaultLayout(*subshape), output_tensor,
-              &literal));
-          TF_RETURN_IF_ERROR(sub->ToLiteral(device_ref.backend(), &literal));
-
-          ++output;
-          return OkStatus();
-        });
-    OP_REQUIRES_OK(ctx, status);
-  }
-  bool discard_;
-  DataTypeVector dtypes_;
-};
-
-// Op that writes a new literal value into device-resident memory.
-template <class DeviceAccessor>
-class XRTWriteLiteralOp : public OpKernel {
- public:
-  explicit XRTWriteLiteralOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  ~XRTWriteLiteralOp() override = default;
-  XRTWriteLiteralOp(const XRTWriteLiteralOp&) = delete;
-  XRTWriteLiteralOp& operator=(const XRTWriteLiteralOp&) = delete;
-
-  void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "XRTWriteLiteralOp::Compute";
-    auto timed = monitoring::MakeTimed(xrt_metrics::GetWriteLiteralCell());
-
-    const Tensor& handle_tensor = ctx->input(0);
-    OP_REQUIRES(
-        ctx, TensorShapeUtils::IsScalar(handle_tensor.shape()),
-        errors::Internal("computation input should be an int64 scalar"));
-    int64_t allocation_handle = handle_tensor.scalar<int64_t>()();
-
-    const Tensor& literal_info = ctx->input(1);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(literal_info.shape()),
-                errors::Internal("literal input should be a string scalar"));
-    xla::LiteralProto literal_proto;
-    OP_REQUIRES(
-        ctx, ParseFromTString(literal_info.scalar<tstring>()(), &literal_proto),
-        errors::InvalidArgument(
-            "Unable to parse allocation input to LiteralProto"));
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, XRTStateHelpers::MakeLiteral(literal_proto, &literal));
-
-    ResourceMgr* rm;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
-
-    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
-    RefPtr<XRTTupleAllocation> allocation;
-    OP_REQUIRES_OK(ctx, memory_manager->Lookup(allocation_handle, &allocation));
-
-    // We are guaranteed that the underlying device object won't be deleted out
-    // from under us, while the ScopedRef is live.
-    typename DeviceAccessor::ScopedRef device_ref;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(
-                            ctx, allocation->device_ordinal(), &device_ref));
-    OP_REQUIRES_OK(ctx,
-                   allocation->WriteLiteral(device_ref.backend(), literal));
-
-    Tensor output(DT_INT64, TensorShape({}));
-    output.scalar<int64_t>()() = allocation_handle;
-    ctx->set_output(0, output);
-  }
-};
-
-// Op that discards a handle to device memory.
-template <class DeviceAccessor>
-class XRTReleaseAllocationOp : public OpKernel {
- public:
-  explicit XRTReleaseAllocationOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  ~XRTReleaseAllocationOp() override = default;
-  XRTReleaseAllocationOp(const XRTReleaseAllocationOp&) = delete;
-  XRTReleaseAllocationOp& operator=(const XRTReleaseAllocationOp&) = delete;
-
-  void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "XRTReleaseAllocationOp::Compute";
-    auto timed = monitoring::MakeTimed(xrt_metrics::GetReleaseAllocationCell());
-
-    ResourceMgr* rm;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
-
-    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
-    const Tensor& allocation_handle = ctx->input(0);
-    auto flat_keys = allocation_handle.flat<int64_t>();
-    for (int64_t i = 0; i < flat_keys.size(); ++i) {
-      int64_t key = flat_keys(i);
-      OP_REQUIRES_OK(ctx, memory_manager->Release(key));
-      VLOG(2) << "Released allocation handle " << key;
-    }
-  }
-};
-
-// Op that discards a handle to device memory.
-template <class DeviceAccessor>
-class XRTReleaseAllAllocationsOp : public OpKernel {
- public:
-  explicit XRTReleaseAllAllocationsOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {}
-  ~XRTReleaseAllAllocationsOp() override = default;
-  XRTReleaseAllAllocationsOp(const XRTReleaseAllAllocationsOp&) = delete;
-  XRTReleaseAllAllocationsOp& operator=(const XRTReleaseAllAllocationsOp&) =
-      delete;
-
-  void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "XRTReleaseAllAllocationsOp::Compute";
-    auto timed =
-        monitoring::MakeTimed(xrt_metrics::GetReleaseAllAllocationsCell());
-
-    ResourceMgr* rm;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
-    XRTMemoryManager::Get(rm)->ReleaseAllAllocations();
-  }
-};
-
-template <class DeviceAccessor>
-class XRTCompactAllocationsOp : public OpKernel {
- public:
-  explicit XRTCompactAllocationsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  ~XRTCompactAllocationsOp() override = default;
-  XRTCompactAllocationsOp(const XRTCompactAllocationsOp&) = delete;
-  XRTCompactAllocationsOp& operator=(const XRTCompactAllocationsOp&) = delete;
-
-  void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "XRTCompactAllocationsOp::Compute";
-    auto timed =
-        monitoring::MakeTimed(xrt_metrics::GetCompactAllocationsCell());
-
-    ResourceMgr* rm;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
-    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
-    class DeviceAccessor::ScopedRef device_ref;
-    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(ctx, &device_ref));
-    OP_REQUIRES_OK(ctx, memory_manager->CompactAllocations(
-                            device_ref.backend(), device_ref.device_ordinal(),
-                            device_ref.allocator()));
-  }
-};
-
-template <class DeviceAccessor>
-class XRTMemoryInfoOp : public OpKernel {
- public:
-  explicit XRTMemoryInfoOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  ~XRTMemoryInfoOp() override = default;
-  XRTMemoryInfoOp(const XRTMemoryInfoOp&) = delete;
-  XRTMemoryInfoOp& operator=(const XRTMemoryInfoOp&) = delete;
-
-  void Compute(OpKernelContext* ctx) override {
-    auto kernel_fn = [&]() -> Status {
-      VLOG(1) << "XRTMemoryInfoOp::Compute";
-
-      class DeviceAccessor::ScopedRef device_ref;
-      TF_RETURN_IF_ERROR(DeviceAccessor::InitScopedRef(ctx, &device_ref));
-      TF_ASSIGN_OR_RETURN(
-          se::StreamExecutor * stream_executor,
-          device_ref.backend()->stream_executor(device_ref.device_ordinal()));
-      int64_t mem_free = -1;
-      int64_t mem_total = -1;
-      if (!stream_executor->DeviceMemoryUsage(&mem_free, &mem_total)) {
-        VLOG(2) << "Device " << ctx->device()->name()
-                << " does not expose memory information";
-      }
-      xrt::MemoryInfo mem_info;
-      mem_info.set_kb_total((mem_total >= 0) ? mem_total / 1024 : -1);
-      mem_info.set_kb_free((mem_free >= 0) ? mem_free / 1024 : -1);
-
-      Tensor output(DT_STRING, TensorShape({}));
-      output.scalar<tstring>()() = mem_info.SerializeAsString();
-      ctx->set_output(0, output);
-      return OkStatus();
-    };
-    OP_REQUIRES_OK(ctx, kernel_fn());
-  }
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_XRT_KERNELS_XRT_STATE_OPS_H_
diff --git a/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc b/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc
deleted file mode 100644
index fffb703dd84c2d..00000000000000
--- a/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tensorflow {
-
-REGISTER_OP("XRTCompile")
-    .Input("computation: string")
-    .Output("handle: int64")
-    .Output("program_shape: string")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      c->set_output(1, c->UnknownShapeOfRank(1));
-      return OkStatus();
-    })
-    .Doc(
-        R"(
-Reads a computation proto, compiles it, and places it in the global compilation
-cache.
-
-'computation' is a serialized xrt::XLAComputation proto.
-'handle' is an identifier that can be used in other ops to refer to the
-computation.
-)");
-
-REGISTER_OP("XRTReleaseCompilationHandle")
-    .Input("handle: int64")
-    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
-    .Doc(
-        R"(
-Discards one or more computation handles from the compilation cache.
-The handle(s) cannot be subsequently used.
-
-'handle' is an ID (or vector of IDs) returned from a XRTCompile Op.
-)");
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/ops/xrt_execute_op.cc b/tensorflow/compiler/xrt/ops/xrt_execute_op.cc
deleted file mode 100644
index 6f485d82cbecc4..00000000000000
--- a/tensorflow/compiler/xrt/ops/xrt_execute_op.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <vector>
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tensorflow {
-
-REGISTER_OP("XRTExecute")
-    .Attr("Ninputs: int >= 0")
-    .Input("computation_handle: int64")
-    .Input("execution_config: string")
-    .Input("input_handles: Ninputs * int64")
-    .Output("output_handle: int64")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      std::vector<shape_inference::ShapeHandle> input_handle_shapes;
-      TF_RETURN_IF_ERROR(c->input("input_handles", &input_handle_shapes));
-      for (size_t i = 0; i < input_handle_shapes.size(); ++i) {
-        shape_inference::ShapeHandle unused;
-        TF_RETURN_IF_ERROR(
-            c->WithRankAtMost(input_handle_shapes[i], 1, &unused));
-      }
-      return tensorflow::shape_inference::ScalarShape(c);
-    })
-    .Doc(
-        R"(
-Runs a previously-compiled computation on a core. If
-execution_config.release_input_handles is true, the input handles are invalid
-after this op runs.
-
-'computation_handle' is an id returned by XRTCompile.
-'execution_config' is a serialized xrt::TPUExecutionConfig proto.
-'input_handles' is a list of ids of allocations, one per input to the compiled
-computation.
-'output_handle' is an identifier for the result of the compiled computation.
-'Ninputs' is the number of input handles.
-)");
-
-REGISTER_OP("XRTExecuteChained")
-    .Input("execution_plan: string")
-    .Input("execution_config: string")
-    .Output("output_handle: int64")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      return tensorflow::shape_inference::ScalarShape(c);
-    })
-    .Doc(
-        R"(
-Runs a sequence of previously-compiled computations on a core.
-The 'execution_plan' input is a serialized xrt::XRTChainedExecutePlan proto
-describing the post-order of the chained execution.
-The 'execution_config' input is a serialized xrt::XRTChainedExecuteConfig
-proto describing the configuration for the chained execution operation.
-Returns one of more int64 handles to the XRT device data generated by the
-chained execution.
-)");
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
deleted file mode 100644
index 5a831d14284633..00000000000000
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tensorflow {
-
-static bool Initialized = [] {
-  tensorflow::RequestXlaDevicesCreation();
-  return true;
-}();
-
-REGISTER_OP("XRTAllocate")
-    .Input("allocation: string")
-    .Output("handle: int64")
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .Doc(
-        R"(
-Reads a literal proto and transfers it to device memory.
-
-'allocation' is a serialized xrt::XLAAllocation proto.
-'handle' is an id that can be used in other ops to refer to the allocation.
-)");
-
-REGISTER_OP("XRTAllocateUninitialized")
-    .Output("handle: int64")
-    .Attr("dtype: type")
-    .Attr("shape: shape")
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .Doc(
-        R"(
-Allocates a tensor to hold the specified shape in device memory.  The values
-in the tensor are left uninitialized.
-
-shape: The shapes which the tensor should have on device.
-
-handle: An id that can be used in other ops to refer to the allocation.
-)");
-
-REGISTER_OP("XRTAllocateFromTensor")
-    .Input("inputs: dtypes")
-    .Output("handle: int64")
-    .Attr("dtypes: list(type)")
-    .Attr("shapes: list(shape)")
-    .Attr("layouts: list(int) = []")
-    .Attr("make_tuple: bool = false")
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .Doc(
-        R"(
-Reads a list of tensors with optional layouts, and transfers it to device
-memory.
-
-inputs: The tensors holding the input data.
-shapes: The shapes which the tensors should have on device. The i-th shape
-corresponds to the i-th input. The shapes, together with the (optional)
-layouts, helps creating the fully qualified shape of the data on the device.
-The shapes can differ from the corresponding input one, as long as the total
-number of elements matches. In other words, it is possible to feed an input
-tensor with shape {8} and have a corresponding shape {2,2,2}.
-layouts: A vector holding the requested layout in minor-to-major sequence.
-If empty, the default layout will be used.
-For a tuple, the layouts vector holds a linearized minor-to-major numbers
-for all the tuple leaves, in the order they appear within the tuple.
-The elements within the layouts sequence corresponding to a given tuple
-subshape can be set to -1, to leave such subshape to the default shape.
-handle: An id that can be used in other ops to refer to the allocation.
-)");
-
-REGISTER_OP("XRTSubTuple")
-    .Input("base_handle: int64")
-    .Input("shape_index: int32")
-    .Output("output_handle: int64")
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .Doc(
-        R"(
-Returns a handle to a sub-tuple of an allocated tuple.
-
-'base_handle' is the id of the on-device allocation.
-'shape_index' is a vector of integers describing an XLA ShapeIndex.
-'output_handle' is an id that can be used in other ops to refer to the
-sub-tuple.
-)");
-
-REGISTER_OP("XRTSubTupleAndRelease")
-    .Input("base_handle: int64")
-    .Input("shape_index: int32")
-    .Output("output_handle: int64")
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .Doc(
-        R"(
-Returns a handle to a sub-tuple of an allocated tuple, and releases the handle
-of the input tuple.
-
-'base_handle' is the id of the on-device allocation.
-'shape_index' is a vector of integers describing an XLA ShapeIndex.
-'output_handle' is an id that can be used by other ops to refer to the
-sub-tuple.
-)");
-
-REGISTER_OP("XRTMakeTuple")
-    .Attr("Ninputs: int")
-    .Input("tuple_description: string")
-    .Input("input_handles: Ninputs * int64")
-    .Output("output_handle: int64")
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .Doc(
-        R"(
-Returns a handle to a new allocation constructed by assembling existing
-allocations in a tuple.
-
-'tuple_description' is a serialized xrt::XLATupleNode proto describing the
-shape of the output tuple, and whether each input handle should be aliased or
-released.
-'input_handles' is a list of input handles to assemble into the output tuple.
-'output_handle' is an id that can be used by other ops to refer to the new
-tuple.
-'Ninputs' is the number of input handles.
-)");
-
-REGISTER_OP("XRTReadLiteral")
-    .Input("handle: int64")
-    .Output("literal: string")
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .Doc(
-        R"(
-Copies an allocated tuple from device memory and returns it as a literal.
-
-'handle' is the id returned from the Op that produced the on-device allocation.
-'literal' is a serialized xla::LiteralProto proto.
-)");
-
-REGISTER_OP("XRTWriteLiteral")
-    .Input("handle: int64")
-    .Input("literal: string")
-    .Output("output_handle: int64")
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .Doc(
-        R"(
-Copies the input literal into the device memory pointed to by handle.
-Returns the handle itself.
-
-'handle' is the id returned from the Op that produced the on-device allocation.
-'literal' is a serialized xla::LiteralProto proto to be written to device memory.
-)");
-
-REGISTER_OP("XRTReadLiteralAndRelease")
-    .Input("handle: int64")
-    .Output("literal: string")
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .Doc(
-        R"(
-Copies an allocated tuple from device memory, and returns it as a literal, and
-releases the handle.
-
-'handle' is the id returned from the Op that produced the on-device allocation.
-'literal' is a serialized xla::LiteralProto proto.
-)");
-
-REGISTER_OP("XRTReadToTensor")
-    .Input("handles: int64")
-    .Attr("release_handles: bool = False")
-    .Attr("dtypes: list(type)")
-    .Output("tensors: dtypes")
-    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
-    .Doc(
-        R"(
-Copies allocated values from device memory and returns them as zero or more
-Tensors. If a handle refers to a non-tuple buffer, a single tensor is returned.
-In general, the tensors returned for a handle correspond to an in-order traversal
-of a the tuple-tree value referenced by the handle.
-
-'handles' contains ids returned from Ops that produced on-device allocations.
-At present, only a single (scalar) handle is supported.
-'dtypes' are the expected types for each `Tensor` to be returned. If the
-expected and actual tensor types do not match, an error is returned.
-'release_handles': if True, `handles` are released.
-'tensors' are the output Tensors.
-)");
-
-REGISTER_OP("XRTReleaseAllocationHandle")
-    .Input("handle: int64")
-    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
-    .Doc(
-        R"(
-Discards one or more device memory handles. The handle(s) cannot be subsequently
-used.
-
-'handle' is the ID (or a vector of IDs) returned from the Op that produced the
-on-device allocation.
-)");
-
-REGISTER_OP("XRTReleaseAllAllocations")
-    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
-    .Doc(
-        R"(
-Discards all the XRT allocations. All the client held handles will be invalid.
-)");
-
-REGISTER_OP("XRTCompactAllocations")
-    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
-    .Doc(
-        R"(
-Runs a device memory compaction cycle. This copies the device data behind the
-currently alive allocation handles into host memory, releases the device memory
-backing the handles, and re-allocate and send back the data to the device.
-This operation helps with device memory fragmentation.
-)");
-
-REGISTER_OP("XRTMetricsCollect")
-    .Input("request: string")
-    .Output("result: string")
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .Doc(
-        R"(
-Reads the selected metric values from the metrics collection registry.
-
-'request' is a serialized xrt::XRTMetricsCollect proto.
-'result' is a serialized xrt::MetricsReport proto.
-)");
-
-REGISTER_OP("XRTMemoryInfo")
-    .Output("result: string")
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .Doc(
-        R"(
-Returns the memory information of the device this op executes on/
-
-'result' is a serialized xrt::MemoryInfo proto.
-)");
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/tests/BUILD b/tensorflow/compiler/xrt/tests/BUILD
deleted file mode 100644
index 0139bd1fc6a076..00000000000000
--- a/tensorflow/compiler/xrt/tests/BUILD
+++ /dev/null
@@ -1,88 +0,0 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_cc_test")
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "tf_cuda_tests_tags",
-)
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//learning/brain:__subpackages__"],
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "raw_api_test_lib",
-    testonly = 1,
-    srcs = [
-        "raw_api_test.cc",
-    ],
-    deps = [
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xrt:xrt_proto_cc",
-        "//tensorflow/compiler/xrt:xrt_server",
-        "//tensorflow/compiler/xrt/cc:xrt_ops",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:tensorflow_opensource",
-        "//tensorflow/core:test",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:status",
-        "@local_xla//xla:literal",
-        "@local_xla//xla:literal_util",
-        "@local_xla//xla:shape_util",
-        "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/client:client_library",
-        "@local_xla//xla/client:executable_build_options",
-        "@local_xla//xla/client:local_client",
-        "@local_xla//xla/client:padding",
-        "@local_xla//xla/client:xla_builder",
-        "@local_xla//xla/client:xla_computation",
-        "@local_xla//xla/client/lib:arithmetic",
-        "@local_xla//xla/client/lib:constants",
-        "@local_xla//xla/service:platform_util",
-        "@local_xla//xla/stream_executor:platform",
-    ],
-)
-
-tf_cc_test(
-    name = "raw_api_test_cpu",
-    size = "medium",
-    srcs = [],
-    args = [
-        "--xla_test_device=XLA_CPU",
-        "--xla_platform=CPU",
-    ],
-    deps = [
-        ":raw_api_test_lib",
-        "//tensorflow/compiler/jit:xla_cpu_device",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "raw_api_test_gpu",
-    size = "medium",
-    srcs = [],
-    args = [
-        "--xla_test_device=XLA_GPU",
-        "--xla_platform=GPU",
-    ],
-    tags = tf_cuda_tests_tags() + [
-        "no_cuda_asan",  # TODO(b/171319142): re-enable.
-    ],
-    deps = [
-        ":raw_api_test_lib",
-        "//tensorflow/compiler/jit:xla_gpu_device",
-    ],
-)
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
deleted file mode 100644
index 10f32f44aa2236..00000000000000
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ /dev/null
@@ -1,2291 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <initializer_list>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/strings/str_cat.h"
-#include "absl/types/span.h"
-#include "tensorflow/cc/client/client_session.h"
-#include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/array_ops.h"
-#include "tensorflow/cc/ops/const_op.h"
-#include "tensorflow/compiler/tf2xla/literal_util.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "xla/client/client_library.h"
-#include "xla/client/executable_build_options.h"
-#include "xla/client/lib/arithmetic.h"
-#include "xla/client/lib/constants.h"
-#include "xla/client/local_client.h"
-#include "xla/client/padding.h"
-#include "xla/client/xla_builder.h"
-#include "xla/client/xla_computation.h"
-#include "xla/layout.h"
-#include "xla/layout_util.h"
-#include "xla/literal.h"
-#include "xla/literal_util.h"
-#include "xla/service/platform_util.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/xla_data.pb.h"
-#include "tensorflow/compiler/xrt/cc/ops/xrt_compile_ops.h"
-#include "tensorflow/compiler/xrt/cc/ops/xrt_execute_op.h"
-#include "tensorflow/compiler/xrt/cc/ops/xrt_state_ops.h"
-#include "tensorflow/compiler/xrt/xrt.pb.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/tstring.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/public/session_options.h"
-#include "tensorflow/core/util/command_line_flags.h"
-#include "tsl/lib/core/status_test_util.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
-
-namespace tensorflow {
-namespace {
-
-xla::XlaComputation ReturnDynamicR1() {
-  xla::XlaBuilder builder("ReturnDynamicR1");
-  auto p0 = xla::Parameter(&builder, 0,
-                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P0");
-  auto p1 = xla::Parameter(&builder, 1,
-                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P1");
-  auto p2 = xla::Parameter(&builder, 2, xla::ShapeUtil::MakeShape(xla::S32, {}),
-                           "P2");
-  auto sum = xla::Add(p0, p1);
-  auto pad_sum = xla::SetDimensionSize(sum, p2, 0);
-  return builder.Build(pad_sum).value();
-}
-
-xla::XlaComputation ReturnDynamicR2() {
-  xla::XlaBuilder builder("ReturnDynamicR2");
-  auto p0 = xla::Parameter(&builder, 0,
-                           xla::ShapeUtil::MakeShape(xla::F32, {2, 4}), "P0");
-  auto p1 = xla::Parameter(&builder, 1,
-                           xla::ShapeUtil::MakeShape(xla::F32, {2, 4}), "P1");
-  auto p2 = xla::Parameter(&builder, 2, xla::ShapeUtil::MakeShape(xla::S32, {}),
-                           "P2");
-  auto sum = xla::Add(p0, p1);
-  auto pad_sum_dim0 = xla::SetDimensionSize(sum, p2, 0);
-  auto pad_sum_dim1 = xla::SetDimensionSize(pad_sum_dim0, p2, 1);
-  return builder.Build(pad_sum_dim1).value();
-}
-
-xla::XlaComputation AcceptDynamicR1() {
-  xla::XlaBuilder builder("AcceptDynamicR1");
-  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
-  dyn_shape.set_dynamic_dimension(0, true);
-  auto p0 = xla::Parameter(&builder, 0, dyn_shape, "P0");
-  auto p1 = xla::Parameter(&builder, 1, dyn_shape, "P1");
-  auto sum = xla::Add(p0, p1);
-  return builder.Build(sum).value();
-}
-
-xla::XlaComputation AcceptDynamicR2() {
-  xla::XlaBuilder builder("AcceptDynamicR2");
-  xla::Shape dyn_shape;
-  dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {2, 4});
-  dyn_shape.set_dynamic_dimension(1, true);
-  auto p0 = xla::Parameter(&builder, 0, dyn_shape, "P0");
-  auto negate = xla::Neg(p0);
-  return builder.Build(negate).value();
-}
-
-xla::XlaComputation ReturnDynamicR1Tuple() {
-  xla::XlaBuilder builder("ReturnDynamicR1Tuple");
-  auto p0 = xla::Parameter(&builder, 0,
-                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P0");
-  auto p1 = xla::Parameter(&builder, 1,
-                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P1");
-  auto p2 = xla::Parameter(&builder, 2, xla::ShapeUtil::MakeShape(xla::S32, {}),
-                           "P2");
-  auto sum = xla::Add(p0, p1);
-  auto sub = xla::Sub(p0, p1);
-  auto one = xla::One(&builder, xla::S32);
-  auto pad_sum = xla::SetDimensionSize(sum, p2, 0);
-  auto pad_sub = xla::SetDimensionSize(sub, p2 + one, 0);
-  auto tuple = xla::Tuple(&builder, {pad_sum, sum, pad_sub});
-  return builder.Build(tuple).value();
-}
-
-xla::XlaComputation AcceptDynamicR1Tuple() {
-  xla::XlaBuilder builder("AcceptDynamicR1");
-  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
-  dyn_shape.set_dynamic_dimension(0, true);
-  xla::Shape tuple_shape =
-      xla::ShapeUtil::MakeTupleShape({dyn_shape, dyn_shape});
-  xla::Shape nest_tuple_shape =
-      xla::ShapeUtil::MakeTupleShape({dyn_shape, dyn_shape});
-  auto p = xla::Parameter(&builder, 0, tuple_shape, "P0");
-  auto p0 = xla::GetTupleElement(p, 0);
-  auto p1 = xla::GetTupleElement(p, 1);
-  auto sum = xla::Add(p0, p1);
-  return builder.Build(sum).value();
-}
-
-template <typename T>
-xla::LiteralProto CreateR0(T v) {
-  auto array = xla::LiteralUtil::CreateR0<T>(v);
-  return array.ToProto();
-}
-
-tensorflow::SessionOptions GetSessionOptions() {
-  tensorflow::SessionOptions options;
-  // Disable optimizations for static graph to allow calls to Session::Extend.
-  options.config.mutable_experimental()->set_disable_optimize_for_static_graph(
-      true);
-  return options;
-}
-
-class XrtClientSession : public ClientSession {
- public:
-  explicit XrtClientSession(const Scope& scope)
-      : ClientSession(scope, GetSessionOptions()) {
-    auto clear_all = ops::XRTReleaseAllAllocations(scope);
-    std::vector<Tensor> outputs;
-    TF_CHECK_OK(Run(ClientSession::FeedType(), {}, {clear_all}, &outputs));
-  }
-};
-
-string* xla_test_device_ptr;  // initial value set in main()
-string* xla_platform_ptr;     // initial value set in main()
-
-string DeviceFromFlag() {
-  string xla_test_device = *xla_test_device_ptr;
-  return absl::StrCat("/device:", xla_test_device, ":0");
-}
-
-std::vector<int> GetAttrLayout(absl::Span<const int64_t> minor_to_mayor) {
-  std::vector<int> layout;
-  for (auto dim : minor_to_mayor) {
-    layout.push_back(static_cast<int>(dim));
-  }
-  return layout;
-}
-
-xla::LiteralProto TwoElementTuple() {
-  auto array = xla::LiteralUtil::CreateR1<float>({1.0f, 3.0f});
-  auto matrix = xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}});
-  auto tuple = xla::LiteralUtil::MakeTuple({&array, &matrix});
-  return tuple.ToProto();
-}
-
-xla::LiteralProto BasedTwoElementTuple(float base) {
-  auto array = xla::LiteralUtil::CreateR1<float>({base, base + 1});
-  auto matrix = xla::LiteralUtil::CreateR2<float>(
-      {{base + 2, base + 3}, {base + 4, base + 5}});
-  auto tuple = xla::LiteralUtil::MakeTuple({&array, &matrix});
-  return tuple.ToProto();
-}
-
-xla::LiteralProto ScalarLiteral() {
-  auto scalar = xla::LiteralUtil::CreateR0<float>(12.0f);
-  return scalar.ToProto();
-}
-
-xla::LiteralProto NestedTuple() {
-  auto array = xla::LiteralUtil::CreateR1<float>({1.0f, 3.0f});
-  auto matrix = xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}});
-  auto tuple = xla::LiteralUtil::MakeTuple({&array, &matrix});
-  auto scalar = xla::LiteralUtil::CreateR0<float>(12.0f);
-  auto nested = xla::LiteralUtil::MakeTuple({&tuple, &scalar});
-  return nested.ToProto();
-}
-
-xla::LiteralProto MakeTuple0() {
-  auto scalar = xla::LiteralUtil::CreateR0<float>(12.0f);
-  auto array = xla::LiteralUtil::CreateR1<float>({1.0f, 3.0f});
-  auto matrix = xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}});
-  auto tuple = xla::LiteralUtil::MakeTuple({&array, &matrix});
-  auto nested0 = xla::LiteralUtil::MakeTuple({&scalar, &tuple});
-  auto nested1 = xla::LiteralUtil::MakeTuple({&scalar, &nested0});
-  return nested1.ToProto();
-}
-
-xla::LiteralProto FloatVector(absl::Span<const float> v) {
-  auto array = xla::LiteralUtil::CreateR1<float>(v);
-  return array.ToProto();
-}
-
-xla::LiteralProto FloatMatrix(
-    std::initializer_list<std::initializer_list<float>> v,
-    const xla::Layout& layout) {
-  auto array = xla::LiteralUtil::CreateR2WithLayout<float>(v, layout);
-  return array.ToProto();
-}
-
-xla::Literal ReadOutputLiteral(const std::vector<Tensor>& outputs, size_t idx) {
-  xla::LiteralProto response;
-  CHECK(ParseFromTString(outputs[idx].scalar<tstring>()(), &response));
-  return xla::Literal::CreateFromProto(response).value();
-}
-
-bool CompareLiteralProtos(const xla::LiteralProto& a,
-                          const xla::LiteralProto& b) {
-  auto l_a = xla::Literal::CreateFromProto(a).value();
-  auto l_b = xla::Literal::CreateFromProto(b).value();
-  bool equal = l_a == l_b;
-  if (!equal) {
-    LOG(INFO) << "LiteralProtos don't match:\n"
-              << a.DebugString() << "\n!=\n"
-              << b.DebugString();
-  }
-  return equal;
-}
-
-bool CompareLiteralToLiteralProto(const xla::Literal& a,
-                                  const xla::LiteralProto& b) {
-  auto l_b = xla::Literal::CreateFromProto(b).value();
-  bool equal = a == l_b;
-  if (!equal) {
-    LOG(INFO) << "Literal and LiteralProto don't match:\n"
-              << a.ToProto().DebugString() << "\n!=\n"
-              << b.DebugString();
-  }
-  return equal;
-}
-
-bool CompareLiterals(const xla::Literal& a, const xla::Literal& b) {
-  bool equal = a == b;
-  if (!equal) {
-    LOG(INFO) << "Literals don't match:\n"
-              << a.ToProto().DebugString() << "\n!=\n"
-              << b.ToProto().DebugString();
-  }
-  return equal;
-}
-
-xla::XlaComputation OnePlusTwo() {
-  xla::XlaBuilder builder("OnePlusTwo");
-  auto c0 = xla::ConstantR0(&builder, 1.0f);
-  auto c1 = xla::ConstantR0(&builder, 2.0f);
-  xla::Add(c0, c1);
-  return builder.Build().value();
-}
-
-xla::XlaComputation AddAndScale() {
-  xla::XlaBuilder builder("AddAndScale");
-  auto p0 = xla::Parameter(&builder, 0,
-                           xla::ShapeUtil::MakeShape(xla::F32, {2}), "P0");
-  auto p1 = xla::Parameter(&builder, 1,
-                           xla::ShapeUtil::MakeShape(xla::F32, {2}), "P1");
-  auto sum = xla::Add(p0, p1);
-  auto c = xla::ConstantR0<float>(&builder, 3.0f);
-  xla::Mul(sum, c);
-  return builder.Build().value();
-}
-
-xla::XlaComputation SubAndScale() {
-  xla::XlaBuilder builder("SubAndScale");
-  auto p0 = xla::Parameter(&builder, 0,
-                           xla::ShapeUtil::MakeShape(xla::F32, {2}), "P0");
-  auto p1 = xla::Parameter(&builder, 1,
-                           xla::ShapeUtil::MakeShape(xla::F32, {2}), "P1");
-  auto sum = xla::Sub(p0, p1);
-  auto c = xla::ConstantR0<float>(&builder, 11.0f);
-  xla::Mul(sum, c);
-  return builder.Build().value();
-}
-
-xla::XlaComputation Dot() {
-  xla::XlaBuilder builder("Dot");
-  auto p0 = xla::Parameter(
-      &builder, 0,
-      xla::ShapeUtil::MakeShapeWithDenseLayout(xla::F32, {2, 2}, {0, 1}), "P0");
-  auto p1 = xla::Parameter(
-      &builder, 1,
-      xla::ShapeUtil::MakeShapeWithDenseLayout(xla::F32, {2, 1}, {0, 1}), "P1");
-  xla::DotDimensionNumbers ddn;
-  ddn.add_lhs_contracting_dimensions(1);
-  ddn.add_rhs_contracting_dimensions(0);
-  xla::DotGeneral(p0, p1, ddn);
-  return builder.Build().value();
-}
-
-xla::XlaComputation AddS64() {
-  xla::XlaBuilder builder("AddS64");
-  auto p0 = xla::Parameter(&builder, 0, xla::ShapeUtil::MakeShape(xla::S64, {}),
-                           "P0");
-  auto p1 = xla::Parameter(&builder, 1, xla::ShapeUtil::MakeShape(xla::S64, {}),
-                           "P1");
-  xla::Add(p0, p1);
-  return builder.Build().value();
-}
-
-xla::XlaComputation AddAndTuple() {
-  xla::XlaBuilder builder("AddAndTuple");
-  auto p0 = xla::Parameter(&builder, 0,
-                           xla::ShapeUtil::MakeShape(xla::F32, {2}), "P0");
-  auto p1 = xla::Parameter(&builder, 1,
-                           xla::ShapeUtil::MakeShape(xla::F32, {2}), "P1");
-  auto sum = xla::Add(p0, p1);
-  xla::Tuple(&builder, {sum});
-  return builder.Build().value();
-}
-
-xla::XlaComputation AddAndSubTuple() {
-  xla::XlaBuilder builder("AddAndSubTuple");
-  auto p0 = xla::Parameter(&builder, 0, xla::ShapeUtil::MakeShape(xla::F32, {}),
-                           "P0");
-  auto p1 = xla::Parameter(&builder, 1, xla::ShapeUtil::MakeShape(xla::F32, {}),
-                           "P1");
-  auto sum = xla::Add(p0, p1);
-  auto sub = xla::Sub(p0, p1);
-  xla::Tuple(&builder, {sum, sub});
-  return builder.Build().value();
-}
-
-xla::XlaComputation BroadcastComputation(const xla::Shape& shape,
-                                         absl::Span<const int64_t> dimensions) {
-  xla::XlaBuilder builder("BroadcastComputation");
-  auto p0 = xla::Parameter(&builder, 0, shape, "P0");
-  xla::Broadcast(p0, dimensions);
-  return builder.Build().value();
-}
-
-xla::XlaComputation IsEqualComputation(const xla::Shape& shape) {
-  xla::XlaBuilder builder("IsEqualComputation");
-  auto p0 = xla::Parameter(&builder, 0, shape, "P0");
-  auto p1 = xla::Parameter(&builder, 1, shape, "P1");
-  auto cmp =
-      xla::Ne(xla::Sub(p0, p1), xla::Zero(&builder, shape.element_type()));
-  auto icmp = xla::ConvertElementType(cmp, xla::S32);
-  xla::ReduceAll(icmp, xla::Zero(&builder, xla::S32),
-                 xla::CreateScalarAddComputation(xla::S32, &builder));
-  return builder.Build().value();
-}
-
-void StoreComputationSnapshot(const xla::XlaComputation& computation,
-                              xla::HloSnapshot* dst) {
-  auto snapshot = computation.Snapshot().value();
-  *dst = *snapshot;
-}
-
-xla::ProgramShape XlaCompiledProgramShape(
-    const xla::XlaComputation& computation,
-    const xla::ProgramShape& input_program_shape) {
-  se::Platform* platform =
-      xla::PlatformUtil::GetPlatform(*xla_platform_ptr).value();
-  xla::LocalClient* client =
-      xla::ClientLibrary::GetOrCreateLocalClient(platform).value();
-  xla::ExecutableBuildOptions exec_options;
-  exec_options.set_result_layout(input_program_shape.result());
-  std::vector<const xla::Shape*> parameters_shapes;
-  for (int64_t i = 0; i < input_program_shape.parameters_size(); ++i) {
-    parameters_shapes.push_back(&input_program_shape.parameters(i));
-  }
-  std::vector<std::unique_ptr<xla::LocalExecutable>> local_executables =
-      client->Compile(computation, parameters_shapes, exec_options).value();
-  EXPECT_EQ(local_executables.size(), 1);
-  std::unique_ptr<xla::LocalExecutable> local_executable =
-      std::move(local_executables[0]);
-  return local_executable->executable()
-      ->module()
-      .entry_computation()
-      ->ComputeProgramShape();
-}
-
-TEST(RawApiTest, AllocFromTensor) {
-  xla::Literal literal =
-      xla::LiteralUtil::CreateR2<float>({{4.0f, 5.0f}, {6.0f, 7.0f}});
-  Tensor tensor;
-  TF_ASSERT_OK(LiteralToHostTensor(literal, DT_FLOAT, &tensor));
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  std::vector<int> layout =
-      GetAttrLayout(literal.shape().layout().minor_to_major());
-  ops::XRTAllocateFromTensor::Attrs alloc_attrs =
-      ops::XRTAllocateFromTensor::Layouts(layout);
-  auto handle =
-      ops::XRTAllocateFromTensor(root, {tensor}, {tensor.shape()}, alloc_attrs);
-  auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back}, &outputs));
-  EXPECT_EQ(outputs.size(), 1);
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-  EXPECT_TRUE(CompareLiteralToLiteralProto(literal, response));
-}
-
-TEST(RawApiTest, AllocUninitialized) {
-  xla::Literal literal =
-      xla::LiteralUtil::CreateR2<float>({{4.0f, 5.0f}, {6.0f, 7.0f}});
-  Tensor tensor;
-  TF_ASSERT_OK(LiteralToHostTensor(literal, DT_FLOAT, &tensor));
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  std::vector<int> layout =
-      GetAttrLayout(literal.shape().layout().minor_to_major());
-
-  auto allocate_op =
-      ops::XRTAllocateUninitialized(root, DT_FLOAT, tensor.shape());
-
-  Tensor handle;
-  std::vector<Tensor> outputs;
-  XrtClientSession session(root);
-  // Allocate the tensor
-  {
-    TF_EXPECT_OK(session.Run({allocate_op}, &outputs));
-    handle = outputs[0];
-  }
-
-  // Make sure it has the expected shape
-  {
-    auto read_back_op = ops::XRTReadLiteral(root, handle);
-    TF_ASSERT_OK(root.status());
-
-    TF_EXPECT_OK(session.Run({read_back_op}, &outputs));
-    EXPECT_EQ(outputs.size(), 1);
-    xla::LiteralProto read_back_literal;
-    EXPECT_TRUE(
-        ParseFromTString(outputs[0].scalar<tstring>()(), &read_back_literal));
-    Tensor read_back_tensor;
-    TF_ASSERT_OK(LiteralToHostTensor(
-        xla::Literal::CreateFromProto(read_back_literal).value(), DT_FLOAT,
-        &read_back_tensor));
-
-    // The shape should be the same as 'tensor', but we don't have any
-    // expectation about the value of the tensors yet since it is uninitialized
-    EXPECT_EQ(tensor.shape(), read_back_tensor.shape());
-  }
-
-  // Make sure we can write to it
-  xla::LiteralProto new_literal =
-      xla::LiteralUtil::CreateR2({{9.0f, 2.0f}, {4.0f, 1.0f}}).ToProto();
-  {
-    auto new_value = ops::Const(root.WithDevice("/device:CPU:0"),
-                                new_literal.SerializeAsString());
-    auto write_op = ops::XRTWriteLiteral(root, Input(handle), new_value);
-    TF_ASSERT_OK(root.status());
-    TF_EXPECT_OK(session.Run({write_op}, &outputs));
-  }
-
-  // Now read it back
-  {
-    auto read_back_op = ops::XRTReadLiteralAndRelease(root, handle);
-    TF_ASSERT_OK(root.status());
-    TF_EXPECT_OK(session.Run({read_back_op}, &outputs));
-    EXPECT_EQ(outputs.size(), 1);
-
-    xla::LiteralProto response;
-    EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-    EXPECT_TRUE(CompareLiteralProtos(response, new_literal));
-  }
-}
-
-TEST(RawApiTest, AllocFromTensorTuple) {
-  xla::Literal literal0 =
-      xla::LiteralUtil::CreateR2<float>({{4.0f, 5.0f}, {6.0f, 7.0f}});
-  xla::Literal literal1 =
-      xla::LiteralUtil::CreateR2<float>({{14.0f, -5.0f}, {16.0f, 17.0f}});
-  xla::Literal literal = xla::LiteralUtil::MakeTuple({&literal0, &literal1});
-  Tensor tensor0;
-  TF_ASSERT_OK(LiteralToHostTensor(literal0, DT_FLOAT, &tensor0));
-  Tensor tensor1;
-  TF_ASSERT_OK(LiteralToHostTensor(literal1, DT_FLOAT, &tensor1));
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  std::vector<int> layout = GetShapeLayoutVector(literal.shape()).value();
-  ops::XRTAllocateFromTensor::Attrs alloc_attrs =
-      ops::XRTAllocateFromTensor::Layouts(layout);
-  auto handle = ops::XRTAllocateFromTensor(root, {tensor0, tensor1},
-                                           {tensor0.shape(), tensor1.shape()},
-                                           alloc_attrs);
-  auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back}, &outputs));
-  EXPECT_EQ(outputs.size(), 1);
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-  EXPECT_TRUE(CompareLiteralToLiteralProto(literal, response));
-}
-
-TEST(RawApiTest, AllocFromTensorTupleSingle) {
-  xla::Literal literal0 =
-      xla::LiteralUtil::CreateR2<float>({{4.0f, 5.0f}, {6.0f, 7.0f}});
-  xla::Literal literal = xla::LiteralUtil::MakeTuple({&literal0});
-  Tensor tensor0;
-  TF_ASSERT_OK(LiteralToHostTensor(literal0, DT_FLOAT, &tensor0));
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  std::vector<int> layout = GetShapeLayoutVector(literal.shape()).value();
-  ops::XRTAllocateFromTensor::Attrs alloc_attrs =
-      ops::XRTAllocateFromTensor::Layouts(layout).MakeTuple(true);
-  auto handle = ops::XRTAllocateFromTensor(root, {tensor0}, {tensor0.shape()},
-                                           alloc_attrs);
-  auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back}, &outputs));
-  EXPECT_EQ(outputs.size(), 1);
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-  EXPECT_TRUE(CompareLiteralToLiteralProto(literal, response));
-}
-
-TEST(RawApiTest, AllocFromTensorRelayout) {
-  xla::Literal literal =
-      xla::LiteralUtil::CreateR2<float>({{4.0f, 5.0f}, {6.0f, 7.0f}});
-  Tensor tensor;
-  TF_ASSERT_OK(LiteralToHostTensor(literal, DT_FLOAT, &tensor));
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  // Use inverse array layout with the tensor data above.
-  std::vector<int> layout({0, 1});
-  ops::XRTAllocateFromTensor::Attrs alloc_attrs =
-      ops::XRTAllocateFromTensor::Layouts(layout);
-  auto handle =
-      ops::XRTAllocateFromTensor(root, {tensor}, {tensor.shape()}, alloc_attrs);
-  auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back}, &outputs));
-  EXPECT_EQ(outputs.size(), 1);
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-  // We have sent literal's data (in array layout) with a attribute layout
-  // {0,1}, so the expected literal read from device needs to be changed
-  // accordingly.
-  xla::Literal expected_literal =
-      xla::LiteralUtil::CreateR2<float>({{4.0f, 6.0f}, {5.0f, 7.0f}});
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected_literal, response));
-}
-
-TEST(RawApiTest, AllocAndRewrite) {
-  xrt::XLAAllocation alloc;
-  *alloc.mutable_value() =
-      xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}}).ToProto();
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto value =
-      ops::Const(root.WithDevice("/device:CPU:0"), alloc.SerializeAsString());
-  auto handle = ops::XRTAllocate(root, value);
-  auto read_back = ops::XRTReadLiteral(root, handle);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back, handle}, &outputs));
-  EXPECT_EQ(outputs.size(), 2);
-
-  int64_t allocation_handle = outputs[1].scalar<int64_t>()();
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-  EXPECT_TRUE(CompareLiteralProtos(alloc.value(), response));
-
-  xla::LiteralProto new_literal =
-      xla::LiteralUtil::CreateR2({{9, 2}, {4, 1}}).ToProto();
-  auto new_value = ops::Const(root.WithDevice("/device:CPU:0"),
-                              new_literal.SerializeAsString());
-  auto write_op =
-      ops::XRTWriteLiteral(root, Input(allocation_handle), new_value);
-  TF_ASSERT_OK(root.status());
-  TF_EXPECT_OK(session.Run({write_op}, &outputs));
-  EXPECT_EQ(outputs.size(), 1);
-  EXPECT_EQ(allocation_handle, outputs[0].scalar<int64_t>()());
-
-  auto read_after_write = ops::XRTReadLiteral(root, Input(allocation_handle));
-  TF_EXPECT_OK(session.Run({read_after_write}, &outputs));
-  EXPECT_EQ(outputs.size(), 1);
-
-  xla::LiteralProto new_response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &new_response));
-  EXPECT_TRUE(CompareLiteralProtos(new_literal, new_response));
-
-  Tensor release_tensor(DT_INT64, TensorShape({1}));
-  release_tensor.flat<int64_t>()(0) = allocation_handle;
-
-  auto release = ops::XRTReleaseAllocationHandle(root, release_tensor);
-  TF_EXPECT_OK(session.Run(ClientSession::FeedType(), {}, {release}, &outputs));
-}
-
-TEST(RawApiTest, AllocReleaseMany) {
-  xrt::XLAAllocation alloc1;
-  *alloc1.mutable_value() =
-      xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}}).ToProto();
-  xrt::XLAAllocation alloc2;
-  *alloc2.mutable_value() =
-      xla::LiteralUtil::CreateR2({{6, 7}, {4, 5}}).ToProto();
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto value1 =
-      ops::Const(root.WithDevice("/device:CPU:0"), alloc1.SerializeAsString());
-  auto value2 =
-      ops::Const(root.WithDevice("/device:CPU:0"), alloc2.SerializeAsString());
-  auto handle1 = ops::XRTAllocate(root, value1);
-  auto handle2 = ops::XRTAllocate(root, value2);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({handle1, handle2}, &outputs));
-  EXPECT_EQ(outputs.size(), 2);
-
-  int64_t allocation_handle1 = outputs[0].scalar<int64_t>()();
-  int64_t allocation_handle2 = outputs[1].scalar<int64_t>()();
-
-  Tensor release_tensor(DT_INT64, TensorShape({2}));
-  release_tensor.flat<int64_t>()(0) = allocation_handle1;
-  release_tensor.flat<int64_t>()(1) = allocation_handle2;
-
-  auto release = ops::XRTReleaseAllocationHandle(root, release_tensor);
-  TF_EXPECT_OK(session.Run(ClientSession::FeedType(), {}, {release}, &outputs));
-}
-
-TEST(RawApiTest, CompileAndReleaseMany) {
-  xrt::XLAComputation c1;
-  auto config1 = c1.mutable_config();
-  auto shapes1 = config1->mutable_program_shape();
-  *shapes1->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  *shapes1->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  *shapes1->mutable_result() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  StoreComputationSnapshot(AddAndScale(), c1.mutable_hlo_snapshot());
-
-  xrt::XLAComputation c2;
-  auto config2 = c2.mutable_config();
-  auto shapes2 = config2->mutable_program_shape();
-  *shapes2->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  *shapes2->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  *shapes2->mutable_result() =
-      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {2})})
-          .ToProto();
-  StoreComputationSnapshot(AddAndTuple(), c2.mutable_hlo_snapshot());
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto computation1 =
-      ops::Const(root.WithDevice("/device:CPU:0"), c1.SerializeAsString());
-  auto c_handle1 = ops::XRTCompile(root, computation1);
-  auto computation2 =
-      ops::Const(root.WithDevice("/device:CPU:0"), c2.SerializeAsString());
-  auto c_handle2 = ops::XRTCompile(root, computation2);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({c_handle1.handle, c_handle2.handle}, &outputs));
-  EXPECT_EQ(outputs.size(), 2);
-
-  int64_t compilation_handle1 = outputs[0].scalar<int64_t>()();
-  int64_t compilation_handle2 = outputs[1].scalar<int64_t>()();
-
-  Tensor release_tensor(DT_INT64, TensorShape({2}));
-  release_tensor.flat<int64_t>()(0) = compilation_handle1;
-  release_tensor.flat<int64_t>()(1) = compilation_handle2;
-
-  auto release = ops::XRTReleaseCompilationHandle(root, release_tensor);
-  TF_EXPECT_OK(session.Run(ClientSession::FeedType(), {}, {release}, &outputs));
-}
-
-TEST(RawApiTest, AllocAndClearAll) {
-  xrt::XLAAllocation alloc;
-  *alloc.mutable_value() =
-      xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}}).ToProto();
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto value =
-      ops::Const(root.WithDevice("/device:CPU:0"), alloc.SerializeAsString());
-  auto handle = ops::XRTAllocate(root, value);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({handle}, &outputs));
-  EXPECT_EQ(outputs.size(), 1);
-
-  int64_t allocation_handle = outputs[0].scalar<int64_t>()();
-
-  auto clear_all = ops::XRTReleaseAllAllocations(root);
-
-  TF_EXPECT_OK(
-      session.Run(ClientSession::FeedType(), {}, {clear_all}, &outputs));
-  EXPECT_EQ(outputs.size(), 0);
-
-  auto read_after_clear = ops::XRTReadLiteral(root, Input(allocation_handle));
-  EXPECT_EQ(session.Run({read_after_clear}, &outputs).code(),
-            error::Code::NOT_FOUND);
-}
-
-TEST(RawApiTest, ReadAndWriteState) {
-  xrt::XLAAllocation alloc;
-  *alloc.mutable_value() = TwoElementTuple();
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto value =
-      ops::Const(root.WithDevice("/device:CPU:0"), alloc.SerializeAsString());
-  auto handle = ops::XRTAllocate(root, value);
-  auto read_back = ops::XRTReadLiteral(root, handle);
-  auto release = ops::XRTReleaseAllocationHandle(
-      root.WithControlDependencies(read_back), handle);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(
-      session.Run(ClientSession::FeedType(), {read_back}, {release}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-
-  EXPECT_TRUE(CompareLiteralProtos(alloc.value(), response));
-}
-
-TEST(RawApiTest, ReadAndWriteStateAutoFree) {
-  xrt::XLAAllocation alloc;
-  *alloc.mutable_value() = TwoElementTuple();
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto value =
-      ops::Const(root.WithDevice("/device:CPU:0"), alloc.SerializeAsString());
-  auto handle = ops::XRTAllocate(root, value);
-  auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-  EXPECT_TRUE(CompareLiteralProtos(alloc.value(), response));
-}
-
-TEST(RawApiTest, SubBuffer) {
-  xrt::XLAAllocation alloc;
-  *alloc.mutable_value() = NestedTuple();
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto value =
-      ops::Const(root.WithDevice("/device:CPU:0"), alloc.SerializeAsString());
-  auto base_handle = ops::XRTAllocate(root, value);
-  auto index_0 = ops::Const(root.WithDevice("/device:CPU:0"), {0});
-  auto index_1 = ops::Const(root.WithDevice("/device:CPU:0"), {1});
-  auto index_00 = ops::Const(root.WithDevice("/device:CPU:0"), {0, 0});
-  auto sub_0 = ops::XRTSubTuple(root, base_handle, index_0);
-  auto sub_1 = ops::XRTSubTuple(root, base_handle, index_1);
-  auto sub_00 = ops::XRTSubTupleAndRelease(
-      root.WithControlDependencies(
-          {sub_0.output_handle.op(), sub_1.output_handle.op()}),
-      base_handle, index_00);
-  auto value_0 = ops::XRTReadLiteralAndRelease(root, sub_0);
-  auto value_1 = ops::XRTReadLiteralAndRelease(root, sub_1);
-  auto value_00 = ops::XRTReadLiteralAndRelease(root, sub_00);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({value_0, value_1, value_00}, &outputs));
-
-  auto base_literal = xla::Literal::CreateFromProto(alloc.value()).value();
-  auto base_elements = base_literal.DecomposeTuple();
-  auto nested_0_elements = base_elements[0].Clone().DecomposeTuple();
-  xla::LiteralProto response_0;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response_0));
-  EXPECT_TRUE(CompareLiteralToLiteralProto(base_elements[0], response_0));
-  xla::LiteralProto response_1;
-  EXPECT_TRUE(ParseFromTString(outputs[1].scalar<tstring>()(), &response_1));
-  EXPECT_TRUE(CompareLiteralToLiteralProto(base_elements[1], response_1));
-  xla::LiteralProto response_00;
-  EXPECT_TRUE(ParseFromTString(outputs[2].scalar<tstring>()(), &response_00));
-  EXPECT_TRUE(CompareLiteralToLiteralProto(nested_0_elements[0], response_00));
-}
-
-TEST(RawApiTest, MakeTuple) {
-  xrt::XLAAllocation alloc_0;
-  *alloc_0.mutable_value() = TwoElementTuple();
-  xrt::XLAAllocation alloc_1;
-  *alloc_1.mutable_value() = ScalarLiteral();
-
-  // The trivial tuple that just forwards its input and releases it.
-  xrt::XLATupleNode desc_0;
-  desc_0.set_input_index(0);
-  desc_0.set_release_input_handle(true);
-
-  xrt::XLATupleNode desc_1;
-  auto subdesc_10 = desc_1.add_tuples();
-  auto subdesc_11 = desc_1.add_tuples();
-  subdesc_10->set_input_index(0);
-  auto subdesc_110 = subdesc_11->add_tuples();
-  subdesc_110->set_input_index(0);
-  auto subdesc_111 = subdesc_11->add_tuples();
-  subdesc_111->set_input_index(1);
-
-  xrt::XLATupleNode desc_2;
-  auto subdesc_20 = desc_2.add_tuples();
-  auto subdesc_21 = desc_2.add_tuples();
-  subdesc_20->set_input_index(1);
-  subdesc_20->set_release_input_handle(true);
-  subdesc_21->set_input_index(0);
-  subdesc_21->set_release_input_handle(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto value_0 =
-      ops::Const(root.WithDevice("/device:CPU:0"), alloc_0.SerializeAsString());
-  auto handle_0 = ops::XRTAllocate(root, value_0);
-  auto value_1 =
-      ops::Const(root.WithDevice("/device:CPU:0"), alloc_1.SerializeAsString());
-  auto handle_1 = ops::XRTAllocate(root, value_1);
-  auto tuple_0 =
-      ops::Const(root.WithDevice("/device:CPU:0"), desc_0.SerializeAsString());
-  auto handle_2 =
-      ops::XRTMakeTuple(root, tuple_0, {static_cast<Output>(handle_0)});
-  // handle_0 has now been released.
-  auto tuple_1 =
-      ops::Const(root.WithDevice("/device:CPU:0"), desc_1.SerializeAsString());
-  auto handle_3 = ops::XRTMakeTuple(
-      root, tuple_1,
-      {static_cast<Output>(handle_1), static_cast<Output>(handle_2)});
-  auto tuple_2 =
-      ops::Const(root.WithDevice("/device:CPU:0"), desc_2.SerializeAsString());
-  // Make sure this runs after handle_3 has completed, since it will free
-  // handle_1 and handle_2.
-  auto handle_4 = ops::XRTMakeTuple(
-      root.WithControlDependencies(handle_3), tuple_2,
-      {static_cast<Output>(handle_1), static_cast<Output>(handle_2)});
-  // handle_1 and handle_2 have now been released.
-
-  auto res_0 = ops::XRTReadLiteralAndRelease(root, handle_3);
-  auto res_1 = ops::XRTReadLiteralAndRelease(root, handle_4);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({res_0, res_1}, &outputs));
-  xla::LiteralProto response_0;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response_0));
-  xla::LiteralProto response_1;
-  EXPECT_TRUE(ParseFromTString(outputs[1].scalar<tstring>()(), &response_1));
-
-  auto expected_0 = MakeTuple0();
-  EXPECT_TRUE(CompareLiteralProtos(response_0, expected_0));
-  auto expected_1 = NestedTuple();
-  EXPECT_TRUE(CompareLiteralProtos(response_1, expected_1));
-}
-
-TEST(RawApiTest, ExecuteChainedOpByOp) {
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-
-  auto make_computation = [](const std::function<xla::XlaComputation()>& fn) {
-    xrt::XLAComputation c;
-    auto config = c.mutable_config();
-    auto shapes = config->mutable_program_shape();
-    *shapes->add_parameters() =
-        xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-    *shapes->add_parameters() =
-        xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-    *shapes->mutable_result() =
-        xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-    StoreComputationSnapshot(fn(), c.mutable_hlo_snapshot());
-    return c.SerializeAsString();
-  };
-
-  auto c_add_scale = make_computation(AddAndScale);
-  auto c_sub_scale = make_computation(SubAndScale);
-
-  auto c_add_scale_op = ops::XRTCompile(
-      root, ops::Const(root.WithDevice("/device:CPU:0"), c_add_scale));
-  auto c_sub_scale_op = ops::XRTCompile(
-      root, ops::Const(root.WithDevice("/device:CPU:0"), c_sub_scale));
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(
-      session.Run({c_add_scale_op.handle, c_sub_scale_op.handle}, &outputs));
-  EXPECT_EQ(outputs.size(), 2);
-
-  int64_t c_add_scale_handle = outputs[0].scalar<int64_t>()();
-  int64_t c_sub_scale_handle = outputs[1].scalar<int64_t>()();
-
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() = FloatVector({1.0f, 2.0f});
-  xrt::XLAAllocation p1;
-  *p1.mutable_value() = FloatVector({8.0f, 5.0f});
-
-  auto p0_handle = ops::XRTAllocate(
-      root,
-      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString()));
-  auto p1_handle = ops::XRTAllocate(
-      root,
-      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString()));
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(false);
-  e.set_release_compilation_handle(false);
-  auto e_config =
-      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
-  auto result0 = ops::XRTExecute(root, Input(c_add_scale_handle), e_config,
-                                 {Output(p0_handle), Output(p1_handle)});
-  auto result1 = ops::XRTExecute(root, Input(c_sub_scale_handle), e_config,
-                                 {Output(p0_handle), Output(p1_handle)});
-  auto result = ops::XRTExecute(root, Input(c_add_scale_handle), e_config,
-                                {result0.output_handle, result1.output_handle});
-  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
-  TF_ASSERT_OK(root.status());
-
-  TF_EXPECT_OK(session.Run({read_back}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-
-  auto expected = xla::LiteralUtil::CreateR1<float>({-150.0f, -36.0f});
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-}
-
-TEST(RawApiTest, ExecuteChained) {
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-
-  auto make_computation = [](const std::function<xla::XlaComputation()>& fn) {
-    xrt::XLAComputation c;
-    auto config = c.mutable_config();
-    auto shapes = config->mutable_program_shape();
-    *shapes->add_parameters() =
-        xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-    *shapes->add_parameters() =
-        xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-    *shapes->mutable_result() =
-        xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-    StoreComputationSnapshot(fn(), c.mutable_hlo_snapshot());
-    return c.SerializeAsString();
-  };
-
-  auto c_add_scale = make_computation(AddAndScale);
-  auto c_sub_scale = make_computation(SubAndScale);
-
-  auto c_add_scale_op = ops::XRTCompile(
-      root, ops::Const(root.WithDevice("/device:CPU:0"), c_add_scale));
-  auto c_sub_scale_op = ops::XRTCompile(
-      root, ops::Const(root.WithDevice("/device:CPU:0"), c_sub_scale));
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(
-      session.Run({c_add_scale_op.handle, c_sub_scale_op.handle}, &outputs));
-  EXPECT_EQ(outputs.size(), 2);
-
-  int64_t c_add_scale_handle = outputs[0].scalar<int64_t>()();
-  int64_t c_sub_scale_handle = outputs[1].scalar<int64_t>()();
-
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() = FloatVector({1.0f, 2.0f});
-  xrt::XLAAllocation p1;
-  *p1.mutable_value() = FloatVector({8.0f, 5.0f});
-
-  auto p0_handle_op = ops::XRTAllocate(
-      root,
-      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString()));
-  auto p1_handle_op = ops::XRTAllocate(
-      root,
-      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString()));
-
-  TF_EXPECT_OK(session.Run({p0_handle_op, p1_handle_op}, &outputs));
-  EXPECT_EQ(outputs.size(), 2);
-
-  int64_t p0_handle = outputs[0].scalar<int64_t>()();
-  int64_t p1_handle = outputs[1].scalar<int64_t>()();
-
-  xrt::XRTChainedExecuteConfig config;
-  auto config_const =
-      ops::Const(root.WithDevice("/device:CPU:0"), config.SerializeAsString());
-
-  xrt::XRTChainedExecutePlan plan;
-  xrt::XRTChainedExecuteOp* op;
-  xrt::XRTChainedExecuteOp::Input* input;
-  xrt::XRTChainedExecuteOp::Output* output;
-
-  // Index 0
-  op = plan.add_ops();
-  op->set_data_handle(p0_handle);
-
-  // Index 1
-  op = plan.add_ops();
-  op->set_data_handle(p1_handle);
-
-  // Index 2
-  op = plan.add_ops();
-  op->set_computation_handle(c_add_scale_handle);
-  input = op->add_inputs();
-  input->set_op_index(0);
-  input = op->add_inputs();
-  input->set_op_index(1);
-
-  // Index 3
-  op = plan.add_ops();
-  op->set_computation_handle(c_sub_scale_handle);
-  input = op->add_inputs();
-  input->set_op_index(0);
-  input = op->add_inputs();
-  input->set_op_index(1);
-
-  // Index 4
-  op = plan.add_ops();
-  op->set_computation_handle(c_add_scale_handle);
-  input = op->add_inputs();
-  input->set_op_index(2);
-  input = op->add_inputs();
-  input->set_op_index(3);
-  output = op->add_outputs();
-  output->set_result_index(0);
-
-  auto plan_const =
-      ops::Const(root.WithDevice("/device:CPU:0"), plan.SerializeAsString());
-  auto result = ops::XRTExecuteChained(root, plan_const, config_const);
-  TF_ASSERT_OK(root.status());
-
-  TF_EXPECT_OK(session.Run({result}, &outputs));
-  EXPECT_EQ(outputs.size(), 1);
-
-  auto handles_vec = outputs[0].vec<int64_t>();
-  EXPECT_EQ(handles_vec.size(), 1);
-
-  auto read_back = ops::XRTReadLiteralAndRelease(root, Input(handles_vec(0)));
-  TF_ASSERT_OK(root.status());
-
-  TF_EXPECT_OK(session.Run({read_back}, &outputs));
-  EXPECT_EQ(outputs.size(), 1);
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-
-  auto expected = xla::LiteralUtil::CreateR1<float>({-150.0f, -36.0f});
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-}
-
-TEST(RawApiTest, CompileAndExecute) {
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() = FloatVector({1.0f, 2.0f});
-  xrt::XLAAllocation p1;
-  *p1.mutable_value() = FloatVector({8.0f, 5.0f});
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  *shapes->mutable_result() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  StoreComputationSnapshot(AddAndScale(), c.mutable_hlo_snapshot());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(true);
-  e.set_release_compilation_handle(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto e_config =
-      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
-  auto computation =
-      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  auto p0_value =
-      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
-  auto p0_handle = ops::XRTAllocate(root, p0_value);
-  auto p1_value =
-      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
-  auto p1_handle = ops::XRTAllocate(root, p1_value);
-  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
-                                {Output(p0_handle), Output(p1_handle)});
-  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-
-  auto expected = xla::LiteralUtil::CreateR1<float>({27.0f, 21.0f});
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-
-  xla::ProgramShapeProto program_shape;
-  EXPECT_TRUE(ParseFromTString(outputs[1].vec<tstring>()(0), &program_shape));
-  EXPECT_EQ(program_shape.parameters_size(), 2);
-}
-
-TEST(RawApiTest, DynamicR1Test) {
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f, -1.0f});
-  xrt::XLAAllocation p1;
-  *p1.mutable_value() = FloatVector({1.0f, -1.0f, 2.5f, 1.17f});
-  xrt::XLAAllocation p2;
-  *p2.mutable_value() = CreateR0<int32_t>(2);
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
-  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
-  dyn_shape.set_dynamic_dimension(0, true);
-  *shapes->mutable_result() = dyn_shape.ToProto();
-  StoreComputationSnapshot(ReturnDynamicR1(), c.mutable_hlo_snapshot());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(true);
-  e.set_release_compilation_handle(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  Scope cpu_root = root.WithDevice("/device:CPU:0");
-  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
-  auto computation = ops::Const(cpu_root, c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
-  auto p0_handle = ops::XRTAllocate(root, p0_value);
-  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
-  auto p1_handle = ops::XRTAllocate(root, p1_value);
-  auto p2_value = ops::Const(cpu_root, p2.SerializeAsString());
-  auto p2_handle = ops::XRTAllocate(root, p2_value);
-  auto result = ops::XRTExecute(
-      root, c_handle.handle, e_config,
-      {Output(p0_handle), Output(p1_handle), Output(p2_handle)});
-  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
-  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f});
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-}
-
-TEST(RawApiTest, DynamicR2Test) {
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() = xla::LiteralUtil::CreateR2({{1.0f, 2.0f, 0.5f, -1.0f},
-                                                    {1.5f, 2.5f, 3.0f, -2.0f}})
-                            .ToProto();
-  xrt::XLAAllocation p1;
-  *p1.mutable_value() = xla::LiteralUtil::CreateR2({{1.0f, -1.0f, 2.5f, 1.17f},
-                                                    {1.2f, -1.6f, 2.8f, 1.24f}})
-                            .ToProto();
-  xrt::XLAAllocation p2;
-  *p2.mutable_value() = CreateR0<int32_t>(2);
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2, 4}).ToProto();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2, 4}).ToProto();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
-  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {2, 4});
-  dyn_shape.set_dynamic_dimension(0, true);
-  dyn_shape.set_dynamic_dimension(1, true);
-  *shapes->mutable_result() = dyn_shape.ToProto();
-  StoreComputationSnapshot(ReturnDynamicR2(), c.mutable_hlo_snapshot());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(true);
-  e.set_release_compilation_handle(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  Scope cpu_root = root.WithDevice("/device:CPU:0");
-  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
-  auto computation = ops::Const(cpu_root, c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
-  auto p0_handle = ops::XRTAllocate(root, p0_value);
-  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
-  auto p1_handle = ops::XRTAllocate(root, p1_value);
-  auto p2_value = ops::Const(cpu_root, p2.SerializeAsString());
-  auto p2_handle = ops::XRTAllocate(root, p2_value);
-  auto result = ops::XRTExecute(
-      root, c_handle.handle, e_config,
-      {Output(p0_handle), Output(p1_handle), Output(p2_handle)});
-  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
-  auto expected = xla::LiteralUtil::CreateR2<float>({{2.0f, 1.0f}, {2.7, 0.9}});
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-}
-
-TEST(RawApiTest, DynamicR1TupleTest) {
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f, -1.0f});
-  xrt::XLAAllocation p1;
-  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f, 1.0f});
-  xrt::XLAAllocation p2;
-  *p2.mutable_value() = CreateR0<int32_t>(2);
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
-  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
-  dyn_shape.set_dynamic_dimension(0, true);
-  *shapes->mutable_result() =
-      xla::ShapeUtil::MakeTupleShape(
-          {dyn_shape, xla::ShapeUtil::MakeShape(xla::F32, {4}), dyn_shape})
-          .ToProto();
-  StoreComputationSnapshot(ReturnDynamicR1Tuple(), c.mutable_hlo_snapshot());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(true);
-  e.set_release_compilation_handle(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  Scope cpu_root = root.WithDevice("/device:CPU:0");
-  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
-  auto computation = ops::Const(cpu_root, c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
-  auto p0_handle = ops::XRTAllocate(root, p0_value);
-  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
-  auto p1_handle = ops::XRTAllocate(root, p1_value);
-  auto p2_value = ops::Const(cpu_root, p2.SerializeAsString());
-  auto p2_handle = ops::XRTAllocate(root, p2_value);
-  auto result = ops::XRTExecute(
-      root, c_handle.handle, e_config,
-      {Output(p0_handle), Output(p1_handle), Output(p2_handle)});
-  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
-
-  auto expected0 = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f});
-  auto expected1 = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f, 0.0f});
-  auto expected2 = xla::LiteralUtil::CreateR1<float>({0.0f, 3.0f, 1.0f});
-  auto expected =
-      xla::LiteralUtil::MakeTuple({&expected0, &expected1, &expected2});
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-}
-
-TEST(RawApiTest, AcceptDynamicR1TupleTest) {
-  if (*xla_test_device_ptr == "XLA_CPU" || *xla_test_device_ptr == "XLA_GPU") {
-    // XLA_CPU and XLA_GPU has shape check set to kCompileTime.
-    return;
-  }
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f});
-  xrt::XLAAllocation p1;
-  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f});
-
-  xrt::XLATupleNode tuple_desc;
-  auto subdesc_10 = tuple_desc.add_tuples();
-  auto subdesc_11 = tuple_desc.add_tuples();
-  subdesc_10->set_input_index(0);
-  subdesc_10->set_release_input_handle(true);
-  subdesc_11->set_input_index(1);
-  subdesc_11->set_release_input_handle(true);
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  xla::Shape dyn_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
-  dyn_input_shape.set_dynamic_dimension(0, true);
-  xla::Shape dyn_tuple_shape =
-      xla::ShapeUtil::MakeTupleShape({dyn_input_shape, dyn_input_shape});
-  *shapes->add_parameters() = dyn_tuple_shape.ToProto();
-  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
-  dyn_shape.set_dynamic_dimension(0, true);
-  *shapes->mutable_result() = dyn_shape.ToProto();
-  StoreComputationSnapshot(AcceptDynamicR1Tuple(), c.mutable_hlo_snapshot());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(true);
-  e.set_release_compilation_handle(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  Scope cpu_root = root.WithDevice("/device:CPU:0");
-  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
-  auto computation = ops::Const(cpu_root, c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
-  auto p0_handle = ops::XRTAllocate(root, p0_value);
-  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
-  auto p1_handle = ops::XRTAllocate(root, p1_value);
-
-  auto tuple_0 = ops::Const(root.WithDevice("/device:CPU:0"),
-                            tuple_desc.SerializeAsString());
-  auto t0_handle = ops::XRTMakeTuple(
-      root, tuple_0,
-      {static_cast<Output>(p0_handle), static_cast<Output>(p1_handle)});
-  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
-                                {static_cast<Output>(t0_handle)});
-  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
-
-  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f});
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-}
-
-TEST(RawApiTest, AcceptDynamicR1Test) {
-  if (*xla_test_device_ptr == "XLA_CPU" || *xla_test_device_ptr == "XLA_GPU") {
-    // XLA_CPU and XLA_GPU has shape check set to kCompileTime.
-    return;
-  }
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f});
-  xrt::XLAAllocation p1;
-  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f});
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  xla::Shape dyn_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
-  dyn_input_shape.set_dynamic_dimension(0, true);
-  *shapes->add_parameters() = dyn_input_shape.ToProto();
-  *shapes->add_parameters() = dyn_input_shape.ToProto();
-  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
-  dyn_shape.set_dynamic_dimension(0, true);
-  *shapes->mutable_result() = dyn_shape.ToProto();
-  StoreComputationSnapshot(AcceptDynamicR1(), c.mutable_hlo_snapshot());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(true);
-  e.set_release_compilation_handle(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  Scope cpu_root = root.WithDevice("/device:CPU:0");
-  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
-  auto computation = ops::Const(cpu_root, c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
-  auto allocate_op_0 = ops::XRTAllocate(root, p0_value);
-  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
-  auto allocate_op_1 = ops::XRTAllocate(root, p1_value);
-  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
-                                {Output(allocate_op_0), Output(allocate_op_1)});
-  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
-
-  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f});
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-}
-
-TEST(RawApiTest, AcceptDynamicR2Test) {
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() =
-      xla::LiteralUtil::CreateR2({{-1.0f, 2.0f, 3.0f}, {-4.0f, -5.0f, 6.0f}})
-          .ToProto();
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  // Compile time expects ascending layout.
-  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {2, 4});
-  dyn_shape.set_dynamic_dimension(1, true);
-  *shapes->add_parameters() = dyn_shape.ToProto();
-
-  *shapes->mutable_result() = dyn_shape.ToProto();
-  StoreComputationSnapshot(AcceptDynamicR2(), c.mutable_hlo_snapshot());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(true);
-  e.set_release_compilation_handle(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  Scope cpu_root = root.WithDevice("/device:CPU:0");
-  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
-  auto computation = ops::Const(cpu_root, c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
-  auto p0_handle = ops::XRTAllocate(root, p0_value);
-  auto result =
-      ops::XRTExecute(root, c_handle.handle, e_config, {Output(p0_handle)});
-  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
-
-  auto expected = xla::LiteralUtil::CreateR2<float>(
-      {{1.0f, -2.0f, -3.0f}, {4.0f, 5.0f, -6.0f}});
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-}
-
-TEST(RawApiTest, CompileAndExecuteWithArgumentVector) {
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() = FloatVector({1.0f, 2.0f});
-  xrt::XLAAllocation p1;
-  *p1.mutable_value() = FloatVector({8.0f, 5.0f});
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  *shapes->mutable_result() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  StoreComputationSnapshot(AddAndScale(), c.mutable_hlo_snapshot());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(true);
-  e.set_release_compilation_handle(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto e_config =
-      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
-  auto computation =
-      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  auto p0_value =
-      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
-  auto p0_handle = ops::XRTAllocate(root, p0_value);
-  auto p1_value =
-      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
-  auto p1_handle = ops::XRTAllocate(root, p1_value);
-  auto packed_args = ops::Stack(root.WithDevice("/device:CPU:0"),
-                                {Output(p0_handle), Output(p1_handle)});
-  auto result =
-      ops::XRTExecute(root, c_handle.handle, e_config, {Output(packed_args)});
-  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-
-  auto expected = xla::LiteralUtil::CreateR1<float>({27.0f, 21.0f});
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-
-  xla::ProgramShapeProto program_shape;
-  EXPECT_TRUE(ParseFromTString(outputs[1].vec<tstring>()(0), &program_shape));
-  EXPECT_EQ(program_shape.parameters_size(), 2);
-}
-
-TEST(RawApiTest, CompileWithXlaReturnShapes) {
-  xla::XlaBuilder builder("XrtXlaShapes");
-  auto input_shape = xla::ShapeUtil::MakeShape(xla::BF16, {32, 3, 128, 128});
-  auto kernel_shape = xla::ShapeUtil::MakeShape(xla::BF16, {3, 3, 5, 5});
-  // Clear layouts to signal XLA we are ready to get whatever are coming out of
-  // the compilation process.
-  xla::LayoutUtil::ClearLayout(&input_shape);
-  xla::LayoutUtil::ClearLayout(&kernel_shape);
-  auto param_shape =
-      xla::ShapeUtil::MakeTupleShape({input_shape, kernel_shape});
-  auto param = xla::Parameter(&builder, 0, param_shape, "param");
-  auto input = xla::GetTupleElement(param, 0);
-  auto kernel = xla::GetTupleElement(param, 1);
-  xla::Conv(input, kernel, {1, 1}, xla::Padding::kSame);
-  TF_ASSERT_OK_AND_ASSIGN(xla::XlaComputation xla_computation, builder.Build());
-
-  auto result_shape = xla_computation.GetProgramShape().value().result();
-  // Clear the result shape layout to tell XLA we are accepting whatever are
-  // coming out of the compilation process.
-  xla::LayoutUtil::ClearLayout(&result_shape);
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = param_shape.ToProto();
-  *shapes->mutable_result() = result_shape.ToProto();
-  StoreComputationSnapshot(xla_computation, c.mutable_hlo_snapshot());
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto computation =
-      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  auto release = ops::XRTReleaseCompilationHandle(root, c_handle.handle);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run(ClientSession::FeedType(), {c_handle.program_shape},
-                           {release}, &outputs));
-
-  xla::ProgramShapeProto program_shape_proto;
-  EXPECT_TRUE(
-      ParseFromTString(outputs[0].vec<tstring>()(0), &program_shape_proto));
-  xla::ProgramShape program_shape(program_shape_proto);
-  EXPECT_EQ(program_shape.parameters_size(), 1);
-
-  VLOG(2) << "Param: "
-          << xla::ShapeUtil::HumanStringWithLayout(program_shape.parameters(0));
-  VLOG(2) << "Result: "
-          << xla::ShapeUtil::HumanStringWithLayout(program_shape.result());
-
-  xla::ProgramShape xla_program_shape =
-      XlaCompiledProgramShape(xla_computation, xla::ProgramShape(*shapes));
-  EXPECT_TRUE(xla::Layout::Equal().MinorToMajorOnly()(
-      xla::ShapeUtil::GetSubshape(program_shape.parameters(0), {0}).layout(),
-      xla::ShapeUtil::GetSubshape(xla_program_shape.parameters(0), {0})
-          .layout()));
-  EXPECT_TRUE(xla::Layout::Equal().MinorToMajorOnly()(
-      xla::ShapeUtil::GetSubshape(program_shape.parameters(0), {1}).layout(),
-      xla::ShapeUtil::GetSubshape(xla_program_shape.parameters(0), {1})
-          .layout()));
-  EXPECT_TRUE(xla::Layout::Equal().MinorToMajorOnly()(
-      program_shape.result().layout(), xla_program_shape.result().layout()));
-}
-
-TEST(RawApiTest, DotGeneralWithLayoutTest) {
-  auto layout = xla::LayoutUtil::MakeLayout({0, 1});
-
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() = FloatMatrix({{1.0f, 2.0f}, {3.0f, 4.0f}}, layout);
-  xrt::XLAAllocation p1;
-  *p1.mutable_value() = FloatMatrix({{8.0f}, {5.0f}}, layout);
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShapeWithDenseLayout(xla::F32, {2, 2}, {0, 1})
-          .ToProto();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShapeWithDenseLayout(xla::F32, {2, 1}, {0, 1})
-          .ToProto();
-  *shapes->mutable_result() =
-      xla::ShapeUtil::MakeShapeWithDenseLayout(xla::F32, {2, 1}, {0, 1})
-          .ToProto();
-  StoreComputationSnapshot(Dot(), c.mutable_hlo_snapshot());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(true);
-  e.set_release_compilation_handle(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto e_config =
-      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
-  auto computation =
-      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  auto p0_value =
-      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
-  auto p0_handle = ops::XRTAllocate(root, p0_value);
-  auto p1_value =
-      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
-  auto p1_handle = ops::XRTAllocate(root, p1_value);
-  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
-                                {Output(p0_handle), Output(p1_handle)});
-  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-
-  auto expected =
-      xla::LiteralUtil::CreateR2WithLayout<float>({{18.0f}, {44.0f}}, layout);
-
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-}
-
-TEST(RawApiTest, CompileAndExecuteZeroArg) {
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto();
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(true);
-  e.set_release_compilation_handle(true);
-  StoreComputationSnapshot(OnePlusTwo(), c.mutable_hlo_snapshot());
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto e_config =
-      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
-  auto computation =
-      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
-                                std::initializer_list<Input>({}));
-  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-
-  auto expected = xla::LiteralUtil::CreateR0<float>(3.0f);
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-}
-
-TEST(RawApiTest, CompileAndExecuteReturnTuple) {
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() = FloatVector({1.0f, 2.0f});
-  xrt::XLAAllocation p1;
-  *p1.mutable_value() = FloatVector({8.0f, 5.0f});
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  *shapes->mutable_result() =
-      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {2})})
-          .ToProto();
-  StoreComputationSnapshot(AddAndTuple(), c.mutable_hlo_snapshot());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(true);
-  e.set_release_compilation_handle(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto e_config =
-      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
-  auto computation =
-      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  auto p0_value =
-      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
-  auto p0_handle = ops::XRTAllocate(root, p0_value);
-  auto p1_value =
-      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
-  auto p1_handle = ops::XRTAllocate(root, p1_value);
-  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
-                                {Output(p0_handle), Output(p1_handle)});
-  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-
-  auto sum = xla::LiteralUtil::CreateR1<float>({9.0f, 7.0f});
-  auto expected = xla::LiteralUtil::MakeTuple({&sum});
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-}
-
-TEST(RawApiTest, CompileAndExecuteReturnExplodedTuple) {
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() = xla::LiteralUtil::CreateR0<float>(12.0f).ToProto();
-
-  xrt::XLAAllocation p1;
-  *p1.mutable_value() = xla::LiteralUtil::CreateR0<float>(3.0f).ToProto();
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto();
-  *shapes->mutable_result() =
-      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {}),
-                                      xla::ShapeUtil::MakeShape(xla::F32, {})})
-          .ToProto();
-  StoreComputationSnapshot(AddAndSubTuple(), c.mutable_hlo_snapshot());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(true);
-  e.set_release_compilation_handle(true);
-  e.set_return_exploded_tuple(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto e_config =
-      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
-  auto computation =
-      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  auto p0_value =
-      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
-  auto p0_handle = ops::XRTAllocate(root, p0_value);
-  auto p1_value =
-      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
-  auto p1_handle = ops::XRTAllocate(root, p1_value);
-  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
-                                {Output(p0_handle), Output(p1_handle)});
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({result}, &outputs));
-  EXPECT_EQ(outputs.size(), 1);
-
-  auto handles_vec = outputs.front().vec<int64_t>();
-  EXPECT_EQ(handles_vec.size(), 2);
-
-  const float kResults[2] = {15.0f, 9.0f};
-  for (int64_t i = 0; i < handles_vec.size(); ++i) {
-    auto read_back = ops::XRTReadLiteralAndRelease(root, Input(handles_vec(i)));
-    std::vector<Tensor> voutputs;
-    TF_EXPECT_OK(session.Run({read_back}, &voutputs));
-    EXPECT_EQ(voutputs.size(), 1);
-
-    xla::LiteralProto response;
-    EXPECT_TRUE(ParseFromTString(voutputs[0].scalar<tstring>()(), &response));
-
-    auto expected = xla::LiteralUtil::CreateR0<float>(kResults[i]);
-    EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-  }
-}
-
-TEST(RawApiTest, LeakCompilationReference) {
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
-  *shapes->mutable_result() =
-      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {2})})
-          .ToProto();
-  StoreComputationSnapshot(AddAndTuple(), c.mutable_hlo_snapshot());
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto computation =
-      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({c_handle.handle}, &outputs));
-}
-
-TEST(RawApiTest, CompileAndExecuteWithReusedBuffers) {
-  xla::Shape element_shape = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  xla::Shape shape =
-      xla::ShapeUtil::MakeTupleShape({element_shape, element_shape});
-  xla::Shape return_shape = xla::ShapeUtil::MakeTupleShape(
-      {element_shape, element_shape, element_shape, element_shape});
-  xla::XlaBuilder builder("ReuseBuffer");
-  auto param = xla::Parameter(&builder, 0, shape, "param");
-  auto p0 = xla::GetTupleElement(param, 0);
-  auto p1 = xla::GetTupleElement(param, 1);
-  auto add = xla::Add(p0, p1);
-  auto sub = xla::Sub(p0, p1);
-  xla::Tuple(&builder, {add, sub, p0, p1});
-
-  // Flip the tuple literals in the input handle.
-  builder.SetUpAlias({1}, 0, {0});
-  builder.SetUpAlias({0}, 0, {1});
-
-  auto computation = builder.Build().value();
-
-  auto literal0 = xla::LiteralUtil::CreateR1<float>({1.0f, 2.0f});
-  auto literal1 = xla::LiteralUtil::CreateR1<float>({5.0f, 9.0f});
-  auto literal = xla::LiteralUtil::MakeTuple({&literal0, &literal1});
-
-  xrt::XLAAllocation param_alloc;
-  *param_alloc.mutable_value() = literal.ToProto();
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = shape.ToProto();
-  *shapes->mutable_result() = return_shape.ToProto();
-  StoreComputationSnapshot(computation, c.mutable_hlo_snapshot());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(false);
-  e.set_release_compilation_handle(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  XrtClientSession session(root);
-  auto e_config =
-      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
-  auto c_data =
-      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, c_data);
-  auto param_value = ops::Const(root.WithDevice("/device:CPU:0"),
-                                param_alloc.SerializeAsString());
-  auto param_handle = ops::XRTAllocate(root, param_value);
-  TF_ASSERT_OK(root.status());
-
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({param_handle}, &outputs));
-
-  int64_t alloc_handle = outputs[0].scalar<int64_t>()();
-
-  // Note that we release the result handle immediately, but since we aliased
-  // the output buffers onto the input allocation ones (held in alloc_handle),
-  // we can fetch the result from there.
-  auto result =
-      ops::XRTExecute(root, c_handle.handle, e_config, {Input(alloc_handle)});
-  auto read_back = ops::XRTReadLiteral(root, result);
-  auto release = ops::XRTReleaseAllocationHandle(
-      root.WithControlDependencies(read_back), result);
-  TF_ASSERT_OK(root.status());
-
-  TF_EXPECT_OK(
-      session.Run(ClientSession::FeedType(), {read_back}, {release}, &outputs));
-
-  xla::Literal exec_literal = ReadOutputLiteral(outputs, 0);
-  auto exec_literal_parts = exec_literal.DecomposeTuple();
-  ASSERT_EQ(exec_literal_parts.size(), 4);
-
-  EXPECT_TRUE(CompareLiterals(exec_literal_parts[2], literal0));
-  EXPECT_TRUE(CompareLiterals(exec_literal_parts[3], literal1));
-
-  // Now we read back the original input handle values, which at this point
-  // should contain the result of the XLA computation.
-  auto read_handle = ops::XRTReadLiteral(root, Input(alloc_handle));
-  TF_ASSERT_OK(root.status());
-  auto release_handle = ops::XRTReleaseAllocationHandle(
-      root.WithControlDependencies(read_handle), Input(alloc_handle));
-  TF_ASSERT_OK(root.status());
-
-  TF_EXPECT_OK(session.Run(ClientSession::FeedType(), {read_handle},
-                           {release_handle}, &outputs));
-
-  xla::Literal return_literal = ReadOutputLiteral(outputs, 0);
-
-  auto expected_literal0 = xla::LiteralUtil::CreateR1<float>({6.0f, 11.0f});
-  auto expected_literal1 = xla::LiteralUtil::CreateR1<float>({-4.0f, -7.0f});
-  // The first element of the computation returned tuple would be the add
-  // (expected_literal0), but since we flipped the buffers, the sub
-  // (expected_literal1) should come first.
-  auto expected_literal =
-      xla::LiteralUtil::MakeTuple({&expected_literal1, &expected_literal0});
-
-  EXPECT_TRUE(CompareLiterals(return_literal, expected_literal));
-}
-
-TEST(RawApiTest, CompileAndExecuteWithReusedBuffersS64) {
-  xla::Shape element_shape = xla::ShapeUtil::MakeShape(xla::S64, {2});
-  xla::Shape shape =
-      xla::ShapeUtil::MakeTupleShape({element_shape, element_shape});
-  xla::Shape return_shape = xla::ShapeUtil::MakeTupleShape(
-      {element_shape, element_shape, element_shape, element_shape});
-  xla::XlaBuilder builder("ReuseBuffer");
-  auto param = xla::Parameter(&builder, 0, shape, "param");
-  auto p0 = xla::GetTupleElement(param, 0);
-  auto p1 = xla::GetTupleElement(param, 1);
-  auto add = xla::Add(p0, p1);
-  auto sub = xla::Sub(p0, p1);
-  xla::Tuple(&builder, {add, sub, p0, p1});
-
-  // Flip the tuple literals in the input handle.
-  builder.SetUpAlias({1}, 0, {0});
-  builder.SetUpAlias({0}, 0, {1});
-
-  auto computation = builder.Build().value();
-
-  auto literal0 = xla::LiteralUtil::CreateR1<int64_t>({1, 2});
-  auto literal1 = xla::LiteralUtil::CreateR1<int64_t>({5, 9});
-  auto literal = xla::LiteralUtil::MakeTuple({&literal0, &literal1});
-
-  xrt::XLAAllocation param_alloc;
-  *param_alloc.mutable_value() = literal.ToProto();
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = shape.ToProto();
-  *shapes->mutable_result() = return_shape.ToProto();
-  StoreComputationSnapshot(computation, c.mutable_hlo_snapshot());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(false);
-  e.set_release_compilation_handle(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  XrtClientSession session(root);
-  auto e_config =
-      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
-  auto c_data =
-      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, c_data);
-  auto param_value = ops::Const(root.WithDevice("/device:CPU:0"),
-                                param_alloc.SerializeAsString());
-  auto param_handle = ops::XRTAllocate(root, param_value);
-  TF_ASSERT_OK(root.status());
-
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({param_handle}, &outputs));
-
-  int64_t alloc_handle = outputs[0].scalar<int64_t>()();
-
-  // Note that we release the result handle immediately, but since we aliased
-  // the output buffers onto the input allocation ones (held in alloc_handle),
-  // we can fetch the result from there.
-  auto result =
-      ops::XRTExecute(root, c_handle.handle, e_config, {Input(alloc_handle)});
-  auto read_back = ops::XRTReadLiteral(root, result);
-  auto release = ops::XRTReleaseAllocationHandle(
-      root.WithControlDependencies(read_back), result);
-  TF_ASSERT_OK(root.status());
-
-  TF_EXPECT_OK(
-      session.Run(ClientSession::FeedType(), {read_back}, {release}, &outputs));
-
-  xla::Literal exec_literal = ReadOutputLiteral(outputs, 0);
-  auto exec_literal_parts = exec_literal.DecomposeTuple();
-  ASSERT_EQ(exec_literal_parts.size(), 4);
-
-  EXPECT_TRUE(CompareLiterals(exec_literal_parts[2], literal0));
-  EXPECT_TRUE(CompareLiterals(exec_literal_parts[3], literal1));
-
-  // Now we read back the original input handle values, which at this point
-  // should contain the result of the XLA computation.
-  auto read_handle = ops::XRTReadLiteral(root, Input(alloc_handle));
-  TF_ASSERT_OK(root.status());
-  auto release_handle = ops::XRTReleaseAllocationHandle(
-      root.WithControlDependencies(read_handle), Input(alloc_handle));
-  TF_ASSERT_OK(root.status());
-
-  TF_EXPECT_OK(session.Run(ClientSession::FeedType(), {read_handle},
-                           {release_handle}, &outputs));
-
-  xla::Literal return_literal = ReadOutputLiteral(outputs, 0);
-
-  auto expected_literal0 = xla::LiteralUtil::CreateR1<int64_t>({6, 11});
-  auto expected_literal1 = xla::LiteralUtil::CreateR1<int64_t>({-4, -7});
-  // The first element of the computation returned tuple would be the add
-  // (expected_literal0), but since we flipped the buffers, the sub
-  // (expected_literal1) should come first.
-  auto expected_literal =
-      xla::LiteralUtil::MakeTuple({&expected_literal1, &expected_literal0});
-
-  EXPECT_TRUE(CompareLiterals(return_literal, expected_literal));
-}
-
-TEST(RawApiTest, CompileAndExecuteWithS64Argument) {
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() = xla::LiteralUtil::CreateR0<int64_t>(11031965).ToProto();
-  xrt::XLAAllocation p1;
-  *p1.mutable_value() = xla::LiteralUtil::CreateR0<int64_t>(4091934).ToProto();
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S64, {}).ToProto();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S64, {}).ToProto();
-  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::S64, {}).ToProto();
-  StoreComputationSnapshot(AddS64(), c.mutable_hlo_snapshot());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(true);
-  e.set_release_compilation_handle(true);
-  e.set_return_exploded_tuple(true);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  auto e_config =
-      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
-  auto computation =
-      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
-  auto c_handle = ops::XRTCompile(root, computation);
-  auto p0_value =
-      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
-  auto p0_handle = ops::XRTAllocate(root, p0_value);
-  auto p1_value =
-      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
-  auto p1_handle = ops::XRTAllocate(root, p1_value);
-  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
-                                {Output(p0_handle), Output(p1_handle)});
-  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
-
-  xla::LiteralProto response;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-
-  auto expected = xla::LiteralUtil::CreateR0<int64_t>(15123899);
-  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
-
-  xla::ProgramShapeProto program_shape;
-  EXPECT_TRUE(ParseFromTString(outputs[1].vec<tstring>()(0), &program_shape));
-  EXPECT_EQ(program_shape.parameters_size(), 2);
-  EXPECT_TRUE(xla::ShapeUtil::HasPrimitiveType(
-      xla::Shape(program_shape.result()), xla::S64));
-}
-
-// Tests the XRT device memory compaction API (XRTCompactAllocations).
-TEST(RawApiTest, TestDeviceMemoryCompaction) {
-  static const int kNumAllocs = 32;
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-
-  std::vector<xrt::XLAAllocation> allocs(kNumAllocs);
-  std::vector<Output> handle_outputs;
-  for (int i = 0; i < kNumAllocs; ++i) {
-    *allocs[i].mutable_value() = BasedTwoElementTuple(i * 4.0f);
-    auto value = ops::Const(root.WithDevice("/device:CPU:0"),
-                            allocs[i].SerializeAsString());
-    handle_outputs.push_back(ops::XRTAllocate(root, value));
-  }
-  TF_ASSERT_OK(root.status());
-
-  XrtClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run(handle_outputs, &outputs));
-  EXPECT_EQ(outputs.size(), handle_outputs.size());
-
-  std::vector<int64_t> handles;
-  for (auto& output : outputs) {
-    handles.push_back(output.scalar<int64_t>()());
-  }
-  // Create holes by releasing even allocations.
-  std::vector<Operation> handle_releases;
-  for (size_t i = 0; i < handles.size(); i += 2) {
-    handle_releases.push_back(
-        ops::XRTReleaseAllocationHandle(root, Input(handles[i])));
-  }
-  TF_ASSERT_OK(root.status());
-
-  TF_EXPECT_OK(
-      session.Run(ClientSession::FeedType(), {}, handle_releases, &outputs));
-
-  // Run the compaction API.
-  auto compact_op = ops::XRTCompactAllocations(root);
-  TF_EXPECT_OK(
-      session.Run(ClientSession::FeedType(), {}, {compact_op}, &outputs));
-
-  // Read back the allocation left at odd indices.
-  std::vector<Output> read_outputs;
-  for (size_t i = 1; i < handles.size(); i += 2) {
-    read_outputs.push_back(ops::XRTReadLiteral(root, Input(handles[i])));
-  }
-  TF_ASSERT_OK(root.status());
-
-  TF_EXPECT_OK(session.Run(read_outputs, &outputs));
-  EXPECT_EQ(outputs.size(), read_outputs.size());
-
-  // Verify that everything got moved correctly and the device data matches what
-  // we have on record.
-  for (size_t i = 1, j = 0; i < handles.size(); i += 2, ++j) {
-    xla::LiteralProto response;
-    EXPECT_TRUE(ParseFromTString(outputs[j].scalar<tstring>()(), &response));
-    EXPECT_TRUE(CompareLiteralProtos(allocs[i].value(), response));
-  }
-}
-
-TEST(RawApiTest, TestDeviceMemorySwap) {
-  const xla::Shape scalar_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
-  // 100MB F32 tensor.
-  const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {5000, 5000});
-  const int64_t tensor_size = xla::ShapeUtil::ByteSizeOf(shape);
-  // On CPU we cannot trigger OOM/swap. For TPU and GPU we select 16GB as
-  // maximum memory.
-  int64_t device_memory_size = 8LL * 1024 * 1024 * 1024;
-  if (*xla_test_device_ptr == "TPU" || *xla_test_device_ptr == "XLA_GPU") {
-    device_memory_size = 16LL * 1024 * 1024 * 1024;
-  }
-
-  xrt::XLAAllocation p0;
-  *p0.mutable_value() = xla::LiteralUtil::CreateR0<float>(0.90434).ToProto();
-
-  // Create a computation which broadcasts a scalar to a big tensor.
-  xrt::XLAComputation c_bcast;
-  {
-    auto shapes = c_bcast.mutable_config()->mutable_program_shape();
-    *shapes->add_parameters() = scalar_shape.ToProto();
-    *shapes->mutable_result() = shape.ToProto();
-    StoreComputationSnapshot(
-        BroadcastComputation(scalar_shape, shape.dimensions()),
-        c_bcast.mutable_hlo_snapshot());
-  }
-
-  // Create a computation which compares two tensors.
-  xrt::XLAComputation c_equal;
-  {
-    auto shapes = c_equal.mutable_config()->mutable_program_shape();
-    *shapes->add_parameters() = shape.ToProto();
-    *shapes->add_parameters() = shape.ToProto();
-    *shapes->mutable_result() =
-        xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
-    StoreComputationSnapshot(IsEqualComputation(shape),
-                             c_equal.mutable_hlo_snapshot());
-  }
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(false);
-  e.set_release_compilation_handle(false);
-
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  XrtClientSession session(root);
-  auto e_config =
-      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
-  auto bcast_computation =
-      ops::Const(root.WithDevice("/device:CPU:0"), c_bcast.SerializeAsString());
-  auto c_bcast_handle = ops::XRTCompile(root, bcast_computation);
-  auto equal_computation =
-      ops::Const(root.WithDevice("/device:CPU:0"), c_equal.SerializeAsString());
-  auto c_equal_handle = ops::XRTCompile(root, equal_computation);
-  auto p0_value =
-      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
-  auto p0_handle = ops::XRTAllocate(root, p0_value);
-  std::vector<Tensor> outputs;
-  std::vector<int64_t> device_handles;
-
-  // Create more data the device can take using the broadcast computation.
-  int64_t num_tensors = 8 + device_memory_size / tensor_size;
-  for (int64_t i = 0; i < num_tensors; ++i) {
-    auto result = ops::XRTExecute(root, c_bcast_handle.handle, e_config,
-                                  {Output(p0_handle)});
-    TF_ASSERT_OK(root.status());
-    TF_ASSERT_OK(session.Run({result}, &outputs));
-    EXPECT_EQ(outputs.size(), 1);
-    device_handles.push_back(outputs[0].scalar<int64_t>()());
-  }
-
-  // Trigger computations on XRT handles to verify the swap-out/swap-in logic,
-  // by comparing sequential couple of tensors.
-  auto zero_literal = xla::LiteralUtil::CreateR0<int32_t>(0);
-  for (size_t i = 0; i + 1 < device_handles.size(); ++i) {
-    auto exec_op = ops::XRTExecute(
-        root, c_equal_handle.handle, e_config,
-        {Input(device_handles[i]), Input(device_handles[i + 1])});
-    auto read_back = ops::XRTReadLiteral(root, exec_op);
-
-    TF_ASSERT_OK(root.status());
-    TF_ASSERT_OK(session.Run({read_back}, &outputs));
-    EXPECT_EQ(outputs.size(), 1);
-
-    xla::LiteralProto response;
-    EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &response));
-    auto literal = xla::Literal::CreateFromProto(response).value();
-    EXPECT_EQ(literal, zero_literal);
-  }
-}
-
-TEST(RawApiTest, TestMetricsFetch) {
-  xrt::XRTMetricsCollect metrics;
-  metrics.add_metrics_regex("/tensorflow/xrt/.*");
-
-  Scope root = Scope::NewRootScope().WithDevice("/device:CPU:0");
-  auto metrics_value = ops::Const(root, metrics.SerializeAsString());
-  Output result = ops::XRTMetricsCollect(root, metrics_value);
-  TF_ASSERT_OK(root.status());
-
-  ClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({result}, &outputs));
-  ASSERT_EQ(outputs.size(), 1);
-
-  xrt::MetricsReport report;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &report));
-  for (auto& metric : report.metrics()) {
-    EXPECT_EQ(metric.name().compare(0, 16, "/tensorflow/xrt/"), 0);
-  }
-}
-
-TEST(RawApiTest, TestMemoryInfo) {
-  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  Output result = ops::XRTMemoryInfo(root);
-  TF_ASSERT_OK(root.status());
-
-  ClientSession session(root);
-  std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({result}, &outputs));
-  ASSERT_EQ(outputs.size(), 1);
-
-  xrt::MemoryInfo mem_info;
-  EXPECT_TRUE(ParseFromTString(outputs[0].scalar<tstring>()(), &mem_info));
-  EXPECT_GT(mem_info.kb_total(), 0);
-  EXPECT_GT(mem_info.kb_free(), 0);
-}
-
-}  // namespace
-
-}  // namespace tensorflow
-
-int main(int argc, char** argv) {
-  tensorflow::xla_test_device_ptr = new tensorflow::string("XLA_CPU");
-  tensorflow::xla_platform_ptr = new tensorflow::string("CPU");
-  std::vector<tensorflow::Flag> flag_list = {
-      tensorflow::Flag("xla_test_device", tensorflow::xla_test_device_ptr,
-                       "Tensorflow device type to use for test, e.g., XLA_CPU"),
-      tensorflow::Flag("xla_platform", tensorflow::xla_platform_ptr,
-                       "The XLA platform to select for the device"),
-  };
-  tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
-  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  if (!parse_result) {
-    LOG(ERROR) << "\n" << usage;
-    return 2;
-  }
-  testing::InitGoogleTest(&argc, argv);
-  if (argc > 1) {
-    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
-    return 2;
-  }
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
deleted file mode 100644
index 826ecafc8a9273..00000000000000
--- a/tensorflow/compiler/xrt/xrt.proto
+++ /dev/null
@@ -1,277 +0,0 @@
-syntax = "proto3";
-
-package xrt;
-
-import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
-import "xla/service/hlo.proto";
-import "xla/xla.proto";
-import "xla/xla_data.proto";
-
-message DeviceAssignment {
-  message ComputationDevice {
-    message DeviceMeshCoordinates {
-      // The mesh coordinates for the device. Usually (X, Y, Z, Core), in the
-      // order in which they are returned in the TopologyProto.
-      //  X    = value(0)
-      //  Y    = value(1)
-      //  Z    = value(2)
-      //  Core = value(3)
-      repeated int32 value = 1;
-    }
-    // As many replicas as there are in the replicated computation.
-    repeated DeviceMeshCoordinates replica_devices = 1;
-  }
-  // As many ComputationDevice as many there are computations (number
-  // of cores per replica).
-  repeated ComputationDevice computation_devices = 1;
-}
-
-// Options for an XLA compilation.
-message XLAComputationConfig {
-  // The number of replicas the computation will be run on. If this is
-  // default (0) it is interpreted as 1.
-  int32 num_replicas = 1;
-  // The number of "model-parallel" cores per replica. If this is
-  // default (0) it is interpreted as 1.
-  int32 num_cores_per_replica = 2;
-  // Optional metadata about host sends and recvs.
-  tensorflow.tf2xla.HostComputeMetadata host_compute_metadata = 3;
-
-  // The arg/result shapes for the whole computation.
-  xla.ProgramShapeProto program_shape = 4;
-  // The arg/result shapes for each core of a model-parallel
-  // computation. per_core_args_and_result_shapes is optional for a
-  // single-core computation.
-  repeated xla.ProgramShapeProto per_core_program_shape = 5;
-  // Describes how replicated computation instances should be assigned to
-  // devices. There are num_cores_per_replica computations, and each one will be
-  // sent and executed to the set of replica device numbers described in the
-  // DeviceAssignment proto.
-  DeviceAssignment device_assignment = 6;
-  // The debugging options to be passed to the XLA compilation process.
-  xla.DebugOptions debug_options = 7;
-
-  // Everything inside Experimental is subject to change and is not subject
-  // to API stability guarantees in
-  // https://www.tensorflow.org/guide/version_compat.
-  message Experimental {
-    message UpdateIndexPair {
-      int32 index = 1;
-      bool updated = 2;
-    }
-
-    // stateful_input_indices is only useful when using XRT-compiled
-    // programs together with standard TensorFlow TPU execution ops, so should
-    // be ignored by most clients.
-    //
-    // Optionally the client can pass information about which inputs
-    // to the computation are updates to "stateful" quantities. Each
-    // element of stateful_input_indices includes an index indicating
-    // which input argument it corresponds to, and a bool indicating
-    // whether the value is updated or not. If the XRT computation is
-    // going to be used with a TensorFlow TPU execution op then an
-    // input index must be present for each input that will correspond
-    // to a resource variable in the execution op, and may not be
-    // present for any other input.
-    repeated UpdateIndexPair stateful_input_indices = 1;
-  }
-
-  Experimental experimental = 8;
-}
-
-// Options and XLA computation for a compilation.
-message XLAComputation {
-  XLAComputationConfig config = 1;
-  xla.HloSnapshot hlo_snapshot = 2;
-}
-
-// Literal to allocate space for, and transfer to, device memory.
-message XLAAllocation {
-  reserved 1;
-  xla.LiteralProto value = 2;
-}
-
-// Node in a tree describing a tuple constructed from input handles. A
-// node is an internal node if tuples is non-empty, in which case
-// input_index and release_input_handle are ignored. Otherwise a node
-// is a leaf node. Each leaf XLATupleNode is the index of an input
-// which corresponds to a handle that will be grafted onto the output
-// tuple at that location. If release_input_handle is true that input
-// handle will be released and become invalid.  Inputs may be repeated
-// in which case leaves of the output tuple will alias. If an input is
-// repeated, release_input_handle must be false for every leaf where
-// that input appears.
-//
-// For example, if input 0 has shape {} and input 1 has shape {2,3}
-// then the XLATupleNode with structure {1,{0,1}} corresponds to a
-// tuple with shape {{2,3},{{},{2,3}}}.
-message XLATupleNode {
-  int32 input_index = 1;
-  bool release_input_handle = 2;
-  repeated XLATupleNode tuples = 3;
-}
-
-message CommonExecutionConfig {
-  // The replica index this execute is driving.
-  int32 replica_id = 1;
-  // Mapping local device ordinals to global replica IDs.
-  // local_replica_mapping[LOCAL_DEVICE_ORDINAL] = GLOBAL_REPLICA_ID
-  repeated int32 local_replica_mapping = 2;
-  // The execution run ID used to correlate different XRT execute operations
-  // happeining in parallel from different threads.
-  int64 run_id = 3;
-}
-
-// Options for an XLA execution.
-message XRTExecutionConfig {
-  // Local device to run on. This is present because the execute Op
-  // may be placed on a device such as CPU or TPU_SYSTEM that
-  // logically manages multiple cores.
-  int32 device_ordinal = 1;
-  // Which model-parallel computation to run from the compiled bundle.
-  int32 core_index_in_replica = 2;
-  // Optional key to disambiguate between executions. This is only
-  // needed if multiple host send/recvs may be outstanding
-  // concurrently with executions.
-  string execution_instance_key = 3;
-  // If non-zero, rng_seed to reset the core with.
-  uint32 rng_seed = 4;
-  // If true, release allocation handles on the inputs after running.
-  bool release_input_handles = 5;
-  // If true, release the handle to the computation after running.
-  bool release_compilation_handle = 6;
-  // If set to true, and the result shape is a tuple, then instead of returning
-  // a single tuple allocation the execution will return a vector of
-  // allocations, one for each of the first-level elements of the result tuple.
-  bool return_exploded_tuple = 7;
-  reserved 8;
-  // The common configuration for XRT execute operations.
-  CommonExecutionConfig common_config = 9;
-}
-
-message XRTChainedExecuteConfig {
-  // If non-zero, rng_seed to reset the core with.
-  uint32 rng_seed = 1;
-  // Which model-parallel computation to run from the compiled bundle.
-  int32 core_index_in_replica = 2;
-  // Optional key to disambiguate between executions. This is only needed if
-  // multiple host send/recvs may be outstanding concurrently with executions.
-  string execution_instance_key = 3;
-  reserved 4;
-  // The common configuration for XRT execute operations.
-  CommonExecutionConfig common_config = 5;
-}
-
-// A single chained execute operation. An operation can either be a device data
-// load, or an existing (as in, previously compiled and accessible via its int64
-// handle) XLA computation execution.
-message XRTChainedExecuteOp {
-  // Represents an input for this operation.
-  message Input {
-    // The index within the XRTChainedExecutePlan.ops post-order of the source
-    // operation for this input.
-    int64 op_index = 1;
-    // The output index of the value generated by the operation at op_index.
-    // Zero (default value) means no index ({}) while if an indexing is
-    // required, output_index needs to be set to index+1.
-    // Thanks proto3!
-    int64 output_index = 2;
-  }
-  // Represents an output of the XRTChainedExecute operation, which should
-  // originate by the output of this operation.
-  message Output {
-    // The index in the value generated by this operation, which should be
-    // forwarded as XRTChainedExecute output. If output_index is zero (default
-    // value) the whole output will be used as result. This means that if the
-    // output shape is a tuple, the result will be the full tuple. Otherwise the
-    // real sub-tuple index will be output_index - 1.
-    int64 output_index = 1;
-    // The index in the vector of the results returned by the XRTChainedExecute
-    // operation, where this output should be forwarded.
-    int64 result_index = 2;
-  }
-
-  oneof op_oneof {
-    // The handle to an existing XRT device data.
-    int64 data_handle = 1;
-    // The handle to an existing XRT compiled computation.
-    int64 computation_handle = 2;
-  }
-  // The outputs of this XRTChainedExecuteOp operation.
-  repeated Output outputs = 3;
-  // The inputs of this XRTChainedExecuteOp operation. If data_handle is set,
-  // there are no inputs.
-  repeated Input inputs = 4;
-}
-
-// Execution plan for the XRTChainedExecute operation.
-message XRTChainedExecutePlan {
-  // The post order with the XRT computations to be executed.
-  repeated XRTChainedExecuteOp ops = 1;
-}
-
-// The message used to encode the options for the XRTMetricsCollect operation.
-message XRTMetricsCollect {
-  // A list of regular expressions to match the metric names. Empty means to
-  // return all the metrics reported by the collection registry.
-  repeated string metrics_regex = 1;
-}
-
-message Percentiles {
-  message Point {
-    // In the [0, 100] range.
-    double percentile = 1;
-    double value = 2;
-  }
-
-  // The time (in nanoseconds) of the first sample within the samples buffer.
-  uint64 start_nstime = 1;
-  // The time (in nanoseconds) of the last sample within the samples buffer.
-  uint64 end_nstime = 2;
-  // The minimum value of the samples within the samples buffer.
-  double min_value = 3;
-  // The maximum value of the samples within the samples buffer.
-  double max_value = 4;
-  // The mean value of the samples within the samples buffer.
-  double mean = 5;
-  // The stndard deviation of the samples within the samples buffer.
-  double stddev = 6;
-  // The number samples within the samples buffer.
-  uint64 num_samples = 7;
-  // The total number of times this metrics has been posted a value to.
-  uint64 total_samples = 8;
-  // The sum of all the posted values.
-  double accumulator = 9;
-  // The percentile points reported by the metric.
-  repeated Point points = 10;
-}
-
-message MetricValues {
-  enum UnitOfMeasure {
-    INVALID = 0;
-    NUMBER = 1;
-    TIME = 2;
-    BYTES = 3;
-  }
-
-  // The metric name.
-  string name = 1;
-
-  oneof values_oneof {
-    Percentiles percentiles_value = 2;
-    int64 int64_value = 3;
-  }
-
-  UnitOfMeasure unit_of_measure = 4;
-}
-
-message MetricsReport {
-  repeated MetricValues metrics = 1;
-}
-
-message MemoryInfo {
-  // The total memory on a device, in KB.
-  int64 kb_total = 1;
-  // The free memory on a device, in KB.
-  int64 kb_free = 2;
-}
diff --git a/tensorflow/compiler/xrt/xrt_compilation_cache.cc b/tensorflow/compiler/xrt/xrt_compilation_cache.cc
deleted file mode 100644
index 7c88bad0b22bff..00000000000000
--- a/tensorflow/compiler/xrt/xrt_compilation_cache.cc
+++ /dev/null
@@ -1,307 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
-
-#include <stdlib.h>
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/synchronization/mutex.h"
-#include "xla/client/local_client.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/random/random.h"
-
-namespace tensorflow {
-
-namespace {
-
-int64_t get_uid() {
-  uint64 unsigned_rand = random::New64() & INT64_MAX;
-  return static_cast<int64_t>(unsigned_rand);
-}
-
-int64_t GetCompilationCacheSizeFromEnv() {
-  const char* env = getenv("TF_XRT_COMPILATION_CACHE_SIZE");
-  return env == nullptr ? 1024 : std::stol(env);
-}
-
-}  // namespace
-
-const char* kXRTCompilationCacheResourceName = "xrt_compilation_cache";
-
-XRTCompilationCache::EntryRefImpl::EntryRefImpl(XRTCompilationCache* parent,
-                                                CompiledSubgraph* entry)
-    : parent_(parent), entry_(entry) {
-  entry_->Ref();
-}
-
-XRTCompilationCache::EntryRefImpl::~EntryRefImpl() {
-  parent_->DiscardEntryRef(entry_);
-}
-
-XRTCompilationCacheEntry XRTCompilationCache::EntryRefImpl::get() {
-  return XRTCompilationCacheEntry(entry_->program.get());
-}
-
-XRTCompilationCache::XRTCompilationCache(int max_number_of_entries)
-    : max_cache_entries_(max_number_of_entries) {
-  CHECK_GE(max_cache_entries_, 0);
-  VLOG(1) << "Created compilation cache max " << max_cache_entries_
-          << " entries.";
-}
-
-XRTCompilationCache::~XRTCompilationCache() {
-  VLOG(1) << "XRTCompilationCache::~XRTCompilationCache()";
-  // A buggy client may be holding onto a reference, or a client might have
-  // crashed while holding onto a reference. In either case, discard all
-  // outstanding client references to avoid leaking storage.
-  for (const auto& entry : entries_by_uid_) {
-    while (!entry.second->RefCountIsOne()) {
-      entry.second->Unref();
-    }
-  }
-  while (!entries_by_last_use_.empty()) {
-    MarkOldestEntryForEviction();
-  }
-  CHECK_EQ(cache_.size(), 0);
-  CHECK_EQ(entries_by_uid_.size(), 0);
-  CHECK_EQ(cache_entries_, 0);
-  CHECK_EQ(marked_for_eviction_entries_, 0);
-}
-
-Status XRTCompilationCache::Release(int64_t uid) {
-  absl::MutexLock lock(&mu_);
-  auto iter = entries_by_uid_.find(uid);
-
-  if (iter == entries_by_uid_.end()) {
-    return errors::NotFound("No cache entry found for uid ", uid);
-  }
-
-  DiscardEntryRefLocked(iter->second);
-
-  VLOG(1) << "After releasing entry " << uid << " refs cache is "
-          << cache_.size() << " entries ("
-          << cache_entries_ + marked_for_eviction_entries_
-          << "), marked for eviction "
-          << (cache_.size() - entries_by_last_use_.size()) << " entries ("
-          << marked_for_eviction_entries_ << ").";
-
-  return OkStatus();
-}
-
-void XRTCompilationCache::DiscardEntryRef(CompiledSubgraph* entry) {
-  absl::MutexLock lock(&mu_);
-  DiscardEntryRefLocked(entry);
-}
-
-void XRTCompilationCache::DiscardEntryRefLocked(CompiledSubgraph* entry) {
-  if (entry->RefCountIsOne()) {
-    // The last reference to this entry is going away, so really delete it from
-    // the cache in such a way that it can't be restored by being looked up
-    // again.
-
-    // Sanity-check that it has been marked for eviction.
-    CHECK(entries_by_last_use_.find(entry->last_use) ==
-          entries_by_last_use_.end());
-    // Update the counter tracking how much space is taken up by entries that
-    // are marked for eviction.
-    --marked_for_eviction_entries_;
-
-    // Remove the entry from the cache.
-    auto erased = cache_.erase(entry->key);
-    if (erased == 0) {
-      LOG(FATAL) << "Tried to discard nonexistent cache entry";
-    }
-    erased = entries_by_uid_.erase(entry->uid);
-    CHECK_EQ(erased, 1);
-  }
-  entry->Unref();
-}
-
-void XRTCompilationCache::MarkOldestEntryForEviction() {
-  CompiledSubgraph* entry_to_mark = entries_by_last_use_.begin()->second;
-  VLOG(1) << "Marking " << entry_to_mark->key << " for eviction";
-  entries_by_last_use_.erase(entry_to_mark->last_use);
-  --cache_entries_;
-  ++marked_for_eviction_entries_;
-  // Discard the cache's reference to entry. If steps are holding onto
-  // references to entry it won't be deleted until the last step holding it
-  // completes. It stays in the cache in the meantime and can be resurrected
-  // by a call to CompileIfKeyAbsent if that occurs before the last reference
-  // expires.
-  DiscardEntryRefLocked(entry_to_mark);
-}
-
-void XRTCompilationCache::LookupEntryMarkedForEviction(
-    CompiledSubgraph* entry) {
-  // The entry was previously marked for eviction (or is newly created) so
-  // unmark it. Add a reference (owned by the cache), update the cache size, and
-  // mark something old for eviction if necessary.
-  entry->Ref();
-  --marked_for_eviction_entries_;
-  ++cache_entries_;
-
-  // Mark the least-recently-used non-marked entry for eviction. Never mark the
-  // most-recently used entry (i.e., do nothing if entries_by_last_use_ == 1
-  // which means there's only one entry not already marked for eviction), so
-  // that an entry persists in the cache even if it is larger than the allocated
-  // cache size.
-  while (entries_by_last_use_.size() > 1 &&
-         cache_entries_ > max_cache_entries_) {
-    MarkOldestEntryForEviction();
-  }
-}
-
-XRTCompilationCache::CompiledSubgraph* XRTCompilationCache::InitializeEntry(
-    const string& key,
-    const std::function<Status(std::unique_ptr<xla::LocalExecutable>*)>&
-        initialize_program) {
-  CompiledSubgraph* entry = new CompiledSubgraph();
-  entry->parent = this;
-  entry->key = key;
-  entry->uid = get_uid();
-  // Add the entry to the cache. Once the computation has been compiled,
-  // UpdateEntryAfterCompilation will be called to potentially mark old entries
-  // that don't fit any more for eviction.
-  //
-  // At this point there is one reference to entry, which is owned by the caller
-  // who created the entry. A second reference, owned by the cache, will be
-  // added below since we leave the entry in the 'marked for eviction' state
-  // here.
-  auto cache_inserted =
-      cache_.insert(std::pair<string, CompiledSubgraph*>(key, entry));
-  CHECK(cache_inserted.second);
-
-  // Initialize the program outside the lock so that other cache operations
-  // can proceed during the (potentially lengthy) initialization.
-  Status s;
-  std::unique_ptr<xla::LocalExecutable> program;
-  {
-    mu_.Unlock();
-    { s = initialize_program(&program); }
-    mu_.Lock();
-  }
-
-  // Add the entry to the uid index.
-  auto uid_inserted = entries_by_uid_.insert(
-      std::pair<int64_t, CompiledSubgraph*>(entry->uid, entry));
-  CHECK(uid_inserted.second);
-
-  entry->initialized = true;
-  entry->initialization_status = s;
-  if (s.ok()) {
-    entry->program = std::move(program);
-  }
-  // Add the entry to marked_for_eviction_entries_ since it will be adjusted
-  // down again when the newly-created entry gets unmarked.
-  ++marked_for_eviction_entries_;
-  return entry;
-}
-
-Status XRTCompilationCache::CompileIfKeyAbsent(
-    const string& key, int64_t* uid,
-    const std::function<Status(std::unique_ptr<xla::LocalExecutable>*)>&
-        compile_function) {
-  CompiledSubgraph* entry = nullptr;
-
-  absl::MutexLock lock(&mu_);
-  auto iter = cache_.find(key);
-
-  if (iter == cache_.end()) {
-    // The single ref on the newly-created entry is owned by the caller.
-    VLOG(1) << "Before adding new entry for key " << key << " cache is "
-            << cache_.size() << " entries ("
-            << cache_entries_ + marked_for_eviction_entries_ << "), "
-            << " marked for eviction "
-            << (cache_.size() - entries_by_last_use_.size()) << " entries ("
-            << marked_for_eviction_entries_ << ").";
-    entry = InitializeEntry(key, compile_function);
-  } else {
-    VLOG(1) << "Before refreshing entry for key " << key << " cache is "
-            << cache_.size() << " entries ("
-            << cache_entries_ + marked_for_eviction_entries_ << "), "
-            << " marked for eviction "
-            << (cache_.size() - entries_by_last_use_.size()) << " entries ("
-            << marked_for_eviction_entries_ << ").";
-    entry = iter->second;
-    // Make a new reference that is owned by the caller.
-    entry->Ref();
-    // Block if necessary until the subgraph has been initialized.
-    mu_.Await(absl::Condition(
-        +[](CompiledSubgraph* e) { return e->initialized; }, entry));
-  }
-
-  // Let the caller know the uid of the entry.
-  *uid = entry->uid;
-
-  // Remove the old LRU-table entry if it wasn't already marked for eviction.
-  auto erased = entries_by_last_use_.erase(entry->last_use);
-  // Update the LRU table indicating this entry is the most recently used.
-  entry->last_use = use_counter_++;
-  entries_by_last_use_[entry->last_use] = entry;
-  if (erased == 0) {
-    // The entry had been marked for eviction, or is newly created.
-    LookupEntryMarkedForEviction(entry);
-  }
-
-  VLOG(1) << "After refreshing entry for key " << key << " cache is "
-          << cache_.size() << " entries ("
-          << cache_entries_ + marked_for_eviction_entries_ << "), "
-          << " marked for eviction "
-          << (cache_.size() - entries_by_last_use_.size()) << " entries ("
-          << marked_for_eviction_entries_ << ").";
-
-  return entry->initialization_status;
-}
-
-Status XRTCompilationCache::Lookup(
-    int64_t uid, std::unique_ptr<XRTCompilationCacheEntryRef>* entry) {
-  entry->reset();
-
-  absl::MutexLock lock(&mu_);
-  const auto iter = entries_by_uid_.find(uid);
-  if (iter == entries_by_uid_.end()) {
-    return errors::NotFound("No executable found for uid ", uid);
-  }
-  CompiledSubgraph* cache_entry = iter->second;
-  *entry = std::unique_ptr<XRTCompilationCacheEntryRef>(
-      new EntryRefImpl(this, cache_entry));
-  return OkStatus();
-}
-
-string XRTCompilationCache::DebugString() const {
-  return "XRTCompilationCache";
-}
-
-xla::StatusOr<RefPtr<XRTCompilationCache>> GetOrCreateCompilationCache(
-    ResourceMgr* rm, int64_t max_number_of_entries) {
-  if (max_number_of_entries == 0) {
-    max_number_of_entries = GetCompilationCacheSizeFromEnv();
-  }
-  XRTCompilationCache* cache;
-  TF_RETURN_IF_ERROR(rm->LookupOrCreate<XRTCompilationCache>(
-      rm->default_container(), kXRTCompilationCacheResourceName, &cache,
-      [&](XRTCompilationCache** new_cache) {
-        *new_cache = new XRTCompilationCache(max_number_of_entries);
-        return OkStatus();
-      }));
-  return RefPtr<XRTCompilationCache>(cache);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_compilation_cache.h b/tensorflow/compiler/xrt/xrt_compilation_cache.h
deleted file mode 100644
index 7c89bcc5a1ecea..00000000000000
--- a/tensorflow/compiler/xrt/xrt_compilation_cache.h
+++ /dev/null
@@ -1,252 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XRT_XRT_COMPILATION_CACHE_H_
-#define TENSORFLOW_COMPILER_XRT_XRT_COMPILATION_CACHE_H_
-
-#include <functional>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/client/local_client.h"
-#include "xla/statusor.h"
-#include "tensorflow/compiler/xrt/xrt_refptr.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/lib/core/refcount.h"
-
-namespace tensorflow {
-
-extern const char* kXRTCompilationCacheResourceName;
-
-struct XRTCompilationCacheEntry {
-  explicit XRTCompilationCacheEntry(xla::LocalExecutable* executable)
-      : executable(executable) {}
-
-  // Returns a non-owned pointer to an immutable executable.
-  xla::LocalExecutable* get_executable() const { return executable; }
-
- private:
-  xla::LocalExecutable* executable;
-};
-
-// Base class for a reference to a cached executable. A unique_ptr to a
-// XRTCompilationCacheEntryRef is returned by the cache Lookup methods below,
-// and ensures the underlying executable is not garbage-collected until the
-// client discards the ptr.
-class XRTCompilationCacheEntryRef {
- public:
-  virtual ~XRTCompilationCacheEntryRef() = default;
-
-  // Returns a XRTCompilationCacheEntry that should not be used beyond the
-  // lifetime of the XRTCompilationCacheEntryRef.
-  virtual XRTCompilationCacheEntry get() = 0;
-};
-
-// Cache for compiled XLA executables.
-// TODO(b/112646171) rationalize this with the other compilation caches.
-//
-// Each key identifies a unique XLA computation, and the value is executable
-// generated by compiling the computation.
-//
-// When a computation is considered for compilation, the client calls
-//
-// auto key = <compute key for computation>;
-// auto compile_function = <lambda to compile computation into executable>;
-// int64 uid;
-// CompileIfKeyAbsent(computation_key, &uid, compile_function);
-//
-// where computation_key is the key computed for the computation. On success,
-// uid contains an identifier that can be used to look up the executable. If the
-// compiled executable were not present in the cache, compile_function would be
-// called to generate it.
-//
-// The caller is responsible for calling Release(uid) once for every
-// call to CompileIfKeyAbsent(key, ...) to discard the reference to the
-// compilation results, after the caller is sure it will not look up the
-// compiled executables again.
-//
-// Subsequently the client can call
-//
-// std::unique_ptr<XRTCompilationCacheEntryRef> entry;
-// Lookup(uid, &entry);
-// auto proto = entry->get();
-//
-// to access a cached executable.
-class XRTCompilationCache : public ResourceBase {
- public:
-  // There is no way in general to discover the size taken by an XLA executable,
-  // so the cache defaults to a specific number of entries to determine when to
-  // start evicting programs. TODO(b/112592410) change this if the XLA API gets
-  // a mechanism to query size.
-  explicit XRTCompilationCache(int max_number_of_entries);
-  ~XRTCompilationCache() override;
-
-  // Ensures there is an entry for key present in the cache. By the time
-  // CompileIfKeyAbsent returns there is guaranteed to be an entry in the cache
-  // for key, and that entry will remain valid at least until Release is called
-  // on the returned uid. The first call to CompileIfKeyAbsent with a key that
-  // is not in the cache will evaluate compile_function to compute the value to
-  // use in the entry. Subsequent calls with the same key will block until
-  // compile_function completes. Other cache reads and inserts may proceed on
-  // other threads while compile_function is executing. The caller is
-  // responsible for calling Release(uid) to manually discard its reference to
-  // the compiled program, once the caller will not look up the compiled program
-  // again.
-  //
-  // compile_function should compile the computation represented by key and fill
-  // the xla::LocalExecutable into its passed argument. It should return OK
-  // if and only if compilation succeeds. The executable will be discarded on
-  // non-OK status.
-  Status CompileIfKeyAbsent(
-      const string& key, int64_t* uid,
-      const std::function<Status(std::unique_ptr<xla::LocalExecutable>*)>&
-          compile_function);
-
-  Status Release(int64_t uid);
-
-  // Looks up an executable corresponding to uid. On success a pointer to an
-  // EntryRef holding the program is returned in entry.
-  Status Lookup(int64_t uid,
-                std::unique_ptr<XRTCompilationCacheEntryRef>* entry);
-
-  string DebugString() const override;
-
- private:
-  // An entry in the compilation cache. The entry is deleted once it has been
-  // marked for eviction from the cache _and_ all looked-up entries have been
-  // released. When the entry is first created, it is uninitialized and a
-  // client-supplied compilation function is run outside the cache's lock to
-  // generate the program to be stored in the entry. Any other client that
-  // requests the entry will block until it has been initialized. Each entry has
-  // a last_use value that set from a monotonically-increasing counter in the
-  // cache whenever the entry is referenced. When the cache becomes full,
-  // entries are marked for eviction in LRU order.
-  struct CompiledSubgraph : public core::RefCounted {
-    ~CompiledSubgraph() override = default;
-
-    XRTCompilationCache* parent = nullptr;  // Not owned.
-    bool initialized = false;
-    // The Status returned by the compilation function when the entry is
-    // initialized. This status will be returned to any client that requests the
-    // entry.
-    Status initialization_status;
-    // Counter to keep track of LRU entries for the eviction policy.
-    int64_t last_use = -1;
-    // The unique key describing this entry.
-    string key;
-    // The uid describing this entry.
-    int64_t uid;
-    // The compiled payload corresponding to the key.
-    std::unique_ptr<xla::LocalExecutable> program;
-  };
-
-  // Wrapper for a cache entry that holds a reference to the entry until the
-  // wrapper is deleted. This wrapper is the concrete type of
-  // XRTCompilationCacheEntryRef returned by Lookup.
-  class EntryRefImpl : public XRTCompilationCacheEntryRef {
-   public:
-    EntryRefImpl(XRTCompilationCache* parent, CompiledSubgraph* entry);
-    ~EntryRefImpl() override;
-
-    XRTCompilationCacheEntry get() override;
-
-   private:
-    XRTCompilationCache* parent_;  // Not owned.
-    // A reference to entry_ is acquired in the contructor and released via
-    // parent->DiscardEntryRef in the destructor.
-    CompiledSubgraph* entry_;
-  };
-
-  // Releases one reference to entry. This is called by the cache when entry is
-  // marked for eviction; or by an EntryRefImpl when it is destroyed. Before the
-  // last reference to entry is released, entry is removed from cache_.
-  void DiscardEntryRef(CompiledSubgraph* entry);
-  void DiscardEntryRefLocked(CompiledSubgraph* entry)
-      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Marks the oldest unmarked entry for eviction. Requires that there is at
-  // least one such entry.
-  void MarkOldestEntryForEviction() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Updates datastructures to indicate that entry, which had been marked for
-  // eviction, has been looked up. This is called by CompileIfKeyAbsent when an
-  // entry is newly created, or an entry that has been marked for eviction but
-  // not yet evicted is looked up.
-  //
-  // First the entry is unmarked for eviction, i.e. the cache gains a reference
-  // to entry, entry's last_use field is set to be the most recent value of
-  // use_counter_ and entries_by_last_use_ is updated accordingly.
-  //
-  // Next, the size of the cache is examined to see if any other entries need to
-  // be marked for eviction now that entry has been unmarked. While the total
-  // number of unmarked cached entries is greater than max_cache_entries_,
-  // entries are marked for eviction in LRU order. The most recently used entry
-  // is never marked for eviction, so an entry larger than the max cache entries
-  // will remain in the cache until it is replaced by something else.
-  void LookupEntryMarkedForEviction(CompiledSubgraph* entry)
-      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Creates a new entry by running initialize_program and places it in the
-  // cache to be looked up by key. The new entry is in the 'marked for eviction'
-  // state (not present in entries_by_last_use_) and the caller is expected to
-  // call LookupEntryMarkedForEviction after InitializeEntry.
-  //
-  // **InitializeEntry releases mu_ during the call to initialize_program.**
-  CompiledSubgraph* InitializeEntry(
-      const string& key,
-      const std::function<Status(std::unique_ptr<xla::LocalExecutable>*)>&
-          initialize_program) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // The maximum number of entries that are stored in the cache before entries
-  // are marked for eviction.
-  const int max_cache_entries_;
-
-  mutable absl::Mutex mu_;
-  // The total number of entries that are stored and not marked for eviction.
-  int cache_entries_ TF_GUARDED_BY(mu_) = 0;
-  // The total number of entries that are marked for eviction.
-  int marked_for_eviction_entries_ TF_GUARDED_BY(mu_) = 0;
-  // The value to assign to the last_use field of the next entry that is looked
-  // up.
-  int64_t use_counter_ TF_GUARDED_BY(mu_) = 0;
-  // All the executables that can be looked up in the cache index by key. An
-  // entry is marked for eviction iff it is present in cache_ and not in
-  // entries_by_last_use_.
-  std::unordered_map<string, CompiledSubgraph*> cache_ TF_GUARDED_BY(mu_);
-  // All the executable entries that can be looked up in the cache indexed by
-  // uid.
-  absl::flat_hash_map<int64_t, CompiledSubgraph*> entries_by_uid_
-      TF_GUARDED_BY(mu_);
-  // Map from last_use to entry, used to mark entries for eviction in LRU
-  // order. If an entry's last_use counter is not present as a key in
-  // entries_by_last_use_ then the entry has been marked for eviction.
-  std::map<int64_t, CompiledSubgraph*> entries_by_last_use_ TF_GUARDED_BY(mu_);
-};
-
-// Looks up or create an XRTCompilationCache object within the given resource
-// manager, under the default container. The max_number_of_entries sets the
-// maximum number of entries within the cache (which will be LRU-evicted).
-// If max_number_of_entries is set to sero, the size of the cache will be
-// configured using the TF_XRT_COMPILATION_CACHE_SIZE environment variable.
-xla::StatusOr<RefPtr<XRTCompilationCache>> GetOrCreateCompilationCache(
-    ResourceMgr* rm, int64_t max_number_of_entries);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_XRT_XRT_COMPILATION_CACHE_H_
diff --git a/tensorflow/compiler/xrt/xrt_device.cc b/tensorflow/compiler/xrt/xrt_device.cc
deleted file mode 100644
index 9e1d929f429194..00000000000000
--- a/tensorflow/compiler/xrt/xrt_device.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Classes for managing access to XLA resources.
-
-#include "tensorflow/compiler/xrt/xrt_device.h"
-
-#include <map>
-#include <memory>
-#include <string>
-
-#include "absl/container/node_hash_map.h"
-#include "tensorflow/compiler/jit/xla_device.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tsl/framework/device_id.h"
-
-namespace tensorflow {
-namespace {
-
-class ResourceMgrArena {
- public:
-  static ResourceMgrArena* Get() {
-    static ResourceMgrArena* arena = new ResourceMgrArena();
-    return arena;
-  }
-
-  ResourceMgr* GetResourceMgr(const std::string& platform_name) {
-    mutex_lock lock(mutex_);
-    auto it = resource_managers_.find(platform_name);
-    if (it == resource_managers_.end()) {
-      it = resource_managers_.emplace(platform_name, new ResourceMgr()).first;
-    }
-    return it->second;
-  }
-
- private:
-  mutex mutex_;
-  std::map<std::string, ResourceMgr*> resource_managers_;
-};
-
-}  // namespace
-
-/*static*/ Status XRTGenericDeviceAccessor::GetResourceManager(
-    OpKernelContext* ctx, ResourceMgr** rm) {
-  const XlaDevice::Metadata* metadata;
-  TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(ctx, &metadata));
-  *rm = ResourceMgrArena::Get()->GetResourceMgr(metadata->platform()->Name());
-  return OkStatus();
-}
-
-/* static */ xla::StatusOr<RefPtr<XRTCompilationCache>>
-XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
-    OpKernelContext* ctx, int64_t max_number_of_entries) {
-  ResourceMgr* rm;
-  TF_RETURN_IF_ERROR(GetResourceManager(ctx, &rm));
-  return tensorflow::GetOrCreateCompilationCache(rm, max_number_of_entries);
-}
-
-/*static*/ Status XRTGenericDeviceAccessor::InitScopedRef(
-    OpKernelContext* ctx, int device_ordinal, ScopedRef* scoped_ref) {
-  const XlaDevice::Metadata* metadata;
-  TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(ctx, &metadata));
-  if (device_ordinal != metadata->device_ordinal()) {
-    return errors::Internal("XRT device ordinal requested ", device_ordinal,
-                            " on device with ordinal ",
-                            metadata->device_ordinal());
-  }
-  scoped_ref->Acquire(metadata->client(), device_ordinal,
-                      metadata->platform()->Name(), ctx);
-  return OkStatus();
-}
-
-/*static*/ Status XRTGenericDeviceAccessor::InitScopedRef(
-    OpKernelContext* ctx, ScopedRef* scoped_ref) {
-  const XlaDevice::Metadata* metadata;
-  TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(ctx, &metadata));
-  scoped_ref->Acquire(metadata->client(), metadata->device_ordinal(),
-                      metadata->platform()->Name(), ctx);
-  return OkStatus();
-}
-
-/* static */ tensorflow::mutex
-    XRTGenericDeviceAccessor::ScopedRef::cuda_allocator_mutex_(
-        tensorflow::LINKER_INITIALIZED);
-/* static */ absl::flat_hash_map<stream_executor::Stream*,
-                                 std::unique_ptr<se::TfAllocatorAdapter>>*
-    XRTGenericDeviceAccessor::ScopedRef::cuda_allocators_ =
-        new absl::flat_hash_map<stream_executor::Stream*,
-                                std::unique_ptr<se::TfAllocatorAdapter>>;
-
-void XRTGenericDeviceAccessor::ScopedRef::Acquire(
-    xla::LocalClient* client, int ordinal, const std::string& platform_name,
-    OpKernelContext* ctx) {
-  client_ = client;
-  ordinal_ = ordinal;
-  allocator_ = client_->mutable_backend()->memory_allocator();
-#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
-    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
-  if (platform_name == "CUDA") {
-    // Use BfcAllocator for the CUDA.
-    auto stream = ctx->op_device_context()->stream();
-    if (!cuda_allocators_->count(stream)) {
-      mutex_lock lock(cuda_allocator_mutex_);
-      if (!cuda_allocators_->count(stream)) {
-        GPUOptions gpu_options;
-        Allocator* raw_allocator =
-            GPUProcessState::singleton()->GetGPUAllocator(
-                tsl::TfDeviceId(ordinal_));
-        (*cuda_allocators_)[stream] =
-            std::make_unique<se::TfAllocatorAdapter>(raw_allocator, stream);
-      }
-    }
-    allocator_ = static_cast<se::DeviceMemoryAllocator*>(
-        (*cuda_allocators_)[stream].get());
-  }
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-}
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_device.h b/tensorflow/compiler/xrt/xrt_device.h
deleted file mode 100644
index de9f2c589a8bcc..00000000000000
--- a/tensorflow/compiler/xrt/xrt_device.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Classes for keeping track of on-device state.
-
-#ifndef TENSORFLOW_COMPILER_XRT_XRT_DEVICE_H_
-#define TENSORFLOW_COMPILER_XRT_XRT_DEVICE_H_
-
-#include <memory>
-#include <string>
-
-#include "absl/container/flat_hash_map.h"
-#include "xla/client/local_client.h"
-#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
-#include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace tensorflow {
-
-// This accessor is used for XLA CPU/GPU. It uses the device resource manager,
-// so e.g., on multi-GPU setups the compilation cache will not be shared across
-// devices.
-class XRTGenericDeviceAccessor {
- public:
-  static Status GetResourceManager(OpKernelContext* ctx, ResourceMgr** rm);
-
-  static xla::StatusOr<RefPtr<XRTCompilationCache>> GetOrCreateCompilationCache(
-      OpKernelContext* ctx, int64_t max_number_of_entries);
-
-  // We use a ScopedRef pattern here even though it's not strictly necessary,
-  // just so that templated uses of this and the TPU accessor class will be as
-  // similar as possible.
-  class ScopedRef {
-   public:
-    ScopedRef() = default;
-    ~ScopedRef() = default;
-
-    ScopedRef(const ScopedRef&) = delete;
-    ScopedRef& operator=(const ScopedRef&) = delete;
-
-    // Returns the XLA device protected by this ScopedRef.
-    xla::LocalClient* client() const { return client_; }
-    xla::Backend* backend() { return client_->mutable_backend(); }
-    int device_ordinal() const { return ordinal_; }
-    se::DeviceMemoryAllocator* allocator() { return allocator_; }
-
-   private:
-    // XRTGenericDeviceAccessor::InitScopedRef is the only way to initialize
-    // ScopedRef.
-    friend class XRTGenericDeviceAccessor;
-
-    void Acquire(xla::LocalClient* client, int ordinal,
-                 const std::string& platform_name, OpKernelContext* ctx);
-
-    xla::LocalClient* client_ = nullptr;
-    int ordinal_ = 0;
-    se::DeviceMemoryAllocator* allocator_ = nullptr;
-    static tensorflow::mutex cuda_allocator_mutex_;
-    static absl::flat_hash_map<stream_executor::Stream*,
-                               std::unique_ptr<se::TfAllocatorAdapter>>*
-        cuda_allocators_;
-  };
-
-  static Status InitScopedRef(OpKernelContext* ctx, int device_ordinal,
-                              ScopedRef* scoped_ref);
-
-  static Status InitScopedRef(OpKernelContext* ctx, ScopedRef* scoped_ref);
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_XRT_XRT_DEVICE_H_
diff --git a/tensorflow/compiler/xrt/xrt_memory_manager.cc b/tensorflow/compiler/xrt/xrt_memory_manager.cc
deleted file mode 100644
index 05325a822d9d22..00000000000000
--- a/tensorflow/compiler/xrt/xrt_memory_manager.cc
+++ /dev/null
@@ -1,370 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xrt/xrt_memory_manager.h"
-
-#include <algorithm>
-#include <list>
-#include <memory>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "absl/memory/memory.h"
-#include "tensorflow/compiler/xrt/xrt_metrics.h"
-#include "tensorflow/core/lib/monitoring/timed.h"
-#include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
-
-namespace tensorflow {
-namespace {
-
-// We use kDeviceBits to store the device ordinal in the handle. We store the
-// device in the upper part of the int64 handle to make sure the random bits are
-// in the lower part which is better when storing the handle as a key for
-// unordered maps.
-const int kDeviceBits = 12;
-
-int64_t MakeDeviceHandle(int64_t device_ordinal, int64_t rnd_value) {
-  const int64_t kUidMask = (static_cast<int64_t>(1) << (64 - kDeviceBits)) - 1;
-  return (device_ordinal << (64 - kDeviceBits)) | (rnd_value & kUidMask);
-}
-
-int GetDeviceFromHandle(int64_t handle) {
-  return (handle >> (64 - kDeviceBits)) & ((1 << kDeviceBits) - 1);
-}
-
-}  // namespace
-
-class XRTMemoryManager::DeviceContext {
-  struct Alloc {
-    explicit Alloc(RefPtr<XRTTupleAllocation> tuple)
-        : tuple(std::move(tuple)) {}
-
-    RefPtr<XRTTupleAllocation> tuple;
-  };
-
-  using AllocList = std::list<Alloc>;
-
- public:
-  int64_t Register(RefPtr<XRTTupleAllocation> tuple) {
-    while (true) {
-      int64_t handle = MakeDeviceHandle(tuple->device_ordinal(), CreateUid());
-      mutex_lock lock(lock_);
-      allocs_.emplace_front(tuple);
-      if (alloc_map_.emplace(handle, allocs_.begin()).second) {
-        return handle;
-      }
-      // The chances of hitting an existing handle are so remote, it is much
-      // more convenient to add to the list before, and eventually removing.
-      allocs_.erase(allocs_.begin());
-    }
-  }
-
-  bool Release(int64_t handle) {
-    mutex_lock lock(lock_);
-    auto it = alloc_map_.find(handle);
-    if (it == alloc_map_.end()) {
-      return false;
-    }
-    allocs_.erase(it->second);
-    alloc_map_.erase(it);
-    return true;
-  }
-
-  RefPtr<XRTTupleAllocation> Lookup(int64_t handle) {
-    mutex_lock lock(lock_);
-    auto it = alloc_map_.find(handle);
-    if (it == alloc_map_.end()) {
-      return nullptr;
-    }
-    // LRU
-    allocs_.splice(allocs_.begin(), allocs_, it->second);
-    return it->second->tuple;
-  }
-
-  void Clear() {
-    mutex_lock lock(lock_);
-    alloc_map_.clear();
-    allocs_.clear();
-  }
-
-  Status CompactAllocations(XRTMemoryManager* memory_manager,
-                            xla::Backend* backend,
-                            se::DeviceMemoryAllocator* allocator) {
-    profiler::TraceMe trace_me("XRTMemoryManager::CompactAllocations",
-                               /*level=*/2);
-    auto timed = monitoring::MakeTimed(xrt_metrics::GetMemoryCompactCell());
-    VLOG(4) << "CompactAllocations started";
-    mutex_lock lock(lock_);
-    Status status;
-    std::vector<AllocList::iterator> swapped;
-    // We are swapping out from the most recently used allocations. This is
-    // desirable since the most recently used will be finding themselves at the
-    // bottom of the allocation space. Since these are more likely to be pinned
-    // allocations, a further trim done by following TryFreeMemory() call will
-    // eventually drop the higher located allocations, with better chance of
-    // reducing fragmentation.
-    // Also, by swapping out the pinned allocations first, those will also be
-    // the first to be restored, and hence if we will ever find OOM on the way
-    // out, we would more likely be swapping in not pinned ones.
-    for (auto it = allocs_.begin(); it != allocs_.end(); ++it) {
-      // We are compacting all the allocations, so we will temporarily swap out
-      // even pinned allocations.
-      auto swap_result_or = it->tuple->SwapOut(backend, /*swap_pinned=*/true);
-      if (!swap_result_or.ok()) {
-        status = swap_result_or.status();
-        break;
-      }
-      if (swap_result_or.value()) {
-        swapped.push_back(it);
-      }
-    }
-    // At this point we have released all the device memory we could release.
-    // Load back the tuple allocations we have swapped out above.
-    for (auto& it : swapped) {
-      auto swap_result_or =
-          it->tuple->SwapIn(memory_manager, backend, allocator);
-      if (!swap_result_or.ok()) {
-        // If we failed to restored a pinned allocation, better to CHECK here
-        // than wondering why XRTTupleAllocation calls fail with errors about
-        // missing buffers.
-        CHECK(!it->tuple->IsPinned());  // Crash OK
-        if (status.ok()) {
-          status = swap_result_or.status();
-        }
-      }
-    }
-    VLOG(4) << "CompactAllocations finished: " << status;
-    return status;
-  }
-
-  // Tries to free size bytes by freeing some unpinned device memory. Returns
-  // the amount of memory which was able to free.
-  xla::StatusOr<size_t> TryFreeMemory(xla::Backend* backend, size_t size) {
-    profiler::TraceMe trace_me("XRTMemoryManager::TryFreeMemory", /*level=*/2);
-    auto timed = monitoring::MakeTimed(xrt_metrics::GetTryFreeMemoryCell());
-    mutex_lock lock(lock_);
-    size_t swapped_size = 0;
-    for (auto it = allocs_.rbegin(); it != allocs_.rend(); ++it) {
-      TF_ASSIGN_OR_RETURN(bool swap_result,
-                          it->tuple->SwapOut(backend, /*swap_pinned=*/false));
-      if (swap_result) {
-        swapped_size += it->tuple->GetDeviceMemorySize();
-        if (swapped_size >= size) {
-          break;
-        }
-      }
-    }
-    VLOG(3) << "Swapped out " << swapped_size << " bytes";
-    return swapped_size;
-  }
-
- private:
-  static int64_t CreateUid() {
-    int64_t uid;
-    do {
-      uid = random::New64() & INT64_MAX;
-    } while (uid == InvalidKey());
-    return uid;
-  }
-
-  // We store Alloc records inside an std::list<Alloc> so we can LRU it, and
-  // store the list iterators within the handle map, as list iterators don't get
-  // invalidated by (other elements) removals or position swaps.
-  mutex lock_;
-  AllocList allocs_;
-  std::unordered_map<int64_t, AllocList::iterator> alloc_map_;
-};
-
-XRTMemoryManager::WorkingSet::WorkingSet(
-    RefPtr<XRTMemoryManager> memory_manager)
-    : memory_manager_(std::move(memory_manager)) {}
-
-XRTMemoryManager::WorkingSet::~WorkingSet() {
-  for (auto& tuple : pinned_tuples_) {
-    tuple->Unpin();
-  }
-}
-
-Status XRTMemoryManager::WorkingSet::LookupAndPin(
-    xla::Backend* backend, int64_t handle,
-    se::DeviceMemoryAllocator* allocator) {
-  TF_ASSIGN_OR_RETURN(auto tuple, memory_manager_->Lookup(handle));
-  TF_RETURN_IF_ERROR(
-      tuple->PinAndSwapIn(memory_manager_.get(), backend, allocator).status());
-  pinned_tuples_.push_back(std::move(tuple));
-  return OkStatus();
-}
-
-/* static */ RefPtr<XRTMemoryManager> XRTMemoryManager::Get(ResourceMgr* rm) {
-  static string* container = new string("XrtState");
-  static string* name = new string("MemoryManager");
-  XRTMemoryManager* memory_manager = nullptr;
-  TF_CHECK_OK(rm->LookupOrCreate<XRTMemoryManager>(
-      *container, *name, &memory_manager, [](XRTMemoryManager** ret) {
-        *ret = new XRTMemoryManager();
-        return OkStatus();
-      }));
-  return memory_manager;
-}
-
-int64_t XRTMemoryManager::Register(RefPtr<XRTTupleAllocation> tuple) {
-  DeviceContext* device_context = GetDeviceContext(tuple->device_ordinal(),
-                                                   /*create_if_missing=*/true);
-  return device_context->Register(std::move(tuple));
-}
-
-xla::StatusOr<RefPtr<XRTTupleAllocation>> XRTMemoryManager::Lookup(
-    int64_t handle) {
-  int device_ordinal = GetDeviceFromHandle(handle);
-  DeviceContext* device_context = GetDeviceContext(device_ordinal,
-                                                   /*create_if_missing=*/false);
-  if (device_context == nullptr) {
-    return errors::NotFound("XRT memory handle not found: ", handle);
-  }
-  RefPtr<XRTTupleAllocation> tuple = device_context->Lookup(handle);
-  if (tuple == nullptr) {
-    return errors::NotFound("XRT memory handle not found: ", handle);
-  }
-  return std::move(tuple);
-}
-
-Status XRTMemoryManager::Release(int64_t handle) {
-  int device_ordinal = GetDeviceFromHandle(handle);
-  DeviceContext* device_context = GetDeviceContext(device_ordinal,
-                                                   /*create_if_missing=*/false);
-  if (device_context == nullptr || !device_context->Release(handle)) {
-    return errors::NotFound("XRT memory handle not found: ", handle);
-  }
-  return OkStatus();
-}
-
-Status XRTMemoryManager::CompactAllocations(
-    xla::Backend* backend, int device_ordinal,
-    se::DeviceMemoryAllocator* allocator) {
-  DeviceContext* device_context = GetDeviceContext(device_ordinal,
-                                                   /*create_if_missing=*/false);
-  return device_context != nullptr
-             ? device_context->CompactAllocations(this, backend, allocator)
-             : OkStatus();
-}
-
-void XRTMemoryManager::ReleaseAllAllocations() {
-  mutex_lock lock(lock_);
-  for (auto& device_context : device_contexts_) {
-    if (device_context != nullptr) {
-      device_context->Clear();
-    }
-  }
-}
-
-xla::StatusOr<se::OwningDeviceMemory> XRTMemoryManager::Allocate(
-    xla::Backend* backend, int device_ordinal, size_t size,
-    se::DeviceMemoryAllocator* allocator) {
-  auto memory_or =
-      allocator->Allocate(device_ordinal, size, /*retry_on_failure=*/false);
-  if (memory_or.status().code() == error::RESOURCE_EXHAUSTED) {
-    VLOG(4) << "Allocate of " << size << " bytes failed on device "
-            << device_ordinal;
-
-    DeviceContext* device_context =
-        GetDeviceContext(device_ordinal,
-                         /*create_if_missing=*/false);
-    if (device_context != nullptr) {
-      Status status = device_context->TryFreeMemory(backend, size).status();
-      if (status.ok()) {
-        // As long as there is no error, we still try again the allocation, even
-        // if the TryFreeMemory() call ended up freeing less memory than the
-        // required size. Fragmentation could make the memory allocation succeed
-        // even if the freed memory is indeed lower.
-        memory_or = allocator->Allocate(device_ordinal, size,
-                                        /*retry_on_failure=*/false);
-      } else if (status.code() != error::RESOURCE_EXHAUSTED) {
-        VLOG(4) << "Allocate of " << size << " bytes on device "
-                << device_ordinal << ": " << status;
-        return status;
-      }
-    }
-  }
-  return memory_or;
-}
-
-string XRTMemoryManager::DebugString() const {
-  // We might want to emit more detailed information here, like per device
-  // memory allocations.
-  return "XRTMemoryManager";
-}
-
-XRTMemoryManager::DeviceContext* XRTMemoryManager::GetDeviceContext(
-    int device_ordinal, bool create_if_missing) {
-  mutex_lock lock(lock_);
-  if (device_ordinal >= device_contexts_.size()) {
-    if (!create_if_missing) {
-      return nullptr;
-    }
-    device_contexts_.resize(device_ordinal + 1);
-  }
-  DeviceContext* device_context = device_contexts_[device_ordinal].get();
-  if (device_context == nullptr && create_if_missing) {
-    device_contexts_[device_ordinal] = std::make_unique<DeviceContext>();
-    device_context = device_contexts_[device_ordinal].get();
-  }
-  return device_context;
-}
-
-Status XRTMemoryManager::TryFreeMemoryStep(MemoryReclaimContext* mrctx,
-                                           const Status& status) {
-  DeviceContext* device_context = GetDeviceContext(mrctx->device_ordinal,
-                                                   /*create_if_missing=*/false);
-  if (device_context == nullptr) {
-    return status;
-  }
-  if (!mrctx->done_freeing) {
-    // If the caller passed us a zero requested_free_size, we try to free chunks
-    // of kMaxFreeSize memory, until either the run function succeeds, or we run
-    // out of freeable memory.
-    const size_t kMaxFreeSize = 1000000000;
-    size_t free_size =
-        (mrctx->requested_free_size > 0)
-            ? std::min<size_t>(mrctx->requested_free_size - mrctx->free_size,
-                               kMaxFreeSize)
-            : kMaxFreeSize;
-    if (free_size > 0) {
-      auto free_size_or =
-          device_context->TryFreeMemory(mrctx->backend, free_size);
-      if (!free_size_or.ok()) {
-        return status;
-      }
-      size_t size = free_size_or.value();
-      mrctx->free_size += size;
-      if (size > 0) {
-        return OkStatus();
-      }
-    }
-    mrctx->done_freeing = true;
-  }
-  if (!mrctx->done_compacting) {
-    mrctx->done_compacting = true;
-    if (device_context
-            ->CompactAllocations(this, mrctx->backend, mrctx->allocator)
-            .ok()) {
-      return OkStatus();
-    }
-  }
-  return status;
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_memory_manager.h b/tensorflow/compiler/xrt/xrt_memory_manager.h
deleted file mode 100644
index 519938c525a18f..00000000000000
--- a/tensorflow/compiler/xrt/xrt_memory_manager.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XRT_XRT_MEMORY_MANAGER_H_
-#define TENSORFLOW_COMPILER_XRT_XRT_MEMORY_MANAGER_H_
-
-#include <memory>
-#include <vector>
-
-#include "xla/service/backend.h"
-#include "xla/statusor.h"
-#include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/xla_data.pb.h"
-#include "tensorflow/compiler/xrt/xrt_refptr.h"
-#include "tensorflow/compiler/xrt/xrt_state.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/refcount.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-// The XRTMemoryManager manages all the XRT allocations. It is a ResourceBase
-// object which leaves within the ResourceMgr. This is only one XRT memory
-// manager object within the ResourceMgr container.
-class XRTMemoryManager : public ResourceBase {
-  // The DeviceContext class, defined and implemented locally inside the
-  // xrt_memory_manager.cc file, holds, for each device, all the information
-  // related to the XRT memory management for such device.
-  class DeviceContext;
-
- public:
-  // A working set is a set of tuple allocations which are the input of a given
-  // operation, and as such they must be pinned on the device memory. The tuple
-  // allocations added to the WorkingSet will be unpinned at object destruction.
-  class WorkingSet {
-   public:
-    explicit WorkingSet(RefPtr<XRTMemoryManager> memory_manager);
-
-    ~WorkingSet();
-
-    // Looks up the tuple handle within the memory manager, and pins it to the
-    // device (if not already pinned).
-    Status LookupAndPin(xla::Backend* backend, int64_t handle,
-                        se::DeviceMemoryAllocator* allocator);
-
-    const std::vector<RefPtr<XRTTupleAllocation>>& PinnedTuples() const {
-      return pinned_tuples_;
-    }
-
-    const RefPtr<XRTMemoryManager>& MemoryManager() const {
-      return memory_manager_;
-    }
-
-   private:
-    RefPtr<XRTMemoryManager> memory_manager_;
-    std::vector<RefPtr<XRTTupleAllocation>> pinned_tuples_;
-  };
-
-  // Retrieves the XRTMemoryManager singleton stored within the ResourceMgr.
-  static RefPtr<XRTMemoryManager> Get(ResourceMgr* rm);
-
-  // Registers an XRTTupleAllocation and returns the unique handle identifying
-  // it.
-  int64_t Register(RefPtr<XRTTupleAllocation> tuple);
-
-  // Looks up an handle returned by the Register() API and returns the
-  // XRTTupleAllocation behind it.
-  xla::StatusOr<RefPtr<XRTTupleAllocation>> Lookup(int64_t handle);
-
-  Status Lookup(int64_t handle, RefPtr<XRTTupleAllocation>* tuple) {
-    TF_ASSIGN_OR_RETURN(*tuple, Lookup(handle));
-    return OkStatus();
-  }
-
-  // Releases an handle by dropping the references count held on the
-  // XRTTupleAllocation by the XRTMemoryManager. Existing XRTTupleAllocation
-  // references will continue to be valid.
-  Status Release(int64_t handle);
-
-  // Tries to compact all the memory allocations on a given device. This is
-  // currently done by swapping-out all the existing allocation, and swapping
-  // them back in.
-  Status CompactAllocations(xla::Backend* backend, int device_ordinal,
-                            se::DeviceMemoryAllocator* allocator);
-
-  // Releases all the device memory allocated by XRT within the resource
-  // manager.
-  void ReleaseAllAllocations();
-
-  // Tries to allocate size bytes of device memory from the device_ordinal
-  // device. Might attempt to free some unpinned device memory, if the underline
-  // allocator call fails, and try the allocation again.
-  xla::StatusOr<se::OwningDeviceMemory> Allocate(
-      xla::Backend* backend, int device_ordinal, size_t size,
-      se::DeviceMemoryAllocator* allocator);
-
-  // Runs the specified function and handling the error::RESOURCE_EXHAUSTED
-  // status code coming out of it. In such cases, we run different memory
-  // freeing operations trying to make runfn succeed. The requested_free_size
-  // argument represents an hint of the requested memory size which would make
-  // runfn succeed.
-  template <typename T>
-  xla::StatusOr<T> Run(const std::function<xla::StatusOr<T>()>& runfn,
-                       xla::Backend* backend, int device_ordinal,
-                       size_t requested_free_size,
-                       se::DeviceMemoryAllocator* allocator);
-
-  string DebugString() const override;
-
-  // Returns the invalid key value, which will be never generated by the
-  // Intern() API.
-  static int64_t InvalidKey() { return 0; }
-
- private:
-  // Structure used to track the progress of a try-to-free operation. It is
-  // initialized and the passed to the TryFreeMemoryStep() API.
-  struct MemoryReclaimContext {
-    MemoryReclaimContext(xla::Backend* backend, int device_ordinal,
-                         size_t requested_free_size,
-                         se::DeviceMemoryAllocator* specific_allocator)
-        : backend(backend),
-          device_ordinal(device_ordinal),
-          requested_free_size(requested_free_size) {
-      allocator = specific_allocator;
-    }
-
-    xla::Backend* const backend = nullptr;
-    se::DeviceMemoryAllocator* allocator = nullptr;
-    const int device_ordinal = 0;
-    const size_t requested_free_size = 0;
-    size_t free_size = 0;
-    bool done_freeing = false;
-    bool done_compacting = false;
-  };
-
-  DeviceContext* GetDeviceContext(int device_ordinal, bool create_if_missing);
-
-  // Called multiple times while trying to make a memory consuming function call
-  // to fit. Performs progressively more expensive memory reduction operations,
-  // until returning error::RESOURCE_EXHAUSTED when no further reductions are
-  // possible.
-  Status TryFreeMemoryStep(MemoryReclaimContext* mrctx, const Status& status);
-
-  mutex lock_;
-  std::vector<std::unique_ptr<DeviceContext>> device_contexts_;
-};
-
-template <typename T>
-xla::StatusOr<T> XRTMemoryManager::Run(
-    const std::function<xla::StatusOr<T>()>& runfn, xla::Backend* backend,
-    int device_ordinal, size_t requested_free_size,
-    se::DeviceMemoryAllocator* allocator) {
-  MemoryReclaimContext mrctx(backend, device_ordinal, requested_free_size,
-                             allocator);
-  while (true) {
-    // We assume that runfn is a relatively fast-fail function compared to the
-    // operations required to free up the required memory. Here we call into the
-    // TryFreeMemoryStep() API multiple times, which will run progressively more
-    // expensive operations.
-    auto result_or = runfn();
-    if (result_or.status().code() != error::RESOURCE_EXHAUSTED) {
-      return result_or;
-    }
-    TF_RETURN_IF_ERROR(TryFreeMemoryStep(&mrctx, result_or.status()));
-  }
-}
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_XRT_XRT_MEMORY_MANAGER_H_
diff --git a/tensorflow/compiler/xrt/xrt_metrics.cc b/tensorflow/compiler/xrt/xrt_metrics.cc
deleted file mode 100644
index e6e4ca8c5fef69..00000000000000
--- a/tensorflow/compiler/xrt/xrt_metrics.cc
+++ /dev/null
@@ -1,292 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xrt/xrt_metrics.h"
-
-#include <utility>
-#include <vector>
-
-#include "tensorflow/core/lib/monitoring/collection_registry.h"
-#include "tensorflow/core/platform/regexp.h"
-
-namespace tensorflow {
-namespace {
-
-static const size_t kMaxSamples = 1024;
-
-std::vector<double> GetDefaultPercentiles() {
-  return {25.0, 50.0, 80.0, 90.0, 95.0, 99.0};
-}
-
-bool IsSelectedMetric(const xrt::XRTMetricsCollect& metrics,
-                      const string& name) {
-  if (metrics.metrics_regex_size() == 0) {
-    return true;
-  }
-  for (auto& metric_regex : metrics.metrics_regex()) {
-    if (RE2::FullMatch(name, metric_regex)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void SetUnitOfMeasure(xrt::MetricValues* metrics,
-                      monitoring::UnitOfMeasure unit_of_measure) {
-  switch (unit_of_measure) {
-    case monitoring::UnitOfMeasure::kNumber:
-      metrics->set_unit_of_measure(xrt::MetricValues::NUMBER);
-      break;
-    case monitoring::UnitOfMeasure::kTime:
-      metrics->set_unit_of_measure(xrt::MetricValues::TIME);
-      break;
-    case monitoring::UnitOfMeasure::kBytes:
-      metrics->set_unit_of_measure(xrt::MetricValues::BYTES);
-      break;
-  }
-}
-
-Status AddMetrics(xrt::MetricsReport* report,
-                  const monitoring::PointSet& point_set) {
-  for (auto& point : point_set.points) {
-    xrt::MetricValues* metrics = report->add_metrics();
-    metrics->set_name(point_set.metric_name);
-    if (point->value_type == monitoring::ValueType::kPercentiles) {
-      xrt::Percentiles* percentiles = metrics->mutable_percentiles_value();
-      SetUnitOfMeasure(metrics, point->percentiles_value.unit_of_measure);
-      percentiles->set_start_nstime(point->percentiles_value.start_nstime);
-      percentiles->set_end_nstime(point->percentiles_value.end_nstime);
-      percentiles->set_min_value(point->percentiles_value.min_value);
-      percentiles->set_max_value(point->percentiles_value.max_value);
-      percentiles->set_mean(point->percentiles_value.mean);
-      percentiles->set_stddev(point->percentiles_value.stddev);
-      percentiles->set_num_samples(point->percentiles_value.num_samples);
-      percentiles->set_total_samples(point->percentiles_value.total_samples);
-      percentiles->set_accumulator(point->percentiles_value.accumulator);
-      for (auto& pct_point : point->percentiles_value.points) {
-        xrt::Percentiles::Point* xpoint = percentiles->add_points();
-        xpoint->set_percentile(pct_point.percentile);
-        xpoint->set_value(pct_point.value);
-      }
-    } else if (point->value_type == monitoring::ValueType::kInt64) {
-      metrics->set_unit_of_measure(xrt::MetricValues::NUMBER);
-      metrics->set_int64_value(point->int64_value);
-    }
-  }
-  return OkStatus();
-}
-
-}  // namespace
-
-namespace xrt_metrics {
-
-monitoring::PercentileSamplerCell* GetAllocateCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/allocate", "Tracks XRTAllocate times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetAllocateUninitializedCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/allocate_uninitialized",
-           "Tracks XRTAllocateUninitialized times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetAllocateFromTensorCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/allocate_from_tensor",
-           "Tracks XRTAllocateFromTensor times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetSubTupleCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/sub_tuple", "Tracks XRTSubTuple times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetMakeTupleCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/make_tuple", "Tracks XRTMakeTuple times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetReadLiteralCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/read_literal", "Tracks XRTReadLiteral times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetReadToTensorCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/read_tensor", "Tracks XRTReadToTensor times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetWriteLiteralCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/write_literal", "Tracks XRTWriteLiteral times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetReleaseAllocationCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/release_allocation",
-           "Tracks XRTReleaseAllocation times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetReleaseAllAllocationsCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/release_all_allocations",
-           "Tracks XRTReleaseAllAllocations times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetCompactAllocationsCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/compact_allocations",
-           "Tracks XRTCompactAllocations times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetCompileCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/compile", "Tracks XRTCompile times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetReleaseCompilationCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/release_compilation",
-           "Tracks XRTReleaseCompilationRef times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetExecuteCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/execute", "Tracks XRTExecute times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetExecuteChainedCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/ops/execute_chained",
-           "Tracks XRTExecuteChained times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetMemoryCompactCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/memory_manager/compaction",
-           "Tracks XRT memory manager memory compaction times"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-monitoring::PercentileSamplerCell* GetTryFreeMemoryCell() {
-  static monitoring::PercentileSamplerCell* cell =
-      monitoring::PercentileSampler<0>::New(
-          {"/tensorflow/xrt/memory_manager/try_free_memory",
-           "Tracks XRT memory manager times in trying to "
-           "free memory by swpping device memory to host memory"},
-          GetDefaultPercentiles(), kMaxSamples,
-          monitoring::UnitOfMeasure::kTime)
-          ->GetCell();
-  return cell;
-}
-
-}  // namespace xrt_metrics
-
-xla::StatusOr<xrt::MetricsReport> CollectMetrics(
-    const xrt::XRTMetricsCollect& metrics) {
-  auto* collection_registry = monitoring::CollectionRegistry::Default();
-  monitoring::CollectionRegistry::CollectMetricsOptions options;
-  options.collect_metric_descriptors = false;
-  auto collected_metrics = collection_registry->CollectMetrics(options);
-  xrt::MetricsReport report;
-  for (auto& name_pointset : collected_metrics->point_set_map) {
-    if (IsSelectedMetric(metrics, name_pointset.first)) {
-      TF_RETURN_IF_ERROR(AddMetrics(&report, *name_pointset.second));
-    }
-  }
-  return std::move(report);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_metrics.h b/tensorflow/compiler/xrt/xrt_metrics.h
deleted file mode 100644
index d6afdbd7e33ab9..00000000000000
--- a/tensorflow/compiler/xrt/xrt_metrics.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XRT_XRT_METRICS_H_
-#define TENSORFLOW_COMPILER_XRT_XRT_METRICS_H_
-
-#include "xla/statusor.h"
-#include "tensorflow/compiler/xrt/xrt.pb.h"
-#include "tensorflow/core/lib/monitoring/percentile_sampler.h"
-
-namespace tensorflow {
-namespace xrt_metrics {
-
-// Defines the singletons of the metrics populated by the XRT op framework.
-// Single of a single XRT op there can be many device specific versions (CPU,
-// GPU, TPU), and since the monitoring subsystem does not allow multiple
-// registrations of the same metric name, we define them all in this file.
-monitoring::PercentileSamplerCell* GetAllocateCell();
-monitoring::PercentileSamplerCell* GetAllocateUninitializedCell();
-monitoring::PercentileSamplerCell* GetAllocateFromTensorCell();
-monitoring::PercentileSamplerCell* GetSubTupleCell();
-monitoring::PercentileSamplerCell* GetMakeTupleCell();
-monitoring::PercentileSamplerCell* GetReadLiteralCell();
-monitoring::PercentileSamplerCell* GetReadToTensorCell();
-monitoring::PercentileSamplerCell* GetWriteLiteralCell();
-monitoring::PercentileSamplerCell* GetReleaseAllocationCell();
-monitoring::PercentileSamplerCell* GetReleaseAllAllocationsCell();
-monitoring::PercentileSamplerCell* GetCompactAllocationsCell();
-monitoring::PercentileSamplerCell* GetCompileCell();
-monitoring::PercentileSamplerCell* GetReleaseCompilationCell();
-monitoring::PercentileSamplerCell* GetExecuteCell();
-monitoring::PercentileSamplerCell* GetExecuteChainedCell();
-monitoring::PercentileSamplerCell* GetMemoryCompactCell();
-monitoring::PercentileSamplerCell* GetTryFreeMemoryCell();
-
-}  // namespace xrt_metrics
-
-xla::StatusOr<xrt::MetricsReport> CollectMetrics(
-    const xrt::XRTMetricsCollect& metrics);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_XRT_XRT_METRICS_H_
diff --git a/tensorflow/compiler/xrt/xrt_refptr.h b/tensorflow/compiler/xrt/xrt_refptr.h
deleted file mode 100644
index 2db20dd71ce5ed..00000000000000
--- a/tensorflow/compiler/xrt/xrt_refptr.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Utility functions in support of the XRT API.
-
-#ifndef TENSORFLOW_COMPILER_XRT_XRT_REFPTR_H_
-#define TENSORFLOW_COMPILER_XRT_XRT_REFPTR_H_
-
-#include <cstddef>
-
-namespace tensorflow {
-
-// Reference counted smart pointer for XRT objects providing the standard
-// Ref()/Unref() APIs.
-template <typename T>
-class RefPtr {
- public:
-  RefPtr() = default;
-  // Creates a RefPtr from a pointer. This is an ownership transfer operation,
-  // and the caller has to own a valid reference to ptr (unless ptr is nullptr).
-  RefPtr(T* ptr) : ptr_(ptr) {}  // NOLINT
-  RefPtr(const RefPtr& other) : ptr_(other.ptr_) { Acquire(ptr_); }
-  RefPtr(RefPtr&& other) : ptr_(other.ptr_) { other.ptr_ = nullptr; }
-
-  ~RefPtr() { Release(ptr_); }
-
-  RefPtr& operator=(const RefPtr& other) {
-    if (this != &other) {
-      Acquire(other.ptr_);
-      Release(ptr_);
-      ptr_ = other.ptr_;
-    }
-    return *this;
-  }
-
-  RefPtr& operator=(RefPtr&& other) {
-    if (this != &other) {
-      Release(ptr_);
-      ptr_ = other.ptr_;
-      other.ptr_ = nullptr;
-    }
-    return *this;
-  }
-
-  operator bool() const { return ptr_ != nullptr; }  // NOLINT
-  bool operator==(const RefPtr& rhs) const { return ptr_ == rhs.ptr_; }
-  bool operator!=(const RefPtr& rhs) const { return ptr_ != rhs.ptr_; }
-  bool operator==(const T* ptr) const { return ptr_ == ptr; }
-  bool operator!=(const T* ptr) const { return ptr_ != ptr; }
-  bool operator==(std::nullptr_t ptr) const { return ptr_ == ptr; }
-  bool operator!=(std::nullptr_t ptr) const { return ptr_ != ptr; }
-
-  T* get() const { return ptr_; }
-
-  T* operator->() const {
-    CHECK(ptr_ != nullptr);  // Crash OK
-    return ptr_;
-  }
-
-  T& operator*() const {
-    CHECK(ptr_ != nullptr);  // Crash OK
-    return *ptr_;
-  }
-
-  T* release() {
-    T* ptr = ptr_;
-    ptr_ = nullptr;
-    return ptr;
-  }
-
-  // Resets the RefPtr from a pointer. This is an ownership transfer operation,
-  // and the caller has to own a valid reference to ptr (unless ptr is nullptr).
-  void reset(T* ptr = nullptr) {
-    Release(ptr_);
-    ptr_ = ptr;
-  }
-
- private:
-  static void Release(T* ptr) {
-    if (ptr != nullptr) {
-      ptr->Unref();
-    }
-  }
-
-  static void Acquire(T* ptr) {
-    if (ptr != nullptr) {
-      ptr->Ref();
-    }
-  }
-
-  T* ptr_ = nullptr;
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_XRT_XRT_REFPTR_H_
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
deleted file mode 100644
index 4189c5e7bc1063..00000000000000
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ /dev/null
@@ -1,679 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Classes for allocating XLA literals in device memory and managing handles
-// that refer to them.
-
-#include "tensorflow/compiler/xrt/xrt_state.h"
-
-#include <functional>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/memory/memory.h"
-#include "xla/service/backend.h"
-#include "xla/status_macros.h"
-#include "tensorflow/compiler/xrt/xrt_memory_manager.h"
-
-namespace tensorflow {
-namespace {
-
-// Helper typedef to make ShapeTree ForEach helper lambda signatures more
-// readable. They need a type of const T& where in this case T is the
-// following pointer.
-typedef XRTBufferAllocation* XRTBufferAllocationPtr;
-
-class BufferAllocStats {
- public:
-  struct Stats {
-    int64_t count = 0;
-    int64_t size = 0;
-  };
-
-  Stats ReportAlloc(int64_t device, int64_t msize) {
-    mutex_lock lock(lock_);
-    Stats* device_stats = &stats_[device];
-    device_stats->count += 1;
-    device_stats->size += msize;
-    return *device_stats;
-  }
-
-  Stats ReportFree(int64_t device, int64_t msize) {
-    mutex_lock lock(lock_);
-    Stats* device_stats = &stats_[device];
-    device_stats->count -= 1;
-    device_stats->size -= msize;
-    return *device_stats;
-  }
-
- private:
-  mutable mutex lock_;
-  std::map<int64_t, Stats> stats_;
-};
-
-BufferAllocStats* GetAllocStats() {
-  static BufferAllocStats* stats = new BufferAllocStats();
-  return stats;
-}
-
-Status AllocateScopedShapedBuffer(
-    XRTMemoryManager* memory_manager, xla::Backend* backend, int device_ordinal,
-    const xla::Shape& shape, std::unique_ptr<xla::ScopedShapedBuffer>* buffer,
-    se::DeviceMemoryAllocator* allocator) {
-  auto transfer_manager = backend->transfer_manager();
-  TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal));
-
-  // XLA may use a different representation on device than the representation on
-  // the host. XLA does not document any contract for the relationship between
-  // these representations :/ Right now, the device shape is always a superset
-  // of the host shape, meaning that for any valid ShapeIndex in the host shape
-  // that ShapeIndex is also valid in the device shape, but not vice versa. In
-  // particular, some host-side types are rewritten to be tuples. We rely on
-  // this property when making sub-buffers, because we assume that if the client
-  // requests the host-shape sub-buffer at index i, that will correspond to the
-  // right device-shape sub-buffer at the same index.
-  xla::Shape on_device_shape = transfer_manager->HostShapeToDeviceShape(shape);
-  VLOG(3) << "Allocating literal buffer: host_shape="
-          << xla::ShapeUtil::HumanStringWithLayout(shape) << " device_shape="
-          << xla::ShapeUtil::HumanStringWithLayout(on_device_shape);
-
-  // The ScopedShapedBuffer frees the buffers that have so far been allocated if
-  // it goes out of scope. That's useful if we return early as the result of an
-  // error allocating one of the later buffers.
-  *buffer = std::make_unique<xla::ScopedShapedBuffer>(
-      shape, on_device_shape, allocator, device_ordinal);
-  for (auto& index_to_buffer : (*buffer)->buffers()) {
-    const xla::Shape& subshape =
-        xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
-    uint64 size = transfer_manager->GetByteSizeRequirement(subshape);
-    TF_ASSIGN_OR_RETURN(
-        se::OwningDeviceMemory buffer,
-        memory_manager->Allocate(backend, device_ordinal, size, allocator));
-    // Move our buffer into shaped_buffer, which takes ownership of it.
-    index_to_buffer.second = buffer.Release();
-    VLOG(2) << "Allocated buffer at " << index_to_buffer.second.opaque()
-            << " index " << index_to_buffer.first.ToString() << " (" << size
-            << " bytes)";
-  }
-
-  TF_RETURN_IF_ERROR(
-      transfer_manager->WriteTupleIndexTables(stream.get(), *(buffer->get())));
-
-  return OkStatus();
-}
-
-}  // namespace
-
-XRTBufferAllocation::XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
-                                         int device_ordinal,
-                                         se::DeviceMemoryAllocator* allocator)
-    : allocation_(allocation),
-      device_ordinal_(device_ordinal),
-      allocator_(allocator) {
-  if (VLOG_IS_ON(2)) {
-    auto stats =
-        GetAllocStats()->ReportAlloc(device_ordinal_, allocation_.size());
-    LOG(INFO) << "XRT Allocation Stats: device=" << device_ordinal_
-              << " count=" << stats.count << " size=" << stats.size;
-  }
-}
-
-XRTBufferAllocation::~XRTBufferAllocation() {
-  if (VLOG_IS_ON(2)) {
-    GetAllocStats()->ReportFree(device_ordinal_, allocation_.size());
-  }
-  // Deallocate explicitly allows allocation_ to be null.
-  TF_CHECK_OK(allocator_->Deallocate(device_ordinal_, allocation_));
-  VLOG(2) << "Freed buffer at " << allocation_.opaque() << " ("
-          << allocation_.size() << " bytes)";
-}
-
-const se::DeviceMemoryBase& XRTBufferAllocation::allocation() {
-  return allocation_;
-}
-
-XRTTupleAllocation::XRTTupleAllocation(int device_ordinal,
-                                       se::DeviceMemoryAllocator* allocator,
-                                       const xla::Shape& on_host_shape,
-                                       const xla::Shape& on_device_shape)
-    : device_ordinal_(device_ordinal),
-      allocator_(allocator),
-      on_host_shape_(on_host_shape),
-      on_device_shape_(on_device_shape),
-      buffers_(&on_device_shape_),
-      pin_count_(0) {}
-
-XRTTupleAllocation::~XRTTupleAllocation() { ReleaseBuffers(); }
-
-void XRTTupleAllocation::ReleaseBuffers() {
-  for (auto& index_buffer : buffers_) {
-    if (index_buffer.second != nullptr) {
-      index_buffer.second->Unref();
-      index_buffer.second = nullptr;
-    }
-  }
-}
-
-/*static*/ Status XRTTupleAllocation::CreateAndTransfer(
-    const xla::LiteralBase& literal, XRTMemoryManager* memory_manager,
-    xla::Backend* backend, int device_ordinal, XRTTupleAllocation** allocation,
-    se::DeviceMemoryAllocator* allocator) {
-  auto transfer_manager = backend->transfer_manager();
-  std::unique_ptr<xla::ScopedShapedBuffer> scoped_buffer;
-  TF_RETURN_IF_ERROR(AllocateScopedShapedBuffer(memory_manager, backend,
-                                                device_ordinal, literal.shape(),
-                                                &scoped_buffer, allocator));
-  TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal));
-  TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice(
-      stream.get(), literal, *scoped_buffer));
-
-  // By releasing the ScopedShapedBuffer we ensure that the underlying storage
-  // won't be freed when the buffer goes out of scope at the end of this
-  // call. To avoid a leak, there must be no error-case returns from here until
-  // the end of the method.
-  auto shaped_buffer = scoped_buffer->release();
-  *allocation = new XRTTupleAllocation(device_ordinal, allocator,
-                                       shaped_buffer.on_host_shape(),
-                                       shaped_buffer.on_device_shape());
-  (*allocation)
-      ->InitializeFromShapedBuffer(shaped_buffer, allocator, device_ordinal);
-  (*allocation)->SetDeviceMemorySize();
-  return OkStatus();
-}
-
-/*static*/ Status XRTTupleAllocation::CreateUninitialized(
-    const xla::Shape& shape, XRTMemoryManager* memory_manager,
-    xla::Backend* backend, int device_ordinal, XRTTupleAllocation** allocation,
-    se::DeviceMemoryAllocator* allocator) {
-  std::unique_ptr<xla::ScopedShapedBuffer> scoped_buffer;
-  TF_RETURN_IF_ERROR(AllocateScopedShapedBuffer(memory_manager, backend,
-                                                device_ordinal, shape,
-                                                &scoped_buffer, allocator));
-
-  // By releasing the ScopedShapedBuffer we ensure that the underlying storage
-  // won't be freed when the buffer goes out of scope at the end of this
-  // call. To avoid a leak, there must be no error-case returns from here until
-  // the end of the method.
-  auto shaped_buffer = scoped_buffer->release();
-  *allocation = new XRTTupleAllocation(device_ordinal, allocator,
-                                       shaped_buffer.on_host_shape(),
-                                       shaped_buffer.on_device_shape());
-  (*allocation)
-      ->InitializeFromShapedBuffer(shaped_buffer, allocator, device_ordinal);
-  (*allocation)->SetDeviceMemorySize();
-  return OkStatus();
-}
-
-/*static*/ Status XRTTupleAllocation::CreateFromBuffer(
-    const xla::ShapedBuffer& shaped_buffer, const xla::Shape& on_host_shape,
-    const xla::Shape& on_device_shape, xla::Backend* backend,
-    int device_ordinal, XRTTupleAllocation** allocation,
-    se::DeviceMemoryAllocator* allocator) {
-  *allocation = new XRTTupleAllocation(device_ordinal, allocator, on_host_shape,
-                                       on_device_shape);
-  (*allocation)
-      ->InitializeFromShapedBuffer(shaped_buffer, allocator, device_ordinal);
-  (*allocation)->SetDeviceMemorySize();
-  return OkStatus();
-}
-
-/*static*/ Status XRTTupleAllocation::CreateFromBuffer(
-    const xla::ShapedBuffer& shaped_buffer, xla::Backend* backend,
-    int device_ordinal, XRTTupleAllocation** allocation,
-    se::DeviceMemoryAllocator* allocator) {
-  return CreateFromBuffer(shaped_buffer, shaped_buffer.on_host_shape(),
-                          shaped_buffer.on_device_shape(), backend,
-                          device_ordinal, allocation, allocator);
-}
-
-Status XRTTupleAllocation::ToLiteral(xla::Backend* backend,
-                                     xla::MutableLiteralBase* literal) {
-  mutex_lock lock(lock_);
-  return literal_ == nullptr ? StoreToLiteral(backend, literal)
-                             : literal->CopyFrom(*literal_);
-}
-
-Status XRTTupleAllocation::StoreToLiteral(xla::Backend* backend,
-                                          xla::MutableLiteralBase* literal) {
-  auto transfer_manager = backend->transfer_manager();
-  TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal()));
-  TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer, ToShapedBuffer());
-  return transfer_manager->TransferLiteralFromDevice(stream.get(),
-                                                     shaped_buffer, literal);
-}
-
-Status XRTTupleAllocation::WriteLiteral(xla::Backend* backend,
-                                        const xla::Literal& literal) {
-  if (!xla::ShapeUtil::Equal(literal.shape(), on_host_shape())) {
-    return errors::InvalidArgument(
-        "New literal shape not matching the existing one: literal=",
-        xla::ShapeUtil::HumanStringWithLayout(literal.shape()),
-        " device=", xla::ShapeUtil::HumanStringWithLayout(on_host_shape()));
-  }
-  mutex_lock lock(lock_);
-  if (literal_ != nullptr) {
-    // The allocation is currently swapped out, and we have a host literal for
-    // its content. Just update the host literal with the new value.
-    return literal_->CopyFrom(literal);
-  }
-  TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer, ToShapedBuffer());
-  auto transfer_manager = backend->transfer_manager();
-  TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal()));
-  return transfer_manager->TransferLiteralToDevice(stream.get(), literal,
-                                                   shaped_buffer);
-}
-
-xla::StatusOr<bool> XRTTupleAllocation::SwapOut(xla::Backend* backend,
-                                                bool swap_pinned) {
-  mutex_lock lock(lock_);
-  if (literal_ == nullptr && (!IsPinned() || swap_pinned)) {
-    xla::Literal literal(on_host_shape());
-    TF_RETURN_IF_ERROR(StoreToLiteral(backend, &literal));
-    ReleaseBuffers();
-    literal_ = std::make_unique<xla::Literal>(std::move(literal));
-    return true;
-  }
-  return false;
-}
-
-xla::StatusOr<bool> XRTTupleAllocation::SwapIn(
-    XRTMemoryManager* memory_manager, xla::Backend* backend,
-    se::DeviceMemoryAllocator* allocator) {
-  // We need to call AllocateScopedShapedBuffer() outside the locks, since the
-  // XRTMemoryManager might end up calling back into the SwapOut() API.
-  // So we do a quick check before using the IsSwapped() API, and it can happen
-  // that the allocation becomes swapped in after the check. This means which we
-  // will end up doing an allocation, and then releasing it soon after (via its
-  // scoped variables). This is an unlikely scenario (two threads calling
-  // SwapIn() on the same allocation) though.
-  if (!IsSwapped()) {
-    return false;
-  }
-
-  auto transfer_manager = backend->transfer_manager();
-  std::unique_ptr<xla::ScopedShapedBuffer> scoped_buffer;
-  TF_RETURN_IF_ERROR(
-      AllocateScopedShapedBuffer(memory_manager, backend, device_ordinal(),
-                                 on_host_shape(), &scoped_buffer, allocator));
-  TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal()));
-
-  mutex_lock lock(lock_);
-  if (literal_ != nullptr) {
-    TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice(
-        stream.get(), *literal_, *scoped_buffer));
-
-    auto shaped_buffer = scoped_buffer->release();
-    InitializeFromShapedBuffer(shaped_buffer, allocator, device_ordinal());
-    literal_ = nullptr;
-    return true;
-  }
-  return false;
-}
-
-xla::StatusOr<bool> XRTTupleAllocation::PinAndSwapIn(
-    XRTMemoryManager* memory_manager, xla::Backend* backend,
-    se::DeviceMemoryAllocator* allocator) {
-  Pin();
-  return SwapIn(memory_manager, backend, allocator);
-}
-
-bool XRTTupleAllocation::IsSwapped() const {
-  mutex_lock lock(lock_);
-  return literal_ != nullptr;
-}
-
-int64_t XRTTupleAllocation::Pin() { return pin_count_.fetch_add(1); }
-
-int64_t XRTTupleAllocation::Unpin() { return pin_count_.fetch_sub(1); }
-
-bool XRTTupleAllocation::IsPinned() const { return pin_count_ != 0; }
-
-void XRTTupleAllocation::DiscardAllocation(
-    const xla::ShapeIndex& buffer_index) {
-  buffers_.element(buffer_index)->DiscardAllocation();
-}
-
-const xla::Shape& XRTTupleAllocation::on_host_shape() const {
-  return on_host_shape_;
-}
-
-const xla::Shape& XRTTupleAllocation::on_device_shape() const {
-  return on_device_shape_;
-}
-
-int XRTTupleAllocation::device_ordinal() const { return device_ordinal_; }
-
-const se::DeviceMemoryBase& XRTTupleAllocation::root_allocation() const {
-  return buffers_.element({})->allocation();
-}
-
-/*static*/ Status XRTTupleAllocation::MakeSubBuffer(
-    XRTTupleAllocation* parent, const xla::ShapeIndex& subshape,
-    XRTTupleAllocation** allocation, bool alias_parent_allocation) {
-  TF_ASSIGN_OR_RETURN(
-      const xla::Shape* host_sub_shape,
-      xla::ShapeUtil::TryGetSubshape(parent->on_host_shape(), subshape));
-  TF_ASSIGN_OR_RETURN(
-      const xla::Shape* device_sub_shape,
-      xla::ShapeUtil::TryGetSubshape(parent->on_device_shape(), subshape));
-
-  *allocation =
-      new XRTTupleAllocation(parent->device_ordinal(), parent->allocator_,
-                             *host_sub_shape, *device_sub_shape);
-  if (alias_parent_allocation) {
-    // Copy the subtree of allocations from the parent allocation.
-    (*allocation)->buffers_.CopySubtreeFrom(parent->buffers_, subshape, {});
-    // Increment the refcount on each aliased buffer.
-    (*allocation)
-        ->buffers_.ForEachElement(
-            [](const xla::ShapeIndex& index,
-               const XRTBufferAllocationPtr& buffer) { buffer->Ref(); });
-  } else {
-    // Find the buffers in the parent allocation that match the subtree, and
-    // move the parent allocation's buffer over to the new allocation.
-    (*allocation)
-        ->buffers_.ForEachMutableElement(
-            [&](const xla::ShapeIndex& index, XRTBufferAllocationPtr* buffer) {
-              // Extend the allocation's index to the parent's frame by adding
-              // subshape as a prefix.
-              xla::ShapeIndex parent_index = subshape;
-              for (int i = 0; i < index.size(); ++i) {
-                parent_index.push_back(index[i]);
-              }
-              *buffer = parent->buffers_.element(parent_index);
-              *parent->buffers_.mutable_element(parent_index) = nullptr;
-            });
-  }
-  (*allocation)->SetDeviceMemorySize();
-  return OkStatus();
-}
-
-void XRTTupleAllocation::SetDeviceMemorySize() {
-  size_t size = 0;
-  for (auto& index_buffer : buffers_) {
-    if (index_buffer.second != nullptr) {
-      size += index_buffer.second->allocation().size();
-    }
-  }
-  device_memory_size_ = size;
-}
-
-/* static */ Status XRTTupleAllocation::ExpandTreeOfTuples(
-    const xla::ShapeTree<ExpandedTupleInput>& elements, int device_ordinal,
-    se::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
-    xla::Shape* device_shape) {
-  // Initialize both host and device shape to be the 'spine' of the new tuple
-  // shape, given by the shape of the tree of tuples.
-  *host_shape = elements.shape();
-  *device_shape = elements.shape();
-  // Now go over the leaves of the tree of tuples, and 'graft' the host/device
-  // shapes of the allocation at that leaf onto the expanded host/device shapes
-  // at the leaf position.
-  TF_RETURN_IF_ERROR(elements.ForEachElementWithStatus(
-      [&](const xla::ShapeIndex& index, const ExpandedTupleInput& element) {
-        if (elements.IsLeaf(index)) {
-          if (element.allocation == nullptr) {
-            return errors::InvalidArgument(
-                "MakeTuple elements has a null internal node at index ",
-                index.ToString());
-          }
-          if (device_ordinal != element.allocation->device_ordinal() ||
-              allocator != element.allocation->allocator_) {
-            return errors::InvalidArgument(
-                "MakeTuple elements must all be allocated on the same device "
-                "as the destination.");
-          }
-          *xla::ShapeUtil::GetMutableSubshape(host_shape, index) =
-              element.allocation->on_host_shape();
-          *xla::ShapeUtil::GetMutableSubshape(device_shape, index) =
-              element.allocation->on_device_shape();
-        } else {
-          if (element.allocation != nullptr) {
-            return errors::InvalidArgument(
-                "MakeTuple elements has a non-null internal node at index ",
-                index.ToString());
-          }
-        }
-        return OkStatus();
-      }));
-  return OkStatus();
-}
-
-/*static*/ Status XRTTupleAllocation::MakeTuple(
-    XRTMemoryManager* memory_manager, xla::Backend* backend, int device_ordinal,
-    const xla::ShapeTree<ExpandedTupleInput>& elements,
-    XRTTupleAllocation** allocation, se::DeviceMemoryAllocator* allocator) {
-  auto transfer_manager = backend->transfer_manager();
-  TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal));
-
-  xla::Shape host_shape;
-  xla::Shape device_shape;
-  TF_RETURN_IF_ERROR(ExpandTreeOfTuples(elements, device_ordinal, allocator,
-                                        &host_shape, &device_shape));
-
-  // The aliasing is determined below based on whether or not all the inputs are
-  // released while being transferred. allocation_tmp is a local pointer that is
-  // copied to *allocation at the end only if the method succeeds.
-  XRTTupleAllocation* allocation_tmp = new XRTTupleAllocation(
-      device_ordinal, allocator, host_shape, device_shape);
-  core::ScopedUnref allocation_unref(allocation_tmp);
-  // First allocate device memory for the new tuple index tables, one at each
-  // internal node of the elements tree. Do this in a separate pass into a
-  // ScopedShapedBuffer so that it's easy to free the newly-allocated memory if
-  // an allocation fails. Make sure the shape has layout so that the code that
-  // writes index tables will be happy lower down.
-  xla::Shape spine_shape = elements.shape();
-  xla::LayoutUtil::SetToDefaultLayout(&spine_shape);
-  auto new_tuple_buffers = std::make_unique<xla::ScopedShapedBuffer>(
-      spine_shape, spine_shape, allocator, device_ordinal);
-  TF_RETURN_IF_ERROR(elements.ForEachElementWithStatus(
-      [&](const xla::ShapeIndex& index, const ExpandedTupleInput& element) {
-        if (!elements.IsLeaf(index)) {
-          const xla::Shape& subshape =
-              xla::ShapeUtil::GetSubshape(device_shape, index);
-          uint64 size = transfer_manager->GetByteSizeRequirement(subshape);
-          TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer,
-                              memory_manager->Allocate(backend, device_ordinal,
-                                                       size, allocator));
-          VLOG(2) << "Allocated buffer at " << buffer->opaque() << " index "
-                  << index.ToString();
-          // Move the new buffer into new_tuple_buffers, which takes ownership
-          // of it.
-          new_tuple_buffers->set_buffer(std::move(buffer), index);
-        }
-        return OkStatus();
-      }));
-  // Transfer from the ScopedShapedBuffer to a ShapedBuffer, which does not own
-  // the newly-allocated index tables. Right now there's no owner for the new
-  // index tables, so next we will transfer ownership to the new allocation,
-  // taking care not to return early on any errors in the meantime.
-  xla::ShapedBuffer tuple_buffers = new_tuple_buffers->release();
-  // Now fill in the remaining datastructures. After this ForEachElement
-  // completes:
-  //   1) Every leaf element of tuple_buffers will be the root buffer of
-  //      an existing allocation, and every internal element of tuple_buffers
-  //      will be a newly-allocated index table. tuple_buffers does not own any
-  //      of these.
-  //   2) Every element of allocation_tmp->buffers_ will be a correctly
-  //   constructed
-  //      XRTBufferAllocation wrapping the necessary allocations. For buffers in
-  //      existing allocations there will be a new reference owned by the new
-  //      allocation, and for newly-allocated index tables there will be a
-  //      single reference owned by the new allocation.
-  elements.ForEachElement([&](const xla::ShapeIndex& index,
-                              const ExpandedTupleInput& element) {
-    if (elements.IsLeaf(index)) {
-      allocation_tmp->buffers_.CopySubtreeFrom(element.allocation->buffers_, {},
-                                               index);
-      tuple_buffers.set_buffer(element.allocation->root_allocation(), index);
-      if (element.release_allocation_after_use) {
-        // Transfer the references from element's buffers to the new allocation
-        // rather than incrementing the refcount. The caller should have
-        // validated that release_allocation_after_use is false if
-        // element.allocation appears in more than one leaf.
-        element.allocation->buffers_.ForEachMutableElement(
-            [&](const xla::ShapeIndex&, XRTBufferAllocationPtr* buffer) {
-              *buffer = nullptr;
-            });
-      } else {
-        // Increment the refcount on each newly-aliased buffer.
-        element.allocation->buffers_.ForEachElement(
-            [](const xla::ShapeIndex& index,
-               const XRTBufferAllocationPtr& buffer) { buffer->Ref(); });
-      }
-    } else {
-      // This is an internal node of the tuple tree so take ownership of the
-      // newly-created index table.
-      *allocation_tmp->buffers_.mutable_element(index) =
-          new XRTBufferAllocation(tuple_buffers.buffer(index), device_ordinal,
-                                  allocator);
-    }
-  });
-  allocation_tmp->SetDeviceMemorySize();
-  // Because the internal nodes of tuple_buffers are exactly the new index
-  // tables, WriteTupleIndexTables will write only the new index tables and not
-  // rewrite the index tables for the existing allocations.
-  TF_RETURN_IF_ERROR(
-      transfer_manager->WriteTupleIndexTables(stream.get(), tuple_buffers));
-
-  *allocation = allocation_tmp;
-  // Get another reference since allocation_tmp will be Unrefed automatically on
-  // exit.
-  (*allocation)->Ref();
-  return OkStatus();
-}
-
-bool XRTTupleAllocation::IsExclusiveOwner() const {
-  for (const auto& index_buffer : buffers_) {
-    if (index_buffer.second != nullptr &&
-        !index_buffer.second->RefCountIsOne()) {
-      return false;
-    }
-  }
-  return true;
-}
-
-size_t XRTTupleAllocation::GetDeviceMemorySize() const {
-  return device_memory_size_;
-}
-
-void XRTTupleAllocation::InitializeFromShapedBuffer(
-    const xla::ShapedBuffer& shaped_buffer,
-    se::DeviceMemoryAllocator* allocator, int device_ordinal) {
-  for (auto& index_buffer : buffers_) {
-    if (index_buffer.second != nullptr) {
-      index_buffer.second->Unref();
-    }
-    // Make a reference-counted version of the allocated buffer.
-    index_buffer.second = new XRTBufferAllocation(
-        shaped_buffer.buffer(index_buffer.first), device_ordinal, allocator);
-  }
-}
-
-xla::StatusOr<xla::ShapedBuffer> XRTTupleAllocation::ToShapedBuffer() {
-  xla::ShapedBuffer shaped_buffer(on_host_shape(), on_device_shape(),
-                                  device_ordinal_);
-  for (const auto& index_buffer : buffers_) {
-    if (index_buffer.second == nullptr ||
-        (index_buffer.second->allocation().is_null() &&
-         index_buffer.second->allocation().size() > 0)) {
-      return errors::InvalidArgument("Literal buffer at index ",
-                                     index_buffer.first.ToString(),
-                                     " has been released");
-    }
-    shaped_buffer.set_buffer(index_buffer.second->allocation(),
-                             index_buffer.first);
-  }
-  return std::move(shaped_buffer);
-}
-
-Status XRTTupleAllocation::AliasBufferFrom(const XRTTupleAllocation& source,
-                                           const xla::ShapeIndex& source_index,
-                                           const xla::ShapeIndex& dest_index) {
-  XRTBufferAllocation* source_buffer = source.buffers_.element(source_index);
-  XRTBufferAllocation* dest_buffer = buffers_.element(dest_index);
-  if (dest_buffer != nullptr) {
-    // We allow the destination size being zero, because there are cases where
-    // we are coming in later filling in null/uninitialized device buffers. In
-    // all other cases, the size of the new buffer must match.
-    if (source_buffer->allocation().size() !=
-            dest_buffer->allocation().size() &&
-        dest_buffer->allocation().size() != 0) {
-      return errors::InvalidArgument(
-          "Source buffer at index ", source_index.ToString(),
-          " does not match the size of destination buffer at index ",
-          dest_index.ToString(), ": ", source_buffer->allocation().size(),
-          " vs ", dest_buffer->allocation().size());
-    }
-  } else {
-    const xla::Shape& source_subshape =
-        xla::ShapeUtil::GetSubshape(source.on_device_shape(), source_index);
-    const xla::Shape& dest_subshape =
-        xla::ShapeUtil::GetSubshape(on_device_shape(), dest_index);
-    if (!xla::ShapeUtil::Equal(source_subshape, dest_subshape)) {
-      return errors::InvalidArgument(
-          "Source and destination subshapes do not match: source=",
-          xla::ShapeUtil::HumanStringWithLayout(source_subshape),
-          " dest=", xla::ShapeUtil::HumanStringWithLayout(dest_subshape));
-    }
-  }
-  *buffers_.mutable_element(dest_index) = source_buffer;
-  source_buffer->Ref();
-  if (dest_buffer != nullptr) {
-    // If we handed over the ownership of a buffer in ToExecutionInput(), we
-    // will be called here on the way back from execution, to alias back the
-    // buffer at that index. In that case the buffers will be the same. So we
-    // need to discard the memory at the destination buffer, before releasing
-    // the reference.
-    if (dest_buffer->allocation().IsSameAs(source_buffer->allocation()) &&
-        dest_buffer != source_buffer) {
-      dest_buffer->DiscardAllocation();
-    }
-    dest_buffer->Unref();
-  }
-  return OkStatus();
-}
-
-xla::StatusOr<xla::ExecutionInput> XRTTupleAllocation::ToExecutionInput(
-    const std::function<xla::StatusOr<bool>(const xla::ShapeIndex&)>&
-        alias_checker) {
-  xla::ExecutionInput result(on_device_shape(), on_host_shape());
-  for (const auto& index_buffer : buffers_) {
-    if (index_buffer.second == nullptr ||
-        (index_buffer.second->allocation().is_null() &&
-         index_buffer.second->allocation().size() > 0)) {
-      return errors::InvalidArgument("Literal buffer at index ",
-                                     index_buffer.first.ToString(),
-                                     " has been released");
-    }
-    TF_ASSIGN_OR_RETURN(bool should_alias, alias_checker(index_buffer.first));
-    if (!should_alias) {
-      result.SetBuffer(
-          index_buffer.first,
-          xla::MaybeOwningDeviceMemory(index_buffer.second->allocation()));
-    } else {
-      // We keep the ownership of the device memory here.
-      result.SetUnownedBuffer(
-          index_buffer.first,
-          xla::MaybeOwningDeviceMemory(se::OwningDeviceMemory(
-              index_buffer.second->allocation(), device_ordinal_, allocator_)));
-    }
-  }
-  return std::move(result);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_state.h b/tensorflow/compiler/xrt/xrt_state.h
deleted file mode 100644
index 679071f27eb199..00000000000000
--- a/tensorflow/compiler/xrt/xrt_state.h
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Classes for keeping track of on-device state.
-
-#ifndef TENSORFLOW_COMPILER_XRT_XRT_STATE_H_
-#define TENSORFLOW_COMPILER_XRT_XRT_STATE_H_
-
-#include <atomic>
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "xla/literal.h"
-#include "xla/service/backend.h"
-#include "xla/service/executable.h"
-#include "xla/service/shaped_buffer.h"
-#include "xla/shape_util.h"
-#include "xla/statusor.h"
-#include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/xla_data.pb.h"
-#include "tensorflow/compiler/xrt/xrt_refptr.h"
-#include "tensorflow/core/lib/core/refcount.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-// Cannot include xrt_memory_manager.h here, as it needs to include this file.
-class XRTMemoryManager;
-
-// TODO(misard) make this a Tensor if and when that makes sense.
-// A reference-counted wrapper around a buffer allocation. This maps an XLA
-// tuple index or a non-tuple XLA shape to a region of device memory. The device
-// memory buffer is freed when the reference count drops to zero.
-class XRTBufferAllocation : public core::RefCounted {
- public:
-  XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
-                      int device_ordinal, se::DeviceMemoryAllocator* allocator);
-  ~XRTBufferAllocation() override;
-
-  // The region of device memory being wrapped.
-  const se::DeviceMemoryBase& allocation();
-
-  void DiscardAllocation() { allocation_ = se::DeviceMemoryBase(); }
-
- private:
-  se::DeviceMemoryBase allocation_;
-  int device_ordinal_;
-  se::DeviceMemoryAllocator* allocator_;
-};
-
-// A XRTTupleAllocation represents an allocated memory area on the device.
-// New tuples can be created in three ways: by passing a literal in which case
-// device memory is allocated and the literal is transferred to that memory; by
-// aliasing a sub-shape of an existing tuple-shaped handle; or by aliasing a
-// vector of existing handles to create a new tuple. The underlying storage is
-// reference-counted. When a handle is released, the reference count of each
-// storage buffer is decremented, and buffers with no outstanding references are
-// freed.
-class XRTTupleAllocation : public core::RefCounted {
- public:
-  ~XRTTupleAllocation() override;
-
-  // Allocates new device memory buffers sufficient to store literal, transfers
-  // literal to that memory, and returns a XRTTupleAllocation handle to the
-  // allocated buffers.
-  static Status CreateAndTransfer(const xla::LiteralBase& literal,
-                                  XRTMemoryManager* memory_manager,
-                                  xla::Backend* backend, int device_ordinal,
-                                  XRTTupleAllocation** allocation,
-                                  se::DeviceMemoryAllocator* allocator);
-
-  // Allocates new device memory buffers sufficient to store a tensor of
-  // the specified shape, and returns a XRTTupleAllocation handle to the
-  // allocated buffers.  The allocated buffers are not initialized.
-  static Status CreateUninitialized(const xla::Shape& shape,
-                                    XRTMemoryManager* memory_manager,
-                                    xla::Backend* backend, int device_ordinal,
-                                    XRTTupleAllocation** allocation,
-                                    se::DeviceMemoryAllocator* allocator);
-
-  // Wraps an existing ShapeBuffer in a new XRTTupleAllocation handle.
-  static Status CreateFromBuffer(const xla::ShapedBuffer& shaped_buffer,
-                                 xla::Backend* backend, int device_ordinal,
-                                 XRTTupleAllocation** allocation,
-                                 se::DeviceMemoryAllocator* allocator);
-
-  // Same as the CreateFromBuffer() API above, but with the shapes being passed
-  // as input. This API is used when creating tuple allocations with the output
-  // of XLA computations which emit dynamic shaped output via the output shape
-  // table.
-  static Status CreateFromBuffer(const xla::ShapedBuffer& shaped_buffer,
-                                 const xla::Shape& on_host_shape,
-                                 const xla::Shape& on_device_shape,
-                                 xla::Backend* backend, int device_ordinal,
-                                 XRTTupleAllocation** allocation,
-                                 se::DeviceMemoryAllocator* allocator);
-
-  // Aliases a sub-shape of parent and returns a XRTTupleAllocation handle
-  // to the sub-shape. If alias_base_allocation is true, the buffers in the
-  // sub-shape will be shared between parent and the returned allocation,
-  // otherwise the overlapping buffers in parent will be replaced by
-  // nullptr.
-  static Status MakeSubBuffer(XRTTupleAllocation* parent,
-                              const xla::ShapeIndex& subshape,
-                              XRTTupleAllocation** allocation,
-                              bool alias_parent_allocation);
-
-  // A structure describing a leaf of a tree of tuples to expand. Each leaf
-  // contains an allocation and indicates whether or not the allocation's handle
-  // should be freed after incorporating its buffers into the expanded tree.
-  struct ExpandedTupleInput {
-    RefPtr<XRTTupleAllocation> allocation;
-    bool release_allocation_after_use;
-  };
-
-  // Returns a handle to a new tuple where the subtree of the new tuple at an
-  // index corresponding to a leaf of 'elements' is constructed from the
-  // allocation (i.e., a tuple or array) pointed to by that leaf. If
-  // release_allocation_after_use is false at a leaf, the new tuple will alias
-  // the input allocation at that leaf, otherwise the input allocation will be
-  // released. Input allocations may be repeated (appear in more than one leaf)
-  // in which case the corresponding buffers in the output tuple will alias. If
-  // an input is repeated, release_input_handle must be false for every leaf
-  // where that input appears. The latter property is not validated by MakeTuple
-  // and must be enforced by the caller.
-  static Status MakeTuple(XRTMemoryManager* memory_manager,
-                          xla::Backend* backend, int device_ordinal,
-                          const xla::ShapeTree<ExpandedTupleInput>& elements,
-                          XRTTupleAllocation** allocation,
-                          se::DeviceMemoryAllocator* allocator);
-
-  // Copies the allocation from device to host and returns it in literal.
-  Status ToLiteral(xla::Backend* backend, xla::MutableLiteralBase* literal);
-
-  // Write a new literal value to the allocation.
-  Status WriteLiteral(xla::Backend* backend, const xla::Literal& literal);
-
-  // Stores the content of the tuple allocation into the internal literal, and
-  // releases all the device buffers. The swap_pinned flag tells whether a
-  // pinned allocation should be swapped out. It should be false on all cases,
-  // but during the memory compaction operation from the XRTMemoryManager.
-  // Returns a boolean telling whether the allocation was swapped out.
-  xla::StatusOr<bool> SwapOut(xla::Backend* backend, bool swap_pinned);
-
-  // Allocates the device memory required to store the tuple value held within
-  // the internal literal, and transfer the literal value into the device
-  // memory. Returns a boolean telling whether the allocation was swapped in.
-  xla::StatusOr<bool> SwapIn(XRTMemoryManager* memory_manager,
-                             xla::Backend* backend,
-                             se::DeviceMemoryAllocator* allocator);
-
-  // Pins the allocation first, then swap it in (if it is not already). After
-  // this API returns, the allocation is pinned and its content on device
-  // memory. The caller is responsible for releasing the pin-count using the
-  // Unpin() API.
-  xla::StatusOr<bool> PinAndSwapIn(XRTMemoryManager* memory_manager,
-                                   xla::Backend* backend,
-                                   se::DeviceMemoryAllocator* allocator);
-
-  // Checks whether the allocation is currently swapped out.
-  bool IsSwapped() const;
-
-  // Increases the pin-count of this allocation. If the pin-count is greater
-  // than 0, the allocation cannot be swapped. Returned the pin-count value
-  // before the increase.
-  int64_t Pin();
-
-  // Decreases the pin-count of this allocation. Returned the pin-count value
-  // before the decrease.
-  int64_t Unpin();
-
-  // Checks whether the allocation is currently pinned.
-  bool IsPinned() const;
-
-  // True if none of the buffers in the allocation are aliased by any other live
-  // handle.
-  bool IsExclusiveOwner() const;
-
-  // Retrieves the footprint in terms of device memory, of this allocation.
-  size_t GetDeviceMemorySize() const;
-
-  // The ordinal of the device holding this tuple.
-  int device_ordinal() const;
-
-  // Returns the shape of the tuple as seen by the host.
-  const xla::Shape& on_host_shape() const;
-
-  // Returns the shape of the tuple as stored on the device.
-  const xla::Shape& on_device_shape() const;
-
-  // Returns the buffer pointed to by the root of the tuple.
-  const se::DeviceMemoryBase& root_allocation() const;
-
-  // Stops managing the storage for the allocation at buffer_index, e.g.,
-  // because it has been aliased to the output buffer of a computation.
-  void DiscardAllocation(const xla::ShapeIndex& buffer_index);
-
-  // Returns the tree of allocations as a ShapedBuffer. This tree may not have
-  // the same shape as on_host_shape.
-  xla::StatusOr<xla::ShapedBuffer> ToShapedBuffer();
-
-  // Aliases the source buffer at source_index into the current tuple allocation
-  // dest_index.
-  Status AliasBufferFrom(const XRTTupleAllocation& source,
-                         const xla::ShapeIndex& source_index,
-                         const xla::ShapeIndex& dest_index);
-
-  // Returns the device memory tree of this allocation. If the alias_checker
-  // function returns true for a given index, an owned device memory is returned
-  // to the caller. But the tuple allocation cannot release the ownership in
-  // full, as the execute operation might fail. So we rely on a call to
-  // AliasBufferFrom() to re-alias back the buffers. This is not great (to say
-  // the least), but the current aliasing logic relies on
-  // MaybeOwningDeviceMemory being owned, to detect the fact that the user may
-  // want to alias a buffer. Unfortunately to do that, it needs to release the
-  // ownership, which is a problem if the execute will fail.
-  // This calls for a refactoring of the whole owning/maybe-owning interface to
-  // introduce a sharing concept (IOW shared_ptr model vs. unique_ptr).
-  // We'd need something similar to XRTTupleAllocation instead of
-  // ScopedShapedBuffer, which wants ownership and does not allow sharing.
-  xla::StatusOr<xla::ExecutionInput> ToExecutionInput(
-      const std::function<xla::StatusOr<bool>(const xla::ShapeIndex&)>&
-          alias_checker);
-
- private:
-  // Creates a new handle with (tuple) shape.
-  XRTTupleAllocation(int device_ordinal, se::DeviceMemoryAllocator* allocator,
-                     const xla::Shape& on_host_shape,
-                     const xla::Shape& on_device_shape);
-
-  // Inherits the allocations represented in buffer, which must have the same
-  // shape as buffers_.
-  void InitializeFromShapedBuffer(const xla::ShapedBuffer& shaped_buffer,
-                                  se::DeviceMemoryAllocator* allocator,
-                                  int device_ordinal);
-
-  // Releases all the XRTBufferAllocation buffer references and set the
-  // corresponding shape tree entry to nullptr.
-  void ReleaseBuffers();
-
-  // Stores the content of the allocation from device memory to the target host
-  // literal.
-  Status StoreToLiteral(xla::Backend* backend,
-                        xla::MutableLiteralBase* literal);
-
-  // Sets the total size of the buffers held within this allocation buffers.
-  // This API should be called once when an XRTTupleAllocation object is
-  // created, as the XRTTupleAllocation shapes never change, and hence the
-  // device memory size.
-  void SetDeviceMemorySize();
-
-  // Takes a tree 'elements' where each leaf is an allocation, validates that
-  // they are all on device_ordinal managed by allocator, and returns in
-  // host_shape and device_shape the host/device shapes of the expanded tree,
-  // where at each leaf of elements the shape of the allocation at elements is
-  // grafted on.
-  static Status ExpandTreeOfTuples(
-      const xla::ShapeTree<ExpandedTupleInput>& elements, int device_ordinal,
-      se::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
-      xla::Shape* device_shape);
-
-  // The lock which protects the internal operations of the tuple allocation. Is
-  // mutable to allow const-like operations to be declared as such.
-  mutable mutex lock_;
-
-  // Location of the memory that is being managed.
-  const int device_ordinal_;
-  se::DeviceMemoryAllocator* const allocator_;
-
-  // The shape that the caller thinks the tuple has.
-  const xla::Shape on_host_shape_;
-  // The shape that the tuple has on device. Store this explicitly instead of
-  // using a shape stored in ShapeTree because ShapeTree discards the layout.
-  const xla::Shape on_device_shape_;
-  // The tree of reference-counted buffers, which uses on_device_shape_ as its
-  // shape.
-  xla::ShapeTree<XRTBufferAllocation*> buffers_;
-  // The footprint of the allocation, when residing on device memory.
-  size_t device_memory_size_ = 0;
-  // If the allocation is swapped out, this is the literal storing its content.
-  std::unique_ptr<xla::Literal> literal_;
-  // A pinned allocation is one which cannot be swapped out. If pin_count_ > 0
-  // then the allocation is pinned.
-  std::atomic<int64_t> pin_count_;
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_XRT_XRT_STATE_H_
diff --git a/tensorflow/compiler/xrt/xrt_tpu_device.cc b/tensorflow/compiler/xrt/xrt_tpu_device.cc
deleted file mode 100644
index b747c5505e7aa1..00000000000000
--- a/tensorflow/compiler/xrt/xrt_tpu_device.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xrt/xrt_tpu_device.h"
-
-#include "tensorflow/compiler/jit/xla_device.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/tpu/tpu_configuration.h"
-
-namespace tensorflow {
-
-/*static*/ Status XRTTpuDeviceAccessor::GetResourceManager(OpKernelContext* ctx,
-                                                           ResourceMgr** rm) {
-  // ctx is unused here, but maintained because XRTGenericDeviceAccessor uses
-  // it in its GetResourceManager.
-  *rm = GetTPUConfigResourceMgr();
-  if (*rm == nullptr) {
-    return errors::Internal("No Tpu resource manager.");
-  }
-  return OkStatus();
-}
-
-Status XRTTpuDeviceAccessor::ScopedRef::Acquire(int device_ordinal) {
-  TF_ASSIGN_OR_RETURN(node_context_,
-                      tpu::TpuNodeContext::Create(device_ordinal));
-  ordinal_ = device_ordinal;
-  return OkStatus();
-}
-
-Status XRTTpuDeviceAccessor::ScopedRef::Acquire(OpKernelContext* ctx) {
-  const XlaDevice::Metadata* metadata;
-  TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(ctx, &metadata));
-  return Acquire(metadata->device_ordinal());
-}
-
-/*static*/ Status XRTTpuDeviceAccessor::InitScopedRef(
-    OpKernelContext* /*unused ctx*/, int device_ordinal,
-    ScopedRef* scoped_ref) {
-  return scoped_ref->Acquire(device_ordinal);
-}
-
-/*static*/ Status XRTTpuDeviceAccessor::InitScopedRef(OpKernelContext* ctx,
-                                                      ScopedRef* scoped_ref) {
-  return scoped_ref->Acquire(ctx);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_tpu_device.h b/tensorflow/compiler/xrt/xrt_tpu_device.h
deleted file mode 100644
index c2251e76be8f42..00000000000000
--- a/tensorflow/compiler/xrt/xrt_tpu_device.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Classes for keeping track of on-device state for TPUs.
-
-#ifndef TENSORFLOW_COMPILER_XRT_XRT_TPU_DEVICE_H_
-#define TENSORFLOW_COMPILER_XRT_XRT_TPU_DEVICE_H_
-
-#include <memory>
-
-#include "xla/client/local_client.h"
-#include "xla/stream_executor/tpu/tpu_node_context.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-
-namespace tensorflow {
-
-// This accessor is used for XLA TPU. It uses the distributed TPU compilation
-// cache infrastructure which it accesses via the TPU_SYSTEM resource manager.
-class XRTTpuDeviceAccessor {
- public:
-  static Status GetResourceManager(OpKernelContext* ctx, ResourceMgr** rm);
-
-  class ScopedRef {
-   public:
-    ScopedRef() = default;
-    ~ScopedRef() = default;
-
-    ScopedRef(const ScopedRef&) = delete;
-    ScopedRef& operator=(const ScopedRef&) = delete;
-
-    // Returns the XLA device properties from the TpuNodeContext object
-    // protected by this ScopedRef.
-    xla::Backend* backend() { return node_context_->backend(); }
-    int device_ordinal() { return ordinal_; }
-    se::DeviceMemoryAllocator* allocator() {
-      return backend()->memory_allocator();
-    }
-
-   private:
-    // XRTTpuDeviceAccessor::InitScopedRef is the only way to initialize
-    // ScopedRef.
-    friend class XRTTpuDeviceAccessor;
-
-    Status Acquire(int device_ordinal);
-
-    Status Acquire(OpKernelContext* ctx);
-
-    std::unique_ptr<tpu::TpuNodeContext> node_context_;
-    int ordinal_ = 0;
-  };
-
-  static Status InitScopedRef(OpKernelContext* ctx, int device_ordinal,
-                              ScopedRef* scoped_ref);
-
-  static Status InitScopedRef(OpKernelContext* ctx, ScopedRef* scoped_ref);
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_XRT_XRT_TPU_DEVICE_H_
diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc
deleted file mode 100644
index 5f1df1ff6dc0eb..00000000000000
--- a/tensorflow/compiler/xrt/xrt_util.cc
+++ /dev/null
@@ -1,450 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xrt/xrt_util.h"
-
-#include <stdlib.h>
-#include <string.h>
-
-#include <functional>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "xla/debug_options_flags.h"
-#include "xla/types.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace tensorflow {
-namespace {
-
-mutex nccl_factory_mutex(LINKER_INITIALIZED);
-std::shared_ptr<NcclUniqueIdFactory>* nccl_factory;
-
-// The ScopedHandles data structure is used in the ExecuteChained() API and its
-// task is to track tuple allocation registrations. It is used both the track
-// intermediate results of a chained computation, or its final results. Anything
-// which is marked to be released, will be released using the XRTMemoryManager
-// once the object is destroyed (unless an explicit call to Drop() or Release()
-// is made).
-class ScopedHandles {
- public:
-  explicit ScopedHandles(RefPtr<XRTMemoryManager> memory_manager)
-      : memory_manager_(std::move(memory_manager)) {}
-
-  ~ScopedHandles() {
-    for (size_t i = 0; i < handles_.size(); ++i) {
-      if (handles_release_[i]) {
-        memory_manager_->Release(handles_[i]).IgnoreError();
-      }
-    }
-  }
-
-  int64_t operator[](size_t index) const { return handles_.at(index); }
-
-  size_t size() const { return handles_.size(); }
-
-  // Adds the given handle at the index position, by marking it releasable
-  // according to the release argument. If an existing, and to-be-released
-  // handle already exists at the same index, it will be released.
-  Status Add(size_t index, int64_t handle, bool release) {
-    if (index >= handles_.size()) {
-      handles_.resize(index + 1, XRTMemoryManager::InvalidKey());
-      handles_release_.resize(index + 1, false);
-    }
-    if (handles_release_[index]) {
-      Status status = memory_manager_->Release(handles_[index]);
-      if (!status.ok()) {
-        if (release) {
-          memory_manager_->Release(handle).IgnoreError();
-        }
-        return status;
-      }
-    }
-    handles_[index] = handle;
-    handles_release_[index] = release;
-    return OkStatus();
-  }
-
-  // Adds a to-be-released tuple allocation at the given index.
-  Status Add(size_t index, RefPtr<XRTTupleAllocation> tuple) {
-    return Add(index, memory_manager_->Register(std::move(tuple)),
-               /*release=*/true);
-  }
-
-  // Drops the handle at the given index, and releases it using the
-  // XRTMemoryManager::Release() if marked as to-be-released.
-  Status Drop(size_t index) {
-    if (handles_release_.at(index)) {
-      TF_RETURN_IF_ERROR(memory_manager_->Release(handles_[index]));
-    }
-    Release(index);
-    return OkStatus();
-  }
-
-  // Releases the handle at the given index. The destructor will not use that
-  // XRTMemoryManager::Release() API on such handle.
-  int64_t Release(size_t index) {
-    int64_t handle = handles_.at(index);
-    handles_[index] = XRTMemoryManager::InvalidKey();
-    handles_release_[index] = false;
-    return handle;
-  }
-
-  // Looks up the handle stored at the given index, and returns the matching
-  // tuple allocation.
-  xla::StatusOr<RefPtr<XRTTupleAllocation>> Lookup(size_t index) const {
-    return memory_manager_->Lookup(handles_.at(index));
-  }
-
- private:
-  RefPtr<XRTMemoryManager> memory_manager_;
-  std::vector<int64_t> handles_;
-  std::vector<bool> handles_release_;
-};
-
-bool DebugOptionsPassThroughEnabled() {
-  const char* env = getenv("TF_XLA_DEBUG_OPTIONS_PASSTHROUGH");
-  bool enabled =
-      env != nullptr && (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
-  if (enabled) {
-    LOG(WARNING) << "Passing through XLA debug options!";
-  } else {
-    LOG(WARNING) << "TF_XLA_DEBUG_OPTIONS_PASSTHROUGH not set, not all options "
-                    "will be retained";
-  }
-  return enabled;
-}
-
-string SafeDebugPath(const string& path) {
-  if (path.empty() || path.compare(0, 5, "gs://") == 0 ||
-      path.compare(0, 11, "bigstore://") == 0) {
-    return path;
-  }
-  LOG(WARNING) << "Invalid config path (will be dropped): " << path;
-  return string();
-}
-
-Status MakeOutput(const RefPtr<XRTTupleAllocation>& output, int64_t index,
-                  RefPtr<XRTTupleAllocation>* result) {
-  if (index == 0) {
-    *result = output;
-  } else {
-    XRTTupleAllocation* tuple;
-    TF_RETURN_IF_ERROR(
-        XRTTupleAllocation::MakeSubBuffer(output.get(), {index - 1}, &tuple,
-                                          /*alias_parent_allocation=*/true));
-    result->reset(tuple);
-  }
-  return OkStatus();
-}
-
-Status PopulateOpWorkingSet(xla::Backend* backend,
-                            const xrt::XRTChainedExecuteOp& op,
-                            int current_index, const ScopedHandles& outputs,
-                            XRTMemoryManager::WorkingSet* working_set,
-                            se::DeviceMemoryAllocator* allocator) {
-  for (int i = 0; i < op.inputs_size(); ++i) {
-    auto& input = op.inputs(i);
-    if (input.op_index() >= current_index) {
-      return errors::InvalidArgument(
-          "Input index ", input.op_index(),
-          " is above the current position: ", current_index);
-    }
-    TF_RETURN_IF_ERROR(working_set->LookupAndPin(
-        backend, outputs[input.op_index()], allocator));
-  }
-  return OkStatus();
-}
-
-}  // namespace
-
-void SetNcclUniqueIdFactory(std::shared_ptr<NcclUniqueIdFactory> factory) {
-  mutex_lock lock(nccl_factory_mutex);
-  if (nccl_factory == nullptr) {
-    nccl_factory = new std::shared_ptr<NcclUniqueIdFactory>();
-  }
-  *nccl_factory = std::move(factory);
-}
-
-std::shared_ptr<NcclUniqueIdFactory> GetNcclUniqueIdFactory() {
-  mutex_lock lock(nccl_factory_mutex);
-  return nccl_factory != nullptr ? *nccl_factory : nullptr;
-}
-
-xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options) {
-  static const bool options_passthrough = DebugOptionsPassThroughEnabled();
-  if (options_passthrough) {
-    return ref_options;
-  }
-  xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
-  options.set_xla_dump_to(SafeDebugPath(ref_options.xla_dump_to()));
-  options.set_xla_dump_hlo_as_proto(ref_options.xla_dump_hlo_as_proto());
-  options.set_xla_dump_hlo_as_text(ref_options.xla_dump_hlo_as_text());
-  options.set_xla_dump_hlo_snapshots(ref_options.xla_dump_hlo_snapshots());
-  options.set_xla_dump_hlo_pass_re(ref_options.xla_dump_hlo_pass_re());
-  options.set_xla_dump_include_timestamp(
-      ref_options.xla_dump_include_timestamp());
-  options.set_xla_dump_max_hlo_modules(ref_options.xla_dump_max_hlo_modules());
-  options.set_xla_dump_enable_mlir_pretty_form(
-      ref_options.xla_dump_enable_mlir_pretty_form());
-
-  for (auto& pass : ref_options.xla_disable_hlo_passes()) {
-    options.add_xla_disable_hlo_passes(pass);
-  }
-  return options;
-}
-
-xla::StatusOr<std::vector<InputCoords>> GetComputationInputs(
-    OpKernelContext* context, const char* input_name) {
-  OpInputList arg_list;
-  TF_RETURN_IF_ERROR(context->input_list(input_name, &arg_list));
-  // Concatenate all input uids from list of scalars-or-vectors carrying them.
-  std::vector<InputCoords> input_coords;
-  for (int i = 0; i < arg_list.size(); ++i) {
-    const Tensor& arg = arg_list[i];
-    if (TensorShapeUtils::IsScalar(arg.shape())) {
-      input_coords.emplace_back(arg.scalar<int64_t>()());
-    } else {
-      TF_RET_CHECK(TensorShapeUtils::IsVector(arg.shape()));
-      auto arg_vec = arg.vec<int64_t>();
-      const int64_t num_elts = arg.shape().dim_size(0);
-      for (int i = 0; i < num_elts; ++i) {
-        input_coords.emplace_back(arg_vec(i));
-      }
-    }
-  }
-  return std::move(input_coords);
-}
-
-bool InputShapeMatches(const xla::Shape& parameter_shape,
-                       const xla::Shape& input_shape) {
-  auto shape_checker = [&](const xla::Shape& pshape,
-                           const xla::ShapeIndex& index) {
-    if (pshape.IsArray()) {
-      TF_ASSIGN_OR_RETURN(const xla::Shape* ishape,
-                          xla::ShapeUtil::TryGetSubshape(input_shape, index));
-      if (pshape.rank() != ishape->rank() ||
-          pshape.element_type() != ishape->element_type()) {
-        return errors::InvalidArgument("Mismatching shapes");
-      }
-      if (pshape.is_static() && !xla::Layout::Equal().IgnoreTiles()(
-                                    pshape.layout(), ishape->layout())) {
-        return errors::InvalidArgument("Mismatching layouts");
-      }
-      for (int64_t dim = 0; dim < pshape.rank(); ++dim) {
-        if (pshape.is_dynamic_dimension(dim)) {
-          if (pshape.dimensions(dim) < ishape->dimensions(dim)) {
-            return errors::InvalidArgument("Mismatching shapes");
-          }
-        } else if (pshape.dimensions(dim) != ishape->dimensions(dim)) {
-          return errors::InvalidArgument("Mismatching shapes");
-        }
-      }
-    }
-    return OkStatus();
-  };
-  return xla::ShapeUtil::ForEachSubshapeWithStatus(parameter_shape,
-                                                   shape_checker)
-      .ok();
-}
-
-xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetInputTupleAllocations(
-    const std::vector<InputCoords>& input_coords,
-    XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend,
-    int64_t num_input_shapes,
-    const std::function<xla::Shape(int64_t)>& shape_getter, bool release_inputs,
-    se::DeviceMemoryAllocator* allocator) {
-  if (input_coords.size() != num_input_shapes) {
-    return errors::InvalidArgument(
-        "Number of inputs does not match executable proto input shapes: ",
-        input_coords.size(), " vs. ", num_input_shapes);
-  }
-  std::vector<RefPtr<XRTTupleAllocation>> input_tuples;
-  input_tuples.reserve(input_coords.size());
-  for (size_t i = 0; i < input_coords.size(); ++i) {
-    TF_RETURN_IF_ERROR(
-        working_set->LookupAndPin(backend, input_coords[i].handle, allocator));
-    auto tuple = working_set->PinnedTuples().back();
-    if (release_inputs) {
-      // We are holding a reference to the tuple, so we can safely delete it
-      // from the resource manager here.
-      TF_RETURN_IF_ERROR(
-          working_set->MemoryManager()->Release(input_coords[i].handle));
-      VLOG(2) << "Released allocation handle " << input_coords[i].handle;
-    }
-    xla::Shape input_shape = shape_getter(i);
-    if (!InputShapeMatches(input_shape, tuple->on_host_shape())) {
-      return errors::InvalidArgument(
-          "Run-time shape mismatch for XRTExecute argument[", i, "] (",
-          input_coords[i].handle, "). Expected ", input_shape.DebugString(),
-          "; got ", tuple->on_host_shape().DebugString());
-    }
-    if (input_coords[i].index.empty()) {
-      input_tuples.emplace_back(std::move(tuple));
-    } else {
-      XRTTupleAllocation* sub_tuple;
-      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
-          tuple.get(), input_coords[i].index, &sub_tuple,
-          /*alias_parent_allocation=*/true));
-      input_tuples.emplace_back(sub_tuple);
-    }
-  }
-  return std::move(input_tuples);
-}
-
-Status RebuildOutputAliases(
-    const RefPtr<XRTTupleAllocation>& output_tuple,
-    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
-    const xla::HloInputOutputAliasConfig& input_output_alias) {
-  auto alias_function =
-      [&](const xla::ShapeIndex& output_index,
-          const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
-    TF_RET_CHECK(alias.parameter_number < input_tuples.size());
-    return output_tuple->AliasBufferFrom(*input_tuples[alias.parameter_number],
-                                         alias.parameter_index, output_index);
-  };
-  return input_output_alias.ForEachAliasWithStatus(alias_function);
-}
-
-xla::StatusOr<std::vector<xla::ExecutionInput>> GetArgumentsBuffers(
-    const xla::HloInputOutputAliasConfig& input_output_alias,
-    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
-    const std::vector<bool>& input_is_dynamic, bool release_inputs) {
-  auto is_dynamic = [&](size_t arg) {
-    return arg < input_is_dynamic.size() && input_is_dynamic[arg];
-  };
-  std::vector<xla::ExecutionInput> arguments;
-  // Don't alias dynamic input -- Due to the underlying implementation,
-  // aliased inputs have two owners: XRTAllocation and return value of
-  // this function. If an argument is dynamic and the ownership is
-  // released to output of this function, TPUExecute will free it and
-  // reallocate a new one, which creates a double freeing issue where
-  // XRTAllocation also attempts to release the buffer.
-  bool alias_outputs = release_inputs && input_tuples.size() == 1 &&
-                       input_tuples[0]->IsExclusiveOwner() && !is_dynamic(0);
-  arguments.reserve(input_tuples.size());
-  for (int64_t i = 0; i < input_tuples.size(); ++i) {
-    auto alias_checker =
-        [&](const xla::ShapeIndex& index) -> xla::StatusOr<bool> {
-      if (input_output_alias.ParameterHasAlias(i, index)) {
-        TF_RET_CHECK(!is_dynamic(i));
-        return true;
-      }
-      return alias_outputs;
-    };
-    TF_ASSIGN_OR_RETURN(xla::ExecutionInput exec_input,
-                        input_tuples[i]->ToExecutionInput(alias_checker));
-    arguments.emplace_back(std::move(exec_input));
-  }
-  return std::move(arguments);
-}
-
-Status CreateExecuteOutput(OpKernelContext* context,
-                           XRTMemoryManager* memory_manager,
-                           RefPtr<XRTTupleAllocation> output_tuple,
-                           bool return_exploded_tuple) {
-  if (return_exploded_tuple && output_tuple->on_host_shape().IsTuple()) {
-    int64_t tuple_element_count =
-        xla::ShapeUtil::TupleElementCount(output_tuple->on_device_shape());
-    Tensor* output_tensor;
-    TF_RETURN_IF_ERROR(context->allocate_output(
-        0, TensorShape({tuple_element_count}), &output_tensor));
-
-    for (int64_t i = 0; i < tuple_element_count; ++i) {
-      XRTTupleAllocation* suballocation;
-      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
-          output_tuple.get(), {i}, &suballocation,
-          /*alias_parent_allocation=*/false));
-      output_tensor->vec<int64_t>()(i) =
-          memory_manager->Register(suballocation);
-    }
-  } else {
-    Tensor* output_tensor;
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(0, TensorShape({}), &output_tensor));
-    output_tensor->scalar<int64_t>()() =
-        memory_manager->Register(std::move(output_tuple));
-  }
-  return OkStatus();
-}
-
-Status ExecuteChained(OpKernelContext* context,
-                      const RefPtr<XRTMemoryManager>& memory_manager,
-                      xla::Backend* backend, int device_ordinal,
-                      const xrt::XRTChainedExecutePlan& plan,
-                      const xrt::XRTChainedExecuteConfig& config,
-                      const ChainedExecuteFn& execute_op,
-                      se::DeviceMemoryAllocator* allocator) {
-  // Create the vector which tracks the uses of the intermediate chained
-  // operations outputs.
-  std::vector<int64_t> uses(plan.ops_size(), 0);
-  for (auto& op : plan.ops()) {
-    for (auto& input : op.inputs()) {
-      uses[input.op_index()] += 1;
-    }
-  }
-
-  ScopedHandles outputs(memory_manager);
-  ScopedHandles results(memory_manager);
-  for (int i = 0; i < plan.ops_size(); ++i) {
-    auto& op = plan.ops(i);
-    if (op.op_oneof_case() == xrt::XRTChainedExecuteOp::kDataHandle) {
-      // This operation is a device data load. Set the handle as output and
-      // leave the release flag off, since this is not an intermediate output.
-      TF_RETURN_IF_ERROR(outputs.Add(i, op.data_handle(), /*release=*/false));
-    } else if (op.op_oneof_case() ==
-               xrt::XRTChainedExecuteOp::kComputationHandle) {
-      // This is an XRT execute operation, forward to the device specific
-      // handler. Populating the working set makes sure the input allocations
-      // for this execute operations are pinned to device memory.
-      XRTMemoryManager::WorkingSet working_set(memory_manager);
-      TF_RETURN_IF_ERROR(PopulateOpWorkingSet(backend, op, i, outputs,
-                                              &working_set, allocator));
-      TF_ASSIGN_OR_RETURN(auto tuple,
-                          execute_op(op, working_set.PinnedTuples()));
-      TF_RETURN_IF_ERROR(outputs.Add(i, std::move(tuple)));
-    } else {
-      return errors::InvalidArgument(
-          "Undefined operation kind at post-order position ", i);
-    }
-    // If the result of this chained operation is an output result, feed the
-    // results at the desired position.
-    for (auto& output : op.outputs()) {
-      TF_ASSIGN_OR_RETURN(auto tuple, outputs.Lookup(i));
-      RefPtr<XRTTupleAllocation> result;
-      TF_RETURN_IF_ERROR(MakeOutput(tuple, output.output_index(), &result));
-      TF_RETURN_IF_ERROR(results.Add(output.result_index(), std::move(result)));
-    }
-    // Drop intermediate results which have no more users.
-    for (auto& input : op.inputs()) {
-      uses[input.op_index()] -= 1;
-      if (uses[input.op_index()] == 0) {
-        TF_RETURN_IF_ERROR(outputs.Drop(input.op_index()));
-      }
-    }
-  }
-
-  Tensor* output_tensor;
-  TF_RETURN_IF_ERROR(context->allocate_output(
-      0, TensorShape({static_cast<int64_t>(results.size())}), &output_tensor));
-  for (size_t i = 0; i < results.size(); ++i) {
-    output_tensor->vec<int64_t>()(i) = results.Release(i);
-  }
-  return OkStatus();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_util.h b/tensorflow/compiler/xrt/xrt_util.h
deleted file mode 100644
index a9f68d676efa6b..00000000000000
--- a/tensorflow/compiler/xrt/xrt_util.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Utility functions in support of the XRT API.
-
-#ifndef TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
-#define TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "xla/hlo/ir/hlo_input_output_alias_config.h"
-#include "xla/service/backend.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/statusor.h"
-#include "xla/xla.pb.h"
-#include "tensorflow/compiler/xrt/xrt.pb.h"
-#include "tensorflow/compiler/xrt/xrt_memory_manager.h"
-#include "tensorflow/compiler/xrt/xrt_refptr.h"
-#include "tensorflow/compiler/xrt/xrt_state.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tensorflow {
-
-// Factory class which creates NCCL unique IDs based on the replicas
-// participating to a given communication. This is only used for GPU backends.
-struct NcclUniqueIdFactory {
-  virtual ~NcclUniqueIdFactory() = default;
-
-  // Generates the NCCL unique ID for the given set of replica IDs.
-  virtual std::string GetUniqueId(absl::Span<const int64_t> replicas) = 0;
-};
-
-void SetNcclUniqueIdFactory(std::shared_ptr<NcclUniqueIdFactory> factory);
-
-std::shared_ptr<NcclUniqueIdFactory> GetNcclUniqueIdFactory();
-
-struct InputCoords {
-  explicit InputCoords(int64_t handle) : handle(handle) {}
-  InputCoords(int64_t handle, xla::ShapeIndex index)
-      : handle(handle), index(std::move(index)) {}
-
-  int64_t handle = 0;
-  xla::ShapeIndex index;
-};
-
-// Filters the debug options provided as argument according to the value of the
-// TF_XLA_DEBUG_OPTIONS_PASSTHROUGH environment variable. If such variable is
-// set to "1" or "true", the debug options will be returned as is. Otherwise
-// only a subset of them will be set in the returned ones, and all the paths
-// contained in it, will be limited to gs:// and bigstore:// ones.
-xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options);
-
-// Populates the input_coords with a list of input coordinates from a input_name
-// op argument.
-xla::StatusOr<std::vector<InputCoords>> GetComputationInputs(
-    OpKernelContext* context, const char* input_name);
-
-bool InputShapeMatches(const xla::Shape& parameter_shape,
-                       const xla::Shape& input_shape);
-
-xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetInputTupleAllocations(
-    const std::vector<InputCoords>& input_coords,
-    XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend,
-    int64_t num_input_shapes,
-    const std::function<xla::Shape(int64_t)>& shape_getter, bool release_inputs,
-    se::DeviceMemoryAllocator* allocator);
-
-Status RebuildOutputAliases(
-    const RefPtr<XRTTupleAllocation>& output_tuple,
-    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
-    const xla::HloInputOutputAliasConfig& input_output_alias);
-
-xla::StatusOr<std::vector<xla::ExecutionInput>> GetArgumentsBuffers(
-    const xla::HloInputOutputAliasConfig& input_output_alias,
-    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
-    const std::vector<bool>& input_is_dynamic, bool release_inputs);
-
-// Create the XRT execute output tensor given the computation result
-// (output_tuple). The return_exploded_tuple tells whether a tuple result should
-// be returned as vector of handles representing each tuple child.
-Status CreateExecuteOutput(OpKernelContext* context,
-                           XRTMemoryManager* memory_manager,
-                           RefPtr<XRTTupleAllocation> output_tuple,
-                           bool return_exploded_tuple);
-
-// Drives the XRT chained computation execution given the supplied core execute
-// function.
-using ChainedExecuteFn =
-    std::function<xla::StatusOr<RefPtr<XRTTupleAllocation>>(
-        const xrt::XRTChainedExecuteOp&,
-        absl::Span<const RefPtr<XRTTupleAllocation>>)>;
-Status ExecuteChained(OpKernelContext* context,
-                      const RefPtr<XRTMemoryManager>& memory_manager,
-                      xla::Backend* backend, int device_ordinal,
-                      const xrt::XRTChainedExecutePlan& plan,
-                      const xrt::XRTChainedExecuteConfig& config,
-                      const ChainedExecuteFn& execute_op,
-                      se::DeviceMemoryAllocator* allocator);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 8491b7fbf2f2a2..b90c7cbbb5cc44 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -121,7 +121,6 @@ package(
         "//tensorflow_models:__subpackages__",
     ],
     features = if_google([
-        "-layering_check",
         "-parse_headers",
     ]),
     licenses = ["notice"],
@@ -1310,6 +1309,7 @@ cc_library(
     ],
     hdrs = [":lib_internal_public_headers"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     deps = tf_additional_lib_deps() + [
         ":core_stringpiece",
         ":lib_proto_parsing",
@@ -1456,6 +1456,7 @@ cc_library(
     }) +
     # The TF proto implementations that we will statically link here.
     [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc_impl",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc_impl",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc_impl",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc_impl",
@@ -1641,6 +1642,7 @@ tf_cuda_library(
     ],
     hdrs = [":framework_internal_public_headers"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     linkopts = select({
         "//tensorflow:freebsd": ["-lm"],
         "//tensorflow:windows": [],
@@ -1770,7 +1772,11 @@ tf_cuda_library(
         ":protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
     ],
 )
diff --git a/tensorflow/core/api_def/base_api/api_def_GlobalIterId.pbtxt b/tensorflow/core/api_def/base_api/api_def_GlobalIterId.pbtxt
new file mode 100644
index 00000000000000..7ec4d4db81f96c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GlobalIterId.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GlobalIterId"
+  visibility: HIDDEN
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_ListSnapshotChunksDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ListSnapshotChunksDataset.pbtxt
new file mode 100644
index 00000000000000..83bce65aa59919
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ListSnapshotChunksDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ListSnapshotChunksDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_GlobalIterId.pbtxt b/tensorflow/core/api_def/python_api/api_def_GlobalIterId.pbtxt
new file mode 100644
index 00000000000000..7ec4d4db81f96c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_GlobalIterId.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GlobalIterId"
+  visibility: HIDDEN
+}
\ No newline at end of file
diff --git a/tensorflow/core/build_defs.bzl b/tensorflow/core/build_defs.bzl
index b9952c2214524f..9d948028278ffd 100644
--- a/tensorflow/core/build_defs.bzl
+++ b/tensorflow/core/build_defs.bzl
@@ -4,12 +4,18 @@ load("//third_party/bazel_rules/rules_python/python:py_binary.bzl", "py_binary")
 
 def _tf_core_transition_impl(settings, attr):
     _ignore = (settings, attr)  # @unused
-    return {"@local_tsl//tsl/framework/contraction:disable_onednn_contraction_kernel": True}
+    return {
+        "@local_tsl//tsl/framework/contraction:disable_onednn_contraction_kernel": True,
+        "//tensorflow/compiler/mlir/python:disable_mlir": True,
+    }
 
 _tf_core_transition = transition(
     implementation = _tf_core_transition_impl,
     inputs = [],
-    outputs = ["@local_tsl//tsl/framework/contraction:disable_onednn_contraction_kernel"],
+    outputs = [
+        "@local_tsl//tsl/framework/contraction:disable_onednn_contraction_kernel",
+        "//tensorflow/compiler/mlir/python:disable_mlir",
+    ],
 )
 
 def _py_binary_tf_core_impl(ctx):
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index bf5b15eebdc72f..cfec2624420c3f 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -131,14 +131,12 @@ cc_library(
     srcs = ["collective_test_util.cc"],
     hdrs = ["collective_test_util.h"],
     copts = tf_copts(),
-    features = ["-layering_check"],
     deps = [
         ":device_resolver_local",
         ":process_util",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:session_options",
         "//tensorflow/core:testlib",
         "//tensorflow/core/framework:allocator",
         "//tensorflow/core/framework:device_attributes_proto_cc",
@@ -146,6 +144,8 @@ cc_library(
         "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/platform:refcount",
         "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:unbounded_work_queue",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -329,7 +329,6 @@ cc_library(
     srcs = ["all_to_all.cc"],
     hdrs = ["all_to_all.h"],
     copts = tf_copts(),
-    features = ["-layering_check"],
     deps = [
         ":base_collective_executor",
         ":collective_rma_local",
@@ -341,7 +340,7 @@ cc_library(
         ":process_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/platform:blocking_counter",
     ],
     alwayslink = 1,
 )
@@ -385,13 +384,13 @@ cc_library(
     srcs = ["buf_rendezvous.cc"],
     hdrs = ["buf_rendezvous.h"],
     copts = tf_copts(),
-    features = ["-layering_check"],
     deps = [
         ":device",
         ":device_mgr",
         ":process_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -462,12 +461,13 @@ cc_library(
     srcs = ["collective_param_resolver_local.cc"],
     hdrs = ["collective_param_resolver_local.h"],
     copts = tf_copts(),
-    features = ["-layering_check"],
     deps = [
         ":device_mgr",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1373,6 +1373,7 @@ cc_library(
         ":bfc_allocator",
         ":pool_allocator",
         "//tensorflow/core:lib",
+        "//tensorflow/core/util:env_var",
         "//tensorflow/core/util:onednn_env_vars",
     ],
 )
@@ -1388,6 +1389,7 @@ cc_library(
     deps = [
         ":function",
         ":optimization_registry",
+        ":process_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:graph",
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index edeca472fa9b4a..376b6d81351458 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -1,5 +1,3 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_zendnn",
@@ -9,6 +7,8 @@ load(
     "tf_cuda_library",
     "tf_mkl_kernel_library",
 )
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
@@ -119,6 +119,8 @@ tf_cuda_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -301,6 +303,8 @@ tf_cuda_library(
         "//tensorflow/core/platform:platform_port",
         "//tensorflow/core/util:managed_stack_trace",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
diff --git a/tensorflow/core/common_runtime/eager/attr_builder_test.cc b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
index 185acbf9463428..1baf0ddcdceb48 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder_test.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
@@ -162,10 +162,12 @@ TEST(AttrBuilder, BuildNodeDef_Modified) {
   AttrBuilder a("MatMul");
   a.Set("transpose_a", true);
   a.Set("transpose_b", false);
+  a.Set("grad_x", true);
+  a.Set("grad_y", false);
   a.NumInputs(2);
 
   const NodeDef& node_def = a.BuildNodeDef();
-  EXPECT_EQ(node_def.attr().size(), 2);
+  EXPECT_EQ(node_def.attr().size(), 6);
 
   a.Set("new_attr", 15);
   a.NumInputs(3);
@@ -173,11 +175,15 @@ TEST(AttrBuilder, BuildNodeDef_Modified) {
   const NodeDef& node_def2 = a.BuildNodeDef();
 
   auto attrs = node_def2.attr();
-  EXPECT_EQ(attrs.size(), 3);
+  EXPECT_EQ(attrs.size(), 7);
   ASSERT_NE(attrs.find("transpose_a"), attrs.end());
   EXPECT_EQ(attrs.find("transpose_a")->second.b(), true);
   ASSERT_NE(attrs.find("transpose_b"), attrs.end());
   EXPECT_EQ(attrs.find("transpose_b")->second.b(), false);
+  ASSERT_NE(attrs.find("grad_x"), attrs.end());
+  EXPECT_EQ(attrs.find("grad_x")->second.b(), true);
+  ASSERT_NE(attrs.find("grad_y"), attrs.end());
+  EXPECT_EQ(attrs.find("grad_y")->second.b(), false);
   ASSERT_NE(attrs.find("new_attr"), attrs.end());
   EXPECT_EQ(attrs.find("new_attr")->second.i(), 15);
 }
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 1ba7291a3e07be..a7306be3b8b431 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -26,6 +26,8 @@ limitations under the License.
 
 // clang-format off
 // Required for IS_MOBILE_PLATFORM
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
@@ -1008,6 +1010,42 @@ Status EagerContext::AddFunctionDef(const FunctionDef& fdef,
   return OkStatus();
 }
 
+Status EagerContext::AddComponentFunction(const FunctionDef& fdef,
+                                          const FunctionDefLibrary& library) {
+  {
+    mutex_lock l(cache_mu_);
+    auto iter = component_function_libraries_.find(fdef.signature().name());
+    if (iter == component_function_libraries_.end()) {
+      // TODO(mrry): For any functions in the main function library, consider
+      //   deduplicating them here.
+      auto component_func_lib_def = std::make_unique<FunctionLibraryDefinition>(
+          OpRegistry::Global(), library);
+      TF_RETURN_IF_ERROR(component_func_lib_def->AddFunctionDef(fdef, {}));
+      component_function_libraries_.insert(
+          {fdef.signature().name(), std::move(component_func_lib_def)});
+    } else {
+      // The function has been registered before. If the function is different,
+      // we error out.
+      const FunctionDef* prev_fdef =
+          iter->second->Find(fdef.signature().name());
+      if (prev_fdef == nullptr) {
+        return absl::InternalError(
+            absl::StrCat("Component function: ", fdef.signature().name(),
+                         " is in the cache but not in the library"));
+      }
+      if (!FunctionDefsEqual(fdef, *prev_fdef)) {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Attempting to add a duplicate function with name: ",
+            fdef.signature().name(), " where the previous and current ",
+            "definitions differ. Previous definition: ",
+            prev_fdef->DebugString(),
+            " and current definition: ", fdef.DebugString()));
+      }
+    }
+  }
+  return OkStatus();
+}
+
 const FunctionDef* EagerContext::GetFunctionDef(const string& function_name) {
   return func_lib_def_.Find(function_name);
 }
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 3aa9a5a3d03890..075849fae3304b 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -251,6 +251,14 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
                         bool add_to_local_only = false,
                         const StackTracesMap& stack_traces = {});
 
+  // Adds a component function (i.e. containing a subgraph of a multi-process
+  // function) implemented as `fdef`.
+  //
+  // REQUIRES: `library` must contain all functions reachable from `fdef`. It
+  //   should not contain `fdef` itself.
+  Status AddComponentFunction(const FunctionDef& fdef,
+                              const FunctionDefLibrary& library);
+
   const FunctionDef* GetFunctionDef(const string& function_name);
 
   std::vector<string> ListFunctionNames() override;
@@ -385,6 +393,16 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   FunctionLibraryDefinition* FuncLibDef() override { return &func_lib_def_; }
 
+  FunctionLibraryDefinition* GetComponentFunctionFunctionLibraryDefinition(
+      const string& function_name) {
+    tf_shared_lock lock(cache_mu_);
+    auto iter = component_function_libraries_.find(function_name);
+    if (iter != component_function_libraries_.end()) {
+      return iter->second.get();
+    }
+    return nullptr;
+  }
+
 #if !defined(IS_MOBILE_PLATFORM)
   // Assign the EagerClient pointer to `client` based on the given device / task
   // name, and increment the refcount of the client. The reference ownership is
@@ -756,6 +774,9 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
       kernel_cache_ TF_GUARDED_BY(cache_mu_);
   std::unordered_map<string, RegisteredFunction*> registered_functions_
       TF_GUARDED_BY(cache_mu_);
+
+  std::unordered_map<string, std::unique_ptr<FunctionLibraryDefinition>>
+      component_function_libraries_ TF_GUARDED_BY(cache_mu_);
   absl::flat_hash_map<Fprint128, Device*, Fprint128Hasher> device_cache_
       TF_GUARDED_BY(device_cache_mu_);
   std::unordered_map<std::string, std::vector<std::function<void()>>>
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index 545585750b6abb..58888afece8bd1 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "tensorflow/c/eager/abstract_operation.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
@@ -27,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/custom_device.h"
 #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/host_info.h"
@@ -333,16 +336,24 @@ Status EagerOperation::Reset(
   if (!is_function) {
     const auto& exempt_ops = InputColocationExemptionRegistry::Global()->Get();
     colocation_exempt_ = exempt_ops.find(op) != exempt_ops.end();
-
     TF_RETURN_IF_ERROR(OpDefForOp(op, &op_def_));
-  } else if (!remote && !ctx_.FindFunctionByName(op)) {
-    return errors::NotFound(
-        "'", op,
-        "' is neither a type of a primitive operation nor a name "
-        "of a function registered in binary running on ",
-        port::Hostname(),
-        ". Make sure the operation or function is "
-        "registered in the binary running in this process.");
+  } else if (!remote) {
+    const FunctionLibraryDefinition* func_lib_def;
+    if (eager_func_params.has_value() &&
+        eager_func_params.value().func_lib_def_override != nullptr) {
+      func_lib_def = eager_func_params.value().func_lib_def_override;
+    } else {
+      func_lib_def = ctx_.FuncLibDef();
+    }
+    if (func_lib_def->Find(op) == nullptr) {
+      return absl::NotFoundError(absl::StrCat(
+          "'", op,
+          "' is neither a type of a primitive operation nor a name "
+          "of a function registered in binary running on ",
+          port::Hostname(),
+          ". Make sure the operation or function is "
+          "registered in the binary running in this process."));
+    }
   }
   attrs_.Reset(op);
   stack_trace_.reset();
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index ccde391e8dc53d..3ddf91c5ed5f52 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/managed_stack_trace.h"
@@ -153,6 +154,23 @@ class EagerOperation : public ImmediateExecutionOperation {
 
   tensorflow::EagerContext& EagerContext() const { return ctx_; }
 
+  const FunctionLibraryDefinition* FuncLibDef() const {
+    if (eager_func_params_.has_value() &&
+        eager_func_params_.value().func_lib_def_override) {
+      return eager_func_params_.value().func_lib_def_override;
+    } else {
+      return ctx_.FuncLibDef();
+    }
+  }
+
+  const FunctionDef* GetFunctionDef() const {
+    if (is_function_) {
+      return FuncLibDef()->Find(attrs_.op_name());
+    } else {
+      return nullptr;
+    }
+  }
+
   AttrBuilder* MutableAttrs() { return &attrs_; }
   const AttrBuilder& Attrs() const { return attrs_; }
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index daaab604d2a01d..0d68aac0cff554 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -291,8 +291,7 @@ Status GetOutputDTypes(EagerOperation* op, DataTypeVector* output_dtypes) {
   const auto& node_def = op->MutableAttrs()->BuildNodeDef();
   const OpDef* op_def = nullptr;
 
-  const FunctionDef* function_def =
-      op->EagerContext().FuncLibDef()->Find(op->Name());
+  const FunctionDef* function_def = op->GetFunctionDef();
   if (function_def != nullptr) {
     op_def = &(function_def->signature());
   } else {
@@ -420,8 +419,7 @@ Status GetFuncAttr(const EagerOperation* op, const EagerContext& ctx,
     return OkStatus();
   }
 
-  const FunctionDef* function_def =
-      ctx.pflr()->GetFunctionLibraryDefinition()->Find(op->Name());
+  const FunctionDef* function_def = op->GetFunctionDef();
   if (function_def == nullptr) {
     return errors::NotFound("Failed to find function '", op->Name(), "'");
   }
@@ -445,8 +443,7 @@ Status HasTPUReplication(const EagerOperation& op, const EagerContext& ctx,
     return OkStatus();
   }
 
-  const FunctionDef* function_def =
-      ctx.pflr()->GetFunctionLibraryDefinition()->Find(op.Name());
+  const FunctionDef* function_def = op.GetFunctionDef();
   if (function_def == nullptr) {
     return errors::NotFound("Failed to find function '", op.Name(), "'");
   }
@@ -513,11 +510,12 @@ Status HasNestedJitCompile(const EagerOperation& op, const EagerContext& ctx,
   std::queue<std::string> function_names;
   function_names.push(op.Name());
 
+  const FunctionLibraryDefinition* func_lib_def = op.FuncLibDef();
+
   while (!function_names.empty()) {
     const string& function_name = function_names.front();
 
-    const FunctionDef* function_def =
-        ctx.pflr()->GetFunctionLibraryDefinition()->Find(function_name);
+    const FunctionDef* function_def = func_lib_def->Find(function_name);
     if (function_def == nullptr) {
       return errors::NotFound("Failed to find function '", function_name, "'");
     }
@@ -1537,8 +1535,8 @@ Status GetOrCreateKernelAndDevice(
           ctx.GetCollectiveExecutorHandle(), ctx.HostCPU()));
     }
 
-    TF_RETURN_IF_ERROR(
-        kernel->Init(ctx.LogDevicePlacement(), ndef, graph_collector));
+    TF_RETURN_IF_ERROR(kernel->Init(ctx.LogDevicePlacement(), ndef,
+                                    graph_collector, op->eager_func_params()));
 
     // Exclude tf.data op kernels from being cached. The reason for this is
     // that tf.data op kernels that accept a user-defined function will have a
@@ -1548,8 +1546,7 @@ Status GetOrCreateKernelAndDevice(
     // programs that build input pipeline graphs in a loop.
     const OpDef* op_def;
     if (op->is_function()) {
-      const FunctionDef* function_def =
-          op->EagerContext().FuncLibDef()->Find(op->Name());
+      const FunctionDef* function_def = op->GetFunctionDef();
       if (function_def != nullptr) {
         op_def = &(function_def->signature());
       } else {
@@ -1976,8 +1973,8 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   std::unique_ptr<EagerNode> node(new eager::RemoteExecuteNode(
       &op->EagerContext(), std::move(request), op_device,
       ctx.GetContextViewId(), eager_client.get(), op->GetCancellationManager(),
-      op->MutableAttrs()->BuildNodeDef(), op->EagerContext().FuncLibDef(),
-      *inputs, {retvals, num_outputs}));
+      op->MutableAttrs()->BuildNodeDef(), op->FuncLibDef(), *inputs,
+      {retvals, num_outputs}));
 
   if (op->EagerContext().LogDevicePlacement() || VLOG_IS_ON(1)) {
     string msg = strings::StrCat(
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 7b3b383b3ddb44..460fab04252ece 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/match.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
@@ -103,9 +104,14 @@ KernelAndDeviceFunc::~KernelAndDeviceFunc() {
   }
 }
 
-Status KernelAndDeviceOp::Init(const bool log_device_placement,
-                               const NodeDef& ndef,
-                               GraphCollector* graph_collector) {
+Status KernelAndDeviceOp::Init(
+    const bool log_device_placement, const NodeDef& ndef,
+    GraphCollector* graph_collecto,
+    const absl::optional<EagerFunctionParams>& eager_func_params) {
+  if (eager_func_params.has_value()) {
+    return absl::InternalError(
+        "KernelAndDeviceOp does not support EagerFunctionParams.");
+  }
   OpKernel* k = nullptr;
   if (flr_ == nullptr) {
     return errors::Internal(
@@ -141,22 +147,31 @@ Status KernelAndDeviceOp::Init(const bool log_device_placement,
   return OkStatus();
 }
 
-Status KernelAndDeviceFunc::InstantiateFunc(const bool log_device_placement,
-                                            const NodeDef& ndef,
-                                            GraphCollector* graph_collector) {
+Status KernelAndDeviceFunc::InstantiateFunc(
+    const bool log_device_placement, const NodeDef& ndef,
+    GraphCollector* graph_collector,
+    const absl::optional<EagerFunctionParams>& eager_func_params) {
   const OpDef* op_def = nullptr;
-  const FunctionDef* function_def;
-  if (flr_ == nullptr) {
-    // If function is being executed without an explicit device request,
-    // lookup the FunctionDef in the CPU's FLR. All FLRs share the same
-    // library.
-    function_def = pflr_->GetFLR(host_cpu_device_->name())
-                       ->GetFunctionLibraryDefinition()
-                       ->Find(ndef.op());
+  const FunctionLibraryDefinition* func_lib_def;
+  FunctionLibraryRuntime::InstantiateOptions options;
+
+  if (eager_func_params.has_value() &&
+      eager_func_params.value().func_lib_def_override != nullptr) {
+    func_lib_def = eager_func_params.value().func_lib_def_override;
+    options.lib_def = func_lib_def;
   } else {
-    function_def = flr_->GetFunctionLibraryDefinition()->Find(ndef.op());
+    if (flr_ == nullptr) {
+      // If function is being executed without an explicit device request,
+      // lookup the FunctionDef in the CPU's FLR. All FLRs share the same
+      // library.
+      func_lib_def = pflr_->GetFLR(host_cpu_device_->name())
+                         ->GetFunctionLibraryDefinition();
+    } else {
+      func_lib_def = flr_->GetFunctionLibraryDefinition();
+    }
   }
 
+  const FunctionDef* function_def = func_lib_def->Find(ndef.op());
   if (function_def != nullptr) {
     op_def = &(function_def->signature());
   } else {
@@ -165,7 +180,6 @@ Status KernelAndDeviceFunc::InstantiateFunc(const bool log_device_placement,
   TF_RETURN_IF_ERROR(
       InOutTypesForNode(ndef, *op_def, &input_dtypes_, &output_dtypes_));
 
-  FunctionLibraryRuntime::InstantiateOptions options;
   options.target = device_ == nullptr ? "" : device_->name();
   options.is_multi_device_function = true;
   for (const Device* device : input_devices_) {
@@ -174,13 +188,10 @@ Status KernelAndDeviceFunc::InstantiateFunc(const bool log_device_placement,
   options.composite_devices = composite_devices_;
   options.input_resource_dtypes_and_shapes = input_resource_dtypes_and_shapes_;
   if (outputs_on_op_device_) {
-    const FunctionLibraryDefinition* lib_def =
-        pflr_->GetFunctionLibraryDefinition();
-    const FunctionDef* fdef = lib_def->Find(ndef.op());
-    if (fdef == nullptr) {
+    if (function_def == nullptr) {
       return errors::InvalidArgument("Failed to find function ", ndef.op());
     }
-    for (int i = 0; i < fdef->signature().output_arg_size(); ++i) {
+    for (int i = 0; i < function_def->signature().output_arg_size(); ++i) {
       options.output_devices.push_back(options.target);
     }
   }
@@ -248,11 +259,12 @@ Status KernelAndDeviceFunc::InstantiateFunc(const bool log_device_placement,
   return pflr_->IsCrossProcess(handle_, &is_cross_process_);
 }
 
-Status KernelAndDeviceFunc::Init(const bool log_device_placement,
-                                 const NodeDef& ndef,
-                                 GraphCollector* graph_collector) {
-  TF_RETURN_IF_ERROR(
-      InstantiateFunc(log_device_placement, ndef, graph_collector));
+Status KernelAndDeviceFunc::Init(
+    const bool log_device_placement, const NodeDef& ndef,
+    GraphCollector* graph_collector,
+    const absl::optional<EagerFunctionParams>& eager_func_params) {
+  TF_RETURN_IF_ERROR(InstantiateFunc(log_device_placement, ndef,
+                                     graph_collector, eager_func_params));
   return pflr_->GetOutputDevices(handle_, &output_devices_);
 }
 
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index a98427a9e04d27..7a800f9b2a15d1 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -60,14 +60,19 @@ class FunctionLibraryRuntime;
 
 const int64_t kInvalidOpId = -1;
 
-// This struc is used for:
-// 1. setting op_id and step_id, is_component_function for single-client
+// This struct is used for:
+// 1. Setting `op_id` and `step_id`, `is_component_function` for single-client
 // remote function scenario,
-// 2. setting step_id for multi-client parallel_device scenario.
+// 2. Setting `step_id` for multi-client parallel_device scenario.
+// 3. Supplying an overriding, private `FunctionLibraryDefinition` for component
+// functions.
 struct EagerFunctionParams {
   int64_t op_id = kInvalidOpId;
   bool is_component_function;
   std::optional<int64_t> step_id = std::nullopt;
+  FunctionLibraryDefinition* func_lib_def_override =
+      nullptr;  // Not owned (owned by `EagerContext`). If not null, functions
+                // called by the function will be looked up in this library.
 };
 
 class EagerKernelArgs : public FunctionArgsInterface {
@@ -113,8 +118,10 @@ class KernelAndDevice : public core::RefCounted {
   //
   // The provided FunctionLibraryRuntime MUST outlive all calls to
   // Run() on the returned KernelAndDevice.
-  virtual Status Init(bool log_device_placement, const NodeDef& ndef,
-                      GraphCollector* graph_collector) = 0;
+  virtual Status Init(
+      bool log_device_placement, const NodeDef& ndef,
+      GraphCollector* graph_collector,
+      const absl::optional<EagerFunctionParams>& eager_func_params) = 0;
 
   // Non-multi-device functions are run using regular CallOp and look like
   // primitive operations from KernelAndDevice perspective.
@@ -215,8 +222,10 @@ class KernelAndDeviceOp final : public KernelAndDevice {
 
   ~KernelAndDeviceOp() override = default;
 
-  Status Init(bool log_device_placement, const NodeDef& ndef,
-              GraphCollector* graph_collector) override;
+  Status Init(
+      bool log_device_placement, const NodeDef& ndef,
+      GraphCollector* graph_collector,
+      const absl::optional<EagerFunctionParams>& eager_func_params) override;
 
   Status Run(
       ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
@@ -316,11 +325,15 @@ class KernelAndDeviceFunc : public KernelAndDevice {
 
   bool IsCrossProcess() override { return is_cross_process_; }
 
-  Status InstantiateFunc(bool log_device_placement, const NodeDef& ndef,
-                         GraphCollector* graph_collector);
+  Status InstantiateFunc(
+      bool log_device_placement, const NodeDef& ndef,
+      GraphCollector* graph_collector,
+      const absl::optional<EagerFunctionParams>& eager_func_params);
 
-  Status Init(bool log_device_placement, const NodeDef& ndef,
-              GraphCollector* graph_collector) override;
+  Status Init(
+      bool log_device_placement, const NodeDef& ndef,
+      GraphCollector* graph_collector,
+      const absl::optional<EagerFunctionParams>& eager_func_params) override;
 
   Status Run(
       ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
index 33122bc4c38105..bda3e5f582fc05 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@@ -118,7 +118,7 @@ void BM_KernelAndDeviceInit(::testing::benchmark::State& state) {
   KernelAndDeviceOp k(nullptr, false, env.function_library_runtime(), nullptr,
                       nullptr, env.cpu_device());
   for (auto s : state) {
-    TF_CHECK_OK(k.Init({}, ndef, nullptr));
+    TF_CHECK_OK(k.Init({}, ndef, nullptr, std::nullopt));
   }
 }
 BENCHMARK(BM_KernelAndDeviceInit);
@@ -138,7 +138,7 @@ void BM_KernelAndDeviceRun(::testing::benchmark::State& state) {
   TestEnv env;
   KernelAndDeviceOp k(nullptr, false, env.function_library_runtime(), nullptr,
                       nullptr, env.cpu_device());
-  TF_CHECK_OK(k.Init({}, ndef, nullptr));
+  TF_CHECK_OK(k.Init({}, ndef, nullptr, std::nullopt));
   const EagerKernelArgs args(std::move(inputs));
   for (auto s : state) {
     TF_CHECK_OK(k.Run(nullptr, args, &outputs, nullptr, std::nullopt,
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 93371be2fc12ab..1f7613b9ec48c1 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -1586,7 +1586,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     // impact.
     TF_CHECK_OK(GetNodeAttr(n->def(), "transpose_a", &trans_a));
 
-    return !trans_a;
+    // Only rewrite float and bfloat16.
+    DataType T_m;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "T", &T_m));
+
+    return !trans_a && (T_m == DT_FLOAT || T_m == DT_BFLOAT16);
   }
 
   // Check if we are performing pooling on depth or batch. If it is, then we
@@ -1864,6 +1868,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
             fused_ops == std::vector<string>{"BiasAdd", "Relu"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Relu6"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Elu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "_FusedHardSwish"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Add"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu6"} ||
@@ -1899,7 +1904,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return (fused_ops == std::vector<string>{"BiasAdd"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Relu"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Relu6"} ||
-            fused_ops == std::vector<string>{"BiasAdd", "Elu"});
+            fused_ops == std::vector<string>{"BiasAdd", "Elu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "_FusedHardSwish"});
   }
 
   // Rewrites input node to a new node specified by its matching rewrite info.
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/BUILD b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
index a5ace31ae81401..9c9ce942b78e78 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
@@ -211,6 +211,7 @@ cc_library(
     name = "c_plugin_op_kernel",
     srcs = ["c_plugin_op_kernel.cc"],
     hdrs = ["c_plugin_op_kernel.h"],
+    copts = ["-DTF_CAPI_WEAK"],
     visibility = ["//visibility:public"],
     deps = [
         ":c_plugin_variable",
@@ -300,7 +301,7 @@ cc_library(
     name = "c_plugin_coordination_service_agent",
     srcs = ["c_plugin_coordination_service_agent.cc"],
     hdrs = ["c_plugin_coordination_service_agent.h"],
-    defines = ["TF_CAPI_WEAK"],
+    copts = ["-DTF_CAPI_WEAK"],
     visibility = ["//visibility:public"],
     deps = [
         ":plugin_coordination_service_agent",
@@ -352,7 +353,7 @@ cc_library(
     name = "c_plugin_variable",
     srcs = ["c_plugin_variable.cc"],
     hdrs = ["c_plugin_variable.h"],
-    defines = ["TF_CAPI_WEAK"],
+    copts = ["-DTF_CAPI_WEAK"],
     visibility = ["//visibility:public"],
     deps = [
         ":plugin_variable",
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.cc b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.cc
index a266fe7bcf8f3a..109d9ed62b95b2 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.cc
@@ -83,7 +83,7 @@ Status CPluginOpKernelConstruction::GetInt32AttrList(
                                       &total_size, status);
   TF_RETURN_IF_ERROR(StatusFromTF_Status(status));
 
-  value->reserve(list_size);
+  value->resize(list_size);
 
   TF_OpKernelConstruction_GetAttrInt32List(
       ctx_, attr_name.data(), value->data(), /*max_vals=*/list_size, status);
diff --git a/tensorflow/core/common_runtime/replicate_constants_pass.cc b/tensorflow/core/common_runtime/replicate_constants_pass.cc
index d81db57beb23b3..376129bad99719 100644
--- a/tensorflow/core/common_runtime/replicate_constants_pass.cc
+++ b/tensorflow/core/common_runtime/replicate_constants_pass.cc
@@ -67,13 +67,39 @@ bool HasCpuDevice(const Node* node) {
   return device.type == "CPU";
 }
 
+// Convert the CPU device name to the corresponding CPU device name. If
+// multiple local CPU devices are enabled, the CPU device name will also
+// contain the device id.
+Status DeviceNameToCpuDeviceNameWithDeviceId(const string& device_name,
+                                             string* host_device_name) {
+  DeviceNameUtils::ParsedName device;
+  if (!DeviceNameUtils::ParseFullName(device_name, &device)) {
+    return absl::InternalError(
+        absl::StrCat("Could not parse device name ", device_name));
+  }
+  // If aggressive constant replication is enabled and the dst node is on CPU.
+  // We just use the device name of the dst for the src.
+  if (flags::Global().enable_aggressive_constant_replication.value() &&
+      device.type == "CPU") {
+    *host_device_name = device_name;
+  } else {
+    // If not, assigning the corresponding CPU 0 to it.
+    device.type = "CPU";
+    device.has_type = true;
+    device.id = 0;
+    device.has_id = true;
+    *host_device_name = DeviceNameUtils::ParsedNameToString(device);
+  }
+  return OkStatus();
+}
+
 // Get the CPU device on the same host as dst.
 Status GetDestinationCpuDevice(const Node* dst, std::string* device) {
   if (!dst->has_assigned_device_name())
     return absl::AbortedError(
         absl::StrCat("Node name: ", dst->name(), " has no assigned device."));
-  return DeviceNameUtils::DeviceNameToCpuDeviceName(dst->assigned_device_name(),
-                                                    device);
+  return DeviceNameToCpuDeviceNameWithDeviceId(dst->assigned_device_name(),
+                                               device);
 }
 
 // Collect the successor edges of the constant. Group them by the device of the
diff --git a/tensorflow/core/config/BUILD b/tensorflow/core/config/BUILD
index 7a2400c2f64206..53f42c5759ecfb 100644
--- a/tensorflow/core/config/BUILD
+++ b/tensorflow/core/config/BUILD
@@ -95,6 +95,7 @@ py_strict_test(
     python_version = "PY3",
     deps = [
         ":flags_py",
+        #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/core/config/flag_defs.h b/tensorflow/core/config/flag_defs.h
index 6a99c548ac9cfc..2061add2f1d4b8 100644
--- a/tensorflow/core/config/flag_defs.h
+++ b/tensorflow/core/config/flag_defs.h
@@ -53,6 +53,9 @@ class Flags {
                   "Enables the publication of partitioned function graphs "
                   "via StatsPublisherInterface. Disabling this flag can "
                   "reduce memory consumption.");
+  TF_DECLARE_FLAG(enable_aggressive_constant_replication, true,
+                  "Replicate constants across CPU devices and even for local "
+                  "CPUs within the same task if available.")
   // LINT.ThenChange(//tensorflow/core/config/flags_api_wrapper.cc)
 };
 
diff --git a/tensorflow/core/config/flags_api_wrapper.cc b/tensorflow/core/config/flags_api_wrapper.cc
index 3d0a001aecf903..769a9a4db2d983 100644
--- a/tensorflow/core/config/flags_api_wrapper.cc
+++ b/tensorflow/core/config/flags_api_wrapper.cc
@@ -52,5 +52,6 @@ PYBIND11_MODULE(flags_pybind, m) {
   TF_PY_DECLARE_FLAG(tf_shape_default_int64);
   TF_PY_DECLARE_FLAG(more_stack_traces);
   TF_PY_DECLARE_FLAG(publish_function_graphs);
+  TF_PY_DECLARE_FLAG(enable_aggressive_constant_replication);
   // LINT.ThenChange(//tensorflow/core/config/flag_defs.h)
 };
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index a6debf6a378624..2f02dd9b8c88d6 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -1,4 +1,3 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_not_mobile",
@@ -8,6 +7,7 @@ load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_protos_all",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -39,6 +39,10 @@ exports_files([
     "serialization_utils.h",
     "split_utils.cc",
     "split_utils.h",
+    "file_logger_client_no_op.h",
+    "file_logger_client_no_op.cc",
+    "file_logger_client_interface.h",
+    "file_logger_client_interface.cc",
     "stats_utils.cc",
     "stats_utils.h",
     "tfdataz_metrics.h",
@@ -393,7 +397,9 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/core:status",
+        "//tensorflow/core/platform:stringpiece",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -439,6 +445,7 @@ cc_library(
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:thread_annotations",
+        "@local_tsl//tsl/platform:types",
     ],
 )
 
@@ -507,14 +514,15 @@ cc_library(
     deps = [
         ":dataset_utils",
         ":root_dataset",
+        ":serialization_utils",
         ":unbounded_thread_pool",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
-        "//tensorflow/core/data:serialization_utils",
         "//tensorflow/core/framework:graph_proto_cc",
         "@com_google_absl//absl/memory",
         "@local_tsl//tsl/platform:env",
@@ -630,9 +638,38 @@ cc_library(
     hdrs = ["utils.h"],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
+        ":file_logger_client_interface",
+        ":file_logger_client_no_op",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status:statusor",
     ],
 )
+
+tf_cc_test(
+    name = "utils_test",
+    srcs = ["utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
+    deps = [
+        ":file_logger_client_interface",
+        ":file_logger_client_no_op",
+        ":utils",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "file_logger_client_interface",
+    hdrs = ["file_logger_client_interface.h"],
+    visibility = [
+        "//learning/processing/tf_data_logger/client:__subpackages__",
+        "//tensorflow:internal",
+    ],
+)
+
+cc_library(
+    name = "file_logger_client_no_op",
+    hdrs = ["file_logger_client_no_op.h"],
+    deps = [":file_logger_client_interface"],
+)
diff --git a/tensorflow/core/data/captured_function.h b/tensorflow/core/data/captured_function.h
index c3d489dd855263..5d9a573aad0d3f 100644
--- a/tensorflow/core/data/captured_function.h
+++ b/tensorflow/core/data/captured_function.h
@@ -290,6 +290,8 @@ class InstantiatedCapturedFunction {
                 FunctionLibraryRuntime::DoneCallback done,
                 const std::shared_ptr<model::Node>& node) const;
 
+  std::string func_name() const { return captured_func_->func().name(); }
+
  private:
   friend class CapturedFunction;
 
diff --git a/tensorflow/core/data/dataset_utils.cc b/tensorflow/core/data/dataset_utils.cc
index 0d662a7a938d33..7d0081c2e6de1e 100644
--- a/tensorflow/core/data/dataset_utils.cc
+++ b/tensorflow/core/data/dataset_utils.cc
@@ -1006,8 +1006,8 @@ REGISTER_DATASET_EXPERIMENT("no_compression", RandomJobSamplePercentage<50>,
 REGISTER_DATASET_EXPERIMENT("inject_io_prefetch", RandomJobSamplePercentage<0>,
                             AllTasks);
 REGISTER_DATASET_EXPERIMENT("reduce_array_record_dataset_memory_usage",
-                            RandomJobSamplePercentage<0>, AllTasks);
-REGISTER_DATASET_EXPERIMENT("map_fusion", RandomJobSamplePercentage<10>,
+                            RandomJobSamplePercentage<50>, AllTasks);
+REGISTER_DATASET_EXPERIMENT("map_fusion", RandomJobSamplePercentage<0>,
                             AllTasks);
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/data/file_logger_client_interface.h b/tensorflow/core/data/file_logger_client_interface.h
new file mode 100644
index 00000000000000..afa6cda0cf15f5
--- /dev/null
+++ b/tensorflow/core/data/file_logger_client_interface.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_FILE_LOGGER_CLIENT_INTERFACE_H_
+#define TENSORFLOW_CORE_DATA_FILE_LOGGER_CLIENT_INTERFACE_H_
+
+#include <string>
+#include <vector>
+
+namespace tensorflow::data {
+
+// An abstract class to provides an easy and thread safe api to make
+// asynchronous calls to the TFDataLoggerService.
+// LogFilesAsync is guaranteed to be non blocking.
+// The destructor however might be blocking.
+class FileLoggerClientInterface {
+ public:
+  // Default constructor
+  FileLoggerClientInterface() = default;
+
+  // Sends file names in `files` to the TFDataLoggerService. Asynchronously.
+  virtual void LogFilesAsync(std::vector<std::string> files) = 0;
+
+  // Default destructor. May block depending on implementation of the derived
+  // class.
+  virtual ~FileLoggerClientInterface() = default;
+};
+}  // namespace tensorflow::data
+
+#endif  // TENSORFLOW_CORE_DATA_FILE_LOGGER_CLIENT_INTERFACE_H_
diff --git a/tensorflow/core/data/file_logger_client_no_op.h b/tensorflow/core/data/file_logger_client_no_op.h
new file mode 100644
index 00000000000000..65247844f741c4
--- /dev/null
+++ b/tensorflow/core/data/file_logger_client_no_op.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_FILE_LOGGER_CLIENT_NO_OP_H_
+#define TENSORFLOW_CORE_DATA_FILE_LOGGER_CLIENT_NO_OP_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/data/file_logger_client_interface.h"
+
+namespace tensorflow::data {
+
+// Implementation of the abstract class FileLoggerClientInterface, which does
+// nothing. It does not allocate any resources and immediately returns in
+// LogFilesAsync.3rd This is used in 3rd party version of the tf.data library.
+class FileLoggerClientNoOp : public FileLoggerClientInterface {
+ public:
+  // Default constructor
+  FileLoggerClientNoOp() = default;
+
+  // Does not do anything
+  void LogFilesAsync(std::vector<std::string> files) override{};
+
+  // Default destructor
+  ~FileLoggerClientNoOp() override = default;
+};
+}  // namespace tensorflow::data
+
+#endif  // TENSORFLOW_CORE_DATA_FILE_LOGGER_CLIENT_NO_OP_H_
diff --git a/tensorflow/core/data/rewrite_utils.cc b/tensorflow/core/data/rewrite_utils.cc
index 707e4d8264118d..76c05e6e47f2fc 100644
--- a/tensorflow/core/data/rewrite_utils.cc
+++ b/tensorflow/core/data/rewrite_utils.cc
@@ -249,7 +249,8 @@ Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
 }
 
 std::unique_ptr<tensorflow::grappler::GrapplerItem> GetGrapplerItem(
-    GraphDef* graph_def, std::string* dataset_node, bool add_fake_sinks) {
+    GraphDef* graph_def, std::string* dataset_node, bool add_fake_sinks,
+    bool apply_optimizations) {
   // Add an identity node as the fetch node, otherwise we might get 'placeholder
   // is both fed and fetched' errors in some cases when using input list with
   // placeholder dataset nodes.
@@ -285,7 +286,7 @@ std::unique_ptr<tensorflow::grappler::GrapplerItem> GetGrapplerItem(
 
   // Create Grappler item.
   tensorflow::grappler::ItemConfig item_config;
-  item_config.apply_optimizations = true;
+  item_config.apply_optimizations = apply_optimizations;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
       tensorflow::grappler::GrapplerItemFromMetaGraphDef(
           "graph", meta_graph_def, item_config);
diff --git a/tensorflow/core/data/rewrite_utils.h b/tensorflow/core/data/rewrite_utils.h
index 23ea965d67e105..44205dc83b24f5 100644
--- a/tensorflow/core/data/rewrite_utils.h
+++ b/tensorflow/core/data/rewrite_utils.h
@@ -57,10 +57,13 @@ Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
 // `dataset_node` is the name of the node corresponding to the dataset.
 // If `add_fake_sinks` is true, it adds fake sink node to graph and functions to
 // allow rewriting the actual sink nodes.
+// If `apply_optimizations` is true, general grappler optimizations at level
+// `tensorflow::OptimizerOptions::L1` are applied to the graph.
 // TODO(b/118820916): When MetaOptimizer adds provisions for function retvals to
 // be optimizable, we will no longer need to add fake nodes.
 std::unique_ptr<tensorflow::grappler::GrapplerItem> GetGrapplerItem(
-    GraphDef* graph_def, std::string* dataset_node, bool add_fake_sinks);
+    GraphDef* graph_def, std::string* dataset_node, bool add_fake_sinks,
+    bool apply_optimizations = true);
 
 // Returns the name of the node corresponding to the dataset. It is indicated by
 // the symbolic `_Retval` node.
diff --git a/tensorflow/core/data/root_dataset.cc b/tensorflow/core/data/root_dataset.cc
index 55ff2bc8122213..bba8a426366329 100644
--- a/tensorflow/core/data/root_dataset.cc
+++ b/tensorflow/core/data/root_dataset.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/stringprintf.h"
+#include "tsl/platform/host_info.h"
 
 namespace tensorflow {
 namespace data {
@@ -46,6 +47,8 @@ constexpr char kDatasetType[] = "Root";
 constexpr char kAlgorithm[] = "algorithm";
 constexpr char kCpuBudget[] = "cpu_budget";
 constexpr char kExperiments[] = "experiments";
+constexpr char kReadRoundtripLatency[] = "read_latency_usec";
+constexpr char kReadResponseBytes[] = "read_bytes";
 constexpr char kIntraOpParallelism[] = "intra_op_parallelism";
 constexpr char kMemBandwidth[] = "mem_bw_used_megabytes_per_sec";
 constexpr char kPrivateThreadpoolSize[] = "threadpool_size";
@@ -277,6 +280,27 @@ class RootDataset::Iterator : public DatasetIterator<RootDataset> {
               "%lld", static_cast<long long>(
                           model_node()->TotalMaximumBufferedBytes() / 1.0e6))));
     }
+    const auto io_statistics = tsl::port::GetIOStatistics();
+    if (io_statistics.roundtrip_latency_usec.count > 0) {
+      traceme_metadata.push_back(std::make_pair(
+          kReadRoundtripLatency,
+          strings::Printf(
+              "(count: %lld, mean: %lld, std dev: %lld)",
+              static_cast<long long>(
+                  io_statistics.roundtrip_latency_usec.count),
+              static_cast<long long>(io_statistics.roundtrip_latency_usec.mean),
+              static_cast<long long>(
+                  io_statistics.roundtrip_latency_usec.std_dev))));
+    }
+    if (io_statistics.response_bytes.count > 0) {
+      traceme_metadata.push_back(std::make_pair(
+          kReadResponseBytes,
+          strings::Printf(
+              "(count: %lld, mean: %lld, std dev: %lld)",
+              static_cast<long long>(io_statistics.response_bytes.count),
+              static_cast<long long>(io_statistics.response_bytes.mean),
+              static_cast<long long>(io_statistics.response_bytes.std_dev))));
+    }
     return traceme_metadata;
   }
 
diff --git a/tensorflow/core/data/serialization_utils.cc b/tensorflow/core/data/serialization_utils.cc
index e07ec49b9137de..01b5a1289e1257 100644
--- a/tensorflow/core/data/serialization_utils.cc
+++ b/tensorflow/core/data/serialization_utils.cc
@@ -14,12 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/data/serialization_utils.h"
 
+#include <cstdint>
 #include <map>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/data/compression_utils.h"
@@ -30,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/platform/stringpiece.h"
 
 namespace tensorflow {
 namespace data {
@@ -118,20 +121,39 @@ Status ReadElementsFromCheckpoint(IteratorContext* ctx,
   return OkStatus();
 }
 
+Status WriteElement(IteratorStateWriter* writer, StringPiece key_prefix,
+                    const std::vector<std::vector<Tensor>>& elements,
+                    int64_t index) {
+  const std::vector<Tensor>& element = elements[index];
+  std::string element_prefix = absl::StrCat(key_prefix, "::", index);
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(element_prefix, kNumComponents, element.size()));
+  for (int j = 0; j < element.size(); ++j) {
+    TF_RETURN_IF_ERROR(writer->WriteTensor(
+        element_prefix, absl::StrCat(kComponent, "[", j, "]"), element[j]));
+  }
+  return OkStatus();
+}
+
 Status WriteElementsToCheckpoint(
     IteratorStateWriter* writer, StringPiece key_prefix,
     const std::vector<std::vector<Tensor>>& elements) {
   TF_RETURN_IF_ERROR(
       writer->WriteScalar(key_prefix, kNumElements, elements.size()));
   for (int i = 0; i < elements.size(); ++i) {
-    const std::vector<Tensor>& element = elements[i];
-    std::string element_prefix = absl::StrCat(key_prefix, "::", i);
-    TF_RETURN_IF_ERROR(
-        writer->WriteScalar(element_prefix, kNumComponents, element.size()));
-    for (int j = 0; j < elements[i].size(); ++j) {
-      TF_RETURN_IF_ERROR(writer->WriteTensor(
-          element_prefix, absl::StrCat(kComponent, "[", j, "]"), element[j]));
-    }
+    TF_RETURN_IF_ERROR(WriteElement(writer, key_prefix, elements, i));
+  }
+  return OkStatus();
+}
+
+Status UpdateCheckpointElements(
+    IteratorStateWriter* writer, StringPiece key_prefix,
+    const std::vector<std::vector<Tensor>>& elements,
+    const absl::flat_hash_set<int64_t>& checkpoint_indices) {
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(key_prefix, kNumElements, elements.size()));
+  for (int64_t i : checkpoint_indices) {
+    TF_RETURN_IF_ERROR(WriteElement(writer, key_prefix, elements, i));
   }
   return OkStatus();
 }
diff --git a/tensorflow/core/data/serialization_utils.h b/tensorflow/core/data/serialization_utils.h
index d5e83c32eb488f..b55dfdfb7eca8c 100644
--- a/tensorflow/core/data/serialization_utils.h
+++ b/tensorflow/core/data/serialization_utils.h
@@ -47,6 +47,15 @@ Status WriteElementsToCheckpoint(
     IteratorStateWriter* writer, StringPiece key_prefix,
     const std::vector<std::vector<Tensor>>& elements);
 
+// Updates the dataset elements in the checkpoint for given `checkpoint_indices`
+// using the given key prefix, assuming that vector of elements have
+// checkpointed these before. The elements can be read back by passing the same
+// key prefix to ReadElementsFromCheckpoint.
+Status UpdateCheckpointElements(
+    IteratorStateWriter* writer, StringPiece key_prefix,
+    const std::vector<std::vector<Tensor>>& elements,
+    const absl::flat_hash_set<int64_t>& checkpoint_indices);
+
 // Helper class for reading data from a vector of VariantTensorData objects.
 class VariantTensorDataReader : public IteratorStateReader {
  public:
diff --git a/tensorflow/core/data/serialization_utils_test.cc b/tensorflow/core/data/serialization_utils_test.cc
index ddd424c519841c..5de7acfdc30f53 100644
--- a/tensorflow/core/data/serialization_utils_test.cc
+++ b/tensorflow/core/data/serialization_utils_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/data/serialization_utils.h"
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <string>
@@ -203,6 +204,16 @@ class ParameterizedIteratorStateVariantTest
   }
 };
 
+class ParemeterizedCheckpointIndicesTest
+    : public DatasetOpsTestBase,
+      public ::testing::WithParamInterface<absl::flat_hash_set<int64_t>> {
+ protected:
+  absl::flat_hash_set<int64_t> GetCheckpointIndices() const {
+    absl::flat_hash_set<int64_t> checkpoint_indices = GetParam();
+    return checkpoint_indices;
+  }
+};
+
 std::vector<std::vector<Tensor>> TestCases() {
   return {
       CreateTensors<int64_t>(TensorShape{1}, {{1}}),           // int64
@@ -216,6 +227,18 @@ std::vector<std::vector<Tensor>> TestCases() {
   };
 }
 
+std::vector<absl::flat_hash_set<int64_t>> CheckpointIndicesTestCases() {
+  return {
+      {/*checkpoint_indices*/},
+      {/*checkpoint_indices*/ 0},
+      {/*checkpoint_indices*/ 0, 1},
+      {/*checkpoint_indices*/ 0, 1, 2},
+      {/*checkpoint_indices*/ 1, 3, 4},
+      {/*checkpoint_indices*/ 1, 2, 3, 4},
+      {/*checkpoint_indices*/ 0, 1, 2, 3, 4},
+  };
+}
+
 TEST_P(ParameterizedIteratorStateVariantTest, EncodeAndDecode) {
   VariantTensorData data = GetVariantTensorData();
   TF_ASSERT_OK_AND_ASSIGN(VariantTensorData result, EncodeAndDecode(data));
@@ -236,9 +259,58 @@ TEST_P(ParameterizedIteratorStateVariantTest, DecodeUncompressed) {
   }
 }
 
+TEST_P(ParemeterizedCheckpointIndicesTest,
+       CheckpointElementsRoundTripUsingIndices) {
+  std::vector<std::vector<Tensor>> elements;
+  elements.push_back(CreateTensors<int32>(TensorShape({3}), {{1, 2, 3}}));
+  elements.push_back(CreateTensors<int32>(TensorShape({2}), {{4, 5}}));
+  elements.push_back(
+      CreateTensors<int32>(TensorShape({5}), {{6, 7, 8, 9, 10}}));
+  elements.push_back(
+      CreateTensors<int32>(TensorShape({4}), {{11, 12, 13, 14}}));
+  elements.push_back(CreateTensors<int32>(TensorShape({2}), {{15, 16}}));
+  VariantTensorDataWriter writer;
+  tstring test_prefix = full_name("test_prefix");
+  // Generate checkpoint for entire buffer
+  absl::flat_hash_set<int64_t> checkpoint_indices_write = {0, 1, 2, 3, 4};
+  TF_ASSERT_OK(WriteElementsToCheckpoint(&writer, test_prefix, elements));
+  // Update the elements at checkpoint indices
+  for (auto index : GetCheckpointIndices()) {
+    elements.at(index) = CreateTensors<int32>(TensorShape({1}), {{1}});
+  }
+  TF_ASSERT_OK(UpdateCheckpointElements(&writer, test_prefix, elements,
+                                        GetCheckpointIndices()));
+  std::vector<const VariantTensorData*> data;
+  writer.GetData(&data);
+
+  VariantTensorDataReader reader(data);
+  std::vector<std::vector<Tensor>> read_elements;
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<TestContext> ctx,
+                          TestContext::Create());
+  TF_ASSERT_OK(ReadElementsFromCheckpoint(ctx->iter_ctx(), &reader, test_prefix,
+                                          &read_elements));
+
+  ASSERT_EQ(elements.size(), read_elements.size());
+  // Check if checkpoint state of entire buffer is as expected
+  for (int index = 0; index < elements.size(); ++index) {
+    std::vector<Tensor>& original = elements[index];
+    std::vector<Tensor>& read = read_elements[index];
+
+    ASSERT_EQ(original.size(), read.size());
+    for (int j = 0; j < original.size(); ++j) {
+      EXPECT_EQ(original[j].NumElements(), read[j].NumElements());
+      EXPECT_EQ(original[j].flat<int32>()(0), read[j].flat<int32>()(0));
+    }
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(Instantiation, ParameterizedIteratorStateVariantTest,
                          ::testing::ValuesIn(TestCases()));
 
+INSTANTIATE_TEST_SUITE_P(Instantiation, ParemeterizedCheckpointIndicesTest,
+                         ::testing::ValuesIn(CheckpointIndicesTestCases()));
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index ee93b3bc916f20..90acfd91600efb 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -1,16 +1,16 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", "cc_grpc_library")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "get_compatible_with_portable", "tf_grpc_cc_dependencies")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_all_protos",
     "tf_proto_library",
     "tf_protos_profiler_service",
 )
-load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "get_compatible_with_portable", "tf_grpc_cc_dependencies")
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-)
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package_group(
     name = "data_transfer_visibility",
@@ -739,6 +739,23 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "split_provider_test",
+    srcs = ["split_provider_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
+    deps = [
+        ":common_proto_cc",
+        ":split_provider",
+        ":test_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "task_remover",
     srcs = ["task_remover.cc"],
@@ -765,6 +782,7 @@ cc_library(
         ":thread_safe_buffer",
         ":worker_proto_cc",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/data:standalone",
@@ -847,15 +865,14 @@ cc_library(
         "//tensorflow/core/framework:node_def_proto_cc",
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/framework:types_proto_cc",
-        "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:path",
-        "//tensorflow/core/platform:protobuf",
-        "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:statusor",
         "//tensorflow/core/platform:tstring",
         "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
diff --git a/tensorflow/core/data/service/graph_rewriters.cc b/tensorflow/core/data/service/graph_rewriters.cc
index af154691549f2a..114ae7c336cedb 100644
--- a/tensorflow/core/data/service/graph_rewriters.cc
+++ b/tensorflow/core/data/service/graph_rewriters.cc
@@ -95,10 +95,18 @@ RemoveCompressionMapRewriter::ApplyRemoveCompressionMapRewrite(
   tensorflow::RewriterConfig::CustomGraphOptimizer config = GetRewriteConfig();
   TF_RETURN_IF_ERROR(remove_compression_map.Init(&config));
 
+  // Don't apply general grappler optimizations. Sometimes there is a conflict
+  // between two applications of these optimizations to the same graph (see
+  // b/303524867). This conflict isn't worth resolving in the context of this
+  // rewrite: the point of this rewrite is to remove one node and change one
+  // reference to it, not to apply any general optimizations.
+  bool apply_general_grappler_optimizations = false;
+
   GraphDef input_graph = graph_def;
   TF_ASSIGN_OR_RETURN(std::string dataset_node, GetDatasetNode(input_graph));
   std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
-      GetGrapplerItem(&input_graph, &dataset_node, /*add_fake_sinks=*/false);
+      GetGrapplerItem(&input_graph, &dataset_node, /*add_fake_sinks=*/false,
+                      apply_general_grappler_optimizations);
 
   GraphDef rewritten_graph;
   std::unordered_map<std::string, tensorflow::DeviceProperties> device_map;
diff --git a/tensorflow/core/data/service/snapshot/BUILD b/tensorflow/core/data/service/snapshot/BUILD
index 46e420aac744bb..e3a16ec3b8b856 100644
--- a/tensorflow/core/data/service/snapshot/BUILD
+++ b/tensorflow/core/data/service/snapshot/BUILD
@@ -1,7 +1,7 @@
 # Distributed snapshot library.
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
+load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies", "tf_kernel_library")
 load("//tensorflow/core/platform:build_config.bzl", "tf_protos_profiler_service")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 
@@ -35,6 +35,7 @@ tf_cc_test(
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:tstring",
     ] + tf_grpc_cc_dependencies() + tf_protos_profiler_service(),
 )
@@ -85,6 +86,28 @@ tf_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "list_snapshot_chunks_dataset_op",
+    srcs = ["list_snapshot_chunks_dataset_op.cc"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":snapshot_chunk_provider",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:name_utils",
+        "//tensorflow/core/framework:allocator",
+        "//tensorflow/core/framework:op_requires",
+        "//tensorflow/core/framework:types_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:tstring",
+    ],
+)
+
 cc_library(
     name = "path_utils",
     srcs = ["path_utils.cc"],
@@ -242,6 +265,58 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "snapshot_chunk_provider",
+    srcs = ["snapshot_chunk_provider.cc"],
+    hdrs = ["snapshot_chunk_provider.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":file_utils",
+        ":path_utils",
+        "//tensorflow/core:framework",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/distributed_runtime/rpc:grpc_util",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:status_to_from_proto",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:tstring",
+        "@local_tsl//tsl/protobuf:status_proto_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "snapshot_chunk_provider_test",
+    size = "small",
+    srcs = ["snapshot_chunk_provider_test.cc"],
+    deps = [
+        ":file_utils",
+        ":path_utils",
+        ":snapshot_chunk_provider",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:status_to_from_proto",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/protobuf:status_proto_cc",
+    ],
+)
+
 cc_library(
     name = "snapshot_stream_writer",
     srcs = ["snapshot_stream_writer.cc"],
diff --git a/tensorflow/core/data/service/snapshot/distributed_snapshot_test.cc b/tensorflow/core/data/service/snapshot/distributed_snapshot_test.cc
index ccd795706ea376..0a91582823f676 100644
--- a/tensorflow/core/data/service/snapshot/distributed_snapshot_test.cc
+++ b/tensorflow/core/data/service/snapshot/distributed_snapshot_test.cc
@@ -32,13 +32,14 @@ limitations under the License.
 #include "tsl/lib/io/compression.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
+#include "tsl/platform/tstring.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-using testing::ChooseFromDatasets;
 using testing::CreateDummyDistributedSnapshotMetadata;
 using ::testing::ElementsAre;
 using ::testing::IsEmpty;
@@ -150,7 +151,8 @@ TEST_P(DistributedSnapshotTest, ChooseFromDatasets) {
   // choice_dataset = tf.data.Dataset.range(3).repeat()
   // dataset = tf.data.Dataset.choose_from_datasets(datasets, choice_dataset)
   TestSnapshotCluster data_service(NumWorkers());
-  TF_ASSERT_OK_AND_ASSIGN(DatasetDef dataset, ChooseFromDatasets());
+  TF_ASSERT_OK_AND_ASSIGN(DatasetDef dataset,
+                          testing::GetTestDataset("choose_from_datasets"));
   experimental::DistributedSnapshotMetadata metadata =
       CreateDummyDistributedSnapshotMetadata();
   std::string snapshot_path = LocalTempFilename();
@@ -158,8 +160,8 @@ TEST_P(DistributedSnapshotTest, ChooseFromDatasets) {
       data_service.dispatcher().Snapshot(dataset, snapshot_path, metadata));
   TF_ASSERT_OK(WaitForSnapshotComplete(snapshot_path));
   EXPECT_THAT(
-      testing::ReadSnapshot<tstring>(snapshot_path,
-                                     tsl::io::compression::kNone),
+      testing::ReadSnapshot<tsl::tstring>(snapshot_path,
+                                          tsl::io::compression::kNone),
       IsOkAndHolds(UnorderedElementsAre("a", "b", "c", "a", "b", "c", "a", "b",
                                         "c", "a", "b", "c", "a", "b", "c")));
 }
diff --git a/tensorflow/core/data/service/snapshot/list_snapshot_chunks_dataset_op.cc b/tensorflow/core/data/service/snapshot/list_snapshot_chunks_dataset_op.cc
new file mode 100644
index 00000000000000..666374ffe7693c
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/list_snapshot_chunks_dataset_op.cc
@@ -0,0 +1,198 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/data/name_utils.h"
+#include "tensorflow/core/data/service/snapshot/snapshot_chunk_provider.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/tstring.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr const char kListSnapshotChunksDataset[] = "ListSnapshotChunksDataset";
+constexpr const char kSnapshotPath[] = "snapshot_path";
+
+Tensor ConvertToTensor(absl::string_view s, Allocator* allocator) {
+  Tensor tensor(allocator, DT_STRING, TensorShape({}));
+  tensor.scalar<tsl::tstring>()() = tsl::tstring(s);
+  return tensor;
+}
+
+// TODO(b/297930782): Implement split provider for this dataset.
+class ListSnapshotChunksDatasetOp : public DatasetOpKernel {
+ public:
+  explicit ListSnapshotChunksDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+class ListSnapshotChunksDatasetOp::Dataset : public DatasetBase {
+ public:
+  Dataset(OpKernelContext* ctx, tsl::tstring snapshot_path,
+          const DataTypeVector& output_types,
+          const std::vector<PartialTensorShape>& output_shapes)
+      : DatasetBase(DatasetContext(ctx)),
+        snapshot_path_(std::move(snapshot_path)),
+        output_types_(output_types),
+        output_shapes_(output_shapes) {}
+
+  absl::string_view snapshot_path() const { return snapshot_path_; }
+
+  const DataTypeVector& output_dtypes() const override { return output_types_; }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return output_shapes_;
+  }
+
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    // TODO(b/297930782): Implement this.
+    return kUnknownCardinality;
+  }
+
+  std::string DebugString() const override {
+    return name_utils::DatasetDebugString(kListSnapshotChunksDataset);
+  }
+
+  absl::Status InputDatasets(
+      std::vector<const DatasetBase*>* inputs) const override {
+    inputs->clear();
+    return absl::OkStatus();
+  }
+
+  absl::Status CheckExternalState() const override { return absl::OkStatus(); }
+
+ protected:
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const std::string& prefix) const override;
+
+  absl::Status AsGraphDefInternal(SerializationContext* ctx,
+                                  DatasetGraphDefBuilder* b,
+                                  Node** output) const override {
+    Node* snapshot_path = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(snapshot_path_, &snapshot_path));
+    return b->AddDataset(this, /*inputs=*/{snapshot_path}, output);
+  }
+
+ private:
+  class Iterator;
+
+  const tsl::tstring snapshot_path_;
+  const DataTypeVector output_types_;
+  const std::vector<PartialTensorShape> output_shapes_;
+};
+
+class ListSnapshotChunksDatasetOp::Dataset::Iterator
+    : public DatasetIterator<ListSnapshotChunksDatasetOp::Dataset> {
+ public:
+  explicit Iterator(const Params& params)
+      : DatasetIterator<ListSnapshotChunksDatasetOp::Dataset>(params) {}
+
+  absl::Status Initialize(IteratorContext* ctx) override {
+    if (!snapshot_chunk_provider_) {
+      snapshot_chunk_provider_ = std::make_unique<SnapshotChunkProvider>(
+          dataset()->snapshot_path(), ctx->env());
+    }
+    return absl::OkStatus();
+  }
+
+ private:
+  absl::Status GetNextInternal(IteratorContext* ctx,
+                               std::vector<Tensor>* out_tensors,
+                               bool* end_of_sequence) override {
+    TF_ASSIGN_OR_RETURN(std::optional<std::string> chunk,
+                        snapshot_chunk_provider_->GetNext());
+    if (!chunk.has_value()) {
+      *end_of_sequence = true;
+      return absl::OkStatus();
+    }
+    out_tensors->push_back(ConvertToTensor(*chunk, ctx->allocator({})));
+    *end_of_sequence = false;
+    return absl::OkStatus();
+  }
+
+  absl::Status SaveInternal(SerializationContext* ctx,
+                            IteratorStateWriter* writer) override {
+    return snapshot_chunk_provider_->Save(
+        [&](const std::string& key) { return full_name(key); }, writer);
+  }
+
+  absl::Status RestoreInternal(IteratorContext* ctx,
+                               IteratorStateReader* reader) override {
+    return snapshot_chunk_provider_->Restore(
+        [&](const std::string& key) { return full_name(key); }, reader);
+  }
+
+  std::unique_ptr<SnapshotChunkProvider> snapshot_chunk_provider_;
+};
+
+ListSnapshotChunksDatasetOp::ListSnapshotChunksDatasetOp(
+    OpKernelConstruction* ctx)
+    : DatasetOpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+}
+
+void ListSnapshotChunksDatasetOp::MakeDataset(OpKernelContext* ctx,
+                                              DatasetBase** output) {
+  tsl::tstring snapshot_path;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kSnapshotPath, &snapshot_path));
+  OP_REQUIRES(ctx, !snapshot_path.empty(),
+              absl::InvalidArgumentError(
+                  "snapshot_path is required to list snapshot chunks."));
+  *output = new ListSnapshotChunksDatasetOp::Dataset(
+      ctx, std::move(snapshot_path), output_types_, output_shapes_);
+}
+
+std::unique_ptr<IteratorBase>
+ListSnapshotChunksDatasetOp::Dataset::MakeIteratorInternal(
+    const std::string& prefix) const {
+  return std::make_unique<ListSnapshotChunksDatasetOp::Dataset::Iterator>(
+      ListSnapshotChunksDatasetOp::Dataset::Iterator::Params{
+          this,
+          name_utils::IteratorPrefix(kListSnapshotChunksDataset, prefix)});
+}
+
+REGISTER_KERNEL_BUILDER(Name(kListSnapshotChunksDataset).Device(DEVICE_CPU),
+                        ListSnapshotChunksDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_chunk_dataset_op.cc b/tensorflow/core/data/service/snapshot/snapshot_chunk_dataset_op.cc
index ad2e7ba41bdea8..49ec21ecf6e6b2 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_chunk_dataset_op.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_chunk_dataset_op.cc
@@ -125,11 +125,12 @@ class SnapshotChunkDatasetOp::Dataset : public DatasetBase {
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params) {}
 
+    ~Iterator() override { RecordBytesRead(); }
+
     absl::Status Initialize(IteratorContext* ctx) override {
       reader_ = std::make_unique<snapshot_util::TFRecordReader>(
           TranslateFileName(dataset()->chunk_file_), dataset()->compression_,
           dataset()->dtypes_, kTFRecordReaderOutputBufferSize);
-      bytes_read_ = 0;
       return reader_->Initialize(ctx->env());
     }
 
@@ -147,7 +148,6 @@ class SnapshotChunkDatasetOp::Dataset : public DatasetBase {
           status,
           " Failed to read tf.data snapshot file: ", dataset()->chunk_file_);
       ++start_index_;
-      RecordBytesRead();
       return status;
     }
 
@@ -180,15 +180,12 @@ class SnapshotChunkDatasetOp::Dataset : public DatasetBase {
 
     void RecordBytesRead() {
       uint64_t bytes_read = reader_->BytesRead();
-      static auto* bytes_counter =
-          metrics::GetTFDataBytesReadCounter(kSnapshotChunkDataset);
-      bytes_counter->IncrementBy(bytes_read - bytes_read_);
-      bytes_read_ = bytes_read;
+      metrics::GetTFDataBytesReadCounter(kSnapshotChunkDataset)
+          ->IncrementBy(bytes_read);
     }
 
     std::unique_ptr<snapshot_util::TFRecordReader> reader_;
     int64_t start_index_ = 0;
-    uint64_t bytes_read_ = 0;
   };
 
   const tstring chunk_file_;
diff --git a/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.cc b/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.cc
new file mode 100644
index 00000000000000..b5910e081d1e7b
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.cc
@@ -0,0 +1,162 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/snapshot_chunk_provider.h"
+
+#include <functional>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/data/service/snapshot/file_utils.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tsl/distributed_runtime/rpc/grpc_util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/status_to_from_proto.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/tstring.h"
+#include "tsl/protobuf/status.pb.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kChunksRead[] = "chunks_read";
+constexpr absl::string_view kSetElementDelimiter = ",";
+
+// Waits for a short period of time before retrying.
+void Backoff(int num_retries, tsl::Env* env) {
+  if (num_retries >= 1) {  // Does not backoff for the first try.
+    env->SleepForMicroseconds(tsl::ComputeBackoffMicroseconds(num_retries - 1));
+  }
+}
+
+std::string SetToString(const absl::flat_hash_set<std::string>& s) {
+  return absl::StrJoin(s, kSetElementDelimiter);
+}
+
+absl::flat_hash_set<std::string> SetFromString(absl::string_view s) {
+  if (s.empty()) {
+    return {};
+  }
+  std::vector<std::string> split = absl::StrSplit(s, kSetElementDelimiter);
+  return absl::flat_hash_set<std::string>(split.begin(), split.end());
+}
+
+}  // namespace
+
+SnapshotChunkProvider::SnapshotChunkProvider(absl::string_view snapshot_path,
+                                             tsl::Env* env)
+    : snapshot_path_(snapshot_path), env_(env) {}
+
+absl::StatusOr<std::optional<std::string>> SnapshotChunkProvider::GetNext()
+    ABSL_LOCKS_EXCLUDED(mu_) {
+  for (int num_retries = 0;; ++num_retries) {
+    Backoff(num_retries, env_);
+    absl::MutexLock l(&mu_);
+    TF_RETURN_IF_ERROR(snapshot_state_.status);
+    if (!chunks_unread_.empty()) {
+      std::string next_chunk = *chunks_unread_.begin();
+      chunks_read_.insert(next_chunk);
+      chunks_unread_.erase(next_chunk);
+      return tsl::io::JoinPath(CommittedChunksDirectory(snapshot_path_),
+                               next_chunk);
+    }
+    if (snapshot_state_.snapshot_is_done) {
+      return std::nullopt;
+    }
+    TF_RETURN_IF_ERROR(UpdateSnapshot());
+  }
+}
+
+absl::Status SnapshotChunkProvider::UpdateSnapshot()
+    ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  // Reads the state files first then reads the chunks. If we read chunks before
+  // reading the state files, the writer could write more chunks in between, and
+  // we may see the DONE file but miss those final chunks.
+  TF_ASSIGN_OR_RETURN(snapshot_state_, GetSnapshotState());
+  TF_RETURN_IF_ERROR(snapshot_state_.status);
+  TF_ASSIGN_OR_RETURN(std::vector<std::string> chunks, GetAvailableChunks());
+  for (absl::string_view chunk : chunks) {
+    if (!chunks_read_.contains(chunk)) {
+      chunks_unread_.insert(std::string(chunk));
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<SnapshotChunkProvider::SnapshotState>
+SnapshotChunkProvider::GetSnapshotState() {
+  std::string error_file_path = SnapshotErrorFilePath(snapshot_path_);
+  if (env_->FileExists(error_file_path).ok()) {
+    StatusProto status_proto;
+    TF_RETURN_IF_ERROR(ReadTextProto(env_, error_file_path, &status_proto));
+    absl::Status status = tsl::StatusFromProto(status_proto);
+    if (status.ok()) {
+      return absl::InternalError(absl::StrCat(
+          "Unexpected snapshot ERROR file contains an OK status at ",
+          error_file_path, "."));
+    }
+    return SnapshotState(status);
+  }
+  return SnapshotState(
+      env_->FileExists(SnapshotDoneFilePath(snapshot_path_)).ok());
+}
+
+absl::StatusOr<std::vector<std::string>>
+SnapshotChunkProvider::GetAvailableChunks() {
+  absl::StatusOr<std::vector<std::string>> status_or_chunks =
+      GetChildren(CommittedChunksDirectory(snapshot_path_), env_);
+  if (status_or_chunks.ok()) {
+    return *std::move(status_or_chunks);
+  } else if (absl::IsNotFound(status_or_chunks.status())) {
+    return std::vector<std::string>{};
+  }
+  return status_or_chunks.status();
+}
+
+absl::Status SnapshotChunkProvider::Save(
+    std::function<std::string(std::string)> full_name,
+    IteratorStateWriter* writer) {
+  absl::MutexLock l(&mu_);
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(full_name(kChunksRead), SetToString(chunks_read_)));
+  return absl::OkStatus();
+}
+
+absl::Status SnapshotChunkProvider::Restore(
+    std::function<std::string(std::string)> full_name,
+    IteratorStateReader* reader) {
+  absl::MutexLock l(&mu_);
+  tsl::tstring chunks_read;
+  TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kChunksRead), &chunks_read));
+  chunks_read_ = SetFromString(chunks_read);
+  return UpdateSnapshot();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.h b/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.h
new file mode 100644
index 00000000000000..17d932ea38d5ce
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.h
@@ -0,0 +1,100 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_CHUNK_PROVIDER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_CHUNK_PROVIDER_H_
+
+#include <functional>
+#include <optional>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tsl/platform/env.h"
+
+namespace tensorflow {
+namespace data {
+
+// Provides the next chunk to read. Blocks until the next chunk is unavailable,
+// or all the chunks have been read. This class is thread-safe.
+class SnapshotChunkProvider {
+ public:
+  SnapshotChunkProvider(absl::string_view snapshot_path, tsl::Env* env);
+  virtual ~SnapshotChunkProvider() = default;
+  SnapshotChunkProvider(const SnapshotChunkProvider&) = delete;
+  SnapshotChunkProvider& operator=(const SnapshotChunkProvider&) = delete;
+
+  // Returns the absolute file path of next snapshot chunk to read. If there is
+  // no available chunk, blocks until the next chunk is unavailable, or all the
+  // chunks are read. Returns std::nullopt if all chunks have been read.
+  absl::StatusOr<std::optional<std::string>> GetNext();
+
+  // Supports checkpointing.
+  absl::Status Save(std::function<std::string(std::string)> full_name,
+                    IteratorStateWriter* writer);
+  absl::Status Restore(std::function<std::string(std::string)> full_name,
+                       IteratorStateReader* reader);
+
+  // TODO(b/297930782): Support cancellation.
+
+ private:
+  // State of the snapshot.
+  struct SnapshotState {
+    SnapshotState() = default;
+    explicit SnapshotState(bool snapshot_is_done)
+        : snapshot_is_done(snapshot_is_done) {}
+    explicit SnapshotState(absl::Status status) : status(std::move(status)) {}
+
+    // True if the snapshot is done without errors.
+    bool snapshot_is_done = false;
+
+    // Non-OK status if writing the snapshot fails.
+    absl::Status status = absl::OkStatus();
+  };
+
+  // Updates the snapshot state and available chunks.
+  absl::Status UpdateSnapshot();
+
+  // Reads the DONE or ERROR file and returns a SnapshotState indicating whether
+  // the snapshot is complete.
+  absl::StatusOr<SnapshotState> GetSnapshotState();
+
+  // Reads the available chunks from disk and returns a vector of chunk file
+  // names.
+  absl::StatusOr<std::vector<std::string>> GetAvailableChunks();
+
+  const std::string snapshot_path_;
+  tsl::Env* const env_;
+
+  mutable absl::Mutex mu_;
+
+  // The set of read chunks.
+  absl::flat_hash_set<std::string> chunks_read_ ABSL_GUARDED_BY(mu_);
+
+  // The set of unread chunks.
+  absl::flat_hash_set<std::string> chunks_unread_ ABSL_GUARDED_BY(mu_);
+
+  // State of the snapshot.
+  SnapshotState snapshot_state_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_CHUNK_PROVIDER_H_
diff --git a/tensorflow/core/data/service/snapshot/snapshot_chunk_provider_test.cc b/tensorflow/core/data/service/snapshot/snapshot_chunk_provider_test.cc
new file mode 100644
index 00000000000000..28e31cae660e56
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_chunk_provider_test.cc
@@ -0,0 +1,242 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/snapshot_chunk_provider.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/data/service/snapshot/file_utils.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/status_to_from_proto.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+#include "tsl/protobuf/status.pb.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+using ::testing::UnorderedElementsAreArray;
+using ::tsl::testing::IsOkAndHolds;
+using ::tsl::testing::StatusIs;
+
+absl::StatusOr<std::string> CreateSnapshotDirectory() {
+  std::string snapshot_path;
+  if (!tsl::Env::Default()->LocalTempFilename(&snapshot_path)) {
+    return absl::FailedPreconditionError(
+        "Failed to create local temp file for snapshot.");
+  }
+  TF_RETURN_IF_ERROR(tsl::Env::Default()->RecursivelyCreateDir(
+      CommittedChunksDirectory(snapshot_path)));
+  return snapshot_path;
+}
+
+absl::Status WriteChunk(absl::string_view snapshot_path,
+                        absl::string_view chunk_file) {
+  return AtomicallyWriteStringToFile(
+      tsl::io::JoinPath(CommittedChunksDirectory(snapshot_path), chunk_file),
+      "", tsl::Env::Default());
+}
+
+absl::Status SetDone(absl::string_view snapshot_path) {
+  return AtomicallyWriteStringToFile(SnapshotDoneFilePath(snapshot_path), "",
+                                     tsl::Env::Default());
+}
+
+absl::Status SetStatus(absl::string_view snapshot_path,
+                       const absl::Status& status) {
+  return AtomicallyWriteTextProto(SnapshotErrorFilePath(snapshot_path),
+                                  tsl::StatusToProto(status),
+                                  tsl::Env::Default());
+}
+
+absl::StatusOr<std::vector<std::string>> GetAllChunks(
+    SnapshotChunkProvider& snapshot_chunk_provider) {
+  std::vector<std::string> chunks;
+  while (true) {
+    TF_ASSIGN_OR_RETURN(std::optional<std::string> chunk,
+                        snapshot_chunk_provider.GetNext());
+    if (!chunk.has_value()) {
+      break;
+    }
+    chunks.push_back(*chunk);
+  }
+  return chunks;
+}
+
+std::vector<std::string> JoinPaths(absl::string_view snapshot_path,
+                                   const std::vector<std::string> chunks) {
+  std::vector<std::string> joined_chunks;
+  for (absl::string_view chunk : chunks) {
+    joined_chunks.push_back(
+        tsl::io::JoinPath(CommittedChunksDirectory(snapshot_path), chunk));
+  }
+  return joined_chunks;
+}
+
+TEST(SnapshotChunkProviderTest, EmptySnapshot) {
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  TF_ASSERT_OK(SetDone(snapshot_path));
+
+  SnapshotChunkProvider snapshot_chunk_provider(snapshot_path,
+                                                tsl::Env::Default());
+  EXPECT_THAT(GetAllChunks(snapshot_chunk_provider), IsOkAndHolds(IsEmpty()));
+  EXPECT_THAT(GetAllChunks(snapshot_chunk_provider), IsOkAndHolds(IsEmpty()));
+}
+
+TEST(SnapshotChunkProviderTest, SingleReader) {
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  std::vector<std::string> chunks = {"chunk_0_0_0", "chunk_1_1_1",
+                                     "chunk_2_2_2", "chunk_3_3_3",
+                                     "chunk_4_4_4"};
+  for (absl::string_view chunk : chunks) {
+    TF_ASSERT_OK(WriteChunk(snapshot_path, chunk));
+  }
+  TF_ASSERT_OK(SetDone(snapshot_path));
+
+  SnapshotChunkProvider snapshot_chunk_provider(snapshot_path,
+                                                tsl::Env::Default());
+  EXPECT_THAT(GetAllChunks(snapshot_chunk_provider),
+              IsOkAndHolds(
+                  UnorderedElementsAreArray(JoinPaths(snapshot_path, chunks))));
+}
+
+TEST(SnapshotChunkProviderTest, WaitForSnapshot) {
+  std::string snapshot_path;
+  ASSERT_TRUE(tsl::Env::Default()->LocalTempFilename(&snapshot_path));
+
+  absl::Mutex mu;
+  std::vector<std::string> result;  // Guarded by `mu`.
+  std::unique_ptr<tsl::Thread> reader_thread =
+      absl::WrapUnique(tsl::Env::Default()->StartThread(
+          /*thread_options=*/{}, /*name=*/"Reader",
+          [&snapshot_path, &mu, &result]() {
+            SnapshotChunkProvider snapshot_chunk_provider(snapshot_path,
+                                                          tsl::Env::Default());
+            TF_ASSERT_OK_AND_ASSIGN(std::vector<std::string> chunks,
+                                    GetAllChunks(snapshot_chunk_provider));
+            absl::MutexLock l(&mu);
+            result = std::move(chunks);
+          }));
+
+  {  // The reader should wait when there are no chunks.
+    absl::MutexLock l(&mu);
+    EXPECT_TRUE(result.empty());
+  }
+
+  TF_ASSERT_OK(tsl::Env::Default()->RecursivelyCreateDir(
+      CommittedChunksDirectory(snapshot_path)));
+  TF_ASSERT_OK(WriteChunk(snapshot_path, "chunk_0_0_0"));
+  TF_ASSERT_OK(SetDone(snapshot_path));
+
+  // The reader should be able to get chunks now.
+  reader_thread.reset();
+  absl::MutexLock l(&mu);
+  EXPECT_THAT(result, UnorderedElementsAreArray(
+                          JoinPaths(snapshot_path, {"chunk_0_0_0"})));
+}
+
+TEST(SnapshotChunkProviderTest, ConcurrentReadWrite) {
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+
+  const int num_readers = 10;
+  absl::Mutex mu;
+  SnapshotChunkProvider snapshot_chunk_provider(snapshot_path,
+                                                tsl::Env::Default());
+  std::vector<std::string> result;  // Guarded by `mu`.
+  std::vector<std::unique_ptr<tsl::Thread>> reader_threads;
+  for (int i = 0; i < num_readers; ++i) {
+    reader_threads.push_back(absl::WrapUnique(tsl::Env::Default()->StartThread(
+        /*thread_options=*/{}, /*name=*/absl::StrCat("Reader_", i),
+        [&snapshot_chunk_provider, &mu, &result]() {
+          while (true) {
+            tsl::Env::Default()->SleepForMicroseconds(25);
+            TF_ASSERT_OK_AND_ASSIGN(std::optional<std::string> chunk,
+                                    snapshot_chunk_provider.GetNext());
+            if (!chunk.has_value()) {
+              break;
+            }
+            absl::MutexLock l(&mu);
+            result.push_back(std::move(*chunk));
+          }
+        })));
+  }
+
+  int num_streams = 10, num_chunks_per_stream = 50;
+  std::vector<std::unique_ptr<tsl::Thread>> stream_threads;
+  for (int i = 0; i < num_streams; ++i) {
+    stream_threads.push_back(absl::WrapUnique(tsl::Env::Default()->StartThread(
+        /*thread_options=*/{}, /*name=*/absl::StrCat("Writer_", i),
+        [&snapshot_path, num_chunks_per_stream, i]() {
+          for (int j = 0; j < num_chunks_per_stream; ++j) {
+            std::string filename = absl::StrCat("chunk_", i, "_", j);
+            TF_ASSERT_OK(WriteChunk(snapshot_path, filename));
+            tsl::Env::Default()->SleepForMicroseconds(35);
+          }
+        })));
+  }
+
+  stream_threads.clear();
+  TF_ASSERT_OK(SetDone(snapshot_path));
+
+  reader_threads.clear();
+  std::vector<std::string> expected;
+  for (int i = 0; i < num_streams; ++i) {
+    for (int j = 0; j < num_chunks_per_stream; ++j) {
+      expected.push_back(absl::StrCat("chunk_", i, "_", j));
+    }
+  }
+  EXPECT_THAT(result,
+              UnorderedElementsAreArray(JoinPaths(snapshot_path, expected)));
+}
+
+TEST(SnapshotChunkProviderTest, SnapshotError) {
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  std::unique_ptr<tsl::Thread> reader_thread =
+      absl::WrapUnique(tsl::Env::Default()->StartThread(
+          /*thread_options=*/{}, /*name=*/"Reader", [&snapshot_path]() {
+            SnapshotChunkProvider snapshot_chunk_provider(snapshot_path,
+                                                          tsl::Env::Default());
+            EXPECT_THAT(
+                GetAllChunks(snapshot_chunk_provider),
+                StatusIs(absl::StatusCode::kFailedPrecondition, "Test error."));
+          }));
+
+  TF_ASSERT_OK(WriteChunk(snapshot_path, "chunk_0_0_0"));
+  TF_ASSERT_OK(WriteChunk(snapshot_path, "chunk_1_0_0"));
+  TF_ASSERT_OK(WriteChunk(snapshot_path, "chunk_2_0_0"));
+  TF_ASSERT_OK(
+      SetStatus(snapshot_path, absl::FailedPreconditionError("Test error.")));
+  reader_thread.reset();
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager.cc b/tensorflow/core/data/service/snapshot/snapshot_manager.cc
index f02652c49a0ca3..b4bd82b5ad2184 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_manager.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/data/service/snapshot/path_utils.h"
 #include "tensorflow/core/data/service/split_provider.h"
 #include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/platform/status.h"
 #include "tsl/lib/io/compression.h"
 #include "tsl/platform/env.h"
@@ -99,7 +100,7 @@ absl::Status SnapshotManager::Start(const SnapshotRequest& request)
   }
   tsl::mutex_lock l(mu_);
   TF_ASSIGN_OR_RETURN(sources_, CreateSources(request.dataset()));
-  TF_ASSIGN_OR_RETURN(num_total_splits_, CountSplits());
+  TF_ASSIGN_OR_RETURN(num_total_splits_, GetSplitsCardinality());
   TF_RETURN_IF_ERROR(WriteOnDiskSkeleton());
   TF_RETURN_IF_ERROR(WriteOnDiskMetadata(request));
   metadata_ = request.metadata();
@@ -120,6 +121,31 @@ SnapshotManager::CreateSources(const DatasetDef& dataset_def) const
   return sources;
 }
 
+absl::StatusOr<int64_t> SnapshotManager::GetSplitsCardinality()
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  if (ShouldCountSplits()) {
+    return CountSplits();
+  }
+
+  int64_t num_splits = 0;
+  for (const auto& source : sources_) {
+    if (source.split_provider->Cardinality() > 0) {
+      num_splits += source.split_provider->Cardinality();
+    }
+  }
+  return num_splits;
+}
+
+bool SnapshotManager::ShouldCountSplits() const
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  for (const auto& source : sources_) {
+    if (source.split_provider->Cardinality() == kUnknownCardinality) {
+      return true;
+    }
+  }
+  return false;
+}
+
 absl::StatusOr<int64_t> SnapshotManager::CountSplits()
     TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   int64_t num_splits = 0;
@@ -210,7 +236,7 @@ absl::Status SnapshotManager::ReadOnDiskMetadata()
       ReadBinaryProto(env_, DatasetDefFilePath(path_), &dataset_def));
 
   TF_ASSIGN_OR_RETURN(sources_, CreateSources(dataset_def));
-  TF_ASSIGN_OR_RETURN(num_total_splits_, CountSplits());
+  TF_ASSIGN_OR_RETURN(num_total_splits_, GetSplitsCardinality());
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager.h b/tensorflow/core/data/service/snapshot/snapshot_manager.h
index 77a24ce915bc73..fe903fde98847f 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_manager.h
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager.h
@@ -247,6 +247,11 @@ class SnapshotManager {
   // Creates sources for the specified dataset.
   absl::StatusOr<std::vector<Source>> CreateSources(
       const DatasetDef& dataset_def) const;
+  // Returns the total number of splits.
+  absl::StatusOr<int64> GetSplitsCardinality();
+  // Returns true if we need to count the total number of splits for progress
+  // reporting.
+  bool ShouldCountSplits() const;
   // Counts the number of splits for a single repetition of the data in
   // `sources_`.
   absl::StatusOr<int64_t> CountSplits();
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer_test.cc b/tensorflow/core/data/service/snapshot/snapshot_stream_writer_test.cc
index d22e9573a15148..fcebb32e82a539 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_stream_writer_test.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer_test.cc
@@ -93,8 +93,6 @@ class ElementOrErrorIterator : public TaskIterator {
 
   int64_t Cardinality() const override { return elements_.size(); }
 
-  std::optional<double> GetProcessingTimeNsec() const override { return 1.0e7; }
-
  private:
   const std::vector<absl::StatusOr<T>> elements_;
   int64_t next_ = 0;
diff --git a/tensorflow/core/data/service/split_provider_test.cc b/tensorflow/core/data/service/split_provider_test.cc
new file mode 100644
index 00000000000000..08adc907058af2
--- /dev/null
+++ b/tensorflow/core/data/service/split_provider_test.cc
@@ -0,0 +1,115 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/split_provider.h"
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/test_util.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::UnorderedElementsAre;
+
+std::vector<int64_t> GetCardinalities(
+    const std::vector<std::unique_ptr<SplitProvider>>& split_providers) {
+  std::vector<int64_t> cardinalities;
+  for (const auto& split_provider : split_providers) {
+    cardinalities.push_back(split_provider->Cardinality());
+  }
+  return cardinalities;
+}
+
+TEST(SplitProviderTest, RangeCardinality) {
+  DatasetDef range_dataset = testing::RangeDataset(10);
+  std::vector<std::unique_ptr<SplitProvider>> split_providers;
+  TF_ASSERT_OK(CreateSplitProviders(range_dataset, split_providers));
+  EXPECT_THAT(GetCardinalities(split_providers), UnorderedElementsAre(10));
+}
+
+class RepeatedSplitProviderTest
+    : public ::testing::TestWithParam<std::tuple<int64_t, int64_t, int64_t>> {
+ public:
+  int64_t Range() const { return std::get<0>(GetParam()); }
+  int64_t RepeatCount() const { return std::get<1>(GetParam()); }
+  int64_t ExpectedCardinality() const { return std::get<2>(GetParam()); }
+};
+
+// Test cases for the `RepeatedDatasetCardinality` test. The tuples specify
+// {range, repeat count, expected cardinality}.
+constexpr std::array<std::tuple<int64_t, int64_t, int64_t>, 5>
+    kRepeatedSplitProviderTestCases{{{9, 9, 81},
+                                     {9, 0, 0},
+                                     {9, -1, kInfiniteCardinality},
+                                     {0, -1, 0},
+                                     {-1, 1, 0}}};
+
+TEST_P(RepeatedSplitProviderTest, RepeatedDatasetCardinality) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      DatasetDef repeated_dataset,
+      testing::GetTestDataset(
+          "repeated_dataset",
+          {absl::StrCat(Range()), absl::StrCat(RepeatCount())}));
+  std::vector<std::unique_ptr<SplitProvider>> split_providers;
+  TF_ASSERT_OK(CreateSplitProviders(repeated_dataset, split_providers));
+  EXPECT_THAT(GetCardinalities(split_providers),
+              ElementsAre(ExpectedCardinality()));
+}
+
+INSTANTIATE_TEST_SUITE_P(MyGroup, RepeatedSplitProviderTest,
+                         ::testing::ValuesIn(kRepeatedSplitProviderTestCases));
+
+TEST(SplitProviderTest, EnumerateCardinality) {
+  TF_ASSERT_OK_AND_ASSIGN(DatasetDef enumerate_dataset,
+                          testing::GetTestDataset("enumerate_dataset"));
+  std::vector<std::unique_ptr<SplitProvider>> split_providers;
+  TF_ASSERT_OK(CreateSplitProviders(enumerate_dataset, split_providers));
+  EXPECT_THAT(GetCardinalities(split_providers),
+              UnorderedElementsAre(3, kInfiniteCardinality));
+}
+
+TEST(SplitProviderTest, ChooseFromDatasetsCardinality) {
+  TF_ASSERT_OK_AND_ASSIGN(DatasetDef sample_from_datasets,
+                          testing::GetTestDataset("choose_from_datasets"));
+  std::vector<std::unique_ptr<SplitProvider>> split_providers;
+  TF_ASSERT_OK(CreateSplitProviders(sample_from_datasets, split_providers));
+  EXPECT_THAT(GetCardinalities(split_providers),
+              UnorderedElementsAre(5, 5, 5, kInfiniteCardinality));
+}
+
+TEST(SplitProviderTest, SampleFromDatasetsCardinality) {
+  TF_ASSERT_OK_AND_ASSIGN(DatasetDef sample_from_datasets,
+                          testing::GetTestDataset("sample_from_datasets"));
+  std::vector<std::unique_ptr<SplitProvider>> split_providers;
+  TF_ASSERT_OK(CreateSplitProviders(sample_from_datasets, split_providers));
+  EXPECT_THAT(GetCardinalities(split_providers),
+              UnorderedElementsAre(5, 5, 5, kInfiniteCardinality));
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/task_runner.cc b/tensorflow/core/data/service/task_runner.cc
index c0240f27235557..6c169d2bf90a93 100644
--- a/tensorflow/core/data/service/task_runner.cc
+++ b/tensorflow/core/data/service/task_runner.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/data/standalone.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -73,8 +74,8 @@ Status StandaloneTaskIterator::Restore(
   return iterator_->Restore(saved_iterator);
 }
 
-std::optional<double> StandaloneTaskIterator::GetProcessingTimeNsec() const {
-  return iterator_->GetProcessingTimeNsec();
+std::shared_ptr<model::Model> StandaloneTaskIterator::model() const {
+  return iterator_->model();
 }
 
 Status TaskRunner::Create(const experimental::WorkerConfig& worker_config,
@@ -168,10 +169,8 @@ void FirstComeFirstServedTaskRunner::Cancel() {
   buffer_.Cancel(errors::Cancelled("tf.data service FCFS task is cancelled."));
 }
 
-std::optional<double> FirstComeFirstServedTaskRunner::GetProcessingTimeNsec()
-    TF_LOCKS_EXCLUDED(mu_) {
-  mutex_lock l(mu_);
-  return iterator_->GetProcessingTimeNsec();
+std::shared_ptr<model::Model> FirstComeFirstServedTaskRunner::model() const {
+  return model_;
 }
 
 CachingTaskRunner::CachingTaskRunner(std::unique_ptr<TaskIterator> iterator,
@@ -223,8 +222,8 @@ void CachingTaskRunner::Cancel() {
   fcfs_task_runner_.Cancel();
 }
 
-std::optional<double> CachingTaskRunner::GetProcessingTimeNsec() {
-  return fcfs_task_runner_.GetProcessingTimeNsec();
+std::shared_ptr<model::Model> CachingTaskRunner::model() const {
+  return fcfs_task_runner_.model();
 }
 
 RoundRobinTaskRunner::RoundRobinTaskRunner(
@@ -361,8 +360,8 @@ void RoundRobinTaskRunner::Cancel() {
   new_round_cv_.notify_all();
 }
 
-std::optional<double> RoundRobinTaskRunner::GetProcessingTimeNsec() {
-  return prefetch_thread_.GetProcessingTimeNsec();
+std::shared_ptr<model::Model> RoundRobinTaskRunner::model() const {
+  return prefetch_thread_.model();
 }
 
 PrefetchThread::PrefetchThread(std::unique_ptr<TaskIterator> iterator,
@@ -447,8 +446,8 @@ Status PrefetchThread::GetStatus() {
   return status_;
 }
 
-std::optional<double> PrefetchThread::GetProcessingTimeNsec() const {
-  return iterator_->GetProcessingTimeNsec();
+std::shared_ptr<model::Model> PrefetchThread::model() const {
+  return iterator_->model();
 }
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/task_runner.h b/tensorflow/core/data/service/task_runner.h
index a1db121ac602ca..565a7f4727a477 100644
--- a/tensorflow/core/data/service/task_runner.h
+++ b/tensorflow/core/data/service/task_runner.h
@@ -63,12 +63,8 @@ class TaskIterator {
         "Restoring from a tf.data service task iterator is unsupported.");
   }
 
-  // Returns the time it takes the pipeline associated with this task iterator
-  // to process an element.
-  // Returns std::nullopt if there is not currently enough information to
-  // determine the processing time, e.g. because not enough data has been
-  // produced yet from the iterator.
-  virtual std::optional<double> GetProcessingTimeNsec() const = 0;
+  // Returns the dataset model for performance analysis.
+  virtual std::shared_ptr<model::Model> model() const { return nullptr; }
 };
 
 // Implementation of TaskIterator wrapping a standalone iterator.
@@ -83,7 +79,7 @@ class StandaloneTaskIterator : public TaskIterator {
   int64_t Cardinality() const override;
   StatusOr<std::vector<Tensor>> Save() override;
   Status Restore(const std::vector<Tensor>& saved_iterator) override;
-  std::optional<double> GetProcessingTimeNsec() const override;
+  std::shared_ptr<model::Model> model() const override;
 
  private:
   std::unique_ptr<standalone::Dataset> dataset_;
@@ -102,14 +98,10 @@ class TaskRunner {
   // Gets the next element for the given request.
   virtual Status GetNext(const GetElementRequest& req,
                          GetElementResult& result) = 0;
-  // Returns the time it takes the pipeline associated with this task runner to
-  // process an element. Returns 0 if the model is null or empty.
-  // Returns std::nullopt if there is not currently enough information to
-  // determine the processing time, e.g. because not enough data has been
-  // produced yet from the iterator.
-  virtual std::optional<double> GetProcessingTimeNsec() = 0;
   // Cancels in-progress `GetNext` requests.
   virtual void Cancel() = 0;
+  // Returns the dataset model for performance analysis.
+  virtual std::shared_ptr<model::Model> model() const = 0;
 };
 
 // A task runner which provides elements on a first-come first-served basis.
@@ -127,7 +119,7 @@ class FirstComeFirstServedTaskRunner : public TaskRunner {
 
   void Cancel() override;
 
-  std::optional<double> GetProcessingTimeNsec() override TF_LOCKS_EXCLUDED(mu_);
+  std::shared_ptr<model::Model> model() const override;
 
  private:
   // Function to continually prefetch the next element. Returns an error if the
@@ -140,6 +132,7 @@ class FirstComeFirstServedTaskRunner : public TaskRunner {
   // Gets the next element from the input iterator.
   StatusOr<GetElementResult> GetNextFromInputIterator() TF_LOCKS_EXCLUDED(mu_);
 
+  const std::shared_ptr<model::Model> model_;
   mutex mu_;
   std::unique_ptr<TaskIterator> iterator_ TF_GUARDED_BY(mu_);
   int64_t element_index_ TF_GUARDED_BY(mu_) = 0;
@@ -173,7 +166,8 @@ class CachingTaskRunner : public TaskRunner {
   // return a Cancelled status.
   void Cancel() override;
 
-  std::optional<double> GetProcessingTimeNsec() override;
+  // Returns the dataset model for performance analysis.
+  std::shared_ptr<model::Model> model() const override;
 
  private:
   // The `GetElementResultSequence` generates a sequence of elements from the
@@ -224,7 +218,8 @@ class PrefetchThread {
                     std::vector<std::unique_ptr<Element>>& out);
   // Returns the status for any failures encountered by the prefetch thread.
   Status GetStatus();
-  std::optional<double> GetProcessingTimeNsec() const;
+  // Returns the dataset model for performance analysis.
+  std::shared_ptr<model::Model> model() const;
 
  private:
   const std::unique_ptr<TaskIterator> iterator_;
@@ -269,7 +264,7 @@ class RoundRobinTaskRunner : public TaskRunner {
   Status GetNext(const GetElementRequest& req,
                  GetElementResult& result) override;
   void Cancel() override;
-  std::optional<double> GetProcessingTimeNsec() override;
+  std::shared_ptr<model::Model> model() const override;
 
  private:
   // Prepares a full round of data. `wait_us` indicates how long to wait before
diff --git a/tensorflow/core/data/service/task_runner_test.cc b/tensorflow/core/data/service/task_runner_test.cc
index 5650e28627e631..0c1ef895742b0c 100644
--- a/tensorflow/core/data/service/task_runner_test.cc
+++ b/tensorflow/core/data/service/task_runner_test.cc
@@ -77,8 +77,6 @@ class RangeIterator : public TaskIterator {
     return repeat_ ? kInfiniteCardinality : range_;
   }
 
-  std::optional<double> GetProcessingTimeNsec() const override { return 1.0e7; }
-
  private:
   const int64_t range_;
   const bool repeat_;
@@ -96,8 +94,6 @@ class InfiniteRangeIterator : public TaskIterator {
 
   int64_t Cardinality() const override { return kInfiniteCardinality; }
 
-  std::optional<double> GetProcessingTimeNsec() const override { return 1.0e7; }
-
  private:
   int64_t next_ = 0;
 };
@@ -121,8 +117,6 @@ class ElementOrErrorIterator : public TaskIterator {
 
   int64_t Cardinality() const override { return elements_.size(); }
 
-  std::optional<double> GetProcessingTimeNsec() const override { return 1.0e7; }
-
  private:
   const std::vector<StatusOr<T>> elements_;
   int64_t next_ = 0;
diff --git a/tensorflow/core/data/service/test_util.cc b/tensorflow/core/data/service/test_util.cc
index 8d7abf3a9eb540..766fe07c4f7469 100644
--- a/tensorflow/core/data/service/test_util.cc
+++ b/tensorflow/core/data/service/test_util.cc
@@ -19,8 +19,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
+#include "absl/strings/substitute.h"
 #include "tensorflow/core/data/dataset_test_base.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/framework/function.h"
@@ -33,13 +36,11 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/path.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/tstring.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/struct.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/protobuf.h"
 
 namespace tensorflow {
 namespace data {
@@ -53,9 +54,11 @@ using ::tensorflow::test::function::NDef;
 constexpr int64_t kShardHint = -1;
 constexpr const char kTestdataDir[] =
     "tensorflow/core/data/service/testdata";
+constexpr const char kEnumerateDatasetFile[] = "enumerate_dataset.pbtxt";
 constexpr const char kInterleaveTextlineDatasetFile[] =
     "interleave_textline_dataset.pbtxt";
 constexpr const char kChooseFromDatasetsFile[] = "choose_from_datasets.pbtxt";
+constexpr const char kSampleFromDatasetsFile[] = "sample_from_datasets.pbtxt";
 
 NodeDef GetMapNode(absl::string_view name, absl::string_view input_node_name,
                    absl::string_view function_name) {
@@ -77,16 +80,16 @@ FunctionDef XTimesX() {
       /*ret_def=*/{{"y", "y:z:0"}});
 }
 
-Status CreateTestFiles(const std::vector<tstring>& filenames,
-                       const std::vector<tstring>& contents) {
+absl::Status CreateTestFiles(const std::vector<tstring>& filenames,
+                             const std::vector<tstring>& contents) {
   if (filenames.size() != contents.size()) {
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "The number of files does not match with the contents.");
   }
   for (int i = 0; i < filenames.size(); ++i) {
     TF_RETURN_IF_ERROR(WriteDataToFile(filenames[i], contents[i].data()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -96,6 +99,32 @@ std::string LocalTempFilename() {
   return path;
 }
 
+absl::StatusOr<DatasetDef> GetTestDataset(
+    absl::string_view dataset_name, const std::vector<std::string>& args) {
+  std::string graph_file =
+      io::JoinPath(kTestdataDir, absl::StrCat(dataset_name, ".pbtxt"));
+  std::string graph_str;
+  TF_RETURN_IF_ERROR(ReadFileToString(Env::Default(), graph_file, &graph_str));
+  if (args.size() == 1) {
+    graph_str = absl::Substitute(graph_str, args[0]);
+  } else if (args.size() == 2) {
+    graph_str = absl::Substitute(graph_str, args[0], args[1]);
+  } else if (args.size() == 3) {
+    graph_str = absl::Substitute(graph_str, args[0], args[1], args[2]);
+  } else if (args.size() > 3) {
+    return absl::UnimplementedError(
+        "GetTestDataset does not support more than 3 arguments.");
+  }
+
+  DatasetDef dataset;
+  if (!tsl::protobuf::TextFormat::ParseFromString(graph_str,
+                                                  dataset.mutable_graph())) {
+    return absl::FailedPreconditionError(
+        absl::StrCat("Can't parse ", graph_file, " as text proto."));
+  }
+  return dataset;
+}
+
 DatasetDef RangeDataset(int64_t range) {
   DatasetDef dataset_def;
   *dataset_def.mutable_graph() = GDef(
@@ -182,14 +211,6 @@ DatasetDef InfiniteDataset() {
   return dataset_def;
 }
 
-StatusOr<DatasetDef> ChooseFromDatasets() {
-  DatasetDef dataset;
-  std::string graph_file = io::JoinPath(kTestdataDir, kChooseFromDatasetsFile);
-  TF_RETURN_IF_ERROR(
-      ReadTextProto(Env::Default(), graph_file, dataset.mutable_graph()));
-  return dataset;
-}
-
 experimental::DistributedSnapshotMetadata
 CreateDummyDistributedSnapshotMetadata() {
   StructuredValue decoded_spec;
@@ -204,7 +225,7 @@ CreateDummyDistributedSnapshotMetadata() {
   return metadata;
 }
 
-StatusOr<DatasetDef> InterleaveTextlineDataset(
+absl::StatusOr<DatasetDef> InterleaveTextlineDataset(
     const std::vector<tstring>& filenames,
     const std::vector<tstring>& contents) {
   TF_RETURN_IF_ERROR(CreateTestFiles(filenames, contents));
@@ -222,11 +243,11 @@ StatusOr<DatasetDef> InterleaveTextlineDataset(
   return dataset;
 }
 
-Status WaitWhile(std::function<StatusOr<bool>()> f) {
+absl::Status WaitWhile(std::function<absl::StatusOr<bool>()> f) {
   while (true) {
     TF_ASSIGN_OR_RETURN(bool result, f());
     if (!result) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     Env::Default()->SleepForMicroseconds(10 * 1000);  // 10ms.
   }
diff --git a/tensorflow/core/data/service/test_util.h b/tensorflow/core/data/service/test_util.h
index a175c543cbeef8..2180675b74e16f 100644
--- a/tensorflow/core/data/service/test_util.h
+++ b/tensorflow/core/data/service/test_util.h
@@ -20,9 +20,10 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/platform/types.h"
@@ -35,6 +36,13 @@ namespace testing {
 // Creates a local tempfile and returns the path.
 std::string LocalTempFilename();
 
+// Creates a dataset graph for testing. `dataset_name` is one of the filenames
+// defined in `testdata` (without `.pbtxt`). `args` specifies arguments passed
+// to the dataset. These args appear as `$0`, `$1`, etc, in the dataset
+// definition and will be replaced with the specified args.
+absl::StatusOr<DatasetDef> GetTestDataset(
+    absl::string_view dataset_name, const std::vector<std::string>& args = {});
+
 // Returns a test dataset representing
 // tf.data.Dataset.range(range). Useful for testing dataset graph execution.
 DatasetDef RangeDataset(int64_t range);
@@ -51,14 +59,6 @@ DatasetDef RangeDatasetWithShardHint(int64_t range);
 // tf.data.Dataset.range(100000000).repeat().
 DatasetDef InfiniteDataset();
 
-// Returns a test dataset representing
-// datasets = [tf.data.Dataset.from_tensor_slices(["a", "a", "a", "a", "a"]),
-//             tf.data.Dataset.from_tensor_slices(["b", "b", "b", "b", "b"]),
-//             tf.data.Dataset.from_tensor_slices(["c", "c", "c", "c", "c"])]
-// choice_dataset = tf.data.Dataset.range(3).repeat()
-// dataset = tf.data.Dataset.choose_from_datasets(datasets, choice_dataset)
-StatusOr<DatasetDef> ChooseFromDatasets();
-
 // Returns a distributed snapshot metadata for a dummy dataset.
 experimental::DistributedSnapshotMetadata
 CreateDummyDistributedSnapshotMetadata();
@@ -67,14 +67,14 @@ CreateDummyDistributedSnapshotMetadata();
 // tf.data.Dataset.from_tensor_slices(["filenames"]).interleave(
 //     lambda filepath: tf.data.TextLineDataset(filepath),
 //     cycle_length=10)
-StatusOr<DatasetDef> InterleaveTextlineDataset(
+absl::StatusOr<DatasetDef> InterleaveTextlineDataset(
     const std::vector<tstring>& filenames,
     const std::vector<tstring>& contents);
 
 // Repeatedly calls `f()`, blocking until `f()` returns `false`.
 //
 // Returns an error if `f()` returns an error.
-Status WaitWhile(std::function<StatusOr<bool>()> f);
+absl::Status WaitWhile(std::function<absl::StatusOr<bool>()> f);
 
 // TODO(b/229726259): Make EqualsProto available in Googletest
 // (Public feature request: https://github.com/google/googletest/issues/1761).
diff --git a/tensorflow/core/data/service/test_util_test.cc b/tensorflow/core/data/service/test_util_test.cc
index 9608163e9f47e8..0cf43eb404631e 100644
--- a/tensorflow/core/data/service/test_util_test.cc
+++ b/tensorflow/core/data/service/test_util_test.cc
@@ -158,8 +158,9 @@ TEST(TestUtilTest, InterleaveTextlineEmptyFiles) {
   EXPECT_THAT(GetIteratorOutput<tstring>(*iterator), IsOkAndHolds(IsEmpty()));
 }
 
-TEST(TestUtilTest, ChooseFromDatasets) {
-  TF_ASSERT_OK_AND_ASSIGN(const DatasetDef dataset_def, ChooseFromDatasets());
+TEST(TestUtilTest, GetTestDataset) {
+  TF_ASSERT_OK_AND_ASSIGN(const DatasetDef dataset_def,
+                          GetTestDataset("choose_from_datasets"));
   standalone::Dataset::Params params;
   std::unique_ptr<standalone::Dataset> dataset;
   TF_ASSERT_OK(
diff --git a/tensorflow/core/data/service/testdata/enumerate_dataset.pbtxt b/tensorflow/core/data/service/testdata/enumerate_dataset.pbtxt
new file mode 100644
index 00000000000000..c0066f9da50e72
--- /dev/null
+++ b/tensorflow/core/data/service/testdata/enumerate_dataset.pbtxt
@@ -0,0 +1,288 @@
+# proto-file: third_party/tensorflow/core/framework/graph.proto
+# proto-message: GraphDef
+#
+# Proto content generated by
+#
+# import tensorflow as tf
+#
+# dataset = tf.data.Dataset.from_tensor_slices(["a", "b", "c"])
+# dataset = dataset.enumerate()
+#
+# g = tf.compat.v1.GraphDef()
+# g.ParseFromString(dataset._as_serialized_graph().numpy())
+# print(g)
+
+node {
+  name: "Const/_0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Const/_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 9223372036854775807
+      }
+    }
+  }
+}
+node {
+  name: "Const/_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "RangeDataset/_3"
+  op: "RangeDataset"
+  input: "Const/_0"
+  input: "Const/_1"
+  input: "Const/_2"
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\016RangeDataset:9"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "replicate_on_split"
+    value {
+      b: true
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_INT64
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const/_4"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\001\001abc"
+      }
+    }
+  }
+}
+node {
+  name: "TensorSliceDataset/_5"
+  op: "TensorSliceDataset"
+  input: "Const/_4"
+  attr {
+    key: "Toutput_types"
+    value {
+      list {
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    key: "is_files"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\024TensorSliceDataset:7"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "replicate_on_split"
+    value {
+      b: false
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_STRING
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "ZipDataset/_6"
+  op: "ZipDataset"
+  input: "RangeDataset/_3"
+  input: "TensorSliceDataset/_5"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\rZipDataset:10"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_INT64
+          }
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_STRING
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "dataset"
+  op: "_Retval"
+  input: "ZipDataset/_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_VARIANT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 1700
+}
diff --git a/tensorflow/core/data/service/testdata/repeated_dataset.pbtxt b/tensorflow/core/data/service/testdata/repeated_dataset.pbtxt
new file mode 100644
index 00000000000000..8dfca4717c97e3
--- /dev/null
+++ b/tensorflow/core/data/service/testdata/repeated_dataset.pbtxt
@@ -0,0 +1,215 @@
+# proto-file: third_party/tensorflow/core/framework/graph.proto
+# proto-message: GraphDef
+#
+# Proto content generated by
+#
+# import tensorflow as tf
+#
+# dataset = tf.data.Dataset.range($0)
+# dataset = dataset.repeat($1)
+#
+# g = tf.compat.v1.GraphDef()
+# g.ParseFromString(dataset._as_serialized_graph().numpy())
+# print(g)
+
+node {
+  name: "Const/_0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Const/_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: $0
+      }
+    }
+  }
+}
+node {
+  name: "Const/_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "RangeDataset/_3"
+  op: "RangeDataset"
+  input: "Const/_0"
+  input: "Const/_1"
+  input: "Const/_2"
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\017RangeDataset:15"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "replicate_on_split"
+    value {
+      b: false
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_INT64
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const/_4"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: $1
+      }
+    }
+  }
+}
+node {
+  name: "RepeatDataset/_5"
+  op: "RepeatDataset"
+  input: "RangeDataset/_3"
+  input: "Const/_4"
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\020RepeatDataset:16"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_INT64
+      }
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_INT64
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "dataset"
+  op: "_Retval"
+  input: "RepeatDataset/_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_VARIANT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 1700
+}
diff --git a/tensorflow/core/data/service/testdata/sample_from_datasets.pbtxt b/tensorflow/core/data/service/testdata/sample_from_datasets.pbtxt
new file mode 100644
index 00000000000000..4b6276b1923db7
--- /dev/null
+++ b/tensorflow/core/data/service/testdata/sample_from_datasets.pbtxt
@@ -0,0 +1,762 @@
+# proto-file: third_party/tensorflow/core/framework/graph.proto
+# proto-message: GraphDef
+#
+# Proto content generated by
+#
+# import tensorflow as tf
+#
+# datasets = [tf.data.Dataset.from_tensor_slices(["a", "a", "a", "a", "a"]),
+#             tf.data.Dataset.from_tensor_slices(["b", "b", "b", "b", "b"]),
+#             tf.data.Dataset.from_tensor_slices(["c", "c", "c", "c", "c"])]
+# dataset = tf.data.Dataset.sample_from_datasets(
+#     datasets, weights=[1.0] * len(datasets))
+#
+# g = tf.compat.v1.GraphDef()
+# g.ParseFromString(dataset._as_serialized_graph().numpy())
+# print(g)
+
+node {
+  name: "Const/_0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Const/_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "RandomDataset/_2"
+  op: "RandomDataset"
+  input: "Const/_0"
+  input: "Const/_1"
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\017RandomDataset:3"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_INT64
+      }
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_INT64
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const/_3"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Const/_4"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: false
+      }
+    }
+  }
+}
+node {
+  name: "BatchDatasetV2/_5"
+  op: "BatchDatasetV2"
+  input: "RandomDataset/_2"
+  input: "Const/_3"
+  input: "Const/_4"
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\020BatchDatasetV2:4"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "parallel_copy"
+    value {
+      b: false
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_INT64
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const/_6"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "MapDataset/_7"
+  op: "MapDataset"
+  input: "BatchDatasetV2/_5"
+  input: "Const/_6"
+  attr {
+    key: "Targuments"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "f"
+    value {
+      func {
+        name: "__inference_Dataset_map_select_dataset_constant_logits_24"
+      }
+    }
+  }
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\014MapDataset:5"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "preserve_cardinality"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "use_inter_op_parallelism"
+    value {
+      b: false
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_INT64
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const/_8"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 5
+          }
+        }
+        tensor_content: "\001\001\001\001\001aaaaa"
+      }
+    }
+  }
+}
+node {
+  name: "TensorSliceDataset/_9"
+  op: "TensorSliceDataset"
+  input: "Const/_8"
+  attr {
+    key: "Toutput_types"
+    value {
+      list {
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    key: "is_files"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\024TensorSliceDataset:0"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "replicate_on_split"
+    value {
+      b: false
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_STRING
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const/_10"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 5
+          }
+        }
+        tensor_content: "\001\001\001\001\001bbbbb"
+      }
+    }
+  }
+}
+node {
+  name: "TensorSliceDataset/_11"
+  op: "TensorSliceDataset"
+  input: "Const/_10"
+  attr {
+    key: "Toutput_types"
+    value {
+      list {
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    key: "is_files"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\024TensorSliceDataset:1"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "replicate_on_split"
+    value {
+      b: false
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_STRING
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const/_12"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 5
+          }
+        }
+        tensor_content: "\001\001\001\001\001ccccc"
+      }
+    }
+  }
+}
+node {
+  name: "TensorSliceDataset/_13"
+  op: "TensorSliceDataset"
+  input: "Const/_12"
+  attr {
+    key: "Toutput_types"
+    value {
+      list {
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    key: "is_files"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\024TensorSliceDataset:2"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "replicate_on_split"
+    value {
+      b: false
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_STRING
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "DirectedInterleaveDataset/_14"
+  op: "DirectedInterleaveDataset"
+  input: "MapDataset/_7"
+  input: "TensorSliceDataset/_9"
+  input: "TensorSliceDataset/_11"
+  input: "TensorSliceDataset/_13"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    key: "stop_on_empty_dataset"
+    value {
+      b: false
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_STRING
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "dataset"
+  op: "_Retval"
+  input: "DirectedInterleaveDataset/_14"
+  attr {
+    key: "T"
+    value {
+      type: DT_VARIANT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+library {
+  function {
+    signature {
+      name: "__inference_Dataset_map_select_dataset_constant_logits_24"
+      input_arg {
+        name: "args_0"
+        type: DT_INT64
+      }
+      input_arg {
+        name: "statelessmultinomial_logits"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "identity"
+        type: DT_INT64
+      }
+    }
+    node_def {
+      name: "StatelessMultinomial/num_samples"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "StatelessMultinomial"
+      op: "StatelessMultinomial"
+      input: "statelessmultinomial_logits"
+      input: "StatelessMultinomial/num_samples:output:0"
+      input: "args_0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "Tseed"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "output_dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    node_def {
+      name: "Squeeze"
+      op: "Squeeze"
+      input: "StatelessMultinomial:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "squeeze_dims"
+        value {
+          list {
+            i: 0
+            i: 1
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Identity"
+      op: "Identity"
+      input: "Squeeze:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    ret {
+      key: "identity"
+      value: "Identity:output:0"
+    }
+    attr {
+      key: "_construction_context"
+      value {
+        s: "kEagerRuntime"
+      }
+    }
+    attr {
+      key: "_tf_data_function"
+      value {
+        b: true
+      }
+    }
+    arg_attr {
+      key: 0
+      value {
+        attr {
+          key: "_output_shapes"
+          value {
+            list {
+              shape {
+                dim {
+                  size: -1
+                }
+              }
+            }
+          }
+        }
+        attr {
+          key: "_user_specified_name"
+          value {
+            s: "args_0"
+          }
+        }
+      }
+    }
+    arg_attr {
+      key: 1
+      value {
+        attr {
+          key: "_output_shapes"
+          value {
+            list {
+              shape {
+                dim {
+                  size: 1
+                }
+                dim {
+                  size: 3
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+versions {
+  producer: 1700
+}
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index ce1a3feed067fd..70e8311c2d0b3d 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/data/standalone.h"
 #include "tensorflow/core/framework/dataset.pb.h"
 #include "tensorflow/core/framework/metrics.h"
+#include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -615,11 +616,14 @@ std::vector<ActiveTask> DataServiceWorkerImpl::GetActiveTasks() const
       mutex_lock task_lock(task->mu);
       task_initialized = task->initialized;
     }
-    if (task_initialized && task->task_runner != nullptr) {
-      std::optional<double> processing_time_nsec =
-          task->task_runner->GetProcessingTimeNsec();
-      active_task.set_processing_time_nsec(
-          processing_time_nsec ? processing_time_nsec.value() : 0.0);
+
+    if (task_initialized && task->task_runner != nullptr &&
+        task->task_runner->model() != nullptr) {
+      std::shared_ptr<model::Model> model = task->task_runner->model();
+      double processing_time_nsec = model->ComputeSnapshotProcessingTimeNsec();
+      if (processing_time_nsec > 0) {
+        active_task.set_processing_time_nsec(processing_time_nsec);
+      }
     }
     active_tasks.push_back(std::move(active_task));
   }
diff --git a/tensorflow/core/data/split_utils.cc b/tensorflow/core/data/split_utils.cc
index 350c79b0897a72..da75c168126fb0 100644
--- a/tensorflow/core/data/split_utils.cc
+++ b/tensorflow/core/data/split_utils.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/mutex.h"
+#include "tsl/platform/types.h"
 
 namespace tensorflow {
 namespace data {
@@ -78,6 +79,15 @@ absl::Status IndexSplitProvider::Restore(
   return reader->ReadScalar(full_name(kIndex), &i_);
 }
 
+int64_t IndexSplitProvider::Cardinality() const {
+  // RandomDataset uses kint64max to simulate infinite splits.
+  // See RandomDatasetOp::Dataset::MakeSplitProviders.
+  if (n_ == tsl::kint64max) {
+    return kInfiniteCardinality;
+  }
+  return n_;
+}
+
 ShardingSplitProvider::ShardingSplitProvider(
     int64_t num_shards, int64_t shard_index,
     std::shared_ptr<SplitProvider> split_provider)
diff --git a/tensorflow/core/data/split_utils.h b/tensorflow/core/data/split_utils.h
index 0801d9afd546e7..a0fdef8d2d2213 100644
--- a/tensorflow/core/data/split_utils.h
+++ b/tensorflow/core/data/split_utils.h
@@ -42,6 +42,7 @@ class IndexSplitProvider : public SplitProvider {
                     IteratorStateWriter* writer) override;
   absl::Status Restore(std::function<std::string(std::string)> full_name,
                        IteratorStateReader* reader) override;
+  int64_t Cardinality() const override;
 
  private:
   tsl::mutex mu_;
diff --git a/tensorflow/core/data/standalone.cc b/tensorflow/core/data/standalone.cc
index 1790b29730249c..04a425170b27be 100644
--- a/tensorflow/core/data/standalone.cc
+++ b/tensorflow/core/data/standalone.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -116,16 +117,7 @@ Status Iterator::Restore(const std::vector<Tensor>& saved_iterator) {
   return iterator_->Restore(ctx_.get(), &reader);
 }
 
-std::optional<double> Iterator::GetProcessingTimeNsec() const {
-  if (ctx_->model() == nullptr) return std::nullopt;
-
-  double processing_time_nsec =
-      ctx_->model()->ComputeSnapshotProcessingTimeNsec();
-  if (processing_time_nsec > 0)
-    return processing_time_nsec;
-  else
-    return std::nullopt;
-}
+std::shared_ptr<model::Model> Iterator::model() const { return ctx_->model(); }
 
 Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
                           std::unique_ptr<Dataset>* result) {
diff --git a/tensorflow/core/data/standalone.h b/tensorflow/core/data/standalone.h
index 0854869fb67a30..5de0d81b274b30 100644
--- a/tensorflow/core/data/standalone.h
+++ b/tensorflow/core/data/standalone.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -89,12 +90,9 @@ class Iterator {
   // Restores the iterator from a checkpoint. `saved_iterator` is the serialized
   // iterator saved by calling `Save()`.
   Status Restore(const std::vector<Tensor>& saved_iterator);
-  // Returns the time it takes the pipeline associated with this iterator
-  // to process an element.
-  // Returns std::nullopt if there is not currently enough information to
-  // determine the processing time, e.g. because not enough data has been
-  // produced yet from the iterator.
-  std::optional<double> GetProcessingTimeNsec() const;
+
+  // Returns the dataset model for performance analysis.
+  std::shared_ptr<model::Model> model() const;
 
  private:
   friend class Dataset;
diff --git a/tensorflow/core/data/standalone_test.cc b/tensorflow/core/data/standalone_test.cc
index 964ec803a32df0..54f438b1cc2308 100644
--- a/tensorflow/core/data/standalone_test.cc
+++ b/tensorflow/core/data/standalone_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tsl/lib/core/status_test_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -523,21 +524,15 @@ TEST(Scalar, Standalone) {
     GraphDef graph_def;
     protobuf::TextFormat::ParseFromString(test_case.graph_string, &graph_def);
     std::unique_ptr<Dataset> dataset;
-    auto s = Dataset::FromGraph({}, graph_def, &dataset);
-    TF_EXPECT_OK(s);
+    TF_EXPECT_OK(Dataset::FromGraph({}, graph_def, &dataset));
     std::unique_ptr<Iterator> iterator;
-    s = dataset->MakeIterator(&iterator);
-    TF_EXPECT_OK(s);
-
-    std::optional<double> processing_time_nsec =
-        iterator->GetProcessingTimeNsec();
-    EXPECT_EQ(processing_time_nsec, std::nullopt);
+    TF_EXPECT_OK(dataset->MakeIterator(&iterator));
+    EXPECT_DOUBLE_EQ(iterator->model()->ComputeSnapshotProcessingTimeNsec(), 0);
 
     bool end_of_input = false;
     for (int num_outputs = 0; !end_of_input; ++num_outputs) {
       std::vector<tensorflow::Tensor> outputs;
-      s = iterator->GetNext(&outputs, &end_of_input);
-      TF_EXPECT_OK(s);
+      TF_EXPECT_OK(iterator->GetNext(&outputs, &end_of_input));
       if (!end_of_input) {
         EXPECT_EQ(outputs[0].scalar<int64_t>()(),
                   test_case.expected_outputs[num_outputs]);
@@ -548,9 +543,7 @@ TEST(Scalar, Standalone) {
 
     // Wait for an optimization round in the pipeline model.
     absl::SleepFor(absl::Seconds(1));
-    processing_time_nsec = iterator->GetProcessingTimeNsec();
-    EXPECT_NE(processing_time_nsec, std::nullopt);
-    EXPECT_LT(0, processing_time_nsec.value());
+    EXPECT_GT(iterator->model()->ComputeSnapshotProcessingTimeNsec(), 0);
   }
 }
 
@@ -562,10 +555,7 @@ TEST(NoAutotune, Standalone) {
   TF_EXPECT_OK(Dataset::FromGraph({}, graph_def, &dataset));
   std::unique_ptr<Iterator> iterator;
   TF_EXPECT_OK(dataset->MakeIterator(&iterator));
-
-  std::optional<double> processing_time_nsec =
-      iterator->GetProcessingTimeNsec();
-  EXPECT_EQ(processing_time_nsec, std::nullopt);
+  EXPECT_EQ(iterator->model(), nullptr);
 
   bool end_of_input = false;
   for (int num_outputs = 0; !end_of_input; ++num_outputs) {
@@ -580,10 +570,8 @@ TEST(NoAutotune, Standalone) {
 
   // Wait for an optimization round in the pipeline model.
   absl::SleepFor(absl::Seconds(1));
-  processing_time_nsec = iterator->GetProcessingTimeNsec();
-  // Model should not be created and `GetProcessingTimeNsec()` should return
-  // `nullopt`.
-  EXPECT_EQ(processing_time_nsec, std::nullopt);
+  // Model should not be created.
+  EXPECT_EQ(iterator->model(), nullptr);
 }
 
 }  // namespace
diff --git a/tensorflow/core/data/utils.cc b/tensorflow/core/data/utils.cc
index 7d346dcbecd319..73f8a75587e97e 100644
--- a/tensorflow/core/data/utils.cc
+++ b/tensorflow/core/data/utils.cc
@@ -14,11 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/data/utils.h"
 
+#include <memory>
 #include <optional>
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
+#include "tensorflow/core/data/file_logger_client_interface.h"
+#include "tensorflow/core/data/file_logger_client_no_op.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/protobuf/data_service.pb.h"
 
@@ -44,5 +47,9 @@ absl::StatusOr<bool> DisableCompressionAtRuntime(
   return false;
 }
 
+std::unique_ptr<FileLoggerClientInterface> CreateFileLoggerClient() {
+  return std::make_unique<FileLoggerClientNoOp>();
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/utils.h b/tensorflow/core/data/utils.h
index d80431c9680ab8..00fe795f9c7f3e 100644
--- a/tensorflow/core/data/utils.h
+++ b/tensorflow/core/data/utils.h
@@ -15,11 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DATA_UTILS_H_
 #define TENSORFLOW_CORE_DATA_UTILS_H_
 
+#include <memory>
 #include <optional>
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
+#include "tensorflow/core/data/file_logger_client_interface.h"
 #include "tensorflow/core/protobuf/data_service.pb.h"
 
 namespace tensorflow {
@@ -48,6 +50,9 @@ std::string LocalityOptimizedPath(const std::string& path);
 absl::StatusOr<bool> DisableCompressionAtRuntime(
     const std::string& data_transfer_protocol, DeploymentMode deployment_mode);
 
+// Creates a instance of a class derived from FileLoggerClientInterface.
+std::unique_ptr<FileLoggerClientInterface> CreateFileLoggerClient();
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/data/utils_test.cc b/tensorflow/core/data/utils_test.cc
new file mode 100644
index 00000000000000..1f908acb278b59
--- /dev/null
+++ b/tensorflow/core/data/utils_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/utils.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/data/file_logger_client_interface.h"
+#include "tensorflow/core/data/file_logger_client_no_op.h"
+
+namespace tensorflow::data {
+namespace {
+
+TEST(Util, CreateFileLoggerClient) {
+  std::unique_ptr<FileLoggerClientInterface> client = CreateFileLoggerClient();
+  EXPECT_NE(dynamic_cast<FileLoggerClientNoOp*>(client.get()), nullptr);
+}
+
+TEST(Util, DefaultDataTransferProtocol) {
+  EXPECT_EQ(DefaultDataTransferProtocol(), "grpc");
+}
+
+TEST(TranslateFileName, NoOp) {
+  constexpr char file[] = "/home/tfdata/file1";
+  EXPECT_EQ(TranslateFileName(file), file);
+}
+
+TEST(TranslateFileName, EmptyPath) {
+  constexpr char file[] = "";
+  EXPECT_EQ(TranslateFileName(file), file);
+}
+
+TEST(TranslateFileName, TfDataPath) {
+  constexpr char file[] = "tfdata/file1";
+  EXPECT_EQ(TranslateFileName(file), file);
+}
+
+TEST(LocalityOptimizedPath, NoOp) {
+  constexpr char file[] = "/home/tfdata/file1";
+  EXPECT_EQ(LocalityOptimizedPath(file), file);
+}
+
+TEST(LocalityOptimizedPath, EmptyPath) {
+  constexpr char file[] = "";
+  EXPECT_EQ(LocalityOptimizedPath(file), file);
+}
+
+TEST(LocalityOptimizedPath, TfDataPath) {
+  constexpr char file[] = "tfdata/file1";
+  EXPECT_EQ(LocalityOptimizedPath(file), file);
+}
+
+}  // namespace
+}  // namespace tensorflow::data
diff --git a/tensorflow/core/distributed_runtime/README.md b/tensorflow/core/distributed_runtime/README.md
index d22cd2a45bc68e..b4220beeae5f5f 100644
--- a/tensorflow/core/distributed_runtime/README.md
+++ b/tensorflow/core/distributed_runtime/README.md
@@ -4,5 +4,7 @@ This directory contains the initial open-source implementation of the
 distributed TensorFlow runtime, using [gRPC](http://grpc.io) for inter-process
 communication.
 
-To learn how to use the distributed runtime to create a TensorFlow cluster,
-see the [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed) How-To.
+To learn how to use the distributed runtime to create a TensorFlow cluster, see
+the
+[Distributed TensorFlow](https://www.tensorflow.org/guide/distributed_training)
+How-To.
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
index c4a7af7c6a26fd..0261268a589e2c 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -78,36 +79,36 @@ class MockCoordinationServiceAgent : public CoordinationServiceAgent {
   MOCK_METHOD(Status, ReportError, (const Status& error), (override));
   MOCK_METHOD(Status, Shutdown, (), (override));
   MOCK_METHOD(Status, Reset, (), (override));
-  MOCK_METHOD(StatusOr<std::string>, GetKeyValue, (const std::string& key),
+  MOCK_METHOD(StatusOr<std::string>, GetKeyValue, (std::string_view key),
               (override));
   MOCK_METHOD(StatusOr<std::string>, GetKeyValue,
               (const char* key, int64_t key_size), (override));
   MOCK_METHOD(StatusOr<std::string>, GetKeyValue,
-              (const std::string& key, absl::Duration timeout), (override));
+              (std::string_view key, absl::Duration timeout), (override));
   MOCK_METHOD(std::shared_ptr<CallOptions>, GetKeyValueAsync,
-              (const std::string& key, StatusOrValueCallback done), (override));
-  MOCK_METHOD(StatusOr<std::string>, TryGetKeyValue, (const std::string& key),
+              (std::string_view key, StatusOrValueCallback done), (override));
+  MOCK_METHOD(StatusOr<std::string>, TryGetKeyValue, (std::string_view key),
               (override));
   MOCK_METHOD(StatusOr<std::vector<KeyValueEntry>>, GetKeyValueDir,
-              (const std::string& key), (override));
+              (std::string_view key), (override));
   MOCK_METHOD(void, GetKeyValueDirAsync,
-              (const std::string& key, StatusOrValueDirCallback done),
+              (std::string_view key, StatusOrValueDirCallback done),
               (override));
   MOCK_METHOD(Status, InsertKeyValue,
-              (const std::string& key, const std::string& value), (override));
+              (std::string_view key, std::string_view value), (override));
   MOCK_METHOD(Status, InsertKeyValue,
               (const char* key, int64_t key_size, const char* value,
                int64_t value_size),
               (override));
-  MOCK_METHOD(Status, DeleteKeyValue, (const std::string& key), (override));
+  MOCK_METHOD(Status, DeleteKeyValue, (std::string_view key), (override));
   MOCK_METHOD(Status, DeleteKeyValue, (const char* key, int64_t key_size),
               (override));
   MOCK_METHOD(Status, UpdateKeyValue,
-              (const std::string& key, const std::string& value), (override));
+              (std::string_view key, std::string_view value), (override));
   MOCK_METHOD(Status, StartWatchKey,
-              (const std::string& key, ChangedKeyValuesCallback on_change),
+              (std::string_view key, ChangedKeyValuesCallback on_change),
               (override));
-  MOCK_METHOD(Status, StopWatchKey, (const std::string& key), (override));
+  MOCK_METHOD(Status, StopWatchKey, (std::string_view key), (override));
   MOCK_METHOD(void, WaitAtBarrierAsync,
               (const std::string& barrier_id, absl::Duration timeout,
                const std::vector<CoordinatedTask>& tasks, StatusCallback done),
@@ -117,7 +118,7 @@ class MockCoordinationServiceAgent : public CoordinationServiceAgent {
   MOCK_METHOD(StatusOr<Env*>, GetEnv, (), (override));
   MOCK_METHOD(void, SetError, (const Status& error), (override));
   MOCK_METHOD(Status, ActivateWatch,
-              (const std::string& key,
+              (std::string_view key,
                (const std::map<std::string, std::string>&)),
               (override));
 };
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index 46e86a42be6734..a3a1c8ae937db7 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -1,9 +1,9 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_grpc_cc_dependencies")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_grpc_cc_dependencies")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -102,6 +102,7 @@ cc_library(
         ":remote_tensor_handle",
         "//tensorflow/c/eager:immediate_execution_distributed_manager",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index f2fd43ca853156..f6f3bf1ee1668c 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/errors.h"
@@ -48,6 +49,7 @@ limitations under the License.
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/stringprintf.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tsl/distributed_runtime/preemption/preemption_notifier.h"
 #include "tsl/protobuf/coordination_config.pb.h"
@@ -55,13 +57,14 @@ namespace tensorflow {
 namespace eager {
 
 namespace {
-Status GetNumRetvals(tensorflow::EagerContext* context, const string& op_name,
+Status GetNumRetvals(FunctionLibraryDefinition* func_lib_def,
+                     const string& op_name,
                      const google::protobuf::Map<string, tensorflow::AttrValue>& attrs,
                      int* num_retvals) {
   const tensorflow::OpRegistrationData* op_reg_data = nullptr;
   auto status = tensorflow::OpRegistry::Global()->LookUp(op_name, &op_reg_data);
   if (absl::IsNotFound(status)) {
-    status = context->FindFunctionOpData(op_name, &op_reg_data);
+    status = func_lib_def->LookUp(op_name, &op_reg_data);
   }
   TF_RETURN_IF_ERROR(status);
 
@@ -100,14 +103,27 @@ Status GetEagerOperationAndNumRetvals(const Operation& operation,
   const char* name = operation.name().c_str();  // Shorthand
   std::optional<tensorflow::EagerFunctionParams> remote_func_params =
       std::nullopt;
+  FunctionLibraryDefinition* func_lib_def;
   if (operation.is_function()) {
     if (operation.is_component_function()) {
+      func_lib_def =
+          eager_context->GetComponentFunctionFunctionLibraryDefinition(
+              operation.name());
+      if (func_lib_def == nullptr) {
+        return absl::InternalError(
+            absl::StrCat("Could not find function library for registered "
+                         "component function: ",
+                         operation.name()));
+      }
       remote_func_params = {operation.id(), /*is_component_function=*/true,
-                            operation.func_step_id()};
+                            operation.func_step_id(), func_lib_def};
     } else {
+      func_lib_def = eager_context->FuncLibDef();
       remote_func_params = {operation.id(), /*is_component_function=*/false,
-                            std::nullopt};
+                            std::nullopt, /*func_lib_def=*/nullptr};
     }
+  } else {
+    func_lib_def = eager_context->FuncLibDef();
   }
   TF_RETURN_IF_ERROR(eager_op->Reset(name, operation.device().c_str(), false,
                                      eager_executor, remote_func_params));
@@ -143,7 +159,7 @@ Status GetEagerOperationAndNumRetvals(const Operation& operation,
   }
 
   // TODO(nareshmodi): Consider caching this.
-  return GetNumRetvals(eager_context, operation.name(), operation.attrs(),
+  return GetNumRetvals(func_lib_def, operation.name(), operation.attrs(),
                        num_retvals);
 }
 
@@ -770,9 +786,14 @@ Status EagerServiceImpl::RegisterFunction(
     const RegisterFunctionOp& register_function, EagerContext* eager_context) {
   // If the function is a component of a multi-device function, we only need to
   // register it locally.
-  return eager_context->AddFunctionDef(
-      register_function.function_def(), register_function.library(),
-      register_function.is_component_function());
+  if (register_function.is_component_function()) {
+    return eager_context->AddComponentFunction(register_function.function_def(),
+                                               register_function.library());
+  } else {
+    return eager_context->AddFunctionDef(register_function.function_def(),
+                                         register_function.library(),
+                                         /*add_to_local_only=*/false);
+  }
 }
 
 Status EagerServiceImpl::RemoveFunction(const RemoveFunctionOp& remove_function,
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 79f8ccb21d934a..2ab6631de71d9b 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -309,6 +309,46 @@ tensorflow::FunctionDef MatMulFunction() {
   return def;
 }
 
+tensorflow::FunctionDef MatMulTransposeFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'MatMulFunction'"
+      "      input_arg {"
+      "        name: 'a'"
+      "        type: DT_FLOAT"
+      "      }"
+      "      output_arg {"
+      "        name: 'm'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'matmul'"
+      "      op: 'MatMul'"
+      "      input: 'a'"
+      "      input: 'a'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "      attr {"
+      "        key: 'transpose_a'"
+      "        value {"
+      "          b: true"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'm'"
+      "      value: 'matmul:product'"
+      "    }",
+      &def));
+  return def;
+}
+
 tensorflow::FunctionDef MatMulNestedFunction() {
   tensorflow::FunctionDef def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
@@ -710,15 +750,178 @@ TEST_F(EagerServiceImplFunctionTest, FunctionCancellationTest) {
 TEST_F(EagerServiceImplFunctionTest, ComponentFunctionTest) {
   RegisterFunctionOp register_op;
   *register_op.mutable_function_def() = MatMulFunction();
+  register_op.set_is_component_function(true);
   TestComponentFunction(register_op, "MatMulFunction", false);
 }
 
 TEST_F(EagerServiceImplFunctionTest, ComponentFunctionCancellationTest) {
   RegisterFunctionOp register_op;
   *register_op.mutable_function_def() = SingleRecvNodeFunction();
+  register_op.set_is_component_function(true);
   TestComponentFunction(register_op, "SingleRecvNodeFunction", true);
 }
 
+TEST_F(EagerServiceImplFunctionTest, ComponentNestedFunctionTest) {
+  RegisterFunctionOp register_op;
+  *register_op.mutable_function_def() = MatMulNestedFunction();
+  *register_op.mutable_library()->add_function() = MatMulFunction();
+  register_op.set_is_component_function(true);
+  TestComponentFunction(register_op, "MatMulNestedFunction", false);
+}
+
+TEST_F(EagerServiceImplFunctionTest, ComponentNestedFunctionWithNameClashTest) {
+  TestEagerServiceImpl eager_service_impl(&worker_env_);
+  uint64 context_id = random::New64();
+
+  // Create context.
+  CreateContextRequest request;
+  request.mutable_server_def()->set_job_name("localhost");
+  request.mutable_server_def()->set_task_index(0);
+  request.set_context_id(context_id);
+  CreateContextResponse response;
+  TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+  // Register first function.
+  {
+    EnqueueRequest enqueue_request;
+    enqueue_request.set_context_id(context_id);
+    RegisterFunctionOp* register_op =
+        enqueue_request.add_queue()->mutable_register_function();
+    *register_op->mutable_function_def() = MatMulNestedFunction();
+    *register_op->mutable_library()->add_function() = MatMulFunction();
+    register_op->set_is_component_function(true);
+    EnqueueResponse enqueue_response;
+    TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &enqueue_request,
+                                            &enqueue_response));
+  }
+
+  // Register second function.
+  // In the second registration, the library contains a function named
+  // "MatMulFunction" but a different body.
+  {
+    EnqueueRequest enqueue_request;
+    enqueue_request.set_context_id(context_id);
+    RegisterFunctionOp* register_op =
+        enqueue_request.add_queue()->mutable_register_function();
+
+    *register_op->mutable_function_def() = MatMulNestedFunction();
+    register_op->mutable_function_def()->mutable_signature()->set_name(
+        "MatMulNestedTransposeFunction");
+    *register_op->mutable_library()->add_function() = MatMulTransposeFunction();
+    register_op->set_is_component_function(true);
+    EnqueueResponse enqueue_response;
+    TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &enqueue_request,
+                                            &enqueue_response));
+  }
+
+  // First run an op to generate input for the functions.
+  EnqueueRequest remote_enqueue_request;
+  remote_enqueue_request.set_context_id(context_id);
+  EnqueueResponse remote_enqueue_response;
+
+  std::unordered_map<string, AttrValue> const_attrs;
+  AttrValue val;
+  val.set_type(tensorflow::DataType::DT_FLOAT);
+  const_attrs.insert({"dtype", val});
+  val.Clear();
+  SetTensorProto(val.mutable_tensor());
+  const_attrs.insert({"value", val});
+  AddOperationToEnqueueRequest(1, "Const", {}, const_attrs,
+                               "/job:localhost/replica:0/task:0/device:CPU:0",
+                               &remote_enqueue_request);
+  TF_ASSERT_OK(eager_service_impl.Enqueue(nullptr, &remote_enqueue_request,
+                                          &remote_enqueue_response));
+
+  {
+    // Run first function with input from the previous op.
+    RunComponentFunctionRequest run_comp_func_request;
+    run_comp_func_request.set_context_id(context_id);
+    RunComponentFunctionResponse run_comp_func_response;
+    const int output_num = 5;
+    AddOperationToRunComponentFunctionRequest(
+        2, "MatMulNestedFunction", {std::make_pair(1, 0)},
+        std::unordered_map<string, AttrValue>(),
+        "/job:localhost/replica:0/task:0/device:CPU:0", output_num,
+        &run_comp_func_request);
+
+    CallOptions call_opts;
+    Notification n;
+    Status status;
+    eager_service_impl.RunComponentFunction(&call_opts, &run_comp_func_request,
+                                            &run_comp_func_response,
+                                            [&status, &n](const Status& s) {
+                                              status.Update(s);
+                                              n.Notify();
+                                            });
+    n.WaitForNotification();
+
+    TF_ASSERT_OK(status);
+    // Retrieve the output.
+    const tensorflow::Tensor* t = nullptr;
+    tensorflow::TensorHandle* tensor_handle;
+    TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+        context_id, RemoteTensorHandleInternal(2, output_num), &tensor_handle));
+    TF_ASSERT_OK(tensor_handle->Tensor(&t));
+
+    auto actual = t->flat<float>();
+    EXPECT_EQ(4, actual.size());
+
+    EXPECT_EQ(7, actual(0));
+    EXPECT_EQ(10, actual(1));
+    EXPECT_EQ(15, actual(2));
+    EXPECT_EQ(22, actual(3));
+  }
+
+  {
+    // Run second function with input from the constant op. The result should
+    // be different, because we are using the transposed implementation of
+    // MatMulFunction in the second function's library.
+    RunComponentFunctionRequest run_comp_func_request;
+    run_comp_func_request.set_context_id(context_id);
+    RunComponentFunctionResponse run_comp_func_response;
+    const int output_num = 5;
+    AddOperationToRunComponentFunctionRequest(
+        3, "MatMulNestedTransposeFunction", {std::make_pair(1, 0)},
+        std::unordered_map<string, AttrValue>(),
+        "/job:localhost/replica:0/task:0/device:CPU:0", output_num,
+        &run_comp_func_request);
+
+    CallOptions call_opts;
+    Notification n;
+    Status status;
+    eager_service_impl.RunComponentFunction(&call_opts, &run_comp_func_request,
+                                            &run_comp_func_response,
+                                            [&status, &n](const Status& s) {
+                                              status.Update(s);
+                                              n.Notify();
+                                            });
+    n.WaitForNotification();
+
+    TF_ASSERT_OK(status);
+    // Retrieve the output.
+    const tensorflow::Tensor* t = nullptr;
+    tensorflow::TensorHandle* tensor_handle;
+    TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+        context_id, RemoteTensorHandleInternal(3, output_num), &tensor_handle));
+    TF_ASSERT_OK(tensor_handle->Tensor(&t));
+
+    auto actual = t->flat<float>();
+    EXPECT_EQ(4, actual.size());
+
+    EXPECT_EQ(10, actual(0));
+    EXPECT_EQ(14, actual(1));
+    EXPECT_EQ(14, actual(2));
+    EXPECT_EQ(20, actual(3));
+  }
+
+  CloseContextRequest close_context_request;
+  close_context_request.set_context_id(context_id);
+  close_context_request.set_context_view_id(0);
+  CloseContextResponse close_context_response;
+  TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
+                                               &close_context_response));
+}
+
 class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
  public:
   FunctionWithRemoteInputsTest()
@@ -987,7 +1190,7 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
 
   // Instantiate MatMulFunction on remote_device.
   const NodeDef node_def = MatMulFunctionNodeDef();
-  TF_ASSERT_OK(kernel->InstantiateFunc({}, node_def, nullptr));
+  TF_ASSERT_OK(kernel->InstantiateFunc({}, node_def, nullptr, std::nullopt));
 
   // Run MatMulFunction on remote_device.
   gtl::InlinedVector<TensorValue, 4> input_tensors = {TensorValue()};
@@ -1042,7 +1245,7 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
 
   // Instantiate MatMulFunction on remote_device.
   const NodeDef node_def = MatMulFunctionNodeDef();
-  TF_ASSERT_OK(kernel->InstantiateFunc({}, node_def, nullptr));
+  TF_ASSERT_OK(kernel->InstantiateFunc({}, node_def, nullptr, std::nullopt));
 
   // Run MatMulFunction on remote_device.
   gtl::InlinedVector<TensorValue, 4> input_tensors = {TensorValue()};
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index 4cefc9433c2556..bd5bc39622b9d6 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -60,7 +60,8 @@ Status CreateUncachedKernelAndDeviceOp(
 
   const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
   return kernel->get()->Init(ctx.LogDevicePlacement(), ndef,
-                             /*graph_collector=*/nullptr);
+                             /*graph_collector=*/nullptr,
+                             /*eager_func_params=*/std::nullopt);
 }
 
 // This gets a unique wire ID. We add a random identifier so that if the
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
index 6aabf3ce209d7d..148e58a5b008c5 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
@@ -42,7 +42,8 @@ class RemoteExecuteNode : public AsyncRemoteExecuteNode {
                     std::unique_ptr<EnqueueRequest> request, Device* device,
                     uint64 context_view_id, EagerClient* eager_client,
                     CancellationManager* cancellation_manager,
-                    const NodeDef& ndef, FunctionLibraryDefinition* lib_def,
+                    const NodeDef& ndef,
+                    const FunctionLibraryDefinition* lib_def,
                     const gtl::InlinedVector<TensorHandle*, 4>& inputs,
                     absl::Span<TensorHandle*> retvals)
       : AsyncRemoteExecuteNode(),
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 27c92c157eb7ae..01a6cfa158390a 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -712,6 +712,7 @@ cc_library(
     hdrs = ["resource_base.h"],
     visibility = default_visibility + [
         "//learning/brain/google/data/core/kernels:__pkg__",
+        "//learning/deepmind/tensorflow/queues:__pkg__",
         "//learning/deepmind/tensorflow/sstable:__pkg__",
     ],
     deps = [
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index a174234a86e483..5fc2119aaace5f 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -400,6 +400,12 @@ class SplitProvider {
   // Restores the state of this split provider.
   virtual Status Restore(std::function<std::string(std::string)> full_name,
                          IteratorStateReader* reader) = 0;
+  // Returns the number of splits:
+  // - If there are a finite number of splits, returns a non-negative count.
+  // - If there are an infinite number of splits, returns kInfiniteCardinality.
+  // - If the number of splits is unknown or can't be efficiently computed,
+  // returns kUnknownCardinality.
+  virtual int64_t Cardinality() const { return kUnknownCardinality; }
 };
 
 // Returns the runner threadpool size from an OpKernelContext.
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index d806545d573bda..080e8b5c98f719 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -137,6 +137,25 @@ FunctionDef XTimesTwo() {
       });
 }
 
+FunctionDef XTimesTwoWithControlInput() {
+  const Tensor kTwo = test::AsScalar<int64_t>(2);
+  return FDH::Define(
+      // Name
+      "XTimesTwo",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"scale"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y"}, "Mul", {"scale"}, {{"T", "$T"}}, /*dep=*/{"x"}},
+      });
+}
+
 FunctionDef TwoDeviceMult() {
   const Tensor kTwo = test::AsScalar<int64_t>(2);
   const Tensor kThree = test::AsScalar<int64_t>(3);
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index 559e0d6d67d241..b0ce4fafde58af 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -70,6 +70,8 @@ GraphDef GDef(gtl::ArraySlice<NodeDef> nodes,
 
 // x: T -> x * 2.
 FunctionDef XTimesTwo();
+// Same as `XTimesTwo` above, but with the `x` input as a control dependency.
+FunctionDef XTimesTwoWithControlInput();
 
 // x: T -> cpu(x * 2) + cpu(x * 3).
 FunctionDef TwoDeviceTimesFive();
diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index 104af6dcfb1936..e6f94a8e444b4a 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -266,6 +266,12 @@ auto* tf_data_model_gauge =
     tsl::monitoring::Gauge<std::function<std::string()>, 1>::New(
         "/tensorflow/data/model", "tf.data autotuning model proto.", "id");
 
+auto* tf_data_pipeline_processing_time = tsl::monitoring::Gauge<double, 1>::New(
+    "/tensorflow/data/pipeline_processing_time",
+    "The total processing time of the slowest stage in the input pipeline "
+    "in microseconds",
+    "id");
+
 auto* tf_data_auto_shard = tsl::monitoring::Gauge<int64, 2>::New(
     "/tensorflow/data/autoshard", "tf.data autoshard statistics.", "id",
     "name");
@@ -467,6 +473,11 @@ tsl::monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
   return tf_data_model_gauge->GetCell(id);
 }
 
+tsl::monitoring::GaugeCell<double>* GetTFDataPipelineProcessingTimeGauge(
+    const string& id) {
+  return tf_data_pipeline_processing_time->GetCell(id);
+}
+
 void RecordTFDataBytesFetched(int64_t num_bytes) {
   tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
 }
@@ -822,6 +833,11 @@ void RecordUnusedOutput(const string& op_name) {
   graph_unused_outputs->GetCell(op_name)->IncrementBy(1);
 }
 
+void RecordPipelineProcessingTime(const string& id,
+                                  double pipeline_processing_time_usec) {
+  GetTFDataPipelineProcessingTimeGauge(id)->Set(pipeline_processing_time_usec);
+}
+
 void IncrementTestCounter(const string& name, const string& label) {
   test_counters->GetCell(name, label)->IncrementBy(1);
 }
diff --git a/tensorflow/core/framework/metrics.h b/tensorflow/core/framework/metrics.h
index 5b15ee16b0a165..bcc5808cf7f8e8 100644
--- a/tensorflow/core/framework/metrics.h
+++ b/tensorflow/core/framework/metrics.h
@@ -243,6 +243,10 @@ void UpdateGraphPendingQueueLength(uint64 len);
 // Records that one output of an op of type `op_name` was unused.
 void RecordUnusedOutput(const string& op_name);
 
+// Records the pipeline processing time in microseconds
+void RecordPipelineProcessingTime(const string& id,
+                                  double pipeline_processing_time_usec);
+
 // Updates the metrics stored about time spent building graphs.
 //
 // By "GraphBuild", we refer to building a client graph, which is a sub-graph of
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 7cfbd96294136d..17bf8d731236b8 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -18,12 +18,14 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <queue>
 
 #include "absl/time/clock.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/model.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -549,7 +551,7 @@ class InterleaveMany : public Node {
         self_processing_time + inputs_processing_time;
   }
 
-  Status ToProto(ModelProto::Node* node_proto) const {
+  Status ToProto(ModelProto::Node* node_proto) const override {
     TF_RETURN_IF_ERROR(Node::ToProto(node_proto));
     node_proto->set_node_class(NodeClass::INTERLEAVE_MANY);
     return OkStatus();
@@ -761,7 +763,7 @@ class AsyncInterleaveMany : public Node {
         self_processing_time + inputs_processing_time;
   }
 
-  double MaximumBufferedBytes() const TF_SHARED_LOCKS_REQUIRED(mu_) {
+  double MaximumBufferedBytes() const override TF_SHARED_LOCKS_REQUIRED(mu_) {
     auto* parameter = gtl::FindOrNull(parameters_, kMaxBufferedElements);
     if (parameter == nullptr) {
       parameter = gtl::FindOrNull(parameters_, kParallelism);
@@ -772,7 +774,7 @@ class AsyncInterleaveMany : public Node {
     return (*parameter)->value * AverageBufferedElementSizeLocked();
   }
 
-  Status ToProto(ModelProto::Node* node_proto) const {
+  Status ToProto(ModelProto::Node* node_proto) const override {
     TF_RETURN_IF_ERROR(Node::ToProto(node_proto));
     node_proto->set_node_class(NodeClass::ASYNC_INTERLEAVE_MANY);
     return OkStatus();
@@ -864,7 +866,7 @@ class KnownRatio : public Node {
         self_processing_time + inputs_processing_time;
   }
 
-  Status ToProto(ModelProto::Node* node_proto) const {
+  Status ToProto(ModelProto::Node* node_proto) const override {
     TF_RETURN_IF_ERROR(Node::ToProto(node_proto));
     node_proto->set_node_class(NodeClass::KNOWN_RATIO);
     node_proto->set_ratio(ratio_);
@@ -1243,7 +1245,7 @@ class UnknownRatio : public Node {
         self_processing_time + inputs_processing_time;
   }
 
-  Status ToProto(ModelProto::Node* node_proto) const {
+  Status ToProto(ModelProto::Node* node_proto) const override {
     TF_RETURN_IF_ERROR(Node::ToProto(node_proto));
     node_proto->set_node_class(NodeClass::UNKNOWN_RATIO);
     return OkStatus();
@@ -1297,7 +1299,7 @@ class Unknown : public Node {
         TotalProcessingTimeForInputs(*total_processing_times);
   }
 
-  Status ToProto(ModelProto::Node* node_proto) const {
+  Status ToProto(ModelProto::Node* node_proto) const override {
     TF_RETURN_IF_ERROR(Node::ToProto(node_proto));
     node_proto->set_node_class(NodeClass::UNKNOWN);
     return OkStatus();
@@ -1326,7 +1328,7 @@ class AsyncKnownRatio : public AsyncRatio {
         parameters);
   }
 
-  Status ToProto(ModelProto::Node* node_proto) const {
+  Status ToProto(ModelProto::Node* node_proto) const override {
     TF_RETURN_IF_ERROR(Node::ToProto(node_proto));
     node_proto->set_node_class(NodeClass::ASYNC_KNOWN_RATIO);
     node_proto->set_ratio(Ratio());
@@ -1371,7 +1373,7 @@ class AsyncUnknownRatio : public AsyncRatio {
         Args{id_, name_, std::move(output)}, parameters);
   }
 
-  Status ToProto(ModelProto::Node* node_proto) const {
+  Status ToProto(ModelProto::Node* node_proto) const override {
     TF_RETURN_IF_ERROR(Node::ToProto(node_proto));
     node_proto->set_node_class(NodeClass::ASYNC_UNKNOWN_RATIO);
     return OkStatus();
@@ -2205,8 +2207,9 @@ Status Node::FromProto(ModelProto::Node node_proto,
 Model::Model()
     : optimization_period_ms_(kOptimizationPeriodMinMs),
       safe_to_collect_metrics_(std::make_shared<GuardedBool>(true)) {
-  model_gauge_cell_ = metrics::GetTFDataModelGauge(
-      strings::StrCat(reinterpret_cast<uint64>(this)));
+  model_id_ = strings::StrCat(reinterpret_cast<uint64>(this));
+  model_gauge_cell_ = metrics::GetTFDataModelGauge(model_id_);
+
   // Capture `safe_to_collect_metrics_` by value to avoid use-after-free issues
   // when the callback is invoked after the model has been destroyed.
   model_gauge_cell_->Set(
@@ -2237,6 +2240,8 @@ Model::Model()
 Model::~Model() {
   mutex_lock l(safe_to_collect_metrics_->mu);
   safe_to_collect_metrics_->val = false;
+  // Reset the pipeline processing time to 0
+  metrics::RecordPipelineProcessingTime(model_id_, 0);
 }
 
 void Model::AddNode(Node::Factory factory, const string& name,
@@ -2356,6 +2361,38 @@ void Model::Optimize(AutotuneAlgorithm algorithm,
     mutex_lock l(mu_);
     snapshot_ = snapshot;
     optimization_params_ = optimization_params;
+
+    if (snapshot_) {
+      double pipeline_processing_usec = 0;
+      ModelTiming model_timing(snapshot_);
+      auto bfs_stage_roots = model_timing.GetStageRoots();
+      for (const auto& root : bfs_stage_roots) {
+        auto* root_timing = model_timing.GetTiming(root.get());
+        if (root_timing == nullptr) {
+          constexpr int TEN_MINUTES = 60 * 10;
+          LOG_EVERY_N_SEC(ERROR, TEN_MINUTES)
+              << "Encounter an error when computing the pipeline processing "
+                 "time for "
+                 "/tensorflow/data/pipeline_processing_time";
+          pipeline_processing_usec = 0;
+          break;
+        }
+
+        double root_total_time_usec = root_timing->total_time_nsec *
+                                      root_timing->pipeline_ratio /
+                                      EnvTime::kMicrosToNanos;
+
+        pipeline_processing_usec =
+            std::max(pipeline_processing_usec, root_total_time_usec);
+      }
+      // Only updates the pipeline processing time when it is greater than 0.
+      // If it is zero, we assume the pipeline processing time is the same
+      // as the previous one and do not update it.
+      if (pipeline_processing_usec > 0) {
+        metrics::RecordPipelineProcessingTime(model_id_,
+                                              pipeline_processing_usec);
+      }
+    }
   }
 }
 
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index a7b95dfe5cadd0..505fc2f8a5e3f2 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -1186,6 +1186,8 @@ class Model {
   std::shared_ptr<Node> snapshot_ TF_GUARDED_BY(mu_);
   // Stores the optimization parameters used by autotune.
   OptimizationParams optimization_params_ TF_GUARDED_BY(mu_);
+  // Stores the model id in the string format
+  std::string model_id_;
 };
 
 // Class to compute timing information for a model.
diff --git a/tensorflow/core/framework/op_requires.h b/tensorflow/core/framework/op_requires.h
index c5fb7796ecf6e8..a009a11ea606e7 100644
--- a/tensorflow/core/framework/op_requires.h
+++ b/tensorflow/core/framework/op_requires.h
@@ -49,7 +49,7 @@ namespace tensorflow {
 
 #define OP_REQUIRES_OK(CTX, ...)                             \
   do {                                                       \
-    ::tensorflow::Status _s(__VA_ARGS__);                    \
+    const ::tensorflow::Status& _s(__VA_ARGS__);             \
     if (!TF_PREDICT_TRUE(_s.ok())) {                         \
       CheckNotInComputeAsync((CTX), "OP_REQUIRES_OK_ASYNC"); \
       (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s);  \
diff --git a/tensorflow/core/framework/tensor_matcher.h b/tensorflow/core/framework/tensor_matcher.h
index 094d66f81f72f3..e89cfc15cd1f2a 100644
--- a/tensorflow/core/framework/tensor_matcher.h
+++ b/tensorflow/core/framework/tensor_matcher.h
@@ -34,7 +34,7 @@ namespace test {
 //
 // Use this like:
 //
-//   EXPECT_EQ(lhs, TensorEq(rhs));
+//   EXPECT_THAT(lhs, TensorEq(rhs));
 //
 // All POD types and DT_STRING type tensors are supported. Note that this
 // utility requires Tensors to point to CPU memory.
diff --git a/tensorflow/core/framework/variant.h b/tensorflow/core/framework/variant.h
index c4e23a8d07ba5e..152e0538f81bfe 100644
--- a/tensorflow/core/framework/variant.h
+++ b/tensorflow/core/framework/variant.h
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "tensorflow/core/framework/type_index.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/strcat.h"
diff --git a/tensorflow/core/function/capture/BUILD b/tensorflow/core/function/capture/BUILD
index 9588fe68bdf6ee..31ecbe1a79c16f 100644
--- a/tensorflow/core/function/capture/BUILD
+++ b/tensorflow/core/function/capture/BUILD
@@ -34,9 +34,10 @@ py_strict_test(
     ],
     deps = [
         ":free_vars_detect",
-        "//tensorflow/python/util:tf_decorator_py",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        #internal proto upb dep
+        "//third_party/py/numpy",
+        "//tensorflow/python/util:tf_decorator_py",
     ],
 )
 
@@ -45,13 +46,14 @@ py_strict_test(
     srcs = ["by_ref_capture_test.py"],
     python_version = "PY3",
     deps = [
+        "@absl_py//absl/testing:parameterized",
+        #internal proto upb dep
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:client_testlib",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -76,9 +78,10 @@ py_strict_test(
     python_version = "PY3",
     deps = [
         ":capture_container",
+        "@absl_py//absl/testing:parameterized",
+        #internal proto upb dep
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python/platform:client_testlib",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/core/function/integration_test/BUILD b/tensorflow/core/function/integration_test/BUILD
index 5c6f6d189d9e25..e78adb5486088d 100644
--- a/tensorflow/core/function/integration_test/BUILD
+++ b/tensorflow/core/function/integration_test/BUILD
@@ -12,8 +12,9 @@ py_strict_test(
     srcs = ["side_inputs_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow:tensorflow_py",
         "@absl_py//absl/testing:parameterized",
+        #internal proto upb dep
+        "//tensorflow:tensorflow_py",
     ],
 )
 
@@ -22,7 +23,8 @@ py_strict_test(
     srcs = ["side_inputs_manual_api_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow:tensorflow_py",
         "@absl_py//absl/testing:parameterized",
+        #internal proto upb dep
+        "//tensorflow:tensorflow_py",
     ],
 )
diff --git a/tensorflow/core/function/polymorphism/BUILD b/tensorflow/core/function/polymorphism/BUILD
index 478ca86c222332..67165dfc34deca 100644
--- a/tensorflow/core/function/polymorphism/BUILD
+++ b/tensorflow/core/function/polymorphism/BUILD
@@ -32,6 +32,7 @@ py_strict_test(
     deps = [
         ":function_type",
         ":type_dispatch",
+        #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/types:trace",
     ],
@@ -60,6 +61,7 @@ py_strict_test(
     visibility = ["//learning/brain/contrib/eager/python/examples:__pkg__"],
     deps = [
         ":function_cache",
+        #internal proto upb dep
         "//tensorflow/core/function/polymorphism:function_type",
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python/ops:array_ops",
@@ -116,6 +118,7 @@ py_strict_test(
     python_version = "PY3",
     deps = [
         ":function_type",
+        #internal proto upb dep
         "//tensorflow/core/function/polymorphism:function_type_proto_py",
         "//tensorflow/core/function/trace_type",
         "//tensorflow/core/function/trace_type:serialization",
diff --git a/tensorflow/core/function/runtime_client/BUILD b/tensorflow/core/function/runtime_client/BUILD
index 2ad85369234ca7..d1046cdd72e9cd 100644
--- a/tensorflow/core/function/runtime_client/BUILD
+++ b/tensorflow/core/function/runtime_client/BUILD
@@ -19,13 +19,16 @@ cc_library(
     hdrs = [
         "runtime_client.h",
     ],
+    defines = select({
+        "//tensorflow/compiler/mlir/python:disable_mlir_config": ["DISABLE_MLIR"],
+        "//conditions:default": [],
+    }),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
-        "//tensorflow/compiler/mlir/python:mlir",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
@@ -52,7 +55,12 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
-    ],
+    ] + select({
+        "//tensorflow/compiler/mlir/python:disable_mlir_config": [],
+        "//conditions:default": [
+            "//tensorflow/compiler/mlir/python:mlir",
+        ],
+    }),
     # TODO(mdan): Get rid of alwayslink, it's nonstandard.
     alwayslink = 1,
 )
@@ -156,6 +164,7 @@ py_strict_test(
     tags = ["no_oss"],  # TODO(b/219089812)
     deps = [
         ":runtime_client_py",
+        #internal proto upb dep
         "//tensorflow/core/framework:function_proto_py",
         "//tensorflow/core/function/testing:test_pass_py",
         "//tensorflow/python:tf2",
diff --git a/tensorflow/core/function/runtime_client/runtime_client.cc b/tensorflow/core/function/runtime_client/runtime_client.cc
index b10bcc3856e19b..6438a1ca2b83c1 100644
--- a/tensorflow/core/function/runtime_client/runtime_client.cc
+++ b/tensorflow/core/function/runtime_client/runtime_client.cc
@@ -31,7 +31,11 @@ limitations under the License.
 #include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/c/eager/immediate_execution_operation.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+
+#if !defined(DISABLE_MLIR)
 #include "tensorflow/compiler/mlir/python/mlir.h"
+#endif
+
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
diff --git a/tensorflow/core/function/trace_type/BUILD b/tensorflow/core/function/trace_type/BUILD
index a88f0a3aca20d6..b3c0f2e05f8fa6 100644
--- a/tensorflow/core/function/trace_type/BUILD
+++ b/tensorflow/core/function/trace_type/BUILD
@@ -51,6 +51,8 @@ py_strict_test(
         ":custom_nest_trace_type",
         ":default_types",
         ":trace_type",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
@@ -66,7 +68,6 @@ py_strict_test(
         "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -94,6 +95,7 @@ py_strict_test(
     deps = [
         ":default_types",
         ":serialization",
+        #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/types:trace",
         "@absl_py//absl/testing:parameterized",
@@ -121,6 +123,7 @@ py_strict_test(
     deps = [
         ":custom_nest_trace_type",
         ":default_types",
+        #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/types:trace",
         "@absl_py//absl/testing:parameterized",
@@ -143,6 +146,7 @@ py_strict_test(
     python_version = "PY3",
     deps = [
         ":serialization",
+        #internal proto upb dep
         "//tensorflow/core/function/trace_type:serialization_test_proto_py",
         "//tensorflow/python/platform:client_testlib",
     ],
diff --git a/tensorflow/core/function/trace_type/trace_type_test.py b/tensorflow/core/function/trace_type/trace_type_test.py
index 0ef6e8d8d75adc..3e9c7dbe06b05a 100644
--- a/tensorflow/core/function/trace_type/trace_type_test.py
+++ b/tensorflow/core/function/trace_type/trace_type_test.py
@@ -439,29 +439,29 @@ def testDictofTensorSpecs(self):
 
 class TraceTypeMemoryTest(test.TestCase):
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testGeneric(self):
     trace_type.from_value(1)
     trace_type.from_value(DummyGenericClass())
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testTensor(self):
     tensor = array_ops.zeros([10])
     trace_type.from_value(tensor)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testTuple(self):
     trace_type.from_value((1, 2, 3))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDict(self):
     trace_type.from_value({1: 1, 2: 2, 3: 3})
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testList(self):
     trace_type.from_value([1, 2, 3])
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testAttrs(self):
     trace_type.from_value(TestAttrsClass(1, 2))
 
diff --git a/tensorflow/core/function/transform/BUILD b/tensorflow/core/function/transform/BUILD
index 91e7302db8229a..9d7b60dd1328e7 100644
--- a/tensorflow/core/function/transform/BUILD
+++ b/tensorflow/core/function/transform/BUILD
@@ -43,6 +43,8 @@ py_strict_test(
     tags = ["no_oss"],  # TODO(b/219089812)
     deps = [
         ":transform",
+        "@absl_py//absl/testing:parameterized",
+        #internal proto upb dep
         "//tensorflow/core/function/testing:test_pass_py",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
@@ -57,6 +59,5 @@ py_strict_test(
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 4342ec637492ec..84b33460db1b03 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -841,7 +841,6 @@ class SymbolicShapeRefiner {
       }
 
       int output_port_num = input_tensor.index();
-      AttrValue attr_output_shape;
       TensorShapeProto proto;
       const auto handle = input_ic->output(output_port_num);
       input_ic->ShapeHandleToProto(handle, &proto);
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index ecd559734ea870..8e79043af832a6 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -910,7 +910,7 @@ tf_kernel_library(
 tf_cuda_cc_test(
     name = "remapper_test",
     srcs = ["remapper_test.cc"],
-    tags = ["no_rocm"],
+    tags = [],
     deps = [
         ":remapper",
         "//tensorflow/cc:cc_ops",
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 2eed9cd40061a9..689185fb08923d 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -932,7 +932,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListFromTensor) {
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
-    test::ExpectClose(tensors_expected[i], tensors[i], -1, 2e-4);
+    test::ExpectClose(tensors_expected[i], tensors[i], -1, 4e-4);
   }
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 5519abcf25aaa9..4edc6cc529451a 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core/platform:build_config.bzl", "tf_protos_all")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -373,6 +373,7 @@ cc_library(
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:functional_ops",
+        "@com_google_absl//absl/strings",
     ] + tf_protos_all(),
 )
 
@@ -389,6 +390,8 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/strings",
     ] + tf_protos_all(),
 )
 
@@ -688,6 +691,7 @@ cc_library(
         "//tensorflow/core/grappler/utils:topological_sort",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -708,6 +712,7 @@ tf_cc_test(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:errors",
     ],
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
index 0cd0db36808485..62c615c45905fc 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/fusion_utils.h"
 
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/strip.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -31,15 +34,32 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace fusion_utils {
 
 namespace {
+
+// See the comment for the proto field `tensorflow.NodeDef.input`.
+constexpr char kControlInputPrefix[] = "^";
+
+bool IsControlInput(const string& node_input) {
+  return absl::StartsWith(node_input, kControlInputPrefix);
+}
+
+string StripControlInputNotation(const string& node_input) {
+  return string(absl::StripPrefix(node_input, kControlInputPrefix));
+}
+
+string AddControlInputNotation(const string& node_input) {
+  return absl::StrCat(kControlInputPrefix, node_input);
+}
+
+// Returns e.g. `"node"` given `"node:out"` or `"node:out:0"`. See the comment
+// for the proto field `tensorflow.FunctionDef.node_def`.
 string ParseNodeConnection(const string& name) {
-  // If input/output node name has semicolon, take the prefix.  Otherwise take
-  // the whole string.
   return name.substr(0, name.find(':'));
 }
 
@@ -194,10 +214,15 @@ OpDef GetUniqueSignature(const OpDef& first_signature,
 
   for (NodeDef& function_node : *nodes_to_fuse) {
     for (auto& node_input : *function_node.mutable_input()) {
-      const auto& input = ParseNodeConnection(node_input);
+      bool is_control_input = IsControlInput(node_input);
+      const auto& input =
+          ParseNodeConnection(StripControlInputNotation(node_input));
       if (const string* new_name =
               gtl::FindOrNull(changed_input_names, input)) {
         node_input = *new_name + ParseOutputNode(node_input);
+        if (is_control_input) {
+          node_input = AddControlInputNotation(node_input);
+        }
       }
     }
   }
@@ -215,7 +240,9 @@ void FuseFunctionNodes(const StringCollection& first_inputs,
                        protobuf::RepeatedPtrField<NodeDef>* nodes_to_fuse) {
   for (NodeDef& function_node : *nodes_to_fuse) {
     for (auto& node_input : *function_node.mutable_input()) {
-      auto parsed_name = ParseNodeConnection(node_input);
+      bool is_control_input = IsControlInput(node_input);
+      auto parsed_name =
+          ParseNodeConnection(StripControlInputNotation(node_input));
 
       auto input_it =
           std::find(second_inputs.begin(), second_inputs.end(), parsed_name);
@@ -224,6 +251,9 @@ void FuseFunctionNodes(const StringCollection& first_inputs,
       auto arg_num = std::distance(second_inputs.begin(), input_it);
       node_input =
           set_input(first_inputs, second_inputs, first_outputs, arg_num);
+      if (is_control_input) {
+        node_input = AddControlInputNotation(node_input);
+      }
     }
   }
 }
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc
index e667affeeaf7f4..84c22590926a5b 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc
@@ -15,38 +15,39 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/fusion_utils.h"
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
-
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace fusion_utils {
 namespace {
 
-string ParseNodeConnection(const string &name) {
+string ParseNodeConnection(const string& name) {
   return name.substr(0, name.find(':'));
 }
 
-void CheckUniqueNames(const FunctionDef &function) {
+void CheckUniqueNames(const FunctionDef& function) {
   std::unordered_set<string> inputs;
-  for (const auto &input_arg : function.signature().input_arg())
+  for (const auto& input_arg : function.signature().input_arg())
     inputs.insert(input_arg.name());
   EXPECT_EQ(inputs.size(), function.signature().input_arg_size());
 
   std::unordered_set<string> outputs;
-  for (const auto &output_arg : function.signature().output_arg())
+  for (const auto& output_arg : function.signature().output_arg())
     outputs.insert(output_arg.name());
   EXPECT_EQ(outputs.size(), function.signature().output_arg_size());
 
   std::unordered_set<string> nodes;
-  for (const auto &node : function.node_def()) nodes.insert(node.name());
+  for (const auto& node : function.node_def()) nodes.insert(node.name());
 
   EXPECT_EQ(nodes.size(), function.node_def_size());
 }
@@ -71,7 +72,7 @@ TEST(FusionUtilsTest, FuseFunctionsByComposition) {
   CheckUniqueNames(*fused_function);
 
   const NodeDef *parent_mul = nullptr, *output_mul = nullptr;
-  for (const auto &fused_node : fused_function->node_def()) {
+  for (const auto& fused_node : fused_function->node_def()) {
     if (fused_node.op() == "Mul") {
       if (fused_node.name() == "y")
         parent_mul = &fused_node;
@@ -89,6 +90,44 @@ TEST(FusionUtilsTest, FuseFunctionsByComposition) {
   EXPECT_EQ(ParseNodeConnection(output_value), output_mul->name());
 }
 
+TEST(FusionUtilsTest, FuseFunctionsWithControlInputs) {
+  GraphDef graph;
+  auto *parent_function = graph.mutable_library()->add_function();
+  *parent_function = test::function::XTimesTwoWithControlInput();
+  auto *function = graph.mutable_library()->add_function();
+  *function = test::function::XTimesTwoWithControlInput();
+
+  auto *fused_function = FuseFunctions(
+      *parent_function, *function, "fused_maps", fusion_utils::ComposeSignature,
+      fusion_utils::ComposeInput, fusion_utils::ComposeOutput,
+      fusion_utils::MergeNodes, graph.mutable_library());
+
+  EXPECT_EQ(fused_function->signature().name(), "fused_maps");
+  EXPECT_EQ(fused_function->signature().input_arg_size(), 1);
+  EXPECT_EQ(fused_function->signature().output_arg_size(), 1);
+  EXPECT_EQ(fused_function->ret_size(), 1);
+  CheckUniqueNames(*fused_function);
+
+  const NodeDef *parent_mul = nullptr, *output_mul = nullptr;
+  for (const auto& fused_node : fused_function->node_def()) {
+    if (fused_node.op() == "Mul") {
+      if (fused_node.name() == "y")
+        parent_mul = &fused_node;
+      else
+        output_mul = &fused_node;
+    }
+  }
+  ASSERT_NE(parent_mul, nullptr);
+  ASSERT_NE(output_mul, nullptr);
+  EXPECT_EQ(ParseNodeConnection(output_mul->input(1)),
+            absl::StrCat("^", parent_mul->name()));
+
+  auto output_value = fused_function->ret().at(
+      fused_function->signature().output_arg(0).name());
+
+  EXPECT_EQ(ParseNodeConnection(output_value), output_mul->name());
+}
+
 TEST(FusionUtilsTest, FuseFunctionWithPredicate) {
   GraphDef graph;
   auto *xtimes_two = graph.mutable_library()->add_function();
@@ -112,7 +151,7 @@ TEST(FusionUtilsTest, FuseFunctionWithPredicate) {
 
   ASSERT_TRUE(
       function_utils::ContainsFunctionNodeWithOp("Equal", *fused_function));
-  const auto &equal_node = fused_function->node_def(
+  const auto& equal_node = fused_function->node_def(
       function_utils::FindFunctionNodeWithOp("Equal", *fused_function));
 
   EXPECT_EQ(xtimes_two->signature().output_arg(0).name(),
@@ -152,8 +191,8 @@ TEST(FusionUtilsTest, ZipFusion) {
   auto *function = graph.mutable_library()->add_function();
   *function = test::function::XTimesTwo();
 
-  auto zip_signature = [](const OpDef &parent_function_signature,
-                          const OpDef &function_signature,
+  auto zip_signature = [](const OpDef& parent_function_signature,
+                          const OpDef& function_signature,
                           OpDef *fused_function_signature) {
     *fused_function_signature = parent_function_signature;
     fused_function_signature->mutable_input_arg()->MergeFrom(
@@ -162,9 +201,9 @@ TEST(FusionUtilsTest, ZipFusion) {
         function_signature.output_arg());
   };
 
-  auto zip_input = [](const StringCollection &parent_inputs,
-                      const StringCollection &function_inputs,
-                      const StringCollection &parent_outputs, int arg_num) {
+  auto zip_input = [](const StringCollection& parent_inputs,
+                      const StringCollection& function_inputs,
+                      const StringCollection& parent_outputs, int arg_num) {
     // Take corresponding parent output.
     return function_inputs.at(arg_num);
   };
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
index cfd6826ee8ccce..a537b794760c65 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -79,13 +80,30 @@ bool SameDeterministicAttr(const NodeDef& parallel_map_node,
   return false;
 }
 
+// Returns a name for a new node or function that fuses the inputs.
+// - For nodes, this is only for debugging.
+// - For functions, this additionally prevents collisions (upstream of this
+// optimizer, the act of optimizing a single graph entails individually
+// optimizing each function in that graph and later aggregating any new
+// functions introduced during these individual optimizations into that single
+// graph's collective function library).
+// TODO(mpcallanan): Look at deduping names in a more generic fashion upstream.
+string GetFusedName(const NodeDef& parent, const NodeDef& child) {
+  return absl::StrCat("map_fusion_nodes/", parent.name(), "/", child.name());
+}
+string GetFusedName(const FunctionDef& parent, const FunctionDef& child) {
+  return absl::StrCat("map_fusion_funcs/", parent.signature().name(), "/",
+                      child.signature().name());
+}
+
 // Sets basic function parameters and copies attributes from parent and map
 // node.
 NodeDef MakeFusedNode(const NodeDef& parent_map_node, const NodeDef& map_node,
                       const FunctionDef& fused_function,
                       MutableGraphView* graph) {
   NodeDef fused_node;
-  graph_utils::SetUniqueGraphNodeName("fused_map", graph->graph(), &fused_node);
+  graph_utils::SetUniqueGraphNodeName(GetFusedName(parent_map_node, map_node),
+                                      graph->graph(), &fused_node);
 
   if (map_node.op() == kMapDatasetOp) {
     fused_node.set_op(kMapDatasetOp);
@@ -185,9 +203,10 @@ Status MapFusion::OptimizeAndCollectStats(Cluster* cluster,
       return nullptr;
     }
     return fusion_utils::FuseFunctions(
-        *parent_func, *func, "fused_map", fusion_utils::ComposeSignature,
-        fusion_utils::ComposeInput, fusion_utils::ComposeOutput,
-        fusion_utils::MergeNodes, output->mutable_library());
+        *parent_func, *func, GetFusedName(*parent_func, *func),
+        fusion_utils::ComposeSignature, fusion_utils::ComposeInput,
+        fusion_utils::ComposeOutput, fusion_utils::MergeNodes,
+        output->mutable_library());
   };
 
   for (const NodeDef& node : sorted_old_graph.node()) {
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc
index 76b45248c3f17c..c81191ecd823df 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -186,6 +187,55 @@ TEST(MapFusionTest, FuseTwoParallelMapNodesIntoOne) {
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map2", output));
 }
 
+TEST(MapFusionTest, FusedNodesAndFunctionsAreNamedAfterOldNodesAndFunctions) {
+  using test::function::NDef;
+  NodeDef num_parallel_calls_node = CreateScalarConstNodeHelper(
+      "num_parallel_calls", DT_INT64,
+      [](TensorProto* proto) { proto->add_int64_val(-1); });
+  auto graph = [&num_parallel_calls_node](
+                   const std::string& parent_map_node_name,
+                   const std::string& map_node_name,
+                   const std::string& parent_function_name,
+                   const std::string& function_name) {
+    FunctionDef parent_fn = test::function::XTimesTwo();
+    FunctionDef fn = test::function::XTimesTwo();
+    parent_fn.mutable_signature()->set_name(parent_function_name);
+    fn.mutable_signature()->set_name(function_name);
+    return test::function::GDef(
+        {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+         NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+         NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+         NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+         num_parallel_calls_node,
+         MakeParallelMapV2Node(parent_map_node_name, "range",
+                               num_parallel_calls_node.name(),
+                               parent_function_name, "default"),
+         MakeParallelMapV2Node(map_node_name, parent_map_node_name,
+                               num_parallel_calls_node.name(), function_name,
+                               "default")},
+        // FunctionLib
+        {parent_fn, fn});
+  };
+
+  GrapplerItem item_1;
+  item_1.graph = graph("map1", "map2", "fnA", "fnB");
+  GraphDef output_1;
+  TF_ASSERT_OK(OptimizeWithMapFusion(item_1, &output_1, true));
+  EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName(
+      "map_fusion_nodes/map1/map2", output_1));
+  EXPECT_TRUE(graph_utils::ContainsGraphFunctionWithName(
+      "map_fusion_funcs/fnA/fnB", output_1.library()));
+
+  GrapplerItem item_2;
+  item_2.graph = graph("map3", "map4", "fnC", "fnD");
+  GraphDef output_2;
+  TF_ASSERT_OK(OptimizeWithMapFusion(item_2, &output_2, true));
+  EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName(
+      "map_fusion_nodes/map3/map4", output_2));
+  EXPECT_TRUE(graph_utils::ContainsGraphFunctionWithName(
+      "map_fusion_funcs/fnC/fnD", output_2.library()));
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index 2896e3e703caa5..98caa6218376e3 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -1551,6 +1551,142 @@ TEST_F(MklFuseInstanceNormTest, FuseMklInstanceNormWithActivation4D_FP32_NCHW) {
   FuseMklInstanceNorm4D<DT_FLOAT>("NCHW", true);
 }
 
+class FusedConvBiasAddAndHardSwishTest : public GrapplerTest {
+ public:
+  const string kAddOp = "Add";
+  const string kAddV2Op = "AddV2";
+
+  template <DataType DType, bool with_cast_op = false>
+  void RunTest(const string& add_op, const bool is_depthwise) {
+    using ::tensorflow::ops::Placeholder;
+
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+    auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+    auto bias_shape = ops::Placeholder::Shape({is_depthwise ? 384 : 128});
+
+    auto input = Placeholder(s.WithOpName("input"), DType, input_shape);
+    auto filter = Placeholder(s.WithOpName("filter"), DType, filter_shape);
+    auto bias = Placeholder(s.WithOpName("bias"), DType, bias_shape);
+    const DataType const_dt = with_cast_op ? DT_FLOAT : DType;
+    typedef typename EnumToDataType<const_dt>::Type DT;
+    Tensor three(const_dt, TensorShape({}));
+    Tensor one_sixth(const_dt, TensorShape({}));
+    three.scalar<DT>()() = static_cast<DT>(3.0f);
+    one_sixth.scalar<DT>()() = static_cast<DT>(1.0f / 6.0f);
+    auto three_op =
+        with_cast_op
+            ? ops::Cast(s.WithOpName("three"), Input::Initializer(three),
+                        DT_BFLOAT16)
+            : ops::Const(s.WithOpName("three"), Input::Initializer(three));
+    auto one_sixth_op =
+        with_cast_op ? ops::Cast(s.WithOpName("one_sixth"),
+                                 Input::Initializer(one_sixth), DT_BFLOAT16)
+                     : ops::Const(s.WithOpName("one_sixth"),
+                                  Input::Initializer(one_sixth));
+
+    std::vector<int> strides = {1, 1, 1, 1};
+    Output conv;
+    if (is_depthwise) {
+      conv = ops::DepthwiseConv2dNative(
+          s.WithOpName("conv"), input, filter, strides, "SAME",
+          ops::DepthwiseConv2dNative::Attrs().DataFormat("NHWC"));
+    } else {
+      conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME",
+                         ops::Conv2D::Attrs().DataFormat("NHWC"));
+    }
+    auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias,
+                                 ops::BiasAdd::Attrs().DataFormat("NHWC"));
+
+    Output add;
+    if (add_op == kAddV2Op) {
+      add = ops::AddV2(s.WithOpName(add_op), three_op, bias_add);
+    } else {
+      add = ops::Add(s.WithOpName(add_op), three_op, bias_add);
+    }
+
+    auto relu6 = ops::Relu6(s.WithOpName("relu_6"), add);
+    auto mul_one_sixth =
+        ops::Mul(s.WithOpName("mul_one_sixth"), one_sixth_op, bias_add);
+    auto mul_output = ops::Mul(s.WithOpName("output"), mul_one_sixth, relu6);
+
+    auto fetch = ops::Identity(s.WithOpName("fetch"), mul_output);
+
+    auto input_tensor = GenerateTensorWithSetRandom<DType>(
+        TensorShape(input_shape.shape_.dim_sizes()));
+    auto filter_tensor = GenerateTensorWithSetRandom<DType>(
+        TensorShape(filter_shape.shape_.dim_sizes()));
+    auto bias_tensor = GenerateTensorWithSetRandom<DType>(
+        TensorShape(bias_shape.shape_.dim_sizes()));
+
+    GrapplerItem item;
+    item.fetch = {"fetch"};
+    item.feed = {{"input", input_tensor},
+                 {"filter", filter_tensor},
+                 {"bias", bias_tensor}};
+
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+    // Place all nodes on CPU.
+    for (int i = 0; i < item.graph.node_size(); ++i) {
+      item.graph.mutable_node(i)->set_device("/device:CPU:0");
+    }
+
+    Remapper optimizer(RewriterConfig::ON);
+    GraphDef output;
+    TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+    int found = 0;
+    for (const NodeDef& node : output.node()) {
+      if (node.name() == "output") {
+        if (is_depthwise) {
+          EXPECT_EQ("_FusedDepthwiseConv2dNative", node.op());
+        } else {
+          EXPECT_EQ("_FusedConv2D", node.op());
+        }
+        EXPECT_EQ("input", node.input(0));
+        EXPECT_EQ("filter", node.input(1));
+        EXPECT_EQ("bias", node.input(2));
+        EXPECT_EQ(1, node.attr().at("num_args").i());
+
+        const auto fused_ops = node.attr().at("fused_ops").list().s();
+        EXPECT_EQ(2, fused_ops.size());
+        EXPECT_EQ("BiasAdd", fused_ops[0]);
+        EXPECT_EQ("_FusedHardSwish", fused_ops[1]);
+        found++;
+      }
+    }
+    EXPECT_EQ(1, found);
+
+    auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+    auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+    EXPECT_EQ(1, tensors_expected.size());
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectClose(tensors_expected[0], tensors[0], 1e-6);
+  }
+};
+
+TEST_F(FusedConvBiasAddAndHardSwishTest, Float32Conv2DBiasHardSwish) {
+  RunTest<DT_FLOAT>("AddV2", false);
+}
+TEST_F(FusedConvBiasAddAndHardSwishTest, Float32DWConv2DBiasHardSwish) {
+  RunTest<DT_FLOAT>("AddV2", true);
+}
+TEST_F(FusedConvBiasAddAndHardSwishTest, Bfloat16Conv2DBiasHardSwish) {
+  RunTest<DT_BFLOAT16>("Add", false);
+}
+TEST_F(FusedConvBiasAddAndHardSwishTest, Bfloat16DWConv2DBiasHardSwish) {
+  RunTest<DT_BFLOAT16>("Add", true);
+}
+TEST_F(FusedConvBiasAddAndHardSwishTest, Bfloat16Conv2DBiasHardSwishWithCast) {
+  RunTest<DT_BFLOAT16, true>("Add", false);
+}
+TEST_F(FusedConvBiasAddAndHardSwishTest,
+       Bfloat16DWConv2DBiasHardSwishWithCast) {
+  RunTest<DT_BFLOAT16, true>("Add", true);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
 #endif  // INTEL_MKL && ENABLE_MKL
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 3c37150f496aa4..3723290691ce28 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -1060,7 +1060,8 @@ bool FindConv2DWithBatchNormAndActivation(
 bool FindContractionWithBiasInPort(const RemapperContext& ctx,
                                    const utils::MutableNodeView& add_node_view,
                                    const NodeDef& add_node_def, int port_id,
-                                   ContractionWithBiasAdd* base) {
+                                   ContractionWithBiasAdd* base,
+                                   const int allowed_fanouts = 1) {
   // Input to AddN must match ContractionWithBiasAdd pattern.
   if (add_node_view.NumRegularFanins() < port_id + 1) return false;
   const auto& bias_add_node_view =
@@ -1071,7 +1072,7 @@ bool FindContractionWithBiasInPort(const RemapperContext& ctx,
   if (!FindContractionWithBias(ctx, bias_add_node_view->node_index(), base,
                                /*check_device_compatible=*/false))
     return false;
-  if (!HasAtMostOneFanoutAtPort0(*bias_add_node_view) ||
+  if (bias_add_node_view->GetRegularFanout(0).size() > allowed_fanouts ||
       !HaveSameDataType(&add_node_def, bias_add_node_def) ||
       IsInPreserveSet(ctx, bias_add_node_def))
     return false;
@@ -2670,6 +2671,140 @@ bool FindTensorToHashBucket(const RemapperContext& ctx, int node_index,
   return true;
 }
 
+// clang-format off
+// HardSwish pattern
+//                        input     Const (value: 3)
+//                          |  \   /
+//                          |  Add or AddV2
+//                          |        |
+//                          |       Relu6
+//                          |        /
+//                          |       /
+// Const (value: 0.1666)    |      /
+//                      \   |     /
+//                        Mul    /
+//                          \   /
+//                           Mul
+// clang-format on
+bool FindHardSwish(RemapperContext& ctx, int node_index,
+                   std::map<string, int>* matched_nodes_map,
+                   std::set<int>* remove_node_indices) {
+  if (!IsMKLEnabled()) return false;
+
+  using utils::MatchingDirection;
+  using utils::NodeStatus;
+  // clang-format off
+  utils::OpTypePattern pattern {"Mul", "output", NodeStatus::kReplace,
+    {
+      {"Mul", "mul_one_sixth", NodeStatus::kRemove,
+        {
+          {"Const|Cast", "one_sixth", NodeStatus::kRemain},
+          {"*", "input", NodeStatus::kRemain}
+        }
+      },
+      {"Relu6", "relu6", NodeStatus::kRemove,
+        {
+          {"Add|AddV2", "add", NodeStatus::kRemove,
+            {
+              {"*", "input", NodeStatus::kRemain},
+              {"Const|Cast", "three", NodeStatus::kRemain}
+            }
+          }
+        }
+      },
+    }
+  };
+  // clang-format on
+  bool found_match = false;
+  utils::SubGraphMatcher<MatchingDirection::kFollowInputs> graph_matcher(
+      &(ctx.graph_view));
+
+  matched_nodes_map->clear();
+  remove_node_indices->clear();
+
+  found_match = graph_matcher.GetMatchedNodes(
+      pattern, ctx.nodes_to_preserve, ctx.graph_view.GetNode(node_index),
+      matched_nodes_map, remove_node_indices);
+
+  if (found_match) {
+    // Check if the values of Const nodes are as expected
+    std::map<string, float> values_map = {{"three", 3.0},
+                                          {"one_sixth", 0.16666}};
+    if (!VerifyConstants(&ctx, matched_nodes_map, &values_map)) return false;
+  }
+
+  return found_match;
+}
+
+// clang-format off
+// Contraction + BiasAdd + _FusedHardSwish activation
+// input     filter
+//     \     /
+// Contraction  bias
+//         |   /
+//      BiasAdd
+//         |
+//   _FusedHardSwish
+// clang-format on
+bool FindContractionWithBiasAddAndHardSwish(
+    RemapperContext& ctx, int node_index,
+    std::map<string, int>* matched_nodes_map,
+    std::set<int>* remove_node_indices) {
+  if (!IsMKLEnabled()) return false;
+
+  const auto* node_view = ctx.graph_view.GetNode(node_index);
+  if (HasControlFaninOrFanout(*node_view)) return false;
+
+  // Check if HardSwish pattern is available
+  if (!FindHardSwish(ctx, node_index, matched_nodes_map, remove_node_indices))
+    return false;
+  // Get handle of Add|AddV2 op that is the root of HardSwish pattern.
+  const auto* add_node_view =
+      ctx.graph_view.GetNode(matched_nodes_map->at("add"));
+  const auto* add_node_def = add_node_view->node();
+
+  // Check if ContractionWithBias pattern is feeding HardSwish
+  ContractionWithBiasAdd base;
+  int port_id = 0;
+  // BiasAdd node is expected to have 2 fanouts feeding the HardSwish pattern.
+  if (!FindContractionWithBiasInPort(ctx, *add_node_view, *add_node_def,
+                                     port_id, &base, /*allowed_fanouts*/ 2)) {
+    port_id = 1;
+    if (!FindContractionWithBiasInPort(ctx, *add_node_view, *add_node_def,
+                                       port_id, &base, /*allowed_fanouts*/ 2)) {
+      VLOG(2) << "Contraction + BiasAdd pattern was not found although"
+              << " HardSwish pattern was found, so fusion failed.";
+      return false;
+    }
+  }
+
+  // Get the BiasAdd node
+  const auto* bias_node_def = ctx.graph_view.GetNode(base.bias_add)->node();
+  if (!HaveSameDataType(add_node_def, bias_node_def)) return false;
+
+  // Get the contraction node
+  const auto* contraction_node_view = ctx.graph_view.GetNode(base.contraction);
+  const auto* contraction_node_def = contraction_node_view->node();
+
+  // Currently only Conv2D and DepthwiseConv2D contraction ops are supported
+  if (!IsConv2D(*contraction_node_def) &&
+      !IsDepthwiseConv2dNative(*contraction_node_def))
+    return false;
+
+  // Check if contraction is compatible with CPU
+  if (!IsCpuCompatibleConv2D(ctx, contraction_node_def) &&
+      !IsCpuCompatibleDepthwiseConv2dNative(contraction_node_def))
+    return false;
+
+  // We found a {Conv2D, DepthwiseConv2D}+BiasAdd+_FusedHardSwish pattern.
+  matched_nodes_map->insert({"contraction", base.contraction});
+  matched_nodes_map->insert({"bias_add", base.bias_add});
+
+  remove_node_indices->insert(base.contraction);
+  remove_node_indices->insert(base.bias_add);
+  return true;
+}
+
 bool FindFusedBatchMatMul(RemapperContext* ctx, int node_index,
                           std::map<string, int>* matched_nodes_map,
                           std::set<int>* remove_node_indices,
@@ -3537,6 +3672,47 @@ Status AddFusedContractionNode(
   return OkStatus();
 }
 
+Status FuseContractionWithBiasAddAndHardSwish(
+    RemapperContext* ctx, std::map<string, int>* matched_nodes_map,
+    std::set<int>* remove_node_indices, std::vector<bool>* invalidated_nodes,
+    std::vector<bool>* nodes_to_delete) {
+  auto* output_node =
+      ctx->graph_view.GetNode(matched_nodes_map->at("output"))->node();
+  auto* contraction_node =
+      ctx->graph_view.GetNode(matched_nodes_map->at("contraction"))->node();
+  auto* bias_add_node =
+      ctx->graph_view.GetNode(matched_nodes_map->at("bias_add"))->node();
+
+  bool is_conv2d = IsConv2D(*contraction_node);
+
+  NodeDef fused_node;
+  fused_node.set_name(output_node->name());
+  fused_node.set_op(is_conv2d ? kFusedConv2D : kFusedDepthwiseConv2dNative);
+  fused_node.set_device(contraction_node->device());
+  fused_node.add_input(contraction_node->input(0));
+  fused_node.add_input(contraction_node->input(1));
+  fused_node.add_input(bias_add_node->input(1));
+
+  if (is_conv2d) {
+    CopyConv2DAttributes(*contraction_node, &fused_node);
+  } else {
+    CopyDepthwiseConv2dNativeAttributes(*contraction_node, &fused_node);
+  }
+  SetFusedOpAttributes(&fused_node, {"BiasAdd", "_FusedHardSwish"});
+
+  utils::Mutation* mutation = ctx->graph_view.GetMutationBuilder();
+  Status status;
+  mutation->AddNode(std::move(fused_node), &status);
+  TF_RETURN_IF_ERROR(status);
+  TF_RETURN_IF_ERROR(mutation->Apply());
+  (*invalidated_nodes)[matched_nodes_map->at("output")] = true;
+
+  for (const auto& node_idx : *remove_node_indices) {
+    (*nodes_to_delete)[node_idx] = true;
+  }
+  return OkStatus();
+}
+
 Status FuseConv2DSwish(RemapperContext* ctx,
                        const std::map<string, int>& matched_nodes_map,
                        const std::set<int>& remove_node_indices,
@@ -4703,6 +4879,15 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
       std::set<int> remove_node_indices;
       std::vector<string> input_node_names;
 
+      // Remap {Conv2D|DepthwiseConv2D} + BiasAdd + HardSwish subgraph
+      if (FindContractionWithBiasAddAndHardSwish(ctx, i, &matched_nodes_map,
+                                                 &remove_node_indices)) {
+        TF_RETURN_IF_ERROR(FuseContractionWithBiasAddAndHardSwish(
+            &ctx, &matched_nodes_map, &remove_node_indices, &invalidated_nodes,
+            &nodes_to_delete));
+        continue;
+      }
+
       // Softplus + Tanh + Mul to Mish conversion
       matched_nodes_map.clear();
       remove_node_indices.clear();
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index d3a6652589381b..76c7098361d6f2 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -888,6 +888,163 @@ TEST_F(RemapperFuseConvWithBiasAndActivation, Conv3D_BF16) {
   RunTest<3, DT_BFLOAT16>();
 }
 
+class RemapperFuseConvWithBiasAndAddActivation : public RemapperTest {
+ public:
+  template <int dim, DataType DTYPE>
+  void RunTest() {
+    if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to oneDNN.";
+    using ::tensorflow::ops::Placeholder;
+
+    for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
+      tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+      auto input_shape = Placeholder::Shape({8, 32, 32, 3});
+      auto filter_shape = Placeholder::Shape({1, 1, 3, 128});
+      auto bias_shape = Placeholder::Shape({128});
+      auto add_shape = ops::Placeholder::Shape({8, 32, 32, 128});
+
+      auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+      auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+      auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+      auto add_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 128});
+
+      float leakyrelu_alpha = 0.5;
+
+      std::vector<int> strides = {1, 1, 1, 1};
+
+      if (dim == 3) {
+        input_shape = Placeholder::Shape({8, 4, 32, 32, 3});
+        filter_shape = Placeholder::Shape({1, 1, 1, 3, 128});
+        bias_shape = Placeholder::Shape({128});
+        add_shape = ops::Placeholder::Shape({8, 4, 32, 32, 128});
+        strides = {1, 1, 1, 1, 1};
+
+        input_t = GenerateRandomTensor<DT_FLOAT>({8, 4, 32, 32, 3});
+        filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 1, 3, 128});
+        bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+        add_t = GenerateRandomTensor<DT_FLOAT>({8, 4, 32, 32, 128});
+      }
+
+      auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+      auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+      auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+      auto input_add =
+          Placeholder(s.WithOpName("input_add"), DT_FLOAT, add_shape);
+
+      if (dim == 2) {
+        auto conv =
+            ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+        auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+        auto add = ops::Add(s.WithOpName("add_op"), input_add, bias_add);
+
+        ops::Identity fetch = [&]() -> ops::Identity {
+          auto activate = s.WithOpName("activation");
+          auto fetch = s.WithOpName("fetch");
+
+          if (activation == "Relu") {
+            return ops::Identity(fetch, ops::Relu(activate, add));
+          } else if (activation == "Relu6") {
+            return ops::Identity(fetch, ops::Relu6(activate, add));
+          } else if (activation == "Elu") {
+            return ops::Identity(fetch, ops::Elu(activate, add));
+          } else if (activation == "LeakyRelu") {
+            auto attr = ops::internal::LeakyRelu::Alpha(leakyrelu_alpha);
+            return ops::Identity(fetch,
+                                 ops::internal::LeakyRelu(activate, add, attr));
+          }
+
+          return ops::Identity(fetch, bias);
+        }();
+      } else if (dim == 3) {
+        auto conv =
+            ops::Conv3D(s.WithOpName("conv"), input, filter, strides, "SAME");
+        auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+        auto add = ops::Add(s.WithOpName("add_op"), input_add, bias_add);
+
+        ops::Identity fetch = [&]() -> ops::Identity {
+          auto activate = s.WithOpName("activation");
+          auto fetch = s.WithOpName("fetch");
+
+          if (activation == "Relu") {
+            return ops::Identity(fetch, ops::Relu(activate, add));
+          } else if (activation == "Relu6") {
+            return ops::Identity(fetch, ops::Relu6(activate, add));
+          } else if (activation == "Elu") {
+            return ops::Identity(fetch, ops::Elu(activate, add));
+          } else if (activation == "LeakyRelu") {
+            auto attr = ops::internal::LeakyRelu::Alpha(leakyrelu_alpha);
+            return ops::Identity(fetch,
+                                 ops::internal::LeakyRelu(activate, add, attr));
+          }
+
+          return ops::Identity(fetch, bias);
+        }();
+      }
+
+      GrapplerItem item;
+      item.fetch = {"fetch"};
+      item.feed = {{"input", input_t},
+                   {"filter", filter_t},
+                   {"bias", bias_t},
+                   {"input_add", add_t}};
+      TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+      // Place all nodes on CPU.
+      for (int i = 0; i < item.graph.node_size(); ++i) {
+        item.graph.mutable_node(i)->set_device("/device:CPU:0");
+      }
+
+      Remapper optimizer(RewriterConfig::AGGRESSIVE);
+      GraphDef output;
+      TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+      int found = 0;
+      for (const NodeDef& node : output.node()) {
+        if (node.name() == "activation") {
+          if (dim == 2) {
+            EXPECT_EQ(node.op(), "_FusedConv2D");
+          } else if (dim == 3) {
+            EXPECT_EQ(node.op(), "_FusedConv3D");
+          }
+          ASSERT_GE(node.input_size(), 3);
+          EXPECT_EQ(node.input(0), "input");
+          EXPECT_EQ(node.input(1), "filter");
+
+          EXPECT_EQ(node.attr().at("num_args").i(), 2);
+          EXPECT_EQ(node.input(2), "bias");
+
+          const auto fused_ops = node.attr().at("fused_ops").list().s();
+          ASSERT_EQ(fused_ops.size(), 3);
+          EXPECT_EQ("BiasAdd", fused_ops[0]);
+          EXPECT_EQ("Add", fused_ops[1]);
+          EXPECT_EQ(activation, fused_ops[2]);
+          found++;
+        }
+      }
+      EXPECT_EQ(found, 1);
+
+      auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+      ASSERT_EQ(tensors_expected.size(), 1);
+      auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+      ASSERT_EQ(tensors.size(), 1);
+      test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+    }
+  }
+};
+
+TEST_F(RemapperFuseConvWithBiasAndAddActivation, Conv2D_F32) {
+  RunTest<2, DT_FLOAT>();
+}
+TEST_F(RemapperFuseConvWithBiasAndAddActivation, Conv3D_F32) {
+  RunTest<3, DT_FLOAT>();
+}
+TEST_F(RemapperFuseConvWithBiasAndAddActivation, Conv2D_BF16) {
+  RunTest<2, DT_BFLOAT16>();
+}
+TEST_F(RemapperFuseConvWithBiasAndAddActivation, Conv3D_BF16) {
+  RunTest<3, DT_BFLOAT16>();
+}
+
 class RemapperFuseConvWithSqueezeAndBias : public RemapperTest {
  public:
   template <int dim, DataType DTYPE>
@@ -2255,102 +2412,6 @@ TEST_F(RemapperTest, FuseConv3DWithBiasAndAdd) {
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }
 
-TEST_F(RemapperTest, FuseConv3DWithBiasAndAddActivation) {
-  if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to oneDNN.";
-  using ::tensorflow::ops::Placeholder;
-
-  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
-    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-
-    auto input_shape = Placeholder::Shape({8, 4, 32, 32, 3});
-    auto filter_shape = Placeholder::Shape({1, 1, 1, 3, 128});
-    auto bias_shape = Placeholder::Shape({128});
-    auto add_shape = ops::Placeholder::Shape({8, 4, 32, 32, 128});
-
-    auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
-    auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
-    auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
-    auto input_add =
-        Placeholder(s.WithOpName("input_add"), DT_FLOAT, add_shape);
-
-    float leakyrelu_alpha = 0.5;
-
-    std::vector<int> strides = {1, 1, 1, 1, 1};
-    auto conv =
-        ops::Conv3D(s.WithOpName("conv"), input, filter, strides, "SAME");
-    auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
-    auto add = ops::Add(s.WithOpName("add_op"), input_add, bias_add);
-
-    ops::Identity fetch = [&]() -> ops::Identity {
-      auto activate = s.WithOpName("activation");
-      auto fetch = s.WithOpName("fetch");
-
-      if (activation == "Relu") {
-        return ops::Identity(fetch, ops::Relu(activate, add));
-      } else if (activation == "Relu6") {
-        return ops::Identity(fetch, ops::Relu6(activate, add));
-      } else if (activation == "Elu") {
-        return ops::Identity(fetch, ops::Elu(activate, add));
-      } else if (activation == "LeakyRelu") {
-        auto attr = ops::internal::LeakyRelu::Alpha(leakyrelu_alpha);
-        return ops::Identity(fetch,
-                             ops::internal::LeakyRelu(activate, add, attr));
-      }
-
-      return ops::Identity(fetch, bias);
-    }();
-
-    auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 4, 32, 32, 3});
-    auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 1, 3, 128});
-    auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
-    auto add_t = GenerateRandomTensor<DT_FLOAT>({8, 4, 32, 32, 128});
-
-    GrapplerItem item;
-    item.fetch = {"fetch"};
-    item.feed = {{"input", input_t},
-                 {"filter", filter_t},
-                 {"bias", bias_t},
-                 {"input_add", add_t}};
-    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
-
-    // Place all nodes on CPU.
-    for (int i = 0; i < item.graph.node_size(); ++i) {
-      item.graph.mutable_node(i)->set_device("/device:CPU:0");
-    }
-
-    Remapper optimizer(RewriterConfig::AGGRESSIVE);
-    GraphDef output;
-    TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-
-    int found = 0;
-    for (const NodeDef& node : output.node()) {
-      if (node.name() == "activation") {
-        EXPECT_EQ(node.op(), "_FusedConv3D");
-        ASSERT_GE(node.input_size(), 3);
-        EXPECT_EQ(node.input(0), "input");
-        EXPECT_EQ(node.input(1), "filter");
-
-        EXPECT_EQ(node.attr().at("num_args").i(), 2);
-        EXPECT_EQ(node.input(2), "bias");
-
-        const auto fused_ops = node.attr().at("fused_ops").list().s();
-        ASSERT_EQ(fused_ops.size(), 3);
-        EXPECT_EQ("BiasAdd", fused_ops[0]);
-        EXPECT_EQ("Add", fused_ops[1]);
-        EXPECT_EQ(activation, fused_ops[2]);
-        found++;
-      }
-    }
-    EXPECT_EQ(found, 1);
-
-    auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
-    ASSERT_EQ(tensors_expected.size(), 1);
-    auto tensors = EvaluateNodes(output, item.fetch, item.feed);
-    ASSERT_EQ(tensors.size(), 1);
-    test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
-  }
-}
-
 // Conv2D + Add {6,} + Conv2D + Biasadd fusion.
 TEST_F(RemapperTest, FuseConv2DWithSemanticAdd) {
   if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to MKL.";
diff --git a/tensorflow/core/ir/dialect.td b/tensorflow/core/ir/dialect.td
index d80ecfa70cc19a..f3530af1232a16 100644
--- a/tensorflow/core/ir/dialect.td
+++ b/tensorflow/core/ir/dialect.td
@@ -172,7 +172,6 @@ def TFGraphDialect : Dialect {
   let useDefaultAttributePrinterParser = 1;
   let hasNonDefaultDestructor = 1;
   let hasOperationInterfaceFallback = 1;
-  let usePropertiesForAttributes = 0;
 }
 
 #endif // TFG_DIALECT
diff --git a/tensorflow/core/ir/types/dialect.h b/tensorflow/core/ir/types/dialect.h
index 419d396cb7aaf0..e2dc8bef70a5d1 100644
--- a/tensorflow/core/ir/types/dialect.h
+++ b/tensorflow/core/ir/types/dialect.h
@@ -111,12 +111,13 @@ class TensorFlowRefType : public TensorFlowType {
 
 // Define a class for each individual TensorFlow type (dtype), see types.def
 // for the list.
-#define HANDLE_TF_TYPE(tftype, enumerant, name)                          \
+#define HANDLE_TF_TYPE(tftype, enumerant, name_marg)                     \
   class tftype##Type : public detail::TensorFlowTypeImpl<tftype##Type> { \
    public:                                                               \
     using TFBase::TFBase;                                                \
+    static constexpr StringLiteral name = #name_marg;                    \
   };
-#define HANDLE_CUSTOM_TF_TYPE(tftype, enumerant, name)
+#define HANDLE_CUSTOM_TF_TYPE(tftype, enumerant, name_marg)
 #include "tensorflow/core/ir/types/types.def"
 
 namespace detail {
@@ -222,6 +223,7 @@ inline Type GetDefaultTypeOf(TensorFlowTypeWithSubtype type) {
 class ResourceType : public detail::TypeWithSubtypeImpl<ResourceType> {
  public:
   using TFBase::TFBase;
+  static constexpr ::mlir::StringLiteral name = "tf_type.resource";
   static std::string getTypeName() { return "ResourceType"; }
 };
 
@@ -233,6 +235,7 @@ class ResourceType : public detail::TypeWithSubtypeImpl<ResourceType> {
 class VariantType : public detail::TypeWithSubtypeImpl<VariantType> {
  public:
   using TFBase::TFBase;
+  static constexpr ::mlir::StringLiteral name = "tf_type.variant";
   static std::string getTypeName() { return "VariantType"; }
 };
 
diff --git a/tensorflow/core/ir/types/dialect.td b/tensorflow/core/ir/types/dialect.td
index e6a13969b6843b..417b870977782a 100644
--- a/tensorflow/core/ir/types/dialect.td
+++ b/tensorflow/core/ir/types/dialect.td
@@ -47,7 +47,6 @@ def TFTypeDialect : Dialect {
      void printType(::mlir::Type type, ::mlir::DialectAsmPrinter &printer) const;
   }];
   let useDefaultAttributePrinterParser = 1;
-  let usePropertiesForAttributes = 0;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 504620a451c331..025a63d92354c2 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -52,10 +52,8 @@ package(
     default_visibility = ["//visibility:public"],
     features = if_google(
         [
-            "-layering_check",
             "-parse_headers",
         ],
-        ["-layering_check"],
     ),
     licenses = ["notice"],
 )
@@ -183,6 +181,7 @@ tf_kernel_library(
         "collective_nccl_reducer.h",
         "collective_nccl_reducer.cc",
     ]),
+    features = ["-layering_check"],
     prefix = "collective_ops",
     deps = [
         "//tensorflow/core:core_cpu",
@@ -202,6 +201,7 @@ tf_cuda_cc_test(
     name = "collective_nccl_test",
     size = "small",
     srcs = ["collective_nccl_test.cc"],
+    features = if_cuda(["-layering_check"]),
     tags = tf_cuda_tests_tags() + [
         "guitar",
         "multi_gpu",
@@ -265,6 +265,7 @@ cc_library(
 tf_kernel_library(
     name = "conv_2d",
     hdrs = ["conv_2d.h"],
+    features = if_cuda(["-layering_check"]),
     gpu_copts = if_not_windows([
         "-Wno-pass-failed",  # clang misses #pragma loop optimizations
     ]),
@@ -317,6 +318,7 @@ cc_library(
 
 tf_kernel_library(
     name = "fill_functor",
+    features = ["-layering_check"],
     prefix = "fill_functor",
     deps = [
         "//tensorflow/core:framework",
@@ -372,6 +374,7 @@ cc_library(
         "sparse_utils.cc",
     ],
     hdrs = ["sparse_utils.h"],
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
@@ -382,6 +385,7 @@ cc_library(
 tf_cc_test(
     name = "sparse_utils_test",
     srcs = ["sparse_utils_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":sparse_utils",
         "//tensorflow/core:framework",
@@ -490,6 +494,7 @@ cc_library(
 tf_cuda_only_cc_test(
     name = "gpu_prim_helpers_test",
     srcs = ["gpu_prim_helpers_test.cu.cc"],
+    features = if_cuda(["-layering_check"]),
     tags = ["no_cuda_asan"],  # TODO(b/183963619)
     deps = [
         ":gpu_prim_helpers",
@@ -538,6 +543,7 @@ tf_cuda_library(
     name = "gpu_utils",
     srcs = if_cuda_or_rocm(["gpu_utils.cc"]),
     hdrs = ["gpu_utils.h"],
+    features = ["-layering_check"],
     deps = [
         ":gpu_util_hdrs",
         "//tensorflow/core:lib",
@@ -626,6 +632,7 @@ cc_library(
     name = "queue_base",
     srcs = ["queue_base.cc"],
     hdrs = ["queue_base.h"],
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -637,6 +644,7 @@ cc_library(
     name = "queue_op",
     srcs = ["queue_op.cc"],
     hdrs = ["queue_op.h"],
+    features = ["-layering_check"],
     deps = [
         ":queue_base",
         "//tensorflow/core:framework",
@@ -877,6 +885,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "debug_ops",
+    features = ["-layering_check"],
     prefix = "debug_ops",
     deps = ARRAY_DEPS + [
         "//tensorflow/core:gpu_runtime",
@@ -948,6 +957,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "concat_op",
+    features = ["-layering_check"],
     prefix = "concat_op",
     deps = ARRAY_DEPS,
 )
@@ -1072,6 +1082,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "reshape_op",
+    features = ["-layering_check"],
     prefix = "reshape_op",
     deps = ARRAY_DEPS,
 )
@@ -1090,6 +1101,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "shape_ops",
+    features = ["-layering_check"],
     prefix = "shape_ops",
     deps = ARRAY_DEPS + ["//tensorflow/core/common_runtime:dma_helper"],
 )
@@ -1108,6 +1120,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "split_op",
+    features = ["-layering_check"],
     gpu_srcs = ["gpu_device_array.h"],
     prefix = "split_op",
     deps = ARRAY_DEPS + [
@@ -1118,6 +1131,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "split_v_op",
+    features = ["-layering_check"],
     gpu_srcs = ["gpu_device_array.h"],
     prefix = "split_v_op",
     deps = ARRAY_DEPS + [
@@ -1190,6 +1204,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "unique_op",
+    features = if_cuda(["-layering_check"]),
     prefix = "unique_op",
     deps = ARRAY_DEPS + [
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1219,6 +1234,7 @@ tf_kernel_library(
     name = "where_op",
     srcs = ["where_op.cc"],
     hdrs = ["where_op.h"],
+    features = ["-layering_check"],
     gpu_srcs = [
         "where_op.h",
         "where_op_gpu.cu.h",
@@ -1290,6 +1306,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ragged_utils",
+    hdrs = [
+        "ragged_utils.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "@com_google_absl//absl/status",
+    ],
+)
+
 tf_kernel_library(
     name = "ragged_gather_op",
     srcs = ["ragged_gather_op.cc"],
@@ -1315,6 +1342,7 @@ tf_cc_test(
 tf_kernel_library(
     name = "ragged_range_op",
     srcs = ["ragged_range_op.cc"],
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
     ],
@@ -1323,6 +1351,7 @@ tf_kernel_library(
 tf_cc_test(
     name = "ragged_range_op_test",
     srcs = ["ragged_range_op_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":ops_testutil",
         ":ragged_range_op",
@@ -1336,6 +1365,7 @@ tf_cc_test(
 tf_kernel_library(
     name = "ragged_tensor_to_sparse_kernel",
     srcs = ["ragged_tensor_to_sparse_kernel.cc"],
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
     ],
@@ -1389,6 +1419,7 @@ cc_library(
     name = "ragged_tensor_variant",
     srcs = ["ragged_tensor_variant.cc"],
     hdrs = ["ragged_tensor_variant.h"],
+    features = ["-layering_check"],
     deps = [
         ":cwise_op",
         "//tensorflow/core:framework",
@@ -1401,6 +1432,7 @@ tf_kernel_library(
     deps = [
         ":concat_lib",
         ":ragged_tensor_variant",
+        ":ragged_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -1423,6 +1455,7 @@ tf_cc_test(
         "ragged_tensor_to_variant_op_test.cc",
         "ragged_tensor_to_variant_op_test.h",
     ],
+    features = ["-layering_check"],
     deps = [
         ":ops_testutil",
         ":ragged_tensor_to_variant_op",
@@ -1444,6 +1477,7 @@ tf_cc_test(
         "ragged_tensor_to_variant_op_large_data_test.cc",
         "ragged_tensor_to_variant_op_test.h",
     ],
+    features = ["-layering_check"],
     tags = [
         "local",
         "manual",
@@ -1482,7 +1516,9 @@ tf_cc_test(
 tf_kernel_library(
     name = "ragged_cross_op",
     srcs = ["ragged_cross_op.cc"],
+    features = ["-layering_check"],
     deps = [
+        ":ragged_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -1508,6 +1544,7 @@ tf_cc_test(
     name = "ragged_fill_empty_rows_op_test",
     size = "small",
     srcs = ["ragged_fill_empty_rows_op_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":ops_testutil",
         ":ragged_fill_empty_rows_op",
@@ -1570,10 +1607,10 @@ cc_library(
     testonly = 1,
     srcs = ["batch_kernel_test_util.cc"],
     hdrs = ["batch_kernel_test_util.h"],
+    features = ["-layering_check"],
     deps = [
         ":batch_kernels",
         ":ops_testutil",
-        ":ops_util",
         "//tensorflow/core:test",
         "//tensorflow/core:testlib",
     ],
@@ -1583,6 +1620,7 @@ tf_cc_test(
     name = "batch_kernels_test",
     size = "small",
     srcs = ["batch_kernels_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":batch_kernel_test_util",
         ":batch_kernels",
@@ -1606,8 +1644,10 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:status_matchers",
         "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1910,6 +1950,7 @@ tf_cuda_cc_test(
     name = "fused_batch_norm_ex_op_test",
     size = "small",
     srcs = ["fused_batch_norm_ex_op_test.cc"],
+    features = if_cuda(["-layering_check"]),
     tags = ["no_cuda_on_cpu_tap"],
     deps = [
         ":cwise_op",
@@ -1957,6 +1998,7 @@ tf_cc_test(
 
 tf_kernel_library(
     name = "gather_functor",
+    features = ["-layering_check"],
     prefix = "gather_functor",
     visibility = [":friends"],
     deps = [
@@ -2357,6 +2399,7 @@ tf_kernel_library(
 tf_cc_test(
     name = "while_op_test",
     srcs = ["while_op_test.cc"],
+    features = ["-layering_check"],
     tags = [
         "no_windows",
     ],  # TODO(b/208697533): Re-enable after fixing.
@@ -2411,8 +2454,8 @@ tf_cc_test(
         ":control_flow_ops",
         ":ops_testutil",
         ":ops_util",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -2611,6 +2654,7 @@ DYNAMIC_DEPS = [
 
 tf_kernel_library(
     name = "dynamic_partition_op",
+    features = if_cuda(["-layering_check"]),
     prefix = "dynamic_partition_op",
     deps = DYNAMIC_DEPS + [
         ":fill_functor",
@@ -2634,6 +2678,7 @@ cc_library(
     name = "tensor_cord",
     srcs = ["tensor_cord.cc"],
     hdrs = ["tensor_cord.h"],
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
         "@com_google_absl//absl/strings",
@@ -2806,6 +2851,7 @@ tf_kernel_library(
     name = "tensor_array",
     srcs = ["tensor_array.cc"],
     hdrs = ["tensor_array.h"],
+    features = ["-layering_check"],
     visibility = ["//visibility:private"],
     deps = [
         ":aggregate_ops",
@@ -2819,6 +2865,7 @@ tf_kernel_library(
     name = "resource_variable_ops",
     srcs = ["resource_variable_ops.cc"],
     hdrs = ["resource_variable_ops.h"],
+    features = ["-layering_check"],
     deps = [
         ":dense_update_functor",
         ":gather_functor",
@@ -2874,6 +2921,7 @@ tf_kernel_library(
     name = "list_kernels",
     srcs = ["list_kernels.cc"],
     hdrs = ["list_kernels.h"],
+    features = ["-layering_check"],
     gpu_srcs = [
         "list_kernels.cu.cc",
         "list_kernels.h",
@@ -2894,6 +2942,7 @@ cc_library(
     name = "tensor_map",
     srcs = ["tensor_map.cc"],
     hdrs = ["tensor_map.h"],
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -2947,6 +2996,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "function_ops",
+    features = ["-layering_check"],
     prefix = "function_ops",
     deps = [
         "//tensorflow/core:core_cpu",
@@ -3024,6 +3074,7 @@ tf_cc_test(
         ":no_mkldnn_contraction_kernel": [],
         "//conditions:default": ["eigen_mkldnn_contraction_kernel_test.cc"],
     }),
+    features = ["-layering_check"],
     tags = ["mkldnn_contraction_kernel"],
     deps = [
         "//tensorflow/core:test",
@@ -3167,6 +3218,7 @@ tf_kernel_library(
     hdrs = [
         "checkpoint_callback_manager.h",
     ],
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core/platform:regexp",
@@ -3178,6 +3230,7 @@ tf_cc_tests(
     name = "checkpoint_callback_manager_test",
     size = "small",
     srcs = ["checkpoint_callback_manager_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":checkpoint_callback_manager",
         ":io",
@@ -3368,6 +3421,7 @@ tf_cc_test(
     name = "resource_ops_test",
     size = "small",
     srcs = ["resource_ops_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":dense_update_functor",
         ":ops_testutil",
@@ -3382,6 +3436,7 @@ tf_cc_test(
     name = "lookup_ops_test",
     size = "small",
     srcs = ["lookup_ops_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":lookup_table_op",
         ":ops_testutil",
@@ -3465,6 +3520,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "matmul_op",
+    features = ["-layering_check"],
     prefix = "matmul_op",
     textual_hdrs = ["matmul_op_impl.h"],
     deps = MATH_DEPS + [
@@ -3495,6 +3551,7 @@ cc_library(
     name = "matmul_util",
     srcs = ["matmul_util.cc"],
     hdrs = ["matmul_util.h"],
+    features = ["-layering_check"],
     local_defines = if_cuda(["GOOGLE_CUDA=1"]) + if_rocm(["TENSORFLOW_USE_ROCM=1"]),
     deps = if_cuda_or_rocm([
         "@com_google_absl//absl/container:flat_hash_map",
@@ -3520,6 +3577,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "bucketize_op",
+    features = if_cuda(["-layering_check"]),
     gpu_srcs = ["gpu_device_array.h"],
     prefix = "bucketize_op",
     deps = ARRAY_DEPS,
@@ -3530,6 +3588,7 @@ tf_kernel_library(
     copts = if_mlir_generated_gpu_kernels_enabled(
         ["-DMLIR_GENERATED_GPU_KERNELS_ENABLED"],
     ),
+    features = ["-layering_check"],
     # *.cu.cc sources are compiled with gpu_copts instead of copts.
     gpu_copts = if_mlir_generated_gpu_kernels_enabled(
         ["-DMLIR_GENERATED_GPU_KERNELS_ENABLED"],
@@ -3542,6 +3601,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "check_numerics_op",
+    features = ["-layering_check"],
     prefix = "check_numerics_op",
     deps = MATH_DEPS + ["//tensorflow/core:framework_internal"],
 )
@@ -3576,6 +3636,7 @@ tf_kernel_library(
     copts = if_mlir_generated_gpu_kernels_enabled(
         ["-DMLIR_GENERATED_GPU_KERNELS_ENABLED"],
     ),
+    features = if_cuda(["-layering_check"]),
     # *.cu.cc sources are compiled with gpu_copts instead of copts.
     gpu_copts = if_mlir_generated_gpu_kernels_enabled(
         ["-DMLIR_GENERATED_GPU_KERNELS_ENABLED"],
@@ -3595,6 +3656,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "fft_ops",
+    features = ["-layering_check"],
     prefix = "fft_ops",
     deps = MATH_DEPS + if_cuda([
         "@com_google_absl//absl/container:flat_hash_map",
@@ -3605,6 +3667,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "reduction_ops",
+    features = if_cuda(["-layering_check"]),
     gpu_srcs = ["reduction_gpu_kernels.cu.h"],
     prefix = "reduction_ops",
     deps = MATH_DEPS + [
@@ -3615,6 +3678,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "segment_reduction_ops",
+    features = ["-layering_check"],
     prefix = "segment_reduction_ops",
     deps = MATH_DEPS + [
         "//tensorflow/core/util:determinism_for_kernels",
@@ -3631,6 +3695,7 @@ tf_kernel_library(
     name = "scan_ops",
     srcs = ["scan_ops.cc"],
     hdrs = ["scan_ops.h"],
+    features = if_cuda(["-layering_check"]),
     gpu_srcs = [
         "scan_ops.h",
         "scan_ops_gpu.h",
@@ -3999,6 +4064,7 @@ tf_kernel_library(
     defines = [
         "EIGEN_NEON_GEBP_NR=4",
     ],
+    features = ["-layering_check"],
     prefix = "conv_ops",
     textual_hdrs = [
         "autotune_conv_impl.h",
@@ -4067,6 +4133,7 @@ tf_kernel_library(
     name = "depthwise_conv_op",
     srcs = ["depthwise_conv_op.cc"],
     hdrs = ["depthwise_conv_op.h"],
+    features = ["-layering_check"],
     gpu_copts = if_not_windows([
         "-Wno-pass-failed",  # clang misses #pragma loop optimizations
     ]),
@@ -4102,6 +4169,7 @@ tf_kernel_library(
     hdrs = [
         "depthwise_conv_op.h",
     ],
+    features = ["-layering_check"],
     prefix = "depthwise_conv_grad_op",
     deps = [
         ":cast_op",
@@ -4174,6 +4242,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "bias_op",
+    features = ["-layering_check"],
     prefix = "bias_op",
     deps = NN_DEPS + [
         ":loose_headers",
@@ -4194,6 +4263,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "fused_batch_norm_op",
+    features = ["-layering_check"],
     prefix = "fused_batch_norm_op",
     deps = NN_DEPS + [
         ":cast_op",
@@ -4208,12 +4278,14 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "in_topk_op",
+    features = if_cuda(["-layering_check"]),
     prefix = "in_topk_op",
     deps = NN_DEPS + [":reduction_ops"],
 )
 
 tf_kernel_library(
     name = "lrn_op",
+    features = ["-layering_check"],
     prefix = "lrn_op",
     deps = NN_DEPS + if_rocm([":conv_ops_gpu_hdrs"]) + [":loose_headers"],
 )
@@ -4225,6 +4297,7 @@ tf_kernel_library(
     ) + if_mlir_generated_gpu_kernels_enabled(
         ["-DMLIR_GENERATED_GPU_KERNELS_ENABLED"],
     ),
+    features = if_cuda(["-layering_check"]),
     # *.cu.cc sources are compiled with gpu_copts instead of copts.
     gpu_copts = if_mlir_generated_experimental_kernels_enabled(
         ["-DMLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED"],
@@ -4345,6 +4418,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "l2loss_op",
+    features = if_cuda(["-layering_check"]),
     prefix = "l2loss_op",
     deps = [
         ":gpu_prim_hdrs",
@@ -4440,6 +4514,7 @@ tf_kernel_library(
         "pooling_ops_3d.h",
         "pooling_ops_common.h",
     ],
+    features = ["-layering_check"],
     gpu_srcs = [
         "avgpooling_op.h",
         "avgpooling_op_gpu.cu.cc",
@@ -4686,6 +4761,7 @@ cc_library(
 
 tf_kernel_library(
     name = "random_op",
+    features = ["-layering_check"],
     prefix = "random_op",
     deps = RANDOM_OPS_DEPS,
 )
@@ -4771,6 +4847,7 @@ cc_library(
 
 tf_kernel_library(
     name = "stateful_random_ops",
+    features = if_cuda(["-layering_check"]),
     prefix = "stateful_random_ops",
     deps = [
         ":dense_update_functor",
@@ -4797,6 +4874,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "stateless_random_gamma_op",
+    features = ["-layering_check"],
     prefix = "stateless_random_gamma_op",
     deps = [
         ":stateless_random_ops",
@@ -4840,6 +4918,7 @@ tf_cc_test(
 
 tf_kernel_library(
     name = "random_index_shuffle_ops",
+    features = ["-layering_check"],
     prefix = "random_index_shuffle_ops",
     deps = [
         ":random_index_shuffle",
@@ -4940,6 +5019,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "sparse_concat_op",
+    features = ["-layering_check"],
     prefix = "sparse_concat_op",
     deps = SPARSE_DEPS + if_cuda_or_rocm([
         ":gpu_device_array",
@@ -4964,6 +5044,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "fill_empty_rows_functor",
+    features = if_cuda(["-layering_check"]),
     prefix = "fill_empty_rows_functor",
     deps = [
         "//tensorflow/core:framework",
@@ -4979,6 +5060,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "sparse_cross_op",
+    features = ["-layering_check"],
     prefix = "sparse_cross_op",
     deps = SPARSE_DEPS + [
         "@eigen_archive//:eigen3",
@@ -5011,6 +5093,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "sparse_reorder_op",
+    features = if_cuda(["-layering_check"]),
     prefix = "sparse_reorder_op",
     deps = SPARSE_DEPS + if_cuda_or_rocm([
         ":gpu_prim_hdrs",
@@ -5028,6 +5111,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "sparse_slice_grad_op",
+    features = if_cuda(["-layering_check"]),
     prefix = "sparse_slice_grad_op",
     deps = SPARSE_DEPS + if_cuda_or_rocm([
         ":gpu_prim_hdrs",
@@ -5036,6 +5120,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "sparse_slice_op",
+    features = if_cuda(["-layering_check"]),
     prefix = "sparse_slice_op",
     deps = SPARSE_DEPS + if_cuda_or_rocm([
         ":gpu_device_array",
@@ -5057,6 +5142,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "sparse_split_op",
+    features = if_cuda(["-layering_check"]),
     prefix = "sparse_split_op",
     deps = SPARSE_DEPS + if_cuda_or_rocm([
         ":gpu_device_array",
@@ -5090,6 +5176,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "sparse_to_dense_op",
+    features = ["-layering_check"],
     prefix = "sparse_to_dense_op",
     deps = SPARSE_DEPS + [
         ":loose_headers",
@@ -5102,6 +5189,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "sparse_xent_op",
+    features = if_cuda(["-layering_check"]),
     gpu_copts = tf_disable_ptxas_warning_flags(),
     prefix = "sparse_xent_op",
     deps = SPARSE_DEPS + [
@@ -5147,6 +5235,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "sparse_tensors_map_ops",
+    features = ["-layering_check"],
     prefix = "sparse_tensors_map_ops",
     deps = SPARSE_DEPS,
 )
@@ -5323,6 +5412,7 @@ cc_library(
     name = "scatter_nd_util",
     srcs = ["scatter_nd_util.cc"],
     hdrs = ["scatter_nd_util.h"],
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
     ],
@@ -5344,6 +5434,7 @@ tf_kernel_library(
     hdrs = [
         "scatter_nd_op.h",
     ],
+    features = ["-layering_check"],
     gpu_copts = if_not_windows([
         "-Wno-pass-failed",  # clang misses #pragma loop optimizations
     ]),
@@ -5365,6 +5456,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "variable_ops",
+    features = ["-layering_check"],
     prefix = "variable_ops",
     deps = STATE_DEPS,
 )
@@ -5474,7 +5566,7 @@ tf_kernel_library(
     name = "tensor_to_hash_bucket_op",
     prefix = "tensor_to_hash_bucket_op",
     deps = STRING_DEPS + if_oss(
-        if_cuda(["@farmhash_gpu_archive//:farmhash_gpu"]),
+        if_cuda_or_rocm(["@farmhash_gpu_archive//:farmhash_gpu"]),
         tf_fingerprint_deps(),
     ),
 )
@@ -5656,6 +5748,7 @@ tf_cc_test(
 
 tf_kernel_library(
     name = "as_string_op",
+    features = ["-layering_check"],
     prefix = "as_string_op",
     deps = STRING_DEPS,
 )
@@ -5730,6 +5823,7 @@ tf_cc_test(
 
 tf_kernel_library(
     name = "multinomial_op",
+    features = if_cuda(["-layering_check"]),
     prefix = "multinomial_op",
     deps = [
         ":gpu_prim_hdrs",
@@ -5763,6 +5857,7 @@ tf_cuda_cc_test(
 
 tf_kernel_library(
     name = "parameterized_truncated_normal_op",
+    features = if_cuda(["-layering_check"]),
     gpu_copts = if_not_windows([
         "-Wno-pass-failed",  # clang misses #pragma loop optimizations
     ]),
@@ -6011,6 +6106,7 @@ tf_cuda_cc_test(
     name = "spectrogram_op_test",
     size = "small",
     srcs = ["spectrogram_op_test.cc"],
+    features = ["-layering_check"],
     deps = [
         ":ops_util",
         ":spectrogram_op",
@@ -6381,6 +6477,7 @@ filegroup(
         "fill_empty_rows_functor.h",
         "function_ops.h",
         "fused_batch_norm_op.h",
+        "gpu_utils.h",
         "inplace_ops.cc",
         "inplace_ops_functor.h",
         "l2loss_op.h",
@@ -6397,6 +6494,7 @@ filegroup(
         "partitioned_function_ops.h",
         "pooling_ops_3d.h",
         "ragged_tensor_variant.h",
+        "ragged_utils.h",
         "random_index_shuffle.h",
         "random_op.h",
         "random_poisson_op.h",
@@ -6978,6 +7076,7 @@ cc_library(
     ]),
     copts = tf_copts() + tf_opts_nortti_if_lite_protos(),
     defines = ["EIGEN_NEON_GEBP_NR=4"],
+    features = ["-layering_check"],
     linkopts = if_android(["-ldl"]),
     tags = [
         "manual",
@@ -7071,6 +7170,7 @@ tf_kernel_library(
         "reshape_op.h",
     ],
     hdrs = ["reference_gemm.h"],
+    features = ["-layering_check"],
     deps = [
         ":concat_lib_hdrs",
         ":conv_ops",
@@ -7163,6 +7263,7 @@ tf_cc_binary(
     testonly = 1,
     srcs = ["quantization_utils_test.cc"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     linkopts = select({
         "//tensorflow:android": [
             "-lm",
@@ -7222,6 +7323,7 @@ cc_binary(
     testonly = 1,
     srcs = ["quantized_add_op_test.cc"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     linkopts = select({
         "//tensorflow:android": [
             "-lm",
@@ -7307,6 +7409,7 @@ cc_binary(
     testonly = 1,
     srcs = ["quantized_resize_bilinear_op_test.cc"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     linkopts = select({
         "//tensorflow:android": [
             "-lm",
@@ -7428,6 +7531,7 @@ cc_binary(
     name = "quantized_mul_op_test_android_only",
     testonly = 1,
     srcs = ["quantized_mul_op_test.cc"],
+    features = ["-layering_check"],
     linkopts = select({
         "//tensorflow:android": [
             "-pie",
@@ -7622,6 +7726,7 @@ cc_library(
     name = "quantization_utils",
     srcs = ["quantization_utils.cc"],
     hdrs = ["quantization_utils.h"],
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework",
         "@gemmlowp",
@@ -7734,6 +7839,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "sync_ops",
+    features = ["-layering_check"],
     prefix = "sync_ops",
     deps = [
         "//tensorflow/core:framework",
@@ -7816,6 +7922,7 @@ cc_library(
 
 tf_kernel_library(
     name = "stochastic_cast_op",
+    features = ["-layering_check"],
     prefix = "stochastic_cast_op",
     deps = [
         ":stateless_random_ops_v2_util",
@@ -7829,6 +7936,7 @@ tf_cc_test(
     name = "stochastic_cast_op_test",
     timeout = "moderate",
     srcs = ["stochastic_cast_op_test.cc"],
+    features = ["-layering_check"],
     shard_count = 48,
     deps = [
         ":cwise_lib",
diff --git a/tensorflow/core/kernels/batch_kernel_test_util.cc b/tensorflow/core/kernels/batch_kernel_test_util.cc
index e7d35ec2e4779c..bda3c25b182973 100644
--- a/tensorflow/core/kernels/batch_kernel_test_util.cc
+++ b/tensorflow/core/kernels/batch_kernel_test_util.cc
@@ -15,51 +15,53 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/batch_kernel_test_util.h"
 
+#include <vector>
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/kernels/batch_kernels.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/status.h"
+
 namespace tensorflow {
-namespace internal {
+namespace test_util {
 
 BatchFunctionKernelTestAccess::BatchFunctionKernelTestAccess(
-    BatchFunctionKernel* kernel)
+    const BatchFunctionKernel* kernel)
     : kernel_(kernel) {}
 
 bool BatchFunctionKernelTestAccess::enable_adaptive_batch_threads() const {
   return kernel_->enable_adaptive_batch_threads_;
 }
 
-}  // namespace internal
-
-bool BatchFunctionKernelTestBase::enable_adaptive_scheduler() const {
-  return GetParam();
-}
-
-Status BatchFunctionKernelTestBase::Init() {
+Status BatchFunctionKernelTestBase::Init(bool enable_adaptive_scheduler) {
   std::vector<DataType> input_dtypes({DataType::DT_INT64, DataType::DT_INT64});
   std::vector<NodeDefBuilder::NodeOut> inputs(
       {NodeDefBuilder::NodeOut({"n1", 0, DataType::DT_INT64}),
        NodeDefBuilder::NodeOut({"n2", 1, DataType::DT_INT64})});
   NameAttrList f;
   f.set_name("func_to_batch");
-  TF_CHECK_OK(
-      NodeDefBuilder("BatchTPUInput", "BatchFunction")
-          .Attr("max_batch_size", 32)
-          .Attr("num_batch_threads", enable_adaptive_scheduler() ? 0 : 8)
-          .Attr("allowed_batch_sizes", {2, 4, 8})
-          .Attr("batch_timeout_micros", 1000)
-          .Attr("max_enqueued_batches", 100)
-          .Attr("enable_large_batch_splitting", true)
-          .Attr("low_priority_max_batch_size", 64)
-          .Attr("low_priority_batch_timeout_micros", 8000)
-          .Attr("low_priority_allowed_batch_sizes", {32, 64})
-          .Attr("low_priority_max_enqueued_batches", 1000)
-          .Attr("Tcaptured", std::vector<DataType>{DataType::DT_INT64})
-          .Attr("Tin", input_dtypes)
-          .Input(inputs)
-          .Attr("Tcaptured", std::vector<DataType>{DataType::DT_INT64})
-          .Input(std::vector<NodeDefBuilder::NodeOut>{
-              NodeDefBuilder::NodeOut({"n3", 1, DataType::DT_INT64})})
-          .Attr("Tout", std::vector<DataType>(4, DataType::DT_INT64))
-          .Attr("f", f)
-          .Finalize(node_def()));
+  TF_CHECK_OK(NodeDefBuilder("BatchTPUInput", "BatchFunction")
+                  .Attr("max_batch_size", 32)
+                  .Attr("num_batch_threads", enable_adaptive_scheduler ? 0 : 8)
+                  .Attr("allowed_batch_sizes", {2, 4, 8})
+                  .Attr("batch_timeout_micros", 1000)
+                  .Attr("max_enqueued_batches", 100)
+                  .Attr("enable_large_batch_splitting", true)
+                  .Attr("low_priority_max_batch_size", 64)
+                  .Attr("low_priority_batch_timeout_micros", 8000)
+                  .Attr("low_priority_allowed_batch_sizes", {32, 64})
+                  .Attr("low_priority_max_enqueued_batches", 1000)
+                  .Attr("Tcaptured", std::vector<DataType>{DataType::DT_INT64})
+                  .Attr("Tin", input_dtypes)
+                  .Input(inputs)
+                  .Attr("Tcaptured", std::vector<DataType>{DataType::DT_INT64})
+                  .Input(std::vector<NodeDefBuilder::NodeOut>{
+                      NodeDefBuilder::NodeOut({"n3", 1, DataType::DT_INT64})})
+                  .Attr("Tout", std::vector<DataType>(4, DataType::DT_INT64))
+                  .Attr("f", f)
+                  .Finalize(node_def()));
   return InitOp();
 }
+
+}  // namespace test_util
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_kernel_test_util.h b/tensorflow/core/kernels/batch_kernel_test_util.h
index e26f6c5d78914c..e6b37e635ac0bc 100644
--- a/tensorflow/core/kernels/batch_kernel_test_util.h
+++ b/tensorflow/core/kernels/batch_kernel_test_util.h
@@ -16,37 +16,33 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_BATCH_KERNEL_TEST_UTIL_H_
 #define TENSORFLOW_CORE_KERNELS_BATCH_KERNEL_TEST_UTIL_H_
 
-#include "tensorflow/core/framework/node_def_builder.h"
+#include <gtest/gtest.h>
 #include "tensorflow/core/kernels/batch_kernels.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
-namespace internal {
+namespace test_util {
+
+// A test util for accessing private members of `BatchFunctionKernel`.
 class BatchFunctionKernelTestAccess {
  public:
-  explicit BatchFunctionKernelTestAccess(BatchFunctionKernel* kernel);
+  explicit BatchFunctionKernelTestAccess(const BatchFunctionKernel* kernel);
 
   bool enable_adaptive_batch_threads() const;
 
  private:
-  BatchFunctionKernel* const kernel_;
+  const BatchFunctionKernel* const kernel_;
 };
 
-}  // namespace internal
-
 class BatchFunctionKernelTestBase : public OpsTestBase,
                                     public ::testing::WithParamInterface<bool> {
  public:
-  bool enable_adaptive_scheduler() const;
-
   // Init test fixture with a batch kernel instance.
-  Status Init();
+  Status Init(bool enable_adaptive_scheduler);
 };
 
+}  // namespace test_util
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_BATCH_KERNEL_TEST_UTIL_H_
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 1763fcd3c15088..8862e5b0c98f58 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -218,7 +218,7 @@ class BatchResource : public serving::BatchResourceBase {
         has_process_batch_function, std::move(batcher),
         GetAdaptiveBatcherQueueOptions(
             max_batch_size, batch_timeout_micros, max_enqueued_batches,
-            true /* enable large batch split */, allowed_batch_sizes,
+            /*enable_large_batch_splitting=*/true, allowed_batch_sizes,
             /*disable_padding=*/false),
         allowed_batch_sizes));
     return OkStatus();
@@ -302,9 +302,6 @@ BatchFunctionKernel::BatchFunctionKernel(OpKernelConstruction* c)
     OP_REQUIRES_OK(c, c->GetAttr("enable_large_batch_splitting",
                                  &enable_large_batch_splitting_));
     has_attribute_enable_large_batch_splitting_ = true;
-  } else {
-    enable_large_batch_splitting_ = false;
-    has_attribute_enable_large_batch_splitting_ = false;
   }
 
   // Helper function `SetAdaptiveBatchSchedulerOptions` calls
diff --git a/tensorflow/core/kernels/batch_kernels.h b/tensorflow/core/kernels/batch_kernels.h
index 9ea1b195a050f6..1c9c35356e3d2f 100644
--- a/tensorflow/core/kernels/batch_kernels.h
+++ b/tensorflow/core/kernels/batch_kernels.h
@@ -34,9 +34,9 @@ ABSL_CONST_INIT extern const int64_t kInitialInflightBatches;
 ABSL_CONST_INIT extern const int64_t kBatchesToAverageOver;
 ABSL_CONST_INIT extern const int64_t kMaxInflightBatches;
 
-namespace internal {
+namespace test_util {
 class BatchFunctionKernelTestAccess;
-}
+}  // namespace test_util
 
 // Records the usage of attribute `enable_large_batch_splitting`.
 void RecordBatchSplitUsage(
@@ -71,7 +71,7 @@ class BatchFunctionKernel : public AsyncOpKernel {
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final;
 
  private:
-  friend class internal::BatchFunctionKernelTestAccess;
+  friend class test_util::BatchFunctionKernelTestAccess;
 
   // Validates 'allowed_batch_sizes_'. The entries must increase monotonically.
   // If large batch split is not enabled, the last one must equal
@@ -111,8 +111,8 @@ class BatchFunctionKernel : public AsyncOpKernel {
   std::vector<int32> low_priority_allowed_batch_sizes_;
   NameAttrList func_;
   absl::optional<FunctionLibraryRuntime::Handle> fhandle_ TF_GUARDED_BY(mu_);
-  bool enable_large_batch_splitting_;
-  bool has_attribute_enable_large_batch_splitting_;
+  bool enable_large_batch_splitting_ = false;
+  bool has_attribute_enable_large_batch_splitting_ = false;
   bool enable_adaptive_batch_threads_ = false;
 
   mutex mu_;
diff --git a/tensorflow/core/kernels/batch_kernels_env_test.cc b/tensorflow/core/kernels/batch_kernels_env_test.cc
index 8b2819c0a6be3f..508c0e8699763c 100644
--- a/tensorflow/core/kernels/batch_kernels_env_test.cc
+++ b/tensorflow/core/kernels/batch_kernels_env_test.cc
@@ -13,29 +13,37 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <gmock/gmock.h>
 #include "tensorflow/core/kernels/batch_kernel_test_util.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/status_matchers.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tsl/lib/core/status_test_util.h"
 
 namespace tensorflow {
+namespace {
 // Tests that batch kernel initialization returns error when it's configured to
 // use adaptive scheduling yet batching thread pool creation fails.
-class BatchFunctionKernelEnvTest : public BatchFunctionKernelTestBase {};
+class BatchFunctionKernelEnvTest
+    : public test_util::BatchFunctionKernelTestBase {};
 
 TEST_P(BatchFunctionKernelEnvTest, Basic) {
   tensorflow::setenv("TF_NUM_BATCH_THREADS", "0", 1 /* overwrite */);
-  if (enable_adaptive_scheduler()) {
-    EXPECT_THAT(Init(), tensorflow::testing::StatusIs(
+
+  const bool adaptive_scheduler_enabled = GetParam();
+  Status status = Init(adaptive_scheduler_enabled);
+  if (adaptive_scheduler_enabled) {
+    EXPECT_THAT(status, tensorflow::testing::StatusIs(
                             error::FAILED_PRECONDITION,
                             "Failed to create batch threads pool"));
   } else {
     // Initialization is ok since batch kernel doesn't use adaptive
     // scheduler.
-    TF_EXPECT_OK(Init());
+    TF_EXPECT_OK(status);
   }
 }
 
 INSTANTIATE_TEST_SUITE_P(Params, BatchFunctionKernelEnvTest, ::testing::Bool());
+
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_kernels_test.cc b/tensorflow/core/kernels/batch_kernels_test.cc
index 7b7810780872a7..af7546a062169d 100644
--- a/tensorflow/core/kernels/batch_kernels_test.cc
+++ b/tensorflow/core/kernels/batch_kernels_test.cc
@@ -24,30 +24,38 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/batch_kernel_test_util.h"
 #include "tensorflow/core/kernels/batching_util/warmup.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/blocking_counter.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
 
 namespace tensorflow {
+namespace {
 
 using PerModelData = serving::WarmupStateRegistry::PerModelData;
 
-class BatchFunctionKernelTest : public BatchFunctionKernelTestBase {};
+class BatchFunctionKernelTest : public test_util::BatchFunctionKernelTestBase {
+};
 
 TEST_P(BatchFunctionKernelTest, EnableAdaptiveScheduler) {
-  TF_EXPECT_OK(Init());
+  const bool adaptive_scheduler_enabled = GetParam();
+
+  TF_EXPECT_OK(Init(adaptive_scheduler_enabled));
+
   BatchFunctionKernel *batch_kernel =
       dynamic_cast<BatchFunctionKernel *>(op_kernel());
-  EXPECT_EQ(internal::BatchFunctionKernelTestAccess(batch_kernel)
-                .enable_adaptive_batch_threads(),
-            enable_adaptive_scheduler());
+  EXPECT_EQ(adaptive_scheduler_enabled,
+            test_util::BatchFunctionKernelTestAccess(batch_kernel)
+                .enable_adaptive_batch_threads());
 }
 
 INSTANTIATE_TEST_SUITE_P(Params, BatchFunctionKernelTest, ::testing::Bool());
@@ -55,51 +63,68 @@ INSTANTIATE_TEST_SUITE_P(Params, BatchFunctionKernelTest, ::testing::Bool());
 class BatchFunctionKernelParallelWarmupTestState : public OpsTestBase {
  public:
   // Init test fixture with a batch kernel instance.
-  Status Init(bool enable_splitting, bool check_output_shape = true) {
+  Status Init(bool enable_splitting, bool check_output_shape) {
     static auto *const cpu_device = []() {
       auto device =
           DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
       return device.release();
     }();
 
-    // Overriding the per-test/per-op device with a global device so that it can
+    // Override the per-test/per-op device with a global device so that it can
     // be shared between ops.
     device_ = cpu_device;
 
-    std::vector<DataType> input_dtypes({DataType::DT_INT64});
-    std::vector<NodeDefBuilder::NodeOut> inputs(
-        {NodeDefBuilder::NodeOut({"n1", 0, DataType::DT_INT64})});
-
     NameAttrList f;
-    f.set_name("func_to_batch");
-    tensorflow::FunctionDefHelper::Node node_info = {
-        {"output1"}, "Identity", {"input1"}, {{"T", DT_INT64}}};
+    f.set_name("BatchFunctionKernelParallelWarmupTestStateFunc");
+    FunctionDef func;
     if (check_output_shape) {
-      node_info = {{"output1"},
-                   "EnsureShape",
-                   {"input1"},
-                   {{"T", DT_INT64}, {"shape", TensorShape({2})}}};
+      func = FunctionDefHelper::Create(
+          // function_name
+          f.name(),
+          // in_def
+          {"x:int64"},
+          // out_def
+          {"o:int64"},
+          // attr_def
+          {},
+          // node_def
+          {{{"o"},
+            "EnsureShape",
+            {"x"},
+            {{"T", DataType::DT_INT64}, {"shape", TensorShape({2})}}}},
+          // ret_def
+          {{"o", "o:output"}});
+    } else {
+      func = FunctionDefHelper::Create(
+          // function_name
+          f.name(),
+          // in_def
+          {"x:int64"},
+          // out_def
+          {"o:int64"},
+          // attr_def
+          {},
+          // node_def
+          {{{"o"}, "Identity", {"x"}, {{"T", DataType::DT_INT64}}}},
+          // ret_def
+          {{"o", "o:output"}});
     }
-    TF_RETURN_IF_ERROR(flib_def_->AddFunctionDef(FunctionDefHelper::Define(
-        /*Function*/ "func_to_batch",
-        /*Inputs*/ {"input1:int64"},
-        /*Outputs*/ {"output1:int64"},
-        /*Attribute*/ {},
-        // Node info
-        {node_info})));
+    TF_RETURN_IF_ERROR(flib_def_->AddFunctionDef(func));
 
     pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
         device_mgr_.get(), Env::Default(), /*config=*/nullptr,
         TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(),
         /*thread_pool=*/nullptr, /*parent=*/nullptr,
         /*session_metadata=*/nullptr,
-        Rendezvous::Factory{[](const int64, const DeviceMgr *device_mgr,
+        Rendezvous::Factory{[](const int64_t, const DeviceMgr *device_mgr,
                                tsl::core::RefCountPtr<Rendezvous> *r) {
           *r = tsl::core::RefCountPtr<Rendezvous>(
               new IntraProcessRendezvous(device_mgr));
           return OkStatus();
         }});
 
+    std::vector<NodeDefBuilder::NodeOut> inputs(
+        {NodeDefBuilder::NodeOut({"n1", 0, DataType::DT_INT64})});
     TF_CHECK_OK(NodeDefBuilder("BatchTPUInput", "BatchFunction")
                     .Attr("max_batch_size", enable_splitting ? 16 : 8)
                     .Attr("num_batch_threads", 8)
@@ -111,7 +136,7 @@ class BatchFunctionKernelParallelWarmupTestState : public OpsTestBase {
                     .Attr("low_priority_batch_timeout_micros", 8000)
                     .Attr("low_priority_allowed_batch_sizes", {32, 64})
                     .Attr("low_priority_max_enqueued_batches", 1000)
-                    .Attr("Tin", input_dtypes)
+                    .Attr("Tin", {DataType::DT_INT64})
                     .Input(inputs)
                     .Attr("Tcaptured", std::vector<DataType>{})
                     .Input(std::vector<NodeDefBuilder::NodeOut>{})
@@ -150,7 +175,8 @@ TEST_P(BatchFunctionKernelParallelWarmupTest, ParallelWarmup) {
       Env::Default()->SchedClosure([&]() {
         BatchFunctionKernelParallelWarmupTestState test;
         test.set_session_metadata(session_metadata);
-        TF_CHECK_OK(test.Init(enable_splitting));
+        TF_CHECK_OK(test.Init(enable_splitting,
+                              /*check_output_shape=*/true));
         test.AddInputFromList<int64_t>(TensorShape({2}), {123, 456});
         TF_CHECK_OK(test.RunOpKernel());
 
@@ -171,7 +197,8 @@ TEST_P(BatchFunctionKernelParallelWarmupTest, ParallelWarmup) {
       Env::Default()->SchedClosure([&]() {
         BatchFunctionKernelParallelWarmupTestState test;
         test.set_session_metadata(session_metadata);
-        TF_CHECK_OK(test.Init(enable_splitting));
+        TF_CHECK_OK(test.Init(enable_splitting,
+                              /*check_output_shape=*/true));
         test.AddInputFromList<int64_t>(TensorShape({2}), {123, 456});
         // We expect requests to be batched together when the warm-up mode is
         // turned off, which will make the execution fail at `EnsureShape`.
@@ -205,7 +232,7 @@ TEST_P(BatchFunctionKernelParallelWarmupTest, ParallelWarmupAutoBatch) {
       Env::Default()->SchedClosure([&]() {
         BatchFunctionKernelParallelWarmupTestState test;
         test.set_session_metadata(session_metadata);
-        TF_CHECK_OK(test.Init(enable_splitting));
+        TF_CHECK_OK(test.Init(enable_splitting, /*check_output_shape=*/true));
         test.AddInputFromList<int64_t>(TensorShape({2}), {123, 456});
         auto status = test.RunOpKernel();
         ASSERT_FALSE(status.ok());
@@ -250,5 +277,5 @@ TEST_P(BatchFunctionKernelParallelWarmupTest, ParallelWarmupAutoBatch) {
 INSTANTIATE_TEST_SUITE_P(BatchFunctionKernelParallelWarmupTestSuite,
                          BatchFunctionKernelParallelWarmupTest,
                          ::testing::Bool());
-
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_norm_op_test.cc b/tensorflow/core/kernels/batch_norm_op_test.cc
index 45ddc853295557..7b96122b521ae5 100644
--- a/tensorflow/core/kernels/batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/batch_norm_op_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <vector>
+
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -29,60 +30,50 @@ limitations under the License.
 
 namespace tensorflow {
 
-class BatchNormOpTest : public OpsTestBase {};
-
-TEST_F(BatchNormOpTest, Simple) {
-  TF_EXPECT_OK(
-      NodeDefBuilder("batch_norm_op", "BatchNormWithGlobalNormalization")
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Attr("scale_after_normalization", false)
-          .Attr("variance_epsilon", 0.001)
-          .Finalize(node_def()));
-  TF_EXPECT_OK(InitOpWithGraphVersion(8));
-  AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
-                           {1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6});
-  AddInputFromArray<float>(TensorShape({2}), {10, 20});
-  AddInputFromArray<float>(TensorShape({2}), {0.25f, 0.5f});
-  AddInputFromArray<float>(TensorShape({2}), {0.1f, 0.6f});
-  AddInputFromArray<float>(TensorShape({2}), {0.0f, 0.0f});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
-  test::FillValues<float>(
-      &expected, {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f, -21.86f,
-                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
-}
-
-TEST_F(BatchNormOpTest, Fp16) {
-  TF_EXPECT_OK(
-      NodeDefBuilder("batch_norm_op", "BatchNormWithGlobalNormalization")
-          .Input(FakeInput(DT_HALF))
-          .Input(FakeInput(DT_HALF))
-          .Input(FakeInput(DT_HALF))
-          .Input(FakeInput(DT_HALF))
-          .Input(FakeInput(DT_HALF))
-          .Attr("scale_after_normalization", false)
-          .Attr("variance_epsilon", 0.001)
-          .Finalize(node_def()));
-  TF_EXPECT_OK(InitOpWithGraphVersion(8));
-  AddInputFromList<Eigen::half>(TensorShape({1, 1, 6, 2}),
-                                {1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6});
-  AddInputFromList<Eigen::half>(TensorShape({2}), {10, 20});
-  AddInputFromList<Eigen::half>(TensorShape({2}), {0.25, 0.5});
-  AddInputFromList<Eigen::half>(TensorShape({2}), {0.1, 0.6});
-  AddInputFromList<Eigen::half>(TensorShape({2}), {0.0, 0.0});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_HALF, TensorShape({1, 1, 6, 2}));
-  test::FillValues<Eigen::half>(
-      &expected, {-17.86, -22.00, -15.87, -20.59, -13.87, -19.18, -21.86,
-                  -33.31, -23.85, -34.72, -25.85, -36.13});
-  test::ExpectTensorNear<Eigen::half>(expected, *GetOutput(0), 0.1);
-}
+template <typename T>
+struct BatchNormOpTest : public OpsTestBase {
+  static constexpr auto TValueType = DataTypeToEnum<T>::value;
+
+  void run_me() {
+    TF_EXPECT_OK(
+        NodeDefBuilder("batch_norm_op", "BatchNormWithGlobalNormalization")
+            .Input(FakeInput(TValueType))
+            .Input(FakeInput(TValueType))
+            .Input(FakeInput(TValueType))
+            .Input(FakeInput(TValueType))
+            .Input(FakeInput(TValueType))
+            .Attr("scale_after_normalization", false)
+            .Attr("variance_epsilon", 0.001)
+            .Finalize(node_def()));
+    TF_EXPECT_OK(InitOpWithGraphVersion(8));
+
+    AddInputFromList<T>(TensorShape({1, 1, 6, 2}),
+                        {1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6});
+    AddInputFromList<T>(TensorShape({2}), {10, 20});
+    AddInputFromList<T>(TensorShape({2}), {0.25, 0.5});
+    AddInputFromList<T>(TensorShape({2}), {0.1, 0.6});
+    AddInputFromList<T>(TensorShape({2}), {0.0, 0.0});
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    double atol = TValueType == DT_FLOAT ? 0.01 : 0.1;
+
+    Tensor expected(allocator(), TValueType, TensorShape({1, 1, 6, 2}));
+    test::FillValues<T>(&expected,
+                        {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f,
+                         -21.86f, -33.31f, -23.85f, -34.72f, -25.85f, -36.13f});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), atol);
+  }
+};
+
+TYPED_TEST_SUITE_P(BatchNormOpTest);
+
+TYPED_TEST_P(BatchNormOpTest, Simple) { this->run_me(); }
+
+REGISTER_TYPED_TEST_SUITE_P(BatchNormOpTest, Simple);
+
+// TODO(ezhulenev): Add support for more data types.
+using DataTypes = ::testing::Types<float, Eigen::half>;
+INSTANTIATE_TYPED_TEST_SUITE_P(Test, BatchNormOpTest, DataTypes);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index c1395ed464252c..fa900a9c87789c 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -234,6 +234,23 @@ void RecordBatchParamAllowedBatchSizes(const string& allowed_batch_sizes,
   cell->GetCell(model_name, op_name)->Set(allowed_batch_sizes);
 }
 
+void RecordBatchCosts(const std::string& model_name,
+                      const int64_t processed_size,
+                      const absl::string_view cost_type,
+                      const absl::Duration total_cost) {
+  static auto* cell = tensorflow::monitoring::Sampler<3>::New(
+      {"/tensorflow/serving/batching/costs",
+       "Tracks the batch costs (in microseconds) by model name and processed "
+       "size.",
+       "model_name", "processed_size", "cost_type"},
+      // It's 27 buckets with the last bucket being 2^26 to DBL_MAX;
+      // so the limits are [1, 2, 4, 8, ..., 64 * 1024 * 1024 (~64s), DBL_MAX].
+      monitoring::Buckets::Exponential(1, 2, 27));
+  cell->GetCell(model_name, std::to_string(processed_size),
+                std::string(cost_type))
+      ->Add(absl::ToDoubleMicroseconds(total_cost));
+}
+
 const string& GetModelName(OpKernelContext* ctx) {
   static string* kModelNameUnset = new string("model_name_unset");
   if (!ctx->session_metadata()) return *kModelNameUnset;
@@ -485,6 +502,7 @@ BatchResourceBase::GetBatcherQueueOptions(
           *allowed_batch_sizes.rbegin();
       batcher_queue_options.high_priority_queue_options
           .max_execution_batch_size = *allowed_batch_sizes.rbegin();
+      batcher_queue_options.allowed_batch_sizes = allowed_batch_sizes;
     }
     if (low_priority_allowed_batch_sizes.empty()) {
       batcher_queue_options.low_priority_queue_options
@@ -827,6 +845,7 @@ void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
 
   auto& last_task = batch->task(batch->num_tasks() - 1);
   OpKernelContext* last_task_context = last_task.context;
+  const std::string& model_name = GetModelName(last_task_context);
 
   // Regardless of the outcome, we need to propagate the status to the
   // individual tasks and signal that they are done. We use MakeCleanup() to
@@ -838,8 +857,8 @@ void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
     if (cleanup_done) {
       return;
     }
-    SplitBatchCostsAndRecordMetrics(batch_cost_measurements, processed_size,
-                                    *batch);
+    SplitBatchCostsAndRecordMetrics(model_name, batch_cost_measurements,
+                                    processed_size, *batch);
     // Clear the measurements before unblocking the batch task, as measurements
     // are associated with the task's thread context.
     batch_cost_measurements.clear();
@@ -878,7 +897,6 @@ void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
   args.insert(args.end(), captured_inputs.begin(), captured_inputs.end());
 
   uint64 current_time = EnvTime::NowNanos();
-  const string& model_name = GetModelName(last_task_context);
   for (int i = 0; i < batch->num_tasks(); ++i) {
     RecordBatchDelayUs((current_time - batch->task(i).start_time) * 1e-3,
                        model_name, last_task_context->op_kernel().name(),
@@ -930,15 +948,17 @@ void BatchResourceBase::ProcessBatch(std::unique_ptr<BatchT> batch) const {
       CreateCostMeasurements(batching_context);
 
   int64_t processed_size = batch->size();
-  auto batch_cost_split_cleanup = gtl::MakeCleanup([&] {
-    SplitBatchCostsAndRecordMetrics(batch_cost_measurements, processed_size,
-                                    *batch);
-  });
 
   OpKernelContext* last_task_context =
       batch->task(batch->num_tasks() - 1).context;
   AsyncOpKernel::DoneCallback last_task_callback =
       batch->task(batch->num_tasks() - 1).done_callback;
+  const std::string& model_name = GetModelName(last_task_context);
+
+  auto batch_cost_cleanup = gtl::MakeCleanup([&] {
+    SplitBatchCostsAndRecordMetrics(model_name, batch_cost_measurements,
+                                    processed_size, *batch);
+  });
 
   OP_REQUIRES_OK_ASYNC(last_task_context, ValidateBatch(*batch),
                        last_task_callback);
@@ -1056,6 +1076,7 @@ Status BatchResourceBase::LookupOrCreateBatcherQueue(const string& queue_name,
 }
 
 void BatchResourceBase::SplitBatchCostsAndRecordMetrics(
+    const std::string& model_name,
     const std::vector<std::unique_ptr<CostMeasurement>>&
         batch_cost_measurements,
     const int64_t processed_size, BatchT& batch) {
@@ -1078,6 +1099,15 @@ void BatchResourceBase::SplitBatchCostsAndRecordMetrics(
     const absl::string_view cost_type = batch_cost_measurement->GetCostType();
     const absl::Duration total_cost = batch_cost_measurement->GetTotalCost();
 
+    // Smeared batch cost: cost for processing this batch.
+    RecordBatchCosts(model_name, processed_size,
+                     absl::StrCat(cost_type, kWithSmearSuffix), total_cost);
+    // Non-smeared batch cost: cost for processing inputs in this batch, i.e.
+    // cost for processing paddings is excluded.
+    RecordBatchCosts(model_name, processed_size,
+                     absl::StrCat(cost_type, kNoSmearSuffix),
+                     total_cost / processed_size * batch.size());
+
     for (int i = 0; i < batch.num_tasks(); i++) {
       RequestCost* request_cost = batch.task(i).request_cost;
       // Skip recording the cost if the request_cost is null.
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.h b/tensorflow/core/kernels/batching_util/batch_resource_base.h
index 5124e9f031733a..b86d25c097da39 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.h
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <map>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -238,6 +239,7 @@ class BatchResourceBase : public ResourceBase {
   //   2) the input size from this task;
   //   3) the padding amount.
   static void SplitBatchCostsAndRecordMetrics(
+      const std::string& model_name,
       const std::vector<std::unique_ptr<CostMeasurement>>&
           batch_cost_measurements,
       int64_t processed_size, BatchT& batch);
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc b/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
index dc75fde050cc6f..cd4ae4644ed62e 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
@@ -70,9 +70,8 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SkipOnNoCostMeasurement) {
   batch.Close();
 
   std::vector<std::unique_ptr<CostMeasurement>> batch_cost_measurements;
-  BatchResourceBase::SplitBatchCostsAndRecordMetrics(batch_cost_measurements,
-                                                     /*processed_size=*/16,
-                                                     batch);
+  BatchResourceBase::SplitBatchCostsAndRecordMetrics(
+      "model_name", batch_cost_measurements, /*processed_size=*/16, batch);
   EXPECT_TRUE(batch.task(0).request_cost->GetCosts().empty());
   EXPECT_THAT(batch.task(0).request_cost->GetBatchMetrics(),
               ::testing::ElementsAre(::testing::FieldsAre(
@@ -90,9 +89,8 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SkipOnZeroCost) {
   std::vector<std::unique_ptr<CostMeasurement>> batch_cost_measurements;
   batch_cost_measurements.push_back(
       CostMeasurementRegistry::CreateByNameOrNull("no_op", context));
-  BatchResourceBase::SplitBatchCostsAndRecordMetrics(batch_cost_measurements,
-                                                     /*processed_size=*/16,
-                                                     batch);
+  BatchResourceBase::SplitBatchCostsAndRecordMetrics(
+      "model_name", batch_cost_measurements, /*processed_size=*/16, batch);
   EXPECT_TRUE(batch.task(0).request_cost->GetCosts().empty());
   EXPECT_THAT(batch.task(0).request_cost->GetBatchMetrics(),
               ::testing::ElementsAre(::testing::FieldsAre(
@@ -108,9 +106,8 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SkipOnZeroBatchSize) {
   std::vector<std::unique_ptr<CostMeasurement>> batch_cost_measurements;
   batch_cost_measurements.push_back(
       CostMeasurementRegistry::CreateByNameOrNull("test_tpu", context));
-  BatchResourceBase::SplitBatchCostsAndRecordMetrics(batch_cost_measurements,
-                                                     /*processed_size=*/0,
-                                                     batch);
+  BatchResourceBase::SplitBatchCostsAndRecordMetrics(
+      "model_name", batch_cost_measurements, /*processed_size=*/0, batch);
 }
 
 TEST(SplitBatchCostsAndRecordMetricsTest, SkipOnNoRequestCost) {
@@ -123,9 +120,8 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SkipOnNoRequestCost) {
   std::vector<std::unique_ptr<CostMeasurement>> batch_cost_measurements;
   batch_cost_measurements.push_back(
       CostMeasurementRegistry::CreateByNameOrNull("test_tpu", context));
-  BatchResourceBase::SplitBatchCostsAndRecordMetrics(batch_cost_measurements,
-                                                     /*processed_size=*/16,
-                                                     batch);
+  BatchResourceBase::SplitBatchCostsAndRecordMetrics(
+      "model_name", batch_cost_measurements, /*processed_size=*/16, batch);
 
   EXPECT_EQ(batch.task(0).request_cost, nullptr);
   EXPECT_EQ(batch.task(1).request_cost, nullptr);
@@ -142,9 +138,8 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SplitSingleCostType) {
   std::vector<std::unique_ptr<CostMeasurement>> batch_cost_measurements;
   batch_cost_measurements.push_back(
       CostMeasurementRegistry::CreateByNameOrNull("test_tpu", context));
-  BatchResourceBase::SplitBatchCostsAndRecordMetrics(batch_cost_measurements,
-                                                     /*processed_size=*/20,
-                                                     batch);
+  BatchResourceBase::SplitBatchCostsAndRecordMetrics(
+      "model_name", batch_cost_measurements, /*processed_size=*/20, batch);
 
   EXPECT_THAT(
       batch.task(0).request_cost->GetCosts(),
@@ -179,9 +174,8 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SplitMultiCostTypes) {
       CostMeasurementRegistry::CreateByNameOrNull("test_tpu", context));
   batch_cost_measurements.push_back(
       CostMeasurementRegistry::CreateByNameOrNull("test_gcu", context));
-  BatchResourceBase::SplitBatchCostsAndRecordMetrics(batch_cost_measurements,
-                                                     /*processed_size=*/20,
-                                                     batch);
+  BatchResourceBase::SplitBatchCostsAndRecordMetrics(
+      "model_name", batch_cost_measurements, /*processed_size=*/20, batch);
 
   EXPECT_THAT(
       batch.task(0).request_cost->GetCosts(),
@@ -223,9 +217,8 @@ TEST(SplitBatchCostsAndRecordMetricsTest, SplitOnlyNonZeroCostTypes) {
       CostMeasurementRegistry::CreateByNameOrNull("no_op", context));
   batch_cost_measurements.push_back(
       CostMeasurementRegistry::CreateByNameOrNull("test_tpu", context));
-  BatchResourceBase::SplitBatchCostsAndRecordMetrics(batch_cost_measurements,
-                                                     /*processed_size=*/20,
-                                                     batch);
+  BatchResourceBase::SplitBatchCostsAndRecordMetrics(
+      "model_name", batch_cost_measurements, /*processed_size=*/20, batch);
 
   EXPECT_THAT(
       batch.task(0).request_cost->GetCosts(),
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 2b95b91b8f103e..4e218e83dd51c2 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -217,6 +217,9 @@ class SharedBatchScheduler
     // done by adding padding in the process-batch callback.
     size_t max_execution_batch_size = 1000;
 
+    // If non-empty, contains configured batch sizes.
+    std::vector<int32> allowed_batch_sizes;
+
     // If true, the padding will not be appended.
     bool disable_padding = false;
 
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc b/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc
index 454e7a77b32037..c21a0cc907bce7 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc
@@ -671,7 +671,6 @@ DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
-
 // A dummy type to group backward filter autotune results together.
 struct Conv3dBackwardFilterAutotuneGroup {
   static string name() { return "Conv3dBwdFilter"; }
@@ -702,8 +701,7 @@ void LaunchConvBackpropFilterOpImpl(
   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
   if (DataTypeToEnum<T>::value == DT_BFLOAT16 &&
-      !stream->GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
+      !IsBF16SupportedInOps(stream)) {
     context->SetStatus(errors::Unimplemented(
         "Conv3DBackpropFilter for GPU with bfloat16 is only supported "
         "with cuDNN on Ampere GPUs or later."));
@@ -803,86 +801,86 @@ void LaunchConvBackpropFilterOpImpl(
       << padding_planes << ")";
 
 #if GOOGLE_CUDA
-    const bool compute_in_nhwc = ComputeInNhwcEnabled(
-        DataTypeToEnum<T>::value, stream, /*use_4d_tensor=*/false);
+  const bool compute_in_nhwc = ComputeInNhwcEnabled(
+      DataTypeToEnum<T>::value, stream, /*use_4d_tensor=*/false);
 #else
-    // fast NDHWC implementation is a CUDA only feature
-    const bool compute_in_nhwc = false;
+  // fast NDHWC implementation is a CUDA only feature
+  const bool compute_in_nhwc = false;
 #endif
-    const TensorFormat compute_data_format =
-        (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
-                                                        : FORMAT_NCHW;
-
-    VLOG(3) << "Compute Conv3DBackpropFilter with cuDNN:"
-            << " data_format=" << ToString(data_format)
-            << " compute_data_format=" << ToString(compute_data_format);
-
-    constexpr auto kComputeInNHWC =
-        std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
-                        se::dnn::FilterLayout::kOutputYXInput);
-    constexpr auto kComputeInNCHW =
-        std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
-                        se::dnn::FilterLayout::kOutputInputYX);
-
-    se::dnn::DataLayout compute_data_layout;
-    se::dnn::FilterLayout filter_layout;
-
-    std::tie(compute_data_layout, filter_layout) =
-        compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
-
-    se::dnn::BatchDescriptor input_desc(3);
-    input_desc.set_count(dims.batch_size)
-        .set_spatial_dim(DimIndex::X,
-                         GetTensorDim(compatible_input, data_format, '2'))
-        .set_spatial_dim(DimIndex::Y,
-                         GetTensorDim(compatible_input, data_format, '1'))
-        .set_spatial_dim(DimIndex::Z,
-                         GetTensorDim(compatible_input, data_format, '0'))
-        .set_feature_map_count(dims.in_depth)
-        .set_layout(compute_data_layout);
-    se::dnn::BatchDescriptor output_desc(3);
-    output_desc.set_count(dims.batch_size)
-        .set_spatial_dim(DimIndex::X, dims.output_size(2))
-        .set_spatial_dim(DimIndex::Y, dims.output_size(1))
-        .set_spatial_dim(DimIndex::Z, dims.output_size(0))
-        .set_feature_map_count(dims.out_depth)
-        .set_layout(compute_data_layout);
-    se::dnn::FilterDescriptor filter_desc(3);
-    filter_desc.set_spatial_dim(DimIndex::X, dims.filter_size(2))
-        .set_spatial_dim(DimIndex::Y, dims.filter_size(1))
-        .set_spatial_dim(DimIndex::Z, dims.filter_size(0))
-        .set_input_feature_map_count(filter_shape.dim_size(3))
-        .set_output_feature_map_count(filter_shape.dim_size(4))
-        .set_layout(filter_layout);
-    se::dnn::ConvolutionDescriptor conv_desc(3);
-    conv_desc.set_dilation_rate(DimIndex::X, dims.dilation(2))
-        .set_dilation_rate(DimIndex::Y, dims.dilation(1))
-        .set_dilation_rate(DimIndex::Z, dims.dilation(0))
-        .set_filter_stride(DimIndex::X, dims.stride(2))
-        .set_filter_stride(DimIndex::Y, dims.stride(1))
-        .set_filter_stride(DimIndex::Z, dims.stride(0))
-        .set_zero_padding(DimIndex::X, padding_cols / 2)
-        .set_zero_padding(DimIndex::Y, padding_rows / 2)
-        .set_zero_padding(DimIndex::Z, padding_planes / 2)
-        .set_group_count(dims.in_depth / filter_shape.dim_size(3));
-
-    Tensor pre_transformed_filter_backprop;
-    auto dst_format =
-        compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
-    TensorShape dst_shape =
-        dst_format == FORMAT_OIHW
-            ? TensorShape({filter_shape.dim_size(4), filter_shape.dim_size(3),
-                           dims.filter_size(0), dims.filter_size(1),
-                           dims.filter_size(2)})
-            : TensorShape({filter_shape.dim_size(4), dims.filter_size(0),
-                           dims.filter_size(1), dims.filter_size(2),
-                           filter_shape.dim_size(3)});
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
-                                          &pre_transformed_filter_backprop));
-
-    Tensor transformed_out_backprop;
-    if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+  const TensorFormat compute_data_format =
+      (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
+                                                      : FORMAT_NCHW;
+
+  VLOG(3) << "Compute Conv3DBackpropFilter with cuDNN:"
+          << " data_format=" << ToString(data_format)
+          << " compute_data_format=" << ToString(compute_data_format);
+
+  constexpr auto kComputeInNHWC =
+      std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                      se::dnn::FilterLayout::kOutputYXInput);
+  constexpr auto kComputeInNCHW =
+      std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                      se::dnn::FilterLayout::kOutputInputYX);
+
+  se::dnn::DataLayout compute_data_layout;
+  se::dnn::FilterLayout filter_layout;
+
+  std::tie(compute_data_layout, filter_layout) =
+      compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
+
+  se::dnn::BatchDescriptor input_desc(3);
+  input_desc.set_count(dims.batch_size)
+      .set_spatial_dim(DimIndex::X,
+                       GetTensorDim(compatible_input, data_format, '2'))
+      .set_spatial_dim(DimIndex::Y,
+                       GetTensorDim(compatible_input, data_format, '1'))
+      .set_spatial_dim(DimIndex::Z,
+                       GetTensorDim(compatible_input, data_format, '0'))
+      .set_feature_map_count(dims.in_depth)
+      .set_layout(compute_data_layout);
+  se::dnn::BatchDescriptor output_desc(3);
+  output_desc.set_count(dims.batch_size)
+      .set_spatial_dim(DimIndex::X, dims.output_size(2))
+      .set_spatial_dim(DimIndex::Y, dims.output_size(1))
+      .set_spatial_dim(DimIndex::Z, dims.output_size(0))
+      .set_feature_map_count(dims.out_depth)
+      .set_layout(compute_data_layout);
+  se::dnn::FilterDescriptor filter_desc(3);
+  filter_desc.set_spatial_dim(DimIndex::X, dims.filter_size(2))
+      .set_spatial_dim(DimIndex::Y, dims.filter_size(1))
+      .set_spatial_dim(DimIndex::Z, dims.filter_size(0))
+      .set_input_feature_map_count(filter_shape.dim_size(3))
+      .set_output_feature_map_count(filter_shape.dim_size(4))
+      .set_layout(filter_layout);
+  se::dnn::ConvolutionDescriptor conv_desc(3);
+  conv_desc.set_dilation_rate(DimIndex::X, dims.dilation(2))
+      .set_dilation_rate(DimIndex::Y, dims.dilation(1))
+      .set_dilation_rate(DimIndex::Z, dims.dilation(0))
+      .set_filter_stride(DimIndex::X, dims.stride(2))
+      .set_filter_stride(DimIndex::Y, dims.stride(1))
+      .set_filter_stride(DimIndex::Z, dims.stride(0))
+      .set_zero_padding(DimIndex::X, padding_cols / 2)
+      .set_zero_padding(DimIndex::Y, padding_rows / 2)
+      .set_zero_padding(DimIndex::Z, padding_planes / 2)
+      .set_group_count(dims.in_depth / filter_shape.dim_size(3));
+
+  Tensor pre_transformed_filter_backprop;
+  auto dst_format =
+      compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
+  TensorShape dst_shape =
+      dst_format == FORMAT_OIHW
+          ? TensorShape({filter_shape.dim_size(4), filter_shape.dim_size(3),
+                         dims.filter_size(0), dims.filter_size(1),
+                         dims.filter_size(2)})
+          : TensorShape({filter_shape.dim_size(4), dims.filter_size(0),
+                         dims.filter_size(1), dims.filter_size(2),
+                         filter_shape.dim_size(3)});
+  OP_REQUIRES_OK(context,
+                 context->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
+                                        &pre_transformed_filter_backprop));
+
+  Tensor transformed_out_backprop;
+  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
     VLOG(4) << "Convert the `out_backprop` tensor from NDHWC to NCDHW.";
     TensorShape nchw_shape = {dims.batch_size, dims.out_depth,
                               dims.output_size(0), dims.output_size(1),
@@ -897,11 +895,11 @@ void LaunchConvBackpropFilterOpImpl(
     } else {
       CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape));
     }
-    } else {
+  } else {
     transformed_out_backprop = out_backprop;
-    }
-    Tensor transformed_input;
-    if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+  }
+  Tensor transformed_input;
+  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
     VLOG(4) << "Convert the `input` tensor from NDHWC to NCDHW.";
     TensorShape nchw_shape = {
         dims.batch_size, dims.in_depth, compatible_input.dim_size(1),
@@ -917,96 +915,93 @@ void LaunchConvBackpropFilterOpImpl(
     } else {
       CHECK(transformed_input.CopyFrom(compatible_input, nchw_shape));
     }
-    } else {
+  } else {
     transformed_input = compatible_input;
-    }
+  }
 
-    auto out_backprop_ptr =
-        AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
-                       transformed_out_backprop.template flat<T>().size());
-    auto filter_backprop_ptr = AsDeviceMemory(
-        pre_transformed_filter_backprop.template flat<T>().data(),
-        pre_transformed_filter_backprop.template flat<T>().size());
-    auto input_ptr =
-        AsDeviceMemory(transformed_input.template flat<T>().data(),
-                       transformed_input.template flat<T>().size());
-
-    static int64_t ConvolveBackwardFilterScratchSize =
-        GetDnnWorkspaceLimitOrDefault();
-
-    const ConvParameters conv_parameters = {
-        stream->parent(),
-        dims.batch_size,
-        dims.in_depth,
-        {{dims.input_size(0), dims.input_size(1), dims.input_size(2)}},
-        compute_data_format,
-        dims.out_depth,
-        {{dims.filter_size(0), dims.filter_size(1), dims.filter_size(2)}},
-        {{dims.dilation(0), dims.dilation(1), dims.dilation(2)}},
-        {{dims.stride(0), dims.stride(1), dims.stride(2)}},
-        {{padding_planes, padding_rows, padding_cols}},
-        input.dtype(),
-        conv_desc.group_count(),
-    };
-
-    using se::dnn::AlgorithmConfig;
-    using se::dnn::AlgorithmDesc;
-    using se::dnn::ProfileResult;
-
-    auto entry_or = AutotuneUnfusedConv(
-        cudnn_use_autotune, AutotuneConv3dBwdFilter::GetInstance(),
-        conv_parameters, context, se::dnn::ConvolutionKind::BACKWARD_FILTER,
-        input_desc, input_ptr, filter_desc, filter_backprop_ptr, conv_desc,
-        output_desc, out_backprop_ptr, ConvolveBackwardFilterScratchSize);
-    OP_REQUIRES_OK(context, entry_or.status());
-    auto autotune_entry = std::move(entry_or).value();
-
-    DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
-                                          context);
-    Status cudnn_launch_status = LaunchAutotunedConv(
-        autotune_entry, &scratch_allocator,
-        se::dnn::ConvolutionKind::BACKWARD_FILTER, stream, input_desc,
-        input_ptr, filter_desc, filter_backprop_ptr, conv_desc, output_desc,
-        out_backprop_ptr);
-    if (!cudnn_launch_status.ok()) {
-      context->SetStatus(cudnn_launch_status);
-      return;
-    }
+  auto out_backprop_ptr =
+      AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
+                     transformed_out_backprop.template flat<T>().size());
+  auto filter_backprop_ptr =
+      AsDeviceMemory(pre_transformed_filter_backprop.template flat<T>().data(),
+                     pre_transformed_filter_backprop.template flat<T>().size());
+  auto input_ptr = AsDeviceMemory(transformed_input.template flat<T>().data(),
+                                  transformed_input.template flat<T>().size());
+
+  static int64_t ConvolveBackwardFilterScratchSize =
+      GetDnnWorkspaceLimitOrDefault();
+
+  const ConvParameters conv_parameters = {
+      stream->parent(),
+      dims.batch_size,
+      dims.in_depth,
+      {{dims.input_size(0), dims.input_size(1), dims.input_size(2)}},
+      compute_data_format,
+      dims.out_depth,
+      {{dims.filter_size(0), dims.filter_size(1), dims.filter_size(2)}},
+      {{dims.dilation(0), dims.dilation(1), dims.dilation(2)}},
+      {{dims.stride(0), dims.stride(1), dims.stride(2)}},
+      {{padding_planes, padding_rows, padding_cols}},
+      input.dtype(),
+      conv_desc.group_count(),
+  };
+
+  using se::dnn::AlgorithmConfig;
+  using se::dnn::AlgorithmDesc;
+  using se::dnn::ProfileResult;
+
+  auto entry_or = AutotuneUnfusedConv(
+      cudnn_use_autotune, AutotuneConv3dBwdFilter::GetInstance(),
+      conv_parameters, context, se::dnn::ConvolutionKind::BACKWARD_FILTER,
+      input_desc, input_ptr, filter_desc, filter_backprop_ptr, conv_desc,
+      output_desc, out_backprop_ptr, ConvolveBackwardFilterScratchSize);
+  OP_REQUIRES_OK(context, entry_or.status());
+  auto autotune_entry = std::move(entry_or).value();
+
+  DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
+                                        context);
+  Status cudnn_launch_status = LaunchAutotunedConv(
+      autotune_entry, &scratch_allocator,
+      se::dnn::ConvolutionKind::BACKWARD_FILTER, stream, input_desc, input_ptr,
+      filter_desc, filter_backprop_ptr, conv_desc, output_desc,
+      out_backprop_ptr);
+  if (!cudnn_launch_status.ok()) {
+    context->SetStatus(cudnn_launch_status);
+    return;
+  }
 
-    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
-    functor::ReverseTransformFilter<GPUDevice, T, 5>()(
-        context->eigen_device<GPUDevice>(), /*src_filter_format=*/dst_format,
-        toConstTensor(pre_transformed_filter_backprop).template tensor<T, 5>(),
-        filter_backprop->tensor<T, 5>());
+  auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+  functor::ReverseTransformFilter<GPUDevice, T, 5>()(
+      context->eigen_device<GPUDevice>(), /*src_filter_format=*/dst_format,
+      toConstTensor(pre_transformed_filter_backprop).template tensor<T, 5>(),
+      filter_backprop->tensor<T, 5>());
 }
 
 template <typename T>
 struct LaunchConvBackpropFilterOp {
-    static void launch(OpKernelContext* context, bool cudnn_use_autotune,
-                       const Tensor& input, const Tensor& out_backprop,
-                       const std::vector<int32>& dilation,
-                       const std::vector<int32>& stride, const Padding& padding,
-                       Tensor* filter_backprop, TensorFormat data_format) {
-      LaunchConvBackpropFilterOpImpl<T>(context, cudnn_use_autotune, input,
-                                        out_backprop, dilation, stride, padding,
-                                        filter_backprop, data_format);
-    }
+  static void launch(OpKernelContext* context, bool cudnn_use_autotune,
+                     const Tensor& input, const Tensor& out_backprop,
+                     const std::vector<int32>& dilation,
+                     const std::vector<int32>& stride, const Padding& padding,
+                     Tensor* filter_backprop, TensorFormat data_format) {
+    LaunchConvBackpropFilterOpImpl<T>(context, cudnn_use_autotune, input,
+                                      out_backprop, dilation, stride, padding,
+                                      filter_backprop, data_format);
+  }
 };
 
 template <>
 struct LaunchConvBackpropFilterOp<Eigen::bfloat16> {
-    static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
-                       const Tensor& input, const Tensor& out_backprop,
-                       const std::vector<int32>& dilation,
-                       const std::vector<int32>& stride, const Padding& padding,
-                       Tensor* filter_backprop, TensorFormat data_format) {
-      // Performant bfloat16 operations are supported for Ampere+ GPUs. For
-      // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
-      auto* stream = ctx->op_device_context()->stream();
-      const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE);
-
-      if (cast_to_float) {
+  static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
+                     const Tensor& input, const Tensor& out_backprop,
+                     const std::vector<int32>& dilation,
+                     const std::vector<int32>& stride, const Padding& padding,
+                     Tensor* filter_backprop, TensorFormat data_format) {
+    auto* stream = ctx->op_device_context()->stream();
+
+    const bool cast_to_float = !IsBF16SupportedInOps(stream);
+
+    if (cast_to_float) {
       Tensor casted_input = input;
       Tensor casted_out_backprop = out_backprop;
       Tensor casted_filter_backprop = *filter_backprop;
@@ -1035,96 +1030,96 @@ struct LaunchConvBackpropFilterOp<Eigen::bfloat16> {
       cast_back(device, filter_backprop->template flat<Eigen::bfloat16>(),
                 casted_filter_backprop_const.template flat<float>());
       return;
-      }
-
-      LaunchConvBackpropFilterOpImpl<Eigen::bfloat16>(
-          ctx, cudnn_use_autotune, input, out_backprop, dilation, stride,
-          padding, filter_backprop, data_format);
     }
+
+    LaunchConvBackpropFilterOpImpl<Eigen::bfloat16>(
+        ctx, cudnn_use_autotune, input, out_backprop, dilation, stride, padding,
+        filter_backprop, data_format);
+  }
 };
 
 template <typename T>
 class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
-   public:
-    explicit Conv3DBackpropFilterOp(OpKernelConstruction* context)
-        : OpKernel(context),
-          data_format_(FORMAT_NHWC),
-          takes_shape_(type_string().find("V2") != std::string::npos) {
-      // data_format is only available in V2.
-      if (takes_shape_) {
+ public:
+  explicit Conv3DBackpropFilterOp(OpKernelConstruction* context)
+      : OpKernel(context),
+        data_format_(FORMAT_NHWC),
+        takes_shape_(type_string().find("V2") != std::string::npos) {
+    // data_format is only available in V2.
+    if (takes_shape_) {
       string data_format;
       OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                   errors::InvalidArgument("Invalid data format"));
-      }
-      OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
-      OP_REQUIRES(context, dilation_.size() == 5,
-                  errors::InvalidArgument("Dilation rates field must "
-                                          "specify 5 dimensions"));
-      OP_REQUIRES(context,
-                  (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
-                   GetTensorDim(dilation_, data_format_, 'N') == 1),
-                  errors::InvalidArgument(
-                      "Current implementation does not yet support "
-                      "dilation rates in the batch and depth dimensions."));
-      OP_REQUIRES(
-          context,
-          (GetTensorDim(dilation_, data_format_, '0') > 0 &&
-           GetTensorDim(dilation_, data_format_, '1') > 0 &&
-           GetTensorDim(dilation_, data_format_, '2') > 0),
-          errors::InvalidArgument("Dilated rates should be larger than 0."));
-      OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
-      OP_REQUIRES(context, stride_.size() == 5,
-                  errors::InvalidArgument("Sliding window strides field must "
-                                          "specify 5 dimensions"));
-      OP_REQUIRES(context,
-                  (GetTensorDim(stride_, data_format_, 'C') == 1 &&
-                   GetTensorDim(stride_, data_format_, 'N') == 1),
-                  errors::InvalidArgument(
-                      "Current implementation does not yet support "
-                      "strides in the batch and depth dimensions."));
-      OP_REQUIRES(
-          context,
-          (GetTensorDim(stride_, data_format_, '0') > 0 &&
-           GetTensorDim(stride_, data_format_, '1') > 0 &&
-           GetTensorDim(stride_, data_format_, '2') > 0),
-          errors::InvalidArgument("Spatial strides should be larger than 0."));
-      OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-      cudnn_use_autotune_ = CudnnUseAutotune();
     }
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'N') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(dilation_, data_format_, '0') > 0 &&
+         GetTensorDim(dilation_, data_format_, '1') > 0 &&
+         GetTensorDim(dilation_, data_format_, '2') > 0),
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 5,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, 'C') == 1 &&
+         GetTensorDim(stride_, data_format_, 'N') == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, '0') > 0 &&
+         GetTensorDim(stride_, data_format_, '1') > 0 &&
+         GetTensorDim(stride_, data_format_, '2') > 0),
+        errors::InvalidArgument("Spatial strides should be larger than 0."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    cudnn_use_autotune_ = CudnnUseAutotune();
+  }
 
-    void Compute(OpKernelContext* context) override {
-      const Tensor& input = context->input(0);
-      const Tensor& out_backprop = context->input(2);
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& out_backprop = context->input(2);
 
-      TensorShape filter_shape;
-      if (takes_shape_) {
+    TensorShape filter_shape;
+    if (takes_shape_) {
       const Tensor& filter_sizes = context->input(1);
       OP_REQUIRES(context, TensorShapeUtils::IsVector(filter_sizes.shape()),
                   errors::InvalidArgument(
                       "filter_sizes shape must be rank 1 but is rank ",
                       filter_sizes.shape().dims()));
       OP_REQUIRES_OK(context, tensor::MakeShape(filter_sizes, &filter_shape));
-      } else {
+    } else {
       filter_shape = context->input(1).shape();
-      }
+    }
 
-      Tensor* filter_backprop;
-      OP_REQUIRES_OK(
-          context, context->allocate_output(0, filter_shape, &filter_backprop));
+    Tensor* filter_backprop;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, filter_shape, &filter_backprop));
 
-      LaunchConvBackpropFilterOp<T>::launch(
-          context, cudnn_use_autotune_, input, out_backprop, dilation_, stride_,
-          padding_, filter_backprop, data_format_);
-    }
+    LaunchConvBackpropFilterOp<T>::launch(
+        context, cudnn_use_autotune_, input, out_backprop, dilation_, stride_,
+        padding_, filter_backprop, data_format_);
+  }
 
-   private:
-    std::vector<int32> dilation_;
-    std::vector<int32> stride_;
-    Padding padding_;
-    TensorFormat data_format_;
-    bool takes_shape_;
-    bool cudnn_use_autotune_;
+ private:
+  std::vector<int32> dilation_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+  bool takes_shape_;
+  bool cudnn_use_autotune_;
 };
 
 #define REGISTER_GPU_KERNEL(T)                                                \
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc b/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc
index 1c1472ead96faa..e65e5995e92045 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc
@@ -539,11 +539,8 @@ operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
            const Padding& padding,
            const std::vector<int64_t>& explicit_paddings,
            Tensor* filter_backprop, TensorFormat data_format) {
-  // Performant bfloat16 operations are supported for Ampere+ GPUs. For
-  // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
   auto* stream = ctx->op_device_context()->stream();
-  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-      se::CudaComputeCapability::AMPERE);
+  const bool cast_to_float = !IsBF16SupportedInOps(stream);
 
   if (cast_to_float) {
     Tensor casted_input = input;
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 327855646e1f60..cf805027cb5835 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -463,11 +463,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, Eigen::bfloat16>::operator()(
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
     const std::vector<int64_t>& explicit_paddings, Tensor* in_backprop,
     TensorFormat data_format) {
-  // Performant bfloat16 operations are supported for Ampere+ GPUs. For
-  // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
   auto* stream = ctx->op_device_context()->stream();
-  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-      se::CudaComputeCapability::AMPERE);
+  const bool cast_to_float = !IsBF16SupportedInOps(stream);
 
   if (cast_to_float) {
     Tensor casted_out_backprop = out_backprop;
diff --git a/tensorflow/core/kernels/conv_grad_input_ops_3d.cc b/tensorflow/core/kernels/conv_grad_input_ops_3d.cc
index 06cf67d0fc4b50..70311cbbd7a3d7 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops_3d.cc
@@ -657,7 +657,6 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 TF_CALL_bfloat16(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
 
-
 // GPU definitions of both ops.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
@@ -1025,11 +1024,8 @@ struct LaunchConvBackpropInputOp<Eigen::bfloat16> {
                      const std::vector<int32>& dilation,
                      const std::vector<int32>& strides, const Padding& padding,
                      Tensor* in_backprop, TensorFormat data_format) {
-    // Performant bfloat16 operations are supported for Ampere+ GPUs. For
-    // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
     auto* stream = ctx->op_device_context()->stream();
-    const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-        se::CudaComputeCapability::AMPERE);
+    const bool cast_to_float = !IsBF16SupportedInOps(stream);
 
     if (cast_to_float) {
       Tensor casted_out_backprop = out_backprop;
@@ -1153,15 +1149,14 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
   bool cudnn_use_autotune_;
 };
 
-
-#define REGISTER_GPU_KERNEL(T)                                                \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("Conv3DBackpropInput").Device(DEVICE_GPU).TypeConstraint<T>("T"),  \
-      Conv3DBackpropInputOp<GPUDevice, T>);                                   \
-  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInputV2")                       \
-                              .Device(DEVICE_GPU)                             \
-                              .TypeConstraint<T>("T")                         \
-                              .HostMemory("input_sizes"),                     \
+#define REGISTER_GPU_KERNEL(T)                                               \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("Conv3DBackpropInput").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      Conv3DBackpropInputOp<GPUDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInputV2")                      \
+                              .Device(DEVICE_GPU)                            \
+                              .TypeConstraint<T>("T")                        \
+                              .HostMemory("input_sizes"),                    \
                           Conv3DBackpropInputOp<GPUDevice, T>);
 TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index d932b57189b4ef..72bad756b4d0fd 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -227,11 +227,9 @@ struct LaunchConv3DOp<GPUDevice, Eigen::bfloat16> {
                                                   strides.end());
     gtl::InlinedVector<int64_t, 3> casted_dilations(dilations.begin(),
                                                     dilations.end());
-    // Performant bfloat16 operations are supported for Ampere+ GPUs. For
-    // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
+
     auto* stream = ctx->op_device_context()->stream();
-    const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-        se::CudaComputeCapability::AMPERE);
+    const bool cast_to_float = !IsBF16SupportedInOps(stream);
 
     if (cast_to_float) {
       Tensor casted_input = input_param;
diff --git a/tensorflow/core/kernels/conv_ops_bfloat16.cc b/tensorflow/core/kernels/conv_ops_bfloat16.cc
index 918c17c0f31b02..37507841647f0b 100644
--- a/tensorflow/core/kernels/conv_ops_bfloat16.cc
+++ b/tensorflow/core/kernels/conv_ops_bfloat16.cc
@@ -118,11 +118,8 @@ void LaunchConvOp<GPUDevice, Eigen::bfloat16>::operator()(
     dilations_spatial[i] =
         GetTensorDim(dilations, data_format, static_cast<char>(i + '0'));
   }
-  // Performant bfloat16 operations are supported for Ampere+ GPUs. For
-  // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
   auto* stream = context->op_device_context()->stream();
-  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-      se::CudaComputeCapability::AMPERE);
+  const bool cast_to_float = !IsBF16SupportedInOps(stream);
 
   if (cast_to_float) {
     Tensor casted_input = input;
@@ -173,11 +170,8 @@ void LaunchConv2DOp<GPUDevice, Eigen::bfloat16>::operator()(
   gtl::InlinedVector<int64_t, 3> casted_dilations = {row_dilation,
                                                      col_dilation};
 
-  // Performant bfloat16 operations are supported for Ampere+ GPUs. For
-  // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
   auto* stream = ctx->op_device_context()->stream();
-  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-      se::CudaComputeCapability::AMPERE);
+  const bool cast_to_float = !IsBF16SupportedInOps(stream);
 
   if (cast_to_float) {
     Tensor casted_input = input_param;
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.cc b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
index dce0e995be7581..bd6e9ed054762a 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@@ -149,11 +149,9 @@ void DnnPooling3dOp<Eigen::bfloat16>::Compute(
     const std::array<int64_t, 3>& window, const std::array<int64_t, 3>& stride,
     const std::array<int64_t, 3>& padding, TensorFormat data_format,
     const Tensor& tensor_in, Tensor* output) {
-  // Performant bfloat16 operations are supported for Ampere+ GPUs. For
-  // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
   auto* stream = context->op_device_context()->stream();
-  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-      se::CudaComputeCapability::AMPERE);
+  const bool cast_to_float = !IsBF16SupportedInOps(stream);
+
   if (cast_to_float) {
     Tensor casted_in;
     Tensor casted_output;
@@ -348,11 +346,8 @@ void DnnPooling3dGradOp<Eigen::bfloat16>::Compute(
     const std::array<int64_t, 3>& output_size, TensorFormat data_format,
     const Tensor& out_backprop, const TensorShape& tensor_in_shape,
     const Tensor* tensor_in, const Tensor* tensor_out, Tensor* input_backprop) {
-  // Performant bfloat16 operations are supported for Ampere+ GPUs. For
-  // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
   auto* stream = context->op_device_context()->stream();
-  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-      se::CudaComputeCapability::AMPERE);
+  const bool cast_to_float = !IsBF16SupportedInOps(stream);
   if (cast_to_float) {
     Tensor casted_out_backprop;
     Tensor casted_tensor_in;
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 06f1bf2bb3c531..16812fc1a59342 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -1,11 +1,11 @@
 # Description:
 #   OpKernels for tf.data
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 # Definitions are loaded separately so that copybara can pattern match (and modify) each definition.
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_kernel_library")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -996,6 +996,7 @@ tf_kernel_library(
         "//tensorflow/core/data:name_utils",
         "//tensorflow/core/data:split_utils",
         "@com_google_absl//absl/memory",
+        "@local_tsl//tsl/platform:types",
     ],
 )
 
@@ -1064,6 +1065,8 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/data:name_utils",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -1172,7 +1175,9 @@ tf_kernel_library(
         "//tensorflow/core/data:dataset_utils",
         "//tensorflow/core/data:name_utils",
         "//tensorflow/core/data:serialization_utils",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/random",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1512,6 +1517,8 @@ filegroup(
         "//tensorflow/core/data:captured_function.h",
         "//tensorflow/core/data:compression_utils.h",
         "//tensorflow/core/data:dataset_utils.h",
+        "//tensorflow/core/data:file_logger_client_interface.h",
+        "//tensorflow/core/data:file_logger_client_no_op.h",
         "//tensorflow/core/data:finalization_utils.h",
         "//tensorflow/core/data:metric_utils.h",
         "//tensorflow/core/data:name_utils.h",
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 768fd7c6cf0e0d..3ffe6bfc78eb76 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -919,6 +919,8 @@ tf_kernel_library(
         ":data_service_dataset_op",
         ":data_service_ops",
         ":distributed_save_op",
+        "//tensorflow/core/data/service/snapshot:list_snapshot_chunks_dataset_op",
+        "//tensorflow/core/data/service/snapshot:snapshot_chunk_dataset_op",
     ],
 )
 
@@ -964,7 +966,6 @@ tf_kernel_library(
         ":to_tf_record_op",
         ":unbatch_dataset_op",
         ":unique_dataset_op",
-        "//tensorflow/core/data/service/snapshot:snapshot_chunk_dataset_op",
     ] + select({
         "//tensorflow:fuchsia": [],
         "//conditions:default": [":lmdb_dataset_op"],
diff --git a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index 2a955123681ca6..05035da404cc99 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -495,9 +495,16 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
         // time of the node. If restoring, pass nullptr to not record processing
         // time because iterator modeling is only used to model Iterator's
         // GetNext() resource usage.
-        TF_RETURN_IF_ERROR(instantiated_reduce_func_->Run(
+        auto status = instantiated_reduce_func_->Run(
             ctx, std::move(args), &return_values,
-            ctx->is_restoring() ? nullptr : model_node()));
+            ctx->is_restoring() ? nullptr : model_node());
+        if (!status.ok()) {
+          return absl::InternalError(absl::StrFormat(
+              "Got error code %s and message: {\n%s\n} \nfrom running "
+              "user-defined function %s: ",
+              absl::StatusCodeToString(status.code()), status.message(),
+              instantiated_reduce_func_->func_name()));
+        }
 
         if (!(return_values.size() == 1 &&
               return_values[0].dtype() == DT_VARIANT &&
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index b6379c598fbd31..9abee3f1112296 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -139,6 +139,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
+      // LINT.IfChange(GetNextInternal)
       mutex_lock l(mu_);
       do {
         if (!input_impl_) {
@@ -149,28 +150,39 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
           // We are currently processing a mapped element, so try to get the
           // next subelement.
           bool end_of_element;
+          // Create a new context so that we have a separate `checkpoint`
+          // different from `ctx->checkpoint()`
           auto nested_ctx = MakeNestedIteratorContext(ctx);
           TF_RETURN_IF_ERROR(current_element_iterator_->GetNext(
               &nested_ctx, out_tensors, &end_of_element));
+
+          // Merge the checkpoint so that the changes made to
+          // `current_element_iterator_` is propagated
           ctx->MergeCheckpoint(nested_ctx.checkpoint());
           if (!end_of_element) {
             // Produce the subelement as output.
             *end_of_sequence = false;
             return OkStatus();
           }
+          // Since this sub-iterator is done,
+          // we can commit `input_ckpt_` to `ctx->checkpoint()`
           ctx->MergeCheckpoint(input_ckpt_.get());
 
+          // Also clean up this sub-iterator's checkpoint inside of
+          // `ctx->checkpoint()` since it has been consumed.
+          ctx->PurgeCheckpoint(current_element_iterator_->prefix());
           // We have reached the end of the current element, so maybe move on
           // to the next element.
-          ctx->PurgeCheckpoint(current_element_iterator_->prefix());
           current_element_iterator_.reset();
         }
-
         // Get the next element from the input dataset.
         inputs_.clear();
         auto input_ctx = std::make_unique<IteratorContext>(*ctx);
         TF_RETURN_IF_ERROR(
             input_impl_->GetNext(input_ctx.get(), &inputs_, end_of_sequence));
+        // Merge the checkpoint to `input_ckpt_` but do not commit to
+        // `ctx->checkpoint()` yet until the sub-iterator created from
+        // this `inputs_` is consumed.
         input_ckpt_->Merge(input_ctx->checkpoint());
         if (*end_of_sequence) {
           input_impl_.reset();
@@ -180,10 +192,12 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(
             BuildCurrentElementIteratorLocked(ctx, /*is_get_next=*/true));
       } while (true);
+      // LINT.ThenChange(:SkipInternal)
     }
 
     Status SkipInternal(IteratorContext* ctx, int num_to_skip,
                         bool* end_of_sequence, int* num_skipped) override {
+      // LINT.IfChange(SkipInternal)
       mutex_lock l(mu_);
       *num_skipped = 0;
       while (*num_skipped < num_to_skip) {
@@ -191,33 +205,65 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
           *end_of_sequence = true;
           return OkStatus();
         }
-        if (!current_element_iterator_) {
-          // Get the next element from the input dataset.
-          inputs_.clear();
-          TF_RETURN_IF_ERROR(
-              input_impl_->GetNext(ctx, &inputs_, end_of_sequence));
-          if (*end_of_sequence) {
-            input_impl_.reset();
-            *end_of_sequence = true;
-            return OkStatus();
+        if (current_element_iterator_) {
+          // We are currently processing a mapped element, so try to get the
+          // next subelement.
+
+          bool end_of_element;
+          // Create a new context so that we have a separate `checkpoint`
+          // different from `ctx->checkpoint()`
+          auto nested_ctx = MakeNestedIteratorContext(ctx);
+
+          // `last_num_skipped` stores how many elements
+          // we have actually skipped.
+          int last_num_skipped;
+          TF_RETURN_IF_ERROR(current_element_iterator_->Skip(
+              &nested_ctx, num_to_skip - *num_skipped, &end_of_element,
+              &last_num_skipped));
+          *num_skipped += last_num_skipped;
+
+          // Merge the checkpoint so that the changes made to
+          // `current_element_iterator_` is propagated
+          ctx->MergeCheckpoint(nested_ctx.checkpoint());
+          if (!end_of_element) {
+            if (*num_skipped != num_to_skip) {
+              return absl::InternalError(absl::StrFormat(
+                  "Expected `num_skipped` and `num_to_skip` to be the same. Got"
+                  " %d(num_skipped) and %d(num_to_skip)",
+                  *num_skipped, num_to_skip));
+            }
+            continue;
           }
-          TF_RETURN_IF_ERROR(
-              BuildCurrentElementIteratorLocked(ctx, /*is_get_next=*/false));
-        }
-        bool end_of_element;
-        int last_num_skipped;
-        TF_RETURN_IF_ERROR(current_element_iterator_->Skip(
-            MakeNestedIteratorContext(ctx), num_to_skip - *num_skipped,
-            &end_of_element, &last_num_skipped));
-        *num_skipped += last_num_skipped;
-        if (end_of_element) {
+          // Since this sub-iterator is done,
+          // we can commit `input_ckpt_` to `ctx->checkpoint()`
+          ctx->MergeCheckpoint(input_ckpt_.get());
+          // Also clean up this sub-iterator's checkpoint inside of
+          // `ctx->checkpoint()` since it has been consumed.
+          ctx->PurgeCheckpoint(current_element_iterator_->prefix());
           // We have reached the end of the current element, so maybe move on
           // to the next element.
           current_element_iterator_.reset();
         }
+        // Get the next element from the input dataset.
+        inputs_.clear();
+        auto input_ctx = std::make_unique<IteratorContext>(*ctx);
+        TF_RETURN_IF_ERROR(
+            input_impl_->GetNext(input_ctx.get(), &inputs_, end_of_sequence));
+        // Merge the checkpoint to `input_ckpt_` but do not commit to
+        // `ctx->checkpoint()` yet until the sub-iterator created from
+        // this `inputs_` is consumed.
+        input_ckpt_->Merge(input_ctx->checkpoint());
+        if (*end_of_sequence) {
+          input_impl_.reset();
+          *end_of_sequence = true;
+          return OkStatus();
+        }
+        TF_RETURN_IF_ERROR(
+            BuildCurrentElementIteratorLocked(ctx, /*is_get_next=*/false));
       }
       *end_of_sequence = false;
       return OkStatus();
+      // LINT.ThenChange(:GetNextInternal)
     }
 
    protected:
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 2ae28298d81f27..41e98622f18e5f 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -226,7 +226,13 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
              {"cycle_length",
               strings::Printf("%lld", static_cast<long long>(cycle_length))},
              {"deterministic",
-              deterministic.IsNondeterministic() ? "false" : "true"}}) {
+              deterministic.IsNondeterministic() ? "false" : "true"},
+             {"buffer_output_elements",
+              strings::Printf("%lld",
+                              static_cast<long long>(buffer_output_elements_))},
+             {"prefetch_input_elements",
+              strings::Printf(
+                  "%lld", static_cast<long long>(prefetch_input_elements_))}}) {
     input_->Ref();
   }
 
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 35313d78f26d09..b6aa84aa7089f5 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tsl/platform/types.h"
 
 namespace tensorflow {
 namespace data {
@@ -65,6 +66,25 @@ Status ConvertOutputTypes(const tensorflow::DataTypeVector& output_dtypes,
 
 int64_t sgn(int64_t val) { return (0 < val) - (val < 0); }
 
+int64_t RangeCardinality(int64_t start, int64_t stop, int64_t step) {
+  // `enumerate` uses int max to simulate an infinite range dataset.
+  if (stop >= tsl::kint64max) {
+    return kInfiniteCardinality;
+  }
+
+  // If the signs of `stop - start` and `step` are different or either of
+  // the values is zero, the range will be empty.
+  if (sgn(stop - start) * sgn(step) <= 0) {
+    return 0;
+  } else if (step > 0) {
+    // Invariant: stop - start > 0 && step > 0
+    return (stop - start - 1) / step + 1;
+  } else {
+    // Invariant: start - stop > 0 && step < 0
+    return (start - stop - 1) / -step + 1;
+  }
+}
+
 // Class which produces the elements of `range(start, stop, step)`. Threadsafe.
 class RangeCounter {
  public:
@@ -100,6 +120,8 @@ class RangeCounter {
     next_ = value;
   }
 
+  int64_t Cardinality() const { return RangeCardinality(start_, stop_, step_); }
+
  private:
   const int64_t start_;
   const int64_t stop_;
@@ -147,6 +169,8 @@ class RangeDatasetOp::RangeSplitProvider : public SplitProvider {
     return OkStatus();
   }
 
+  int64_t Cardinality() const override { return counter_.Cardinality(); }
+
  private:
   RangeCounter counter_;
 };
@@ -185,17 +209,7 @@ class RangeDatasetOp::Dataset : public DatasetBase {
   }
 
   int64_t CardinalityInternal(CardinalityOptions options) const override {
-    // If the signs of `stop_ - start_` and `step_` are different or either of
-    // the values is zero, the range will be empty.
-    if (sgn(stop_ - start_) * sgn(step_) <= 0) {
-      return 0;
-    } else if (step_ > 0) {
-      // Invariant: stop_ - start_ > 0 && step_ > 0
-      return (stop_ - start_ - 1) / step_ + 1;
-    } else {
-      // Invariant: start_ - stop_ > 0 && step_ < 0
-      return (start_ - stop_ - 1) / -step_ + 1;
-    }
+    return RangeCardinality(start_, stop_, step_);
   }
 
   Status MakeSplitProviders(std::vector<std::unique_ptr<SplitProvider>>*
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index 819f3bf087a66a..2e66c311d43b89 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -14,14 +14,18 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/repeat_dataset_op.h"
 
+#include <functional>
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tsl/platform/errors.h"
 
 namespace tensorflow {
 namespace data {
@@ -74,6 +78,49 @@ bool HasDataServiceInput(const DatasetBase* dataset) {
 }
 }  // namespace
 
+// Updates an input split provider with the appropriate cardinality count based
+// on how many times it is repeated.
+class RepeatedSplitProvider : public SplitProvider {
+ public:
+  explicit RepeatedSplitProvider(std::unique_ptr<SplitProvider> split_provider,
+                                 int64_t count)
+      : split_provider_(std::move(split_provider)), count_(count) {}
+
+  // Updates the cardinality based on the times the input dataset is repeated.
+  int64_t Cardinality() const override {
+    if (split_provider_->Cardinality() == 0 || count_ == 0) {
+      return 0;
+    }
+    // From tensorflow/python/data/ops/repeat_op.py, the repeat op uses -1 for
+    // infinite repetitions.
+    if (count_ < 0) {
+      return kInfiniteCardinality;
+    }
+    if (split_provider_->Cardinality() < 0) {
+      return split_provider_->Cardinality();
+    }
+    return split_provider_->Cardinality() * count_;
+  }
+
+  // The following are the same as the input split provider.
+  absl::Status GetNext(Tensor* split, bool* end_of_splits) override {
+    return split_provider_->GetNext(split, end_of_splits);
+  }
+  absl::Status Reset() override { return split_provider_->Reset(); }
+  absl::Status Save(std::function<std::string(std::string)> full_name,
+                    IteratorStateWriter* writer) override {
+    return split_provider_->Save(full_name, writer);
+  }
+  absl::Status Restore(std::function<std::string(std::string)> full_name,
+                       IteratorStateReader* reader) override {
+    return split_provider_->Restore(full_name, reader);
+  }
+
+ private:
+  const std::unique_ptr<SplitProvider> split_provider_;
+  const int64_t count_;
+};
+
 class RepeatDatasetOp::Dataset : public DatasetBase {
  public:
   Dataset(OpKernelContext* ctx, int64_t count, const DatasetBase* input)
@@ -97,6 +144,19 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
     }
   }
 
+  absl::Status MakeSplitProviders(std::vector<std::unique_ptr<SplitProvider>>*
+                                      split_providers) const override {
+    std::vector<std::unique_ptr<SplitProvider>> input_split_providers;
+    TF_RETURN_IF_ERROR(input_->MakeSplitProviders(&input_split_providers));
+
+    split_providers->clear();
+    for (auto& split_provider : input_split_providers) {
+      split_providers->push_back(std::make_unique<RepeatedSplitProvider>(
+          std::move(split_provider), count_));
+    }
+    return absl::OkStatus();
+  }
+
   const DataTypeVector& output_dtypes() const override {
     return input_->output_dtypes();
   }
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 5143182f0b1a90..cb2c28dbf1ea58 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/serialization_utils.h"
@@ -203,6 +205,12 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       mutex_lock l(mu_);
       seed_generator_->GenerateSeeds(&seed_, &seed2_);
       ResetRngs();
+      // Initialize checkpoint_indices_ to the entire buffer.
+      if (ctx->symbolic_checkpoint()) {
+        for (int64_t i = 0; i < buffer_->size(); ++i) {
+          checkpoint_indices_.insert(i);
+        }
+      }
       return OkStatus();
     }
 
@@ -229,6 +237,8 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       this->RecordBufferDequeue(ctx, *out_tensors);
       std::swap(buffer_->at(index),
                 buffer_->at(slices_.front()->start % buffer_->size()));
+      checkpoint_indices_.insert(index);
+      checkpoint_indices_.insert(slices_.front()->start % buffer_->size());
       slices_.front()->start++;
       num_elements_--;
       return OkStatus();
@@ -273,8 +283,20 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kEpoch, epoch_));
       TF_RETURN_IF_ERROR(
           writer->WriteScalar(prefix(), kNumElements, num_elements_));
-      TF_RETURN_IF_ERROR(WriteElementsToCheckpoint(
-          writer, absl::StrCat(prefix(), kColon, "buffer"), *buffer_));
+      const std::string key_prefix = absl::StrCat(prefix(), kColon, "buffer");
+      if (ctx->symbolic_checkpoint()) {
+        // When symbolic checkpointing is turned on, `writer`
+        // already contains checkpoint of the shuffle buffer created by the
+        // previous invocation of this instance and the indices that need to be
+        // updated are stored in `checkpoint_indices`.
+        TF_RETURN_IF_ERROR(UpdateCheckpointElements(
+            writer, key_prefix, *buffer_, checkpoint_indices_));
+        checkpoint_indices_.clear();
+      } else {
+        TF_RETURN_IF_ERROR(
+            WriteElementsToCheckpoint(writer, key_prefix, *buffer_));
+      }
+
       TF_RETURN_IF_ERROR(
           writer->WriteScalar(prefix(), kSlicesSize, slices_.size()));
       for (size_t i = 0; i < slices_.size(); ++i) {
@@ -339,6 +361,12 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       TF_RETURN_IF_ERROR(ReadElementsFromCheckpoint(
           ctx, reader, absl::StrCat(prefix(), kColon, "buffer"),
           buffer_.get()));
+      if (ctx->symbolic_checkpoint()) {
+        DCHECK(checkpoint_indices_.empty());
+        for (size_t i = 0; i < buffer_->size(); ++i) {
+          checkpoint_indices_.insert(i);
+        }
+      }
       for (const auto& element : *buffer_) {
         RecordBufferEnqueue(ctx, element);
       }
@@ -502,9 +530,11 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       this->RecordBufferEnqueue(ctx, element);
       if (num_elements_ == buffer_->size()) {
         DCHECK(IsShuffleAll());
+        checkpoint_indices_.insert(buffer_->size());
         buffer_->push_back(element);
       } else {
         size_t index = slices_.back()->end % buffer_->size();
+        checkpoint_indices_.insert(index);
         buffer_->at(index) = std::move(element);
       }
       num_elements_++;
@@ -530,6 +560,10 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
     SeedGenerator* const seed_generator_ TF_GUARDED_BY(mu_);  // Not owned.
     std::unique_ptr<std::vector<std::vector<Tensor>>> buffer_
         TF_GUARDED_BY(mu_);
+    // Holds the indices of `buffer_` that have changed since the previous
+    // `SaveInternal()` and need to be updated in the MemoryCheckpoint
+    // (if symbolic checkpointing is used) in the next `SaveInternal()`.
+    absl::flat_hash_set<int64_t> checkpoint_indices_ TF_GUARDED_BY(mu_);
     std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_) = nullptr;
     int64_t epoch_ TF_GUARDED_BY(mu_) = 0;
     int64_t num_elements_ TF_GUARDED_BY(mu_) = 0;
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index 0708e70481c594..c15b014815705c 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -58,16 +58,14 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
 bool UseCudnnWith16BitFloat(OpKernelContext* ctx, DataType dtype) {
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   if (dtype == DT_HALF) {
     return true;
   } else if (dtype == DT_BFLOAT16) {
     auto* stream = ctx->op_device_context()->stream();
-    if (!stream) return false;
-    return stream->GetCudaComputeCapability().IsAtLeast(
-        se::CudaComputeCapability::AMPERE);
+    return IsBF16SupportedInOps(stream);
   }
-#endif
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   return false;
 }
 
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index e360d5f5a7d653..009710f113fd7a 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #endif  // GOOGLE_CUDA
 
 #include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/gpu_utils.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -1045,11 +1046,9 @@ struct FusedBatchNorm<GPUDevice, Eigen::bfloat16, float, is_training> {
                   Tensor* batch_mean, Tensor* batch_var, Tensor* saved_mean,
                   Tensor* saved_inv_var, TensorFormat tensor_format,
                   bool use_reserved_space) {
-    // Performant bfloat16 operations are supported for Ampere+ GPUs. For
-    // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
     auto* stream = context->op_device_context()->stream();
-    const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-        se::CudaComputeCapability::AMPERE);
+    const bool cast_to_float = !IsBF16SupportedInOps(stream);
+
     if (cast_to_float) {
       Tensor casted_x = x;
       Tensor casted_side_input;
@@ -1311,11 +1310,8 @@ struct FusedBatchNormGrad<GPUDevice, Eigen::bfloat16, float> {
                   Tensor* x_backprop, Tensor* scale_backprop,
                   Tensor* offset_backprop, Tensor* side_input_backprop,
                   bool use_reserved_space, TensorFormat tensor_format) {
-    // Performant bfloat16 operations are supported for Ampere+ GPUs. For
-    // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
     auto* stream = context->op_device_context()->stream();
-    const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-        se::CudaComputeCapability::AMPERE);
+    const bool cast_to_float = !IsBF16SupportedInOps(stream);
     if (cast_to_float) {
       Tensor casted_y_backprop = y_backprop;
       Tensor casted_x = x;
diff --git a/tensorflow/core/kernels/fused_eigen_output_kernels.h b/tensorflow/core/kernels/fused_eigen_output_kernels.h
index c264925055286f..9a50882a1016c7 100644
--- a/tensorflow/core/kernels/fused_eigen_output_kernels.h
+++ b/tensorflow/core/kernels/fused_eigen_output_kernels.h
@@ -26,6 +26,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_
 #define TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_
 
+#include <type_traits>
+
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -103,6 +105,22 @@ struct Relu6 {
   };
 };
 
+// Applies `Tanh` to the passed input expression.
+struct Tanh {
+  template <typename XprType>
+  static auto apply(XprType expr) -> decltype(expr.tanh()) {
+    return expr.tanh();
+  };
+};
+
+// Applies `Sigmoid` to the passed input expression.
+struct Sigmoid {
+  template <typename XprType>
+  static auto apply(XprType expr) -> decltype(expr.sigmoid()) {
+    return expr.sigmoid();
+  };
+};
+
 // Applies `Elu` to the passed input expression.
 struct Elu {
   template <typename XprType>
@@ -142,6 +160,8 @@ struct BiasAddArgs {
     return fusion == FusedComputationType::kBiasAdd ||
            fusion == FusedComputationType::kBiasAddWithRelu ||
            fusion == FusedComputationType::kBiasAddWithRelu6 ||
+           fusion == FusedComputationType::kBiasAddWithTanh ||
+           fusion == FusedComputationType::kBiasAddWithSigmoid ||
            fusion == FusedComputationType::kBiasAddWithElu ||
            fusion == FusedComputationType::kBiasAddWithLeakyRelu;
   }
@@ -219,10 +239,16 @@ struct BiasAddOutputKernel {
     typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows);
 
     for (int col = 0; col < num_cols; ++col) {
-      T* output_base = &output_mapper(0, col);
-      typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
-      const auto expr = output + bias;
-      output = Activation::template apply<decltype(expr)>(expr);
+      Scalar* output_base = &output_mapper(0, col);
+      typename TTypes<Scalar>::UnalignedTensor output(output_base, num_rows);
+      if constexpr (std::is_same_v<Scalar, T>) {
+        const auto expr = output + bias;
+        output = Activation::template apply<decltype(expr)>(expr);
+      } else {
+        const auto bias_expr = bias.template cast<Scalar>();
+        const auto expr = output + bias_expr;
+        output = Activation::template apply<decltype(expr)>(expr);
+      }
     }
   }
 
@@ -246,10 +272,18 @@ struct BiasAddOutputKernel<T, LeakyRelu> {
     typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows);
 
     for (int col = 0; col < num_cols; ++col) {
-      T* output_base = &output_mapper(0, col);
-      typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
-      const auto expr = output + bias;
-      output = LeakyRelu::template apply<decltype(expr)>(expr, leakyrelu_alpha);
+      Scalar* output_base = &output_mapper(0, col);
+      typename TTypes<Scalar>::UnalignedTensor output(output_base, num_rows);
+      if constexpr (std::is_same_v<Scalar, T>) {
+        const auto expr = output + bias;
+        output =
+            LeakyRelu::template apply<decltype(expr)>(expr, leakyrelu_alpha);
+      } else {
+        const auto bias_expr = bias.template cast<Scalar>();
+        const auto expr = output + bias_expr;
+        output =
+            LeakyRelu::template apply<decltype(expr)>(expr, leakyrelu_alpha);
+      }
     }
   }
 
@@ -356,6 +390,10 @@ using WithBiasAddAndRelu = BiasAddOutputKernel<T, Relu>;
 template <typename T>
 using WithBiasAddAndRelu6 = BiasAddOutputKernel<T, Relu6>;
 template <typename T>
+using WithBiasAddAndTanh = BiasAddOutputKernel<T, Tanh>;
+template <typename T>
+using WithBiasAddAndSigmoid = BiasAddOutputKernel<T, Sigmoid>;
+template <typename T>
 using WithBiasAddAndElu = BiasAddOutputKernel<T, Elu>;
 template <typename T>
 using WithBiasAddAndLeakyRelu = BiasAddOutputKernel<T, LeakyRelu>;
diff --git a/tensorflow/core/kernels/gpu_utils.cc b/tensorflow/core/kernels/gpu_utils.cc
index f9b9868579a4af..6f578f5f7d124d 100644
--- a/tensorflow/core/kernels/gpu_utils.cc
+++ b/tensorflow/core/kernels/gpu_utils.cc
@@ -37,6 +37,21 @@ using xla::AutotuningLog;
 using xla::ComputeCapability;
 using xla::CudnnVersion;
 
+bool IsBF16SupportedInOps(se::Stream* stream) {
+  if (!stream) {
+    return false;  // No stream: don't know whether it's supported.
+  }
+#if GOOGLE_CUDA
+  // Performant bfloat16 operations are supported for Ampere+ GPUs. For
+  // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
+  return stream->GetCudaComputeCapability().IsAtLeast(
+      se::CudaComputeCapability::AMPERE);
+#elif TENSORFLOW_USE_ROCM
+  // So far, we return false meaning that the conversion to float is needed.
+  return false;
+#endif
+}
+
 bool RedzoneCheckDisabled() {
   const char* disable_rz_str = std::getenv("TF_DISABLE_RZ_CHECK");
   return disable_rz_str != nullptr && std::strcmp(disable_rz_str, "1") == 0;
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index 96af46697a859b..8d511859ac5768 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -42,6 +42,10 @@ class AutotuneResult;
 
 namespace tensorflow {
 
+// Returns true if bfloat16 is directly supported in Ops and inputs shall not be
+// casted to floats to perform the computations and then back.
+bool IsBF16SupportedInOps(se::Stream* stream);
+
 class NodeDef;
 using xla::AutotuneResult;
 
diff --git a/tensorflow/core/kernels/linalg/einsum_op_impl.h b/tensorflow/core/kernels/linalg/einsum_op_impl.h
index 99c13063933250..da5c6718f4a271 100644
--- a/tensorflow/core/kernels/linalg/einsum_op_impl.h
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl.h
@@ -471,6 +471,7 @@ struct EinsumHelper {
         ReshapeToRank3(*output, bcast.output_batch_size(), &output_reshaped));
     LaunchBatchMatMul<Device, T>::Launch(ctx, lhs, rhs, /*adj_x=*/false,
                                          /*adj_y=*/false, trans_x, trans_y,
+                                         /*grad_x=*/false, /*grad_y=*/false,
                                          bcast, &output_reshaped);
     return OkStatus();
   }
diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc
index 872aa9247bcb51..f937a06016dc8c 100644
--- a/tensorflow/core/kernels/matmul_op_fused.cc
+++ b/tensorflow/core/kernels/matmul_op_fused.cc
@@ -32,15 +32,20 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
+#include <functional>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
+#include "Eigen/Core"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/fused_eigen_output_kernels.h"
 #include "tensorflow/core/platform/errors.h"
@@ -85,14 +90,16 @@ struct LaunchFusedMatMulOp {
 
 template <typename T>
 struct LaunchFusedMatMulOp<CPUDevice, T> {
+  // Use F32 compute for F16 inputs on CPU to preserve precision and reduce
+  // excessive casting during intermediate computations.
+  using ComputeType =
+      std::conditional_t<DataTypeToEnum<T>::value == DT_HALF, float, T>;
+
   void operator()(
       OpKernelContext* context, const Tensor& a, const Tensor& b,
       const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
       FusedComputationType fusion, const FusedComputationArgs& fusion_args,
       Tensor* output, bool use_autotune) {
-    OP_REQUIRES(context, DataTypeToEnum<T>::value != DT_HALF,
-                errors::InvalidArgument("_FusedMatMul doesn't support DT_HALF "
-                                        "data type on CPU devices."));
     auto lhs = a.matrix<T>();
     auto rhs = b.matrix<T>();
     auto out = output->matrix<T>();
@@ -104,13 +111,21 @@ struct LaunchFusedMatMulOp<CPUDevice, T> {
     auto executeWithOutputKernel = [&](auto output_kernel) {
       OutputKernelWrapper output_kernel_wrapper(
           [&output_kernel](
-              const ContractionOutputMapper<T, Eigen::Index>& output_mapper,
+              const ContractionOutputMapper<ComputeType, Eigen::Index>&
+                  output_mapper,
               const Eigen::TensorContractionParams& params, Eigen::Index i,
               Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) {
             output_kernel(output_mapper, params, i, j, num_rows, num_cols);
           });
 
-      out.device(d) = lhs.contract(rhs, dim_pair, output_kernel_wrapper);
+      if constexpr (std::is_same_v<ComputeType, T>) {
+        out.device(d) = lhs.contract(rhs, dim_pair, output_kernel_wrapper);
+      } else {
+        out.device(d) = lhs.template cast<ComputeType>()
+                            .contract(rhs.template cast<ComputeType>(),
+                                      dim_pair, output_kernel_wrapper)
+                            .template cast<T>();
+      }
     };
 
     BiasAddArgs<T> bias_add_args;
@@ -133,6 +148,12 @@ struct LaunchFusedMatMulOp<CPUDevice, T> {
       case FusedComputationType::kBiasAddWithRelu6:
         executeWithOutputKernel(WithBiasAddAndRelu6<T>(bias_add_args));
         break;
+      case FusedComputationType::kBiasAddWithTanh:
+        executeWithOutputKernel(WithBiasAddAndTanh<T>(bias_add_args));
+        break;
+      case FusedComputationType::kBiasAddWithSigmoid:
+        executeWithOutputKernel(WithBiasAddAndSigmoid<T>(bias_add_args));
+        break;
       case FusedComputationType::kBiasAddWithElu:
         executeWithOutputKernel(WithBiasAddAndElu<T>(bias_add_args));
         break;
@@ -155,16 +176,16 @@ struct LaunchFusedMatMulOp<CPUDevice, T> {
   // We do not pass std::function directly as an output kernel because it blows
   // up the binary size in debug mode with super long symbol names.
   struct OutputKernelWrapper {
-    using OutputKernelFn =
-        std::function<void(const ContractionOutputMapper<T, Eigen::Index>&,
-                           const Eigen::TensorContractionParams&, Eigen::Index,
-                           Eigen::Index, Eigen::Index, Eigen::Index)>;
+    using OutputKernelFn = std::function<void(
+        const ContractionOutputMapper<ComputeType, Eigen::Index>&,
+        const Eigen::TensorContractionParams&, Eigen::Index, Eigen::Index,
+        Eigen::Index, Eigen::Index)>;
 
     explicit OutputKernelWrapper(OutputKernelFn fn)
         : output_kernel_fn(std::move(fn)) {}
 
     void operator()(
-        const ContractionOutputMapper<T, Eigen::Index>& output_mapper,
+        const ContractionOutputMapper<ComputeType, Eigen::Index>& output_mapper,
         const Eigen::TensorContractionParams& params, Eigen::Index i,
         Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) const {
       output_kernel_fn(output_mapper, params, i, j, num_rows, num_cols);
@@ -611,6 +632,8 @@ class FusedMatMulOp : public OpKernel {
           {FCT::kBiasAdd, {"BiasAdd"}},
           {FCT::kBiasAddWithRelu, {"BiasAdd", "Relu"}},
           {FCT::kBiasAddWithRelu6, {"BiasAdd", "Relu6"}},
+          {FCT::kBiasAddWithTanh, {"BiasAdd", "Tanh"}},
+          {FCT::kBiasAddWithSigmoid, {"BiasAdd", "Sigmoid"}},
           {FCT::kBiasAddWithElu, {"BiasAdd", "Elu"}},
           {FCT::kBiasAddWithLeakyRelu, {"BiasAdd", "LeakyRelu"}},
       };
@@ -711,6 +734,7 @@ class FusedMatMulOp : public OpKernel {
       FusedMatMulOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_FUSED_CPU_MATMUL);
+TF_CALL_half(REGISTER_FUSED_CPU_MATMUL);
 
 #undef REGISTER_FUSED_CPU_MATMUL
 
diff --git a/tensorflow/core/kernels/matmul_op_impl.h b/tensorflow/core/kernels/matmul_op_impl.h
index 71f338f266bf55..7180fb1d4e35f9 100644
--- a/tensorflow/core/kernels/matmul_op_impl.h
+++ b/tensorflow/core/kernels/matmul_op_impl.h
@@ -21,12 +21,15 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include <algorithm>
+#include <cstdint>
 #include <functional>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
+#include "Eigen/Core"  // from @eigen_archive
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/bfloat16.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -36,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/matmul_autotune.h"
@@ -410,7 +414,8 @@ template <typename Scalar>
 struct LaunchBatchMatMul<CPUDevice, Scalar> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
                      const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
-                     bool trans_y, const MatMulBCast& bcast, Tensor* out) {
+                     bool trans_y, bool grad_x, bool grad_y,
+                     const MatMulBCast& bcast, Tensor* out) {
     typedef ParallelMatMulKernel<Scalar, Eigen::NumTraits<Scalar>::IsComplex>
         ParallelMatMulKernel;
     bool conjugate_result = false;
@@ -539,7 +544,8 @@ template <typename Scalar>
 struct LaunchBatchMatMul<GPUDevice, Scalar> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
                      const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
-                     bool trans_y, const MatMulBCast& bcast, Tensor* out) {
+                     bool trans_y, bool grad_x, bool grad_y,
+                     const MatMulBCast& bcast, Tensor* out) {
     se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
                                    se::blas::Transpose::kTranspose,
                                    se::blas::Transpose::kConjugateTranspose};
@@ -582,6 +588,16 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
                                     std::is_same_v<Scalar, Eigen::bfloat16>;
     using Coefficient = std::conditional_t<is_16bit_input, float, Scalar>;
 
+    se::blas::CallContext call_context = se::blas::CallContext::kNone;
+    OP_REQUIRES(context, grad_x == false || grad_y == false,
+                errors::InvalidArgument(
+                    "At least 1 of grad_x and grad_y shall be false"));
+    if (grad_x) {
+      call_context = se::blas::CallContext::kBackpropInput1;
+    }
+    if (grad_y) {
+      call_context = se::blas::CallContext::kBackpropInput2;
+    }
 #if GOOGLE_CUDA || TF_HIPBLASLT
     static const bool use_autotune = MatmulAutotuneEnable();
     bool bCublasLtSupport = true;
@@ -711,8 +727,7 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
                     static_cast<Coefficient>(1.0), b_ptrs,
                     adj_y || trans_y ? k : n, a_ptrs, adj_x || trans_x ? m : k,
                     static_cast<Coefficient>(0.0), c_ptrs, n, batch_size,
-                    GetNumericOptions(), &scratch_allocator,
-                    se::blas::CallContext::kNone)
+                    GetNumericOptions(), &scratch_allocator, call_context)
                 .ok();
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
@@ -811,17 +826,16 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
                            blas_transpose_b, blas_transpose_a, n, m, k,
                            *(b_ptrs[0]), adj_y || trans_y ? k : n, *(a_ptrs[0]),
                            adj_x || trans_x ? m : k, c_ptrs[0], n,
-                           GetNumericOptions(), se::blas::CallContext::kNone));
+                           GetNumericOptions(), call_context));
       } else if (use_strided_batched) {
         OP_REQUIRES_OK(
-            context,
-            stream->ThenBlasGemmStridedBatched(
-                blas_transpose_b, blas_transpose_a, n, m, k,
-                static_cast<Coefficient>(1.0), *b_ptrs[0],
-                adj_y || trans_y ? k : n, b_stride, *a_ptrs[0],
-                adj_x || trans_x ? m : k, a_stride,
-                static_cast<Coefficient>(0.0), c_ptrs[0], n, c_stride,
-                batch_size, GetNumericOptions(), se::blas::CallContext::kNone));
+            context, stream->ThenBlasGemmStridedBatched(
+                         blas_transpose_b, blas_transpose_a, n, m, k,
+                         static_cast<Coefficient>(1.0), *b_ptrs[0],
+                         adj_y || trans_y ? k : n, b_stride, *a_ptrs[0],
+                         adj_x || trans_x ? m : k, a_stride,
+                         static_cast<Coefficient>(0.0), c_ptrs[0], n, c_stride,
+                         batch_size, GetNumericOptions(), call_context));
       } else {
         BlasScratchAllocator scratch_allocator(context);
         bool blas_launch_status =
@@ -831,8 +845,7 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
                     static_cast<Coefficient>(1.0), b_ptrs,
                     adj_y || trans_y ? k : n, a_ptrs, adj_x || trans_x ? m : k,
                     static_cast<Coefficient>(0.0), c_ptrs, n, batch_size,
-                    GetNumericOptions(), &scratch_allocator,
-                    se::blas::CallContext::kNone)
+                    GetNumericOptions(), &scratch_allocator, call_context)
                 .ok();
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
@@ -850,6 +863,32 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+template <typename T>
+inline void FastConvertToFloat(const T* src, float* dst, int64_t size) {
+  Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>> src_eigen(src, size);
+  Eigen::Map<Eigen::ArrayXf> dst_eigen(dst, size);
+  dst_eigen = src_eigen.template cast<float>();
+}
+
+template <typename T>
+inline void FastConvertFromFloat(const float* src, T* dst, int64_t size) {
+  Eigen::Map<const Eigen::ArrayXf> src_eigen(src, size);
+  Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>> dst_eigen(dst, size);
+  dst_eigen = src_eigen.template cast<T>();
+}
+
+template <>
+inline void FastConvertToFloat<bfloat16>(const bfloat16* src, float* dst,
+                                         int64_t size) {
+  BFloat16ToFloat(src, dst, size);
+}
+
+template <>
+inline void FastConvertFromFloat<bfloat16>(const float* src, bfloat16* dst,
+                                           int64_t size) {
+  FloatToBFloat16(src, dst, size);
+}
+
 template <typename Device, typename Ta, typename Tb, typename Tout>
 class BaseBatchMatMulOp : public OpKernel {
  public:
@@ -862,11 +901,15 @@ class BaseBatchMatMulOp : public OpKernel {
       OP_REQUIRES_OK(context, context->GetAttr("transpose_b", &trans_y_));
       adj_x_ = false;
       adj_y_ = false;
+      OP_REQUIRES_OK(context, context->GetAttr("grad_a", &grad_input_1_));
+      OP_REQUIRES_OK(context, context->GetAttr("grad_b", &grad_input_2_));
     } else {
       OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
       OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
       trans_x_ = false;
       trans_y_ = false;
+      OP_REQUIRES_OK(context, context->GetAttr("grad_x", &grad_input_1_));
+      OP_REQUIRES_OK(context, context->GetAttr("grad_y", &grad_input_2_));
     }
   }
 
@@ -931,8 +974,17 @@ class BaseBatchMatMulOp : public OpKernel {
                 out_reshaped.CopyFrom(*out, TensorShape({batch_size, d0, d3})),
                 errors::Internal("Failed to reshape output from ",
                                  out->shape().DebugString()));
-    if (std::is_same_v<Device, CPUDevice> && std::is_same_v<Ta, bfloat16> &&
-        std::is_same_v<Tb, bfloat16>) {
+
+    // b/307285203: There seems to be an overly aggressive compiler optimization
+    // that optimizes away these data pointers unless we explicitly check them.
+    OP_REQUIRES(ctx,
+                in0_reshaped.data() != nullptr &&
+                    in1_reshaped.data() != nullptr &&
+                    out_reshaped.data() != nullptr,
+                absl::InternalError("Null data pointer encountered."));
+    if constexpr (std::is_same_v<Device, CPUDevice> && std::is_same_v<Ta, Tb> &&
+                  (std::is_same_v<Ta, bfloat16> ||
+                   std::is_same_v<Ta, Eigen::half>)) {
       Tensor in0_reshaped_float, in1_reshaped_float, out_reshaped_float;
       OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, in0_reshaped.shape(),
                                              &in0_reshaped_float));
@@ -941,31 +993,32 @@ class BaseBatchMatMulOp : public OpKernel {
       OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, out_reshaped.shape(),
                                              &out_reshaped_float));
 
-      // TODO: Avoid extra copy to make bfloat16 matmul efficient on CPU.
-      BFloat16ToFloat(in0_reshaped.flat<bfloat16>().data(),
-                      in0_reshaped_float.flat<float>().data(),
-                      in0_reshaped.NumElements());
-      BFloat16ToFloat(in1_reshaped.flat<bfloat16>().data(),
-                      in1_reshaped_float.flat<float>().data(),
-                      in1_reshaped.NumElements());
+      // TODO: Avoid extra copy to make (b)float16 matmul efficient on CPU.
+      FastConvertToFloat(in0_reshaped.flat<Ta>().data(),
+                         in0_reshaped_float.flat<float>().data(),
+                         in0_reshaped.NumElements());
+      FastConvertToFloat(in1_reshaped.flat<Tb>().data(),
+                         in1_reshaped_float.flat<float>().data(),
+                         in1_reshaped.NumElements());
 
       LaunchBatchMatMul<Device, float>::Launch(
           ctx, in0_reshaped_float, in1_reshaped_float, adj_x_, adj_y_, trans_x_,
-          trans_y_, bcast, &out_reshaped_float);
-      FloatToBFloat16(out_reshaped_float.flat<float>().data(),
-                      out_reshaped.flat<bfloat16>().data(), out->NumElements());
+          trans_y_, grad_input_1_, grad_input_2_, bcast, &out_reshaped_float);
+      FastConvertFromFloat<Tout>(out_reshaped_float.flat<float>().data(),
+                                 out_reshaped.flat<Tout>().data(),
+                                 out->NumElements());
     } else {
       // Cast tensor to desired type to reuse Eigen.
       // TODO(b/178749687): remove this cast if Eigen supports this natively.
-      if (!std::is_same<Ta, Tout>::value) {
+      if constexpr (!std::is_same<Ta, Tout>::value) {
         in0_reshaped = CastTensor<Ta, Tout>(in0_reshaped);
       }
-      if (!std::is_same<Tb, Tout>::value) {
+      if constexpr (!std::is_same<Tb, Tout>::value) {
         in1_reshaped = CastTensor<Tb, Tout>(in1_reshaped);
       }
-      LaunchBatchMatMul<Device, Tout>::Launch(ctx, in0_reshaped, in1_reshaped,
-                                              adj_x_, adj_y_, trans_x_,
-                                              trans_y_, bcast, &out_reshaped);
+      LaunchBatchMatMul<Device, Tout>::Launch(
+          ctx, in0_reshaped, in1_reshaped, adj_x_, adj_y_, trans_x_, trans_y_,
+          grad_input_1_, grad_input_2_, bcast, &out_reshaped);
     }
   }
 
@@ -979,6 +1032,8 @@ class BaseBatchMatMulOp : public OpKernel {
   bool adj_y_ = false;
   bool trans_x_ = false;
   bool trans_y_ = false;
+  bool grad_input_1_ = false;
+  bool grad_input_2_ = false;
 
   // Cast `t` from `SrcT` to `DstT`.
   template <typename SrcT, typename DstT>
diff --git a/tensorflow/core/kernels/matmul_op_test.cc b/tensorflow/core/kernels/matmul_op_test.cc
index 9d4276c39c2d10..96c37ac97817b8 100644
--- a/tensorflow/core/kernels/matmul_op_test.cc
+++ b/tensorflow/core/kernels/matmul_op_test.cc
@@ -416,12 +416,7 @@ REGISTER_TYPED_TEST_SUITE_P(FusedMatMulWithBiasOpTest,       //
                             MatMul1x256x1WithActivation);
 
 // TODO(ezhulenev): Add support for more data types.
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 using FusedBiasAddDataTypes = ::testing::Types<float, Eigen::half>;
-#else
-// CPU doesn't support more data types.
-using FusedBiasAddDataTypes = ::testing::Types<float>;
-#endif
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedMatMulWithBiasOpTest,
                                FusedBiasAddDataTypes);
 
diff --git a/tensorflow/core/kernels/matmul_util.cc b/tensorflow/core/kernels/matmul_util.cc
index b91db8d4cd3273..930de6e25ed604 100644
--- a/tensorflow/core/kernels/matmul_util.cc
+++ b/tensorflow/core/kernels/matmul_util.cc
@@ -166,6 +166,8 @@ StatusOr<const PlanAndAlgorithms*> GetPlanAndAlgorithms(
         .beta = 0.0,
         .compute_precision = se::blas::kDefaultComputePrecision,
         .algorithm = {},
+        .grad_x = false,
+        .grad_y = false,
         .compute_type = computation_type,
     };
 
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
index 84486cc1abb9d3..ead6367f5bc3ce 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
@@ -1660,6 +1660,10 @@ class MklFusedConvOp
       OP_REQUIRES(context, num_args == 1,
                   absl::InvalidArgumentError(
                       "Fused Conv2D must have one extra argument: bias."));
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "_FusedHardSwish"}) {
+      this->set_fuse_biasadd(true);
+      this->set_fuse_activation(true, dnnl::algorithm::eltwise_hardswish,
+                                1.0 / 6.0, 0.5);
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Add"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_add(true);
@@ -1831,6 +1835,10 @@ class MklFusedDepthwiseConvOp
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Elu"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_activation(true, dnnl::algorithm::eltwise_elu, 1.0);
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "_FusedHardSwish"}) {
+      this->set_fuse_biasadd(true);
+      this->set_fuse_activation(true, dnnl::algorithm::eltwise_hardswish,
+                                1.0 / 6.0, 0.5);
     } else {
       OP_REQUIRES(context, false,
                   absl::InvalidArgumentError(
diff --git a/tensorflow/core/kernels/mlir_generated/BUILD b/tensorflow/core/kernels/mlir_generated/BUILD
index db697ffe26f6b2..cb43658770bc9d 100644
--- a/tensorflow/core/kernels/mlir_generated/BUILD
+++ b/tensorflow/core/kernels/mlir_generated/BUILD
@@ -527,7 +527,6 @@ tf_cuda_cc_test(
     shard_count = 20,
     tags = tf_cuda_tests_tags() + [
         "no_cuda_asan",  # b/173033461
-        "no_rocm",  # failed since 7de9cf4
     ],
     deps = [
         ":base_binary_ops_test",
diff --git a/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc b/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc
index 9ea628a8673c94..561ca57c67ca11 100644
--- a/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc
+++ b/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc
@@ -696,11 +696,13 @@ GENERATE_DEFAULT_TESTS_WITH_SPECIFIC_INPUT_VALUES(
     /*test_name=*/UInt64, uint64_t, uint64_t, test::DefaultInput<uint64_t>(),
     test::DefaultInputNonZero<uint64_t>(), baseline_floor_mod,
     test::OpsTestConfig().ExpectStrictlyEqual());
+#if !TENSORFLOW_USE_ROCM
 GENERATE_DEFAULT_TESTS_WITH_SPECIFIC_INPUT_VALUES(
     FloorMod,
     /*test_name=*/Half, Eigen::half, Eigen::half,
     test::DefaultInput<Eigen::half>(), test::DefaultInputNonZero<Eigen::half>(),
     baseline_floor_mod, test::OpsTestConfig());
+#endif
 GENERATE_DEFAULT_TESTS_WITH_SPECIFIC_INPUT_VALUES(
     FloorMod,
     /*test_name=*/Float, float, float, test::DefaultInput<float>(),
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 407d6991608c7e..8d1220de4d4e0e 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -462,8 +462,7 @@ void DnnPoolingOp<Eigen::bfloat16>::Compute(
                  context->allocate_output(0, tensor_out_shape, &tensor_out));
 
   auto* stream = context->op_device_context()->stream();
-  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-      se::CudaComputeCapability::AMPERE);
+  const bool cast_to_float = !IsBF16SupportedInOps(stream);
   if (cast_to_float) {
     Tensor casted_tensor_in;
     Tensor casted_tensor_out;
@@ -876,8 +875,7 @@ void DnnPoolingGradOp<Eigen::bfloat16>::Compute(
   OP_REQUIRES_OK(context,
                  context->allocate_output(0, tensor_in_shape, &input_backprop));
   auto* stream = context->op_device_context()->stream();
-  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
-      se::CudaComputeCapability::AMPERE);
+  const bool cast_to_float = !IsBF16SupportedInOps(stream);
   if (cast_to_float) {
     Tensor casted_tensor_in;
     Tensor casted_tensor_out;
diff --git a/tensorflow/core/kernels/ragged_cross_op.cc b/tensorflow/core/kernels/ragged_cross_op.cc
index 71deb58c3c12d0..c8f27051b449cc 100644
--- a/tensorflow/core/kernels/ragged_cross_op.cc
+++ b/tensorflow/core/kernels/ragged_cross_op.cc
@@ -22,10 +22,12 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/ragged_utils.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/util/util.h"
 #include "tensorflow/core/util/work_sharder.h"
+#include "tsl/platform/errors.h"
 
 namespace tensorflow {
 
@@ -392,28 +394,10 @@ class RaggedCrossOp : public OpKernel {
         return absl::InvalidArgumentError(
             "tf.ragged.cross only supports inputs with rank=2.");
       }
-      if (ragged_splits_list[i].NumElements() == 0) {
-        return absl::InvalidArgumentError(
-            "Invalid RaggedTensor: Ragged splits must be non-empty.");
-      }
-      auto flat_row_splits = ragged_splits_list[i].flat<SplitsType>();
-      if (flat_row_splits(0) != 0) {
-        return absl::InvalidArgumentError(
-            "Invalid RaggedTensor: Ragged splits must start from 0.");
-      }
+
       int64_t num_values = ragged_values_list[i].NumElements();
-      if (flat_row_splits(flat_row_splits.size() - 1) != num_values) {
-        return absl::InvalidArgumentError(
-            "Invalid RaggedTensor: "
-            "Ragged splits must end with the number of values.");
-      }
-      for (int i = 1; i < flat_row_splits.size(); ++i) {
-        if (flat_row_splits(i - 1) > flat_row_splits(i)) {
-          return absl::InvalidArgumentError(
-              "Invalid RaggedTensor: "
-              "Ragged splits must be sorted in ascending order.");
-        }
-      }
+      TF_RETURN_IF_ERROR(RaggedTensorVerifySplits<SplitsType>(
+          ragged_splits_list[i], true, num_values));
     }
     for (int i = 0; i < num_sparse; ++i) {
       if (!TensorShapeUtils::IsMatrix(sparse_indices_list[i].shape()) ||
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
index f9d45627bc109c..153fd5a98fea1e 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/ragged_tensor_variant.h"
+#include "tensorflow/core/kernels/ragged_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/errors.h"
@@ -191,34 +192,6 @@ class RaggedTensorToVariantOp : public OpKernel {
 
     // Validate nested_row_splits.
     for (int i = ragged_nested_splits_len - 1; i >= 0; --i) {
-      OP_REQUIRES(context, ragged_nested_splits_in[i].dims() == 1,
-                  errors::InvalidArgument("Requires nested_row_splits[", i, "]",
-                                          " to be rank 1 but is rank ",
-                                          ragged_nested_splits_in[i].dims()));
-      OP_REQUIRES(
-          context, ragged_nested_splits_in[i].dim_size(0) >= 1,
-          errors::InvalidArgument("Requires nested_row_splits[", i, "]",
-                                  " has at least one splits, but is empty."));
-      OP_REQUIRES(context,
-                  ragged_nested_splits_in[i].flat<SPLIT_TYPE>()(0) ==
-                      static_cast<SPLIT_TYPE>(0),
-                  errors::InvalidArgument(
-                      "Requires the first element of nested_row_splits[", i,
-                      "]", " to be 0 but is ",
-                      ragged_nested_splits_in[i].flat<SPLIT_TYPE>()(0)));
-
-      SPLIT_TYPE last_split = 0;
-      for (int j = 1; j < ragged_nested_splits_in[i].dim_size(0); j++) {
-        auto split = ragged_nested_splits_in[i].flat<SPLIT_TYPE>()(j);
-        OP_REQUIRES(
-            context, split >= last_split,
-            errors::InvalidArgument("Requires splits to be monotonically "
-                                    "increasing, but nested_row_splits[",
-                                    i, "][", j, "]=", split,
-                                    " is smaller than nested_row_splits[", i,
-                                    "][", j - 1, "]=", last_split));
-        last_split = split;
-      }
       SPLIT_TYPE nvals;
       if (i == ragged_nested_splits_len - 1) {
         OP_REQUIRES(context, batched_ragged_input.values().dims() >= 1,
@@ -230,12 +203,8 @@ class RaggedTensorToVariantOp : public OpKernel {
         nvals = ragged_nested_splits_in[i + 1].dim_size(0) - 1;
       }
 
-      OP_REQUIRES(context, last_split == nvals,
-                  errors::InvalidArgument("Requires nested_row_splits[", i,
-                                          "][-1]=", last_split,
-                                          " to be equal with the number of "
-                                          "values in this dimension, which is ",
-                                          nvals, "."));
+      OP_REQUIRES_OK(context, RaggedTensorVerifySplits<SPLIT_TYPE>(
+                                  ragged_nested_splits_in[i], true, nvals));
     }
 
     for (int i = 0; i < ragged_nested_splits_len; i++) {
@@ -290,6 +259,15 @@ class RaggedTensorToVariantGradientOp : public OpKernel {
                    TensorShapeUtils::MakeShape(context->input(2).vec<int32>(),
                                                &dense_values_shape));
 
+    // Validate row_splits.
+    // Note rank of the row_splits can be 0. Besides, the number of ragged
+    // values corresponding to the outermost splits are unknown when calculating
+    // the gradient so we don't check the last element of `row_splits`
+    if (row_splits.dims()) {
+      OP_REQUIRES_OK(
+          context, RaggedTensorVerifySplits<SPLIT_TYPE>(row_splits, false, 0));
+    }
+
     const auto& flat_variants = encoded_variant.flat<Variant>();
 
     // Get a Tensor containing the flat_values for each variant.
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
index adfe17667315c8..ac04580f4dec11 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
@@ -390,8 +390,8 @@ TEST_F(RaggedTensorToVariantKernelTest,
                                              true);
   EXPECT_THAT(RunOpKernel(),
               testing::StatusIs(error::INVALID_ARGUMENT,
-                                "Requires the first element of "
-                                "nested_row_splits[0] to be 0 but is 1"));
+                                "Invalid ragged splits: first element of "
+                                "ragged splits  must be 0 but is 1"));
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, NestedRowSplitsIncreasingError) {
@@ -400,9 +400,10 @@ TEST_F(RaggedTensorToVariantKernelTest, NestedRowSplitsIncreasingError) {
                                              true);
   EXPECT_THAT(RunOpKernel(),
               testing::StatusIs(error::INVALID_ARGUMENT,
-                                "Requires splits to be monotonically "
-                                "increasing, but nested_row_splits[0][2]=-1 is "
-                                "smaller than nested_row_splits[0][1]=2"));
+                                "Invalid ragged splits: ragged splits must be "
+                                "monotonically increasing, but "
+                                "ragged_splits[2]=-1 is smaller than "
+                                "row_splits[1]=2"));
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, NestedRowSplitsSizeMismatchError) {
@@ -412,8 +413,8 @@ TEST_F(RaggedTensorToVariantKernelTest, NestedRowSplitsSizeMismatchError) {
   EXPECT_THAT(
       RunOpKernel(),
       testing::StatusIs(error::INVALID_ARGUMENT,
-                        "Requires nested_row_splits[0][-1]=3 to be equal with "
-                        "the number of values in this dimension, which is 5."));
+                        "Invalid ragged splits: last element of ragged splits "
+                        "must be the number of ragged values(5) but is 3"));
 }
 
 TEST_F(RaggedTensorToVariantKernelTest,
@@ -425,8 +426,8 @@ TEST_F(RaggedTensorToVariantKernelTest,
   EXPECT_THAT(
       RunOpKernel(),
       testing::StatusIs(error::INVALID_ARGUMENT,
-                        "Requires nested_row_splits[1][-1]=4 to be equal with "
-                        "the number of values in this dimension, which is 5."));
+                        "Invalid ragged splits: last element of ragged splits "
+                        "must be the number of ragged values(5) but is 4"));
 }
 
 TEST_F(RaggedTensorToVariantKernelTest,
@@ -438,8 +439,8 @@ TEST_F(RaggedTensorToVariantKernelTest,
   EXPECT_THAT(
       RunOpKernel(),
       testing::StatusIs(error::INVALID_ARGUMENT,
-                        "Requires nested_row_splits[0][-1]=2 to be equal with "
-                        "the number of values in this dimension, which is 3."));
+                        "Invalid ragged splits: last element of ragged splits "
+                        "must be the number of ragged values(3) but is 2"));
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, NestedRowSplitsEmptySplitsError) {
@@ -448,8 +449,8 @@ TEST_F(RaggedTensorToVariantKernelTest, NestedRowSplitsEmptySplitsError) {
                                              {0, 1, 2, 3, 4}, true);
   EXPECT_THAT(RunOpKernel(),
               testing::StatusIs(error::INVALID_ARGUMENT,
-                                "Requires nested_row_splits[0] has at least "
-                                "one splits, but is empty."));
+                                "Invalid ragged splits: ragged splits must "
+                                "have at least one splits, but is empty"));
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, NestedRowSplitsScalarValueError) {
@@ -462,5 +463,83 @@ TEST_F(RaggedTensorToVariantKernelTest, NestedRowSplitsScalarValueError) {
                                 "nested_row_splits is not empty, but is 0."));
 }
 
+TEST_F(RaggedTensorToVariantGradientKernelTest, RowSplitsMatch) {
+  // encoded_variant_grad=
+  // [ [1, 2, 3],
+  //   [       ],
+  //   [4, 5   ],
+  //   [6      ]]
+  auto encoded_variant_grad_1 =
+      CreateVariantFromRagged<int, int64_t>({}, {3}, {1, 2, 3});
+  auto encoded_variant_grad_2 =
+      CreateVariantFromRagged<int, int64_t>({}, {0}, {});
+  auto encoded_variant_grad_3 =
+      CreateVariantFromRagged<int, int64_t>({}, {2}, {4, 5});
+  auto encoded_variant_grad_4 =
+      CreateVariantFromRagged<int, int64_t>({}, {1}, {6});
+
+  BuildEncodeRaggedTensorGradientGraph<int, int64_t>(
+      {encoded_variant_grad_1, encoded_variant_grad_2, encoded_variant_grad_3,
+       encoded_variant_grad_4},
+      {0, 3, 3, 5, 6}, {6});
+
+  TF_ASSERT_OK(RunOpKernel());
+}
+
+TEST_F(RaggedTensorToVariantGradientKernelTest,
+       RowSplitsFirstElementNotZeroError) {
+  // encoded_variant_grad=
+  // [ [1, 2, 3],
+  //   [       ],
+  //   [4, 5   ],
+  //   [6      ]]
+  auto encoded_variant_grad_1 =
+      CreateVariantFromRagged<int, int64_t>({}, {3}, {1, 2, 3});
+  auto encoded_variant_grad_2 =
+      CreateVariantFromRagged<int, int64_t>({}, {0}, {});
+  auto encoded_variant_grad_3 =
+      CreateVariantFromRagged<int, int64_t>({}, {2}, {4, 5});
+  auto encoded_variant_grad_4 =
+      CreateVariantFromRagged<int, int64_t>({}, {1}, {6});
+
+  BuildEncodeRaggedTensorGradientGraph<int, int64_t>(
+      {encoded_variant_grad_1, encoded_variant_grad_2, encoded_variant_grad_3,
+       encoded_variant_grad_4},
+      {1, 3, 3, 5, 6}, {6});
+
+  EXPECT_THAT(RunOpKernel(),
+              testing::StatusIs(error::INVALID_ARGUMENT,
+                                "Invalid ragged splits: first element of "
+                                "ragged splits  must be 0 but is 1"));
+}
+
+TEST_F(RaggedTensorToVariantGradientKernelTest, RowSplitsIncreasingError) {
+  // encoded_variant_grad=
+  // [ [1, 2, 3],
+  //   [       ],
+  //   [4, 5   ],
+  //   [6      ]]
+  auto encoded_variant_grad_1 =
+      CreateVariantFromRagged<int, int64_t>({}, {3}, {1, 2, 3});
+  auto encoded_variant_grad_2 =
+      CreateVariantFromRagged<int, int64_t>({}, {0}, {});
+  auto encoded_variant_grad_3 =
+      CreateVariantFromRagged<int, int64_t>({}, {2}, {4, 5});
+  auto encoded_variant_grad_4 =
+      CreateVariantFromRagged<int, int64_t>({}, {1}, {6});
+
+  BuildEncodeRaggedTensorGradientGraph<int, int64_t>(
+      {encoded_variant_grad_1, encoded_variant_grad_2, encoded_variant_grad_3,
+       encoded_variant_grad_4},
+      {0, 3, 2, 5, 6}, {6});
+
+  EXPECT_THAT(RunOpKernel(),
+              testing::StatusIs(error::INVALID_ARGUMENT,
+                                "Invalid ragged splits: ragged splits must be "
+                                "monotonically increasing, but "
+                                "ragged_splits[2]=2 is smaller than "
+                                "row_splits[1]=3"));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h
index 0b71a308b2c503..7dc63ac8fbf7f8 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -129,6 +130,60 @@ class RaggedTensorToVariantKernelTest : public ::tensorflow::OpsTestBase {
   }
 };
 
+class RaggedTensorToVariantGradientKernelTest
+    : public ::tensorflow::OpsTestBase {
+ protected:
+  // Builds the tensorflow test graph for the RaggedTensorToVariantGradient op,
+  // and populates the `encoded_ragged_grad`, `row_splits` and
+  // `dense_values_shape` input with the given values.
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  void BuildEncodeRaggedTensorGradientGraph(
+      const std::vector<Variant>& encoded_ragged_grad,
+      const std::vector<SPLIT_TYPE>& row_splits,
+      const std::vector<int32>& dense_values_shape) {
+    const auto values_dtype = DataTypeToEnum<VALUE_TYPE>::v();
+    const auto splits_dtype = DataTypeToEnum<SPLIT_TYPE>::v();
+
+    TF_ASSERT_OK(NodeDefBuilder("tested_op", "RaggedTensorToVariantGradient")
+                     .Input(FakeInput(DT_VARIANT))    // encoded_ragged_grad
+                     .Input(FakeInput(splits_dtype))  // row_splits
+                     .Input(FakeInput(DT_INT32))      // dense_values_shape
+                     .Attr("Tvalues", values_dtype)
+                     .Attr("Tsplits", splits_dtype)
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+
+    int64_t encoded_ragged_grad_size = encoded_ragged_grad.size();
+    AddInputFromArray<Variant>(TensorShape({encoded_ragged_grad_size}),
+                               encoded_ragged_grad);
+
+    int64_t splits_size = row_splits.size();
+    AddInputFromArray<SPLIT_TYPE>(TensorShape({splits_size}), row_splits);
+
+    int64_t dense_values_shape_size = dense_values_shape.size();
+    AddInputFromArray<int32>(TensorShape({dense_values_shape_size}),
+                             dense_values_shape);
+  }
+
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  RaggedTensorVariant CreateVariantFromRagged(
+      const std::vector<std::vector<SPLIT_TYPE>>& ragged_splits,
+      const TensorShape& ragged_values_shape,
+      const std::vector<VALUE_TYPE>& ragged_values) {
+    RaggedTensorVariant encoded;
+    for (auto ragged_split : ragged_splits) {
+      int splits_size = ragged_split.size();
+      Tensor splits(DataTypeToEnum<SPLIT_TYPE>::v(),
+                    TensorShape({splits_size}));
+      test::FillValues<SPLIT_TYPE>(&splits, ragged_split);
+      encoded.append_splits(splits);
+    }
+    Tensor values(DataTypeToEnum<VALUE_TYPE>::v(), ragged_values_shape);
+    test::FillValues<VALUE_TYPE>(&values, ragged_values);
+    encoded.set_values(values);
+    return encoded;
+  }
+};
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_RAGGED_TENSOR_TO_VARIANT_OP_TEST_H_
diff --git a/tensorflow/core/kernels/ragged_utils.h b/tensorflow/core/kernels/ragged_utils.h
new file mode 100644
index 00000000000000..f91f1da343993f
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_utils.h
@@ -0,0 +1,77 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_RAGGED_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_RAGGED_UTILS_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+// Utility functions for RaggedTensor
+
+// Verifies that the splits are valid for ragged tensor
+template <typename SPLIT_TYPE>
+Status RaggedTensorVerifySplits(const Tensor& ragged_splits,
+                                bool check_last_element,
+                                int64_t num_ragged_values) {
+  auto flat_ragged_splits = ragged_splits.flat<SPLIT_TYPE>();
+
+  if (ragged_splits.dims() != 1) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Invalid ragged splits: ragged splits must be rank 1 but is rank ",
+        ragged_splits.dims()));
+  }
+
+  if (ragged_splits.NumElements() < 1) {
+    return absl::InvalidArgumentError(
+        "Invalid ragged splits: ragged splits must have at least one splits, "
+        "but is empty");
+  }
+
+  if (flat_ragged_splits(0) != static_cast<SPLIT_TYPE>(0)) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Invalid ragged splits: first element of ragged splits "
+                     " must be 0 but is ",
+                     flat_ragged_splits(0)));
+  }
+
+  SPLIT_TYPE last_split = 0;
+  for (int j = 1; j < ragged_splits.dim_size(0); j++) {
+    auto split = flat_ragged_splits(j);
+    if (split < last_split) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Invalid ragged splits: ragged splits must be "
+                       "monotonically increasing, but ragged_splits[",
+                       j, "]=", split, " is smaller than row_splits[", j - 1,
+                       "]=", last_split));
+    }
+    last_split = split;
+  }
+
+  if (check_last_element & last_split != num_ragged_values) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Invalid ragged splits: last element of ragged splits must be ",
+        "the number of ragged values(", num_ragged_values, ") but is ",
+        last_split));
+  }
+
+  return absl::OkStatus();
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RAGGED_UTILS_H_
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index ac5a410a261ed9..ae9169df96ac3a 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -15,14 +15,19 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/roll_op.h"
 
+#include <algorithm>
+#include <cstdint>
+
 #include "tensorflow/core/framework/bounds_check.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/register_types_traits.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 
@@ -191,9 +196,9 @@ void DoRollWithMemcpy(const OpKernelContext* context,
                   int64_t start, int64_t end) {
     // the number of indices over in the flattened tensor you need to skip in
     // order to make it over from one side of the isd to the other
-    const int64_t isd_range = std::max<int>(dim_range[isd], 1);
-    // the distance along the flattend tensor to the next element in the isd
-    const int64_t isd_stride = isd_range / std::max<int>(dim_size[isd], 1);
+    const int64_t isd_range = std::max<int64_t>(dim_range[isd], 1);
+    // the distance along the flattened tensor to the next element in the isd
+    const int64_t isd_stride = isd_range / std::max<int64_t>(dim_size[isd], 1);
 
     // start and end represent the i-th group currently so we will convert
     // them into numbers representing the i-th elements.
@@ -295,9 +300,10 @@ void DoRollWithMemcpy(const OpKernelContext* context,
   // Shard
   auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
   const int64_t ave_group_size = dim_range[isd] / 2;
-  const int total_work = 2 * num_elements / std::max<int>(dim_range[isd], 1);
+  const int64_t total_work =
+      2 * num_elements / std::max<int64_t>(dim_range[isd], 1);
   // 25000 - experimentally determined with float and bool types
-  const int cost_per_group = 25000 * sizeof(T) * ave_group_size;
+  const int64_t cost_per_group = 25000 * sizeof(T) * ave_group_size;
   Shard(worker_threads->num_threads, worker_threads->workers, total_work,
         cost_per_group, std::move(work));
 }
diff --git a/tensorflow/core/kernels/tensor_to_hash_bucket_op.cc b/tensorflow/core/kernels/tensor_to_hash_bucket_op.cc
index d031461318df3e..eb58830bff17d2 100644
--- a/tensorflow/core/kernels/tensor_to_hash_bucket_op.cc
+++ b/tensorflow/core/kernels/tensor_to_hash_bucket_op.cc
@@ -74,7 +74,7 @@ TF_CALL_INTEGRAL_TYPES(REGISTER_CPU_KERNELS);
 
 #undef REGISTER_CPU_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU_KERNELS(type)                        \
   REGISTER_KERNEL_BUILDER(Name("_TensorToHashBucketFast") \
@@ -86,6 +86,6 @@ TF_CALL_INTEGRAL_TYPES(REGISTER_GPU_KERNELS);
 
 #undef REGISTER_GPU_KERNELS
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_to_hash_bucket_op.h b/tensorflow/core/kernels/tensor_to_hash_bucket_op.h
index 6c75b8cffccd10..cdf7dab23947d9 100644
--- a/tensorflow/core/kernels/tensor_to_hash_bucket_op.h
+++ b/tensorflow/core/kernels/tensor_to_hash_bucket_op.h
@@ -66,13 +66,13 @@ struct LaunchTensorToHashBucket {
   }
 };
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename T>
 struct LaunchTensorToHashBucket<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* c, const int64_t num_buckets, const T* input,
                   const int num_elems, int64_t* output);
 };
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }  // namespace functor
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_to_hash_bucket_op_gpu.cu.cc b/tensorflow/core/kernels/tensor_to_hash_bucket_op_gpu.cu.cc
index 8e79c7929f013c..8b6b0d48ecc461 100644
--- a/tensorflow/core/kernels/tensor_to_hash_bucket_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/tensor_to_hash_bucket_op_gpu.cu.cc
@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
@@ -119,4 +119,4 @@ TF_CALL_INTEGRAL_TYPES(REGISTER_FUNCTORS);
 #undef REGISTER_FUNCTORS
 
 }  // namespace tensorflow
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/topk_op_gpu.h b/tensorflow/core/kernels/topk_op_gpu.h
index 152dbcd96a0e63..26162abc2f8f80 100644
--- a/tensorflow/core/kernels/topk_op_gpu.h
+++ b/tensorflow/core/kernels/topk_op_gpu.h
@@ -483,25 +483,16 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
 
   bool ran_nonsegmented_version = false;
   if (num_rows == 1) {
-#if GOOGLE_CUDA
-    constexpr bool is_supported = true;
-#else
-    // GpuRadixSortDescending is not supported on ROCm for fp16/bf16.
-    constexpr bool is_supported = !std::is_same<T, Eigen::half>::value &&
-                                  !std::is_same<T, Eigen::bfloat16>::value;
-#endif
-    if constexpr (is_supported) {
-      // Note: DeviceSegmentedRadixSort is very slow when num_segments=1 because
-      // it only uses 1 SM per segment. Calling the un-segmented version is much
-      // faster in this case.
-      TF_RETURN_IF_ERROR(
-          GpuRadixSortDescending(ctx, num_cols, /*keys_in=*/input,
-                                 /*keys_out=*/sorted_values_ptr,
-                                 /*indices_in=*/input_indices_t.data(),
-                                 /*indices_out=*/sorted_indices_ptr,
-                                 /*num_bits=*/sizeof(T) * 8));
-      ran_nonsegmented_version = true;
-    }
+    // Note: DeviceSegmentedRadixSort is very slow when num_segments=1 because
+    // it only uses 1 SM per segment. Calling the un-segmented version is much
+    // faster in this case.
+    TF_RETURN_IF_ERROR(
+        GpuRadixSortDescending(ctx, num_cols, /*keys_in=*/input,
+                               /*keys_out=*/sorted_values_ptr,
+                               /*indices_in=*/input_indices_t.data(),
+                               /*indices_out=*/sorted_indices_ptr,
+                               /*num_bits=*/sizeof(T) * 8));
+    ran_nonsegmented_version = true;
   }
   if (!ran_nonsegmented_version) {
     auto err = gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
diff --git a/tensorflow/core/lib/png/BUILD b/tensorflow/core/lib/png/BUILD
index 46f167b5e60302..cdd3491276c9a3 100644
--- a/tensorflow/core/lib/png/BUILD
+++ b/tensorflow/core/lib/png/BUILD
@@ -15,6 +15,7 @@ cc_library(
     name = "png_io",
     srcs = ["png_io.cc"],
     hdrs = ["png_io.h"],
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core/platform:byte_order",
         "//tensorflow/core/platform:logging",
@@ -22,6 +23,7 @@ cc_library(
         "//tensorflow/core/platform:stringpiece",
         "//tensorflow/core/platform:types",
         "@com_google_absl//absl/base",
+        "@png",
         "@zlib",
     ],
 )
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index 2bfbe4470d94a8..f07861a0be6808 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -25,6 +25,7 @@ limitations under the License.
 // provokes a compile error. We instead let png.h include what is needed.
 
 #include "absl/base/casts.h"
+#include "png.h"  // from @png
 #include "tensorflow/core/lib/png/png_io.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/logging.h"
@@ -77,7 +78,7 @@ static void Convert8to16(const uint8* p8, int num_comps, int p8_row_bytes,
 
 void ErrorHandler(png_structp png_ptr, png_const_charp msg) {
   DecodeContext* const ctx =
-      absl::bit_cast<DecodeContext*>(png_get_io_ptr(png_ptr));
+      absl::bit_cast<DecodeContext*>(png_get_error_ptr(png_ptr));
   ctx->error_condition = true;
   // To prevent log spam, errors are logged as VLOG(1) instead of ERROR.
   VLOG(1) << "PNG error: " << msg;
@@ -354,8 +355,9 @@ bool WriteImageToBuffer(
 
   png_string->resize(0);
   png_infop info_ptr = nullptr;
-  png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, nullptr,
-                                                ErrorHandler, WarningHandler);
+  DecodeContext decode_context;
+  png_structp png_ptr = png_create_write_struct(
+      PNG_LIBPNG_VER_STRING, &decode_context, ErrorHandler, WarningHandler);
   if (png_ptr == nullptr) return false;
   if (setjmp(png_jmpbuf(png_ptr))) {
     png_destroy_write_struct(&png_ptr, info_ptr ? &info_ptr : nullptr);
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMul.pbtxt
index 747bd5ed03115c..9d7ac3ca8e2a33 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMul.pbtxt
@@ -174,3 +174,62 @@ op {
     }
   }
 }
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "grad_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "grad_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt
index aa446c0a492155..4769d8220f53e1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt
@@ -139,3 +139,67 @@ op {
     }
   }
 }
+op {
+  name: "BatchMatMulV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "grad_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "grad_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV3.pbtxt
index 332af934efbb23..1bcfdb937064ca 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV3.pbtxt
@@ -164,3 +164,101 @@ op {
     }
   }
 }
+op {
+  name: "BatchMatMulV3"
+  input_arg {
+    name: "x"
+    type_attr: "Ta"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "Tb"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Ta"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tb"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "grad_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "grad_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GlobalIterId.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GlobalIterId.pbtxt
new file mode 100644
index 00000000000000..5fa2302622c9ac
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/GlobalIterId.pbtxt
@@ -0,0 +1,8 @@
+op 	 {
+  name: "GlobalIterId"
+  output_arg {
+    name: "iter_id"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ListSnapshotChunksDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ListSnapshotChunksDataset.pbtxt
new file mode 100644
index 00000000000000..be35470141fb28
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ListSnapshotChunksDataset.pbtxt
@@ -0,0 +1,44 @@
+op 	 {
+  name: "ListSnapshotChunksDataset"
+  input_arg {
+    name: "snapshot_path"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatMul.pbtxt
index 369f763e9472c5..8f79fa11000f7f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatMul.pbtxt
@@ -223,3 +223,66 @@ op {
     }
   }
 }
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "grad_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "grad_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 1203d36a8951c0..396e1720aaf2fd 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -1252,6 +1252,21 @@ REGISTER_OP("SnapshotNestedDatasetReader")
                                                            "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ListSnapshotChunksDataset")
+    .Input("snapshot_path: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `snapshot_path` should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("SqlDataset")
     .Input("driver_name: string")
     .Input("data_source_name: string")
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index a5356ab9d38911..d54750253f32e3 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -126,6 +126,8 @@ REGISTER_OP("BatchMatMul")
         "complex128}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
+    .Attr("grad_x: bool = false")
+    .Attr("grad_y: bool = false")
     .SetShapeFn(shape_inference::BatchMatMulShape);
 
 REGISTER_OP("BatchMatMulV2")
@@ -137,6 +139,8 @@ REGISTER_OP("BatchMatMulV2")
         "uint16, uint32, uint64, complex64, complex128}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
+    .Attr("grad_x: bool = false")
+    .Attr("grad_y: bool = false")
     .SetShapeFn(shape_inference::BatchMatMulV2Shape);
 
 REGISTER_OP("BatchMatMulV3")
@@ -154,6 +158,8 @@ REGISTER_OP("BatchMatMulV3")
         "complex128}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
+    .Attr("grad_x: bool = false")
+    .Attr("grad_y: bool = false")
     .SetShapeFn(shape_inference::BatchMatMulV2Shape);
 
 #ifdef INTEL_MKL
@@ -164,6 +170,8 @@ REGISTER_OP("_MklBatchMatMul")
     .Attr("T: {bfloat16, float}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
+    .Attr("grad_x: bool = false")
+    .Attr("grad_y: bool = false")
     .SetShapeFn(shape_inference::BatchMatMulShape);
 
 REGISTER_OP("_MklBatchMatMulV2")
@@ -173,6 +181,8 @@ REGISTER_OP("_MklBatchMatMulV2")
     .Attr("T: {bfloat16, float}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
+    .Attr("grad_x: bool = false")
+    .Attr("grad_y: bool = false")
     .SetShapeFn(shape_inference::BatchMatMulV2Shape);
 #endif  // INTEL_MKL
 
@@ -953,6 +963,8 @@ REGISTER_OP("MatMul")
     .Attr(
         "T: {bfloat16, half, float, double, int32, int64, uint8, "
         "uint16, uint32, uint64, complex64, complex128}")
+    .Attr("grad_a: bool = false")
+    .Attr("grad_b: bool = false")
     .SetShapeFn(shape_inference::MatMulShape);
 
 #ifdef INTEL_MKL
@@ -963,6 +975,8 @@ REGISTER_OP("_MklMatMul")
     .Attr("transpose_a: bool = false")
     .Attr("transpose_b: bool = false")
     .Attr("T: {bfloat16, float}")
+    .Attr("grad_a: bool = false")
+    .Attr("grad_b: bool = false")
     .SetShapeFn(shape_inference::MatMulShape);
 #endif  // INTEL_MKL
 
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 2bcdc8b109329d..95a20f13f87522 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4204,6 +4204,20 @@ op {
       b: false
     }
   }
+  attr {
+    name: "grad_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "grad_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "BatchMatMulV2"
@@ -4254,6 +4268,20 @@ op {
       b: false
     }
   }
+  attr {
+    name: "grad_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "grad_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "BatchMatMulV3"
@@ -4338,6 +4366,20 @@ op {
       b: false
     }
   }
+  attr {
+    name: "grad_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "grad_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "BatchMatrixBandPart"
@@ -22034,6 +22076,14 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "GlobalIterId"
+  output_arg {
+    name: "iter_id"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
 op {
   name: "Greater"
   input_arg {
@@ -25210,6 +25260,50 @@ op {
     }
   }
 }
+op {
+  name: "ListSnapshotChunksDataset"
+  input_arg {
+    name: "snapshot_path"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "LoadAllTPUEmbeddingParameters"
   input_arg {
@@ -27092,6 +27186,20 @@ op {
       }
     }
   }
+  attr {
+    name: "grad_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "grad_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "MatchingFiles"
diff --git a/tensorflow/core/platform/build_config.default.bzl b/tensorflow/core/platform/build_config.default.bzl
index 24421c6d6e8b87..80c7d25ad1dd9e 100644
--- a/tensorflow/core/platform/build_config.default.bzl
+++ b/tensorflow/core/platform/build_config.default.bzl
@@ -1,24 +1,23 @@
 """OSS versions of Bazel macros that can't be migrated to TSL."""
 
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load(
+    "@local_xla//xla:xla.bzl",
+    _xla_clean_dep = "clean_dep",
+)
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
 )
 load(
-    "@local_xla//xla:xla.bzl",
-    _xla_clean_dep = "clean_dep",
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl_ml",
 )
 load(
     "@local_tsl//tsl:tsl.bzl",
     "if_libtpu",
     _tsl_clean_dep = "clean_dep",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load(
-    "//third_party/mkl:build_defs.bzl",
-    "if_mkl_ml",
-)
 
 def tf_tpu_dependencies():
     return if_libtpu(["//tensorflow/core/tpu/kernels"])
@@ -34,11 +33,7 @@ def tf_additional_binary_deps():
         # core.
         str(Label("//tensorflow/core/kernels:lookup_util")),
         str(Label("//tensorflow/core/util/tensor_bundle")),
-    ] + if_cuda(
-        [
-            str(Label("@local_xla//xla/stream_executor:cuda_platform")),
-        ],
-    ) + if_rocm(
+    ] + if_rocm(
         [
             str(Label("@local_xla//xla/stream_executor:rocm_platform")),
             str(Label("@local_xla//xla/stream_executor/rocm:rocm_rpath")),
diff --git a/tensorflow/core/platform/host_info.h b/tensorflow/core/platform/host_info.h
index 89d495d6e41229..caab7ae380b31b 100644
--- a/tensorflow/core/platform/host_info.h
+++ b/tensorflow/core/platform/host_info.h
@@ -22,6 +22,7 @@ limitations under the License.
 namespace tensorflow {
 namespace port {
 using tsl::port::Hostname;
+using tsl::port::IOStatistics;
 using tsl::port::JobName;
 using tsl::port::JobUid;
 }  // namespace port
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 3590422f92d5b5..27d9d649c84cb3 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -1015,6 +1015,7 @@ cc_library(
     srcs = ["xspace_to_dcn_slack_analysis.cc"],
     hdrs = ["xspace_to_dcn_slack_analysis.h"],
     deps = [
+        "//tensorflow/core/profiler/protobuf:dcn_collective_info_proto_cc",
         "//tensorflow/core/profiler/protobuf:dcn_slack_analysis_proto_cc",
         "//tensorflow/core/profiler/protobuf:topology_proto_cc",
         "//tensorflow/core/profiler/utils:hlo_module_utils",
diff --git a/tensorflow/core/profiler/convert/op_profile_builder.cc b/tensorflow/core/profiler/convert/op_profile_builder.cc
index 2111ea4f56ac6e..124d4096518f95 100644
--- a/tensorflow/core/profiler/convert/op_profile_builder.cc
+++ b/tensorflow/core/profiler/convert/op_profile_builder.cc
@@ -185,9 +185,12 @@ void PopulateOpMetricsNode(
   // https://github.com/tensorflow/profiler/blob/master/frontend/app/common/utils/utils.ts
   metrics->set_raw_time(op_metrics.time_ps());
   metrics->set_raw_flops(op_metrics.flops());
+  metrics->set_occurrences(op_metrics.occurrences());
+  metrics->set_avg_time_ps(
+      SafeDivide(op_metrics.time_ps(), op_metrics.occurrences()));
 
   // Hack to approximate utilization for INT8/4 convolution HLOs:
-  // Since MXU BW is 2x/4x for INT8/4, multiply peak BW by the factor detemrined
+  // Since MXU BW is 2x/4x for INT8/4, multiply peak BW by the factor determined
   // by the computation size
   if (GetComputationSize(*node) == 8) {
     peak_gigaflops_per_second_per_core *= 2;
diff --git a/tensorflow/core/profiler/convert/process_megascale_dcn.cc b/tensorflow/core/profiler/convert/process_megascale_dcn.cc
index dc740ffd9090c0..947c5e54a19568 100644
--- a/tensorflow/core/profiler/convert/process_megascale_dcn.cc
+++ b/tensorflow/core/profiler/convert/process_megascale_dcn.cc
@@ -47,6 +47,8 @@ void ProcessMegascaleDcn(XSpace* space) {
   for (XPlane* device_xplane : device_xplanes) {
     dcn_events_processor.AddTpuCollectiveDcnTrafficToXPlane(device_xplane);
   }
+
+  SortXSpace(space);
 }
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_container.cc b/tensorflow/core/profiler/convert/xplane_to_trace_container.cc
index 53f94371d1a96d..cfb4f2ec20cf4b 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_container.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_container.cc
@@ -207,6 +207,7 @@ void ConvertXPlaneToTraceEventsContainer(uint64_t device_id,
   }
 
   plane.ForEachLine([&](const XLineVisitor& line) {
+    if (line.DisplayName() == tsl::profiler::kXlaAsyncOpLineName) return;
     if (line.NumEvents() == 0) return;
     // Capture a copy of XLineVisitor because it will go out of scope.
     uint32_t device_id = resource_grouper->GetDeviceId(line.DisplayId());
@@ -241,7 +242,7 @@ void ConvertXSpaceToTraceEventsContainer(absl::string_view hostname,
   for (const XPlane* custom_plane :
        FindPlanesWithPrefix(space, tsl::profiler::kCustomPlanePrefix)) {
     ConvertXPlaneToTraceEventsContainer(
-        tsl::profiler::kCustomPlaneDeviceId + custom_plane->id(), hostname,
+        tsl::profiler::kFirstCustomPlaneDeviceId + custom_plane->id(), hostname,
         *custom_plane, container);
   }
 }
diff --git a/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc b/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc
index 95b6342c525133..f85d0f92904c80 100644
--- a/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc
+++ b/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/side_effect_util.h"
 #include "xla/xla_data.pb.h"
+#include "tensorflow/core/profiler/protobuf/dcn_collective_info.pb.h"
 #include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
 #include "tensorflow/core/profiler/protobuf/topology.pb.h"
 #include "tensorflow/core/profiler/utils/hlo_module_utils.h"
@@ -74,7 +75,7 @@ using xla::HloOpcode;
 // TODO: Identify mechanism to maintain consistency between producer and
 // consumer here.
 const char kHostEventRegex[] = {
-    "device_[0-9][0-9][0-9]([0-9][0-9][0-9])_gid_(.*)"};
+    "device_[0-9]+([0-9][0-9][0-9][0-9][0-9])_gid_(.*)"};
 
 std::optional<std::string> GetAttributeFromInstr(
     const xla::HloInstruction* instr, std::string_view attribute) {
@@ -110,6 +111,21 @@ std::string HostCollectiveKey(int index_on_host,
   return absl::StrCat(index_on_host, "_", rendezvous_name);
 }
 
+DcnCollectiveInfoProto GetDcnCollectiveInfoProto(const XEventVisitor& xevent) {
+  DcnCollectiveInfoProto dcn_collective_info;
+  xevent.Metadata().ForEachStat([&](const XStatVisitor& xstat) {
+    if (static_cast<StatType>(*xstat.Type()) == StatType::kDcnCollectiveInfo) {
+      absl::string_view byte_value = xstat.BytesValue();
+      if (!dcn_collective_info.ParseFromArray(byte_value.data(),
+                                              byte_value.size())) {
+        LOG(WARNING) << "Could not parse DcnCollectiveInfoProto from metadata.";
+      }
+    }
+  });
+
+  return dcn_collective_info;
+}
+
 }  // namespace
 
 namespace dcn_analysis_internal {
@@ -203,6 +219,55 @@ void DcnTracker::UpdateActiveOps(uint64_t duration) {
   }
 }
 
+int DcnTracker::GetReplicaGroupSize(const std::string& rendezvous_name,
+                                    const XEventVisitor& visitor) {
+  if (rendezvous_to_replica_group_size_map_.contains(rendezvous_name)) {
+    return rendezvous_to_replica_group_size_map_[rendezvous_name];
+  }
+
+  DcnCollectiveInfoProto dcn_collective_info =
+      GetDcnCollectiveInfoProto(visitor);
+
+  if (dcn_collective_info.one_to_one_groups_size() != 0) {
+    // OneToOneGroup has a source and a destination, which is one replica group
+    rendezvous_to_replica_group_size_map_[rendezvous_name] = 1;
+  } else if (dcn_collective_info.endpoint_groups_size() != 0) {
+    rendezvous_to_replica_group_size_map_[rendezvous_name] =
+        dcn_collective_info.endpoint_groups(0).endpoints().size();
+  } else {
+    rendezvous_to_replica_group_size_map_[rendezvous_name] = 0;
+  }
+
+  return rendezvous_to_replica_group_size_map_[rendezvous_name];
+}
+
+uint64_t DcnTracker::ComputeTransmittedDataSize(
+    const int64_t buffer_size, const int group_size,
+    const std::string& transfer_type) {
+  uint64_t transmitted_bytes = 0;
+  if (group_size == 0) {
+    LOG(ERROR) << "Replica group size is 0.";
+    return transmitted_bytes;
+  }
+
+  if (transfer_type == "ONE_TO_ONE") {
+    transmitted_bytes = group_size * buffer_size;
+  } else if (transfer_type == "ALL_GATHER") {
+    transmitted_bytes = (group_size - 1) * buffer_size;
+  } else if (transfer_type == "ALL_REDUCE") {
+    // Since the reduced buffer now has to be sent back to the replicas,
+    // the total bytes transmitted over the network is 2x the shape of the op.
+    transmitted_bytes =
+        2 * SafeDivide(group_size - 1, group_size) * buffer_size;
+  } else if (transfer_type == "ALL_TO_ALL" ||
+             transfer_type == "REDUCE_SCATTER") {
+    transmitted_bytes = SafeDivide(group_size - 1, group_size) * buffer_size;
+  } else {
+    LOG(ERROR) << "Unsupported transfer type: " << transfer_type;
+  }
+  return transmitted_bytes;
+}
+
 void DcnTracker::VisitOp(const InstrMetadata& instr,
                          const XEventVisitor& visitor) {
   std::string rendezvous_name;
@@ -233,6 +298,8 @@ void DcnTracker::VisitOp(const InstrMetadata& instr,
       opState.send_op_name = visitor.DisplayName();
       opState.send.set_duration_ps(visitor.DurationPs());
       opState.send.set_start_time_ps(visitor.TimestampPs());
+      opState.replica_group_size =
+          GetReplicaGroupSize(rendezvous_name, visitor);
       break;
     case HloOpcode::kRecv:
       opState.recv.set_duration_ps(visitor.DurationPs());
@@ -255,16 +322,8 @@ void DcnTracker::VisitOp(const InstrMetadata& instr,
         analysis->set_slack_us(NanoToMicro(visitor.TimestampNs() -
                                            opState.start_time -
                                            opState.overlapping_duration));
-        // TODO(b/294584919): The current transmitted bytes measures the
-        // buffer size at the recv-done. This could include bytes that were not
-        // received over the network. Fix the calculation based on the number of
-        // replica groups.
-        // In case of ALL_REDUCE, Since the reduced buffer now
-        // has to be sent back to the replicas, the total bytes transmitted over
-        // the network is 2x the shape of the op.
-        analysis->set_bytes_transmitted_over_network(
-            analysis->transfer_type() == "ALL_REDUCE" ? 2 * instr.size
-                                                      : instr.size);
+        analysis->set_bytes_transmitted_over_network(ComputeTransmittedDataSize(
+            instr.size, opState.replica_group_size, opState.transfer_type));
         analysis->set_stall_duration_us(NanoToMicro(opState.stall_duration_ns));
         analysis->set_recv_op_name(std::string(visitor.DisplayName()));
         analysis->set_send_op_name(opState.send_op_name);
diff --git a/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h b/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h
index 3f38da65460346..daac70f634abca 100644
--- a/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h
+++ b/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/core/profiler/protobuf/dcn_collective_info.pb.h"
 #include "tensorflow/core/profiler/protobuf/dcn_slack_analysis.pb.h"
 #include "tensorflow/core/profiler/protobuf/topology.pb.h"
 #include "tensorflow/core/profiler/utils/hlo_proto_map.h"
@@ -53,6 +54,7 @@ struct DcnOpState {
   std::string transfer_type;
   uint64_t stall_duration_ns = 0;
   std::string send_op_name;
+  int replica_group_size = 0;
 
   OpInstance send;
   OpInstance send_done;
@@ -125,6 +127,7 @@ class DcnTracker {
   absl::flat_hash_map<int, int> global_chip_id_to_local_index_map_;
   absl::flat_hash_map<std::string, std::unique_ptr<xla::HloModule>>
       hlo_module_cache_;
+  absl::flat_hash_map<std::string, int> rendezvous_to_replica_group_size_map_;
   bool is_megacore_ = true;
 
   absl::StatusOr<InstrMetadata> GetInstrMetadataFromHloModule(
@@ -140,6 +143,14 @@ class DcnTracker {
 
   // GetLocalIndex when available, else return the global_device_id itself.
   int GetLocalIndex(int dcn_device_id);
+
+  // Get number of replica group
+  int GetReplicaGroupSize(const std::string& rendezvous_name,
+                          const tsl::profiler::XEventVisitor& visitor);
+
+  // Compute data transmitted size based on number of replica groups
+  uint64_t ComputeTransmittedDataSize(int64_t buffer_size, int group_size,
+                                      const std::string& transfer_type);
 };
 
 }  // namespace dcn_analysis_internal
diff --git a/tensorflow/core/profiler/protobuf/BUILD b/tensorflow/core/profiler/protobuf/BUILD
index 0e0ffa72b6ce71..c521c9ad89d392 100644
--- a/tensorflow/core/profiler/protobuf/BUILD
+++ b/tensorflow/core/profiler/protobuf/BUILD
@@ -270,3 +270,10 @@ tf_proto_library(
     cc_api_version = 2,
     visibility = [":friends"],
 )
+
+tf_proto_library(
+    name = "dcn_collective_info_proto",
+    srcs = ["dcn_collective_info.proto"],
+    cc_api_version = 2,
+    visibility = [":friends"],
+)
diff --git a/tensorflow/core/profiler/protobuf/dcn_collective_info.proto b/tensorflow/core/profiler/protobuf/dcn_collective_info.proto
new file mode 100644
index 00000000000000..5359a3dd54c1c6
--- /dev/null
+++ b/tensorflow/core/profiler/protobuf/dcn_collective_info.proto
@@ -0,0 +1,55 @@
+syntax = "proto3";
+
+package tensorflow.profiler;
+
+// This proto is based on MegaScaleInfoProto and should be consistent with it.
+message DcnCollectiveInfoProto {
+  enum TransferType {
+    UNKNOWN_TRANSFER_TYPE = 0;
+
+    // XLA AllToAll transfer.
+    // Needs `endpoint_groups`.
+    ALL_TO_ALL = 1;
+
+    // Peer-To-Peer DCN transfer from source to one destination.
+    // Needs one_to_one_groups.
+    ONE_TO_ONE = 2;
+
+    // XLA reduce-scatter transfer.
+    // Needs `endpoint_groups`.
+    REDUCE_SCATTER = 3;
+
+    // XLA AllGather transfer.
+    // Needs `endpoint_groups`.
+    ALL_GATHER = 4;
+
+    // XLA all-reduce transfer.
+    // Needs `endpoint_groups`.
+    ALL_REDUCE = 5;
+  }
+
+  message Endpoint {
+    int32 slice_id = 1;
+    int32 device_id = 2;
+  }
+
+  message EndpointGroup {
+    repeated Endpoint endpoints = 1;
+  }
+
+  message OneToOneGroup {
+    Endpoint source = 1;
+    Endpoint destination = 2;
+  }
+
+  // The type of DCN transfer.
+  TransferType transfer_type = 1;
+
+  // Groups of endpoints (in the form of slice id and device id) involved in
+  // `ALL_TO_ALL`, `REDUCE_SCATTER`, `ALL_REDUCE` and `ALL_GATHER` transfer.
+  repeated EndpointGroup endpoint_groups = 2;
+
+  // Groups of endpoints (in the form of slice id and device id) involved in
+  // `ONE_TO_ONE` transfer.
+  repeated OneToOneGroup one_to_one_groups = 3;
+}
diff --git a/tensorflow/core/profiler/protobuf/op_profile.proto b/tensorflow/core/profiler/protobuf/op_profile.proto
index 9c29d1777eb38a..14ce2d203fb16a 100644
--- a/tensorflow/core/profiler/protobuf/op_profile.proto
+++ b/tensorflow/core/profiler/protobuf/op_profile.proto
@@ -82,6 +82,10 @@ message Metrics {
   // Total bytes accessed for each memory type.
   // Index into array using MemBwType enum.
   repeated double raw_bytes_accessed_array = 15;
+  // Number of executions.
+  uint32 occurrences = 16;
+  // Average "accumlated" time in picoseconds that the operation took.
+  double avg_time_ps = 17;
 
   reserved 1, 3, 4, 13, 14;
 }
diff --git a/tensorflow/core/profiler/utils/tfstreamz_utils.cc b/tensorflow/core/profiler/utils/tfstreamz_utils.cc
index af957c54843ec7..0b32f5712edba5 100644
--- a/tensorflow/core/profiler/utils/tfstreamz_utils.cc
+++ b/tensorflow/core/profiler/utils/tfstreamz_utils.cc
@@ -112,6 +112,9 @@ Status SerializeToXPlane(const std::vector<TfStreamzSnapshot>& snapshots,
             xevent.AddStatValue(*metadata, *xplane.GetOrCreateStatMetadata(
                                                point->string_value));
             break;
+          case monitoring::ValueType::kDouble:
+            xevent.AddStatValue(*metadata, point->double_value);
+            break;
           case monitoring::ValueType::kHistogram:
             xevent.AddStatValue(*metadata, point->histogram_value);
             break;
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index de23bad54d98de..f436e57dbf7995 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -673,6 +673,20 @@ message ConfigProto {
     // Whether runtime execution uses TFRT.
     bool use_tfrt = 18;
 
+    // If true, use Pathways with TFRT API for multi host support.
+    bool enable_multi_host = 27;
+
+    // Port for the Pathways server. Ignored if enable_multi_host=false.
+    int32 backend_server_port = 28;
+
+    // If true, TFRT will use TPU specific compiler passes and perform TPU
+    // specific initialization.
+    bool target_tpu = 29;
+
+    // If true, TFRT will use GPU specific compiler passes and perform GPU
+    // specific initialization.
+    bool target_gpu = 30;
+
     // The field "coordination_service was previously specified as a string;
     // this has been replaced with a message below.
     reserved 19;
@@ -711,7 +725,7 @@ message ConfigProto {
 
     reserved 25;
 
-    // Next: 27
+    // Next: 31
   }
 
   Experimental experimental = 16;
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index a36d3b961d93a9..63f8f6476f1696 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1672  // Updated: 2023/11/6
+#define TF_GRAPH_DEF_VERSION 1710  // Updated: 2023/12/14
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/runtime_fallback/BUILD b/tensorflow/core/runtime_fallback/BUILD
index 28bd4565cebecb..da33d97329ad0b 100644
--- a/tensorflow/core/runtime_fallback/BUILD
+++ b/tensorflow/core/runtime_fallback/BUILD
@@ -52,10 +52,7 @@ tf_cc_binary(
         "//conditions:default": [
             "//tensorflow/core:all_kernels",
         ],
-    }) + if_cuda([
-        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_gpu_alwayslink",
-        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_gpu_alwayslink",
-    ]),
+    }) + if_cuda([]),
 )
 
 cc_library(
diff --git a/tensorflow/core/runtime_fallback/test/BUILD b/tensorflow/core/runtime_fallback/test/BUILD
index d8fd4850f1357f..e210a8296f3b85 100644
--- a/tensorflow/core/runtime_fallback/test/BUILD
+++ b/tensorflow/core/runtime_fallback/test/BUILD
@@ -1,4 +1,3 @@
-load("@tf_runtime//tools:mlir_to_bef.bzl", "mlir_to_bef")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_test", "tf_cc_test")
 # copybara:uncomment load("//third_party/tf_runtime_google/cpp_tests:gen_tests.bzl", "tfrt_cc_test_and_strict_benchmark")
 
@@ -17,21 +16,6 @@ package_group(
     ],
 )
 
-mlir_to_bef(
-    name = "testdata/batch_function_fallback.mlir",
-    tfrt_translate = "//tensorflow/compiler/mlir/tfrt:tfrt_fallback_translate",
-)
-
-mlir_to_bef(
-    name = "testdata/create_op.mlir",
-    tfrt_translate = "//tensorflow/compiler/mlir/tfrt:tfrt_fallback_translate",
-)
-
-mlir_to_bef(
-    name = "testdata/custom_thread_pool.mlir",
-    tfrt_translate = "//tensorflow/compiler/mlir/tfrt:tfrt_fallback_translate",
-)
-
 cc_library(
     name = "forwarding_test_kernels",
     srcs = ["forwarding_test_kernels.cc"],
@@ -142,43 +126,6 @@ cc_library(
 #     ],
 # )
 #
-# # C++ benchmarks for batch function runtime fallback.
-# tfrt_cc_test_and_strict_benchmark(
-#     name = "batch_function_fallback_benchmark",
-#     srcs = ["batch_function_fallback_benchmark_test.cc"],
-#     data = ["testdata/batch_function_fallback.mlir.bef"],
-#     enable_xprof = True,
-#     includes = ["third_party/tf_runtime/include"],
-#     owners = ["tf-runtime-testing"],
-#     tags = [
-#         "need_main",
-#         "no_gpu",
-#     ],
-#     deps = [
-#         "//base",
-#         "//devtools/build/runtime:get_runfiles_dir",
-#         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
-#         "//tensorflow/core/platform:env",
-#         "//tensorflow/core/platform:resource_loader",
-#         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_op_handler",
-#         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_tensor",
-#         "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
-#         "//tensorflow/core/runtime_fallback/util:fallback_test_util",
-#         "//tensorflow/core/runtime_fallback/util:tensor_util",
-#         "//tensorflow/core/tfrt/utils:fallback_tensor",
-#         "@eigen_archive//:eigen3",
-#         "@tf_runtime//:bef",
-#         "@tf_runtime//:befexecutor",
-#         "@tf_runtime//:core_runtime_alwayslink",
-#         "@tf_runtime//:hostcontext_alwayslink",
-#         "@tf_runtime//:mlirtobef",
-#         "@tf_runtime//:support",
-#         "@tf_runtime//:tensor",
-#         "@tf_runtime//backends/cpu:core_runtime_alwayslink",
-#         "@tf_runtime//backends/cpu:test_ops_alwayslink",
-#     ],
-# )
-#
 # # C++ tests and benchmarks for runtime fallback.
 # tfrt_cc_test_and_strict_benchmark(
 #     name = "c_api_tfrt",
@@ -214,10 +161,10 @@ cc_library(
 #     srcs = ["runtime_fallback_kernels_test.cc"],
 #     deps = [
 #         ":coreruntime_driver",
-#         "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
 #         "@com_google_googletest//:gtest",
 #         "@com_google_googletest//:gtest_main",
 #         "@llvm-project//llvm:Support",
+#         "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
 #         "@tf_runtime//:core_runtime",
 #         "@tf_runtime//backends/cpu:core_runtime_alwayslink",
 #     ] + select({
@@ -241,11 +188,11 @@ cc_library(
 #     includes = ["third_party/tf_runtime/include"],
 #     deps = [
 #         ":coreruntime_driver",
+#         "@com_google_googletest//:gtest",
 #         "//tensorflow/core/platform:test_benchmark",
 #         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_op_handler",
 #         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_tensor",
 #         "@local_tsl//tsl/platform/default/build_config:test_main",
-#         "@com_google_googletest//:gtest",
 #         "@tf_runtime//:core_runtime_alwayslink",
 #         "@tf_runtime//:hostcontext",
 #         "@tf_runtime//:tensor",
@@ -266,11 +213,11 @@ cc_library(
 #     srcs = ["kernel_fallback_compat_request_state_test.cc"],
 #     includes = ["third_party/tf_runtime/include"],
 #     deps = [
+#         "@com_google_googletest//:gtest",
 #         "//tensorflow/core/framework:tensor_testutil",
 #         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
 #         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_op_handler",
 #         "@local_tsl//tsl/platform/default/build_config:test_main",
-#         "@com_google_googletest//:gtest",
 #         "@tf_runtime//:core_runtime_alwayslink",
 #     ],
 # )
@@ -323,32 +270,3 @@ cc_library(
     }),
     alwayslink = 1,
 )
-
-tf_cc_shared_test(
-    name = "kernel_fallback_compat_test",
-    srcs = ["kernel_fallback_compat_test.cc"],
-    data = [
-        "testdata/create_op.mlir.bef",
-        "testdata/custom_thread_pool.mlir.bef",
-    ],
-    tags = ["no_oss"],
-    deps = [
-        "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
-        "//tensorflow/core:all_kernels",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:resource_loader",
-        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
-        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
-        "//tensorflow/core/runtime_fallback/util:fallback_test_util",
-        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
-        "//tensorflow/core/tfrt/runtime",
-        "//tensorflow/core/tfrt/utils:thread_pool",
-        "@com_google_googletest//:gtest_main",
-        "@tf_runtime//:bef",
-        "@tf_runtime//:befexecutor",
-        "@tf_runtime//:core_runtime",
-        "@tf_runtime//:hostcontext",
-        "@tf_runtime//:init_tfrt_dialects",
-        "@tf_runtime//:tracing",
-    ],
-)
diff --git a/tensorflow/core/runtime_fallback/util/BUILD b/tensorflow/core/runtime_fallback/util/BUILD
index 60a8b009c2d9a2..92db3499ec3977 100644
--- a/tensorflow/core/runtime_fallback/util/BUILD
+++ b/tensorflow/core/runtime_fallback/util/BUILD
@@ -10,6 +10,7 @@ package_group(
     name = "internal",
     packages = [
         "//learning/brain/experimental/tfrt/native_lowering/kernels/...",
+        "//tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/...",
         "//tensorflow/core/runtime_fallback/...",
         "//tensorflow/core/tfrt/utils/...",
     ],
diff --git a/tensorflow/core/tfrt/BUILD b/tensorflow/core/tfrt/BUILD
index ecf03260bba471..790641ff63b747 100644
--- a/tensorflow/core/tfrt/BUILD
+++ b/tensorflow/core/tfrt/BUILD
@@ -9,3 +9,13 @@ cc_library(
     ],
     alwayslink = 1,
 )
+
+cc_library(
+    name = "ifrt_program_ops_op_lib",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core/tfrt/kernels:ifrt_program_ops",
+        "//tensorflow/core/tfrt/ops:ifrt_program_ops_op_lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/tfrt/common/BUILD b/tensorflow/core/tfrt/common/BUILD
index f6928dc560b5f6..469bbff25d6f04 100644
--- a/tensorflow/core/tfrt/common/BUILD
+++ b/tensorflow/core/tfrt/common/BUILD
@@ -20,6 +20,7 @@ package_group(
         # copybara:uncomment "//learning/brain/google/xla/...",
         # copybara:uncomment "//learning/brain/tfrc/...",
         # copybara:uncomment "//learning/brain/tfrt/...",
+        # copybara:uncomment "//learning/serving/model_servers/...",
         # copybara:uncomment "//platforms/xla/megascale/tensorflow/...",
         "//tensorflow/c/...",
         "//tensorflow/compiler/jit/...",
diff --git a/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc b/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc
index 5369961b4a87da..7e383c57b6311b 100644
--- a/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc
+++ b/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc
@@ -27,12 +27,13 @@ namespace xla {
 
 StatusOr<std::unique_ptr<xla::PjRtClient>> GetGpuClient(
     const PjrtClientFactoryOptions& option) {
+  xla::GpuClientOptions gpu_client_options;
+  gpu_client_options.node_id = option.gpu_options.node_id;
+  gpu_client_options.num_nodes = 1;
+  gpu_client_options.allowed_devices = option.gpu_options.allowed_devices;
+  gpu_client_options.platform_name = option.gpu_options.platform_name;
   TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
-                      xla::GetStreamExecutorGpuClient(
-                          option.gpu_options.asynchronous,
-                          /*allocator_config=*/{}, option.gpu_options.node_id,
-                          /*num_nodes=*/1, option.gpu_options.allowed_devices,
-                          option.gpu_options.platform_name));
+                      xla::GetStreamExecutorGpuClient(gpu_client_options));
   return std::move(client);
 }
 
diff --git a/tensorflow/core/tfrt/graph_executor/BUILD b/tensorflow/core/tfrt/graph_executor/BUILD
index 38653fe09f1c0a..d99f3519d05206 100644
--- a/tensorflow/core/tfrt/graph_executor/BUILD
+++ b/tensorflow/core/tfrt/graph_executor/BUILD
@@ -205,6 +205,10 @@ tf_proto_library(
     name = "test_config_proto",
     testonly = True,
     srcs = ["test_config.proto"],
+    visibility = if_google(
+        [":friends"],
+        ["//visibility:public"],
+    ),
 )
 
 tf_cc_test(
diff --git a/tensorflow/core/tfrt/ifrt/BUILD b/tensorflow/core/tfrt/ifrt/BUILD
index 092532a9a9df56..a48a74128e6212 100644
--- a/tensorflow/core/tfrt/ifrt/BUILD
+++ b/tensorflow/core/tfrt/ifrt/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
@@ -17,16 +19,19 @@ cc_library(
     srcs = ["ifrt_serving_executable.cc"],
     hdrs = ["ifrt_serving_executable.h"],
     deps = [
-        "//tensorflow/compiler/mlir/tfrt:tf2hlo",
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:tf2hlo",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_helpers",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/concurrency:ref_count",
@@ -66,3 +71,94 @@ cc_library(
         "@local_xla//xla/python/ifrt",
     ],
 )
+
+cc_library(
+    name = "sharding_utils",
+    srcs = [
+        "sharding_utils.cc",
+    ],
+    hdrs = [
+        "sharding_utils.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:statusor",
+        "//tensorflow/core/tpu/kernels:sharding_utils",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla:executable_run_options",
+        "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/python/pjrt_ifrt",
+    ],
+)
+
+tf_cc_test(
+    name = "sharding_utils_test",
+    srcs = ["sharding_utils_test.cc"],
+    tags = ["no_oss"],
+    deps = [
+        ":sharding_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core/framework:tensor_matcher",
+        "//tensorflow/core/framework:tensor_testutil",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@eigen_archive//:eigen3",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/hlo/ir:hlo",
+        "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/python/ifrt:test_util",
+        "@local_xla//xla/python/ifrt/ir",
+        "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
+        "@local_xla//xla/python/pjrt_ifrt:xla_ifrt",
+    ],
+)
+
+tf_cc_test(
+    name = "ifrt_serving_executable_test",
+    srcs = [
+        "ifrt_serving_executable_test.cc",
+    ],
+    data = [
+        "//tensorflow/core/tfrt/ifrt/testdata",
+    ],
+    tags = ["no_oss"],
+    deps = [
+        ":ifrt_serving_executable",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/core/platform:resource_loader",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/python/ifrt:test_util",
+        "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
+    ],
+)
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
index a03958bcbd644d..649e813a511ddd 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
@@ -1,4 +1,3 @@
-
 /* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -25,7 +24,9 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -33,6 +34,7 @@ limitations under the License.
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
@@ -84,24 +86,70 @@ IfrtServingExecutable::ConvertTensorToArray(const tensorflow::Tensor& tensor) {
   return single_array;
 }
 
-absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
+xla::ifrt::Future<absl::StatusOr<std::shared_ptr<xla::ifrt::LoadedExecutable>>>
+IfrtServingExecutable::LookUpOrCreateExecutable(
     absl::Span<const tensorflow::Tensor> inputs) {
-  // TODO(b/304839793): Build cache based on tensorshape etc
-  if (!ifrt_executable_) {
-    LOG(INFO) << "Cache missed. Building executable";
-
-    TF_ASSIGN_OR_RETURN(auto mlir_hlo_module,
-                        CompileTfToHlo(*module_, inputs, signature_name(),
-                                       ifrt_client_->GetDefaultCompiler(),
-                                       shape_representation_fn_));
-
-    TF_ASSIGN_OR_RETURN(
-        ifrt_executable_,
-        ifrt_client_->GetDefaultCompiler()->Compile(
-            std::make_unique<xla::ifrt::XlaProgram>(mlir_hlo_module.get()),
-            std::make_unique<xla::ifrt::XlaCompileOptions>()));
+  std::vector<tensorflow::TensorShape> input_shapes;
+  for (const auto& tensor : inputs) {
+    input_shapes.push_back(tensor.shape());
+  }
+  Key key(input_shapes);
+
+  xla::ifrt::Promise<
+      absl::StatusOr<std::shared_ptr<xla::ifrt::LoadedExecutable>>>
+      promise;
+  xla::ifrt::Future<
+      absl::StatusOr<std::shared_ptr<xla::ifrt::LoadedExecutable>>>
+      future;
+
+  {
+    absl::MutexLock lock(&mutex_);
+
+    const auto it = ifrt_executables_.find(key);
+    if (it != ifrt_executables_.end()) {
+      return it->second;
+    }
+
+    // Only create promise and future when cache missed.
+    promise = xla::ifrt::Future<absl::StatusOr<
+        std::shared_ptr<xla::ifrt::LoadedExecutable>>>::CreatePromise();
+    future = xla::ifrt::Future<
+        absl::StatusOr<std::shared_ptr<xla::ifrt::LoadedExecutable>>>(promise);
+
+    ifrt_executables_.emplace(key, future);
+  }
+
+  LOG(INFO) << "Cache missed. Building executable";
+
+  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> mlir_hlo_module =
+      CompileTfToHlo(*module_, inputs, signature_name(),
+                     ifrt_client_->GetDefaultCompiler(),
+                     shape_representation_fn_);
+  if (!mlir_hlo_module.ok()) {
+    promise.Set(mlir_hlo_module.status());
+    return future;
+  }
+
+  absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>> ifrt_executable =
+      ifrt_client_->GetDefaultCompiler()->Compile(
+          std::make_unique<xla::ifrt::XlaProgram>(mlir_hlo_module->get()),
+          std::make_unique<xla::ifrt::XlaCompileOptions>());
+  if (!ifrt_executable.ok()) {
+    promise.Set(ifrt_executable.status());
+    return future;
   }
 
+  promise.Set(std::shared_ptr<xla::ifrt::LoadedExecutable>(
+      std::move(*ifrt_executable)));
+  return future;
+}
+
+absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
+    absl::Span<const tensorflow::Tensor> inputs) {
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<xla::ifrt::LoadedExecutable> ifrt_executable,
+      LookUpOrCreateExecutable(inputs).Await());
+
   std::vector<tsl::RCReference<xla::ifrt::Array>> args;
   args.reserve(inputs.size());
   for (auto& tensor : inputs) {
@@ -110,7 +158,7 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
   }
 
   TF_ASSIGN_OR_RETURN(auto execution_result,
-                      ifrt_executable_->Execute(
+                      ifrt_executable->Execute(
                           absl::MakeSpan(args),
                           /*options=*/{.untuple_result = true}, std::nullopt));
 
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
index 64ce6580f8cfab..9b1d86cbcbbfd7 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
@@ -21,9 +21,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -32,6 +35,7 @@ limitations under the License.
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/future.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tsl/concurrency/ref_count.h"
@@ -65,7 +69,30 @@ class IfrtServingExecutable {
   absl::StatusOr<std::vector<tensorflow::Tensor>> Execute(
       absl::Span<const tensorflow::Tensor> inputs);
 
+  int num_executables() const {
+    absl::MutexLock lock(&mutex_);
+    return ifrt_executables_.size();
+  }
+
  private:
+  // In memory cache key.
+  struct Key {
+    std::vector<tensorflow::TensorShape> input_shapes;
+    template <typename H>
+    friend H AbslHashValue(H h, const Key& key) {
+      for (const auto& shape : key.input_shapes) {
+        for (auto size : shape.dim_sizes()) {
+          h = H::combine(std::move(h), size);
+        }
+      }
+      return h;
+    }
+
+    friend bool operator==(const Key& x, const Key& y) {
+      return x.input_shapes == y.input_shapes;
+    }
+  };
+
   std::string model_name_;
   std::string signature_name_;
 
@@ -76,10 +103,17 @@ class IfrtServingExecutable {
 
   tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn_;
 
-  std::unique_ptr<xla::ifrt::LoadedExecutable> ifrt_executable_;
+  mutable absl::Mutex mutex_;
+  absl::flat_hash_map<Key, xla::ifrt::Future<absl::StatusOr<
+                               std::shared_ptr<xla::ifrt::LoadedExecutable>>>>
+      ifrt_executables_ ABSL_GUARDED_BY(mutex_);
 
   absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> ConvertTensorToArray(
       const tensorflow::Tensor& tensor);
+
+  xla::ifrt::Future<
+      absl::StatusOr<std::shared_ptr<xla::ifrt::LoadedExecutable>>>
+  LookUpOrCreateExecutable(absl::Span<const tensorflow::Tensor> inputs);
 };
 
 }  // namespace ifrt_serving
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
new file mode 100644
index 00000000000000..a2de6e9a68e16e
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
@@ -0,0 +1,162 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/test_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/resource_loader.h"
+#include "tensorflow/core/platform/test.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+
+TEST(IfrtServingExecutableTest, Basic) {
+  // Create test input module
+  constexpr absl::string_view kDataDirectory =
+      "tensorflow/core/tfrt/ifrt/testdata";
+  std::string mlir_module_path = tensorflow::GetDataDependencyFilepath(
+      absl::StrCat(kDataDirectory, "/executable.mlir"));
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+
+  mlir::MLIRContext context(registry);
+
+  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
+      mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context);
+
+  ASSERT_TRUE(mlir_module);
+
+  // Create contexts required for the compiler execution.
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
+                          xla::ifrt::test_util::GetClient());
+
+  IfrtServingExecutable executable("test", "main", std::move(mlir_module),
+                                   client,
+                                   tensorflow::IdentityShapeRepresentationFn());
+
+  tensorflow::Tensor x(tensorflow::DT_INT32, tensorflow::TensorShape({1, 3}));
+  tensorflow::Tensor y(tensorflow::DT_INT32, tensorflow::TensorShape({3, 1}));
+  for (int i = 0; i < 3; ++i) {
+    x.flat<int32_t>()(i) = i + 1;
+    y.flat<int32_t>()(i) = i + 1;
+  }
+
+  std::vector<tensorflow::Tensor> inputs{x, y};
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          executable.Execute(absl::MakeSpan(inputs)));
+
+  ASSERT_EQ(result.size(), 1);
+  ASSERT_EQ(result[0].dtype(), tensorflow::DT_INT32);
+  ASSERT_EQ(result[0].shape(), tensorflow::TensorShape({1, 1}));
+  ASSERT_EQ(result[0].flat<int32_t>()(0), 14);
+}
+
+TEST(IfrtServingExecutableTest, MultipleShapes) {
+  // Create test input module
+  constexpr absl::string_view kDataDirectory =
+      "tensorflow/core/tfrt/ifrt/testdata";
+  std::string mlir_module_path = tensorflow::GetDataDependencyFilepath(
+      absl::StrCat(kDataDirectory, "/executable.mlir"));
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+
+  mlir::MLIRContext context(registry);
+
+  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
+      mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context);
+
+  ASSERT_TRUE(mlir_module);
+
+  // Create contexts required for the compiler execution.
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
+                          xla::ifrt::test_util::GetClient());
+
+  IfrtServingExecutable executable("test", "main", std::move(mlir_module),
+                                   client,
+                                   tensorflow::IdentityShapeRepresentationFn());
+
+  constexpr int kDim1 = 3;
+  tensorflow::Tensor x1(tensorflow::DT_INT32,
+                        tensorflow::TensorShape({1, kDim1}));
+  tensorflow::Tensor y1(tensorflow::DT_INT32,
+                        tensorflow::TensorShape({kDim1, 1}));
+  for (int i = 0; i < kDim1; ++i) {
+    x1.flat<int32_t>()(i) = i + 1;
+    y1.flat<int32_t>()(i) = i + 1;
+  }
+  std::vector<tensorflow::Tensor> inputs1{x1, y1};
+
+  constexpr int kDim2 = 4;
+  tensorflow::Tensor x2(tensorflow::DT_INT32,
+                        tensorflow::TensorShape({1, kDim2}));
+  tensorflow::Tensor y2(tensorflow::DT_INT32,
+                        tensorflow::TensorShape({kDim2, 1}));
+  for (int i = 0; i < kDim2; ++i) {
+    x2.flat<int32_t>()(i) = i + 1;
+    y2.flat<int32_t>()(i) = i + 1;
+  }
+  std::vector<tensorflow::Tensor> inputs2{x2, y2};
+
+  std::vector<tensorflow::Tensor> outputs1, outputs2;
+  for (int i = 0; i < 3; i++) {
+    TF_ASSERT_OK_AND_ASSIGN(outputs1,
+                            executable.Execute(absl::MakeSpan(inputs1)));
+    TF_ASSERT_OK_AND_ASSIGN(outputs2,
+                            executable.Execute(absl::MakeSpan(inputs2)));
+  }
+  ASSERT_EQ(outputs1.size(), 1);
+  ASSERT_EQ(outputs1[0].dtype(), tensorflow::DT_INT32);
+  ASSERT_EQ(outputs1[0].shape(), tensorflow::TensorShape({1, 1}));
+  ASSERT_EQ(outputs1[0].flat<int32_t>()(0), 14);
+
+  ASSERT_EQ(outputs2.size(), 1);
+  ASSERT_EQ(outputs2[0].dtype(), tensorflow::DT_INT32);
+  ASSERT_EQ(outputs2[0].shape(), tensorflow::TensorShape({1, 1}));
+  ASSERT_EQ(outputs2[0].flat<int32_t>()(0), 30);
+
+  ASSERT_EQ(executable.num_executables(), 2);
+}
+
+}  // namespace
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils.cc b/tensorflow/core/tfrt/ifrt/sharding_utils.cc
new file mode 100644
index 00000000000000..c61d528897bb1b
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils.cc
@@ -0,0 +1,393 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/container/btree_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/index_domain.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/pjrt_ifrt/pjrt_array.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/tpu/kernels/sharding_utils.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+absl::StatusOr<xla::ifrt::DType> ToIfrtDType(
+    tensorflow::DataType tensor_dtype) {
+  xla::PrimitiveType primitive_type;
+  TF_RETURN_IF_ERROR(
+      tensorflow::DataTypeToPrimitiveType(tensor_dtype, &primitive_type));
+  return xla::ifrt::ToDType(primitive_type);
+}
+
+// Shard the given `input_tensor` into equal shapes of slices.
+//
+// `num_paritions_per_axis` specifies the number of partitions along
+// each axis (dimension).
+//
+// `num_replicas` specifies the number of replication for each partitioned
+// sliced buffer.
+//
+// `devices` contains a list of devices flattend into the following
+// order: [slice0][replicate0], [slice0][replicate1], ..., [slice1][replicate0],
+// [slice1][replicate1], ...
+absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+SplitAndCreateArraysFromHostBuffer(
+    xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
+    const std::vector<int32_t>& num_partitions_per_axis, int num_replicas,
+    const std::vector<xla::ifrt::Device*>& devices,
+    const Eigen::ThreadPoolDevice& thread_pool_device) {
+  int64_t num_slices = 1;
+  for (auto k : num_partitions_per_axis) {
+    num_slices *= k;
+  }
+
+  tensorflow::DataType tensor_data_type = input_tensor.dtype();
+  std::vector<int32_t> paddings(num_partitions_per_axis.size(), 0);
+  std::vector<tensorflow::Tensor> split_tensors;
+  split_tensors.resize(num_slices);
+
+  auto allocate_output_fn =
+      [&](int i, const tensorflow::TensorShape& output_slice_shape,
+          tensorflow::Tensor** tensor) {
+        if (i < 0 || i >= split_tensors.size()) {
+          return absl::InvalidArgumentError(absl::StrCat(
+              "Index ", i, " out of range [0, ", split_tensors.size(), "]"));
+        }
+        split_tensors[i] =
+            tensorflow::Tensor(tensor_data_type, output_slice_shape);
+        *tensor = &split_tensors[i];
+        return absl::OkStatus();
+      };
+
+  // Fast path for output in the simple no split case.
+  auto assign_or_copy_value_fn =
+      [&](const tensorflow::Tensor& input) -> Status {
+    split_tensors[0] = input;
+    return absl::OkStatus();
+  };
+
+  // XlaNDSplitter only support rank (0, 8] as there is no concept of split for
+  // rank 0 tensor.
+  if (input_tensor.shape().dims() == 0) {
+    if (split_tensors.size() != 1) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Rank 0 tensor only expects 1 slice but got ", split_tensors.size()));
+    }
+    split_tensors[0] = input_tensor;
+  } else {
+    switch (input_tensor.dtype()) {
+#define CASE(type)                                                             \
+  case tensorflow::DataTypeToEnum<type>::value: {                              \
+    TF_ASSIGN_OR_RETURN(auto splitter,                                         \
+                        (XlaNDSplitter<Eigen::ThreadPoolDevice, type>::Create( \
+                            num_partitions_per_axis, num_slices, paddings,     \
+                            /*has_paddings=*/false)));                         \
+    TF_RETURN_IF_ERROR(                                                        \
+        splitter.Split(&input_tensor, "input tensor", assign_or_copy_value_fn, \
+                       allocate_output_fn, thread_pool_device));               \
+  } break;
+    TF_CALL_ALL_TYPES(CASE);
+    TF_CALL_quint8(CASE);
+#undef CASE
+    default:
+      return absl::InvalidArgumentError("Unsupported data type");
+    }
+  }
+
+  if (split_tensors.size() * num_replicas != devices.size()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Expect ", devices.size(), " but got ",
+                     split_tensors.size(), " x ", num_replicas));
+  }
+
+  std::vector<tsl::RCReference<xla::ifrt::Array>> arrays;
+  arrays.reserve(devices.size());
+  TF_ASSIGN_OR_RETURN(xla::ifrt::DType dtype, ToIfrtDType(tensor_data_type));
+  auto device_iter = devices.begin();
+  for (int slice_idx = 0; slice_idx < split_tensors.size(); ++slice_idx) {
+    auto& tensor = split_tensors[slice_idx];
+
+    for (int i = 0; i < num_replicas; ++i) {
+      VLOG(2) << "Make array for buffer slice " << slice_idx << " at "
+              << tensor.data();
+      if (device_iter == devices.end()) {
+        return absl::InternalError(
+            absl::StrCat("Missing Device ", i, " for slice ", slice_idx));
+      }
+      auto single_device_sharding = xla::ifrt::SingleDeviceSharding::Create(
+          *device_iter, xla::ifrt::MemoryKind());
+
+      TF_ASSIGN_OR_RETURN(
+          auto array,
+          ifrt_client.MakeArrayFromHostBuffer(
+              tensor.data(), dtype,
+              xla::ifrt::Shape(tensor.shape().dim_sizes()),
+              /*byte_strides=*/{}, std::move(single_device_sharding),
+              xla::ifrt::Client::HostBufferSemantics::
+                  kImmutableUntilTransferCompletes,
+              [tensor, slice_idx]() {
+                // Keep tensor alive
+                LOG(INFO) << "Done with host buffer for slice " << slice_idx
+                          << " at " << tensor.data();
+              }));
+      arrays.push_back(std::move(array));
+      device_iter++;
+    }
+  }
+  return arrays;
+}
+
+absl::StatusOr<int> VerifyIndexDomainsAndGetReplicas(
+    absl::Span<xla::ifrt::IndexDomain> index_domains,
+    const tensorflow::TensorShape& tensor_shape) {
+  if (index_domains.size() <= 1) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Expect multiple index domains but got ", index_domains.size()));
+  }
+
+  for (auto index_domain = index_domains.begin();
+       index_domain < index_domains.end(); ++index_domain) {
+    if (index_domain->shape().dims().size() != tensor_shape.dims()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Expect equal rank of ", tensor_shape.dims(),
+                       " but got ", index_domain->shape().dims().size()));
+    }
+  }
+
+  // Only support equal shape for all index domains
+  auto first_index_domain = index_domains.begin();
+  for (auto index_domain = index_domains.begin() + 1;
+       index_domain < index_domains.end(); ++index_domain) {
+    if (first_index_domain->shape() != index_domain->shape()) {
+      return absl::UnimplementedError(absl::StrCat(
+          "Expect equal shape of ", first_index_domain->shape().DebugString(),
+          " but  got ", index_domain->shape().DebugString()));
+    }
+  }
+
+  // Verify that each `IndexDomain` appear the same `num_replica` times. Since
+  // shapes are the same for all `IndexDomain`, this also implies each `origin`
+  // appear `num_replica` times.
+  auto index_domain_lexicographical_comparator =
+      [](const xla::ifrt::IndexDomain& a, const xla::ifrt::IndexDomain& b) {
+        return std::lexicographical_compare(
+            a.origin().elements().begin(), a.origin().elements().end(),
+            b.origin().elements().begin(), b.origin().elements().end());
+      };
+  absl::btree_map<xla::ifrt::IndexDomain, int,
+                  decltype(index_domain_lexicographical_comparator)>
+      index_domain_counts;
+  for (const auto& index_domain : index_domains) {
+    index_domain_counts[index_domain]++;
+  }
+
+  std::vector<xla::ifrt::IndexDomain> unique_index_domains;
+  unique_index_domains.reserve(index_domain_counts.size());
+  int num_replicas = index_domain_counts.begin()->second;
+  for (const auto& [index_domain, count] : index_domain_counts) {
+    if (count != num_replicas) {
+      return absl::FailedPreconditionError(absl::StrCat(
+          "Expected ", num_replicas, " replicas for ",
+          index_domain.DebugString(), " but got ", count, " replicas"));
+    }
+    unique_index_domains.push_back(index_domain);
+  }
+
+  // Verify that distances of between origins of neighbouring `IndexDomain`
+  // bounded by shape. Note that unique_indexx_domains are already in sorted
+  // order.
+  auto prev_iter = unique_index_domains.begin();
+  auto next_iter = unique_index_domains.begin() + 1;
+  const auto& bounded_box = first_index_domain->shape();
+  while (prev_iter != unique_index_domains.end() &&
+         next_iter != unique_index_domains.end()) {
+    xla::ifrt::Index offset = next_iter->origin() - prev_iter->origin();
+    for (int dim = 0; dim < bounded_box.dims().size(); ++dim) {
+      if (std::abs(offset.elements()[dim]) != bounded_box.dims()[dim] &&
+          offset.elements()[dim] != 0) {
+        return absl::FailedPreconditionError(absl::StrCat(
+            "IndexDomains should not have gap or overlap, but got ",
+            prev_iter->DebugString(), " and ", next_iter->DebugString(),
+            " that have offset of ", offset.DebugString()));
+      }
+    }
+    prev_iter = next_iter;
+    next_iter++;
+  }
+
+  // Verify the last `IndexDomain`'s upper end of the bound matches with the
+  // tensor shape. Together with the above check, this provides an approximation
+  // to the following two assumptions:
+  // 1. the union of all IndexDomain covers the entire global shape array with
+  // no gaps.
+  // 2. no two index_domain have any overlap.
+  std::vector<int64_t> bounded_shape;
+  const auto& last_index_domain = unique_index_domains.back();
+  bounded_shape.reserve(last_index_domain.shape().dims().size());
+  for (int d = 0; d < last_index_domain.shape().dims().size(); ++d) {
+    bounded_shape.push_back(last_index_domain.origin().elements()[d] +
+                            last_index_domain.shape().dims()[d]);
+  }
+
+  if (xla::ifrt::Shape(bounded_shape) !=
+      xla::ifrt::Shape(tensor_shape.dim_sizes())) {
+    return absl::FailedPreconditionError(absl::StrCat(
+        "IndexDomain ", last_index_domain.DebugString(),
+        " does not overlap with tensor shape ", tensor_shape.DebugString()));
+  }
+
+  return num_replicas;
+}
+
+}  // namespace
+
+StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeAssembledArrayFromHostBuffer(
+    xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
+    std::shared_ptr<xla::ifrt::Sharding> sharding,
+    const Eigen::ThreadPoolDevice& thread_pool_device) {
+  VLOG(2) << "Assembling arrays by sharding " << sharding->DebugString();
+
+  TF_ASSIGN_OR_RETURN(auto index_domains,
+                      sharding->IndexDomains(
+                          xla::ifrt::Shape(input_tensor.shape().dim_sizes())));
+
+  TF_ASSIGN_OR_RETURN(int index_domain_replicas,
+                      VerifyIndexDomainsAndGetReplicas(
+                          absl::MakeSpan(index_domains), input_tensor.shape()));
+
+  const auto& first_index_domain = index_domains.begin();
+  std::vector<int32_t> num_partitions_per_axis;
+  int total_num_partitions = 1;
+  num_partitions_per_axis.reserve(input_tensor.shape().dims());
+  for (int dim = 0; dim < input_tensor.shape().dims(); ++dim) {
+    int target_size = first_index_domain->shape().dims()[dim];
+    if (input_tensor.shape().dim_size(dim) % target_size != 0) {
+      return absl::FailedPreconditionError(absl::StrCat(
+          "Only support even sharding, but input tensor shape ",
+          input_tensor.shape().DebugString(), " not even splittable to ",
+          first_index_domain->shape().DebugString()));
+    }
+    int num_partitions = input_tensor.shape().dim_size(dim) / target_size;
+    total_num_partitions *= num_partitions;
+    num_partitions_per_axis.push_back(num_partitions);
+  }
+
+  if (total_num_partitions > sharding->devices().size() ||
+      sharding->devices().size() % total_num_partitions != 0) {
+    return absl::UnimplementedError(absl::StrCat(
+        "Number of devices ", sharding->devices().size(),
+        " not a multiple of number of partitions", total_num_partitions));
+  }
+
+  // Assume index domains are non-overlapping and each index domain appears
+  // exactly num_replicates times. This allows us to rely on
+  // lexicographical sorting to replicate slices in the correct order.
+  int num_replicas = sharding->devices().size() / total_num_partitions;
+  if (index_domain_replicas != num_replicas) {
+    return absl::FailedPreconditionError(
+        absl::StrCat("IndexDomain indicates ", index_domain_replicas,
+                     " replicas, but got ", num_replicas, " replicas"));
+  }
+
+  // Sorted the IndexDomain and devices from major to minor dimenson. For
+  // example, a two dimension IndexDomain will be ordered by [0, 0], [0, 1], [1,
+  // 0], [1, 1].
+  // This is O(n*log(n)) vs looking for devices individually which is O(n^2).
+  struct IndexDomainDevice {
+    xla::ifrt::IndexDomain index_domain;
+    xla::ifrt::Device* device;
+    // The index of this `device`/`index_domain` in the
+    // sharding.devices/index_domains.
+    int original_shard_index;
+  };
+  std::vector<IndexDomainDevice> index_domain_devices;
+  index_domain_devices.reserve(index_domains.size());
+  for (int i = 0; i < index_domains.size(); ++i) {
+    index_domain_devices.push_back(
+        {index_domains[i], sharding->devices()[i], i});
+  }
+  std::sort(index_domain_devices.begin(), index_domain_devices.end(),
+            [](const IndexDomainDevice& a, const IndexDomainDevice& b) {
+              return std::lexicographical_compare(
+                  a.index_domain.origin().elements().begin(),
+                  a.index_domain.origin().elements().end(),
+                  b.index_domain.origin().elements().begin(),
+                  b.index_domain.origin().elements().end());
+            });
+  // Now the devices is in order.
+  std::vector<xla::ifrt::Device*> devices;
+  devices.reserve(index_domain_devices.size());
+  std::vector<int> original_device_indices;
+  original_device_indices.reserve(index_domain_devices.size());
+  for (auto& [index_domain, device, original_device_index] :
+       index_domain_devices) {
+    devices.push_back(device);
+    original_device_indices.push_back(original_device_index);
+    VLOG(3) << "Device " << device->ToString();
+  }
+
+  TF_ASSIGN_OR_RETURN(auto arrays,
+                      SplitAndCreateArraysFromHostBuffer(
+                          ifrt_client, input_tensor, num_partitions_per_axis,
+                          num_replicas, devices, thread_pool_device));
+
+  // Re-arranged arrays back to original device order
+  std::vector<tsl::RCReference<xla::ifrt::Array>> rearranged_arrays;
+  rearranged_arrays.resize(arrays.size());
+  for (int i = 0; i < arrays.size(); ++i) {
+    rearranged_arrays[original_device_indices[i]] = std::move(arrays[i]);
+  }
+
+  return ifrt_client.AssembleArrayFromSingleDeviceArrays(
+      xla::ifrt::Shape(input_tensor.shape().dim_sizes()), sharding,
+      absl::MakeSpan(rearranged_arrays),
+      xla::ifrt::ArrayCopySemantics::kDonateInput);
+}
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils.h b/tensorflow/core/tfrt/ifrt/sharding_utils.h
new file mode 100644
index 00000000000000..5e19d46443582c
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_SHARDING_UTILS_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_SHARDING_UTILS_H_
+
+#include <memory>
+
+#include "xla/executable_run_options.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/sharding.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tsl/concurrency/ref_count.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// Sharded the given `data` by the `sharding` specification.
+// It currently supports even sharding, replication and partial replication.
+StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeAssembledArrayFromHostBuffer(
+    xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
+    std::shared_ptr<xla::ifrt::Sharding> sharding,
+    const Eigen::ThreadPoolDevice& thread_pool_device);
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  //  TENSORFLOW_CORE_TFRT_IFRT_SHARDING_UTILS_H_
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc b/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc
new file mode 100644
index 00000000000000..57fd9ed7003bb7
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc
@@ -0,0 +1,516 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
+
+#include <cstdint>
+#include <initializer_list>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "llvm/ADT/SmallVector.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/ir/sharding_param.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/test_util.h"
+#include "xla/python/pjrt_ifrt/xla_sharding.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_matcher.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+#include "tsl/platform/threadpool.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+
+using tsl::testing::StatusIs;
+
+struct HloShardingTestParam {
+  tensorflow::Tensor in_tensor;
+  std::vector<tensorflow::Tensor> expected_out_tensors;
+  std::vector<int> device_indices;
+  xla::HloSharding sharding;
+};
+
+struct ShardingParamTestParam {
+  tensorflow::Tensor in_tensor;
+  std::vector<tensorflow::Tensor> expected_out_tensors;
+  std::vector<int> device_indices;
+
+  // Parameter to form ShardingParam
+  std::vector<int64_t> dim_shards;
+  llvm::SmallVector<int, 4> permutation;
+  llvm::SmallVector<int, 4> axis_sizes;
+};
+
+using ShardingParamTest = ::testing::TestWithParam<ShardingParamTestParam>;
+using HloShardingTest = ::testing::TestWithParam<HloShardingTestParam>;
+
+// Wrapper functions for building sharding specs for a given shape with a
+// natural device order.
+xla::HloSharding Tile(absl::Span<const int64_t> dims) {
+  return xla::HloSharding::IotaTile(dims);
+}
+xla::HloSharding PartialTile(absl::Span<const int64_t> dims) {
+  return xla::HloSharding::PartialTile(xla::TileAssignment(dims));
+}
+xla::HloSharding Replicate() { return xla::HloSharding::Replicate(); }
+
+TEST_P(HloShardingTest, MakeAssembledArrayFromHostBuffer) {
+  constexpr int kMaxParallelism = 16;
+  auto thread_pool = std::make_unique<tsl::thread::ThreadPool>(
+      tsl::Env::Default(), tsl::ThreadOptions(), "Resharding", kMaxParallelism);
+
+  Eigen::ThreadPoolDevice device(thread_pool->AsEigenThreadPool(),
+                                 kMaxParallelism);
+
+  auto input_tensor = GetParam().in_tensor;
+
+  // Create contexts required for the compiler execution.
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
+                          xla::ifrt::test_util::GetClient());
+  TF_ASSERT_OK_AND_ASSIGN(auto device_list,
+                          xla::ifrt::test_util::GetDevices(
+                              client.get(), GetParam().device_indices));
+
+  auto sharding = xla::ifrt::HloSharding::Create(
+      device_list, xla::ifrt::MemoryKind(), GetParam().sharding);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto assembled_array,
+      MakeAssembledArrayFromHostBuffer(*client, input_tensor,
+                                       std::move(sharding), device));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto disassembled_arrays,
+                          assembled_array->DisassembleIntoSingleDeviceArrays(
+                              xla::ifrt::ArrayCopySemantics::kAlwaysCopy));
+
+  ASSERT_EQ(disassembled_arrays.size(), GetParam().expected_out_tensors.size());
+
+  tensorflow::Tensor host_tensor(tensorflow::DT_INT32,
+                                 tensorflow::TensorShape({1, 2}));
+
+  for (int i = 0; i < disassembled_arrays.size(); ++i) {
+    SCOPED_TRACE(absl::StrCat("Array ", i, " of ", disassembled_arrays.size()));
+    auto disassembled_array = disassembled_arrays[i];
+    auto expected_out_tensor = GetParam().expected_out_tensors[i];
+    ASSERT_EQ(disassembled_array->shape(),
+              xla::ifrt::Shape(expected_out_tensor.shape().dim_sizes()));
+    tensorflow::Tensor host_tensor(expected_out_tensor.dtype(),
+                                   expected_out_tensor.shape());
+    TF_ASSERT_OK(
+        disassembled_array
+            ->CopyToHostBuffer(host_tensor.data(), /*byte_strides=*/{},
+                               xla::ifrt::ArrayCopySemantics::kAlwaysCopy)
+            .Await());
+    EXPECT_THAT(expected_out_tensor, tensorflow::test::TensorEq(host_tensor));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    HloShardingTests, HloShardingTest,
+    ::testing::ValuesIn<HloShardingTestParam>(
+        {
+            // Full replication.
+            {
+                .in_tensor = test::AsTensor<int32_t>({1}, TensorShape({})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1}, TensorShape({})),
+                        test::AsTensor<int32_t>({1}, TensorShape({})),
+                    },
+                .device_indices = {0, 1},
+                .sharding = Replicate(),
+            },
+            {
+                .in_tensor = test::AsTensor<int32_t>({1, 2, 3},
+                                                     TensorShape({3, 1})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 2, 3}, TensorShape({3, 1})),
+                        test::AsTensor<int32_t>({1, 2, 3}, TensorShape({3, 1})),
+                    },
+                .device_indices = {0, 1},
+                .sharding = Replicate(),
+            },
+            // 1-D sharding
+            {
+                .in_tensor = test::AsTensor<int32_t>({1, 2, 3, 4},
+                                                     TensorShape({4})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 2}, TensorShape({2})),
+                        test::AsTensor<int32_t>({3, 4}, TensorShape({2})),
+                    },
+                .device_indices = {0, 1},
+                .sharding = Tile({2}),
+            },
+            {
+                .in_tensor = test::AsTensor<int32_t>({1, 2, 3, 4},
+                                                     TensorShape({2, 2})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 2}, TensorShape({1, 2})),
+                        test::AsTensor<int32_t>({3, 4}, TensorShape({1, 2})),
+                    },
+                .device_indices = {0, 1},
+                .sharding = Tile({2, 1}),
+            },
+            {
+                .in_tensor = test::AsTensor<int32_t>({1, 2, 3, 4},
+                                                     TensorShape({1, 2, 2})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 3}, TensorShape({1, 2, 1})),
+                        test::AsTensor<int32_t>({2, 4}, TensorShape({1, 2, 1})),
+                    },
+                .device_indices = {0, 1},
+                .sharding = Tile({1, 1, 2}),
+            },
+            {
+                .in_tensor = test::AsTensor<int32_t>({1, 2, 3, 4, 5, 6, 7, 8},
+                                                     TensorShape({4, 2})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 2}, TensorShape({1, 2})),
+                        test::AsTensor<int32_t>({3, 4}, TensorShape({1, 2})),
+                        test::AsTensor<int32_t>({5, 6}, TensorShape({1, 2})),
+                        test::AsTensor<int32_t>({7, 8}, TensorShape({1, 2})),
+                    },
+                .device_indices = {0, 1, 2, 3},
+                .sharding = Tile({4, 1}),
+            },
+            {
+                .in_tensor = test::AsTensor<int32_t>({1, 2, 3, 4, 5, 6, 7, 8},
+                                                     TensorShape({4, 2})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 3, 5, 7},
+                                                TensorShape({4, 1})),
+                        test::AsTensor<int32_t>({2, 4, 6, 8},
+                                                TensorShape({4, 1})),
+                    },
+                .device_indices = {0, 1},
+                .sharding = Tile({1, 2}),
+            },
+            // 2-D sharding
+            {
+                .in_tensor = test::AsTensor<int32_t>(
+                    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                    TensorShape({4, 4})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 2, 5, 6},
+                                                TensorShape({2, 2})),
+                        test::AsTensor<int32_t>({3, 4, 7, 8},
+                                                TensorShape({2, 2})),
+                        test::AsTensor<int32_t>({9, 10, 13, 14},
+                                                TensorShape({2, 2})),
+                        test::AsTensor<int32_t>({11, 12, 15, 16},
+                                                TensorShape({2, 2})),
+                    },
+                .device_indices = {0, 1, 2, 3},
+                .sharding = Tile({2, 2}),
+            },
+            {
+                .in_tensor = test::AsTensor<int32_t>(
+                    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                    TensorShape({4, 1, 4})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 2, 5, 6},
+                                                TensorShape({2, 1, 2})),
+                        test::AsTensor<int32_t>({3, 4, 7, 8},
+                                                TensorShape({2, 1, 2})),
+                        test::AsTensor<int32_t>({9, 10, 13, 14},
+                                                TensorShape({2, 1, 2})),
+                        test::AsTensor<int32_t>({11, 12, 15, 16},
+                                                TensorShape({2, 1, 2})),
+                    },
+                .device_indices = {0, 1, 2, 3},
+                .sharding = Tile({2, 1, 2}),
+            },
+            // Partial replication
+            {
+                .in_tensor = test::AsTensor<int32_t>({1, 2, 3, 4},
+                                                     TensorShape({2, 2})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 3}, TensorShape({2, 1})),
+                        test::AsTensor<int32_t>({1, 3}, TensorShape({2, 1})),
+                        test::AsTensor<int32_t>({2, 4}, TensorShape({2, 1})),
+                        test::AsTensor<int32_t>({2, 4}, TensorShape({2, 1})),
+                    },
+                .device_indices = {0, 1, 2, 3},
+                .sharding = PartialTile({1, 2, 2}),
+            },
+            {
+                .in_tensor = test::AsTensor<int32_t>({1, 2, 3, 4},
+                                                     TensorShape({2, 2})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 2}, TensorShape({1, 2})),
+                        test::AsTensor<int32_t>({1, 2}, TensorShape({1, 2})),
+                        test::AsTensor<int32_t>({3, 4}, TensorShape({1, 2})),
+                        test::AsTensor<int32_t>({3, 4}, TensorShape({1, 2})),
+                    },
+                .device_indices = {0, 1, 2, 3},
+                .sharding = PartialTile({2, 1, 2}),
+            },
+        }));
+
+TEST_P(ShardingParamTest, MakeAssembledArrayFromHostBuffer) {
+  constexpr int kMaxParallelism = 16;
+  auto thread_pool = std::make_unique<tsl::thread::ThreadPool>(
+      tsl::Env::Default(), tsl::ThreadOptions(), "Resharding", kMaxParallelism);
+
+  Eigen::ThreadPoolDevice device(thread_pool->AsEigenThreadPool(),
+                                 kMaxParallelism);
+
+  auto input_tensor = GetParam().in_tensor;
+
+  // Create contexts required for the compiler execution.
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
+                          xla::ifrt::test_util::GetClient());
+  TF_ASSERT_OK_AND_ASSIGN(auto device_list,
+                          xla::ifrt::test_util::GetDevices(
+                              client.get(), GetParam().device_indices));
+
+  xla::ifrt::ShardingParam sharding_param{
+      GetParam().dim_shards,
+      xla::ifrt::ShardingParam::MinorToMajor(GetParam().permutation,
+                                             GetParam().axis_sizes)};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto sharding, xla::ifrt::ShardingParamSharding::Create(
+                         sharding_param, device_list, xla::ifrt::MemoryKind()));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto assembled_array,
+      MakeAssembledArrayFromHostBuffer(*client, input_tensor,
+                                       std::move(sharding), device));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto disassembled_arrays,
+                          assembled_array->DisassembleIntoSingleDeviceArrays(
+                              xla::ifrt::ArrayCopySemantics::kAlwaysCopy));
+
+  ASSERT_EQ(disassembled_arrays.size(), GetParam().expected_out_tensors.size());
+
+  tensorflow::Tensor host_tensor(tensorflow::DT_INT32,
+                                 tensorflow::TensorShape({1, 2}));
+
+  for (int i = 0; i < disassembled_arrays.size(); ++i) {
+    SCOPED_TRACE(absl::StrCat("Array ", i, " of ", disassembled_arrays.size()));
+    auto disassembled_array = disassembled_arrays[i];
+    auto expected_out_tensor = GetParam().expected_out_tensors[i];
+    ASSERT_EQ(disassembled_array->shape(),
+              xla::ifrt::Shape(expected_out_tensor.shape().dim_sizes()));
+    tensorflow::Tensor host_tensor(expected_out_tensor.dtype(),
+                                   expected_out_tensor.shape());
+    TF_ASSERT_OK(
+        disassembled_array
+            ->CopyToHostBuffer(host_tensor.data(), /*byte_strides=*/{},
+                               xla::ifrt::ArrayCopySemantics::kAlwaysCopy)
+            .Await());
+    EXPECT_THAT(expected_out_tensor, tensorflow::test::TensorEq(host_tensor));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ShardingParamTests, ShardingParamTest,
+    ::testing::ValuesIn<ShardingParamTestParam>(
+        {
+            {
+                .in_tensor = test::AsTensor<int32_t>({1, 2, 3, 4},
+                                                     TensorShape({2, 2})),
+
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 2}, TensorShape({1, 2})),
+                        test::AsTensor<int32_t>({3, 4}, TensorShape({1, 2})),
+                    },
+                .device_indices = {0, 1},
+                .dim_shards = {2, 1},
+                .permutation = {0, 1},
+                .axis_sizes = {2, 1},
+            },
+            {
+                .in_tensor = test::AsTensor<int32_t>({1, 2, 3, 4},
+                                                     TensorShape({2, 2})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 3}, TensorShape({2, 1})),
+                        test::AsTensor<int32_t>({2, 4}, TensorShape({2, 1})),
+                    },
+                .device_indices = {0, 1},
+                .dim_shards = {1, 2},
+                .permutation = {0, 1},
+                .axis_sizes = {1, 2},
+            },
+            {
+                .in_tensor = test::AsTensor<int32_t>(
+                    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                    TensorShape({4, 4})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 2, 5, 6},
+                                                TensorShape({2, 2})),
+                        test::AsTensor<int32_t>({3, 4, 7, 8},
+                                                TensorShape({2, 2})),
+                        test::AsTensor<int32_t>({9, 10, 13, 14},
+                                                TensorShape({2, 2})),
+                        test::AsTensor<int32_t>({11, 12, 15, 16},
+                                                TensorShape({2, 2})),
+                    },
+                .device_indices = {0, 1, 2, 3},
+                .dim_shards = {2, 2},
+                .permutation = {0, 1},
+                .axis_sizes = {2, 2},
+            },
+            {
+                .in_tensor = test::AsTensor<int32_t>(
+                    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                    TensorShape({4, 4})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 2, 3, 4, 5, 6, 7, 8},
+                                                TensorShape({2, 4})),
+                        test::AsTensor<int32_t>({9, 10, 11, 12, 13, 14, 15, 16},
+                                                TensorShape({2, 4})),
+                    },
+                .device_indices = {0, 1},
+                .dim_shards = {2, 1},
+                .permutation = {1, 0},
+                .axis_sizes = {2, 1},
+            },
+            // Full replication
+            {
+                .in_tensor = test::AsTensor<int32_t>({1, 2, 3, 4},
+                                                     TensorShape({2, 2})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 2, 3, 4},
+                                                TensorShape({2, 2})),
+                        test::AsTensor<int32_t>({1, 2, 3, 4},
+                                                TensorShape({2, 2})),
+                    },
+                .device_indices = {0, 1},
+                .dim_shards = {1, 1},
+                .permutation = {0},
+                .axis_sizes = {2},
+            },
+            // Partial replication (aka replicate_on_last_tile_dim = true)
+            {
+                .in_tensor = test::AsTensor<int32_t>({1, 2, 3, 4},
+                                                     TensorShape({2, 2})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 3}, TensorShape({2, 1})),
+                        test::AsTensor<int32_t>({1, 3}, TensorShape({2, 1})),
+                        test::AsTensor<int32_t>({2, 4}, TensorShape({2, 1})),
+                        test::AsTensor<int32_t>({2, 4}, TensorShape({2, 1})),
+                    },
+                .device_indices = {0, 1, 2, 3},
+                .dim_shards = {1, 2},
+                .permutation = {0, 1},
+                .axis_sizes = {2, 2},
+            },
+            // Partial replication that shards along the first dimension.
+            {
+                .in_tensor = test::AsTensor<int32_t>({1, 2, 3, 4},
+                                                     TensorShape({2, 2})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 2}, TensorShape({1, 2})),
+                        test::AsTensor<int32_t>({1, 2}, TensorShape({1, 2})),
+                        test::AsTensor<int32_t>({3, 4}, TensorShape({1, 2})),
+                        test::AsTensor<int32_t>({3, 4}, TensorShape({1, 2})),
+                    },
+                .device_indices = {0, 1, 2, 3},
+                .dim_shards = {2, 1},
+                .permutation = {0, 1},
+                .axis_sizes = {2, 2},
+            },
+            // Partial replication with random device indices.
+            {
+                .in_tensor = test::AsTensor<int32_t>({1, 2, 3, 4},
+                                                     TensorShape({2, 2})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1, 3}, TensorShape({2, 1})),
+                        test::AsTensor<int32_t>({1, 3}, TensorShape({2, 1})),
+                        test::AsTensor<int32_t>({2, 4}, TensorShape({2, 1})),
+                        test::AsTensor<int32_t>({2, 4}, TensorShape({2, 1})),
+                    },
+                .device_indices = {3, 1, 2, 0},
+                .dim_shards = {1, 2},
+                .permutation = {0, 1},
+                .axis_sizes = {2, 2},
+            },
+        }));
+
+TEST(ShardingUtilsTest, MismatchRank) {
+  constexpr int kMaxParallelism = 16;
+  auto thread_pool = std::make_unique<tsl::thread::ThreadPool>(
+      tsl::Env::Default(), tsl::ThreadOptions(), "Resharding", kMaxParallelism);
+
+  Eigen::ThreadPoolDevice device(thread_pool->AsEigenThreadPool(),
+                                 kMaxParallelism);
+
+  auto input_tensor =
+      test::AsTensor<int32_t>({1, 2, 3, 4}, TensorShape({2, 1, 2}));
+
+  // Create contexts required for the compiler execution.
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
+                          xla::ifrt::test_util::GetClient());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_list, xla::ifrt::test_util::GetDevices(client.get(), {0, 1}));
+
+  xla::ifrt::ShardingParam sharding_param = {
+      /*dim_shards=*/{2, 1},
+      xla::ifrt::ShardingParam::MinorToMajor(/*permutation=*/{0, 1},
+                                             /*axis_sizes=*/{2, 1})};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto sharding, xla::ifrt::ShardingParamSharding::Create(
+                         sharding_param, device_list, xla::ifrt::MemoryKind()));
+
+  EXPECT_THAT(MakeAssembledArrayFromHostBuffer(*client, input_tensor,
+                                               std::move(sharding), device),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       "Expect equal rank of 3 but got 2"));
+}
+
+}  // namespace
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/testdata/BUILD b/tensorflow/core/tfrt/ifrt/testdata/BUILD
new file mode 100644
index 00000000000000..948ce54ab983a7
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/testdata/BUILD
@@ -0,0 +1,12 @@
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/core/tfrt/ifrt:__subpackages__"],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "testdata",
+    srcs = glob(
+        ["*"],
+    ),
+)
diff --git a/tensorflow/core/tfrt/ifrt/testdata/executable.mlir b/tensorflow/core/tfrt/ifrt/testdata/executable.mlir
new file mode 100644
index 00000000000000..95c558ddb7ae0b
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/testdata/executable.mlir
@@ -0,0 +1,6 @@
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+  func.func @main(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi32> {
+    %0 = "tf.MatMul"(%arg0, %arg1): (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+    func.return %0 : tensor<*xi32>
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/core/tfrt/kernels/BUILD b/tensorflow/core/tfrt/kernels/BUILD
index bf2768ec3ed419..390bef2009b9b0 100644
--- a/tensorflow/core/tfrt/kernels/BUILD
+++ b/tensorflow/core/tfrt/kernels/BUILD
@@ -16,6 +16,23 @@ package_group(
     ],
 )
 
+cc_library(
+    name = "ifrt_program_ops",
+    srcs = ["ifrt_program_ops.cc"],
+    hdrs = ["ifrt_program_ops.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core/tfrt/ifrt:ifrt_executable_registry",
+        "//tensorflow/core/tfrt/ifrt:ifrt_serving_executable",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "stream_ops",
     srcs = ["stream_ops.cc"],
diff --git a/tensorflow/core/tfrt/kernels/ifrt_program_ops.cc b/tensorflow/core/tfrt/kernels/ifrt_program_ops.cc
new file mode 100644
index 00000000000000..92ce3cad2c1e04
--- /dev/null
+++ b/tensorflow/core/tfrt/kernels/ifrt_program_ops.cc
@@ -0,0 +1,67 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/kernels/ifrt_program_ops.h"
+
+#include <memory>
+#include <vector>
+
+#include "absl/base/call_once.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+IfrtCallOp::IfrtCallOp(tensorflow::OpKernelConstruction* ctx) : OpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("program_id", &program_id_));
+}
+
+void IfrtCallOp::Compute(tensorflow::OpKernelContext* ctx) {
+  absl::call_once(init_once_, [&]() {
+    executable_ = tensorflow::ifrt_serving::ServingExecutableRegistry::Lookup(
+        program_id_);
+  });
+  OP_REQUIRES(ctx, executable_ != nullptr,
+              absl::NotFoundError(
+                  absl::StrCat("Unknown program id '", program_id_, "'")));
+
+  std::vector<Tensor> inputs;
+  inputs.reserve(ctx->num_inputs());
+  for (int i = 0; i < ctx->num_inputs(); ++i) {
+    inputs.push_back(ctx->input(i));
+  }
+
+  absl::StatusOr<std::vector<Tensor>> results = executable_->Execute(inputs);
+  OP_REQUIRES(ctx, results.ok(), results.status());
+
+  tensorflow::OpOutputList outputs(ctx, 0, results->size());
+  for (int i = 0; i < results->size(); ++i) {
+    outputs.set(i, (*results)[i]);
+  }
+}
+
+REGISTER_KERNEL_BUILDER(Name("IfrtCall").Device(tensorflow::DEVICE_CPU),
+                        IfrtCallOp);
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/kernels/ifrt_program_ops.h b/tensorflow/core/tfrt/kernels/ifrt_program_ops.h
new file mode 100644
index 00000000000000..578ccae70b8e4b
--- /dev/null
+++ b/tensorflow/core/tfrt/kernels/ifrt_program_ops.h
@@ -0,0 +1,51 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_KERNELS_IFRT_PROGRAM_OPS_H_
+#define TENSORFLOW_CORE_TFRT_KERNELS_IFRT_PROGRAM_OPS_H_
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "absl/base/call_once.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// TensorFlow op that calls a Ifrt program registered in `ProgramRegistry`.
+class IfrtCallOp : public tensorflow::OpKernel {
+ public:
+  explicit IfrtCallOp(tensorflow::OpKernelConstruction* ctx);
+
+  IfrtCallOp(const IfrtCallOp& other) = delete;
+  IfrtCallOp& operator=(const IfrtCallOp& other) = delete;
+
+  void Compute(tensorflow::OpKernelContext* ctx) override;
+
+ private:
+  // Op attributes.
+  int64_t program_id_;
+
+  // Ifrt program to be called. Cached after the first call.
+  absl::once_flag init_once_;
+  std::shared_ptr<tensorflow::ifrt_serving::IfrtServingExecutable> executable_;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_TFRT_KERNELS_IFRT_PROGRAM_OPS_H_
diff --git a/tensorflow/core/tfrt/mlrt/bytecode/bytecode.h b/tensorflow/core/tfrt/mlrt/bytecode/bytecode.h
index f6b8de5da15dcb..f82666f172a37d 100644
--- a/tensorflow/core/tfrt/mlrt/bytecode/bytecode.h
+++ b/tensorflow/core/tfrt/mlrt/bytecode/bytecode.h
@@ -109,6 +109,8 @@ class Buffer {
   size_t size() const { return buffer_.size(); }
   bool empty() const { return buffer_.empty(); }
 
+  void shrink_to_fit() { buffer_.shrink_to_fit(); }
+
  private:
   static_assert(alignof(std::max_align_t) >= 8,
                 "The bytecode buffer needs to be at least 8-byte aligned.");
diff --git a/tensorflow/core/tfrt/mlrt/kernel/BUILD b/tensorflow/core/tfrt/mlrt/kernel/BUILD
index 9dacee29030c8f..cca6cddbfb650a 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/BUILD
+++ b/tensorflow/core/tfrt/mlrt/kernel/BUILD
@@ -10,7 +10,6 @@ package(
         # copybara:uncomment "//learning/brain/tfrt:__subpackages__",
         # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
         "//tensorflow/core/tfrt/graph_executor:__subpackages__",
-        "//tensorflow/core/tfrt/mlrt/application/tensorflow/tests:__subpackages__",
         "//tensorflow/core/tfrt/saved_model:__subpackages__",
         "//tensorflow/core/tfrt/tfrt_session:__subpackages__",
     ],
diff --git a/tensorflow/core/tfrt/ops/BUILD b/tensorflow/core/tfrt/ops/BUILD
index cf9beb34fc7231..d09db6a81dc3c2 100644
--- a/tensorflow/core/tfrt/ops/BUILD
+++ b/tensorflow/core/tfrt/ops/BUILD
@@ -5,6 +5,32 @@ package(
     default_visibility = ["//tensorflow/core/tfrt/__subpackages__"],
 )
 
+tf_gen_op_libs(
+    op_lib_names = ["ifrt_program_ops"],
+    sub_directory = "",
+    deps = ["//tensorflow/core:lib"],
+)
+
+tf_gen_op_wrapper_cc(
+    name = "gen_ifrt_program_ops",
+    out_ops_file = "gen_ifrt_program_ops",
+    deps = [":ifrt_program_ops_op_lib"],
+)
+
+cc_library(
+    name = "gen_ifrt_program_ops_cc",
+    srcs = ["gen_ifrt_program_ops.cc"],
+    hdrs = ["gen_ifrt_program_ops.h"],
+    deps = [
+        ":ifrt_program_ops_op_lib",
+        "//tensorflow/cc:const_op",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 tf_gen_op_libs(
     op_lib_names = ["stream_ops"],
     sub_directory = "",
diff --git a/tensorflow/core/tfrt/ops/ifrt_program_ops.cc b/tensorflow/core/tfrt/ops/ifrt_program_ops.cc
new file mode 100644
index 00000000000000..ab8e14b2e41eac
--- /dev/null
+++ b/tensorflow/core/tfrt/ops/ifrt_program_ops.cc
@@ -0,0 +1,46 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+REGISTER_OP("IfrtCall")
+    .Input("args: Tin")
+    .Output("results: Tout")
+    .Attr("Tin: list(type) >= 0")
+    .Attr("Tout: list(type) >= 0")
+    .Attr("program_id: int")
+    .SetIsStateful()
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .Doc(R"(
+Calls an IFRT program identified by the given program id.
+
+This op looks up a `ServingExecutable` from `ServingExecutableRegistry` using
+the program id, calls the executable with the op's inputs as arguments, and
+returns its results as the op's outputs.
+
+Note that this op is not part of a stable interface. Users must not use this op
+in their SavedModel and instead rely on Ifrt Serving's mechanism that
+automatically inserts this op with graph rewrite.
+
+program_id: int64 id that can be used to look up compiled programs from
+  `ServingExecutableRegistry`.
+)");
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
index 17b1ba31fd044c..dc225b9ab2448d 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
@@ -320,9 +320,8 @@ StatusOr<std::string> AotCompileToGpuPjRtLoadedExecutableWithDevice(
     int graph_def_version, const std::vector<XlaCompiler::Argument>& args,
     bool has_ref_vars, bool may_alias_resource_update,
     XlaCompiler::CompilationResult** compilation_result) {
-  TF_ASSIGN_OR_RETURN(auto client, xla::GetStreamExecutorGpuClient(
-                                       true, /*allocator_config=*/{},
-                                       /*node_id=*/0));
+  TF_ASSIGN_OR_RETURN(auto client,
+                      xla::GetStreamExecutorGpuClient(xla::GpuClientOptions()));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<xla::StreamExecutorGpuClient*>(client.release()));
 
diff --git a/tensorflow/core/tfrt/saved_model/tests/BUILD b/tensorflow/core/tfrt/saved_model/tests/BUILD
index 3495c1150ac420..12b9779fef43f1 100644
--- a/tensorflow/core/tfrt/saved_model/tests/BUILD
+++ b/tensorflow/core/tfrt/saved_model/tests/BUILD
@@ -619,6 +619,64 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "saved_model_ifrt_testlib",
+    testonly = 1,
+    srcs = ["saved_model_ifrt_test.cc"],
+    data = [
+        "toy_v2/saved_model.pb",
+        "toy_v2/variables/variables.data-00000-of-00001",
+        "toy_v2/variables/variables.index",
+    ],
+    tags = ["no_oss"],
+    deps = [
+        "//tensorflow/compiler/mlir/tfrt:tfrt_compile_options",
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_backend_compiler",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
+        "//tensorflow/core/tfrt:ifrt_program_ops_op_lib",
+        "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
+        "//tensorflow/core/tfrt/runtime",
+        "//tensorflow/core/tfrt/saved_model:saved_model_cpu",
+        "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest",
+        "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/python/ifrt:test_util",
+        "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
+        "@tf_runtime//:basic_kernels_alwayslink",
+        "@tf_runtime//:core_runtime_alwayslink",
+        "@tf_runtime//:test_kernels_alwayslink",
+        "@tf_runtime//backends/cpu:core_runtime_alwayslink",
+        "@tf_runtime//backends/cpu:tf_ops_alwayslink",
+    ],
+)
+
+tf_cc_test(
+    name = "saved_model_ifrt_test",
+    srcs = [],
+    tags = ["no_oss"],
+    deps = [
+        ":saved_model_ifrt_testlib",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "saved_model_ifrt_test_mlrt",
+    srcs = [],
+    args = ["--enable_mlrt=true"],
+    tags = ["no_oss"],
+    deps = [
+        ":saved_model_ifrt_testlib",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "saved_model_test",
     srcs = [],
diff --git a/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl b/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl
index 0cb693e7b8763c..f3ed254c39689e 100644
--- a/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl
+++ b/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl
@@ -18,3 +18,18 @@ def gen_saved_model(model_name = "", script = "", **kwargs):
         tools = [script],
         **kwargs
     )
+
+def gen_variableless_saved_model(model_name = "", script = "", **kwargs):
+    native.genrule(
+        name = "saved_model_gen_" + model_name,
+        srcs = [],
+        outs = [
+            model_name + "/saved_model.pb",
+        ],
+        cmd = if_google(
+            "$(location " + script + ") --saved_model_path=$(RULEDIR)/" + model_name,
+            "touch $(OUTS)",  # TODO(b/188517768): fix model gen.
+        ),
+        tools = [script],
+        **kwargs
+    )
diff --git a/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc b/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
new file mode 100644
index 00000000000000..6113741df2eec4
--- /dev/null
+++ b/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
@@ -0,0 +1,93 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.h"
+#include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/test_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/resource_loader.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_model_context.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+#include "tensorflow/core/tfrt/saved_model/saved_model.h"
+#include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+namespace {
+
+TEST(SavedModelIfrt, Basic) {
+  std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
+      "tensorflow/core/tfrt/saved_model/tests/toy_v2");
+
+  auto runtime =
+      tensorflow::tfrt_stub::Runtime::Create(/*num_inter_op_threads=*/4);
+
+  // Create contexts required for the compiler execution.
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
+                          xla::ifrt::test_util::GetClient());
+
+  // Use IFRT compiler
+  runtime->AddCreateRuntimeResourceFn(
+      [&](tensorflow::tfrt_stub::ModelRuntimeContext& model_context) {
+        tensorflow::ifrt_serving::IfrtModelContext ifrt_model_context(client);
+
+        model_context.resource_context()
+            .CreateResource<tensorflow::ifrt_serving::IfrtModelContext>(
+                "IfrtModelContext", std::move(ifrt_model_context));
+        return absl::OkStatus();
+      });
+  tensorflow::ifrt_serving::IfrtBackendCompiler ifrt_compiler;
+
+  auto options = DefaultSavedModelOptions(runtime.get());
+  options.enable_lazy_loading = true;
+  options.lazy_loading_use_graph_executor = true;
+  options.graph_execution_options.compile_options.backend_compiler =
+      &ifrt_compiler;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto saved_model, SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                       /*tags=*/{"serve"}));
+
+  // Set input 'x' to [[1, 1, 1]]
+  std::vector<tensorflow::Tensor> inputs;
+  inputs.push_back(
+      CreateTfTensor<int32_t>(/*shape=*/{1, 3}, /*data=*/{1, 1, 1}));
+
+  tfrt::SavedModel::RunOptions run_options;
+
+  std::vector<tensorflow::Tensor> outputs;
+  TF_ASSERT_OK(
+      saved_model->Run(run_options, "serving_default", inputs, &outputs));
+  ASSERT_EQ(outputs.size(), 1);
+
+  EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
+              ::testing::ElementsAreArray({6}));
+}
+
+}  // namespace
+}  // namespace tfrt_stub
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/utils/BUILD b/tensorflow/core/tfrt/utils/BUILD
index f9879e84d4562d..7517e07928f87b 100644
--- a/tensorflow/core/tfrt/utils/BUILD
+++ b/tensorflow/core/tfrt/utils/BUILD
@@ -270,6 +270,7 @@ tf_cc_test(
     deps = [
         ":fallback_tensor",
         "//tensorflow/core/common_runtime:dma_helper",
+        "//tensorflow/core/framework:tensor_shape",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/core/tfrt/utils/fallback_tensor.h b/tensorflow/core/tfrt/utils/fallback_tensor.h
index 393e06e75f4a90..0856117d2b7a09 100644
--- a/tensorflow/core/tfrt/utils/fallback_tensor.h
+++ b/tensorflow/core/tfrt/utils/fallback_tensor.h
@@ -64,7 +64,7 @@ class FallbackTensor {
   FallbackTensor(const FallbackTensor& other) { *this = other; }
   FallbackTensor& operator=(const FallbackTensor& other) {
     tsl::profiler::TraceMe trace_me("FallbackTensor::Copy");
-    if (!other.is_immutable()) {
+    if (!other.is_immutable() && other.buffer() != nullptr) {
       // Create a new TensorBuffer which contains a new atomic counter for each
       // result, to avoid downstream threads contending the original atomic
       // counter.
@@ -72,7 +72,8 @@ class FallbackTensor {
           tensorflow::tfrt_stub::ImmutableTensor::Create(other.tensor())
               .tensor());
     } else {
-      // For immutable tensors, we just need to copy the pointer.
+      // For immutable tensors or empty tensors, we just need to copy the
+      // pointer as they don't incur atomic operations when they are referenced.
       tensor_ = other.tensor();
     }
     is_immutable_ = true;
diff --git a/tensorflow/core/tfrt/utils/fallback_tensor_test.cc b/tensorflow/core/tfrt/utils/fallback_tensor_test.cc
index 9c54e8704158c3..1e3de50a38d9fc 100644
--- a/tensorflow/core/tfrt/utils/fallback_tensor_test.cc
+++ b/tensorflow/core/tfrt/utils/fallback_tensor_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
 namespace tfrt_stub {
@@ -158,6 +159,15 @@ TEST(FallbackTensorTest, FallbackTensorCopyRootBuffer) {
             tensorflow::DMAHelper::buffer(&tensor));
 }
 
+TEST(FallbackTensorTest, EmptyTensor) {
+  tensorflow::Tensor tensor(tensorflow::DT_FLOAT,
+                            tensorflow::TensorShape({1, 0}));
+
+  FallbackTensor fallback_tensor(tensor);
+  auto copy = fallback_tensor;
+  ASSERT_FALSE(copy.buffer());
+}
+
 }  // namespace
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 317060b2ca9408..eade2efb96f75c 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -1,19 +1,18 @@
 # Description: Utilities for TPU Operations
 
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_libtpu",
     "if_windows",
     "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/compiler/mlir/tf2xla:__subpackages__",
-        "//tensorflow/compiler/xrt:__subpackages__",
         "//tensorflow/core/tpu:__subpackages__",
         "//tensorflow/dtensor:__subpackages__",
     ],
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 212bfbcbdcb844..48b4711d37cf75 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -46,7 +46,7 @@ package_group(
     packages = [
         "//tensorflow/compiler/mlir/quantization/...",
         "//tensorflow/compiler/mlir/tf2xla/...",
-        "//tensorflow/compiler/xrt/kernels/...",
+        "//tensorflow/core/tfrt/ifrt/...",
         "//tensorflow/core/tpu/...",
         "//tensorflow/dtensor/...",
         "//third_party/py/jax_tpu_embedding/...",
@@ -60,13 +60,15 @@ tf_kernel_library(
     visibility = ["//visibility:public"],
     deps = [
         ":cross_replica_ops",
+        ":global_iter_id_op",
         ":host_compute_ops",
         ":image_resize_ops",
         ":infeed_ops",
         ":outfeed_ops",
         ":replication_ops",
         ":sharding_util_ops",
-        ":sparse_core_ops",
+        ":sparse_core_preprocess_ops",
+        ":sparse_core_xla_ops",
         ":topk_ops",
         ":tpu_compile_op",
         ":tpu_configuration_ops",
@@ -184,6 +186,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:statusor",
@@ -198,7 +201,6 @@ cc_library(
         "@local_xla//xla/client:xla_computation",
         "@local_xla//xla/client/lib:slicing",
         "@local_xla//xla/stream_executor/tpu:c_api_decl",
-        "@local_xla//xla/stream_executor/tpu:status_helper",
         "@local_xla//xla/stream_executor/tpu:tpu_api",
         "@local_xla//xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
     ],
@@ -512,7 +514,6 @@ cc_library(
         ":tpu_program_group_interface",
         "//tensorflow/compiler/tf2xla:host_compute_metadata_proto_cc",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xrt:xrt_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "@local_xla//xla:xla_proto_cc",
@@ -1342,6 +1343,7 @@ cc_library(
     name = "sharding_util_ops",
     srcs = ["sharding_util_ops.cc"],
     deps = [
+        ":sharding_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core/framework:op_requires",
@@ -1349,6 +1351,7 @@ cc_library(
         "//tensorflow/core/platform:refcount",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1360,12 +1363,53 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "sharding_utils",
+    srcs = ["sharding_utils.cc"],
+    hdrs = ["sharding_utils.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:macros",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+tf_cc_test(
+    name = "sharding_utils_test",
+    srcs = ["sharding_utils_test.cc"],
+    deps = [
+        ":sharding_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 tf_kernel_library(
     name = "global_iter_id_op",
     srcs = ["global_iter_id.cc"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core/kernels:partitioned_function_ops",
+        "//tensorflow/core/tpu/ops:sparse_core_ops",
     ],
 )
 
@@ -1479,52 +1523,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "sparse_core_ops",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":sparse_core_preprocess_ops",
-        "//tensorflow/compiler/jit:xla_device",
-        "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla:xla_op_registry",
-        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:functional_ops_op_lib",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core:no_op_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:sendrecv_ops_op_lib",
-        "//tensorflow/core/kernels:ops_util_hdrs",
-        "//tensorflow/core/kernels:transpose_functor",
-        "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/core/tpu:tpu_configuration",
-        "//tensorflow/core/tpu/kernels:global_iter_id_op",
-        "//tensorflow/core/tpu/kernels:host_compute_ops",
-        "//tensorflow/core/tpu/kernels:image_resize_ops",
-        "//tensorflow/core/tpu/kernels:infeed_ops",
-        "//tensorflow/core/tpu/kernels:outfeed_ops",
-        "//tensorflow/core/tpu/kernels:replication_ops",
-        "//tensorflow/core/tpu/kernels:sharding_util_ops",
-        "//tensorflow/core/tpu/kernels:topk_ops",
-        "//tensorflow/core/tpu/kernels:tpu_compilation_cache_interface",
-        "//tensorflow/core/tpu/kernels:tpu_functional_ops",
-        "//tensorflow/core/tpu/kernels:tpu_handle_to_key_op",
-        "//tensorflow/core/tpu/kernels:tpu_op_consts",
-        "//tensorflow/core/tpu/kernels:transfer_ops",
-        "//tensorflow/dtensor/cc:dtensor_tpu_kernels",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla:util",
-        "@local_xla//xla/client:xla_builder",
-        "@local_xla//xla/client/lib:constants",
-        "@local_xla//xla/stream_executor:multi_platform_manager",
-    ],
-)
-
 tf_proto_library(
     name = "sparse_core_layout_proto",
     srcs = ["sparse_core_layout.proto"],
diff --git a/tensorflow/core/tpu/kernels/global_iter_id.cc b/tensorflow/core/tpu/kernels/global_iter_id.cc
index 92a44d7c106a2b..11b80146f63153 100644
--- a/tensorflow/core/tpu/kernels/global_iter_id.cc
+++ b/tensorflow/core/tpu/kernels/global_iter_id.cc
@@ -29,7 +29,6 @@ class GlobalIterId : public OpKernel {
     ctx->set_output(0, Tensor(ctx->frame_iter().iter_id));
   }
 };
-REGISTER_OP("GlobalIterId").Output("iter_id: int64").SetIsStateful();
 REGISTER_KERNEL_BUILDER(Name("GlobalIterId").Device(DEVICE_CPU), GlobalIterId);
 
 }  // anonymous namespace
diff --git a/tensorflow/core/tpu/kernels/sharding_util_ops.cc b/tensorflow/core/tpu/kernels/sharding_util_ops.cc
index 5513547b6bb67b..fe726011527165 100644
--- a/tensorflow/core/tpu/kernels/sharding_util_ops.cc
+++ b/tensorflow/core/tpu/kernels/sharding_util_ops.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
 #define EIGEN_USE_THREADS
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -34,10 +37,12 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/tpu/kernels/sharding_utils.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
 #include "tsl/platform/macros.h"
@@ -129,454 +134,63 @@ Status CreateResourceInvalidDTypeError(const ResourceHandle& handle,
                    DataTypeString(expected_dtype), "."));
 }
 
-// Converts flatten index to start indices (subscript scaled with slice shape)
-// for determining where to start a slice in the input tensor.
-template <int Rank>
-Eigen::DSizes<Eigen::DenseIndex, Rank> GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, Rank>& slice_shape, int index);
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 1> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 1>& slice_shape, int index);
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 2> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_shape, int index);
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 3> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_shape, int index);
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 4> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 4>& slice_shape, int index);
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 5> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 5>& slice_shape, int index);
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 6> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 6>& slice_shape, int index);
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 7> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 7>& slice_shape, int index);
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 8> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 8>& slice_shape, int index);
-
-template <int Rank>
-Eigen::DSizes<Eigen::DenseIndex, Rank> GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, Rank>& slice_shape,
-    const int index) {
-  return Eigen::DSizes<Eigen::DenseIndex, Rank>();
-}
-
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 1> GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 1>& slice_shape, const int index) {
-  Eigen::DSizes<Eigen::DenseIndex, 1> subscript;
-  subscript[0] = index * slice_shape[0];
-  return subscript;
-}
-
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 2> GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_shape, const int index) {
-  Eigen::DSizes<Eigen::DenseIndex, 2> subscript;
-  subscript[1] = (index % num_partitions[1]) * slice_shape[1];
-  subscript[0] = (index / num_partitions[1]) * slice_shape[0];
-  return subscript;
-}
-
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 3> GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_shape, const int index) {
-  Eigen::DSizes<Eigen::DenseIndex, 3> subscript;
-  subscript[2] = (index % num_partitions[2]) * slice_shape[2];
-  subscript[1] =
-      ((index / num_partitions[2]) % num_partitions[1]) * slice_shape[1];
-  subscript[0] =
-      (index / (num_partitions[2] * num_partitions[1])) * slice_shape[0];
-  return subscript;
-}
-
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 4> GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 4>& slice_shape, const int index) {
-  Eigen::DSizes<Eigen::DenseIndex, 4> subscript;
-  subscript[3] = (index % num_partitions[3]) * slice_shape[3];
-  subscript[2] =
-      ((index / num_partitions[3]) % num_partitions[2]) * slice_shape[2];
-  subscript[1] =
-      ((index / (num_partitions[3] * num_partitions[2])) % num_partitions[1]) *
-      slice_shape[1];
-  subscript[0] =
-      (index / (num_partitions[3] * num_partitions[2] * num_partitions[1])) *
-      slice_shape[0];
-  return subscript;
-}
-
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 5> GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 5>& slice_shape, const int index) {
-  Eigen::DSizes<Eigen::DenseIndex, 5> subscript;
-  subscript[4] = (index % num_partitions[4]) * slice_shape[4];
-  subscript[3] =
-      ((index / num_partitions[4]) % num_partitions[3]) * slice_shape[3];
-  subscript[2] =
-      ((index / (num_partitions[4] * num_partitions[3])) % num_partitions[2]) *
-      slice_shape[2];
-  subscript[1] =
-      ((index / (num_partitions[4] * num_partitions[3] * num_partitions[2])) %
-       num_partitions[1]) *
-      slice_shape[1];
-  subscript[0] = (index / (num_partitions[4] * num_partitions[3] *
-                           num_partitions[2] * num_partitions[1])) *
-                 slice_shape[0];
-  return subscript;
-}
-
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 6> GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 6>& slice_shape, const int index) {
-  Eigen::DSizes<Eigen::DenseIndex, 6> subscript;
-  subscript[5] = (index % num_partitions[5]) * slice_shape[5];
-  subscript[4] =
-      ((index / num_partitions[5]) % num_partitions[4]) * slice_shape[4];
-  subscript[3] =
-      ((index / (num_partitions[5] * num_partitions[4])) % num_partitions[3]) *
-      slice_shape[3];
-  subscript[2] =
-      ((index / (num_partitions[5] * num_partitions[4] * num_partitions[3])) %
-       num_partitions[2]) *
-      slice_shape[2];
-  subscript[1] = ((index / (num_partitions[5] * num_partitions[4] *
-                            num_partitions[3] * num_partitions[2])) %
-                  num_partitions[1]) *
-                 slice_shape[1];
-  subscript[0] =
-      (index / (num_partitions[5] * num_partitions[4] * num_partitions[3] *
-                num_partitions[2] * num_partitions[1])) *
-      slice_shape[0];
-  return subscript;
-}
-
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 7> GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 7>& slice_shape, const int index) {
-  Eigen::DSizes<Eigen::DenseIndex, 7> subscript;
-  subscript[6] = (index % num_partitions[6]) * slice_shape[6];
-  subscript[5] =
-      ((index / num_partitions[6]) % num_partitions[5]) * slice_shape[5];
-  subscript[4] =
-      ((index / (num_partitions[6] * num_partitions[5])) % num_partitions[4]) *
-      slice_shape[4];
-  subscript[3] =
-      ((index / (num_partitions[6] * num_partitions[5] * num_partitions[4])) %
-       num_partitions[3]) *
-      slice_shape[3];
-  subscript[2] = ((index / (num_partitions[6] * num_partitions[5] *
-                            num_partitions[4] * num_partitions[3])) %
-                  num_partitions[2]) *
-                 slice_shape[2];
-  subscript[1] =
-      ((index / (num_partitions[6] * num_partitions[5] * num_partitions[4] *
-                 num_partitions[3] * num_partitions[2])) %
-       num_partitions[1]) *
-      slice_shape[1];
-  subscript[0] =
-      (index / (num_partitions[6] * num_partitions[5] * num_partitions[4] *
-                num_partitions[3] * num_partitions[2] * num_partitions[1])) *
-      slice_shape[0];
-  return subscript;
-}
-
-template <>
-Eigen::DSizes<Eigen::DenseIndex, 8> GetSliceIndices(
-    absl::Span<const int32_t> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 8>& slice_shape, const int index) {
-  Eigen::DSizes<Eigen::DenseIndex, 8> subscript;
-  subscript[7] = (index % num_partitions[7]) * slice_shape[7];
-  subscript[6] =
-      ((index / num_partitions[7]) % num_partitions[6]) * slice_shape[6];
-  subscript[5] =
-      ((index / (num_partitions[7] * num_partitions[6])) % num_partitions[5]) *
-      slice_shape[5];
-  subscript[4] =
-      ((index / (num_partitions[7] * num_partitions[6] * num_partitions[5])) %
-       num_partitions[4]) *
-      slice_shape[4];
-  subscript[3] = ((index / (num_partitions[7] * num_partitions[6] *
-                            num_partitions[5] * num_partitions[4])) %
-                  num_partitions[3]) *
-                 slice_shape[3];
-  subscript[2] =
-      ((index / (num_partitions[7] * num_partitions[6] * num_partitions[5] *
-                 num_partitions[4] * num_partitions[3])) %
-       num_partitions[2]) *
-      slice_shape[2];
-  subscript[1] =
-      ((index / (num_partitions[7] * num_partitions[6] * num_partitions[5] *
-                 num_partitions[4] * num_partitions[3] * num_partitions[2])) %
-       num_partitions[1]) *
-      slice_shape[1];
-  subscript[0] =
-      (index / (num_partitions[7] * num_partitions[6] * num_partitions[5] *
-                num_partitions[4] * num_partitions[3] * num_partitions[2] *
-                num_partitions[1])) *
-      slice_shape[0];
-  return subscript;
-}
-
 constexpr absl::string_view kTensorName = "'input' tensor";
 constexpr absl::string_view kResourceName = "'resource' variable tensor";
 
-template <int Rank>
-Eigen::DSizes<Eigen::DenseIndex, Rank> TF_ATTRIBUTE_NOINLINE
-ShapeAsEigenDSizes(const TensorShape& shape);
-template <int Rank>
-Eigen::DSizes<Eigen::DenseIndex, Rank> ShapeAsEigenDSizes(
-    const TensorShape& shape) {
-  return shape.AsEigenDSizes<Rank>();
-}
-
-bool TF_ATTRIBUTE_NOINLINE
-ValidateShapesForSlice(OpKernelContext* ctx, bool resource, const Tensor* input,
-                       const std::vector<int32_t>& num_splits,
-                       const std::vector<int32_t>& paddings);
-
-bool ValidateShapesForSlice(OpKernelContext* ctx, bool resource,
-                            const Tensor* input,
-                            const std::vector<int32_t>& num_splits,
-                            const std::vector<int32_t>& paddings) {
-  const auto& ishape = input->shape();
-
-  Status s;
-
-  absl::string_view input_name = resource ? kResourceName : kTensorName;
-  const int rank = ishape.dims();
-  const auto& input_shape = ishape.dim_sizes();
-  if (rank <= 0 || rank > 8) {
-    s = absl::InvalidArgumentError(absl::StrCat(
-        input_name, " must have rank in range (0, 8], but got ", rank, "."));
-  } else if (rank != num_splits.size()) {
-    s = absl::InvalidArgumentError(absl::StrCat(
-        input_name, " rank must be the same as 'num_splits' length ",
-        num_splits.size(), ", but got rank ", rank, "."));
-  } else {
-    for (int dim = 0; dim < rank; ++dim) {
-      const auto input_shape_dim = input_shape[dim];
-      const auto paddings_dim = paddings[dim];
-      const auto num_splits_dim = num_splits[dim];
-      if ((input_shape_dim + paddings_dim) % num_splits_dim != 0) {
-        s = absl::InvalidArgumentError(absl::StrCat(
-            input_name, " shape dimension ", dim, " (", input_shape_dim,
-            ") with padding ", paddings_dim,
-            " must be evenly divisible by 'num_splits' ", num_splits_dim, "."));
-        break;
-      }
-    }
-  }
-  if (!s.ok()) {
-    ctx->CtxFailure(__FILE__, __LINE__, s);
-    return false;
-  }
-  return true;
-}
-
 // Shared base class to save code space
+template <typename Device, typename T>
 class XlaSplitNDShared : public OpKernel {
  public:
   explicit TF_ATTRIBUTE_NOINLINE XlaSplitNDShared(OpKernelConstruction* ctx)
-      : OpKernel(ctx), num_slices_(1), has_paddings_(false) {
-    GetAndValidateAttributes(/*split=*/true, ctx, num_splits_, num_slices_,
-                             paddings_, has_paddings_);
+      : OpKernel(ctx) {
+    std::vector<int32_t> num_splits;
+    int num_slices = 1;
+    std::vector<int32_t> paddings;
+    bool has_paddings = false;
+
+    GetAndValidateAttributes(/*split=*/true, ctx, num_splits, num_slices,
+                             paddings, has_paddings);
+
+    auto xla_nd_splitter = XlaNDSplitter<Device, T>::Create(
+        num_splits, num_slices, paddings, has_paddings);
+    OP_REQUIRES_OK(ctx, xla_nd_splitter.status());
+    splitter_ = *std::move(xla_nd_splitter);
   }
 
  protected:
-  template <int Rank>
-  class SliceAndMaybePadState {
-   public:
-    int num_complete_pad_dims_;
-    int num_partial_pad_dims_;
-    TensorShape non_padded_slice_shape_;
-    Eigen::array<Eigen::IndexPair<int64_t>, Rank> slice_paddings_;
-    Eigen::DSizes<Eigen::DenseIndex, Rank> slice_indices_;
-    Eigen::DSizes<Eigen::DenseIndex, Rank> output_slice_shape_dsizes_;
-    Eigen::DSizes<Eigen::DenseIndex, Rank> non_padded_slice_shape_dsizes_;
-
-    TF_ATTRIBUTE_NOINLINE SliceAndMaybePadState(
-        absl::Span<const int32_t> num_splits,
-        const absl::Span<const int64_t> input_shape,
-        const TensorShape& output_slice_shape, int slice_index) {
-      output_slice_shape_dsizes_ = ShapeAsEigenDSizes<Rank>(output_slice_shape);
-      num_complete_pad_dims_ = 0;
-      num_partial_pad_dims_ = 0;
-      slice_indices_ = GetSliceIndices<Rank>(
-          num_splits, output_slice_shape_dsizes_, slice_index);
-
-      // Calculate paddings necessary for slice instead of padding input and
-      // slicing subsequently to reduce temporary memory allocation.
-      for (int dim = 0; dim < Rank; ++dim) {
-        const int64_t dim_size = input_shape[dim];
-        const int64_t out_dim = output_slice_shape_dsizes_[dim];
-        int64_t non_padded_dim = 0;
-        if (slice_indices_[dim] >= dim_size) {
-          // Complete padding.
-          slice_indices_[dim] = dim_size;
-          non_padded_dim = 0;
-          slice_paddings_[dim] = {0, out_dim};
-          num_complete_pad_dims_++;
-        } else if (slice_indices_[dim] + out_dim > dim_size) {
-          // Partial padding.
-          non_padded_dim = dim_size - slice_indices_[dim];
-          slice_paddings_[dim] = {0, out_dim - non_padded_dim};
-          num_partial_pad_dims_++;
-        } else {
-          non_padded_dim = out_dim;
-        }
-        non_padded_slice_shape_.AddDim(non_padded_dim);
-      }
-      non_padded_slice_shape_dsizes_ =
-          ShapeAsEigenDSizes<Rank>(non_padded_slice_shape_);
-    }
-  };
-
   static void TF_ATTRIBUTE_NOINLINE GetDtypeHelper(OpKernelConstruction* ctx,
                                                    const char* attr_name,
                                                    DataType* dtype_ptr) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr(attr_name, dtype_ptr));
   }
 
-  std::vector<int32_t> num_splits_;
-  int num_slices_;
-  std::vector<int32_t> paddings_;
-  bool has_paddings_;
+  std::optional<XlaNDSplitter<Device, T>> splitter_;
 };
 
 template <typename Device, typename T>
-class XlaSplitNDBaseOp : public XlaSplitNDShared {
+class XlaSplitNDBaseOp : public XlaSplitNDShared<Device, T> {
  public:
   explicit XlaSplitNDBaseOp(OpKernelConstruction* ctx)
-      : XlaSplitNDShared(ctx) {}
+      : XlaSplitNDShared<Device, T>(ctx) {}
 
  protected:
   void ComputeInternal(
       bool resource, OpKernelContext* ctx,
       const std::function<Status(const Tensor&)>& assign_or_copy_value_fn,
       const Tensor* input) {
-    const int rank = input->shape().dims();
     const auto& input_shape = input->shape().dim_sizes();
 
-    if (!ValidateShapesForSlice(ctx, resource, input, num_splits_, paddings_)) {
-      return;
-    }
-
-    TensorShape output_slice_shape;
-    for (int i = 0; i < rank; ++i) {
-      output_slice_shape.AddDim((input_shape[i] + paddings_[i]) /
-                                ((num_slices_ == 1) ? 1 : num_splits_[i]));
-    }
-    if (num_slices_ == 1 && !has_paddings_) {
-      // Handle simple case first
-      OP_REQUIRES_OK(ctx, assign_or_copy_value_fn(*input));
-    } else {
-      const Device& device = ctx->eigen_device<Device>();
-      std::vector<Tensor*> output_slices(num_slices_);
-      for (int i = 0; i < num_slices_; i++) {
-        OP_REQUIRES_OK(ctx,
-                       ctx->allocate_output(
-                           /*index=*/i, output_slice_shape, &output_slices[i]));
-      }
-
-      if (rank == 1) {
-        SliceAndMaybePad<1>(ctx, device, input, input_shape, output_slice_shape,
-                            output_slices);
-      } else if (rank == 2) {
-        SliceAndMaybePad<2>(ctx, device, input, input_shape, output_slice_shape,
-                            output_slices);
-      } else if (rank == 3) {
-        SliceAndMaybePad<3>(ctx, device, input, input_shape, output_slice_shape,
-                            output_slices);
-      } else if (rank == 4) {
-        SliceAndMaybePad<4>(ctx, device, input, input_shape, output_slice_shape,
-                            output_slices);
-      } else if (rank == 5) {
-        SliceAndMaybePad<5>(ctx, device, input, input_shape, output_slice_shape,
-                            output_slices);
-      } else if (rank == 6) {
-        SliceAndMaybePad<6>(ctx, device, input, input_shape, output_slice_shape,
-                            output_slices);
-      } else if (rank == 7) {
-        SliceAndMaybePad<7>(ctx, device, input, input_shape, output_slice_shape,
-                            output_slices);
-      } else if (rank == 8) {
-        SliceAndMaybePad<8>(ctx, device, input, input_shape, output_slice_shape,
-                            output_slices);
-      }
-      return;
-    }
-  }
-
- private:
-  void TF_ATTRIBUTE_NOINLINE SetToConstant(Tensor* output_slice,
-                                           const Device& device) {
-    auto output_flat = output_slice->flat<T>();
-    output_flat.device(device) = output_flat.constant(T());
-  }
-
-  template <int Rank>
-  void TF_ATTRIBUTE_NOINLINE AssignFromInput(
-      Tensor* output_slice, const Device& device, const Tensor* input,
-      const Eigen::DSizes<Eigen::DenseIndex, Rank>& slice_indices,
-      const Eigen::DSizes<Eigen::DenseIndex, Rank>& output_slice_shape_dsizes) {
-    output_slice->tensor<T, Rank>().device(device) =
-        input->tensor<T, Rank>().slice(slice_indices,
-                                       output_slice_shape_dsizes);
-  }
+    absl::string_view input_name = resource ? kResourceName : kTensorName;
+    auto allocate_output_fn = [&](int i, const TensorShape& output_slice_shape,
+                                  Tensor** tensor) {
+      return ctx->allocate_output(
+          /*index=*/i, output_slice_shape, tensor);
+    };
 
-  template <int Rank>
-  void TF_ATTRIBUTE_NOINLINE SliceAndMaybePad(
-      OpKernelContext* ctx, const Device& device, const Tensor* input,
-      const absl::Span<const int64_t> input_shape,
-      const TensorShape& output_slice_shape,
-      const std::vector<Tensor*>& output_slices) {
-    const auto& input_tensor = input->tensor<T, Rank>();
-    // Slice shape with optional padding.
-    for (int i = 0; i < num_slices_; ++i) {
-      Tensor* output_slice = output_slices[i];
-      SliceAndMaybePadState<Rank> r(num_splits_, input_shape,
-                                    output_slice_shape, i);
-      if (r.num_complete_pad_dims_ == Rank ||
-          (r.num_complete_pad_dims_ > 0 || r.num_partial_pad_dims_ > 0)) {
-        // Need to init padding
-        SetToConstant(output_slice, device);
-      }
-      if (r.num_complete_pad_dims_ == Rank) {
-        // Done
-      } else if (r.num_complete_pad_dims_ > 0 || r.num_partial_pad_dims_ > 0) {
-        output_slice->tensor<T, Rank>()
-            .slice(Eigen::DSizes<Eigen::DenseIndex, Rank>(),
-                   r.non_padded_slice_shape_dsizes_)
-            .device(device) = input_tensor.slice(
-            r.slice_indices_, r.non_padded_slice_shape_dsizes_);
-      } else {
-        AssignFromInput<Rank>(output_slice, device, input, r.slice_indices_,
-                              r.output_slice_shape_dsizes_);
-      }
-    }
+    const Device& device = ctx->eigen_device<Device>();
+    auto status = this->splitter_->Split(
+        input, input_name, assign_or_copy_value_fn, allocate_output_fn, device);
+    OP_REQUIRES_OK(ctx, status);
   }
 };
 
@@ -605,7 +219,7 @@ class ReadVariableXlaSplitNDOp : public XlaSplitNDBaseOp<Device, T> {
   explicit TF_ATTRIBUTE_NOINLINE ReadVariableXlaSplitNDOp(
       OpKernelConstruction* ctx)
       : XlaSplitNDBaseOp<Device, T>(ctx) {
-    XlaSplitNDShared::GetDtypeHelper(ctx, "T", &dtype_);
+    XlaSplitNDShared<Device, T>::GetDtypeHelper(ctx, "T", &dtype_);
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -671,12 +285,18 @@ TF_CALL_uint4(REGISTER_READ_VARIABLE_XLA_SPLIT_ND);
 #undef REGISTER_READ_VARIABLE_XLA_SPLIT_ND
 
 // Shared base class to save code space
+template <typename Device, typename T>
 class XlaConcatNDShared : public OpKernel {
  public:
   explicit TF_ATTRIBUTE_NOINLINE XlaConcatNDShared(OpKernelConstruction* ctx)
       : OpKernel(ctx), num_slices_(1), has_paddings_(false) {
     GetAndValidateAttributes(/*split=*/false, ctx, num_concats_, num_slices_,
                              paddings_, has_paddings_);
+
+    auto xla_nd_concatenator = XlaNDConcatenator<Device, T>::Create(
+        num_concats_, num_slices_, paddings_, has_paddings_);
+    OP_REQUIRES_OK(ctx, xla_nd_concatenator.status());
+    concatenator_ = *std::move(xla_nd_concatenator);
   }
 
  protected:
@@ -714,132 +334,31 @@ class XlaConcatNDShared : public OpKernel {
 
     return absl::OkStatus();
   }
-  void ApplyAssignOrCopyShared(
-      OpKernelContext* ctx,
-      const std::function<Status(const Tensor&)>& assign_or_copy_value_fn,
-      const Tensor& input) {
-    OP_REQUIRES_OK(ctx, assign_or_copy_value_fn(input));
-  }
-
-  template <int Rank>
-  class MaybeUnpadAndAssignState {
-   public:
-    int num_complete_pad_dims_;
-    int num_partial_pad_dims_;
-    TensorShape non_padded_slice_shape_;
-    Eigen::DSizes<Eigen::DenseIndex, Rank> slice_shape_dsizes_;
-    Eigen::array<Eigen::IndexPair<int64_t>, Rank> slice_paddings_;
-    Eigen::DSizes<Eigen::DenseIndex, Rank> slice_indices_;
-    Eigen::DSizes<Eigen::DenseIndex, Rank> output_slice_shape_dsizes_;
-    Eigen::DSizes<Eigen::DenseIndex, Rank> non_padded_slice_shape_dsizes_;
-
-    TF_ATTRIBUTE_NOINLINE MaybeUnpadAndAssignState(
-        absl::Span<const int32_t> num_concats, const Tensor& input0,
-        Tensor* output, int slice_index) {
-      slice_shape_dsizes_ = input0.shape().AsEigenDSizes<Rank>();
-      slice_indices_ =
-          GetSliceIndices<Rank>(num_concats, slice_shape_dsizes_, slice_index);
-      num_complete_pad_dims_ = 0;
-      num_partial_pad_dims_ = 0;
-      // Calculate paddings necessary to strip from slice.
-      for (int dim = 0; dim < Rank; ++dim) {
-        const int64_t dim_size = output->shape().dim_size(dim);
-        int64_t non_padded_dim = 0;
-        if (slice_indices_[dim] >= dim_size) {
-          // Complete padding.
-          slice_indices_[dim] = dim_size;
-          non_padded_dim = 0;
-          num_complete_pad_dims_++;
-        } else if (slice_indices_[dim] + slice_shape_dsizes_[dim] > dim_size) {
-          // Partial padding.
-          non_padded_dim = dim_size - slice_indices_[dim];
-          num_partial_pad_dims_++;
-        } else {
-          non_padded_dim = slice_shape_dsizes_[dim];
-        }
-        non_padded_slice_shape_.AddDim(non_padded_dim);
-      }
-      non_padded_slice_shape_dsizes_ =
-          non_padded_slice_shape_.AsEigenDSizes<Rank>();
-    }
-  };
 
   std::vector<int32_t> num_concats_;
   int num_slices_;
   std::vector<int32_t> paddings_;
   bool has_paddings_;
+  std::optional<XlaNDConcatenator<Device, T>> concatenator_;
 };
 
 template <typename Device, typename T>
-class XlaConcatNDBaseOp : public XlaConcatNDShared {
+class XlaConcatNDBaseOp : public XlaConcatNDShared<Device, T> {
  public:
   explicit TF_ATTRIBUTE_NOINLINE XlaConcatNDBaseOp(OpKernelConstruction* ctx)
-      : XlaConcatNDShared(ctx) {}
+      : XlaConcatNDShared<Device, T>(ctx) {}
 
  protected:
   void ComputeInternal(
       bool resource, OpKernelContext* ctx, const OpInputList& inputs,
       const std::function<Status(const Tensor&)>& assign_or_copy_value_fn,
       const std::function<StatusOr<Tensor*>()>& get_output_fn) {
-    const int rank = inputs[0].shape().dims();
-
-    OP_REQUIRES(ctx, rank > 0 && rank <= 8,
-                absl::InvalidArgumentError(absl::StrCat(
-                    "'inputs' tensors must have rank in range (0, 8], but got ",
-                    rank, ".")));
-
-    if (num_slices_ == 1 && !has_paddings_) {
-      // Simple case
-      ApplyAssignOrCopyShared(ctx, assign_or_copy_value_fn, inputs[0]);
-      return;
-    }
-
     const Device& device = ctx->eigen_device<Device>();
-    auto status_or_output = get_output_fn();
-    OP_REQUIRES_OK(ctx, status_or_output.status());
-    Tensor* output = std::move(status_or_output).value();
-
-    if (rank == 1) {
-      MaybeUnpadAndAssign<1>(ctx, device, inputs, output);
-    } else if (rank == 2) {
-      MaybeUnpadAndAssign<2>(ctx, device, inputs, output);
-    } else if (rank == 3) {
-      MaybeUnpadAndAssign<3>(ctx, device, inputs, output);
-    } else if (rank == 4) {
-      MaybeUnpadAndAssign<4>(ctx, device, inputs, output);
-    } else if (rank == 5) {
-      MaybeUnpadAndAssign<5>(ctx, device, inputs, output);
-    } else if (rank == 6) {
-      MaybeUnpadAndAssign<6>(ctx, device, inputs, output);
-    } else if (rank == 7) {
-      MaybeUnpadAndAssign<7>(ctx, device, inputs, output);
-    } else if (rank == 8) {
-      MaybeUnpadAndAssign<8>(ctx, device, inputs, output);
-    }
-  }
-
- private:
-  template <int Rank>
-  void TF_ATTRIBUTE_NOINLINE MaybeUnpadAndAssign(OpKernelContext* ctx,
-                                                 const Device& device,
-                                                 const OpInputList& inputs,
-                                                 Tensor* output) {
-    for (int i = 0; i < num_slices_; ++i) {
-      MaybeUnpadAndAssignState<Rank> r(num_concats_, inputs[0], output, i);
-      if (r.num_complete_pad_dims_ == Rank) {
-        continue;
-      } else if (r.num_complete_pad_dims_ > 0 || r.num_partial_pad_dims_ > 0) {
-        output->tensor<T, Rank>()
-            .slice(r.slice_indices_, r.non_padded_slice_shape_dsizes_)
-            .device(device) = inputs[i].tensor<T, Rank>().slice(
-            Eigen::DSizes<Eigen::DenseIndex, Rank>(),
-            r.non_padded_slice_shape_dsizes_);
-      } else {
-        output->tensor<T, Rank>()
-            .slice(r.slice_indices_, r.slice_shape_dsizes_)
-            .device(device) = inputs[i].tensor<T, Rank>();
-      }
-    }
+    std::vector<Tensor> input_tensors(inputs.begin(), inputs.end());
+    auto status = this->concatenator_->ComputeInternal(
+        absl::MakeSpan(input_tensors), assign_or_copy_value_fn, get_output_fn,
+        device);
+    OP_REQUIRES_OK(ctx, status);
   }
 };
 
diff --git a/tensorflow/core/tpu/kernels/sharding_utils.cc b/tensorflow/core/tpu/kernels/sharding_utils.cc
new file mode 100644
index 00000000000000..0f4b9620b347f3
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/sharding_utils.cc
@@ -0,0 +1,237 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/kernels/sharding_utils.h"
+
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "Eigen/Core"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/macros.h"
+
+namespace tensorflow {
+namespace sharding_internal {
+absl::Status ValidateShapesForSlice(absl::string_view input_name,
+                                    const Tensor* input,
+                                    const std::vector<int32_t>& num_splits,
+                                    const std::vector<int32_t>& paddings) {
+  const auto& ishape = input->shape();
+
+  Status s;
+
+  const int rank = ishape.dims();
+  const auto& input_shape = ishape.dim_sizes();
+  if (rank <= 0 || rank > 8) {
+    s = absl::InvalidArgumentError(absl::StrCat(
+        input_name, " must have rank in range (0, 8], but got ", rank, "."));
+  } else if (rank != num_splits.size()) {
+    s = absl::InvalidArgumentError(absl::StrCat(
+        input_name, " rank must be the same as 'num_splits' length ",
+        num_splits.size(), ", but got rank ", rank, "."));
+  } else {
+    for (int dim = 0; dim < rank; ++dim) {
+      const auto input_shape_dim = input_shape[dim];
+      const auto paddings_dim = paddings[dim];
+      const auto num_splits_dim = num_splits[dim];
+      if ((input_shape_dim + paddings_dim) % num_splits_dim != 0) {
+        s = absl::InvalidArgumentError(absl::StrCat(
+            input_name, " shape dimension ", dim, " (", input_shape_dim,
+            ") with padding ", paddings_dim,
+            " must be evenly divisible by 'num_splits' ", num_splits_dim, "."));
+        break;
+      }
+    }
+  }
+  return s;
+}
+
+}  // namespace sharding_internal
+
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 1> GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 1>& slice_shape, const int index) {
+  Eigen::DSizes<Eigen::DenseIndex, 1> subscript;
+  subscript[0] = index * slice_shape[0];
+  return subscript;
+}
+
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 2> GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_shape, const int index) {
+  Eigen::DSizes<Eigen::DenseIndex, 2> subscript;
+  subscript[1] = (index % num_partitions[1]) * slice_shape[1];
+  subscript[0] = (index / num_partitions[1]) * slice_shape[0];
+  return subscript;
+}
+
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 3> GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_shape, const int index) {
+  Eigen::DSizes<Eigen::DenseIndex, 3> subscript;
+  subscript[2] = (index % num_partitions[2]) * slice_shape[2];
+  subscript[1] =
+      ((index / num_partitions[2]) % num_partitions[1]) * slice_shape[1];
+  subscript[0] =
+      (index / (num_partitions[2] * num_partitions[1])) * slice_shape[0];
+  return subscript;
+}
+
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 4> GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 4>& slice_shape, const int index) {
+  Eigen::DSizes<Eigen::DenseIndex, 4> subscript;
+  subscript[3] = (index % num_partitions[3]) * slice_shape[3];
+  subscript[2] =
+      ((index / num_partitions[3]) % num_partitions[2]) * slice_shape[2];
+  subscript[1] =
+      ((index / (num_partitions[3] * num_partitions[2])) % num_partitions[1]) *
+      slice_shape[1];
+  subscript[0] =
+      (index / (num_partitions[3] * num_partitions[2] * num_partitions[1])) *
+      slice_shape[0];
+  return subscript;
+}
+
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 5> GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 5>& slice_shape, const int index) {
+  Eigen::DSizes<Eigen::DenseIndex, 5> subscript;
+  subscript[4] = (index % num_partitions[4]) * slice_shape[4];
+  subscript[3] =
+      ((index / num_partitions[4]) % num_partitions[3]) * slice_shape[3];
+  subscript[2] =
+      ((index / (num_partitions[4] * num_partitions[3])) % num_partitions[2]) *
+      slice_shape[2];
+  subscript[1] =
+      ((index / (num_partitions[4] * num_partitions[3] * num_partitions[2])) %
+       num_partitions[1]) *
+      slice_shape[1];
+  subscript[0] = (index / (num_partitions[4] * num_partitions[3] *
+                           num_partitions[2] * num_partitions[1])) *
+                 slice_shape[0];
+  return subscript;
+}
+
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 6> GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 6>& slice_shape, const int index) {
+  Eigen::DSizes<Eigen::DenseIndex, 6> subscript;
+  subscript[5] = (index % num_partitions[5]) * slice_shape[5];
+  subscript[4] =
+      ((index / num_partitions[5]) % num_partitions[4]) * slice_shape[4];
+  subscript[3] =
+      ((index / (num_partitions[5] * num_partitions[4])) % num_partitions[3]) *
+      slice_shape[3];
+  subscript[2] =
+      ((index / (num_partitions[5] * num_partitions[4] * num_partitions[3])) %
+       num_partitions[2]) *
+      slice_shape[2];
+  subscript[1] = ((index / (num_partitions[5] * num_partitions[4] *
+                            num_partitions[3] * num_partitions[2])) %
+                  num_partitions[1]) *
+                 slice_shape[1];
+  subscript[0] =
+      (index / (num_partitions[5] * num_partitions[4] * num_partitions[3] *
+                num_partitions[2] * num_partitions[1])) *
+      slice_shape[0];
+  return subscript;
+}
+
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 7> GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 7>& slice_shape, const int index) {
+  Eigen::DSizes<Eigen::DenseIndex, 7> subscript;
+  subscript[6] = (index % num_partitions[6]) * slice_shape[6];
+  subscript[5] =
+      ((index / num_partitions[6]) % num_partitions[5]) * slice_shape[5];
+  subscript[4] =
+      ((index / (num_partitions[6] * num_partitions[5])) % num_partitions[4]) *
+      slice_shape[4];
+  subscript[3] =
+      ((index / (num_partitions[6] * num_partitions[5] * num_partitions[4])) %
+       num_partitions[3]) *
+      slice_shape[3];
+  subscript[2] = ((index / (num_partitions[6] * num_partitions[5] *
+                            num_partitions[4] * num_partitions[3])) %
+                  num_partitions[2]) *
+                 slice_shape[2];
+  subscript[1] =
+      ((index / (num_partitions[6] * num_partitions[5] * num_partitions[4] *
+                 num_partitions[3] * num_partitions[2])) %
+       num_partitions[1]) *
+      slice_shape[1];
+  subscript[0] =
+      (index / (num_partitions[6] * num_partitions[5] * num_partitions[4] *
+                num_partitions[3] * num_partitions[2] * num_partitions[1])) *
+      slice_shape[0];
+  return subscript;
+}
+
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 8> GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 8>& slice_shape, const int index) {
+  Eigen::DSizes<Eigen::DenseIndex, 8> subscript;
+  subscript[7] = (index % num_partitions[7]) * slice_shape[7];
+  subscript[6] =
+      ((index / num_partitions[7]) % num_partitions[6]) * slice_shape[6];
+  subscript[5] =
+      ((index / (num_partitions[7] * num_partitions[6])) % num_partitions[5]) *
+      slice_shape[5];
+  subscript[4] =
+      ((index / (num_partitions[7] * num_partitions[6] * num_partitions[5])) %
+       num_partitions[4]) *
+      slice_shape[4];
+  subscript[3] = ((index / (num_partitions[7] * num_partitions[6] *
+                            num_partitions[5] * num_partitions[4])) %
+                  num_partitions[3]) *
+                 slice_shape[3];
+  subscript[2] =
+      ((index / (num_partitions[7] * num_partitions[6] * num_partitions[5] *
+                 num_partitions[4] * num_partitions[3])) %
+       num_partitions[2]) *
+      slice_shape[2];
+  subscript[1] =
+      ((index / (num_partitions[7] * num_partitions[6] * num_partitions[5] *
+                 num_partitions[4] * num_partitions[3] * num_partitions[2])) %
+       num_partitions[1]) *
+      slice_shape[1];
+  subscript[0] =
+      (index / (num_partitions[7] * num_partitions[6] * num_partitions[5] *
+                num_partitions[4] * num_partitions[3] * num_partitions[2] *
+                num_partitions[1])) *
+      slice_shape[0];
+  return subscript;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/sharding_utils.h b/tensorflow/core/tpu/kernels/sharding_utils.h
new file mode 100644
index 00000000000000..429e327462ad74
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/sharding_utils.h
@@ -0,0 +1,456 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_SHARDING_UTILS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_SHARDING_UTILS_H_
+
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "Eigen/Core"  // from @eigen_archive
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace sharding_internal {
+absl::Status ValidateShapesForSlice(absl::string_view input_name,
+                                    const Tensor* input,
+                                    const std::vector<int32_t>& num_splits,
+                                    const std::vector<int32_t>& paddings);
+template <int Rank>
+Eigen::DSizes<Eigen::DenseIndex, Rank> TF_ATTRIBUTE_NOINLINE
+ShapeAsEigenDSizes(const TensorShape& shape);
+template <int Rank>
+Eigen::DSizes<Eigen::DenseIndex, Rank> ShapeAsEigenDSizes(
+    const TensorShape& shape) {
+  return shape.AsEigenDSizes<Rank>();
+}
+
+}  // namespace sharding_internal
+
+// Converts flatten index to start indices (subscript scaled with slice shape)
+// for determining where to start a slice in the input tensor.
+template <int Rank>
+Eigen::DSizes<Eigen::DenseIndex, Rank> GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, Rank>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 1> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 1>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 2> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 3> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 4> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 4>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 5> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 5>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 6> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 6>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 7> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 7>& slice_shape, int index);
+template <>
+Eigen::DSizes<Eigen::DenseIndex, 8> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, 8>& slice_shape, int index);
+
+template <int Rank>
+Eigen::DSizes<Eigen::DenseIndex, Rank> GetSliceIndices(
+    absl::Span<const int32_t> num_partitions,
+    const Eigen::DSizes<Eigen::DenseIndex, Rank>& slice_shape,
+    const int index) {
+  return Eigen::DSizes<Eigen::DenseIndex, Rank>();
+}
+
+// Shared base class to save code space
+template <typename Device, typename T>
+class XlaNDSplitter {
+ public:
+  static absl::StatusOr<XlaNDSplitter<Device, T>> Create(
+      const std::vector<int32_t>& num_splits, int num_slices,
+      const std::vector<int32_t>& paddings, bool has_paddings) {
+    if (num_splits.size() != paddings.size()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("num_splits size ", num_splits.size(),
+                       " mismatch with paddings size ", paddings.size(), "."));
+    }
+
+    int splits_cnt = 1;
+    for (auto split : num_splits) {
+      splits_cnt *= split;
+    }
+
+    if (num_slices != splits_cnt) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Expect num_slices ", splits_cnt, " but got ", num_slices));
+    }
+
+    return XlaNDSplitter<Device, T>(num_splits, num_slices, paddings,
+                                    has_paddings);
+  }
+
+  // Split the given input.
+  //
+  // The splitted outputs are stored into tensors allocated by
+  // `allocate_output_fn`. In the simple case of pass through (no split and no
+  // padding), the output is stored through the fast path by
+  // `assign_or_copy_value_fn`.
+  absl::Status Split(
+      const Tensor* input, absl::string_view input_name,
+      const std::function<Status(const Tensor&)>& assign_or_copy_value_fn,
+      const std::function<Status(int index, const TensorShape& shape,
+                                 Tensor** tensor)>& allocate_output_fn,
+      const Device& device) {
+    if (num_splits_.size() != paddings_.size()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("num_splits size ", num_splits_.size(),
+                       " mismatch with paddings size ", paddings_.size(), "."));
+    }
+
+    const int rank = input->shape().dims();
+    const auto& input_shape = input->shape().dim_sizes();
+
+    TF_RETURN_IF_ERROR(sharding_internal::ValidateShapesForSlice(
+        input_name, input, num_splits_, paddings_));
+
+    TensorShape output_slice_shape;
+    for (int i = 0; i < rank; ++i) {
+      output_slice_shape.AddDim((input_shape[i] + paddings_[i]) /
+                                ((num_slices_ == 1) ? 1 : num_splits_[i]));
+    }
+    if (num_slices_ == 1 && !has_paddings_) {
+      // Handle simple case first
+      TF_RETURN_IF_ERROR(assign_or_copy_value_fn(*input));
+    } else {
+      std::vector<Tensor*> output_slices(num_slices_);
+      for (int i = 0; i < num_slices_; i++) {
+        TF_RETURN_IF_ERROR(allocate_output_fn(
+            /*index=*/i, output_slice_shape, &output_slices[i]));
+      }
+
+      if (rank == 1) {
+        SliceAndMaybePad<1>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      } else if (rank == 2) {
+        SliceAndMaybePad<2>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      } else if (rank == 3) {
+        SliceAndMaybePad<3>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      } else if (rank == 4) {
+        SliceAndMaybePad<4>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      } else if (rank == 5) {
+        SliceAndMaybePad<5>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      } else if (rank == 6) {
+        SliceAndMaybePad<6>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      } else if (rank == 7) {
+        SliceAndMaybePad<7>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      } else if (rank == 8) {
+        SliceAndMaybePad<8>(device, input, input_shape, output_slice_shape,
+                            output_slices);
+      }
+    }
+    return absl::OkStatus();
+  }
+
+ private:
+  template <int Rank>
+  class SliceAndMaybePadState {
+   public:
+    int num_complete_pad_dims_;
+    int num_partial_pad_dims_;
+    TensorShape non_padded_slice_shape_;
+    Eigen::array<Eigen::IndexPair<int64_t>, Rank> slice_paddings_;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> slice_indices_;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> output_slice_shape_dsizes_;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> non_padded_slice_shape_dsizes_;
+
+    TF_ATTRIBUTE_NOINLINE SliceAndMaybePadState(
+        absl::Span<const int32_t> num_splits,
+        const absl::Span<const int64_t> input_shape,
+        const TensorShape& output_slice_shape, int slice_index) {
+      output_slice_shape_dsizes_ =
+          sharding_internal::ShapeAsEigenDSizes<Rank>(output_slice_shape);
+      num_complete_pad_dims_ = 0;
+      num_partial_pad_dims_ = 0;
+      slice_indices_ = GetSliceIndices<Rank>(
+          num_splits, output_slice_shape_dsizes_, slice_index);
+
+      // Calculate paddings necessary for slice instead of padding input and
+      // slicing subsequently to reduce temporary memory allocation.
+      for (int dim = 0; dim < Rank; ++dim) {
+        const int64_t dim_size = input_shape[dim];
+        const int64_t out_dim = output_slice_shape_dsizes_[dim];
+        int64_t non_padded_dim = 0;
+        if (slice_indices_[dim] >= dim_size) {
+          // Complete padding.
+          slice_indices_[dim] = dim_size;
+          non_padded_dim = 0;
+          slice_paddings_[dim] = {0, out_dim};
+          num_complete_pad_dims_++;
+        } else if (slice_indices_[dim] + out_dim > dim_size) {
+          // Partial padding.
+          non_padded_dim = dim_size - slice_indices_[dim];
+          slice_paddings_[dim] = {0, out_dim - non_padded_dim};
+          num_partial_pad_dims_++;
+        } else {
+          non_padded_dim = out_dim;
+        }
+        non_padded_slice_shape_.AddDim(non_padded_dim);
+      }
+      non_padded_slice_shape_dsizes_ =
+          sharding_internal::ShapeAsEigenDSizes<Rank>(non_padded_slice_shape_);
+    }
+  };
+
+  std::vector<int32_t> num_splits_;
+  int num_slices_;
+  std::vector<int32_t> paddings_;
+  bool has_paddings_;
+
+  explicit XlaNDSplitter(const std::vector<int32_t>& num_splits, int num_slices,
+                         const std::vector<int32_t>& paddings,
+                         bool has_paddings)
+      : num_splits_(num_splits),
+        num_slices_(num_slices),
+        paddings_(paddings),
+        has_paddings_(has_paddings) {}
+
+  void TF_ATTRIBUTE_NOINLINE SetToConstant(Tensor* output_slice,
+                                           const Device& device) {
+    auto output_flat = output_slice->flat<T>();
+    output_flat.device(device) = output_flat.constant(T());
+  }
+
+  template <int Rank>
+  void TF_ATTRIBUTE_NOINLINE AssignFromInput(
+      Tensor* output_slice, const Device& device, const Tensor* input,
+      const Eigen::DSizes<Eigen::DenseIndex, Rank>& slice_indices,
+      const Eigen::DSizes<Eigen::DenseIndex, Rank>& output_slice_shape_dsizes) {
+    output_slice->tensor<T, Rank>().device(device) =
+        input->tensor<T, Rank>().slice(slice_indices,
+                                       output_slice_shape_dsizes);
+  }
+
+  template <int Rank>
+  void TF_ATTRIBUTE_NOINLINE
+  SliceAndMaybePad(const Device& device, const Tensor* input,
+                   const absl::Span<const int64_t> input_shape,
+                   const TensorShape& output_slice_shape,
+                   const std::vector<Tensor*>& output_slices) {
+    const auto& input_tensor = input->tensor<T, Rank>();
+    // Slice shape with optional padding.
+    for (int i = 0; i < num_slices_; ++i) {
+      Tensor* output_slice = output_slices[i];
+      SliceAndMaybePadState<Rank> r(num_splits_, input_shape,
+                                    output_slice_shape, i);
+      if (r.num_complete_pad_dims_ == Rank ||
+          (r.num_complete_pad_dims_ > 0 || r.num_partial_pad_dims_ > 0)) {
+        // Need to init padding
+        SetToConstant(output_slice, device);
+      }
+      if (r.num_complete_pad_dims_ == Rank) {
+        // Done
+      } else if (r.num_complete_pad_dims_ > 0 || r.num_partial_pad_dims_ > 0) {
+        output_slice->tensor<T, Rank>()
+            .slice(Eigen::DSizes<Eigen::DenseIndex, Rank>(),
+                   r.non_padded_slice_shape_dsizes_)
+            .device(device) = input_tensor.slice(
+            r.slice_indices_, r.non_padded_slice_shape_dsizes_);
+      } else {
+        AssignFromInput<Rank>(output_slice, device, input, r.slice_indices_,
+                              r.output_slice_shape_dsizes_);
+      }
+    }
+  }
+};
+
+// Shared base class to save code space
+template <typename Device, typename T>
+class XlaNDConcatenator {
+ public:
+  static absl::StatusOr<XlaNDConcatenator<Device, T>> Create(
+      const std::vector<int32_t>& num_concats, int num_slices,
+      const std::vector<int32_t>& paddings, bool has_paddings) {
+    if (num_concats.size() != paddings.size()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("num_concats size ", num_concats.size(),
+                       " mismatch with paddings size ", paddings.size(), "."));
+    }
+
+    int concats_cnt = 1;
+    for (auto concat : num_concats) {
+      concats_cnt *= concat;
+    }
+
+    if (num_slices != concats_cnt) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Expect num_slices ", concats_cnt, " but got ", num_slices));
+    }
+
+    return XlaNDConcatenator<Device, T>(num_concats, num_slices, paddings,
+                                        has_paddings);
+  }
+  absl::Status ComputeInternal(
+      absl::Span<Tensor> inputs,
+      const std::function<Status(const Tensor&)>& assign_or_copy_value_fn,
+      const std::function<StatusOr<Tensor*>()>& get_output_fn,
+      const Device& device) {
+    const int rank = inputs[0].shape().dims();
+
+    if (rank < 1 || rank > 8) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "'inputs' tensors must have rank in range (0, 8], but got ", rank,
+          "."));
+    }
+
+    if (num_slices_ == 1 && !has_paddings_) {
+      // Simple case
+      return assign_or_copy_value_fn(inputs[0]);
+    }
+
+    TF_ASSIGN_OR_RETURN(Tensor * output, get_output_fn());
+
+    if (rank == 1) {
+      MaybeUnpadAndAssign<1>(device, inputs, output);
+    } else if (rank == 2) {
+      MaybeUnpadAndAssign<2>(device, inputs, output);
+    } else if (rank == 3) {
+      MaybeUnpadAndAssign<3>(device, inputs, output);
+    } else if (rank == 4) {
+      MaybeUnpadAndAssign<4>(device, inputs, output);
+    } else if (rank == 5) {
+      MaybeUnpadAndAssign<5>(device, inputs, output);
+    } else if (rank == 6) {
+      MaybeUnpadAndAssign<6>(device, inputs, output);
+    } else if (rank == 7) {
+      MaybeUnpadAndAssign<7>(device, inputs, output);
+    } else if (rank == 8) {
+      MaybeUnpadAndAssign<8>(device, inputs, output);
+    }
+    return absl::OkStatus();
+  }
+
+ private:
+  template <int Rank>
+  class MaybeUnpadAndAssignState {
+   public:
+    int num_complete_pad_dims_;
+    int num_partial_pad_dims_;
+    TensorShape non_padded_slice_shape_;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> slice_shape_dsizes_;
+    Eigen::array<Eigen::IndexPair<int64_t>, Rank> slice_paddings_;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> slice_indices_;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> output_slice_shape_dsizes_;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> non_padded_slice_shape_dsizes_;
+
+    TF_ATTRIBUTE_NOINLINE MaybeUnpadAndAssignState(
+        absl::Span<const int32_t> num_concats, const Tensor& input0,
+        Tensor* output, int slice_index) {
+      slice_shape_dsizes_ = input0.shape().AsEigenDSizes<Rank>();
+      slice_indices_ =
+          GetSliceIndices<Rank>(num_concats, slice_shape_dsizes_, slice_index);
+      num_complete_pad_dims_ = 0;
+      num_partial_pad_dims_ = 0;
+      // Calculate paddings necessary to strip from slice.
+      for (int dim = 0; dim < Rank; ++dim) {
+        const int64_t dim_size = output->shape().dim_size(dim);
+        int64_t non_padded_dim = 0;
+        if (slice_indices_[dim] >= dim_size) {
+          // Complete padding.
+          slice_indices_[dim] = dim_size;
+          non_padded_dim = 0;
+          num_complete_pad_dims_++;
+        } else if (slice_indices_[dim] + slice_shape_dsizes_[dim] > dim_size) {
+          // Partial padding.
+          non_padded_dim = dim_size - slice_indices_[dim];
+          num_partial_pad_dims_++;
+        } else {
+          non_padded_dim = slice_shape_dsizes_[dim];
+        }
+        non_padded_slice_shape_.AddDim(non_padded_dim);
+      }
+      non_padded_slice_shape_dsizes_ =
+          non_padded_slice_shape_.AsEigenDSizes<Rank>();
+    }
+  };
+
+  std::vector<int32_t> num_concats_;
+  int num_slices_;
+  std::vector<int32_t> paddings_;
+  bool has_paddings_;
+
+  explicit TF_ATTRIBUTE_NOINLINE XlaNDConcatenator(
+      const std::vector<int32_t>& num_concats, int num_slices,
+      const std::vector<int32_t>& paddings, bool has_paddings)
+      : num_concats_(num_concats),
+        num_slices_(num_slices),
+        paddings_(paddings),
+        has_paddings_(has_paddings) {}
+
+  template <int Rank>
+  void TF_ATTRIBUTE_NOINLINE MaybeUnpadAndAssign(const Device& device,
+                                                 absl::Span<Tensor> inputs,
+                                                 Tensor* output) {
+    for (int i = 0; i < num_slices_; ++i) {
+      MaybeUnpadAndAssignState<Rank> r(num_concats_, inputs[0], output, i);
+      if (r.num_complete_pad_dims_ == Rank) {
+        continue;
+      } else if (r.num_complete_pad_dims_ > 0 || r.num_partial_pad_dims_ > 0) {
+        output->tensor<T, Rank>()
+            .slice(r.slice_indices_, r.non_padded_slice_shape_dsizes_)
+            .device(device) = inputs[i].tensor<T, Rank>().slice(
+            Eigen::DSizes<Eigen::DenseIndex, Rank>(),
+            r.non_padded_slice_shape_dsizes_);
+      } else {
+        output->tensor<T, Rank>()
+            .slice(r.slice_indices_, r.slice_shape_dsizes_)
+            .device(device) = inputs[i].tensor<T, Rank>();
+      }
+    }
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_SHARDING_UTILS_H_
diff --git a/tensorflow/core/tpu/kernels/sharding_utils_test.cc b/tensorflow/core/tpu/kernels/sharding_utils_test.cc
new file mode 100644
index 00000000000000..cd583df8a57bef
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/sharding_utils_test.cc
@@ -0,0 +1,456 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/tpu/kernels/sharding_utils.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+#include "tsl/platform/threadpool.h"
+
+namespace tensorflow {
+namespace {
+Eigen::ThreadPoolDevice CreateThreadPoolDevice() {
+  constexpr int kMaxParallelism = 16;
+  auto thread_pool = std::make_unique<tsl::thread::ThreadPool>(
+      tsl::Env::Default(), tsl::ThreadOptions(), "Resharding", kMaxParallelism);
+
+  Eigen::ThreadPoolDevice device(thread_pool->AsEigenThreadPool(),
+                                 kMaxParallelism);
+  return device;
+}
+
+TEST(XlaNDSplitterTest, NoSplits) {
+  auto device = CreateThreadPoolDevice();
+
+  const TensorShape input_shape({2, 2, 2});
+  const std::vector<int32_t> num_splits = {1, 1, 1};
+  const std::vector<int> paddings(num_splits.size(), 0);
+  const int num_outputs = 1;
+  auto input_tensor =
+      test::AsTensor<int32_t>({0, 1, 2, 3, 4, 5, 6, 7}, input_shape);
+
+  std::vector<Tensor> output_tensors;
+  output_tensors.resize(num_outputs);
+  auto allocate_output_fn = [&](int i, const TensorShape& output_slice_shape,
+                                Tensor** tensor) {
+    if (i < 0 || i >= output_tensors.size()) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Index ", i, " out of range [0, ", output_tensors.size(), "]"));
+    }
+    output_tensors[i] = Tensor(tensorflow::DT_INT32, output_slice_shape);
+    *tensor = &output_tensors[i];
+    return absl::OkStatus();
+  };
+  auto assign_or_copy_value_fn = [&](const Tensor& input) -> Status {
+    output_tensors[0] = input;
+    return absl::OkStatus();
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto splitter, (XlaNDSplitter<Eigen::ThreadPoolDevice, int32_t>::Create(
+                         num_splits, num_outputs, paddings,
+                         /*has_paddings=*/false)));
+  TF_ASSERT_OK(splitter.Split(&input_tensor, "test", assign_or_copy_value_fn,
+                              allocate_output_fn, device));
+
+  ASSERT_EQ(output_tensors.size(), 1);
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[0], test::AsTensor<int32_t>({0, 1, 2, 3, 4, 5, 6, 7},
+                                                 TensorShape({2, 2, 2})));
+}
+
+TEST(XlaNDSplitterTest, NoSplitsWithPadding) {
+  auto device = CreateThreadPoolDevice();
+
+  const TensorShape input_shape({2, 1, 1});
+  const std::vector<int32_t> num_splits = {1, 1, 1};
+  const std::vector<int> paddings = {0, 1, 1};
+  const int num_outputs = 1;
+  auto input_tensor = test::AsTensor<int32_t>({0, 1}, input_shape);
+
+  std::vector<Tensor> output_tensors;
+  output_tensors.resize(num_outputs);
+  auto allocate_output_fn = [&](int i, const TensorShape& output_slice_shape,
+                                Tensor** tensor) {
+    if (i < 0 || i >= output_tensors.size()) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Index ", i, " out of range [0, ", output_tensors.size(), "]"));
+    }
+    output_tensors[i] = Tensor(tensorflow::DT_INT32, output_slice_shape);
+    *tensor = &output_tensors[i];
+    return absl::OkStatus();
+  };
+  auto assign_or_copy_value_fn = [&](const Tensor& input) -> Status {
+    output_tensors[0] = input;
+    return absl::OkStatus();
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto splitter, (XlaNDSplitter<Eigen::ThreadPoolDevice, int32_t>::Create(
+                         num_splits, num_outputs, paddings,
+                         /*has_paddings=*/true)));
+
+  TF_ASSERT_OK(splitter.Split(&input_tensor, "test", assign_or_copy_value_fn,
+                              allocate_output_fn, device));
+
+  ASSERT_EQ(output_tensors.size(), 1);
+  std::vector<int32_t> expected_values(3 * 3 * 3);
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[0], test::AsTensor<int32_t>({0, 0, 0, 0, 1, 0, 0, 0},
+                                                 TensorShape({2, 2, 2})));
+}
+
+TEST(XlaNDSplitterTest, SplitNoPadding) {
+  auto device = CreateThreadPoolDevice();
+
+  const TensorShape input_shape({4, 4});
+  const std::vector<int32_t> num_splits = {2, 2};
+  const std::vector<int32_t> paddings(num_splits.size(), 0);
+  const int num_outputs = 4;
+  auto input_tensor = test::AsTensor<int32_t>(
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, input_shape);
+
+  std::vector<Tensor> output_tensors;
+  output_tensors.resize(num_outputs);
+  auto allocate_output_fn = [&](int i, const TensorShape& output_slice_shape,
+                                Tensor** tensor) {
+    if (i < 0 || i >= output_tensors.size()) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Index ", i, " out of range [0, ", output_tensors.size(), "]"));
+    }
+    output_tensors[i] = Tensor(tensorflow::DT_INT32, output_slice_shape);
+    *tensor = &output_tensors[i];
+    return absl::OkStatus();
+  };
+  auto assign_or_copy_value_fn = [&](const Tensor& input) -> Status {
+    output_tensors[0] = input;
+    return absl::OkStatus();
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto splitter, (XlaNDSplitter<Eigen::ThreadPoolDevice, int32_t>::Create(
+                         num_splits, num_outputs, paddings,
+                         /*has_paddings=*/true)));
+
+  TF_ASSERT_OK(splitter.Split(&input_tensor, "test", assign_or_copy_value_fn,
+                              allocate_output_fn, device));
+
+  ASSERT_EQ(output_tensors.size(), num_outputs);
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[0],
+      test::AsTensor<int32_t>({0, 1, 4, 5}, TensorShape({2, 2})));
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[1],
+      test::AsTensor<int32_t>({2, 3, 6, 7}, TensorShape({2, 2})));
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[2],
+      test::AsTensor<int32_t>({8, 9, 12, 13}, TensorShape({2, 2})));
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[3],
+      test::AsTensor<int32_t>({10, 11, 14, 15}, TensorShape({2, 2})));
+}
+
+TEST(XlaNDSplitterTest, SplitPartialPadding) {
+  auto device = CreateThreadPoolDevice();
+
+  const TensorShape input_shape({3, 3});
+  const std::vector<int32_t> num_splits = {2, 2};
+  const std::vector<int32_t> paddings = {1, 1};
+  const int num_outputs = 4;
+  auto input_tensor =
+      test::AsTensor<int32_t>({0, 1, 2, 3, 4, 5, 6, 7, 8}, input_shape);
+
+  std::vector<Tensor> output_tensors;
+  output_tensors.resize(num_outputs);
+  auto allocate_output_fn = [&](int i, const TensorShape& output_slice_shape,
+                                Tensor** tensor) {
+    if (i < 0 || i >= output_tensors.size()) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Index ", i, " out of range [0, ", output_tensors.size(), "]"));
+    }
+    output_tensors[i] = Tensor(tensorflow::DT_INT32, output_slice_shape);
+    *tensor = &output_tensors[i];
+    return absl::OkStatus();
+  };
+  auto assign_or_copy_value_fn = [&](const Tensor& input) -> Status {
+    output_tensors[0] = input;
+    return absl::OkStatus();
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto splitter, (XlaNDSplitter<Eigen::ThreadPoolDevice, int32_t>::Create(
+                         num_splits, num_outputs, paddings,
+                         /*has_paddings=*/true)));
+
+  TF_ASSERT_OK(splitter.Split(&input_tensor, "test", assign_or_copy_value_fn,
+                              allocate_output_fn, device));
+
+  ASSERT_EQ(output_tensors.size(), num_outputs);
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[0],
+      test::AsTensor<int32_t>({0, 1, 3, 4}, TensorShape({2, 2})));
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[1],
+      test::AsTensor<int32_t>({2, 0, 5, 0}, TensorShape({2, 2})));
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[2],
+      test::AsTensor<int32_t>({6, 7, 0, 0}, TensorShape({2, 2})));
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[3],
+      test::AsTensor<int32_t>({8, 0, 0, 0}, TensorShape({2, 2})));
+}
+
+TEST(XlaNDSplitterTest, SplitCompletePadding) {
+  auto device = CreateThreadPoolDevice();
+
+  const TensorShape input_shape({2, 1});
+  const std::vector<int32_t> num_splits = {2, 2};
+  const std::vector<int32_t> paddings = {2, 3};
+  const int num_outputs = 4;
+  auto input_tensor = test::AsTensor<int32_t>({0, 1}, input_shape);
+
+  std::vector<Tensor> output_tensors;
+  output_tensors.resize(num_outputs);
+  auto allocate_output_fn = [&](int i, const TensorShape& output_slice_shape,
+                                Tensor** tensor) {
+    if (i < 0 || i >= output_tensors.size()) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Index ", i, " out of range [0, ", output_tensors.size(), "]"));
+    }
+    output_tensors[i] = Tensor(tensorflow::DT_INT32, output_slice_shape);
+    *tensor = &output_tensors[i];
+    return absl::OkStatus();
+  };
+  auto assign_or_copy_value_fn = [&](const Tensor& input) -> Status {
+    output_tensors[0] = input;
+    return absl::OkStatus();
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto splitter, (XlaNDSplitter<Eigen::ThreadPoolDevice, int32_t>::Create(
+                         num_splits, num_outputs, paddings,
+                         /*has_paddings=*/true)));
+
+  TF_ASSERT_OK(splitter.Split(&input_tensor, "test", assign_or_copy_value_fn,
+                              allocate_output_fn, device));
+
+  ASSERT_EQ(output_tensors.size(), num_outputs);
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[0],
+      test::AsTensor<int32_t>({0, 0, 1, 0}, TensorShape({2, 2})));
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[1],
+      test::AsTensor<int32_t>({0, 0, 0, 0}, TensorShape({2, 2})));
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[2],
+      test::AsTensor<int32_t>({0, 0, 0, 0}, TensorShape({2, 2})));
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[3],
+      test::AsTensor<int32_t>({0, 0, 0, 0}, TensorShape({2, 2})));
+}
+
+TEST(XlaNDConcatenatorTest, NoConcats) {
+  auto device = CreateThreadPoolDevice();
+
+  const TensorShape input_shape({2, 2, 2});
+  const TensorShape output_shape({2, 2, 2});
+  const std::vector<int32_t> num_concats = {1, 1, 1};
+  const std::vector<int> paddings(num_concats.size(), 0);
+  int num_slices = 1;
+  auto tensor0 = test::AsTensor<int32_t>({0, 1, 2, 3, 4, 5, 6, 7}, input_shape);
+  std::vector<Tensor> input_tensors;
+  input_tensors.push_back(tensor0);
+
+  std::vector<Tensor> output_tensors;
+  output_tensors.reserve(1);
+  auto get_output_fn = [&]() {
+    output_tensors.push_back(Tensor(tensorflow::DT_INT32, output_shape));
+    return &output_tensors.back();
+  };
+  auto assign_or_copy_value_fn = [&](const Tensor& input) -> Status {
+    output_tensors.push_back(input);
+    return absl::OkStatus();
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto concatenator,
+      (XlaNDConcatenator<Eigen::ThreadPoolDevice, int32_t>::Create(
+          num_concats, num_slices, paddings,
+          /*has_paddings=*/true)));
+
+  TF_ASSERT_OK(concatenator.ComputeInternal(absl::MakeSpan(input_tensors),
+                                            assign_or_copy_value_fn,
+                                            get_output_fn, device));
+
+  ASSERT_EQ(output_tensors.size(), 1);
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[0], test::AsTensor<int32_t>({0, 1, 2, 3, 4, 5, 6, 7},
+                                                 TensorShape({2, 2, 2})));
+}
+
+TEST(XlaNDConcatenatorTest, ConcatNoPadding) {
+  auto device = CreateThreadPoolDevice();
+
+  const TensorShape input_shape({2, 2});
+  const TensorShape output_shape({4, 4});
+  const std::vector<int32_t> num_concats = {2, 2};
+  const std::vector<int> paddings(num_concats.size(), 0);
+  int num_slices = 4;
+  auto tensor0 = test::AsTensor<int32_t>({0, 1, 2, 3}, input_shape);
+  auto tensor1 = test::AsTensor<int32_t>({4, 5, 6, 7}, input_shape);
+  auto tensor2 = test::AsTensor<int32_t>({8, 9, 10, 11}, input_shape);
+  auto tensor3 = test::AsTensor<int32_t>({12, 13, 14, 15}, input_shape);
+  std::vector<Tensor> input_tensors;
+  input_tensors.push_back(tensor0);
+  input_tensors.push_back(tensor1);
+  input_tensors.push_back(tensor2);
+  input_tensors.push_back(tensor3);
+
+  std::vector<Tensor> output_tensors;
+  output_tensors.reserve(1);
+  auto get_output_fn = [&]() {
+    output_tensors.push_back(Tensor(tensorflow::DT_INT32, output_shape));
+    return &output_tensors.back();
+  };
+  auto assign_or_copy_value_fn = [&](const Tensor& input) -> Status {
+    output_tensors.push_back(input);
+    return absl::OkStatus();
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto concatenator,
+      (XlaNDConcatenator<Eigen::ThreadPoolDevice, int32_t>::Create(
+          num_concats, num_slices, paddings,
+          /*has_paddings=*/true)));
+
+  TF_ASSERT_OK(concatenator.ComputeInternal(absl::MakeSpan(input_tensors),
+                                            assign_or_copy_value_fn,
+                                            get_output_fn, device));
+  ASSERT_EQ(output_tensors.size(), 1);
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[0], test::AsTensor<int32_t>({0, 1, 4, 5, 2, 3, 6, 7, 8, 9,
+                                                  12, 13, 10, 11, 14, 15},
+                                                 TensorShape({4, 4})));
+}
+
+TEST(XlaNDConcatenatorTest, ConcatPartialPadding) {
+  auto device = CreateThreadPoolDevice();
+
+  const TensorShape input_shape({2, 2});
+  const TensorShape output_shape({3, 3});
+  const std::vector<int32_t> num_concats = {2, 2};
+  const std::vector<int> paddings = {1, 1};
+  int num_slices = 4;
+  auto tensor0 = test::AsTensor<int32_t>({0, 1, 2, 3}, input_shape);
+  auto tensor1 = test::AsTensor<int32_t>({4, 5, 6, 7}, input_shape);
+  auto tensor2 = test::AsTensor<int32_t>({8, 9, 10, 11}, input_shape);
+  auto tensor3 = test::AsTensor<int32_t>({12, 13, 14, 15}, input_shape);
+  std::vector<Tensor> input_tensors;
+  input_tensors.push_back(tensor0);
+  input_tensors.push_back(tensor1);
+  input_tensors.push_back(tensor2);
+  input_tensors.push_back(tensor3);
+
+  std::vector<Tensor> output_tensors;
+  output_tensors.reserve(1);
+  auto get_output_fn = [&]() {
+    output_tensors.push_back(Tensor(tensorflow::DT_INT32, output_shape));
+    return &output_tensors.back();
+  };
+  auto assign_or_copy_value_fn = [&](const Tensor& input) -> Status {
+    output_tensors.push_back(input);
+    return absl::OkStatus();
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto concatenator,
+      (XlaNDConcatenator<Eigen::ThreadPoolDevice, int32_t>::Create(
+          num_concats, num_slices, paddings,
+          /*has_paddings=*/true)));
+
+  TF_ASSERT_OK(concatenator.ComputeInternal(absl::MakeSpan(input_tensors),
+                                            assign_or_copy_value_fn,
+                                            get_output_fn, device));
+
+  ASSERT_EQ(output_tensors.size(), 1);
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[0], test::AsTensor<int32_t>({0, 1, 4, 2, 3, 6, 8, 9, 12},
+                                                 TensorShape({3, 3})));
+}
+
+TEST(XlaNDConcatenatorTest, ConcatCompletePadding) {
+  auto device = CreateThreadPoolDevice();
+
+  const TensorShape input_shape({2, 2});
+  const TensorShape output_shape({2, 2});
+  const std::vector<int32_t> num_concats = {2, 2};
+  const std::vector<int> paddings = {2, 2};
+  int num_slices = 4;
+  auto tensor0 = test::AsTensor<int32_t>({0, 1, 2, 3}, input_shape);
+  auto tensor1 = test::AsTensor<int32_t>({4, 5, 6, 7}, input_shape);
+  auto tensor2 = test::AsTensor<int32_t>({8, 9, 10, 11}, input_shape);
+  auto tensor3 = test::AsTensor<int32_t>({12, 13, 14, 15}, input_shape);
+  std::vector<Tensor> input_tensors;
+  input_tensors.push_back(tensor0);
+  input_tensors.push_back(tensor1);
+  input_tensors.push_back(tensor2);
+  input_tensors.push_back(tensor3);
+
+  std::vector<Tensor> output_tensors;
+  output_tensors.reserve(1);
+  auto get_output_fn = [&]() {
+    output_tensors.push_back(Tensor(tensorflow::DT_INT32, output_shape));
+    return &output_tensors.back();
+  };
+  auto assign_or_copy_value_fn = [&](const Tensor& input) -> Status {
+    output_tensors.push_back(input);
+    return absl::OkStatus();
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto concatenator,
+      (XlaNDConcatenator<Eigen::ThreadPoolDevice, int32_t>::Create(
+          num_concats, num_slices, paddings,
+          /*has_paddings=*/true)));
+
+  TF_ASSERT_OK(concatenator.ComputeInternal(absl::MakeSpan(input_tensors),
+                                            assign_or_copy_value_fn,
+                                            get_output_fn, device));
+
+  ASSERT_EQ(output_tensors.size(), 1);
+  test::ExpectTensorEqual<int32_t>(
+      output_tensors[0],
+      test::AsTensor<int32_t>({0, 1, 2, 3}, TensorShape({2, 2})));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
index 54feed5d5fffe2..53a0c70779534d 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
@@ -357,6 +357,8 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   const int64* splits_tensor_ptr = splits->flat<int64>().data();
   const int32* id_counts_tensor_ptr = id_counts->flat<int32>().data();
 
+  const int32_t total_id_count = row_ids->NumElements();
+
   const int num_physical_replica = num_replica_ * num_sc_per_chip_;
 
   size_t xla_pad_size = stream_executor::tpu::OpsApiFn()
@@ -405,6 +407,12 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
 
   const int32 max_ids_per_chip = max_ids_per_chip_per_sample_ * sample_count_;
 
+  OP_REQUIRES(
+      ctx, max_ids_per_chip % xla_pad_size == 0,
+      absl::InvalidArgumentError(absl::StrCat(
+          "The max_ids_per_chip is set to be ", max_ids_per_chip,
+          " which is not divisible by the xla_pad_size ", xla_pad_size, " .")));
+
   const int32 padded_row_pointers_size_per_sc =
       xla::RoundUpTo<int32>(num_physical_replica, xla_pad_size);
 
@@ -435,6 +443,11 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
       sorted_token_ids_tensor->flat<int32>().data();
   float* sorted_gains_tensor_ptr = sorted_gains_tensor->flat<float>().data();
 
+  // This packed id count is used to track how many ids we have packed into
+  // the output tensor and based on this we would know how many ids that we
+  // dropped.
+  int32_t packed_id_count = 0;
+
   int32 global_index = 0;
   int32 row_pointers_index = 0;
   for (int sc_id = 0; sc_id < num_sc_per_chip_; ++sc_id) {
@@ -453,14 +466,41 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
         const int token_id_start_pos =
             *(id_counts_tensor_ptr + start_division_pos);
 
-        std::copy_n(col_ids_tensor_ptr + token_id_start_pos, token_id_count,
-                    sorted_token_ids_tensor_ptr + global_index);
-        std::copy_n(row_ids_tensor_ptr + token_id_start_pos, token_id_count,
-                    sorted_sample_ids_tensor_ptr + global_index);
-        std::copy_n(gains_tensor_ptr + token_id_start_pos, token_id_count,
-                    sorted_gains_tensor_ptr + global_index);
-
-        global_index += token_id_count;
+        if (global_index + token_id_count > max_ids_per_chip) {
+          if (allow_id_dropping_for_minibatching_) {
+            const int32_t copy_id_count =
+                std::min(max_ids_per_chip - global_index, token_id_count);
+            std::copy_n(col_ids_tensor_ptr + token_id_start_pos, copy_id_count,
+                        sorted_token_ids_tensor_ptr + global_index);
+            std::copy_n(row_ids_tensor_ptr + token_id_start_pos, copy_id_count,
+                        sorted_sample_ids_tensor_ptr + global_index);
+            std::copy_n(gains_tensor_ptr + token_id_start_pos, copy_id_count,
+                        sorted_gains_tensor_ptr + global_index);
+            packed_id_count += copy_id_count;
+            global_index = max_ids_per_chip;
+          } else {
+            const int32_t remain_id_count = total_id_count - packed_id_count;
+            ctx->CtxFailure(absl::InvalidArgumentError(absl::StrCat(
+                "The max_ids_per_chip is set to be ", max_ids_per_chip,
+                " which is not going to fit all ids. The remaining id count "
+                "is ",
+                remain_id_count,
+                " . Please consider setting the "
+                "sparse_core_allow_id_dropping_for_minibatching to be "
+                "true. ")));
+            return;
+          }
+        } else {
+          std::copy_n(col_ids_tensor_ptr + token_id_start_pos, token_id_count,
+                      sorted_token_ids_tensor_ptr + global_index);
+          std::copy_n(row_ids_tensor_ptr + token_id_start_pos, token_id_count,
+                      sorted_sample_ids_tensor_ptr + global_index);
+          std::copy_n(gains_tensor_ptr + token_id_start_pos, token_id_count,
+                      sorted_gains_tensor_ptr + global_index);
+
+          global_index += token_id_count;
+          packed_id_count += token_id_count;
+        }
 
         *(row_pointers_tensor_ptr + row_pointers_index) = global_index;
         int32 num_ids_to_pad_per_replica =
@@ -484,13 +524,16 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
     }
   }
 
-  int32 ids_unpadded_size = global_index;
+  int32_t ids_unpadded_size = global_index;
 
-  OP_REQUIRES(ctx, ids_unpadded_size <= max_ids_per_chip,
-              absl::InvalidArgumentError(absl::StrCat(
-                  "Got ", ids_unpadded_size,
-                  " ids after padding but the max_ids_per_chip is set to be ",
-                  max_ids_per_chip, " which is smaller.")));
+  if (packed_id_count < total_id_count) {
+    const int32_t dropped_id_count = total_id_count - packed_id_count;
+    LOG(WARNING) << "Dropping " << dropped_id_count
+                 << " ids so that the produced CsrWrappedCooTensor can be fit "
+                    "in static bound of "
+                 << max_ids_per_chip
+                 << " . This could potentially impact the model quality.";
+  }
 
   int32 row_pointers_unpadded_size =
       total_num_minibatch * padded_row_pointers_size_per_sc;
@@ -923,7 +966,8 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
                                          table_name_);
 
   CalculateHeadroom(this_max_ids, this_max_uniques, program_key,
-                    max_ids_per_partition, max_unique_ids_per_partition);
+                    max_ids_per_partition, max_unique_ids_per_partition,
+                    dropped_id_count);
 
   Tensor* splits_tensor;
   OP_REQUIRES_OK(
diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
index b61367d2cb0796..f2d35b3fa76cd6 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
@@ -73,6 +73,8 @@ class GetMinibatchesInCsrWithPhysicalReplicaOp : public OpKernel {
   std::string table_name_;
   std::unique_ptr<SparseCoreOpsStatsHandler> sparse_core_ops_stats_handler_;
 
+  bool allow_id_dropping_for_minibatching_ = false;
+
  private:
   int num_replica_ = 1;
   int max_minibatches_per_sc_ = 1;
@@ -96,7 +98,8 @@ class GetMinibatchSplitsWithPhysicalReplicaOp : public OpKernel {
   virtual void CalculateHeadroom(int32 this_max_ids, int32 this_max_uniques,
                                  tstring program_key,
                                  int64_t max_ids_per_partition,
-                                 int64_t max_unique_ids_per_partition) {}
+                                 int64_t max_unique_ids_per_partition,
+                                 int32_t dropped_id_count) {}
   virtual inline int32_t CalculateBucketIdWithHashing(int32_t col_id,
                                                       int32_t num_buckets) {
     // TODO(pineapplejuice233): Add a proper hashing function here.
diff --git a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
index 81d56802f1c77b..3d5e8642b2e58e 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
-#include "xla/stream_executor/tpu/status_helper.h"
 #include "xla/stream_executor/tpu/tpu_api.h"
 #include "xla/stream_executor/tpu/tpu_ops_c_api.h"
 #include "xla/xla_data.pb.h"
@@ -41,12 +40,21 @@ limitations under the License.
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/tpu/kernels/sparse_core_ops_utils.h"
 #include "tsl/platform/macros.h"
 
+typedef tensorflow::monitoring::Gauge<int64_t, 2> TFGaugeMetric;
+static TFGaugeMetric* max_ids_per_partition_gauge_ = TFGaugeMetric::New(
+    "/tensorflow/tpu/embedding/maximum_ids_per_partition",
+    "Max ids_per_partition limit for each table", "device", "table");
+static TFGaugeMetric* max_unique_ids_per_partition_gauge_ = TFGaugeMetric::New(
+    "/tensorflow/tpu/embedding/maximum_unique_ids_per_partition",
+    "Max unique_ids_per_partition limit for each table", "device", "table");
+
 namespace tensorflow {
 namespace {
 
@@ -216,6 +224,7 @@ class XlaSparseDenseMatmulWithCsrInputOp : public XlaOpKernel {
         quantization_config_high_ = quant_clipping_float;
       }
     }
+    device_name_ = ctx->device()->name();
     // Check for incomplete quantization config.
     OP_REQUIRES(ctx,
                 quantization_config_low_.has_value() ==
@@ -248,10 +257,17 @@ class XlaSparseDenseMatmulWithCsrInputOp : public XlaOpKernel {
         ctx, GetMaxIdsAndUniquesExternal(
                  "", table_name_, per_sparse_core_batch_size, feature_width,
                  &max_ids_per_partition, &max_unique_ids_per_partition));
-    VLOG(3) << "XlaSparseDenseMatmulWithCsrInputOp: "
-            << "table_name = '" << table_name_
-            << "', max_ids = " << max_ids_per_partition
-            << ", max_uniques = " << max_unique_ids_per_partition;
+    // Log max_ids and max_uniques for offline analysis. We do this here since
+    // these values are fixed at TPU compile time and remain fixed during
+    // training.
+    max_ids_per_partition_gauge_->GetCell(device_name_, table_name_)
+        ->Set(max_ids_per_partition);
+    max_unique_ids_per_partition_gauge_->GetCell(device_name_, table_name_)
+        ->Set(max_unique_ids_per_partition);
+    LOG(INFO) << "Lowering XlaSparseDenseMatmulWithCsrInputOp to HLO: "
+              << "table_name = '" << table_name_
+              << "', max_ids = " << max_ids_per_partition
+              << ", max_uniques = " << max_unique_ids_per_partition;
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsScalar(ctx->InputShape(
                     "num_minibatches_per_physical_sparse_core")),
@@ -321,6 +337,7 @@ class XlaSparseDenseMatmulWithCsrInputOp : public XlaOpKernel {
   std::optional<float> quantization_config_low_;
   std::optional<float> quantization_config_high_;
   std::optional<int> quantization_config_num_buckets_;
+  std::string device_name_;
   std::string table_name_;
 
   XlaSparseDenseMatmulWithCsrInputOp(
@@ -410,10 +427,10 @@ class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
         ctx, GetMaxIdsAndUniquesExternal(
                  "", table_name_, per_sparse_core_batch_size, feature_width,
                  &max_ids_per_partition, &max_unique_ids_per_partition));
-    VLOG(3) << "XlaSparseDenseMatmulWithCsrInputOp: "
-            << "table_name = '" << table_name_
-            << "', max_ids = " << max_ids_per_partition
-            << ", max_uniques = " << max_unique_ids_per_partition;
+    LOG(INFO) << "Lowering XlaSparseDenseMatmulGradWithCsrInputOp to HLO: "
+              << "table_name = '" << table_name_
+              << "', max_ids = " << max_ids_per_partition
+              << ", max_uniques = " << max_unique_ids_per_partition;
 
     xla::XlaComputation optimizer = build_optimizer_computation(feature_width);
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
index d098abe6e1ae08..5cb7e5a5d55511 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
@@ -62,7 +62,7 @@ using GuaranteedConsts = std::variant<absl::Span<const TensorProto* const>,
 // List of parameters for lowering function library definition to HLO IR.
 struct FunctionToHloArgs {
   const NameAttrList* const function;
-  const FunctionLibraryDefinition* const flib_def;
+  const FunctionLibraryDefinition* flib_def;
   int graph_def_version;
   GuaranteedConsts guaranteed_constants;
 };
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
index 52b83ae6be78b2..97fe019201e4cb 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
@@ -25,7 +25,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "xla/client/xla_builder.h"
+#include "xla/layout_util.h"
 #include "xla/literal_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/tpu/c_api_conversions.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
@@ -252,7 +254,11 @@ class SendTPUEmbeddingGradientsOp : public XlaOpKernel {
     auto builder = ctx->builder();
     gradient_shapes.reserve(gradients.size());
     for (xla::XlaOp op : gradients) {
-      gradient_shapes.push_back(builder->GetShape(op).value());
+      // Gradient layout information is added by XLA, so we can just create
+      // default layout information.
+      xla::Shape gradient_shape = builder->GetShape(op).value();
+      xla::LayoutUtil::SetToDefaultLayout(&gradient_shape);
+      gradient_shapes.push_back(gradient_shape);
     }
 
     std::vector<xla::XlaOp> learning_rates;
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.cc b/tensorflow/core/tpu/kernels/tpu_program_group.cc
index 0b5d4444ef0c33..77f7f3361083ea 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.cc
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.cc
@@ -301,42 +301,6 @@ Status TpuProgramGroup::CompileAndBuild(
   return status.status();
 }
 
-/*static*/
-Status TpuProgramGroup::CompileAndBuild(
-    const xrt::XLAComputation& xrt_computation_proto,
-    const XLA_TpuMeshState* mesh_state,
-    TpuProgramGroupInterface* tpu_program_group_interface) {
-  se_tpu::SerializedProto serialized_compilation_request =
-      se_tpu::SerializeProto(xrt_computation_proto);
-  auto cleanup = gtl::MakeCleanup([serialized_compilation_request] {
-    se_tpu::SerializedProto_Free(serialized_compilation_request);
-  });
-  size_t count = 0;
-  XLA_TpuProgram** xla_tpu_programs = nullptr;
-  StatusHelper status;
-  stream_executor::tpu::OpsApiFn()->TpuCompile_XrtCompileAndBuildFn(
-      serialized_compilation_request, mesh_state, &xla_tpu_programs, &count,
-      status.c_status);
-  if (!status.ok()) {
-    VLOG(1) << "Run CompileAndBuild failed.";
-    return status.status();
-  }
-
-  // SPMD could return 1 result for all partitions.
-  int num_cores_per_replica =
-      xrt_computation_proto.config().num_cores_per_replica()
-          ? xrt_computation_proto.config().num_cores_per_replica()
-          : 1;
-  TF_RET_CHECK(count == 1 || count == num_cores_per_replica);
-  VLOG(1) << "Initialize TpuProgramGroup.";
-  TpuProgramGroup* tpu_program_group =
-      tensorflow::down_cast<TpuProgramGroup*>(tpu_program_group_interface);
-  tpu_program_group->Initialize(
-      absl::MakeConstSpan(&xla_tpu_programs[0], count));
-  stream_executor::tpu::OpsApiFn()->TpuProgram_FreeArrayFn(xla_tpu_programs);
-  return status.status();
-}
-
 std::vector<XLA_TpuProgram*> TpuProgramGroup::tpu_programs(
     TpuProgramShardingType sharding_type) const {
   std::vector<XLA_TpuProgram*> tpu_programs;
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.h b/tensorflow/core/tpu/kernels/tpu_program_group.h
index 3d164c09725666..6859b0facd038c 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/stream_executor/tpu/tpu_ops_c_api.h"
 #include "xla/stream_executor/tpu/tpu_platform_interface.h"
-#include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 #include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
@@ -96,11 +95,6 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
       const XLA_TpuMeshState* mesh_state,
       TpuProgramGroupInterface* tpu_program_group_interface);
 
-  // Compiles HLO IR and returns TPU programs ready for execution.
-  static Status CompileAndBuild(
-      const xrt::XLAComputation& xrt_computation_proto,
-      const XLA_TpuMeshState* mesh_state,
-      TpuProgramGroupInterface* tpu_program_group_interface);
 
   // Initializes `TpuProgramGroup` object with `xla_tpu_programs`.
   void Initialize(absl::Span<XLA_TpuProgram* const> xla_tpu_programs);
diff --git a/tensorflow/core/tpu/ops/sparse_core_ops.cc b/tensorflow/core/tpu/ops/sparse_core_ops.cc
index f9b9d64339e572..e770c1814399a2 100644
--- a/tensorflow/core/tpu/ops/sparse_core_ops.cc
+++ b/tensorflow/core/tpu/ops/sparse_core_ops.cc
@@ -322,4 +322,12 @@ REGISTER_OP("XlaSparseCoreFtrl")
       return OkStatus();
     });
 
+REGISTER_OP("GlobalIterId")
+    .Output("iter_id: int64")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
+      c->set_output(0, c->Scalar());
+      return OkStatus();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index e01ba6ff6a792e..d435ac73f29780 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -37,7 +37,6 @@ package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = default_package_visibility,
     features = [
-        "-layering_check",
         "-parse_headers",
     ],
     licenses = ["notice"],
@@ -453,6 +452,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/kernels:ops_util",
+        "@com_google_absl//absl/base",
     ],
 )
 
@@ -726,6 +726,7 @@ tf_kernel_library(
     srcs = ["cuda_solvers.cc"],
     hdrs = ["gpu_solvers.h"],
     compatible_with = [],
+    features = ["-layering_check"],
     # @local_config_cuda//cuda:cusolver_static, //third_party/eigen3:blas,
     # and //third_party/libf2c all contain various parts of BLAS, LAPACK,
     # and f2c helper functions in global namespace. Tell the compiler to
@@ -773,6 +774,7 @@ tf_kernel_library(
         "gpu_solvers.h",
     ],
     compatible_with = [],
+    features = ["-layering_check"],
     deps = [
         ":cuda_solvers",
         "//tensorflow/core:framework",
@@ -800,28 +802,6 @@ cc_library(
     ],
 )
 
-# For a more maintainable build this target should not exist and the headers
-# should  be split into the existing cc_library targets, but this change was
-# automatically  done so that we can remove long standing issues and complexity
-# in the build system. It's up to the OWNERS of this package to get rid of it or
-# not. The use of the textual_hdrs attribute is discouraged, use hdrs instead.
-# Here it is used to avoid header parsing errors in packages where the feature
-# parse_headers was enabled since loose headers were not being parsed. See
-# go/loose-lsc-one-target-approach for more details.
-cc_library(
-    name = "loose_headers",
-    tags = ["avoid_dep"],
-    textual_hdrs = [
-        "cuda_sparse.h",
-        "gpu_solvers.h",
-    ],
-    visibility = [
-        "//tensorflow/core/kernels:__pkg__",
-        "//tensorflow/core/kernels/linalg:__pkg__",
-        "//tensorflow/core/kernels/sparse:__pkg__",
-    ],
-)
-
 # Tests.
 tf_cc_test(
     name = "overflow_test",
@@ -852,6 +832,7 @@ tf_cuda_only_cc_test(
     srcs = [
         "gpu_kernel_helper_test.cu.cc",
     ],
+    features = ["-layering_check"],
     tags = [
         "no_cuda_asan",  # TODO(b/171342366): re-enable.
     ],
@@ -890,6 +871,7 @@ tf_cc_tests(
         "tensor_slice_writer_test.cc",
         "work_sharder_test.cc",
     ],
+    features = ["-layering_check"],
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
diff --git a/tensorflow/core/util/autotune_maps/BUILD b/tensorflow/core/util/autotune_maps/BUILD
index 94ccd691946c78..f4f13211ab2f8e 100644
--- a/tensorflow/core/util/autotune_maps/BUILD
+++ b/tensorflow/core/util/autotune_maps/BUILD
@@ -107,7 +107,10 @@ tf_proto_library(
         "//tensorflow/core/util/autotune_maps:conv_parameters_proto",
         "@local_tsl//tsl/protobuf:dnn_proto",
     ],
-    visibility = ["//waymo/ml/deploy/system/autotuning:__subpackages__"],
+    visibility = [
+        "//waymo/ml/deploy/benchmark:__subpackages__",
+        "//waymo/ml/deploy/system/autotuning:__subpackages__",
+    ],
 )
 
 # copybara:uncomment_begin(google-only)
diff --git a/tensorflow/core/util/command_line_flags_test.cc b/tensorflow/core/util/command_line_flags_test.cc
index 221f347c22bea2..d76d2fce3d0b03 100644
--- a/tensorflow/core/util/command_line_flags_test.cc
+++ b/tensorflow/core/util/command_line_flags_test.cc
@@ -43,6 +43,7 @@ TEST(CommandLineFlagsTest, BasicUsage) {
   bool some_switch_set_directly = false;
   bool some_switch_set_via_hook = true;
   bool some_switch_set_capitalized = false;
+  bool some_switch_set_by_number = false;
   string some_name_set_directly = "something_a";
   string some_name_set_via_hook = "something_b";
   float some_float_set_directly = -23.23f;
@@ -55,6 +56,7 @@ TEST(CommandLineFlagsTest, BasicUsage) {
                                       "--some_switch_set_directly",
                                       "--some_switch_set_via_hook=false",
                                       "--some_switch_set_capitalized=True",
+                                      "--some_switch_set_by_number=1",
                                       "--some_name_set_directly=somethingelse",
                                       "--some_name_set_via_hook=anythingelse",
                                       "--some_float_set_directly=42.0",
@@ -93,6 +95,8 @@ TEST(CommandLineFlagsTest, BasicUsage) {
               some_switch_set_via_hook, "some switch set via hook"),
           Flag("some_switch_set_capitalized", &some_switch_set_capitalized,
                "some switch set capitalized"),
+          Flag("some_switch_set_by_number", &some_switch_set_by_number,
+               "some switch set by number"),
           Flag("some_name_set_directly", &some_name_set_directly,
                "some name set directly"),
           Flag(
@@ -121,6 +125,7 @@ TEST(CommandLineFlagsTest, BasicUsage) {
   EXPECT_EQ(true, some_switch_set_directly);
   EXPECT_EQ(false, some_switch_set_via_hook);
   EXPECT_EQ(true, some_switch_set_capitalized);
+  EXPECT_EQ(true, some_switch_set_by_number);
   EXPECT_EQ("somethingelse", some_name_set_directly);
   EXPECT_EQ("anythingelse", some_name_set_via_hook);
   EXPECT_NEAR(42.0f, some_float_set_directly, 1e-5f);
diff --git a/tensorflow/dtensor/cc/dtensor_device.cc b/tensorflow/dtensor/cc/dtensor_device.cc
index 81d03d14ace24e..b9ddf25fde6caf 100644
--- a/tensorflow/dtensor/cc/dtensor_device.cc
+++ b/tensorflow/dtensor/cc/dtensor_device.cc
@@ -2384,6 +2384,9 @@ void DTensorDevice::Execute(const TFE_Op* original_op, int* num_outputs,
   absl::flat_hash_set<Mesh> input_meshes;
   std::vector<int> single_device_input_indices;
 
+  VLOG(4) << "DTensorOperation: " << dtensor_operation.name
+          << " num_inputs are " << num_inputs;
+
   typed_inputs.resize(num_inputs);
   for (int j = 0; j < num_inputs; ++j) {
     TFE_TensorHandle* input = inputs[j];
@@ -2392,6 +2395,8 @@ void DTensorDevice::Execute(const TFE_Op* original_op, int* num_outputs,
     if (name_ != input_device) {
       single_device_input_indices.push_back(j);
       typed_inputs[j] = nullptr;
+      VLOG(5) << "Input " << j << ": "
+              << tensorflow::unwrap(input)->DebugString();
       continue;
     }
     // Handle input which is on DTensor device already.
@@ -2404,10 +2409,15 @@ void DTensorDevice::Execute(const TFE_Op* original_op, int* num_outputs,
       input_meshes.insert(t->layout().mesh());
     }
     typed_inputs[j] = t;
+    VLOG(5) << "Input " << j << ": " << typed_inputs[j]->DebugString();
   }
 
   const std::optional<Mesh> mesh = ChooseBroadcastingMesh(input_meshes, dtypes);
 
+  VLOG(4) << "Execution DTensorOperation: " << dtensor_operation.name
+          << " with broadcast mesh "
+          << (mesh.has_value() ? mesh->ToString() : "no broadcast mesh");
+
   // TODO(feyu): This short circuit only allows running unsupported op
   // via DTensorDevice in eager mode. for tf.function and its graph, we will
   // need to build single device mesh placement rules in mesh propagation.
diff --git a/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc b/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
index d39a71fad4d626..c1484beb5c2e3d 100644
--- a/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
+++ b/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
@@ -165,7 +166,7 @@ GetMergedMeshClusterResults(mlir::tf_device::ClusterOp current_cluster,
 // Updates the users of `merging_cluster` so that they use values
 // from `merged_cluster` instead.
 void ReplaceOperandUsagesWithMergedClusterOutputs(
-    const llvm::SmallVectorImpl<mlir::Value>& values_to_replace,
+    mlir::ValueRange values_to_replace,
     mlir::tf_device::ClusterOp merged_cluster) {
   for (auto result :
        llvm::zip(values_to_replace, merged_cluster.getResults())) {
diff --git a/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_attributes.h b/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_attributes.h
index fce2c014dc03b3..f54bdd248d3685 100644
--- a/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_attributes.h
+++ b/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_attributes.h
@@ -37,6 +37,8 @@ class MeshAttr
   using Base::Base;
   using Mesh = tensorflow::dtensor::Mesh;
 
+  static constexpr StringLiteral name = "dtensor.mesh";
+
   // Constructor of attribute
   static MeshAttr get(MLIRContext* context, const Mesh& mesh);
 
@@ -52,6 +54,8 @@ class LayoutAttr : public Attribute::AttrBase<LayoutAttr, Attribute,
   using Layout = tensorflow::dtensor::Layout;
   using Mesh = tensorflow::dtensor::Mesh;
 
+  static constexpr StringLiteral name = "dtensor.layout";
+
   // Create a layout attribute.
   static LayoutAttr get(MLIRContext* context, Layout layout);
 
diff --git a/tensorflow/dtensor/mlir/layout_propagation_v2.cc b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
index 1b370b2c68468c..adf9219a4fd922 100644
--- a/tensorflow/dtensor/mlir/layout_propagation_v2.cc
+++ b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
@@ -705,8 +705,8 @@ mlir::LogicalResult UpdateLayoutsForOp(
 
 mlir::LogicalResult InsertDTensorLayoutOps(
     mlir::OpBuilder& builder,
-    const llvm::DenseMap<mlir::Value, Layout>& merged_layouts) {
-  for (const auto& merged_layout : merged_layouts) {
+    llvm::DenseMap<mlir::Value, Layout>& merged_layouts) {
+  for (auto& merged_layout : merged_layouts) {
     // merged_layout is a pair of mlir::Value and Layout.
     // If there is only one user of the Value and that user is a DTensorLayout
     // op, then we can skip creating the op as the layout is already there. Note
diff --git a/tensorflow/dtensor/python/BUILD b/tensorflow/dtensor/python/BUILD
index 4090792e1e97a8..bf0c9564a97bb8 100644
--- a/tensorflow/dtensor/python/BUILD
+++ b/tensorflow/dtensor/python/BUILD
@@ -1,7 +1,7 @@
 # DTensor Python API and libraries.
 
-load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 default_visibility = [
@@ -100,6 +100,25 @@ pytype_strict_library(
     ],
 )
 
+pytype_strict_library(
+    name = "d_random",
+    srcs = ["d_random.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":api",
+        ":layout",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:shape_util",
+        "//tensorflow/python/ops:stateless_random_ops_gen",
+    ],
+)
+
 pytype_strict_library(
     name = "d_variable",
     srcs = ["d_variable.py"],
diff --git a/tensorflow/dtensor/python/accelerator_util.py b/tensorflow/dtensor/python/accelerator_util.py
index e32d54b36c1834..b1e96c169de4e1 100644
--- a/tensorflow/dtensor/python/accelerator_util.py
+++ b/tensorflow/dtensor/python/accelerator_util.py
@@ -121,6 +121,7 @@ def initialize_accelerator_system(
     enable_coordination_service: Optional[bool] = True,
     num_logical_cpu_devices: Optional[int] = None,
     experimental_reset_context: Optional[bool] = False,
+    experimental_enable_megcore: Optional[bool] = False,
 ) -> str:
   """Initializes accelerators and communication fabrics for DTensor.
 
@@ -170,6 +171,7 @@ def initialize_accelerator_system(
       as an escape hatch, if there is no clear way to refactor your code to call
       initialize_accelerator_system() before calling TensorFlow APIs that
       initialize the context.
+    experimental_enable_megcore: Optionally enable megcore in backend.
 
   Returns:
     device_type: the type of accelerator that was initialized.
@@ -258,7 +260,7 @@ def initialize_accelerator_system(
       )
 
   if device_type == "TPU" and not config.backend_is_pw():
-    tpu_util.initialize_tpu_system()
+    tpu_util.initialize_tpu_system(use_megacore=experimental_enable_megcore)
 
   _INITIALIZED_ACCELERATOR_SYSTEM_TYPE = device_type
 
diff --git a/tensorflow/dtensor/python/d_random.py b/tensorflow/dtensor/python/d_random.py
new file mode 100644
index 00000000000000..1697f3598151c7
--- /dev/null
+++ b/tensorflow/dtensor/python/d_random.py
@@ -0,0 +1,331 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""DTensor helpers for random generators."""
+
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import gen_stateless_random_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import shape_util
+
+# ------------------------------------------------------------------------------
+# stateless rngs
+# ------------------------------------------------------------------------------
+
+
+# TODO(b/171746536): switch all rng ops to official versions once supported.
+def _old_tf_random_stateless_normal(
+    shape,
+    seed,
+    mean=0.0,
+    stddev=1.0,
+    dtype=dtypes.float32,
+    name=None,
+    layout=None,
+):
+  """DTensor stateless normal implementation that takes an layout."""
+  with ops.name_scope(
+      name, "stateless_random_normal", [shape, seed, mean, stddev]
+  ) as name:
+    seed = ops.convert_to_tensor(seed, dtype=dtypes.int32, name="seed")
+    shape = shape_util.shape_tensor(shape)
+    mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
+    stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
+    rnd = api.call_with_layout(
+        gen_stateless_random_ops.stateless_random_normal,
+        layout,
+        shape,
+        seed,
+        dtype,
+    )
+    result = math_ops.add(rnd * stddev, mean, name=name)
+    shape_util.maybe_set_static_shape(result, shape)
+    return result
+
+
+def _old_tf_random_stateless_uniform(
+    shape,
+    seed,
+    minval=0,
+    maxval=None,
+    dtype=dtypes.float32,
+    name=None,
+    layout=None,
+):
+  """DTensor stateless uniform implementation that takes an layout."""
+  dtype = dtypes.as_dtype(dtype)
+  accepted_dtypes = (
+      dtypes.float16,
+      dtypes.bfloat16,
+      dtypes.float32,
+      dtypes.float64,
+      dtypes.int32,
+      dtypes.int64,
+      dtypes.uint32,
+      dtypes.uint64,
+  )
+  if dtype not in accepted_dtypes:
+    raise ValueError(
+        f"Argument `dtype` got invalid value {dtype}. Accepted dtypes are "
+        f"{accepted_dtypes}."
+    )
+  if dtype.is_integer:
+    if (minval is None) != (maxval is None):
+      raise ValueError(
+          f"For integer `dtype` argument {dtype}, argument `minval` and "
+          f"`maxval` must be both None or not None. Got `minval`={minval} and "
+          f"`maxval`={maxval}."
+      )
+    if minval is not None and dtype in (dtypes.uint32, dtypes.uint64):
+      raise ValueError(
+          f"Argument `dtype` got invalid value {dtype} when argument `minval` "
+          "is not None. Please don't use unsigned integers in this case."
+      )
+
+  shape = shape_util.shape_tensor(shape)
+  with ops.name_scope(
+      name, "stateless_random_uniform", [shape, seed, minval, maxval]
+  ) as name:
+    seed = ops.convert_to_tensor(seed, dtype_hint=dtypes.int32, name="seed")
+
+    if dtype.is_integer and minval is None and maxval is None:
+      result = api.call_with_layout(
+          gen_stateless_random_ops.stateless_random_uniform_full_int,
+          layout,
+          shape,
+          seed=seed,
+          dtype=dtype,
+          name=name,
+      )
+    else:
+      if not dtype.is_integer and maxval is None:
+        maxval = 1
+      val_range = ops.convert_to_tensor(
+          maxval - minval, dtype=dtype, name="range"
+      )
+      minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
+      if dtype.is_integer:
+        result = api.call_with_layout(
+            gen_stateless_random_ops.stateless_random_uniform_int,
+            layout,
+            shape,
+            seed=seed,
+            minval=minval,
+            maxval=maxval,
+        )
+      else:
+        rnd = api.call_with_layout(
+            gen_stateless_random_ops.stateless_random_uniform,
+            layout,
+            shape,
+            seed=seed,
+            dtype=dtype,
+        )
+        result = math_ops.add(rnd * val_range, minval, name=name)
+    shape_util.maybe_set_static_shape(result, shape)
+    return result
+
+
+def _old_tf_stateless_truncated_normal(
+    shape,
+    seed,
+    mean=0.0,
+    stddev=1.0,
+    dtype=dtypes.float32,
+    name=None,
+    layout=None,
+):
+  """DTensor stateless truncated normal implementation that takes an layout."""
+  with ops.name_scope(
+      name, "stateless_truncated_normal", [shape, seed, mean, stddev]
+  ) as name:
+    seed = ops.convert_to_tensor(seed, dtype=dtypes.int32, name="seed")
+    shape = shape_util.shape_tensor(shape)
+    mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
+    stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
+    rnd = api.call_with_layout(
+        gen_stateless_random_ops.stateless_truncated_normal,
+        layout,
+        shape,
+        seed,
+        dtype,
+    )
+    result = math_ops.add(rnd * stddev, mean, name=name)
+    shape_util.maybe_set_static_shape(result, shape)
+    return result
+
+
+def stateless_random_normal(
+    shape,
+    seed,
+    mean=0.0,
+    stddev=1.0,
+    dtype=dtypes.float32,
+    name=None,
+    layout=None,
+):
+  """DTensor stateless RNG."""
+  if not context.executing_eagerly():
+    layout = None
+
+  return _old_tf_random_stateless_normal(
+      shape,
+      seed=seed,
+      mean=mean,
+      stddev=stddev,
+      dtype=dtype,
+      name=name,
+      layout=layout,
+  )
+
+
+def stateless_random_uniform(
+    shape,
+    seed,
+    minval=0,
+    maxval=None,
+    dtype=dtypes.float32,
+    name=None,
+    layout=None,
+):
+  """DTensor stateless random uniform."""
+  if not context.executing_eagerly():
+    layout = None
+
+  return _old_tf_random_stateless_uniform(
+      shape,
+      seed=seed,
+      minval=minval,
+      maxval=maxval,
+      dtype=dtype,
+      name=name,
+      layout=layout,
+  )
+
+
+def stateless_truncated_normal(
+    shape,
+    seed,
+    mean=0.0,
+    stddev=1.0,
+    dtype=dtypes.float32,
+    name=None,
+    layout=None,
+):
+  """DTensor stateless RNG."""
+  if not context.executing_eagerly():
+    layout = None
+
+  return _old_tf_stateless_truncated_normal(
+      shape,
+      seed=seed,
+      mean=mean,
+      stddev=stddev,
+      dtype=dtype,
+      name=name,
+      layout=layout,
+  )
+
+
+def stateless_split(seed, num=2, mesh=None):
+  seed = ops.convert_to_tensor(seed)
+  layout = None
+  if mesh:
+    layout = layout_lib.Layout.replicated(mesh, rank=2)
+  return stateless_random_uniform(
+      shape=[num, 2],
+      seed=seed,
+      dtype=seed.dtype,
+      minval=None,
+      maxval=None,
+      layout=layout,
+  )
+
+
+# ------------------------------------------------------------------------------
+# stateless dropout.
+# ------------------------------------------------------------------------------
+
+
+def _get_noise_shape(x, noise_shape):
+  """Noisve shape util copied from tf nn_ops."""
+  # If noise_shape is none return immediately.
+  if noise_shape is None:
+    return array_ops.shape(x)
+
+  try:
+    # Best effort to figure out the intended shape.
+    # If not possible, let the op to handle it.
+    # In eager mode exception will show up.
+    noise_shape_ = tensor_shape.as_shape(noise_shape)
+  except (TypeError, ValueError):
+    return noise_shape
+
+  if x.shape.dims is not None and len(x.shape.dims) == len(noise_shape_.dims):
+    new_dims = []
+    for i, dim in enumerate(x.shape.dims):
+      if noise_shape_.dims[i].value is None and dim.value is not None:
+        new_dims.append(dim.value)
+      else:
+        new_dims.append(noise_shape_.dims[i].value)
+    return tensor_shape.TensorShape(new_dims)
+
+  return noise_shape
+
+
+# TODO(b/171213877, b/169909066): Fix layout prop in function case for the rng
+# Op used. The layout prop should be able to propagate the layout from input
+# tensor `x` to the tf.mul and then back propagate the layout to the
+# `random_tensor`.
+def dropout(x, rate, noise_shape=None, seed=None, name=None):
+  """DTensor replacement for dropout."""
+  if not isinstance(rate, float):
+    raise ValueError("rate should be float for dropout.")
+  if seed is None:
+    raise ValueError("seed must be specified for DTensor dropout. Got: None")
+
+  with ops.name_scope(name, "dropout", [x]):
+    x_dtype = x.dtype
+    keep_prob = 1 - rate
+    scale = 1 / keep_prob
+    scale = ops.convert_to_tensor(scale, dtype=x_dtype)
+    ret = gen_math_ops.mul(x, scale)
+
+    noise_shape = _get_noise_shape(x, noise_shape)
+    # stateless_random_uniform requires a shape [2] seed.
+    seed = [seed, 0]
+
+    if context.executing_eagerly():
+      layout = api.fetch_layout(x)
+    else:
+      layout = None
+    random_tensor = _old_tf_random_stateless_uniform(
+        noise_shape, seed=seed, minval=0, maxval=1, dtype=x_dtype, layout=layout
+    )
+    keep_mask = random_tensor >= rate
+    ret = gen_math_ops.mul(ret, gen_math_ops.cast(keep_mask, x_dtype))
+    if not context.executing_eagerly():
+      ret.set_shape(x.get_shape())
+    return ret
+
+
+# TODO(b/195413777): error out for stateful dropout.
diff --git a/tensorflow/dtensor/python/tests/BUILD b/tensorflow/dtensor/python/tests/BUILD
index af1cd7b54c5a74..0a7d9f90345e30 100644
--- a/tensorflow/dtensor/python/tests/BUILD
+++ b/tensorflow/dtensor/python/tests/BUILD
@@ -8,6 +8,7 @@ load(
     "PATHWAYS",
     "PATHWAYS_V3_DONUT_BACKEND",
     "TPU_V3_DONUT_BACKEND",
+    "TPU_V4_DONUT_BACKEND",
     "dtensor_test",
 )
 
@@ -69,6 +70,32 @@ pytype_strict_library(
     ],
 )
 
+py_strict_test(
+    name = "api_test",
+    srcs = [
+        "api_test.py",
+    ],
+    python_version = "PY3",
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:d_random",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:numpy_util",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 # TODO(b/301286466): Investigate why python annotation type mismatch is not catptured by the type
 # strict BUILD rules.
 
@@ -88,6 +115,64 @@ dtensor_test(
     ],
 )
 
+dtensor_test(
+    name = "batchparallel_spmd_test",
+    srcs = ["batchparallel_spmd_test.py"],
+    additional_backends = [TPU_V4_DONUT_BACKEND],
+    main = "batchparallel_spmd_test.py",
+    shard_count = {
+        "cpu": 4,
+        "gpu": 4,
+        "tpu": 4,
+        TPU_V4_DONUT_BACKEND: 8,
+    },
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:numpy_util",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:image_ops_gen",
+        "//tensorflow/python/ops:linalg_ops_gen",
+        "//tensorflow/python/ops:nn_impl",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+dtensor_test(
+    name = "cache_test",
+    srcs = ["cache_test.py"],
+    main = "cache_test.py",
+    tags = [
+        "nomultivm",
+    ],
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 dtensor_test(
     name = "config_test",
     srcs = ["config_test.py"],
@@ -162,6 +247,35 @@ dtensor_test(
     ],
 )
 
+dtensor_test(
+    name = "conv_test",
+    srcs = [
+        "conv_test.py",
+    ],
+    additional_backends = [TPU_V3_DONUT_BACKEND],
+    # All tests require 8 TPUs.
+    disable = ["tpu"],
+    shard_count = {
+        "cpu": 4,
+        "gpu": 4,
+        TPU_V3_DONUT_BACKEND: 4,
+    },
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:numpy_util",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 dtensor_test(
     name = "device_test",
     srcs = ["device_test.py"],
@@ -224,6 +338,52 @@ py_strict_test(
     ],
 )
 
+py_strict_test(
+    name = "multi_client_input_util_test",
+    timeout = "long",
+    srcs = ["multi_client_input_util_test.py"],
+    env = {
+        "TF2_BEHAVIOR": "1",
+    },
+    shard_count = 8,
+    tags = [
+        # ThreadSanitizer does not support starting new threads after multi-threaded fork.
+        "notsan",
+        "no_oss",  # Fails on OSS.
+        "nosan",  # b/195537906
+    ],
+    deps = [
+        ":multi_client_test_util",
+        ":test_util",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/dtensor/python:accelerator_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:config",
+        "//tensorflow/dtensor/python:input_util",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:mesh_util",
+        "//tensorflow/python/data/experimental/service:server_lib",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:device_spec",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/lib/io:tf_record",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:parsing_config",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:parsing_ops_gen",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/logging",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 dtensor_test(
     name = "layout_test",
     srcs = ["layout_test.py"],
@@ -678,6 +838,81 @@ dtensor_test(
     ],
 )
 
+dtensor_test(
+    name = "rng_test",
+    size = "medium",
+    srcs = ["rng_test.py"],
+    additional_backends = [TPU_V3_DONUT_BACKEND],
+    # Requires at least 8 TPUs to run the tests.
+    disable = ["tpu"],
+    disable_tfrt = [
+        "gpu",
+        TPU_V3_DONUT_BACKEND,
+    ],
+    main = "rng_test.py",
+    shard_count = {
+        "cpu": 20,
+        "tpu": 10,
+        "gpu": 30,
+        TPU_V3_DONUT_BACKEND: 20,
+    },
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:numpy_util",
+        "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/distribute/cluster_resolver/tpu:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:bitwise_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateful_random_ops_gen",
+        "//tensorflow/python/ops:stateless_random_ops_v2_gen",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/tpu:device_assignment",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+dtensor_test(
+    name = "save_restore_v2_test",
+    srcs = ["save_restore_v2_test.py"],
+    additional_backends = [
+        TPU_V3_DONUT_BACKEND,
+        TPU_V4_DONUT_BACKEND,
+    ],
+    main = "save_restore_v2_test.py",
+    shard_count = {
+        "cpu": 8,
+        "gpu": 8,
+        TPU_V3_DONUT_BACKEND: 8,
+    },
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:numpy_util",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 dtensor_test(
     name = "variable_test",
     srcs = ["variable_test.py"],
@@ -707,3 +942,97 @@ dtensor_test(
         "//third_party/py/numpy",
     ],
 )
+
+dtensor_test(
+    name = "mnist_test",
+    size = "large",
+    srcs = ["mnist_test.py"],
+    shard_count = {
+        "tpu": 2,
+    },
+    tags = ["nosan"],  # Non-opt builds has slow XLA compilation.
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:input_util",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+dtensor_test(
+    name = "numerics_test",
+    srcs = ["numerics_test.py"],
+    additional_backends = [TPU_V3_DONUT_BACKEND],
+    disable = ALL_BACKENDS,
+    enable = [
+        "tpu",
+    ],
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:accelerator_util",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:numpy_util",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+dtensor_test(
+    name = "sparse_test",
+    srcs = ["sparse_test.py"],
+    main = "sparse_test.py",
+    shard_count = {
+        "cpu": 4,
+    },
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:numpy_util",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+dtensor_test(
+    name = "tpu_device_assignment_test",
+    srcs = ["tpu_device_assignment_test.py"],
+    disable = ALL_BACKENDS,
+    enable = [
+        "tpu",
+    ],
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:numpy_util",
+        "//tensorflow/dtensor/python:tpu_util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
diff --git a/tensorflow/dtensor/python/tests/api_test.py b/tensorflow/dtensor/python/tests/api_test.py
new file mode 100644
index 00000000000000..7231086651439f
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/api_test.py
@@ -0,0 +1,305 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for the internal DTensor Python API."""
+
+from absl.testing import parameterized
+import numpy as np
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import d_random
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python import numpy_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.platform import test
+
+Layout = layout_lib.Layout
+Mesh = layout_lib.Mesh
+_MESH_DIM_X = 'x'
+_MESH_DIM_Y = 'y'
+
+
+class APITest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(APITest, self).setUp()
+    global_ids = test_util.create_device_ids_array((2, 2))
+    local_device_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        'CPU': Mesh(
+            [_MESH_DIM_X, _MESH_DIM_Y],
+            global_ids,
+            local_device_ids,
+            test_util.create_device_list((2, 2), 'CPU'),
+        )
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+    self.layouts_1d = [
+        Layout.replicated(self.mesh, rank=1),
+        Layout.batch_sharded(self.mesh, _MESH_DIM_X, rank=1),
+        Layout.batch_sharded(self.mesh, _MESH_DIM_Y, rank=1),
+    ]
+    self.layouts_2d = [
+        Layout.replicated(self.mesh, rank=2),
+        Layout.batch_sharded(self.mesh, _MESH_DIM_X, rank=2),
+        Layout.inner_sharded(self.mesh, _MESH_DIM_X, rank=2),
+        Layout([_MESH_DIM_X, _MESH_DIM_Y], self.mesh),
+    ]
+
+  def testV2API(self):
+    layout = Layout.replicated(self.mesh, rank=1)
+    zero_tensor = array_ops.zeros([10], layout=layout)
+    zero_like_tensor = array_ops.zeros_like_v2(zero_tensor, layout=layout)
+    self.assertAllEqual(zero_like_tensor.numpy(), zero_tensor.numpy())
+
+    ones_tensor = array_ops.ones([10], layout=layout)
+    ones_like_tensor = array_ops.ones_like_v2(zero_tensor, layout=layout)
+    self.assertAllEqual(ones_like_tensor.numpy(), ones_tensor.numpy())
+
+  def testStatelessRandom(self):
+    # test dtype default float32 random
+    result = stateless_random_ops.stateless_random_uniform(
+        [10],
+        seed=constant_op.constant([0, 0], dtype=dtypes.int64),
+        minval=0.0,
+        maxval=10.0,
+    )
+    self.assertEqual([10], result.shape)
+
+    # test dtype default int32 minval maxval are both None
+    result = stateless_random_ops.stateless_random_uniform(
+        [10],
+        seed=constant_op.constant([1, 2], dtype=dtypes.int64),
+        dtype=dtypes.int32,
+        minval=None,
+        maxval=None,
+    )
+    self.assertEqual([10], result.shape)
+
+    # test maxval is None or not given
+    result = stateless_random_ops.stateless_random_uniform(
+        [10],
+        seed=constant_op.constant([1, 2], dtype=dtypes.int64),
+        maxval=12,
+        dtype=dtypes.int32,
+    )
+    self.assertEqual([10], result.shape)
+    self.assertAllInRange(result, 0, 12)
+
+  def testStatelessRandomNormal(self):
+    # test dtype default float32 random
+    result = stateless_random_ops.stateless_random_normal(
+        [10], seed=constant_op.constant([0, 0], dtype=dtypes.int32)
+    )
+    self.assertEqual([10], result.shape)
+
+    # test dtype double
+    result = stateless_random_ops.stateless_random_normal(
+        [10],
+        seed=constant_op.constant([1, 2], dtype=dtypes.int32),
+        dtype=dtypes.double,
+    )
+    self.assertEqual([10], result.shape)
+
+    # test mean and stddev
+    result = stateless_random_ops.stateless_random_normal(
+        [10],
+        seed=constant_op.constant([1, 2], dtype=dtypes.int32),
+        mean=0,
+        stddev=0,
+    )
+    self.assertEqual([10], result.shape)
+    self.assertAllInRange(result, 0, 0)
+
+    # test dtensor version of each, check layouts
+    layout = Layout.replicated(self.mesh, rank=1)
+
+    # test dtype default float 32 random
+    result = d_random.stateless_random_normal(
+        [10],
+        seed=constant_op.constant([0, 0], dtype=dtypes.int32),
+        layout=layout,
+    )
+    self.assertEqual([10], result.shape)
+    self.assertEqual(layout, api.fetch_layout(result))
+
+    # test dtype double
+    result = d_random.stateless_random_normal(
+        [10],
+        seed=constant_op.constant([1, 2], dtype=dtypes.int32),
+        dtype=dtypes.double,
+        layout=layout,
+    )
+    self.assertEqual([10], result.shape)
+    self.assertEqual(layout, api.fetch_layout(result))
+
+    # test mean and stddev
+    result = d_random.stateless_random_normal(
+        [10],
+        seed=constant_op.constant([1, 2], dtype=dtypes.int32),
+        mean=0,
+        stddev=0,
+        layout=layout,
+    )
+    self.assertEqual([10], result.shape)
+    self.assertAllInRange(result, 0, 0)
+    self.assertEqual(layout, api.fetch_layout(result))
+
+  @parameterized.named_parameters(*set(
+      test_util.product((('_labels_unsharded', 0), ('_labels_batch', 1),
+                         ('_labels_inner', 2), ('_labels_both', 3)),
+                        (('_logits_unsharded', 0), ('_logits_batch', 1),
+                         ('_logits_inner', 2), ('_logits_both', 3)))))
+  def testSoftmaxCrossentropyWithLogits(self, labels_layout, logits_layout):
+    expected_layout = Layout.replicated(self.mesh, rank=1)
+    if (labels_layout == 1 or labels_layout == 3 or logits_layout == 1 or
+        logits_layout == 3):
+      expected_layout = Layout.inner_sharded(self.mesh, _MESH_DIM_X, rank=1)
+
+    labels_layout = self.layouts_2d[labels_layout]
+    logits_layout = self.layouts_2d[logits_layout]
+    labels_numpy = np.random.uniform(size=[6, 4])
+    logits_numpy = np.random.uniform(size=[6, 4])
+    labels = constant_op.constant(labels_numpy, dtype=dtypes.float32)
+    logits = constant_op.constant(logits_numpy, dtype=dtypes.float32)
+
+    # Should we test against the built in version or the patched version?
+    expected = nn_ops.softmax_cross_entropy_with_logits_v2(
+        labels=labels, logits=logits
+    )
+
+    labels = numpy_util.pack_numpy(labels, labels_layout)
+    logits = numpy_util.pack_numpy(logits, logits_layout)
+    dtensor_result = nn_ops.softmax_cross_entropy_with_logits_v2(
+        labels=labels, logits=logits
+    )
+    self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  @parameterized.named_parameters(*set(
+      test_util.product((('_labels_unsharded', 0), ('_labels_batch_x', 1),
+                         ('_labels_batch_y', 2)),
+                        (('_logits_unsharded', 0), ('_logits_batch', 1),
+                         ('_logits_inner', 2), ('_logits_both', 3)))))
+  def testSparseSoftmaxCrossentropyWithLogits(self, labels_layout,
+                                              logits_layout):
+    expected_layout = Layout.replicated(self.mesh, rank=1)
+    if labels_layout == 1 or logits_layout == 1 or logits_layout == 3:
+      expected_layout = Layout.inner_sharded(self.mesh, _MESH_DIM_X, rank=1)
+    elif labels_layout == 2:
+      expected_layout = Layout.inner_sharded(self.mesh, _MESH_DIM_Y, rank=1)
+
+    labels_layout = self.layouts_1d[labels_layout]
+    logits_layout = self.layouts_2d[logits_layout]
+    labels_numpy = np.random.randint(size=[6], low=0, high=4)
+    logits_numpy = np.random.uniform(size=[6, 4])
+    labels = constant_op.constant(labels_numpy, dtype=dtypes.int64)
+    logits = constant_op.constant(logits_numpy, dtype=dtypes.float32)
+
+    # Should we test against the built in version or the patched version?
+    expected = nn_ops.sparse_softmax_cross_entropy_with_logits_v2(
+        labels=labels, logits=logits
+    )
+
+    labels = numpy_util.pack_numpy(labels, labels_layout)
+    logits = numpy_util.pack_numpy(logits, logits_layout)
+    dtensor_result = nn_ops.sparse_softmax_cross_entropy_with_logits_v2(
+        labels=labels, logits=logits
+    )
+    self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def test_dropout_raises_on_none_seed(self):
+    with api.default_mesh(self.mesh):
+      with self.assertRaisesRegex(ValueError, 'seed must be specified'):
+        _ = d_random.dropout(
+            array_ops.ones([2, 2], dtype=dtypes.float32), rate=0.5, seed=None
+        )
+
+  def test_default_mesh(self):
+
+    @polymorphic_function.function
+    def func(a):
+      return a + 3.0
+
+    with api.default_mesh(self.mesh):
+      a = array_ops.zeros(shape=())
+      result = func(a)
+
+    self.assertEqual(result, 3.0)
+    self.assertEqual(api.fetch_layout(result).mesh, self.mesh)
+    self.assertTrue(api.fetch_layout(result).is_fully_replicated())
+    self.assertEqual(result.device, api.device_name())
+
+    # Also make sure it works as wrapper
+    @api.default_mesh(self.mesh)
+    def func2():
+      b = array_ops.ones(shape=())
+      return func(b)
+
+    result = func2()
+    self.assertEqual(result, 4.0)
+    self.assertEqual(api.fetch_layout(result).mesh, self.mesh)
+    self.assertTrue(api.fetch_layout(result).is_fully_replicated())
+    self.assertEqual(result.device, api.device_name())
+
+    with self.assertRaisesRegex(ValueError, 'Expect `mesh` to be `Mesh`'):
+      with api.default_mesh(None):
+        pass
+
+  def test_default_mesh_with_constant(self):
+
+    @polymorphic_function.function
+    def func():
+      return constant_op.constant([3, 4])
+
+    with api.default_mesh(self.mesh):
+      result = func()
+
+    self.assertAllEqual(result, [3, 4])
+    self.assertEqual(api.fetch_layout(result).mesh, self.mesh)
+    self.assertTrue(api.fetch_layout(result).is_fully_replicated())
+    self.assertEqual(result.device, api.device_name())
+
+  def test_error_no_default_mesh(self):
+    with self.assertRaisesRegex(
+        errors_impl.InvalidArgumentError,
+        'No default mesh has been registered to DTensor',
+    ):
+      with ops.device_v2(api.device_name()):
+        _ = constant_op.constant(3.0)
+
+  def test_get_default_mesh(self):
+    self.assertIsNone(api.get_default_mesh())
+    with api.default_mesh(self.mesh):
+      self.assertEqual(api.get_default_mesh(), self.mesh)
+
+      with api.default_mesh(self.mesh.host_mesh()):
+        self.assertEqual(api.get_default_mesh(), self.mesh.host_mesh())
+
+      self.assertEqual(api.get_default_mesh(), self.mesh)
+
+    self.assertIsNone(api.get_default_mesh())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/dtensor/python/tests/batchparallel_spmd_test.py b/tensorflow/dtensor/python/tests/batchparallel_spmd_test.py
new file mode 100644
index 00000000000000..b6cbbd0459b8e6
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/batchparallel_spmd_test.py
@@ -0,0 +1,660 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for batchparallel_spmd."""
+
+import itertools
+from absl.testing import parameterized
+import numpy as np
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python import numpy_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.dtensor.python.tests import test_util_ops
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_image_ops
+from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+# pylint: enable=g-direct-tensorflow-import
+
+Layout = layout_lib.Layout
+Mesh = layout_lib.Mesh
+
+
+class DTensorBatchParallelSPMDTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(DTensorBatchParallelSPMDTest, self).setUp()
+
+    self.skipForDeviceType(['TPU'],
+                           'all tests require 8 TPU cores.',
+                           unless_device_count_equals_to=8)
+    # Builds a 8x2 mesh.
+    self._mesh_dim_b = 'b'
+    self._mesh_dim_x = 'x'
+    self._dims = [self._mesh_dim_b, self._mesh_dim_x]
+
+    global_ids = test_util.create_device_ids_array((4, 2))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: Mesh(
+            self._dims,
+            global_ids,
+            local_ids,
+            test_util.create_device_list((4, 2), device),
+        )
+        for device in ('CPU', 'GPU', 'TPU')
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+    context.ensure_initialized()
+
+    # Creates a bunch of common layouts used by tests later.
+    # 4-d
+    self.replicated_layout_4d = Layout.replicated(self.mesh, rank=4)
+    self.batch_layout_4d = Layout.batch_sharded(
+        self.mesh, self._mesh_dim_b, rank=4)
+
+    # 5-d
+    self.replicated_layout_5d = Layout.replicated(self.mesh, rank=5)
+    self.batch_layout_5d = Layout.batch_sharded(
+        self.mesh, self._mesh_dim_b, rank=5)
+
+  @parameterized.named_parameters(('NoBatchDim', 0), ('SingleBatchDim', 1),
+                                  ('TwoBatchDim', 2))
+  def testCholesky(self, num_batch_dim):
+    # Input needs to be symmetric and positive definite.
+    x = constant_op.constant(
+        [[1, 1, 1, 1], [1, 5, 5, 5], [1, 5, 14, 14], [1, 5, 14, 17]],
+        dtype=dtypes.float32,
+    )
+    for _ in range(num_batch_dim):
+      x = array_ops.expand_dims_v2(x, 0)
+      s = [4] + [1 for _ in range(array_ops.rank(x) - 1)]
+      x = gen_array_ops.tile(x, s)
+
+    expected_result = gen_linalg_ops.cholesky(x)
+
+    if num_batch_dim == 0:
+      layout_spec = []
+    elif num_batch_dim == 1:
+      layout_spec = [self._mesh_dim_b]
+    elif num_batch_dim == 2:
+      layout_spec = [self._mesh_dim_b, self._mesh_dim_x]
+    layout = Layout(layout_spec + ['unsharded'] * 2, self.mesh)
+
+    x = numpy_util.pack_numpy(x, layout)
+    got = gen_linalg_ops.cholesky(input=x)
+    self.assertDTensorEqual(expected_result, layout, got)
+
+  @parameterized.named_parameters(
+      test_util.product(
+          [('NoBatchDim', 0), ('SingleBatchDim', 1), ('TwoBatchDim', 2)],
+          test_util_ops.FFT_OPS,
+      )
+  )
+  def testFFT(self, num_batch_dim, fft_op, num_nonbatch_dim):
+    shape = [4 for i in range(num_batch_dim + num_nonbatch_dim)]
+    np.random.seed(123)
+    x = constant_op.constant(
+        np.random.normal(0.0, 1.0, np.prod(shape)).reshape(shape),
+        dtype=dtypes.complex64,
+    )
+    expected_result = fft_op(input=x)
+
+    if num_batch_dim == 0:
+      layout_spec = []
+    elif num_batch_dim == 1:
+      layout_spec = [self._mesh_dim_b]
+    elif num_batch_dim == 2:
+      layout_spec = [self._mesh_dim_b, self._mesh_dim_x]
+    layout = Layout(layout_spec + ['unsharded'] * num_nonbatch_dim, self.mesh)
+
+    x = numpy_util.pack_numpy(x, layout)
+    got = fft_op(input=x)
+    self.assertDTensorEqual(expected_result, layout, got)
+
+  @parameterized.named_parameters(
+      test_util.product(
+          [('NoBatchDim', 0), ('SingleBatchDim', 1), ('TwoBatchDim', 2)],
+          test_util_ops.RFFT_OPS,
+      )
+  )
+  def testRFFT(self, num_batch_dim, rfft_op, num_nonbatch_dim, dtype):
+    self.skipForDeviceType(['GPU'], 'RFFT has numerical issues on GPU')
+    shape = [4 for i in range(num_batch_dim + num_nonbatch_dim)]
+    np.random.seed(123)
+    x = constant_op.constant(
+        np.random.normal(0.0, 1.0, np.prod(shape)).reshape(shape), dtype=dtype
+    )
+    expected_result = rfft_op(input=x, fft_length=[2] * num_nonbatch_dim)
+
+    if num_batch_dim == 0:
+      layout_spec = []
+    elif num_batch_dim == 1:
+      layout_spec = [self._mesh_dim_b]
+    elif num_batch_dim == 2:
+      layout_spec = [self._mesh_dim_b, self._mesh_dim_x]
+    layout = Layout(layout_spec + ['unsharded'] * num_nonbatch_dim, self.mesh)
+
+    x = numpy_util.pack_numpy(x, layout)
+    got = rfft_op(input=x, fft_length=[2] * num_nonbatch_dim)
+    self.assertDTensorEqual(expected_result, layout, got)
+
+  @parameterized.named_parameters(
+      test_util.product(
+          [('Replicated', 'replicated'), ('Sharded', 'batch')],
+          [
+              (
+                  'SamePadding',
+                  'SAME',
+              ),
+              (
+                  'ValidPadding',
+                  'VALID',
+              ),
+          ],
+          test_util_ops.BATCH_PARALLEL_2D_WINDOW_OPS,
+      )
+  )
+  def test2DWindowOp(self, layout_spec, padding, op):
+    np.random.seed(123)
+    row_window_size = 3
+    col_window_size = 4
+    window_size = [1, row_window_size, col_window_size, 1]
+    stride_size = [1, row_window_size - 1, col_window_size - 1, 1]
+
+    num_rows = (row_window_size - 1) * 5 + 1
+    num_cols = (col_window_size - 1) * 7 + 1
+    x_in = np.random.normal(0.0, 1.0, 8 * num_rows * num_cols * 3).reshape(
+        [8, num_rows, num_cols, 3])
+
+    inputs = constant_op.constant(x_in, dtype=dtypes.float32)
+    expected_result = op(inputs, window_size, stride_size, padding)
+
+    if layout_spec == 'replicated':
+      layout = self.replicated_layout_4d
+    else:
+      layout = self.batch_layout_4d
+
+    x = numpy_util.pack_numpy(inputs, layout)
+    got = op(x, window_size, stride_size, padding)
+    self.assertDTensorEqual(expected_result, layout, got)
+
+  @parameterized.named_parameters(
+      test_util.product(
+          [('Replicated', 'replicated'), ('BatchSharded', 'batch')],
+          [
+              (
+                  'SamePadding',
+                  'SAME',
+              ),
+              (
+                  'ValidPadding',
+                  'VALID',
+              ),
+          ],
+          test_util_ops.BATCH_PARALLEL_3D_WINDOW_OPS,
+      )
+  )
+  def test3DWindowOp(self, layout_spec, padding, op):
+    np.random.seed(123)
+    dep_window_size = 2
+    row_window_size = 3
+    col_window_size = 4
+    window_size = [1, dep_window_size, row_window_size, col_window_size, 1]
+    stride_size = [
+        1, dep_window_size - 1, row_window_size - 1, col_window_size - 1, 1
+    ]
+
+    num_deps = 3
+    num_rows = (row_window_size - 1) * 5 + 1
+    num_cols = (col_window_size - 1) * 7 + 1
+    x_in = np.random.normal(0.0, 1.0, 8 * num_deps * num_rows * num_cols *
+                            3).reshape([8, num_deps, num_rows, num_cols, 3])
+
+    inputs = constant_op.constant(x_in, dtype=dtypes.float32)
+    expected_result = op(inputs, window_size, stride_size, padding)
+
+    if layout_spec == 'replicated':
+      layout = self.replicated_layout_5d
+    else:
+      layout = self.batch_layout_5d
+
+    x = numpy_util.pack_numpy(inputs, layout)
+
+    got = op(x, window_size, stride_size, padding)
+
+    self.assertDTensorEqual(expected_result, layout, got)
+
+  @parameterized.named_parameters(test_util_ops.PADDINGS)
+  def testDepthwiseConv2dNative(self, padding):
+    np.random.seed(123)
+    x_in = np.random.normal(0.0, 1.0, 8 * 9 * 9).reshape([8, 9, 9, 1])
+
+    kernel_in = np.array([
+        [[[2, 0.1]], [[3, 0.2]]],
+        [[[0, 0.3]], [[1, 0.4]]],
+    ])
+
+    inputs = constant_op.constant(x_in, dtype=dtypes.float32)
+    kernel = constant_op.constant(kernel_in, dtype=dtypes.float32)
+    expected_result = nn_impl.depthwise_conv2d_v2(
+        inputs, kernel, strides=[1, 1, 1, 1], padding=padding
+    )
+
+    layout = self.batch_layout_4d
+
+    x = numpy_util.pack_numpy(inputs, layout)
+    kernel = numpy_util.pack_numpy(kernel, self.replicated_layout_4d)
+    got = nn_impl.depthwise_conv2d_v2(
+        x, kernel, strides=[1, 1, 1, 1], padding=padding
+    )
+
+    self.assertDTensorEqual(expected_result, layout, got)
+
+  @parameterized.named_parameters(('Sharded', 'sharded'),
+                                  ('Replicated', 'replicated'))
+  def testResizeBilinear(self, shard_spec):
+    np.random.seed(123)
+    images = constant_op.constant(
+        np.random.normal(0.0, 1.0, 8 * 9 * 9).reshape([8, 9, 9, 1]),
+        dtype=dtypes.float32,
+    )
+
+    expected_result = gen_image_ops.resize_bilinear(
+        images=images,
+        size=[3, 3],
+        align_corners=False,
+        half_pixel_centers=False,
+        name=None,
+    )
+
+    if shard_spec == 'sharded':
+      layout = self.batch_layout_4d
+    else:
+      layout = self.replicated_layout_4d
+    images = numpy_util.pack_numpy(images, layout)
+
+    got = gen_image_ops.resize_bilinear(
+        images=images,
+        size=[3, 3],
+        align_corners=False,
+        half_pixel_centers=False,
+        name=None,
+    )
+
+    self.assertDTensorEqual(expected_result, layout, got)
+
+  @parameterized.named_parameters(('Sharded', 'sharded'),
+                                  ('Replicated', 'replicated'))
+  def testResizeNearestNeighbor(self, shard_spec):
+    np.random.seed(123)
+    images = constant_op.constant(
+        np.random.normal(0.0, 1.0, 8 * 9 * 9).reshape([8, 9, 9, 1]),
+        dtype=dtypes.float32,
+    )
+
+    expected_result = gen_image_ops.resize_nearest_neighbor(
+        images=images,
+        size=[3, 3],
+        align_corners=False,
+        half_pixel_centers=False,
+        name=None,
+    )
+
+    if shard_spec == 'sharded':
+      layout = self.batch_layout_4d
+    else:
+      layout = self.replicated_layout_4d
+    images = numpy_util.pack_numpy(images, layout)
+
+    got = gen_image_ops.resize_nearest_neighbor(
+        images=images,
+        size=[3, 3],
+        align_corners=False,
+        half_pixel_centers=False,
+        name=None,
+    )
+
+    self.assertDTensorEqual(expected_result, layout, got)
+
+  @parameterized.named_parameters(('Sharded', 'sharded'),
+                                  ('Replicated', 'replicated'))
+  def testAdjustContrastv2(self, shard_spec):
+    np.random.seed(123)
+    images = constant_op.constant(
+        np.random.normal(0.0, 1.0, 8 * 9 * 9 * 3).reshape([8, 9, 9, 3]),
+        dtype=dtypes.float32,
+    )
+
+    expected_result = gen_image_ops.adjust_contrastv2(
+        images=images, contrast_factor=0.5
+    )
+
+    if shard_spec == 'sharded':
+      layout = self.batch_layout_4d
+    else:
+      layout = self.replicated_layout_4d
+    images = numpy_util.pack_numpy(images, layout)
+
+    got = gen_image_ops.adjust_contrastv2(images=images, contrast_factor=0.5)
+
+    self.assertDTensorEqual(expected_result, layout, got)
+
+  @parameterized.named_parameters(('Sharded', 'sharded'),
+                                  ('Replicated', 'replicated'))
+  def testAdjustSaturation(self, shard_spec):
+    np.random.seed(123)
+    images = constant_op.constant(
+        np.random.normal(0.0, 1.0, 8 * 9 * 9 * 3).reshape([8, 9, 9, 3]),
+        dtype=dtypes.float32,
+    )
+
+    expected_result = gen_image_ops.adjust_saturation(images=images, scale=0.5)
+
+    if shard_spec == 'sharded':
+      layout = self.batch_layout_4d
+    else:
+      layout = self.replicated_layout_4d
+    images = numpy_util.pack_numpy(images, layout)
+
+    got = gen_image_ops.adjust_saturation(images=images, scale=0.5)
+
+    self.assertDTensorEqual(expected_result, layout, got)
+
+  @parameterized.parameters(
+      itertools.permutations(['sharded', 'replicated'], 2))
+  def testResizeBilinearGradBatchSharded(self, spec1, spec2):
+    np.random.seed(123)
+    images = constant_op.constant(
+        np.random.normal(0.0, 1.0, 8 * 9 * 9).reshape([8, 9, 9, 1]),
+        dtype=dtypes.float32,
+    )
+    grads = constant_op.constant(
+        np.random.normal(0.0, 1.0, 8 * 9 * 9).reshape([8, 9, 9, 1]),
+        dtype=dtypes.float32,
+    )
+    expected_result = gen_image_ops.resize_bilinear_grad(
+        grads=grads,
+        original_image=images,
+        align_corners=False,
+        half_pixel_centers=False,
+        name=None,
+    )
+
+    specs = [spec1, spec2]
+    layouts = [
+        self.batch_layout_4d if spec == 'sharded' else self.replicated_layout_4d
+        for spec in specs
+    ]
+
+    # Test images is replicated, grads is batch sharded
+    images = numpy_util.pack_numpy(images, layouts[0])
+    grads = numpy_util.pack_numpy(grads, layouts[1])
+
+    got = gen_image_ops.resize_bilinear_grad(
+        grads=grads,
+        original_image=images,
+        align_corners=False,
+        half_pixel_centers=False,
+        name=None,
+    )
+    self.assertDTensorEqual(expected_result, self.batch_layout_4d, got)
+
+  def testResizeBilinearGradReplicated(self):
+    np.random.seed(123)
+    images = constant_op.constant(
+        np.random.normal(0.0, 1.0, 8 * 9 * 9).reshape([8, 9, 9, 1]),
+        dtype=dtypes.float32,
+    )
+    grads = constant_op.constant(
+        np.random.normal(0.0, 1.0, 8 * 9 * 9).reshape([8, 9, 9, 1]),
+        dtype=dtypes.float32,
+    )
+    expected_result = gen_image_ops.resize_bilinear_grad(
+        grads=grads,
+        original_image=images,
+        align_corners=False,
+        half_pixel_centers=False,
+        name=None,
+    )
+
+    images = numpy_util.pack_numpy(images, self.replicated_layout_4d)
+    grads = numpy_util.pack_numpy(grads, self.replicated_layout_4d)
+
+    got = gen_image_ops.resize_bilinear_grad(
+        grads=grads,
+        original_image=images,
+        align_corners=False,
+        half_pixel_centers=False,
+        name=None,
+    )
+    self.assertDTensorEqual(expected_result, self.replicated_layout_4d, got)
+
+  @parameterized.named_parameters(
+      test_util.product([('Replicated', 'replicated'), ('Sharded', 'batch')], [(
+          'SamePadding',
+          'SAME',
+      ), (
+          'ValidPadding',
+          'VALID',
+      )]))
+  def testMaxPool3DGrad(self, shard_spec, padding):
+    np.random.seed(123)
+    dep_window_size = 2
+    row_window_size = 3
+    col_window_size = 4
+    window_size = [1, dep_window_size, row_window_size, col_window_size, 1]
+    stride_size = [
+        1, dep_window_size - 1, row_window_size - 1, col_window_size - 1, 1
+    ]
+
+    num_deps = 3
+    num_rows = (row_window_size - 1) * 5 + 1
+    num_cols = (col_window_size - 1) * 7 + 1
+    x_in = np.random.normal(0.0, 1.0, 8 * num_deps * num_rows * num_cols *
+                            3).reshape([8, num_deps, num_rows, num_cols, 3])
+    inputs = constant_op.constant(x_in, dtype=dtypes.float32)
+
+    with backprop.GradientTape() as tape:
+      tape.watch([inputs])
+      expected_result = nn_ops.max_pool3d(
+          inputs, window_size, stride_size, padding
+      )
+    expected_grad = tape.gradient(expected_result, [inputs])
+    layout = (
+        self.batch_layout_5d
+        if shard_spec == 'sharded'
+        else self.replicated_layout_5d
+    )
+
+    inputs = numpy_util.pack_numpy(inputs, layout)
+
+    with ops.device_v2(api.device_name()):
+      with backprop.GradientTape() as tape:
+        tape.watch([inputs])
+        dtensor_result = nn_ops.max_pool3d(
+            inputs, window_size, stride_size, padding
+        )
+      dtensor_grad = tape.gradient(dtensor_result, [inputs])
+
+    self.assertDTensorEqual(expected_grad[0], layout, dtensor_grad[0])
+
+  @parameterized.named_parameters(
+      test_util.product([('Replicated', 'replicated'), ('Sharded', 'batch')], [(
+          'SamePadding',
+          'SAME',
+      ), (
+          'ValidPadding',
+          'VALID',
+      )]))
+  def testMaxPool3DGradGrad(self, shard_spec, padding):
+    np.random.seed(123)
+    dep_window_size = 2
+    row_window_size = 3
+    col_window_size = 4
+    window_size = [1, dep_window_size, row_window_size, col_window_size, 1]
+    stride_size = [
+        1, dep_window_size - 1, row_window_size - 1, col_window_size - 1, 1
+    ]
+
+    num_deps = 3
+    num_rows = (row_window_size - 1) * 5 + 1
+    num_cols = (col_window_size - 1) * 7 + 1
+    x_in = np.random.normal(0.0, 1.0, 8 * num_deps * num_rows * num_cols *
+                            3).reshape([8, num_deps, num_rows, num_cols, 3])
+    inputs = constant_op.constant(x_in, dtype=dtypes.float32)
+
+    with backprop.GradientTape() as outer_tape:
+      with backprop.GradientTape() as inner_tape:
+        outer_tape.watch([inputs])
+        inner_tape.watch([inputs])
+        expected_result = nn_ops.max_pool3d(
+            inputs, window_size, stride_size, padding
+        )
+      expected_first_grad = inner_tape.gradient(expected_result, [inputs])
+    expected_second_grad = outer_tape.gradient(expected_first_grad, [inputs])
+
+    if shard_spec == 'sharded':
+      layout = self.batch_layout_5d
+    else:
+      layout = self.replicated_layout_5d
+
+    inputs = numpy_util.pack_numpy(inputs, layout)
+
+    @polymorphic_function.function()
+    def compute_gradients(inputs):
+      with backprop.GradientTape() as outer_tape:
+        with backprop.GradientTape() as inner_tape:
+          outer_tape.watch([inputs])
+          inner_tape.watch([inputs])
+          dtensor_result = nn_ops.max_pool3d(
+              inputs, window_size, stride_size, padding
+          )
+        dtensor_first_grad = inner_tape.gradient(dtensor_result, [inputs])
+      dtensor_second_grad = outer_tape.gradient(dtensor_first_grad[0], [inputs])
+      return dtensor_first_grad, dtensor_second_grad
+
+    dtensor_first_grad, dtensor_second_grad = compute_gradients(inputs)
+
+    self.assertDTensorEqual(expected_first_grad[0], layout,
+                            dtensor_first_grad[0])
+    self.assertDTensorEqual(expected_second_grad[0], layout,
+                            dtensor_second_grad[0])
+
+  @parameterized.named_parameters(
+      test_util.product([('Replicated', 'replicated'), ('Sharded', 'batch')], [(
+          'SamePadding',
+          'SAME',
+      ), (
+          'ValidPadding',
+          'VALID',
+      )]))
+  def testMaxPoolGradGrad(self, shard_spec, padding):
+    np.random.seed(123)
+    row_window_size = 3
+    col_window_size = 4
+    window_size = [1, row_window_size, col_window_size, 1]
+    stride_size = [1, row_window_size - 1, col_window_size - 1, 1]
+
+    num_rows = (row_window_size - 1) * 5 + 1
+    num_cols = (col_window_size - 1) * 7 + 1
+    x_in = np.random.normal(0.0, 1.0, 8 * num_rows * num_cols * 3).reshape(
+        [8, num_rows, num_cols, 3])
+    inputs = constant_op.constant(x_in, dtype=dtypes.float32)
+
+    with backprop.GradientTape() as outer_tape:
+      with backprop.GradientTape() as inner_tape:
+        outer_tape.watch([inputs])
+        inner_tape.watch([inputs])
+        expected_result = nn_ops.max_pool_v2(
+            inputs, window_size, stride_size, padding
+        )
+      expected_first_grad = inner_tape.gradient(expected_result, [inputs])
+    expected_second_grad = outer_tape.gradient(expected_first_grad, [inputs])
+
+    if shard_spec == 'sharded':
+      layout = self.batch_layout_4d
+    else:
+      layout = self.replicated_layout_4d
+    inputs = numpy_util.pack_numpy(inputs, layout)
+
+    @polymorphic_function.function()
+    def compute_gradients(inputs):
+      with backprop.GradientTape() as outer_tape:
+        with backprop.GradientTape() as inner_tape:
+          outer_tape.watch([inputs])
+          inner_tape.watch([inputs])
+          dtensor_result = nn_ops.max_pool_v2(
+              inputs, window_size, stride_size, padding
+          )
+        dtensor_first_grad = inner_tape.gradient(dtensor_result, [inputs])
+      dtensor_second_grad = outer_tape.gradient(dtensor_first_grad[0], [inputs])
+      return dtensor_first_grad, dtensor_second_grad
+
+    dtensor_first_grad, dtensor_second_grad = compute_gradients(inputs)
+
+    self.assertDTensorEqual(expected_first_grad[0], layout,
+                            dtensor_first_grad[0])
+    self.assertDTensorEqual(expected_second_grad[0], layout,
+                            dtensor_second_grad[0])
+
+  @parameterized.named_parameters(('Sharded', 'sharded'),
+                                  ('Replicated', 'replicated'))
+  def testResizeNearestNeighborGrad(self, shard_spec):
+    np.random.seed(123)
+    grads = constant_op.constant(
+        np.random.normal(0.0, 1.0, 8 * 9 * 9).reshape([8, 9, 9, 1]),
+        dtype=dtypes.float32,
+    )
+    expected_result = gen_image_ops.resize_nearest_neighbor_grad(
+        grads=grads,
+        size=[3, 3],
+        align_corners=False,
+        half_pixel_centers=False,
+        name=None,
+    )
+
+    if shard_spec == 'sharded':
+      layout = self.batch_layout_4d
+    else:
+      layout = self.replicated_layout_4d
+
+    grads = numpy_util.pack_numpy(grads, layout)
+
+    got = gen_image_ops.resize_nearest_neighbor_grad(
+        grads=grads,
+        size=[3, 3],
+        align_corners=False,
+        half_pixel_centers=False,
+        name=None,
+    )
+
+    self.assertDTensorEqual(expected_result, layout, got)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/dtensor/python/tests/cache_test.py b/tensorflow/dtensor/python/tests/cache_test.py
new file mode 100644
index 00000000000000..d56dacb8c8f605
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/cache_test.py
@@ -0,0 +1,330 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests DTensor device cache for compiled function computation."""
+
+import gc
+import numpy as np
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import d_variable
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_stateless_random_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+# Convenient constants to use for tests.
+_BATCH_DIM = "batch"
+_MESH_DIM_X = "x"
+
+# Shorter notation.
+Layout = layout_lib.Layout
+Mesh = layout_lib.Mesh
+
+
+def diff_dicts(dict1, dict2):
+  keys = set(dict1.keys()) | set(dict2.keys())
+  return {key: dict1.get(key, 0) - dict2.get(key, 0) for key in keys}
+
+
+class DTensorDeviceCacheTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(DTensorDeviceCacheTest, self).setUp()
+    device_ids = test_util.create_device_ids_array((2,))
+    local_device_ids = np.ravel(device_ids).tolist()
+    mesh_dict = {
+        device: Mesh(
+            [_BATCH_DIM],
+            device_ids,
+            local_device_ids,
+            test_util.create_device_list((2,), device),
+        )
+        for device in ("CPU", "GPU", "TPU")
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+  def testBasic(self):
+
+    @polymorphic_function.function
+    def func0(a):
+      return a + 1
+
+    @polymorphic_function.function
+    def func1(a):
+      return a + 2
+
+    c0 = api.copy_to_mesh(
+        constant_op.constant(1.0), Layout.replicated(self.mesh, rank=0)
+    )
+    c1 = api.copy_to_mesh(
+        constant_op.constant([2.0, 3.0]), Layout.replicated(self.mesh, rank=1)
+    )
+    c2 = api.copy_to_mesh(
+        constant_op.constant([4.0]), Layout.replicated(self.mesh, rank=1)
+    )
+    c3 = api.copy_to_mesh(
+        constant_op.constant(1, dtype=dtypes.int32),
+        Layout.replicated(self.mesh, rank=0),
+    )
+
+    # c0 and c1 have different layouts. c1 and c2 have different shapes.
+    # c0 and c3 have different dtypes.
+    self.assertAllEqual(func0(c0), 2.0)
+    self.assertAllEqual(func0(c1), [3.0, 4.0])
+    self.assertAllEqual(func0(c2), [5.0])
+    self.assertAllEqual(func0(c3), 2)
+
+    # func0 and func1 have different names.
+    self.assertAllEqual(func1(c0), 3.0)
+
+  def testFunctionInputConstantFoldingCacheHits(self):
+
+    @polymorphic_function.function
+    def add(a, b):
+      return a + b
+
+    c0 = api.copy_to_mesh(
+        constant_op.constant(17.0), Layout.replicated(self.mesh, rank=0)
+    )
+    c1 = api.copy_to_mesh(
+        constant_op.constant(21.0), Layout.replicated(self.mesh, rank=0)
+    )
+
+    stats1 = api._dtensor_device()._get_stats()
+    self.assertAllEqual(add(c0, c1), 38.0)
+    self.assertAllEqual(add(c0, c1), 38.0)
+
+    # First call should miss and second should hit.
+    stats2 = api._dtensor_device()._get_stats()
+    diff = {key: stats2[key] - stats1[key] for key in stats1.keys()}
+    self.assertEqual(diff["function_manager.miss"], 1)
+    self.assertEqual(diff["function_manager.hit"], 1)
+
+  def testFunctionInputConstantFoldingCacheMiss(self):
+
+    @polymorphic_function.function
+    def add(a, b):
+      return a + b
+
+    c0 = api.copy_to_mesh(
+        constant_op.constant(17.0), Layout.replicated(self.mesh, rank=0)
+    )
+    c1 = api.copy_to_mesh(
+        constant_op.constant(21.0), Layout.replicated(self.mesh, rank=0)
+    )
+    c2 = api.copy_to_mesh(
+        constant_op.constant(0.0), Layout.replicated(self.mesh, rank=0)
+    )
+
+    stats1 = api._dtensor_device()._get_stats()
+    # First call should log a cache miss.
+    self.assertAllEqual(add(c0, c1), 38.0)
+
+    # Second call should also log a cache miss since second constant changed.
+    self.assertAllEqual(add(c0, c2), 17.0)
+
+    # Third call should not log a cache miss since the same input as the prev.
+    self.assertAllEqual(add(c0, c2), 17.0)
+
+    # Fourth call should log a cache miss since first input changed.
+    self.assertAllEqual(add(c1, c2), 21.0)
+
+    stats2 = api._dtensor_device()._get_stats()
+    diff = {key: stats2[key] - stats1[key] for key in stats1.keys()}
+    self.assertEqual(diff["function_manager.miss"], 3)
+    self.assertEqual(diff["function_manager.hit"], 1)
+
+  def testCacheWithRNG(self):
+    with api._dtensor_device()._default_layout(
+        Layout.replicated(self.mesh, rank=1)):
+      v0 = gen_stateless_random_ops.stateless_random_normal(
+          shape=[1], seed=[1, 2]
+      )
+
+    with api._dtensor_device()._default_layout(
+        Layout.replicated(self.mesh, rank=1)):
+      v1 = gen_stateless_random_ops.stateless_random_normal(
+          shape=[1], seed=[1, 2]
+      )
+      v2 = gen_stateless_random_ops.stateless_random_normal(
+          shape=[2], seed=[1, 2]
+      )
+      v3 = gen_stateless_random_ops.stateless_random_normal(
+          shape=[1], seed=[3, 4]
+      )
+
+    # v0 and v1 have same layouts.
+    self.assertAllEqual(v0, v1)
+    api.check_layout(v0, Layout.replicated(self.mesh, rank=1))
+    api.check_layout(v1, Layout.replicated(self.mesh, rank=1))
+    # v1 and v2 have different shapes.
+    self.assertNotEqual(v1.shape, v2.shape)
+    # v1 and v3 have different seeds.
+    self.assertNotEqual(v1.numpy(), v3.numpy())
+
+  def testCacheWithVariable(self):
+    c0 = api.copy_to_mesh(
+        constant_op.constant(1.0), Layout.replicated(self.mesh, rank=0)
+    )
+    c1 = api.copy_to_mesh(
+        constant_op.constant([2.0, 3.0]), Layout.replicated(self.mesh, rank=1)
+    )
+    a = constant_op.constant([4.0])
+    b = constant_op.constant([5.0])
+    c2 = api.pack(
+        [a, b], layout=Layout.batch_sharded(self.mesh, _BATCH_DIM, rank=1)
+    )
+
+    v0 = d_variable.DVariable(c0)
+    v1 = d_variable.DVariable(c1)
+    v2 = d_variable.DVariable(c2)
+
+    self.assertAllEqual(v0.read_value(), 1.0)
+    self.assertAllEqual(v1.read_value(), [2.0, 3.0])
+    unpacked_tensor = api.unpack(v2.read_value())
+    self.assertAllClose([4.0], unpacked_tensor[0])
+    self.assertAllClose([5.0], unpacked_tensor[1])
+
+  @combinations.generate(
+      combinations.combine(size=[16, 40], same_value=[True, False])
+  )
+  def testManyFunctions(self, size, same_value):
+    r = range(100)
+
+    values = [np.reshape(r[i : i + size], (4, size // 4)) for i in range(10)]
+    c_layout = Layout.replicated(self.mesh, rank=2)
+    values = [constant_op.constant(v, dtype=dtypes.float32) for v in values]
+    c0 = [api.copy_to_mesh(v, c_layout) for v in values]
+
+    c0 = [c0[0 if same_value else i] for i in range(10)]
+    e0 = [values[0 if same_value else i] for i in range(10)]
+    stats1 = api._dtensor_device()._get_stats()
+
+    for i in range(10):
+      # Use a special to ensure no conflicts with otherwise used names.
+      @polymorphic_function.function
+      def fn_31415926(c):
+        return math_ops.reduce_sum(c)
+
+      self.assertAllEqual(fn_31415926(c0[i]).numpy(), np.sum(e0[i]))
+
+    del fn_31415926
+    gc.collect()
+
+    stats2 = api._dtensor_device()._get_stats()
+    diff = diff_dicts(stats2, stats1)
+    self.assertEqual(diff["function_manager.size"], 0)
+    self.assertEqual(diff["kernel_cache.size"], 0)
+    self.assertEqual(diff["device_cache.size"], 0)
+
+  @combinations.generate(
+      combinations.combine(size=[16, 40], same_value=[True, False])
+  )
+  def testManyEagerOps(self, size, same_value):
+    if self.mesh.device_type() != "TPU":
+      # For the CPU/GPU mesh, we have a shortcut that doesn't go through the
+      # MLIR, but run the eager op locally and broadcast to all the devices.
+      expected_cache_diff = 0
+      expected_kernel_cache = 0
+      expected_device_cache = 0
+      expected_eager_pure_hit = 10
+    else:
+      # TODO(b/287529295): Remove this branch after the TPU issue is fixed.
+      expected_device_cache = 0
+      expected_eager_pure_hit = 0
+      if same_value:
+        expected_cache_diff = 1
+        expected_kernel_cache = 2
+      else:
+        if size >= 20:
+          expected_cache_diff = 1
+          expected_kernel_cache = 2
+        else:
+          expected_cache_diff = 2
+          expected_kernel_cache = 4
+
+    r = range(100)
+    c_layout = Layout.replicated(self.mesh, rank=2)
+    values = [np.reshape(r[i : i + size], (4, size // 4)) for i in range(10)]
+    values = [constant_op.constant(v, dtype=dtypes.float32) for v in values]
+    c0 = [api.copy_to_mesh(v, c_layout) for v in values]
+
+    c0 = [c0[0 if same_value else i] for i in range(10)]
+    e0 = [values[0 if same_value else i] for i in range(10)]
+
+    stats1 = api._dtensor_device()._get_stats()
+
+    for i in range(10):
+      self.assertAllEqual(array_ops.identity(c0[i]).numpy(), e0[i])
+
+    gc.collect()
+
+    stats2 = api._dtensor_device()._get_stats()
+    diff = diff_dicts(stats2, stats1)
+
+    if same_value:
+      self.assertEqual(diff["function_manager.size"], expected_cache_diff)
+      self.assertEqual(
+          diff["eager_pure_optimization.hit"], expected_eager_pure_hit
+      )
+      # TFRT doesn't use eager cache.
+      if not test_util.is_tfrt_enabled():
+        self.assertEqual(diff["kernel_cache.size"], expected_kernel_cache)
+        self.assertEqual(diff["device_cache.size"], expected_device_cache)
+    else:
+      # FIXME(feyu): Update these when the leaks are fixed.
+      if size >= 20:
+        self.assertEqual(diff["function_manager.size"], expected_cache_diff)
+        self.assertEqual(
+            diff["eager_pure_optimization.hit"], expected_eager_pure_hit
+        )
+        # TFRT doesn't use eager cache.
+        if not test_util.is_tfrt_enabled():
+          self.assertEqual(diff["kernel_cache.size"], expected_kernel_cache)
+          self.assertEqual(diff["device_cache.size"], expected_device_cache)
+      else:
+        self.assertEqual(diff["function_manager.size"], expected_cache_diff)
+        self.assertEqual(
+            diff["eager_pure_optimization.hit"], expected_eager_pure_hit
+        )
+        # TFRT doesn't use eager cache.
+        if not test_util.is_tfrt_enabled():
+          self.assertEqual(diff["kernel_cache.size"], expected_kernel_cache)
+          self.assertEqual(diff["device_cache.size"], expected_device_cache)
+
+  def testManyEagerOpsVaryInput(self):
+    c_layout = Layout.replicated(self.mesh, rank=10)
+
+    c0 = constant_op.constant(
+        [[[[[[[[[[0, 1, 2, 3], [4, 5, 6, 7]]]]]]]]]], dtype=dtypes.float32
+    )
+    e0 = c0.numpy()
+    c0 = api.copy_to_mesh(c0, c_layout)
+
+    for ax in range(10):
+      self.assertAllEqual(
+          math_ops.reduce_sum(c0, axis=ax).numpy(), np.sum(e0, axis=ax)
+      )
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/dtensor/python/tests/conv_test.py b/tensorflow/dtensor/python/tests/conv_test.py
new file mode 100644
index 00000000000000..25cab09e1096ac
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/conv_test.py
@@ -0,0 +1,350 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for executing ops needed to implement image model."""
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python import numpy_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import special_math_ops
+from tensorflow.python.platform import test
+
+
+UNSHARDED = layout_lib.UNSHARDED
+Mesh = layout_lib.Mesh
+Layout = layout_lib.Layout
+
+BATCH_DIM = 'batch'
+DEPTH_DIM = 'depth'
+HEIGHT_DIM = 'height'
+WIDTH_DIM = 'width'
+BATCH_SIZE = 4
+DEPTH = 8
+HEIGHT = 12
+WIDTH = 12
+CHANNEL_IN = 1
+CHANNEL_OUT = 3
+
+
+class ConvOpTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+
+    global_ids = test_util.create_device_ids_array((2, 2, 2))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {}
+    for device in ('CPU', 'GPU', 'TPU'):
+      mesh_dict[device] = Mesh(
+          [BATCH_DIM, HEIGHT_DIM, WIDTH_DIM],
+          global_ids,
+          local_ids,
+          test_util.create_device_list((2, 2, 2), device),
+      )
+
+    self.mesh = self.configTestMesh(mesh_dict)
+
+    self.replicated_2d = Layout.replicated(self.mesh, 2)
+    self.batch_sharded_2d = Layout.batch_sharded(self.mesh, BATCH_DIM, 2)
+
+  @parameterized.named_parameters(
+      test_util.product(
+          *[
+              [
+                  (
+                      'Conv2D',
+                      nn_ops.conv2d_v2,
+                      (BATCH_SIZE, HEIGHT, WIDTH, CHANNEL_IN),
+                      (2, 2, CHANNEL_IN, CHANNEL_OUT),
+                      'bhwc,xy->by',
+                      [1, 2, 1, 1],
+                  ),
+                  (
+                      'Conv3D',
+                      nn_ops.conv3d_v2,
+                      (BATCH_SIZE, DEPTH, HEIGHT, WIDTH, CHANNEL_IN),
+                      (2, 2, 2, CHANNEL_IN, CHANNEL_OUT),
+                      'bdhwc,xy->by',
+                      [1, 1, 2, 1, 1],
+                  ),
+              ],
+              [
+                  ('Eager', True),
+                  ('Graph', False),
+              ],
+              [
+                  ('ReplicatedInput', 'replicated'),
+                  ('BatchShardedInput', 'batch_sharded'),
+              ],
+              [
+                  ('ValidPadding', 'VALID'),
+                  ('SamePadding', 'SAME'),
+              ],
+          ]
+      )
+  )
+  def testConvFollowedByEinsum(self, conv_op, input_size, kernel_size,
+                               einsum_eq, strides, eager_mode, input_sharding,
+                               padding):
+    x_in = constant_op.constant(
+        np.random.random(size=input_size), dtype=dtypes.float32
+    )
+    kernel_in = constant_op.constant(
+        np.random.random(size=kernel_size), dtype=dtypes.float32
+    )
+    weight = constant_op.constant(
+        np.random.random(size=(2, 2)), dtype=dtypes.float32
+    )
+
+    def conv_fn(inputs, img_kernel, layer_weights):
+      output = conv_op(inputs, img_kernel, strides=strides, padding=padding)
+      output = special_math_ops.einsum(einsum_eq, output, layer_weights)
+      return output
+
+    if not eager_mode:
+      conv_fn = polymorphic_function.function(conv_fn)
+
+    golden_result = conv_fn(x_in, kernel_in, weight)
+
+    if input_sharding == 'replicated':
+      input_layout = Layout.replicated(self.mesh, len(input_size))
+      output_layout = self.replicated_2d
+    elif input_sharding == 'batch_sharded':
+      input_layout = Layout.batch_sharded(self.mesh, BATCH_DIM, len(input_size))
+      output_layout = self.batch_sharded_2d
+
+    kernel_layout = Layout.replicated(self.mesh, len(kernel_size))
+
+    d_x_in = numpy_util.pack_numpy(x_in, input_layout)
+    d_kernel_in = numpy_util.pack_numpy(kernel_in, kernel_layout)
+    d_weight = numpy_util.pack_numpy(weight, self.replicated_2d)
+    d_result = conv_fn(d_x_in, d_kernel_in, d_weight)
+
+    self.assertDTensorEqual(golden_result, output_layout, d_result)
+
+  @parameterized.named_parameters(
+      test_util.product(
+          *[
+              [
+                  (
+                      'Conv2D',
+                      nn_ops.conv2d_v2,
+                      (BATCH_SIZE, HEIGHT, WIDTH, CHANNEL_IN),
+                      (2, 2, CHANNEL_IN, CHANNEL_OUT),
+                      'bhwc,xy->by',
+                      [1, 1, 1, 1],
+                  ),
+                  (
+                      'Conv3D',
+                      nn_ops.conv3d_v2,
+                      (BATCH_SIZE, DEPTH, HEIGHT, WIDTH, CHANNEL_IN),
+                      (2, 2, 2, CHANNEL_IN, CHANNEL_OUT),
+                      'bdhwc,xy->by',
+                      [1, 1, 1, 1, 1],
+                  ),
+              ],
+              [
+                  ('ReplicatedInput', 'replicated'),
+                  ('BatchShardedInput', 'batch_sharded'),
+              ],
+              [
+                  ('ValidPadding', 'VALID'),
+                  ('SamePadding', 'SAME'),
+              ],
+          ]
+      )
+  )
+  def testConvFollowedByEinsumWithGradient(self, conv_op, input_size,
+                                           kernel_size, einsum_eq, strides,
+                                           input_sharding, padding):
+    x_in = constant_op.constant(
+        np.random.random(size=input_size), dtype=dtypes.float32
+    )
+    kernel_in = constant_op.constant(
+        np.random.random(size=kernel_size), dtype=dtypes.float32
+    )
+    weight = constant_op.constant(
+        np.random.random(size=(2, 2)), dtype=dtypes.float32
+    )
+
+    @polymorphic_function.function
+    def conv_fn(inputs, img_kernel, layer_weights):
+      with backprop.GradientTape() as tape:
+        tape.watch([inputs, img_kernel, layer_weights])
+        output = conv_op(inputs, img_kernel, strides=strides, padding=padding)
+        output = special_math_ops.einsum(einsum_eq, output, layer_weights)
+
+      inputs_grad, kernel_grad, weight_grad = tape.gradient(
+          output, [inputs, img_kernel, layer_weights])
+      return output, inputs_grad, kernel_grad, weight_grad
+
+    result, inputs_grad, kernel_grad, weight_grad = conv_fn(
+        x_in, kernel_in, weight)
+
+    if input_sharding == 'replicated':
+      input_layout = Layout.replicated(self.mesh, len(input_size))
+      output_layout = self.replicated_2d
+    elif input_sharding == 'batch_sharded':
+      input_layout = Layout.batch_sharded(self.mesh, BATCH_DIM, len(input_size))
+      output_layout = self.batch_sharded_2d
+
+    kernel_layout = Layout.replicated(self.mesh, len(kernel_size))
+
+    d_x_in = numpy_util.pack_numpy(x_in, input_layout)
+    d_kernel_in = numpy_util.pack_numpy(kernel_in, kernel_layout)
+    d_weight = numpy_util.pack_numpy(weight, self.replicated_2d)
+    d_result, d_inputs_grad, d_kernel_grad, d_weight_grad = conv_fn(
+        d_x_in, d_kernel_in, d_weight)
+
+    self.assertDTensorEqual(result, output_layout, d_result)
+    # TODO(b/208700444): layout of input grads should match layout of input.
+    self.assertDTensorEqual(
+        inputs_grad,
+        Layout.replicated(self.mesh, len(input_size)),
+        d_inputs_grad,
+    )
+    self.assertDTensorEqual(kernel_grad, kernel_layout, d_kernel_grad)
+    self.assertDTensorEqual(weight_grad, self.replicated_2d, d_weight_grad)
+
+
+SPATIALLY_PARTITIONED_CONV_TEST_CASES = [
+    [
+        ('Case1', (BATCH_SIZE, 8, 16, CHANNEL_IN), (3, 5, CHANNEL_IN,
+                                                    CHANNEL_OUT)),
+        ('Case2', (BATCH_SIZE, 8, 128, CHANNEL_IN), (3, 9, CHANNEL_IN,
+                                                     CHANNEL_OUT)),
+    ],
+    [
+        ('ValidPadding', 'VALID'),
+        ('SamePadding', 'SAME'),
+    ],
+    [
+        ('Batch_1d_2x4', [BATCH_DIM, UNSHARDED, WIDTH_DIM, UNSHARDED], (2, 4)),
+        ('2d_2x4', [UNSHARDED, HEIGHT_DIM, WIDTH_DIM, UNSHARDED], (2, 4)),
+        ('Batch_2d_2x2x2', [BATCH_DIM, HEIGHT_DIM, WIDTH_DIM,
+                            UNSHARDED], (2, 2, 2)),
+    ],
+]
+
+
+class SpatiallyPartitionedConvOpTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+
+    # TODO(b/261485237): Enable CPU testing once CollectivePermute is supported
+    # on CPU's.
+    if not test_util.is_tpu_present():
+      self.skipTest('This test only runs on TPUs.')
+
+  def _create_mesh(self, mesh_dims, topology):
+    global_ids = test_util.create_device_ids_array(topology)
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {}
+    for device in ('CPU', 'GPU', 'TPU'):
+      mesh_dict[device] = Mesh(
+          mesh_dims,
+          global_ids,
+          local_ids,
+          test_util.create_device_list(topology, device),
+      )
+
+    return self.configTestMesh(mesh_dict)
+
+  @parameterized.named_parameters(
+      test_util.product(*SPATIALLY_PARTITIONED_CONV_TEST_CASES))
+  def testConv(self, input_shape, kernel_shape, padding, sharding_specs,
+               topology):
+    mesh_dims = [spec for spec in sharding_specs if spec != UNSHARDED]
+    mesh = self._create_mesh(mesh_dims, topology)
+
+    x_in = constant_op.constant(
+        np.random.random(size=input_shape), dtype=dtypes.float32
+    )
+    kernel_in = constant_op.constant(
+        np.random.random(size=kernel_shape), dtype=dtypes.float32
+    )
+
+    expected_output = nn_ops.conv2d_v2(
+        x_in, kernel_in, strides=[1, 1, 1, 1], padding=padding
+    )
+
+    input_layout = Layout(sharding_specs, mesh)
+    kernel_layout = Layout.replicated(mesh, 4)
+
+    d_x_in = numpy_util.pack_numpy(x_in, input_layout)
+    d_kernel_in = numpy_util.pack_numpy(kernel_in, kernel_layout)
+    d_output = nn_ops.conv2d_v2(
+        d_x_in, d_kernel_in, strides=[1, 1, 1, 1], padding=padding
+    )
+
+    self.assertDTensorEqual(expected_output, input_layout, d_output)
+
+  @parameterized.named_parameters(
+      test_util.product(*SPATIALLY_PARTITIONED_CONV_TEST_CASES))
+  def testConvWithGradient(self, input_shape, kernel_shape, padding,
+                           sharding_specs, topology):
+    # TODO(b/208700444): add support for SPMD expansion of spatially partitioned
+    # conv backprop.
+    self.skipTest(
+        'b/208700444: Spatially partitioned conv backprop not implemented.')
+
+    mesh_dims = [spec for spec in sharding_specs if spec != UNSHARDED]
+    mesh = self._create_mesh(mesh_dims, topology)
+
+    x_in = constant_op.constant(
+        np.random.random(size=input_shape), dtype=dtypes.float32
+    )
+    kernel_in = constant_op.constant(
+        np.random.random(size=kernel_shape), dtype=dtypes.float32
+    )
+
+    @polymorphic_function.function
+    def conv_fn(inputs, img_kernel, padding):
+      with backprop.GradientTape() as tape:
+        tape.watch([inputs, img_kernel])
+        output = nn_ops.conv2d_v2(
+            inputs, img_kernel, strides=[1, 1, 1, 1], padding=padding
+        )
+      inputs_grad, kernel_grad = tape.gradient(output, [inputs, img_kernel])
+      return output, inputs_grad, kernel_grad
+
+    expected_output, expected_inputs_grad, expected_kernel_grad = conv_fn(
+        x_in, kernel_in, padding)
+
+    input_layout = Layout(sharding_specs, mesh)
+    kernel_layout = Layout.replicated(mesh, 4)
+
+    d_x_in = numpy_util.pack_numpy(x_in, input_layout)
+    d_kernel_in = numpy_util.pack_numpy(kernel_in, kernel_layout)
+
+    d_output, d_inputs_grad, d_kernel_grad = conv_fn(d_x_in, d_kernel_in,
+                                                     padding)
+
+    self.assertDTensorEqual(expected_output, input_layout, d_output)
+    self.assertDTensorEqual(expected_inputs_grad, input_layout, d_inputs_grad)
+    self.assertDTensorEqual(expected_kernel_grad, kernel_layout, d_kernel_grad)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/dtensor/python/tests/mnist_test.py b/tensorflow/dtensor/python/tests/mnist_test.py
new file mode 100644
index 00000000000000..5fd08f18414ef0
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/mnist_test.py
@@ -0,0 +1,197 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""DTensor MNIST test."""
+
+from absl.testing import parameterized
+
+import numpy as np
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import d_variable
+from tensorflow.dtensor.python import input_util
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+_BATCH_DIM = 'batch'
+_DEVICE_IDS = test_util.create_device_ids_array((2,))
+_ONE_D_MESH = layout_lib.Mesh(
+    [_BATCH_DIM],
+    _DEVICE_IDS,
+    np.ravel(_DEVICE_IDS).tolist(),
+    test_util.create_device_list((2,), 'CPU'),
+)
+_ONE_D_TPU_MESH = layout_lib.Mesh(
+    [_BATCH_DIM],
+    _DEVICE_IDS,
+    np.ravel(_DEVICE_IDS).tolist(),
+    test_util.create_device_list((2,), 'TPU'),
+)
+_BATCH_SIZE = 1024
+_STEPS = 5
+_LR = 1e-3
+_ATOL = 1  # absolute error becomes large as gradients approach zero.
+_RTOL = 1e-3
+Layout = layout_lib.Layout
+
+
+def mnist_fake_dataset():
+  imgs = []
+  labels = []
+  for i in range(_STEPS * _BATCH_SIZE):
+    img = stateless_random_ops.stateless_random_uniform(
+        shape=(28, 28, 1),
+        seed=[1, i],
+        minval=0,
+        maxval=256,
+        dtype=dtypes.float32,
+    )
+    imgs.append(img)
+    label = stateless_random_ops.stateless_random_uniform(
+        shape=(1,), seed=[2, i], minval=0, maxval=10, dtype=dtypes.int64
+    )
+    labels.append(label)
+
+  return dataset_ops.DatasetV2.from_tensor_slices(
+      (array_ops_stack.stack(imgs), array_ops_stack.stack(labels))
+  )
+
+
+def _run_step(inputs, w, b, k):
+  with backprop.GradientTape() as g:
+    g.watch([w, b])
+    logits = nn_ops.conv2d_v2(inputs, k, strides=[1, 1, 1, 1], padding='SAME')
+    logits = array_ops.reshape(logits, [logits.shape[0], -1])
+    logits = math_ops.matmul(logits, w)
+    logits = logits + b
+    loss = math_ops.reduce_sum(logits, axis=[0, 1])
+  gw, gb = g.gradient(loss, [w, b])
+  for v, v_grad in zip([w, b], [gw, gb]):
+    v.assign_sub(_LR * v_grad)
+  return gw, gb, loss
+
+
+class DTensorMNISTTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(DTensorMNISTTest, self).setUp()
+
+    global_ids = test_util.create_device_ids_array((2,))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: layout_lib.Mesh(
+            [_BATCH_DIM],
+            global_ids,
+            local_ids,
+            test_util.create_device_list((2,), device),
+        )
+        for device in ['TPU', 'GPU', 'CPU']
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+  def init_var(self, mesh):
+    # Initialize TF randon normal variables(without using DTensor).
+    w_initializer = stateless_random_ops.stateless_random_normal(
+        shape=[28 * 28, 10], seed=[0, 1]
+    )
+    b_initializer = stateless_random_ops.stateless_random_normal(
+        shape=[10], seed=[1, 2]
+    )
+    # A filter with 3x3 shape, 1 input channel and 1 output channel.
+    k_initializer = stateless_random_ops.stateless_random_normal(
+        [3, 3, 1, 1], seed=[2, 3]
+    )
+
+    n_w = variables.Variable(w_initializer)
+    n_b = variables.Variable(b_initializer)
+    n_k = variables.Variable(k_initializer)
+
+    # Initialize DTensor variables.
+    w_initializer_on_mesh = api.copy_to_mesh(
+        w_initializer, Layout.replicated(mesh, 2)
+    )
+    b_initializer_on_mesh = api.copy_to_mesh(
+        b_initializer, Layout.replicated(mesh, rank=1)
+    )
+    k_initializer_on_mesh = api.copy_to_mesh(
+        k_initializer, Layout.replicated(mesh, rank=4)
+    )
+
+    w = d_variable.DVariable(w_initializer_on_mesh)
+    b = d_variable.DVariable(b_initializer_on_mesh)
+    k = d_variable.DVariable(k_initializer_on_mesh)
+
+    return (n_w, n_b, n_k), (w, b, k)
+
+  @parameterized.named_parameters(('Eager', False), ('Function', True))
+  def testMnist(self, on_function):
+    mnist_dataset = mnist_fake_dataset()
+
+    (n_w, n_b, n_k), (w, b, k) = self.init_var(self.mesh)
+
+    n_dataset = mnist_dataset.batch(_BATCH_SIZE, drop_remainder=True)
+    n_iter = iter(n_dataset)
+
+    input_layout = Layout.batch_sharded(self.mesh, _BATCH_DIM, rank=4)
+    label_layout = Layout.batch_sharded(self.mesh, _BATCH_DIM, rank=2)
+    dtensor_dataset = input_util.DTensorDataset(
+        dataset=mnist_dataset,
+        global_batch_size=_BATCH_SIZE,
+        mesh=self.mesh,
+        layouts=(input_layout, label_layout),
+        batch_dim=_BATCH_DIM,
+    )
+    dtensor_iter = iter(dtensor_dataset)
+
+    step_fn = (
+        polymorphic_function.function(_run_step) if on_function else _run_step
+    )
+
+    # Training loop.
+    for _ in range(_STEPS):
+      # Normal run without DTensor.
+      n_input, _ = next(n_iter)
+      g_nw, g_nb, n_loss = step_fn(n_input, n_w, n_b, n_k)
+
+      # DTensor Run
+      dtensor_input, _ = next(dtensor_iter)
+      with ops.device_v2(api.device_name()):
+        gw, gb, loss = step_fn(dtensor_input, w, b, k)
+
+      loss_unpack = api.unpack(loss)
+      self.assertAllEqual(loss_unpack[0], loss_unpack[1])
+
+      self.assertAllClose(n_loss, loss, atol=_ATOL, rtol=_RTOL)
+      self.assertAllClose(g_nw, gw, atol=_ATOL, rtol=_RTOL)
+      self.assertAllClose(g_nb, gb, atol=_ATOL, rtol=_RTOL)
+      self.assertAllClose(n_w, w, atol=_ATOL, rtol=_RTOL)
+      self.assertAllClose(n_b, b, atol=_ATOL, rtol=_RTOL)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/dtensor/python/tests/multi_client_input_util_test.py b/tensorflow/dtensor/python/tests/multi_client_input_util_test.py
new file mode 100644
index 00000000000000..3538cb4e09b4e6
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/multi_client_input_util_test.py
@@ -0,0 +1,548 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Multi-client tests for input_util."""
+
+import os
+from typing import Any, List, Mapping, Optional, Tuple
+
+from absl import logging
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.dtensor.python import accelerator_util
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import config
+from tensorflow.dtensor.python import input_util
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python import mesh_util
+from tensorflow.dtensor.python.tests import multi_client_test_util
+from tensorflow.dtensor.python.tests import test_backend_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.data.experimental.service import server_lib
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.eager import context
+from tensorflow.python.framework import config as tf_config
+from tensorflow.python.framework import device_spec
+from tensorflow.python.framework import dtypes
+from tensorflow.python.lib.io import tf_record
+from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import gen_parsing_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import parsing_config
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.platform import test
+
+
+mp_context = test_backend_util.get_mp_context()
+
+# Multi-client test constants.
+JOB_NAME = 'worker'
+TF_DATA_SERVICE_JOB_NAME = 'dtensor_tf_data'
+NUM_CLIENTS = 4
+NUM_DEVICES_PER_CLIENT = 4
+
+# Mesh constants.
+MESH_DIM_BATCH = 'batch'
+MESH_DIM_HEIGHT = 'height'
+MESH_DIM_WIDTH = 'width'
+
+# Data constants.
+IMG_HEIGHT = 8
+IMG_WIDTH = 8
+IMG_CHANNELS = 3
+
+UNSHARDED = layout_lib.UNSHARDED
+Mesh = layout_lib.Mesh
+Layout = layout_lib.Layout
+
+
+def redirect_output(file_name):
+  # Redirect stderr/stdout to undeclared outputs on sponge.
+  artifact_dir = os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', '')
+  if artifact_dir:
+    with open(os.path.join(artifact_dir, file_name), 'wb') as fp:
+      os.dup2(fp.fileno(), 1)
+      os.dup2(fp.fileno(), 2)
+
+
+def create_dispatcher(test_name, worker_addresses, port, pipe=None):
+  dispatcher = server_lib.DispatchServer(
+      config=server_lib.DispatcherConfig(
+          port=port, protocol='grpc', worker_addresses=worker_addresses
+      )
+  )
+  dispatcher.start()
+  if pipe is None:
+    # Dispatcher is not within subprocess, so do not block.
+    return dispatcher, dispatcher._address
+  else:
+    redirect_output(f'test-{test_name}-dispatcher.log')
+    pipe.send(dispatcher._address)
+    signal = pipe.recv()  # blocks until a 'stop' signal is received
+    if signal == 'stop':
+      dispatcher._stop()
+      pipe.send('stopped')
+    else:
+      raise ValueError('Got unknown signal %s' % signal)
+
+
+def create_worker(test_name, dispatcher_address, port=None, pipe=None):
+  worker = server_lib.WorkerServer(
+      config=server_lib.WorkerConfig(
+          port=port, dispatcher_address=dispatcher_address, protocol='grpc'
+      )
+  )
+  worker.start()
+  if pipe is None:
+    # Worker is not within subprocess, so do not block.
+    return worker, worker._address
+  else:
+    redirect_output(f'test-{test_name}-worker.log')
+    pipe.send(worker._address)
+    signal = pipe.recv()  # blocks until a 'stop' signal is received
+    if signal == 'stop':
+      worker._stop()
+      pipe.send('stopped')
+    else:
+      raise ValueError('Got unknown signal %s' % signal)
+
+
+class TFDataServiceCluster:
+  """tf.data service cluster with dispatcher and workers as subprocesses.
+
+  To run the cluster in co-located mode, set `num_workers` to 0 and create the
+  tf.data service workers manually in each client process.
+  """
+
+  def __init__(self,
+               test_name,
+               num_workers,
+               worker_ports=None,
+               worker_addresses=None):
+    self._test_name = test_name
+    self._num_workers = num_workers
+    self._start_dispatcher(worker_addresses)
+    self._start_workers(worker_ports)
+
+  def _start_dispatcher(self, worker_addresses, port=0):
+    self._pipe_to_dispatcher, dispatcher_pipe = mp_context.Pipe(True)
+    logging.info(
+        'Starting remote dispatcher on port %d with worker addresses: %s', port,
+        worker_addresses)
+    self._dispatcher_process = mp_context.Process(
+        target=create_dispatcher,
+        args=(self._test_name, worker_addresses, port, dispatcher_pipe),
+    )
+    self._dispatcher_process.start()
+    self._dispatcher_address = self._pipe_to_dispatcher.recv()
+
+  def dispatcher_address(self):
+    return self._dispatcher_address
+
+  def _start_workers(self, worker_ports=None):
+    self._workers = []
+    self._worker_addresses = []
+    self._worker_pipes = []
+    for idx in range(self._num_workers):
+      port = worker_ports[idx] if worker_ports else None
+      self._start_worker(port)
+
+  def _start_worker(self, port=None):
+    pipe_to_worker, worker_pipe = mp_context.Pipe(True)
+    logging.info(
+        'Starting remote worker on port %d with dispatcher address: %s', port,
+        self._dispatcher_address)
+    worker_process = mp_context.Process(
+        target=create_worker,
+        args=(self._test_name, self._dispatcher_address, port, worker_pipe),
+    )
+    worker_process.start()
+    worker_address = self._pipe_to_worker.recv()
+    self._workers.append(worker_process)
+    self._worker_addresses.append(worker_address)
+    self._worker_pipes.append(pipe_to_worker)
+
+  def worker_addresses(self):
+    return self._worker_addresses
+
+  def stop(self):
+    # Segfault logs may still be printed because clean exit of child processes
+    # is not always possible. This will not affect the outcome of the test.
+    logging.info('Will try to stop TFDataServiceCluster!')
+
+    for idx in range(self._num_workers):
+      address = self._worker_addresses[idx]
+      pipe_to_worker = self._worker_pipes[idx]
+      logging.info('Stopping worker %s...', address)
+      pipe_to_worker.send('stop')
+      if pipe_to_worker.poll(2):
+        if pipe_to_worker.recv() == 'stopped':
+          logging.info('Successfully stopped worker %s', address)
+      self._workers[idx].terminate()
+
+    logging.info('Stopping dispatcher...')
+    self._pipe_to_dispatcher.send('stop')
+    if self._pipe_to_dispatcher.poll(2):
+      if self._pipe_to_dispatcher.recv() == 'stopped':
+        logging.info('Successfully stopped dispatcher')
+    self._dispatcher_process.terminate()
+
+
+def setup_local_devices(num_devices):
+  physical_cpus = tf_config.list_physical_devices('CPU')
+  tf_config.set_logical_device_configuration(
+      physical_cpus[0],
+      [context.LogicalDeviceConfiguration() for _ in range(num_devices)],
+  )
+
+
+def setup_client(client_id: int, test_name: str, env: Mapping[str, str],
+                 num_local_devices: int):
+  """Set up a DTensor client for use in multi-client tests.
+
+  Args:
+    client_id: the index of the client.
+    test_name: the name of the test under which this client is running, used To
+      identify the log file artifact containing the test output.
+    env: a dictionary of environment variables to update.
+    num_local_devices: number of local devices to set up.
+  """
+  # Redirect client's stderr/stdout to undeclared outputs on sponge.
+  redirect_output(f'test-{test_name}-process-{client_id}.log')
+
+  # Update any specified environment variables.
+  for var, val in env.items():
+    os.environ[var] = val
+
+  # Set up local devices.
+  setup_local_devices(num_local_devices)
+
+  # Set up DTensor cluster and enable collectives.
+  accelerator_util.initialize_accelerator_system()
+
+
+def run_client(
+    client_id: int,
+    test_name: str,
+    env: Mapping[str, str],
+    num_local_devices: int,
+    dispatcher_address: str,
+    worker_port: int,
+    batch_size: int,
+    dataset_paths: List[str],
+    mesh: Mesh,
+    batch_dim: Optional[str],
+    layouts: Tuple[Layout, Layout],
+) -> List[Tuple[Any, Any]]:
+  # Co-located tf.data service mode. It is important to hold the worker object
+  # until the end otherwise it will get garbage collected.
+  worker, worker_address = create_worker(  # pylint: disable=unused-variable
+      test_name, dispatcher_address, port=worker_port)
+  logging.info(
+      'tf.data service worker running at %s',
+      worker_address,
+  )
+
+  setup_client(client_id, test_name, env, num_local_devices)
+
+  def decode_fn(record_bytes):
+    decoded = parsing_ops.parse_single_example_v2(
+        serialized=record_bytes,
+        features={
+            'idx': parsing_config.FixedLenFeature([], dtype=dtypes.int64),
+            'elem': parsing_config.FixedLenFeature([], dtype=dtypes.string),
+        },
+    )
+    parsed_elem = gen_parsing_ops.parse_tensor(decoded['elem'], dtypes.int32)
+    elem = check_ops.ensure_shape(
+        parsed_elem, [IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS]
+    )
+    return decoded['idx'], elem
+
+  dataset = dataset_ops.DatasetV2.from_tensor_slices(dataset_paths)
+  dataset = dataset.interleave(readers.TFRecordDatasetV2)
+  dataset = dataset.map(decode_fn)
+
+  tf_data_service_config = input_util.TFDataServiceConfig(
+      dispatcher_address=dispatcher_address, job_name=TF_DATA_SERVICE_JOB_NAME
+  )
+  d_dataset = input_util.DTensorDataset(
+      dataset=dataset,
+      global_batch_size=batch_size,
+      mesh=mesh,
+      layouts=layouts,
+      batch_dim=batch_dim,
+      tf_data_service_config=tf_data_service_config,
+  )
+
+  # Subprocesses cannot return a sharded DTensor as it triggers a copy and
+  # copying non-replicated DTensors is not supported. So instead we unpack it
+  # and return the component tensors.
+  ret = []
+  for batch_idx, elem in d_dataset:
+    n_batch_idx = api.unpack(batch_idx)
+    n_elem = api.unpack(elem)
+    ret.append((n_batch_idx, n_elem))
+  return ret
+
+
+class MultiClientDTensorDatasetTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+
+    logging.info('Check per client log in Test artifacts.')
+
+    self.server_ports = [
+        multi_client_test_util.pick_unused_port() for _ in range(NUM_CLIENTS)
+    ]
+
+    self.worker_ports = [
+        multi_client_test_util.pick_unused_port() for _ in range(NUM_CLIENTS)
+    ]
+    worker_addresses = [f'localhost:{port}' for port in self.worker_ports]
+    self.cluster = TFDataServiceCluster(
+        test_name=self._testMethodName,
+        num_workers=0,  # Co-located mode.
+        worker_addresses=worker_addresses)
+
+  def tearDown(self):
+    super().tearDown()
+    self.cluster.stop()
+
+  def write_dataset(self, dataset, num_files, num_elems):
+    """Writes a dataset_ops.DatasetV2 to multiple files."""
+    dataset_paths = []
+    dataset_iter = iter(dataset)
+
+    for file_idx in range(num_files):
+      dataset_path = os.path.join(self.get_temp_dir(),
+                                  f'dataset-{file_idx}.tfrecords')
+      dataset_paths.append(dataset_path)
+      with tf_record.TFRecordWriter(dataset_path) as writer:
+        for _ in range(num_elems // num_files):
+          idx, elem = next(dataset_iter)
+          elem_bytes = example_pb2.Example(
+              features=feature_pb2.Features(
+                  feature={
+                      'idx': feature_pb2.Feature(
+                          int64_list=feature_pb2.Int64List(value=[idx])
+                      ),
+                      'elem': feature_pb2.Feature(
+                          bytes_list=feature_pb2.BytesList(
+                              value=[io_ops.serialize_tensor(elem).numpy()]
+                          )
+                      ),
+                  }
+              )
+          ).SerializeToString()
+          writer.write(elem_bytes)
+
+    return dataset_paths
+
+  @parameterized.product(
+      (
+          {
+              # batch=4 x height=2 x width=2
+              # 1 replica per client.
+              'mesh_dims': [(MESH_DIM_BATCH, 4),
+                            (MESH_DIM_HEIGHT, 2),
+                            (MESH_DIM_WIDTH, 2)],
+          }, {
+              # batch=4 x height=2 x width=2 (transposed)
+              # 1 replica per client with reordered local partitions.
+              'mesh_dims': [(MESH_DIM_BATCH, 4),
+                            (MESH_DIM_WIDTH, 2),
+                            (MESH_DIM_HEIGHT, 2)],
+          }, {
+              # batch=8 x height=2 x width=1
+              # 2 replicas per client.
+              'mesh_dims': [(MESH_DIM_BATCH, 8),
+                            (MESH_DIM_HEIGHT, 2),
+                            (MESH_DIM_WIDTH, 1)],
+          }, {
+              # batch=8 x height=2 x width=1 (transposed)
+              # 2 replicas per client with reordered partitions.
+              'mesh_dims': [(MESH_DIM_BATCH, 8),
+                            (MESH_DIM_WIDTH, 1),
+                            (MESH_DIM_HEIGHT, 2)],
+          }, {
+              # batch=2 x height=4 x width=2
+              # 1 replica split over 2 clients.
+              'mesh_dims': [(MESH_DIM_BATCH, 2),
+                            (MESH_DIM_HEIGHT, 4),
+                            (MESH_DIM_WIDTH, 2)],
+          }, {
+              # batch=2 x height=4 x width=2 (transposed)
+              # 1 replica split over 2 clients with reordered partitions.
+              'mesh_dims': [(MESH_DIM_BATCH, 2),
+                            (MESH_DIM_WIDTH, 2),
+                            (MESH_DIM_HEIGHT, 4)],
+          },
+      ),
+      (
+          {
+              # Replicated
+              'idx_sharding': [UNSHARDED],
+              'images_sharding': [UNSHARDED, UNSHARDED, UNSHARDED, UNSHARDED],
+          }, {
+              # Batch sharded
+              'idx_sharding': [MESH_DIM_BATCH],
+              'images_sharding':
+                  [MESH_DIM_BATCH, UNSHARDED, UNSHARDED, UNSHARDED],
+          }, {
+              # Spatially sharded
+              'idx_sharding': [UNSHARDED],
+              'images_sharding':
+                  [UNSHARDED, MESH_DIM_HEIGHT, MESH_DIM_WIDTH, UNSHARDED],
+          }, {
+              # Batch and spatially sharded
+              'idx_sharding': [MESH_DIM_BATCH],
+              'images_sharding':
+                  [MESH_DIM_BATCH, MESH_DIM_HEIGHT, MESH_DIM_WIDTH, UNSHARDED],
+          }
+      ))
+  def testMultiClientIter(self, mesh_dims, idx_sharding, images_sharding):
+    num_batches = 4
+    batch_size = 16
+    num_elems = num_batches * batch_size
+
+    images = stateless_random_ops.stateless_random_uniform(
+        [num_elems, IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS],
+        seed=(1, 2),
+        minval=0,
+        maxval=255,
+        dtype=dtypes.int32,
+    )
+    dataset = dataset_ops.DatasetV2.from_tensor_slices(images)
+
+    # Enumerate the dataset elements to make it easier to identify the batches
+    # returned by the DTensorDataset.
+    dataset = dataset.enumerate()
+
+    # Store a mapping of index to dataset elements which can be looked up later
+    # to identify the batches returned by the DTensorDataset.
+    all_elems = {idx.numpy(): elem for idx, elem in dataset}
+
+    # Write the dataset and shard it among multiple files.
+    dataset_paths = self.write_dataset(
+        dataset, num_files=8, num_elems=num_elems)
+
+    # Construct args for starmap.
+    args = []
+    mesh_dim_names, mesh_dim_sizes = zip(*mesh_dims)
+    global_device_ids = test_util.create_device_ids_array(mesh_dim_sizes)
+    device_ids_split = np.split(np.ravel(global_device_ids), NUM_CLIENTS)
+    dtensor_jobs = [
+        f'localhost:{self.server_ports[i]}' for i in range(NUM_CLIENTS)
+    ]
+
+    for client_id in range(NUM_CLIENTS):
+      # Manually specify DTensor environment variables since we are in a test
+      # environment.
+      env = {
+          config._DT_CLIENT_ID: str(client_id),
+          config._DT_JOB_NAME: str(JOB_NAME),
+          config._DT_JOBS: ','.join(dtensor_jobs)
+      }
+
+      local_device_ids = device_ids_split[client_id].tolist()
+      local_devices = [
+          device_spec.DeviceSpecV2(  # pylint: disable=g-complex-comprehension
+              job=JOB_NAME,
+              replica=0,
+              task=client_id,
+              device_type='CPU',
+              device_index=i,
+          )
+          for i in range(len(local_device_ids))
+      ]
+      mesh = Mesh(
+          dim_names=mesh_dim_names,
+          global_device_ids=global_device_ids,
+          local_device_ids=local_device_ids,
+          local_devices=local_devices,
+      )
+      idx_layout = Layout(idx_sharding, mesh)
+      images_layout = Layout(images_sharding, mesh)
+      batch_dim = MESH_DIM_BATCH if MESH_DIM_BATCH in images_sharding else None
+
+      args.append((client_id, self._testMethodName, env, NUM_DEVICES_PER_CLIENT,
+                   self.cluster.dispatcher_address(),
+                   self.worker_ports[client_id], batch_size, dataset_paths,
+                   mesh, batch_dim, (idx_layout, images_layout)))
+
+    def get_results():
+      # Run the DTensor client processes and get the DTensor dataset components.
+      with mp_context.Pool(NUM_CLIENTS) as pool:
+        results = pool.starmap(run_client, args)
+        pool.close()
+        pool.join()
+
+      return results
+
+    # TODO(b/271162918): fix multi-client use case.
+    with self.assertRaises(NotImplementedError):
+      results = get_results()
+
+    return
+    # pylint: disable=unreachable
+
+    # Create a mesh on the main test process. The tensor components returned
+    # from each DTensor client subprocess will be packed onto this mesh to
+    # verify correctness.
+    test_mesh = mesh_util.create_mesh(
+        mesh_dims=mesh_dims,
+        devices=[
+            'CPU:%d' % i for i in range(NUM_CLIENTS * NUM_DEVICES_PER_CLIENT)
+        ])
+    test_mesh = self.configTestMesh({'CPU': test_mesh})
+    idx_test_layout = Layout(idx_sharding, test_mesh)
+    images_test_layout = Layout(images_sharding, test_mesh)
+
+    for batch_elems in zip(*results):
+      # Collect the tensor components returned from each client.
+      idx_components = []
+      images_components = []
+      for client_id in range(NUM_CLIENTS):
+        local_idx, local_images = batch_elems[client_id]
+        idx_components.extend(local_idx)
+        images_components.extend(local_images)
+
+      # Pack the dataset elements into a DTensor on the test mesh.
+      d_idx = api.pack(idx_components, idx_test_layout)
+      d_images = api.pack(images_components, images_test_layout)
+
+      # Get the batch of elements from the original dataset using the element
+      # indices.
+      batch_stack = []
+      for elem_idx in d_idx:
+        batch_stack.append(all_elems.pop(elem_idx.numpy()))
+      batch = array_ops_stack.stack(batch_stack)
+
+      self.assertDTensorEqual(batch, images_test_layout, d_images)
+
+    self.assertEmpty(
+        all_elems, 'Not all batches were returned by DTensorDataset.')
+
+
+if __name__ == '__main__':
+  test_backend_util.handle_test_main(test.main)
diff --git a/tensorflow/dtensor/python/tests/multi_client_test_util.py b/tensorflow/dtensor/python/tests/multi_client_test_util.py
index 35d3e7aa10ef98..dd4a69f14f77e4 100644
--- a/tensorflow/dtensor/python/tests/multi_client_test_util.py
+++ b/tensorflow/dtensor/python/tests/multi_client_test_util.py
@@ -31,6 +31,11 @@
     'Number of clients. 0 for local mode. 2 is the only allowed value for TPU.')
 
 
+def pick_unused_port():
+  """Helper function to return an unused port."""
+  return portpicker.pick_unused_port()
+
+
 def multi_client_main(client_config_function):
   """Creates a Flock of TensorFlow Processes on localhost."""
   flags.FLAGS(sys.argv, known_only=True)
@@ -49,12 +54,11 @@ def multi_client_main(client_config_function):
 
   # Inverts the order of ports intentionally to rule out ordering bugs.
   server_ports = sorted(
-      [portpicker.pick_unused_port() for _ in range(num_process)], reverse=True)
-
-  additional_ports = sorted(
-      [portpicker.pick_unused_port() for _ in range(num_process)]
+      [pick_unused_port() for _ in range(num_process)], reverse=True
   )
 
+  additional_ports = sorted([pick_unused_port() for _ in range(num_process)])
+
   # Starts processes
   procs = []
   for client_idx in range(num_process):
@@ -138,4 +142,3 @@ def run_client(idx, num_clients, server_ports, additional_ports,
 
   # The following function call never returns.
   tf_test.main()
-
diff --git a/tensorflow/dtensor/python/tests/numerics_test.py b/tensorflow/dtensor/python/tests/numerics_test.py
new file mode 100644
index 00000000000000..60bb7995adf6e2
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/numerics_test.py
@@ -0,0 +1,125 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for numerics in DTensor Ops."""
+
+import os
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.dtensor.python import accelerator_util
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python import numpy_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.platform import test
+
+Layout = layout_lib.Layout
+Mesh = layout_lib.Mesh
+UNSHARDED = layout_lib.UNSHARDED
+_MESH_DIM_X = 'x'
+_MESH_DIM_Y = 'y'
+_MESH_DIMS = [_MESH_DIM_X, _MESH_DIM_Y]
+
+
+class NumericTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(NumericTest, self).setUp()
+
+    self.skipForDeviceType(['TPU'],
+                           'all tests require 8 TPU cores.',
+                           unless_device_count_equals_to=8)
+
+    test_util.reset_logical_devices('CPU', 8)
+    accelerator_util.initialize_accelerator_system()
+
+    self.stateless_random_seed = [0, 1]
+
+  def _create_mesh(self, topology, device):
+    device_ids = test_util.create_device_ids_array(topology)
+    return Mesh(
+        _MESH_DIMS,
+        device_ids,
+        np.ravel(device_ids).tolist(),
+        test_util.create_device_list(topology, device),
+    )
+
+  # Tests AllReduce numerics with and without mixed precision reduce enabled,
+  # based on go/dtensor-numerics.
+  @parameterized.named_parameters(('_without_mixed_precision_reduce', False),
+                                  ('_with_mixed_precision_reduce', True))
+  def test_all_reduce(self, enable_mixed_precision_reduce):
+    if enable_mixed_precision_reduce:
+      os.environ['DTENSOR_ENABLE_MIXED_PRECISION_REDUCE'] = ''
+      # Override group size since we are testing on smaller mesh.
+      os.environ['DTENSOR_REDUCE_IN_BFLOAT16_MAX_GROUP_SIZE'] = '4'
+    else:
+      if 'DTENSOR_ENABLE_MIXED_PRECISION_REDUCE' in os.environ:
+        del os.environ['DTENSOR_ENABLE_MIXED_PRECISION_REDUCE']
+
+    @polymorphic_function.function
+    def _compute_reduction(inp):
+      return math_ops.reduce_sum(inp, axis=[2])
+
+    input_tensor = stateless_random_ops.stateless_random_uniform(
+        shape=(8, 8, 8, 64),
+        seed=self.stateless_random_seed,
+        minval=-5.0,
+        maxval=5.0,
+        dtype=dtypes.bfloat16,
+    )
+    expected = _compute_reduction(input_tensor)
+
+    # Compute reduction on 8x1, since dim 2 is unsharded AllReduce will not be
+    # needed.
+    mesh_8x1 = self._create_mesh((8, 1), 'TPU')
+    input_8x1 = numpy_util.pack_numpy(
+        input_tensor,
+        Layout([_MESH_DIM_X, UNSHARDED, UNSHARDED, UNSHARDED], mesh_8x1),
+    )
+    result_8x1 = _compute_reduction(input_8x1)
+    result_8x1_np = numpy_util.to_numpy(result_8x1)
+
+    # Compute reduction on 1x8, AllReduce will be needed since dim 2 is sharded.
+    mesh_1x8 = self._create_mesh((1, 8), 'TPU')
+    input_1x8 = numpy_util.pack_numpy(
+        input_tensor,
+        Layout([_MESH_DIM_X, UNSHARDED, _MESH_DIM_Y, UNSHARDED], mesh_1x8),
+    )
+    result_1x8 = _compute_reduction(input_1x8)
+    result_1x8_np = numpy_util.to_numpy(result_1x8)
+
+    self.assertEqual(result_8x1.dtype, dtypes.bfloat16)
+    self.assertEqual(result_1x8.dtype, dtypes.bfloat16)
+
+    # Mixed precision does not apply since AllReduce was not used, result will
+    # always be close to the expected value.
+    self.assertAllClose(result_8x1_np, expected, atol=1e-5, rtol=1e-5)
+
+    # AllReduce was needed, so result will be more accurate if mixed precision
+    # is enabled.
+    if enable_mixed_precision_reduce:
+      self.assertAllClose(result_1x8_np, expected, atol=1e-5, rtol=1e-5)
+    else:
+      self.assertNotAllClose(result_1x8_np, expected, atol=1e-5, rtol=1e-5)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/dtensor/python/tests/rng_test.py b/tensorflow/dtensor/python/tests/rng_test.py
new file mode 100644
index 00000000000000..41b61119372b0b
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/rng_test.py
@@ -0,0 +1,665 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from absl.testing import parameterized
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import d_variable
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python import numpy_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.dtensor.python.tests import test_util_ops
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute.cluster_resolver.tpu import tpu_cluster_resolver
+from tensorflow.python.eager import remote
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_bitwise_ops
+from tensorflow.python.ops import gen_stateful_random_ops
+from tensorflow.python.ops import gen_stateless_random_ops_v2
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.tpu import device_assignment as device_assignment_lib
+
+# pylint: enable=g-direct-tensorflow-import
+
+# Makes a 2-D mesh with dimensions as, X(2) and Y(4).
+_MESH_DIM_X = 'x'
+_MESH_DIM_Y = 'y'
+_MESH_DIMS = [_MESH_DIM_X, _MESH_DIM_Y]
+
+Layout = layout_lib.Layout
+Mesh = layout_lib.Mesh
+
+# Create a random local IDs to make tests more challenging.
+_LOCAL_IDS = [7, 3, 1, 4, 2, 0, 6, 5]
+# The row and col indices for each local id, e.g., 7 is (row=1, col=3)
+_ROW_INDEX = [i / 4 for i in _LOCAL_IDS]
+_COL_INDEX = [i % 4 for i in _LOCAL_IDS]
+
+# The index of local id for the row head.
+#
+# For example, local id 7 is on row 1, the head is local id 4, whose index in
+# _LOCAL_IDS is 3, i.e., _LOCAL_IDS[3] == 4
+_ROW_0_HEAD = 3
+_ROW_1_HEAD = 5
+_ROW_HEAD = [3, 5, 5, 3, 5, 5, 3, 3]
+
+# The index of local id for the col head. Similar to row id before.
+_COL_0_HEAD = 5
+_COL_1_HEAD = 2
+_COL_2_HEAD = 4
+_COL_3_HEAD = 1
+_COL_HEAD = [1, 1, 2, 5, 4, 5, 4, 2]
+
+_tpu_strategy = None
+
+
+def _call_op(op, seed, shape, dtype, key, counter, alg, minval, maxval,
+             op_version):
+  if op_version == 'V1':
+    return op(shape=shape, seed=seed, dtype=dtype)
+  elif op_version == 'V2':
+    return op(shape=shape, key=key, counter=counter, alg=alg, dtype=dtype)
+  elif op_version == 'V2_RANGE':
+    return op(
+        shape=shape,
+        key=key,
+        counter=counter,
+        alg=alg,
+        minval=minval,
+        maxval=maxval)
+  else:
+    raise ValueError('op_version argument was invalid.')
+
+
+def _call_dtensor_op(op, seed, shape, dtype, key, counter, alg, minval, maxval,
+                     op_version, mesh):
+  if op_version == 'V1':
+    return op(shape=shape, seed=seed, dtype=dtype)
+
+  shape = numpy_util.pack_numpy(
+      constant_op.constant(shape), Layout.replicated(mesh, 1)
+  )
+  key = numpy_util.pack_numpy(key, Layout.replicated(mesh, 1))
+  counter = numpy_util.pack_numpy(counter, Layout.replicated(mesh, 1))
+
+  if op_version == 'V2':
+    return op(shape=shape, key=key, counter=counter, alg=alg, dtype=dtype)
+  elif op_version == 'V2_RANGE':
+    return op(
+        shape=shape,
+        key=key,
+        counter=counter,
+        alg=alg,
+        minval=minval,
+        maxval=maxval)
+  else:
+    raise ValueError('op_version argument was invalid.')
+
+
+def get_tpu_strategy():
+  """Returns a single-core TPUStrategy."""
+  global _tpu_strategy
+  if _tpu_strategy is not None:
+    return _tpu_strategy
+
+  resolver = tpu_cluster_resolver.TPUClusterResolver(tpu='')
+  remote.connect_to_cluster(resolver)
+  topology = tpu_cluster_resolver.initialize_tpu_system(resolver)
+  device_assignment = device_assignment_lib.DeviceAssignment.build(
+      topology, num_replicas=1
+  )
+  strategy = tpu_strategy.TPUStrategyV2(
+      resolver, experimental_device_assignment=device_assignment
+  )
+  _tpu_strategy = strategy
+  return strategy
+
+
+def rng_op_spmd(op,
+                device_id,
+                seed,
+                shape,
+                dtype,
+                key,
+                counter,
+                alg,
+                minval,
+                maxval,
+                op_version,
+                device_index_fn,
+                full_replicated=False,
+                is_tpu=False):
+
+  if not is_tpu:
+    return rng_op_spmd_fn(
+        op,
+        device_id,
+        seed,
+        shape,
+        dtype,
+        key,
+        counter,
+        alg,
+        minval,
+        maxval,
+        op_version,
+        device_index_fn,
+        full_replicated=full_replicated)
+
+  # As of 2021-April, TPU eager and multi-device function produce different
+  # stateless rng results compared with bridge compiled function. As DTensor
+  # uses bridge to lower TPU function by default, we need to create a
+  # TPUStrategy for single core and invoke `run` on it.
+  @polymorphic_function.function
+  def tpu_fn(device_id, seed):
+    return rng_op_spmd_fn(
+        op,
+        device_id,
+        seed,
+        shape,
+        dtype,
+        key,
+        counter,
+        alg,
+        minval,
+        maxval,
+        op_version,
+        device_index_fn,
+        full_replicated=full_replicated)
+
+  return get_tpu_strategy().run(tpu_fn, args=(device_id, seed))
+
+
+def rng_op_spmd_fn(op,
+                   device_id,
+                   seed,
+                   shape,
+                   dtype,
+                   key,
+                   counter,
+                   alg,
+                   minval,
+                   maxval,
+                   op_version,
+                   device_index_fn,
+                   full_replicated=False):
+  if full_replicated:
+    # TODO(bfontain,xiejw): Consider to make this consistent with non-replicated
+    # case. Seems very confusing.
+    new_seed, new_key = seed, key
+  else:
+    # Runs on TF2 non-DTensor pure eager. This code should align the same
+    # logic in RandomOpSPMDExpander.
+    x_cord = device_id // 4
+    y_cord = device_id % 4
+    device_index = device_index_fn(x_cord, y_cord)
+    device_id_seed = device_index * 65536 + 65521
+    new_seed = gen_bitwise_ops.bitwise_xor(seed, device_id_seed)
+    new_key = gen_bitwise_ops.bitwise_xor(
+        key, math_ops.cast(device_id_seed, dtype=dtypes.uint64)
+    )
+  return _call_op(
+      op=op,
+      seed=new_seed,
+      shape=shape,
+      dtype=dtype,
+      key=new_key,
+      counter=counter,
+      alg=alg,
+      minval=minval,
+      maxval=maxval,
+      op_version=op_version)
+
+
+class DTensorRNGTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(DTensorRNGTest, self).setUp()
+    global_ids = test_util.create_device_ids_array((2, 4))
+    local_ids = _LOCAL_IDS
+    mesh_dict = {
+        device: Mesh(
+            [_MESH_DIM_X, _MESH_DIM_Y],
+            global_ids,
+            local_ids,
+            test_util.create_device_list((2, 4), device),
+        )
+        for device in ('CPU', 'GPU', 'TPU')
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+    # Creates a bunch of common layouts used by tests later.
+    self.replicated_layout_2d = Layout.replicated(self.mesh, rank=2)
+    self.shardings = {
+        'batch': Layout.batch_sharded,
+        'inner': Layout.inner_sharded
+    }
+    # Creates a bunch of parameters for rng V2 ops
+    self.key = constant_op.constant([123], dtype=dtypes.uint64)
+    self.counter = constant_op.constant([1, 1], dtype=dtypes.uint64)
+    self.alg = 1
+    self.minval = 1
+    self.maxval = 100
+
+  @parameterized.named_parameters(test_util_ops.RANDOM_OPS)
+  def testStatelessRNGWithFullyReplicated(self, op, dtype, op_version):
+    layout = self.replicated_layout_2d
+    shape = [16, 16]
+    seed = [123, 321]
+
+    with ops.device_v2(api.device_name()):
+      with api._dtensor_device()._default_layout(layout):
+        b = _call_dtensor_op(
+            op=op,
+            seed=seed,
+            shape=shape,
+            dtype=dtype,
+            key=self.key,
+            counter=self.counter,
+            alg=self.alg,
+            minval=self.minval,
+            maxval=self.maxval,
+            op_version=op_version,
+            mesh=self.mesh)
+
+    api.check_layout(b, layout)
+    self.assertListEqual(shape, list(b.shape))
+
+    b = [tensor.numpy() for tensor in api.unpack(b)]
+    for i in range(self.mesh.num_local_devices() - 1):
+      self.assertAllEqual(b[i], b[i + 1])
+
+  @parameterized.named_parameters(test_util_ops.RANDOM_OPS)
+  def testStatelessRNGWithFullyReplicatedComparingWithNonDTensor(
+      self, op, dtype, op_version):
+
+    layout = self.replicated_layout_2d
+    shape = [16, 16]
+    seed = [123, 321]
+
+    with ops.device_v2(api.device_name()):
+      with api._dtensor_device()._default_layout(layout):
+        b = _call_dtensor_op(
+            op=op,
+            seed=seed,
+            shape=shape,
+            dtype=dtype,
+            key=self.key,
+            counter=self.counter,
+            alg=self.alg,
+            minval=self.minval,
+            maxval=self.maxval,
+            op_version=op_version,
+            mesh=self.mesh)
+
+    api.check_layout(b, layout)
+    self.assertListEqual(shape, list(b.shape))
+
+    b = [tensor.numpy() for tensor in api.unpack(b)]
+
+    local_shape = shape
+    for index, device_id in enumerate(_LOCAL_IDS):
+      self.assertAllEqual(
+          b[index],
+          rng_op_spmd(
+              op,
+              device_id,
+              seed,
+              local_shape,
+              dtype,
+              key=self.key,
+              counter=self.counter,
+              alg=self.alg,
+              minval=self.minval,
+              maxval=self.maxval,
+              op_version=op_version,
+              device_index_fn=None,  # not needed
+              full_replicated=True,
+              is_tpu=self.mesh.device_type().upper() == 'TPU'))
+
+  @parameterized.named_parameters(
+      test_util_ops.expand_test_config(
+          test_util_ops.RANDOM_OPS,
+          [
+              {
+                  'dim': _MESH_DIM_X,
+                  'shard_type': 'batch',
+              },
+              {
+                  'dim': _MESH_DIM_Y,
+                  'shard_type': 'batch',
+              },
+              {
+                  'dim': _MESH_DIM_X,
+                  'shard_type': 'inner',
+              },
+              {'dim': _MESH_DIM_Y, 'shard_type': 'inner'},
+          ],
+      )
+  )
+  def testStatelessRNGOpsWithSingleDimensionSharded(self, op, dtype, op_version,
+                                                    dim, shard_type):
+    shape = [128, 128]
+    seed = [123, 321]
+    sharding = self.shardings[shard_type]
+    layout = sharding(self.mesh, dim, rank=2)
+
+    # Raw rng Ops do not have inputs, so we need to place the Op DTensor device
+    # explicitly.
+    with ops.device_v2(api.device_name()):
+      with api._dtensor_device()._default_layout(layout):
+        b = _call_dtensor_op(
+            op=op,
+            seed=seed,
+            shape=shape,
+            dtype=dtype,
+            key=self.key,
+            counter=self.counter,
+            alg=self.alg,
+            minval=self.minval,
+            maxval=self.maxval,
+            op_version=op_version,
+            mesh=self.mesh)
+
+    api.check_layout(b, layout)
+    b = [tensor.numpy() for tensor in api.unpack(b)]
+
+    if dim == _MESH_DIM_X:
+      if shard_type == 'batch':
+        self.assertAllEqual(b[0].shape, [64, 128])
+      else:
+        assert shard_type == 'inner'
+        self.assertAllEqual(b[0].shape, [128, 64])
+
+      # first check that each component is same as the row header.
+      for i in range(self.mesh.num_local_devices()):
+        self.assertAllEqual(b[i], b[_ROW_HEAD[i]])
+      # then check the row header are NOT identital.
+      self.assertNotAllEqual(b[_ROW_0_HEAD], b[_ROW_1_HEAD])
+
+    elif dim == _MESH_DIM_Y:
+      if shard_type == 'batch':
+        self.assertAllEqual(b[0].shape, [32, 128])
+      else:
+        assert shard_type == 'inner'
+        self.assertAllEqual(b[0].shape, [128, 32])
+
+      # first check elements in same columns are identical
+      for i in range(self.mesh.num_local_devices()):
+        self.assertAllEqual(b[i], b[_COL_HEAD[i]])
+
+      col_heads = [_COL_0_HEAD, _COL_1_HEAD, _COL_2_HEAD, _COL_3_HEAD]
+      # then check the column header are not identital (mutually)
+      for i in range(self.mesh.num_local_devices() - 1):
+        for j in range(self.mesh.num_local_devices()):
+          if i == j:
+            continue
+          if i in col_heads and j in col_heads:
+            self.assertNotAllEqual(b[i], b[j])
+
+    else:
+      self.fail('should not reach here.')
+
+  @parameterized.named_parameters(
+      test_util_ops.expand_test_config(
+          test_util_ops.RANDOM_OPS,
+          [
+              {
+                  'dim': _MESH_DIM_X,
+                  'shard_type': 'batch',
+              },
+              {
+                  'dim': _MESH_DIM_Y,
+                  'shard_type': 'batch',
+              },
+              {
+                  'dim': _MESH_DIM_X,
+                  'shard_type': 'inner',
+              },
+              {'dim': _MESH_DIM_Y, 'shard_type': 'inner'},
+          ],
+      )
+  )
+  def testStatelessRNGOpsWithSingleDimensionShardedComparingWithNonDTensor(
+      self, op, dtype, op_version, dim, shard_type):
+
+    shape = [128, 128]
+    seed = [123, 321]
+    sharding = self.shardings[shard_type]
+    layout = sharding(self.mesh, dim, rank=2)
+
+    # Raw rng Ops do not have inputs, so we need to place the Op DTensor device
+    # explicitly.
+    with ops.device_v2(api.device_name()):
+      with api._dtensor_device()._default_layout(layout):
+        b = _call_dtensor_op(
+            op=op,
+            seed=seed,
+            shape=shape,
+            dtype=dtype,
+            key=self.key,
+            counter=self.counter,
+            alg=self.alg,
+            minval=self.minval,
+            maxval=self.maxval,
+            op_version=op_version,
+            mesh=self.mesh)
+
+    api.check_layout(b, layout)
+    b = [tensor.numpy() for tensor in api.unpack(b)]
+
+    if dim == _MESH_DIM_X:
+      if shard_type == 'batch':
+        local_shape = [64, 128]
+      else:
+        local_shape = [128, 64]
+
+      def device_index_fn(x_cord, y_cord):
+        # See todo of device_index_fn in 2d sharding case.
+        del y_cord
+        return x_cord
+
+      for index, device_id in enumerate(_LOCAL_IDS):
+        self.assertAllEqual(
+            b[index],
+            rng_op_spmd(
+                op,
+                device_id,
+                seed,
+                local_shape,
+                dtype,
+                key=self.key,
+                counter=self.counter,
+                alg=self.alg,
+                minval=self.minval,
+                maxval=self.maxval,
+                op_version=op_version,
+                device_index_fn=device_index_fn,
+                is_tpu=self.mesh.device_type().upper() == 'TPU'))
+    elif dim == _MESH_DIM_Y:
+      if shard_type == 'batch':
+        local_shape = [32, 128]
+      else:
+        local_shape = [128, 32]
+
+      def device_index_fn(x_cord, y_cord):
+        # See todo of device_index_fn in 2d sharding case. note this case is
+        # particulary interesting as 2*y_cord is more natual.
+        del x_cord
+        return y_cord
+
+      for index, device_id in enumerate(_LOCAL_IDS):
+        self.assertAllEqual(
+            b[index],
+            rng_op_spmd(
+                op,
+                device_id,
+                seed,
+                local_shape,
+                dtype,
+                key=self.key,
+                counter=self.counter,
+                alg=self.alg,
+                minval=self.minval,
+                maxval=self.maxval,
+                op_version=op_version,
+                device_index_fn=device_index_fn,
+                is_tpu=self.mesh.device_type().upper() == 'TPU'))
+
+    else:
+      self.fail('should not reach here.')
+
+  @parameterized.named_parameters(test_util_ops.RANDOM_OPS)
+  def testStatelessRNGOpsWith2DSharding(self, op, dtype, op_version):
+    shape = [128, 128]
+    seed = [123, 321]
+    layout = Layout([_MESH_DIM_Y, _MESH_DIM_X], self.mesh)
+
+    # Raw rng Ops do not have inputs, so we need to place the Op DTensor device
+    # explicitly.
+    with ops.device_v2(api.device_name()):
+      with api._dtensor_device()._default_layout(layout):
+        b = _call_dtensor_op(
+            op=op,
+            seed=seed,
+            shape=shape,
+            dtype=dtype,
+            key=self.key,
+            counter=self.counter,
+            alg=self.alg,
+            minval=self.minval,
+            maxval=self.maxval,
+            op_version=op_version,
+            mesh=self.mesh)
+
+    api.check_layout(b, layout)
+    b = [tensor.numpy() for tensor in api.unpack(b)]
+
+    # check all raw components are not identital (mutually)
+    for i in range(self.mesh.num_local_devices() - 1):
+      for j in range(self.mesh.num_local_devices()):
+        if i == j:
+          continue
+        self.assertNotAllEqual(b[i], b[j])
+
+  @parameterized.named_parameters(test_util_ops.RANDOM_OPS)
+  def testStatelessRNGOpsWith2DShardingComparingWithNonDTensor(
+      self, op, dtype, op_version):
+    shape = [128, 128]
+    seed = [123, 321]
+    layout = Layout([_MESH_DIM_Y, _MESH_DIM_X], self.mesh)
+    local_shape = [128 // 4, 128 // 2]
+
+    # Raw rng Ops do not have inputs, so we need to place the Op DTensor device
+    # explicitly.
+    with ops.device_v2(api.device_name()):
+      with api._dtensor_device()._default_layout(layout):
+        b = _call_dtensor_op(
+            op=op,
+            seed=seed,
+            shape=shape,
+            dtype=dtype,
+            key=self.key,
+            counter=self.counter,
+            alg=self.alg,
+            minval=self.minval,
+            maxval=self.maxval,
+            op_version=op_version,
+            mesh=self.mesh)
+
+    api.check_layout(b, layout)
+    b = [tensor.numpy() for tensor in api.unpack(b)]
+
+    def device_index_fn(x_cord, y_cord):
+      # TODO(bfontain,xiejw): Currently, the device index is x+2y. But it is
+      # more natual to use 4x+y for a mesh<x=2, y=4>. Consider to change this
+      # once all correctness tests are done.
+      return x_cord + 2 * y_cord
+
+    for index, device_id in enumerate(_LOCAL_IDS):
+      self.assertAllEqual(
+          b[index],
+          rng_op_spmd(
+              op,
+              device_id,
+              seed,
+              local_shape,
+              dtype,
+              key=self.key,
+              counter=self.counter,
+              alg=self.alg,
+              minval=self.minval,
+              maxval=self.maxval,
+              op_version=op_version,
+              device_index_fn=device_index_fn,
+              is_tpu=self.mesh.device_type().upper() == 'TPU'))
+
+  def testRNGReadAndSkip(self):
+    replicated_layout = Layout.replicated(self.mesh, 1)
+    a = constant_op.constant([1, 2, 3], dtype=dtypes.int64)
+    v = variables.Variable(a)
+    expected = gen_stateful_random_ops.rng_read_and_skip(
+        resource=v.handle,
+        alg=1,
+        delta=constant_op.constant(1, dtype=dtypes.uint64),
+    )
+
+    a = numpy_util.pack_numpy(a, replicated_layout)
+    v = d_variable.DVariable(a)
+    got = gen_stateful_random_ops.rng_read_and_skip(
+        resource=v.handle,
+        alg=1,
+        delta=constant_op.constant(1, dtype=dtypes.uint64),
+    )
+
+    self.assertDTensorEqual(expected, replicated_layout, got)
+
+  def testStatelessRandomGetKeyCounter(self):
+    seed = constant_op.constant([7, 17], dtypes.int32)
+
+    # TPU computation result is different from CPU computation.
+    # We force it to run on the TPU using tpu_strategy for TPU mesh
+    # so that we compare equal values.
+    @polymorphic_function.function
+    def tpu_fn():
+      return gen_stateless_random_ops_v2.stateless_random_get_key_counter(
+          seed=seed
+      )
+
+    if self.mesh.device_type().upper() == 'TPU':
+      expected = get_tpu_strategy().run(tpu_fn)
+    else:
+      expected = gen_stateless_random_ops_v2.stateless_random_get_key_counter(
+          seed=seed
+      )
+
+    replicated_1d_layout = Layout.replicated(self.mesh, 1)
+    seed = numpy_util.pack_numpy(seed, replicated_1d_layout)
+
+    got = gen_stateless_random_ops_v2.stateless_random_get_key_counter(
+        seed=seed
+    )
+    self.assertDTensorEqual(expected[0], replicated_1d_layout, got[0])
+    self.assertDTensorEqual(expected[1], replicated_1d_layout, got[1])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/dtensor/python/tests/save_restore_v2_test.py b/tensorflow/dtensor/python/tests/save_restore_v2_test.py
new file mode 100644
index 00000000000000..e53f5003240da9
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/save_restore_v2_test.py
@@ -0,0 +1,337 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import gc
+
+from absl.testing import parameterized
+
+import numpy as np
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import d_variable
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python import numpy_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.checkpoint import checkpoint
+from tensorflow.python.checkpoint import checkpoint_management
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.module import module
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.platform import test
+
+Mesh = layout_lib.Mesh
+Layout = layout_lib.Layout
+UNSHARDED = layout_lib.UNSHARDED
+
+# Makes a 2D mesh with dimension X(2) and dimension Y(4).
+_MESH_DIM_X = 'x'
+_MESH_DIM_Y = 'y'
+_DEVICE_IDS = test_util.create_device_ids_array((2, 4))
+_TWO_D_CPU_MESH = Mesh(
+    [_MESH_DIM_X, _MESH_DIM_Y],
+    _DEVICE_IDS,
+    np.ravel(_DEVICE_IDS).tolist(),
+    test_util.create_device_list((2, 4), 'CPU'),
+)
+_TWO_D_TPU_MESH = Mesh(
+    [_MESH_DIM_X, _MESH_DIM_Y],
+    _DEVICE_IDS,
+    np.ravel(_DEVICE_IDS).tolist(),
+    test_util.create_device_list((2, 4), 'TPU'),
+)
+_TWO_D_GPU_MESH = Mesh(
+    [_MESH_DIM_X, _MESH_DIM_Y],
+    _DEVICE_IDS,
+    np.ravel(_DEVICE_IDS).tolist(),
+    test_util.create_device_list((2, 4), 'GPU'),
+)
+
+
+class DTensorSaveRestoreV2Test(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(DTensorSaveRestoreV2Test, self).setUp()
+    self.skipForDeviceType(['TPU'],
+                           'all tests require 8 TPU cores.',
+                           unless_device_count_equals_to=8)
+    mesh_dict = {
+        'CPU': _TWO_D_CPU_MESH,
+        'GPU': _TWO_D_GPU_MESH,
+        'TPU': _TWO_D_TPU_MESH,
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+    self.skipForTfrt(
+        'b/235088250, DTensorCheckpointingV2 requires upcasting TF scalar '
+        'variables to replicated DTensor scalar variables, which is not '
+        'supported in TFRT.')
+
+  @parameterized.named_parameters(
+      ('x_unsharded', [_MESH_DIM_X, UNSHARDED]),
+      ('unsharded_x', [UNSHARDED, _MESH_DIM_X]),
+      ('x_y', [_MESH_DIM_X, _MESH_DIM_Y]),
+      ('unsharded_unsharded', [UNSHARDED, UNSHARDED]),
+  )
+  def test_checkpoint_simple(self, shard_spec):
+    tensor_a = stateless_random_ops.stateless_random_uniform(
+        shape=[4, 8], seed=[0, 1]
+    )
+    tensor_b = stateless_random_ops.stateless_random_uniform(
+        shape=[2, 4], seed=[0, 1]
+    )
+
+    layout = Layout(shard_spec, self.mesh)
+
+    dvariable_a = d_variable.DVariable(numpy_util.pack_numpy(tensor_a, layout))
+    dvariable_b = d_variable.DVariable(numpy_util.pack_numpy(tensor_b, layout))
+
+    # Record a checkpoint with two dvariables.
+    ckpt = checkpoint.Checkpoint(a=dvariable_a, b=dvariable_b)
+
+    saved_path = ckpt.save(self.get_temp_dir())
+
+    # Zero out the values of the DVariables so that we can restore
+    # and check that the values are restored to the initial random values.
+    dvariable_a.assign(
+        numpy_util.pack_numpy(
+            array_ops.zeros([4, 8], dtype=dtypes.float32), layout
+        )
+    )
+    dvariable_b.assign(
+        numpy_util.pack_numpy(
+            array_ops.zeros([2, 4], dtype=dtypes.float32), layout
+        )
+    )
+
+    ckpt.restore(saved_path)
+
+    self.assertDTensorEqual(tensor_a, layout, dvariable_a.read_value())
+    self.assertDTensorEqual(tensor_b, layout, dvariable_b.read_value())
+
+  @parameterized.named_parameters(
+      ('x_unsharded', [_MESH_DIM_X, UNSHARDED]),
+      ('unsharded_x', [UNSHARDED, _MESH_DIM_X]),
+      ('x_y', [_MESH_DIM_X, _MESH_DIM_Y]),
+      ('unsharded_unsharded', [UNSHARDED, UNSHARDED]),
+  )
+  def test_checkpoint_write(self, shard_spec):
+    tensor_a = stateless_random_ops.stateless_random_uniform(
+        shape=[4, 8], seed=[0, 1]
+    )
+    tensor_b = stateless_random_ops.stateless_random_uniform(
+        shape=[2, 4], seed=[0, 1]
+    )
+
+    layout = Layout(shard_spec, self.mesh)
+
+    dvariable_a = d_variable.DVariable(numpy_util.pack_numpy(tensor_a, layout))
+    dvariable_b = d_variable.DVariable(numpy_util.pack_numpy(tensor_b, layout))
+
+    ckpt = checkpoint.Checkpoint(a=dvariable_a, b=dvariable_b)
+
+    saved_path = ckpt.write(self.get_temp_dir())
+
+    dvariable_a.assign(
+        numpy_util.pack_numpy(
+            array_ops.zeros([4, 8], dtype=dtypes.float32), layout
+        )
+    )
+    dvariable_b.assign(
+        numpy_util.pack_numpy(
+            array_ops.zeros([2, 4], dtype=dtypes.float32), layout
+        )
+    )
+
+    ckpt.restore(saved_path)
+
+    self.assertDTensorEqual(tensor_a, layout, dvariable_a.read_value())
+    self.assertDTensorEqual(tensor_b, layout, dvariable_b.read_value())
+
+  @parameterized.named_parameters(
+      ('x_unsharded', [_MESH_DIM_X, UNSHARDED]),
+      ('unsharded_x', [UNSHARDED, _MESH_DIM_X]),
+      ('x_y', [_MESH_DIM_X, _MESH_DIM_Y]),
+      ('unsharded_unsharded', [UNSHARDED, UNSHARDED]),
+  )
+  def test_checkpoint_manager(self, shard_spec):
+    tensor_a = stateless_random_ops.stateless_random_uniform(
+        shape=[8, 16], seed=[0, 1]
+    )
+    tensor_b = stateless_random_ops.stateless_random_uniform(
+        shape=[4, 4], seed=[0, 1]
+    )
+
+    layout = Layout(shard_spec, self.mesh)
+
+    dvariable_a = d_variable.DVariable(numpy_util.pack_numpy(tensor_a, layout))
+    dvariable_b = d_variable.DVariable(numpy_util.pack_numpy(tensor_b, layout))
+
+    # Record a checkpoint with two dvariables.
+    ckpt = checkpoint.Checkpoint(a=dvariable_a, b=dvariable_b)
+
+    checkpoint_manager = checkpoint_management.CheckpointManager(
+        ckpt, self.get_temp_dir(), max_to_keep=None
+    )
+
+    saved_path = checkpoint_manager.save()
+
+    # Zero out the values of the DVariables so that we can restore
+    # and check that the values are restored to the initial random values.
+    dvariable_a.assign(
+        numpy_util.pack_numpy(
+            array_ops.zeros([8, 16], dtype=dtypes.float32), layout
+        )
+    )
+    dvariable_b.assign(
+        numpy_util.pack_numpy(
+            array_ops.zeros([4, 4], dtype=dtypes.float32), layout
+        )
+    )
+
+    ckpt.restore(saved_path)
+
+    self.assertDTensorEqual(tensor_a, layout, dvariable_a.read_value())
+    self.assertDTensorEqual(tensor_b, layout, dvariable_b.read_value())
+
+  @parameterized.named_parameters(
+      ('x_unsharded', [_MESH_DIM_X, UNSHARDED]),
+      ('unsharded_x', [UNSHARDED, _MESH_DIM_X]),
+      ('x_y', [_MESH_DIM_X, _MESH_DIM_Y]),
+      ('unsharded_unsharded', [UNSHARDED, UNSHARDED]),
+  )
+  def test_checkpoint_restore_with_different_layout(self, shard_spec):
+    tensor_a = stateless_random_ops.stateless_random_uniform(
+        shape=[4, 8], seed=[0, 1]
+    )
+    tensor_b = stateless_random_ops.stateless_random_uniform(
+        shape=[2, 4], seed=[0, 1]
+    )
+
+    layout = Layout(shard_spec, self.mesh)
+
+    dvariable_a = d_variable.DVariable(numpy_util.pack_numpy(tensor_a, layout))
+    dvariable_b = d_variable.DVariable(numpy_util.pack_numpy(tensor_b, layout))
+
+    # Record a checkpoint with two dvariables.
+    checkpoint_1 = checkpoint.Checkpoint(a=dvariable_a, b=dvariable_b)
+
+    saved_path = checkpoint_1.save(self.get_temp_dir())
+
+    new_layout = Layout([_MESH_DIM_X, _MESH_DIM_Y], self.mesh)
+
+    # Create new Dvariables, zero'd out with different layouts
+    # from the layouts we saved the tensors.
+    dvariable_a = d_variable.DVariable(
+        numpy_util.pack_numpy(
+            array_ops.zeros([4, 8], dtype=dtypes.float32), new_layout
+        )
+    )
+    dvariable_b = d_variable.DVariable(
+        numpy_util.pack_numpy(
+            array_ops.zeros([2, 4], dtype=dtypes.float32), new_layout
+        )
+    )
+
+    checkpoint_2 = checkpoint.Checkpoint(a=dvariable_a, b=dvariable_b)
+
+    checkpoint_2.restore(saved_path)
+
+    self.assertDTensorEqual(tensor_a, new_layout, dvariable_a.read_value())
+    self.assertDTensorEqual(tensor_b, new_layout, dvariable_b.read_value())
+
+  @parameterized.named_parameters(
+      ('x_unsharded', [_MESH_DIM_X, UNSHARDED]),
+      ('unsharded_x', [UNSHARDED, _MESH_DIM_X]),
+  )
+  def test_checkpoint_in_a_train_loop(self, shard_dims):
+    # This test is a parallel test with save_restore_test's
+    # DTensorSaveRestoreTest.test_checkpoint
+
+    class M(module.Module):
+
+      # Pass in both replicated and sharded for better coverage.
+      def __init__(self, replicated_value, sharded_value):
+        # This is actually a DVariable.
+        self.r = d_variable.DVariable(replicated_value)
+        self.s = d_variable.DVariable(sharded_value)
+
+      def __call__(self, x):
+        return math_ops.reduce_sum(x + self.r) + math_ops.reduce_sum(x + self.s)
+
+    directory = self.get_temp_dir()
+
+    sharded_np = np.arange(8).reshape((2, 4)).astype(np.float32)
+    replicated_np = np.arange(16).reshape((8, 2)).astype(np.float32)
+
+    replicated_layout = Layout.replicated(self.mesh, rank=2)
+    one_d_sharded_layout = Layout(shard_dims, self.mesh)
+
+    replicated_value = api.copy_to_mesh(replicated_np, replicated_layout)
+    replicated_zeros = api.copy_to_mesh(
+        np.zeros((8, 2)).astype(np.float32), replicated_layout
+    )
+
+    sharded_value = numpy_util.pack_numpy(sharded_np, one_d_sharded_layout)
+    sharded_zeros = numpy_util.pack_numpy(
+        np.zeros((2, 4)).astype(np.float32), one_d_sharded_layout)
+
+    # Training loop that just increments the model's variable every "epoch"
+    # to test checkpointing.
+    for epoch in range(5):
+      m = M(replicated_value, sharded_value)
+
+      ckpt = checkpoint.Checkpoint(model=m)
+      manager = checkpoint_management.CheckpointManager(
+          ckpt, directory=directory, max_to_keep=None
+      )
+
+      ckpt.restore(manager.latest_checkpoint)
+
+      # Ensure that the variable is created
+      m(api.copy_to_mesh(1.0, Layout.replicated(self.mesh, rank=0)))
+
+      self.assertDTensorEqual(epoch + replicated_np, replicated_layout, m.r)
+      self.assertDTensorEqual(epoch + sharded_np, one_d_sharded_layout, m.s)
+
+      m.s.assign_add(
+          numpy_util.pack_numpy(
+              np.ones((2, 4), dtype=np.float32), one_d_sharded_layout))
+      m.r.assign_add(
+          api.copy_to_mesh(
+              constant_op.constant(np.ones((8, 2), dtype=np.float32)),
+              replicated_layout,
+          )
+      )
+
+      checkpoint_number = epoch + 1
+
+      stats1 = api._dtensor_device()._get_stats()
+      manager.save(checkpoint_number=checkpoint_number)
+
+      gc.collect()
+      stats2 = api._dtensor_device()._get_stats()
+      keys = set(stats2.keys())
+      keys.update(stats1.keys())
+      diff = {k: stats2.get(k, 0) - stats1.get(k, 0) for k in keys}
+      diff = {k: v for k, v in diff.items() if v != 0}
+
+      m.s.assign(sharded_zeros)
+      m.r.assign(replicated_zeros)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/dtensor/python/tests/sparse_test.py b/tensorflow/dtensor/python/tests/sparse_test.py
new file mode 100644
index 00000000000000..b519da74e4ea57
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/sparse_test.py
@@ -0,0 +1,141 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python import numpy_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+# Convenient constants to use for tests.
+_BATCH_DIM = "batch"
+_MESH_DIM_X = "x"
+
+# Shorter notation
+Layout = layout_lib.Layout
+Mesh = layout_lib.Mesh
+
+
+class DTensorSPMDTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+
+    self.skipForDeviceType(["GPU", "TPU"],
+                           "SparseTensors only supported on CPU.")
+
+    global_ids = test_util.create_device_ids_array((2, 2))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: Mesh(
+            [_BATCH_DIM, _MESH_DIM_X],
+            global_ids,
+            local_ids,
+            test_util.create_device_list((2, 2), device),
+        )
+        for device in ("CPU", "GPU", "TPU")
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+  @parameterized.parameters(
+      [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]
+  )
+  def testIdentityOpWithSparseTensorInputSimple(self, dtype):
+    inputs = array_ops.ones([6, 4], dtype=dtype)
+    layout = Layout.batch_sharded(self.mesh, _BATCH_DIM, rank=2)
+
+    @polymorphic_function.function
+    def f(x):
+      return array_ops.identity(x)
+
+    self.assertDTensorEqual(
+        inputs, layout,
+        f(numpy_util.pack_numpy(inputs, layout, make_sparse=True)))
+
+  @parameterized.product(
+      dtype=[dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64],
+      is_sparse_a=[True, False],
+      is_sparse_b=[True, False],
+  )
+  def testIdentityOpWithSparseTensorInputComplex(self, dtype, is_sparse_a,
+                                                 is_sparse_b):
+    inputs_a = array_ops.ones([2, 1], dtype=dtype)
+    inputs_b = array_ops.ones([32, 16], dtype=dtype)
+
+    layout_a = Layout.batch_sharded(self.mesh, _BATCH_DIM, rank=2)
+    layout_b = Layout.replicated(self.mesh, rank=2)
+
+    @polymorphic_function.function
+    def f(x, y):
+      return array_ops.identity(x), array_ops.identity(y)
+
+    got_a, got_b = f(
+        numpy_util.pack_numpy(inputs_a, layout_a, make_sparse=is_sparse_a),
+        numpy_util.pack_numpy(inputs_b, layout_b, make_sparse=is_sparse_b))
+
+    self.assertDTensorEqual(inputs_a, layout_a, got_a)
+    self.assertDTensorEqual(inputs_b, layout_b, got_b)
+
+  def testMultipleIdentityOpFromOneSparseTensor(self):
+    inputs_a = array_ops.ones([2, 1])
+    layout_a = Layout.batch_sharded(self.mesh, _BATCH_DIM, rank=2)
+
+    @polymorphic_function.function
+    def f(x):
+      return array_ops.identity(x), array_ops.identity(x)
+
+    got_a, got_b = f(
+        numpy_util.pack_numpy(inputs_a, layout_a, make_sparse=True))
+
+    self.assertDTensorEqual(inputs_a, layout_a, got_a)
+    self.assertDTensorEqual(inputs_a, layout_a, got_b)
+
+  @parameterized.product(
+      is_sparse_a=[True, False],
+      is_sparse_b=[True, False],
+      shard_type=["Replicated", "Sharded"])
+  def testSparseTensorDenseMatMul(self, is_sparse_a, is_sparse_b, shard_type):
+    inputs_a = array_ops.ones([16, 16])
+    inputs_b = array_ops.ones([16, 16])
+
+    if shard_type == "Replicated":
+      layout_a = Layout.replicated(self.mesh, rank=2)
+      layout_b = Layout.replicated(self.mesh, rank=2)
+    else:
+      layout_a = Layout([_MESH_DIM_X, _BATCH_DIM], self.mesh)
+      layout_b = Layout(["unsharded", _MESH_DIM_X], self.mesh)
+
+    expected = math_ops.matmul(inputs_a, inputs_b)
+
+    @polymorphic_function.function
+    def f(x, y):
+      return math_ops.matmul(x, y)
+
+    got = f(
+        numpy_util.pack_numpy(inputs_a, layout_a, make_sparse=is_sparse_a),
+        numpy_util.pack_numpy(inputs_b, layout_b, make_sparse=is_sparse_b))
+
+    self.assertDTensorEqual(expected, Layout.replicated(self.mesh, rank=2), got)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/dtensor/python/tests/tpu_device_assignment_test.py b/tensorflow/dtensor/python/tests/tpu_device_assignment_test.py
new file mode 100644
index 00000000000000..08ece48382e52b
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/tpu_device_assignment_test.py
@@ -0,0 +1,889 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for TPU device assignment."""
+
+from tensorflow.dtensor.python import accelerator_util
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python import numpy_util
+from tensorflow.dtensor.python import tpu_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+Layout = layout_lib.Layout
+Mesh = layout_lib.Mesh
+
+
+class DeviceAssignmentTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+    accelerator_util.initialize_accelerator_system('TPU')
+
+  def tearDown(self):
+    accelerator_util.shutdown_accelerator_system()
+    super().tearDown()
+
+  def _build_all_reduce_ring(self, core_locations):
+    permutation = tpu_util._build_all_reduce_ring(core_locations)
+    return [core_locations[element] for element in permutation]
+
+  # Picture of chips:
+  # 0 -- 1
+  # |    |
+  # 3 -- 2
+  def testBuildAllReduceRing4Replicas(self):
+    core_locations = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+    ]
+    expected = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+    ]
+    result = self._build_all_reduce_ring(core_locations)
+    self.assertAllEqual(result, expected)
+
+  # Picture of chips with core0/core1 assignments:
+  # 0/1 -- 2/3
+  #  |      |
+  # 6/7 -- 4/5
+  def testBuildAllReduceRing8ReplicasUsingTwoCores(self):
+    core_locations = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+    ]
+    expected = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+    ]
+    result = self._build_all_reduce_ring(core_locations)
+    self.assertAllEqual(result, expected)
+
+  # Picture of chips:
+  # 0 -- 1 -- 2 -- 3
+  # |              |
+  # 15   6 -- 5 -- 4
+  # |    |
+  # 14   7 -- 8 -- 9
+  # |              |
+  # 13-- 12-- 11-- 10
+  def testBuildAllReduceRing32Replicas(self):
+    core_locations = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+        tpu_util._CoreLocation(0, 2, 0, 0),
+        tpu_util._CoreLocation(0, 2, 0, 1),
+        tpu_util._CoreLocation(0, 3, 0, 0),
+        tpu_util._CoreLocation(0, 3, 0, 1),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(1, 2, 0, 0),
+        tpu_util._CoreLocation(1, 2, 0, 1),
+        tpu_util._CoreLocation(1, 3, 0, 0),
+        tpu_util._CoreLocation(1, 3, 0, 1),
+        tpu_util._CoreLocation(2, 0, 0, 0),
+        tpu_util._CoreLocation(2, 0, 0, 1),
+        tpu_util._CoreLocation(2, 1, 0, 0),
+        tpu_util._CoreLocation(2, 1, 0, 1),
+        tpu_util._CoreLocation(2, 2, 0, 0),
+        tpu_util._CoreLocation(2, 2, 0, 1),
+        tpu_util._CoreLocation(2, 3, 0, 0),
+        tpu_util._CoreLocation(2, 3, 0, 1),
+        tpu_util._CoreLocation(3, 0, 0, 0),
+        tpu_util._CoreLocation(3, 0, 0, 1),
+        tpu_util._CoreLocation(3, 1, 0, 0),
+        tpu_util._CoreLocation(3, 1, 0, 1),
+        tpu_util._CoreLocation(3, 2, 0, 0),
+        tpu_util._CoreLocation(3, 2, 0, 1),
+        tpu_util._CoreLocation(3, 3, 0, 0),
+        tpu_util._CoreLocation(3, 3, 0, 1),
+    ]
+    expected = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(2, 0, 0, 0),
+        tpu_util._CoreLocation(2, 0, 0, 1),
+        tpu_util._CoreLocation(3, 0, 0, 0),
+        tpu_util._CoreLocation(3, 0, 0, 1),
+        tpu_util._CoreLocation(3, 1, 0, 0),
+        tpu_util._CoreLocation(3, 1, 0, 1),
+        tpu_util._CoreLocation(2, 1, 0, 0),
+        tpu_util._CoreLocation(2, 1, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(1, 2, 0, 0),
+        tpu_util._CoreLocation(1, 2, 0, 1),
+        tpu_util._CoreLocation(2, 2, 0, 0),
+        tpu_util._CoreLocation(2, 2, 0, 1),
+        tpu_util._CoreLocation(3, 2, 0, 0),
+        tpu_util._CoreLocation(3, 2, 0, 1),
+        tpu_util._CoreLocation(3, 3, 0, 0),
+        tpu_util._CoreLocation(3, 3, 0, 1),
+        tpu_util._CoreLocation(2, 3, 0, 0),
+        tpu_util._CoreLocation(2, 3, 0, 1),
+        tpu_util._CoreLocation(1, 3, 0, 0),
+        tpu_util._CoreLocation(1, 3, 0, 1),
+        tpu_util._CoreLocation(0, 3, 0, 0),
+        tpu_util._CoreLocation(0, 3, 0, 1),
+        tpu_util._CoreLocation(0, 2, 0, 0),
+        tpu_util._CoreLocation(0, 2, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+    ]
+    result = self._build_all_reduce_ring(core_locations)
+    self.assertAllEqual(result, expected)
+
+  # Picture of chips:
+  # 7 -- 0  6 -- 5
+  #      |       |
+  # 2 -- 1  3 -- 4
+  def testBuildAllReduceRing3D(self):
+    core_locations = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(0, 0, 1, 0),
+        tpu_util._CoreLocation(0, 0, 1, 1),
+        tpu_util._CoreLocation(0, 1, 1, 0),
+        tpu_util._CoreLocation(0, 1, 1, 1),
+        tpu_util._CoreLocation(1, 0, 1, 0),
+        tpu_util._CoreLocation(1, 0, 1, 1),
+        tpu_util._CoreLocation(1, 1, 1, 0),
+        tpu_util._CoreLocation(1, 1, 1, 1),
+    ]
+    expected = [
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+        tpu_util._CoreLocation(0, 1, 1, 1),
+        tpu_util._CoreLocation(0, 1, 1, 0),
+        tpu_util._CoreLocation(1, 1, 1, 1),
+        tpu_util._CoreLocation(1, 1, 1, 0),
+        tpu_util._CoreLocation(1, 0, 1, 1),
+        tpu_util._CoreLocation(1, 0, 1, 0),
+        tpu_util._CoreLocation(0, 0, 1, 0),
+        tpu_util._CoreLocation(0, 0, 1, 1),
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+    ]
+    result = self._build_all_reduce_ring(core_locations)
+    self.assertAllEqual(result, expected)
+
+  # Picture of chips:
+  # 31-- 0 -- 1 -- 2  30--29--28--27
+  #                |              |
+  # 14   5 -- 4 -- 3  15  24--25--26
+  # |    |            |   |
+  # 13   6 -- 7 -- 8  16  23--22--21
+  # |              |  |           |
+  # 12-- 11-- 10-- 9  17--18--19--20
+  def testBuildAllReduceRing3DLarge(self):
+    core_locations = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(2, 0, 0, 0),
+        tpu_util._CoreLocation(2, 0, 0, 1),
+        tpu_util._CoreLocation(3, 0, 0, 0),
+        tpu_util._CoreLocation(3, 0, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(2, 1, 0, 0),
+        tpu_util._CoreLocation(2, 1, 0, 1),
+        tpu_util._CoreLocation(3, 1, 0, 0),
+        tpu_util._CoreLocation(3, 1, 0, 1),
+        tpu_util._CoreLocation(0, 2, 0, 0),
+        tpu_util._CoreLocation(0, 2, 0, 1),
+        tpu_util._CoreLocation(1, 2, 0, 0),
+        tpu_util._CoreLocation(1, 2, 0, 1),
+        tpu_util._CoreLocation(2, 2, 0, 0),
+        tpu_util._CoreLocation(2, 2, 0, 1),
+        tpu_util._CoreLocation(3, 2, 0, 0),
+        tpu_util._CoreLocation(3, 2, 0, 1),
+        tpu_util._CoreLocation(0, 3, 0, 0),
+        tpu_util._CoreLocation(0, 3, 0, 1),
+        tpu_util._CoreLocation(1, 3, 0, 0),
+        tpu_util._CoreLocation(1, 3, 0, 1),
+        tpu_util._CoreLocation(2, 3, 0, 0),
+        tpu_util._CoreLocation(2, 3, 0, 1),
+        tpu_util._CoreLocation(3, 3, 0, 0),
+        tpu_util._CoreLocation(3, 3, 0, 1),
+        tpu_util._CoreLocation(0, 0, 1, 0),
+        tpu_util._CoreLocation(0, 0, 1, 1),
+        tpu_util._CoreLocation(1, 0, 1, 0),
+        tpu_util._CoreLocation(1, 0, 1, 1),
+        tpu_util._CoreLocation(2, 0, 1, 0),
+        tpu_util._CoreLocation(2, 0, 1, 1),
+        tpu_util._CoreLocation(3, 0, 1, 0),
+        tpu_util._CoreLocation(3, 0, 1, 1),
+        tpu_util._CoreLocation(0, 1, 1, 0),
+        tpu_util._CoreLocation(0, 1, 1, 1),
+        tpu_util._CoreLocation(1, 1, 1, 0),
+        tpu_util._CoreLocation(1, 1, 1, 1),
+        tpu_util._CoreLocation(2, 1, 1, 0),
+        tpu_util._CoreLocation(2, 1, 1, 1),
+        tpu_util._CoreLocation(3, 1, 1, 0),
+        tpu_util._CoreLocation(3, 1, 1, 1),
+        tpu_util._CoreLocation(0, 2, 1, 0),
+        tpu_util._CoreLocation(0, 2, 1, 1),
+        tpu_util._CoreLocation(1, 2, 1, 0),
+        tpu_util._CoreLocation(1, 2, 1, 1),
+        tpu_util._CoreLocation(2, 2, 1, 0),
+        tpu_util._CoreLocation(2, 2, 1, 1),
+        tpu_util._CoreLocation(3, 2, 1, 0),
+        tpu_util._CoreLocation(3, 2, 1, 1),
+        tpu_util._CoreLocation(0, 3, 1, 0),
+        tpu_util._CoreLocation(0, 3, 1, 1),
+        tpu_util._CoreLocation(1, 3, 1, 0),
+        tpu_util._CoreLocation(1, 3, 1, 1),
+        tpu_util._CoreLocation(2, 3, 1, 0),
+        tpu_util._CoreLocation(2, 3, 1, 1),
+        tpu_util._CoreLocation(3, 3, 1, 0),
+        tpu_util._CoreLocation(3, 3, 1, 1),
+    ]
+    expected = [
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(2, 0, 0, 0),
+        tpu_util._CoreLocation(2, 0, 0, 1),
+        tpu_util._CoreLocation(3, 0, 0, 0),
+        tpu_util._CoreLocation(3, 0, 0, 1),
+        tpu_util._CoreLocation(3, 1, 0, 0),
+        tpu_util._CoreLocation(3, 1, 0, 1),
+        tpu_util._CoreLocation(2, 1, 0, 0),
+        tpu_util._CoreLocation(2, 1, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(1, 2, 0, 0),
+        tpu_util._CoreLocation(1, 2, 0, 1),
+        tpu_util._CoreLocation(2, 2, 0, 0),
+        tpu_util._CoreLocation(2, 2, 0, 1),
+        tpu_util._CoreLocation(3, 2, 0, 0),
+        tpu_util._CoreLocation(3, 2, 0, 1),
+        tpu_util._CoreLocation(3, 3, 0, 0),
+        tpu_util._CoreLocation(3, 3, 0, 1),
+        tpu_util._CoreLocation(2, 3, 0, 0),
+        tpu_util._CoreLocation(2, 3, 0, 1),
+        tpu_util._CoreLocation(1, 3, 0, 0),
+        tpu_util._CoreLocation(1, 3, 0, 1),
+        tpu_util._CoreLocation(0, 3, 0, 0),
+        tpu_util._CoreLocation(0, 3, 0, 1),
+        tpu_util._CoreLocation(0, 2, 0, 0),
+        tpu_util._CoreLocation(0, 2, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+        tpu_util._CoreLocation(0, 1, 1, 1),
+        tpu_util._CoreLocation(0, 1, 1, 0),
+        tpu_util._CoreLocation(0, 2, 1, 1),
+        tpu_util._CoreLocation(0, 2, 1, 0),
+        tpu_util._CoreLocation(0, 3, 1, 1),
+        tpu_util._CoreLocation(0, 3, 1, 0),
+        tpu_util._CoreLocation(1, 3, 1, 1),
+        tpu_util._CoreLocation(1, 3, 1, 0),
+        tpu_util._CoreLocation(2, 3, 1, 1),
+        tpu_util._CoreLocation(2, 3, 1, 0),
+        tpu_util._CoreLocation(3, 3, 1, 1),
+        tpu_util._CoreLocation(3, 3, 1, 0),
+        tpu_util._CoreLocation(3, 2, 1, 1),
+        tpu_util._CoreLocation(3, 2, 1, 0),
+        tpu_util._CoreLocation(2, 2, 1, 1),
+        tpu_util._CoreLocation(2, 2, 1, 0),
+        tpu_util._CoreLocation(1, 2, 1, 1),
+        tpu_util._CoreLocation(1, 2, 1, 0),
+        tpu_util._CoreLocation(1, 1, 1, 1),
+        tpu_util._CoreLocation(1, 1, 1, 0),
+        tpu_util._CoreLocation(2, 1, 1, 1),
+        tpu_util._CoreLocation(2, 1, 1, 0),
+        tpu_util._CoreLocation(3, 1, 1, 1),
+        tpu_util._CoreLocation(3, 1, 1, 0),
+        tpu_util._CoreLocation(3, 0, 1, 1),
+        tpu_util._CoreLocation(3, 0, 1, 0),
+        tpu_util._CoreLocation(2, 0, 1, 1),
+        tpu_util._CoreLocation(2, 0, 1, 0),
+        tpu_util._CoreLocation(1, 0, 1, 1),
+        tpu_util._CoreLocation(1, 0, 1, 0),
+        tpu_util._CoreLocation(0, 0, 1, 0),
+        tpu_util._CoreLocation(0, 0, 1, 1),
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+    ]
+    result = self._build_all_reduce_ring(core_locations)
+    self.assertAllEqual(result, expected)
+
+  # Picture of chips:
+  # 0 -- 1    4 -- 5
+  # |    |    |    |
+  # 3 -- 2    7 -- 6
+  #
+  # 12-- 13   8 -- 9
+  # |    |    |    |
+  # 15-- 14   11-- 10
+  def testBuildOrthogonalAllReduceRings(self):
+    core_locations = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(0, 2, 0, 0),
+        tpu_util._CoreLocation(0, 2, 0, 1),
+        tpu_util._CoreLocation(0, 3, 0, 0),
+        tpu_util._CoreLocation(0, 3, 0, 1),
+        tpu_util._CoreLocation(1, 2, 0, 0),
+        tpu_util._CoreLocation(1, 2, 0, 1),
+        tpu_util._CoreLocation(1, 3, 0, 0),
+        tpu_util._CoreLocation(1, 3, 0, 1),
+        tpu_util._CoreLocation(2, 0, 0, 0),
+        tpu_util._CoreLocation(2, 0, 0, 1),
+        tpu_util._CoreLocation(2, 1, 0, 0),
+        tpu_util._CoreLocation(2, 1, 0, 1),
+        tpu_util._CoreLocation(3, 0, 0, 0),
+        tpu_util._CoreLocation(3, 0, 0, 1),
+        tpu_util._CoreLocation(3, 1, 0, 0),
+        tpu_util._CoreLocation(3, 1, 0, 1),
+        tpu_util._CoreLocation(2, 2, 0, 0),
+        tpu_util._CoreLocation(2, 2, 0, 1),
+        tpu_util._CoreLocation(2, 3, 0, 0),
+        tpu_util._CoreLocation(2, 3, 0, 1),
+        tpu_util._CoreLocation(3, 2, 0, 0),
+        tpu_util._CoreLocation(3, 2, 0, 1),
+        tpu_util._CoreLocation(3, 3, 0, 0),
+        tpu_util._CoreLocation(3, 3, 0, 1),
+    ]
+    expected = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+        tpu_util._CoreLocation(2, 0, 0, 0),
+        tpu_util._CoreLocation(2, 0, 0, 1),
+        tpu_util._CoreLocation(3, 0, 0, 0),
+        tpu_util._CoreLocation(3, 0, 0, 1),
+        tpu_util._CoreLocation(3, 1, 0, 0),
+        tpu_util._CoreLocation(3, 1, 0, 1),
+        tpu_util._CoreLocation(2, 1, 0, 0),
+        tpu_util._CoreLocation(2, 1, 0, 1),
+        tpu_util._CoreLocation(2, 2, 0, 0),
+        tpu_util._CoreLocation(2, 2, 0, 1),
+        tpu_util._CoreLocation(3, 2, 0, 0),
+        tpu_util._CoreLocation(3, 2, 0, 1),
+        tpu_util._CoreLocation(3, 3, 0, 0),
+        tpu_util._CoreLocation(3, 3, 0, 1),
+        tpu_util._CoreLocation(2, 3, 0, 0),
+        tpu_util._CoreLocation(2, 3, 0, 1),
+        tpu_util._CoreLocation(0, 2, 0, 0),
+        tpu_util._CoreLocation(0, 2, 0, 1),
+        tpu_util._CoreLocation(1, 2, 0, 0),
+        tpu_util._CoreLocation(1, 2, 0, 1),
+        tpu_util._CoreLocation(1, 3, 0, 0),
+        tpu_util._CoreLocation(1, 3, 0, 1),
+        tpu_util._CoreLocation(0, 3, 0, 0),
+        tpu_util._CoreLocation(0, 3, 0, 1),
+    ]
+    result = tpu_util._build_orthogonal_rings(
+        core_locations, ring_size=8, rotate_ring_across_rings=False)
+    self.assertAllEqual(result, expected)
+
+  # Picture of chips:
+  # 0 -- 1    12 -- 13
+  # |    |    |     |
+  # 3 -- 2    15 -- 14
+  #
+  # 4 -- 5   8 -- 9
+  # |    |    |    |
+  # 7 -- 6   11-- 10
+  def testBuildOrthogonalRotatedAllReduceRings(self):
+    core_locations = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(0, 2, 0, 0),
+        tpu_util._CoreLocation(0, 2, 0, 1),
+        tpu_util._CoreLocation(0, 3, 0, 0),
+        tpu_util._CoreLocation(0, 3, 0, 1),
+        tpu_util._CoreLocation(1, 2, 0, 0),
+        tpu_util._CoreLocation(1, 2, 0, 1),
+        tpu_util._CoreLocation(1, 3, 0, 0),
+        tpu_util._CoreLocation(1, 3, 0, 1),
+        tpu_util._CoreLocation(2, 0, 0, 0),
+        tpu_util._CoreLocation(2, 0, 0, 1),
+        tpu_util._CoreLocation(2, 1, 0, 0),
+        tpu_util._CoreLocation(2, 1, 0, 1),
+        tpu_util._CoreLocation(3, 0, 0, 0),
+        tpu_util._CoreLocation(3, 0, 0, 1),
+        tpu_util._CoreLocation(3, 1, 0, 0),
+        tpu_util._CoreLocation(3, 1, 0, 1),
+        tpu_util._CoreLocation(2, 2, 0, 0),
+        tpu_util._CoreLocation(2, 2, 0, 1),
+        tpu_util._CoreLocation(2, 3, 0, 0),
+        tpu_util._CoreLocation(2, 3, 0, 1),
+        tpu_util._CoreLocation(3, 2, 0, 0),
+        tpu_util._CoreLocation(3, 2, 0, 1),
+        tpu_util._CoreLocation(3, 3, 0, 0),
+        tpu_util._CoreLocation(3, 3, 0, 1),
+    ]
+    expected = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+        tpu_util._CoreLocation(0, 2, 0, 0),
+        tpu_util._CoreLocation(0, 2, 0, 1),
+        tpu_util._CoreLocation(1, 2, 0, 0),
+        tpu_util._CoreLocation(1, 2, 0, 1),
+        tpu_util._CoreLocation(1, 3, 0, 0),
+        tpu_util._CoreLocation(1, 3, 0, 1),
+        tpu_util._CoreLocation(0, 3, 0, 0),
+        tpu_util._CoreLocation(0, 3, 0, 1),
+        tpu_util._CoreLocation(2, 2, 0, 0),
+        tpu_util._CoreLocation(2, 2, 0, 1),
+        tpu_util._CoreLocation(3, 2, 0, 0),
+        tpu_util._CoreLocation(3, 2, 0, 1),
+        tpu_util._CoreLocation(3, 3, 0, 0),
+        tpu_util._CoreLocation(3, 3, 0, 1),
+        tpu_util._CoreLocation(2, 3, 0, 0),
+        tpu_util._CoreLocation(2, 3, 0, 1),
+        tpu_util._CoreLocation(2, 0, 0, 0),
+        tpu_util._CoreLocation(2, 0, 0, 1),
+        tpu_util._CoreLocation(3, 0, 0, 0),
+        tpu_util._CoreLocation(3, 0, 0, 1),
+        tpu_util._CoreLocation(3, 1, 0, 0),
+        tpu_util._CoreLocation(3, 1, 0, 1),
+        tpu_util._CoreLocation(2, 1, 0, 0),
+        tpu_util._CoreLocation(2, 1, 0, 1),
+    ]
+    result = tpu_util._build_orthogonal_rings(
+        core_locations, ring_size=8, rotate_ring_across_rings=True)
+    self.assertAllEqual(result, expected)
+
+  # Create a 4x8 mesh on a 4x4 DF slice, disallowing splitting hosts.
+  def testCreateDFMeshNoSplittingHosts(self):
+    result = tpu_util._enumerate_core_locations(
+        [4, 4, 1, 2], [4, 4, 1, 2], ['core', 'y', 'z', 'x'],
+        can_split_host_across_rings=False,
+        ring_size=8)
+    expected = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(0, 2, 0, 0),
+        tpu_util._CoreLocation(0, 2, 0, 1),
+        tpu_util._CoreLocation(0, 3, 0, 0),
+        tpu_util._CoreLocation(0, 3, 0, 1),
+        tpu_util._CoreLocation(1, 2, 0, 0),
+        tpu_util._CoreLocation(1, 2, 0, 1),
+        tpu_util._CoreLocation(1, 3, 0, 0),
+        tpu_util._CoreLocation(1, 3, 0, 1),
+        tpu_util._CoreLocation(2, 0, 0, 0),
+        tpu_util._CoreLocation(2, 0, 0, 1),
+        tpu_util._CoreLocation(2, 1, 0, 0),
+        tpu_util._CoreLocation(2, 1, 0, 1),
+        tpu_util._CoreLocation(3, 0, 0, 0),
+        tpu_util._CoreLocation(3, 0, 0, 1),
+        tpu_util._CoreLocation(3, 1, 0, 0),
+        tpu_util._CoreLocation(3, 1, 0, 1),
+        tpu_util._CoreLocation(2, 2, 0, 0),
+        tpu_util._CoreLocation(2, 2, 0, 1),
+        tpu_util._CoreLocation(2, 3, 0, 0),
+        tpu_util._CoreLocation(2, 3, 0, 1),
+        tpu_util._CoreLocation(3, 2, 0, 0),
+        tpu_util._CoreLocation(3, 2, 0, 1),
+        tpu_util._CoreLocation(3, 3, 0, 0),
+        tpu_util._CoreLocation(3, 3, 0, 1),
+    ]
+    self.assertAllEqual(result, expected)
+
+  # Create a 4x8 mesh on a 4x4 DF slice with at most 2, 2, 1, 2 devices from
+  # each dimension, disallowing splitting hosts.
+  def testCreateDFMeshWithRingBoundsNoSplittingHosts(self):
+    result = tpu_util._enumerate_core_locations(
+        [4, 4, 1, 2], [2, 2, 1, 2], ['core', 'x', 'y', 'z'],
+        can_split_host_across_rings=False,
+        ring_size=8)
+    expected = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(2, 0, 0, 0),
+        tpu_util._CoreLocation(2, 0, 0, 1),
+        tpu_util._CoreLocation(3, 0, 0, 0),
+        tpu_util._CoreLocation(3, 0, 0, 1),
+        tpu_util._CoreLocation(2, 1, 0, 0),
+        tpu_util._CoreLocation(2, 1, 0, 1),
+        tpu_util._CoreLocation(3, 1, 0, 0),
+        tpu_util._CoreLocation(3, 1, 0, 1),
+        tpu_util._CoreLocation(0, 2, 0, 0),
+        tpu_util._CoreLocation(0, 2, 0, 1),
+        tpu_util._CoreLocation(1, 2, 0, 0),
+        tpu_util._CoreLocation(1, 2, 0, 1),
+        tpu_util._CoreLocation(0, 3, 0, 0),
+        tpu_util._CoreLocation(0, 3, 0, 1),
+        tpu_util._CoreLocation(1, 3, 0, 0),
+        tpu_util._CoreLocation(1, 3, 0, 1),
+        tpu_util._CoreLocation(2, 2, 0, 0),
+        tpu_util._CoreLocation(2, 2, 0, 1),
+        tpu_util._CoreLocation(3, 2, 0, 0),
+        tpu_util._CoreLocation(3, 2, 0, 1),
+        tpu_util._CoreLocation(2, 3, 0, 0),
+        tpu_util._CoreLocation(2, 3, 0, 1),
+        tpu_util._CoreLocation(3, 3, 0, 0),
+        tpu_util._CoreLocation(3, 3, 0, 1),
+    ]
+    self.assertAllEqual(result, expected)
+
+  # Create a 4x8 mesh on a 4x4 DF slice, allowing splitting hosts.
+  def testCreateDFMeshSplittingHosts(self):
+    result = tpu_util._enumerate_core_locations(
+        [4, 4, 1, 2], [4, 4, 1, 2], ['core', 'y', 'z', 'x'],
+        can_split_host_across_rings=True,
+        ring_size=8)
+    expected = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+        tpu_util._CoreLocation(0, 2, 0, 0),
+        tpu_util._CoreLocation(0, 2, 0, 1),
+        tpu_util._CoreLocation(0, 3, 0, 0),
+        tpu_util._CoreLocation(0, 3, 0, 1),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(1, 2, 0, 0),
+        tpu_util._CoreLocation(1, 2, 0, 1),
+        tpu_util._CoreLocation(1, 3, 0, 0),
+        tpu_util._CoreLocation(1, 3, 0, 1),
+        tpu_util._CoreLocation(2, 0, 0, 0),
+        tpu_util._CoreLocation(2, 0, 0, 1),
+        tpu_util._CoreLocation(2, 1, 0, 0),
+        tpu_util._CoreLocation(2, 1, 0, 1),
+        tpu_util._CoreLocation(2, 2, 0, 0),
+        tpu_util._CoreLocation(2, 2, 0, 1),
+        tpu_util._CoreLocation(2, 3, 0, 0),
+        tpu_util._CoreLocation(2, 3, 0, 1),
+        tpu_util._CoreLocation(3, 0, 0, 0),
+        tpu_util._CoreLocation(3, 0, 0, 1),
+        tpu_util._CoreLocation(3, 1, 0, 0),
+        tpu_util._CoreLocation(3, 1, 0, 1),
+        tpu_util._CoreLocation(3, 2, 0, 0),
+        tpu_util._CoreLocation(3, 2, 0, 1),
+        tpu_util._CoreLocation(3, 3, 0, 0),
+        tpu_util._CoreLocation(3, 3, 0, 1),
+    ]
+    self.assertAllEqual(result, expected)
+
+  # Create a 2x64 mesh on a 4x4x4 PF slice, allowing splitting hosts.
+  def testCreateMeshPFSplittingHosts(self):
+    result = tpu_util._enumerate_core_locations(
+        [4, 4, 4, 2], [4, 4, 4, 2], ['core', 'x', 'y', 'z'],
+        can_split_host_across_rings=True,
+        ring_size=64)
+    expected = [
+        tpu_util._CoreLocation(0, 0, 0, 0),
+        tpu_util._CoreLocation(0, 0, 0, 1),
+        tpu_util._CoreLocation(1, 0, 0, 0),
+        tpu_util._CoreLocation(1, 0, 0, 1),
+        tpu_util._CoreLocation(2, 0, 0, 0),
+        tpu_util._CoreLocation(2, 0, 0, 1),
+        tpu_util._CoreLocation(3, 0, 0, 0),
+        tpu_util._CoreLocation(3, 0, 0, 1),
+        tpu_util._CoreLocation(0, 1, 0, 0),
+        tpu_util._CoreLocation(0, 1, 0, 1),
+        tpu_util._CoreLocation(1, 1, 0, 0),
+        tpu_util._CoreLocation(1, 1, 0, 1),
+        tpu_util._CoreLocation(2, 1, 0, 0),
+        tpu_util._CoreLocation(2, 1, 0, 1),
+        tpu_util._CoreLocation(3, 1, 0, 0),
+        tpu_util._CoreLocation(3, 1, 0, 1),
+        tpu_util._CoreLocation(0, 2, 0, 0),
+        tpu_util._CoreLocation(0, 2, 0, 1),
+        tpu_util._CoreLocation(1, 2, 0, 0),
+        tpu_util._CoreLocation(1, 2, 0, 1),
+        tpu_util._CoreLocation(2, 2, 0, 0),
+        tpu_util._CoreLocation(2, 2, 0, 1),
+        tpu_util._CoreLocation(3, 2, 0, 0),
+        tpu_util._CoreLocation(3, 2, 0, 1),
+        tpu_util._CoreLocation(0, 3, 0, 0),
+        tpu_util._CoreLocation(0, 3, 0, 1),
+        tpu_util._CoreLocation(1, 3, 0, 0),
+        tpu_util._CoreLocation(1, 3, 0, 1),
+        tpu_util._CoreLocation(2, 3, 0, 0),
+        tpu_util._CoreLocation(2, 3, 0, 1),
+        tpu_util._CoreLocation(3, 3, 0, 0),
+        tpu_util._CoreLocation(3, 3, 0, 1),
+        tpu_util._CoreLocation(0, 0, 1, 0),
+        tpu_util._CoreLocation(0, 0, 1, 1),
+        tpu_util._CoreLocation(1, 0, 1, 0),
+        tpu_util._CoreLocation(1, 0, 1, 1),
+        tpu_util._CoreLocation(2, 0, 1, 0),
+        tpu_util._CoreLocation(2, 0, 1, 1),
+        tpu_util._CoreLocation(3, 0, 1, 0),
+        tpu_util._CoreLocation(3, 0, 1, 1),
+        tpu_util._CoreLocation(0, 1, 1, 0),
+        tpu_util._CoreLocation(0, 1, 1, 1),
+        tpu_util._CoreLocation(1, 1, 1, 0),
+        tpu_util._CoreLocation(1, 1, 1, 1),
+        tpu_util._CoreLocation(2, 1, 1, 0),
+        tpu_util._CoreLocation(2, 1, 1, 1),
+        tpu_util._CoreLocation(3, 1, 1, 0),
+        tpu_util._CoreLocation(3, 1, 1, 1),
+        tpu_util._CoreLocation(0, 2, 1, 0),
+        tpu_util._CoreLocation(0, 2, 1, 1),
+        tpu_util._CoreLocation(1, 2, 1, 0),
+        tpu_util._CoreLocation(1, 2, 1, 1),
+        tpu_util._CoreLocation(2, 2, 1, 0),
+        tpu_util._CoreLocation(2, 2, 1, 1),
+        tpu_util._CoreLocation(3, 2, 1, 0),
+        tpu_util._CoreLocation(3, 2, 1, 1),
+        tpu_util._CoreLocation(0, 3, 1, 0),
+        tpu_util._CoreLocation(0, 3, 1, 1),
+        tpu_util._CoreLocation(1, 3, 1, 0),
+        tpu_util._CoreLocation(1, 3, 1, 1),
+        tpu_util._CoreLocation(2, 3, 1, 0),
+        tpu_util._CoreLocation(2, 3, 1, 1),
+        tpu_util._CoreLocation(3, 3, 1, 0),
+        tpu_util._CoreLocation(3, 3, 1, 1),
+        tpu_util._CoreLocation(0, 0, 2, 0),
+        tpu_util._CoreLocation(0, 0, 2, 1),
+        tpu_util._CoreLocation(1, 0, 2, 0),
+        tpu_util._CoreLocation(1, 0, 2, 1),
+        tpu_util._CoreLocation(2, 0, 2, 0),
+        tpu_util._CoreLocation(2, 0, 2, 1),
+        tpu_util._CoreLocation(3, 0, 2, 0),
+        tpu_util._CoreLocation(3, 0, 2, 1),
+        tpu_util._CoreLocation(0, 1, 2, 0),
+        tpu_util._CoreLocation(0, 1, 2, 1),
+        tpu_util._CoreLocation(1, 1, 2, 0),
+        tpu_util._CoreLocation(1, 1, 2, 1),
+        tpu_util._CoreLocation(2, 1, 2, 0),
+        tpu_util._CoreLocation(2, 1, 2, 1),
+        tpu_util._CoreLocation(3, 1, 2, 0),
+        tpu_util._CoreLocation(3, 1, 2, 1),
+        tpu_util._CoreLocation(0, 2, 2, 0),
+        tpu_util._CoreLocation(0, 2, 2, 1),
+        tpu_util._CoreLocation(1, 2, 2, 0),
+        tpu_util._CoreLocation(1, 2, 2, 1),
+        tpu_util._CoreLocation(2, 2, 2, 0),
+        tpu_util._CoreLocation(2, 2, 2, 1),
+        tpu_util._CoreLocation(3, 2, 2, 0),
+        tpu_util._CoreLocation(3, 2, 2, 1),
+        tpu_util._CoreLocation(0, 3, 2, 0),
+        tpu_util._CoreLocation(0, 3, 2, 1),
+        tpu_util._CoreLocation(1, 3, 2, 0),
+        tpu_util._CoreLocation(1, 3, 2, 1),
+        tpu_util._CoreLocation(2, 3, 2, 0),
+        tpu_util._CoreLocation(2, 3, 2, 1),
+        tpu_util._CoreLocation(3, 3, 2, 0),
+        tpu_util._CoreLocation(3, 3, 2, 1),
+        tpu_util._CoreLocation(0, 0, 3, 0),
+        tpu_util._CoreLocation(0, 0, 3, 1),
+        tpu_util._CoreLocation(1, 0, 3, 0),
+        tpu_util._CoreLocation(1, 0, 3, 1),
+        tpu_util._CoreLocation(2, 0, 3, 0),
+        tpu_util._CoreLocation(2, 0, 3, 1),
+        tpu_util._CoreLocation(3, 0, 3, 0),
+        tpu_util._CoreLocation(3, 0, 3, 1),
+        tpu_util._CoreLocation(0, 1, 3, 0),
+        tpu_util._CoreLocation(0, 1, 3, 1),
+        tpu_util._CoreLocation(1, 1, 3, 0),
+        tpu_util._CoreLocation(1, 1, 3, 1),
+        tpu_util._CoreLocation(2, 1, 3, 0),
+        tpu_util._CoreLocation(2, 1, 3, 1),
+        tpu_util._CoreLocation(3, 1, 3, 0),
+        tpu_util._CoreLocation(3, 1, 3, 1),
+        tpu_util._CoreLocation(0, 2, 3, 0),
+        tpu_util._CoreLocation(0, 2, 3, 1),
+        tpu_util._CoreLocation(1, 2, 3, 0),
+        tpu_util._CoreLocation(1, 2, 3, 1),
+        tpu_util._CoreLocation(2, 2, 3, 0),
+        tpu_util._CoreLocation(2, 2, 3, 1),
+        tpu_util._CoreLocation(3, 2, 3, 0),
+        tpu_util._CoreLocation(3, 2, 3, 1),
+        tpu_util._CoreLocation(0, 3, 3, 0),
+        tpu_util._CoreLocation(0, 3, 3, 1),
+        tpu_util._CoreLocation(1, 3, 3, 0),
+        tpu_util._CoreLocation(1, 3, 3, 1),
+        tpu_util._CoreLocation(2, 3, 3, 0),
+        tpu_util._CoreLocation(2, 3, 3, 1),
+        tpu_util._CoreLocation(3, 3, 3, 0),
+        tpu_util._CoreLocation(3, 3, 3, 1),
+    ]
+    self.assertAllEqual(result, expected)
+
+  def testCreateMeshNoSplittingHostsUnfulfillable(self):
+    with self.assertRaises(ValueError):
+      tpu_util.create_tpu_mesh(['x', 'y'], [2, 1],
+                               'mesh_unfulfillable_without_splitting_hosts',
+                               can_split_host_across_rings=False)
+
+  def testCreateMeshWithDefaultOptions(self):
+    mesh = tpu_util.create_tpu_mesh(['x'], [2], 'mesh_with_default_options')
+    self.assertAllEqual(mesh.shape(), [2])
+    self.assertEqual(mesh.num_local_devices(), 2)
+
+  def testCreateMeshWithWrongShape(self):
+    with self.assertRaises(ValueError):
+      tpu_util.create_tpu_mesh(['x'], [1], 'mesh_with_wrong_shape')
+
+  # Build rings for the batch dimension.
+  def testCreateMeshWithPositiveRingDims(self):
+    mesh = tpu_util.create_tpu_mesh(['x', 'y'], [2, 1],
+                                    'mesh_with_positive_ring_dims',
+                                    ring_dims=1)
+    self.assertAllEqual(mesh.shape(), [2, 1])
+    self.assertEqual(mesh.num_local_devices(), 2)
+
+  # Build rings for all non-batch dimensions.
+  def testCreateMeshWithNegativeRingDims(self):
+    mesh = tpu_util.create_tpu_mesh(['x', 'y', 'z'], [1, 2, 1],
+                                    'mesh_with_negative_ring_dims',
+                                    ring_dims=-2)
+    self.assertAllEqual(mesh.shape(), [1, 2, 1])
+    self.assertEqual(mesh.num_local_devices(), 2)
+
+  # Build single-core rings.
+  def testCreateMeshWithZeroRingDims(self):
+    mesh = tpu_util.create_tpu_mesh(['x', 'y'], [2, 1],
+                                    'mesh_with_zero_ring_dims',
+                                    ring_dims=0)
+    self.assertAllEqual(mesh.shape(), [2, 1])
+    self.assertEqual(mesh.num_local_devices(), 2)
+
+  def testCreateMeshWithCustomAxes(self):
+    mesh = tpu_util.create_tpu_mesh(['x', 'y'], [2, 1],
+                                    'mesh_with_custom_axes',
+                                    ring_axes=['x', 'z', 'y', 'core'])
+    self.assertAllEqual(mesh.shape(), [2, 1])
+    self.assertEqual(mesh.num_local_devices(), 2)
+
+  # More cores (2 cores) on the first axis (core) than ring size (1).
+  def testCreateMeshWithDividedAxis(self):
+    mesh = tpu_util.create_tpu_mesh(['x', 'y'], [2, 1],
+                                    'mesh_with_divided_axis',
+                                    ring_dims=-1,
+                                    ring_axes=['core', 'z', 'y', 'x'])
+    self.assertAllEqual(mesh.shape(), [2, 1])
+    self.assertEqual(mesh.num_local_devices(), 2)
+
+  # Both meshes should produce the same result despite different `ring_dim`.
+  def testCreateMultipleMeshes(self):
+    a = constant_op.constant([[0, 1], [2, 3]], dtype=dtypes.int32)
+    b_expected = math_ops.reduce_sum(a)
+
+    mesh_1 = tpu_util.create_tpu_mesh(['x', 'y'], [2, 1], 'mesh_1', ring_dims=1)
+    a_1 = numpy_util.pack_numpy(a, Layout(['x', 'y'], mesh_1))
+    b_1 = math_ops.reduce_sum(a_1)
+    self.assertDTensorEqual(b_expected, Layout.replicated(mesh_1, rank=0), b_1)
+
+    mesh_2 = tpu_util.create_tpu_mesh(['x', 'y'], [2, 1],
+                                      'mesh_2',
+                                      ring_dims=-1)
+    a_2 = numpy_util.pack_numpy(a, Layout(['x', 'y'], mesh_2))
+    b_2 = math_ops.reduce_sum(a_2)
+    self.assertDTensorEqual(b_expected, Layout.replicated(mesh_2, rank=0), b_2)
+
+  def testCreateMeshWithEmptyName(self):
+    tpu_util.create_tpu_mesh(['x'], [2], '')
+
+  def testCreateMeshWithExistingName(self):
+    tpu_util.create_tpu_mesh(['x'], [2], 'mesh_with_existing_name')
+    with self.assertRaises(ValueError):
+      tpu_util.create_tpu_mesh(['x'], [2], 'mesh_with_existing_name')
+
+  def testGetDeviceIDs(self):
+    mesh = tpu_util.create_tpu_mesh(['x', 'y'], [2, 1],
+                                    'mesh_to_get_device_ids')
+    self.assertAllEqual(tpu_util.get_device_ids(mesh), [0, 1])
+
+  def testGetDeviceLocations(self):
+    mesh = tpu_util.create_tpu_mesh(['x', 'y'], [2, 1],
+                                    'mesh_to_get_device_locations')
+    self.assertAllEqual(
+        tpu_util.get_device_locations(mesh), [{
+            'x': 0,
+            'y': 0
+        }, {
+            'x': 1,
+            'y': 0
+        }])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/dtensor/python/tpu_util.py b/tensorflow/dtensor/python/tpu_util.py
index 317d49e37ab59b..a191eac8a6532c 100644
--- a/tensorflow/dtensor/python/tpu_util.py
+++ b/tensorflow/dtensor/python/tpu_util.py
@@ -142,7 +142,8 @@ def _shutdown_tpu_system():
 def tpu_system_init_helper(task_id,
                            num_tasks,
                            num_devices,
-                           use_tfrt_host_runtime=True):
+                           use_tfrt_host_runtime=True,
+                           use_megacore=False):
   """A helper function to initialize multi-client tpu system."""
 
   @def_function.function
@@ -156,6 +157,10 @@ def _set_global_tpu_array_fn(topology_proto):
 
   with ops.device("/job:" + config.full_job_name() + "/device:TPU_SYSTEM:0"):  # pylint: disable=protected-access
     my_core_ids = _tpu_init_fn()
+
+  if use_megacore:
+    logging.info("Using TPU megacore")
+    my_core_ids = my_core_ids * 2
   logging.info("TPU core IDs: %s", my_core_ids)
 
   # `my_core_ids` contains the IDs of TPU cores attached to this host.
@@ -240,7 +245,7 @@ def _set_global_tpu_array_fn(topology_proto):
   return tpu_topology, device
 
 
-def initialize_tpu_system():
+def initialize_tpu_system(use_megacore=False):
   """Initializes the TPU system."""
 
   # Make sure the server change is fully propagated before attempting to run
@@ -260,7 +265,8 @@ def initialize_tpu_system():
         task_id,
         num_tasks,
         num_devices,
-        use_tfrt_host_runtime=use_tfrt_host_runtime)
+        use_tfrt_host_runtime=use_tfrt_host_runtime,
+        use_megacore=use_megacore)
     global _tpu_topology
     _tpu_topology = tpu_topology
     logging.vlog(1, "TPU Topology: %s, %s", tpu_topology.mesh_shape,
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index dffdc7d1c386ac..1c1bc5a47fac85 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2515,6 +2515,22 @@ func BatchMatMulAdjY(value bool) BatchMatMulAttr {
 	}
 }
 
+// BatchMatMulGradX sets the optional grad_x attribute to value.
+// If not specified, defaults to false
+func BatchMatMulGradX(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["grad_x"] = value
+	}
+}
+
+// BatchMatMulGradY sets the optional grad_y attribute to value.
+// If not specified, defaults to false
+func BatchMatMulGradY(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["grad_y"] = value
+	}
+}
+
 // Multiplies slices of two tensors in batches.
 //
 // Multiplies all slices of `Tensor` `x` and `y` (each slice can be
@@ -2584,6 +2600,22 @@ func BatchMatMulV2AdjY(value bool) BatchMatMulV2Attr {
 	}
 }
 
+// BatchMatMulV2GradX sets the optional grad_x attribute to value.
+// If not specified, defaults to false
+func BatchMatMulV2GradX(value bool) BatchMatMulV2Attr {
+	return func(m optionalAttr) {
+		m["grad_x"] = value
+	}
+}
+
+// BatchMatMulV2GradY sets the optional grad_y attribute to value.
+// If not specified, defaults to false
+func BatchMatMulV2GradY(value bool) BatchMatMulV2Attr {
+	return func(m optionalAttr) {
+		m["grad_y"] = value
+	}
+}
+
 // Multiplies slices of two tensors in batches.
 //
 // Multiplies all slices of `Tensor` `x` and `y` (each slice can be
@@ -2657,6 +2689,22 @@ func BatchMatMulV3AdjY(value bool) BatchMatMulV3Attr {
 	}
 }
 
+// BatchMatMulV3GradX sets the optional grad_x attribute to value.
+// If not specified, defaults to false
+func BatchMatMulV3GradX(value bool) BatchMatMulV3Attr {
+	return func(m optionalAttr) {
+		m["grad_x"] = value
+	}
+}
+
+// BatchMatMulV3GradY sets the optional grad_y attribute to value.
+// If not specified, defaults to false
+func BatchMatMulV3GradY(value bool) BatchMatMulV3Attr {
+	return func(m optionalAttr) {
+		m["grad_y"] = value
+	}
+}
+
 // Multiplies slices of two tensors in batches.
 //
 // Multiplies all slices of `Tensor` `x` and `y` (each slice can be
@@ -24661,6 +24709,22 @@ func MatMulTransposeB(value bool) MatMulAttr {
 	}
 }
 
+// MatMulGradA sets the optional grad_a attribute to value.
+// If not specified, defaults to false
+func MatMulGradA(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["grad_a"] = value
+	}
+}
+
+// MatMulGradB sets the optional grad_b attribute to value.
+// If not specified, defaults to false
+func MatMulGradB(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["grad_b"] = value
+	}
+}
+
 // Multiply the matrix "a" by the matrix "b".
 //
 // The inputs must be two-dimensional matrices and the inner dimension of
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 4c42cccbbb82b2..d1d3d6bb123958 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -220,7 +220,6 @@ cc_library(
     copts = tflite_copts_warnings(),
     deps = [
         ":graph_info",
-        ":kernel_api",
         ":memory_planner",
         ":simple_memory_arena",
         ":util",
@@ -237,7 +236,6 @@ cc_library(
     copts = tflite_copts_warnings() + ["-DTF_LITE_TENSORFLOW_PROFILER"],
     deps = [
         ":graph_info",
-        ":kernel_api",
         ":memory_planner",
         ":simple_memory_arena_with_profiler",
         ":util",
@@ -256,9 +254,10 @@ cc_test(
         ":arena_planner_with_profiler",
         ":builtin_ops",
         ":graph_info",
-        "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/testing:util",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1053,7 +1052,6 @@ cc_test(
     deps = [
         ":simple_memory_arena",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/acceleration/configuration/BUILD b/tensorflow/lite/acceleration/configuration/BUILD
index 4f1b2fe568cb97..6b6aab6638d033 100644
--- a/tensorflow/lite/acceleration/configuration/BUILD
+++ b/tensorflow/lite/acceleration/configuration/BUILD
@@ -277,11 +277,8 @@ cc_library(
         "//conditions:default": [],
     }),
     visibility = [
-        "//tensorflow/lite/acceleration/configuration/c:__pkg__",
         "//tensorflow/lite/core/acceleration/configuration/c:__pkg__",
-        "//tensorflow/lite/core/experimental/acceleration/configuration/c:__pkg__",
         "//tensorflow/lite/experimental/acceleration/configuration:__pkg__",
-        "//tensorflow/lite/experimental/acceleration/configuration/c:__pkg__",
     ],
     deps = [
         ":configuration_fbs",
diff --git a/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h b/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h
index 3c186b0345b741..46c20dbfb40f93 100644
--- a/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h
+++ b/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
 #define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
 
+/// For documentation, see
+/// third_party/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h
+
 #include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
 
 #endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
diff --git a/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h b/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h
index 8a8202ef2bd1f6..6c83d2b2bded37 100644
--- a/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h
+++ b/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
 #define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
 
+/// For documentation, see
+/// third_party/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h
+
 #include "tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h"
 
 #endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
diff --git a/tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h b/tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h
index 74be5f9b3a96dc..f2406e8311860b 100644
--- a/tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h
+++ b/tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
 #define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
 
+/// For documentation, see
+/// third_party/tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h
+
 #include "tensorflow/lite/core/acceleration/configuration/c/nnapi_plugin.h"
 
 #endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
diff --git a/tensorflow/lite/acceleration/configuration/c/stable_delegate.h b/tensorflow/lite/acceleration/configuration/c/stable_delegate.h
index 2b34a32f4bf611..f3589c58cc9562 100644
--- a/tensorflow/lite/acceleration/configuration/c/stable_delegate.h
+++ b/tensorflow/lite/acceleration/configuration/c/stable_delegate.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
 #define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
 
+/// For documentation, see
+/// third_party/tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h
+
 #include "tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h"
 
 #endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
diff --git a/tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h b/tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h
index 9ced18f3dc5a86..ae44009e4b816e 100644
--- a/tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h
+++ b/tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
 #define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
 
+/// For documentation, see
+/// third_party/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h
+
 #include "tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h"
 
 #endif  // TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
diff --git a/tensorflow/lite/acceleration/configuration/delegate_registry.h b/tensorflow/lite/acceleration/configuration/delegate_registry.h
index b1064054f30d25..a6ed2b0636b937 100644
--- a/tensorflow/lite/acceleration/configuration/delegate_registry.h
+++ b/tensorflow/lite/acceleration/configuration/delegate_registry.h
@@ -15,7 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
 #define TENSORFLOW_LITE_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
 
-#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"
+/// For documentation, see
+/// third_party/tensorflow/lite/core/acceleration/configuration/delegate_registry.h
+
+#include "tensorflow/lite/core/acceleration/configuration/delegate_registry.h"  // IWYU pragma: export
 
 namespace tflite {
 namespace delegates {
diff --git a/tensorflow/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc
index b63c682d7cb046..8fd1a794369b50 100644
--- a/tensorflow/lite/arena_planner.cc
+++ b/tensorflow/lite/arena_planner.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/simple_memory_arena.h"
@@ -42,6 +41,7 @@ ArenaPlanner::ArenaPlanner(TfLiteContext* context,
     : context_(context),
       graph_info_(std::move(graph_info)),
       arena_(kDefaultArenaAlignment, subgraph_index),
+      has_nonpersistent_memory_(false),
       persistent_arena_(kDefaultArenaAlignment, subgraph_index),
       preserve_all_tensors_(preserve_all_tensors),
       tensor_alignment_(tensor_alignment),
@@ -380,6 +380,7 @@ TfLiteStatus ArenaPlanner::ExecuteAllocations(int first_node, int last_node) {
 TfLiteStatus ArenaPlanner::ReleaseNonPersistentMemory() {
   // Clear non-persistent arena's buffer.
   TF_LITE_ENSURE_STATUS(arena_.ReleaseBuffer());
+  has_nonpersistent_memory_ = false;
   // Set data pointers for all non-persistent tensors to nullptr.
   TfLiteTensor* tensors = graph_info_->tensors();
   for (int i = 0; i < static_cast<int>(graph_info_->num_tensors()); ++i) {
@@ -394,7 +395,8 @@ TfLiteStatus ArenaPlanner::ReleaseNonPersistentMemory() {
 TfLiteStatus ArenaPlanner::AcquireNonPersistentMemory() {
   // First commit arena_ to allocate underlying buffer.
   bool reallocated;
-  TF_LITE_ENSURE_STATUS(arena_.Commit(context_, &reallocated));
+  TF_LITE_ENSURE_STATUS(arena_.Commit(&reallocated));
+  has_nonpersistent_memory_ = true;
   // Resolve allocations for all tensors not on the persistent arena.
   TfLiteTensor* tensors = graph_info_->tensors();
   for (int i = 0; i < static_cast<int>(graph_info_->num_tensors()); ++i) {
@@ -407,7 +409,7 @@ TfLiteStatus ArenaPlanner::AcquireNonPersistentMemory() {
 }
 
 bool ArenaPlanner::HasNonPersistentMemory() {
-  return arena_.GetBufferSize() != 0;
+  return has_nonpersistent_memory_;
 }
 
 void ArenaPlanner::DumpDebugInfo(const std::vector<int>& execution_plan) const {
@@ -424,9 +426,10 @@ void ArenaPlanner::GetAllocInfo(size_t* arena_size,
 
 TfLiteStatus ArenaPlanner::Commit(bool* reallocated) {
   bool arena_reallocated, persistent_arena_reallocated;
-  TF_LITE_ENSURE_STATUS(arena_.Commit(context_, &arena_reallocated));
+  TF_LITE_ENSURE_STATUS(arena_.Commit(&arena_reallocated));
+  has_nonpersistent_memory_ = true;
   TF_LITE_ENSURE_STATUS(
-      persistent_arena_.Commit(context_, &persistent_arena_reallocated));
+      persistent_arena_.Commit(&persistent_arena_reallocated));
   *reallocated = arena_reallocated;
   *reallocated |= persistent_arena_reallocated;
   return kTfLiteOk;
diff --git a/tensorflow/lite/arena_planner.h b/tensorflow/lite/arena_planner.h
index f8547c352a8fc5..f4644d15986fab 100644
--- a/tensorflow/lite/arena_planner.h
+++ b/tensorflow/lite/arena_planner.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_ARENA_PLANNER_H_
 #define TENSORFLOW_LITE_ARENA_PLANNER_H_
 
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <unordered_map>
@@ -30,7 +31,6 @@ limitations under the License.
 namespace tflite {
 
 constexpr const int kDefaultArenaAlignment = 64;
-struct AllocationInfo;
 
 // A memory planner that makes all the allocations using arenas.
 //
@@ -141,6 +141,8 @@ class ArenaPlanner : public MemoryPlanner {
   // Raw memory buffer that is allocated for all temporary and graph outputs
   // that are declared kTfLiteArenaRw.
   SimpleMemoryArena arena_;
+  // True when the arena_ has allocated memory (Commit was called).
+  bool has_nonpersistent_memory_;
 
   // Raw memory buffer that is allocated for persistent tensors that are
   // declared as kTfLiteArenaRwPersistent.
diff --git a/tensorflow/lite/arena_planner_test.cc b/tensorflow/lite/arena_planner_test.cc
index 2a434d734f0ec9..2021ac0797654c 100644
--- a/tensorflow/lite/arena_planner_test.cc
+++ b/tensorflow/lite/arena_planner_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdarg>
 #include <cstddef>
 #include <cstdint>
+#include <cstdio>
 #include <initializer_list>
 #include <memory>
 #include <set>
@@ -25,11 +26,12 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/core/platform/logging.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/graph_info.h"
-#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 
@@ -1079,10 +1081,10 @@ TEST_F(ArenaPlannerTest, SimpleProfilerTest) {
   SetGraph(&graph);
   Execute(0, graph.nodes().size() - 1);
 
-  EXPECT_EQ(gNumAlloc, 2);
+  EXPECT_EQ(gNumAlloc, 1);
   EXPECT_EQ(gNumDealloc, 0);
   Destroy();
-  EXPECT_EQ(gNumDealloc, 2);
+  EXPECT_EQ(gNumDealloc, 1);
 }
 
 }  // namespace
diff --git a/tensorflow/lite/async/c/async_kernel.h b/tensorflow/lite/async/c/async_kernel.h
index f9bc9dcd5866d9..d49c72f4a5342b 100644
--- a/tensorflow/lite/async/c/async_kernel.h
+++ b/tensorflow/lite/async/c/async_kernel.h
@@ -14,6 +14,7 @@ limitations under the License.
 
 /// For documentation, see
 /// third_party/tensorflow/lite/core/async/c/async_kernel.h.
+
 #include "tensorflow/lite/core/async/c/async_kernel.h"  // IWYU pragma: export
 
 #endif  // TENSORFLOW_LITE_ASYNC_C_ASYNC_KERNEL_H_
diff --git a/tensorflow/lite/async/c/async_signature_runner.h b/tensorflow/lite/async/c/async_signature_runner.h
index 84ea7085cc6ced..7eacd0cb8ebfc1 100644
--- a/tensorflow/lite/async/c/async_signature_runner.h
+++ b/tensorflow/lite/async/c/async_signature_runner.h
@@ -14,6 +14,7 @@ limitations under the License.
 
 /// For documentation, see
 /// third_party/tensorflow/lite/core/async/c/async_signature_runner.h.
+
 #include "tensorflow/lite/core/async/c/async_signature_runner.h"  // IWYU pragma: export
 
 #endif  // TENSORFLOW_LITE_ASYNC_C_ASYNC_SIGNATURE_RUNNER_H_
diff --git a/tensorflow/lite/async/c/task.h b/tensorflow/lite/async/c/task.h
index 0fa56b3358302d..891e4183f4514e 100644
--- a/tensorflow/lite/async/c/task.h
+++ b/tensorflow/lite/async/c/task.h
@@ -12,8 +12,10 @@ limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_ASYNC_C_TASK_H_
 #define TENSORFLOW_LITE_ASYNC_C_TASK_H_
+
 /// For documentation, see
 /// third_party/tensorflow/lite/core/async/c/task.h.
+
 #include "tensorflow/lite/core/async/c/task.h"  // IWYU pragma: export
 
 #endif  // TENSORFLOW_LITE_ASYNC_C_TASK_H_
diff --git a/tensorflow/lite/async/c/types.h b/tensorflow/lite/async/c/types.h
index a606c75536b5b1..6b509427111de3 100644
--- a/tensorflow/lite/async/c/types.h
+++ b/tensorflow/lite/async/c/types.h
@@ -11,7 +11,10 @@ limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_ASYNC_C_TYPES_H_
 #define TENSORFLOW_LITE_ASYNC_C_TYPES_H_
+
 /// For documentation, see
 /// tensorflow/lite/core/async/c/types.h.
+
 #include "tensorflow/lite/core/async/c/types.h"  // IWYU pragma: export
+
 #endif  // TENSORFLOW_LITE_ASYNC_C_TYPES_H_
diff --git a/tensorflow/lite/async/interop/c/attribute_map.h b/tensorflow/lite/async/interop/c/attribute_map.h
index c1b41b6292ccfc..7da44462e99a30 100644
--- a/tensorflow/lite/async/interop/c/attribute_map.h
+++ b/tensorflow/lite/async/interop/c/attribute_map.h
@@ -17,4 +17,4 @@ limitations under the License.
 
 #include "tensorflow/lite/core/async/interop/c/attribute_map.h"  // IWYU pragma: export
 
-#endif // TENSORFLOW_LITE_ASYNC_INTEROP_C_ATTRIBUTE_MAP_H_
\ No newline at end of file
+#endif  // TENSORFLOW_LITE_ASYNC_INTEROP_C_ATTRIBUTE_MAP_H_
diff --git a/tensorflow/lite/async/interop/c/constants.h b/tensorflow/lite/async/interop/c/constants.h
index 07365bf9f41dd1..6b151dde5fd3bd 100644
--- a/tensorflow/lite/async/interop/c/constants.h
+++ b/tensorflow/lite/async/interop/c/constants.h
@@ -12,6 +12,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_ASYNC_INTEROP_C_CONSTANTS_H_
 #define TENSORFLOW_LITE_ASYNC_INTEROP_C_CONSTANTS_H_
 
+/// For documentation, see
+/// third_party/tensorflow/lite/core/async/interop/c/constants.h
+
 #include "tensorflow/lite/core/async/interop/c/constants.h"  // IWYU pragma: export
 
 #endif  // TENSORFLOW_LITE_ASYNC_INTEROP_C_CONSTANTS_H_
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 7628e5ad1f9997..0606819288b6e5 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_C_BUILTIN_OP_DATA_H_
 #define TENSORFLOW_LITE_C_BUILTIN_OP_DATA_H_
 
+/// For documentation, see
+/// third_party/tensorflow/lite/core/c/builtin_op_data.h
+
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 
 #endif  // TENSORFLOW_LITE_C_BUILTIN_OP_DATA_H_
diff --git a/tensorflow/lite/c/c_api_experimental.h b/tensorflow/lite/c/c_api_experimental.h
index 2bf6add77f3c02..84cd4b030506af 100644
--- a/tensorflow/lite/c/c_api_experimental.h
+++ b/tensorflow/lite/c/c_api_experimental.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_C_C_API_EXPERIMENTAL_H_
 #define TENSORFLOW_LITE_C_C_API_EXPERIMENTAL_H_
 
+/// For documentation, see
+/// third_party/tensorflow/lite/core/c/c_api_experimental.h
+
 #include "tensorflow/lite/core/c/c_api_experimental.h"
 
 #endif  // TENSORFLOW_LITE_C_C_API_EXPERIMENTAL_H_
diff --git a/tensorflow/lite/c/c_api_opaque.h b/tensorflow/lite/c/c_api_opaque.h
index 0cafb763f83cdf..7e4d401a46466e 100644
--- a/tensorflow/lite/c/c_api_opaque.h
+++ b/tensorflow/lite/c/c_api_opaque.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_C_C_API_OPAQUE_H_
 #define TENSORFLOW_LITE_C_C_API_OPAQUE_H_
 
+/// For documentation, see
+/// third_party/tensorflow/lite/core/c/c_api_opaque.h
+
 #include "tensorflow/lite/core/c/c_api_opaque.h"
 
 #endif  // TENSORFLOW_LITE_C_C_API_OPAQUE_H_
diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index f5a31d3f0cd88c..8a8b51331c476b 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -21,6 +21,9 @@ limitations under the License.
 /// interpreter and the operations are C.
 ///
 /// For documentation, see tensorflow/lite/core/c/common.h.
+///
+/// See also c_api_opaque.h which has more ABI-stable variants of some of these
+/// APIs.
 
 #ifndef TENSORFLOW_LITE_C_COMMON_H_
 #define TENSORFLOW_LITE_C_COMMON_H_
diff --git a/tensorflow/lite/c/jni/jni_utils.h b/tensorflow/lite/c/jni/jni_utils.h
index a425dcf4788f40..355b7a4a83bbf9 100644
--- a/tensorflow/lite/c/jni/jni_utils.h
+++ b/tensorflow/lite/c/jni/jni_utils.h
@@ -22,6 +22,14 @@ limitations under the License.
 extern "C" {
 #endif
 
+/// Checks whether the TFLite API has been initialized, throwing a Java exception
+/// otherwise.
+///
+/// @param env The JNIEnv for the current thread (which has to be attached to the
+///     JVM).
+/// @return Whether or not the TFLite API has been initialized. If this method
+///   returns false, no other JNI method should be called until the pending
+///   exception has been handled (typically by returning to Java).
 bool TfLiteCheckInitializedOrThrow(JNIEnv* env);
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h b/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h
index 900a2666934186..3f02a3fe267fc1 100644
--- a/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h
+++ b/tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h
@@ -12,18 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// NOLINTBEGIN(whitespace/line_length)
 // WARNING: Users of TensorFlow Lite should not include this file directly,
 // but should instead include
 // "third_party/tensorflow/lite/acceleration/configuration/c/delegate_plugin.h".
-// Only the TensorFlow Lite implementation itself should include this
-// file directly.
-// NOLINTEND(whitespace/line_length)
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
+
 #ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
 #define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
 
 /// C API types for TF Lite delegate plugins.
 
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
+
 #include "tensorflow/lite/core/c/common.h"
 
 #ifdef __cplusplus
@@ -32,7 +41,7 @@ extern "C" {
 
 // clang-format off
 // NOLINTBEGIN(whitespace/line_length)
-/** \defgroup delegate_plugin tensorflow/lite/acceleration/configuration/c/delegate_plugin.h
+/** \defgroup delegate_plugin lite/acceleration/configuration/c/delegate_plugin.h
  *  @{
  */
 // NOLINTEND(whitespace/line_length)
diff --git a/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h b/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h
index c30ce4dcdf4452..c1e42c935f974a 100644
--- a/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h
+++ b/tensorflow/lite/core/acceleration/configuration/c/gpu_plugin.h
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// NOLINTBEGIN(whitespace/line_length)
-// WARNING: Users of TensorFlow Lite should not include this file directly,
-// but should instead include
+// WARNING: Users of TensorFlow Lite should not include this file directly, but
+// should instead include
 // "third_party/tensorflow/lite/acceleration/configuration/c/gpu_plugin.h".
-// Only the TensorFlow Lite implementation itself should include this
-// file directly.
-// NOLINTEND(whitespace/line_length)
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
+
 #ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
 #define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
 
@@ -32,6 +31,16 @@ limitations under the License.
 ///
 /// But to provide a C API to access the GPU delegate plugin, we do expose
 /// some functions, which are declared below.
+///
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/acceleration/configuration/c/gpu_plugin.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
 
 #include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
 
@@ -41,7 +50,7 @@ extern "C" {
 
 // clang-format off
 // NOLINTBEGIN(whitespace/line_length)
-/** \defgroup gpu_plugin tensorflow/lite/acceleration/configuration/c/gpu_plugin.h
+/** \defgroup gpu_plugin lite/acceleration/configuration/c/gpu_plugin.h
  *  @{
  */
 // NOLINTEND(whitespace/line_length)
diff --git a/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h b/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h
index fce48ff8622288..d7c51a9b5afc7a 100644
--- a/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h
+++ b/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.h
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// NOLINTBEGIN(whitespace/line_length)
-// WARNING: Users of TensorFlow Lite should not include this file directly,
-// but should instead include
+// WARNING: Users of TensorFlow Lite should not include this file directly, but
+// should instead include
 // "third_party/tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h".
-// Only the TensorFlow Lite implementation itself should include this
-// file directly.
-// NOLINTEND(whitespace/line_length)
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
+
 #ifndef TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
 #define TENSORFLOW_LITE_CORE_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
 
@@ -32,6 +31,16 @@ limitations under the License.
 ///
 /// But to provide a C API to access the XNNPACK delegate plugin, we do expose
 /// some functions, which are declared below.
+///
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
 
 #include "tensorflow/lite/core/acceleration/configuration/c/delegate_plugin.h"
 
@@ -41,7 +50,7 @@ extern "C" {
 
 // clang-format off
 // NOLINTBEGIN(whitespace/line_length)
-/** \defgroup xnnpack_plugin tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h
+/** \defgroup xnnpack_plugin lite/acceleration/configuration/c/xnnpack_plugin.h
  *  @{
  */
 // NOLINTEND(whitespace/line_length)
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index f37e38a9c144fd..8b7f0e522acf21 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -918,6 +918,9 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
+    case BuiltinOperator_STABLEHLO_PAD: {
+      return ParseStablehloPad(op, error_reporter, allocator, builtin_data);
+    }
     // TODO: skip param parsing for now since ops below don't have kernels
     case BuiltinOperator_STABLEHLO_SLICE:
     case BuiltinOperator_STABLEHLO_BROADCAST_IN_DIM:
@@ -952,7 +955,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_STABLEHLO_IOTA:
     case BuiltinOperator_STABLEHLO_COMPARE:
     case BuiltinOperator_STABLEHLO_CONVERT:
-    case BuiltinOperator_STABLEHLO_PAD:
     case BuiltinOperator_STABLEHLO_DOT_GENERAL:
     case BuiltinOperator_STABLEHLO_SORT:
     case BuiltinOperator_STABLEHLO_WHILE:
@@ -2123,7 +2125,8 @@ TfLiteStatus ParseStablehloReduceWindow(const Operator* op,
     const size_t rank = schema_params->window_dimensions()->size();
 
     auto LoadAttr = [&error_reporter](
-                        auto& params_array, auto* const flatbuffer_vector,
+                        int64_t* params_array, size_t params_array_size_bytes,
+                        const flatbuffers::Vector<int64_t>* flatbuffer_vector,
                         const char* attr_name, const size_t expected_size,
                         const int64_t fill_value) -> TfLiteStatus {
       if (flatbuffer_vector && flatbuffer_vector->size()) {
@@ -2136,7 +2139,7 @@ TfLiteStatus ParseStablehloReduceWindow(const Operator* op,
           return kTfLiteError;
         }
         TfLiteStatus status = FlatBufferIntVectorToArray(
-            sizeof(params_array), flatbuffer_vector, params_array,
+            params_array_size_bytes, flatbuffer_vector, params_array,
             error_reporter, "stablehlo.reduce_window");
         if (status != kTfLiteOk) {
           TF_LITE_REPORT_ERROR(error_reporter, "Check the '%s' attribute.",
@@ -2144,43 +2147,32 @@ TfLiteStatus ParseStablehloReduceWindow(const Operator* op,
           return status;
         }
       } else {
-        std::fill_n(params_array,
-                    TFLITE_STABLEHLO_REDUCE_WINDOW_PARAMS_MAX_DIMENSION_COUNT,
+        std::fill_n(params_array, params_array_size_bytes / sizeof(int64_t),
                     fill_value);
       }
       return kTfLiteOk;
     };
 
-    if (TfLiteStatus status = LoadAttr(
-            params->window_dimensions, schema_params->window_dimensions(),
-            "window_dimensions", /*expected_size=*/rank, /*fill_value=*/1);
-        status != kTfLiteOk) {
-      return status;
-    }
-    if (TfLiteStatus status = LoadAttr(
-            params->window_strides, schema_params->window_strides(),
-            "window_strides", /*expected_size=*/rank, /*fill_value=*/1);
-        status != kTfLiteOk) {
-      return status;
-    }
-    if (TfLiteStatus status = LoadAttr(
-            params->base_dilations, schema_params->base_dilations(),
-            "base_dilations", /*expected_size=*/rank, /*fill_value=*/1);
-        status != kTfLiteOk) {
-      return status;
-    }
-    if (TfLiteStatus status = LoadAttr(
-            params->window_dilations, schema_params->window_dilations(),
-            "window_dilations", /*expected_size=*/rank, /*fill_value=*/1);
-        status != kTfLiteOk) {
-      return status;
-    }
-    if (TfLiteStatus status =
-            LoadAttr(params->padding, schema_params->padding(), "padding",
-                     /*expected_size=*/2 * rank, /*fill_value=*/0);
-        status != kTfLiteOk) {
-      return status;
-    }
+    TF_LITE_ENSURE_STATUS(
+        LoadAttr(params->window_dimensions, sizeof(params->window_dimensions),
+                 schema_params->window_dimensions(), "window_dimensions",
+                 /*expected_size=*/rank, /*fill_value=*/1));
+    TF_LITE_ENSURE_STATUS(
+        LoadAttr(params->window_strides, sizeof(params->window_strides),
+                 schema_params->window_strides(), "window_strides",
+                 /*expected_size=*/rank, /*fill_value=*/1));
+    TF_LITE_ENSURE_STATUS(
+        LoadAttr(params->base_dilations, sizeof(params->base_dilations),
+                 schema_params->base_dilations(), "base_dilations",
+                 /*expected_size=*/rank, /*fill_value=*/1));
+    TF_LITE_ENSURE_STATUS(
+        LoadAttr(params->window_dilations, sizeof(params->window_dilations),
+                 schema_params->window_dilations(), "window_dilations",
+                 /*expected_size=*/rank, /*fill_value=*/1));
+    TF_LITE_ENSURE_STATUS(LoadAttr(params->padding, sizeof(params->padding),
+                                   schema_params->padding(), "padding",
+                                   /*expected_size=*/2 * rank,
+                                   /*fill_value=*/0));
 
     params->body_subgraph_index = schema_params->body_subgraph_index();
     *builtin_data = params.release();
@@ -2209,27 +2201,34 @@ TfLiteStatus ParseStablehloScatter(const Operator* op,
   if (schema_params) {
     params->indices_are_sorted = schema_params->indices_are_sorted();
 
-    TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray<int64_t>(
-        schema_params->update_window_dims()->size() * sizeof(int64_t),
-        schema_params->update_window_dims(), params->update_window_dims,
-        error_reporter, "stablehlo_scatter"));
-    params->num_update_window_dims =
-        schema_params->update_window_dims()->size();
+    if (schema_params->update_window_dims()) {
+      TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray<int64_t>(
+          schema_params->update_window_dims()->size() * sizeof(int64_t),
+          schema_params->update_window_dims(), params->update_window_dims,
+          error_reporter, "stablehlo_scatter"));
+      params->num_update_window_dims =
+          schema_params->update_window_dims()->size();
+    }
 
-    TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray<int64_t>(
-        schema_params->inserted_window_dims()->size() * sizeof(int64_t),
-        schema_params->inserted_window_dims(), params->inserted_window_dims,
-        error_reporter, "stablehlo_scatter"));
-    params->num_inserted_window_dims =
-        schema_params->inserted_window_dims()->size();
+    if (schema_params->inserted_window_dims()) {
+      TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray<int64_t>(
+          schema_params->inserted_window_dims()->size() * sizeof(int64_t),
+          schema_params->inserted_window_dims(), params->inserted_window_dims,
+          error_reporter, "stablehlo_scatter"));
+      params->num_inserted_window_dims =
+          schema_params->inserted_window_dims()->size();
+    }
 
-    TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray<int64_t>(
-        schema_params->scatter_dims_to_operand_dims()->size() * sizeof(int64_t),
-        schema_params->scatter_dims_to_operand_dims(),
-        params->scatter_dims_to_operand_dims, error_reporter,
-        "stablehlo_scatter"));
-    params->num_scatter_dims_to_operand_dims =
-        schema_params->scatter_dims_to_operand_dims()->size();
+    if (schema_params->scatter_dims_to_operand_dims()) {
+      TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray<int64_t>(
+          schema_params->scatter_dims_to_operand_dims()->size() *
+              sizeof(int64_t),
+          schema_params->scatter_dims_to_operand_dims(),
+          params->scatter_dims_to_operand_dims, error_reporter,
+          "stablehlo_scatter"));
+      params->num_scatter_dims_to_operand_dims =
+          schema_params->scatter_dims_to_operand_dims()->size();
+    }
 
     params->index_vector_dim = schema_params->index_vector_dim();
     params->unique_indices = schema_params->unique_indices();
@@ -2326,6 +2325,59 @@ TfLiteStatus ParseStablehloGather(const Operator* op,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParseStablehloPad(const Operator* op,
+                               ErrorReporter* error_reporter,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  auto params = safe_allocator.Allocate<TfLiteStablehloPadParams>();
+  const StablehloPadOptions* schema_params =
+      op->builtin_options_2_as_StablehloPadOptions();
+
+  if (schema_params) {
+    auto LoadAttr =
+        [&error_reporter](
+            int64_t* params_array, const size_t params_array_size_bytes,
+            const flatbuffers::Vector<int64_t>* const flatbuffer_vector,
+            const char* const attr_name) -> TfLiteStatus {
+      TfLiteStatus status = FlatBufferIntVectorToArray(
+          params_array_size_bytes, flatbuffer_vector, params_array,
+          error_reporter, "stablehlo.pad");
+      if (status != kTfLiteOk) {
+        TF_LITE_REPORT_ERROR(error_reporter, "Check the '%s' attribute.",
+                             attr_name);
+      }
+      return status;
+    };
+
+    TF_LITE_ENSURE_STATUS(
+        LoadAttr(params->edge_padding_low, sizeof(params->edge_padding_low),
+                 schema_params->edge_padding_low(), "edge_padding_low"));
+    TF_LITE_ENSURE_STATUS(
+        LoadAttr(params->edge_padding_high, sizeof(params->edge_padding_high),
+                 schema_params->edge_padding_high(), "edge_padding_high"));
+    TF_LITE_ENSURE_STATUS(
+        LoadAttr(params->interior_padding, sizeof(params->interior_padding),
+                 schema_params->interior_padding(), "interior_padding"));
+    if (schema_params->edge_padding_low()->size() !=
+            schema_params->edge_padding_high()->size() ||
+        schema_params->edge_padding_low()->size() !=
+            schema_params->interior_padding()->size()) {
+      TF_LITE_REPORT_ERROR(error_reporter,
+                           "'stablehlo.pad' operation parameter array sizes "
+                           "are not consistent.");
+      return kTfLiteError;
+    }
+    *builtin_data = params.release();
+    return kTfLiteOk;
+  }
+  TF_LITE_REPORT_ERROR(error_reporter,
+                       "Could not get 'stablehlo.pad' operation parameters.");
+  return kTfLiteError;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index 11e70a601077de..1c90e9fd9bdd68 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -440,6 +440,11 @@ TfLiteStatus ParseStablehloReduceWindow(const Operator* op,
                                         BuiltinDataAllocator* allocator,
                                         void** builtin_data);
 
+TfLiteStatus ParseStablehloPad(const Operator* op,
+                               ErrorReporter* error_reporter,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index 1fbe440404607f..6e08e6880e5522 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "flatbuffers/buffer.h"  // from @flatbuffers
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -226,8 +225,7 @@ class StablehloReduceWindowFlatbufferConversionsTest
   auto EmptyAttr() { return builder_.CreateVector<int64_t>({}); }
 };
 
-TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindow) {
+TEST_F(StablehloReduceWindowFlatbufferConversionsTest, Succeeds) {
   const Operator* stablehlo_reduce_window_op = BuildTestOperator(
       BuiltinOptions2_StablehloReduceWindowOptions,
       CreateStablehloReduceWindowOptions(
@@ -260,40 +258,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowDeathTests) {
-  const Operator* stablehlo_reduce_window_op = BuildTestOperator(
-      BuiltinOptions2_StablehloReduceWindowOptions,
-      CreateStablehloReduceWindowOptions(
-          builder_, /*window_dimensions=*/ValidAttr(),
-          /*window_strides=*/ValidAttr(),
-          /*base_dilations=*/ValidAttr(),
-          /*window_dilations=*/ValidAttr(),
-          /*padding=*/ValidPaddingAttr(), /*body_subgraph_index=*/13)
-          .Union());
-  TfLiteStablehloReduceWindowParams* output_data = nullptr;
-#ifdef NDEBUG
-  GTEST_SKIP();
-#endif
-  EXPECT_DEATH(
-      ParseOpData(nullptr, BuiltinOperator_STABLEHLO_REDUCE_WINDOW,
-                  &mock_reporter_, &mock_allocator_, (void**)&output_data),
-      "");
-  EXPECT_DEATH(ParseOpData(stablehlo_reduce_window_op,
-                           BuiltinOperator_STABLEHLO_REDUCE_WINDOW, nullptr,
-                           &mock_allocator_, (void**)&output_data),
-               "");
-  EXPECT_DEATH(ParseOpData(stablehlo_reduce_window_op,
-                           BuiltinOperator_STABLEHLO_REDUCE_WINDOW,
-                           &mock_reporter_, nullptr, (void**)&output_data),
-               "");
-  EXPECT_DEATH(ParseOpData(stablehlo_reduce_window_op,
-                           BuiltinOperator_STABLEHLO_REDUCE_WINDOW,
-                           &mock_reporter_, &mock_allocator_, nullptr),
-               "");
-}
-
-TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowFailsWithNoWindowDimensions) {
+       FailsWithNoWindowDimensions) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -315,7 +280,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowSucceedsWithNoWindowStrides) {
+       SucceedsWithNoWindowStrides) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -345,7 +310,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowSucceedsWithNoBaseDilations) {
+       SucceedsWithNoBaseDilations) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -375,7 +340,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowSucceedsWithNoWindowDilations) {
+       SucceedsWithNoWindowDilations) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -405,8 +370,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
   EXPECT_THAT(output_data->body_subgraph_index, Eq(13));
 }
 
-TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowSucceedsWithNoPadding) {
+TEST_F(StablehloReduceWindowFlatbufferConversionsTest, SucceedsWithNoPadding) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -436,7 +400,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowFailsWithEmptyWindowDimensions) {
+       FailsWithEmptyWindowDimensions) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -458,7 +422,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowSucceedsWithEmptyWindowStrides) {
+       SucceedsWithEmptyWindowStrides) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -488,7 +452,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowSucceedsWithEmptyBaseDilations) {
+       SucceedsWithEmptyBaseDilations) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -518,7 +482,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowSucceedsWithEmptyWindowDilations) {
+       SucceedsWithEmptyWindowDilations) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -549,7 +513,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowSucceedsWithEmptyPadding) {
+       SucceedsWithEmptyPadding) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -579,7 +543,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowSucceedsWithParamsAtMaxDims) {
+       SucceedsWithParamsAtMaxDims) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -599,7 +563,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowFailsWhenWindowDimensionsHasMoreThanMaxDims) {
+       FailsWhenWindowDimensionsHasMoreThanMaxDims) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -622,7 +586,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowFailsWhenWindowStridesHasWrongDimCount) {
+       FailsWhenWindowStridesHasWrongDimCount) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -645,7 +609,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowFailsWhenBaseDilationsHasWrongDimCount) {
+       FailsWhenBaseDilationsHasWrongDimCount) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -668,7 +632,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowFailsWhenWindowDilationsHasWrongDimCount) {
+       FailsWhenWindowDilationsHasWrongDimCount) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -692,7 +656,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
 }
 
 TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowFailsWhenPaddingHasWrongDimCount) {
+       FailsWhenPaddingHasWrongDimCount) {
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
   EXPECT_EQ(ParseOpData(
                 BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions,
@@ -713,8 +677,7 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
                         "not have the expected size"));
 }
 
-TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
-       ParseStablehloReduceWindowFailsWithWrongOptions) {
+TEST_F(StablehloReduceWindowFlatbufferConversionsTest, FailsWithWrongOptions) {
   const Operator* stablehlo_reduce_window_op =
       BuildTestOperator(BuiltinOptions2_StablehloReduceWindowOptions, 0);
   TfLiteStablehloReduceWindowParams* output_data = nullptr;
@@ -729,4 +692,179 @@ TEST_F(StablehloReduceWindowFlatbufferConversionsTest,
           "Could not get 'stablehlo.reduce_window' operation parameters."));
 }
 
+TEST_F(StablehloReduceWindowFlatbufferConversionsTest, DeathTests) {
+  const Operator* stablehlo_reduce_window_op = BuildTestOperator(
+      BuiltinOptions2_StablehloReduceWindowOptions,
+      CreateStablehloReduceWindowOptions(
+          builder_, /*window_dimensions=*/ValidAttr(),
+          /*window_strides=*/ValidAttr(),
+          /*base_dilations=*/ValidAttr(),
+          /*window_dilations=*/ValidAttr(),
+          /*padding=*/ValidPaddingAttr(), /*body_subgraph_index=*/13)
+          .Union());
+  TfLiteStablehloReduceWindowParams* output_data = nullptr;
+#ifdef NDEBUG
+  GTEST_SKIP();
+#endif
+  EXPECT_DEATH(
+      ParseOpData(nullptr, BuiltinOperator_STABLEHLO_REDUCE_WINDOW,
+                  &mock_reporter_, &mock_allocator_, (void**)&output_data),
+      "");
+  EXPECT_DEATH(ParseOpData(stablehlo_reduce_window_op,
+                           BuiltinOperator_STABLEHLO_REDUCE_WINDOW, nullptr,
+                           &mock_allocator_, (void**)&output_data),
+               "");
+  EXPECT_DEATH(ParseOpData(stablehlo_reduce_window_op,
+                           BuiltinOperator_STABLEHLO_REDUCE_WINDOW,
+                           &mock_reporter_, nullptr, (void**)&output_data),
+               "");
+  EXPECT_DEATH(ParseOpData(stablehlo_reduce_window_op,
+                           BuiltinOperator_STABLEHLO_REDUCE_WINDOW,
+                           &mock_reporter_, &mock_allocator_, nullptr),
+               "");
+}
+
+class StablehloPadFlatbufferConversionsTest : public FlatbufferConversionsTest {
+ public:
+  static constexpr int kMaxDims =
+      TFLITE_STABLEHLO_PAD_PARAMS_MAX_DIMENSION_COUNT;
+  static constexpr int64_t kValidValue = 5;
+};
+
+TEST_F(StablehloPadFlatbufferConversionsTest, Succeeds) {
+  const Operator* stablehlo_pad_op = BuildTestOperator(
+      BuiltinOptions2_StablehloPadOptions,
+      CreateStablehloPadOptions(
+          builder_,
+          /*edge_padding_low=*/builder_.CreateVector<int64_t>({1, 0, -1}),
+          /*edge_padding_high=*/builder_.CreateVector<int64_t>({2, 0, -2}),
+          /*interior_padding=*/builder_.CreateVector<int64_t>({3, 0, 3}))
+          .Union());
+  TfLiteStablehloPadParams* output_data = nullptr;
+  EXPECT_EQ(
+      ParseOpData(stablehlo_pad_op, BuiltinOperator_STABLEHLO_PAD,
+                  &mock_reporter_, &mock_allocator_, (void**)&output_data),
+      kTfLiteOk);
+  EXPECT_THAT(std::make_tuple(output_data->edge_padding_low, 3),
+              ElementsAre(1, 0, -1));
+  EXPECT_THAT(std::make_tuple(output_data->edge_padding_high, 3),
+              ElementsAre(2, 0, -2));
+  EXPECT_THAT(std::make_tuple(output_data->interior_padding, 3),
+              ElementsAre(3, 0, 3));
+}
+
+TEST_F(StablehloPadFlatbufferConversionsTest, FailsWithMissingLowPadding) {
+  const Operator* stablehlo_pad_op = BuildTestOperator(
+      BuiltinOptions2_StablehloPadOptions,
+      CreateStablehloPadOptions(
+          builder_,
+          /*edge_padding_low=*/0,
+          /*edge_padding_high=*/builder_.CreateVector<int64_t>({2, 0, -2}),
+          /*interior_padding=*/builder_.CreateVector<int64_t>({3, 0, 3}))
+          .Union());
+  TfLiteStablehloPadParams* output_data = nullptr;
+  EXPECT_EQ(
+      ParseOpData(stablehlo_pad_op, BuiltinOperator_STABLEHLO_PAD,
+                  &mock_reporter_, &mock_allocator_, (void**)&output_data),
+      kTfLiteError);
+  EXPECT_THAT(
+      mock_reporter_.GetString(),
+      AllOf(
+          HasSubstr("Input array not provided for operation 'stablehlo.pad'."),
+          HasSubstr("Check the 'edge_padding_low' attribute.")));
+}
+
+TEST_F(StablehloPadFlatbufferConversionsTest, FailsWithMissingHighPadding) {
+  const Operator* stablehlo_pad_op = BuildTestOperator(
+      BuiltinOptions2_StablehloPadOptions,
+      CreateStablehloPadOptions(
+          builder_,
+          /*edge_padding_low=*/builder_.CreateVector<int64_t>({1, 0, -1}),
+          /*edge_padding_high=*/0,
+          /*interior_padding=*/builder_.CreateVector<int64_t>({3, 0, 3}))
+          .Union());
+  TfLiteStablehloPadParams* output_data = nullptr;
+  EXPECT_EQ(
+      ParseOpData(stablehlo_pad_op, BuiltinOperator_STABLEHLO_PAD,
+                  &mock_reporter_, &mock_allocator_, (void**)&output_data),
+      kTfLiteError);
+  EXPECT_THAT(
+      mock_reporter_.GetString(),
+      AllOf(
+          HasSubstr("Input array not provided for operation 'stablehlo.pad'."),
+          HasSubstr("Check the 'edge_padding_high' attribute.")));
+}
+
+TEST_F(StablehloPadFlatbufferConversionsTest, FailsWithMissingInteriorPadding) {
+  const Operator* stablehlo_pad_op = BuildTestOperator(
+      BuiltinOptions2_StablehloPadOptions,
+      CreateStablehloPadOptions(
+          builder_,
+          /*edge_padding_low=*/builder_.CreateVector<int64_t>({1, 0, -1}),
+          /*edge_padding_high=*/builder_.CreateVector<int64_t>({2, 0, -2}),
+          /*interior_padding=*/0)
+          .Union());
+  TfLiteStablehloPadParams* output_data = nullptr;
+  EXPECT_EQ(
+      ParseOpData(stablehlo_pad_op, BuiltinOperator_STABLEHLO_PAD,
+                  &mock_reporter_, &mock_allocator_, (void**)&output_data),
+      kTfLiteError);
+  EXPECT_THAT(
+      mock_reporter_.GetString(),
+      AllOf(
+          HasSubstr("Input array not provided for operation 'stablehlo.pad'."),
+          HasSubstr("Check the 'interior_padding' attribute.")));
+}
+
+TEST_F(StablehloPadFlatbufferConversionsTest, FailsInconsistentSizes) {
+  const Operator* stablehlo_pad_op = BuildTestOperator(
+      BuiltinOptions2_StablehloPadOptions,
+      CreateStablehloPadOptions(
+          builder_,
+          /*edge_padding_low=*/builder_.CreateVector<int64_t>({1, 0, -1}),
+          /*edge_padding_high=*/builder_.CreateVector<int64_t>({2, 0, -2}),
+          /*interior_padding=*/builder_.CreateVector<int64_t>({3, 0, -3, 5}))
+          .Union());
+  TfLiteStablehloPadParams* output_data = nullptr;
+  EXPECT_EQ(
+      ParseOpData(stablehlo_pad_op, BuiltinOperator_STABLEHLO_PAD,
+                  &mock_reporter_, &mock_allocator_, (void**)&output_data),
+      kTfLiteError);
+  EXPECT_THAT(mock_reporter_.GetString(),
+              HasSubstr("'stablehlo.pad' operation parameter array sizes are "
+                        "not consistent."));
+}
+
+TEST_F(StablehloPadFlatbufferConversionsTest, FailsWithWrongOptions) {
+  const Operator* stablehlo_pad_op = BuildTestOperator(BuiltinOptions_NONE, 0);
+  TfLiteStablehloPadParams* output_data = nullptr;
+  EXPECT_EQ(
+      ParseOpData(stablehlo_pad_op, BuiltinOperator_STABLEHLO_PAD,
+                  &mock_reporter_, &mock_allocator_, (void**)&output_data),
+      kTfLiteError);
+  EXPECT_THAT(mock_reporter_.GetString(),
+              HasSubstr("Could not get 'stablehlo.pad' operation parameters."));
+}
+
+TEST_F(StablehloPadFlatbufferConversionsTest, DeathTests) {
+  const Operator* stablehlo_pad_op = BuildTestOperator(BuiltinOptions_NONE, 0);
+  TfLiteStablehloPadParams* output_data = nullptr;
+#ifdef NDEBUG
+  GTEST_SKIP();
+#endif
+  EXPECT_DEATH(
+      ParseOpData(nullptr, BuiltinOperator_STABLEHLO_PAD, &mock_reporter_,
+                  &mock_allocator_, (void**)&output_data),
+      "");
+  EXPECT_DEATH(ParseOpData(stablehlo_pad_op, BuiltinOperator_STABLEHLO_PAD,
+                           nullptr, &mock_allocator_, (void**)&output_data),
+               "");
+  EXPECT_DEATH(ParseOpData(stablehlo_pad_op, BuiltinOperator_STABLEHLO_PAD,
+                           &mock_reporter_, nullptr, (void**)&output_data),
+               "");
+  EXPECT_DEATH(ParseOpData(stablehlo_pad_op, BuiltinOperator_STABLEHLO_PAD,
+                           &mock_reporter_, &mock_allocator_, nullptr),
+               "");
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/core/c/BUILD b/tensorflow/lite/core/c/BUILD
index e9e7fb17cf3936..70999f0b24cf7a 100644
--- a/tensorflow/lite/core/c/BUILD
+++ b/tensorflow/lite/core/c/BUILD
@@ -1,9 +1,9 @@
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/lite:build_def.bzl",
     "tflite_cc_library_with_c_headers_test",
     "tflite_copts",
 )
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/lite/core/c:special_rules.bzl",
     "c_api_experimental_visibility_allowlist",
@@ -190,6 +190,33 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "c_api_test_with_opaque_delegate",
+    size = "small",
+    srcs = ["c_api_test.cc"],
+    copts = tflite_copts(),
+    data = [
+        "//tensorflow/lite:testdata/2_subgraphs.bin",
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/add_quantized.bin",
+        "//tensorflow/lite:testdata/custom_sinh.bin",
+    ],
+    local_defines = ["TFLITE_USE_OPAQUE_DELEGATE"],
+    deps = [
+        ":c_api",
+        ":c_api_experimental",
+        ":c_api_types",
+        ":common",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core:subgraph",
+        "//tensorflow/lite/delegates:delegate_test_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_test(
     name = "selectively_built_c_api_test",
     size = "small",
@@ -350,6 +377,7 @@ tflite_cc_library_with_c_headers_test(
         "//tensorflow/lite:signature_runner",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/c:c_api_opaque_internal",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/core:framework",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/profiling/telemetry:profiler",
@@ -387,6 +415,7 @@ tflite_cc_library_with_c_headers_test(
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/c:c_api_opaque_internal",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/core:framework",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/profiling/telemetry:profiler",
@@ -438,6 +467,7 @@ tflite_cc_library_with_c_headers_test(
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/c:c_api_opaque_internal_without_alwayslink",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/core:framework",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/profiling/telemetry:profiler",
@@ -563,6 +593,7 @@ cc_test(
         ":common",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/delegates:delegate_test_util",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/core/c/builtin_op_data.h b/tensorflow/lite/core/c/builtin_op_data.h
index b96350f45e2af5..1ac385b932b15e 100644
--- a/tensorflow/lite/core/c/builtin_op_data.h
+++ b/tensorflow/lite/core/c/builtin_op_data.h
@@ -35,6 +35,7 @@ extern "C" {
 #define TFLITE_STABLEHLO_SCATTER_PARAMS_MAX_DIMENSION_COUNT 8
 #define TFLITE_STABLEHLO_GATHER_PARAMS_MAX_DIMENSION_COUNT 8
 #define TFLITE_STABLEHLO_REDUCE_WINDOW_PARAMS_MAX_DIMENSION_COUNT 8
+#define TFLITE_STABLEHLO_PAD_PARAMS_MAX_DIMENSION_COUNT 8
 
 // TODO(aselle): Consider using "if this then that" for testing.
 
@@ -636,6 +637,14 @@ typedef struct {
   enum TfLiteReduceWindowFunction reduce_function;
 } TfLiteReduceWindowParams;
 
+typedef struct {
+  // See the stablehlo spec for the explanation of the attributes:
+  // https://github.com/openxla/stablehlo/blob/main/docs/spec.md#pad
+  int64_t edge_padding_low[TFLITE_STABLEHLO_PAD_PARAMS_MAX_DIMENSION_COUNT];
+  int64_t edge_padding_high[TFLITE_STABLEHLO_PAD_PARAMS_MAX_DIMENSION_COUNT];
+  int64_t interior_padding[TFLITE_STABLEHLO_PAD_PARAMS_MAX_DIMENSION_COUNT];
+} TfLiteStablehloPadParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/core/c/c_api.h b/tensorflow/lite/core/c/c_api.h
index b98fddf2569744..f7504a315f1bff 100644
--- a/tensorflow/lite/core/c/c_api.h
+++ b/tensorflow/lite/core/c/c_api.h
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// \warning Note: Users of TensorFlow Lite should not include this file
-// directly, but should instead include
-// "third_party/tensorflow/lite/c/c_api.h". Only the TensorFlow Lite
-// implementation itself should include this
-// file directly.
+// WARNING: Users of TensorFlow Lite should not include this file directly, but
+// should instead include "third_party/tensorflow/lite/c/c_api.h".
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
 
 #ifndef TENSORFLOW_LITE_CORE_C_C_API_H_
 #define TENSORFLOW_LITE_CORE_C_C_API_H_
@@ -76,6 +75,16 @@ limitations under the License.
 /// TfLiteInterpreterOptionsDelete(options);
 /// TfLiteModelDelete(model);
 /// </code></pre>
+///
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/c/c_api.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
 
 #ifdef __cplusplus
 extern "C" {
@@ -83,7 +92,7 @@ extern "C" {
 
 // clang-format off
 // NOLINTBEGIN(whitespace/line_length)
-/** \defgroup c_api tensorflow/lite/c/c_api.h
+/** \defgroup c_api lite/c/c_api.h
  *  @{
  */
 // NOLINTEND(whitespace/line_length)
@@ -276,8 +285,6 @@ TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsAddRegistrationExternal(
 ///
 /// By default it is disabled and calling to `TfLiteInterpreterCancel` will
 /// return kTfLiteError. See `TfLiteInterpreterCancel`.
-///
-/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterOptionsEnableCancellation(
     TfLiteInterpreterOptions* options, bool enable);
 
@@ -448,8 +455,6 @@ TfLiteTensor* TfLiteInterpreterGetTensor(const TfLiteInterpreter* interpreter,
 ///
 /// Returns kTfLiteError if cancellation is not enabled via
 /// `TfLiteInterpreterOptionsEnableCancellation`.
-///
-/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterCancel(
     const TfLiteInterpreter* interpreter);
 
diff --git a/tensorflow/lite/core/c/c_api_experimental.cc b/tensorflow/lite/core/c/c_api_experimental.cc
index f7e117a64c53e0..f88349efc9f7f3 100644
--- a/tensorflow/lite/core/c/c_api_experimental.cc
+++ b/tensorflow/lite/core/c/c_api_experimental.cc
@@ -17,12 +17,14 @@ limitations under the License.
 
 #include <stdint.h>
 
+#include <cstddef>
 #include <memory>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/c/c_api.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/profiling/telemetry/profiler.h"
@@ -166,11 +168,6 @@ void TfLiteInterpreterOptionsSetEnableDelegateFallback(
   options->enable_delegate_fallback = enable;
 }
 
-void TfLiteSetAllowBufferHandleOutput(const TfLiteInterpreter* interpreter,
-                                      bool allow_buffer_handle_output) {
-  interpreter->impl->SetAllowBufferHandleOutput(allow_buffer_handle_output);
-}
-
 TfLiteStatus TfLiteInterpreterModifyGraphWithDelegate(
     const TfLiteInterpreter* interpreter, TfLiteDelegate* delegate) {
   return interpreter->impl->ModifyGraphWithDelegate(delegate);
@@ -191,6 +188,26 @@ int32_t TfLiteInterpreterGetSignatureCount(
   return static_cast<int32_t>(interpreter->impl->signature_keys().size());
 }
 
+TfLiteStatus TfLiteInterpreterSetBufferHandle(TfLiteInterpreter* interpreter,
+                                              TfLiteTensor* tensor,
+                                              TfLiteBufferHandle buffer_handle,
+                                              TfLiteOpaqueDelegate* delegate) {
+  return interpreter->impl->SetBufferHandle(tensor, buffer_handle, delegate);
+}
+
+TfLiteStatus TfLiteInterpreterGetBufferHandle(TfLiteInterpreter* interpreter,
+                                              int tensor_index,
+                                              TfLiteBufferHandle* buffer_handle,
+                                              TfLiteOpaqueDelegate** delegate) {
+  return interpreter->impl->GetBufferHandle(tensor_index, buffer_handle,
+                                            delegate);
+}
+
+void TfLiteSetAllowBufferHandleOutput(const TfLiteInterpreter* interpreter,
+                                      bool allow_buffer_handle_output) {
+  interpreter->impl->SetAllowBufferHandleOutput(allow_buffer_handle_output);
+}
+
 TfLiteStatus TfLiteInterpreterSetCustomAllocationForTensor(
     TfLiteInterpreter* interpreter, int tensor_index,
     const TfLiteCustomAllocation* allocation, int64_t flags) {
@@ -201,6 +218,11 @@ TfLiteStatus TfLiteInterpreterSetCustomAllocationForTensor(
                                                          *allocation, flags);
 }
 
+TfLiteStatus TfLiteInterpreterEnsureTensorDataIsReadable(
+    TfLiteInterpreter* interpreter, int tensor_index) {
+  return interpreter->impl->EnsureTensorDataIsReadable(tensor_index);
+}
+
 const char* TfLiteInterpreterGetSignatureKey(
     const TfLiteInterpreter* interpreter, int32_t signature_index) {
   int32_t signature_count = TfLiteInterpreterGetSignatureCount(interpreter);
diff --git a/tensorflow/lite/core/c/c_api_experimental.h b/tensorflow/lite/core/c/c_api_experimental.h
index 95528110167369..9de042491ead7d 100644
--- a/tensorflow/lite/core/c/c_api_experimental.h
+++ b/tensorflow/lite/core/c/c_api_experimental.h
@@ -21,6 +21,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_CORE_C_C_API_EXPERIMENTAL_H_
 
 #include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/c/c_api.h"
 #include "tensorflow/lite/core/c/common.h"
 
@@ -266,17 +267,6 @@ TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetUseNNAPI(
 TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetEnableDelegateFallback(
     TfLiteInterpreterOptions* options, bool enable);
 
-// Set if buffer handle output is allowed.
-//
-/// When using hardware delegation, Interpreter will make the data of output
-/// tensors available in `tensor->data` by default. If the application can
-/// consume the buffer handle directly (e.g. reading output from OpenGL
-/// texture), it can set this flag to false, so Interpreter won't copy the
-/// data from buffer handle to CPU memory. WARNING: This is an experimental
-/// API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteSetAllowBufferHandleOutput(
-    const TfLiteInterpreter* interpreter, bool allow_buffer_handle_output);
-
 /// Allow a delegate to look at the graph and modify the graph to handle
 /// parts of the graph themselves. After this is called, the graph may
 /// contain new nodes that replace 1 more nodes.
@@ -332,6 +322,41 @@ TfLiteInterpreterSetCustomAllocationForTensor(
     TfLiteInterpreter* interpreter, int tensor_index,
     const TfLiteCustomAllocation* allocation, int64_t flags);
 
+/// --------------------------------------------------------------------------
+/// BufferHandle APIs
+
+/// Sets the delegate buffer handle for the given tensor.
+///
+/// This function sets the buffer handle for a tensor that is used by other
+/// computing hardware such as EdgeTpu. For example, EdgeTpu delegate imports a
+/// tensor's memory into EdgeTpu's virtual address and returns a buffer handle.
+/// Then EdgeTpu delegate calls this API to associate the tensor with the buffer
+/// handle.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterSetBufferHandle(
+    TfLiteInterpreter* interpreter, TfLiteTensor* tensor,
+    TfLiteBufferHandle buffer_handle, TfLiteOpaqueDelegate* delegate);
+
+/// Gets the delegate buffer handle, and the delegate which can process
+/// the buffer handle.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterGetBufferHandle(
+    TfLiteInterpreter* interpreter, int tensor_index,
+    TfLiteBufferHandle* buffer_handle, TfLiteOpaqueDelegate** delegate);
+
+/// Sets whether buffer handle output is allowed.
+/// When using hardware delegation, Interpreter will make the data of output
+/// tensors available in `tensor->data` by default. If the application can
+/// consume the buffer handle directly (e.g. reading output from OpenGL
+/// texture), it can set this flag to false, so Interpreter won't copy the
+/// data from buffer handle to CPU memory.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteSetAllowBufferHandleOutput(
+    const TfLiteInterpreter* interpreter, bool allow_buffer_handle_output);
+
 /// --------------------------------------------------------------------------
 /// SignatureRunner APIs
 
@@ -360,6 +385,16 @@ TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetTelemetryProfiler(
     TfLiteInterpreterOptions* options,
     struct TfLiteTelemetryProfilerStruct* profiler);
 
+/// Ensures the data of the tensor at the given index is readable.
+/// Note: If a delegate has been used, and `SetAllowBufferHandleOutput(true)`
+/// has been called, tensor outputs may be stored as delegate buffer handles
+/// whose data is not directly readable until this method has been called. In
+/// such cases, this method will copy the data from the delegate buffer handle
+/// to CPU memory.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterEnsureTensorDataIsReadable(
+    TfLiteInterpreter* interpreter, int tensor_index);
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/core/c/c_api_experimental_test.cc b/tensorflow/lite/core/c/c_api_experimental_test.cc
index 2425dd97676ba3..c52ad232b6a9e7 100644
--- a/tensorflow/lite/core/c/c_api_experimental_test.cc
+++ b/tensorflow/lite/core/c/c_api_experimental_test.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "tensorflow/lite/core/c/c_api_experimental.h"
 
+#include <algorithm>
 #include <array>
 #include <cmath>
 #include <cstdint>
+#include <cstdio>
 #include <cstring>
 #include <memory>
 #include <vector>
@@ -25,6 +27,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/c/c_api.h"
 #include "tensorflow/lite/core/c/c_api_opaque.h"
 #include "tensorflow/lite/core/c/common.h"
@@ -566,6 +569,356 @@ TEST(CApiExperimentalTest, SetCustomAllocationForOutputTensorSuccess) {
   TfLiteModelDelete(model);
 }
 
+TEST(CApiExperimentalTest, SetAndGetBufferHandleSuccess) {
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+
+  auto simple_delegate = std::make_unique<SimpleDelegate>(
+      // The delegate will handle the first (index 0) and the second (index 1)
+      // op nodes in the TfLiteModel.
+      /*nodes=*/std::vector<int>({0, 1}),
+      /*delegate_flags=*/kTfLiteDelegateFlagsNone,
+      /*fail_node_prepare=*/false, /*min_ops_per_subset=*/0,
+      /*fail_node_invoke=*/false,
+      /* automatic_shape_propagation=*/false, /*custom_op=*/false,
+      /* set_output_tensor_dynamic =*/false);
+  TfLiteDelegate* delegate = simple_delegate->get_tf_lite_delegate();
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddDelegate(options, delegate);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+  ASSERT_NE(interpreter, nullptr);
+  EXPECT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+
+  // Tensor index is set to the input tensor (index 1) of the TfLiteModel.
+  int tensor_index = 1;
+  TfLiteTensor* tensor = TfLiteInterpreterGetTensor(interpreter, tensor_index);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+  ASSERT_EQ(tensor->delegate, nullptr);
+
+  // Use of an arbitrary non-negative int value for the buffer handle.
+  TfLiteBufferHandle buffer_handle = 1234;
+
+  TfLiteDelegate* expected_delegate = delegate;
+  TfLiteBufferHandle expected_buffer_handle = buffer_handle;
+  ASSERT_EQ(TfLiteInterpreterSetBufferHandle(interpreter, tensor, buffer_handle,
+                                             delegate),
+            kTfLiteOk);
+  ASSERT_EQ(tensor->delegate, expected_delegate);
+  ASSERT_EQ(tensor->buffer_handle, expected_buffer_handle);
+
+  TfLiteOpaqueDelegate* fetched_delegate;
+  TfLiteBufferHandle fetched_buffer_handle;
+  ASSERT_EQ(
+      TfLiteInterpreterGetBufferHandle(
+          interpreter, tensor_index, &fetched_buffer_handle, &fetched_delegate),
+      kTfLiteOk);
+  ASSERT_EQ(fetched_delegate, expected_delegate);
+  ASSERT_EQ(fetched_buffer_handle, expected_buffer_handle);
+
+  EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
+}
+
+// A utility struct, intended to be used to record the interaction between a
+// test delegate and the runtime.
+struct DelegateState {
+  bool delegate_prepared;
+  bool copy_from_buffer_handle_called;
+  bool free_buffer_handle_called;
+  int buffer_handle;
+
+  void Reset() {
+    delegate_prepared = false;
+    copy_from_buffer_handle_called = false;
+    free_buffer_handle_called = false;
+    buffer_handle = -1;
+  }
+};
+
+struct OpaqueTestDelegate {
+  static constexpr int kTestDelegateOutput = 42;
+
+  static inline TfLiteStatus Prepare(TfLiteOpaqueContext* opaque_context,
+                                     TfLiteOpaqueDelegate* opaque_delegate,
+                                     void* data) {
+    DelegateState* delegate_state = reinterpret_cast<DelegateState*>(data);
+    delegate_state->delegate_prepared = true;
+
+    // The buffer handle is set to one greater than the last allocated buffer
+    // handle.
+    delegate_state->buffer_handle++;
+
+    TfLiteRegistration registration{};
+    registration.registration_external = TfLiteRegistrationExternalCreate(
+        kTfLiteBuiltinDelegate, "OpaqueTestDelegate delegate kernel",
+        /* version = */ 1);
+
+    TfLiteRegistrationExternalSetPrepare(
+        registration.registration_external,
+        [](TfLiteOpaqueContext* context,
+           TfLiteOpaqueNode* node) -> TfLiteStatus { return kTfLiteOk; });
+
+    TfLiteRegistrationExternalSetInvoke(
+        registration.registration_external,
+        [](TfLiteOpaqueContext*, TfLiteOpaqueNode*) -> TfLiteStatus {
+          return kTfLiteOk;
+        });
+
+    TfLiteIntArray* execution_plan;
+    TfLiteOpaqueContextGetExecutionPlan(opaque_context, &execution_plan);
+
+    TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
+        opaque_context, registration.registration_external, execution_plan,
+        opaque_delegate);
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus CopyFromBufferHandle(TfLiteOpaqueContext* context,
+                                           TfLiteOpaqueDelegate* delegate,
+                                           void* data,
+                                           TfLiteBufferHandle buffer_handle,
+                                           TfLiteOpaqueTensor* opaque_tensor) {
+    DelegateState* delegate_state = reinterpret_cast<DelegateState*>(data);
+    delegate_state->copy_from_buffer_handle_called = true;
+    delegate_state->buffer_handle = buffer_handle;
+
+    auto* output =
+        reinterpret_cast<float*>(TfLiteOpaqueTensorData(opaque_tensor));
+    int total_num_elements = 1;
+    for (int i = 0; i < TfLiteOpaqueTensorNumDims(opaque_tensor); ++i) {
+      total_num_elements *= TfLiteOpaqueTensorDim(opaque_tensor, i);
+    }
+    std::vector<float> meaning_of_life(total_num_elements, kTestDelegateOutput);
+    memcpy(output, meaning_of_life.data(),
+           meaning_of_life.size() * sizeof(float));
+    return kTfLiteOk;
+  }
+
+  static inline void FreeBufferHandle(TfLiteOpaqueContext* context,
+                                      TfLiteOpaqueDelegate* delegate,
+                                      void* data,
+                                      TfLiteBufferHandle* buffer_handle) {
+    DelegateState* delegate_state = reinterpret_cast<DelegateState*>(data);
+    delegate_state->free_buffer_handle_called = true;
+  }
+};
+
+TEST(CApiExperimentalTest, SetAllowBufferHandleOutputFalse) {
+  DelegateState delegate_state;
+  delegate_state.Reset();
+
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+  int kNumTensorElements = 1 * 8 * 8 * 3;
+
+  TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
+  opaque_delegate_builder.data = &delegate_state;
+  opaque_delegate_builder.CopyFromBufferHandle =
+      OpaqueTestDelegate::CopyFromBufferHandle;
+  opaque_delegate_builder.FreeBufferHandle =
+      OpaqueTestDelegate::FreeBufferHandle;
+  opaque_delegate_builder.Prepare = OpaqueTestDelegate::Prepare;
+
+  TfLiteOpaqueDelegate* tflite_delegate =
+      TfLiteOpaqueDelegateCreate(&opaque_delegate_builder);
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddDelegate(options, tflite_delegate);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+  ASSERT_NE(interpreter, nullptr);
+
+  // Allocate tensor buffers.
+  EXPECT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+
+  // Fill input buffers
+  TfLiteTensor* input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0);
+  float* input = reinterpret_cast<float*>(input_tensor->data.raw);
+  std::fill(input, input + kNumTensorElements, 1);
+
+  // We set the buffer handle of the output tensor and mark its data as stale.
+  // This will make the interpreter call 'CopyFromBufferHandle' to refresh the
+  // output tensor's data.
+  int first_buffer_handle = 0;
+
+  // Tensor index is set to the output tensor (index 2) of the TfLite model.
+  int tensor_index = 2;
+
+  TfLiteTensor* output_tensor =
+      TfLiteInterpreterGetTensor(interpreter, tensor_index);
+
+  ASSERT_EQ(
+      TfLiteInterpreterSetBufferHandle(interpreter, output_tensor,
+                                       first_buffer_handle, tflite_delegate),
+      kTfLiteOk);
+
+  output_tensor->data_is_stale = true;
+
+  TfLiteSetAllowBufferHandleOutput(interpreter,
+                                   /*allow_buffer_handle_output=*/false);
+
+  // Run inference
+  EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+  EXPECT_TRUE(delegate_state.delegate_prepared);
+  EXPECT_TRUE(delegate_state.copy_from_buffer_handle_called);
+  EXPECT_EQ(delegate_state.buffer_handle, first_buffer_handle);
+  EXPECT_FALSE(delegate_state.free_buffer_handle_called);
+  float* outputs = reinterpret_cast<float*>(output_tensor->data.raw);
+  for (int i = 0; i < kNumTensorElements; ++i) {
+    EXPECT_EQ(outputs[i], OpaqueTestDelegate::kTestDelegateOutput);
+  }
+  ASSERT_EQ(output_tensor->buffer_handle, first_buffer_handle);
+  ASSERT_EQ(output_tensor->delegate, tflite_delegate);
+
+  // Destroying the interpreter will release any buffer handles that are
+  // associated with the tensors owner by the interpreter.
+  delegate_state.Reset();
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteOpaqueDelegateDelete(tflite_delegate);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
+  EXPECT_FALSE(delegate_state.copy_from_buffer_handle_called);
+  EXPECT_TRUE(delegate_state.free_buffer_handle_called);
+}
+
+TEST(CApiExperimentalTest, SetAllowBufferHandleOutputTrue) {
+  DelegateState delegate_state;
+  delegate_state.Reset();
+
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+  int kNumTensorElements = 1 * 8 * 8 * 3;
+
+  TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
+  opaque_delegate_builder.data = &delegate_state;
+  opaque_delegate_builder.CopyFromBufferHandle =
+      OpaqueTestDelegate::CopyFromBufferHandle;
+  opaque_delegate_builder.FreeBufferHandle =
+      OpaqueTestDelegate::FreeBufferHandle;
+  opaque_delegate_builder.Prepare = OpaqueTestDelegate::Prepare;
+
+  TfLiteOpaqueDelegate* tflite_delegate =
+      TfLiteOpaqueDelegateCreate(&opaque_delegate_builder);
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddDelegate(options, tflite_delegate);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+  ASSERT_NE(interpreter, nullptr);
+
+  // Allocate tensor buffers.
+  EXPECT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+
+  // Fill input buffers
+  TfLiteTensor* input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0);
+  float* input = reinterpret_cast<float*>(input_tensor->data.raw);
+  std::fill(input, input + kNumTensorElements, 1);
+
+  // We set the buffer handle of the output tensor and mark its data as stale.
+  // This will make the interpreter call 'CopyFromBufferHandle' to refresh the
+  // output tensor's data.
+  EXPECT_FALSE(delegate_state.free_buffer_handle_called);
+  int first_buffer_handle = 0;
+
+  // Tensor index is set to the output tensor (index 2) of the TfLite model.
+  int tensor_index = 2;
+
+  TfLiteTensor* output_tensor =
+      TfLiteInterpreterGetTensor(interpreter, tensor_index);
+
+  ASSERT_EQ(
+      TfLiteInterpreterSetBufferHandle(interpreter, output_tensor,
+                                       first_buffer_handle, tflite_delegate),
+      kTfLiteOk);
+
+  output_tensor->data_is_stale = true;
+
+  TfLiteSetAllowBufferHandleOutput(interpreter,
+                                   /*allow_buffer_handle_output=*/true);
+
+  // Run inference
+  EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+  EXPECT_TRUE(delegate_state.delegate_prepared);
+  EXPECT_FALSE(delegate_state.copy_from_buffer_handle_called);
+  EXPECT_EQ(delegate_state.buffer_handle, first_buffer_handle);
+  EXPECT_FALSE(delegate_state.free_buffer_handle_called);
+  ASSERT_EQ(output_tensor->buffer_handle, first_buffer_handle);
+  ASSERT_EQ(output_tensor->delegate, tflite_delegate);
+
+  // Destroying the interpreter will release any buffer handles that are
+  // associated with the tensors owner by the interpreter.
+  delegate_state.Reset();
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteOpaqueDelegateDelete(tflite_delegate);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
+  EXPECT_FALSE(delegate_state.copy_from_buffer_handle_called);
+  EXPECT_TRUE(delegate_state.free_buffer_handle_called);
+}
+
+TEST(CApiExperimentalTest, SetInvalidHandleToTensor) {
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+
+  auto simple_delegate = std::make_unique<SimpleDelegate>(
+      // The delegate will handle the first (index 0) and the second (index 1)
+      // op nodes in the TfLiteModel.
+      /*nodes=*/std::vector<int>({0, 1}),
+      /*delegate_flags=*/kTfLiteDelegateFlagsNone,
+      /*fail_node_prepare=*/false, /*min_ops_per_subset=*/0,
+      /*fail_node_invoke=*/false,
+      /* automatic_shape_propagation=*/false, /*custom_op=*/false,
+      /* set_output_tensor_dynamic =*/false);
+  TfLiteDelegate* delegate = simple_delegate->get_tf_lite_delegate();
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddDelegate(options, delegate);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+  ASSERT_NE(interpreter, nullptr);
+
+  EXPECT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+  EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+
+  auto another_simple_delegate = std::make_unique<SimpleDelegate>(
+      // The delegate will handle the 0th, 1st and the 2nd indexed nodes in
+      // the TfLiteModel.
+      /*nodes=*/std::vector<int>({0, 1, 2}),
+      /*delegate_flags=*/kTfLiteDelegateFlagsNone,
+      /*fail_node_prepare=*/false, /*min_ops_per_subset=*/0,
+      /*fail_node_invoke=*/false, /* automatic_shape_propagation=*/false,
+      /*custom_op=*/false, /*set_output_tensor_dynamic=*/false);
+
+  // Tensor index is set to the output tensor (index 2) of the TfLite model.
+  int tensor_index = 2;
+  TfLiteTensor* tensor = TfLiteInterpreterGetTensor(interpreter, tensor_index);
+
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  // Buffer handle is set to one greater than the last allocated buffer handle.
+  TfLiteBufferHandle buffer_handle = kTfLiteNullBufferHandle + 1;
+
+  // Setting a buffer handle to a tensor with another delegate will fail.
+  ASSERT_EQ(TfLiteInterpreterSetBufferHandle(
+                interpreter, tensor, buffer_handle,
+                another_simple_delegate->get_tf_lite_delegate()),
+            kTfLiteError);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
+}
+
 void AllocateAndSetInputs(TfLiteInterpreter* interpreter) {
   std::array<int, 1> input_dims = {2};
   ASSERT_EQ(TfLiteInterpreterResizeInputTensor(
diff --git a/tensorflow/lite/core/c/c_api_opaque.cc b/tensorflow/lite/core/c/c_api_opaque.cc
index 13cf85cfb967bb..926d0a4714fef3 100644
--- a/tensorflow/lite/core/c/c_api_opaque.cc
+++ b/tensorflow/lite/core/c/c_api_opaque.cc
@@ -147,7 +147,10 @@ size_t TfLiteOpaqueTensorByteSize(const TfLiteOpaqueTensor* opaque_tensor) {
 }
 
 void* TfLiteOpaqueTensorData(const TfLiteOpaqueTensor* opaque_tensor) {
-  return TfLiteTensorData(reinterpret_cast<const TfLiteTensor*>(opaque_tensor));
+  return opaque_tensor != nullptr
+             ? TfLiteTensorData(
+                   reinterpret_cast<const TfLiteTensor*>(opaque_tensor))
+             : nullptr;
 }
 
 TfLiteAllocationType TfLiteOpaqueTensorGetAllocationType(
diff --git a/tensorflow/lite/core/c/c_api_opaque.h b/tensorflow/lite/core/c/c_api_opaque.h
index 06bdc194b221f6..0a012bfebae087 100644
--- a/tensorflow/lite/core/c/c_api_opaque.h
+++ b/tensorflow/lite/core/c/c_api_opaque.h
@@ -12,6 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+// WARNING: Users of TensorFlow Lite should not include this file directly, but
+// should instead include "third_party/tensorflow/lite/c/c_api_opaque.h".
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
+
 #ifndef TENSORFLOW_LITE_CORE_C_C_API_OPAQUE_H_
 #define TENSORFLOW_LITE_CORE_C_C_API_OPAQUE_H_
 
@@ -36,10 +41,20 @@ extern "C" {
 /// potentially including non-backwards-compatible changes, on a different
 /// schedule than for the other TensorFlow Lite APIs. See
 /// https://www.tensorflow.org/guide/versions#separate_version_number_for_tensorflow_lite_extension_apis.
+///
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/c/c_api_opaque.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
 
 // clang-format off
 // NOLINTBEGIN(whitespace/line_length)
-/** \defgroup c_api_opaque tensorflow/lite/c/c_api_opaque.h
+/** \defgroup c_api_opaque lite/c/c_api_opaque.h
  *  @{
  */
 // NOLINTEND(whitespace/line_length)
@@ -93,6 +108,7 @@ TFL_CAPI_EXPORT extern size_t TfLiteOpaqueTensorByteSize(
     const TfLiteOpaqueTensor* opaque_tensor);
 
 /// Returns a pointer to the underlying data buffer.
+/// Returns nullptr if input is also nullptr.
 TFL_CAPI_EXPORT extern void* TfLiteOpaqueTensorData(
     const TfLiteOpaqueTensor* opaque_tensor);
 
diff --git a/tensorflow/lite/core/c/c_api_opaque_test.cc b/tensorflow/lite/core/c/c_api_opaque_test.cc
index ab2c00b2604a4f..f59a35c3c0feb4 100644
--- a/tensorflow/lite/core/c/c_api_opaque_test.cc
+++ b/tensorflow/lite/core/c/c_api_opaque_test.cc
@@ -166,6 +166,18 @@ TEST(TestTfLiteOpaqueTensorGetBufferAddressStability,
             TfLiteTensorGetBufferAddressStability(&t));
 }
 
+TEST(TestTfLiteOpaqueTensorData, ValidInput) {
+  TfLiteTensor t;
+  char data[] = "data";
+  t.data.raw = data;
+  EXPECT_EQ(TfLiteOpaqueTensorData(reinterpret_cast<TfLiteOpaqueTensor*>(&t)),
+            data);
+}
+
+TEST(TestTfLiteOpaqueTensorData, NullInput) {
+  EXPECT_EQ(TfLiteOpaqueTensorData(nullptr), nullptr);
+}
+
 TEST(TestTfLiteOpaqueTensorGetDataStability,
      WithMemNoneBehavesAsTfLiteTensorGetDataStability) {
   TfLiteTensor t;
diff --git a/tensorflow/lite/core/c/c_api_test.cc b/tensorflow/lite/core/c/c_api_test.cc
index abb0083e12578c..189cd9815f8ebf 100644
--- a/tensorflow/lite/core/c/c_api_test.cc
+++ b/tensorflow/lite/core/c/c_api_test.cc
@@ -291,6 +291,7 @@ TEST(CApiSimple, TfLiteInterpreterGetTensor) {
   TfLiteInterpreterDelete(interpreter);
 }
 
+#if !TFLITE_USE_OPAQUE_DELEGATE
 TEST(CApiSimple, Delegate) {
   TfLiteModel* model =
       TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
@@ -316,6 +317,7 @@ TEST(CApiSimple, Delegate) {
   EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
   TfLiteInterpreterDelete(interpreter);
 }
+#endif
 
 TEST(CApiSimple, DelegateExternal_GetExecutionPlan) {
   TfLiteModel* model =
@@ -409,6 +411,7 @@ TEST(CApiSimple, DelegateExternal_MarkSubgraphAsDelegationSkippable) {
   TfLiteOpaqueDelegateDelete(opaque_delegate);
 }
 
+#if !TFLITE_USE_OPAQUE_DELEGATE
 TEST(CApiSimple, DelegateFails) {
   TfLiteModel* model =
       TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
@@ -428,6 +431,7 @@ TEST(CApiSimple, DelegateFails) {
   TfLiteInterpreterOptionsDelete(options);
   TfLiteModelDelete(model);
 }
+#endif
 
 struct DelegateState {
   bool delegate_prepared;
diff --git a/tensorflow/lite/core/c/c_api_types.h b/tensorflow/lite/core/c/c_api_types.h
index c1f0c568fcf04a..1170025cbab9a2 100644
--- a/tensorflow/lite/core/c/c_api_types.h
+++ b/tensorflow/lite/core/c/c_api_types.h
@@ -12,16 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+// WARNING: Users of TensorFlow Lite should not include this file directly, but
+// should instead include "third_party/tensorflow/lite/c/c_api_types.h".
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
 
 /// This file declares types used by the pure C inference API defined in
 /// c_api.h, some of which are also used in the C++ and C kernel and interpreter
 /// APIs.
-
-// WARNING: Users of TensorFlow Lite should not include this file directly,
-// but should instead include
-// "third_party/tensorflow/lite/c/c_api_types.h".
-// Only the TensorFlow Lite implementation itself should include this
-// file directly.
+///
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/c/c_api_types.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
 
 // IWYU pragma: private, include "third_party/tensorflow/lite/c/c_api_types.h"
 
@@ -36,7 +44,7 @@ extern "C" {
 
 // clang-format off
 // NOLINTBEGIN(whitespace/line_length)
-/** \defgroup c_api_types tensorflow/lite/c/c_api_types.h
+/** \defgroup c_api_types lite/c/c_api_types.h
  *  @{
  */
 // NOLINTEND(whitespace/line_length)
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index 0ebba76e948f33..ca29104f203954 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+// WARNING: Users of TensorFlow Lite should not include this file directly, but
+// should instead include "third_party/tensorflow/lite/c/common.h".
+// Only the TensorFlow Lite implementation itself should include this file
+// directly.
 
 /// This file defines common C types and APIs for implementing operations,
 /// delegates and other constructs in TensorFlow Lite. The actual operations and
@@ -32,12 +36,17 @@ limitations under the License.
 ///
 /// NOTE: The order of values in these structs are "semi-ABI stable". New values
 /// should be added only to the end of structs and never reordered.
+///
+// clang-format off
+// NOLINTBEGIN(whitespace/line_length)
+/// \note Users of TensorFlow Lite should use
+/// \code
+/// #include "tensorflow/lite/c/common.h"
+/// \endcode
+/// to access the APIs documented on this page.
+// NOLINTEND(whitespace/line_length)
+// clang-format on
 
-// WARNING: Users of TensorFlow Lite should not include this file directly,
-// but should instead include
-// "third_party/tensorflow/lite/c/common.h".
-// Only the TensorFlow Lite implementation itself should include this
-// file directly.
 // IWYU pragma: private, include "third_party/tensorflow/lite/c/common.h"
 
 #ifndef TENSORFLOW_LITE_CORE_C_COMMON_H_
@@ -56,7 +65,7 @@ extern "C" {
 
 // clang-format off
 // NOLINTBEGIN(whitespace/line_length)
-/** \defgroup common tensorflow/lite/c/common.h
+/** \defgroup common lite/c/common.h
  *  @{
  */
 // NOLINTEND(whitespace/line_length)
diff --git a/tensorflow/lite/core/interpreter.cc b/tensorflow/lite/core/interpreter.cc
index ee9748c031e87d..5c2917e8be9f24 100644
--- a/tensorflow/lite/core/interpreter.cc
+++ b/tensorflow/lite/core/interpreter.cc
@@ -225,8 +225,8 @@ TfLiteStatus Interpreter::Invoke() {
   ScopedRuntimeInstrumentationProfile scoped_runtime_event(root_profiler_.get(),
                                                            "invoke");
 
-  // "Resets" cancellation flag so cancellation happens before this invoke will
-  // not take effect.
+  // "Resets" cancellation flag so cancellation that happens before this invoke
+  // will not take effect.
   if (cancellation_enabled_) (void)continue_invocation_.test_and_set();
 
   // Denormal floating point numbers could cause significant slowdown on
diff --git a/tensorflow/lite/core/interpreter.h b/tensorflow/lite/core/interpreter.h
index 98a2fd67f4da90..ed9d798f34753b 100644
--- a/tensorflow/lite/core/interpreter.h
+++ b/tensorflow/lite/core/interpreter.h
@@ -580,6 +580,7 @@ class Interpreter {
   /// 5. kTfLiteError: Unexpected/runtime failure. \n
   /// \warning This is an experimental API and subject to change. \n
   TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteOpaqueDelegateStruct* delegate);
 
   // Owning handle to a TfLiteDelegate instance.
   using TfLiteDelegatePtr =
@@ -611,9 +612,12 @@ class Interpreter {
       std::unique_ptr<TfLiteDelegate> delegate) = delete;
 
   /// \warning This is an experimental API and subject to change. \n
-  /// \brief Ensure the data in `tensor.data` is readable. In case delegate is
-  /// used, it might require to copy the data from delegate buffer to raw
-  /// memory.
+  /// \brief Ensure the data in `tensor.data` is readable. If a
+  /// delegate has been used, and `SetAllowBufferHandleOutput(true)` has been
+  /// called, tensor outputs may be stored as delegate buffer handles whose data
+  /// is not directly readable until this method has been called.
+  /// In such cases, this method will copy the data from the delegate buffer
+  /// handle to CPU memory.
   TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
     return primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
   }
diff --git a/tensorflow/lite/core/interpreter_builder.h b/tensorflow/lite/core/interpreter_builder.h
index fcdae0b2543de8..6233d6561ab29c 100644
--- a/tensorflow/lite/core/interpreter_builder.h
+++ b/tensorflow/lite/core/interpreter_builder.h
@@ -76,7 +76,7 @@ class InterpreterBuilder {
   /// For this constructor, the ErrorReporter will be extracted from the
   /// FlatBufferModel.
   /// `options` object is copied during construction. So caller can release it
-  // after calling the constructor.
+  /// after calling the constructor.
   InterpreterBuilder(const FlatBufferModel& model,
                      const OpResolver& op_resolver,
                      const InterpreterOptions* options_experimental = nullptr);
@@ -84,7 +84,7 @@ class InterpreterBuilder {
   /// of a FlatBufferModel). Mostly used for testing.
   /// If `error_reporter` is null, then DefaultErrorReporter() is used.
   /// `options` object is copied during construction. So caller can release it
-  // after calling the constructor.
+  /// after calling the constructor.
   InterpreterBuilder(const ::tflite::Model* model,
                      const OpResolver& op_resolver,
                      ErrorReporter* error_reporter = DefaultErrorReporter(),
diff --git a/tensorflow/lite/core/interpreter_experimental.cc b/tensorflow/lite/core/interpreter_experimental.cc
index e04b1d3e7c675d..016d45df977955 100644
--- a/tensorflow/lite/core/interpreter_experimental.cc
+++ b/tensorflow/lite/core/interpreter_experimental.cc
@@ -84,6 +84,12 @@ TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   return ModifyGraphWithDelegateImpl(delegate);
 }
 
+TfLiteStatus Interpreter::ModifyGraphWithDelegate(
+    TfLiteOpaqueDelegateStruct* delegate) {
+  return ModifyGraphWithDelegateImpl(
+      reinterpret_cast<TfLiteDelegate*>(delegate));
+}
+
 bool Interpreter::HasDelegates() { return primary_subgraph().HasDelegates(); }
 
 TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
diff --git a/tensorflow/lite/core/kernels/builtin_op_kernels.h b/tensorflow/lite/core/kernels/builtin_op_kernels.h
index 20362ada18e65c..e0dcbf8d4b0605 100644
--- a/tensorflow/lite/core/kernels/builtin_op_kernels.h
+++ b/tensorflow/lite/core/kernels/builtin_op_kernels.h
@@ -291,9 +291,9 @@ Register_STABLEHLO_DYNAMIC_SLICE();  // WARNING: not implemented, using this
 TfLiteRegistration*
 Register_STABLEHLO_DYNAMIC_UPDATE_SLICE();  // WARNING: not implemented, using
                                             // this op will crash the runtime
-TfLiteRegistration*
-Register_STABLEHLO_PAD();  // WARNING: not implemented, using this
-                           // op will crash the runtime
+
+TfLiteRegistration* Register_STABLEHLO_PAD();
+
 TfLiteRegistration*
 Register_STABLEHLO_IOTA();  // WARNING: not implemented, using this
                             // op will crash the runtime
diff --git a/tensorflow/lite/core/kernels/register.cc b/tensorflow/lite/core/kernels/register.cc
index cb53c20558106e..0e3eacf4d65017 100644
--- a/tensorflow/lite/core/kernels/register.cc
+++ b/tensorflow/lite/core/kernels/register.cc
@@ -297,7 +297,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_ADD_N, Register_ADD_N());
   AddBuiltin(BuiltinOperator_GATHER_ND, Register_GATHER_ND(),
              /* min_version = */ 1,
-             /* max_version = */ 4);
+             /* max_version = */ 5);
   AddBuiltin(BuiltinOperator_WHERE, Register_WHERE(),
              /* min_version = */ 1,
              /* max_version = */ 2);
@@ -380,6 +380,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_STABLEHLO_MULTIPLY, Register_STABLEHLO_MULTIPLY());
   AddBuiltin(BuiltinOperator_STABLEHLO_MAXIMUM, Register_STABLEHLO_MAXIMUM());
   AddBuiltin(BuiltinOperator_STABLEHLO_MINIMUM, Register_STABLEHLO_MINIMUM());
+  AddBuiltin(BuiltinOperator_STABLEHLO_PAD, Register_STABLEHLO_PAD());
   AddCustom("NumericVerify", tflite::ops::custom::Register_NUMERIC_VERIFY());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/core/model_builder.cc b/tensorflow/lite/core/model_builder.cc
index 05822832cc2f93..e044c6da7e65c9 100644
--- a/tensorflow/lite/core/model_builder.cc
+++ b/tensorflow/lite/core/model_builder.cc
@@ -17,11 +17,12 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
+#include <cstring>
 #include <memory>
 #include <string>
 #include <utility>
 
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "flatbuffers/buffer.h"  // from @flatbuffers
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/verifier.h"
@@ -386,6 +387,12 @@ std::map<std::string, std::string> FlatBufferModel::ReadAllMetadata(
 }
 
 bool FlatBufferModel::CheckModelIdentifier() const {
+  if (allocation_->bytes() < 7) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Model provided must have at least 7 bytes to hold identifier.\n");
+    return false;
+  }
   if (!tflite::ModelBufferHasIdentifier(allocation_->base())) {
     const char* ident = flatbuffers::GetBufferIdentifier(allocation_->base());
     TF_LITE_REPORT_ERROR(
diff --git a/tensorflow/lite/create_op_resolver.h b/tensorflow/lite/create_op_resolver.h
index 41012171b07b03..853505f1c786e6 100644
--- a/tensorflow/lite/create_op_resolver.h
+++ b/tensorflow/lite/create_op_resolver.h
@@ -15,9 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CREATE_OP_RESOLVER_H_
 #define TENSORFLOW_LITE_CREATE_OP_RESOLVER_H_
 
+/// For documentation, see third_party/tensorflow/lite/core/create_op_resolver.h
+
 #include <memory>
 
-#include "tensorflow/lite/core/create_op_resolver.h"
+#include "tensorflow/lite/core/create_op_resolver.h"  // IWYU pragma: export
 
 namespace tflite {
 using ::tflite::CreateOpResolver;
diff --git a/tensorflow/lite/delegates/coreml/BUILD b/tensorflow/lite/delegates/coreml/BUILD
index 08466b7ac02360..2868fa658b216a 100644
--- a/tensorflow/lite/delegates/coreml/BUILD
+++ b/tensorflow/lite/delegates/coreml/BUILD
@@ -29,12 +29,11 @@ objc_library(
     srcs = ["coreml_executor.mm"],
     hdrs = ["coreml_executor.h"],
     copts = ["-std=c++17"],
-    features = ["-layering_check"],
     sdk_frameworks = [
         "CoreML",
         "Foundation",
     ],
-    deps = [":mlmodel_proto_cc"],
+    deps = ["@coremltools//:mlmodel_cc_proto"],
 )
 
 cc_library(
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
index 078aa0863a7d55..560b2b4c65b940 100644
--- a/tensorflow/lite/delegates/delegate_test.cc
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -51,7 +51,8 @@ using test_utils::TestTwoDelegates;
 namespace {
 
 TEST_F(TestDelegate, NullDelegate) {
-  EXPECT_EQ(interpreter_->ModifyGraphWithDelegate(nullptr),
+  TfLiteOpaqueDelegate* delegate = nullptr;
+  EXPECT_EQ(interpreter_->ModifyGraphWithDelegate(delegate),
             kTfLiteDelegateError);
 }
 
@@ -178,14 +179,14 @@ TEST_F(TestDelegate, SetBufferHandleToInput) {
   TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
   interpreter_->ModifyGraphWithDelegate(delegate);
 
-  constexpr int kOutputTensorIndex = 0;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  constexpr int kInputTensorIndex = 0;
+  TfLiteTensor* tensor = interpreter_->tensor(kInputTensorIndex);
   ASSERT_EQ(tensor->delegate, nullptr);
   ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
 
   TfLiteBufferHandle handle = AllocateBufferHandle();
   TfLiteStatus status =
-      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
+      interpreter_->SetBufferHandle(kInputTensorIndex, handle, delegate);
   ASSERT_EQ(status, kTfLiteOk);
   EXPECT_EQ(tensor->delegate, delegate);
   EXPECT_EQ(tensor->buffer_handle, handle);
@@ -1488,7 +1489,8 @@ TEST_P(TestFP16Delegation, NonDelegatedInterpreterWorks) {
 }
 
 TEST_F(TestFP16Delegation, NullDelegate) {
-  EXPECT_EQ(interpreter_->ModifyGraphWithDelegate(nullptr),
+  TfLiteOpaqueDelegate* delegate = nullptr;
+  EXPECT_EQ(interpreter_->ModifyGraphWithDelegate(delegate),
             kTfLiteDelegateError);
   // Verify that resulting interpreter still works, despite null delegate.
   ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 6e767beb635cfb..bd47abe905729c 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -1,12 +1,12 @@
 load("@bazel_skylib//lib:selects.bzl", "selects")
-load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps", "tflite_portable_test_suite")
-load("//tensorflow/lite/delegates/gpu:build_defs.bzl", "gpu_delegate_linkopts")
-load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
-load("@build_bazel_rules_apple//apple:macos.bzl", "macos_dylib")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_gpu_tests_tags",
 )
+load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps", "tflite_portable_test_suite")
+load("//tensorflow/lite/delegates/gpu:build_defs.bzl", "gpu_delegate_linkopts")
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
+load("@build_bazel_rules_apple//apple:macos.bzl", "macos_dylib")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -33,19 +33,6 @@ config_setting(
     },
 )
 
-# copybara:uncomment_begin(google-only)
-# config_setting(
-#     name = "tflite_gpu_angle",
-#     flag_values = {
-#         "//tools/cpp:cc_target_os": "linux-google",
-#         "//third_party/angle:use_angle": "True",
-#     },
-#     values = {
-#         "cpu": "k8",
-#     },
-# )
-# copybara:uncomment_end
-
 cc_library(
     name = "gl_delegate",
     srcs = ["gl_delegate.cc"],
@@ -92,7 +79,6 @@ objc_library(
     srcs = ["metal_delegate.mm"],
     hdrs = ["metal_delegate.h"],
     copts = ["-std=c++17"],
-    features = ["-layering_check"],
     module_name = "TensorFlowLiteCMetal",
     sdk_frameworks = ["Metal"],
     deps = [
@@ -108,11 +94,13 @@ objc_library(
         "//tensorflow/lite/delegates/gpu/common:quantization_util",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/metal:buffer_convert",
+        "//tensorflow/lite/delegates/gpu/metal:common",
         "//tensorflow/lite/delegates/gpu/metal:inference_context",
         "//tensorflow/lite/delegates/gpu/metal:metal_spatial_tensor",
+        "//tensorflow/lite/kernels:kernel_util",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -172,7 +160,7 @@ ios_static_framework(
         "metal_delegate.h",
         "metal_delegate_internal.h",
     ],
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     deps = [":metal_delegate"],
 )
 
@@ -184,7 +172,7 @@ macos_dylib(
         "-all_load",
         "-dead_strip",
     ],
-    minimum_os_version = "10.13",
+    minimum_os_version = "12.0",
     tags = [
         "manual",
         "nobuilder",
@@ -269,6 +257,7 @@ cc_library(
         ],
         "//conditions:default": [],
     }) + [
+        ":android_hardware_buffer",
         ":api",
         ":delegate_options",
         ":tflite_profile",
@@ -304,11 +293,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "android_hardware_buffer",
+    srcs = ["android_hardware_buffer.cc"],
+    hdrs = ["android_hardware_buffer.h"],
+)
+
+cc_test(
+    name = "android_hardware_buffer_test",
+    srcs = ["android_hardware_buffer_test.cc"],
+    deps = [
+        ":android_hardware_buffer",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "async_buffers",
     srcs = ["async_buffers.cc"],
     hdrs = ["async_buffers.h"],
     deps = [
+        ":android_hardware_buffer",
         ":api",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/gl:gl_errors",
@@ -326,6 +331,7 @@ cc_test(
         "tflite_not_portable_ios",
     ],
     deps = [
+        ":android_hardware_buffer",
         ":async_buffers",
         ":delegate",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/delegates/gpu/android_hardware_buffer.cc b/tensorflow/lite/delegates/gpu/android_hardware_buffer.cc
new file mode 100644
index 00000000000000..e9bf3040b8f72a
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/android_hardware_buffer.cc
@@ -0,0 +1,53 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/android_hardware_buffer.h"
+
+#include <dlfcn.h>
+
+namespace tflite::gpu {
+
+OptionalAndroidHardwareBuffer::OptionalAndroidHardwareBuffer() {
+#ifdef __ANDROID__
+  dlopen_handle_ = dlopen("libnativewindow.so", RTLD_NOW);
+  if (dlopen_handle_ == nullptr) {
+    supported_ = false;
+    return;
+  }
+  allocate_ = reinterpret_cast<decltype(allocate_)>(
+      dlsym(dlopen_handle_, "AHardwareBuffer_allocate"));
+  acquire_ = reinterpret_cast<decltype(acquire_)>(
+      dlsym(dlopen_handle_, "AHardwareBuffer_acquire"));
+  release_ = reinterpret_cast<decltype(release_)>(
+      dlsym(dlopen_handle_, "AHardwareBuffer_release"));
+  describe_ = reinterpret_cast<decltype(describe_)>(
+      dlsym(dlopen_handle_, "AHardwareBuffer_describe"));
+  is_supported_ = reinterpret_cast<decltype(is_supported_)>(
+      dlsym(dlopen_handle_, "AHardwareBuffer_isSupported"));
+  supported_ =
+      (allocate_ != nullptr && acquire_ != nullptr && release_ != nullptr &&
+       describe_ != nullptr && is_supported_ != nullptr);
+#else
+  dlopen_handle_ = nullptr;
+  allocate_ = nullptr;
+  acquire_ = nullptr;
+  release_ = nullptr;
+  describe_ = nullptr;
+  is_supported_ = nullptr;
+  supported_ = false;
+#endif
+}
+
+}  // namespace tflite::gpu
diff --git a/tensorflow/lite/delegates/gpu/android_hardware_buffer.h b/tensorflow/lite/delegates/gpu/android_hardware_buffer.h
new file mode 100644
index 00000000000000..dc272f6975ca06
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/android_hardware_buffer.h
@@ -0,0 +1,130 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_ANDROID_HARDWARE_BUFFER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_ANDROID_HARDWARE_BUFFER_H_
+
+#include <stdint.h>
+
+#ifdef __ANDROID__
+#include <android/hardware_buffer.h>
+#else
+extern "C" {
+typedef struct AHardwareBuffer AHardwareBuffer;
+
+// struct is a copy of the Android NDK AHardwareBuffer_Desc struct in the link
+// below
+// https://developer.android.com/ndk/reference/struct/a-hardware-buffer-desc
+typedef struct AHardwareBuffer_Desc AHardwareBuffer_Desc;
+struct AHardwareBuffer_Desc {
+  uint32_t width;
+  uint32_t height;
+  uint32_t layers;
+  uint32_t format;
+  uint64_t usage;
+  uint32_t stride;
+  uint32_t rfu0;
+  uint64_t rfu1;
+};
+}  // extern "C"
+#endif  // __ANDROID__
+
+namespace tflite::gpu {
+
+// This header file and singleton class encapsulates the following Android NDK
+// features
+//   - header <android/hardware_buffer.h>
+//   - opaque struct type AHardwareBuffer
+//   - struct type AHardwareBuffer_Desc
+//   - function AHardwareBuffer_isSupported
+//   - function AHardwareBuffer_allocate
+//   - function AHardwareBuffer_acquire
+//   - function AHardwareBuffer_release
+//   - function AHardwareBuffer_describe
+//   - library libnativewindow.so (for the above features)
+//
+// For documentation on these features, see
+// <https://developer.android.com/ndk/reference/group/a-hardware-buffer>:
+//
+// Unlike using the native NDK functionality directly, this class only has a
+// run-time dependency on API level 26, not a build-time dependency.  So it can
+// be used even when building with NDK min SDK level < 26, as long as you are
+// very careful to check that Supported() returns true before calling any other
+// methods.
+class OptionalAndroidHardwareBuffer {
+ public:
+  static OptionalAndroidHardwareBuffer& Instance() {
+    static OptionalAndroidHardwareBuffer instance;
+    return instance;
+  }
+
+  // Returns true if the functionality in this class is supported.
+  bool Supported() { return supported_; }
+
+  // Like AHardwareBuffer_isSupported.
+  // Caller must check that Supported() returns true before calling this
+  // function.
+  int IsSupported(const AHardwareBuffer_Desc* description) {
+    return is_supported_(description);
+  }
+
+  // Like AHardwareBuffer_allocate.
+  // Caller must check that Supported() returns true before calling this
+  // function.
+  int Allocate(const AHardwareBuffer_Desc* description,
+               AHardwareBuffer** buffer) {
+    return allocate_(description, buffer);
+  }
+
+  // Like AHardwareBuffer_acquire.
+  // Caller must check that Supported() returns true before calling this
+  // function.
+  void Acquire(AHardwareBuffer* buffer) { return acquire_(buffer); }
+
+  // Like AHardwareBuffer_release.
+  // Caller must check that Supported() returns true before calling this
+  // function.
+  void Release(AHardwareBuffer* buffer) { return release_(buffer); }
+
+  // Like AHardwareBuffer_describe.
+  // Caller must check that Supported() returns true before calling this
+  // function.
+  void Describe(AHardwareBuffer* buffer, AHardwareBuffer_Desc* desc) {
+    return describe_(buffer, desc);
+  }
+
+ private:
+  void* dlopen_handle_;
+  int (*is_supported_)(const AHardwareBuffer_Desc* desc);
+  int (*allocate_)(const AHardwareBuffer_Desc* desc, AHardwareBuffer** buffer);
+  void (*acquire_)(AHardwareBuffer* buffer);
+  void (*release_)(AHardwareBuffer* buffer);
+  void (*describe_)(AHardwareBuffer* buffer, AHardwareBuffer_Desc* desc);
+  bool supported_;
+
+  OptionalAndroidHardwareBuffer();
+  OptionalAndroidHardwareBuffer(const OptionalAndroidHardwareBuffer&) = delete;
+  // Note that we deliberately do not call dlclose() in the destructor; doing
+  // so would complicate the code and would unnecessarily introduce additional
+  // failure scenarios. The object is a singleton and so is only destroyed when
+  // the process is about to exit, and the OS will automatically reclaim the
+  // resources on process exit anyway, so calling dlclose would only slow down
+  // process exit.
+  ~OptionalAndroidHardwareBuffer() = default;
+};
+
+}  // namespace tflite::gpu
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_ANDROID_HARDWARE_BUFFER_H_
diff --git a/tensorflow/lite/delegates/gpu/android_hardware_buffer_test.cc b/tensorflow/lite/delegates/gpu/android_hardware_buffer_test.cc
new file mode 100644
index 00000000000000..9f1c35fc5c2d73
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/android_hardware_buffer_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/android_hardware_buffer.h"
+
+#include <gtest/gtest.h>
+
+using tflite::gpu::OptionalAndroidHardwareBuffer;
+auto Instance = OptionalAndroidHardwareBuffer::Instance;
+
+namespace {
+
+#ifndef __ANDROID__
+
+TEST(OptionalAndroidHardwareBufferTest, NotSupportedOnNonAndroid) {
+  EXPECT_EQ(Instance().Supported(), false);
+}
+
+#else  // defined(__ANDROID__)
+
+TEST(OptionalAndroidHardwareBufferTest, SupportedOnAndroid) {
+  EXPECT_EQ(Instance().Supported(), true);
+}
+
+TEST(OptionalAndroidHardwareBufferTest, CanAllocateAndReleaseOnAndroid) {
+  EXPECT_EQ(Instance().Supported(), true);
+  AHardwareBuffer* buffer;
+  AHardwareBuffer_Desc description{};
+  description.width = 1600;
+  description.height = 1;
+  description.layers = 1;
+  description.rfu0 = 0;
+  description.rfu1 = 0;
+  description.stride = 1;
+  description.format = AHARDWAREBUFFER_FORMAT_BLOB;
+  description.usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN;
+  EXPECT_TRUE(Instance().IsSupported(&description));
+  EXPECT_EQ(Instance().Allocate(&description, &buffer), 0);
+  Instance().Release(buffer);
+}
+
+TEST(OptionalAndroidHardwareBufferTest, CanAcquireAndReleaseOnAndroid) {
+  EXPECT_EQ(Instance().Supported(), true);
+  AHardwareBuffer* buffer;
+  AHardwareBuffer_Desc description{};
+  description.width = 1600;
+  description.height = 1;
+  description.layers = 1;
+  description.rfu0 = 0;
+  description.rfu1 = 0;
+  description.stride = 1;
+  description.format = AHARDWAREBUFFER_FORMAT_BLOB;
+  description.usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN;
+  EXPECT_TRUE(Instance().IsSupported(&description));
+  EXPECT_EQ(Instance().Allocate(&description, &buffer), 0);
+  Instance().Acquire(buffer);
+  Instance().Release(buffer);  // To match Acquire
+  Instance().Release(buffer);  // To match Allocate
+}
+
+#endif  // defined(__ANDROID__)
+
+}  // namespace
diff --git a/tensorflow/lite/delegates/gpu/async_buffers.cc b/tensorflow/lite/delegates/gpu/async_buffers.cc
index 78e201f2102e6d..3c988857506ffd 100644
--- a/tensorflow/lite/delegates/gpu/async_buffers.cc
+++ b/tensorflow/lite/delegates/gpu/async_buffers.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <GLES3/gl31.h>
 
 #include "absl/status/status.h"
+#include "tensorflow/lite/delegates/gpu/android_hardware_buffer.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_errors.h"
 
 namespace {
@@ -74,11 +75,9 @@ absl::Status AsyncBuffer::AllocateOpenGlBuffer() {
     if (!status.ok()) {
       // If we can't map to SSBO, clear AHWB & SSBO
       if (ahwb_ != nullptr) {
-#if (__ANDROID__)
-        if (__builtin_available(android 26, *)) {
-          AHardwareBuffer_release(ahwb_);
+        if (OptionalAndroidHardwareBuffer::Instance().Supported()) {
+          OptionalAndroidHardwareBuffer::Instance().Release(ahwb_);
         }
-#endif
         ahwb_ = nullptr;
       }
       glBufferData(GL_SHADER_STORAGE_BUFFER, bytes_, nullptr, GL_STREAM_COPY);
diff --git a/tensorflow/lite/delegates/gpu/async_buffers_test.cc b/tensorflow/lite/delegates/gpu/async_buffers_test.cc
index 2f51e408661358..649c41f4be6797 100644
--- a/tensorflow/lite/delegates/gpu/async_buffers_test.cc
+++ b/tensorflow/lite/delegates/gpu/async_buffers_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/android_hardware_buffer.h"
 #include "tensorflow/lite/delegates/gpu/api.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/gl/egl_environment.h"
@@ -28,38 +29,44 @@ namespace gpu {
 namespace {
 
 TEST(AsyncBufferTest, DuplicateTest) {
-  // Create tie
-  TensorObjectDef* tie = new TensorObjectDef();
-  tie->object_def.data_type = DataType::FLOAT32;
-  tie->object_def.data_layout = DataLayout::BHWC;
-  tie->dimensions = Dimensions(2, 2, 2, 2);
+  if (__builtin_available(android 26, *)) {
+    auto Instance = OptionalAndroidHardwareBuffer::Instance;
+    // Create tie
+    TensorObjectDef* tie = new TensorObjectDef();
+    tie->object_def.data_type = DataType::FLOAT32;
+    tie->object_def.data_layout = DataLayout::BHWC;
+    tie->dimensions = Dimensions(2, 2, 2, 2);
 
-  // Create AHWB
-  AHardwareBuffer_Desc buffDesc = {};
-  buffDesc.width = 1000;
-  buffDesc.height = 1;
-  buffDesc.layers = 1;
-  buffDesc.format = AHARDWAREBUFFER_FORMAT_BLOB;
-  buffDesc.usage = AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN |
-                   AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN |
-                   AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER;
-  AHardwareBuffer* ahwb;
-  EXPECT_EQ(AHardwareBuffer_allocate(&buffDesc, &ahwb), 0);
+    // Create AHWB
+    AHardwareBuffer_Desc buffDesc = {};
+    buffDesc.width = 1000;
+    buffDesc.height = 1;
+    buffDesc.layers = 1;
+    buffDesc.format = AHARDWAREBUFFER_FORMAT_BLOB;
+    buffDesc.usage = AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN |
+                     AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN |
+                     AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER;
+    AHardwareBuffer* ahwb;
+    EXPECT_TRUE(Instance().IsSupported(&buffDesc));
+    EXPECT_EQ(Instance().Allocate(&buffDesc, &ahwb), 0);
 
-  // Init GL Env to properly use gl fcns
-  std::unique_ptr<gl::EglEnvironment> env;
-  EXPECT_OK(gl::EglEnvironment::NewEglEnvironment(&env));
-  AsyncBuffer async_buffer1 = AsyncBuffer(*tie, ahwb);
-  GLuint buffer1, buffer2;
-  EXPECT_OK(async_buffer1.GetOpenGlBuffer(buffer1));
-  EXPECT_GE(buffer1, 0);
-  EXPECT_OK(async_buffer1.GetOpenGlBuffer(buffer2));
-  // Check that each instance of AsyncBuffer class has only one id
-  EXPECT_EQ(buffer1, buffer2);
-  AsyncBuffer async_buffer2 = AsyncBuffer(*tie, ahwb);
-  EXPECT_OK(async_buffer2.GetOpenGlBuffer(buffer2));
-  // Check that each different instance will produce unique id
-  EXPECT_NE(buffer1, buffer2);
+    // Init GL Env to properly use gl fcns
+    std::unique_ptr<gl::EglEnvironment> env;
+    EXPECT_OK(gl::EglEnvironment::NewEglEnvironment(&env));
+    AsyncBuffer async_buffer1 = AsyncBuffer(*tie, ahwb);
+    GLuint buffer1, buffer2;
+    EXPECT_OK(async_buffer1.GetOpenGlBuffer(buffer1));
+    EXPECT_GE(buffer1, 0);
+    EXPECT_OK(async_buffer1.GetOpenGlBuffer(buffer2));
+    // Check that each instance of AsyncBuffer class has only one id
+    EXPECT_EQ(buffer1, buffer2);
+    AsyncBuffer async_buffer2 = AsyncBuffer(*tie, ahwb);
+    EXPECT_OK(async_buffer2.GetOpenGlBuffer(buffer2));
+    // Check that each different instance will produce unique id
+    EXPECT_NE(buffer1, buffer2);
+  } else {
+    GTEST_SKIP();
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/build_defs.bzl b/tensorflow/lite/delegates/gpu/build_defs.bzl
index e6e6fa2be3934e..cdea91aec86507 100644
--- a/tensorflow/lite/delegates/gpu/build_defs.bzl
+++ b/tensorflow/lite/delegates/gpu/build_defs.bzl
@@ -1,17 +1,5 @@
 """Additional build options needed for the GPU Delegate."""
 
-# copybara:uncomment_begin(google-only)
-# load("//third_party/android/ndk/platforms:grte_top.bzl", "min_supported_ndk_api")
-# copybara:uncomment_end
-
-def nativewindow_linkopts():
-    # copybara:uncomment_begin(google-only)
-    # return min_supported_ndk_api("26", ["-lnativewindow"])
-    # copybara:uncomment_end
-    # copybara:comment_begin(oss-only)
-    return ["-lnativewindow"]
-    # copybara:comment_end
-
 def gpu_delegate_linkopts():
     """Additional link options needed when linking in the GPU Delegate."""
     return select({
@@ -24,7 +12,7 @@ def gpu_delegate_linkopts():
             "-lGLESv2",
         ],
         "//conditions:default": [],
-    }) + nativewindow_linkopts()
+    })
 
 def tflite_angle_heapcheck_deps():
     # copybara:uncomment_begin(google-only)
@@ -40,3 +28,11 @@ def tflite_angle_heapcheck_deps():
     # copybara:comment_begin(oss-only)
     return ["@com_google_googletest//:gtest_main"]
     # copybara:comment_end
+
+def gtest_main_no_heapcheck_deps():
+    # copybara:uncomment_begin(google-only)
+    # return ["@com_google_googletest//:gtest_main_no_heapcheck"]
+    # copybara:uncomment_end
+    # copybara:comment_begin(oss-only)
+    return ["@com_google_googletest//:gtest_main"]
+    # copybara:comment_end
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 760f401bdc61ea..ae7c23280538d2 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -428,18 +428,9 @@ cc_library(
     srcs = ["opencl_wrapper.cc"],
     hdrs = ["opencl_wrapper.h"],
     linkopts = select({
-        "//tensorflow:android": [
-            "-ldl",  # opencl_wrapper calls dlopen()
-            "-lm",
-        ],
-        # copybara:uncomment_begin(google-only)
-        # "//tools/cc_target_os:linux-google": [
-        # "-ldl",
-        # "-rdynamic",
-        # ],
-        # copybara:uncomment_end
-        "//conditions:default": ["-ldl"],  # opencl_wrapper calls dlopen()
-    }),
+        "//tensorflow:android": ["-lm"],
+        "//conditions:default": [],
+    }) + ["-ldl"],  # opencl_wrapper calls dlopen()
     deps = [
         "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index 889b178463e5f1..23ac928214a42d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -2,6 +2,7 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_gpu_tests_tags",
 )
+load("//tensorflow/lite/delegates/gpu:build_defs.bzl", "gtest_main_no_heapcheck_deps")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -18,14 +19,13 @@ cc_test(
         "notsan",
         "requires-gpu-nvidia",
     ],
+    # TODO(b/279977471) Once b/279347631 is resolved, check for heap again
     deps = [
         ":cl_test",
-        # TODO(b/279977471) Once b/279347631 is resolved, check for heap again
-        "@com_google_googletest//:gtest_main_no_heapcheck",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:add_test_util",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -42,8 +42,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:cast_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_library(
@@ -76,8 +75,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:concat_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -94,8 +92,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:conv_constants_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",  # constant buffers leak on nvidia
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -112,8 +109,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:conv_generic_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -130,8 +126,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:conv_weights_converter_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_library(
@@ -173,8 +168,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:convolution_transposed_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -191,8 +185,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:convolution_transposed_3x3_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -209,8 +202,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:convolution_transposed_3x3_thin_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",  # constant buffers leak on nvidia
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -227,8 +219,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:convolution_transposed_4x4_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -245,8 +236,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:convolution_transposed_thin_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",  # constant buffers leak on nvidia
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -263,8 +253,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:cumsum_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -281,8 +270,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:depthwise_conv_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -300,8 +288,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:depthwise_conv_3x3_stride_h2_test_util",
         "//tensorflow/lite/delegates/gpu/common/tasks:depthwise_conv_3x3_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -318,8 +305,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:elementwise_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -341,8 +327,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
         "//tensorflow/lite/delegates/gpu/common/tasks:fully_connected",
         "//tensorflow/lite/delegates/gpu/common/tasks:fully_connected_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -359,8 +344,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:gather_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -377,8 +361,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:lstm_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -414,8 +397,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:max_unpooling_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -432,8 +414,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:mean_stddev_normalization_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -450,8 +431,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:one_hot_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -468,8 +448,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:padding_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -486,8 +465,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:pooling_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -504,8 +482,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:prelu_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -522,8 +499,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:quantize_and_dequantize_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -540,8 +516,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:reduce_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -558,8 +533,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:relu_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -576,8 +550,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:resampler_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -594,8 +567,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:reshape_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -612,8 +584,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:reshape_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -630,8 +601,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:select_v2_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -648,8 +618,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:softmax_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -666,8 +635,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:softmax_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -684,8 +652,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:space_to_depth_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -702,8 +669,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:split_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -720,8 +686,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:strided_slice_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -738,8 +703,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:tile_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -756,8 +720,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:transpose_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -774,8 +737,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:resize_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_test(
@@ -792,8 +754,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:winograd_test_util",
-        "@com_google_googletest//:gtest_main_no_heapcheck",
-    ],
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 test_suite(
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/BUILD b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
index 75e36c0c9ca877..e333bb6daf5628 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/lite/delegates/gpu:build_defs.bzl", "gtest_main_no_heapcheck_deps")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
@@ -35,8 +37,7 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/cl/kernels:cl_test",
         "//tensorflow/lite/delegates/gpu/common:gpu_model_test_util",
         "//tensorflow/lite/delegates/gpu/common:status",
-        "@com_google_googletest//:gtest_main_no_heapcheck",  # constant buffers leak on nvidia
-    ],
+    ] + gtest_main_no_heapcheck_deps(),  # constant buffers leak on nvidia
 )
 
 cc_binary(
diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index 7f50c878263cd9..0a55ad05a76968 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow/core/platform:build_config.bzl", "tf_platform_alias")
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 load("//tensorflow:tensorflow.bzl", "workspace_root")
+load("//tensorflow/core/platform:build_config.bzl", "tf_platform_alias")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -299,6 +299,16 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "model_builder_helper_test",
+    srcs = ["model_builder_helper_test.cc"],
+    deps = [
+        ":model_builder_helper",
+        "//tensorflow/lite/core/c:private_common",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "model_hints",
     hdrs = ["model_hints.h"],
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.cc b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
index 8f41a9ef1a715a..30489a1721f9d8 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.cc
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
@@ -39,6 +39,7 @@ GpuVendor GetGpuVendor(const std::string& gpu_description) {
       {"nvidia", GpuVendor::kNvidia},
       {"amd", GpuVendor::kAMD},
       {"radeon", GpuVendor::kAMD},
+      {"xclipse", GpuVendor::kAMD},
       {"power", GpuVendor::kPowerVR},
   };
   for (const auto& v : kMapping) {
@@ -625,15 +626,6 @@ void GetGpuInfoFromDeviceDescription(const std::string& gpu_description,
   absl::AsciiStrToLower(&lowered);
   gpu_info->vendor = GetGpuVendor(lowered);
 
-  // Because clvk is an OpenCL layer on top of vulkan, it does not react to CL
-  // optimisation as native CL implementation does.
-  // AMD is particularly affected, thus let's manage it differently to get the
-  // best performances out of it.
-  if (gpu_info->IsApiOpenCl() && gpu_info->opencl_info.IsCLVK() &&
-      gpu_info->IsAMD()) {
-    gpu_info->vendor = GpuVendor::kUnknown;
-  }
-
   if (gpu_info->IsAdreno()) {
     gpu_info->adreno_info = AdrenoInfo(lowered);
   } else if (gpu_info->IsApple()) {
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index b498916bfed447..66224878c617ae 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -16,10 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
 
 #include <stddef.h>
-#include <stdint.h>
-#include <string.h>
 
-#include <any>
+#include <cstdint>
+#include <cstring>
 #include <limits>
 #include <string>
 #include <vector>
@@ -257,34 +256,32 @@ void ConvertFloat16ToFloat32(size_t num_elements, const uint16_t* src,
 }
 
 template <>
-absl::Status CreateVectorCopyData<float>(const TfLiteTensor& tensor,
-                                         float* tensor_data) {
-  switch (tensor.type) {
+absl::Status CreateVectorCopyData<float>(const TfLiteTensor& src, float* dst) {
+  switch (src.type) {
     case kTfLiteFloat32:
-      std::memcpy(tensor_data, tensor.data.f, tensor.bytes);
-      break;
+      std::memcpy(dst, src.data.f, src.bytes);
+      return absl::OkStatus();
     case kTfLiteFloat16:
-      ConvertFloat16ToFloat32(
-          NumElements(&tensor),
-          reinterpret_cast<uint16_t const*>(tensor.data.f16), tensor_data);
-      break;
+      ConvertFloat16ToFloat32(NumElements(&src),
+                              reinterpret_cast<uint16_t const*>(src.data.f16),
+                              dst);
+      return absl::OkStatus();
     case kTfLiteInt8:
-      DequantizeConstantTensor(tensor, tensor.data.int8, tensor_data);
-      break;
+      DequantizeConstantTensor(src, src.data.int8, dst);
+      return absl::OkStatus();
     case kTfLiteUInt8:
-      DequantizeConstantTensor(tensor, tensor.data.uint8, tensor_data);
-      break;
+      DequantizeConstantTensor(src, src.data.uint8, dst);
+      return absl::OkStatus();
     case kTfLiteInt32:
-      DequantizeConstantTensor(tensor, tensor.data.i32, tensor_data);
-      break;
+      DequantizeConstantTensor(src, src.data.i32, dst);
+      return absl::OkStatus();
     default:
       return absl::InvalidArgumentError(
           "Unsupported data type for float32 tensor");
   }
-  return absl::OkStatus();
 }
 
-const std::string GetDimensionString(const TfLiteIntArray* dimensions) {
+std::string GetDimensionString(const TfLiteIntArray* dimensions) {
   return absl::StrJoin(TfLiteIntArrayView(dimensions), "x");
 }
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
index 88a4576d45d9bf..14384ce5be9a1c 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -17,10 +17,9 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 
 #include <stddef.h>
-#include <stdint.h>
-#include <string.h>
 
-#include <string>
+#include <cstdint>
+#include <cstring>
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
@@ -33,6 +32,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace gpu {
@@ -100,19 +100,94 @@ inline void DequantizeConstantTensor(const TfLiteTensor& tensor,
 }
 
 template <typename T>
-absl::Status CreateVectorCopyData(const TfLiteTensor& tensor, T* tensor_data) {
-  if (tensor.bytes % sizeof(T) != 0) {
+absl::Status CreateVectorCopyData(const TfLiteTensor& src, T* dst) {
+  if (src.bytes % sizeof(T) != 0) {
     return absl::InvalidArgumentError(
-        absl::StrCat("Input data size ", tensor.bytes,
+        absl::StrCat("Input data size ", src.bytes,
                      " is not aligned to expected type: ", sizeof(T)));
   }
-  std::memcpy(tensor_data, tensor.data.uint8, tensor.bytes);
-  return absl::OkStatus();
+  if (const int n = tflite::NumElements(&src); n * sizeof(T) == src.bytes) {
+    std::memcpy(dst, src.data.raw_const, src.bytes);
+    return absl::OkStatus();
+  } else {
+    switch (src.type) {
+      case kTfLiteNoType:
+        return absl::InvalidArgumentError("src has no type.");
+      case kTfLiteFloat32:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<float>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteInt32:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<int32_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteUInt8:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<uint8_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteInt64:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<int64_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteString:
+        return absl::UnimplementedError("src can't be string.");
+      case kTfLiteBool:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<bool>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteInt16:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<int16_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteComplex64:
+        return absl::UnimplementedError("src can't be complex64.");
+      case kTfLiteInt8:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<int8_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteFloat16:
+        return absl::UnimplementedError("src can't be float16.");
+      case kTfLiteFloat64:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<double>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteComplex128:
+        return absl::UnimplementedError("src can't be complex128.");
+      case kTfLiteUInt64:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<uint64_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteResource:
+        return absl::UnimplementedError("src can't be resource.");
+      case kTfLiteVariant:
+        return absl::UnimplementedError("src can't be variant.");
+      case kTfLiteUInt32:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<uint32_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteUInt16:
+        for (int i = 0; i < n; ++i) {
+          dst[i] = tflite::GetTensorData<uint16_t>(&src)[i];
+        }
+        return absl::OkStatus();
+      case kTfLiteInt4:
+        return absl::UnimplementedError("src can't be int4.");
+    }
+  }
 }
 
 template <>
-absl::Status CreateVectorCopyData<float>(const TfLiteTensor& tensor,
-                                         float* tensor_data);
+absl::Status CreateVectorCopyData<float>(const TfLiteTensor& src, float* dst);
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Scalar* shape);
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper_test.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper_test.cc
new file mode 100644
index 00000000000000..f13bc539785467
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper_test.cc
@@ -0,0 +1,48 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
+
+#include <cstdint>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+
+namespace tflite {
+namespace gpu {
+namespace {
+
+using ::testing::ElementsAre;
+
+TEST(ModelBuilderHelperTest, CreateVectorCopyDataDifferentSize) {
+  TfLiteTensor tflite_tensor;
+  tflite_tensor.type = kTfLiteInt32;
+  int32_t src_data[4] = {1, 2, 3, 4};
+  tflite_tensor.data.i32 = src_data;
+  tflite_tensor.dims = TfLiteIntArrayCreate(1);
+  tflite_tensor.dims->data[0] = sizeof(src_data) / sizeof(src_data[0]);
+  tflite_tensor.bytes = sizeof(src_data);
+
+  int16_t dst[4];
+  ASSERT_OK(CreateVectorCopyData(tflite_tensor, dst));
+  EXPECT_THAT(dst, ElementsAre(1, 2, 3, 4));
+
+  TfLiteIntArrayFree(tflite_tensor.dims);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc b/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc
index 72e54dd21c94f5..edcdff50dbc991 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc
@@ -886,7 +886,8 @@ std::string ConvGeneric::GenerateConv(const GpuInfo& gpu_info,
                       std::to_string(s * 4 + ch + shared_offset);
                   std::string w_val;
                   if (conv_params.AreWeightsBuffer()) {
-                    if (gpu_info.SupportsPointersInKernels()) {
+                    if (need_local_mem ||
+                        gpu_info.SupportsPointersInKernels()) {
                       w_val = "weights_cache[" + weight_id + "]";
                     } else {
                       w_val = "args.weights.Read(filters_offset + " +
@@ -926,7 +927,7 @@ std::string ConvGeneric::GenerateConv(const GpuInfo& gpu_info,
                 std::string weight_id =
                     std::to_string(s * 4 + i + shared_offset);
                 if (conv_params.AreWeightsBuffer()) {
-                  if (gpu_info.SupportsPointersInKernels()) {
+                  if (need_local_mem || gpu_info.SupportsPointersInKernels()) {
                     F[i] = "weights_cache[" + weight_id + "]";
                   } else {
                     F[i] =
@@ -1113,7 +1114,7 @@ std::string ConvGeneric::GenerateConv(const GpuInfo& gpu_info,
     c += "  if (DST_S + " + sind + " >= args.dst_tensor.Slices()) return;\n";
     c += "  {\n";
     if (conv_params.AreWeightsBuffer() &&
-        gpu_info.SupportsPointersInKernels()) {
+        (need_local_mem || gpu_info.SupportsPointersInKernels())) {
       c += "    FLT4 bias_val = TO_FLT4(weights_cache[" + sind + "]);\n";
     } else {
       c += "    FLT4 bias_val = args.biases.Read(DST_S + " + sind + ");\n";
@@ -1748,8 +1749,7 @@ ConvGeneric::ConvParams ConvGeneric::GuessBestParams(
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM_X4;
-  } else if (gpu_info.IsIntel() ||
-             (gpu_info.IsApiOpenCl() && gpu_info.opencl_info.IsCLVK())) {
+  } else if (gpu_info.IsIntel()) {
     if (different_weights_for_height) {
       work_group_size_ = int3(16, 1, 1);
       work_group_launch_order_ = int3(0, 1, 2);
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/special/BUILD b/tensorflow/lite/delegates/gpu/common/tasks/special/BUILD
index 74af3502abc116..86ab973e974763 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/special/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/tasks/special/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/lite/delegates/gpu:build_defs.bzl", "gtest_main_no_heapcheck_deps")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
@@ -26,18 +28,17 @@ cc_test(
         "notsan",
         "requires-gpu-nvidia",
     ],
+    # TODO(b/279977471) Once b/279347631 is resolved, check for heap again
     deps = [
         ":conv_pointwise",
-        # TODO(b/279977471) Once b/279347631 is resolved, check for heap again
-        "@com_google_googletest//:gtest_main_no_heapcheck",
         "//tensorflow/lite/delegates/gpu/cl/kernels:cl_test",
-        "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
-        "//tensorflow/lite/delegates/gpu/common/task:testing_util",
         "//tensorflow/lite/delegates/gpu/common:precision",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:types",
-    ],
+        "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/common/task:testing_util",
+    ] + gtest_main_no_heapcheck_deps(),
 )
 
 cc_library(
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise_test.cc b/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise_test.cc
index e77a488587df78..0af40dfaf8f07a 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/special/conv_pointwise_test.cc
@@ -59,11 +59,11 @@ TEST_F(OpenCLOperationTest, SliceMulMeanConcat) {
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
       GPUOperation operation = CreateConvPointwise(op_def, op_attr);
-      EXPECT_OK(env->ExecuteGPUOperation(
+      ASSERT_OK(env->ExecuteGPUOperation(
           {src_tensor, weights_tensor},
           std::make_unique<GPUOperation>(std::move(operation)),
           BHWC(1, 2, 1, 2), &dst_tensor));
-      EXPECT_OK(PointWiseNear({5.5f, 5.5f, 8.5f, 8.5f}, dst_tensor.data, eps));
+      ASSERT_OK(PointWiseNear({5.5f, 5.5f, 8.5f, 8.5f}, dst_tensor.data, eps));
     }
   }
 }
@@ -93,11 +93,11 @@ TEST_F(OpenCLOperationTest, SliceMulSumConcat) {
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
       GPUOperation operation = CreateConvPointwise(op_def, op_attr);
-      EXPECT_OK(env->ExecuteGPUOperation(
+      ASSERT_OK(env->ExecuteGPUOperation(
           {src_tensor, weights_tensor},
           std::make_unique<GPUOperation>(std::move(operation)),
           BHWC(1, 2, 1, 2), &dst_tensor));
-      EXPECT_OK(
+      ASSERT_OK(
           PointWiseNear({11.0f, 11.0f, 17.0f, 17.0f}, dst_tensor.data, eps));
     }
   }
diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index c7232f69078de5..00c58d37e6f37b 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #endif
 
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/gpu/android_hardware_buffer.h"
 #include "tensorflow/lite/delegates/gpu/api.h"
 #include "tensorflow/lite/delegates/gpu/cl/api.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
@@ -105,17 +106,9 @@ using tflite::delegates::utils::WriteSyncAttrs;
   } while (false)
 
 // This idiom allows selecting alternate code paths depending on whether or not
-// AHWB is available.  However, it's still necessary to directly guard calls to
-// AHardwareBuffer_* functions with "if (__builtin_available(android 26, *))" to
-// avoid compiler errors.
-#define TFLITE_AHWB_AVAILABLE()               \
-  [] {                                        \
-    if (__builtin_available(android 26, *)) { \
-      return true;                            \
-    } else {                                  \
-      return false;                           \
-    }                                         \
-  }()
+// AHWB is available.
+#define TFLITE_AHWB_AVAILABLE() \
+  ::tflite::gpu::OptionalAndroidHardwareBuffer::Instance().Supported()
 
 namespace tflite {
 namespace gpu {
@@ -772,24 +765,24 @@ class DelegateAsyncKernel : public BackendAsyncKernelInterface {
   using UniquePtrAHardwareBuffer =
       std::unique_ptr<AHardwareBuffer, void (*)(AHardwareBuffer*)>;
   static UniquePtrAHardwareBuffer Acquire(AHardwareBuffer* ahwb) {
-    if (__builtin_available(android 26, *)) {
-      AHardwareBuffer_acquire(ahwb);
+    if (OptionalAndroidHardwareBuffer::Instance().Supported()) {
+      OptionalAndroidHardwareBuffer::Instance().Acquire(ahwb);
+      return UniquePtrAHardwareBuffer(ahwb, [](AHardwareBuffer* b) {
+        OptionalAndroidHardwareBuffer::Instance().Release(b);
+      });
     } else {
       TFLITE_LOG_PROD(TFLITE_LOG_ERROR,
                       "attempting AHardwareBuffer_acquire on a device without "
                       "AHardwareBuffer support");
+      return {nullptr, [](AHardwareBuffer*) {}};
     }
-    return UniquePtrAHardwareBuffer(ahwb, [](AHardwareBuffer* b) {
-      if (__builtin_available(android 26, *)) {
-        AHardwareBuffer_release(b);
-      }
-    });
   }
   static AHardwareBuffer_Desc Describe(
       const UniquePtrAHardwareBuffer& uptr_ahwb) {
     AHardwareBuffer_Desc desc_ahwb = {};
-    if (__builtin_available(android 26, *)) {
-      AHardwareBuffer_describe(uptr_ahwb.get(), &desc_ahwb);
+    if (OptionalAndroidHardwareBuffer::Instance().Supported()) {
+      OptionalAndroidHardwareBuffer::Instance().Describe(uptr_ahwb.get(),
+                                                         &desc_ahwb);
     } else {
       TFLITE_LOG_PROD(TFLITE_LOG_ERROR,
                       "attempting AHardwareBuffer_describe on a device without "
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index 8571ff7f04156c..0c555a4e6b9a12 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -90,7 +90,7 @@ objc_library(
 ios_unit_test(
     name = "common_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -144,7 +144,6 @@ objc_library(
     copts = DEFAULT_COPTS + [
         "-ObjC++",
     ],
-    features = ["-layering_check"],
     sdk_frameworks = ["Metal"],
     deps = [
         ":compute_task",
@@ -268,7 +267,7 @@ ios_application(
         "iphone",
     ],
     infoplists = ["Info.plist"],
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     provisioning_profile = "//tensorflow/lite/delegates/gpu/metal:provisioning_profile.mobileprovision",
     tags = tf_gpu_tests_tags() + [
         "local",
@@ -298,7 +297,7 @@ objc_library(
 
 ios_unit_test(
     name = "ComponentsTests",
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + ["notap"],
     test_host = ":TestApplication",
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index 72206b7678b140..06c295646eec6e 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -30,7 +30,7 @@ objc_library(
 ios_unit_test(
     name = "add_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -54,7 +54,7 @@ objc_library(
 ios_unit_test(
     name = "cast_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -77,7 +77,7 @@ objc_library(
 ios_unit_test(
     name = "concat_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -111,7 +111,7 @@ objc_library(
 ios_unit_test(
     name = "conv_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -134,7 +134,7 @@ objc_library(
 ios_unit_test(
     name = "conv_weights_converter_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -158,7 +158,7 @@ objc_library(
 ios_unit_test(
     name = "cumsum_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -183,7 +183,7 @@ objc_library(
 ios_unit_test(
     name = "depthwise_conv_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -207,7 +207,7 @@ objc_library(
 ios_unit_test(
     name = "elementwise_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -230,7 +230,7 @@ objc_library(
 ios_unit_test(
     name = "fully_connected_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -253,7 +253,7 @@ objc_library(
 ios_unit_test(
     name = "gather_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -276,7 +276,7 @@ objc_library(
 ios_unit_test(
     name = "lstm_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -299,7 +299,7 @@ objc_library(
 ios_unit_test(
     name = "max_unpooling_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -322,7 +322,7 @@ objc_library(
 ios_unit_test(
     name = "mean_stddev_normalization_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -346,7 +346,7 @@ objc_library(
 ios_unit_test(
     name = "one_hot_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -369,7 +369,7 @@ objc_library(
 ios_unit_test(
     name = "padding_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -392,7 +392,7 @@ objc_library(
 ios_unit_test(
     name = "pooling_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -415,7 +415,7 @@ objc_library(
 ios_unit_test(
     name = "prelu_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -443,7 +443,7 @@ objc_library(
 ios_unit_test(
     name = "quantize_and_dequantize_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -466,7 +466,7 @@ objc_library(
 ios_unit_test(
     name = "reduce_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -489,7 +489,7 @@ objc_library(
 ios_unit_test(
     name = "relu_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -514,7 +514,7 @@ objc_library(
 ios_unit_test(
     name = "resampler_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = [
         "no_mac",  # TODO(b/183905399)
@@ -539,7 +539,7 @@ objc_library(
 ios_unit_test(
     name = "resize_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -562,7 +562,7 @@ objc_library(
 ios_unit_test(
     name = "reshape_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -585,7 +585,7 @@ objc_library(
 ios_unit_test(
     name = "select_v2_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -608,7 +608,7 @@ objc_library(
 ios_unit_test(
     name = "slice_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -631,7 +631,7 @@ objc_library(
 ios_unit_test(
     name = "softmax_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -655,7 +655,7 @@ objc_library(
 ios_unit_test(
     name = "space_to_depth_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -678,7 +678,7 @@ objc_library(
 ios_unit_test(
     name = "split_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -701,7 +701,7 @@ objc_library(
 ios_unit_test(
     name = "tile_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -728,7 +728,7 @@ objc_library(
 ios_unit_test(
     name = "transpose_conv_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -751,7 +751,7 @@ objc_library(
 ios_unit_test(
     name = "transpose_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -804,7 +804,7 @@ objc_library(
 ios_unit_test(
     name = "winograd_test",
     testonly = 1,
-    minimum_os_version = "11.4",
+    minimum_os_version = "12.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index affa0600b1ed6d..ad5755f6c990a3 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -27,6 +27,12 @@ config_setting(
     define_values = {"xnnpack_force_float_precision": "fp16"},
 )
 
+# Force XNNPACK to use all operators in the delegate.
+config_setting(
+    name = "xnnpack_use_latest_ops_explicit",
+    define_values = {"xnnpack_use_latest_ops": "true"},
+)
+
 # Enable offloading of quantized 8-bit signed operators to XNNPACK delegate
 config_setting(
     name = "tflite_with_xnnpack_qs8_explicit_true",
@@ -214,6 +220,9 @@ cc_library(
     copts = tflite_copts() + select({
         ":xnnpack_force_float_precision_explicit_fp16": ["-DXNNPACK_DELEGATE_FORCE_PRECISION_FP16=1"],
         "//conditions:default": [],
+    }) + select({
+        ":xnnpack_use_latest_ops_explicit": ["-DXNNPACK_DELEGATE_USE_LATEST_OPS=1"],
+        "//conditions:default": [],
     }),
     linkstatic = True,
     deps = [
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index ca7bdfc88804ed..498c7b0e5b7fda 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -532,8 +532,12 @@ class Delegate {
   }
 
   bool enable_latest_operators() const {
+#ifdef XNNPACK_DELEGATE_USE_LATEST_OPS
+    return true;
+#else
     return (options_.flags &
             TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS) != 0;
+#endif
   }
 
   bool support_variable_ops() const {
@@ -3688,8 +3692,9 @@ class Subgraph {
       TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
           logging_context, filter_tensor, node->inputs->data[1], node_index));
     } else {
-      TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
-          delegate, logging_context, filter_tensor, node->inputs->data[1],
+      TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQCInt8Type(
+          delegate, logging_context, filter_tensor,
+          /*expected_quantized_dimension=*/0, node->inputs->data[1],
           node_index));
       if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
         TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
index 42e056796ef14f..803a2a89c931ef 100644
--- a/tensorflow/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -263,6 +263,10 @@ void RunInference(Settings* settings,
     LOG(INFO) << "number of outputs: " << outputs.size();
   }
 
+  auto profiler = std::make_unique<profiling::Profiler>(
+      settings->max_profiling_buffer_entries);
+  interpreter->SetProfiler(profiler.get());
+
   auto delegates = delegate_providers.CreateAllDelegates();
   for (auto& delegate : delegates) {
     const auto delegate_name = delegate.provider->GetName();
@@ -311,9 +315,6 @@ void RunInference(Settings* settings,
                  << interpreter->tensor(input)->type << " yet";
       exit(-1);
   }
-  auto profiler = std::make_unique<profiling::Profiler>(
-      settings->max_profiling_buffer_entries);
-  interpreter->SetProfiler(profiler.get());
 
   if (settings->profiling) profiler->StartProfiling();
   for (int i = 0; i < settings->number_of_warmup_runs; i++) {
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin
index 8108897c68e54b..417b66385ccfe7 100644
Binary files a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin and b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin differ
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.h
index e62b599d7e5294..011e492048f352 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_H_
 
+/// For documentation, see
+/// third_party/tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api.h
+
 #include "tensorflow/lite/core/experimental/acceleration/mini_benchmark/c/c_api.h"  // IWYU pragma: export
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_H_
diff --git a/tensorflow/lite/experimental/microfrontend/BUILD b/tensorflow/lite/experimental/microfrontend/BUILD
index 1fb94ff67d2dea..e1c4f30baa7ffd 100644
--- a/tensorflow/lite/experimental/microfrontend/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/BUILD
@@ -118,8 +118,8 @@ tf_custom_op_py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":audio_microfrontend_op",
-        "//tensorflow/python/framework",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:load_library",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops:control_flow_ops",
diff --git a/tensorflow/lite/g3doc/microcontrollers/build_convert.md b/tensorflow/lite/g3doc/microcontrollers/build_convert.md
index 402fc1eb26bd54..6c605d6a38a549 100644
--- a/tensorflow/lite/g3doc/microcontrollers/build_convert.md
+++ b/tensorflow/lite/g3doc/microcontrollers/build_convert.md
@@ -10,9 +10,8 @@ microcontrollers. It also outlines the supported operations and gives some
 guidance on designing and training a model to fit in limited memory.
 
 For an end-to-end, runnable example of building and converting a model, see the
-following Colab which is part of the *Hello World* example:
-
-<a class="button button-primary" href="https://colab.research.google.com/github/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb">train_hello_world_model.ipynb</a>
+[Hello World](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/examples/hello_world#hello-world-example)
+example.
 
 ## Model conversion
 
@@ -54,7 +53,7 @@ important to change the array declaration to `const` for better memory
 efficiency on embedded platforms.
 
 For an example of how to include and use a model in your program, see
-[`evaluate_test.cc`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/hello_world/evaluate_test.cc)
+[`hello_world_test.cc`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc)
 in the *Hello World* example.
 
 ## Model architecture and training
diff --git a/tensorflow/lite/interpreter_builder.h b/tensorflow/lite/interpreter_builder.h
index 01dfefe8a43ed8..346e08ed7cea22 100644
--- a/tensorflow/lite/interpreter_builder.h
+++ b/tensorflow/lite/interpreter_builder.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 /// For documentation, see third_party/tensorflow/lite/core/interpreter_builder.h.
 
-#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/interpreter_builder.h"  // IWYU pragma: export
 
 namespace tflite {
 using InterpreterBuilder = ::tflite::impl::InterpreterBuilder;
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 3c0f0265236d5b..3f9fe7fea2a364 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -465,6 +465,14 @@ java_library(
     ],
 )
 
+java_library_with_tflite(
+    name = "test_init",
+    testonly = True,
+    srcs = [
+        "src/test/java/org/tensorflow/lite/TestInit.java",
+    ],
+)
+
 #-----------------------------------------------------------------------------
 # java_library targets that also include native code dependencies.
 
@@ -516,7 +524,6 @@ java_test_with_tflite(
     size = "small",
     srcs = [
         "src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java",
-        "src/test/java/org/tensorflow/lite/TestInit.java",
     ],
     javacopts = JAVACOPTS,
     # We want to ensure that every test case in the test also verifies that the
@@ -532,6 +539,9 @@ java_test_with_tflite(
         "v1only",
     ],
     test_class = "org.tensorflow.lite.TensorFlowLiteTest",
+    tflite_deps = [
+        ":test_init",
+    ],
     tflite_jni_binaries = [
         "//tensorflow/lite/java/src/test/native:libtensorflowlite_test_jni.so",
     ],
@@ -597,7 +607,6 @@ java_test_with_tflite(
     size = "small",
     srcs = [
         "src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java",
-        "src/test/java/org/tensorflow/lite/TestInit.java",
     ],
     data = [
         # The files named as <data_type>.bin reshape the incoming tensor from (2, 8, 8, 3) to (2, 4, 4, 12).
@@ -613,6 +622,9 @@ java_test_with_tflite(
     ],
     javacopts = JAVACOPTS,
     test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest",
+    tflite_deps = [
+        ":test_init",
+    ],
     tflite_jni_binaries = [
         "//tensorflow/lite/java/src/test/native:libtensorflowlite_test_jni.so",
     ],
@@ -631,7 +643,6 @@ java_test_with_tflite(
     srcs = [
         "src/test/java/org/tensorflow/lite/InterpreterTest.java",
         "src/test/java/org/tensorflow/lite/SupportedFeatures.java",
-        "src/test/java/org/tensorflow/lite/TestInit.java",
         "src/test/java/org/tensorflow/lite/TestUtils.java",
     ],
     data = [
@@ -646,6 +657,9 @@ java_test_with_tflite(
     ],
     javacopts = JAVACOPTS,
     test_class = "org.tensorflow.lite.InterpreterTest",
+    tflite_deps = [
+        ":test_init",
+    ],
     tflite_jni_binaries = [
         "//tensorflow/lite/java/src/test/native:libtensorflowlite_test_jni.so",
     ],
@@ -663,7 +677,6 @@ java_test_with_tflite(
     srcs = [
         "src/test/java/org/tensorflow/lite/InterpreterApiTest.java",
         "src/test/java/org/tensorflow/lite/SupportedFeatures.java",
-        "src/test/java/org/tensorflow/lite/TestInit.java",
         "src/test/java/org/tensorflow/lite/TestUtils.java",
     ],
     data = [
@@ -677,6 +690,9 @@ java_test_with_tflite(
     ],
     javacopts = JAVACOPTS,
     test_class = "org.tensorflow.lite.InterpreterApiTest",
+    tflite_deps = [
+        ":test_init",
+    ],
     tflite_jni_binaries = [
         "//tensorflow/lite/java/src/test/native:libtensorflowlite_stable_test_jni.so",
     ],
@@ -695,7 +711,6 @@ java_test_with_tflite(
     srcs = [
         "src/test/java/org/tensorflow/lite/InterpreterApiNoRuntimeTest.java",
         "src/test/java/org/tensorflow/lite/SupportedFeatures.java",
-        "src/test/java/org/tensorflow/lite/TestInit.java",
         "src/test/java/org/tensorflow/lite/TestUtils.java",
     ],
     data = [
@@ -703,6 +718,9 @@ java_test_with_tflite(
     ],
     javacopts = JAVACOPTS,
     test_class = "org.tensorflow.lite.InterpreterApiNoRuntimeTest",
+    tflite_deps = [
+        ":test_init",
+    ],
     tflite_jni_binaries = [
         "//tensorflow/lite/java/src/test/native:libtensorflowlite_stable_test_jni.so",
     ],
@@ -720,7 +738,6 @@ java_test_with_tflite(
     srcs = [
         "src/test/java/org/tensorflow/lite/NnApiDelegateNativeTest.java",
         "src/test/java/org/tensorflow/lite/SupportedFeatures.java",
-        "src/test/java/org/tensorflow/lite/TestInit.java",
         "src/test/java/org/tensorflow/lite/TestUtils.java",
     ],
     data = [
@@ -728,6 +745,9 @@ java_test_with_tflite(
     ],
     tags = ["no_mac"],
     test_class = "org.tensorflow.lite.NnApiDelegateNativeTest",
+    tflite_deps = [
+        ":test_init",
+    ],
     tflite_jni_binaries = [
         "//tensorflow/lite/java/src/test/native:libtensorflowlite_test_jni.so",
     ],
@@ -745,7 +765,6 @@ java_test_with_tflite(
     size = "small",
     srcs = [
         "src/test/java/org/tensorflow/lite/SupportedFeatures.java",
-        "src/test/java/org/tensorflow/lite/TestInit.java",
         "src/test/java/org/tensorflow/lite/TestUtils.java",
         "src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java",
     ],
@@ -755,6 +774,9 @@ java_test_with_tflite(
     javacopts = JAVACOPTS,
     tags = ["no_mac"],
     test_class = "org.tensorflow.lite.nnapi.NnApiDelegateTest",
+    tflite_deps = [
+        ":test_init",
+    ],
     tflite_jni_binaries = [
         "//tensorflow/lite/java/src/test/native:libtensorflowlite_test_jni.so",
     ],
@@ -797,7 +819,6 @@ java_test_with_tflite(
     size = "small",
     srcs = [
         "src/test/java/org/tensorflow/lite/TensorTest.java",
-        "src/test/java/org/tensorflow/lite/TestInit.java",
     ],
     data = [
         "src/testdata/add.bin",
@@ -808,6 +829,9 @@ java_test_with_tflite(
     ],
     javacopts = JAVACOPTS,
     test_class = "org.tensorflow.lite.TensorTest",
+    tflite_deps = [
+        ":test_init",
+    ],
     tflite_jni_binaries = [
         "//tensorflow/lite/java/src/test/native:libtensorflowlite_test_jni.so",
     ],
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
index cf1371fb9dabba..33a5d41a9b6b03 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
@@ -32,7 +32,7 @@ public final class TensorFlowLite {
   // will discard those, and avoid logging messages with parameters (call String.format instead),
   // since the default Java log handler on Android only logs the raw message string and doesn't
   // apply the parameters.
-  private static final Logger logger = Logger.getLogger(InterpreterApi.class.getName());
+  private static final Logger logger = Logger.getLogger(TensorFlowLite.class.getName());
 
   private static final String[][] TFLITE_RUNTIME_LIBNAMES =
       new String[][] {
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestInit.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestInit.java
index 3c46bc09ba3ee0..32b4ccf216c613 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestInit.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestInit.java
@@ -14,8 +14,11 @@
 ==============================================================================*/
 package org.tensorflow.lite;
 
+import java.util.logging.Logger;
+
 /** Utilities for initializing TF Lite for tests. */
 public final class TestInit {
+  private static final Logger logger = Logger.getLogger(TestInit.class.getName());
 
   private TestInit() {}
 
@@ -29,8 +32,15 @@ public static void init() {
     if (!initialized) {
       try {
         System.loadLibrary("tensorflowlite_test_jni");
+        logger.info("Loaded native library for tests: tensorflowlite_test_jni");
       } catch (UnsatisfiedLinkError e) {
-        System.loadLibrary("tensorflowlite_stable_test_jni");
+        logger.info("Didn't load native library for tests: tensorflowlite_test_jni");
+        try {
+          System.loadLibrary("tensorflowlite_stable_test_jni");
+          logger.info("Loaded native library for tests: tensorflowlite_stable_test_jni");
+        } catch (UnsatisfiedLinkError e2) {
+          logger.info("Didn't load native library for tests: tensorflowlite_stable_test_jni");
+        }
       }
       initTfLiteForTest();
       initialized = true;
diff --git a/tensorflow/lite/java/src/test/native/BUILD b/tensorflow/lite/java/src/test/native/BUILD
index 2c32fc618331a8..db20aafcd2d0f8 100644
--- a/tensorflow/lite/java/src/test/native/BUILD
+++ b/tensorflow/lite/java/src/test/native/BUILD
@@ -30,10 +30,9 @@ cc_library_with_tflite(
         "interpreter_test_jni.cc",
         "nnapi_delegate_test_jni.cc",
         "supported_features_jni.cc",
-        "test_init_jni.cc",
     ],
     tflite_deps = [
-        "//tensorflow/lite/c:test_util",
+        ":test_init_jni",
         "//tensorflow/lite/delegates/nnapi/java/src/main/native",
         "//tensorflow/lite/java/src/main/native",
         "//tensorflow/lite/java/src/main/native:jni_utils",
@@ -51,6 +50,22 @@ cc_library_with_tflite(
     alwayslink = 1,
 )
 
+cc_library_with_tflite(
+    name = "test_init_jni",
+    testonly = 1,
+    srcs = [
+        "test_init_jni.cc",
+    ],
+    tflite_deps = [
+        "//tensorflow/lite/java/src/main/native:jni_utils",
+        "//tensorflow/lite/c:test_util",
+    ],
+    deps = [
+        "//tensorflow/lite/java/jni",
+    ],
+    alwayslink = 1,
+)
+
 # Same as "native", but excluding dependencies on experimental features.
 cc_library_with_tflite(
     name = "native_stable",
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index bf2d189075d294..a0eb5ce425fc2a 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -301,7 +301,7 @@ cc_library(
     visibility = ["//visibility:private"],
     deps = [
         ":op_macros",
-        "//tensorflow/lite:arena_planner",
+        "//tensorflow/lite:util",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:optimized_eigen",
     ],
@@ -767,6 +767,7 @@ BUILTIN_KERNEL_SRCS = [
     "stablehlo_gather.cc",
     "stablehlo_add.cc",
     "stablehlo_multiply.cc",
+    "stablehlo_pad.cc",
     "stablehlo_reduce_window.cc",
     "stablehlo_min_max.cc",
     "stablehlo_scatter.cc",
@@ -1338,6 +1339,28 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "stablehlo_pad_test",
+    srcs = ["stablehlo_pad_test.cc"],
+    tags = ["tflite_nnapi"],
+    deps = [
+        ":stablehlo_reduce_window_test_util",
+        ":test_main",
+        ":test_util",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/random",
+        "@com_google_absl//absl/random:bit_gen_ref",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "stablehlo_reduce_window_test_util",
     hdrs = ["stablehlo_reduce_window_test_util.h"],
@@ -1358,7 +1381,6 @@ cc_test(
 
 cc_test(
     name = "stablehlo_reduce_window_test",
-    size = "small",
     srcs = ["stablehlo_reduce_window_test.cc"],
     tags = ["tflite_nnapi"],
     deps = [
diff --git a/tensorflow/lite/kernels/builtin_op_kernels.h b/tensorflow/lite/kernels/builtin_op_kernels.h
index 54c4ccdd48838f..7b1a0975b0e26a 100644
--- a/tensorflow/lite/kernels/builtin_op_kernels.h
+++ b/tensorflow/lite/kernels/builtin_op_kernels.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_BUILTIN_OP_KERNELS_H_
 #define TENSORFLOW_LITE_KERNELS_BUILTIN_OP_KERNELS_H_
 
+/// For documentation, see
+/// third_party/tensorflow/lite/core/kernels/builtin_op_kernels.h
+
 #include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm.h b/tensorflow/lite/kernels/cpu_backend_gemm.h
index 13374c41958ef5..af91b0a6de7336 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm.h
@@ -176,7 +176,7 @@ template <QuantizationFlavor quantization_flavor>
 void Gemm(const MatrixParams<int8_t>& lhs_params, const int8_t* lhs_data,
           const MatrixParams<int16_t>& rhs_params, const int16_t* rhs_data,
           const MatrixParams<int16_t>& dst_params, int16_t* dst_data,
-          const GemmParams<int32_t, int16, quantization_flavor>& params,
+          const GemmParams<int32_t, int16_t, quantization_flavor>& params,
           CpuBackendContext* context) {
   ruy::profiler::ScopeLabel label("cpu_backend_gemm::Gemm");
   ValidateParams(lhs_params, rhs_params, dst_params, params);
@@ -187,7 +187,7 @@ void Gemm(const MatrixParams<int8_t>& lhs_params, const int8_t* lhs_data,
 
   // Currently, only Ruy backend supports 16x8 quant gemm so we use ruy
   // only.
-  detail::GemmImplUsingRuy<int8_t, int16_t, int32_t, int16,
+  detail::GemmImplUsingRuy<int8_t, int16_t, int32_t, int16_t,
                            quantization_flavor>::Run(lhs_params, lhs_data,
                                                      rhs_params, rhs_data,
                                                      dst_params, dst_data,
diff --git a/tensorflow/lite/kernels/eigen_support.cc b/tensorflow/lite/kernels/eigen_support.cc
index 0dc977e876cfbf..22cf62d36d14e5 100644
--- a/tensorflow/lite/kernels/eigen_support.cc
+++ b/tensorflow/lite/kernels/eigen_support.cc
@@ -18,11 +18,14 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/lite/arena_planner.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
+#ifndef EIGEN_DONT_ALIGN
+#include "tensorflow/lite/util.h"
+#endif  // EIGEN_DONT_ALIGN
+
 namespace tflite {
 namespace eigen_support {
 namespace {
@@ -38,12 +41,11 @@ int GetNumThreads(int num_threads) {
 
 #ifndef EIGEN_DONT_ALIGN
 // Eigen may require buffers to be aligned to 16, 32 or 64 bytes depending on
-// hardware architecture and build configurations.
-// If the static assertion fails, try to increase `kDefaultTensorAlignment` to
-// in `arena_planner.h` to 32 or 64.
+// hardware architecture and build configurations. If the static assertion
+// fails, try to increase `kDefaultTensorAlignment` in `util.h` to 32 or 64.
 static_assert(
     kDefaultTensorAlignment % EIGEN_MAX_ALIGN_BYTES == 0,
-    "kDefaultArenaAlignment doesn't comply with Eigen alignment requirement.");
+    "kDefaultTensorAlignment doesn't comply with Eigen alignment requirement.");
 #endif  // EIGEN_DONT_ALIGN
 
 // Helper routine for updating the global Eigen thread count used for OpenMP.
diff --git a/tensorflow/lite/kernels/gather_nd.cc b/tensorflow/lite/kernels/gather_nd.cc
index 20224c01d86a89..10a62047375673 100644
--- a/tensorflow/lite/kernels/gather_nd.cc
+++ b/tensorflow/lite/kernels/gather_nd.cc
@@ -50,6 +50,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt64:
     case kTfLiteInt32:
     case kTfLiteString:
+    case kTfLiteBool:
       break;
     default:
       TF_LITE_KERNEL_LOG(context,
@@ -157,6 +158,9 @@ TfLiteStatus EvalGatherNd(TfLiteContext* context, const TfLiteTensor* params,
     case kTfLiteString:
       status = GatherNdString<IndicesT>(params, indices, output);
       break;
+    case kTfLiteBool:
+      status = GatherNd<bool, IndicesT>(params, indices, output);
+      break;
     default:
       TF_LITE_KERNEL_LOG(context,
                          "Params type '%s' are not supported by gather_nd.",
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_check.h b/tensorflow/lite/kernels/internal/optimized/neon_check.h
index bbf745ce1d12c7..8fdaeef44598d0 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_check.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_check.h
@@ -17,12 +17,12 @@ limitations under the License.
 
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #define USE_NEON
-#include <arm_neon.h>
+#include <arm_neon.h>  // IWYU pragma: export
 #endif
 
 #if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON
 #define USE_NEON
-#include "NEON_2_SSE.h"
+#include "NEON_2_SSE.h"  // IWYU pragma: export
 #endif
 
 // NEON_OR_PORTABLE(SomeFunc, args) calls NeonSomeFunc(args) if USE_NEON is
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 5609719398fcee..3299f610697bbf 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -49,8 +49,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
-#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops_utils.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
@@ -310,18 +310,18 @@ inline void FullyConnected(
 
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data, CpuBackendContext* cpu_backend_context) {
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data, CpuBackendContext* cpu_backend_context) {
   ruy::profiler::ScopeLabel label("FullyConnected/8bit");
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
   TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
   // TODO(b/62193649): This really should be:
@@ -341,26 +341,26 @@ inline void FullyConnected(
     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
   }
 
-  cpu_backend_gemm::MatrixParams<uint8> lhs_params;
+  cpu_backend_gemm::MatrixParams<uint8_t> lhs_params;
   lhs_params.rows = filter_rows;
   lhs_params.cols = filter_cols;
   lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
   lhs_params.zero_point = -filter_offset;
   lhs_params.cache_policy =
       cpu_backend_gemm::DefaultCachePolicy(params.lhs_cacheable);
-  cpu_backend_gemm::MatrixParams<uint8> rhs_params;
+  cpu_backend_gemm::MatrixParams<uint8_t> rhs_params;
   rhs_params.rows = filter_cols;
   rhs_params.cols = batches;
   rhs_params.order = cpu_backend_gemm::Order::kColMajor;
   rhs_params.zero_point = -input_offset;
   rhs_params.cache_policy =
       cpu_backend_gemm::DefaultCachePolicy(params.rhs_cacheable);
-  cpu_backend_gemm::MatrixParams<uint8> dst_params;
+  cpu_backend_gemm::MatrixParams<uint8_t> dst_params;
   dst_params.rows = filter_rows;
   dst_params.cols = batches;
   dst_params.order = cpu_backend_gemm::Order::kColMajor;
   dst_params.zero_point = output_offset;
-  cpu_backend_gemm::GemmParams<int32, uint8> gemm_params;
+  cpu_backend_gemm::GemmParams<int32_t, uint8_t> gemm_params;
   gemm_params.bias = bias_data;
   gemm_params.clamp_min = output_activation_min;
   gemm_params.clamp_max = output_activation_max;
@@ -373,18 +373,18 @@ inline void FullyConnected(
 
 inline void FullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data_int32, const RuntimeShape& output_shape,
-    int16* output_data, CpuBackendContext* cpu_backend_context) {
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data_int32, const RuntimeShape& output_shape,
+    int16_t* output_data, CpuBackendContext* cpu_backend_context) {
   ruy::profiler::ScopeLabel label("FullyConnected/Uint8Int16");
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
   TFLITE_DCHECK_EQ(output_offset, 0);
   TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
@@ -402,26 +402,26 @@ inline void FullyConnected(
                                        output_shape, output_dim_count - 1);
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
 
-  cpu_backend_gemm::MatrixParams<uint8> lhs_params;
+  cpu_backend_gemm::MatrixParams<uint8_t> lhs_params;
   lhs_params.rows = output_depth;
   lhs_params.cols = accum_depth;
   lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
   lhs_params.zero_point = -filter_offset;
   lhs_params.cache_policy =
       cpu_backend_gemm::DefaultCachePolicy(params.lhs_cacheable);
-  cpu_backend_gemm::MatrixParams<uint8> rhs_params;
+  cpu_backend_gemm::MatrixParams<uint8_t> rhs_params;
   rhs_params.rows = accum_depth;
   rhs_params.cols = batches;
   rhs_params.order = cpu_backend_gemm::Order::kColMajor;
   rhs_params.zero_point = -input_offset;
   rhs_params.cache_policy =
       cpu_backend_gemm::DefaultCachePolicy(params.rhs_cacheable);
-  cpu_backend_gemm::MatrixParams<int16> dst_params;
+  cpu_backend_gemm::MatrixParams<int16_t> dst_params;
   dst_params.rows = output_depth;
   dst_params.cols = batches;
   dst_params.order = cpu_backend_gemm::Order::kColMajor;
   dst_params.zero_point = 0;
-  cpu_backend_gemm::GemmParams<int32, int16> gemm_params;
+  cpu_backend_gemm::GemmParams<int32_t, int16_t> gemm_params;
   gemm_params.bias = bias_data_int32;
   gemm_params.clamp_min = output_activation_min;
   gemm_params.clamp_max = output_activation_max;
@@ -438,12 +438,12 @@ inline void FullyConnected(
 // as the 'task' for worker threads to run (multi-threaded case, see
 // ShuffledFullyConnectedWorkerTask below).
 inline void ShuffledFullyConnectedWorkerImpl(
-    const uint8* shuffled_input_workspace_data,
-    const int8* shuffled_weights_data, int batches, int output_depth,
-    int output_stride, int accum_depth, const int32* bias_data,
-    int32 output_multiplier, int output_shift, int16* output_data) {
+    const uint8_t* shuffled_input_workspace_data,
+    const int8_t* shuffled_weights_data, int batches, int output_depth,
+    int output_stride, int accum_depth, const int32_t* bias_data,
+    int32_t output_multiplier, int output_shift, int16_t* output_data) {
 #if defined USE_NEON
-  const int8* shuffled_weights_ptr = shuffled_weights_data;
+  const int8_t* shuffled_weights_ptr = shuffled_weights_data;
   if (batches == 1) {
     const int right_shift = output_shift > 0 ? 0 : -output_shift;
     const int left_shift = output_shift > 0 ? output_shift : 0;
@@ -515,8 +515,8 @@ inline void ShuffledFullyConnectedWorkerImpl(
     const int right_shift = output_shift > 0 ? 0 : -output_shift;
     const int left_shift = output_shift > 0 ? output_shift : 0;
     for (int c = 0; c < output_depth; c += 4) {
-      const int8* shuffled_input_ptr =
-          reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+      const int8_t* shuffled_input_ptr =
+          reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
       // Accumulation loop.
       int32x4_t row_accum00 = vdupq_n_s32(0);
       int32x4_t row_accum10 = vdupq_n_s32(0);
@@ -613,26 +613,26 @@ inline void ShuffledFullyConnectedWorkerImpl(
   }
 #else
   if (batches == 1) {
-    int16* output_ptr = output_data;
+    int16_t* output_ptr = output_data;
     // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
-    // so that just reinterpreting them as int8 values is equivalent to
+    // so that just reinterpreting them as int8_t values is equivalent to
     // subtracting 128 from them, thus implementing for free the subtraction of
     // the zero_point value 128.
-    const int8* shuffled_weights_ptr =
-        reinterpret_cast<const int8*>(shuffled_weights_data);
+    const int8_t* shuffled_weights_ptr =
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
     // Likewise, we preshuffled and pre-xored the input data above.
-    const int8* shuffled_input_data =
-        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    const int8_t* shuffled_input_data =
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
     for (int c = 0; c < output_depth; c += 4) {
       // Internal accumulation.
       // Initialize accumulator with the bias-value.
-      int32 accum[4] = {0};
+      int32_t accum[4] = {0};
       // Accumulation loop.
       for (int d = 0; d < accum_depth; d += 16) {
         for (int i = 0; i < 4; i++) {
           for (int j = 0; j < 16; j++) {
-            int8 input_val = shuffled_input_data[d + j];
-            int8 weights_val = *shuffled_weights_ptr++;
+            int8_t input_val = shuffled_input_data[d + j];
+            int8_t weights_val = *shuffled_weights_ptr++;
             accum[i] += weights_val * input_val;
           }
         }
@@ -640,35 +640,35 @@ inline void ShuffledFullyConnectedWorkerImpl(
       for (int i = 0; i < 4; i++) {
         // Add bias value
         int acc = accum[i] + bias_data[c + i];
-        // Down-scale the final int32 accumulator to the scale used by our
+        // Down-scale the final int32_t accumulator to the scale used by our
         // (16-bit, typically 3 integer bits) fixed-point format. The quantized
         // multiplier and shift here have been pre-computed offline
         // (e.g. by toco).
         acc =
             MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
-        // Saturate, cast to int16, and store to output array.
+        // Saturate, cast to int16_t, and store to output array.
         acc = std::max(acc, -32768);
         acc = std::min(acc, 32767);
         output_ptr[c + i] = acc;
       }
     }
   } else if (batches == 4) {
-    int16* output_ptr = output_data;
+    int16_t* output_ptr = output_data;
     // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
-    // so that just reinterpreting them as int8 values is equivalent to
+    // so that just reinterpreting them as int8_t values is equivalent to
     // subtracting 128 from them, thus implementing for free the subtraction of
     // the zero_point value 128.
-    const int8* shuffled_weights_ptr =
-        reinterpret_cast<const int8*>(shuffled_weights_data);
+    const int8_t* shuffled_weights_ptr =
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
     // Likewise, we preshuffled and pre-xored the input data above.
-    const int8* shuffled_input_data =
-        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    const int8_t* shuffled_input_data =
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
     for (int c = 0; c < output_depth; c += 4) {
-      const int8* shuffled_input_ptr = shuffled_input_data;
+      const int8_t* shuffled_input_ptr = shuffled_input_data;
       // Accumulation loop.
       // Internal accumulation.
       // Initialize accumulator with the bias-value.
-      int32 accum[4][4];
+      int32_t accum[4][4];
       for (int i = 0; i < 4; i++) {
         for (int b = 0; b < 4; b++) {
           accum[i][b] = 0;
@@ -678,8 +678,8 @@ inline void ShuffledFullyConnectedWorkerImpl(
         for (int i = 0; i < 4; i++) {
           for (int b = 0; b < 4; b++) {
             for (int j = 0; j < 16; j++) {
-              int8 input_val = shuffled_input_ptr[16 * b + j];
-              int8 weights_val = shuffled_weights_ptr[16 * i + j];
+              int8_t input_val = shuffled_input_ptr[16 * b + j];
+              int8_t weights_val = shuffled_weights_ptr[16 * i + j];
               accum[i][b] += weights_val * input_val;
             }
           }
@@ -691,13 +691,13 @@ inline void ShuffledFullyConnectedWorkerImpl(
         for (int b = 0; b < 4; b++) {
           // Add bias value
           int acc = accum[i][b] + bias_data[c + i];
-          // Down-scale the final int32 accumulator to the scale used by our
+          // Down-scale the final int32_t accumulator to the scale used by our
           // (16-bit, typically 3 integer bits) fixed-point format. The
           // quantized multiplier and shift here have been pre-computed offline
           // (e.g. by toco).
           acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
                                               output_shift);
-          // Saturate, cast to int16, and store to output array.
+          // Saturate, cast to int16_t, and store to output array.
           acc = std::max(acc, -32768);
           acc = std::min(acc, 32767);
           output_ptr[b * output_stride + c + i] = acc;
@@ -714,13 +714,13 @@ inline void ShuffledFullyConnectedWorkerImpl(
 // Wraps ShuffledFullyConnectedWorkerImpl into a Task class
 // to allow using gemmlowp's threadpool.
 struct ShuffledFullyConnectedWorkerTask : cpu_backend_threadpool::Task {
-  ShuffledFullyConnectedWorkerTask(const uint8* input_data,
-                                   const int8* shuffled_weights_data,
+  ShuffledFullyConnectedWorkerTask(const uint8_t* input_data,
+                                   const int8_t* shuffled_weights_data,
                                    int batches, int output_depth,
                                    int output_stride, int accum_depth,
-                                   const int32* bias_data,
-                                   int32 output_multiplier, int output_shift,
-                                   int16* output_data)
+                                   const int32_t* bias_data,
+                                   int32_t output_multiplier, int output_shift,
+                                   int16_t* output_data)
       : input_data_(input_data),
         shuffled_weights_data_(shuffled_weights_data),
         batches_(batches),
@@ -739,30 +739,30 @@ struct ShuffledFullyConnectedWorkerTask : cpu_backend_threadpool::Task {
         output_shift_, output_data_);
   }
 
-  const uint8* input_data_;
-  const int8* shuffled_weights_data_;
+  const uint8_t* input_data_;
+  const int8_t* shuffled_weights_data_;
   int batches_;
   int output_depth_;
   int output_stride_;
   int accum_depth_;
-  const int32* bias_data_;
-  int32 output_multiplier_;
+  const int32_t* bias_data_;
+  int32_t output_multiplier_;
   int output_shift_;
-  int16* output_data_;
+  int16_t* output_data_;
 };
 
 inline void ShuffledFullyConnected(
     const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& weights_shape,
-    const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int16* output_data, uint8* shuffled_input_workspace_data,
+    const uint8_t* input_data, const RuntimeShape& weights_shape,
+    const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data, uint8_t* shuffled_input_workspace_data,
     CpuBackendContext* cpu_backend_context) {
   ruy::profiler::ScopeLabel label("ShuffledFullyConnected/8bit");
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_EQ(output_activation_min, -32768);
   TFLITE_DCHECK_EQ(output_activation_max, 32767);
   TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
@@ -782,11 +782,11 @@ inline void ShuffledFullyConnected(
   TFLITE_DCHECK((accum_depth % 16) == 0);
   TFLITE_DCHECK((output_depth % 4) == 0);
   // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
-  // so that just reinterpreting them as int8 values is equivalent to
+  // so that just reinterpreting them as int8_t values is equivalent to
   // subtracting 128 from them, thus implementing for free the subtraction of
   // the zero_point value 128.
-  const int8* int8_shuffled_weights_data =
-      reinterpret_cast<const int8*>(shuffled_weights_data);
+  const int8_t* int8_shuffled_weights_data =
+      reinterpret_cast<const int8_t*>(shuffled_weights_data);
 
   // Shuffling and xoring of input activations into the workspace buffer
   if (batches == 1) {
@@ -803,12 +803,12 @@ inline void ShuffledFullyConnected(
     }
 #endif
   } else if (batches == 4) {
-    uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+    uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
     int c = 0;
 #ifdef USE_NEON
     const uint8x16_t signbit = vdupq_n_u8(0x80);
     for (c = 0; c < accum_depth; c += 16) {
-      const uint8* src_data_ptr = input_data + c;
+      const uint8_t* src_data_ptr = input_data + c;
       uint8x16_t val0 = vld1q_u8(src_data_ptr + 0 * accum_depth);
       uint8x16_t val1 = vld1q_u8(src_data_ptr + 1 * accum_depth);
       uint8x16_t val2 = vld1q_u8(src_data_ptr + 2 * accum_depth);
@@ -826,13 +826,13 @@ inline void ShuffledFullyConnected(
 #else
     for (c = 0; c < accum_depth; c += 16) {
       for (int b = 0; b < 4; b++) {
-        const uint8* src_data_ptr = input_data + b * accum_depth + c;
+        const uint8_t* src_data_ptr = input_data + b * accum_depth + c;
         for (int j = 0; j < 16; j++) {
-          uint8 src_val = *src_data_ptr++;
+          uint8_t src_val = *src_data_ptr++;
           // Flip the sign bit, so that the kernel will only need to
-          // reinterpret these uint8 values as int8, getting for free the
+          // reinterpret these uint8_t values as int8_t, getting for free the
           // subtraction of the zero_point value 128.
-          uint8 dst_val = src_val ^ 0x80;
+          uint8_t dst_val = src_val ^ 0x80;
           *shuffled_input_workspace_ptr++ = dst_val;
         }
       }
@@ -930,7 +930,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   ruy::profiler::ScopeLabel label("Conv");
 
   // NB: the float 0.0f value is represented by all zero bytes.
-  const uint8 float_zero_byte = 0x00;
+  const uint8_t float_zero_byte = 0x00;
   const float* gemm_input_data = nullptr;
   const RuntimeShape* gemm_input_shape = nullptr;
   const int filter_width = filter_shape.Dims(2);
@@ -1117,7 +1117,7 @@ inline void HybridConvPerChannel(
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
 
-  const int8* gemm_input_data = nullptr;
+  const int8_t* gemm_input_data = nullptr;
   const RuntimeShape* gemm_input_shape = nullptr;
   const int filter_width = filter_shape.Dims(2);
   const int filter_height = filter_shape.Dims(1);
@@ -1168,17 +1168,17 @@ inline void HybridConvPerChannel(
     }
   }
 
-  cpu_backend_gemm::MatrixParams<int8> lhs_params;
+  cpu_backend_gemm::MatrixParams<int8_t> lhs_params;
   lhs_params.rows = filter_rows;
   lhs_params.cols = filter_cols;
   lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
 
-  cpu_backend_gemm::MatrixParams<int8> rhs_params;
+  cpu_backend_gemm::MatrixParams<int8_t> rhs_params;
   rhs_params.order = cpu_backend_gemm::Order::kColMajor;
   rhs_params.rows = gemm_input_rows;
   rhs_params.cols = gemm_input_cols;
 
-  cpu_backend_gemm::MatrixParams<int32> dst_params;
+  cpu_backend_gemm::MatrixParams<int32_t> dst_params;
   dst_params.order = cpu_backend_gemm::Order::kColMajor;
   dst_params.rows = output_rows;
   dst_params.cols = output_cols;
@@ -1210,29 +1210,29 @@ inline void HybridConvPerChannel(
 }
 
 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
-                 const uint8* input_data, const RuntimeShape& filter_shape,
-                 const uint8* filter_data, const RuntimeShape& bias_shape,
-                 const int32* bias_data, const RuntimeShape& output_shape,
-                 uint8* output_data, const RuntimeShape& im2col_shape,
-                 uint8* im2col_data, CpuBackendContext* cpu_backend_context) {
+                 const uint8_t* input_data, const RuntimeShape& filter_shape,
+                 const uint8_t* filter_data, const RuntimeShape& bias_shape,
+                 const int32_t* bias_data, const RuntimeShape& output_shape,
+                 uint8_t* output_data, const RuntimeShape& im2col_shape,
+                 uint8_t* im2col_data, CpuBackendContext* cpu_backend_context) {
   ruy::profiler::ScopeLabel label("Conv/8bit");
 
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   const int dilation_width_factor = params.dilation_width_factor;
   const int dilation_height_factor = params.dilation_height_factor;
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
   const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
 
-  const uint8* gemm_input_data = nullptr;
+  const uint8_t* gemm_input_data = nullptr;
   const RuntimeShape* gemm_input_shape = nullptr;
   const int filter_width = filter_shape.Dims(2);
   const int filter_height = filter_shape.Dims(1);
@@ -1287,22 +1287,22 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
 
-  cpu_backend_gemm::MatrixParams<uint8> lhs_params;
+  cpu_backend_gemm::MatrixParams<uint8_t> lhs_params;
   lhs_params.rows = filter_rows;
   lhs_params.cols = filter_cols;
   lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
   lhs_params.zero_point = -filter_offset;
-  cpu_backend_gemm::MatrixParams<uint8> rhs_params;
+  cpu_backend_gemm::MatrixParams<uint8_t> rhs_params;
   rhs_params.rows = gemm_input_rows;
   rhs_params.cols = gemm_input_cols;
   rhs_params.order = cpu_backend_gemm::Order::kColMajor;
   rhs_params.zero_point = -input_offset;
-  cpu_backend_gemm::MatrixParams<uint8> dst_params;
+  cpu_backend_gemm::MatrixParams<uint8_t> dst_params;
   dst_params.rows = output_rows;
   dst_params.cols = output_cols;
   dst_params.order = cpu_backend_gemm::Order::kColMajor;
   dst_params.zero_point = output_offset;
-  cpu_backend_gemm::GemmParams<int32, uint8> gemm_params;
+  cpu_backend_gemm::GemmParams<int32_t, uint8_t> gemm_params;
   gemm_params.bias = bias_data;
   gemm_params.clamp_min = output_activation_min;
   gemm_params.clamp_max = output_activation_max;
@@ -1433,37 +1433,37 @@ inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
 
 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
                             const RuntimeShape& input_shape,
-                            const uint8* input_data,
+                            const uint8_t* input_data,
                             const RuntimeShape& output_shape,
-                            uint8* output_data) {
+                            uint8_t* output_data) {
   ruy::profiler::ScopeLabel label("L2Normalization/8bit");
   const int trailing_dim = input_shape.DimensionsCount() - 1;
   const int depth =
       MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
   const int outer_size =
       MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-  const int32 input_zero_point = op_params.input_zero_point;
+  const int32_t input_zero_point = op_params.input_zero_point;
   for (int i = 0; i < outer_size; ++i) {
-    int32 square_l2_norm = 0;
+    int32_t square_l2_norm = 0;
     for (int c = 0; c < depth; c++) {
       // Note that input_data advances by depth in the second pass below.
-      int32 diff = input_data[c] - input_zero_point;
+      int32_t diff = input_data[c] - input_zero_point;
       square_l2_norm += diff * diff;
     }
     // TODO(b/29395854): add clamping to TOCO and TF Lite kernel
     // for all zero tensors in the input_data
-    int32 inv_l2norm_multiplier;
+    int32_t inv_l2norm_multiplier;
     int inv_l2norm_shift;
     GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
                                      &inv_l2norm_multiplier, &inv_l2norm_shift);
 
     for (int c = 0; c < depth; c++) {
-      int32 diff = *input_data - input_zero_point;
-      int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      int32_t diff = *input_data - input_zero_point;
+      int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
           128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
-      int32 unclamped_output_val = 128 + rescaled_diff;
-      int32 output_val = std::min(255, std::max(0, unclamped_output_val));
-      *output_data = static_cast<uint8>(output_val);
+      int32_t unclamped_output_val = 128 + rescaled_diff;
+      int32_t output_val = std::min(255, std::max(0, unclamped_output_val));
+      *output_data = static_cast<uint8_t>(output_val);
       ++input_data;
       ++output_data;
     }
@@ -1534,8 +1534,8 @@ inline void Add(const ArithmeticParams& params,
 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
 inline void AddElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
-                           uint8* output_data) {
+                           const uint8_t* input1_data,
+                           const uint8_t* input2_data, uint8_t* output_data) {
   ruy::profiler::ScopeLabel label("AddElementwise/8bit");
   int i = 0;
   TFLITE_DCHECK_GT(params.input1_offset, -256);
@@ -1600,25 +1600,25 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
 #endif  // NEON
 
   for (; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             raw_sum, params.output_multiplier, params.output_shift) +
         params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
   }
 }
 
@@ -1626,8 +1626,8 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
 // broadcast add, so that, for example, scalar-broadcast with batch will still
 // be fast.
 inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
-                               uint8 input1_data, const uint8* input2_data,
-                               uint8* output_data) {
+                               uint8_t input1_data, const uint8_t* input2_data,
+                               uint8_t* output_data) {
   using gemmlowp::RoundingDivideByPOT;
 
   ruy::profiler::ScopeLabel label("AddScalarBroadcast/8bit");
@@ -1699,28 +1699,28 @@ inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
 
   if (i < size) {
     // Process broadcast scalar.
-    const int32 input1_val = params.input1_offset + input1_data;
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = params.input1_offset + input1_data;
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input1_val, params.input1_multiplier, params.input1_shift);
 
     for (; i < size; ++i) {
-      const int32 input2_val = params.input2_offset + input2_data[i];
-      const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-      const int32 scaled_input2_val =
+      const int32_t input2_val = params.input2_offset + input2_data[i];
+      const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+      const int32_t scaled_input2_val =
           MultiplyByQuantizedMultiplierSmallerThanOneExp(
               shifted_input2_val, params.input2_multiplier,
               params.input2_shift);
-      const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-      const int32 raw_output =
+      const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+      const int32_t raw_output =
           MultiplyByQuantizedMultiplierSmallerThanOneExp(
               raw_sum, params.output_multiplier, params.output_shift) +
           params.output_offset;
-      const int32 clamped_output =
+      const int32_t clamped_output =
           std::min(params.quantized_activation_max,
                    std::max(params.quantized_activation_min, raw_output));
-      output_data[i] = static_cast<uint8>(clamped_output);
+      output_data[i] = static_cast<uint8_t>(clamped_output);
     }
   }
 }
@@ -1759,9 +1759,9 @@ inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
 }
 
 inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   ruy::profiler::ScopeLabel label("Add/8bit");
@@ -1776,9 +1776,9 @@ inline void Add(const ArithmeticParams& params,
 }
 
 inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
-                const RuntimeShape& output_shape, int16* output_data) {
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int16_t* output_data) {
   ruy::profiler::ScopeLabel label("Add/Int16");
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
@@ -1786,14 +1786,15 @@ inline void Add(const ArithmeticParams& params,
   const int input1_shift = params.input1_shift;
   const int flat_size =
       MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  const int16 output_activation_min = params.quantized_activation_min;
-  const int16 output_activation_max = params.quantized_activation_max;
+  const int16_t output_activation_min = params.quantized_activation_min;
+  const int16_t output_activation_max = params.quantized_activation_max;
 
   TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
   TFLITE_DCHECK_LE(input1_shift, 0);
   TFLITE_DCHECK_LE(params.input2_shift, 0);
-  const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
-  const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int16_t* not_shift_input =
+      input1_shift == 0 ? input1_data : input2_data;
+  const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
   const int input_right_shift =
       input1_shift == 0 ? -params.input2_shift : -input1_shift;
 
@@ -1805,8 +1806,8 @@ inline void Add(const ArithmeticParams& params,
     F0 scaled_input = F0::FromRaw(
         gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
     F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
-    const int16 raw_output = result.raw();
-    const int16 clamped_output = std::min(
+    const int16_t raw_output = result.raw();
+    const int16_t clamped_output = std::min(
         output_activation_max, std::max(output_activation_min, raw_output));
     output_data[i] = clamped_output;
   }
@@ -1867,11 +1868,11 @@ inline void BroadcastAddDispatch(
 
 inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
                                  const RuntimeShape& unswitched_input1_shape,
-                                 const uint8* unswitched_input1_data,
+                                 const uint8_t* unswitched_input1_data,
                                  const RuntimeShape& unswitched_input2_shape,
-                                 const uint8* unswitched_input2_data,
+                                 const uint8_t* unswitched_input2_data,
                                  const RuntimeShape& output_shape,
-                                 uint8* output_data) {
+                                 uint8_t* output_data) {
   BroadcastAddDispatch(unswitched_params, unswitched_input1_shape,
                        unswitched_input1_data, unswitched_input2_shape,
                        unswitched_input2_data, output_shape, output_data);
@@ -1946,6 +1947,63 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
   }
 }
 
+inline void MulElementwise(int32_t n, const ArithmeticParams& params,
+                           const int32_t* __restrict lhs,
+                           const int32_t* __restrict rhs,
+                           int32_t* __restrict out) {
+  const int32_t activation_min_val = params.quantized_activation_min;
+  const int32_t activation_max_val = params.quantized_activation_max;
+
+  int32_t i = 0;
+
+#ifdef USE_NEON
+  const int32x4_t activation_min = vdupq_n_s32(activation_min_val);
+  const int32x4_t activation_max = vdupq_n_s32(activation_max_val);
+
+  // Ewise Mul 16 elements at a time using 4 4-wide vector registers per loop.
+  for (; i <= n - 16; i += 16) {
+    // Load.
+    const int32x4_t lhs_reg = vld1q_s32(lhs + i);
+    const int32x4_t lhs_reg2 = vld1q_s32(lhs + i + 4);
+    const int32x4_t lhs_reg3 = vld1q_s32(lhs + i + 8);
+    const int32x4_t lhs_reg4 = vld1q_s32(lhs + i + 12);
+
+    const int32x4_t rhs_reg = vld1q_s32(rhs + i);
+    const int32x4_t rhs_reg2 = vld1q_s32(rhs + i + 4);
+    const int32x4_t rhs_reg3 = vld1q_s32(rhs + i + 8);
+    const int32x4_t rhs_reg4 = vld1q_s32(rhs + i + 12);
+
+    // Multiply.
+    const int32x4_t mul_reg = vmulq_s32(lhs_reg, rhs_reg);
+    const int32x4_t mul_reg2 = vmulq_s32(lhs_reg2, rhs_reg2);
+    const int32x4_t mul_reg3 = vmulq_s32(lhs_reg3, rhs_reg3);
+    const int32x4_t mul_reg4 = vmulq_s32(lhs_reg4, rhs_reg4);
+
+    // Apply activation.
+    const int32x4_t max_reg = vminq_s32(activation_max, mul_reg);
+    const int32x4_t max_reg2 = vminq_s32(activation_max, mul_reg2);
+    const int32x4_t max_reg3 = vminq_s32(activation_max, mul_reg3);
+    const int32x4_t max_reg4 = vminq_s32(activation_max, mul_reg4);
+    const int32x4_t min_reg = vmaxq_s32(activation_min, max_reg);
+    const int32x4_t min_reg2 = vmaxq_s32(activation_min, max_reg2);
+    const int32x4_t min_reg3 = vmaxq_s32(activation_min, max_reg3);
+    const int32x4_t min_reg4 = vmaxq_s32(activation_min, max_reg4);
+
+    // Store.
+    vst1q_s32(out + i, min_reg);
+    vst1q_s32(out + i + 4, min_reg2);
+    vst1q_s32(out + i + 8, min_reg3);
+    vst1q_s32(out + i + 12, min_reg4);
+  }
+#endif
+
+  // This will handle leftovers when n is not aligned to 4 elements.
+  for (; i < n; ++i) {
+    out[i] = ActivationFunctionWithMinMax(lhs[i] * rhs[i], activation_min_val,
+                                          activation_max_val);
+  }
+}
+
 inline void Mul(const ArithmeticParams& params,
                 const RuntimeShape& input1_shape, const float* input1_data,
                 const RuntimeShape& input2_shape, const float* input2_data,
@@ -1958,30 +2016,25 @@ inline void Mul(const ArithmeticParams& params,
 }
 
 inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int32* input1_data,
-                const RuntimeShape& input2_shape, const int32* input2_data,
-                const RuntimeShape& output_shape, int32* output_data) {
-  ruy::profiler::ScopeLabel label("Mul/int32/activation");
+                const RuntimeShape& input1_shape, const int32_t* input1_data,
+                const RuntimeShape& input2_shape, const int32_t* input2_data,
+                const RuntimeShape& output_shape, int32_t* output_data) {
+  ruy::profiler::ScopeLabel label("Mul/int32_t/activation");
 
   const int flat_size =
       MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] * input2_data[i], output_activation_min,
-        output_activation_max);
-  }
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
 inline void MulNoActivation(const ArithmeticParams& params,
                             const RuntimeShape& input1_shape,
-                            const int32* input1_data,
+                            const int32_t* input1_data,
                             const RuntimeShape& input2_shape,
-                            const int32* input2_data,
+                            const int32_t* input2_data,
                             const RuntimeShape& output_shape,
-                            int32* output_data) {
-  ruy::profiler::ScopeLabel label("Mul/int32");
+                            int32_t* output_data) {
+  ruy::profiler::ScopeLabel label("Mul/int32_t");
 
   auto input1_map = MapAsVector(input1_data, input1_shape);
   auto input2_map = MapAsVector(input2_data, input2_shape);
@@ -2002,9 +2055,9 @@ inline void MulNoActivation(const ArithmeticParams& params,
 }
 
 inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
-                const RuntimeShape& output_shape, int16* output_data) {
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int16_t* output_data) {
   ruy::profiler::ScopeLabel label("Mul/Int16/NoActivation");
   // This is a copy of the reference implementation. We do not currently have a
   // properly optimized version.
@@ -2023,15 +2076,15 @@ inline void Mul(const ArithmeticParams& params,
 }
 
 inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
   ruy::profiler::ScopeLabel label("Mul/Int16Uint8");
   // This is a copy of the reference implementation. We do not currently have a
   // properly optimized version.
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
-  const int32 output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int32_t output_offset = params.output_offset;
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
   const int flat_size =
@@ -2043,12 +2096,12 @@ inline void Mul(const ArithmeticParams& params,
 
     F0 unclamped_result =
         F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
-    int16 rescaled_result =
+    int16_t rescaled_result =
         gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
-    int16 clamped_result =
-        std::min<int16>(output_activation_max - output_offset, rescaled_result);
-    clamped_result =
-        std::max<int16>(output_activation_min - output_offset, clamped_result);
+    int16_t clamped_result = std::min<int16_t>(
+        output_activation_max - output_offset, rescaled_result);
+    clamped_result = std::max<int16_t>(output_activation_min - output_offset,
+                                       clamped_result);
     output_data[i] = output_offset + clamped_result;
   }
 }
@@ -2056,8 +2109,8 @@ inline void Mul(const ArithmeticParams& params,
 // Element-wise mul that can often be used for inner loop of broadcast Mul as
 // well as the non-broadcast Mul.
 inline void MulElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
-                           uint8* output_data) {
+                           const uint8_t* input1_data,
+                           const uint8_t* input2_data, uint8_t* output_data) {
   int i = 0;
   TFLITE_DCHECK_GT(params.input1_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
@@ -2115,25 +2168,26 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
 #endif  // NEON
 
   for (; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 unclamped_result =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
         params.output_offset +
         MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                       params.output_multiplier,
                                       params.output_shift);
-    const int32 clamped_output =
+    const int32_t clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, unclamped_result));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
   }
 }
 
 // Broadcast mul that can often be used for inner loop of broadcast Mul.
 inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
-                               const uint8 broadcast_value,
-                               const uint8* input2_data, uint8* output_data) {
-  const int16 input1_val = params.input1_offset + broadcast_value;
+                               const uint8_t broadcast_value,
+                               const uint8_t* input2_data,
+                               uint8_t* output_data) {
+  const int16_t input1_val = params.input1_offset + broadcast_value;
 
   int i = 0;
   TFLITE_DCHECK_GT(params.input1_offset, -256);
@@ -2185,16 +2239,16 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
 #endif  // NEON
 
   for (; i < size; ++i) {
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 unclamped_result =
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
         params.output_offset +
         MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                       params.output_multiplier,
                                       params.output_shift);
-    const int32 clamped_output =
+    const int32_t clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, unclamped_result));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
   }
 }
 
@@ -2232,9 +2286,9 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
 }
 
 inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   ruy::profiler::ScopeLabel label("Mul/8bit");
@@ -2265,11 +2319,11 @@ inline void BroadcastMulDispatch(
 
 inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
                                  const RuntimeShape& unswitched_input1_shape,
-                                 const uint8* unswitched_input1_data,
+                                 const uint8_t* unswitched_input1_data,
                                  const RuntimeShape& unswitched_input2_shape,
-                                 const uint8* unswitched_input2_data,
+                                 const uint8_t* unswitched_input2_data,
                                  const RuntimeShape& output_shape,
-                                 uint8* output_data) {
+                                 uint8_t* output_data) {
   BroadcastMulDispatch(unswitched_params, unswitched_input1_shape,
                        unswitched_input1_data, unswitched_input2_shape,
                        unswitched_input2_data, output_shape, output_data);
@@ -2347,11 +2401,11 @@ void BroadcastDivSlow(const ArithmeticParams& params,
 template <int N = 5>
 inline void BroadcastDivSlow(const ArithmeticParams& params,
                              const RuntimeShape& unextended_input1_shape,
-                             const uint8* input1_data,
+                             const uint8_t* input1_data,
                              const RuntimeShape& unextended_input2_shape,
-                             const uint8* input2_data,
+                             const uint8_t* input2_data,
                              const RuntimeShape& unextended_output_shape,
-                             uint8* output_data) {
+                             uint8_t* output_data) {
   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
@@ -2372,9 +2426,9 @@ inline void BroadcastDivSlow(const ArithmeticParams& params,
   TFLITE_DCHECK_LT(params.output_offset, 256);
 
   auto div_func = [&](int indexes[N]) {
-    int32 input1_val =
+    int32_t input1_val =
         params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
-    int32 input2_val =
+    int32_t input2_val =
         params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
     TFLITE_DCHECK_NE(input2_val, 0);
     if (input2_val < 0) {
@@ -2384,20 +2438,21 @@ inline void BroadcastDivSlow(const ArithmeticParams& params,
       input2_val = -input2_val;
     }
     int recip_shift;
-    const int32 input2_inv = GetReciprocal(input2_val, 31, &recip_shift);
+    const int32_t input2_inv = GetReciprocal(input2_val, 31, &recip_shift);
     const int headroom = CountLeadingSignBits(input1_val);
-    const int32 unscaled_quotient = MultiplyByQuantizedMultiplierGreaterThanOne(
-        input1_val, input2_inv, headroom);
+    const int32_t unscaled_quotient =
+        MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
+                                                    headroom);
     const int total_shift = params.output_shift - recip_shift - headroom;
-    const int32 unclamped_result =
+    const int32_t unclamped_result =
         params.output_offset +
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             unscaled_quotient, params.output_multiplier, total_shift);
-    const int32 clamped_output =
+    const int32_t clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, unclamped_result));
     output_data[SubscriptToIndex(output_desc, indexes)] =
-        static_cast<uint8>(clamped_output);
+        static_cast<uint8_t>(clamped_output);
   };
   NDOpsHelper<N>(output_desc, div_func);
 }
@@ -2578,25 +2633,25 @@ inline void LstmCell(
 template <int StateIntegerBits>
 inline void LstmCell(
     const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
-    const uint8* input_data_uint8,
+    const uint8_t* input_data_uint8,
     const RuntimeShape& unextended_prev_activ_shape,
-    const uint8* prev_activ_data_uint8, const RuntimeShape& weights_shape,
-    const uint8* weights_data_uint8, const RuntimeShape& unextended_bias_shape,
-    const int32* bias_data_int32,
+    const uint8_t* prev_activ_data_uint8, const RuntimeShape& weights_shape,
+    const uint8_t* weights_data_uint8,
+    const RuntimeShape& unextended_bias_shape, const int32_t* bias_data_int32,
     const RuntimeShape& unextended_prev_state_shape,
-    const int16* prev_state_data_int16,
+    const int16_t* prev_state_data_int16,
     const RuntimeShape& unextended_output_state_shape,
-    int16* output_state_data_int16,
+    int16_t* output_state_data_int16,
     const RuntimeShape& unextended_output_activ_shape,
-    uint8* output_activ_data_uint8,
+    uint8_t* output_activ_data_uint8,
     const RuntimeShape& unextended_concat_temp_shape,
-    uint8* concat_temp_data_uint8,
+    uint8_t* concat_temp_data_uint8,
     const RuntimeShape& unextended_activ_temp_shape,
-    int16* activ_temp_data_int16, CpuBackendContext* cpu_backend_context) {
+    int16_t* activ_temp_data_int16, CpuBackendContext* cpu_backend_context) {
   ruy::profiler::ScopeLabel label(
       "LstmCell/quantized (8bit external, 16bit internal)");
-  int32 weights_zero_point = params.weights_zero_point;
-  int32 accum_multiplier = params.accum_multiplier;
+  int32_t weights_zero_point = params.weights_zero_point;
+  int32_t accum_multiplier = params.accum_multiplier;
   int accum_shift = params.accum_shift;
   TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
@@ -2651,8 +2706,8 @@ inline void LstmCell(
   TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
 
   // Depth-concatenate prev_activ and input data together.
-  uint8 const* concat_input_arrays_data[2] = {input_data_uint8,
-                                              prev_activ_data_uint8};
+  uint8_t const* concat_input_arrays_data[2] = {input_data_uint8,
+                                                prev_activ_data_uint8};
   const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape,
                                                        &prev_activ_shape};
   tflite::ConcatenationParams concat_params;
@@ -2667,22 +2722,22 @@ inline void LstmCell(
   // integers, and the output is 16-bit fixed-point with 3 integer bits so
   // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
   // is explained in the function comment above.
-  cpu_backend_gemm::MatrixParams<uint8> lhs_params;
+  cpu_backend_gemm::MatrixParams<uint8_t> lhs_params;
   lhs_params.rows = fc_output_depth;
   lhs_params.cols = fc_accum_depth;
   lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
   lhs_params.zero_point = weights_zero_point;
-  cpu_backend_gemm::MatrixParams<uint8> rhs_params;
+  cpu_backend_gemm::MatrixParams<uint8_t> rhs_params;
   rhs_params.rows = fc_accum_depth;
   rhs_params.cols = fc_batches;
   rhs_params.order = cpu_backend_gemm::Order::kColMajor;
   rhs_params.zero_point = 128;
-  cpu_backend_gemm::MatrixParams<int16> dst_params;
+  cpu_backend_gemm::MatrixParams<int16_t> dst_params;
   dst_params.rows = fc_output_depth;
   dst_params.cols = fc_batches;
   dst_params.order = cpu_backend_gemm::Order::kColMajor;
   dst_params.zero_point = 0;
-  cpu_backend_gemm::GemmParams<int32, int16> gemm_params;
+  cpu_backend_gemm::GemmParams<int32_t, int16_t> gemm_params;
   gemm_params.bias = bias_data_int32;
   gemm_params.multiplier_fixedpoint = accum_multiplier;
   gemm_params.multiplier_exponent = accum_shift;
@@ -2692,21 +2747,23 @@ inline void LstmCell(
 
   // Rest of the LSTM cell: tanh and logistic math functions, and some adds
   // and muls, all done in 16-bit fixed-point.
-  const int16* input_gate_input_ptr = activ_temp_data_int16;
-  const int16* input_modulation_gate_input_ptr =
+  const int16_t* input_gate_input_ptr = activ_temp_data_int16;
+  const int16_t* input_modulation_gate_input_ptr =
       activ_temp_data_int16 + output_depth;
-  const int16* forget_gate_input_ptr = activ_temp_data_int16 + 2 * output_depth;
-  const int16* output_gate_input_ptr = activ_temp_data_int16 + 3 * output_depth;
-  const int16* prev_state_ptr = prev_state_data_int16;
-  int16* output_state_data_ptr = output_state_data_int16;
-  uint8* output_activ_data_ptr = output_activ_data_uint8;
+  const int16_t* forget_gate_input_ptr =
+      activ_temp_data_int16 + 2 * output_depth;
+  const int16_t* output_gate_input_ptr =
+      activ_temp_data_int16 + 3 * output_depth;
+  const int16_t* prev_state_ptr = prev_state_data_int16;
+  int16_t* output_state_data_ptr = output_state_data_int16;
+  uint8_t* output_activ_data_ptr = output_activ_data_uint8;
 
   for (int b = 0; b < outer_size; ++b) {
     int c = 0;
 #ifdef GEMMLOWP_NEON
     for (; c <= output_depth - 8; c += 8) {
       // Define the fixed-point data types that we will use here. All use
-      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // int16_t as the underlying integer type i.e. all are 16-bit fixed-point.
       // They only differ by the number of integral vs. fractional bits,
       // determining the range of values that they can represent.
       //
@@ -2780,7 +2837,7 @@ inline void LstmCell(
 #endif
     for (; c < output_depth; ++c) {
       // Define the fixed-point data types that we will use here. All use
-      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // int16_t as the underlying integer type i.e. all are 16-bit fixed-point.
       // They only differ by the number of integral vs. fractional bits,
       // determining the range of values that they can represent.
       //
@@ -2837,10 +2894,10 @@ inline void LstmCell(
       *output_state_data_ptr++ = new_state.raw();
       // Down-scale the output activations to 8-bit integers, saturating,
       // and store back to memory.
-      int16 rescaled_output_activ =
+      int16_t rescaled_output_activ =
           gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
-      int16 clamped_output_activ =
-          std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
+      int16_t clamped_output_activ = std::max<int16_t>(
+          -128, std::min<int16_t>(127, rescaled_output_activ));
       *output_activ_data_ptr++ = 128 + clamped_output_activ;
     }
     input_gate_input_ptr += 3 * output_depth;
@@ -2923,8 +2980,9 @@ inline bool AveragePool(const PoolParams& params,
 
 inline bool AveragePool(const PoolParams& params,
                         const RuntimeShape& input_shape,
-                        const uint8* input_data,
-                        const RuntimeShape& output_shape, uint8* output_data) {
+                        const uint8_t* input_data,
+                        const RuntimeShape& output_shape,
+                        uint8_t* output_data) {
   ruy::profiler::ScopeLabel label("AveragePool/8bit");
 
   // Here, and in other pooling ops, in order to maintain locality of reference,
@@ -2947,7 +3005,7 @@ inline bool AveragePool(const PoolParams& params,
   const int stride_height = params.stride_height;
   const int stride_width = params.stride_width;
 
-  uint32 acc[kPoolingAccTrancheSize];
+  uint32_t acc[kPoolingAccTrancheSize];
   for (int batch = 0; batch < batches; ++batch) {
     // We proceed through the depth in tranches (see comment above). The
     // depth_base is the depth at the beginning of the tranche. The
@@ -2972,15 +3030,15 @@ inline bool AveragePool(const PoolParams& params,
               (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
           if (filter_count == 0) return false;
           memset(acc, 0, tranche_depth * sizeof(acc[0]));
-          const uint8* input_ptr =
+          const uint8_t* input_ptr =
               input_data + depth_base +
               depth * (in_x_origin +
                        input_width * (in_y_origin + input_height * batch));
           for (int fy = filter_y_start; fy < filter_y_end; fy++) {
-            const uint8* input_row_ptr =
+            const uint8_t* input_row_ptr =
                 input_ptr + depth * (fy * input_width + filter_x_start);
             for (int fx = filter_x_start; fx < filter_x_end; fx++) {
-              const uint8* input_channel_ptr = input_row_ptr;
+              const uint8_t* input_channel_ptr = input_row_ptr;
               int channel = 0;
 #ifdef USE_NEON
               for (; channel <= tranche_depth - 16; channel += 16) {
@@ -3016,14 +3074,14 @@ inline bool AveragePool(const PoolParams& params,
               input_row_ptr += depth;
             }
           }
-          uint8* output_ptr = output_data + Offset(output_shape, batch, out_y,
-                                                   out_x, depth_base);
+          uint8_t* output_ptr = output_data + Offset(output_shape, batch, out_y,
+                                                     out_x, depth_base);
           int channel = 0;
 #ifdef USE_NEON
 #define AVGPOOL_DIVIDING_BY(FILTER_COUNT)                               \
   if (filter_count == FILTER_COUNT) {                                   \
     for (; channel <= tranche_depth - 8; channel += 8) {                \
-      uint16 buf[8];                                                    \
+      uint16_t buf[8];                                                  \
       for (int i = 0; i < 8; i++) {                                     \
         buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT;  \
       }                                                                 \
@@ -3037,7 +3095,7 @@ inline bool AveragePool(const PoolParams& params,
           AVGPOOL_DIVIDING_BY(15)
 #undef AVGPOOL_DIVIDING_BY
           for (; channel <= tranche_depth - 8; channel += 8) {
-            uint16 buf[8];
+            uint16_t buf[8];
             for (int i = 0; i < 8; i++) {
               buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
             }
@@ -3048,10 +3106,10 @@ inline bool AveragePool(const PoolParams& params,
           }
 #endif
           for (; channel < tranche_depth; ++channel) {
-            uint16 a = (acc[channel] + filter_count / 2) / filter_count;
-            a = std::max<uint16>(a, params.quantized_activation_min);
-            a = std::min<uint16>(a, params.quantized_activation_max);
-            output_ptr[channel] = static_cast<uint8>(a);
+            uint16_t a = (acc[channel] + filter_count / 2) / filter_count;
+            a = std::max<uint16_t>(a, params.quantized_activation_min);
+            a = std::min<uint16_t>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<uint8_t>(a);
           }
         }
       }
@@ -3115,8 +3173,8 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
 }
 
 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
-                    const uint8* input_data, const RuntimeShape& output_shape,
-                    uint8* output_data) {
+                    const uint8_t* input_data, const RuntimeShape& output_shape,
+                    uint8_t* output_data) {
   ruy::profiler::ScopeLabel label("MaxPool/8bit");
 
   // Here, and in other pooling ops, in order to maintain locality of reference,
@@ -3139,7 +3197,7 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
   const int stride_height = params.stride_height;
   const int stride_width = params.stride_width;
 
-  uint8 acc[kPoolingAccTrancheSize];
+  uint8_t acc[kPoolingAccTrancheSize];
   for (int batch = 0; batch < batches; ++batch) {
     // We proceed through the depth in tranches (see comment above). The
     // depth_base is the depth at the beginning of the tranche. The
@@ -3161,15 +3219,15 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
           const int filter_y_end =
               std::min(params.filter_height, input_height - in_y_origin);
           memset(acc, 0, tranche_depth * sizeof(acc[0]));
-          const uint8* input_ptr =
+          const uint8_t* input_ptr =
               input_data + depth_base +
               depth * (in_x_origin +
                        input_width * (in_y_origin + input_height * batch));
           for (int fy = filter_y_start; fy < filter_y_end; fy++) {
-            const uint8* input_row_ptr =
+            const uint8_t* input_row_ptr =
                 input_ptr + depth * (fy * input_width + filter_x_start);
             for (int fx = filter_x_start; fx < filter_x_end; fx++) {
-              const uint8* input_channel_ptr = input_row_ptr;
+              const uint8_t* input_channel_ptr = input_row_ptr;
               int channel = 0;
 #ifdef USE_NEON
               for (; channel <= tranche_depth - 16; channel += 16) {
@@ -3194,8 +3252,8 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
               input_row_ptr += depth;
             }
           }
-          uint8* output_ptr = output_data + Offset(output_shape, batch, out_y,
-                                                   out_x, depth_base);
+          uint8_t* output_ptr = output_data + Offset(output_shape, batch, out_y,
+                                                     out_x, depth_base);
           int channel = 0;
 #ifdef USE_NEON
           for (; channel <= tranche_depth - 16; channel += 16) {
@@ -3212,10 +3270,10 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
           }
 #endif
           for (; channel < tranche_depth; ++channel) {
-            uint8 a = acc[channel];
-            a = std::max<uint8>(a, params.quantized_activation_min);
-            a = std::min<uint8>(a, params.quantized_activation_max);
-            output_ptr[channel] = static_cast<uint8>(a);
+            uint8_t a = acc[channel];
+            a = std::max<uint8_t>(a, params.quantized_activation_min);
+            a = std::min<uint8_t>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<uint8_t>(a);
           }
         }
       }
@@ -3498,7 +3556,7 @@ inline void Softmax(const SoftmaxParams& params,
 // softmax(x) = e^(x - CONST) / sum(e^(x - CONST), 0...n)
 //
 // For quantization, `x` in our case is (input_q - input_zp) * input_s
-// For uint8 case (int8 can be handled similarly), the range is [0, 255]
+// For uint8_t case (int8_t can be handled similarly), the range is [0, 255]
 //
 // so if we let
 // CONST = (255 - input_zp) * input_s
@@ -3508,7 +3566,7 @@ inline void Softmax(const SoftmaxParams& params,
 // sum(e^(input_q - 255) * input_s, 0...n)   -------- (2)
 //
 // the good thing about (1) is it's within the range of (0, 1), so we can
-// approximate its result with uint16.
+// approximate its result with uint16_t.
 //  (1) = uint8_out * 1 / 2^16.
 //
 // so (1) is lookup_uint8_table(input_zp) * 1 / 2^16.
@@ -3522,8 +3580,8 @@ inline void Softmax(const SoftmaxParams& params,
 //             +
 //   output_zp
 //
-// We can actually further improve the performance by using uint8 instead of
-// uint16. But that we may lose some accuracy, so we need to pay attention
+// We can actually further improve the performance by using uint8_t instead of
+// uint16_t. But that we may lose some accuracy, so we need to pay attention
 // to that.
 inline void PopulateSoftmaxUInt8LookupTable(SoftmaxParams* data,
                                             float input_scale, float beta) {
@@ -3553,7 +3611,7 @@ inline int FindMaxValue(int size, const uint8_t* input_data, uint8_t offset) {
     input_value = veorq_u8(input_value, offset_dup);
     max_val_dup = vmaxq_u8(input_value, max_val_dup);
   }
-  max_val = std::max(max_val, static_cast<int32>(vmaxvq_u8(max_val_dup)));
+  max_val = std::max(max_val, static_cast<int32_t>(vmaxvq_u8(max_val_dup)));
 #endif
 
   for (; j < size; ++j) {
@@ -3608,12 +3666,12 @@ inline void SoftmaxInt8LUT(const SoftmaxParams& params,
   const int32_t clamp_min = std::numeric_limits<Out>::min();
 
   // Offset is used to interpret the input data "correctly".
-  // If the input is uint8, the data will be unchanged.
-  // If the input is int8, since it will be reinterpret as uint8.
+  // If the input is uint8_t, the data will be unchanged.
+  // If the input is int8_t, since it will be reinterpret as uint8_t.
   // e.g.,
-  // int8 127 will be applied "offset" to become 255 in uint8.
+  // int8_t 127 will be applied "offset" to become 255 in uint8_t.
   uint8_t offset = 0;
-  if (std::is_same<In, int8>::value) {
+  if (std::is_same<In, int8_t>::value) {
     offset = 0x80;
   }
 
@@ -3641,7 +3699,7 @@ inline void SoftmaxInt8LUT(const SoftmaxParams& params,
     // Find max quantized value.
     int32_t max_val = FindMaxValue(last_dim, input_data_uint, offset);
 
-    int32 sum_exp = 0;
+    int32_t sum_exp = 0;
     const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
     const uint8_t table_offset = max_uint8 - max_val;
 
@@ -3686,7 +3744,7 @@ inline void SoftmaxInt8LUT(const SoftmaxParams& params,
 
     const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
 
-    int32 multiplier, shift;
+    int32_t multiplier, shift;
     QuantizeMultiplier(inv_sum_exp, &multiplier, &shift);
 
     // Normalize and quantize probabilities.
@@ -3782,8 +3840,9 @@ inline void LogSoftmax(const SoftmaxParams& params,
 
 // Backwards compatibility. Less optimized than below version.
 inline void LogSoftmax(const SoftmaxParams& params,
-                       const RuntimeShape& input_shape, const uint8* input_data,
-                       const RuntimeShape& output_shape, uint8* output_data) {
+                       const RuntimeShape& input_shape,
+                       const uint8_t* input_data,
+                       const RuntimeShape& output_shape, uint8_t* output_data) {
   reference_ops::LogSoftmax(params, input_shape, input_data, output_shape,
                             output_data);
 }
@@ -3794,7 +3853,7 @@ inline void LogSoftmax(const SoftmaxParams& params,
 //
 // To handle quantization, first dequantize the inputs (from doing
 // e^(input scale * val) where we ignore the zero point since it cancels
-// out during subtraction due to the ln) and do a rescale at the end to int8.
+// out during subtraction due to the ln) and do a rescale at the end to int8_t.
 //
 // Notably this makes use of float and is intended as the optimized
 // form for quantized execution on CPU. For a fully integer version,
@@ -3825,7 +3884,7 @@ inline void LogSoftmax(const SoftmaxParams& params, float input_scale,
     }
 
     float sum_exp = 0.0f;
-    const int32_t max_uint8 = std::numeric_limits<uint8>::max();
+    const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
     // Offset into table to compute exp(scale*(x - xmax)) instead of
     // exp(scale*(x)) to prevent overflow.
     const float* table_offset = &params.table[max_uint8 - max_val];
@@ -3875,8 +3934,8 @@ inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
 }
 
 inline void Logistic(const LogisticParams& params,
-                     const RuntimeShape& input_shape, const int16* input_data,
-                     const RuntimeShape& output_shape, int16* output_data) {
+                     const RuntimeShape& input_shape, const int16_t* input_data,
+                     const RuntimeShape& output_shape, int16_t* output_data) {
   ruy::profiler::ScopeLabel label("Logistic/Int16");
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
@@ -3884,8 +3943,8 @@ inline void Logistic(const LogisticParams& params,
   }
 
   int c = 0;
-  const int16* input_data_ptr = input_data;
-  int16* output_data_ptr = output_data;
+  const int16_t* input_data_ptr = input_data;
+  int16_t* output_data_ptr = output_data;
 #ifdef GEMMLOWP_NEON
   {
     // F0 uses 0 integer bits, range [-1, 1].
@@ -3988,8 +4047,8 @@ inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
 }
 
 inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
-                 const int16* input_data, const RuntimeShape& output_shape,
-                 int16* output_data) {
+                 const int16_t* input_data, const RuntimeShape& output_shape,
+                 int16_t* output_data) {
   ruy::profiler::ScopeLabel label("Tanh/Int16");
   const int input_left_shift = params.input_left_shift;
   // Support for shifts is limited until we have a parameterized version of
@@ -4000,8 +4059,8 @@ inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
   int c = 0;
-  const int16* input_data_ptr = input_data;
-  int16* output_data_ptr = output_data;
+  const int16_t* input_data_ptr = input_data;
+  int16_t* output_data_ptr = output_data;
 #ifdef GEMMLOWP_NEON
   {
     // F0 uses 0 integer bits, range [-1, 1].
@@ -4201,11 +4260,14 @@ inline void GetIndexRange(int spatial_index_dim, int block_shape_dim,
 }
 
 template <typename T>
-inline void BatchToSpaceND(
-    const RuntimeShape& unextended_input1_shape, const T* input1_data,
-    const RuntimeShape& unextended_input2_shape, const int32* block_shape_data,
-    const RuntimeShape& unextended_input3_shape, const int32* crops_data,
-    const RuntimeShape& unextended_output_shape, T* output_data) {
+inline void BatchToSpaceND(const RuntimeShape& unextended_input1_shape,
+                           const T* input1_data,
+                           const RuntimeShape& unextended_input2_shape,
+                           const int32_t* block_shape_data,
+                           const RuntimeShape& unextended_input3_shape,
+                           const int32_t* crops_data,
+                           const RuntimeShape& unextended_output_shape,
+                           T* output_data) {
   ruy::profiler::ScopeLabel label("BatchToSpaceND");
 
   TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
@@ -4305,8 +4367,8 @@ TFLITE_NOINLINE void TypedMemset(void* ptr, T value, size_t num) {
 // equivalent to a simple input1_data.  For Pad, it should point to a zero
 // value.
 //
-// Note that two typenames are required, so that T=P=int32 is considered a
-// specialization distinct from P=int32.
+// Note that two typenames are required, so that T=P=int32_t is considered a
+// specialization distinct from P=int32_t.
 template <typename T, typename P>
 inline void PadImpl(const tflite::PadParams& op_params,
                     const RuntimeShape& input_shape, const T* input_data,
@@ -4449,11 +4511,11 @@ inline void Pad(const tflite::PadParams& op_params,
           output_data);
 }
 
-// The second (pad-value) input can be int32 when, say, the first is uint8.
+// The second (pad-value) input can be int32_t when, say, the first is uint8_t.
 template <typename T>
 inline void Pad(const tflite::PadParams& op_params,
                 const RuntimeShape& input_shape, const T* input_data,
-                const int32* pad_value_ptr, const RuntimeShape& output_shape,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
                 T* output_data) {
   const T converted_pad_value = static_cast<T>(*pad_value_ptr);
   PadImpl(op_params, input_shape, input_data, &converted_pad_value,
@@ -4463,9 +4525,9 @@ inline void Pad(const tflite::PadParams& op_params,
 // This version avoids conflicting template matching.
 template <>
 inline void Pad(const tflite::PadParams& op_params,
-                const RuntimeShape& input_shape, const int32* input_data,
-                const int32* pad_value_ptr, const RuntimeShape& output_shape,
-                int32* output_data) {
+                const RuntimeShape& input_shape, const int32_t* input_data,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
+                int32_t* output_data) {
   PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
           output_data);
 }
@@ -4474,15 +4536,15 @@ inline void Pad(const tflite::PadParams& op_params,
 //
 // This pad requires that (a) left and right paddings are in the 4D patterns
 // {0, h_pad, w_pad, 0}, and (b) memset can be used: *pad_value_ptr == 0 and/or
-// T is uint8.
+// T is uint8_t.
 //
 // There are two versions of pad: Pad and PadV2.  In PadV2 there is a second
 // scalar input that provides the padding value.  Therefore pad_value_ptr can be
 // equivalent to a simple input1_data.  For Pad, it should point to a zero
 // value.
 //
-// Note that two typenames are required, so that T=P=int32 is considered a
-// specialization distinct from P=int32.
+// Note that two typenames are required, so that T=P=int32_t is considered a
+// specialization distinct from P=int32_t.
 template <typename T, typename P>
 inline void PadImageStyleMemset(const tflite::PadParams& op_params,
                                 const RuntimeShape& input_shape,
@@ -4604,9 +4666,9 @@ inline void PadImageStyle(const tflite::PadParams& op_params,
 template <typename P>
 inline void PadImageStyle(const tflite::PadParams& op_params,
                           const RuntimeShape& input_shape,
-                          const uint8* input_data, const P* pad_value_ptr,
+                          const uint8_t* input_data, const P* pad_value_ptr,
                           const RuntimeShape& output_shape,
-                          uint8* output_data) {
+                          uint8_t* output_data) {
   PadImageStyleMemset(op_params, input_shape, input_data, pad_value_ptr,
                       output_shape, output_data);
 }
@@ -4723,7 +4785,7 @@ inline void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
 }
 
 template <typename T>
-void TransposeIm2col(const ConvParams& params, uint8 zero_byte,
+void TransposeIm2col(const ConvParams& params, uint8_t zero_byte,
                      const RuntimeShape& input_shape, const T* input_data,
                      const RuntimeShape& filter_shape,
                      const RuntimeShape& output_shape, T* im2col_data) {
@@ -4935,7 +4997,7 @@ inline void Quantize(int32_t multiplier, int32_t shift, int32_t total_size,
                      int32_t output_zp, const int32_t output_min,
                      const int32_t output_max, int32_t* scratch,
                      uint8_t* output) {
-  ruy::profiler::ScopeLabel label("Quantize/uint8");
+  ruy::profiler::ScopeLabel label("Quantize/uint8_t");
   int i = 0;
 
 #ifdef USE_NEON
@@ -5000,7 +5062,7 @@ inline void Quantize(const int32_t* multiplier, const int32_t* shift,
                      int32_t channel_size, int32_t total_size,
                      int32_t output_zp, int32_t output_min, int32_t output_max,
                      int32_t* scratch, int8_t* output) {
-  ruy::profiler::ScopeLabel label("Quantize/int8");
+  ruy::profiler::ScopeLabel label("Quantize/int8_t");
 
   // Here we're trying to quantize the raw accumulators:
   //        output_channels
@@ -5062,7 +5124,7 @@ inline void Quantize(const int32_t* multiplier, const int32_t* shift,
       acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
       acc_2 = vminq_s32(acc_2, output_activation_max_vec);
 
-      // Saturating cast to int8 and store to destination.
+      // Saturating cast to int8_t and store to destination.
       const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
       const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
       const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2);
@@ -5076,12 +5138,12 @@ inline void Quantize(const int32_t* multiplier, const int32_t* shift,
   for (; c < channel_size; c++) {
     for (int n = 0; n < rows; ++n) {
       int loc = n * channel_size + c;
-      int32 acc = scratch[loc];
+      int32_t acc = scratch[loc];
       acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
       acc += output_zp;
       acc = std::max(acc, output_min);
       acc = std::min(acc, output_max);
-      output[loc] = static_cast<int8>(acc);
+      output[loc] = static_cast<int8_t>(acc);
     }
   }
 }
@@ -5090,7 +5152,7 @@ inline void Quantize(const int32_t* multiplier, const int32_t* shift,
                      int32_t channel_size, int32_t total_size,
                      int32_t output_zp, int32_t output_min, int32_t output_max,
                      int32_t* scratch, int16_t* output) {
-  ruy::profiler::ScopeLabel label("Quantize(Single-rounding)/int16");
+  ruy::profiler::ScopeLabel label("Quantize(Single-rounding)/int16_t");
 
   // Here we're trying to quantize the raw accumulators:
   //        output_channels
@@ -5152,7 +5214,7 @@ inline void Quantize(const int32_t* multiplier, const int32_t* shift,
       acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
       acc_2 = vminq_s32(acc_2, output_activation_max_vec);
 
-      // Saturating cast to int16 and store to destination.
+      // Saturating cast to int16_t and store to destination.
       const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
       const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
       vst1_s16(reinterpret_cast<int16_t*>(output) + loc, acc_s16_1);
@@ -5165,12 +5227,12 @@ inline void Quantize(const int32_t* multiplier, const int32_t* shift,
   for (; c < channel_size; c++) {
     for (int n = 0; n < rows; ++n) {
       int loc = n * channel_size + c;
-      int32 acc = scratch[loc];
+      int32_t acc = scratch[loc];
       acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
       acc += output_zp;
       acc = std::max(acc, output_min);
       acc = std::min(acc, output_max);
-      output[loc] = static_cast<int16>(acc);
+      output[loc] = static_cast<int16_t>(acc);
     }
   }
 }
@@ -5180,7 +5242,7 @@ inline void Quantize(const int32_t* multiplier, const int32_t* shift,
                      int32_t channel_size, int32_t total_size,
                      int32_t output_zp, int32_t output_min, int32_t output_max,
                      int32_t* scratch, int8_t* output) {
-  ruy::profiler::ScopeLabel label("Quantize/int8");
+  ruy::profiler::ScopeLabel label("Quantize/int8_t");
 
   // Here we're trying to quantize the raw accumulators:
   //        output_channels
@@ -5243,7 +5305,7 @@ inline void Quantize(const int32_t* multiplier, const int32_t* shift,
       acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
       acc_2 = vminq_s32(acc_2, output_activation_max_vec);
 
-      // Saturating cast to int8 and store to destination.
+      // Saturating cast to int8_t and store to destination.
       const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
       const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
       const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2);
@@ -5257,12 +5319,12 @@ inline void Quantize(const int32_t* multiplier, const int32_t* shift,
   for (; c < channel_size; c++) {
     for (int n = 0; n < rows; ++n) {
       int loc = n * channel_size + c;
-      int32 acc = scratch[loc];
+      int32_t acc = scratch[loc];
       acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
       acc += output_zp;
       acc = std::max(acc, output_min);
       acc = std::min(acc, output_max);
-      output[loc] = static_cast<int8>(acc);
+      output[loc] = static_cast<int8_t>(acc);
     }
   }
 }
@@ -5271,7 +5333,7 @@ inline void Quantize(const int32_t* multiplier, const int32_t* shift,
                      int32_t channel_size, int32_t total_size,
                      int32_t output_zp, int32_t output_min, int32_t output_max,
                      int32_t* scratch, int16_t* output) {
-  ruy::profiler::ScopeLabel label("Quantize(Double-rounding)/int16");
+  ruy::profiler::ScopeLabel label("Quantize(Double-rounding)/int16_t");
 
   // Here we're trying to quantize the raw accumulators:
   //        output_channels
@@ -5334,7 +5396,7 @@ inline void Quantize(const int32_t* multiplier, const int32_t* shift,
       acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
       acc_2 = vminq_s32(acc_2, output_activation_max_vec);
 
-      // Saturating cast to int16 and store to destination.
+      // Saturating cast to int16_t and store to destination.
       const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
       const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
       vst1_s16(reinterpret_cast<int16_t*>(output) + loc, acc_s16_1);
@@ -5347,12 +5409,12 @@ inline void Quantize(const int32_t* multiplier, const int32_t* shift,
   for (; c < channel_size; c++) {
     for (int n = 0; n < rows; ++n) {
       int loc = n * channel_size + c;
-      int32 acc = scratch[loc];
+      int32_t acc = scratch[loc];
       acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
       acc += output_zp;
       acc = std::max(acc, output_min);
       acc = std::min(acc, output_max);
-      output[loc] = static_cast<int16>(acc);
+      output[loc] = static_cast<int16_t>(acc);
     }
   }
 }
@@ -5363,11 +5425,11 @@ inline void TransposeConvV2(
     const ConvParams& params, const RuntimeShape& input_shape,
     const uint8_t* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
     const uint8_t* hwoi_ordered_filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
     uint8_t* output_data, const RuntimeShape& col2im_shape,
     int32_t* col2im_data, int32_t* scratch_data,
     CpuBackendContext* cpu_backend_context) {
-  ruy::profiler::ScopeLabel label("TransposeConvV2/uint8");
+  ruy::profiler::ScopeLabel label("TransposeConvV2/uint8_t");
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(hwoi_ordered_filter_shape.DimensionsCount(), 4);
   TFLITE_DCHECK(col2im_data);
@@ -5396,8 +5458,8 @@ inline void TransposeConvV2(
   const int stride_height = params.stride_height;
   const int stride_width = params.stride_width;
 
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
 
   const int hwoi_ordered_filter_total_size =
       filter_height * filter_width * output_depth;
@@ -5409,7 +5471,8 @@ inline void TransposeConvV2(
   lhs_params.zero_point = -params.weights_offset;
 
   int32_t* scratch_data_p = scratch_data;
-  std::fill_n(scratch_data, output_offset * batch_size, static_cast<int32>(0));
+  std::fill_n(scratch_data, output_offset * batch_size,
+              static_cast<int32_t>(0));
   for (int i = 0; i < batch_size; ++i) {
     cpu_backend_gemm::MatrixParams<uint8_t> rhs_params;
     rhs_params.order = cpu_backend_gemm::Order::kColMajor;
@@ -5450,9 +5513,9 @@ inline void TransposeConvV2(
 // version.
 inline void ResizeNearestNeighbor(
     const tflite::ResizeNearestNeighborParams& op_params,
-    const RuntimeShape& unextended_input_shape, const uint8* input_data,
-    const RuntimeShape& output_size_shape, const int32* output_size_data,
-    const RuntimeShape& unextended_output_shape, uint8* output_data) {
+    const RuntimeShape& unextended_input_shape, const uint8_t* input_data,
+    const RuntimeShape& output_size_shape, const int32_t* output_size_data,
+    const RuntimeShape& unextended_output_shape, uint8_t* output_data) {
   if (op_params.align_corners || op_params.half_pixel_centers) {
     // TODO(b/149823713): Add support for align_corners & half_pixel_centers in
     // this kernel.
@@ -5469,42 +5532,42 @@ inline void ResizeNearestNeighbor(
   const RuntimeShape output_shape =
       RuntimeShape::ExtendedShape(4, unextended_output_shape);
 
-  int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
-  int32 input_height = input_shape.Dims(1);
-  int32 input_width = input_shape.Dims(2);
-  int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
+  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32_t input_height = input_shape.Dims(1);
+  int32_t input_width = input_shape.Dims(2);
+  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
 
   // The Tensorflow version of this op allows resize on the width and height
   // axis only.
   TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
-  int32 output_height = output_size_data[0];
-  int32 output_width = output_size_data[1];
+  int32_t output_height = output_size_data[0];
+  int32_t output_width = output_size_data[1];
 
   // Convert scales to fixed-point with 16 fractional bits. We add 1 as an
   // error factor and to avoid zero scales. For example, with input_height = 1,
   // output_height = 3, the float scaling factor would be non-zero at 1/3.
   // With fixed-point, this is zero.
-  int32 height_scale = (input_height << 16) / output_height + 1;
-  int32 width_scale = (input_width << 16) / output_width + 1;
+  int32_t height_scale = (input_height << 16) / output_height + 1;
+  int32_t width_scale = (input_width << 16) / output_width + 1;
 
   const int col_offset = input_shape.Dims(3);
   const int row_offset = input_shape.Dims(2) * col_offset;
   const int batch_offset = input_shape.Dims(1) * row_offset;
 
-  const uint8* input_ptr = input_data;
-  uint8* output_ptr = output_data;
+  const uint8_t* input_ptr = input_data;
+  uint8_t* output_ptr = output_data;
   for (int b = 0; b < batches; ++b) {
     for (int y = 0; y < output_height; ++y) {
-      int32 in_y = std::min((y * height_scale) >> 16, input_height - 1);
+      int32_t in_y = std::min((y * height_scale) >> 16, input_height - 1);
       // Check offset calculation is the same as the reference version. See
       // function comment for details. We check using a non-float version of:
       // TFLITE_DCHECK_EQ(in_y, std::floor(y * (static_cast<float>(input_height)
       //                                            / output_height)));
       TFLITE_DCHECK_LT(y * input_height, output_height + in_y * output_height);
       TFLITE_DCHECK_GE(y * input_height, in_y * output_height);
-      const uint8* y_input_ptr = input_ptr + in_y * row_offset;
+      const uint8_t* y_input_ptr = input_ptr + in_y * row_offset;
       for (int x = 0; x < output_width; ++x) {
-        int32 in_x = std::min((x * width_scale) >> 16, input_width - 1);
+        int32_t in_x = std::min((x * width_scale) >> 16, input_width - 1);
         // Check offset calculation is the same as the reference version. See
         // function comment for details. We check using a non-float version of:
         // TFLITE_DCHECK_EQ(in_y,
@@ -5512,7 +5575,7 @@ inline void ResizeNearestNeighbor(
         //                                      / output_width)));
         TFLITE_DCHECK_LT(x * input_width, output_width + in_x * output_width);
         TFLITE_DCHECK_GE(x * input_width, in_x * output_width);
-        const uint8* x_input_ptr = y_input_ptr + in_x * col_offset;
+        const uint8_t* x_input_ptr = y_input_ptr + in_x * col_offset;
         memcpy(output_ptr, x_input_ptr, depth);
         output_ptr += depth;
       }
@@ -6178,7 +6241,7 @@ inline void Dequantize(const tflite::DequantizationParams& op_params,
                        const uint8_t* input_data,
                        const RuntimeShape& output_shape, float* output_data) {
   ruy::profiler::ScopeLabel label("Dequantize/Uint8");
-  const int32 zero_point = op_params.zero_point;
+  const int32_t zero_point = op_params.zero_point;
   const double scale = op_params.scale;
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
@@ -6207,7 +6270,7 @@ inline void Dequantize(const tflite::DequantizationParams& op_params,
   }
 #endif  // NEON
   for (; i < flat_size; ++i) {
-    const int32 val = input_data[i];
+    const int32_t val = input_data[i];
     const float result = static_cast<float>(scale * (val - zero_point));
     output_data[i] = result;
   }
@@ -6218,7 +6281,7 @@ inline void Dequantize(const tflite::DequantizationParams& op_params,
                        const int8_t* input_data,
                        const RuntimeShape& output_shape, float* output_data) {
   ruy::profiler::ScopeLabel label("Dequantize/Int8");
-  const int32 zero_point = op_params.zero_point;
+  const int32_t zero_point = op_params.zero_point;
   const double scale = op_params.scale;
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
@@ -6246,7 +6309,7 @@ inline void Dequantize(const tflite::DequantizationParams& op_params,
   }
 #endif  // NEON
   for (; i < flat_size; ++i) {
-    const int32 val = input_data[i];
+    const int32_t val = input_data[i];
     const float result = static_cast<float>(scale * (val - zero_point));
     output_data[i] = result;
   }
@@ -6257,7 +6320,7 @@ inline void Dequantize(const tflite::DequantizationParams& op_params,
                        const int16_t* input_data,
                        const RuntimeShape& output_shape, float* output_data) {
   ruy::profiler::ScopeLabel label("Dequantize/Int16");
-  const int32 zero_point = op_params.zero_point;
+  const int32_t zero_point = op_params.zero_point;
   const double scale = op_params.scale;
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
@@ -6283,7 +6346,7 @@ inline void Dequantize(const tflite::DequantizationParams& op_params,
   }
 #endif  // NEON
   for (; i < flat_size; ++i) {
-    const int32 val = input_data[i];
+    const int32_t val = input_data[i];
     const float result = static_cast<float>(scale * (val - zero_point));
     output_data[i] = result;
   }
@@ -6311,11 +6374,11 @@ inline void AffineQuantize(const tflite::QuantizationParams& op_params,
                            const RuntimeShape& output_shape,
                            int8_t* output_data) {
   ruy::profiler::ScopeLabel label("Quantize/Int8");
-  const int32 zero_point = op_params.zero_point;
+  const int32_t zero_point = op_params.zero_point;
   const double scale = static_cast<double>(op_params.scale);
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
-  static constexpr int32 min_val = std::numeric_limits<int8_t>::min();
-  static constexpr int32 max_val = std::numeric_limits<int8_t>::max();
+  static constexpr int32_t min_val = std::numeric_limits<int8_t>::min();
+  static constexpr int32_t max_val = std::numeric_limits<int8_t>::max();
 
   int i = 0;
 #ifdef USE_NEON
@@ -6354,9 +6417,9 @@ inline void AffineQuantize(const tflite::QuantizationParams& op_params,
 
   for (; i < flat_size; ++i) {
     const float val = input_data[i];
-    const int32 unclamped =
-        static_cast<int32>(TfLiteRound(val / scale)) + zero_point;
-    const int32 clamped = std::min(std::max(unclamped, min_val), max_val);
+    const int32_t unclamped =
+        static_cast<int32_t>(TfLiteRound(val / scale)) + zero_point;
+    const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
     output_data[i] = clamped;
   }
 }
@@ -6368,11 +6431,11 @@ inline void AffineQuantize(const tflite::QuantizationParams& op_params,
                            const RuntimeShape& output_shape,
                            uint8_t* output_data) {
   ruy::profiler::ScopeLabel label("Quantize/Uint8");
-  const int32 zero_point = op_params.zero_point;
+  const int32_t zero_point = op_params.zero_point;
   const double scale = static_cast<double>(op_params.scale);
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
-  static constexpr int32 min_val = std::numeric_limits<uint8_t>::min();
-  static constexpr int32 max_val = std::numeric_limits<uint8_t>::max();
+  static constexpr int32_t min_val = std::numeric_limits<uint8_t>::min();
+  static constexpr int32_t max_val = std::numeric_limits<uint8_t>::max();
 
   int i = 0;
 #ifdef USE_NEON
@@ -6412,9 +6475,9 @@ inline void AffineQuantize(const tflite::QuantizationParams& op_params,
 
   for (; i < flat_size; ++i) {
     const float val = input_data[i];
-    const int32 unclamped =
-        static_cast<int32>(TfLiteRound(val / scale)) + zero_point;
-    const int32 clamped = std::min(std::max(unclamped, min_val), max_val);
+    const int32_t unclamped =
+        static_cast<int32_t>(TfLiteRound(val / scale)) + zero_point;
+    const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
     output_data[i] = clamped;
   }
 }
@@ -6426,11 +6489,11 @@ inline void AffineQuantize(const tflite::QuantizationParams& op_params,
                            const RuntimeShape& output_shape,
                            int16_t* output_data) {
   ruy::profiler::ScopeLabel label("Quantize/Int16");
-  const int32 zero_point = op_params.zero_point;
+  const int32_t zero_point = op_params.zero_point;
   const double scale = static_cast<double>(op_params.scale);
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
-  static constexpr int32 min_val = std::numeric_limits<int16_t>::min();
-  static constexpr int32 max_val = std::numeric_limits<int16_t>::max();
+  static constexpr int32_t min_val = std::numeric_limits<int16_t>::min();
+  static constexpr int32_t max_val = std::numeric_limits<int16_t>::max();
 
   int i = 0;
 #ifdef USE_NEON
@@ -6468,9 +6531,9 @@ inline void AffineQuantize(const tflite::QuantizationParams& op_params,
 
   for (; i < flat_size; ++i) {
     const float val = input_data[i];
-    const int32 unclamped =
-        static_cast<int32>(TfLiteRound(val / scale)) + zero_point;
-    const int32 clamped = std::min(std::max(unclamped, min_val), max_val);
+    const int32_t unclamped =
+        static_cast<int32_t>(TfLiteRound(val / scale)) + zero_point;
+    const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
     output_data[i] = clamped;
   }
 }
@@ -6484,9 +6547,9 @@ inline int16x8x4_t SaturatingRounding(
     int16x8_t input_val_0, int16x8_t input_val_1, int16x8_t input_val_2,
     int16x8_t input_val_3, int input_left_shift, int input_multiplier) {
   // This performs what is expressed in the scalar code as
-  // const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
-  //      static_cast<int16>(input_val_centered * (1 << input_left_shift)),
-  //      static_cast<int16>(input_multiplier));
+  // const int16_t input_val_rescaled = SaturatingRoundingDoublingHighMul(
+  //      static_cast<int16_t>(input_val_centered * (1 << input_left_shift)),
+  //      static_cast<int16_t>(input_multiplier));
   const int16x8_t left_shift_dup = vdupq_n_s16(input_left_shift);
   const int16x8_t input_val_shifted_0 = vshlq_s16(input_val_0, left_shift_dup);
   const int16x8_t input_val_shifted_1 = vshlq_s16(input_val_1, left_shift_dup);
@@ -6623,15 +6686,17 @@ inline void ClampWithRangeAndStore(int8_t* output_dst, int8x16_t input_val,
 
 inline void Tanh16bitPrecision(const TanhParams& params,
                                const RuntimeShape& input_shape,
-                               const uint8* input_data,
+                               const uint8_t* input_data,
                                const RuntimeShape& output_shape,
-                               uint8* output_data) {
+                               uint8_t* output_data) {
   // Note that this is almost the exact same code as in Logistic().
   ruy::profiler::ScopeLabel label("Tanh/Uint8");
-  const int32 input_zero_point = params.input_zero_point;
-  const int32 input_range_radius = params.input_range_radius;
-  const int16 input_multiplier = static_cast<int16>(params.input_multiplier);
-  const int16 input_left_shift = static_cast<int16>(params.input_left_shift);
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int16_t input_multiplier =
+      static_cast<int16_t>(params.input_multiplier);
+  const int16_t input_left_shift =
+      static_cast<int16_t>(params.input_left_shift);
   const int size = MatchingFlatSize(input_shape, output_shape);
 
   int c = 0;
@@ -6647,7 +6712,7 @@ inline void Tanh16bitPrecision(const TanhParams& params,
 
   // Handle 32 values at a time
   for (; c <= size - 32; c += 32) {
-    // Read input uint8 values, cast to int16 and subtract input_zero_point
+    // Read input uint8_t values, cast to int16_t and subtract input_zero_point
     using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
     const int16x8x2_t input_val_centered_0_1 =
         Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
@@ -6684,7 +6749,7 @@ inline void Tanh16bitPrecision(const TanhParams& params,
     output_val_s16.val[3] =
         vaddq_s16(output_val_s16.val[3], output_zero_point_s16);
 
-    // Cast output values to uint8, saturating
+    // Cast output values to uint8_t, saturating
     uint8x16_t output_val_u8_0_1 = vcombine_u8(
         vqmovun_s16(output_val_s16.val[0]), vqmovun_s16(output_val_s16.val[1]));
     uint8x16_t output_val_u8_2_3 = vcombine_u8(
@@ -6697,32 +6762,32 @@ inline void Tanh16bitPrecision(const TanhParams& params,
 #endif  // GEMMLOWP_NEON
   // Leftover loop: handle one value at a time with scalar code.
   for (; c < size; ++c) {
-    const uint8 input_val_u8 = input_data[c];
-    const int16 input_val_centered =
-        static_cast<int16>(input_val_u8) - input_zero_point;
-    uint8 output_val;
+    const uint8_t input_val_u8 = input_data[c];
+    const int16_t input_val_centered =
+        static_cast<int16_t>(input_val_u8) - input_zero_point;
+    uint8_t output_val;
     if (input_val_centered < -input_range_radius) {
       output_val = 0;
     } else if (input_val_centered > input_range_radius) {
       output_val = 255;
     } else {
       using gemmlowp::SaturatingRoundingDoublingHighMul;
-      const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
-          static_cast<int16>(input_val_centered * (1 << input_left_shift)),
-          static_cast<int16>(input_multiplier));
-      using FixedPoint4 = gemmlowp::FixedPoint<int16, 4>;
-      using FixedPoint0 = gemmlowp::FixedPoint<int16, 0>;
+      const int16_t input_val_rescaled = SaturatingRoundingDoublingHighMul(
+          static_cast<int16_t>(input_val_centered * (1 << input_left_shift)),
+          static_cast<int16_t>(input_multiplier));
+      using FixedPoint4 = gemmlowp::FixedPoint<int16_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int16_t, 0>;
       const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
       const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
       using gemmlowp::RoundingDivideByPOT;
-      int16 output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 8);
+      int16_t output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 8);
       output_val_s16 += output_zero_point;
       if (output_val_s16 == 256) {
         output_val_s16 = 255;
       }
       TFLITE_DCHECK_GE(output_val_s16, 0);
       TFLITE_DCHECK_LE(output_val_s16, 255);
-      output_val = static_cast<uint8>(output_val_s16);
+      output_val = static_cast<uint8_t>(output_val_s16);
     }
     output_data[c] = output_val;
   }
@@ -6730,15 +6795,17 @@ inline void Tanh16bitPrecision(const TanhParams& params,
 
 inline void Tanh16bitPrecision(const TanhParams& params,
                                const RuntimeShape& input_shape,
-                               const int8* input_data,
+                               const int8_t* input_data,
                                const RuntimeShape& output_shape,
-                               int8* output_data) {
+                               int8_t* output_data) {
   // Note that this is almost the exact same code as in Logistic().
   ruy::profiler::ScopeLabel label("Tanh/Int8");
-  const int32 input_zero_point = params.input_zero_point;
-  const int32 input_range_radius = params.input_range_radius;
-  const int16 input_multiplier = static_cast<int16>(params.input_multiplier);
-  const int16 input_left_shift = static_cast<int16>(params.input_left_shift);
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int16_t input_multiplier =
+      static_cast<int16_t>(params.input_multiplier);
+  const int16_t input_left_shift =
+      static_cast<int16_t>(params.input_left_shift);
   const int size = MatchingFlatSize(input_shape, output_shape);
 
   int c = 0;
@@ -6751,7 +6818,7 @@ inline void Tanh16bitPrecision(const TanhParams& params,
 
   // Handle 32 values at a time
   for (; c <= size - 32; c += 32) {
-    // Read input int8 values, cast to int16 and subtract input_zero_point
+    // Read input int8_t values, cast to int16_t and subtract input_zero_point
     using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
     const int16x8x2_t input_val_centered_0_1 =
         Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
@@ -6778,7 +6845,7 @@ inline void Tanh16bitPrecision(const TanhParams& params,
 
     int16x8x4_t output_val_s16 = FixedPoint4Tanh(input_val_rescaled);
 
-    // Cast output values to uint8, saturating
+    // Cast output values to uint8_t, saturating
     int8x16_t output_val_s8_0_1 = vcombine_s8(
         vqmovn_s16(output_val_s16.val[0]), vqmovn_s16(output_val_s16.val[1]));
     int8x16_t output_val_s8_2_3 = vcombine_s8(
@@ -6791,31 +6858,31 @@ inline void Tanh16bitPrecision(const TanhParams& params,
 #endif  // GEMMLOWP_NEON
   // Leftover loop: handle one value at a time with scalar code.
   for (; c < size; ++c) {
-    const int8 input_val_s8 = input_data[c];
-    const int16 input_val_centered =
-        static_cast<int16>(input_val_s8) - input_zero_point;
-    int8 output_val;
+    const int8_t input_val_s8 = input_data[c];
+    const int16_t input_val_centered =
+        static_cast<int16_t>(input_val_s8) - input_zero_point;
+    int8_t output_val;
     if (input_val_centered <= -input_range_radius) {
       output_val = -128;
     } else if (input_val_centered >= input_range_radius) {
       output_val = 127;
     } else {
       using gemmlowp::SaturatingRoundingDoublingHighMul;
-      const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
-          static_cast<int16>(input_val_centered * (1 << input_left_shift)),
-          static_cast<int16>(input_multiplier));
-      using FixedPoint4 = gemmlowp::FixedPoint<int16, 4>;
-      using FixedPoint0 = gemmlowp::FixedPoint<int16, 0>;
+      const int16_t input_val_rescaled = SaturatingRoundingDoublingHighMul(
+          static_cast<int16_t>(input_val_centered * (1 << input_left_shift)),
+          static_cast<int16_t>(input_multiplier));
+      using FixedPoint4 = gemmlowp::FixedPoint<int16_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int16_t, 0>;
       const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
       const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
       using gemmlowp::RoundingDivideByPOT;
-      int16 output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 8);
+      int16_t output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 8);
       if (output_val_s16 == 128) {
         output_val_s16 = 127;
       }
       TFLITE_DCHECK_GE(output_val_s16, -128);
       TFLITE_DCHECK_LE(output_val_s16, 127);
-      output_val = static_cast<int8>(output_val_s16);
+      output_val = static_cast<int8_t>(output_val_s16);
     }
     output_data[c] = output_val;
   }
@@ -6823,14 +6890,15 @@ inline void Tanh16bitPrecision(const TanhParams& params,
 
 inline void Logistic16bitPrecision(const LogisticParams& params,
                                    const RuntimeShape& input_shape,
-                                   const uint8* input_data,
+                                   const uint8_t* input_data,
                                    const RuntimeShape& output_shape,
-                                   uint8* output_data) {
+                                   uint8_t* output_data) {
   ruy::profiler::ScopeLabel label("Logistic/Uint8");
-  const int32 input_zero_point = params.input_zero_point;
-  const int32 input_range_radius = params.input_range_radius;
-  const int32 input_multiplier = params.input_multiplier;
-  const int16 input_left_shift = static_cast<int16>(params.input_left_shift);
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int32_t input_multiplier = params.input_multiplier;
+  const int16_t input_left_shift =
+      static_cast<int16_t>(params.input_left_shift);
   const int size = MatchingFlatSize(input_shape, output_shape);
 
   int c = 0;
@@ -6843,7 +6911,7 @@ inline void Logistic16bitPrecision(const LogisticParams& params,
 
   // Handle 32 values at a time
   for (; c <= size - 32; c += 32) {
-    // Read input uint8 values, cast to int16 and subtract input_zero_point
+    // Read input uint8_t values, cast to int16_t and subtract input_zero_point
     using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
     const int16x8x2_t input_val_centered_0_1 =
         Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
@@ -6870,7 +6938,7 @@ inline void Logistic16bitPrecision(const LogisticParams& params,
 
     int16x8x4_t output_val_s16 = FixedPoint4Logistic(input_val_rescaled);
 
-    // Cast output values to uint8, saturating
+    // Cast output values to uint8_t, saturating
     uint8x16_t output_val_u8_0_1 = vcombine_u8(
         vqmovun_s16(output_val_s16.val[0]), vqmovun_s16(output_val_s16.val[1]));
     uint8x16_t output_val_u8_2_3 = vcombine_u8(
@@ -6883,31 +6951,31 @@ inline void Logistic16bitPrecision(const LogisticParams& params,
 #endif  // GEMMLOWP_NEON
   // Leftover loop: handle one value at a time with scalar code.
   for (; c < size; ++c) {
-    const uint8 input_val_u8 = input_data[c];
-    const int16 input_val_centered =
-        static_cast<int16>(input_val_u8) - input_zero_point;
-    uint8 output_val;
+    const uint8_t input_val_u8 = input_data[c];
+    const int16_t input_val_centered =
+        static_cast<int16_t>(input_val_u8) - input_zero_point;
+    uint8_t output_val;
     if (input_val_centered < -input_range_radius) {
       output_val = 0;
     } else if (input_val_centered > input_range_radius) {
       output_val = 255;
     } else {
       using gemmlowp::SaturatingRoundingDoublingHighMul;
-      const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
-          static_cast<int16>(input_val_centered * (1 << input_left_shift)),
-          static_cast<int16>(input_multiplier));
-      using FixedPoint4 = gemmlowp::FixedPoint<int16, 4>;
-      using FixedPoint0 = gemmlowp::FixedPoint<int16, 0>;
+      const int16_t input_val_rescaled = SaturatingRoundingDoublingHighMul(
+          static_cast<int16_t>(input_val_centered * (1 << input_left_shift)),
+          static_cast<int16_t>(input_multiplier));
+      using FixedPoint4 = gemmlowp::FixedPoint<int16_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int16_t, 0>;
       const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
       const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
       using gemmlowp::RoundingDivideByPOT;
-      int16 output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 7);
+      int16_t output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 7);
       if (output_val_s16 == 256) {
         output_val_s16 = 255;
       }
       TFLITE_DCHECK_GE(output_val_s16, 0);
       TFLITE_DCHECK_LE(output_val_s16, 255);
-      output_val = static_cast<uint8>(output_val_s16);
+      output_val = static_cast<uint8_t>(output_val_s16);
     }
     output_data[c] = output_val;
   }
@@ -6915,18 +6983,19 @@ inline void Logistic16bitPrecision(const LogisticParams& params,
 
 inline void Logistic16bitPrecision(const LogisticParams& params,
                                    const RuntimeShape& input_shape,
-                                   const int8* input_data,
+                                   const int8_t* input_data,
                                    const RuntimeShape& output_shape,
-                                   int8* output_data) {
+                                   int8_t* output_data) {
   ruy::profiler::ScopeLabel label("Logistic/Int8");
-  const int32 input_zero_point = params.input_zero_point;
-  const int32 input_range_radius = params.input_range_radius;
-  const int32 input_multiplier = params.input_multiplier;
-  const int16 input_left_shift = static_cast<int16>(params.input_left_shift);
+  const int32_t input_zero_point = params.input_zero_point;
+  const int32_t input_range_radius = params.input_range_radius;
+  const int32_t input_multiplier = params.input_multiplier;
+  const int16_t input_left_shift =
+      static_cast<int16_t>(params.input_left_shift);
   const int size = MatchingFlatSize(input_shape, output_shape);
 
   int c = 0;
-  const int16 output_zero_point = 128;
+  const int16_t output_zero_point = 128;
 // TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
 // The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
 // arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
@@ -6937,7 +7006,7 @@ inline void Logistic16bitPrecision(const LogisticParams& params,
 
   // Handle 32 values at a time
   for (; c <= size - 32; c += 32) {
-    // Read input int8 values, cast to int16 and subtract input_zero_point
+    // Read input int8_t values, cast to int16_t and subtract input_zero_point
     using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
     const int16x8x2_t input_val_centered_0_1 =
         Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
@@ -6974,7 +7043,7 @@ inline void Logistic16bitPrecision(const LogisticParams& params,
     output_val_s16.val[3] =
         vsubq_s16(output_val_s16.val[3], output_zero_point_dup);
 
-    // Cast output values to int8, saturating
+    // Cast output values to int8_t, saturating
     int8x16_t output_val_s8_0_1 = vcombine_s8(
         vqmovn_s16(output_val_s16.val[0]), vqmovn_s16(output_val_s16.val[1]));
     int8x16_t output_val_s8_2_3 = vcombine_s8(
@@ -6987,32 +7056,32 @@ inline void Logistic16bitPrecision(const LogisticParams& params,
 #endif  // GEMMLOWP_NEON
   // Leftover loop: handle one value at a time with scalar code.
   for (; c < size; ++c) {
-    const int8 input_val_s8 = input_data[c];
-    const int16 input_val_centered =
-        static_cast<int16>(input_val_s8) - input_zero_point;
-    int8 output_val;
+    const int8_t input_val_s8 = input_data[c];
+    const int16_t input_val_centered =
+        static_cast<int16_t>(input_val_s8) - input_zero_point;
+    int8_t output_val;
     if (input_val_centered < -input_range_radius) {
       output_val = -128;
     } else if (input_val_centered > input_range_radius) {
       output_val = 127;
     } else {
       using gemmlowp::SaturatingRoundingDoublingHighMul;
-      const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
-          static_cast<int16>(input_val_centered * (1 << input_left_shift)),
-          static_cast<int16>(input_multiplier));
-      using FixedPoint4 = gemmlowp::FixedPoint<int16, 4>;
-      using FixedPoint0 = gemmlowp::FixedPoint<int16, 0>;
+      const int16_t input_val_rescaled = SaturatingRoundingDoublingHighMul(
+          static_cast<int16_t>(input_val_centered * (1 << input_left_shift)),
+          static_cast<int16_t>(input_multiplier));
+      using FixedPoint4 = gemmlowp::FixedPoint<int16_t, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int16_t, 0>;
       const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
       const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
       using gemmlowp::RoundingDivideByPOT;
-      int16 output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 7);
+      int16_t output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 7);
       output_val_s16 -= output_zero_point;
       if (output_val_s16 == 128) {
         output_val_s16 = 127;
       }
       TFLITE_DCHECK_GE(output_val_s16, -128);
       TFLITE_DCHECK_LE(output_val_s16, 127);
-      output_val = static_cast<int8>(output_val_s16);
+      output_val = static_cast<int8_t>(output_val_s16);
     }
     output_data[c] = output_val;
   }
@@ -7343,8 +7412,8 @@ void Transpose(const TransposeParams& unshrinked_params,
 
 // Assume input1 & input2 have the same scale & zero point.
 inline void MaximumElementwise(int size, const ArithmeticParams& params,
-                               const int8* input1_data, const int8* input2_data,
-                               int8* output_data) {
+                               const int8_t* input1_data,
+                               const int8_t* input2_data, int8_t* output_data) {
   ruy::profiler::ScopeLabel label("MaximumElementwiseInt8/8bit");
   int i = 0;
 #ifdef USE_NEON
@@ -7357,15 +7426,16 @@ inline void MaximumElementwise(int size, const ArithmeticParams& params,
   }
 #endif  // USE_NEON
   for (; i < size; ++i) {
-    const int8 input1_val = input1_data[i];
-    const int8 input2_val = input2_data[i];
+    const int8_t input1_val = input1_data[i];
+    const int8_t input2_val = input2_data[i];
     output_data[i] = std::max(input1_val, input2_val);
   }
 }
 
 inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
-                                   int8 input1_data, const int8* input2_data,
-                                   int8* output_data) {
+                                   int8_t input1_data,
+                                   const int8_t* input2_data,
+                                   int8_t* output_data) {
   ruy::profiler::ScopeLabel label("MaximumScalarBroadcastInt8/8bit");
   int i = 0;
 
@@ -7379,15 +7449,15 @@ inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
   }
 #endif  // USE_NEON
   for (; i < size; ++i) {
-    const int8 input2_val = input2_data[i];
+    const int8_t input2_val = input2_data[i];
     output_data[i] = std::max(input1_data, input2_val);
   }
 }
 
 // Assume input1 & input2 have the same scale & zero point.
 inline void MinimumElementwise(int size, const ArithmeticParams& params,
-                               const int8* input1_data, const int8* input2_data,
-                               int8* output_data) {
+                               const int8_t* input1_data,
+                               const int8_t* input2_data, int8_t* output_data) {
   ruy::profiler::ScopeLabel label("MinimumElementwiseInt8/8bit");
   int i = 0;
 #ifdef USE_NEON
@@ -7400,15 +7470,16 @@ inline void MinimumElementwise(int size, const ArithmeticParams& params,
   }
 #endif  // USE_NEON
   for (; i < size; ++i) {
-    const int8 input1_val = input1_data[i];
-    const int8 input2_val = input2_data[i];
+    const int8_t input1_val = input1_data[i];
+    const int8_t input2_val = input2_data[i];
     output_data[i] = std::min(input1_val, input2_val);
   }
 }
 
 inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
-                                   int8 input1_data, const int8* input2_data,
-                                   int8* output_data) {
+                                   int8_t input1_data,
+                                   const int8_t* input2_data,
+                                   int8_t* output_data) {
   ruy::profiler::ScopeLabel label("MinimumScalarBroadcastInt8/8bit");
   int i = 0;
 
@@ -7422,7 +7493,7 @@ inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
   }
 #endif  // USE_NEON
   for (; i < size; ++i) {
-    const int8 input2_val = input2_data[i];
+    const int8_t input2_val = input2_data[i];
     output_data[i] = std::min(input1_data, input2_val);
   }
 }
@@ -7430,11 +7501,11 @@ inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
 template <typename Op>
 inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
                                      const RuntimeShape& input1_shape,
-                                     const int8* input1_data,
+                                     const int8_t* input1_data,
                                      const RuntimeShape& input2_shape,
-                                     const int8* input2_data,
+                                     const int8_t* input2_data,
                                      const RuntimeShape& output_shape,
-                                     int8* output_data, Op op) {
+                                     int8_t* output_data, Op op) {
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
     return reference_ops::MaximumMinimumBroadcastSlow(
         input1_shape, input1_data, input2_shape, input2_data, output_shape,
@@ -7449,11 +7520,11 @@ inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
 template <typename Op>
 inline void BroadcastMinimumDispatch(const ArithmeticParams& params,
                                      const RuntimeShape& input1_shape,
-                                     const int8* input1_data,
+                                     const int8_t* input1_data,
                                      const RuntimeShape& input2_shape,
-                                     const int8* input2_data,
+                                     const int8_t* input2_data,
                                      const RuntimeShape& output_shape,
-                                     int8* output_data, Op op) {
+                                     int8_t* output_data, Op op) {
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
     return reference_ops::MaximumMinimumBroadcastSlow(
         input1_shape, input1_data, input2_shape, input2_data, output_shape,
@@ -7979,7 +8050,7 @@ inline TfLiteStatus Conv3D(
   ruy::profiler::ScopeLabel label("Conv3D");
 
   // NB: the float 0.0f value is represented by all zero bytes.
-  const uint8 float_zero_byte = 0x00;
+  const uint8_t float_zero_byte = 0x00;
   const float* gemm_input_data = nullptr;
   const RuntimeShape* gemm_input_shape = nullptr;
   const int filter_width = filter_shape.Dims(2);
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 469abbdc7b3c17..7bf649b0065fd5 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -151,11 +151,11 @@ inline void ReluX(const tflite::ReluParams& params,
   ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
-    const int32 val = static_cast<int32_t>(input_data[i]);
-    int32 clamped = params.output_offset +
-                    MultiplyByQuantizedMultiplier(val - params.input_offset,
-                                                  params.output_multiplier,
-                                                  params.output_shift);
+    const int32_t val = static_cast<int32_t>(input_data[i]);
+    int32_t clamped = params.output_offset +
+                      MultiplyByQuantizedMultiplier(val - params.input_offset,
+                                                    params.output_multiplier,
+                                                    params.output_shift);
     clamped = std::max(params.quantized_activation_min, clamped);
     clamped = std::min(params.quantized_activation_max, clamped);
     output_data[i] = static_cast<T>(clamped);
@@ -185,11 +185,11 @@ inline void ReluX(const tflite::ActivationParams& params,
 // generate max(D1, D2) nested for loops.
 inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
                                  const RuntimeShape& unswitched_input1_shape,
-                                 const uint8* unswitched_input1_data,
+                                 const uint8_t* unswitched_input1_data,
                                  const RuntimeShape& unswitched_input2_shape,
-                                 const uint8* unswitched_input2_data,
+                                 const uint8_t* unswitched_input2_data,
                                  const RuntimeShape& output_shape,
-                                 uint8* output_data) {
+                                 uint8_t* output_data) {
   ArithmeticParams switched_params = unswitched_params;
   switched_params.input1_offset = unswitched_params.input2_offset;
   switched_params.input2_offset = unswitched_params.input1_offset;
@@ -200,25 +200,25 @@ inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
 
   const ArithmeticParams& params =
       use_unswitched ? unswitched_params : switched_params;
-  const uint8* input1_data =
+  const uint8_t* input1_data =
       use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const uint8* input2_data =
+  const uint8_t* input2_data =
       use_unswitched ? unswitched_input2_data : unswitched_input1_data;
 
   // Fivefold nested loops. The second input resets its position for each
   // iteration of the second loop. The first input resets its position at the
   // beginning of the fourth loop. The innermost loop is an elementwise Mul of
   // sections of the arrays.
-  uint8* output_data_ptr = output_data;
-  const uint8* input1_data_ptr = input1_data;
-  const uint8* input2_data_reset = input2_data;
+  uint8_t* output_data_ptr = output_data;
+  const uint8_t* input1_data_ptr = input1_data;
+  const uint8_t* input2_data_reset = input2_data;
   int y0 = params.broadcast_shape[0];
   int y1 = params.broadcast_shape[1];
   int y2 = params.broadcast_shape[2];
   int y3 = params.broadcast_shape[3];
   int y4 = params.broadcast_shape[4];
   for (int i0 = 0; i0 < y0; ++i0) {
-    const uint8* input2_data_ptr;
+    const uint8_t* input2_data_ptr;
     for (int i1 = 0; i1 < y1; ++i1) {
       input2_data_ptr = input2_data_reset;
       for (int i2 = 0; i2 < y2; ++i2) {
@@ -236,9 +236,9 @@ inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
 }
 
 inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
-                const RuntimeShape& output_shape, int16* output_data) {
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int16_t* output_data) {
   ruy::profiler::ScopeLabel label("Mul/Int16");
 
   const int flat_size =
@@ -255,13 +255,13 @@ inline void Mul(const ArithmeticParams& params,
 }
 
 inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
   ruy::profiler::ScopeLabel label("Mul/Int16Uint8");
-  int32 output_offset = params.output_offset;
-  int32 output_activation_min = params.quantized_activation_min;
-  int32 output_activation_max = params.quantized_activation_max;
+  int32_t output_offset = params.output_offset;
+  int32_t output_activation_min = params.quantized_activation_min;
+  int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
   const int flat_size =
@@ -273,12 +273,12 @@ inline void Mul(const ArithmeticParams& params,
 
     F0 unclamped_result =
         F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
-    int16 rescaled_result =
+    int16_t rescaled_result =
         gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
-    int16 clamped_result =
-        std::min<int16>(output_activation_max - output_offset, rescaled_result);
-    clamped_result =
-        std::max<int16>(output_activation_min - output_offset, clamped_result);
+    int16_t clamped_result = std::min<int16_t>(
+        output_activation_max - output_offset, rescaled_result);
+    clamped_result = std::max<int16_t>(output_activation_min - output_offset,
+                                       clamped_result);
     output_data[i] = output_offset + clamped_result;
   }
 }
@@ -291,14 +291,15 @@ inline void Sub16(const ArithmeticParams& params,
   const int input1_shift = params.input1_shift;
   const int flat_size =
       MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  const int16 output_activation_min = params.quantized_activation_min;
-  const int16 output_activation_max = params.quantized_activation_max;
+  const int16_t output_activation_min = params.quantized_activation_min;
+  const int16_t output_activation_max = params.quantized_activation_max;
 
   TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
   TFLITE_DCHECK_LE(input1_shift, 0);
   TFLITE_DCHECK_LE(params.input2_shift, 0);
-  const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
-  const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int16_t* not_shift_input =
+      input1_shift == 0 ? input1_data : input2_data;
+  const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
   const int input_right_shift =
       input1_shift == 0 ? -params.input2_shift : -input1_shift;
 
@@ -310,8 +311,8 @@ inline void Sub16(const ArithmeticParams& params,
       F0 scaled_input = F0::FromRaw(
           gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
       F0 result = SaturatingSub(input_ready_scaled, scaled_input);
-      const int16 raw_output = result.raw();
-      const int16 clamped_output = std::min(
+      const int16_t raw_output = result.raw();
+      const int16_t clamped_output = std::min(
           output_activation_max, std::max(output_activation_min, raw_output));
       output_data[i] = clamped_output;
     }
@@ -323,8 +324,8 @@ inline void Sub16(const ArithmeticParams& params,
       F0 scaled_input = F0::FromRaw(
           gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
       F0 result = SaturatingSub(scaled_input, input_ready_scaled);
-      const int16 raw_output = result.raw();
-      const int16 clamped_output = std::min(
+      const int16_t raw_output = result.raw();
+      const int16_t clamped_output = std::min(
           output_activation_max, std::max(output_activation_min, raw_output));
       output_data[i] = clamped_output;
     }
@@ -395,15 +396,15 @@ void Unpack(const UnpackParams& params, const RuntimeShape& input_shape,
 template <typename Scalar>
 void PackWithScaling(const PackParams& params,
                      const RuntimeShape* const* input_shapes,
-                     const uint8* const* input_data,
-                     const RuntimeShape& output_shape, uint8* output_data) {
+                     const uint8_t* const* input_data,
+                     const RuntimeShape& output_shape, uint8_t* output_data) {
   ruy::profiler::ScopeLabel label("PackWithScaling");
   const int dimensions = output_shape.DimensionsCount();
   int axis = params.axis;
-  const int32* input_zeropoint = params.input_zeropoint;
+  const int32_t* input_zeropoint = params.input_zeropoint;
   const float* input_scale = params.input_scale;
   int inputs_count = params.inputs_count;
-  const int32 output_zeropoint = params.output_zeropoint;
+  const int32_t output_zeropoint = params.output_zeropoint;
   const float output_scale = params.output_scale;
 
   int outer_size = 1;
@@ -599,7 +600,7 @@ inline GatherNdHelperResult GatherNdHelper(const RuntimeShape& params_shape,
 // Implements GatherNd.
 // Returns an error if any of the indices_data would cause an out of bounds
 // memory read.
-template <typename ParamsT, typename IndicesT = int32>
+template <typename ParamsT, typename IndicesT = int32_t>
 inline TfLiteStatus GatherNd(const RuntimeShape& params_shape,
                              const ParamsT* params_data,
                              const RuntimeShape& indices_shape,
@@ -627,7 +628,7 @@ inline TfLiteStatus GatherNd(const RuntimeShape& params_shape,
 // Implements GatherNd on strings.
 // Returns an error if any of the indices_data would cause an out of bounds
 // memory read.
-template <typename IndicesT = int32>
+template <typename IndicesT = int32_t>
 inline TfLiteStatus GatherNdString(const RuntimeShape& params_shape,
                                    const TfLiteTensor* params_data,
                                    const RuntimeShape& indices_shape,
diff --git a/tensorflow/lite/kernels/mul_test.cc b/tensorflow/lite/kernels/mul_test.cc
index 34b484a4ca9c2c..f5f0d40da261f7 100644
--- a/tensorflow/lite/kernels/mul_test.cc
+++ b/tensorflow/lite/kernels/mul_test.cc
@@ -541,6 +541,64 @@ TEST_P(MulOpTest, Int32VariousInputShapes) {
   }
 }
 
+// Neon intrinsics are only dispatched when tensor has at least 16 elements.
+TEST_P(MulOpTest, Int32LargeInputShapeNoActivation) {
+  bool constant_tensors = GetParam();
+  if (SingleOpModel::GetForceUseNnapi() && constant_tensors) {
+    // NNAPI does not support graphs with all constant inputs.
+    return;
+  }
+  const std::vector<int> test_shape = {4, 4, 4, 4};
+  constexpr int kFlatSize = 4 * 4 * 4 * 4;
+
+  std::vector<int> lhs_data(kFlatSize);
+  std::iota(lhs_data.begin(), lhs_data.end(), 0);
+
+  std::vector<int> rhs_data(kFlatSize);
+  std::iota(rhs_data.begin(), rhs_data.end(), 0);
+
+  IntegerMulOpModel<int32_t> m(
+      {TensorType_INT32, test_shape}, {TensorType_INT32, test_shape},
+      {TensorType_INT32, {}}, ActivationFunctionType_NONE, lhs_data, rhs_data,
+      constant_tensors);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  const std::vector<int> output = m.GetOutput();
+  ASSERT_EQ(output.size(), kFlatSize);
+  for (int i = 0; i < kFlatSize; ++i) {
+    EXPECT_EQ(output[i], i * i);
+  }
+}
+
+// Neon intrinsics are only dispatched when tensor has at least 16 elements.
+TEST_P(MulOpTest, Int32LargeInputShapeRELU6) {
+  bool constant_tensors = GetParam();
+  if (SingleOpModel::GetForceUseNnapi() && constant_tensors) {
+    // NNAPI does not support graphs with all constant inputs.
+    return;
+  }
+  const std::vector<int> test_shape = {4, 4, 4, 4};
+  constexpr int kFlatSize = 4 * 4 * 4 * 4;
+
+  std::vector<int> lhs_data(kFlatSize);
+  std::iota(lhs_data.begin(), lhs_data.end(), 0);
+
+  std::vector<int> rhs_data(kFlatSize);
+  std::iota(rhs_data.begin(), rhs_data.end(), 0);
+
+  IntegerMulOpModel<int32_t> m(
+      {TensorType_INT32, test_shape}, {TensorType_INT32, test_shape},
+      {TensorType_INT32, {}}, ActivationFunctionType_RELU6, lhs_data, rhs_data,
+      constant_tensors);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  const std::vector<int> output = m.GetOutput();
+  ASSERT_EQ(output.size(), kFlatSize);
+  for (int i = 0; i < kFlatSize; ++i) {
+    EXPECT_EQ(output[i], std::min(i * i, 6));
+  }
+}
+
 TEST_P(MulOpTest, Int32WithBroadcast) {
   bool constant_tensors = GetParam();
   if (SingleOpModel::GetForceUseNnapi() && constant_tensors) {
diff --git a/tensorflow/lite/kernels/register.h b/tensorflow/lite/kernels/register.h
index 6721dc69a328bd..e444accec6511e 100644
--- a/tensorflow/lite/kernels/register.h
+++ b/tensorflow/lite/kernels/register.h
@@ -15,7 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_REGISTER_H_
 #define TENSORFLOW_LITE_KERNELS_REGISTER_H_
 
-#include "tensorflow/lite/core/kernels/register.h"
+/// For documentation, see third_party/tensorflow/lite/core/kernels/register.h
+
+#include "tensorflow/lite/core/kernels/register.h"  // IWYU pragma: export
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index 7db39695c13de6..7cf5f3710de2c5 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -194,6 +194,7 @@ TfLiteRegistration* Register_STABLEHLO_MULTIPLY();
 TfLiteRegistration* Register_STABLEHLO_REDUCE_WINDOW();
 TfLiteRegistration* Register_STABLEHLO_MAXIMUM();
 TfLiteRegistration* Register_STABLEHLO_MINIMUM();
+TfLiteRegistration* Register_STABLEHLO_PAD();
 
 namespace {
 
@@ -491,7 +492,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
   AddBuiltin(BuiltinOperator_ADD_N, Register_ADD_N());
   AddBuiltin(BuiltinOperator_GATHER_ND, Register_GATHER_ND(),
              /* min_version = */ 1,
-             /* max_version = */ 4);
+             /* max_version = */ 5);
   AddBuiltin(BuiltinOperator_WHERE, Register_WHERE(), /* min_version = */ 1,
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_REVERSE_SEQUENCE, Register_REVERSE_SEQUENCE());
@@ -558,6 +559,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
   AddBuiltin(BuiltinOperator_STABLEHLO_REDUCE_WINDOW,
              Register_STABLEHLO_REDUCE_WINDOW());
   AddBuiltin(BuiltinOperator_STABLEHLO_GATHER, Register_STABLEHLO_GATHER());
+  AddBuiltin(BuiltinOperator_STABLEHLO_PAD, Register_STABLEHLO_PAD());
   AddCustom("NumericVerify",
             tflite::ops::custom::Register_NUMERIC_VERIFY_REF());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
diff --git a/tensorflow/lite/kernels/split.cc b/tensorflow/lite/kernels/split.cc
index 1491f4bbb98823..83add14be0173e 100644
--- a/tensorflow/lite/kernels/split.cc
+++ b/tensorflow/lite/kernels/split.cc
@@ -87,7 +87,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context,
                  input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
                      input_type == kTfLiteInt8 || input_type == kTfLiteInt16 ||
-                     input_type == kTfLiteInt32);
+                     input_type == kTfLiteInt32 || input_type == kTfLiteInt64);
   for (int i = 0; i < NumOutputs(node); ++i) {
     TfLiteTensor* tensor;
     TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &tensor));
@@ -158,6 +158,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_SPLIT(int32_t);
       break;
     }
+    case kTfLiteInt64: {
+      TF_LITE_SPLIT(int64_t);
+      break;
+    }
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s currently not supported.",
                          TfLiteTypeGetName(op_context.input->type));
diff --git a/tensorflow/lite/kernels/stablehlo_pad.cc b/tensorflow/lite/kernels/stablehlo_pad.cc
new file mode 100644
index 00000000000000..13f6b74eae8906
--- /dev/null
+++ b/tensorflow/lite/kernels/stablehlo_pad.cc
@@ -0,0 +1,291 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+         //
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <numeric>
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace stablehlo_pad {
+namespace {
+
+static constexpr int kMaxDims = 6;
+
+// Fills a buffer with the given data.
+//
+// WARNING: This expects buffer_bytes to be a multiple of data_bytes.
+void FillBuffer(char* buffer, int64_t buffer_bytes, const char* data,
+                int64_t data_bytes) {
+  if (buffer_bytes == 0) {
+    return;
+  }
+  assert(buffer_bytes % data_bytes == 0);
+  std::memcpy(buffer, data, data_bytes);
+  buffer_bytes -= data_bytes;
+  while (buffer_bytes) {
+    const int64_t bytes = std::min(buffer_bytes, data_bytes);
+    std::memcpy(buffer + data_bytes, buffer, bytes);
+    buffer_bytes -= bytes;
+    data_bytes += bytes;
+  }
+}
+
+// Recursive implementation of a strided copy of a tensor.
+void StridedCopy(const int rank, const char* input, const int64_t* input_shape,
+                 const int64_t* input_strides, char* output,
+                 const int64_t* output_strides, const int64_t element_size,
+                 const int depth) {
+  if (depth + 1 == rank) {
+    for (int64_t i = 0; i < input_shape[depth]; ++i) {
+      std::memcpy(output, input, element_size);
+      input += input_strides[depth];
+      output += output_strides[depth];
+    }
+  } else {
+    for (int64_t i = 0; i < input_shape[depth]; ++i) {
+      StridedCopy(rank, input, input_shape, input_strides, output,
+                  output_strides, element_size, depth + 1);
+      input += input_strides[depth];
+      output += output_strides[depth];
+    }
+  }
+}
+
+// Holds the main implementation of the Pad operation.
+//
+// The StableHLO pad operation can add interior padding and edge padding to a
+// tensor. The edge padding may be negative in which case it is considered as a
+// cropping specification.
+//
+// This is implemented as a strided copy where:
+//
+// - interior padding affects the output strides.
+// - positive edge padding affects the output shape, strides and initial offset.
+// - negative edge padding affects the input shape and initial offset as well as
+// the output initial offset.
+//
+// See https://github.com/openxla/stablehlo/blob/main/docs/spec.md#pad for more
+// information.
+class PadData {
+ public:
+  enum { kInput, kPaddingValue, kInputTensorCount };
+  enum { kOutput, kOutputTensorCount };
+
+  explicit PadData(const TfLiteStablehloPadParams& params) {
+    std::memcpy(
+        edge_pad_low_, params.edge_padding_low,
+        TFLITE_STABLEHLO_PAD_PARAMS_MAX_DIMENSION_COUNT * sizeof(int64_t));
+    std::memcpy(
+        edge_pad_high_, params.edge_padding_high,
+        TFLITE_STABLEHLO_PAD_PARAMS_MAX_DIMENSION_COUNT * sizeof(int64_t));
+    std::memcpy(
+        interior_pad_, params.interior_padding,
+        TFLITE_STABLEHLO_PAD_PARAMS_MAX_DIMENSION_COUNT * sizeof(int64_t));
+  }
+
+  // Computes the shapes and strides that are needed for the final strided copy.
+  void Setup(const int* dims, const int rank, const int64_t element_size) {
+    rank_ = rank;
+    element_size_ = element_size;
+    input_offset_ = 0;
+    output_offset_ = 0;
+    output_size_ = 0;
+
+    // Compute the output shape.
+    for (int i = 0; i < rank; ++i) {
+      output_shape_[i] = (dims[i] - 1) * (interior_pad_[i] + 1) + 1 +
+                         edge_pad_low_[i] + edge_pad_high_[i];
+    }
+    if (std::any_of(output_shape_, output_shape_ + rank,
+                    [](auto s) { return s <= 0; })) {
+      std::memset(input_shape_, 0, sizeof(input_shape_));
+      std::memset(output_shape_, 0, sizeof(output_shape_));
+      output_size_ = 0;
+      return;
+    }
+    // Compute the output size for each dimension.
+    //
+    // This is different from the output strides because of the interior
+    // padding: the output strides take it into account to "jump" over the
+    // interior padding elements.
+    output_dimension_sizes_[rank - 1] = element_size;
+    for (int i = rank - 2; i >= 0; --i) {
+      output_dimension_sizes_[i] =
+          output_shape_[i + 1] * output_dimension_sizes_[i + 1];
+    }
+    // Compute the output stride for each dimension.
+    //
+    // This is the stride between two elements that are copied from the input
+    // tensor (i.e. not generated by interior padding).
+    output_strides_[rank - 1] = element_size * (interior_pad_[rank - 1] + 1);
+    for (int i = rank - 2; i >= 0; --i) {
+      output_strides_[i] = output_dimension_sizes_[i] * (interior_pad_[i] + 1);
+    }
+    // Compute the output offset from the eventual pads.
+    for (int i = 0; i < rank; ++i) {
+      output_offset_ +=
+          std::max<int64_t>(edge_pad_low_[i], 0) * output_dimension_sizes_[i];
+    }
+    // Compute the final output size.
+    output_size_ = std::accumulate(output_shape_, output_shape_ + rank,
+                                   element_size, std::multiplies<>());
+    // Compute input strides.
+    input_strides_[rank - 1] = element_size;
+    for (int i = rank - 1; i >= 1; --i) {
+      input_strides_[i - 1] = dims[i] * input_strides_[i];
+    }
+    // Helper that computes the division between a negative num and a positive
+    // denum, rounding away from 0, or returns 0 if num is positive.
+    auto DivNegRoundAwayOrZero = [](int64_t num, int64_t denum) -> int64_t {
+      assert(denum > 0);
+      return num < 0 ? (num - denum + 1) / denum : 0;
+    };
+    // Compute the input bounds from the eventual crops.
+    //
+    // If negative padding is applied, we can treat this as copying a subtensor
+    // of the input. We modify the input shape in place as we don't use it for
+    // anything else.
+    for (int i = 0; i < rank; ++i) {
+      input_shape_[i] =
+          dims[i] +
+          DivNegRoundAwayOrZero(edge_pad_low_[i], interior_pad_[i] + 1) +
+          DivNegRoundAwayOrZero(edge_pad_high_[i], interior_pad_[i] + 1);
+    }
+    // Compute the input offset from the eventual crops.
+    //
+    // When computing the subtensor from the negative padding, we need to find
+    // out the offset to its first element in addition to its shape (see
+    // previous comment).
+    //
+    // Cropping also means that the interior padding can become edge padding so
+    // we also need to update the output offset:
+    //
+    // > `1 0 0 0 2 0 0 0 3` cropped by 1 low element becomes `0 0 0 2 0 0 0 3`
+    // > which effectlvely means pad `2 3` with an interior padding of 3 and a
+    // > low edge padding of 3.
+    for (int i = 0; i < rank; ++i) {
+      input_offset_ -=
+          DivNegRoundAwayOrZero(edge_pad_low_[i], interior_pad_[i] + 1) *
+          input_strides_[i];
+      if (edge_pad_low_[i] < 0) {
+        int64_t tmp_offset = ((interior_pad_[i] + 1 + edge_pad_low_[i]) %
+                              (interior_pad_[i] + 1));
+        if (tmp_offset < 0) {
+          tmp_offset += interior_pad_[i] + 1;
+        }
+        output_offset_ += tmp_offset * output_dimension_sizes_[i];
+      }
+    }
+  }
+
+  void Apply(const char* input, const char* padding_value, char* output) const {
+    // Fill the output tensor with the padding value.
+    FillBuffer(output, output_size_, padding_value, element_size_);
+    StridedCopy(rank_, input + input_offset_, input_shape_, input_strides_,
+                output + output_offset_, output_strides_, element_size_,
+                /*depth=*/0);
+  }
+
+  TfLiteIntArray* BuildOuputTensorDims() const {
+    TfLiteIntArray* dims = TfLiteIntArrayCreate(rank_);
+    for (int64_t i = 0; i < rank_; ++i) {
+      dims->data[i] = output_shape_[i];
+    }
+    return dims;
+  }
+
+ private:
+  int64_t edge_pad_low_[kMaxDims];
+  int64_t edge_pad_high_[kMaxDims];
+  int64_t interior_pad_[kMaxDims];
+  int64_t rank_ = 0;
+  int64_t element_size_ = 0;
+  int64_t input_shape_[kMaxDims];
+  int64_t output_shape_[kMaxDims];
+  int64_t input_strides_[kMaxDims];
+  int64_t output_strides_[kMaxDims];
+  int64_t output_dimension_sizes_[kMaxDims];
+  int64_t input_offset_ = 0;
+  int64_t output_offset_ = 0;
+  int64_t output_size_ = 0;
+};
+
+void* Init(TfLiteContext* context, const char* options, size_t options_len) {
+  return new PadData(
+      *reinterpret_cast<const TfLiteStablehloPadParams*>(options));
+}
+
+void Free(TfLiteContext* context, void* node_data) {
+  delete reinterpret_cast<PadData*>(node_data);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Input checks.
+  const TfLiteTensor* input_tensor = GetInput(context, node, PadData::kInput);
+  const TfLiteTensor* padding_value_tensor =
+      GetInput(context, node, PadData::kPaddingValue);
+  TF_LITE_ENSURE(context, input_tensor->type == padding_value_tensor->type);
+  // PadData computations.
+  size_t element_size;
+  TF_LITE_ENSURE(context, GetSizeOfType(context, input_tensor->type,
+                                        &element_size) == kTfLiteOk);
+  PadData& pad_data = *reinterpret_cast<PadData*>(node->user_data);
+  pad_data.Setup(input_tensor->dims->data, input_tensor->dims->size,
+                 element_size);
+  // Output tensor setup.
+  TfLiteTensor* output_tensor = GetOutput(context, node, PadData::kOutput);
+  TF_LITE_ENSURE(context, input_tensor->type == output_tensor->type);
+  context->ResizeTensor(context, output_tensor,
+                        pad_data.BuildOuputTensorDims());
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_tensor = GetInput(context, node, PadData::kInput);
+  const TfLiteTensor* padding_value_tensor =
+      GetInput(context, node, PadData::kPaddingValue);
+  TfLiteTensor* output_tensor = GetOutput(context, node, PadData::kOutput);
+  // Pad using PadData
+  PadData& pad_data = *reinterpret_cast<PadData*>(node->user_data);
+  pad_data.Apply(input_tensor->data.raw_const,
+                 padding_value_tensor->data.raw_const, output_tensor->data.raw);
+  return kTfLiteOk;
+}
+
+}  // namespace
+}  // namespace stablehlo_pad
+
+TfLiteRegistration* Register_STABLEHLO_PAD() {
+  static TfLiteRegistration r = {/*.init=*/stablehlo_pad::Init,
+                                 /*.free=*/stablehlo_pad::Free,
+                                 /*.prepare=*/stablehlo_pad::Prepare,
+                                 /*.invoke=*/stablehlo_pad::Eval};
+  return &r;
+}
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/stablehlo_pad_test.cc b/tensorflow/lite/kernels/stablehlo_pad_test.cc
new file mode 100644
index 00000000000000..f7a3aede43d40e
--- /dev/null
+++ b/tensorflow/lite/kernels/stablehlo_pad_test.cc
@@ -0,0 +1,471 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+         //
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// #include <gmock/gmock.h>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
+#include "absl/random/bit_gen_ref.h"
+#include "absl/random/random.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/stablehlo_reduce_window_test_util.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace stablehlo_pad {
+namespace {
+
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::HasSubstr;
+
+template <class T>
+class StablehloPadModel : public SingleOpModel {
+ public:
+  static constexpr TensorType kTensorType = GetTensorType<T>();
+
+  void SetEdgePadding(std::vector<int64_t> low, std::vector<int64_t> high) {
+    edge_padding_low_ = std::move(low);
+    edge_padding_high_ = std::move(high);
+  }
+
+  const std::vector<int64_t>& GetEdgePaddingLow() const {
+    return edge_padding_low_;
+  }
+
+  const std::vector<int64_t>& GetEdgePaddingHigh() const {
+    return edge_padding_high_;
+  }
+
+  void SetInteriorPadding(std::vector<int64_t> padding) {
+    interior_padding_ = std::move(padding);
+  }
+
+  const std::vector<int64_t>& GetInteriorPadding() const {
+    return interior_padding_;
+  }
+
+  void SetInput(std::vector<int64_t> shape) {
+    input_.shape = shape;
+    input_.data.resize(absl::c_accumulate(shape, 1, std::multiplies<>()));
+    absl::c_iota(input_.data, static_cast<T>(1));
+  }
+
+  void SetInput(std::vector<int64_t> shape, std::vector<T> data) {
+    input_.shape = shape;
+    input_.data = data;
+  }
+
+  void SetInput(absl::Span<const int64_t> shape, absl::BitGenRef bitgen, T min,
+                T max) {
+    input_.shape.assign(shape.begin(), shape.end());
+    input_.data.resize(absl::c_accumulate(shape, 1, std::multiplies<>()));
+    absl::c_generate(input_.data, [&] {
+      return absl::Uniform(absl::IntervalClosed, bitgen, min, max);
+    });
+  }
+
+  const reduce_window::reference::Tensor<T>& GetInput() const { return input_; }
+
+  void SetPaddingValue(const T& v) { padding_value_ = v; }
+
+  T GetPaddingValue() const { return padding_value_; }
+
+  absl::Span<const T> GetOutputData() {
+    return absl::Span<const T>(interpreter_->typed_tensor<T>(output_tensor_id_),
+                               GetTensorSize(output_tensor_id_));
+  }
+
+  absl::Span<const int> GetOutputShape() {
+    const TfLiteIntArray& shape =
+        *(interpreter_->tensor(output_tensor_id_)->dims);
+    return absl::Span<const int>(shape.data, shape.size);
+  }
+
+  absl::Status CheckPreconditions() {
+    const size_t rank = input_.shape.size();
+    if (rank == 0) {
+      return absl::FailedPreconditionError("Input rank is 0.");
+    }
+    if (edge_padding_low_.empty()) {
+      edge_padding_low_ = std::vector<int64_t>(rank, 0);
+    } else if (edge_padding_low_.size() != rank) {
+      return absl::FailedPreconditionError(
+          "Low edge padding does not have the right size.");
+    }
+    if (edge_padding_high_.empty()) {
+      edge_padding_high_ = std::vector<int64_t>(rank, 0);
+    } else if (edge_padding_high_.size() != rank) {
+      return absl::FailedPreconditionError(
+          "High edge padding does not have the right size.");
+    }
+    if (interior_padding_.empty()) {
+      interior_padding_ = std::vector<int64_t>(rank, 0);
+    } else if (interior_padding_.size() != rank) {
+      return absl::FailedPreconditionError(
+          "Interior padding does not have the right size.");
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status Build() {
+    if (absl::Status status = CheckPreconditions(); !status.ok()) {
+      return status;
+    }
+    input_tensor_id_ =
+        AddInput({kTensorType,
+                  std::vector<int>(input_.shape.begin(), input_.shape.end())});
+    padding_value_tensor_id_ =
+        AddConstInput(kTensorType, /*data=*/{padding_value_}, /*shape=*/{1});
+    output_tensor_id_ = AddOutput(kTensorType);
+
+    SetBuiltinOp(BuiltinOperator_STABLEHLO_PAD,
+                 BuiltinOptions2_StablehloPadOptions,
+                 CreateStablehloPadOptions(
+                     builder_, builder_.CreateVector(edge_padding_low_),
+                     builder_.CreateVector(edge_padding_high_),
+                     builder_.CreateVector(interior_padding_))
+                     .Union());
+    BuildInterpreter(
+        /*input_shapes=*/{std::vector<int>(input_.shape.begin(),
+                                           input_.shape.end())},
+        /*num_threads=*/-1, /*allow_fp32_relax_to_fp16=*/false,
+        /*apply_delegate=*/true, /*allocate_and_delegate=*/false,
+        /*use_simple_allocator=*/false);
+    AllocateAndDelegate(/*apply_delegate=*/true);
+    PopulateTensor(input_tensor_id_, input_.data);
+    return absl::OkStatus();
+  }
+
+  absl::Status BuildAndInvoke() {
+    if (absl::Status status = Build(); !status.ok()) {
+      return status;
+    }
+    if (TfLiteStatus status = Invoke(); status != kTfLiteOk) {
+      const std::string msg =
+          absl::StrFormat("Invoke failed with status %d.", status);
+      return absl::InternalError(msg);
+    }
+    return absl::OkStatus();
+  }
+
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const StablehloPadModel& model) {
+    auto print_vec = [&os](const auto& vec) {
+      os << "[";
+      if (!vec.empty()) {
+        auto it = vec.begin();
+        os << +*(it++);
+        for (; it != vec.end(); ++it) {
+          os << ", " << +*it;
+        }
+      }
+      os << "]";
+    };
+    os << "  edge_padding_low: ";
+    print_vec(model.GetEdgePaddingLow());
+    os << "\n  edge_padding_high: ";
+    print_vec(model.GetEdgePaddingHigh());
+    os << "\n  interior_padding: ";
+    print_vec(model.GetInteriorPadding());
+    os << "\n  padding_value: " << +model.GetPaddingValue();
+    os << "\n  input shape: ";
+    print_vec(model.GetInput().shape);
+    return os;
+  }
+
+ private:
+  std::vector<int64_t> edge_padding_low_;
+  std::vector<int64_t> edge_padding_high_;
+  std::vector<int64_t> interior_padding_;
+  reduce_window::reference::Tensor<T> input_;
+  T padding_value_ = 0;
+
+  int input_tensor_id_;
+  int padding_value_tensor_id_;
+  int output_tensor_id_;
+};
+
+template <class T>
+absl::StatusOr<reduce_window::reference::Tensor<T>> ComputeReference(
+    StablehloPadModel<T>& model) {
+  if (absl::Status status = model.CheckPreconditions(); !status.ok()) {
+    return status;
+  }
+  std::vector<int64_t> dilations, padding;
+  for (size_t i = 0; i < model.GetInput().shape.size(); ++i) {
+    padding.push_back(model.GetEdgePaddingLow()[i]);
+    padding.push_back(model.GetEdgePaddingHigh()[i]);
+    dilations.push_back(model.GetInteriorPadding()[i] + 1);
+  }
+
+  auto dilated_tensor = reduce_window::reference::Dilate(
+      model.GetInput(), dilations, model.GetPaddingValue());
+  auto padded_tensor = reduce_window::reference::Pad(dilated_tensor, padding,
+                                                     model.GetPaddingValue());
+  return reduce_window::reference::Crop(padded_tensor, padding);
+}
+
+TEST(StablehloPadModelTest, DefaultModelFails) {
+  StablehloPadModel<int> model;
+  const auto expected_status = ComputeReference(model);
+  EXPECT_FALSE(expected_status.ok());
+  EXPECT_EQ(expected_status.status().code(),
+            absl::StatusCode::kFailedPrecondition);
+  EXPECT_THAT(expected_status.status().message(),
+              HasSubstr("Input rank is 0."));
+}
+
+TEST(StablehloPadModelTest, DefaultModelReturnsIdentity) {
+  StablehloPadModel<int> model;
+  model.SetInput({3, 1});
+  EXPECT_THAT(model.GetInput().shape, ElementsAre(3, 1));
+  const auto expected_status = ComputeReference(model);
+  ASSERT_TRUE(expected_status.ok());
+  EXPECT_THAT(expected_status.value().data,
+              ElementsAreArray(model.GetInput().data));
+}
+
+TEST(StablehloPadModelTest, WrongEdgePaddingSizeIsAnError) {
+  StablehloPadModel<int> model;
+  model.SetInput({3, 1});
+  model.SetEdgePadding(/*low=*/{3, 4, 5}, /*high=*/{6, 7});
+  {
+    const auto expected_status = ComputeReference(model);
+    EXPECT_FALSE(expected_status.ok());
+    EXPECT_EQ(expected_status.status().code(),
+              absl::StatusCode::kFailedPrecondition);
+    EXPECT_THAT(expected_status.status().message(),
+                HasSubstr("Low edge padding does not have the right size."));
+  }
+  model.SetEdgePadding(/*low=*/{3, 4}, /*high=*/{5, 6, 7});
+  {
+    const auto expected_status = ComputeReference(model);
+    EXPECT_FALSE(expected_status.ok());
+    EXPECT_EQ(expected_status.status().code(),
+              absl::StatusCode::kFailedPrecondition);
+    EXPECT_THAT(expected_status.status().message(),
+                HasSubstr("High edge padding does not have the right size."));
+  }
+}
+
+TEST(StablehloPadModelTest, WrongInteriorPaddingSizeIsAnError) {
+  StablehloPadModel<int> model;
+  model.SetInput({3, 1});
+  model.SetInteriorPadding({3, 4, 5});
+  const auto expected_status = ComputeReference(model);
+  EXPECT_FALSE(expected_status.ok());
+  EXPECT_EQ(expected_status.status().code(),
+            absl::StatusCode::kFailedPrecondition);
+  EXPECT_THAT(expected_status.status().message(),
+              HasSubstr("Interior padding does not have the right size."));
+}
+
+TEST(StablehloPadTest, IdentityParams) {
+  StablehloPadModel<int> model;
+  model.SetInput({3, 3});
+  ASSERT_TRUE(model.BuildAndInvoke().ok());
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(model.GetInput().shape));
+  EXPECT_THAT(model.GetOutputData(), ElementsAreArray(model.GetInput().data));
+}
+
+TEST(StablehloPadTest, InteriorPad) {
+  StablehloPadModel<int> model;
+  model.SetInput({3, 3});
+  model.SetInteriorPadding({1, 2});
+  const auto expected_status = ComputeReference(model);
+  ASSERT_TRUE(expected_status.ok());
+  const auto& expected = expected_status.value();
+  ASSERT_TRUE(model.BuildAndInvoke().ok());
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(expected.shape));
+  EXPECT_THAT(model.GetOutputData(), ElementsAreArray(expected.data));
+}
+
+TEST(StablehloPadTest, LowPad) {
+  StablehloPadModel<int> model;
+  model.SetInput({3, 3});
+  model.SetEdgePadding({1, 1}, {0, 0});
+  const auto expected_status = ComputeReference(model);
+  ASSERT_TRUE(expected_status.ok());
+  const auto& expected = expected_status.value();
+  ASSERT_TRUE(model.BuildAndInvoke().ok());
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(expected.shape));
+  EXPECT_THAT(model.GetOutputData(), ElementsAreArray(expected.data));
+}
+
+TEST(StablehloPadTest, HighPad) {
+  StablehloPadModel<int> model;
+  model.SetInput({3, 3});
+  model.SetEdgePadding({0, 0}, {1, 1});
+  const auto expected_status = ComputeReference(model);
+  ASSERT_TRUE(expected_status.ok());
+  const auto& expected = expected_status.value();
+  ASSERT_TRUE(model.BuildAndInvoke().ok());
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(expected.shape));
+  EXPECT_THAT(model.GetOutputData(), ElementsAreArray(expected.data));
+}
+
+TEST(StablehloPadTest, AllPad) {
+  StablehloPadModel<int> model;
+  model.SetInput({3, 3});
+  model.SetEdgePadding({1, 1}, {1, 1});
+  const auto expected_status = ComputeReference(model);
+  ASSERT_TRUE(expected_status.ok());
+  const auto& expected = expected_status.value();
+  ASSERT_TRUE(model.BuildAndInvoke().ok());
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(expected.shape));
+  EXPECT_THAT(model.GetOutputData(), ElementsAreArray(expected.data));
+}
+
+TEST(StablehloPadTest, LowCrop) {
+  StablehloPadModel<int> model;
+  model.SetInput({3, 3});
+  model.SetEdgePadding({-1, -1}, {0, 0});
+  const auto expected_status = ComputeReference(model);
+  ASSERT_TRUE(expected_status.ok());
+  const auto& expected = expected_status.value();
+  ASSERT_TRUE(model.BuildAndInvoke().ok());
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(expected.shape));
+  EXPECT_THAT(model.GetOutputData(), ElementsAreArray(expected.data));
+}
+
+TEST(StablehloPadTest, HighCrop) {
+  StablehloPadModel<int> model;
+  model.SetInput({3, 3});
+  model.SetEdgePadding({0, 0}, {-1, -1});
+  const auto expected_status = ComputeReference(model);
+  ASSERT_TRUE(expected_status.ok());
+  const auto& expected = expected_status.value();
+  ASSERT_TRUE(model.BuildAndInvoke().ok());
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(expected.shape));
+  EXPECT_THAT(model.GetOutputData(), ElementsAreArray(expected.data));
+}
+
+TEST(StablehloPadTest, AllCrop) {
+  StablehloPadModel<int> model;
+  model.SetInput({3, 3});
+  model.SetEdgePadding({-1, -1}, {-1, -1});
+  const auto expected_status = ComputeReference(model);
+  ASSERT_TRUE(expected_status.ok());
+  const auto& expected = expected_status.value();
+  ASSERT_TRUE(model.BuildAndInvoke().ok());
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(expected.shape));
+  EXPECT_THAT(model.GetOutputData(), ElementsAreArray(expected.data));
+}
+
+TEST(StablehloPadTest, PadCrop) {
+  StablehloPadModel<int> model;
+  model.SetInput({3, 3});
+  model.SetEdgePadding({1, -1}, {1, -1});
+  const auto expected_status = ComputeReference(model);
+  ASSERT_TRUE(expected_status.ok());
+  const auto& expected = expected_status.value();
+  ASSERT_TRUE(model.BuildAndInvoke().ok());
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(expected.shape));
+  EXPECT_THAT(model.GetOutputData(), ElementsAreArray(expected.data));
+}
+
+TEST(StablehloPadTest, InteriorEdgePadding) {
+  StablehloPadModel<int> model;
+  model.SetInput({3, 3});
+  model.SetEdgePadding({-1, -4}, {0, 0});
+  model.SetInteriorPadding({1, 2});
+  const auto expected_status = ComputeReference(model);
+  ASSERT_TRUE(expected_status.ok());
+  const auto& expected = expected_status.value();
+  ASSERT_TRUE(model.BuildAndInvoke().ok());
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(expected.shape));
+  EXPECT_THAT(model.GetOutputData(), ElementsAreArray(expected.data));
+}
+
+TEST(StablehloPadTest, CallPrepareTwiceDoesNotFail) {
+  StablehloPadModel<int> model;
+  model.SetInput({3, 3});
+  model.SetEdgePadding({-1, -4}, {0, 0});
+  model.SetInteriorPadding({1, 2});
+  const auto expected_status = ComputeReference(model);
+  ASSERT_TRUE(expected_status.ok());
+  const auto& expected = expected_status.value();
+  // Applying delegates forces Prepare to be called twice.
+  model.SetApplyDefaultDelegates();
+  ASSERT_TRUE(model.BuildAndInvoke().ok());
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(expected.shape));
+  EXPECT_THAT(model.GetOutputData(), ElementsAreArray(expected.data));
+}
+
+// Returns a vector of given size with elements in the range [min, max].
+template <class T>
+std::vector<T> RandomVector(absl::BitGen& bitgen, size_t size, T min, T max) {
+  std::vector<T> vec(size);
+  for (T& v : vec) {
+    v = absl::Uniform(absl::IntervalClosed, bitgen, min, max);
+  }
+  return vec;
+}
+
+template <class T>
+class StablehloPadFuzzyTest : public testing::Test {};
+
+using TestList =
+    testing::Types<int8_t, int16_t, int32_t, int64_t, uint8_t, float, double>;
+TYPED_TEST_SUITE(StablehloPadFuzzyTest, TestList);
+
+TYPED_TEST(StablehloPadFuzzyTest, FuzzyTest) {
+  absl::BitGen bitgen;
+
+  for (size_t iteration = 0; iteration < 10000; ++iteration) {
+    const int rank = absl::Uniform(absl::IntervalClosed, bitgen, 1, 2);
+
+    StablehloPadModel<TypeParam> model;
+    model.SetInput(
+        /*shape=*/RandomVector<int64_t>(bitgen, rank, /*min=*/1, /*max=*/3),
+        bitgen, /*min=*/-5, /*max=*/5);
+    model.SetInteriorPadding(
+        RandomVector<int64_t>(bitgen, rank, /*min=*/0, /*max=*/2));
+    model.SetEdgePadding(
+        RandomVector<int64_t>(bitgen, rank, /*min=*/-5, /*max=*/5),
+        RandomVector<int64_t>(bitgen, rank, /*min=*/-5, /*max=*/5));
+    model.SetPaddingValue(
+        absl::Uniform(absl::IntervalClosed, bitgen, -127, 127));
+
+    const auto expected_status = ComputeReference(model);
+    ASSERT_TRUE(expected_status.ok());
+    const auto& expected = expected_status.value();
+    ASSERT_TRUE(model.BuildAndInvoke().ok());
+    EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(expected.shape))
+        << model;
+    EXPECT_THAT(model.GetOutputData(), ElementsAreArray(expected.data))
+        << model;
+  }
+}
+
+}  // namespace
+}  // namespace stablehlo_pad
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/stablehlo_reduce_window.cc b/tensorflow/lite/kernels/stablehlo_reduce_window.cc
index 32bf358239bc85..78385506c8d768 100644
--- a/tensorflow/lite/kernels/stablehlo_reduce_window.cc
+++ b/tensorflow/lite/kernels/stablehlo_reduce_window.cc
@@ -202,7 +202,7 @@ void Dilate(const DilateData& ctx, const char* input, const char* init_value,
   // Fill the output tensor with the padding value.
   {
     std::memcpy(output, init_value, ctx.init_element_size);
-    int64_t remaining_bytes = ctx.output_size;
+    int64_t remaining_bytes = ctx.output_size - ctx.init_element_size;
     int64_t copied_bytes = ctx.init_element_size;
     while (remaining_bytes) {
       int64_t bytes = std::min(remaining_bytes, copied_bytes);
diff --git a/tensorflow/lite/kernels/stablehlo_reduce_window_test.cc b/tensorflow/lite/kernels/stablehlo_reduce_window_test.cc
index a26c286ea350e9..fa95ac51075738 100644
--- a/tensorflow/lite/kernels/stablehlo_reduce_window_test.cc
+++ b/tensorflow/lite/kernels/stablehlo_reduce_window_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <initializer_list>
 #include <limits>
 #include <ostream>
+#include <type_traits>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -688,10 +689,17 @@ std::vector<T> RandomVector(absl::BitGen& bitgen, size_t size, T min, T max) {
 }
 
 struct Body {
-  static Body GetRandomSupported(absl::BitGen& bitgen) {
-    return Body{/*.body=*/static_cast<BodyFunction>(absl::Uniform<int>(
+  static Body GetRandomSupported(absl::BitGen& bitgen, bool allow_mul) {
+    Body b;
+    b = Body{/*.body=*/static_cast<BodyFunction>(absl::Uniform<int>(
         absl::IntervalClosed, bitgen, static_cast<int>(BodyFunction::kAdd),
         static_cast<int>(BodyFunction::kAny)))};
+    // This skews the uniformity of the random generation in favor of add. We
+    // only need to ensure that all the cases are tested.
+    if (!allow_mul && b.func == BodyFunction::kMul) {
+      b.func = BodyFunction::kAdd;
+    }
+    return b;
   }
 
   template <class T>
@@ -746,7 +754,9 @@ TYPED_TEST(StablehloReduceWindowTest, FuzzyTest) {
     const int rank = absl::Uniform(absl::IntervalClosed, bitgen, 1, 3);
 
     ReduceWindowOpModel<TypeParam> model;
-    Body body = Body::GetRandomSupported(bitgen);
+    // To avoid reduction overflows, we only test mul with floating point types.
+    Body body = Body::GetRandomSupported(
+        bitgen, /*allow_mul=*/std::is_floating_point<TypeParam>::value);
     model.SetInput(
         /*shape=*/RandomVector<int64_t>(bitgen, rank, /*min=*/1, /*max=*/10),
         bitgen, /*min=*/-5, /*max=*/5);
diff --git a/tensorflow/lite/kernels/stablehlo_reduce_window_test_util.h b/tensorflow/lite/kernels/stablehlo_reduce_window_test_util.h
index c514587a394014..d5cd7cc640e4de 100644
--- a/tensorflow/lite/kernels/stablehlo_reduce_window_test_util.h
+++ b/tensorflow/lite/kernels/stablehlo_reduce_window_test_util.h
@@ -91,6 +91,9 @@ inline std::vector<int64_t> DilateShape(std::vector<int64_t> shape,
   for (size_t i = 0; i < shape.size(); ++i) {
     shape[i] = (shape[i] - 1) * dilations[i] + 1;
   }
+  if (absl::c_any_of(shape, [](auto s) { return s <= 0; })) {
+    absl::c_fill(shape, 0);
+  }
   return shape;
 }
 
@@ -100,6 +103,10 @@ Tensor<T> Dilate(const Tensor<T>& input, const std::vector<int64_t>& dilations,
   Tensor<T> output =
       Tensor<T>::FromShape(DilateShape(input.shape, dilations), padding_value);
 
+  if (absl::c_all_of(output.shape, [](auto s) { return s == 0; })) {
+    return output;
+  }
+
   const std::vector<int64_t> strides = input.Strides();
   const std::vector<int64_t> output_strides = output.Strides();
   const std::vector<int64_t> safe_dilations = ExtendToMaxDim(dilations);
@@ -142,6 +149,9 @@ inline std::vector<int64_t> PadCropShape(std::vector<int64_t> shape,
   for (size_t i = 0; i < shape.size(); ++i) {
     shape[i] = shape[i] + padding[2 * i] + padding[2 * i + 1];
   }
+  if (absl::c_any_of(shape, [](auto s) { return s <= 0; })) {
+    absl::c_fill(shape, 0);
+  }
   return shape;
 }
 
@@ -160,6 +170,10 @@ Tensor<T> Pad(const Tensor<T>& input, const std::vector<int64_t>& padding,
   Tensor<T> output = Tensor<T>::FromShape(
       PadCropShape(input.shape, safe_padding), padding_value);
 
+  if (absl::c_all_of(output.shape, [](auto s) { return s == 0; })) {
+    return output;
+  }
+
   const std::vector<int64_t> strides = input.Strides();
   const std::vector<int64_t> output_strides = output.Strides();
   const std::vector<int64_t> safe_input_shape = ExtendToMaxDim(input.shape);
@@ -209,6 +223,10 @@ Tensor<T> Crop(const Tensor<T>& input, const std::vector<int64_t>& cropping) {
   Tensor<T> output =
       Tensor<T>::FromShape(PadCropShape(input.shape, safe_cropping));
 
+  if (absl::c_all_of(output.shape, [](auto s) { return s == 0; })) {
+    return output;
+  }
+
   const std::vector<int64_t> strides = input.Strides();
   const std::vector<int64_t> output_strides = output.Strides();
   const std::vector<int64_t> safe_output_shape = ExtendToMaxDim(output.shape);
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 78d4fe18e39acb..4781eae5dab108 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -1100,24 +1100,38 @@ class SingleOpTest : public ::testing::TestWithParam<string> {
   }
 };
 
+// Maps the native C++ types to the corresponding TFLite tensor type enum
+// values.
+template <class T>
+struct TensorTypeFor;
+
+#define TFLITE_TENSOR_TYPE_ASSOC(CPP_TYPE, TENSORTYPE_VALUE) \
+  template <>                                                \
+  struct TensorTypeFor<CPP_TYPE> {                           \
+    static constexpr TensorType value = TENSORTYPE_VALUE;    \
+  };
+
+TFLITE_TENSOR_TYPE_ASSOC(bool, TensorType_BOOL);
+TFLITE_TENSOR_TYPE_ASSOC(int8_t, TensorType_INT8);
+TFLITE_TENSOR_TYPE_ASSOC(int16_t, TensorType_INT16);
+TFLITE_TENSOR_TYPE_ASSOC(int32_t, TensorType_INT32);
+TFLITE_TENSOR_TYPE_ASSOC(int64_t, TensorType_INT64);
+TFLITE_TENSOR_TYPE_ASSOC(uint8_t, TensorType_UINT8);
+TFLITE_TENSOR_TYPE_ASSOC(uint16_t, TensorType_UINT16);
+TFLITE_TENSOR_TYPE_ASSOC(uint32_t, TensorType_UINT32);
+TFLITE_TENSOR_TYPE_ASSOC(uint64_t, TensorType_UINT64);
+TFLITE_TENSOR_TYPE_ASSOC(TfLiteFloat16, TensorType_FLOAT16);
+TFLITE_TENSOR_TYPE_ASSOC(Eigen::half, TensorType_FLOAT16);
+TFLITE_TENSOR_TYPE_ASSOC(float, TensorType_FLOAT32);
+TFLITE_TENSOR_TYPE_ASSOC(double, TensorType_FLOAT64);
+TFLITE_TENSOR_TYPE_ASSOC(std::string, TensorType_STRING);
+
+#undef TFLITE_TENSOR_TYPE_ASSOC
+
 // Returns the corresponding TensorType given the type T.
 template <typename T>
-TensorType GetTensorType() {
-  if (std::is_same<T, float>::value) return TensorType_FLOAT32;
-  if (std::is_same<T, TfLiteFloat16>::value) return TensorType_FLOAT16;
-  if (std::is_same<T, Eigen::half>::value) return TensorType_FLOAT16;
-  if (std::is_same<T, double>::value) return TensorType_FLOAT64;
-  if (std::is_same<T, int8_t>::value) return TensorType_INT8;
-  if (std::is_same<T, int16_t>::value) return TensorType_INT16;
-  if (std::is_same<T, uint16_t>::value) return TensorType_UINT16;
-  if (std::is_same<T, int32_t>::value) return TensorType_INT32;
-  if (std::is_same<T, uint32_t>::value) return TensorType_UINT32;
-  if (std::is_same<T, int64_t>::value) return TensorType_INT64;
-  if (std::is_same<T, uint64_t>::value) return TensorType_UINT64;
-  if (std::is_same<T, uint8_t>::value) return TensorType_UINT8;
-  if (std::is_same<T, string>::value) return TensorType_STRING;
-  if (std::is_same<T, bool>::value) return TensorType_BOOL;
-  return TensorType_MIN;  // default value
+constexpr TensorType GetTensorType() {
+  return TensorTypeFor<T>::value;
 }
 
 // Strings have a special implementation that is in test_util.cc
diff --git a/tensorflow/lite/kernels/variants/BUILD b/tensorflow/lite/kernels/variants/BUILD
index 13cb9d1567b297..3a806135fa7a50 100644
--- a/tensorflow/lite/kernels/variants/BUILD
+++ b/tensorflow/lite/kernels/variants/BUILD
@@ -179,6 +179,24 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "list_push_back_test",
+    srcs = ["list_kernels/list_push_back_test.cc"],
+    deps = [
+        ":list_ops_lib",
+        ":tensor_array",
+        ":test_util",
+        "//tensorflow/lite:type_to_tflitetype",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_test(
     name = "variant_add_n_test",
     srcs = ["list_kernels/variant_add_n_test.cc"],
@@ -198,13 +216,12 @@ cc_test(
 )
 
 cc_test(
-    name = "list_push_back_test",
-    srcs = ["list_kernels/list_push_back_test.cc"],
+    name = "variant_zeros_like_test",
+    srcs = ["list_kernels/variant_zeros_like_test.cc"],
     deps = [
         ":list_ops_lib",
         ":tensor_array",
         ":test_util",
-        "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
diff --git a/tensorflow/lite/kernels/variants/list_kernels/list_reserve.cc b/tensorflow/lite/kernels/variants/list_kernels/list_reserve.cc
index 094bf38104caa6..7637a4064a5451 100644
--- a/tensorflow/lite/kernels/variants/list_kernels/list_reserve.cc
+++ b/tensorflow/lite/kernels/variants/list_kernels/list_reserve.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstring>
 #include <utility>
 
+#include "tensorflow/lite/array.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -21,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/variants/list_ops_util.h"
 #include "tensorflow/lite/kernels/variants/tensor_array.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace variants {
@@ -46,23 +49,132 @@ TfLiteType ConvertTensorType(TensorType src) {
   }
 }
 
-constexpr int kElementShapeInput = 0;
-constexpr int kNumElementsInput = 1;
 constexpr int kListOut = 0;
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+struct SemanticOutType {
+  TfLiteType element_type;
+  IntArrayUniquePtr element_shape;
+  int num_elements;
+};
+
+class ReserveSemantic {
+ public:
+  ReserveSemantic(TfLiteContext* context, TfLiteNode* node)
+      : context_(context), node_(node) {}
+
+  constexpr static int kElementShapeInput = 0;
+  constexpr static int kNumElementsInput = 1;
+
+  TfLiteStatus CheckInputs() const {
+    TF_LITE_ENSURE_EQ(context_, NumInputs(node_), 2);
+    const TfLiteTensor* element_shape;
+    TF_LITE_ENSURE_OK(
+        context_,
+        GetInputSafe(context_, node_, kElementShapeInput, &element_shape));
+    TF_LITE_ENSURE(context_, element_shape->type == kTfLiteInt32);
+    const TfLiteTensor* num_elements;
+    TF_LITE_ENSURE_OK(context_, GetInputSafe(context_, node_, kNumElementsInput,
+                                             &num_elements));
+    TF_LITE_ENSURE_TYPES_EQ(context_, num_elements->type, kTfLiteInt32);
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus Compute(SemanticOutType& result) const {
+    // Parse element type from custom options.
+    auto* options =
+        reinterpret_cast<const ListReserveOptions*>(node_->custom_initial_data);
+    TfLiteType element_type = ConvertTensorType(options->element_type);
+    TF_LITE_ENSURE(context_, element_type != kTfLiteNoType);
+
+    const TfLiteTensor* num_elements;
+    TF_LITE_ENSURE_OK(context_, GetInputSafe(context_, node_, kNumElementsInput,
+                                             &num_elements));
+    TF_LITE_ENSURE_TYPES_EQ(context_, num_elements->type, kTfLiteInt32);
+    TF_LITE_ENSURE_EQ(context_, num_elements->dims->size, 0);
+    const int num_elements_value = num_elements->data.i32[0];
+    TF_LITE_ENSURE(context_, num_elements_value >= 0);
+
+    // Create int array representing constraint on list's constituent elements.
+    const TfLiteTensor* element_shape_tensor;
+    TF_LITE_ENSURE_OK(context_,
+                      GetInputSafe(context_, node_, kElementShapeInput,
+                                   &element_shape_tensor));
+    IntArrayUniquePtr element_shape = TensorAsShape(*element_shape_tensor);
+
+    result = SemanticOutType{element_type, std::move(element_shape),
+                             num_elements_value};
+    return kTfLiteOk;
+  }
 
-  const TfLiteTensor* element_shape;
-  TF_LITE_ENSURE_OK(
-      context, GetInputSafe(context, node, kElementShapeInput, &element_shape));
-  TF_LITE_ENSURE(context, element_shape->type == kTfLiteInt32);
+  TfLiteStatus PopulateOutput(TensorArray* const output) const {
+    return kTfLiteOk;
+  }
+
+ private:
+  TfLiteContext* const context_;
+  TfLiteNode* const node_;
+};
+
+class ZerosLikeSemantic {
+ public:
+  ZerosLikeSemantic(TfLiteContext* context, TfLiteNode* node)
+      : context_(context), node_(node) {}
+
+  constexpr static int kListInput = 0;
+
+  TfLiteStatus CheckInputs() const {
+    TF_LITE_ENSURE_EQ(context_, NumInputs(node_), 1);
+    const TfLiteTensor* list_input;
+    TF_LITE_ENSURE_OK(context_,
+                      GetInputSafe(context_, node_, kListInput, &list_input));
+    TF_LITE_ENSURE(context_, list_input->type == kTfLiteVariant);
+    return kTfLiteOk;
+  }
 
-  const TfLiteTensor* num_elements;
-  TF_LITE_ENSURE_OK(
-      context, GetInputSafe(context, node, kNumElementsInput, &num_elements));
-  TF_LITE_ENSURE_TYPES_EQ(context, num_elements->type, kTfLiteInt32);
+  TfLiteStatus Compute(SemanticOutType& result) const {
+    const TfLiteTensor* list_input;
+    TF_LITE_ENSURE_OK(context_,
+                      GetInputSafe(context_, node_, kListInput, &list_input));
+    const TensorArray* const input =
+        reinterpret_cast<const TensorArray*>(list_input->data.data);
+
+    result = SemanticOutType{input->ElementType(),
+                             BuildTfLiteArray(*input->ElementShape()),
+                             input->NumElements()};
+    return kTfLiteOk;
+  }
 
+  TfLiteStatus PopulateOutput(TensorArray* const output) const {
+    const TfLiteTensor* list_input;
+    TF_LITE_ENSURE_OK(context_,
+                      GetInputSafe(context_, node_, kListInput, &list_input));
+    const TensorArray* const input =
+        reinterpret_cast<const TensorArray*>(list_input->data.data);
+    for (int i = 0; i < input->NumElements(); ++i) {
+      const TfLiteTensor* const at = input->At(i);
+      if (at == nullptr) continue;
+      // Tensorflow supports lazy allocation in this case which is not possible
+      // with tflite tensors. If this proves to be a performance bottleneck we
+      // can investigate storing more info in TensorArray putting off allocation
+      // for later.
+      TensorUniquePtr output_at = BuildTfLiteTensor(
+          at->type, BuildTfLiteArray(*at->dims), kTfLiteDynamic);
+      memset(output_at->data.data, 0, output_at->bytes);
+      TF_LITE_ENSURE(context_, output->Set(i, std::move(output_at)));
+    }
+    return kTfLiteOk;
+  }
+
+ private:
+  TfLiteContext* const context_;
+  TfLiteNode* const node_;
+};
+
+template <class Semantic>
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const Semantic sem(context, node);
+  TF_LITE_ENSURE_OK(context, sem.CheckInputs());
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   TfLiteTensor* output;
   TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, kListOut, &output));
   TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteVariant);
@@ -70,40 +182,25 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+template <class Semantic>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  // Parse element type from custom options.
-  auto* options =
-      reinterpret_cast<const ListReserveOptions*>(node->custom_initial_data);
-  TfLiteType element_type = ConvertTensorType(options->element_type);
-  TF_LITE_ENSURE(context, element_type != kTfLiteNoType);
-
-  const TfLiteTensor* num_elements;
-  TF_LITE_ENSURE_OK(
-      context, GetInputSafe(context, node, kNumElementsInput, &num_elements));
-  TF_LITE_ENSURE_TYPES_EQ(context, num_elements->type, kTfLiteInt32);
-  TF_LITE_ENSURE_EQ(context, num_elements->dims->size, 0);
-  const int num_elements_value = num_elements->data.i32[0];
-  TF_LITE_ENSURE(context, num_elements_value >= 0);
-
-  // Create int array representing constraint on list's constituent elements.
-  const TfLiteTensor* element_shape_tensor;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kElementShapeInput,
-                                          &element_shape_tensor));
-  IntArrayUniquePtr element_shape = TensorAsShape(*element_shape_tensor);
+  const Semantic sem(context, node);
+  SemanticOutType data;
+  TF_LITE_ENSURE_OK(context, sem.Compute(data));
 
   TfLiteTensor* output;
   TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, kListOut, &output));
 
   // Construct new `TensorArray` underneath the output tensor.
-  TfLiteStatus stat =
-      TfLiteTensorVariantRealloc<TensorArray, TfLiteType, IntArrayUniquePtr>(
-          output, std::move(element_type), std::move(element_shape));
+  TfLiteStatus stat = TfLiteTensorVariantRealloc<TensorArray>(
+      output, data.element_type, std::move(data.element_shape));
   TF_LITE_ENSURE_OK(context, stat);
 
   // Set size of array.
-  TensorArray* arr =
+  TensorArray* const arr =
       static_cast<TensorArray*>(static_cast<VariantData*>(output->data.data));
-  arr->Resize(num_elements_value);
+  arr->Resize(data.num_elements);
+  TF_LITE_ENSURE_OK(context, sem.PopulateOutput(arr));
 
   return kTfLiteOk;
 }
@@ -111,8 +208,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace list_reserve
 
 TfLiteRegistration* Register_LIST_RESERVE() {
-  static TfLiteRegistration r = {nullptr, nullptr, list_reserve::Prepare,
-                                 list_reserve::Eval};
+  static TfLiteRegistration r = {
+      nullptr, nullptr, list_reserve::Prepare<list_reserve::ReserveSemantic>,
+      list_reserve::Eval<list_reserve::ReserveSemantic>};
+  return &r;
+}
+
+TfLiteRegistration* Register_VARIANT_ZEROS_LIKE() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, list_reserve::Prepare<list_reserve::ZerosLikeSemantic>,
+      list_reserve::Eval<list_reserve::ZerosLikeSemantic>};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/variants/list_kernels/variant_zeros_like_test.cc b/tensorflow/lite/kernels/variants/list_kernels/variant_zeros_like_test.cc
new file mode 100644
index 00000000000000..54647833550bfa
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/list_kernels/variant_zeros_like_test.cc
@@ -0,0 +1,132 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <tuple>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/kernels/variants/list_kernels/test_util.h"
+#include "tensorflow/lite/kernels/variants/list_ops_lib.h"
+#include "tensorflow/lite/kernels/variants/tensor_array.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace variants {
+namespace ops {
+namespace {
+
+using ::testing::AllOf;
+using ::testing::Combine;
+using ::testing::ValuesIn;
+using ::tflite::variants::TensorArray;
+
+class VariantZerosLikeModel : public ListOpModel {
+ public:
+  explicit VariantZerosLikeModel() {
+    list_input_ = AddInput({TensorType_VARIANT, {}});
+    list_output_ = AddOutput({TensorType_VARIANT, {}});
+    SetCustomOp("VariantZerosLike", {}, Register_VARIANT_ZEROS_LIKE);
+    BuildInterpreter({{}});
+  }
+
+  const TensorArray* GetOutputTensorArray() {
+    TfLiteTensor* tensor = interpreter_->tensor(list_output_);
+    TFLITE_CHECK(tensor != nullptr && tensor->type == kTfLiteVariant &&
+                 tensor->allocation_type == kTfLiteVariantObject);
+    return static_cast<const TensorArray*>(
+        static_cast<const VariantData*>(tensor->data.data));
+  }
+
+  int list_input_;
+  int list_output_;
+};
+
+using VariantZerosLikeTestParam = std::tuple<std::vector<int>, TfLiteType, int>;
+class VariantZerosLikeTest
+    : public testing::TestWithParam<VariantZerosLikeTestParam> {
+ public:
+  enum { kShape, kType, kLen };
+};
+
+TEST_P(VariantZerosLikeTest, OutputsEmptyListWithSameAttrs) {
+  const auto& param = GetParam();
+  const std::vector<int>& shape = std::get<kShape>(param);
+  const TfLiteType t = std::get<kType>(param);
+  const int len = std::get<kLen>(param);
+  VariantZerosLikeModel m;
+  m.PopulateListTensor(m.list_input_, shape, len, t);
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  const TensorArray* const out = m.GetOutputTensorArray();
+  ASSERT_EQ(out->NumElements(), len);
+  ASSERT_EQ(out->ElementType(), t);
+  ASSERT_THAT(out->ElementShape(), DimsAre(shape));
+  for (int i = 0; i < len; ++i) {
+    EXPECT_EQ(out->At(i), nullptr);
+  }
+}
+
+using VariantZerosLikeItemTestParam = std::tuple<int, std::vector<int>>;
+class VariantZerosLikeItemTest
+    : public testing::TestWithParam<VariantZerosLikeItemTestParam> {
+ public:
+  enum { kLen, kShape };
+};
+
+TEST_P(VariantZerosLikeItemTest, OutputsEmptyListContainsZeroedElement) {
+  const auto& param = GetParam();
+  const int len = std::get<kLen>(param);
+  const std::vector<int>& item_shape = std::get<kShape>(param);
+  VariantZerosLikeModel m;
+  m.PopulateListTensor(m.list_input_, {}, len, kTfLiteInt32);
+  const int num_elements = NumElements(item_shape.data(), item_shape.size());
+  m.ListSetItem(m.list_input_, 0, item_shape, kTfLiteInt32,
+                std::vector<int>(num_elements, 1).data());
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  const TensorArray* const out = m.GetOutputTensorArray();
+  ASSERT_EQ(out->NumElements(), len);
+  ASSERT_EQ(out->ElementType(), kTfLiteInt32);
+  ASSERT_THAT(out->ElementShape(), DimsAre({}));
+  const TfLiteTensor* const zero = out->At(0);
+  ASSERT_NE(zero, nullptr);
+  EXPECT_THAT(zero, AllOf(DimsAre(item_shape), IsAllocatedAs(kTfLiteInt32),
+                          FilledWith<int>(0)));
+  for (int i = 1; i < len; ++i) {
+    EXPECT_EQ(out->At(i), nullptr);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(VariantZerosLikeTests, VariantZerosLikeTest,
+                         Combine(ValuesIn(std::vector<std::vector<int>>{
+                                     {}, {-1}, {2, 2}, {3, 3, 3}}),
+                                 ValuesIn({kTfLiteInt32, kTfLiteInt64,
+                                           kTfLiteFloat32, kTfLiteBool}),
+                                 ValuesIn({0, 2, 10})));
+
+INSTANTIATE_TEST_SUITE_P(VariantZerosLikeTests, VariantZerosLikeItemTest,
+                         Combine(ValuesIn({1, 2, 10}),
+                                 ValuesIn(std::vector<std::vector<int>>{
+                                     {1}, {2, 2}, {3, 3, 3}})));
+
+}  // namespace
+}  // namespace ops
+}  // namespace variants
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/list_ops_lib.h b/tensorflow/lite/kernels/variants/list_ops_lib.h
index 33f1b16a8e59c8..52efd3abae82e8 100644
--- a/tensorflow/lite/kernels/variants/list_ops_lib.h
+++ b/tensorflow/lite/kernels/variants/list_ops_lib.h
@@ -49,6 +49,8 @@ TfLiteRegistration* Register_LIST_PUSH_BACK();
 
 TfLiteRegistration* Register_VARIANT_ADD_N();
 
+TfLiteRegistration* Register_VARIANT_ZEROS_LIKE();
+
 }  // namespace ops
 }  // namespace variants
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/py/BUILD b/tensorflow/lite/kernels/variants/py/BUILD
index 4373d1e389f5fe..da3e0dfb81b650 100644
--- a/tensorflow/lite/kernels/variants/py/BUILD
+++ b/tensorflow/lite/kernels/variants/py/BUILD
@@ -22,11 +22,12 @@ py_strict_test(
     tags = ["nochromiumos_arm"],
     deps = [
         ":register_list_ops_py",
+        "@absl_py//absl/testing:parameterized",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/lite/python:interpreter",
         "//tensorflow/python/ops:list_ops",
         "//tensorflow/python/platform:test",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/lite/kernels/variants/register_list_ops.cc b/tensorflow/lite/kernels/variants/register_list_ops.cc
index 9158c6f8b9f74d..36247e35960477 100644
--- a/tensorflow/lite/kernels/variants/register_list_ops.cc
+++ b/tensorflow/lite/kernels/variants/register_list_ops.cc
@@ -32,6 +32,7 @@ void RegisterListOps(MutableOpResolver* resolver) {
   resolver->AddCustom("TensorListPopBack", Register_LIST_POP_BACK());
   resolver->AddCustom("TensorListPushBack", Register_LIST_PUSH_BACK());
   resolver->AddCustom("VariantAddN", Register_VARIANT_ADD_N());
+  resolver->AddCustom("VariantZerosLike", Register_VARIANT_ZEROS_LIKE());
 }
 
 }  // namespace ops
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index adb0409b2fb791..1547947dc80fe1 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -59,13 +59,14 @@ py_strict_test(
     ],
     deps = [
         ":interpreter",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/lite/python/metrics",
         "//tensorflow/lite/python/testdata:_pywrap_test_registerer",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -122,6 +123,7 @@ py_strict_test(
     python_version = "PY3",
     deps = [
         ":test_util",
+        #internal proto upb dep
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
@@ -149,14 +151,16 @@ py_strict_test(
         ":convert",
         ":test_util",
         ":tflite_convert_main_lib",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:tf2",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:importer",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/ops:array_ops",
@@ -168,7 +172,6 @@ py_strict_test(
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/training:training_util",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -197,11 +200,11 @@ py_strict_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/framework",
         "//tensorflow/python/framework:byte_swap_tensor",
         "//tensorflow/python/framework:convert_to_constants",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:importer",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:versions",
         "//tensorflow/python/platform:gfile",
@@ -240,6 +243,8 @@ py_strict_test(
         ":lite_constants",
         ":schema_py",
         ":util",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
@@ -262,7 +267,6 @@ py_strict_test(
         "//tensorflow/python/platform:resource_loader",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/training:training_util",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -289,6 +293,8 @@ py_strict_test(
         ":schema_py",
         ":test_util",
         ":util",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_options_proto_py",
         "//tensorflow/lite/python/testdata:_pywrap_test_registerer",
@@ -308,7 +314,6 @@ py_strict_test(
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
         "//tensorflow/python/trackable:autotrackable",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
         "@pypi_jax//:pkg",
     ],
@@ -351,14 +356,16 @@ py_strict_test(
         ":interpreter",
         ":lite",
         ":test_util",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/lite/python/testdata:double_op",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:importer",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/ops:array_ops",
@@ -368,7 +375,6 @@ py_strict_test(
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/trackable:autotrackable",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -409,6 +415,8 @@ py_strict_test(
     ],
     deps = [
         ":util",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/lite/tools:flatbuffer_utils",
         "//tensorflow/python/client:session",
@@ -420,7 +428,6 @@ py_strict_test(
         "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/platform:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -561,6 +568,7 @@ py_strict_test(
     visibility = ["//visibility:public"],
     deps = [
         ":convert_saved_model",
+        #internal proto upb dep
         "//tensorflow/python/client:session",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
@@ -644,6 +652,7 @@ py_strict_test(
     python_version = "PY3",
     deps = [
         ":analyzer",
+        #internal proto upb dep
         "//tensorflow:tensorflow_py",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
diff --git a/tensorflow/lite/python/authoring/BUILD b/tensorflow/lite/python/authoring/BUILD
index b6f13fc4f9a8be..d08bb377620a34 100644
--- a/tensorflow/lite/python/authoring/BUILD
+++ b/tensorflow/lite/python/authoring/BUILD
@@ -27,6 +27,7 @@ py_strict_test(
     srcs_version = "PY2AND3",
     deps = [
         ":authoring",
+        #internal proto upb dep
         "//tensorflow:tensorflow_py",
     ],
 )
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index 52eb953a4a6638..d3aea0399683af 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -308,10 +308,18 @@ def testCreationCounter(self, increase_call):
 
 class InterpreterTestErrorPropagation(test_util.TensorFlowTestCase):
 
+  # Model must have at least 7 bytes to hold model identifier
+  def testTooShortModelContent(self):
+    with self.assertRaisesRegex(
+        ValueError,
+        'Model provided must have at least 7 bytes to hold identifier.',
+    ):
+      interpreter_wrapper.Interpreter(model_content=b'short')
+
   def testInvalidModelContent(self):
     with self.assertRaisesRegex(ValueError,
                                 'Model provided has model identifier \''):
-      interpreter_wrapper.Interpreter(model_content=b'garbage')
+      interpreter_wrapper.Interpreter(model_content=b'wrong_identifier')
 
   def testInvalidModelFile(self):
     with self.assertRaisesRegex(ValueError,
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 956825ba639b6f..d3dbfcf2ce5b21 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -1160,7 +1160,7 @@ def _validate_inference_input_output_types(self, quant_mode):
       if quant_mode.is_post_training_int16x8_quantization():
         all_types = default_types + [_dtypes.int16]
       else:
-        all_types = default_types + [_dtypes.int8, _dtypes.uint8]
+        all_types = default_types + [_dtypes.int8, _dtypes.uint8, _dtypes.int16]
       if (
           self.inference_input_type not in all_types
           or self.inference_output_type not in all_types
diff --git a/tensorflow/lite/python/metrics/BUILD b/tensorflow/lite/python/metrics/BUILD
index 1dc0a837124aca..cc86ea7b46dc50 100644
--- a/tensorflow/lite/python/metrics/BUILD
+++ b/tensorflow/lite/python/metrics/BUILD
@@ -69,6 +69,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":metrics_wrapper",
+        #internal proto upb dep
         "//tensorflow:tensorflow_py",
         "//tensorflow/lite/python:convert",
         "//tensorflow/lite/python:lite",
@@ -131,6 +132,9 @@ py_strict_test(
     deps = [
         ":converter_error_data_proto_py",
         ":metrics",
+        "@absl_py//absl/testing:parameterized",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/lite/python:convert",
@@ -138,9 +142,9 @@ py_strict_test(
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:monitoring",
-        "//tensorflow/python/framework",
         "//tensorflow/python/framework:convert_to_constants",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:importer",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/ops:array_ops",
@@ -152,8 +156,6 @@ py_strict_test(
         "//tensorflow/python/platform:resource_loader",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/trackable:autotrackable",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
index e8bf37b60f77c2..9df94b4a9a8054 100644
--- a/tensorflow/lite/python/optimize/BUILD
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -88,6 +88,9 @@ py_strict_test(
     tags = ["no_oss"],
     deps = [
         ":calibrator",
+        "@absl_py//absl/testing:parameterized",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/lite/python:lite",
         "//tensorflow/lite/python:schema_py",
@@ -96,7 +99,5 @@ py_strict_test(
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/lite/python/testdata/BUILD b/tensorflow/lite/python/testdata/BUILD
index 5faaea63bc77af..f05dec25d9cab6 100644
--- a/tensorflow/lite/python/testdata/BUILD
+++ b/tensorflow/lite/python/testdata/BUILD
@@ -148,8 +148,8 @@ tf_custom_op_py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":gen_double_op_wrapper",
-        "//tensorflow/python/framework",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:load_library",
         "//tensorflow/python/platform:resource_loader",
     ],
 )
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 6bffeadfbad9cb..382462f938d93b 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -459,7 +459,7 @@ enum BuiltinOperator : int32 {
   STABLEHLO_CONVERT = 192, // WARNING: No runtime support
   STABLEHLO_DYNAMIC_SLICE = 193, // WARNING: No runtime support
   STABLEHLO_DYNAMIC_UPDATE_SLICE = 194, // WARNING: No runtime support
-  STABLEHLO_PAD = 195, // WARNING: No runtime support
+  STABLEHLO_PAD = 195,
   STABLEHLO_IOTA = 196, // WARNING: No runtime support
   STABLEHLO_DOT_GENERAL = 197, // WARNING: No runtime support
   STABLEHLO_REDUCE_WINDOW = 198,
diff --git a/tensorflow/lite/simple_memory_arena.cc b/tensorflow/lite/simple_memory_arena.cc
index 23d1ed50486507..f1299b56ac7502 100644
--- a/tensorflow/lite/simple_memory_arena.cc
+++ b/tensorflow/lite/simple_memory_arena.cc
@@ -15,23 +15,41 @@ limitations under the License.
 
 #include "tensorflow/lite/simple_memory_arena.h"
 
-#include <stddef.h>
-#include <stdint.h>
-
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
 #include <cstring>
-#include <iterator>
 #include <limits>
-#include <memory>
 #include <string>
 #include <vector>
 
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/macros.h"
+
 #ifdef TF_LITE_TENSORFLOW_PROFILER
 #include "tensorflow/lite/tensorflow_profiler_logger.h"
 #endif  // TF_LITE_TENSORFLOW_PROFILER
 
+#if defined(__ANDROID__)
+// Android has C11 aligned_alloc only with API 28 or newer, even with C++17 or
+// C11 compilation (this is a non-standard behavior).
+#define TF_LITE_HAS_ALIGNED_ALLOC (__ANDROID_API__ >= 28)
+#elif defined(__APPLE__)
+// Apple does not provide aligned_alloc, even with C++17 or C11 compilation
+// (this is a non-standard behavior).
+#define TF_LITE_HAS_ALIGNED_ALLOC 0
+#elif defined(_WIN32)
+// Windows does not provide aligned_alloc, even with C++17 or C11 compilation
+// (this is a non-standard behavior). However, it provides _aligned_malloc,
+// _aligned_realloc, and _aligned_free, with a slightly different behavior than
+// the C11/C++17 standard functions (size requirement, and free function name.)
+#define TF_LITE_HAS_ALIGNED_ALLOC 0
+#elif __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L
+// C++17 or C11 has (std::)aligned_alloc
+#define TF_LITE_HAS_ALIGNED_ALLOC 1
+#endif
+
 namespace {
 
 template <typename T>
@@ -40,10 +58,135 @@ T AlignTo(size_t alignment, T offset) {
                                  : offset + (alignment - offset % alignment);
 }
 
+// Allocates memory and aligns it to the specified size. Returns a pair of the
+// allocation pointer and the aligned pointer.
+tflite::PointerAlignedPointerPair AlignedAlloc(size_t size, size_t alignment);
+
+// Frees up aligned memory.
+void AlignedFree(const tflite::PointerAlignedPointerPair& buffer);
+
+// Reallocates aligned memory
+//
+// The function either extends the memory allocation in-place, or if that is not
+// possible a new allocation is created, the data is copied, and the old buffer
+// is deallocated. It is an error to change the alignment during reallocation.
+// If the previous allocation is null, this is equivalent to AlignedAlloc.
+// Returns pointers to the new allocation.
+tflite::PointerAlignedPointerPair AlignedRealloc(
+    const tflite::PointerAlignedPointerPair& old_buffer, size_t old_size,
+    size_t new_size, size_t alignment);
+
+#if defined(_WIN32)
+// On Windows <cstdlib> provides _aligned_malloc, _aligned_free, and
+// _aligned_realloc, use them to implement the Aligned functions.
+
+tflite::PointerAlignedPointerPair AlignedAlloc(size_t size, size_t alignment) {
+  char* pointer = reinterpret_cast<char*>(_aligned_malloc(size, alignment));
+  char* aligned_ptr = pointer;
+  return {pointer, aligned_ptr};
+}
+
+void AlignedFree(const tflite::PointerAlignedPointerPair& buffer) {
+  _aligned_free(buffer.pointer);
+}
+
+tflite::PointerAlignedPointerPair AlignedRealloc(
+    const tflite::PointerAlignedPointerPair& old_buffer, size_t old_size,
+    size_t new_size, size_t alignment) {
+  char* pointer = reinterpret_cast<char*>(
+      _aligned_realloc(old_buffer.pointer, new_size, alignment));
+  char* aligned_ptr = pointer;
+  return {pointer, aligned_ptr};
+}
+#else
+// Default implementation: Use malloc, allocating extra memory, and align the
+// pointer in the allocated buffer.
+
+tflite::PointerAlignedPointerPair AlignedAlloc(size_t size, size_t alignment) {
+#if TF_LITE_HAS_ALIGNED_ALLOC
+  // (std::)aligned_alloc requires size to be multiple of alignment.
+  // TODO(b/311495100): when bug is fixed, remove `size + alignment - 1` part.
+  const size_t allocation_size = AlignTo(alignment, size + alignment - 1);
+  char* pointer =
+      reinterpret_cast<char*>(::aligned_alloc(alignment, allocation_size));
+  char* aligned_ptr = pointer;
+#else
+  // TODO(b/311495100): when bug is fixed, change this to
+  // `size + std::max(size_t{0}, alignment - alignof(std::max_align_t))`
+  const size_t allocation_size = size + alignment - 1;
+  char* pointer = reinterpret_cast<char*>(std::malloc(allocation_size));
+  char* aligned_ptr = reinterpret_cast<char*>(
+      AlignTo(alignment, reinterpret_cast<std::uintptr_t>(pointer)));
+#endif
+#if defined(__clang__)
+#if __has_feature(memory_sanitizer)
+  std::memset(pointer, 0, allocation_size);
+#endif
+#endif
+  return {pointer, aligned_ptr};
+}
+
+void AlignedFree(const tflite::PointerAlignedPointerPair& buffer) {
+  std::free(buffer.pointer);
+}
+
+tflite::PointerAlignedPointerPair AlignedRealloc(
+    const tflite::PointerAlignedPointerPair& old_buffer, size_t old_size,
+    size_t new_size, size_t alignment) {
+  tflite::PointerAlignedPointerPair new_buffer =
+      AlignedAlloc(new_size, alignment);
+  if (new_size > 0 && old_size > 0) {
+    // Copy data when both old and new buffers are bigger than 0 bytes.
+    const size_t copy_amount = std::min(new_size, old_size);
+    std::memcpy(new_buffer.aligned_pointer, old_buffer.aligned_pointer,
+                copy_amount);
+  }
+  AlignedFree(old_buffer);
+  return new_buffer;
+}
+#endif
 }  // namespace
 
 namespace tflite {
 
+bool ResizableAlignedBuffer::Resize(size_t new_size) {
+  if (new_size <= data_size_) {
+    // Skip reallocation when resizing down.
+    return false;
+  }
+#ifdef TF_LITE_TENSORFLOW_PROFILER
+  PauseHeapMonitoring(/*pause=*/true);
+  OnTfLiteArenaAlloc(subgraph_index_, reinterpret_cast<std::uintptr_t>(this),
+                     new_size);
+  if (data_size_ > 0) {
+    OnTfLiteArenaDealloc(subgraph_index_,
+                         reinterpret_cast<std::uintptr_t>(this), data_size_);
+  }
+#endif
+  auto new_buffer = AlignedRealloc(buffer_, data_size_, new_size, alignment_);
+  bool reallocated = (new_buffer.aligned_pointer != buffer_.aligned_pointer);
+  buffer_ = new_buffer;
+  data_size_ = new_size;
+#ifdef TF_LITE_TENSORFLOW_PROFILER
+  PauseHeapMonitoring(/*pause=*/false);
+#endif
+  return reallocated;
+}
+
+void ResizableAlignedBuffer::Release() {
+  if (buffer_.pointer == nullptr) {
+    return;
+  }
+#ifdef TF_LITE_TENSORFLOW_PROFILER
+  OnTfLiteArenaDealloc(subgraph_index_, reinterpret_cast<std::uintptr_t>(this),
+                       data_size_);
+#endif
+  AlignedFree(buffer_);
+  buffer_.pointer = nullptr;
+  buffer_.aligned_pointer = nullptr;
+  data_size_ = 0;
+}
+
 void SimpleMemoryArena::PurgeAfter(int32_t node) {
   for (int i = 0; i < active_allocs_.size(); ++i) {
     if (active_allocs_[i].first_node > node) {
@@ -91,7 +234,7 @@ TfLiteStatus SimpleMemoryArena::Allocate(
     TfLiteContext* context, size_t alignment, size_t size, int32_t tensor,
     int32_t first_node, int32_t last_node,
     ArenaAllocWithUsageInterval* new_alloc) {
-  TF_LITE_ENSURE(context, alignment <= arena_alignment_);
+  TF_LITE_ENSURE(context, alignment <= underlying_buffer_.GetAlignment());
   new_alloc->tensor = tensor;
   new_alloc->first_node = first_node;
   new_alloc->last_node = last_node;
@@ -141,50 +284,13 @@ TfLiteStatus SimpleMemoryArena::Allocate(
   return kTfLiteOk;
 }
 
-TfLiteStatus SimpleMemoryArena::Commit(TfLiteContext* context,
-                                       bool* arena_reallocated) {
-  size_t required_size = RequiredBufferSize();
-  if (required_size > underlying_buffer_size_) {
-    *arena_reallocated = true;
-#ifdef TF_LITE_TENSORFLOW_PROFILER
-    PauseHeapMonitoring(/*pause=*/true);
-    OnTfLiteArenaAlloc(subgraph_index_, reinterpret_cast<std::uintptr_t>(this),
-                       required_size);
-#endif
-    char* new_alloc = new char[required_size];
-    char* new_underlying_buffer_aligned_ptr = reinterpret_cast<char*>(
-        AlignTo(arena_alignment_, reinterpret_cast<intptr_t>(new_alloc)));
-
-    // If the arena had been previously allocated, copy over the old memory.
-    // Since Alloc pointers are offset based, they will remain valid in the new
-    // memory block.
-    if (high_water_mark_ > 0 && underlying_buffer_size_ > 0) {
-      size_t copy_amount = std::min(
-          underlying_buffer_.get() + underlying_buffer_size_ -
-              underlying_buffer_aligned_ptr_,
-          new_alloc + required_size - new_underlying_buffer_aligned_ptr);
-      memcpy(new_underlying_buffer_aligned_ptr, underlying_buffer_aligned_ptr_,
-             copy_amount);
-    }
-
-#ifdef TF_LITE_TENSORFLOW_PROFILER
-    if (underlying_buffer_size_ > 0) {
-      OnTfLiteArenaDealloc(subgraph_index_,
-                           reinterpret_cast<std::uintptr_t>(this),
-                           underlying_buffer_size_);
-    }
-#endif
-    underlying_buffer_.reset(new_alloc);
-    underlying_buffer_size_ = required_size;
-    underlying_buffer_aligned_ptr_ = new_underlying_buffer_aligned_ptr;
-#ifdef TF_LITE_TENSORFLOW_PROFILER
-    PauseHeapMonitoring(/*pause=*/false);
-#endif
-  } else {
-    *arena_reallocated = false;
-  }
+TfLiteStatus SimpleMemoryArena::Commit(bool* arena_reallocated) {
+  // Resize the arena to the high water mark (calculated by Allocate), retaining
+  // old contents and alignment in the process. Since Alloc pointers are offset
+  // based, they will remain valid in the new memory block.
+  *arena_reallocated = underlying_buffer_.Resize(high_water_mark_);
   committed_ = true;
-  return underlying_buffer_ != nullptr ? kTfLiteOk : kTfLiteError;
+  return kTfLiteOk;
 }
 
 TfLiteStatus SimpleMemoryArena::ResolveAlloc(
@@ -193,11 +299,11 @@ TfLiteStatus SimpleMemoryArena::ResolveAlloc(
   TF_LITE_ENSURE(context, committed_);
   TF_LITE_ENSURE(context, output_ptr != nullptr);
   TF_LITE_ENSURE(context,
-                 underlying_buffer_size_ >= (alloc.offset + alloc.size));
+                 underlying_buffer_.GetSize() >= (alloc.offset + alloc.size));
   if (alloc.size == 0) {
     *output_ptr = nullptr;
   } else {
-    *output_ptr = underlying_buffer_aligned_ptr_ + alloc.offset;
+    *output_ptr = underlying_buffer_.GetPtr() + alloc.offset;
   }
   return kTfLiteOk;
 }
@@ -211,13 +317,7 @@ TfLiteStatus SimpleMemoryArena::ClearPlan() {
 
 TfLiteStatus SimpleMemoryArena::ReleaseBuffer() {
   committed_ = false;
-#ifdef TF_LITE_TENSORFLOW_PROFILER
-  OnTfLiteArenaDealloc(subgraph_index_, reinterpret_cast<std::uintptr_t>(this),
-                       underlying_buffer_size_);
-#endif
-  underlying_buffer_size_ = 0;
-  underlying_buffer_aligned_ptr_ = nullptr;
-  underlying_buffer_.reset();
+  underlying_buffer_.Release();
   return kTfLiteOk;
 }
 
@@ -229,7 +329,7 @@ TFLITE_ATTRIBUTE_WEAK void DumpArenaInfo(
 
 void SimpleMemoryArena::DumpDebugInfo(
     const std::string& name, const std::vector<int>& execution_plan) const {
-  tflite::DumpArenaInfo(name, execution_plan, underlying_buffer_size_,
+  tflite::DumpArenaInfo(name, execution_plan, underlying_buffer_.GetSize(),
                         active_allocs_);
 }
 
diff --git a/tensorflow/lite/simple_memory_arena.h b/tensorflow/lite/simple_memory_arena.h
index 8f8859a6c0d594..7275b3014f3660 100644
--- a/tensorflow/lite/simple_memory_arena.h
+++ b/tensorflow/lite/simple_memory_arena.h
@@ -15,10 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_SIMPLE_MEMORY_ARENA_H_
 #define TENSORFLOW_LITE_SIMPLE_MEMORY_ARENA_H_
 
-#include <stddef.h>
-
+#include <cstddef>
 #include <cstdint>
-#include <memory>
 #include <string>
 #include <vector>
 
@@ -55,6 +53,53 @@ struct ArenaAllocWithUsageInterval {
   }
 };
 
+struct PointerAlignedPointerPair {
+  char* pointer;
+  char* aligned_pointer;
+};
+
+class ResizableAlignedBuffer {
+ public:
+  ResizableAlignedBuffer(size_t alignment, int subgraph_index)
+      : buffer_{nullptr, nullptr},
+        data_size_(0),
+        alignment_(alignment),
+        subgraph_index_(subgraph_index) {
+    // To silence unused private member warning, only used with
+    // TF_LITE_TENSORFLOW_PROFILER
+    (void)subgraph_index_;
+  }
+
+  ~ResizableAlignedBuffer() { Release(); }
+
+  // Resizes the buffer to make sure new_size bytes fit in the buffer. Keeps
+  // alignment and any existing the data. Returns true when any external
+  // pointers into the data array need to be adjusted (the buffer was moved).
+  bool Resize(size_t new_size);
+  // Releases any allocated memory.
+  void Release();
+
+  // Pointer to the data array.
+  char* GetPtr() const { return buffer_.aligned_pointer; }
+  // Size of the data array. Note: the allocated memory block might be larger
+  // due to excess alignment requirements.
+  size_t GetSize() const { return data_size_; }
+  // Alignment of the data array.
+  size_t GetAlignment() const { return alignment_; }
+
+ private:
+  ResizableAlignedBuffer(const ResizableAlignedBuffer&) = delete;
+  ResizableAlignedBuffer& operator=(const ResizableAlignedBuffer&) = delete;
+  ResizableAlignedBuffer(ResizableAlignedBuffer&&) = delete;
+  ResizableAlignedBuffer& operator=(ResizableAlignedBuffer&&) = delete;
+
+  PointerAlignedPointerPair buffer_;
+  size_t data_size_;
+  size_t alignment_;
+
+  int subgraph_index_;
+};
+
 // This small class is responsible for allocating, deallocating and reusing
 // dynamic memory from a common underlying buffer. The arena can be used in
 // scenarios when the pattern of memory allocations and deallocations is
@@ -63,11 +108,9 @@ struct ArenaAllocWithUsageInterval {
 class SimpleMemoryArena {
  public:
   explicit SimpleMemoryArena(size_t arena_alignment, int subgraph_index = 0)
-      : subgraph_index_(subgraph_index),
-        committed_(false),
-        arena_alignment_(arena_alignment),
+      : committed_(false),
         high_water_mark_(0),
-        underlying_buffer_size_(0),
+        underlying_buffer_(arena_alignment, subgraph_index),
         active_allocs_() {}
 
   // Delete all allocs. This should be called when allocating the first node of
@@ -99,14 +142,7 @@ class SimpleMemoryArena {
                         int32_t tensor, int32_t first_node, int32_t last_node,
                         ArenaAllocWithUsageInterval* new_alloc);
 
-  inline size_t RequiredBufferSize() {
-    // Add in a small amount of padding to reduce the chance of resize events
-    // for small allocations.
-    size_t padding = arena_alignment_;
-    return arena_alignment_ + high_water_mark_ + padding;
-  }
-
-  TfLiteStatus Commit(TfLiteContext* context, bool* arena_reallocated);
+  TfLiteStatus Commit(bool* arena_reallocated);
 
   TfLiteStatus ResolveAlloc(TfLiteContext* context,
                             const ArenaAllocWithUsageInterval& alloc,
@@ -122,10 +158,10 @@ class SimpleMemoryArena {
   // again until Commit() is called & tensor allocations are resolved.
   TfLiteStatus ReleaseBuffer();
 
-  size_t GetBufferSize() const { return underlying_buffer_size_; }
+  size_t GetBufferSize() const { return underlying_buffer_.GetSize(); }
 
   std::intptr_t BasePointer() const {
-    return reinterpret_cast<std::intptr_t>(underlying_buffer_aligned_ptr_);
+    return reinterpret_cast<std::intptr_t>(underlying_buffer_.GetPtr());
   }
 
   // Dumps the memory allocation information of this memory arena (which could
@@ -145,16 +181,10 @@ class SimpleMemoryArena {
   void DumpDebugInfo(const std::string& name,
                      const std::vector<int>& execution_plan) const;
 
- protected:
-  int subgraph_index_;
-
  private:
   bool committed_;
-  size_t arena_alignment_;
   size_t high_water_mark_;
-  std::unique_ptr<char[]> underlying_buffer_;
-  size_t underlying_buffer_size_;
-  char* underlying_buffer_aligned_ptr_;
+  ResizableAlignedBuffer underlying_buffer_;
   std::vector<ArenaAllocWithUsageInterval> active_allocs_;
 };
 
diff --git a/tensorflow/lite/simple_memory_arena_test.cc b/tensorflow/lite/simple_memory_arena_test.cc
index fb21e145b62693..af5a4d8ed668ea 100644
--- a/tensorflow/lite/simple_memory_arena_test.cc
+++ b/tensorflow/lite/simple_memory_arena_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace {
@@ -56,8 +55,8 @@ TEST(SimpleMemoryArenaTest, BasicZeroAlloc) {
   // The zero-sized alloc should resolve to null.
   char* resolved_ptr = nullptr;
   bool reallocated = false;
-  ASSERT_EQ(arena.Commit(&context, &reallocated), kTfLiteOk);
-  ASSERT_TRUE(reallocated);
+  ASSERT_EQ(arena.Commit(&reallocated), kTfLiteOk);
+  EXPECT_FALSE(reallocated);  // Don't allocate when zero bytes are needed.
   EXPECT_EQ(resolved_ptr, nullptr);
 }
 
@@ -88,7 +87,7 @@ TEST(SimpleMemoryArenaTest, TestClearPlan) {
   arena.Allocate(&context, 32, 2047, 1, 1, 2, &allocs[1]);
   arena.Allocate(&context, 32, 2047, 2, 1, 2, &allocs[2]);
   bool reallocated = false;
-  arena.Commit(&context, &reallocated);
+  arena.Commit(&reallocated);
   ASSERT_TRUE(reallocated);
 
   EXPECT_EQ(allocs[0].offset, 0);
@@ -101,7 +100,7 @@ TEST(SimpleMemoryArenaTest, TestClearPlan) {
   arena.Allocate(&context, 32, 1023, 3, 0, 2, &allocs[3]);
   arena.Allocate(&context, 32, 1023, 4, 1, 2, &allocs[4]);
   arena.Allocate(&context, 32, 1023, 5, 1, 2, &allocs[5]);
-  arena.Commit(&context, &reallocated);
+  arena.Commit(&reallocated);
   ASSERT_FALSE(reallocated);
 
   EXPECT_EQ(allocs[3].offset, 0);
@@ -114,7 +113,7 @@ TEST(SimpleMemoryArenaTest, TestClearPlan) {
   arena.Allocate(&context, 32, 4095, 6, 0, 2, &allocs[6]);
   arena.Allocate(&context, 32, 4095, 7, 1, 2, &allocs[7]);
   arena.Allocate(&context, 32, 4095, 8, 1, 2, &allocs[8]);
-  arena.Commit(&context, &reallocated);
+  arena.Commit(&reallocated);
   ASSERT_TRUE(reallocated);
 
   EXPECT_EQ(allocs[6].offset, 0);
@@ -136,7 +135,7 @@ TEST(SimpleMemoryArenaTest, TestPurgeAllocs) {
                  /*first_node=*/2, /*last_node=*/3, &allocs[2]);
 
   bool reallocated = false;
-  ASSERT_EQ(arena.Commit(&context, &reallocated), kTfLiteOk);
+  ASSERT_EQ(arena.Commit(&reallocated), kTfLiteOk);
   ASSERT_TRUE(reallocated);
   char* resolved_ptr0 = nullptr;
   char* resolved_ptr1 = nullptr;
@@ -167,7 +166,7 @@ TEST(SimpleMemoryArenaTest, TestPurgeAllocs) {
   arena.PurgeActiveAllocs(4);
   arena.Allocate(&context, /*alignment=*/32, /*size=*/13, /*tensor=*/3,
                  /*first_node=*/4, /*last_node=*/5, &allocs[4]);
-  ASSERT_EQ(arena.Commit(&context, &reallocated), kTfLiteOk);
+  ASSERT_EQ(arena.Commit(&reallocated), kTfLiteOk);
   ASSERT_EQ(arena.ResolveAlloc(&context, allocs[4], &resolved_ptr3), kTfLiteOk);
   /* no tensors are allocated at node 4, so tensor 3's offset should be zero.*/
   ASSERT_EQ(allocs[4].offset, 0);
@@ -190,7 +189,7 @@ TEST(SimpleMemoryArenaTest, TestPurgeAllocs) {
    */
   arena.Allocate(&context, /*alignment=*/32, /*size=*/2047, /*tensor=*/0,
                  /*first_node=*/0, /*last_node=*/2, &allocs[0]);
-  ASSERT_EQ(arena.Commit(&context, &reallocated), kTfLiteOk);
+  ASSERT_EQ(arena.Commit(&reallocated), kTfLiteOk);
   ASSERT_EQ(arena.ResolveAlloc(&context, allocs[3], &resolved_ptr3), kTfLiteOk);
   ASSERT_EQ(allocs[0].offset, 0);
 }
@@ -209,7 +208,7 @@ TEST(SimpleMemoryArenaTest, TestResetAllocs) {
                  /*first_node=*/2, /*last_node=*/3, &allocs[2]);
 
   bool reallocated = false;
-  ASSERT_EQ(arena.Commit(&context, &reallocated), kTfLiteOk);
+  ASSERT_EQ(arena.Commit(&reallocated), kTfLiteOk);
   ASSERT_TRUE(reallocated);
   char* resolved_ptr0 = nullptr;
   char* resolved_ptr1 = nullptr;
@@ -239,7 +238,7 @@ TEST(SimpleMemoryArenaTest, TestResetAllocs) {
    */
   arena.Allocate(&context, /*alignment=*/32, /*size=*/13, /*tensor=*/0,
                  /*first_node=*/0, /*last_node=*/3, &allocs[3]);
-  ASSERT_EQ(arena.Commit(&context, &reallocated), kTfLiteOk);
+  ASSERT_EQ(arena.Commit(&reallocated), kTfLiteOk);
   /* This is the expected arena after tensor3 has been allocated.
    * |xxxxxxxxxxxxxxxxx| tensor3
    *             |xxxxx| tensor2
@@ -275,7 +274,7 @@ TEST(SimpleMemoryArenaTest, TestResetAllocs) {
    * ___________________
    */
 
-  ASSERT_EQ(arena.Commit(&context, &reallocated), kTfLiteOk);
+  ASSERT_EQ(arena.Commit(&reallocated), kTfLiteOk);
   ASSERT_EQ(arena.ResolveAlloc(&context, allocs[3], &resolved_ptr3), kTfLiteOk);
   ASSERT_EQ(allocs[3].offset, 0);
 }
@@ -294,7 +293,7 @@ TEST(SimpleMemoryArenaTest, TestClearBuffer) {
 
   // Commit and ensure resolved pointers are not null.
   bool reallocated = false;
-  ASSERT_EQ(arena.Commit(&context, &reallocated), kTfLiteOk);
+  ASSERT_EQ(arena.Commit(&reallocated), kTfLiteOk);
   ASSERT_TRUE(reallocated);
   char* resolved_ptr = nullptr;
   ASSERT_EQ(arena.ResolveAlloc(&context, allocs[0], &resolved_ptr), kTfLiteOk);
@@ -311,7 +310,7 @@ TEST(SimpleMemoryArenaTest, TestClearBuffer) {
   ASSERT_NE(arena.ResolveAlloc(&context, allocs[0], &resolved_ptr), kTfLiteOk);
 
   // Commit again and ensure resolved pointers are not null.
-  ASSERT_EQ(arena.Commit(&context, &reallocated), kTfLiteOk);
+  ASSERT_EQ(arena.Commit(&reallocated), kTfLiteOk);
   ASSERT_TRUE(reallocated);
   ASSERT_NE(arena.BasePointer(), 0);
   resolved_ptr = nullptr;
@@ -337,7 +336,7 @@ TEST_P(BufferAndPlanClearingTest, TestClearBufferAndClearPlan) {
   arena.Allocate(&context, 32, 2047, 1, 1, 2, &allocs[1]);
 
   bool reallocated = false;
-  ASSERT_EQ(arena.Commit(&context, &reallocated), kTfLiteOk);
+  ASSERT_EQ(arena.Commit(&reallocated), kTfLiteOk);
   ASSERT_TRUE(reallocated);
 
   if (GetParam()) {
@@ -349,15 +348,17 @@ TEST_P(BufferAndPlanClearingTest, TestClearBufferAndClearPlan) {
   }
 
   // Just committing won't work, allocations need to be made again.
-  ASSERT_EQ(arena.Commit(&context, &reallocated), kTfLiteOk);
-  ASSERT_TRUE(reallocated);
+  ASSERT_EQ(arena.Commit(&reallocated), kTfLiteOk);
+  // There was no allocation, the buffer has 0 bytes (was released) and the high
+  // water mark is 0 (plan was cleared).
+  EXPECT_FALSE(reallocated);
   char* resolved_ptr = nullptr;
   ASSERT_NE(arena.ResolveAlloc(&context, allocs[0], &resolved_ptr), kTfLiteOk);
 
   // Re-allocate tensors & commit.
   arena.Allocate(&context, 32, 2047, 0, 0, 2, &allocs[0]);
   arena.Allocate(&context, 32, 2047, 1, 1, 2, &allocs[1]);
-  ASSERT_EQ(arena.Commit(&context, &reallocated), kTfLiteOk);
+  ASSERT_EQ(arena.Commit(&reallocated), kTfLiteOk);
   ASSERT_TRUE(reallocated);
 
   // Pointer-resolution now works.
diff --git a/tensorflow/lite/testing/op_tests/gather_nd.py b/tensorflow/lite/testing/op_tests/gather_nd.py
index 66d8a30033b7bd..37eb052ebff73f 100644
--- a/tensorflow/lite/testing/op_tests/gather_nd.py
+++ b/tensorflow/lite/testing/op_tests/gather_nd.py
@@ -25,19 +25,40 @@ def make_gather_nd_tests(options):
 
   test_parameters = [
       {
-          "params_dtype": [tf.float32, tf.int16, tf.int32, tf.int64, tf.string],
+          "params_dtype": [
+              tf.float32,
+              tf.int16,
+              tf.int32,
+              tf.int64,
+              tf.string,
+              tf.bool,
+          ],
           "params_shape": [[5, 1]],
           "indices_dtype": [tf.int16, tf.int32, tf.int64],
           "indices_shape": [[1, 1]],
       },
       {
-          "params_dtype": [tf.float32, tf.int16, tf.int32, tf.int64, tf.string],
+          "params_dtype": [
+              tf.float32,
+              tf.int16,
+              tf.int32,
+              tf.int64,
+              tf.string,
+              tf.bool,
+          ],
           "params_shape": [[5, 5]],
           "indices_dtype": [tf.int16, tf.int32, tf.int64],
           "indices_shape": [[2, 1], [2, 2]],
       },
       {
-          "params_dtype": [tf.float32, tf.int16, tf.int32, tf.int64, tf.string],
+          "params_dtype": [
+              tf.float32,
+              tf.int16,
+              tf.int32,
+              tf.int64,
+              tf.string,
+              tf.bool,
+          ],
           "params_shape": [[5, 5, 10]],
           "indices_dtype": [tf.int16, tf.int32, tf.int64],
           "indices_shape": [[3, 1], [2, 2], [2, 3], [2, 1, 3]],
diff --git a/tensorflow/lite/toco/logging/BUILD b/tensorflow/lite/toco/logging/BUILD
index 17ec59f24d9f71..83daab2357364a 100644
--- a/tensorflow/lite/toco/logging/BUILD
+++ b/tensorflow/lite/toco/logging/BUILD
@@ -94,6 +94,7 @@ py_strict_test(
     deps = [
         ":gen_html",
         ":toco_conversion_log_proto_py",
+        #internal proto upb dep
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/lib/io:file_io",
         "//tensorflow/python/platform:client_testlib",
diff --git a/tensorflow/lite/toco/tflite/BUILD b/tensorflow/lite/toco/tflite/BUILD
index 77094a64b7d45b..a4ea52f9ac37ae 100644
--- a/tensorflow/lite/toco/tflite/BUILD
+++ b/tensorflow/lite/toco/tflite/BUILD
@@ -88,12 +88,14 @@ cc_library(
     hdrs = [
         "export.h",
     ],
-    features = ["-layering_check"],
     visibility = ["//visibility:public"],
     deps = [
         ":operator",
         ":types",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/lite:context",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite:util",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/toco:model",
@@ -110,11 +112,13 @@ tf_cc_test(
     srcs = [
         "export_test.cc",
     ],
-    features = ["-layering_check"],
     deps = [
         ":export",
         ":operator",
+        ":types",
+        "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 12e9d0db25a914..704021241d6e07 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -51,6 +51,7 @@ py_strict_test(
     deps = [
         ":test_utils",
         ":visualize_lib",
+        #internal proto upb dep
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
     ],
@@ -95,11 +96,12 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":convert_image_to_csv_lib",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -159,6 +161,7 @@ py_strict_test(
     deps = [
         ":flatbuffer_utils",
         ":test_utils",
+        #internal proto upb dep
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
     ],
diff --git a/tensorflow/lite/tools/build_aar.sh b/tensorflow/lite/tools/build_aar.sh
index 1847b794d757aa..644bd08f6dda8d 100755
--- a/tensorflow/lite/tools/build_aar.sh
+++ b/tensorflow/lite/tools/build_aar.sh
@@ -90,12 +90,14 @@ function generate_tflite_aar {
   popd > /dev/null
   # TODO(b/254278688): Enable 'xnn_enable_arm_fp16' with toolchain upgrade.
   # TODO(b/297897797): Enable 'xnn_enable_arm_i8mm' with toolchain upgrade.
-  bazel ${CACHE_DIR_FLAG} build -c opt --cxxopt='--std=c++17' \
+  # TODO: b/315114212 - Remove `xnn_enable_vnni` when the compiler supports it.
+  bazel ${CACHE_DIR_FLAG} build -c opt --config=opt --cxxopt='--std=c++17' \
         --fat_apk_cpu=${TARGET_ARCHS} \
         --define=android_dexmerger_tool=d8_dexmerger \
         --define=android_incremental_dexing_tool=d8_dexbuilder\
         --define=xnn_enable_arm_fp16=false \
         --define=xnn_enable_arm_i8mm=false \
+        --define=xnn_enable_avxvnni=false \
         --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
         //tmp:tensorflow-lite
 
@@ -130,12 +132,14 @@ function generate_flex_aar {
   # Build the aar package.
   # TODO(b/254278688): Enable 'xnn_enable_arm_fp16' with toolchain upgrade.
   # TODO(b/297897797): Enable 'xnn_enable_arm_i8mm' with toolchain upgrade.
-  bazel ${CACHE_DIR_FLAG} build -c opt --cxxopt='--std=c++17' \
+  # TODO: b/315114212 - Remove `xnn_enable_vnni` when the compiler supports it.
+  bazel ${CACHE_DIR_FLAG} build -c opt --config=opt --cxxopt='--std=c++17' \
       --fat_apk_cpu=${TARGET_ARCHS} \
       --define=android_dexmerger_tool=d8_dexmerger \
       --define=android_incremental_dexing_tool=d8_dexbuilder\
       --define=xnn_enable_arm_fp16=false \
       --define=xnn_enable_arm_i8mm=false \
+      --define=xnn_enable_avxvnni=false \
       --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
       //tmp:tensorflow-lite-select-tf-ops
 
@@ -191,14 +195,16 @@ fi
 # Build the standard aar package of no models provided.
 # TODO(b/254278688): Enable 'xnn_enable_arm_fp16' with toolchain upgrade.
 # TODO(b/297897797): Enable 'xnn_enable_arm_i8mm' with toolchain upgrade.
+# TODO: b/315114212 - Remove `xnn_enable_vnni` when the compiler supports it.
 if [ -z ${FLAG_MODELS} ]; then
-  bazel ${CACHE_DIR_FLAG} build -c opt --cxxopt='--std=c++17' \
+  bazel ${CACHE_DIR_FLAG} build -c opt --config=opt --cxxopt='--std=c++17' \
     --config=monolithic \
     --fat_apk_cpu=${TARGET_ARCHS} \
     --define=android_dexmerger_tool=d8_dexmerger \
     --define=android_incremental_dexing_tool=d8_dexbuilder\
     --define=xnn_enable_arm_fp16=false \
     --define=xnn_enable_arm_i8mm=false \
+    --define=xnn_enable_avxvnni=false \
     --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
     //tensorflow/lite/java:tensorflow-lite
 
diff --git a/tensorflow/lite/tools/build_aar_with_docker.sh b/tensorflow/lite/tools/build_aar_with_docker.sh
index 27624f943d4f1a..fbcc9325cd5fa0 100755
--- a/tensorflow/lite/tools/build_aar_with_docker.sh
+++ b/tensorflow/lite/tools/build_aar_with_docker.sh
@@ -104,14 +104,16 @@ else
   cd /tensorflow_src
 
   # Run configure.
+  # -Wno-c++20-designator can be removed once tf supports C++20.
+  # -Wno-gnu-inline-cpp-without-extern is needed for NEON2SSE. Can remove after
+  # https://github.com/intel/ARM_NEON_2_x86_SSE/issues/57 is resolved.
   configs=(
     '/usr/bin/python3'
     '/usr/lib/python3/dist-packages'
     'N'
     'N'
     'N'
-    'N'
-    '-march=native -Wno-sign-compare'
+    '-Wno-sign-compare -Wno-c++20-designator -Wno-gnu-inline-cpp-without-extern'
     'y'
     '/android/sdk'
   )
diff --git a/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake b/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake
index 7866627555d030..d72fa2c18c07ca 100644
--- a/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake
+++ b/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake
@@ -22,8 +22,8 @@ include(OverridableFetchContent)
 OverridableFetchContent_Declare(
   cpuinfo
   GIT_REPOSITORY https://github.com/pytorch/cpuinfo
-  # Sync with tensorflow/third_party/cpuinfo/workspace.bzl
-  GIT_TAG 959002f82d7962a473d8bf301845f2af720e0aa4
+  # Sync with tensorflow/workspace2.bzl
+  GIT_TAG ef634603954d88d2643d5809011288b890ac126e
   GIT_PROGRESS TRUE
   SOURCE_DIR "${CMAKE_BINARY_DIR}/cpuinfo"
 )
diff --git a/tensorflow/lite/tools/cmake/modules/gemmlowp.cmake b/tensorflow/lite/tools/cmake/modules/gemmlowp.cmake
index ac296c0307f901..76d9705475b05b 100644
--- a/tensorflow/lite/tools/cmake/modules/gemmlowp.cmake
+++ b/tensorflow/lite/tools/cmake/modules/gemmlowp.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   gemmlowp
   GIT_REPOSITORY https://github.com/google/gemmlowp
   # Sync with tensorflow/third_party/gemmlowp/workspace.bzl
-  GIT_TAG e844ffd17118c1e17d94e1ba4354c075a4577b88
+  GIT_TAG 16e8662c34917be0065110bfcd9cc27d30f52fdf
   # It's not currently (cmake 3.17) possible to shallow clone with a GIT TAG
   # as cmake attempts to git checkout the commit hash after the clone
   # which doesn't work as it's a shallow clone hence a different commit hash.
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index a6b36451cb819b..436be3901c4865 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG c7e7cde37615a81a529c326aa278bfab4cd6fe5a
+  GIT_TAG 0cbbe74a16e6ca11acf8484ccac85f620336dea4
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/lite/tools/optimize/debugging/python/BUILD b/tensorflow/lite/tools/optimize/debugging/python/BUILD
index e77895b569141f..529fdec107f5bd 100644
--- a/tensorflow/lite/tools/optimize/debugging/python/BUILD
+++ b/tensorflow/lite/tools/optimize/debugging/python/BUILD
@@ -29,6 +29,9 @@ py_strict_test(
     python_version = "PY3",
     deps = [
         ":debugger",
+        "@absl_py//absl/testing:parameterized",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/lite/python:convert",
         "//tensorflow/lite/python:lite",
@@ -36,7 +39,5 @@ py_strict_test(
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/trackable:autotrackable",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/lite/tools/optimize/python/BUILD b/tensorflow/lite/tools/optimize/python/BUILD
index 2cba7d719c4d11..9c3527eb56b684 100644
--- a/tensorflow/lite/tools/optimize/python/BUILD
+++ b/tensorflow/lite/tools/optimize/python/BUILD
@@ -40,10 +40,11 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":modify_model_interface_lib",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
-        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/lite/tools/optimize/sparsity/BUILD b/tensorflow/lite/tools/optimize/sparsity/BUILD
index 13a95f0c517205..6a1a447e19e297 100644
--- a/tensorflow/lite/tools/optimize/sparsity/BUILD
+++ b/tensorflow/lite/tools/optimize/sparsity/BUILD
@@ -35,7 +35,8 @@ py_strict_test(
     python_version = "PY3",
     deps = [
         ":format_converter_wrapper_pybind11",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:absltest",
+        #internal proto upb dep
+        "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/lite/tools/signature/BUILD b/tensorflow/lite/tools/signature/BUILD
index d418b826ce57ad..161dcd1554d04b 100644
--- a/tensorflow/lite/tools/signature/BUILD
+++ b/tensorflow/lite/tools/signature/BUILD
@@ -104,6 +104,7 @@ py_strict_test(
     visibility = ["//visibility:public"],
     deps = [
         ":signature_def_utils",
+        #internal proto upb dep
         "//tensorflow:tensorflow_py",
         "//tensorflow/core:protos_all_py",
     ],
diff --git a/tensorflow/lite/tools/tflite-android.Dockerfile b/tensorflow/lite/tools/tflite-android.Dockerfile
index d1981b0224d2d4..3d84412ccb49dd 100644
--- a/tensorflow/lite/tools/tflite-android.Dockerfile
+++ b/tensorflow/lite/tools/tflite-android.Dockerfile
@@ -9,8 +9,8 @@ RUN apt-get update && \
 # Install Android SDK.
 ENV ANDROID_SDK_FILENAME commandlinetools-linux-6858069_latest.zip
 ENV ANDROID_SDK_URL https://dl.google.com/android/repository/${ANDROID_SDK_FILENAME}
-ENV ANDROID_API_LEVEL 23
-ENV ANDROID_NDK_API_LEVEL 21
+ENV ANDROID_API_LEVEL 30
+ENV ANDROID_NDK_API_LEVEL 30
 # Build Tools Version liable to change.
 ENV ANDROID_BUILD_TOOLS_VERSION 31.0.0
 ENV ANDROID_SDK_HOME ${ANDROID_DEV_HOME}/sdk
@@ -23,7 +23,7 @@ RUN cd ${ANDROID_DEV_HOME} && \
     rm ${ANDROID_SDK_FILENAME}
 
 # Install Android NDK.
-ENV ANDROID_NDK_FILENAME android-ndk-r21e-linux-x86_64.zip
+ENV ANDROID_NDK_FILENAME android-ndk-r25b-linux.zip
 ENV ANDROID_NDK_URL https://dl.google.com/android/repository/${ANDROID_NDK_FILENAME}
 ENV ANDROID_NDK_HOME ${ANDROID_DEV_HOME}/ndk
 ENV PATH ${PATH}:${ANDROID_NDK_HOME}
diff --git a/tensorflow/lite/tools/verifier.h b/tensorflow/lite/tools/verifier.h
index 93bc5433c80e4e..f90d77b558fea0 100644
--- a/tensorflow/lite/tools/verifier.h
+++ b/tensorflow/lite/tools/verifier.h
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #ifndef TENSORFLOW_LITE_TOOLS_VERIFIER_H_
 #define TENSORFLOW_LITE_TOOLS_VERIFIER_H_
 
-#include "tensorflow/lite/core/tools/verifier.h"
+/// For documentation, see third_party/tensorflow/lite/core/tools/verifier.h
+
+#include "tensorflow/lite/core/tools/verifier.h"  // IWYU pragma: export
 
 namespace tflite {
 
diff --git a/tensorflow/lite/tools/verifier_internal.h b/tensorflow/lite/tools/verifier_internal.h
index a3f499bc1fd10f..88380466877e50 100644
--- a/tensorflow/lite/tools/verifier_internal.h
+++ b/tensorflow/lite/tools/verifier_internal.h
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #ifndef TENSORFLOW_LITE_TOOLS_VERIFIER_INTERNAL_H_
 #define TENSORFLOW_LITE_TOOLS_VERIFIER_INTERNAL_H_
 
-#include "tensorflow/lite/core/tools/verifier_internal.h"
+/// For documentation, see third_party/tensorflow/lite/core/tools/verifier_internal.h
+
+#include "tensorflow/lite/core/tools/verifier_internal.h"  // IWYU pragma: export
 
 namespace tflite {
 namespace internal {
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index e05b6419f863db..bba285328d2527 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -721,6 +721,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
     }
 
     case BuiltinOperator_GATHER_ND:
+      if (op_sig.inputs.at(0).type == kTfLiteBool) {
+        return 5;
+      }
       if (op_sig.inputs.at(1).type == kTfLiteInt16) {
         return 4;
       }
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index 3d2e055894f978..5cff633f0ee0d0 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -1047,6 +1047,13 @@ TEST(OpVersionTest, VersioningGatherNdOperatorTest) {
           std::vector<TfLiteType>{kTfLiteInt32, kTfLiteInt16}),
   };
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_GATHER_ND,
+      .inputs = CreateOpSignatureTensorSpecs(
+          std::vector<TfLiteType>{kTfLiteBool, kTfLiteInt16}),
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 5);
 }
 TEST(OpVersionTest, VersioningDivTest) {
   OpSignature fake_op_sig = {
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 47282cbf371e9a..d011a5d5438e46 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -143,6 +143,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_GATHER_ND, 2}, "2.3.0"},
            {{BuiltinOperator_GATHER_ND, 3}, "2.5.0"},
            {{BuiltinOperator_GATHER_ND, 4}, "2.13.0"},
+           {{BuiltinOperator_GATHER_ND, 5}, "2.16.0"},
            {{BuiltinOperator_HASHTABLE_LOOKUP, 1}, "1.5.0"},
            {{BuiltinOperator_SVDF, 1}, "1.5.0"},
            {{BuiltinOperator_SVDF, 2}, "1.14.0"},
@@ -439,7 +440,8 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_STABLEHLO_MULTIPLY, 1}, "2.16.0"},
            {{BuiltinOperator_STABLEHLO_REDUCE_WINDOW, 1}, "2.16.0"},
            {{BuiltinOperator_STABLEHLO_MAXIMUM, 1}, "2.16.0"},
-           {{BuiltinOperator_STABLEHLO_MINIMUM, 1}, "2.16.0"}});
+           {{BuiltinOperator_STABLEHLO_MINIMUM, 1}, "2.16.0"},
+           {{BuiltinOperator_STABLEHLO_PAD, 1}, "2.16.0"}});
 
   std::pair<BuiltinOperator, int> version_key = {op_code, op_version};
   auto it = op_version_map->find(version_key);
diff --git a/tensorflow/lite/tutorials/BUILD b/tensorflow/lite/tutorials/BUILD
index 9c34628d29418b..77c275d74651b6 100644
--- a/tensorflow/lite/tutorials/BUILD
+++ b/tensorflow/lite/tutorials/BUILD
@@ -1,6 +1,6 @@
 # Example Estimator model
 
-load("//tensorflow:strict.default.bzl", "py_strict_binary")
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -10,11 +10,18 @@ package(
 
 py_strict_binary(
     name = "mnist_tflite",
-    srcs = [
-        "dataset.py",
-        "mnist_tflite.py",
-    ],
+    srcs = ["mnist_tflite.py"],
     python_version = "PY3",
+    deps = [
+        ":dataset",
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "dataset",
+    srcs = ["dataset.py"],
     deps = [
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 68f19f8d488a63..0a3015106ef946 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -175,6 +175,8 @@ tf_staging/tensorflow/tools/toolchains/BUILD:
 tf_staging/tensorflow/tools/toolchains/clang6/BUILD:
 tf_staging/tensorflow/tools/toolchains/cpus/py/BUILD:
 tf_staging/tensorflow/tools/toolchains/cpus/py3/BUILD:
+tf_staging/tensorflow/tools/toolchains/cross_compile/cc/BUILD:
+tf_staging/tensorflow/tools/toolchains/cross_compile/config/BUILD:
 tf_staging/tensorflow/tools/toolchains/embedded/arm-linux/BUILD:
 tf_staging/tensorflow/tools/toolchains/java/BUILD:
 tf_staging/tensorflow/tools/toolchains/python/BUILD:
@@ -236,7 +238,9 @@ tf_staging/third_party/gpus/crosstool/BUILD:
 tf_staging/third_party/gpus/crosstool/LICENSE:
 tf_staging/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl:
 tf_staging/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl:
+tf_staging/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl:
 tf_staging/third_party/gpus/cuda/BUILD.tpl:
+tf_staging/third_party/gpus/cuda/BUILD.windows.tpl:
 tf_staging/third_party/gpus/cuda/BUILD:
 tf_staging/third_party/gpus/cuda/LICENSE:
 tf_staging/third_party/gpus/cuda/build_defs.bzl.tpl:
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a14198ec061347..d4a4799c0af207 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -83,7 +83,6 @@ py_strict_library(
         "//tensorflow/python/ops:gradient_checker_v2",
         "//tensorflow/python/ops:stateful_random_ops",
         "//tensorflow/python/ops/structured:structured_ops",
-        "//tensorflow/python/tpu:tpu_estimator",
         "//tensorflow/python/tpu:tpu_noestimator",
     ],
 )
@@ -171,6 +170,7 @@ py_strict_library(
         "//tensorflow/python/framework:_test_metrics_util",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:extension_type",
         "//tensorflow/python/framework:flexible_dtypes",
@@ -350,6 +350,8 @@ py_strict_library(
         ":tf2",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/trace_type",
+        "//tensorflow/python/checkpoint/sharding:sharding_policies",
+        "//tensorflow/python/checkpoint/sharding:sharding_util",
         "//tensorflow/python/client",
         "//tensorflow/python/client:device_lib",
         "//tensorflow/python/client:timeline",
@@ -392,6 +394,7 @@ py_strict_library(
         "//tensorflow/python/lib/io:python_io",
         "//tensorflow/python/lib/io:tf_record",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops:audio_ops_gen",
         "//tensorflow/python/ops:bincount_ops",
         "//tensorflow/python/ops:bitwise_ops",
@@ -448,6 +451,7 @@ py_strict_library(
         "//tensorflow/python/profiler:trace",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/summary:summary_py",
+        "//tensorflow/python/summary:tb_summary",
         "//tensorflow/python/tpu:tpu_noestimator",
         "//tensorflow/python/training",
         "//tensorflow/python/training:quantize_training",
@@ -463,15 +467,6 @@ py_strict_library(
     ],
 )
 
-# Necessary for the pywrap inclusion below.
-tf_pybind_cc_library_wrapper(
-    name = "tfcompile_headers_lib",
-    compatible_with = [],
-    deps = [
-        "//tensorflow/compiler/aot:tfcompile_lib",
-    ],
-)
-
 tf_python_pybind_extension(
     name = "_pywrap_tfcompile",
     srcs = ["tfcompile_wrapper.cc"],
@@ -481,15 +476,13 @@ tf_python_pybind_extension(
         "//tensorflow:windows": [],
     }),
     enable_stub_generation = True,
-    features = ["-layering_check"],
     pytype_srcs = [
         "_pywrap_tfcompile.pyi",
     ],
     static_deps = tf_python_pybind_static_deps(),
     deps = [
-        ":tfcompile_headers_lib",
         "@pybind11",
-        "//third_party/python_runtime:headers",
+        "//tensorflow/compiler/aot:tfcompile_lib",
         "//tensorflow/python/lib/core:pybind11_lib",
         "//tensorflow/python/lib/core:pybind11_status",
         # The headers here cannot be brought in via cc_header_only_library
@@ -776,7 +769,6 @@ pywrap_tensorflow_macro(
         "//tensorflow/cc/saved_model:fingerprinting_impl",
         "//tensorflow/cc/saved_model:loader_lite_impl",
         "//tensorflow/cc/saved_model:metrics_impl",
-        "//tensorflow/compiler/mlir/python:mlir",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc_impl",
         "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
         "//tensorflow/compiler/tf2tensorrt:op_converter_registry_impl",
@@ -848,7 +840,12 @@ pywrap_tensorflow_macro(
         "@local_tsl//tsl/profiler/rpc/client:profiler_client_impl",
         "@local_tsl//tsl/python/lib/core:numpy",
         "@local_xla//xla/stream_executor:stream_executor_impl",
-    ] + if_static([
+    ] + select({
+        "//tensorflow/compiler/mlir/python:disable_mlir_config": [],
+        "//conditions:default": [
+            "//tensorflow/compiler/mlir/python:mlir",
+        ],
+    }) + if_static([
         "//tensorflow/core/platform:tensor_float_32_utils",
         "//tensorflow/core/platform:enable_tf2_utils",
     ]) + if_google([
@@ -886,7 +883,6 @@ filegroup(
         "//tensorflow/cc/saved_model:metrics_impl",  # SavedModel metrics
         "//tensorflow/compiler/jit:flags",  # tfe
         "//tensorflow/compiler/jit:get_compiler_ir",  # tfe
-        "//tensorflow/compiler/mlir/python:mlir",  # mlir
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc_impl",  # quantization
         "//tensorflow/compiler/tf2xla:tf2xla_opset",  # pywrap_xla_ops
         "//tensorflow/core:framework_internal_impl",  # op_def_registry
@@ -961,7 +957,12 @@ filegroup(
         "@local_tsl//tsl/python/lib/core:ml_dtypes_lib",  # bfloat16, float8_e4m3fn, float8_e5m2
         "@local_tsl//tsl/python/lib/core:numpy",  # checkpoint_reader
         "@local_xla//xla/stream_executor",  # stat_summarizer
-    ] + if_xla_available([
+    ] + select({
+        "//tensorflow/compiler/mlir/python:disable_mlir_config": [],
+        "//conditions:default": [
+            "//tensorflow/compiler/mlir/python:mlir",  # mlir
+        ],
+    }) + if_xla_available([
         "//tensorflow/compiler/aot:tfcompile_lib",  # tfcompile
         "@local_xla//xla:status_macros",  # tfcompile
         "@local_xla//xla/hlo/ir:hlo",  # tfcompile
diff --git a/tensorflow/python/_pywrap_tfe.pyi b/tensorflow/python/_pywrap_tfe.pyi
index 26d129cd2a8566..1385ae69244d58 100644
--- a/tensorflow/python/_pywrap_tfe.pyi
+++ b/tensorflow/python/_pywrap_tfe.pyi
@@ -179,6 +179,7 @@ def TFE_ClearScalarCache() -> object: ...
 def TFE_CollectiveOpsCheckPeerHealth(arg0: object, arg1: str, arg2: int) -> None: ...
 def TFE_ContextAddFunction(arg0: object, arg1: TF_Function) -> None: ...
 def TFE_ContextAddFunctionDef(arg0: object, arg1: str, arg2: int) -> None: ...
+def TFE_ContextAddFunctionDefNoSerialization(ctx: object, function_def) -> None: ...
 def TFE_ContextCheckAlive(arg0: object, arg1: str) -> bool: ...
 def TFE_ContextClearCaches(arg0: object) -> None: ...
 def TFE_ContextClearExecutors(arg0: object) -> None: ...
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index 5624f7611f3c84..82177fb9002207 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -200,6 +200,7 @@ py_strict_test(
         ":asserts",
         ":functions",
         ":return_statements",
+        #internal proto upb dep
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:errors",
@@ -214,6 +215,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":break_statements",
+        #internal proto upb dep
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/platform:client_testlib",
@@ -228,6 +230,7 @@ py_strict_test(
     deps = [
         ":call_trees",
         ":functions",
+        #internal proto upb dep
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/platform:client_testlib",
     ],
@@ -240,6 +243,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":conditional_expressions",
+        #internal proto upb dep
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/platform:client_testlib",
     ],
@@ -252,6 +256,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":continue_statements",
+        #internal proto upb dep
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:client_testlib",
@@ -267,6 +272,8 @@ py_strict_test(
         ":break_statements",
         ":continue_statements",
         ":control_flow",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
@@ -276,7 +283,6 @@ py_strict_test(
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -287,6 +293,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":directives",
+        #internal proto upb dep
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/autograph/lang:directives",
         "//tensorflow/python/autograph/pyct:anno",
@@ -301,6 +308,7 @@ py_strict_test(
     deps = [
         ":functions",
         ":return_statements",
+        #internal proto upb dep
         "//tensorflow/python/autograph/core:ag_ctx",
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/core:test_lib",
@@ -318,6 +326,7 @@ py_strict_test(
     deps = [
         ":directives",
         ":lists",
+        #internal proto upb dep
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/autograph/lang:directives",
         "//tensorflow/python/autograph/lang:special_functions",
@@ -336,6 +345,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":logical_expressions",
+        #internal proto upb dep
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:test_lib",
@@ -351,6 +361,7 @@ py_strict_test(
     deps = [
         ":functions",
         ":return_statements",
+        #internal proto upb dep
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:client_testlib",
@@ -365,6 +376,7 @@ py_strict_test(
     deps = [
         ":directives",
         ":slices",
+        #internal proto upb dep
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/autograph/lang:directives",
         "//tensorflow/python/framework:constant_op",
@@ -381,6 +393,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":variables",
+        #internal proto upb dep
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/platform:client_testlib",
     ],
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 46983ab39f0a2b..d1d4ee16fe1761 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -91,6 +91,7 @@ py_strict_test(
     deps = [
         ":converter",
         ":test_lib",
+        #internal proto upb dep
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:loader",
         "//tensorflow/python/autograph/pyct:parser",
@@ -107,6 +108,7 @@ py_strict_test(
     deps = [
         ":converter",
         ":function_wrappers",
+        #internal proto upb dep
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/ops:variables",
diff --git a/tensorflow/python/autograph/lang/BUILD b/tensorflow/python/autograph/lang/BUILD
index d9207ac75a1b87..f857454188571f 100644
--- a/tensorflow/python/autograph/lang/BUILD
+++ b/tensorflow/python/autograph/lang/BUILD
@@ -31,12 +31,13 @@ py_strict_test(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow/python/autograph/lang:special_functions",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/ops:list_ops",
         "//tensorflow/python/platform:client_testlib",
-        "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index d3ab48bbf2c245..25dd28737fce2e 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -159,6 +159,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":data_structures",
+        #internal proto upb dep
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor",
@@ -176,6 +177,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":conditional_expressions",
+        #internal proto upb dep
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:test_lib",
@@ -194,6 +196,8 @@ py_strict_test(
     deps = [
         ":control_flow",
         ":variables",
+        #internal proto upb dep
+        "//third_party/py/numpy",
         "//tensorflow/python/autograph/utils:ag_logging",
         "//tensorflow/python/autograph/utils:testing",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -209,7 +213,6 @@ py_strict_test(
         "//tensorflow/python/ops:random_ops",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/platform:client_testlib",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -220,6 +223,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":exceptions",
+        #internal proto upb dep
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
@@ -234,6 +238,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":logical",
+        #internal proto upb dep
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
@@ -248,6 +253,7 @@ py_strict_test(
     deps = [
         ":data_structures",
         ":py_builtins",
+        #internal proto upb dep
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/core:function_wrappers",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -270,6 +276,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":slices",
+        #internal proto upb dep
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/ops:list_ops",
         "//tensorflow/python/platform:client_testlib",
@@ -283,6 +290,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":variables",
+        #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index 949d841e00cc49..442823158b5f8e 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -193,6 +193,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":anno",
+        #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -209,8 +210,9 @@ py_strict_test(
         ":parser",
         ":pretty_printer",
         ":qual_names",
-        "//tensorflow/python/platform:client_testlib",
         "@pypi_gast//:pkg",
+        #internal proto upb dep
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -221,6 +223,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":cache",
+        #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -233,8 +236,9 @@ py_strict_test(
     deps = [
         ":cfg",
         ":parser",
-        "//tensorflow/python/platform:client_testlib",
         "@pypi_gast//:pkg",
+        #internal proto upb dep
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -248,9 +252,10 @@ py_strict_test(
         ":loader",
         ":parser",
         ":pretty_printer",
+        "@pypi_gast//:pkg",
+        #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:tf_inspect",
-        "@pypi_gast//:pkg",
     ],
 )
 
@@ -262,6 +267,7 @@ py_strict_test(
     deps = [
         ":error_utils",
         ":origin_info",
+        #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -273,6 +279,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":inspect_utils",
+        #internal proto upb dep
         "//tensorflow/python/autograph/pyct/testing:basic_definitions",
         "//tensorflow/python/autograph/pyct/testing:decorators",
         "//tensorflow/python/framework:constant_op",
@@ -294,6 +301,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":naming",
+        #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -308,6 +316,7 @@ py_strict_test(
         ":inspect_utils",
         ":origin_info",
         ":parser",
+        #internal proto upb dep
         "//tensorflow/python/autograph/pyct/testing:basic_definitions",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:tf_inspect",
@@ -324,8 +333,9 @@ py_strict_test(
         ":errors",
         ":parser",
         ":pretty_printer",
-        "//tensorflow/python/platform:client_testlib",
         "@pypi_gast//:pkg",
+        #internal proto upb dep
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -336,6 +346,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":pretty_printer",
+        #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -349,6 +360,7 @@ py_strict_test(
         ":anno",
         ":parser",
         ":qual_names",
+        #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -363,9 +375,10 @@ py_strict_test(
         ":parser",
         ":qual_names",
         ":templates",
-        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
         "@pypi_gast//:pkg",
+        #internal proto upb dep
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -379,8 +392,9 @@ py_strict_test(
         ":origin_info",
         ":parser",
         ":transformer",
-        "//tensorflow/python/platform:client_testlib",
         "@pypi_gast//:pkg",
+        #internal proto upb dep
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -392,7 +406,8 @@ py_strict_test(
     deps = [
         ":transformer",
         ":transpiler",
-        "//tensorflow/python/platform:client_testlib",
         "@pypi_gast//:pkg",
+        #internal proto upb dep
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/autograph/pyct/cfg.py b/tensorflow/python/autograph/pyct/cfg.py
index fd8ddf046d29e9..3c4f0ac15919e6 100644
--- a/tensorflow/python/autograph/pyct/cfg.py
+++ b/tensorflow/python/autograph/pyct/cfg.py
@@ -780,6 +780,11 @@ def visit_ImportFrom(self, node):
   def visit_Expr(self, node):
     self._process_basic_statement(node)
 
+  def visit_NamedExpr(self, node):
+    # TODO(yileiyang): Add a test case once we have a newer astunparse version.
+    # NamedExpr was introduced in Python 3.8 and supported in gast 0.5.1+.
+    self._process_basic_statement(node)
+
   def visit_Assign(self, node):
     self._process_basic_statement(node)
 
diff --git a/tensorflow/python/autograph/pyct/common_transformers/BUILD b/tensorflow/python/autograph/pyct/common_transformers/BUILD
index 2be00498cf7d4d..44160a7f3f22f2 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/BUILD
+++ b/tensorflow/python/autograph/pyct/common_transformers/BUILD
@@ -28,10 +28,11 @@ py_strict_test(
     tags = ["no_oss"],
     deps = [
         ":common_transformers",
+        "@pypi_gast//:pkg",
+        #internal proto upb dep
         "//tensorflow/python/autograph/pyct:loader",
         "//tensorflow/python/autograph/pyct:parser",
         "//tensorflow/python/autograph/pyct:transformer",
         "//tensorflow/python/platform:client_testlib",
-        "@pypi_gast//:pkg",
     ],
 )
diff --git a/tensorflow/python/autograph/pyct/static_analysis/BUILD b/tensorflow/python/autograph/pyct/static_analysis/BUILD
index 4329523b0562de..7e5011fa2d9c16 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/python/autograph/pyct/static_analysis/BUILD
@@ -37,6 +37,7 @@ py_strict_test(
         ":activity",
         ":reaching_definitions",
         ":reaching_fndefs",
+        #internal proto upb dep
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:naming",
@@ -101,13 +102,14 @@ py_strict_test(
     deps = [
         ":activity",
         ":annos",
+        "@pypi_gast//:pkg",
+        #internal proto upb dep
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:naming",
         "//tensorflow/python/autograph/pyct:parser",
         "//tensorflow/python/autograph/pyct:qual_names",
         "//tensorflow/python/autograph/pyct:transformer",
         "//tensorflow/python/platform:client_testlib",
-        "@pypi_gast//:pkg",
     ],
 )
 
@@ -121,6 +123,7 @@ py_strict_test(
         ":activity",
         ":liveness",
         ":reaching_fndefs",
+        #internal proto upb dep
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:naming",
@@ -139,6 +142,7 @@ py_strict_test(
     deps = [
         ":activity",
         ":reaching_definitions",
+        #internal proto upb dep
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:naming",
@@ -159,6 +163,7 @@ py_strict_test(
         ":reaching_definitions",
         ":reaching_fndefs",
         ":type_inference",
+        #internal proto upb dep
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:qual_names",
diff --git a/tensorflow/python/autograph/pyct/testing/BUILD b/tensorflow/python/autograph/pyct/testing/BUILD
index 21a6775b0fb539..51d186363ebb2d 100644
--- a/tensorflow/python/autograph/pyct/testing/BUILD
+++ b/tensorflow/python/autograph/pyct/testing/BUILD
@@ -45,7 +45,8 @@ py_strict_test(
     ],
     deps = [
         ":codegen",
-        "//tensorflow/python/platform:client_testlib",
+        #internal proto upb dep
         "//third_party/py/numpy",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/autograph/utils/BUILD b/tensorflow/python/autograph/utils/BUILD
index d758c28801c315..f5aad03ed8fd8c 100644
--- a/tensorflow/python/autograph/utils/BUILD
+++ b/tensorflow/python/autograph/utils/BUILD
@@ -101,6 +101,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":context_managers",
+        #internal proto upb dep
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/ops:tensor_array_ops",
@@ -115,6 +116,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":misc",
+        #internal proto upb dep
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:test_lib",
@@ -130,6 +132,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":tensor_list",
+        #internal proto upb dep
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
@@ -148,6 +151,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":tensors",
+        #internal proto upb dep
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/ops:list_ops",
diff --git a/tensorflow/python/checkpoint/BUILD b/tensorflow/python/checkpoint/BUILD
index 0c9d5c696b4c20..11c2986f8be6a8 100644
--- a/tensorflow/python/checkpoint/BUILD
+++ b/tensorflow/python/checkpoint/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 load(
     "//tensorflow/tools/test:performance.bzl",
+    "tf_py_benchmark_test",
     "tf_py_logged_benchmark",
 )
 
@@ -99,8 +100,10 @@ py_strict_library(
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:object_identity",
-        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_contextlib",
         "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:tf_inspect",
+        "@absl_py//absl/logging",
     ],
 )
 
@@ -363,7 +366,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "benchmarks_test",
     srcs = ["benchmarks_test.py"],
     deps = [
@@ -389,6 +392,7 @@ py_strict_library(
     srcs = ["checkpoint_options.py"],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/python/checkpoint/sharding:sharding_util",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
@@ -401,6 +405,8 @@ py_strict_library(
     deps = [
         ":checkpoint_options",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/checkpoint/sharding:sharding_policies",
+        "//tensorflow/python/checkpoint/sharding:sharding_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
@@ -408,19 +414,18 @@ py_strict_library(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor",
-        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops:io_ops",
         "//tensorflow/python/ops:io_ops_gen",
         "//tensorflow/python/ops:string_ops",
-        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:trackable_utils",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/types:core",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:object_identity",
     ],
@@ -446,10 +451,10 @@ cuda_py_strict_test(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:io_ops_gen",
         "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/training:server_lib",
-        "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
     ],
 )
diff --git a/tensorflow/python/checkpoint/checkpoint_options.py b/tensorflow/python/checkpoint/checkpoint_options.py
index 662fdcc455c4a3..7a081b80377ce9 100644
--- a/tensorflow/python/checkpoint/checkpoint_options.py
+++ b/tensorflow/python/checkpoint/checkpoint_options.py
@@ -17,6 +17,7 @@
 import copy
 import inspect
 
+from tensorflow.python.checkpoint.sharding import sharding_util
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
@@ -45,6 +46,7 @@ class CheckpointOptions(object):
       "experimental_enable_async_checkpoint",
       "experimental_write_callbacks",
       "enable_async",
+      "experimental_sharding_callback",
   )
 
   @deprecated_args(
@@ -56,6 +58,7 @@ def __init__(
       experimental_enable_async_checkpoint=False,
       experimental_write_callbacks=None,
       enable_async=False,
+      experimental_sharding_callback=None,
   ):
     """Creates an object that stores options for a Checkpoint.
 
@@ -91,6 +94,13 @@ def __init__(
         writing runs in the background. Async checkpoint reduces TPU device idle
         cycles and speeds up model training process, while memory consumption
         may increase.
+
+      experimental_sharding_callback: `tf.train.experimental.ShardingCallback`.
+        A pre-made or custom callback that determines how checkpoints are
+        sharded on disk. Pre-made callback options are
+        `tf.train.experimental.ShardByDevicePolicy` and
+        `tf.train.experimental.MaxShardSizePolicy`. You may also write a custom
+        callback, see `tf.train.experimental.ShardingCallback`.
     """
     self.experimental_io_device = experimental_io_device
     self.enable_async = experimental_enable_async_checkpoint or enable_async
@@ -100,6 +110,13 @@ def __init__(
       for callback in experimental_write_callbacks:
         assert len(inspect.signature(callback).parameters) <= 1
     self.experimental_write_callbacks = experimental_write_callbacks
+    if experimental_sharding_callback is not None:
+      if not isinstance(
+          experimental_sharding_callback, sharding_util.ShardingCallback):
+        raise ValueError("The experimental_sharding_callback checkpoint option"
+                         "must be of type ShardingCallback. The option provided"
+                         f"was of type {type(experimental_sharding_callback)}.")
+    self.experimental_sharding_callback = experimental_sharding_callback
 
   def __copy__(self):
     # Only `experimental_write_callbacks` needs special treatment to Ensure that
diff --git a/tensorflow/python/checkpoint/functional_saver.py b/tensorflow/python/checkpoint/functional_saver.py
index bd2868013ed3ef..6c918d3bd969a6 100644
--- a/tensorflow/python/checkpoint/functional_saver.py
+++ b/tensorflow/python/checkpoint/functional_saver.py
@@ -15,10 +15,12 @@
 """Saves and restore variables inside traced @tf.functions."""
 
 import dataclasses
-from typing import Callable, Dict, List
+from typing import Callable, Mapping, MutableMapping, MutableSequence, Sequence
 
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.checkpoint import checkpoint_options
+from tensorflow.python.checkpoint.sharding import sharding_policies
+from tensorflow.python.checkpoint.sharding import sharding_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
@@ -26,181 +28,125 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor as tensor_lib
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import string_ops
-from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import registration
 from tensorflow.python.trackable import base
 from tensorflow.python.trackable import trackable_utils
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.saving import saveable_object_util
+from tensorflow.python.types import core
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 
 
-@dataclasses.dataclass(frozen=True)
-class ShardableTensor:
-  """Tensor wrapper containing data necessary for sharding."""
-  _tensor_save_spec: saveable_object.SaveSpec
-  tensor: tensor_lib.Tensor
-  dtype: dtypes.DType
-  device: device_lib.DeviceSpec
-  name: str
-  shape: tensor_shape.TensorShape
-  slice_spec: variables.Variable.SaveSliceInfo
-  checkpoint_key: str
-  trackable: base.Trackable
-
-  def __hash__(self):
-    return hash((self.name, self.dtype, str(self.device), self.checkpoint_key))
-
-
-@dataclasses.dataclass(frozen=True)
-class ShardingCallback:
-  """Checkpoint sharding callback function, along with a text description."""
-  callback: Callable[
-      [List[ShardableTensor], ...],
-      List[Dict[str, Dict[tensor_spec.TensorSpec, saveable_object.SaveSpec]]]]
-  description: str
-
-  def __hash__(self):
-    if hasattr(self.callback, "__name__"):
-      callback_hash = hash((self.callback.__module__, self.callback.__name__))
-    else:
-      callback_hash = id(self.callback)
-    return hash((callback_hash, self.description))
-
-
-class ShardByDevicePolicy(ShardingCallback):
-  """Policy that splits tensors into shards based on their device spec."""
-
-  def __init__(self):
-    def device_callback_impl(shardable_tensors):
-      """Callback to split tensors into shards based on their device spec.
-
-      Args:
-        shardable_tensors: A list of ShardableTensors.
-
-      Returns:
-        List of shard dicts containing tensors.
-          [ {checkpoint key: {slice_spec: tensor} } ]
-      """
-      tensors_by_device = {}
-
-      for shardable_tensor in shardable_tensors:
-        tensor = shardable_tensor.tensor
-        checkpoint_key = shardable_tensor.checkpoint_key
-        slice_spec = shardable_tensor.slice_spec
-        device = saveable_object_util.set_cpu0(shardable_tensor.device)
+RegisteredSaversDict = Mapping[
+    registration.RegisteredSaver, Mapping[str, base.Trackable]]
+MappedCapturesCallable = Callable[
+    [core.ConcreteFunction, Sequence[tensor_lib.Tensor]], tensor_lib.Tensor]
 
-        (tensors_by_device
-         .setdefault(device, {})
-         .setdefault(checkpoint_key, {})[slice_spec]) = tensor
 
-      return list(tensors_by_device.values())
+def _single_shard_save(
+    file_prefix: tensor_lib.Tensor,
+    shard: sharding_util.TensorSliceDict,
+    task: device_lib.DeviceSpec,
+    options: "checkpoint_options.CheckpointOptions | None" = None,
+) -> ops.Operation:
+  """Save the saveable objects to a checkpoint with `file_prefix`.
 
-    super().__init__(
-        device_callback_impl,
-        "Split tensors into shards based on their device spec.")
-
-  def __call__(self, shardable_tensors):
-    return self.callback(shardable_tensors)  # pylint: disable=no-value-for-parameter
-
-
-class _SingleDeviceSaver(object):
-  """Saves and restores checkpoints from the current device."""
-
-  __slots__ = ["_tensor_slice_dict"]
-
-  def __init__(self, tensor_slice_dict):
-    """Specify a list of `SaveableObject`s to save and restore.
-
-    Args:
-      tensor_slice_dict: A dict mapping checkpoint key -> slice_spec -> tensor.
-    """
-    self._tensor_slice_dict = tensor_slice_dict
-
-  def save(self, file_prefix, options=None):
-    """Save the saveable objects to a checkpoint with `file_prefix`.
+  Args:
+    file_prefix: A string or scalar string Tensor containing the prefix to
+      save under.
+    shard: Dict containing tensors. {checkpoint key: {slice_spec: tensor} }
+    task: The device spec task of the tensors in the shard.
+    options: Optional `CheckpointOptions` object.
 
-    Args:
-      file_prefix: A string or scalar string Tensor containing the prefix to
-        save under.
-      options: Optional `CheckpointOptions` object.
-    Returns:
-      An `Operation`, or None when executing eagerly.
-    """
-    options = options or checkpoint_options.CheckpointOptions()
-    tensor_names = []
-    tensors = []
-    slice_specs = []
-    for checkpoint_key, tensor_slices in self._tensor_slice_dict.items():
-      for slice_spec, tensor in tensor_slices.items():
-        if isinstance(tensor, saveable_object.SaveSpec):
-          tensor_value = tensor.tensor
-          # A tensor value of `None` indicates that this SaveableObject gets
-          # recorded in the object graph, but that no value is saved in the
-          # checkpoint.
-          if tensor_value is not None:
-            tensor_names.append(tensor.name)
-            tensors.append(tensor_value)
-            slice_specs.append(tensor.slice_spec)
-        else:
-          tensor_names.append(checkpoint_key)
-          tensors.append(tensor)
-          slice_specs.append(slice_spec)
-    save_device = options.experimental_io_device or (
-        len(tensors) and saveable_object_util.set_cpu0(tensors[0].device))
-    save_device = save_device or "cpu:0"
-    with ops.device(save_device):
-      return io_ops.save_v2(file_prefix, tensor_names, slice_specs, tensors)
-
-  def restore(self, file_prefix, options=None):
-    """Restore the saveable objects from a checkpoint with `file_prefix`.
+  Returns:
+    An `Operation`, or None when executing eagerly.
+  """
+  options = options or checkpoint_options.CheckpointOptions()
+
+  tensor_names = []
+  tensors = []
+  slice_specs = []
+  for checkpoint_key, tensor_slices in shard.items():
+    for slice_spec, tensor in tensor_slices.items():
+      # A tensor value of `None` indicates that this SaveableObject gets
+      # recorded in the object graph, but that no value is saved in the
+      # checkpoint.
+      if tensor is not None:
+        # See `MultiDeviceSaver._get_shards_by_task` for an explanation on the
+        # wrapped properties.
+        name = (tensor._wrapped_name  # pylint: disable=protected-access
+                if hasattr(tensor, "_wrapped_name")
+                else checkpoint_key)
+        spec = (tensor._wrapped_slice_spec  # pylint: disable=protected-access
+                if hasattr(tensor, "_wrapped_slice_spec")
+                else slice_spec)
+
+        tensor_names.append(name)
+        tensors.append(tensor)
+        slice_specs.append(spec)
+
+  save_device = options.experimental_io_device or (len(tensors) and task)
+  with ops.device(save_device or "CPU:0"):
+    return io_ops.save_v2(file_prefix, tensor_names, slice_specs, tensors)
+
+
+def _single_shard_restore(
+    file_prefix: tensor_lib.Tensor,
+    shardable_tensors: Sequence[sharding_util.ShardableTensor],
+    options: "checkpoint_options.CheckpointOptions | None" = None
+) -> sharding_util.TensorSliceDict:
+  """Restore the saveable objects from a checkpoint with `file_prefix`.
 
-    Args:
-      file_prefix: A string or scalar string Tensor containing the prefix for
-        files to read from.
-      options: Optional `CheckpointOptions` object.
+  Args:
+    file_prefix: A string or scalar string Tensor containing the prefix for
+      files to read from.
+    shardable_tensors: A list of ShardableTensors to restore.
+    options: Optional `CheckpointOptions` object.
 
-    Returns:
-      A restored tensor dict (maps checkpoint_key -> slice_spec -> tensor).
-    """
-    options = options or checkpoint_options.CheckpointOptions()
-    tensor_names = []
-    tensor_dtypes = []
-    slice_specs = []
-
-    for checkpoint_key, tensor_slices in self._tensor_slice_dict.items():
-      for slice_spec, tensor in tensor_slices.items():
-        tensor_dtypes.append(tensor.dtype)
-        if isinstance(tensor, saveable_object.SaveSpec):
-          slice_specs.append(tensor.slice_spec)
-          tensor_names.append(tensor.name)
-        else:
-          slice_specs.append(slice_spec)
-          tensor_names.append(checkpoint_key)
-
-    restore_device = options.experimental_io_device or "cpu:0"
-    with ops.device(restore_device):
-      restored_tensors = io_ops.restore_v2(
-          file_prefix, tensor_names, slice_specs, tensor_dtypes)
-
-    restored_tensor_dict = {}
-    for checkpoint_key, tensor_slices in self._tensor_slice_dict.items():
-      for slice_spec in tensor_slices:
-        restored_tensor = restored_tensors.pop(0)
-        restored_tensor_dict.setdefault(checkpoint_key, {})[slice_spec] = (
-            restored_tensor)
-    return restored_tensor_dict
-
-
-def sharded_filename(filename_tensor, shard, num_shards):
+  Returns:
+    A restored tensor dict (maps checkpoint_key -> slice_spec -> tensor).
+  """
+  options = options or checkpoint_options.CheckpointOptions()
+
+  tensor_names = []
+  tensor_dtypes = []
+  slice_specs = []
+  for shardable_tensor in shardable_tensors:
+    if shardable_tensor._tensor_save_spec:  # pylint: disable=protected-access
+      name = shardable_tensor._tensor_save_spec.name  # pylint: disable=protected-access
+      spec = shardable_tensor._tensor_save_spec.slice_spec  # pylint: disable=protected-access
+    else:
+      name, spec = shardable_tensor.checkpoint_key, shardable_tensor.slice_spec
+    tensor_names.append(name)
+    slice_specs.append(spec)
+    tensor_dtypes.append(shardable_tensor.dtype)
+
+  restore_device = options.experimental_io_device or "cpu:0"
+  with ops.device(restore_device):
+    restored_tensors = io_ops.restore_v2(
+        file_prefix, tensor_names, slice_specs, tensor_dtypes)
+
+  restored_tensor_dict = {}
+  for shardable_tensor in shardable_tensors:
+    restored_tensor = restored_tensors.pop(0)
+    (restored_tensor_dict
+     .setdefault(shardable_tensor.checkpoint_key, {}
+                 )[shardable_tensor.slice_spec]) = restored_tensor
+  return restored_tensor_dict
+
+
+def sharded_filename(
+    filename_tensor: tensor_lib.Tensor,
+    shard: int,
+    num_shards: tensor_lib.Tensor
+) -> tensor_lib.Tensor:
   """Append sharding information to a filename.
 
   Args:
@@ -214,15 +160,22 @@ def sharded_filename(filename_tensor, shard, num_shards):
   return gen_io_ops.sharded_filename(filename_tensor, shard, num_shards)
 
 
-def registered_saver_filename(filename_tensor, saver_name):
+def registered_saver_filename(
+    filename_tensor: tensor_lib.Tensor,
+    saver_name: registration.RegisteredSaver
+) -> tensor_lib.Tensor:
   return string_ops.string_join(
       [filename_tensor, constant_op.constant(f"-{saver_name}")])
 
 
-def _get_mapped_registered_save_fn(fn, trackables, call_with_mapped_captures):
+def _get_mapped_registered_save_fn(
+    fn: Callable[..., tensor_lib.Tensor],
+    trackables: Sequence[base.Trackable],
+    call_with_mapped_captures: MappedCapturesCallable
+) -> Callable[[tensor_lib.Tensor], MappedCapturesCallable]:
   """Converts the function to a python or tf.function with a single file arg."""
 
-  def save_fn(file_prefix):
+  def save_fn(file_prefix: tensor_lib.Tensor) -> tensor_lib.Tensor:
     return fn(trackables=trackables, file_prefix=file_prefix)
   if call_with_mapped_captures is None:
     return save_fn
@@ -231,17 +184,21 @@ def save_fn(file_prefix):
     concrete = tf_fn.get_concrete_function(
         file_prefix=tensor_spec.TensorSpec(shape=(), dtype=dtypes.string))
 
-    def save_fn_with_replaced_captures(file_prefix):
+    def save_fn_with_replaced_captures(
+        file_prefix: tensor_lib.Tensor) -> tensor_lib.Tensor:
       return call_with_mapped_captures(concrete, [file_prefix])
 
     return save_fn_with_replaced_captures
 
 
-def _get_mapped_registered_restore_fn(fn, trackables,
-                                      call_with_mapped_captures):
+def _get_mapped_registered_restore_fn(
+    fn: Callable[..., tensor_lib.Tensor],
+    trackables: Sequence[base.Trackable],
+    call_with_mapped_captures: MappedCapturesCallable
+) -> Callable[..., tensor_lib.Tensor]:
   """Converts the function to a python or tf.function with a single file arg."""
 
-  def restore_fn(merged_prefix):
+  def restore_fn(merged_prefix: tensor_lib.Tensor) -> tensor_lib.Tensor:
     return fn(trackables=trackables, merged_prefix=merged_prefix)
   if call_with_mapped_captures is None:
     return restore_fn
@@ -250,7 +207,8 @@ def restore_fn(merged_prefix):
     concrete = tf_fn.get_concrete_function(
         merged_prefix=tensor_spec.TensorSpec(shape=(), dtype=dtypes.string))
 
-    def restore_fn_with_replaced_captures(merged_prefix):
+    def restore_fn_with_replaced_captures(
+        merged_prefix: tensor_lib.Tensor) -> tensor_lib.Tensor:
       return call_with_mapped_captures(concrete, [merged_prefix])
 
     return restore_fn_with_replaced_captures
@@ -259,7 +217,7 @@ def restore_fn_with_replaced_captures(merged_prefix):
 _restore_noop = lambda *args, **kwargs: None
 
 
-class MultiDeviceSaver(object):
+class MultiDeviceSaver:
   """Saves checkpoints directly from multiple devices.
 
   Note that this is a low-level utility which stores Tensors in the keys
@@ -267,10 +225,12 @@ class MultiDeviceSaver(object):
   checkpointing are built on top of it.
   """
 
-  def __init__(self,
-               serialized_tensors,
-               registered_savers=None,
-               call_with_mapped_captures=None):
+  def __init__(
+      self,
+      serialized_tensors: Mapping[
+          base.Trackable, sharding_util.TensorSliceDict],
+      registered_savers: "RegisteredSaversDict | None" = None,
+      call_with_mapped_captures: "MappedCapturesCallable | None" = None):
     """Specify a list of `SaveableObject`s to save and restore.
 
     Args:
@@ -284,24 +244,37 @@ def __init__(self,
         Trackable in the checkpoint.
       call_with_mapped_captures: TODO
     """
+    self._shardable_tensors: MutableSequence[sharding_util.ShardableTensor] = []
     # Keep these two data structures so that we can map restored tensors to
     # the Trackable restore functions.
-    self._keys_to_restore_fn = {}
-    self._restore_fn_to_keys = {}
-
-    # Extract serialized tensors and separate by device.
-    tensors_by_device = {}  # device -> checkpoint key -> (slice_spec ->) tensor
-
+    self._keys_to_restore_fn: MutableMapping[
+        sharding_util.TensorSlice,
+        Callable[Mapping[str, tensor_lib.Tensor]]] = {}
+    self._restore_fn_to_keys: MutableMapping[
+        Callable[Mapping[str, tensor_lib.Tensor]],
+        MutableSequence[sharding_util.TensorSlice]] = {}
+
+    unique_tasks = set()
     for obj, tensor_dict in serialized_tensors.items():
       restore_fn = _restore_noop if obj is None else obj._restore_from_tensors
 
-      # Divide tensor_dict by device.
-      for checkpoint_key, maybe_tensor in tensor_dict.items():
-        if not isinstance(maybe_tensor, dict):
+      # Divide tensor_dict by task.
+      for checkpoint_key, tensor_slice_dict in tensor_dict.items():
+        if not isinstance(tensor_slice_dict, dict):
           # Make sure that maybe_tensor is structured as {slice_spec -> tensor}.
-          maybe_tensor = {"": maybe_tensor}
+          tensor_slice_dict = {"": tensor_slice_dict}
+
+        for slice_spec, tensor_save_spec in tensor_slice_dict.items():
+          tensor_value = None
+          if not isinstance(tensor_save_spec, saveable_object.SaveSpec):
+            tensor_value = tensor_save_spec
+            tensor_save_spec = saveable_object.SaveSpec(
+                tensor=tensor_value,
+                slice_spec=slice_spec,
+                name=checkpoint_key,
+                dtype=tensor_save_spec.dtype,
+                device=tensor_save_spec.device)
 
-        for slice_spec, tensor in maybe_tensor.items():
           if (checkpoint_key, slice_spec) in self._keys_to_restore_fn:
             raise ValueError(
                 "Recieved multiple tensors with the same checkpoint key and "
@@ -312,13 +285,24 @@ def __init__(self,
           self._restore_fn_to_keys.setdefault(restore_fn, []).append(
               (checkpoint_key, slice_spec))
 
-          host_device = saveable_object_util.set_cpu0(tensor.device)
-          (tensors_by_device
-           .setdefault(host_device, {})
-           .setdefault(checkpoint_key, {})[slice_spec]) = tensor
-    self._single_device_savers = {
-        device: _SingleDeviceSaver(tensor_slice_dict)
-        for device, tensor_slice_dict in tensors_by_device.items()}
+          device = (device_lib.DeviceSpec.from_string(tensor_save_spec.device)
+                    if isinstance(tensor_save_spec.device, str)
+                    else tensor_save_spec.device)
+          self._shardable_tensors.append(
+              sharding_util.ShardableTensor(
+                  _tensor_save_spec=tensor_save_spec,
+                  tensor=tensor_value,
+                  dtype=tensor_save_spec.dtype,
+                  device=device,
+                  name=tensor_save_spec.name,
+                  shape=None,
+                  slice_spec=slice_spec.strip(),
+                  checkpoint_key=checkpoint_key,
+                  trackable=obj))
+          unique_tasks.add(
+              saveable_object_util.set_cpu0(device.to_string()))
+
+    self._num_unique_tasks = len(unique_tasks)
 
     self._registered_savers = {}
     if registered_savers:
@@ -332,8 +316,13 @@ def __init__(self,
         self._registered_savers[registered_name] = (save_fn, restore_fn)
 
   @classmethod
-  def from_saveables(cls, saveables, registered_savers=None,
-                     call_with_mapped_captures=None):
+  def from_saveables(
+      cls,
+      saveables: Sequence[base.Trackable],
+      registered_savers: "RegisteredSaversDict | None" = None,
+      call_with_mapped_captures: "MappedCapturesCallable | None" = None
+  ) -> "MultiDeviceSaver":
+    """Constructs a MultiDeviceSaver from a list of `SaveableObject`s."""
     serialized_tensors = object_identity.ObjectIdentityDictionary()
     for saveable in saveables:
       trackable = saveable_object_util.SaveableCompatibilityConverter(
@@ -341,7 +330,7 @@ def from_saveables(cls, saveables, registered_savers=None,
       serialized_tensors[trackable] = trackable._serialize_to_tensors()  # pylint: disable=protected-access
     return cls(serialized_tensors, registered_savers, call_with_mapped_captures)
 
-  def to_proto(self):
+  def to_proto(self) -> saver_pb2.SaverDef:
     """Serializes to a SaverDef referencing the current graph."""
     filename_tensor = array_ops.placeholder(
         shape=[], dtype=dtypes.string, name="saver_filename")
@@ -356,7 +345,7 @@ def to_proto(self):
   @def_function.function(
       input_signature=(tensor_spec.TensorSpec(shape=(), dtype=dtypes.string),),
       autograph=False)
-  def _traced_save(self, file_prefix):
+  def _traced_save(self, file_prefix: tensor_lib.Tensor) -> tensor_lib.Tensor:
     save_op = self.save(file_prefix)
     with ops.device("cpu:0"):
       with ops.control_dependencies([save_op]):
@@ -365,13 +354,72 @@ def _traced_save(self, file_prefix):
   @def_function.function(
       input_signature=(tensor_spec.TensorSpec(shape=(), dtype=dtypes.string),),
       autograph=False)
-  def _traced_restore(self, file_prefix):
+  def _traced_restore(
+      self, file_prefix: tensor_lib.Tensor) -> tensor_lib.Tensor:
     restore_ops = self.restore(file_prefix)
     with ops.device("cpu:0"):
       with ops.control_dependencies(restore_ops.values()):
         return array_ops.identity(file_prefix)
 
-  def save(self, file_prefix, options=None):
+  def _get_shards_by_task(
+      self,
+      sharding_callback: sharding_util.ShardingCallback
+  ) -> Sequence[sharding_util.TensorSliceDict]:
+    """Calls the sharding callback with shardable_tensors.
+
+    Args:
+      sharding_callback: ShardingCallback. The callback function wrapper that
+        splits shardable_tensors into shards.
+
+    Returns:
+      A list of shards.
+    """
+    shardable_tensors_by_task = {}
+    for shardable_tensor in self._shardable_tensors:
+      tensor_val = shardable_tensor.tensor
+      tensor_shape = shardable_tensor.shape
+      save_spec = shardable_tensor._tensor_save_spec  # pylint: disable=protected-access
+      with ops.device(shardable_tensor.device):
+        save_spec_tensor = save_spec.tensor
+
+      if tensor_val is None and save_spec_tensor is None:
+        # A tensor value of `None` indicates that this SaveableObject gets
+        # recorded in the object graph, but that no value is saved in the
+        # checkpoint.
+        continue
+      elif save_spec_tensor is not None:
+        # Pull the tensor value from _tensor_save_spec.
+        tensor_val = save_spec_tensor
+        tensor_shape = save_spec_tensor.shape
+
+        # Propagate the save spec name and/or slice spec when they are tensors.
+        # This makes sure properties like `layout` for dtensor names/slice specs
+        # are preserved during sharding.
+        if isinstance(save_spec.name, tensor_lib.Tensor):
+          tensor_val._wrapped_name = save_spec.name  # pylint: disable=protected-access
+        if isinstance(shardable_tensor.slice_spec, tensor_lib.Tensor):
+          tensor_val._wrapped_slice_spec = save_spec.slice_spec  # pylint: disable=protected-access
+
+      task = device_lib.DeviceSpec.from_string(
+          saveable_object_util.set_cpu0(shardable_tensor.device.to_string()))
+      shardable_tensors_by_task.setdefault(task, []).append(dataclasses.replace(
+          shardable_tensor,
+          tensor=tensor_val,
+          shape=tensor_shape
+      ))
+
+    sharding_callback = (
+        sharding_callback or sharding_policies.ShardByTaskPolicy())
+    shards_by_task = [
+        (task, sharding_callback(shardable_tensors))
+        for task, shardable_tensors in shardable_tensors_by_task.items()]
+    return shards_by_task
+
+  def save(
+      self,
+      file_prefix: tensor_lib.Tensor,
+      options: "checkpoint_options.CheckpointOptions | None" = None
+  ) -> ops.Operation:
     """Save the saveable objects to a checkpoint with `file_prefix`.
 
     Args:
@@ -423,7 +471,7 @@ def save(self, file_prefix, options=None):
           for saver_name in self._registered_savers
       }
 
-    def save_fn():
+    def save_fn() -> ops.Operation:
       saved_prefixes = []
       # Save with the registered savers. These run before default savers due to
       # the API contract.
@@ -439,31 +487,31 @@ def save_fn():
                 f"string type tensors. Got {maybe_saved_prefixes}.")
           saved_prefixes.extend(flattened_saved_prefixes)
 
-      # (Default saver) Save with single device savers.
-      num_shards = len(self._single_device_savers)
+      shards_by_task = self._get_shards_by_task(
+          options.experimental_sharding_callback)
+      num_shards_tensor = constant_op.constant(
+          sum([len(shards) for _, shards in shards_by_task]), name="num_shards")
       sharded_saves = []
-      num_shards_tensor = constant_op.constant(num_shards, name="num_shards")
-      last_device = None
-      for shard, (device, saver) in enumerate(
-          sorted(self._single_device_savers.items())):
-        last_device = device
-        with ops.device(saveable_object_util.set_cpu0(device)):
-          shard_prefix = sharded_filename(tmp_checkpoint_prefix, shard,
-                                          num_shards_tensor)
-        saved_prefixes.append(shard_prefix)
-        with ops.device(device):
-          # _SingleDeviceSaver will use the CPU device when necessary, but
-          # initial read operations should be placed on the SaveableObject's
-          # device.
-          sharded_saves.append(saver.save(shard_prefix, options))
+
+      shard_idx = 0
+      for task, shards in shards_by_task:
+        for shard in shards:
+          with ops.device(task):
+            shard_prefix = sharded_filename(tmp_checkpoint_prefix, shard_idx,
+                                            num_shards_tensor)
+            shard_idx += 1
+          saved_prefixes.append(shard_prefix)
+          sharded_saves.append(
+              _single_shard_save(shard_prefix, shard, task, options))
 
       with ops.control_dependencies(sharded_saves):
         # Merge on the io_device if specified, otherwise co-locates the merge op
         # with the last device used.
-        merge_device = (
+        tensor_device_spec = self._shardable_tensors[-1].device
+        merge_device_spec = (
             options.experimental_io_device or
-            saveable_object_util.set_cpu0(last_device))
-        with ops.device(merge_device):
+            saveable_object_util.set_cpu0(tensor_device_spec.to_string()))
+        with ops.device(merge_device_spec):
           # V2 format write path consists of a metadata merge step.  Once
           # merged, attempts to delete the temporary directory,
           # "<user-fed prefix>_temp".
@@ -471,19 +519,23 @@ def save_fn():
               saved_prefixes, file_prefix, delete_old_dirs=True)
 
     # Since this will causes a function re-trace on each save, limit this to the
-    # cases where it is needed: eager and when there are multiple tasks/single
-    # device savers. Note that the retrace is needed to ensure we pickup the
-    # latest values of options like experimental_io_device.
-    if context.executing_eagerly() and len(self._single_device_savers) > 1:
+    # cases where it is needed: eager and when there are multiple tasks. Note
+    # that the retrace is needed to ensure we pickup the latest values of
+    # options like experimental_io_device.
+    if context.executing_eagerly() and self._num_unique_tasks > 1:
       # Explicitly place the identity op on the first device.
       @def_function.function(jit_compile=False)
-      def tf_function_save():
+      def tf_function_save() -> None:
         save_fn()
       tf_function_save()
     else:
       return save_fn()
 
-  def restore(self, file_prefix, options=None):
+  def restore(
+      self,
+      file_prefix: tensor_lib.Tensor,
+      options: "checkpoint_options.CheckpointOptions | None" = None
+  ) -> Mapping[str, ops.Operation]:
     """Restore the saveable objects from a checkpoint with `file_prefix`.
 
     Args:
@@ -498,18 +550,17 @@ def restore(self, file_prefix, options=None):
     """
     options = options or checkpoint_options.CheckpointOptions()
 
-    def restore_fn():
+    def restore_fn() -> Mapping[str, ops.Operation]:
       restore_fn_inputs = {}
       restore_fn_input_count = {
           fn: len(keys) for fn, keys in self._restore_fn_to_keys.items()}
 
       restore_ops = {}
-      # Sort by device name to avoid propagating non-deterministic dictionary
-      # ordering in some Python versions.
-      for device, saver in sorted(self._single_device_savers.items()):
-        with ops.device(device):
+      if self._shardable_tensors:
+        with ops.device("CPU:0"):
           # Load values from checkpoint
-          restored_tensor_dict = saver.restore(file_prefix, options)
+          restored_tensor_dict = _single_shard_restore(
+              file_prefix, self._shardable_tensors, options)
 
           # Map restored tensors to the corresponding restore_fn, and see if all
           # inputs have all been loaded. Call `restore_fn` if that is the case.
@@ -550,13 +601,12 @@ def restore_fn():
       return restore_ops
 
     has_custom_device_saver = any([
-        context.is_custom_device(d) for d in self._single_device_savers.keys()
-    ])
+        context.is_custom_device(st.device.to_string())
+        for st in self._shardable_tensors])
     # Since this will cause a function re-trace on each restore, limit this to
-    # cases where it is needed: eager and when there are multiple tasks/single
-    # device savers or any single device saver is a custom device. Note that the
-    # retrace is needed to ensure we pickup the latest values of options like
-    # experimental_io_device.
+    # cases where it is needed: eager and when there are multiple tasks or any
+    # device_spec is a custom device. Note that the retrace is needed to ensure
+    # we pickup the latest values of options like experimental_io_device.
     #
     # We run in a function when there is a custom device saver because custom
     # devices, such as DTensor, usually do a sharded save and restore.
@@ -564,10 +614,10 @@ def restore_fn():
     # of variables we are restoring to. In practice, this means that custom
     # devices need the AssignVariableOps along with the Restore op within the
     # same graph to infer shapes and shard specs for Restore op.
-    if context.executing_eagerly() and (len(self._single_device_savers) > 1 or
+    if context.executing_eagerly() and (self._num_unique_tasks > 1 or
                                         has_custom_device_saver):
       @def_function.function(jit_compile=False, autograph=False)
-      def tf_function_restore():
+      def tf_function_restore() -> Mapping[str, ops.Operation]:
         restore_fn()
         return {}
 
diff --git a/tensorflow/python/checkpoint/functional_saver_test.py b/tensorflow/python/checkpoint/functional_saver_test.py
index 3bac7428f2e030..954f8ed1c399b4 100644
--- a/tensorflow/python/checkpoint/functional_saver_test.py
+++ b/tensorflow/python/checkpoint/functional_saver_test.py
@@ -15,6 +15,7 @@
 """Tests for the functional saver."""
 
 import os
+import time
 
 from tensorflow.python.checkpoint import checkpoint
 from tensorflow.python.checkpoint import checkpoint_options
@@ -29,13 +30,12 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.module import module
+from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.training import server_lib
-from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.saving import saveable_object_util
 
-
 LOCALHOST = "/job:localhost/replica:0/task:0/device:CPU:0"
 
 
@@ -53,35 +53,22 @@ def setUp(self):
     self.local_options = checkpoint_options.CheckpointOptions(
         experimental_io_device=LOCALHOST)
 
-  def _get_shardable_tensors(self, serialized_tensors):
-    shardable_tensors = []
-    for obj, tensor_dict in serialized_tensors.items():
-      # Divide tensor_dict by device.
-      for checkpoint_key, tensor_slice_dict in tensor_dict.items():
-        if not isinstance(tensor_slice_dict, dict):
-          # Make sure that maybe_tensor is structured as {slice_spec -> tensor}.
-          tensor_slice_dict = {"": tensor_slice_dict}
-        for slice_spec, tensor_save_spec in tensor_slice_dict.items():
-          if not isinstance(tensor_save_spec, saveable_object.SaveSpec):
-            tensor_save_spec = saveable_object.SaveSpec(
-                tensor=tensor_save_spec,
-                slice_spec=slice_spec,
-                name=checkpoint_key,
-                dtype=tensor_save_spec.dtype,
-                device=tensor_save_spec.device)
-          save_spec_tensor = tensor_save_spec.tensor
-          shardable_tensors.append(
-              functional_saver.ShardableTensor(
-                  _tensor_save_spec=tensor_save_spec,
-                  tensor=save_spec_tensor,
-                  dtype=tensor_save_spec.dtype,
-                  device=tensor_save_spec.device,
-                  name=tensor_save_spec.name,
-                  shape=save_spec_tensor.shape,
-                  slice_spec=slice_spec,
-                  checkpoint_key=checkpoint_key,
-                  trackable=obj))
-    return shardable_tensors
+  def _get_tensors_by_task(self, root):
+    serialized_tensors, _, _, _ = (
+        checkpoint.TrackableSaver(graph_view.ObjectGraphView(root))
+        ._gather_serialized_tensors(None))
+
+    tensors_by_task = {}
+    for tensor_dict in serialized_tensors.values():
+      for checkpoint_key, maybe_tensor in tensor_dict.items():
+        if not isinstance(maybe_tensor, dict):
+          maybe_tensor = {"": maybe_tensor}
+        for slice_spec, tensor in maybe_tensor.items():
+          tensor_task = saveable_object_util.set_cpu0(tensor.device)
+          (tensors_by_task
+           .setdefault(tensor_task, {})
+           .setdefault(checkpoint_key, {})[slice_spec]) = tensor
+    return tensors_by_task
 
   @test_util.run_in_graph_and_eager_modes
   def test_resource_variable(self):
@@ -220,40 +207,49 @@ def test_checkpoint_multi_device_using_localhost(self):
         if op.type in ("SaveV2", "RestoreV2", "MergeV2Checkpoints"):
           self.assertEqual(LOCALHOST, op.device)
 
-  def test_ShardByDevicePolicy(self):
+  def test_single_task_save_singlehost_multidevice(self):
     root = module.Module()
     with ops.device("cpu:0"):
-      v0 = resource_variable_ops.ResourceVariable(0.0, name="v0")
+      v0 = resource_variable_ops.ResourceVariable(0.)
     with ops.device("cpu:1"):
-      v1 = resource_variable_ops.ResourceVariable(1.0, name="v1")
+      v1 = resource_variable_ops.ResourceVariable(1.)
     with ops.device("cpu:2"):
-      v2 = resource_variable_ops.ResourceVariable(2.0, name="v2")
+      v2 = resource_variable_ops.ResourceVariable(2.)
     root.v0 = v0
     root.v1 = v1
     root.v2 = v2
-    serialized_tensors, _, _, _ = (
-        checkpoint.TrackableSaver(graph_view.ObjectGraphView(root))
-        ._gather_serialized_tensors(None))
-    shardable_tensors = self._get_shardable_tensors(serialized_tensors)
 
-    callback = functional_saver.ShardByDevicePolicy()
-    shards = callback(shardable_tensors)
+    tensors_by_task = self._get_tensors_by_task(root)
+    var_names = [
+        "v0/.ATTRIBUTES/VARIABLE_VALUE",
+        "v1/.ATTRIBUTES/VARIABLE_VALUE",
+        "v2/.ATTRIBUTES/VARIABLE_VALUE"
+    ]
+    vars_numpy = [v0.numpy(), v1.numpy(), v2.numpy()]
+    tmp_dir = self.get_temp_dir()
+
+    for device in ["cpu:0", "cpu:1", "cpu:2"]:
+      for shard, (_, tensor_slice_dict) in enumerate(
+          sorted(tensors_by_task.items())[1:]):
+        with ops.device(device):
+          shard_prefix = gen_io_ops.sharded_filename(
+              os.path.join(tmp_dir, str(shard)), shard, 3)
+          functional_saver._single_task_save(
+              shard_prefix, tensor_slice_dict)
 
-    self.assertAllEqual(
-        [list(shard.keys()) for shard in shards],
-        [[
-            "v0/.ATTRIBUTES/VARIABLE_VALUE",
-            "v1/.ATTRIBUTES/VARIABLE_VALUE",
-            "v2/.ATTRIBUTES/VARIABLE_VALUE",
-            "_CHECKPOINTABLE_OBJECT_GRAPH"
-        ]])
+        start_time = time.time()
+        max_save_time = start_time + 5  # seconds
+        while not (gfile.ListDirectory(tmp_dir) or time.time() > max_save_time):
+          pass  # eager execution is lovely
+        self.assertNotEmpty(gfile.ListDirectory(tmp_dir))
 
-    self.assertEqual(shards[0]["v0/.ATTRIBUTES/VARIABLE_VALUE"][""].numpy(),
-                     v0.numpy())
-    self.assertEqual(shards[0]["v1/.ATTRIBUTES/VARIABLE_VALUE"][""].numpy(),
-                     v1.numpy())
-    self.assertEqual(shards[0]["v2/.ATTRIBUTES/VARIABLE_VALUE"][""].numpy(),
-                     v2.numpy())
+        with ops.device(device):
+          restored_dict = functional_saver._single_task_restore(
+              shard_prefix, tensor_slice_dict)
+          self.evaluate(restored_dict)
+          self.assertEqual(
+              restored_dict[var_names[shard]][""].numpy(),
+              vars_numpy[shard])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/checkpoint/sharding/BUILD b/tensorflow/python/checkpoint/sharding/BUILD
new file mode 100644
index 00000000000000..412f8c4f12d050
--- /dev/null
+++ b/tensorflow/python/checkpoint/sharding/BUILD
@@ -0,0 +1,103 @@
+# Description:
+#   Utilities for sharding object-based checkpoints.
+
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],
+)
+
+py_strict_library(
+    name = "sharding_policies",
+    srcs = ["sharding_policies.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":sharding_util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/trackable:base",
+        "//tensorflow/python/util:tf_export",
+        "@absl_py//absl/logging",
+    ],
+)
+
+tf_py_strict_test(
+    name = "sharding_policies_test",
+    srcs = ["sharding_policies_test.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":sharding_policies",
+        ":sharding_util",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/checkpoint:checkpoint_options",
+        "//tensorflow/python/checkpoint:graph_view",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/module",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/training:server_lib",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/training/saving:saveable_object_util",
+        "@absl_py//absl/logging",
+    ],
+)
+
+py_strict_library(
+    name = "sharding_util",
+    srcs = ["sharding_util.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/trackable:base",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/util:tf_export",
+        "@absl_py//absl/logging",
+    ],
+)
+
+tf_py_strict_test(
+    name = "sharding_util_test",
+    srcs = ["sharding_util_test.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":sharding_policies",
+        ":sharding_util",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/checkpoint:graph_view",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor",
+        "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/training:server_lib",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/training/saving:saveable_object_util",
+    ],
+)
diff --git a/tensorflow/python/checkpoint/sharding/sharding_policies.py b/tensorflow/python/checkpoint/sharding/sharding_policies.py
new file mode 100644
index 00000000000000..5ee731fd96d979
--- /dev/null
+++ b/tensorflow/python/checkpoint/sharding/sharding_policies.py
@@ -0,0 +1,322 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Checkpoint policies that determine how tensors are split into shards."""
+
+import math
+from typing import MutableSequence, Sequence
+
+from absl import logging
+
+from tensorflow.python.checkpoint.sharding import sharding_util
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor as tensor_lib
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.trackable import base
+from tensorflow.python.util import tf_export
+
+
+@tf_export.tf_export("train.experimental.ShardByTaskPolicy")
+class ShardByTaskPolicy(sharding_util.ShardingCallback):
+  """Policy that splits tensors into shards based on their device spec task."""
+
+  @property
+  def description(self) -> str:
+    return "Split tensors into shards based on their device spec task."
+
+  def __call__(
+      self,
+      shardable_tensors: Sequence[sharding_util.ShardableTensor]
+  ) -> Sequence[sharding_util.TensorSliceDict]:
+    """Callback to split tensors into shards based on their device spec task.
+
+    Args:
+      shardable_tensors: A list of ShardableTensors.
+
+    Returns:
+      List of shard dicts containing tensors.
+          [ {checkpoint key: {slice_spec: tensor} } ]
+    """
+    tensors_by_task = {}
+
+    for shardable_tensor in shardable_tensors:
+      tensor = shardable_tensor.tensor
+      checkpoint_key = shardable_tensor.checkpoint_key
+      slice_spec = shardable_tensor.slice_spec
+
+      (tensors_by_task
+       .setdefault(checkpoint_key, {})[slice_spec]) = tensor
+
+    return [tensors_by_task]
+
+
+_PartitionAxisAndSize = tuple[int, int]
+_OffsetAndShape = tuple[Sequence[int], Sequence[int]]
+
+
+@tf_export.tf_export("train.experimental.MaxShardSizePolicy")
+class MaxShardSizePolicy(sharding_util.ShardingCallback):
+  """Policy that splits tensors into shards with a max shard size.
+  
+  Shards may exceed the max shard size if they contain 1. a single scalar/string
+  tensor that could not be sliced and exceeds the max shard size or 2. the
+  checkpoint object graph, whose size cannot be calculated when saving.
+  """
+
+  def __init__(self, max_shard_size: int):
+    self.max_shard_size = max_shard_size
+
+  @property
+  def description(self) -> str:
+    return "Split tensors into shards with a max shard size."
+
+  def _get_next_partition(
+      self,
+      shard_size_remaining: int,
+      shape: tensor_shape.TensorShape,
+      dtype_size: int,
+      num_elems: int
+  ) -> _PartitionAxisAndSize:
+    """Gets tensor partition with size closest to shard_size_remaining.
+
+    Args:
+      shard_size_remaining: Size in bytes of the space remaining in the shard.
+      shape: Shape of the working tensor to partition in the remaining
+          shard space.
+      dtype_size: Size in bytes of the dtype of the working tensor.
+      num_elems: Number of elements in the working tensor.
+
+    Returns:
+      A tuple containing the axis of the next partition and that partition size.
+    """
+    if shape.rank is None or shape.rank == 0:
+      return 0, math.inf
+
+    # Find axis with minimum partitions. (aka axis with maximum partition size)
+    # (max partition size is as close as possible to the shard_size_remaining)
+    bytes_per_slice = num_elems // shape.dims[0].value * dtype_size
+    slices_per_shard = max(
+        1, math.floor(shard_size_remaining / bytes_per_slice))
+    min_parts = math.ceil(shape.dims[0].value / slices_per_shard)
+    min_axis = 0
+    for axis in range(1, shape.rank):
+      bytes_per_slice = num_elems // shape.dims[axis].value * dtype_size
+      slices_per_shard = max(
+          1, math.floor(shard_size_remaining / bytes_per_slice))
+      axis_parts = math.ceil(shape.dims[axis].value / slices_per_shard)
+      partition_size = num_elems * dtype_size / axis_parts
+      if (axis_parts < min_parts and
+          partition_size < shard_size_remaining):
+        min_axis, min_parts = axis, int(axis_parts)
+    return min_axis, math.ceil(int(shape[min_axis]) / min_parts)
+
+  def _add_partition(
+      self,
+      root_shardable_tensor: sharding_util.ShardableTensor,
+      dtype_size: int,
+      working_tensor_offset: Sequence[int],
+      part_axis_and_size: _PartitionAxisAndSize,
+      shard_size_remaining: int,
+      max_shard_size: int,
+      tensors_by_shard: MutableSequence[sharding_util.TensorSliceDict],
+      large_scalars: MutableSequence[sharding_util.TensorSliceDict],
+  ) -> tuple[tensor_lib.Tensor, _OffsetAndShape]:
+    """Adds the tensor partition to the shard, if possible.
+
+    Args:
+      root_shardable_tensor: The full tensor being partitioned.
+      dtype_size: Size in bytes of the dtype of the working tensor.
+      working_tensor_offset: The offset of the working tensor in the full
+          tensor.
+      part_axis_and_size: A tuple containing the axis of the partition and that
+          partition size.
+      shard_size_remaining: Size in bytes of the space remaining in the shard.
+      max_shard_size: Max size in bytes allowed for a checkpoint shard.
+      tensors_by_shard: List of shard dicts containing tensors.
+          [ {checkpoint key: {slice_spec: tensor} } ]
+      large_scalars: List of shard dicts containing scalars too large to fit in
+          the max_shard_size. [ {checkpoint key: {slice_spec: tensor} } ]
+
+    Returns:
+      A tuple containing the size of the slice that was added to the shard and
+          the offset & shape of the remaining portion of the tensor.
+    """
+    root_tensor = root_shardable_tensor.tensor
+    root_tensor_shape = root_shardable_tensor.shape
+    checkpoint_key = root_shardable_tensor.checkpoint_key
+
+    if root_tensor_shape.rank is None or root_tensor_shape.rank == 0:
+      return None, (None, None)
+
+    min_axis, part_size = part_axis_and_size
+
+    # Add what we can to the current shard.
+    slice_offset = working_tensor_offset
+    slice_shape = [root_tensor_shape[i] - slice_offset[i]
+                   for i in range(root_tensor_shape.rank)]
+    slice_shape[min_axis] = part_size
+    slice_size_in_bytes = int(math.prod(slice_shape)) * dtype_size
+    with ops.device(root_shardable_tensor.device):
+      tensor_slice = array_ops.slice(
+          root_tensor, begin=slice_offset, size=slice_shape)
+    slice_spec = variables.Variable.SaveSliceInfo(
+        full_name=checkpoint_key,
+        full_shape=root_tensor_shape,
+        var_offset=slice_offset,
+        var_shape=slice_shape).spec.strip()
+    remaining_size = shard_size_remaining
+    if slice_size_in_bytes > max_shard_size:
+      logging.warning("Slice %s of tensor %s is a scalar of size %s bytes and "
+                      "cannot be partitioned into a shard of max shard size %s "
+                      "bytes. It will be added as an individual shard that "
+                      "exceeds the max shard size.", slice_spec, checkpoint_key,
+                      slice_size_in_bytes, max_shard_size)
+      large_scalars.append({checkpoint_key: {slice_spec: tensor_slice}})
+    elif slice_size_in_bytes > shard_size_remaining:
+      # Smallest partition can't fit in the remaining shard space. Start fresh
+      # with a new shard.
+      return None, (None, None)
+    else:
+      if not tensors_by_shard or shard_size_remaining < 1:
+        tensors_by_shard.append({})
+        remaining_size = max_shard_size
+      (tensors_by_shard[-1]
+       .setdefault(checkpoint_key, {})[slice_spec]) = tensor_slice
+      remaining_size -= slice_size_in_bytes
+
+    # Get remaining portion of tensor to add to the next shard(s).
+    slice_offset[min_axis] += part_size
+    slice_shape = [root_tensor_shape[i] - slice_offset[i]
+                   for i in range(root_tensor_shape.rank)]
+
+    return (remaining_size, (slice_offset, slice_shape))
+
+  def __call__(
+      self, shardable_tensors: Sequence[sharding_util.ShardableTensor]
+  ) -> Sequence[sharding_util.TensorSliceDict]:
+    """Callback to split tensors into shards with a max shard size.
+
+    Args:
+      shardable_tensors: A list of ShardableTensors.
+
+    Returns:
+      List of shard dicts containing tensors.
+          [ {checkpoint key: {slice_spec: tensor} } ]
+    """
+    tensors_by_shard = []
+    large_scalars = []
+
+    shard_size_remaining = self.max_shard_size
+    for shardable_tensor in shardable_tensors:
+      root_tensor = shardable_tensor.tensor
+      root_shape = shardable_tensor.shape
+      dtype = shardable_tensor.dtype
+      checkpoint_key = shardable_tensor.checkpoint_key
+
+      dtype_size = dtypes.as_dtype(dtype).size
+      total_size = root_shape.num_elements() * dtype_size  # in bytes
+
+      # Calculate string tensor sizes.
+      if checkpoint_key == base.OBJECT_GRAPH_PROTO_KEY:
+        # In graph mode, the object graph is populated using feed_additions when
+        # the session is run. So, we can't calculate the size here. Fortunately,
+        # the serialized object graph string will never be that big, so we just
+        # place it in the current shard without worrying about its size.
+        total_size = dtype_size = 0
+      elif dtype == dtypes.string:
+        if not context.executing_eagerly():
+          with ops.device(shardable_tensor.device):
+            root_tensor = ops.get_default_session().run(root_tensor)
+
+        if root_shape.rank is None or root_shape.rank == 0:
+          sizes = [string_ops.string_length(root_tensor, unit="BYTE")]
+        else:
+          sizes = [string_ops.string_length(elem, unit="BYTE")
+                   for elem in root_tensor]
+
+        if context.executing_eagerly():
+          sizes = [size.numpy() for size in sizes]
+        else:
+          with ops.device(shardable_tensor.device):
+            sizes = ops.get_default_session().run(sizes)
+
+        total_size = sum(sizes)
+        dtype_size = max(sizes)
+
+      if (total_size > self.max_shard_size and
+          (root_shape.rank is None or root_shape.rank == 0)):
+        logging.warning("Tensor %s is a scalar of size %s bytes and cannot be "
+                        "partitioned into a shard of max shard size %s bytes. "
+                        "It will be added as an individual shard that exceeds "
+                        "the max shard size.",
+                        checkpoint_key, total_size, self.max_shard_size)
+        large_scalars.append(
+            {checkpoint_key: {shardable_tensor.slice_spec: root_tensor}})
+        continue
+
+      # Partition tensor and add partitions to shards.
+      working_tensor = root_tensor
+      working_tensor_var_offset = [0] * root_shape.rank
+      working_tensor_shape = root_shape
+      working_tensor_size = total_size
+      while working_tensor_size > shard_size_remaining:
+        part_axis_and_size = self._get_next_partition(
+            shard_size_remaining=shard_size_remaining,
+            shape=working_tensor_shape,
+            dtype_size=dtype_size,
+            num_elems=working_tensor_shape.num_elements())
+
+        (remaining_size,
+         (remaining_offset, remaining_shape)) = self._add_partition(
+             root_shardable_tensor=shardable_tensor,
+             dtype_size=dtype_size,
+             working_tensor_offset=working_tensor_var_offset,
+             part_axis_and_size=part_axis_and_size,
+             shard_size_remaining=shard_size_remaining,
+             max_shard_size=self.max_shard_size,
+             tensors_by_shard=tensors_by_shard,
+             large_scalars=large_scalars)
+
+        if remaining_size is None:
+          # Tensor partition couldn't fit in remaining shard space. Try again
+          # with the next full shard.
+          tensors_by_shard.append({})
+          shard_size_remaining = self.max_shard_size
+        else:
+          working_tensor = array_ops.slice(
+              root_tensor, begin=remaining_offset, size=remaining_shape)
+          working_tensor_var_offset = remaining_offset
+          working_tensor_shape = working_tensor.shape
+          working_tensor_size = int(math.prod(remaining_shape)) * dtype_size
+          shard_size_remaining = remaining_size
+
+      if working_tensor_shape.num_elements() > 0:
+        remaining_tensor_slice_spec = variables.Variable.SaveSliceInfo(
+            full_name=checkpoint_key,
+            full_shape=root_shape,
+            var_offset=working_tensor_var_offset,
+            var_shape=working_tensor_shape).spec.strip()
+        if not tensors_by_shard:
+          tensors_by_shard.append({})
+        (tensors_by_shard[-1]
+         .setdefault(checkpoint_key, {})
+         [remaining_tensor_slice_spec]) = working_tensor
+      shard_size_remaining -= working_tensor_size
+
+    return tensors_by_shard + large_scalars
diff --git a/tensorflow/python/checkpoint/sharding/sharding_policies_test.py b/tensorflow/python/checkpoint/sharding/sharding_policies_test.py
new file mode 100644
index 00000000000000..133a0b923d6338
--- /dev/null
+++ b/tensorflow/python/checkpoint/sharding/sharding_policies_test.py
@@ -0,0 +1,697 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for checkpoint sharding policies."""
+
+import random
+import string
+
+from tensorflow.python.checkpoint import checkpoint
+from tensorflow.python.checkpoint import checkpoint_options
+from tensorflow.python.checkpoint import graph_view
+from tensorflow.python.checkpoint.sharding import sharding_policies
+from tensorflow.python.checkpoint.sharding import sharding_util
+from tensorflow.python.eager import remote
+from tensorflow.python.eager import test
+from tensorflow.python.framework import device as device_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.module import module
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.training import server_lib
+from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.training.saving import saveable_object_util
+
+
+class ShardingPoliciesTest(test.TestCase):
+
+  def _get_shardable_tensors_by_task(self, root):
+    serialized_tensors, _, _, _ = (
+        checkpoint.TrackableSaver(graph_view.ObjectGraphView(root))
+        ._gather_serialized_tensors(None))
+
+    shardable_tensors_by_task = {}
+    for obj, tensor_dict in serialized_tensors.items():
+      # Divide tensor_dict by device.
+      for checkpoint_key, tensor_slice_dict in tensor_dict.items():
+        if not isinstance(tensor_slice_dict, dict):
+          # Make sure that maybe_tensor is structured as {slice_spec -> tensor}.
+          tensor_slice_dict = {"": tensor_slice_dict}
+        for slice_spec, tensor_save_spec in tensor_slice_dict.items():
+          if not isinstance(tensor_save_spec, saveable_object.SaveSpec):
+            tensor_save_spec = saveable_object.SaveSpec(
+                tensor=tensor_save_spec,
+                slice_spec=slice_spec,
+                name=checkpoint_key,
+                dtype=tensor_save_spec.dtype,
+                device=tensor_save_spec.device)
+          save_spec_tensor = tensor_save_spec.tensor
+          device = (device_lib.DeviceSpec.from_string(tensor_save_spec.device)
+                    if isinstance(tensor_save_spec.device, str)
+                    else tensor_save_spec.device)
+          task = device_lib.DeviceSpec.from_string(
+              saveable_object_util.set_cpu0(device.to_string()))
+          shardable_tensors_by_task.setdefault(task, []).append(
+              sharding_util.ShardableTensor(
+                  _tensor_save_spec=tensor_save_spec,
+                  tensor=save_spec_tensor,
+                  dtype=tensor_save_spec.dtype,
+                  device=device,
+                  name=tensor_save_spec.name,
+                  shape=save_spec_tensor.shape,
+                  slice_spec=slice_spec,
+                  checkpoint_key=checkpoint_key,
+                  trackable=obj))
+    return shardable_tensors_by_task.values()
+
+  def test_ShardByTaskPolicy(self):
+    servers = [server_lib.Server.create_local_server() for _ in range(3)]
+    cluster_spec = server_lib.ClusterSpec({
+        "worker": [s.target[len("grpc://"):] for s in servers]})
+    remote.connect_to_cluster(cluster_spec)
+    root = module.Module()
+    with ops.device("/job:worker/task:0/cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable(0.0, name="v0")
+    with ops.device("/job:worker/task:1/cpu:0"):
+      v1 = resource_variable_ops.ResourceVariable(1.0, name="v1")
+    with ops.device("/job:worker/task:2/cpu:0"):
+      v2 = resource_variable_ops.ResourceVariable(2.0, name="v2")
+    root.v0 = v0
+    root.v1 = v1
+    root.v2 = v2
+
+    shardable_tensors = self._get_shardable_tensors_by_task(root)
+
+    callback = sharding_policies.ShardByTaskPolicy()
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(callback(tensors))
+
+    self.assertAllEqual(
+        [set(shard.keys()) for shard in shards],
+        [
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE"},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE"},
+            {"v2/.ATTRIBUTES/VARIABLE_VALUE"},
+            {"_CHECKPOINTABLE_OBJECT_GRAPH"}
+        ])
+
+    self.assertEqual(
+        self.evaluate(shards[0]["v0/.ATTRIBUTES/VARIABLE_VALUE"][""]),
+        v0.numpy())
+    self.assertEqual(
+        self.evaluate(shards[1]["v1/.ATTRIBUTES/VARIABLE_VALUE"][""]),
+        v1.numpy())
+    self.assertEqual(
+        self.evaluate(shards[2]["v2/.ATTRIBUTES/VARIABLE_VALUE"][""]),
+        v2.numpy())
+
+  def test_CheckpointOption_ShardByTaskPolicy(self):
+    servers = [server_lib.Server.create_local_server() for _ in range(3)]
+    cluster_spec = server_lib.ClusterSpec({
+        "worker": [s.target[len("grpc://"):] for s in servers]})
+    remote.connect_to_cluster(cluster_spec)
+    root = module.Module()
+    with ops.device("/job:worker/task:0/cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable(0.0, name="v0")
+    self.evaluate(v0.initializer)
+    with ops.device("/job:worker/task:1/cpu:0"):
+      v1 = resource_variable_ops.ResourceVariable(1.0, name="v1")
+    self.evaluate(v1.initializer)
+    with ops.device("/job:worker/task:2/cpu:0"):
+      v2 = resource_variable_ops.ResourceVariable(2.0, name="v2")
+    self.evaluate(v2.initializer)
+    root.v0 = v0
+    root.v1 = v1
+    root.v2 = v2
+
+    tmp_dir = self.create_tempdir("ckpt")
+    ckpt = checkpoint.Checkpoint(root)
+    save_path = ckpt.save(
+        tmp_dir, options=checkpoint_options.CheckpointOptions(
+            experimental_sharding_callback=(
+                sharding_policies.ShardByTaskPolicy())))
+    self.assertLen(gfile.Glob(save_path + ".data*"), 4)
+    ckpt.restore(save_path)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_MaxShardSizePolicy_1D(self):
+    root = module.Module()
+    with ops.device("cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0, 3.0],
+                                                  name="v0",
+                                                  dtype=dtypes.float32)
+      v1 = resource_variable_ops.ResourceVariable([[4],
+                                                   [5],
+                                                   [6],
+                                                   [7]],
+                                                  name="v1",
+                                                  dtype=dtypes.int32)
+    self.evaluate(v0.initializer)
+    self.evaluate(v1.initializer)
+    root.v0 = v0
+    root.v1 = v1
+
+    v0_name = "v0/.ATTRIBUTES/VARIABLE_VALUE"
+    v1_name = "v1/.ATTRIBUTES/VARIABLE_VALUE"
+
+    class V0SaveSliceInfo(variables.Variable.SaveSliceInfo):
+      def __init__(self, var_offset, var_shape):
+        super().__init__(
+            full_name=v0_name,
+            full_shape=tensor_shape.TensorShape(dims=[4]),
+            var_offset=var_offset,
+            var_shape=var_shape)
+
+    class V1SaveSliceInfo(variables.Variable.SaveSliceInfo):
+      def __init__(self, var_offset, var_shape):
+        super().__init__(
+            full_name=v1_name,
+            full_shape=tensor_shape.TensorShape(dims=[4, 1]),
+            var_offset=var_offset,
+            var_shape=var_shape)
+
+    shardable_tensors = self._get_shardable_tensors_by_task(root)
+
+    # Test sharding the v0 & v1 tensors with different max shard sizes.
+
+    # max_shard_size: 4 bytes
+    # Each element of v0/v1 is a 32 bit/4 byte value, so each variable should be
+    # split into 4 shards.
+    callback = sharding_policies.MaxShardSizePolicy(max_shard_size=4)
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(callback(tensors))
+
+    self.assertEqual(
+        [set(shard.keys()) for shard in shards],
+        [
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE", "_CHECKPOINTABLE_OBJECT_GRAPH",}
+        ])
+
+    # V0
+    slice_spec = V0SaveSliceInfo(var_offset=[0], var_shape=[1]).spec
+    self.assertEqual(self.evaluate(shards[0][v0_name][slice_spec]), 0.0)
+
+    slice_spec = V0SaveSliceInfo(var_offset=[1], var_shape=[1]).spec
+    self.assertEqual(self.evaluate(shards[1][v0_name][slice_spec]), 1.0)
+
+    slice_spec = V0SaveSliceInfo(var_offset=[2], var_shape=[1]).spec
+    self.assertEqual(self.evaluate(shards[2][v0_name][slice_spec]), 2.0)
+
+    slice_spec = V0SaveSliceInfo(var_offset=[3], var_shape=[1]).spec
+    self.assertEqual(self.evaluate(shards[3][v0_name][slice_spec]), 3.0)
+
+    # V1
+    slice_spec = V1SaveSliceInfo(var_offset=[0, 0], var_shape=[1, 1]).spec
+    self.assertEqual(self.evaluate(shards[4][v1_name][slice_spec]), [4])
+
+    slice_spec = V1SaveSliceInfo(var_offset=[1, 0], var_shape=[1, 1]).spec
+    self.assertEqual(self.evaluate(shards[5][v1_name][slice_spec]), [5])
+
+    slice_spec = V1SaveSliceInfo(var_offset=[2, 0], var_shape=[1, 1]).spec
+    self.assertEqual(self.evaluate(shards[6][v1_name][slice_spec]), [6])
+
+    slice_spec = V1SaveSliceInfo(var_offset=[3, 0], var_shape=[1, 1]).spec
+    self.assertEqual(self.evaluate(shards[7][v1_name][slice_spec]), [7])
+
+    # max_shard_size: 8 bytes
+    # v0/v1 haven't changed, so they should now be split into 2 shards each.
+    callback = sharding_policies.MaxShardSizePolicy(max_shard_size=8)
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(callback(tensors))
+
+    self.assertEqual(
+        [set(shard.keys()) for shard in shards],
+        [
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE", "_CHECKPOINTABLE_OBJECT_GRAPH",}
+        ])
+
+    # V0
+    slice_spec = V0SaveSliceInfo(var_offset=[0], var_shape=[2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[0][v0_name][slice_spec]), [0.0, 1.0])
+
+    slice_spec = V0SaveSliceInfo(var_offset=[2], var_shape=[2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[1][v0_name][slice_spec]), [2.0, 3.0])
+
+    # V1
+    slice_spec = V1SaveSliceInfo(var_offset=[0, 0], var_shape=[2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[2][v1_name][slice_spec]), [[4], [5]])
+
+    slice_spec = V1SaveSliceInfo(var_offset=[2, 0], var_shape=[2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[3][v1_name][slice_spec]), [[6], [7]])
+
+    # max_shard_size: 10 bytes
+    # 10 bytes is an uneven boundary for 4 byte elements. v0/v1 should be split
+    # into 2 shards each.
+    callback = sharding_policies.MaxShardSizePolicy(max_shard_size=10)
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(callback(tensors))
+
+    self.assertEqual(
+        [set(shard.keys()) for shard in shards],
+        [
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE", "_CHECKPOINTABLE_OBJECT_GRAPH",}
+        ])
+
+    # V0
+    slice_spec = V0SaveSliceInfo(var_offset=[0], var_shape=[2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[0][v0_name][slice_spec]), [0.0, 1.0])
+
+    slice_spec = V0SaveSliceInfo(var_offset=[2], var_shape=[2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[1][v0_name][slice_spec]), [2.0, 3.0])
+
+    # V1
+    slice_spec = V1SaveSliceInfo(var_offset=[0, 0], var_shape=[2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[2][v1_name][slice_spec]), [[4], [5]])
+
+    slice_spec = V1SaveSliceInfo(var_offset=[2, 0], var_shape=[2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[3][v1_name][slice_spec]), [[6], [7]])
+
+    # max_shard_size: 16 bytes
+    # 16 bytes the exact size of each variable, so they should get 1 shard each.
+    callback = sharding_policies.MaxShardSizePolicy(max_shard_size=16)
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(callback(tensors))
+
+    self.assertEqual(
+        [set(shard.keys()) for shard in shards],
+        [
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE", "_CHECKPOINTABLE_OBJECT_GRAPH",}
+        ])
+
+    # V0
+    slice_spec = V0SaveSliceInfo(var_offset=[0], var_shape=[4]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[0][v0_name][slice_spec]), [0.0, 1.0, 2.0, 3.0])
+
+    # V1
+    slice_spec = V1SaveSliceInfo(var_offset=[0, 0], var_shape=[4, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[1][v1_name][slice_spec]), [[4], [5], [6], [7]])
+
+    # max_shard_size: 18 bytes
+    # 18 bytes slightly larger than the size of each variable, but not large
+    # enough to fit another 4 byte element, so they should get 1 shard each.
+    callback = sharding_policies.MaxShardSizePolicy(max_shard_size=18)
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(callback(tensors))
+
+    self.assertEqual(
+        [set(shard.keys()) for shard in shards],
+        [
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE", "_CHECKPOINTABLE_OBJECT_GRAPH",}
+        ])
+
+    # V0
+    slice_spec = V0SaveSliceInfo(var_offset=[0], var_shape=[4]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[0][v0_name][slice_spec]), [0.0, 1.0, 2.0, 3.0])
+
+    # V1
+    slice_spec = V1SaveSliceInfo(var_offset=[0, 0], var_shape=[4, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[1][v1_name][slice_spec]), [[4], [5], [6], [7]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_MaxShardSizePolicy_2D(self):
+    root = module.Module()
+    with ops.device("cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable([[0, 1],
+                                                   [2, 3],
+                                                   [4, 5]],
+                                                  name="v0")
+      v1 = resource_variable_ops.ResourceVariable([[[6.0], [7.0]],
+                                                   [[8.0], [9.0]],
+                                                   [[10.0], [11.0]]], name="v1")
+    self.evaluate(v0.initializer)
+    self.evaluate(v1.initializer)
+    root.v0 = v0
+    root.v1 = v1
+
+    v0_name = "v0/.ATTRIBUTES/VARIABLE_VALUE"
+    v1_name = "v1/.ATTRIBUTES/VARIABLE_VALUE"
+
+    class V0SaveSliceInfo(variables.Variable.SaveSliceInfo):
+      def __init__(self, var_offset, var_shape):
+        super().__init__(
+            full_name=v0_name,
+            full_shape=tensor_shape.TensorShape(dims=[3, 2]),
+            var_offset=var_offset,
+            var_shape=var_shape)
+
+    class V1SaveSliceInfo(variables.Variable.SaveSliceInfo):
+      def __init__(self, var_offset, var_shape):
+        super().__init__(
+            full_name=v1_name,
+            full_shape=tensor_shape.TensorShape(dims=[3, 2, 1]),
+            var_offset=var_offset,
+            var_shape=var_shape)
+
+    shardable_tensors = self._get_shardable_tensors_by_task(root)
+
+    # Test sharding the v0 & v1 tensors with different max shard sizes.
+
+    # max_shard_size: 8 bytes
+    # Each element of v0/v1 is a 32 bit/4 byte value, so each variable should be
+    # split into 3 shards.
+    callback = sharding_policies.MaxShardSizePolicy(max_shard_size=8)
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(callback(tensors))
+
+    self.assertEqual(
+        [set(shard.keys()) for shard in shards],
+        [
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE", "_CHECKPOINTABLE_OBJECT_GRAPH",}
+        ])
+
+    # V0
+    slice_spec = V0SaveSliceInfo(var_offset=[0, 0], var_shape=[1, 2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[0][v0_name][slice_spec]), [[0, 1]])
+
+    slice_spec = V0SaveSliceInfo(var_offset=[1, 0], var_shape=[1, 2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[1][v0_name][slice_spec]), [[2, 3]])
+
+    slice_spec = V0SaveSliceInfo(var_offset=[2, 0], var_shape=[1, 2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[2][v0_name][slice_spec]), [[4, 5]])
+
+    # V1
+    slice_spec = V1SaveSliceInfo(var_offset=[0, 0, 0], var_shape=[1, 2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[3][v1_name][slice_spec]), [[[6.0], [7.0]]])
+
+    slice_spec = V1SaveSliceInfo(var_offset=[1, 0, 0], var_shape=[1, 2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[4][v1_name][slice_spec]), [[[8.0], [9.0]]])
+
+    slice_spec = V1SaveSliceInfo(var_offset=[2, 0, 0], var_shape=[1, 2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[5][v1_name][slice_spec]), [[[10.0], [11.0]]])
+
+    # max_shard_size: 10 bytes
+    # 10 bytes is an uneven boundary for 4 byte elements. v0/v1 should be split
+    # into 3 shards each.
+    callback = sharding_policies.MaxShardSizePolicy(max_shard_size=10)
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(callback(tensors))
+
+    self.assertEqual(
+        [set(shard.keys()) for shard in shards],
+        [
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE", "_CHECKPOINTABLE_OBJECT_GRAPH",}
+        ])
+
+    # V0
+    slice_spec = V0SaveSliceInfo(var_offset=[0, 0], var_shape=[1, 2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[0][v0_name][slice_spec]), [[0, 1]])
+
+    slice_spec = V0SaveSliceInfo(var_offset=[1, 0], var_shape=[1, 2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[1][v0_name][slice_spec]), [[2, 3]])
+
+    slice_spec = V0SaveSliceInfo(var_offset=[2, 0], var_shape=[1, 2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[2][v0_name][slice_spec]), [[4, 5]])
+
+    # V1
+    slice_spec = V1SaveSliceInfo(var_offset=[0, 0, 0], var_shape=[1, 2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[3][v1_name][slice_spec]), [[[6.0], [7.0]]])
+
+    slice_spec = V1SaveSliceInfo(var_offset=[1, 0, 0], var_shape=[1, 2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[4][v1_name][slice_spec]), [[[8.0], [9.0]]])
+
+    slice_spec = V1SaveSliceInfo(var_offset=[2, 0, 0], var_shape=[1, 2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[5][v1_name][slice_spec]), [[[10.0], [11.0]]])
+
+    # max_shard_size: 12 bytes
+    # 12 bytes is enough to fit 3 elements per variable in each shard, BUT that
+    # would require concurrent multidimensional tensor partitioning, which is
+    # not currently implemented for MaxShardSizePolicy. (When partitioning a
+    # tensor into a shard, we choose an axis to partition along. This can
+    # happen multiple times for a given tensor (in the case that the tensor
+    # spans multiple shards). In that case, multiple dimensions can be
+    # partitioned along (each time the tensor is partitioned, a new axis can be
+    # chosen), but not within a single iteration of adding a tensor partition to
+    # the shard.) So, v0/v1 should be split into 3 shards each.
+    callback = sharding_policies.MaxShardSizePolicy(max_shard_size=12)
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(callback(tensors))
+
+    self.assertEqual(
+        [set(shard.keys()) for shard in shards],
+        [
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE", "_CHECKPOINTABLE_OBJECT_GRAPH",}
+        ])
+
+    # V0
+    slice_spec = V0SaveSliceInfo(var_offset=[0, 0], var_shape=[1, 2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[0][v0_name][slice_spec]), [[0, 1]])
+
+    slice_spec = V0SaveSliceInfo(var_offset=[1, 0], var_shape=[1, 2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[1][v0_name][slice_spec]), [[2, 3]])
+
+    slice_spec = V0SaveSliceInfo(var_offset=[2, 0], var_shape=[1, 2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[2][v0_name][slice_spec]), [[4, 5]])
+
+    # V1
+    slice_spec = V1SaveSliceInfo(var_offset=[0, 0, 0], var_shape=[1, 2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[3][v1_name][slice_spec]), [[[6.0], [7.0]]])
+
+    slice_spec = V1SaveSliceInfo(var_offset=[1, 0, 0], var_shape=[1, 2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[4][v1_name][slice_spec]), [[[8.0], [9.0]]])
+
+    slice_spec = V1SaveSliceInfo(var_offset=[2, 0, 0], var_shape=[1, 2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[5][v1_name][slice_spec]), [[[10.0], [11.0]]])
+
+    # max_shard_size: 16 bytes
+    # Each variable should be split into 1.5 shards. The middle shard will
+    # contain elements from both variables.
+    callback = sharding_policies.MaxShardSizePolicy(max_shard_size=16)
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(callback(tensors))
+
+    self.assertEqual(
+        [set(shard.keys()) for shard in shards],
+        [
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE", "v1/.ATTRIBUTES/VARIABLE_VALUE"},
+            {"v1/.ATTRIBUTES/VARIABLE_VALUE", "_CHECKPOINTABLE_OBJECT_GRAPH",}
+        ])
+
+    # V0
+    slice_spec = V0SaveSliceInfo(var_offset=[0, 0], var_shape=[2, 2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[0][v0_name][slice_spec]), [[0, 1], [2, 3]])
+
+    slice_spec = V0SaveSliceInfo(var_offset=[2, 0], var_shape=[1, 2]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[1][v0_name][slice_spec]), [[4, 5]])
+
+    # V1
+    slice_spec = V1SaveSliceInfo(var_offset=[0, 0, 0], var_shape=[1, 2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[1][v1_name][slice_spec]), [[[6.0], [7.0]]])
+
+    slice_spec = V1SaveSliceInfo(var_offset=[1, 0, 0], var_shape=[2, 2, 1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[2][v1_name][slice_spec]),
+        [[[8.0], [9.0]], [[10.0], [11.0]]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_MaxShardSizePolicy_Strings(self):
+    v_strings = [
+        "".join(random.choices(string.ascii_uppercase + string.digits, k=10))
+        for _ in range(4)]
+
+    root = module.Module()
+    with ops.device("cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable(v_strings, name="v0",
+                                                  dtype=dtypes.string)
+    self.evaluate(v0.initializer)
+    root.v0 = v0
+
+    v0_name = "v0/.ATTRIBUTES/VARIABLE_VALUE"
+
+    class V0SaveSliceInfo(variables.Variable.SaveSliceInfo):
+      def __init__(self, var_offset, var_shape):
+        super().__init__(
+            full_name=v0_name,
+            full_shape=tensor_shape.TensorShape(dims=[4]),
+            var_offset=var_offset,
+            var_shape=var_shape)
+
+    shardable_tensors = self._get_shardable_tensors_by_task(root)
+
+    # Test sharding the v0 & v1 tensors with different max shard sizes.
+
+    # max_shard_size: 10 bytes
+    # Each string in v0 is 10 bytes, so there should be 1 string per shard.
+    callback = sharding_policies.MaxShardSizePolicy(max_shard_size=10)
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(callback(tensors))
+
+    self.assertEqual(
+        [set(shard.keys()) for shard in shards],
+        [
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE", "_CHECKPOINTABLE_OBJECT_GRAPH",}
+        ])
+
+    slice_spec = V0SaveSliceInfo(var_offset=[0], var_shape=[1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[0][v0_name][slice_spec]), [v_strings[0]])
+
+    slice_spec = V0SaveSliceInfo(var_offset=[1], var_shape=[1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[1][v0_name][slice_spec]), [v_strings[1]])
+
+    slice_spec = V0SaveSliceInfo(var_offset=[2], var_shape=[1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[2][v0_name][slice_spec]), [v_strings[2]])
+
+    slice_spec = V0SaveSliceInfo(var_offset=[3], var_shape=[1]).spec
+    self.assertAllEqual(
+        self.evaluate(shards[3][v0_name][slice_spec]), [v_strings[3]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_MaxShardSizePolicy_LargeScalar(self):
+    v_string = "".join(random.choices(
+        string.ascii_uppercase + string.digits, k=10)).encode("utf-8")
+    root = module.Module()
+    with ops.device("cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable(
+          v_string, name="v0", dtype=dtypes.string)
+    self.evaluate(v0.initializer)
+    root.v0 = v0
+
+    v0_name = "v0/.ATTRIBUTES/VARIABLE_VALUE"
+
+    shardable_tensors = self._get_shardable_tensors_by_task(root)
+
+    # max_shard_size: 8 bytes
+    callback = sharding_policies.MaxShardSizePolicy(max_shard_size=8)
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(callback(tensors))
+
+    self.assertEqual(
+        [set(shard.keys()) for shard in shards],
+        [
+            {"_CHECKPOINTABLE_OBJECT_GRAPH",},
+            {"v0/.ATTRIBUTES/VARIABLE_VALUE",}
+        ])
+
+    tensor_val = (self.evaluate(shards[1][v0_name][""])
+                  if ops.context.executing_eagerly()
+                  else shards[1][v0_name][""])
+    self.assertEqual(tensor_val, v_string)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_CheckpointOption_MaxShardSizePolicy(self):
+    root = module.Module()
+    with ops.device("cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable([[0, 1],
+                                                   [2, 3],
+                                                   [4, 5]],
+                                                  name="v0")
+      v1 = resource_variable_ops.ResourceVariable([[[6.0], [7.0]],
+                                                   [[8.0], [9.0]],
+                                                   [[10.0], [11.0]]], name="v1")
+      v2 = resource_variable_ops.ResourceVariable("test_string", name="v1")
+    self.evaluate(v0.initializer)
+    self.evaluate(v1.initializer)
+    self.evaluate(v2.initializer)
+    root.v0 = v0
+    root.v1 = v1
+    root.v2 = v2
+
+    tmp_dir = self.create_tempdir("ckpt")
+    ckpt = checkpoint.Checkpoint(root)
+    save_path = ckpt.save(
+        tmp_dir, options=checkpoint_options.CheckpointOptions(
+            experimental_sharding_callback=(
+                sharding_policies.MaxShardSizePolicy(max_shard_size=10))))
+    self.assertLen(gfile.Glob(save_path + ".data*"), 8)
+    ckpt.restore(save_path)
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/checkpoint/sharding/sharding_util.py b/tensorflow/python/checkpoint/sharding/sharding_util.py
new file mode 100644
index 00000000000000..322bba18dcfa84
--- /dev/null
+++ b/tensorflow/python/checkpoint/sharding/sharding_util.py
@@ -0,0 +1,263 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data structures and utilities for checkpoint sharding."""
+
+import abc
+import dataclasses
+import inspect
+from typing import Hashable, MutableMapping, Sequence
+
+from tensorflow.python.framework import device as device_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor as tensor_lib
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import variables
+from tensorflow.python.trackable import base
+from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.util import tf_export
+
+
+TensorSlice = MutableMapping[tensor_spec.TensorSpec, tensor_lib.Tensor]
+TensorSliceDict = MutableMapping[str, TensorSlice]
+
+
+@tf_export.tf_export("train.experimental.ShardableTensor")
+@dataclasses.dataclass(frozen=True)
+class ShardableTensor:
+  """Tensor wrapper containing data necessary for sharding.
+
+  The tensor representation used as inputs to pre-made and custom
+  `tf.train.experiemental.ShardingCallback`s, which can be specified using the
+  `experimental_sharding_callback` option in `tf.train.CheckpointOptions`.
+
+  """
+  _tensor_save_spec: saveable_object.SaveSpec
+  tensor: tensor_lib.Tensor
+  dtype: dtypes.DType
+  device: device_lib.DeviceSpec
+  name: str
+  shape: tensor_shape.TensorShape
+  slice_spec: variables.Variable.SaveSliceInfo
+  checkpoint_key: str
+  trackable: base.Trackable
+
+  def __hash__(self) -> int:
+    return hash((self.name, self.dtype, str(self.device), self.checkpoint_key))
+
+  def __repr__(self) -> str:
+    return (f"\n{self.__class__.__name__}:\n"
+            f"  _tensor_save_spec={self._tensor_save_spec!r}\n"
+            f"  tensor={self.tensor!r}\n"
+            f"  dtype={self.dtype!r}\n"
+            f"  device={self.device!r}\n"
+            f"  name={self.name!r}\n"
+            f"  shape={self.shape!r}\n"
+            f"  slice_spec={self.slice_spec!r}\n"
+            f"  checkpoint_key={self.checkpoint_key!r}\n"
+            f"  trackable={self.trackable!r}")
+
+
+@tf_export.tf_export("train.experimental.ShardingCallback")
+class ShardingCallback(abc.ABC):
+  """Checkpoint sharding callback function, along with a text description.
+
+  A callback function wrapper that will be executed to determine how tensors
+  will be split into shards when the saver writes the checkpoint shards to disk.
+
+  The callback takes a list of `tf.train.experimental.ShardableTensor`s as input
+  (as well as any kwargs defined by the `tf.train.experimental.ShardingCallback`
+  subclass), and organizes the input tensors into different shards. Tensors are
+  first organized by device task (see `tf.DeviceSpec`), then the callback will
+  be called for each collection of tensors.
+
+  There are a few restrictions to keep in mind when creating a custom callback:
+    - Tensors must not be removed from the checkpoint.
+    - Tensors must not be reshaped.
+    - Tensor dtypes must not change.
+    - Tensors within a shard must belong to the same task.
+  Validation checks will be performed after the callback function is executed to
+  ensure these restrictions aren't violated.
+
+  Here's an example of a simple custom callback:
+
+  ```
+  # Place all tensors in a single shard.
+  class AllInOnePolicy(tf.train.experimental.ShardingCallback):
+    @property
+    def description(self):
+      return "Place all tensors in a single shard."
+
+    def __call__(self, shardable_tensors):
+      tensors = {}
+      for shardable_tensor in shardable_tensors:
+        tensor = shardable_tensor.tensor_save_spec.tensor
+        checkpoint_key = shardable_tensor.checkpoint_key
+        slice_spec = shardable_tensor.slice_spec
+
+        tensors.set_default(checkpoint_key, {})[slice_spec] = tensor
+      return [tensors]
+
+  ckpt.save(
+      "path",
+      options=tf.train.CheckpointOptions(
+          experimental_sharding_callback=AllInOnePolicy()))
+  ```
+
+  The `description` attribute is used to identify the callback and to aid
+  debugging during saving and restoration.
+
+  To take in kwargs, simply define the constructor and pass them in:
+
+  ```
+  class ParameterPolicy(tf.train.experimental.ShardingCallback):
+    def __init__(self, custom_param):
+      self.custom_param = custom_param
+    ...
+
+  ckpt.save(
+      "path",
+      options=tf.train.CheckpointOptions(
+          experimental_sharding_callback=ParameterPolicy(custom_param=...)))
+  ```
+
+  """
+  description: str
+
+  @property
+  @abc.abstractmethod
+  def description(self) -> str:
+    pass
+
+  @abc.abstractmethod
+  def __call__(
+      self, shardable_tensors: Sequence[ShardableTensor]
+  ) -> Sequence[TensorSliceDict]:
+    pass
+
+  def __hash__(self) -> int:
+    hash_val = hash(self.description)
+    # vars() only includes user-defined attributes.
+    for attr_name, attr_val in vars(self).items():
+      if not (inspect.ismethod(attr_val) or inspect.isfunction(attr_val)):
+        hash_val ^= hash(attr_name)
+        if isinstance(attr_val, Hashable):
+          hash_val ^= hash(attr_val)
+    return hash_val
+
+
+def validate_shards(
+    shards: Sequence[TensorSliceDict],
+    shardable_tensors: Sequence[ShardableTensor],
+    callback_description: str
+) -> None:
+  """Validates shards generated by the sharding_callback."""
+  unseen_tensor_dict = {}
+  for shardable_tensor in shardable_tensors:
+    unseen_tensor_dict.setdefault(
+        shardable_tensor.checkpoint_key, {}
+        )[shardable_tensor.slice_spec] = shardable_tensor.tensor
+  seen_tensor_set = set()
+
+  for shard_tensors in shards:
+    task_tensor = None
+    for checkpoint_key, tensor_slice_dict in shard_tensors.items():
+      for slice_spec, shard_tensor in tensor_slice_dict.items():
+        slice_spec = slice_spec.strip()
+
+        # Validate uniqueness.
+        if (checkpoint_key, slice_spec) in seen_tensor_set:
+          raise RuntimeError(
+              "After executing the checkpoint sharding callback, multiple "
+              "tensors with the same checkpoint key and slice spec were "
+              "found:\n"
+              f"  callback_description: {callback_description}\n"
+              f"  checkpoint_key: {checkpoint_key}\n"
+              f"  slice_spec: {slice_spec}\n")
+
+        # Validate no added tensors.
+        if checkpoint_key not in unseen_tensor_dict:
+          raise RuntimeError(
+              "After executing the checkpoint sharding callback, a tensor "
+              "not originally in the object graph was found in the "
+              "checkpoint shards:\n"
+              f"  callback_description: {callback_description}\n"
+              f"  checkpoint_key: {checkpoint_key}\n"
+              f"  slice_spec: {slice_spec}\n")
+
+        # Validate no shape change.
+        target_shape = unseen_tensor_dict[checkpoint_key][slice_spec].shape
+        if shard_tensor.shape != target_shape:
+          raise RuntimeError(
+              "After executing the checkpoint sharding callback, a tensor "
+              "was found with an altered shape:\n"
+              f"  callback_description: {callback_description}\n"
+              f"  checkpoint_key: {checkpoint_key}\n"
+              f"  slice_spec: {slice_spec}\n"
+              f"  original tensor_shape: {target_shape}\n"
+              f"  new tensor_shape: {shard_tensor.shape}\n")
+
+        # Validate no dtype change.
+        target_dtype = unseen_tensor_dict[checkpoint_key][slice_spec].dtype
+        if shard_tensor.dtype != target_dtype:
+          raise RuntimeError(
+              "After executing the checkpoint sharding callback, a tensor "
+              "was found with an altered dtype:\n"
+              f"  callback_description: {callback_description}\n"
+              f"  checkpoint_key: {checkpoint_key}\n"
+              f"  slice_spec: {slice_spec}\n"
+              f"  original tensor_dtype: {target_dtype}\n"
+              f"  new tensor_dtype: {shard_tensor.dtype}\n")
+
+        # Validate same task in shard.
+        if task_tensor is None:
+          task_tensor = ShardableTensor
+          task_tensor.device = shard_tensor.device
+          task_tensor.checkpoint_key = checkpoint_key
+          task_tensor.slice_spec = slice_spec
+        else:
+          task1 = device_lib.DeviceSpec.from_string(task_tensor.device).task
+          task2 = device_lib.DeviceSpec.from_string(shard_tensor.device).task
+          if task1 is not None and task2 is not None and task1 != task2:
+            raise RuntimeError(
+                "After executing the checkpoint sharding callback, tensors "
+                "with different tasks were found in the same shard:\n"
+                f"  callback_description: {callback_description}\n"
+                "  tensor #1:"
+                f"    checkpoint_key: {task_tensor.checkpoint_key}\n"
+                f"    slice_spec: {task_tensor.slice_spec}\n"
+                f"    task: {task1}\n"
+                "  tensor #2:"
+                f"    checkpoint_key: {checkpoint_key}\n"
+                f"    slice_spec: {slice_spec}\n"
+                f"    task: {task2}\n")
+
+        del unseen_tensor_dict[checkpoint_key][slice_spec]
+        if not unseen_tensor_dict[checkpoint_key]:
+          del unseen_tensor_dict[checkpoint_key]
+        seen_tensor_set.add((checkpoint_key, slice_spec))
+
+  # validate no tensor removal
+  if unseen_tensor_dict:
+    tensors_info = ""
+    for ckpt_key, slice_spec in unseen_tensor_dict.items():
+      tensors_info += "  tensor:\n"
+      tensors_info += f"    checkpoint_key: {ckpt_key}\n"
+      tensors_info += f"    slice_spec: {slice_spec}\n"
+    raise RuntimeError(
+        "After executing the checkpoint sharding callback, tensors in the "
+        "object graph were not found in the checkpoint shards:\n"
+        f"  callback_description: {callback_description}\n"
+        f"{tensors_info}")
diff --git a/tensorflow/python/checkpoint/sharding/sharding_util_test.py b/tensorflow/python/checkpoint/sharding/sharding_util_test.py
new file mode 100644
index 00000000000000..1c5acbea791b78
--- /dev/null
+++ b/tensorflow/python/checkpoint/sharding/sharding_util_test.py
@@ -0,0 +1,382 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests for checkpoint sharding structures and utilities."""
+
+
+from typing import Sequence
+
+from tensorflow.python.checkpoint import checkpoint
+from tensorflow.python.checkpoint import graph_view
+from tensorflow.python.checkpoint.sharding import sharding_policies
+from tensorflow.python.checkpoint.sharding import sharding_util
+from tensorflow.python.eager import remote
+from tensorflow.python.eager import test
+from tensorflow.python.framework import device as device_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor as tensor_lib
+from tensorflow.python.module import module
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import server_lib
+from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.training.saving import saveable_object_util
+
+
+class ShardingUtilTest(test.TestCase):
+
+  def _get_shardable_tensors_by_task(self, root):
+    serialized_tensors, _, _, _ = (
+        checkpoint.TrackableSaver(graph_view.ObjectGraphView(root))
+        ._gather_serialized_tensors(None))
+
+    shardable_tensors_by_task = {}
+    for obj, tensor_dict in serialized_tensors.items():
+      for checkpoint_key, tensor_slice_dict in tensor_dict.items():
+        if not isinstance(tensor_slice_dict, dict):
+          # Make sure that maybe_tensor is structured as {slice_spec -> tensor}.
+          tensor_slice_dict = {"": tensor_slice_dict}
+        for slice_spec, tensor_save_spec in tensor_slice_dict.items():
+          if not isinstance(tensor_save_spec, saveable_object.SaveSpec):
+            tensor_save_spec = saveable_object.SaveSpec(
+                tensor=tensor_save_spec,
+                slice_spec=slice_spec,
+                name=checkpoint_key,
+                dtype=tensor_save_spec.dtype,
+                device=tensor_save_spec.device)
+          save_spec_tensor = tensor_save_spec.tensor
+          device = (device_lib.DeviceSpec.from_string(tensor_save_spec.device)
+                    if isinstance(tensor_save_spec.device, str)
+                    else tensor_save_spec.device)
+          task = device_lib.DeviceSpec.from_string(
+              saveable_object_util.set_cpu0(device.to_string()))
+          shardable_tensors_by_task.setdefault(task, []).append(
+              sharding_util.ShardableTensor(
+                  _tensor_save_spec=tensor_save_spec,
+                  tensor=save_spec_tensor,
+                  dtype=tensor_save_spec.dtype,
+                  device=device,
+                  name=tensor_save_spec.name,
+                  shape=save_spec_tensor.shape,
+                  slice_spec=slice_spec.strip(),
+                  checkpoint_key=checkpoint_key,
+                  trackable=obj))
+    return shardable_tensors_by_task.values()
+
+  def test_hash_ShardingCallback(self):
+    class BlankCallback(sharding_util.ShardingCallback):
+      @property
+      def description(self):
+        return ""
+
+      def __call__(
+          self, shardable_tensors: Sequence[sharding_util.ShardableTensor]
+      ) -> Sequence[sharding_util.TensorSliceDict]:
+        pass
+
+    self.assertEqual(hash(BlankCallback()), hash(BlankCallback()))
+
+    class ValueCallback(sharding_util.ShardingCallback):
+      def __init__(self, val):
+        self.val = val
+
+      @property
+      def description(self):
+        return "value callback"
+
+      def __call__(
+          self, shardable_tensors: Sequence[sharding_util.ShardableTensor]
+      ) -> Sequence[sharding_util.TensorSliceDict]:
+        pass
+
+    self.assertEqual(hash(ValueCallback(1)), hash(ValueCallback(1)))
+    self.assertNotEqual(hash(ValueCallback(1)), hash(ValueCallback(2)))
+
+  def test_validate_shards_correct(self):
+    root = module.Module()
+    with ops.device("cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable(0.0, name="v0")
+    with ops.device("cpu:1"):
+      v1 = resource_variable_ops.ResourceVariable(1.0, name="v1")
+    with ops.device("cpu:2"):
+      v2 = resource_variable_ops.ResourceVariable(2.0, name="v2")
+    root.v0 = v0
+    root.v1 = v1
+    root.v2 = v2
+
+    shardable_tensors = self._get_shardable_tensors_by_task(root)
+    shardable_tensors_flat = []
+    for tensors in shardable_tensors:
+      shardable_tensors_flat.extend(tensors)
+
+    sharding_callback = sharding_policies.ShardByTaskPolicy()
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(sharding_callback(tensors))
+
+    sharding_util.validate_shards(
+        shards, shardable_tensors_flat, sharding_callback.description)
+
+    self.assertEqual(
+        [list(shard.keys()) for shard in shards],
+        [[
+            "v0/.ATTRIBUTES/VARIABLE_VALUE",
+            "v1/.ATTRIBUTES/VARIABLE_VALUE",
+            "v2/.ATTRIBUTES/VARIABLE_VALUE",
+            "_CHECKPOINTABLE_OBJECT_GRAPH"
+        ]])
+
+    self.assertEqual(
+        shards[0]["v0/.ATTRIBUTES/VARIABLE_VALUE"][""].numpy(),
+        v0.numpy())
+    self.assertEqual(
+        shards[0]["v1/.ATTRIBUTES/VARIABLE_VALUE"][""].numpy(),
+        v1.numpy())
+    self.assertEqual(
+        shards[0]["v2/.ATTRIBUTES/VARIABLE_VALUE"][""].numpy(),
+        v2.numpy())
+
+  def test_validate_shards_duplicate_tensor(self):
+    root = module.Module()
+    with ops.device("cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable(0.0, name="v0")
+    with ops.device("cpu:1"):
+      v1 = resource_variable_ops.ResourceVariable(1.0, name="v1")
+    root.v0 = v0
+    root.v1 = v1
+
+    class DuplicateTensorCallback(sharding_util.ShardingCallback):
+      @property
+      def description(self):
+        return "duplicate tensor callback"
+
+      def __call__(
+          self, shardable_tensors: Sequence[sharding_util.ShardableTensor]
+      ) -> Sequence[sharding_util.TensorSliceDict]:
+        tensor = shardable_tensors[0].tensor
+        checkpoint_key = shardable_tensors[0].checkpoint_key
+        slice_spec = shardable_tensors[0].slice_spec
+        shards = [
+            {checkpoint_key: {slice_spec: tensor}},
+            {checkpoint_key: {slice_spec: tensor}}
+        ]
+        return shards
+
+    shardable_tensors = self._get_shardable_tensors_by_task(root)
+    shardable_tensors_flat = []
+    for tensors in shardable_tensors:
+      shardable_tensors_flat.extend(tensors)
+
+    sharding_callback = DuplicateTensorCallback()
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(sharding_callback(tensors))
+
+    with self.assertRaisesRegex(RuntimeError,
+                                "multiple tensors with the same checkpoint "
+                                "key and slice spec were found"):
+      sharding_util.validate_shards(
+          shards, shardable_tensors_flat, sharding_callback.description)
+
+  def test_validate_shards_added_tensor(self):
+    root = module.Module()
+    with ops.device("cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable(0.0, name="v0")
+    root.v0 = v0
+
+    class AddedTensorCallback(sharding_util.ShardingCallback):
+      @property
+      def description(self):
+        return "added tensor callback"
+
+      def __call__(
+          self, shardable_tensors: Sequence[sharding_util.ShardableTensor]
+      ) -> Sequence[sharding_util.TensorSliceDict]:
+        checkpoint_key = "ADDED_TENSOR_ABC123"
+        slice_spec = ""
+        tensor = tensor_lib.Tensor()
+        return [{checkpoint_key: {slice_spec: tensor}}]
+
+    shardable_tensors = self._get_shardable_tensors_by_task(root)
+    shardable_tensors_flat = []
+    for tensors in shardable_tensors:
+      shardable_tensors_flat.extend(tensors)
+
+    sharding_callback = AddedTensorCallback()
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(sharding_callback(tensors))
+
+    with self.assertRaisesRegex(RuntimeError,
+                                "a tensor not originally in the object graph"):
+      sharding_util.validate_shards(
+          shards, shardable_tensors_flat, sharding_callback.description)
+
+  def test_validate_shards_shape_change(self):
+    root = module.Module()
+    with ops.device("cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable([[0.0, 1.0]], name="v0")
+    root.v0 = v0
+
+    class ShapeChangeCallback(sharding_util.ShardingCallback):
+      @property
+      def description(self):
+        return "shape change callback"
+
+      def __call__(
+          self, shardable_tensors: Sequence[sharding_util.ShardableTensor]
+      ) -> Sequence[sharding_util.TensorSliceDict]:
+        shards = []
+        for shardable_tensor in shardable_tensors:
+          tensor = shardable_tensor.tensor
+          checkpoint_key = shardable_tensor.checkpoint_key
+          slice_spec = shardable_tensor.slice_spec
+          if checkpoint_key == "v0/.ATTRIBUTES/VARIABLE_VALUE":
+            tensor = array_ops.transpose(tensor)
+          shards.append({checkpoint_key: {slice_spec: tensor}})
+        return shards
+
+    shardable_tensors = self._get_shardable_tensors_by_task(root)
+    shardable_tensors_flat = []
+    for tensors in shardable_tensors:
+      shardable_tensors_flat.extend(tensors)
+
+    sharding_callback = ShapeChangeCallback()
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(sharding_callback(tensors))
+
+    with self.assertRaisesRegex(RuntimeError,
+                                "a tensor was found with an altered shape"):
+      sharding_util.validate_shards(
+          shards, shardable_tensors_flat, sharding_callback.description)
+
+  def test_validate_shards_dtype_change(self):
+    root = module.Module()
+    with ops.device("cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable(0.0, name="v0")
+    root.v0 = v0
+
+    class DtypeChangeCallback(sharding_util.ShardingCallback):
+      @property
+      def description(self):
+        return "dtype change callback"
+
+      def __call__(
+          self, shardable_tensors: Sequence[sharding_util.ShardableTensor]
+      ) -> Sequence[sharding_util.TensorSliceDict]:
+        shards = []
+        for shardable_tensor in shardable_tensors:
+          tensor = shardable_tensor.tensor
+          checkpoint_key = shardable_tensor.checkpoint_key
+          slice_spec = shardable_tensor.slice_spec
+          if checkpoint_key == "v0/.ATTRIBUTES/VARIABLE_VALUE":
+            tensor = math_ops.cast(tensor, dtype=dtypes.int32)
+          shards.append({checkpoint_key: {slice_spec: tensor}})
+        return shards
+
+    shardable_tensors = self._get_shardable_tensors_by_task(root)
+    shardable_tensors_flat = []
+    for tensors in shardable_tensors:
+      shardable_tensors_flat.extend(tensors)
+
+    sharding_callback = DtypeChangeCallback()
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(sharding_callback(tensors))
+
+    with self.assertRaisesRegex(RuntimeError,
+                                "a tensor was found with an altered dtype"):
+      sharding_util.validate_shards(
+          shards, shardable_tensors_flat, sharding_callback.description)
+
+  def test_validate_shards_different_tasks(self):
+    servers = [server_lib.Server.create_local_server() for _ in range(3)]
+    cluster_spec = server_lib.ClusterSpec({
+        "worker": [s.target[len("grpc://"):] for s in servers]})
+    remote.connect_to_cluster(cluster_spec)
+
+    root = module.Module()
+    with ops.device("/job:worker/task:0/cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable(0.0, name="v0")
+    with ops.device("/job:worker/task:1/cpu:0"):
+      v1 = resource_variable_ops.ResourceVariable(0.0, name="v1")
+    root.v0 = v0
+    root.v1 = v1
+
+    class DifferentTasksCallback(sharding_util.ShardingCallback):
+      @property
+      def description(self):
+        return "different tasks callback"
+
+      def __call__(
+          self, shardable_tensors: Sequence[sharding_util.ShardableTensor]
+      ) -> Sequence[sharding_util.TensorSliceDict]:
+        shard = {}
+        for shardable_tensor in shardable_tensors:
+          tensor = shardable_tensor.tensor
+          checkpoint_key = shardable_tensor.checkpoint_key
+          slice_spec = shardable_tensor.slice_spec
+          shard.setdefault(checkpoint_key, {})[slice_spec] = tensor
+        return [shard]
+
+    shardable_tensors = self._get_shardable_tensors_by_task(root)
+    shardable_tensors_flat = []
+    for tensors in shardable_tensors:
+      shardable_tensors_flat.extend(tensors)
+
+    sharding_callback = DifferentTasksCallback()
+    shards = sharding_callback(shardable_tensors_flat)
+
+    with self.assertRaisesRegex(RuntimeError,
+                                "tensors with different tasks were found"):
+      sharding_util.validate_shards(
+          shards, shardable_tensors_flat, sharding_callback.description)
+
+  def test_validate_shards_tensor_removal(self):
+    root = module.Module()
+    with ops.device("cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable(0.0, name="v0")
+    root.v0 = v0
+
+    class TensorRemovalCallback(sharding_util.ShardingCallback):
+      @property
+      def description(self):
+        return "tensor removal callback"
+
+      def __call__(
+          self, shardable_tensors: Sequence[sharding_util.ShardableTensor]
+      ) -> Sequence[sharding_util.TensorSliceDict]:
+        return []
+
+    shardable_tensors = self._get_shardable_tensors_by_task(root)
+    shardable_tensors_flat = []
+    for tensors in shardable_tensors:
+      shardable_tensors_flat.extend(tensors)
+
+    sharding_callback = TensorRemovalCallback()
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(sharding_callback(tensors))
+
+    with self.assertRaisesRegex(RuntimeError,
+                                "tensors in the object graph were not found"):
+      sharding_util.validate_shards(
+          shards, shardable_tensors_flat, sharding_callback.description)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/client/BUILD b/tensorflow/python/client/BUILD
index 782ffe58b43059..1f8f7e6b8b1d31 100644
--- a/tensorflow/python/client/BUILD
+++ b/tensorflow/python/client/BUILD
@@ -3,6 +3,10 @@ load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test", "tf_python_pybind_extension")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "cuda_py_benchmark_test",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -108,6 +112,10 @@ tf_python_pybind_extension(
 tf_python_pybind_extension(
     name = "_pywrap_events_writer",
     srcs = ["events_writer_wrapper.cc"],
+    enable_stub_generation = True,
+    pytype_srcs = [
+        "_pywrap_events_writer.pyi",
+    ],
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_headers_for_pybind",
@@ -433,6 +441,7 @@ tf_py_strict_test(
     python_version = "PY3",
     tags = [
         "no_gpu",
+        "no_rocm",
         "no_windows",
     ],
     deps = [
@@ -494,7 +503,7 @@ cuda_py_strict_test(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "session_benchmark",
     srcs = ["session_benchmark.py"],
     grpc_enabled = True,
diff --git a/tensorflow/python/tpu/tpu_config.py b/tensorflow/python/client/_pywrap_events_writer.pyi
similarity index 52%
rename from tensorflow/python/tpu/tpu_config.py
rename to tensorflow/python/client/_pywrap_events_writer.pyi
index eda3717520f7a8..92da35bcfe093b 100644
--- a/tensorflow/python/tpu/tpu_config.py
+++ b/tensorflow/python/client/_pywrap_events_writer.pyi
@@ -1,10 +1,10 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-# http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,8 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Stub file to maintain backwards compatibility."""
 
-# pylint: disable=wildcard-import,unused-import
-from tensorflow_estimator.python.estimator.tpu.tpu_config import *
-# pylint: enable=wildcard-import,unused-import
+class EventsWriter:
+    def __init__(self, arg0: str) -> None: ...
+    def Close(self) -> Status: ...
+    def FileName(self) -> str: ...
+    def Flush(self) -> Status: ...
+    def InitWithSuffix(self, arg0: str) -> Status: ...
+    def WriteEvent(self, arg0: object) -> None: ...
+    def _WriteSerializedEvent(self, arg0: str) -> None: ...
+
+class Status:
+    def __init__(self, *args, **kwargs) -> None: ...
diff --git a/tensorflow/python/client/events_writer_wrapper.cc b/tensorflow/python/client/events_writer_wrapper.cc
index 7e5720c4eef02d..661c845b3aac57 100644
--- a/tensorflow/python/client/events_writer_wrapper.cc
+++ b/tensorflow/python/client/events_writer_wrapper.cc
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "absl/strings/string_view.h"
+#include "pybind11/attr.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/events_writer.h"
 #include "tensorflow/python/lib/core/pybind11_absl.h"
 #include "tensorflow/python/lib/core/pybind11_proto.h"
@@ -24,6 +26,7 @@ limitations under the License.
 namespace py = pybind11;
 
 PYBIND11_MODULE(_pywrap_events_writer, m) {
+  py::class_<tensorflow::Status> Status(m, "Status", py::module_local());
   py::class_<tensorflow::EventsWriter> events_writer_class(m, "EventsWriter");
   events_writer_class.def(py::init<const std::string&>())
       .def("InitWithSuffix",
diff --git a/tensorflow/python/client/session_partial_run_test.py b/tensorflow/python/client/session_partial_run_test.py
index 075d69e78bc400..79cedb5a2ffdd6 100644
--- a/tensorflow/python/client/session_partial_run_test.py
+++ b/tensorflow/python/client/session_partial_run_test.py
@@ -26,7 +26,6 @@
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import server_lib
 
-
 class PartialRunTest(test_util.TensorFlowTestCase):
 
   def RunTestPartialRun(self, sess):
diff --git a/tensorflow/python/client/tf_session_wrapper.cc b/tensorflow/python/client/tf_session_wrapper.cc
index 790629c96d2e4f..160416c4199102 100644
--- a/tensorflow/python/client/tf_session_wrapper.cc
+++ b/tensorflow/python/client/tf_session_wrapper.cc
@@ -138,9 +138,9 @@ pybind11::object method(pybind11::object type, Func&& function,
 // generation. The type is assumed to be a GC type (containing other types).
 // To add the required Python type fields, classes definitions must start with
 //
-// TFObject_Head(classname)
+// TFObject_Head(classname, TfObjectDataType)
 //
-// Required attributes/methods:
+// Required attributes/methods for TfObjectDataType type:
 //
 // Constructor(PyObject* args, PyObject* kw)
 // ~Destructor
@@ -148,8 +148,10 @@ pybind11::object method(pybind11::object type, Func&& function,
 // Visit(visitproc visit, void* arg)
 //
 // Individual methods/attributes are added to the type later, as seen below.
-template <class T>
+template <typename T>
 void MakeTfObjectType(PyObject** py_type) {
+  using TfObjectDataType = typename T::TfObjectDataType;
+
   py::str name = py::str(T::kTypeName);
   py::str qualname = py::str(T::kTypeName);
   PyHeapTypeObject* heap_type = reinterpret_cast<PyHeapTypeObject*>(
@@ -162,11 +164,14 @@ void MakeTfObjectType(PyObject** py_type) {
   type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE |
                    Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_BASETYPE;
   type->tp_name = T::kTypeName;
-  type->tp_basicsize = sizeof(T);
+
+  // Allocation size for both Python object header and the TF data members.
+  type->tp_basicsize = sizeof(T) + sizeof(TfObjectDataType);
 
   type->tp_new = [](PyTypeObject* subtype, PyObject* args,
                     PyObject* kwds) -> PyObject* {
     T* self = reinterpret_cast<T*>(subtype->tp_alloc(subtype, 0));
+    TfObjectDataType* data = reinterpret_cast<TfObjectDataType*>(&self[1]);
     if (!self) return nullptr;
 
     // PyType_GenericAlloc (the default implementation of tp_alloc) by default
@@ -176,7 +181,7 @@ void MakeTfObjectType(PyObject** py_type) {
     //
     // We disable the GC here until initialization is finished.
     PyObject_GC_UnTrack(self);
-    new (self) T(args, kwds);
+    new (data) TfObjectDataType(args, kwds);
     self->dict = PyDict_New();
     PyObject_GC_Track(self);
 
@@ -193,9 +198,9 @@ void MakeTfObjectType(PyObject** py_type) {
     PyObject_ClearWeakRefs(self);
 
     T* o = reinterpret_cast<T*>(self);
+    TfObjectDataType* data = reinterpret_cast<TfObjectDataType*>(&o[1]);
     Py_CLEAR(o->dict);
-    o->~T();
-
+    data->~TfObjectDataType();
     tp->tp_free(self);
     Py_DECREF(tp);
   };
@@ -203,16 +208,18 @@ void MakeTfObjectType(PyObject** py_type) {
   type->tp_traverse = [](PyObject* self, visitproc visit, void* arg) {
     VLOG(3) << "Visit: " << T::kTypeName;
     T* o = reinterpret_cast<T*>(self);
+    TfObjectDataType* data = reinterpret_cast<TfObjectDataType*>(&o[1]);
     Py_VISIT(Py_TYPE(self));
     Py_VISIT(o->dict);
-    return o->Visit(visit, arg);
+    return data->Visit(visit, arg);
   };
 
   type->tp_clear = [](PyObject* self) {
     VLOG(3) << "Clear: " << T::kTypeName;
     T* o = reinterpret_cast<T*>(self);
+    TfObjectDataType* data = reinterpret_cast<TfObjectDataType*>(&o[1]);
     Py_CLEAR(o->dict);
-    o->Clear();
+    data->Clear();
     return 0;
   };
 
@@ -238,11 +245,13 @@ void MakeTfObjectType(PyObject** py_type) {
   *py_type = reinterpret_cast<PyObject*>(type);
 }
 
-#define TFObject_HEAD(typename) \
-  PyObject_HEAD;                \
-  PyObject* dict = nullptr;     \
-  PyObject* weakrefs = nullptr; \
-  static PyObject* py_type;     \
+#define TFObject_HEAD(typename, datatypename) \
+  using TfObjectDataType = datatypename;      \
+  PyObject_HEAD;                              \
+  PyObject* dict = nullptr;                   \
+  PyObject* weakrefs = nullptr;               \
+  TfObjectDataType data[0];                   \
+  static PyObject* py_type;                   \
   static constexpr const char* kTypeName = #typename;
 
 struct PyGraph;
@@ -272,7 +281,7 @@ PYBIND11_MAKE_OPAQUE(OpsByIdMap);
 PYBIND11_MAKE_OPAQUE(OpsByNameMap);
 
 // Convert the given handle to a TF object type.
-template <class T>
+template <typename T>
 T* AsPyTfObject(py::handle handle) {
   if (handle.get_type() == T::py_type) {
     return reinterpret_cast<T*>(handle.ptr());
@@ -296,11 +305,15 @@ T* AsPyTfObject(py::handle handle) {
                    py::cast<std::string>(py::str(handle))));
 }
 
-template <class T>
+template <typename T>
 py::object AsPyObject(T* obj) {
   return py::reinterpret_borrow<py::object>(reinterpret_cast<PyObject*>(obj));
 }
 
+template <typename T>
+typename T::TfObjectDataType* AsPyTfObjectData(py::handle handle) {
+  return AsPyTfObject<T>(handle)->data;
+}
 // Reference counting helper for PyTfObjects.
 //
 // Similar to the pybind holder types, this manages the Python reference
@@ -309,7 +322,7 @@ py::object AsPyObject(T* obj) {
 // As a special case to support Dismantle(), this allows setting our underlying
 // pointer to None when clearing the type. Direct access to attributes is not
 // allowed after this point.
-template <class T>
+template <typename T>
 class tf_handle {
  public:
   tf_handle() : obj_(nullptr) {}
@@ -402,9 +415,7 @@ struct TF_OperationDeleter {
   void operator()(TF_Operation* op) {}
 };
 
-struct PyGraph {
-  TFObject_HEAD(PyGraph);
-
+struct PyGraphData {
   TF_Graph* graph;
 
   // The C++ graph maintains an ID for every node, however our Python code has
@@ -424,7 +435,7 @@ struct PyGraph {
   OpsByIdMap ops_by_id;
   OpsByNameMap ops_by_name;
 
-  PyGraph(PyObject* args, PyObject* kwds) {
+  PyGraphData(PyObject* args, PyObject* kwds) {
     graph = TF_NewGraph();
 
     // By default shape inference functions are required, however this breaks
@@ -433,7 +444,7 @@ struct PyGraph {
     graph->refiner.set_require_shape_inference_fns(false);
   }
 
-  ~PyGraph() {
+  ~PyGraphData() {
     Clear();
     TF_DeleteGraph(graph);
   }
@@ -462,22 +473,26 @@ struct PyGraph {
     }
     return 0;
   }
+};
+
+struct PyGraph {
+  TFObject_HEAD(PyGraph, PyGraphData);
 
   int64_t add_op(py::object obj);
 
-  py::list operations() { return op_list; }
-  int64_t num_operations() const { return op_list.size(); }
+  py::list operations() { return data->op_list; }
+  int64_t num_operations() const { return data->op_list.size(); }
 
   // Return operations that are part of the Graph, but do not yet have
   // OperationHandle's. This logic is only invoked when importing an existing
   // GraphDef into Python. It should be removed once all logic moves to C++.
   std::vector<TF_Operation*> new_operations() {
-    tsl::mutex_lock l(graph->mu);
+    tsl::mutex_lock l(tf_graph()->mu);
     std::vector<TF_Operation*> ops;
 
     // SUBTLE: `op_nodes` skips the SOURCE and SINK nodes
-    for (auto n : graph->graph.op_nodes()) {
-      if (ops_by_name.find(n->name()) == ops_by_name.end()) {
+    for (auto n : tf_graph()->graph.op_nodes()) {
+      if (data->ops_by_name.find(n->name()) == data->ops_by_name.end()) {
         ops.push_back(reinterpret_cast<TF_Operation*>(n));
       }
     }
@@ -485,15 +500,15 @@ struct PyGraph {
   }
 
   py::object get_operation_by_name(const std::string& name) {
-    tsl::mutex_lock l(graph->mu);
-    auto it = ops_by_name.find(name);
-    if (it == ops_by_name.end()) {
+    tsl::mutex_lock l(tf_graph()->mu);
+    auto it = data->ops_by_name.find(name);
+    if (it == data->ops_by_name.end()) {
       throw py::key_error();
     }
     return it->second;
   }
 
-  int version() const { return ops_by_id.size(); }
+  int version() const { return data->ops_by_id.size(); }
 
   py::bytes version_def() const {
     // Potential deadlock:
@@ -509,8 +524,8 @@ struct PyGraph {
     std::string versions;
     {
       py::gil_scoped_release release;
-      tsl::mutex_lock l(graph->mu);
-      versions = graph->graph.versions().SerializeAsString();
+      tsl::mutex_lock l(tf_graph()->mu);
+      versions = tf_graph()->graph.versions().SerializeAsString();
     }
     pybind11::gil_scoped_acquire acquire;
     return py::bytes(versions);
@@ -518,52 +533,52 @@ struct PyGraph {
 
   tsl::StatusOr<py::bytes> _op_def_for_type(
       const std::string& kTypeName) const {
-    tsl::mutex_lock l(graph->mu);
+    tsl::mutex_lock l(tf_graph()->mu);
     const tensorflow::OpDef* op_def;
     TF_RETURN_IF_ERROR(
-        graph->graph.op_registry()->LookUpOpDef(kTypeName, &op_def));
+        tf_graph()->graph.op_registry()->LookUpOpDef(kTypeName, &op_def));
     return py::bytes(op_def->SerializeAsString());
   }
 
   void add_control_input(tensorflow::Node* src, tensorflow::Node* dst) {
-    tsl::mutex_lock l(graph->mu);
+    tsl::mutex_lock l(tf_graph()->mu);
 
-    graph->graph.AddControlEdge(src, dst);
+    tf_graph()->graph.AddControlEdge(src, dst);
     record_mutation(*dst, "adding control edge");
   }
 
   void remove_all_control_inputs(const tensorflow::Node& node) {
-    tsl::mutex_lock l(graph->mu);
+    tsl::mutex_lock l(tf_graph()->mu);
     std::vector<const tensorflow::Edge*> control_edges;
     for (const tensorflow::Edge* edge : node.in_edges()) {
       if (!edge->IsControlEdge()) continue;
       control_edges.push_back(edge);
     }
     for (const tensorflow::Edge* edge : control_edges) {
-      graph->graph.RemoveControlEdge(edge);
+      tf_graph()->graph.RemoveControlEdge(edge);
     }
   }
 
   void record_mutation(const tensorflow::Node& node, const std::string& reason)
-      TF_EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
-    tensorflow::RecordMutation(
-        graph, reinterpret_cast<const TF_Operation&>(node), reason.c_str());
+      TF_EXCLUSIVE_LOCKS_REQUIRED(tf_graph()->mu) {
+    tensorflow::RecordMutation(tf_graph(),
+                               reinterpret_cast<const TF_Operation&>(node),
+                               reason.c_str());
   }
 
-  TF_Graph* tf_graph() { return graph; }
+  TF_Graph* tf_graph() const { return data->graph; }
 };
 
-struct PyOperation {
-  TFObject_HEAD(PyOperation);
-
+struct PyOperationData {
   TF_Operation* tf_op = nullptr;
+
   py::list outputs;
 
   // N.B. initialized later by Python.
   tf_handle<PyGraph> graph;
   py::function tensor_fn;
 
-  PyOperation(PyObject* args, PyObject* kwds) {
+  PyOperationData(PyObject* args, PyObject* kwds) {
     PyObject *py_op, *py_tensor_fn;
     if (!PyArg_ParseTuple(args, "OO", &py_op, &py_tensor_fn)) {
       return;
@@ -572,90 +587,92 @@ struct PyOperation {
     tensor_fn = py::cast<py::function>(py_tensor_fn);
   }
 
-  ~PyOperation() { Clear(); }
+  ~PyOperationData() { Clear(); }
+
+  void Dismantle(PyOperation* py_op);
 
   void Clear() {
     Py_CLEAR(outputs.release().ptr());
     graph.Clear();
   }
 
-  void Dismantle();
-
   int Visit(visitproc visit, void* arg) {
     Py_VISIT(graph.ptr());
     Py_VISIT(outputs.ptr());
     return 0;
   }
+};
+
+struct PyOperation {
+  TFObject_HEAD(PyOperation, PyOperationData);
+
+  TF_Operation* tf_op() const { return data->tf_op; }
 
   void _init_outputs() {
-    int num_outputs = TF_OperationNumOutputs(tf_op);
+    int num_outputs = TF_OperationNumOutputs(tf_op());
     for (int i = 0; i < num_outputs; ++i) {
-      auto dtype = TF_OperationOutputType(TF_Output{tf_op, i});
-      outputs.append(tensor_fn(AsPyObject(this), i, dtype));
+      auto dtype = TF_OperationOutputType(TF_Output{tf_op(), i});
+      data->outputs.append(data->tensor_fn(AsPyObject(this), i, dtype));
     }
   }
 
   tsl::Status _add_outputs(py::list dtypes, py::list shapes);
 
-  const TF_Operation* op() { return tf_op; }
-
-  TF_Output _tf_output(int idx) const { return TF_Output{tf_op, idx}; }
-  TF_Input _tf_input(int idx) const { return TF_Input{tf_op, idx}; }
+  TF_Output _tf_output(int idx) const { return TF_Output{tf_op(), idx}; }
+  TF_Input _tf_input(int idx) const { return TF_Input{tf_op(), idx}; }
 
   py::bytes node_def() {
-    return py::bytes(tf_op->node.def().SerializeAsString());
+    return py::bytes(tf_op()->node.def().SerializeAsString());
   }
 
   py::bytes op_def() const {
-    return py::bytes(tf_op->node.op_def().SerializeAsString());
+    return py::bytes(tf_op()->node.op_def().SerializeAsString());
   }
 
-  bool is_stateful() const { return tf_op->node.op_def().is_stateful(); }
+  bool is_stateful() const { return tf_op()->node.op_def().is_stateful(); }
 
-  const std::string& type() { return tf_op->node.type_string(); }
+  const std::string& type() { return tf_op()->node.type_string(); }
 
   void add_control_input(PyOperation* input) {
-    graph->add_control_input(&input->tf_op->node, &tf_op->node);
+    data->graph->add_control_input(&input->tf_op()->node, &tf_op()->node);
   }
 
   void add_control_inputs(py::iterable inputs);
 
   py::list control_inputs() {
     py::list output;
-    for (const auto* edge : tf_op->node.in_edges()) {
+    for (const auto* edge : tf_op()->node.in_edges()) {
       if (edge->IsControlEdge() && !edge->src()->IsSource()) {
-        output.append(graph->ops_by_id[edge->src()->id()]);
+        output.append(data->graph->data->ops_by_id[edge->src()->id()]);
       }
     }
     return output;
   }
   py::list control_outputs() {
     py::list output;
-    for (const auto* edge : tf_op->node.out_edges()) {
+    for (const auto* edge : tf_op()->node.out_edges()) {
       if (edge->IsControlEdge() && !edge->dst()->IsSink()) {
-        output.append(graph->ops_by_id[edge->dst()->id()]);
+        output.append(data->graph->data->ops_by_id[edge->dst()->id()]);
       }
     }
     return output;
   }
 
   void remove_all_control_inputs() {
-    graph->remove_all_control_inputs(tf_op->node);
+    data->graph->remove_all_control_inputs(tf_op()->node);
   }
 
   void set_device(const std::string& device) {
-    tsl::mutex_lock l(graph->graph->mu);
-    tf_op->node.set_requested_device(device);
-    graph->record_mutation(tf_op->node, "setting device");
+    tsl::mutex_lock l(data->graph->tf_graph()->mu);
+    tf_op()->node.set_requested_device(device);
+    data->graph->record_mutation(tf_op()->node, "setting device");
   }
 
-  const std::string& device() { return tf_op->node.requested_device(); }
-  const std::string& name() { return tf_op->node.name(); }
+  const std::string& device() { return tf_op()->node.requested_device(); }
+  const std::string& name() { return tf_op()->node.name(); }
 };
 
-struct PyTensor {
-  TFObject_HEAD(PyTensor);
-
+struct PyTensorData {
   py::object tf_output = py::none();
   py::object name = py::none();
   py::object dtype = py::none();
@@ -667,7 +684,7 @@ struct PyTensor {
 
   int value_index = -1;
 
-  PyTensor(PyObject* args, PyObject* kwds) {
+  PyTensorData(PyObject* args, PyObject* kwds) {
     PyObject *py_op, *py_index, *py_dtype, *py_uid;
     if (!PyArg_ParseTuple(args, "OOOO", &py_op, &py_index, &py_dtype,
                           &py_uid)) {
@@ -676,12 +693,13 @@ struct PyTensor {
     dtype = py::reinterpret_borrow<py::object>(py_dtype);
     value_index = py::cast<int>(py::handle(py_index));
     op = py_op;
-    graph = op->graph;
+    graph = op->data->graph;
     name = py::str(absl::StrCat(op->name(), ":", value_index));
-    tf_output = py::cast(TF_Output{op->tf_op, value_index});
+    tf_output = py::cast(TF_Output{op->tf_op(), value_index});
     uid = py::reinterpret_borrow<py::object>(py_uid);
   }
-  ~PyTensor() { Clear(); }
+
+  ~PyTensorData() { Clear(); }
 
   void Clear() {
     Py_CLEAR(tf_output.release().ptr());
@@ -703,14 +721,20 @@ struct PyTensor {
     Py_VISIT(uid.ptr());
     return 0;
   }
+};
+
+struct PyTensor {
+  TFObject_HEAD(PyTensor, PyTensorData);
+
+  int value_index() const { return data->value_index; }
 
   tsl::StatusOr<py::object> shape() {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
     bool unknown_shape = false;
     auto dims = tensorflow::TF_GraphGetTensorShapeHelper(
-        graph->tf_graph(), TF_Output{op->tf_op, value_index}, status.get(),
-        &unknown_shape);
+        data->graph->tf_graph(), TF_Output{data->op->tf_op(), value_index()},
+        status.get(), &unknown_shape);
     if (!status.get()->status.ok()) {
       return status.get()->status;
     }
@@ -737,17 +761,17 @@ struct PyTensor {
       }
     }
     tensorflow::TF_GraphSetTensorShape_wrapper(
-        graph->tf_graph(), TF_Output{op->tf_op, value_index}, dims,
-        unknown_shape, status.get());
+        data->graph->tf_graph(), TF_Output{data->op->tf_op(), value_index()},
+        dims, unknown_shape, status.get());
     return status.get()->status;
   }
 
   int64_t rank() {
-    tsl::mutex_lock l(graph->graph->mu);
+    tsl::mutex_lock l(data->graph->tf_graph()->mu);
     tensorflow::shape_inference::InferenceContext* ic =
-        graph->graph->refiner.GetContext(&op->tf_op->node);
+        data->graph->tf_graph()->refiner.GetContext(&data->op->tf_op()->node);
 
-    tensorflow::shape_inference::ShapeHandle shape = ic->output(value_index);
+    tensorflow::shape_inference::ShapeHandle shape = ic->output(value_index());
     if (ic->RankKnown(shape)) {
       return ic->Rank(shape);
     }
@@ -756,11 +780,11 @@ struct PyTensor {
 
   py::list consumers() {
     py::list out;
-    for (const auto* edge : op->tf_op->node.out_edges()) {
-      if (edge->src_output() != value_index) {
+    for (const auto* edge : data->op->tf_op()->node.out_edges()) {
+      if (edge->src_output() != value_index()) {
         continue;
       }
-      out.append(graph->ops_by_id[edge->dst()->id()]);
+      out.append(data->graph->data->ops_by_id[edge->dst()->id()]);
     }
     return out;
   }
@@ -770,17 +794,17 @@ PyObject* PyOperation::py_type = nullptr;
 PyObject* PyTensor::py_type = nullptr;
 PyObject* PyGraph::py_type = nullptr;
 
-void PyOperation::Dismantle() {
+void PyOperationData::Dismantle(PyOperation* py_op) {
   outputs = py::list();
-  PyDict_Clear(dict);
   graph.Destroy();
+  PyDict_Clear(py_op->dict);
 }
 
 tsl::Status PyOperation::_add_outputs(py::list dtypes, py::list shapes) {
-  int orig_outputs = outputs.size();
+  int orig_outputs = data->outputs.size();
   for (int i = 0; i < dtypes.size(); ++i) {
     py::object tensor =
-        tensor_fn(AsPyObject(this), orig_outputs + i, dtypes[i]);
+        data->tensor_fn(AsPyObject(this), orig_outputs + i, dtypes[i]);
 
     // The passed in `shapes` may be TensorShapes, convert them to lists if
     // needed.
@@ -799,24 +823,25 @@ tsl::Status PyOperation::_add_outputs(py::list dtypes, py::list shapes) {
     }
     TF_RETURN_IF_ERROR(
         AsPyTfObject<PyTensor>(tensor)->set_shape(dims, unknown_shape));
-    outputs.append(tensor);
+    data->outputs.append(tensor);
   }
   return tsl::OkStatus();
 }
 
 void PyOperation::add_control_inputs(py::iterable inputs) {
-  tsl::mutex_lock l(graph->tf_graph()->mu);
+  tsl::mutex_lock l(data->graph->tf_graph()->mu);
   for (py::handle input : inputs) {
     auto* input_handle = py::cast<PyOperation*>(input);
-    graph->tf_graph()->graph.AddControlEdge(&input_handle->tf_op->node,
-                                            &tf_op->node);
+    data->graph->tf_graph()->graph.AddControlEdge(&input_handle->tf_op()->node,
+                                                  &tf_op()->node);
   }
-  graph->record_mutation(tf_op->node, "adding control input");
+  data->graph->record_mutation(tf_op()->node, "adding control input");
 }
 
-void PyGraph::Dismantle() {
+void PyGraphData::Dismantle() {
   for (auto& op : op_list) {
-    AsPyTfObject<PyOperation>(op.ptr())->Dismantle();
+    AsPyTfObjectData<PyOperation>(op.ptr())->Dismantle(
+        AsPyTfObject<PyOperation>(op.ptr()));
   }
   op_list = py::list();
   ops_by_id.clear();
@@ -825,10 +850,10 @@ void PyGraph::Dismantle() {
 
 int64_t PyGraph::add_op(py::object obj) {
   PyOperation* op_handle = AsPyTfObject<PyOperation>(obj);
-  int64_t op_id = op_handle->tf_op->node.id();
-  op_list.append(obj);
-  ops_by_id[op_id] = obj;
-  ops_by_name[op_handle->name()] = obj;
+  int64_t op_id = op_handle->tf_op()->node.id();
+  data->op_list.append(obj);
+  data->ops_by_id[op_id] = obj;
+  data->ops_by_name[op_handle->name()] = obj;
   return op_id;
 }
 
@@ -848,7 +873,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
   m.attr("PyGraph") = c_graph;
   c_graph.attr("__module__") = module_name;
   c_graph.attr("Dismantle") = method(c_graph, [](py::handle handle) {
-    AsPyTfObject<PyGraph>(handle)->Dismantle();
+    AsPyTfObjectData<PyGraph>(handle)->Dismantle();
   });
   c_graph.attr("_version_def") = property_readonly([](py::handle handle) {
     return AsPyTfObject<PyGraph>(handle)->version_def();
@@ -861,10 +886,10 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
         return AsPyTfObject<PyGraph>(handle)->_op_def_for_type(type);
       });
   c_graph.attr("_nodes_by_name") = property_readonly([](py::handle handle) {
-    return AsPyTfObject<PyGraph>(handle)->ops_by_name;
+    return AsPyTfObjectData<PyGraph>(handle)->ops_by_name;
   });
   c_graph.attr("_nodes_by_id") = property_readonly([](py::handle handle) {
-    return AsPyTfObject<PyGraph>(handle)->ops_by_id;
+    return AsPyTfObjectData<PyGraph>(handle)->ops_by_id;
   });
   c_graph.attr("_get_operation_by_name") =
       method(c_graph, [](py::handle handle, std::string name) {
@@ -919,18 +944,18 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
     return AsPyTfObject<PyOperation>(handle)->remove_all_control_inputs();
   });
   c_op.attr("outputs") = property_readonly([](py::handle handle) {
-    return AsPyTfObject<PyOperation>(handle)->outputs;
+    return AsPyTfObjectData<PyOperation>(handle)->outputs;
   });
   c_op.attr("graph") = property(
       [](py::handle handle) {
-        return AsPyTfObject<PyOperation>(handle)->graph.borrow();
+        return AsPyTfObjectData<PyOperation>(handle)->graph.borrow();
       },
       [](py::handle handle, py::handle graph) {
         auto op = AsPyTfObject<PyOperation>(handle);
-        op->graph = graph.ptr();
+        op->data->graph = graph.ptr();
       });
   c_op.attr("_c_op") = property_readonly([](py::handle handle) {
-    return AsPyTfObject<PyOperation>(handle)->tf_op;
+    return AsPyTfObject<PyOperation>(handle)->tf_op();
   });
   c_op.attr("_is_stateful") = property_readonly([](py::handle handle) {
     return AsPyTfObject<PyOperation>(handle)->is_stateful();
@@ -983,7 +1008,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
     m.attr("PyTensor") = c_tensor;
     c_tensor.attr("__module__") = module_name;
     c_tensor.attr("device") = property_readonly([](py::handle handle) {
-      return AsPyTfObject<PyTensor>(handle)->op->device();
+      return AsPyTfObjectData<PyTensor>(handle)->op->device();
     });
     c_tensor.attr("ndim") = property_readonly([](py::handle handle) {
       return AsPyTfObject<PyTensor>(handle)->rank();
@@ -995,40 +1020,44 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
       return AsPyTfObject<PyTensor>(handle)->shape();
     });
     c_tensor.attr("_dtype") = property_readonly([](py::handle handle) {
-      return AsPyTfObject<PyTensor>(handle)->dtype;
+      return AsPyTfObjectData<PyTensor>(handle)->dtype;
     });
     c_tensor.attr("_name") = property(
-        [](py::handle handle) { return AsPyTfObject<PyTensor>(handle)->name; },
+        [](py::handle handle) {
+          return AsPyTfObjectData<PyTensor>(handle)->name;
+        },
         [](py::handle handle, py::object name) {
-          AsPyTfObject<PyTensor>(handle)->name = name;
+          AsPyTfObjectData<PyTensor>(handle)->name = name;
         });
     c_tensor.attr("_shape_val") = property(
         [](py::handle handle) {
           auto py_tensor = AsPyTfObject<PyTensor>(handle);
-          return py_tensor->shape_val;
+          return py_tensor->data->shape_val;
         },
         [](py::handle handle, py::object shape) {
-          AsPyTfObject<PyTensor>(handle)->shape_val = shape;
+          AsPyTfObjectData<PyTensor>(handle)->shape_val = shape;
         });
     c_tensor.attr("_id") = property(
-        [](py::handle handle) { return AsPyTfObject<PyTensor>(handle)->uid; },
+        [](py::handle handle) {
+          return AsPyTfObjectData<PyTensor>(handle)->uid;
+        },
         [](py::handle handle, py::object uid) {
-          AsPyTfObject<PyTensor>(handle)->uid = uid;
+          AsPyTfObjectData<PyTensor>(handle)->uid = uid;
         });
     c_tensor.attr("graph") =
         property_readonly([](py::handle handle) -> py::handle {
-          auto& graph = AsPyTfObject<PyTensor>(handle)->graph;
+          auto& graph = AsPyTfObjectData<PyTensor>(handle)->graph;
           if (graph.ptr() != nullptr) {
             return graph.borrow();
           }
           return py::none();
         });
     c_tensor.attr("_as_tf_output") = method(c_tensor, [](py::handle handle) {
-      return AsPyTfObject<PyTensor>(handle)->tf_output;
+      return AsPyTfObjectData<PyTensor>(handle)->tf_output;
     });
     c_tensor.attr("_op") =
         property_readonly([](py::handle handle) -> py::handle {
-          auto& op = AsPyTfObject<PyTensor>(handle)->op;
+          auto& op = AsPyTfObjectData<PyTensor>(handle)->op;
           if (op.ptr() != nullptr) {
             return op.borrow();
           }
@@ -1036,7 +1065,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
         });
     c_tensor.attr("op") =
         property_readonly([](py::handle handle) -> py::handle {
-          auto& op = AsPyTfObject<PyTensor>(handle)->op;
+          auto& op = AsPyTfObjectData<PyTensor>(handle)->op;
           if (op.ptr() != nullptr) {
             return op.borrow();
           }
@@ -1048,7 +1077,7 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
       return AsPyTfObject<PyTensor>(handle)->set_shape(shape, unknown_shape);
     });
     c_tensor.attr("value_index") = property_readonly([](py::handle handle) {
-      return AsPyTfObject<PyTensor>(handle)->value_index;
+      return AsPyTfObject<PyTensor>(handle)->value_index();
     });
     c_tensor.attr("consumers") = method(c_tensor, [](py::handle handle) {
       return AsPyTfObject<PyTensor>(handle)->consumers();
diff --git a/tensorflow/python/compat/BUILD b/tensorflow/python/compat/BUILD
index 68bab012e8bf28..8765961c533f7c 100644
--- a/tensorflow/python/compat/BUILD
+++ b/tensorflow/python/compat/BUILD
@@ -13,14 +13,9 @@ py_strict_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python:tf2",
-        "//tensorflow/python/data/experimental/ops:counter",
-        "//tensorflow/python/data/experimental/ops:interleave_ops",
-        "//tensorflow/python/data/experimental/ops:random_ops",
-        "//tensorflow/python/data/experimental/ops:readers",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:registry",
         "//tensorflow/python/framework:tensor",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/ops:control_flow_v2_toggles",
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 73e0ad94e2e434..fd9132a7448210 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 11, 6)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 12, 14)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compat/v2_compat.py b/tensorflow/python/compat/v2_compat.py
index cef625b1dc355a..5820e477eb2e5f 100644
--- a/tensorflow/python/compat/v2_compat.py
+++ b/tensorflow/python/compat/v2_compat.py
@@ -15,19 +15,13 @@
 """Switching v2 features on and off."""
 
 from tensorflow.python import tf2
-from tensorflow.python.data.experimental.ops import counter
-from tensorflow.python.data.experimental.ops import interleave_ops
-from tensorflow.python.data.experimental.ops import random_ops
-from tensorflow.python.data.experimental.ops import readers as exp_readers
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import registry
 from tensorflow.python.framework import tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import resource_variables_toggle
-
 from tensorflow.python.util.tf_export import tf_export
 
 # Metrics to track the status of v2_behavior
@@ -35,6 +29,12 @@
     "/tensorflow/version/v2_behavior",
     "whether v2_behavior is enabled or disabled", "status")
 
+_DATA_V2_CALLBACKS = registry.Registry("data_v2_callbacks")
+
+
+def register_data_v2_callback(data_v2_func):
+  _DATA_V2_CALLBACKS.register(data_v2_func, data_v2_func.__module__)
+
 
 @tf_export(v1=["enable_v2_behavior"])
 def enable_v2_behavior():
@@ -65,19 +65,9 @@ def enable_v2_behavior():
   # Enables TensorArrayV2 and control flow V2.
   control_flow_v2_toggles.enable_control_flow_v2()
   # Make sure internal uses of tf.data symbols map to V2 versions.
-  dataset_ops.Dataset = dataset_ops.DatasetV2
-  readers.FixedLengthRecordDataset = readers.FixedLengthRecordDatasetV2
-  readers.TFRecordDataset = readers.TFRecordDatasetV2
-  readers.TextLineDataset = readers.TextLineDatasetV2
-  counter.Counter = counter.CounterV2
-  interleave_ops.choose_from_datasets = interleave_ops.choose_from_datasets_v2
-  interleave_ops.sample_from_datasets = interleave_ops.sample_from_datasets_v2
-  random_ops.RandomDataset = random_ops.RandomDatasetV2
-  exp_readers.CsvDataset = exp_readers.CsvDatasetV2
-  exp_readers.SqlDataset = exp_readers.SqlDatasetV2
-  exp_readers.make_batched_features_dataset = (
-      exp_readers.make_batched_features_dataset_v2)
-  exp_readers.make_csv_dataset = exp_readers.make_csv_dataset_v2
+  for v2_enabler_name in _DATA_V2_CALLBACKS.list():
+    v2_enabler = _DATA_V2_CALLBACKS.lookup(v2_enabler_name)
+    v2_enabler()
 
 
 @tf_export(v1=["disable_v2_behavior"])
@@ -110,16 +100,6 @@ def disable_v2_behavior():
   # Disables TensorArrayV2 and control flow V2.
   control_flow_v2_toggles.disable_control_flow_v2()
   # Make sure internal uses of tf.data symbols map to V1 versions.
-  dataset_ops.Dataset = dataset_ops.DatasetV1
-  readers.FixedLengthRecordDataset = readers.FixedLengthRecordDatasetV1
-  readers.TFRecordDataset = readers.TFRecordDatasetV1
-  readers.TextLineDataset = readers.TextLineDatasetV1
-  counter.Counter = counter.CounterV1
-  interleave_ops.choose_from_datasets = interleave_ops.choose_from_datasets_v1
-  interleave_ops.sample_from_datasets = interleave_ops.sample_from_datasets_v1
-  random_ops.RandomDataset = random_ops.RandomDatasetV1
-  exp_readers.CsvDataset = exp_readers.CsvDatasetV1
-  exp_readers.SqlDataset = exp_readers.SqlDatasetV1
-  exp_readers.make_batched_features_dataset = (
-      exp_readers.make_batched_features_dataset_v1)
-  exp_readers.make_csv_dataset = exp_readers.make_csv_dataset_v1
+  for v2_disabler_name in _DATA_V2_CALLBACKS.list():
+    v2_disabler = _DATA_V2_CALLBACKS.lookup(v2_disabler_name)
+    v2_disabler()
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 9fbdcf56b1a023..0dc49c5a56f3fb 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -32,12 +32,10 @@ py_strict_library(
 
 py_strict_library(
     name = "trt_convert_py",
-    srcs = [
-        "trt_convert.py",
-        "utils.py",
-    ],
+    srcs = ["trt_convert.py"],
     srcs_version = "PY3",
     deps = [
+        ":utils",
         "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
         "//tensorflow/compiler/tf2tensorrt:trt_ops_loader",
         "//tensorflow/core:protos_all_py",
@@ -69,19 +67,30 @@ py_strict_library(
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
-        "@pypi_packaging//:pkg",
         "@six_archive//:six",
     ],
 )
 
+py_strict_library(
+    name = "utils",
+    srcs = ["utils.py"],
+    deps = [
+        "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/framework:dtypes",
+        "@pypi_packaging//:pkg",
+    ],
+)
+
 py_strict_library(
     name = "tf_trt_integration_test_base",
-    srcs = ["//tensorflow/python/compiler/tensorrt/test:tf_trt_integration_test_base_srcs"],
     srcs_version = "PY3",
     deps = [
         ":trt_convert_py",
+        ":utils",
         "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/compiler/tensorrt/test:tf_trt_integration_test_base_srcs",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:graph_io",
@@ -121,6 +130,8 @@ cuda_py_strict_test(
         "no_oss",
         "no_pip",
         "nomac",
+        # TODO(b/303453873): Re-enable tests once TensorRT has been updated
+        "notap",
     ],
     xla_enable_strict_auto_jit = False,
     deps = [
diff --git a/tensorflow/python/compiler/tensorrt/test/BUILD b/tensorflow/python/compiler/tensorrt/test/BUILD
index 4bea640efd6015..15499cbdf79c39 100644
--- a/tensorflow/python/compiler/tensorrt/test/BUILD
+++ b/tensorflow/python/compiler/tensorrt/test/BUILD
@@ -30,6 +30,7 @@ py_strict_library(
         "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/compiler/tensorrt:trt_convert_py",
+        "//tensorflow/python/compiler/tensorrt:utils",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:graph_io",
@@ -93,6 +94,8 @@ base_tags = [
     "no_rocm",
     "no_windows",
     "nomac",
+    # TODO(b/303453873): Re-enable tests once TensorRT has been updated
+    "notap",
 ]
 
 cuda_py_strict_test(
@@ -106,7 +109,7 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python/compiler/tensorrt:trt_convert_py",
+        "//tensorflow/python/compiler/tensorrt:utils",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
@@ -143,7 +146,7 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python/compiler/tensorrt:trt_convert_py",
+        "//tensorflow/python/compiler/tensorrt:utils",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/ops:array_ops",
@@ -243,7 +246,7 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python/compiler/tensorrt:trt_convert_py",
+        "//tensorflow/python/compiler/tensorrt:utils",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops:math_ops",
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index be65e17f9784e4..d023a58baf0ef8 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -1,5 +1,8 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "tf_py_benchmark_test",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -22,7 +25,7 @@ py_strict_library(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "meta_benchmark",
     srcs = ["meta_benchmark.py"],
     deps = [
@@ -36,7 +39,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "batch_benchmark",
     srcs = ["batch_benchmark.py"],
     deps = [
@@ -49,7 +52,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "filter_benchmark",
     srcs = ["filter_benchmark.py"],
     deps = [
@@ -59,7 +62,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "from_tensor_slices_benchmark",
     srcs = ["from_tensor_slices_benchmark.py"],
     deps = [
@@ -74,7 +77,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "interleave_benchmark",
     srcs = ["interleave_benchmark.py"],
     deps = [
@@ -85,7 +88,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "list_files_benchmark",
     srcs = ["list_files_benchmark.py"],
     deps = [
@@ -94,7 +97,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "map_benchmark",
     srcs = ["map_benchmark.py"],
     deps = [
@@ -109,7 +112,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "prefetch_benchmark",
     srcs = ["prefetch_benchmark.py"],
     deps = [
@@ -118,7 +121,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "range_benchmark",
     srcs = ["range_benchmark.py"],
     deps = [
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index a9eef9c7ad6e91..e61a15a2ae88f1 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -1,5 +1,8 @@
 load("//tensorflow:strict.default.bzl", "py_strict_binary")
-load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "tf_py_benchmark_test",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -22,7 +25,7 @@ py_strict_binary(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "autotune_benchmark",
     srcs = ["autotune_benchmark.py"],
     deps = [
@@ -34,7 +37,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "parameter_value_benchmark",
     srcs = ["parameter_value_benchmark.py"],
     deps = [
@@ -47,7 +50,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "csv_dataset_benchmark",
     srcs = ["csv_dataset_benchmark.py"],
     tags = ["no_pip"],
@@ -61,7 +64,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "map_and_batch_benchmark",
     srcs = ["map_and_batch_benchmark.py"],
     deps = [
@@ -77,7 +80,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "map_defun_benchmark",
     srcs = ["map_defun_benchmark.py"],
     deps = [
@@ -92,7 +95,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "matching_files_benchmark",
     size = "small",
     srcs = ["matching_files_benchmark.py"],
@@ -102,7 +105,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "optimize_benchmark",
     srcs = ["optimize_benchmark.py"],
     deps = [
@@ -113,7 +116,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "rejection_resample_benchmark",
     srcs = ["rejection_resample_benchmark.py"],
     tags = ["no_pip"],
@@ -126,7 +129,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "snapshot_dataset_benchmark",
     srcs = ["snapshot_dataset_benchmark.py"],
     deps = [
@@ -138,7 +141,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "unbatch_benchmark",
     srcs = ["unbatch_benchmark.py"],
     deps = [
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index 93f59e64bc6ccb..32c958e4185d83 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -90,6 +90,8 @@ tf_py_strict_test(
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:check_ops",
         "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
index 03f795a9212d84..2a5848fb87befb 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
@@ -24,6 +24,8 @@
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -110,8 +112,66 @@ def testMapFusion(self, functions, num_parallel_calls, deterministic):
           r = function(r)
       expected_output.append(r)
 
-    if num_parallel_calls is None or deterministic in [None, True]:
-      self.assertDatasetProduces(dataset, expected_output=expected_output)
+    nondeterministic_ordering = (
+        num_parallel_calls is not None and deterministic is False  # pylint: disable=g-bool-id-comparison
+    )
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=expected_output,
+        assert_items_equal=nondeterministic_ordering,
+    )
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testMapFusionLongMapChain(self):
+    n = 5
+    dataset = dataset_ops.Dataset.range(n)
+    dataset = dataset.apply(
+        testing.assert_next(["ParallelMap", "MemoryCacheImpl"])
+    )
+
+    k = 50
+    for _ in range(k):
+      dataset = dataset.map(
+          lambda x: 2 * x,
+          num_parallel_calls=dataset_ops.AUTOTUNE,
+      )
+
+    dataset = dataset.cache()
+    options = options_lib.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.map_fusion = True
+    dataset = dataset.with_options(options)
+
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[x * 2**k for x in range(n)],
+        assert_items_equal=True,
+    )
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testControlInputs(self):
+    def f(x):
+      with ops.control_dependencies([check_ops.assert_type(x, dtypes.int64)]):
+        return 2 * x
+
+    n = 5
+    dataset = dataset_ops.Dataset.range(n)
+    dataset = dataset.apply(
+        testing.assert_next(["ParallelMap", "MemoryCacheImpl"])
+    )
+    dataset = dataset.map(f, num_parallel_calls=dataset_ops.AUTOTUNE)
+    dataset = dataset.map(f, num_parallel_calls=dataset_ops.AUTOTUNE)
+
+    dataset = dataset.cache()
+    options = options_lib.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.map_fusion = True
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[x * 4 for x in range(n)],
+        assert_items_equal=True,
+    )
 
   @combinations.generate(
       combinations.times(
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/BUILD b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
index cfac30fe0dbb47..f43db5fd64b852 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
@@ -243,6 +243,27 @@ tf_py_strict_test(
     ],
 )
 
+tf_py_strict_test(
+    name = "distributed_save_load_test",
+    size = "medium",
+    srcs = ["distributed_save_load_test.py"],
+    shard_count = 8,
+    deps = [
+        ":test_base",
+        "//tensorflow/python/data/experimental/ops:data_service_ops",
+        "//tensorflow/python/data/experimental/ops:distributed_save_op",
+        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_strict_test(
     name = "distributed_save_ft_test",
     size = "medium",
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_test.py b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_test.py
new file mode 100644
index 00000000000000..a7e21a5d939321
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_test.py
@@ -0,0 +1,254 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for distributed save/load with the new load algorithm."""
+
+import os
+import shutil
+import tempfile
+import threading
+import time
+from typing import Callable, Optional
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.experimental.kernel_tests.service import test_base as data_service_test_base
+from tensorflow.python.data.experimental.ops import data_service_ops
+from tensorflow.python.data.experimental.ops import distributed_save_op
+from tensorflow.python.data.kernel_tests import checkpoint_test_base
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import load_op
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+
+
+class TestSnapshot:
+  """Test data for snapshots."""
+
+  def __init__(self):
+    temp_dir = tempfile.mkdtemp(dir=googletest.GetTempDir())
+    self.path = os.path.join(
+        tempfile.mkdtemp(dir=temp_dir), "distributed_save_load_test")
+
+  def __del__(self):
+    shutil.rmtree(self.path)
+
+
+class DistributedSaveLoadTest(
+    data_service_test_base.TestBase, parameterized.TestCase):
+  """Tests for distributed save/load with the new load algorithm.
+
+  TODO(b/297930782): Add fault tolerance tests.
+  """
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              num_workers=[1, 3],
+              num_elements=[0, 10],
+              num_repetitions=[1, 3],
+              compression=[None, "AUTO", "GZIP"])))
+  def test_save_load(
+      self,
+      num_workers: int,
+      num_elements: int,
+      num_repetitions: int,
+      compression: Optional[str]):
+    test_snapshot = TestSnapshot()
+    cluster = data_service_test_base.TestCluster(num_workers=num_workers)
+    dataset = dataset_ops.Dataset.range(num_elements)
+    dataset = dataset.repeat(num_repetitions)
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset, test_snapshot.path, cluster.dispatcher_address()))
+
+    # Unlike the old load op, v2 does not need to wait for snapshot to finish.
+    dataset = load_op._load_distributed_snapshot_v2(test_snapshot.path)
+    self.assertDatasetProduces(
+        dataset,
+        list(range(num_elements)) * num_repetitions,
+        assert_items_equal=True)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(num_workers=[1, 3])))
+  def test_concurrent_save_load(self, num_workers: int):
+    test_snapshot = TestSnapshot()
+    cluster = data_service_test_base.TestCluster(num_workers=num_workers)
+
+    def load_thread_fn():
+      dataset = load_op._load_distributed_snapshot_v2(test_snapshot.path)
+      self.assertDatasetProduces(
+          dataset, list(range(10)), assert_items_equal=True)
+    load_thread = threading.Thread(target=load_thread_fn, name="load_thread")
+    load_thread.start()
+
+    def save_thread_fn():
+      time.sleep(5)
+      dataset = dataset_ops.Dataset.range(10)
+      self.evaluate(
+          distributed_save_op.distributed_save(
+              dataset, test_snapshot.path, cluster.dispatcher_address()))
+    save_thread = threading.Thread(target=save_thread_fn, name="save_thread")
+    save_thread.start()
+    save_thread.join()
+    load_thread.join()
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(num_workers=[1, 3], num_elements=[0, 10])))
+  def test_distributed_load(self, num_workers: int, num_elements: int):
+    test_snapshot = TestSnapshot()
+    cluster = data_service_test_base.TestCluster(num_workers=num_workers)
+    dataset = dataset_ops.Dataset.range(num_elements)
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset, test_snapshot.path, cluster.dispatcher_address()))
+
+    dataset = load_op._load_distributed_snapshot_v2(test_snapshot.path)
+    # TODO(b/297930782): Support dynamic sharding.
+    dataset = dataset.apply(
+        data_service_ops.distribute(
+            data_service_ops.ShardingPolicy.OFF, cluster.dispatcher_address()))
+    self.assertDatasetProduces(
+        dataset,
+        list(range(num_elements)) * num_workers,
+        assert_items_equal=True)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(num_workers=[1, 3])))
+  def test_save_before_sample(self, num_workers: int):
+    num_elements = 10
+    num_datasets = 3
+    test_snapshot = TestSnapshot()
+    cluster = data_service_test_base.TestCluster(num_workers=num_workers)
+    datasets = [
+        dataset_ops.Dataset.range(num_elements) for i in range(num_datasets)]
+    for i, dataset in enumerate(datasets):
+      self.evaluate(
+          distributed_save_op.distributed_save(
+              dataset,
+              os.path.join(test_snapshot.path, f"dataset_{i}"),
+              cluster.dispatcher_address()))
+
+    loaded_datasets = []
+    for i in range(len(datasets)):
+      loaded_datasets.append(
+          load_op._load_distributed_snapshot_v2(
+              os.path.join(test_snapshot.path, f"dataset_{i}")))
+    dataset = dataset_ops.Dataset.sample_from_datasets(
+        loaded_datasets,
+        weights=[1.0] * num_datasets,
+        stop_on_empty_dataset=False)
+    self.assertDatasetProduces(
+        dataset,
+        list(range(num_elements)) * num_datasets,
+        assert_items_equal=True)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(num_workers=[1, 3], num_repetitions=[1, 3])))
+  def test_save_after_sample(self, num_workers: int, num_repetitions: int):
+    num_elements = 10
+    num_datasets = 3
+    test_snapshot = TestSnapshot()
+    cluster = data_service_test_base.TestCluster(num_workers=num_workers)
+    datasets = [
+        dataset_ops.Dataset.range(num_elements) for i in range(num_datasets)]
+    if num_repetitions > 1:
+      datasets = [dataset.repeat(num_repetitions) for dataset in datasets]
+    dataset = dataset_ops.Dataset.sample_from_datasets(
+        datasets, weights=[1.0] * num_datasets, stop_on_empty_dataset=False)
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset, test_snapshot.path, cluster.dispatcher_address()))
+
+    dataset = load_op._load_distributed_snapshot_v2(test_snapshot.path)
+    self.assertDatasetProduces(
+        dataset,
+        list(range(num_elements)) * num_datasets * num_repetitions,
+        assert_items_equal=True)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(num_workers=[1, 3])))
+  def test_enumerate(self, num_workers: int):
+    test_snapshot = TestSnapshot()
+    cluster = data_service_test_base.TestCluster(num_workers)
+    dataset = dataset_ops.Dataset.from_tensor_slices(["a", "b", "c"])
+    dataset = dataset.repeat(3)
+    dataset = dataset.enumerate()
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset, test_snapshot.path, cluster.dispatcher_address()))
+
+    dataset = load_op._load_distributed_snapshot_v2(test_snapshot.path)
+    indexes, elements = map(list, zip(*self.getDatasetOutput(dataset)))
+    if num_workers == 1:
+      self.assertCountEqual(indexes, list(range(9)))
+    self.assertCountEqual(elements, [b"a", b"b", b"c"] * 3)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def test_worker_failure(self):
+    test_snapshot = TestSnapshot()
+    cluster = data_service_test_base.TestCluster(num_workers=1)
+    components = np.array([1.0, 2.0, 3.0, np.nan, 5.0]).astype(np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    dataset = dataset.map(lambda x: array_ops.check_numerics(x, "message"))
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset, test_snapshot.path, cluster.dispatcher_address()))
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = load_op._load_distributed_snapshot_v2(test_snapshot.path)
+      self.getDatasetOutput(dataset)
+
+
+class SaveLoadCheckpointTest(
+    data_service_test_base.TestBase,
+    checkpoint_test_base.CheckpointTestBase,
+    parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations()))
+  def test_save_load_checkpoint(self, verify_fn: Callable[..., None]):
+    test_snapshot = TestSnapshot()
+    cluster = data_service_test_base.TestCluster(num_workers=1)
+    dataset = dataset_ops.Dataset.range(10)
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset, test_snapshot.path, cluster.dispatcher_address()))
+
+    def _build_ds() -> dataset_ops.Dataset:
+      return load_op._load_distributed_snapshot_v2(test_snapshot.path)
+
+    verify_fn(self, _build_ds, num_outputs=10)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index c3e153a4e775e6..cc604a2afebe22 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -56,6 +56,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:tf2",
+        "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/util:deprecation",
@@ -194,6 +195,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:tf2",
+        "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/util:deprecation",
@@ -349,6 +351,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:tf2",
+        "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
@@ -365,6 +368,7 @@ py_strict_library(
         ":error_ops",
         ":parsing_ops",
         "//tensorflow/python:tf2",
+        "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/ops:readers",
diff --git a/tensorflow/python/data/experimental/ops/counter.py b/tensorflow/python/data/experimental/ops/counter.py
index 2a8eaaae76afaa..e9dc2b49a0ea0d 100644
--- a/tensorflow/python/data/experimental/ops/counter.py
+++ b/tensorflow/python/data/experimental/ops/counter.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """The Counter Dataset."""
 from tensorflow.python import tf2
+from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.util import deprecation
@@ -70,3 +71,14 @@ def CounterV1(start=0, step=1, dtype=dtypes.int64):
   Counter = CounterV2
 else:
   Counter = CounterV1
+
+
+def _tf2_callback():  # pylint: disable=invalid-name
+  global Counter
+  if tf2.enabled():
+    Counter = CounterV2
+  else:
+    Counter = CounterV1
+
+
+v2_compat.register_data_v2_callback(_tf2_callback)
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index baf0862379cd4f..961bca376a9662 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -16,6 +16,7 @@
 
 import enum
 import functools
+from typing import Callable
 
 from tensorflow.core.protobuf import data_service_pb2
 from tensorflow.python import tf2
@@ -435,17 +436,19 @@ def _parse_service(service) -> tuple[str, str]:
   return (protocol, address)
 
 
-def _distribute(processing_mode,
-                service,
-                job_name=None,
-                consumer_index=None,
-                num_consumers=None,
-                max_outstanding_requests=None,
-                task_refresh_interval_hint_ms=None,
-                data_transfer_protocol=None,
-                compression="AUTO",
-                cross_trainer_cache=None,
-                target_workers="AUTO") -> dataset_ops.Dataset:
+def _distribute(
+    processing_mode,
+    service,
+    job_name=None,
+    consumer_index=None,
+    num_consumers=None,
+    max_outstanding_requests=None,
+    task_refresh_interval_hint_ms=None,
+    data_transfer_protocol=None,
+    compression="AUTO",
+    cross_trainer_cache=None,
+    target_workers="AUTO",
+) -> Callable[dataset_ops.Dataset, dataset_ops.Dataset]:
   """A transformation that moves dataset processing to the tf.data service.
 
   This transformation is similar to `distribute`, but supports additional
@@ -529,16 +532,18 @@ def _apply_fn(dataset) -> dataset_ops.Dataset:  # pylint: disable=missing-docstr
 
 
 @tf_export("data.experimental.service.distribute")
-def distribute(processing_mode,
-               service,
-               job_name=None,
-               consumer_index=None,
-               num_consumers=None,
-               max_outstanding_requests=None,
-               data_transfer_protocol=None,
-               compression="AUTO",
-               cross_trainer_cache=None,
-               target_workers="AUTO") -> dataset_ops.Dataset:
+def distribute(
+    processing_mode,
+    service,
+    job_name=None,
+    consumer_index=None,
+    num_consumers=None,
+    max_outstanding_requests=None,
+    data_transfer_protocol=None,
+    compression="AUTO",
+    cross_trainer_cache=None,
+    target_workers="AUTO",
+) -> Callable[dataset_ops.Dataset, dataset_ops.Dataset]:
   """A transformation that moves dataset processing to the tf.data service.
 
   When you iterate over a dataset containing the `distribute` transformation,
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
index 4cf61f9d5c7f9b..7f1d97d6a0e90e 100644
--- a/tensorflow/python/data/experimental/ops/interleave_ops.py
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Non-deterministic dataset transformations."""
 from tensorflow.python import tf2
+from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.util import deprecation
@@ -245,3 +246,16 @@ def choose_from_datasets_v1(datasets,
 else:
   choose_from_datasets = choose_from_datasets_v1
   sample_from_datasets = sample_from_datasets_v1
+
+
+def _tf2_callback():
+  global choose_from_datasets, sample_from_datasets
+  if tf2.enabled():
+    choose_from_datasets = choose_from_datasets_v2
+    sample_from_datasets = sample_from_datasets_v2
+  else:
+    choose_from_datasets = choose_from_datasets_v1
+    sample_from_datasets = sample_from_datasets_v1
+
+
+v2_compat.register_data_v2_callback(_tf2_callback)
diff --git a/tensorflow/python/data/experimental/ops/random_ops.py b/tensorflow/python/data/experimental/ops/random_ops.py
index 8e951ea962c3d9..a88f14a8063b42 100644
--- a/tensorflow/python/data/experimental/ops/random_ops.py
+++ b/tensorflow/python/data/experimental/ops/random_ops.py
@@ -16,6 +16,7 @@
 import functools
 
 from tensorflow.python import tf2
+from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import random_op
 from tensorflow.python.util import deprecation
@@ -44,3 +45,14 @@ def __init__(self, seed=None):
   RandomDataset = RandomDatasetV2
 else:
   RandomDataset = RandomDatasetV1
+
+
+def _tf2_callback():
+  global RandomDataset
+  if tf2.enabled():
+    RandomDataset = RandomDatasetV2
+  else:
+    RandomDataset = RandomDatasetV1
+
+
+v2_compat.register_data_v2_callback(_tf2_callback)
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index 1ae47f4c9c8e70..75a4a9c39ffa50 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -21,6 +21,7 @@
 import numpy as np
 
 from tensorflow.python import tf2
+from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.experimental.ops import parsing_ops
 from tensorflow.python.data.ops import dataset_ops
@@ -1220,3 +1221,20 @@ def __init__(self, driver_name, data_source_name, query, output_types):
   SqlDataset = SqlDatasetV1
   make_batched_features_dataset = make_batched_features_dataset_v1
   make_csv_dataset = make_csv_dataset_v1
+
+
+def _tf2_callback():
+  global CsvDataset, SqlDataset, make_batched_features_dataset, make_csv_dataset
+  if tf2.enabled():
+    CsvDataset = CsvDatasetV2
+    SqlDataset = SqlDatasetV2
+    make_batched_features_dataset = make_batched_features_dataset_v2
+    make_csv_dataset = make_csv_dataset_v2
+  else:
+    CsvDataset = CsvDatasetV1
+    SqlDataset = SqlDatasetV1
+    make_batched_features_dataset = make_batched_features_dataset_v1
+    make_csv_dataset = make_csv_dataset_v1
+
+
+v2_compat.register_data_v2_callback(_tf2_callback)
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index dc07984864b6b4..4182a927caf562 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -33,7 +33,7 @@ tf_py_strict_test(
     name = "batch_test",
     size = "medium",
     srcs = ["batch_test.py"],
-    shard_count = 4,
+    shard_count = 8,
     deps = [
         ":checkpoint_test_base",
         ":test_base",
@@ -509,6 +509,7 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:constant_op",
diff --git a/tensorflow/python/data/kernel_tests/flat_map_test.py b/tensorflow/python/data/kernel_tests/flat_map_test.py
index 29c9cf72aea840..4a3becfd753faa 100644
--- a/tensorflow/python/data/kernel_tests/flat_map_test.py
+++ b/tensorflow/python/data/kernel_tests/flat_map_test.py
@@ -352,6 +352,47 @@ def _build_ds():
 
     verify_fn(self, _build_ds, num_outputs=20)
 
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[True],
+                               num_skips=[3, 4]),
+      )
+  )
+  def testWithSkip(self, verify_fn, symbolic_checkpoint, num_skips):
+    """Test `.flat_map().skip()` checkpointing behavior.
+
+    `SkipInternal` and `GetNextInternal` are separate functions
+    but with slighly different implementations.
+    Therefore, we should test this op's behavior when used with `.skip()`.
+
+    Args:
+      verify_fn: Verify the correctness of this dataset's checkpointing.
+      symbolic_checkpoint: Whether symbolic checkpointing is turned on.
+      num_skips: `.skip(num_skips)`
+    """
+
+    def build_dataset():
+      def my_map(x):
+        if x == 0:
+          return dataset_ops.Dataset.from_tensor_slices([0, 1, 2, 3])
+        elif x == 1:
+          return dataset_ops.Dataset.from_tensor_slices([4, 5, 6, 7])
+        else:
+          return dataset_ops.Dataset.from_tensor_slices([8, 9, 10, 11])
+
+      indices = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+      dataset = indices.flat_map(my_map)
+      # Skip some elements
+      dataset = dataset.skip(num_skips)
+
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
+
+    verify_fn(self, build_dataset, num_outputs=3 * 4 - num_skips)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/group_by_window_test.py b/tensorflow/python/data/kernel_tests/group_by_window_test.py
index 36ba1659bbf981..461967f528f22c 100644
--- a/tensorflow/python/data/kernel_tests/group_by_window_test.py
+++ b/tensorflow/python/data/kernel_tests/group_by_window_test.py
@@ -16,6 +16,7 @@
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
@@ -277,26 +278,6 @@ def testEmpty(self):
         "Window size must be greater than zero, but got 0."):
       print(self.evaluate(get_next()))
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testReduceFuncError(self):
-    components = np.random.randint(100, size=(200,)).astype(np.int64)
-
-    def reduce_func(_, xs):
-      # Introduce an incorrect padded shape that cannot (currently) be
-      # detected at graph construction time.
-      return xs.padded_batch(
-          4,
-          padded_shapes=(tensor_shape.TensorShape([]),
-                         constant_op.constant([5], dtype=dtypes.int64) * -1))
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(components)
-    dataset = dataset.map(lambda x: (x, ops.convert_to_tensor([x * x])))
-    dataset = dataset.group_by_window(
-        key_func=lambda x, _: x % 2, reduce_func=reduce_func, window_size=32)
-    get_next = self.getNext(dataset)
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.evaluate(get_next())
-
   @combinations.generate(test_base.default_test_combinations())
   def testConsumeWindowDatasetMoreThanOnce(self):
     components = np.random.randint(50, size=(200,)).astype(np.int64)
@@ -399,5 +380,73 @@ def test(self):
         verify_exhausted=False)
 
 
+class GroupByWindowErrorMessageTest(
+    test_base.DatasetTestBase, parameterized.TestCase
+):
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testReduceFuncError(self):
+    components = np.random.randint(100, size=(200,)).astype(np.int64)
+
+    def my_reduce_func(_, window_dataset):
+      # Introduce an incorrect padded shape that cannot (currently) be
+      # detected at graph construction time.
+      return window_dataset.padded_batch(
+          4,
+          padded_shapes=(
+              tensor_shape.TensorShape([]),
+              constant_op.constant([5], dtype=dtypes.int64) * -1,
+          ),
+      )
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    dataset = dataset.map(lambda x: (x, ops.convert_to_tensor([x * x])))
+    dataset = dataset.group_by_window(
+        key_func=lambda x, _: x % 2, reduce_func=my_reduce_func, window_size=32
+    )
+    get_next = self.getNext(dataset)
+    with self.assertRaises(errors.InternalError) as error:
+      self.evaluate(get_next())
+
+    msg = str(error.exception)
+    self.assertIn(error_codes_pb2.Code.Name(errors.INVALID_ARGUMENT), msg)
+    self.assertIn(
+        my_reduce_func.__name__,
+        msg,
+        "{} should show up in the error message".format(
+            my_reduce_func.__name__
+        ),
+    )
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testPropagateUserDefinedFunctionErrorMessage(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0])
+
+    def a_cool_user_defined_reduce_func(unused_key, window_dataset):
+      it = iter(window_dataset)
+      l = [next(it) for _ in range(2)]  # This causes OutOfRange error
+      return dataset_ops.Dataset.from_tensor_slices(l)
+
+    dataset = dataset.group_by_window(
+        key_func=lambda x: 0,
+        window_size=2,
+        reduce_func=a_cool_user_defined_reduce_func,
+    )
+
+    get_next = self.getNext(dataset)
+    with self.assertRaisesRegex(
+        errors.InternalError,
+        ".*{}.*".format(a_cool_user_defined_reduce_func.__name__),
+        msg=(
+            "The name of user-defined-function should show up in the error"
+            " message"
+        ),
+    ):
+      # Loop over the dataset
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          self.evaluate(get_next())
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/placement_test.py b/tensorflow/python/data/kernel_tests/placement_test.py
index 6c9efc53f2486a..35f929737c6ff6 100644
--- a/tensorflow/python/data/kernel_tests/placement_test.py
+++ b/tensorflow/python/data/kernel_tests/placement_test.py
@@ -198,7 +198,7 @@ def create_iter():
     create_iter()
 
   @combinations.generate(test_base.graph_only_combinations())
-  @test_util.run_gpu_only()
+  @test_util.run_gpu_only
   def testIteratorOnDeviceGraphModeOneShotIterator(self):
     self.skipTest("TODO(b/169429285): tf.data.Dataset.make_one_shot_iterator "
                   "does not support GPU placement.")
@@ -230,7 +230,7 @@ def testIteratorOnDeviceGraphModeOneShotIterator(self):
     self.assertIn(b"GPU:0", self.evaluate(has_value_device))
 
   @combinations.generate(test_base.graph_only_combinations())
-  @test_util.run_gpu_only()
+  @test_util.run_gpu_only
   def testIteratorOnDeviceGraphModeInitializableIterator(self):
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(prefetching_ops.prefetch_to_device("/gpu:0"))
@@ -259,7 +259,7 @@ def testIteratorOnDeviceGraphModeInitializableIterator(self):
     self.assertIn(b"GPU:0", self.evaluate(has_value_device))
 
   @combinations.generate(test_base.eager_only_combinations())
-  @test_util.run_gpu_only()
+  @test_util.run_gpu_only
   def testIterDatasetEagerModeWithExplicitDevice(self):
 
     @def_function.function
@@ -274,7 +274,7 @@ def comp():
     self.assertEqual(result.numpy(), 45)
 
   @combinations.generate(test_base.eager_only_combinations())
-  @test_util.run_gpu_only()
+  @test_util.run_gpu_only
   def testFunctionInliningColocation(self):
 
     @def_function.function
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index b7706ef00699c5..b5d55728b53b47 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -98,6 +98,7 @@ py_strict_library(
         "//tensorflow/python/autograph/operators:py_builtins",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/experimental/ops:take_while_ops",
         "//tensorflow/python/data/experimental/service:_pywrap_snapshot_utils",
         "//tensorflow/python/data/util:convert",
@@ -114,6 +115,7 @@ py_strict_library(
         "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:function",
         "//tensorflow/python/framework:none_tensor",
         "//tensorflow/python/framework:ops",
@@ -192,7 +194,6 @@ py_strict_library(
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training:saver",
-        "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:nest",
@@ -268,6 +269,7 @@ py_strict_library(
         ":dataset_ops",
         ":structured_function",
         "//tensorflow/python:tf2",
+        "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/util:convert",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 08ea8693d1cbbc..358b316ea0bd3b 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -28,6 +28,7 @@
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python import tf2
+from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_autograph
 from tensorflow.python.data.ops import debug_mode
 from tensorflow.python.data.ops import iterator_ops
@@ -4212,6 +4213,17 @@ def with_options(self, options, name=None) -> "DatasetV1Adapter":
   Dataset = DatasetV1
 
 
+def _tf2_callback():
+  global Dataset
+  if tf2.enabled():
+    Dataset = DatasetV2
+  else:
+    Dataset = DatasetV1
+
+
+v2_compat.register_data_v2_callback(_tf2_callback)
+
+
 class DatasetV1Adapter(DatasetV1):
   """Wraps a V2 `Dataset` object in the `tf.compat.v1.data.Dataset` API."""
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 8c09060ab85976..6db3abca84c880 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -39,7 +39,6 @@
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.trackable import base as trackable
 from tensorflow.python.training.saver import BaseSaverBuilder
-from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
@@ -1013,5 +1012,4 @@ def get_next_as_optional(iterator):
   return iterator.get_next_as_optional()
 
 
-_pywrap_utils.RegisterType("OwnedIterator", OwnedIterator)
 iterator_autograph.register_overrides()
diff --git a/tensorflow/python/data/ops/load_op.py b/tensorflow/python/data/ops/load_op.py
index bb25e08feb7060..bec48f81349ccd 100644
--- a/tensorflow/python/data/ops/load_op.py
+++ b/tensorflow/python/data/ops/load_op.py
@@ -15,6 +15,8 @@
 """Implementation of LoadDataset in Python."""
 import multiprocessing
 import os
+import time
+from typing import Optional
 
 from google.protobuf import message
 from google.protobuf import text_format
@@ -22,6 +24,9 @@
 from tensorflow.python.data.experimental.service import _pywrap_snapshot_utils
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import structured_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.platform import gfile
 # TODO(b/238903802): Use TypeSpec serialization methods directly.
@@ -31,22 +36,6 @@
 def _load(path, element_spec, compression, reader_func):
   """Loads dataset from tf.data snapshot."""
 
-  def _get_distributed_snapshot_metadata():
-    """Reads the distributed snapshot metadata.
-
-    Returns:
-      DistributedSnapshotMetadata if the snapshot is a distributed snapshot.
-      Returns None if it is a non-distributed snapshot.
-    """
-    try:
-      with gfile.GFile(
-          _pywrap_snapshot_utils.TF_DATA_SnapshotMetadataFilePath(path), "r"
-      ) as f:
-        return text_format.ParseLines(
-            f, snapshot_pb2.DistributedSnapshotMetadata())
-    except (text_format.ParseError, message.DecodeError, UnicodeDecodeError):
-      return None
-
   if reader_func is None:
     reader_func = lambda datasets: datasets.interleave(  # pylint:disable=g-long-lambda
         lambda x: x,
@@ -59,7 +48,7 @@ def _get_distributed_snapshot_metadata():
       encoded_spec = f.read()
     element_spec = _parse_element_spec(encoded_spec)
 
-  distributed_snapshot_metadata = _get_distributed_snapshot_metadata()
+  distributed_snapshot_metadata = _load_distributed_snapshot_metadata(path)
   if distributed_snapshot_metadata:
     _validate_snapshot(
         path, distributed_snapshot_metadata, element_spec, compression)
@@ -68,6 +57,32 @@ def _get_distributed_snapshot_metadata():
   return _LoadDataset(path, element_spec, compression, reader_func)
 
 
+def _load_distributed_snapshot_metadata(
+    path: str,
+) -> Optional[snapshot_pb2.DistributedSnapshotMetadata]:
+  """Reads the distributed snapshot metadata.
+
+  Args:
+    path: Base path of the snapshot.
+
+  Returns:
+    DistributedSnapshotMetadata if the snapshot is a distributed snapshot.
+    Returns None if it is a non-distributed snapshot.
+  """
+  try:
+    with gfile.GFile(
+        _pywrap_snapshot_utils.TF_DATA_SnapshotMetadataFilePath(path), "r"
+    ) as f:
+      return text_format.ParseLines(
+          f, snapshot_pb2.DistributedSnapshotMetadata())
+  except (
+      errors.NotFoundError,
+      text_format.ParseError,
+      message.DecodeError,
+      UnicodeDecodeError):
+    return None
+
+
 def _load_distributed_snapshot(path, metadata, reader_func):
   """Loads a distributed snapshot."""
 
@@ -83,6 +98,46 @@ def _load_distributed_snapshot(path, metadata, reader_func):
   return reader_func(dataset)
 
 
+def _load_distributed_snapshot_v2(
+    path: str, reader_func=None
+) -> dataset_ops.Dataset:
+  """Load a distributed snapshot using the updated loading algorithm.
+
+  The new version allows the load job to read the snapshot while it is being
+  written.
+
+  TODO(b/297930782): Merge this into `_load` when it's ready. Currently, this is
+  for testing only.
+
+  Args:
+    path: Base path of the snapshot.
+    reader_func: Optional. A function to control how to read data from shards.
+      If present, the function will be traced and executed as graph computation.
+
+  Returns:
+    The loaded dataset.
+  """
+
+  if not reader_func:
+    reader_func = lambda datasets: datasets.interleave(  # pylint:disable=g-long-lambda
+        lambda x: x,
+        cycle_length=multiprocessing.cpu_count(),
+        num_parallel_calls=dataset_ops.AUTOTUNE)
+
+  metadata = _load_distributed_snapshot_metadata(path)
+  while not metadata:
+    time.sleep(2)
+    metadata = _load_distributed_snapshot_metadata(path)
+
+  dataset = _ListSnapshotChunksDataset(path)
+  dataset = dataset.map(
+      lambda chunk_file: _SnapshotChunkDataset(  # pylint:disable=g-long-lambda
+          chunk_file,
+          element_spec=_parse_element_spec(metadata.element_spec),
+          compression=metadata.compression))
+  return reader_func(dataset)
+
+
 class _LoadDataset(dataset_ops.DatasetSource):
   """A dataset that loads previously saved dataset."""
 
@@ -127,6 +182,25 @@ def element_spec(self):
     return self._element_spec
 
 
+class _ListSnapshotChunksDataset(dataset_ops.DatasetSource):
+  """A dataset for listing snapshot chunk files.
+
+  It supports listing partially written snapshots. When a snapshot is being
+  written, it returns the currently available chunk files.
+  """
+
+  def __init__(self, snapshot_path: str):
+    self._snapshot_path = snapshot_path
+    variant_tensor = ged_ops.list_snapshot_chunks_dataset(
+        snapshot_path, **self._flat_structure
+    )
+    super().__init__(variant_tensor)
+
+  @property
+  def element_spec(self) -> tensor_spec.TensorSpec:
+    return tensor_spec.TensorSpec([], dtypes.string)
+
+
 def _validate_snapshot(path, metadata, element_spec, compression):
   """Validates a tf.data distributed snapshot.
 
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index 347b7a5c272973..566abb7b66eceb 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -16,6 +16,7 @@
 import os
 
 from tensorflow.python import tf2
+from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import from_tensor_slices_op
 from tensorflow.python.data.ops import structured_function
@@ -705,3 +706,18 @@ def _filenames(self, value):
   FixedLengthRecordDataset = FixedLengthRecordDatasetV1
   TFRecordDataset = TFRecordDatasetV1
   TextLineDataset = TextLineDatasetV1
+
+
+def _tf2_callback():
+  global FixedLengthRecordDataset, TFRecordDataset, TextLineDataset
+  if tf2.enabled():
+    FixedLengthRecordDataset = FixedLengthRecordDatasetV2
+    TFRecordDataset = TFRecordDatasetV2
+    TextLineDataset = TextLineDatasetV2
+  else:
+    FixedLengthRecordDataset = FixedLengthRecordDatasetV1
+    TFRecordDataset = TFRecordDatasetV1
+    TextLineDataset = TextLineDatasetV1
+
+
+v2_compat.register_data_v2_callback(_tf2_callback)
diff --git a/tensorflow/python/debug/lib/debug_events_reader.py b/tensorflow/python/debug/lib/debug_events_reader.py
index 706823b799b14a..2b38a4ca4d34ff 100644
--- a/tensorflow/python/debug/lib/debug_events_reader.py
+++ b/tensorflow/python/debug/lib/debug_events_reader.py
@@ -109,8 +109,15 @@ def _load_metadata_files(self):
         wall_times.append(debug_event.wall_time)
         run_ids.append(debug_event.debug_metadata.tfdbg_run_id)
         tensorflow_versions.append(
-            debug_event.debug_metadata.tensorflow_version)
+            debug_event.debug_metadata.tensorflow_version
+        )
         file_versions.append(debug_event.debug_metadata.file_version)
+      except Exception as e:
+        raise errors.DataLossError(
+            None,
+            None,
+            "Error reading tfdbg metadata from paths %s" % metadata_paths,
+        ) from e
       finally:
         reader.close()
     self._starting_wall_time = wall_times[0]
diff --git a/tensorflow/python/distribute/integration_test/saved_model_test.py b/tensorflow/python/distribute/integration_test/saved_model_test.py
index 0b7677f9d8cf13..aa0215387e2f6b 100644
--- a/tensorflow/python/distribute/integration_test/saved_model_test.py
+++ b/tensorflow/python/distribute/integration_test/saved_model_test.py
@@ -40,7 +40,6 @@
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
-from tensorflow.python.framework import errors_impl
 from tensorflow.python.ops import lookup_ops
 
 _sixteen_worker_pool = strategy_combinations._deferred_pool_runner(
@@ -684,16 +683,15 @@ def test_sharded_variable(self):
 
     self.assertAllEqual(self.load_and_run_v1(model_dir, {"x": 1}), [6, 6, 6, 6])
 
-  def test_load_with_partitioner_raises_error(self):
+  def test_load_with_partitioner_works(self):
     model = self.Model()
     model_dir = self.get_temp_dir()
     tf.saved_model.save(model, model_dir)
 
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
         self.cluster_resolver, tf1.fixed_size_partitioner(2))
-    with self.assertRaises(errors_impl.InvalidArgumentError):
-      with strategy.scope():
-        tf.saved_model.load(model_dir)
+    with strategy.scope():
+      tf.saved_model.load(model_dir)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 69b22392903a03..a07df8e337cb0c 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -929,10 +929,13 @@ def shutdown(self):
     if self._runner is not None:
       try:
         self._runner.join()
+      except unittest.SkipTest:
+        raise
       except Exception as e:  # pylint: disable=broad-except
-        logging.error(
+        logging.exception(
             'Ignoring exception when shutting down MultiProcessPoolRunner: %s',
-            e)
+            e,
+        )
       self._runner = None
 
   def _start(self):
diff --git a/tensorflow/python/distribute/sharded_variable.py b/tensorflow/python/distribute/sharded_variable.py
index 12c9ed9aa3ed10..4f4e0a5cbf3eaa 100644
--- a/tensorflow/python/distribute/sharded_variable.py
+++ b/tensorflow/python/distribute/sharded_variable.py
@@ -438,7 +438,7 @@ def __getitem__(self, slice_spec):
         )
       for i in range(len(self._variables)):
         if i == len(self._variables) - 1 or (
-            s > self._var_offsets[i][0] and s < self._var_offsets[i + 1][0]
+            s >= self._var_offsets[i][0] and s < self._var_offsets[i + 1][0]
         ):
           return self._variables[i][
               (s - self._var_offsets[i][0],) + slice_spec[1:]
diff --git a/tensorflow/python/distribute/sharded_variable_test.py b/tensorflow/python/distribute/sharded_variable_test.py
index 4c83bc49b328db..797b9d066b45dc 100644
--- a/tensorflow/python/distribute/sharded_variable_test.py
+++ b/tensorflow/python/distribute/sharded_variable_test.py
@@ -569,14 +569,20 @@ def safe_sparse_lookup():
     self.assertAllClose(safe_sparse_lookup(), [[1., 2.], [0., 0.], [3., 4.]])
 
   def test_slicing(self):
+    data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14],
+            [15, 16]]
     v = [
-        variables_lib.Variable([[1, 2], [3, 4], [5, 6]]),
-        variables_lib.Variable([[7, 8], [9, 10], [11, 12]]),
-        variables_lib.Variable([[13, 14], [15, 16]])
+        variables_lib.Variable(data[:3]),
+        variables_lib.Variable(data[3:6]),
+        variables_lib.Variable(data[6:])
     ]
     sv = sharded_variable.ShardedVariable(v)
     empty = v[0][0:0]
 
+    # Test cases: all individual indices
+    for ix in range(len(data)):
+      self.assertAllEqual(sv[ix].numpy(), data[ix])
+
     # Test cases: positive step
     self.assertAllEqual(sv[:], array_ops.concat(v, axis=0))
     self.assertAllEqual(sv[:2], [[1, 2], [3, 4]])
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 3c8f7c80a0d593..2923868b1280de 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -9,6 +9,7 @@ load(
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 load(
     "//tensorflow/tools/test:performance.bzl",
+    "cuda_py_benchmark_test",
     "tf_py_logged_benchmark",
 )
 
@@ -113,6 +114,8 @@ cuda_py_strict_test(
     deps = [
         ":pywrap_tensor_test_util",
         ":test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -805,7 +808,7 @@ py_strict_library(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "benchmarks_test",
     srcs = ["benchmarks_test.py"],
     python_version = "PY3",
@@ -896,7 +899,7 @@ tf_xla_py_strict_test(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "remote_benchmarks_test",
     srcs = ["remote_benchmarks_test.py"],
     python_version = "PY3",
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index c4dc6d228c9bf3..a81fb37b013616 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -447,7 +447,7 @@ def testTapeNoOpGradient2By2(self):
     self.assertAllEqual(dy_dy.numpy(),
                         constant_op.constant(1.0, shape=[2, 2]).numpy())
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testTapeNoOpGradientMultiTarget2By2(self):
     a_2_by_2 = constant_op.constant(2.0, shape=[2, 2])
     with backprop.GradientTape(persistent=True) as tape:
@@ -1648,7 +1648,7 @@ def grad_fn(x):
         self.assertIn('gradient_tape/my_scope/', op.name)
     self.assertEqual(num_sin_ops_found, 2)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testRecomputeGradWithDifferentShape(self):
     if sys.version_info.major == 3 and sys.version_info.minor in (11, 12):
       # TODO(b/264947738)
@@ -1681,7 +1681,7 @@ def outer_dict(x):
       self.assertAllEqual(y[1], 2.0)
 
   @parameterized.parameters([(True), (False)])
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testRecomputeGradWithNestedFunctionAndWhileLoop(self, reduce_retracing):
     if sys.version_info.major == 3 and sys.version_info.minor in (11, 12):
       # TODO(b/264947738)
diff --git a/tensorflow/python/eager/benchmarks/BUILD b/tensorflow/python/eager/benchmarks/BUILD
index 58878b11343e05..d50d428ed712f8 100644
--- a/tensorflow/python/eager/benchmarks/BUILD
+++ b/tensorflow/python/eager/benchmarks/BUILD
@@ -1,4 +1,7 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "cuda_py_benchmark_test",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -6,7 +9,7 @@ package(
     licenses = ["notice"],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "kpi_benchmark_test",
     size = "medium",
     srcs = ["kpi_benchmark_test.py"],
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index aeda5a61594fd7..18f32cc1604186 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -305,7 +305,6 @@ class LogicalDevice(
       placement.
     device_type: String declaring the type of device such as "CPU" or "GPU".
   """
-  pass
 
 
 @tf_export("config.LogicalDeviceConfiguration",
@@ -688,6 +687,10 @@ def set_server_def(self, server_def, keep_alive_secs=_KEEP_ALIVE_SECS):
 
     # Clear all the caches in case there are remote tensors in them.
     self._clear_caches()
+    # Also clear the device parsing cache since it caches the resolution of
+    # partial device names, which may become different due to the set_server_def
+    # call as we may have defined different devices.
+    _device_parsing_cache.clear()
 
   def update_server_def(self, server_def, keep_alive_secs=_KEEP_ALIVE_SECS):
     """Update a server_def on the context.
@@ -1378,9 +1381,13 @@ def add_function_def(self, fdef):
       fdef: A FunctionDef protocol buffer message.
     """
     self.ensure_initialized()
-    fdef_string = fdef.SerializeToString()
-    pywrap_tfe.TFE_ContextAddFunctionDef(self._handle, fdef_string,
-                                         len(fdef_string))
+    if is_oss:
+      fdef_string = fdef.SerializeToString()
+      pywrap_tfe.TFE_ContextAddFunctionDef(
+          self._handle, fdef_string, len(fdef_string)
+      )
+    else:
+      pywrap_tfe.TFE_ContextAddFunctionDefNoSerialization(self._handle, fdef)
 
   def get_function_def(self, name):
     """Get a function definition from the context.
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 70f6e0e90877b5..82cac2d18a53c8 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -336,7 +336,7 @@ def testJVPFunctionUsedByAccumulatorForOps(self):
     finally:
       pywrap_tfe.TFE_Py_RegisterJVPFunction(previous_fn)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testFunctionCacheLimited(self):
     # Every time this loop is executed, it will create a slightly larger Tensor
     # and push it through Add's gradient.
@@ -357,7 +357,7 @@ def testVariableUnwatchedZero(self):
     self.assertIsNone(acc.jvp(v))
     self.assertAllClose([[0.]], acc.jvp(v, unconnected_gradients="zero"))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testFunctionReturnsResource(self):
     v = variables.Variable([[1.]])
     x = constant_op.constant(1.)
@@ -371,7 +371,7 @@ def f(a):
       y, _ = f(x)
     self.assertAllClose(2., acc.jvp(y))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testMultipleWatchesAdd(self):
     x = constant_op.constant(-2.)
     with self.assertRaisesRegex(ValueError, "multiple times"):
@@ -387,7 +387,7 @@ def testMultipleWatchesAdd(self):
     self.assertAllClose(24., acc.jvp(x))
     self.assertAllClose(24. * 3., acc.jvp(y))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testReenter(self):
     x = constant_op.constant(-2.)
     with forwardprop.ForwardAccumulator(x, 1.5) as acc:
@@ -403,7 +403,7 @@ def testReenter(self):
       yy = y * y
     self.assertAllClose(6. * -8. * 2., acc.jvp(yy))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDeadTensorsJVPCleared(self):
     x = array_ops.ones([100])
     x_weak = weakref.ref(x)
@@ -424,14 +424,14 @@ def testDeadTensorsJVPCleared(self):
       self.assertIsNone(derived_tensor_weak())
       self.assertIsNone(derived_tensor_grad_weak())
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testJVPManual(self):
     primal, tangent = _jvp(math_ops.sin, (constant_op.constant(0.1),),
                            (constant_op.constant(0.2),))
     self.assertAllClose(math_ops.sin(0.1), primal)
     self.assertAllClose(math_ops.cos(0.1) * 0.2, tangent)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testNumericHigherOrder(self):
 
     def f(x):
@@ -448,7 +448,7 @@ def f(x):
         satol=1e-3,
     )
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testNumericHigherOrderFloat64(self):
 
     def f(x):
@@ -462,7 +462,7 @@ def f(x):
         [constant_op.constant([[2.0, 3.0], [1.0, 4.0]], dtype=dtypes.float64)],
         order=3)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testCustomGradient(self):
 
     @custom_gradient.custom_gradient
@@ -475,7 +475,7 @@ def grad(dy):
 
     _test_gradients(self, f, [constant_op.constant([1., 2.])], order=3)
 
-  # TODO(allenl): investigate why assert_no_new_pyobjects_executing_eagerly
+  # TODO(allenl): investigate why assert_no_new_pyobjects_executing_eagerly()
   # fails around this test?
   def testExceptionCustomGradientRecomputeGradForward(self):
 
@@ -563,7 +563,7 @@ def grad(dy):
       ("Order{}".format(order), order, expected)
       for order, expected in enumerate(_X11_35_DERIVATIVES)
   ])
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testHigherOrderPureForward(self, order, expected):
 
     def _forwardgrad(f):
@@ -606,7 +606,7 @@ def f(x):
     self.assertAllClose(3.5 * 2.5 * 1.1**1.5, outer_jvp)
     self.assertIsNone(acc.jvp(outer_acc.jvp(primal_out)))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testJVPPacking(self):
     two = constant_op.constant(2.)
     primal_in = constant_op.constant(1.)
@@ -688,7 +688,7 @@ def _expected(mat, tangent):
     self.assertAllClose(_expected(m1, tangent1), acc.jvp(result1))
     self.assertAllClose(_expected(m2, tangent2), acc.jvp(result2))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testHVPMemory(self):
 
     def fun(x):
@@ -698,7 +698,7 @@ def fun(x):
     tangents = constant_op.constant([3., 4., 5.])
     _hvp(fun, (primals,), (tangents,))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testHVPCorrectness(self):
 
     def fun(x):
@@ -725,7 +725,7 @@ def fun(x):
     self.assertAllClose(backback_hvp, forwardback_hvp_eager)
     self.assertAllClose(backback_hvp, forwardback_hvp_function)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testShouldRecordAndStopRecord(self):
     c = constant_op.constant(1.)
     c_tangent = constant_op.constant(2.)
@@ -747,7 +747,7 @@ def testShouldRecordAndStopRecord(self):
         self.assertIsNone(acc.jvp(d))
       self.assertIsNone(tape.gradient(d, c))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testRecordingSelectively(self):
     c = constant_op.constant(1.)
     c_tangent = constant_op.constant(2.)
@@ -774,7 +774,7 @@ def testRecordingSelectively(self):
         self.assertIsNone(tape.gradient(d, c))
         self.assertAllClose(3., tape.gradient(e, c))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testOpWithNoTrainableOutputs(self):
     if sys.version_info.major == 3 and sys.version_info.minor in (11, 12):
       # TODO(b/264947738)
@@ -847,7 +847,7 @@ def testBackwardOverForward(self, forward_prop_first):
         self.assertTrue(record.should_record_backprop((acc.jvp(d),)))
       self.assertAllClose(-.1 * math_ops.cos(1.), tape.gradient(acc.jvp(d), c))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testRecordingWithJVPIndices(self):
     c = constant_op.constant(1.)
     with forwardprop.ForwardAccumulator(c, 10.) as acc:
@@ -861,7 +861,7 @@ def testRecordingWithJVPIndices(self):
                                                None, (((0, 1),),))
       self.assertAllClose(3., acc.jvp(d))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testSpecialForwardFunctionUsed(self):
     c = constant_op.constant(1.)
     d = constant_op.constant(2.)
@@ -875,7 +875,7 @@ def testSpecialForwardFunctionUsed(self):
                               lambda x: [x])
       self.assertAllClose(-20., acc.jvp(e))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testVariableWatched(self):
     if sys.version_info.major == 3 and sys.version_info.minor in (11, 12):
       # TODO(b/264947738)
@@ -1015,25 +1015,25 @@ def _fprop_cond(k, y):
 
 class ControlFlowTests(test.TestCase):
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testOfFunctionWhile(self):
     y = constant_op.constant(1.)
     with forwardprop.ForwardAccumulator(y, 1.) as acc:
       self.assertAllClose(10., acc.jvp(_has_loop(constant_op.constant(5), y)))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testOfFunctionCond(self):
     y = constant_op.constant(1.)
     with forwardprop.ForwardAccumulator(y, 1.) as acc:
       self.assertAllClose(3., acc.jvp(_has_cond(constant_op.constant(5), y)))
       self.assertAllClose(0., acc.jvp(_has_cond(constant_op.constant(0), y)))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testInFunctionWhile(self):
     self.assertAllClose(
         10., _fprop_while(constant_op.constant(5), constant_op.constant(1.)))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testInFunctionCond(self):
     self.assertAllClose(
         3., _fprop_cond(constant_op.constant(5), constant_op.constant(1.)))
diff --git a/tensorflow/python/eager/memory_tests/memory_test.py b/tensorflow/python/eager/memory_tests/memory_test.py
index ee5104ef27b343..3503058b0012cd 100644
--- a/tensorflow/python/eager/memory_tests/memory_test.py
+++ b/tensorflow/python/eager/memory_tests/memory_test.py
@@ -61,7 +61,7 @@ def graph(x):
     memory_test_util.assert_no_leak(
         f, num_iters=1000, increase_threshold_absolute_mb=30)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testNestedFunctionsDeleted(self):
 
     @def_function.function
diff --git a/tensorflow/python/eager/polymorphic_function/BUILD b/tensorflow/python/eager/polymorphic_function/BUILD
index e98fe42f55917e..2160b5a8f91c2d 100644
--- a/tensorflow/python/eager/polymorphic_function/BUILD
+++ b/tensorflow/python/eager/polymorphic_function/BUILD
@@ -118,7 +118,6 @@ py_strict_library(
         "//tensorflow/python/profiler:trace",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/types:core",
-        "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:object_identity",
diff --git a/tensorflow/python/eager/polymorphic_function/concrete_function.py b/tensorflow/python/eager/polymorphic_function/concrete_function.py
index 3bbc4deeca4aaf..a68acdd94d40e0 100644
--- a/tensorflow/python/eager/polymorphic_function/concrete_function.py
+++ b/tensorflow/python/eager/polymorphic_function/concrete_function.py
@@ -47,7 +47,6 @@
 from tensorflow.python.profiler import trace
 from tensorflow.python.trackable import base as trackable
 from tensorflow.python.types import core
-from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
@@ -1735,11 +1734,6 @@ def _export_to_saved_model_graph(self, object_map, tensor_map,
     return []
 
 
-_pywrap_utils.RegisterType("Tensor", tensor_lib.Tensor)
-_pywrap_utils.RegisterType("EagerTensor", ops.EagerTensor)
-_pywrap_utils.RegisterType("IndexedSlices", indexed_slices.IndexedSlices)
-
-
 class ConcreteFunctionGarbageCollector:
   """Cleans up reference cycles when a `ConcreteFunction` goes out of scope."""
 
diff --git a/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py b/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py
index 64aab16798ebf4..663562a347b59f 100644
--- a/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py
+++ b/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py
@@ -3833,7 +3833,7 @@ def testMethodReferenceCycles(self):
     # function itself is not involved in a reference cycle.
     self.assertIs(None, weak_fn())
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testErrorMessageWhenGraphTensorIsPassedToEager(self):
 
     @polymorphic_function.function
diff --git a/tensorflow/python/eager/polymorphic_function/tracing_compilation_test.py b/tensorflow/python/eager/polymorphic_function/tracing_compilation_test.py
index 96ea55beeb8077..42d8091ed960d5 100644
--- a/tensorflow/python/eager/polymorphic_function/tracing_compilation_test.py
+++ b/tensorflow/python/eager/polymorphic_function/tracing_compilation_test.py
@@ -385,7 +385,7 @@ def sum_gather():
     expected = self.evaluate(sum_gather())
     self.assertAllEqual(expected, self.evaluate(defined()))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testCallOptionsMemory(self):
     @compiled_fn
     def model(x):
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index dc6db8b6e78962..306e275c63905c 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -1095,6 +1095,15 @@ PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
     PyErr_SetString(PyExc_RuntimeError, "Error while creating EagerTensorType");
     return nullptr;
   }
+#if PY_VERSION_HEX >= 0x030B0000
+  // Py_TPFLAGS_MANAGED_DICT is turned on by PyType_FromSpecWithBases by
+  // default. It tells Python that the class's __dict__ should be managed by VM,
+  // but EagerTensor sets a `tp_dictoffset` (below) to explicitly manage the
+  // dict. See:
+  // - https://docs.python.org/3/c-api/typeobj.html#c.Py_TPFLAGS_MANAGED_DICT
+  // - https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_dictoffset
+  EagerTensorType->tp_flags &= ~Py_TPFLAGS_MANAGED_DICT;
+#endif
   EagerTensorType->tp_dictoffset = offsetof(EagerTensor, dict);
   EagerTensorType->tp_as_buffer = &EagerTensor_as_buffer;
 #else
diff --git a/tensorflow/python/eager/pywrap_tensor_test.py b/tensorflow/python/eager/pywrap_tensor_test.py
index a684a80658fa10..c1539b24f802c3 100644
--- a/tensorflow/python/eager/pywrap_tensor_test.py
+++ b/tensorflow/python/eager/pywrap_tensor_test.py
@@ -17,6 +17,18 @@
 import numpy as np
 from tensorflow.python.eager import pywrap_tensor_test_util as util
 from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+
+
+class MyPythonObject:
+  pass
+
+
+def my_layer(x):
+  y = x**2
+  y.my_dynamic_attribute = MyPythonObject()
+  return y
 
 
 class PywrapTensorTest(test.TestCase):
@@ -26,6 +38,14 @@ def testGetScalarOne(self):
     self.assertIsInstance(result, np.ndarray)
     self.assertAllEqual(result, 1.0)
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
+  def test_no_leak(self):
+    x = constant_op.constant([1, 2, 3])
+    layer = my_layer(x)
+    for _ in range(int(1e2)):
+      layer = my_layer(x)
+    self.assertIsNotNone(layer)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index ea5e6006b9fa24..532d7f1555521f 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -86,7 +86,7 @@ def testNumpyValue(self):
     t = _create_tensor(values)
     self.assertAllEqual(values, t)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testNumpyDtypeSurvivesThroughTensorConversion(self):
     scalar_creators = [np.int32, np.int64, np.float32, np.float64]
     conversion_functions = [ops.convert_to_tensor, constant_op.constant]
@@ -359,7 +359,7 @@ def testConvertToTensorAllowsOverflow(self):
     _ = ops.convert_to_tensor(123456789, dtype=dtypes.uint8)
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testConvertToTensorNumpyZeroDim(self):
     for np_type, dtype in [(np.int32, dtypes.int32), (np.half, dtypes.half),
                            (np.float32, dtypes.float32)]:
@@ -370,7 +370,7 @@ def testConvertToTensorNumpyZeroDim(self):
       self.assertAllEqual(x, [65, 16])
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testConvertToTensorNumpyScalar(self):
     x = ops.convert_to_tensor([
         np.array(321, dtype=np.int64).item(),
@@ -422,19 +422,19 @@ def testMemoryviewIsReadonly(self):
     t = constant_op.constant([0.0])
     self.assertTrue(memoryview(t).readonly)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testMemoryviewScalar(self):
     t = constant_op.constant(42.0)
     self.assertAllEqual(
         np.array(memoryview(t)), np.array(42.0, dtype=np.float32))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testMemoryviewEmpty(self):
     t = constant_op.constant([], dtype=np.float32)
     self.assertAllEqual(np.array(memoryview(t)), np.array([]))
 
   @test_util.run_gpu_only
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testMemoryviewCopyToCPU(self):
     with ops.device("/device:GPU:0"):
       t = constant_op.constant([0.0])
@@ -620,7 +620,7 @@ def testSliceDimOutOfRange(self):
         "but tensor at index 2 has rank 0"):
       pywrap_tfe.TFE_Py_TensorShapeSlice([t2, t1, t3], 0)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testTensorDir(self):
     t = array_ops.ones(1)
     t.test_attr = "Test"
@@ -639,7 +639,7 @@ def testNonRectangularPackAsConstant(self):
     with self.assertRaisesRegex(ValueError, "non-rectangular Python sequence"):
       constant_op.constant(l)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testFloatAndIntAreConvertibleToComplex(self):
     a = [[1., 1], [1j, 2j]]
     np_value = np.array(a, dtype=np.complex128)
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index 65228aeb2bbe19..5a641aba2da70f 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -224,7 +224,7 @@ def __init__(self, fn_graph, variable_holder, attrs=None, signature=None):
     _lift_unlifted_variables(fn_graph, variable_holder)
     # We call __init__ after lifting variables so that the function's signature
     # properly reflects the new captured inputs.
-    for f in fn_graph.as_graph_def().library.function:
+    for f in fn_graph.as_graph_def(use_pybind11_proto=True).library.function:
       context.context().add_function_def(f)
     self._signature = signature
     function_type = function_type_lib.from_structured_signature(
diff --git a/tensorflow/python/flags_pybind.pyi b/tensorflow/python/flags_pybind.pyi
index ad94fa3e713c7a..fbf7124eac2f0a 100644
--- a/tensorflow/python/flags_pybind.pyi
+++ b/tensorflow/python/flags_pybind.pyi
@@ -19,6 +19,7 @@ class Flag:
     def value(self) -> bool: ...
 
 class Flags:
+    enable_aggressive_constant_replication: Flag
     enable_nested_function_shape_inference: Flag
     enable_quantized_dtypes_training: Flag
     graph_building_optimization: Flag
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index 43dd8056fd6cfd..a697162b4b91cf 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -19,8 +19,12 @@ load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict
 load("//tensorflow/core/platform:build_config.bzl", "pyx_library", "tf_additional_all_protos", "tf_additional_lib_deps", "tf_proto_library", "tf_protos_grappler")  # @unused
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static", "tf_additional_xla_deps_py")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "cuda_py_benchmark_test",
+)
 
-visibility = tf_python_framework_friends()
+visibility = tf_python_framework_friends()  # buildifier: disable=package-on-top
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -233,10 +237,11 @@ tf_cc_test(
     ],
 )
 
+# Do not depend on this rule! Depend on the fine-grained sub-targets instead.
 py_strict_library(
     name = "for_generated_wrappers",
-    deprecation = "Depending on this target can cause build dependency cycles. Depend on the fine-grained sub-targets instead.",
     srcs_version = "PY3",
+    tags = ["avoid_dep"],
     visibility = ["//visibility:public"],
     deps = [
         ":byte_swap_tensor",
@@ -254,13 +259,14 @@ py_strict_library(
     ],
 )
 
-# What is needed for tf_gen_op_wrapper_py. This is the same as
-# "for_generated_wrappers" minus the "function" dep. This is to avoid
-# circular dependencies, as "function" uses generated op wrappers.
+# This rule should only be depended on by tf_gen_op_wrapper_py.
+# Do not depend on this rule! Depend on the fine-grained sub-targets instead.
+# This is the same as "for_generated_wrappers" minus the "function" dep.
+# This is to avoid circular dependencies, as "function" uses generated op wrappers.
 py_strict_library(
     name = "for_generated_wrappers_v2",
-    deprecation = "Depending on this target can cause build dependency cycles. Depend on the fine-grained sub-targets instead.",
     srcs_version = "PY3",
+    tags = ["avoid_dep"],
     visibility = ["//visibility:public"],
     deps = [
         ":byte_swap_tensor",
@@ -296,81 +302,6 @@ py_strict_library(
     ],
 )
 
-py_strict_library(
-    name = "framework",
-    deprecation = "This target has been split. Depend on the sub-targets instead.",
-    srcs_version = "PY3",
-    visibility = visibility + ["//tensorflow:internal"],
-    deps = [
-        ":_errors_test_helper",
-        ":_pywrap_python_api_dispatcher",
-        ":_pywrap_python_api_info",
-        ":_pywrap_python_api_parameter_converter",
-        ":_pywrap_python_op_gen",
-        ":byte_swap_tensor",
-        ":c_api_util",
-        ":composite_tensor",
-        ":config",
-        ":cpp_shape_inference_proto_py",
-        ":device",
-        ":dtypes",
-        ":error_interpolation",
-        ":errors",
-        ":fast_tensor_util",
-        ":for_generated_wrappers",
-        ":framework_lib",
-        ":function",
-        ":graph_io",
-        ":graph_util",
-        ":importer",
-        ":indexed_slices",
-        ":load_library",
-        ":meta_graph",
-        ":op_def_registry",
-        ":ops",
-        ":random_seed",
-        ":sparse_tensor",
-        ":tensor",
-        ":tensor_conversion_registry",
-        ":tensor_shape",
-        ":tensor_spec",
-        ":tensor_util",
-        ":type_spec",
-        ":versions",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:_pywrap_py_exception_registry",
-        "//tensorflow/python:_pywrap_quantize_training",
-        "//tensorflow/python:pywrap_mlir",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python:tf2",
-        "//tensorflow/python/client:_pywrap_debug_events_writer",
-        "//tensorflow/python/client:_pywrap_events_writer",
-        "//tensorflow/python/client:pywrap_tf_session",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/lib/core:_pywrap_py_func",  # TODO(b/142001480): remove once the bug is fixed.
-        "//tensorflow/python/lib/io:file_io",
-        "//tensorflow/python/ops:control_flow_util",
-        "//tensorflow/python/platform:_pywrap_stacktrace_handler",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:_pywrap_checkpoint_reader",
-        "//tensorflow/python/util:_pywrap_kernel_registry",
-        "//tensorflow/python/util:_pywrap_nest",
-        "//tensorflow/python/util:_pywrap_stat_summarizer",
-        "//tensorflow/python/util:_pywrap_tfprof",
-        "//tensorflow/python/util:_pywrap_transform_graph",
-        "//tensorflow/python/util:_pywrap_util_port",
-        "//tensorflow/python/util:_pywrap_utils",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-        "@pypi_packaging//:pkg",
-    ] + if_xla_available([
-        "//tensorflow/python:_pywrap_tfcompile",
-    ]),
-)
-
 py_strict_library(
     name = "byte_swap_tensor",
     srcs = ["byte_swap_tensor.py"],
@@ -405,7 +336,10 @@ py_strict_library(
 
 py_strict_library(
     name = "constant_op",
-    srcs = ["constant_op.py"],
+    srcs = [
+        "constant_op.py",
+        "constant_tensor_conversion.py",
+    ],
     srcs_version = "PY3",
     visibility = visibility + [
         "//smartass:__subpackages__",
@@ -776,6 +710,23 @@ py_strict_library(
     ],
 )
 
+py_strict_library(
+    name = "override_binary_operator",
+    srcs = ["override_binary_operator.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":dtypes",
+        ":ops",
+        ":tensor",
+        ":tensor_shape",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops/numpy_ops:np_dtypes",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:traceback_utils",
+        "//third_party/py/numpy",
+    ],
+)
+
 cc_library(
     name = "py_context_manager",
     srcs = ["py_context_manager.cc"],
@@ -1652,6 +1603,7 @@ py_strict_library(
         ":constant_op",
         ":dtypes",
         ":ops",
+        ":override_binary_operator",
         ":tensor",
         ":tensor_shape",
         ":tensor_spec",
@@ -1662,10 +1614,10 @@ py_strict_library(
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:tf2",
         "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:math_ops_gen",
         "//tensorflow/python/ops:sparse_ops_gen",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/types:internal",
-        "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -1723,7 +1675,6 @@ py_strict_library(
     visibility = visibility,
     deps = [
         "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
@@ -1869,7 +1820,6 @@ pytype_strict_library(
         "//tensorflow/python/types:core",
         "//tensorflow/python/types:internal",
         "//tensorflow/python/types:trace",
-        "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:nest",
@@ -2000,7 +1950,9 @@ pytype_strict_library(
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
-    ],
+    ] + if_xla_available([
+        "//tensorflow/python:_pywrap_tfcompile",
+    ]),
 )
 
 pytype_strict_library(
@@ -2013,10 +1965,12 @@ pytype_strict_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
-    ],
+    ] + if_xla_available([
+        "//tensorflow/python:_pywrap_tfcompile",
+    ]),
 )
 
-py_strict_library(
+pytype_strict_library(
     name = "stack",
     srcs = ["stack.py"],
     visibility = visibility + ["//tensorflow:internal"],
@@ -2057,7 +2011,6 @@ py_strict_library(
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/types:core",
         "//tensorflow/python/types:internal",
-        "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:object_identity",
@@ -2103,7 +2056,7 @@ py_strict_library(
     ],
 )
 
-py_strict_library(
+pytype_strict_library(
     name = "traceable_stack",
     srcs = ["traceable_stack.py"],
     srcs_version = "PY3",
@@ -2129,7 +2082,7 @@ py_strict_library(
     deps = [],
 )
 
-py_strict_library(
+pytype_strict_library(
     name = "test_lib",
     srcs = ["test_util.py"],
     srcs_version = "PY3",
@@ -3254,7 +3207,7 @@ tf_py_strict_test(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "graph_building_benchmark",
     size = "medium",
     srcs = ["graph_building_benchmark.py"],
diff --git a/tensorflow/python/framework/composite_tensor.py b/tensorflow/python/framework/composite_tensor.py
index 05b4f672793f3e..6e1651ab5e7b88 100644
--- a/tensorflow/python/framework/composite_tensor.py
+++ b/tensorflow/python/framework/composite_tensor.py
@@ -17,7 +17,6 @@
 import abc
 
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
-from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -99,9 +98,6 @@ def _convert_variables_to_tensors(self):
     return self
 
 
-_pywrap_utils.RegisterType("CompositeTensor", CompositeTensor)
-
-
 def replace_composites_with_components(structure):
   """Recursively replaces CompositeTensors with their components.
 
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index 9c2d8d21a7c0b8..1371d6495e2e64 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -25,6 +25,11 @@
 from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
+# Import constant_tensor_conversion.py to register tensor conversion functions
+# for builtins. These functions were previously in this file, but were
+# refactored out so they can be registered at TF import time without importing
+# all of constant_op.py.
+from tensorflow.python.framework import constant_tensor_conversion  # pylint: disable=unused-import
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor as tensor_lib
@@ -329,24 +334,6 @@ def is_constant(tensor_or_op):
   return op.type == "Const"
 
 
-def _constant_tensor_conversion_function(v, dtype=None, name=None,
-                                         as_ref=False):
-  _ = as_ref
-  return constant(v, dtype=dtype, name=name)
-
-# Register the conversion function for the "unconvertible" types
-# as a conversion to a constant.
-tensor_conversion_registry.register_tensor_conversion_function_internal(
-    tensor_conversion_registry._CONSTANT_OP_CONVERTIBLES,  # pylint: disable=protected-access
-    _constant_tensor_conversion_function,
-    0)
-
-tensor_conversion_registry.register_tensor_conversion_function(
-    (list, tuple), _constant_tensor_conversion_function, 100)
-tensor_conversion_registry.register_tensor_conversion_function(
-    object, _constant_tensor_conversion_function, 200)
-
-
 def _tensor_shape_tensor_conversion_function(s,
                                              dtype=None,
                                              name=None,
diff --git a/tensorflow/python/framework/constant_tensor_conversion.py b/tensorflow/python/framework/constant_tensor_conversion.py
new file mode 100644
index 00000000000000..c02cd37c2ac9da
--- /dev/null
+++ b/tensorflow/python/framework/constant_tensor_conversion.py
@@ -0,0 +1,45 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tensor conversion factory functions for builtins to constant Tensors."""
+
+from tensorflow.python.framework import tensor_conversion_registry
+
+
+# Factory function for tensor conversion for builtins. Import constant_op.py
+# in-line so that it is only imported when it is needed. This file is imported
+# at TF import time, thus that helps reduce import slowness.
+def _constant_tensor_conversion_function(
+    v, dtype=None, name=None, as_ref=False
+):
+  from tensorflow.python.framework import constant_op  # pylint: disable=g-import-not-at-top
+
+  _ = as_ref
+  return constant_op.constant(v, dtype=dtype, name=name)
+
+
+# Register the conversion function for the "unconvertible" types
+# as a conversion to a constant.
+tensor_conversion_registry.register_tensor_conversion_function_internal(
+    tensor_conversion_registry._CONSTANT_OP_CONVERTIBLES,  # pylint: disable=protected-access
+    _constant_tensor_conversion_function,
+    0,
+)
+
+tensor_conversion_registry.register_tensor_conversion_function(
+    (list, tuple), _constant_tensor_conversion_function, 100
+)
+tensor_conversion_registry.register_tensor_conversion_function(
+    object, _constant_tensor_conversion_function, 200
+)
diff --git a/tensorflow/python/framework/experimental/BUILD b/tensorflow/python/framework/experimental/BUILD
index 2c146b1f865e6e..2d7a8f11129a7f 100644
--- a/tensorflow/python/framework/experimental/BUILD
+++ b/tensorflow/python/framework/experimental/BUILD
@@ -2,6 +2,10 @@
 
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_python_pybind_extension")
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "cuda_py_benchmark_test",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -178,7 +182,7 @@ cuda_py_strict_test(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "graph_building_test",
     size = "small",
     srcs = ["graph_building_test.py"],
diff --git a/tensorflow/python/framework/override_binary_operator.py b/tensorflow/python/framework/override_binary_operator.py
new file mode 100644
index 00000000000000..6e5081b8811b70
--- /dev/null
+++ b/tensorflow/python/framework/override_binary_operator.py
@@ -0,0 +1,169 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Binary operator override class for Tensor overrides."""
+import numbers
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor as tensor_lib
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops.numpy_ops import np_dtypes
+from tensorflow.python.util import nest
+from tensorflow.python.util import traceback_utils
+
+
+def _maybe_get_dtype(x):
+  """Returns a numpy type if available from x. Skips if x is numpy.ndarray."""
+  # Don't put np.ndarray in this list, because np.result_type looks at the
+  # value (not just dtype) of np.ndarray to decide the result type.
+  if isinstance(x, numbers.Real):
+    return x
+  if isinstance(x, tensor_lib.Tensor):
+    return x.dtype.as_numpy_dtype
+  if isinstance(x, dtypes.DType):
+    return x.as_numpy_dtype
+  if isinstance(x, tensor_shape.TensorShape):
+    return np.int32
+  if isinstance(x, (list, tuple)):
+    raise ValueError(f"Cannot determine dtype.  Got sequence {x}.")
+  return x
+
+
+def maybe_promote_tensors(*tensors, force_same_dtype=False):
+  """Promotes tensors if numpy style promotion is enabled.
+
+  This function promotes `tensors` according to numpy promotion rules
+  if numpy style promotion is enabled.  Otherwise, if
+  `force_same_dtype` is `True`, it force-casts `tensors[1:]` to
+  `tensor[0]`'s dtype. Note that this force-cast can be problematic.
+  For example, when some `tensors[1:]` elements can be silently
+  downcasted.
+
+  Args:
+    *tensors: the list of tensors to promote.
+    force_same_dtype: bool (optional, default to `False`). When numpy
+      style promotion is disabled and `force_same_dtype` is `True`,
+      this function will force-casts `tensors[1:]` to `tensor[0]`'s
+      dtype (which could be problematic).
+
+  Returns:
+    The promoted list of tensors.
+  """
+  if ops.is_auto_dtype_conversion_enabled():
+    return tensors
+  if not tensors:
+    return tensors
+  if not ops.is_numpy_style_type_promotion():
+    if not force_same_dtype:
+      return tensors
+    promoted_tensors = []
+    promoted_tensors.append(tensors[0])
+    dtype = tensors[0].dtype.base_dtype
+    for tensor in tensors[1:]:
+      promoted_tensors.append(
+          ops.convert_to_tensor(tensor, dtype, name="x"))
+    return promoted_tensors
+  result_type = np_dtypes._result_type(  # pylint: disable=protected-access
+      *[_maybe_get_dtype(x) for x in nest.flatten(tensors)])
+  def _promote_or_cast(x):
+    if isinstance(x, tensor_lib.Tensor):
+      x = gen_math_ops.cast(x, result_type)
+    else:
+      x = ops.convert_to_tensor(x, result_type)
+    return x
+  return [_promote_or_cast(x) for x in tensors]
+
+
+# pylint: disable=protected-access
+def override_binary_operator_helper(
+    func, op_name, clazz_object=tensor_lib.Tensor):
+  """Register operators with different tensor and scalar versions.
+
+  If `clazz_object` is `SparseTensor`, assumes `func` takes `(sp_indices,
+  sp_values, sp_shape, dense)` and outputs `(new_sp_values)`.
+
+  Args:
+    func: the operator
+    op_name: name of the operator being overridden
+    clazz_object: class to override for.  Either `Tensor` or `SparseTensor`.
+  """
+
+  @traceback_utils.filter_traceback
+  def binary_op_wrapper(x, y):
+    with ops.name_scope(None, op_name, [x, y]) as name:
+      try:
+        # force_same_dtype=False to preserve existing TF behavior
+        # TODO(b/178860388): Figure out why binary_op_wrapper and
+        #   r_binary_op_wrapper use different force_same_dtype values.
+        x, y = maybe_promote_tensors(x, y)
+        return func(x, y, name=name)
+      except (TypeError, ValueError) as e:
+        # Even if dispatching the op failed, the RHS may be a tensor aware
+        # object that can implement the operator with knowledge of itself
+        # and the tensor.
+        # If the RHS is not tensor aware we still want to raise the
+        # original error from the LHS, because it may be more
+        # informative.
+        if hasattr(type(y), "__r%s__" % op_name):
+          try:
+            r_op = getattr(y, "__r%s__" % op_name)
+            out = r_op(x)
+            if out is NotImplemented:
+              raise
+            return out
+          except (TypeError, ValueError):
+            raise e
+        else:
+          raise
+
+  @traceback_utils.filter_traceback
+  def binary_op_wrapper_sparse(sp_x, y):
+    with ops.name_scope(None, op_name, [sp_x, y]) as name:
+      y = ops.convert_to_tensor(y, dtype=sp_x.dtype.base_dtype, name="y")
+      # use the passed-in SparseTensor class to avoid having to import
+      # SparseTensor, which would cause a cyclic dep with math_ops
+      return clazz_object(
+          sp_x.indices,
+          func(sp_x.indices, sp_x.values, sp_x.dense_shape, y, name=name),
+          sp_x.dense_shape)
+
+  @traceback_utils.filter_traceback
+  def r_binary_op_wrapper(y, x):
+    with ops.name_scope(None, op_name, [x, y]) as name:
+      # TODO(b/178860388): Figure out why binary_op_wrapper and
+      #   r_binary_op_wrapper use different force_same_dtype values.
+      y, x = maybe_promote_tensors(y, x, force_same_dtype=True)
+      return func(x, y, name=name)
+
+  # Propagate func.__doc__ to the wrappers
+  try:
+    doc = func.__doc__
+  except AttributeError:
+    doc = None
+  binary_op_wrapper.__doc__ = doc
+  r_binary_op_wrapper.__doc__ = doc
+  binary_op_wrapper_sparse.__doc__ = doc
+
+  if clazz_object is tensor_lib.Tensor:
+    clazz_object._override_operator("__%s__" % op_name, binary_op_wrapper)
+    del binary_op_wrapper
+    clazz_object._override_operator("__r%s__" % op_name, r_binary_op_wrapper)
+    del r_binary_op_wrapper
+  else:
+    clazz_object._override_operator("__%s__" % op_name,
+                                    binary_op_wrapper_sparse)
+    del binary_op_wrapper_sparse
diff --git a/tensorflow/python/framework/python_api_dispatcher.cc b/tensorflow/python/framework/python_api_dispatcher.cc
index ae50c87dc334be..805007dc4982bf 100644
--- a/tensorflow/python/framework/python_api_dispatcher.cc
+++ b/tensorflow/python/framework/python_api_dispatcher.cc
@@ -29,12 +29,28 @@ namespace py_dispatch {
 
 namespace {
 
+PyObject* ImportTypeFromModule(const char* module_name, const char* type_name) {
+  static PyObject* given_type = [module_name, type_name]() {
+    PyObject* module = PyImport_ImportModule(module_name);
+    PyObject* attr =
+        module ? PyObject_GetAttrString(module, type_name) : nullptr;
+    if (attr == nullptr) {
+      PyErr_WriteUnraisable(nullptr);
+      PyErr_Clear();
+    }
+    if (module) Py_DECREF(module);
+    return attr;
+  }();
+  return given_type;
+}
+
 std::vector<Safe_PyObjectPtr>& GetRegisteredDispatchableTypes() {
   static std::vector<Safe_PyObjectPtr>* registered_dispatchable_types =
       new std::vector<Safe_PyObjectPtr>();
   if (registered_dispatchable_types->empty()) {
-    static PyObject* composite_tensor =
-        swig::GetRegisteredPyObject("CompositeTensor");
+    static PyObject* composite_tensor = ImportTypeFromModule(
+        "tensorflow.python.framework.composite_tensor",
+        "CompositeTensor");
     Py_INCREF(composite_tensor);
     registered_dispatchable_types->push_back(
         Safe_PyObjectPtr(composite_tensor));
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 0b870b54c96662..b578bf6e0a1545 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -25,6 +25,7 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import override_binary_operator
 from tensorflow.python.framework import tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
@@ -32,10 +33,10 @@
 from tensorflow.python.framework import type_spec
 from tensorflow.python.framework import type_spec_registry
 from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.types import internal
-from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=protected-access
@@ -371,7 +372,6 @@ def _is_eager(self):
 SparseTensorValue = collections.namedtuple("SparseTensorValue",
                                            ["indices", "values", "dense_shape"])
 tf_export(v1=["SparseTensorValue"])(SparseTensorValue)
-_pywrap_utils.RegisterType("SparseTensorValue", SparseTensorValue)
 
 
 @tf_export("SparseTensorSpec")
@@ -574,3 +574,68 @@ def is_sparse(x):
     `tf.compat.v1.SparseTensorValue`.
   """
   return isinstance(x, (SparseTensor, SparseTensorValue))
+
+
+# Conversion table for __truediv__.  None entries mean no conversion required.
+_TRUEDIV_TABLE = {
+    dtypes.uint8: dtypes.float32,
+    dtypes.int8: dtypes.float32,
+    dtypes.uint16: dtypes.float32,
+    dtypes.int16: dtypes.float32,
+    dtypes.uint32: dtypes.float64,
+    dtypes.int32: dtypes.float64,
+    dtypes.uint64: dtypes.float64,
+    dtypes.int64: dtypes.float64,
+    dtypes.bfloat16: None,
+    dtypes.float16: None,
+    dtypes.float32: None,
+    dtypes.float64: None,
+    dtypes.complex64: None,
+    dtypes.complex128: None,
+}
+
+
+# NOTE: the support of "sparse (true)div dense" is currently not baked in into
+# "tf.(true_)div()".  Until such an API decision is made, the supported usage is
+# to explicitly use the "/" operator to invoke either truediv or div.
+def _sparse_dense_truediv(sp_indices, sp_values, sp_shape, y, name=None):
+  """Internal helper function for 'sp_t / dense_t'."""
+  with ops.name_scope(
+      name, "truediv", [sp_indices, sp_values, sp_shape, y]
+  ) as name:
+    sp_values = ops.convert_to_tensor(sp_values, name="sp_values")
+    y = ops.convert_to_tensor(y, name="y")
+    x_dtype = sp_values.dtype.base_dtype
+    y_dtype = y.dtype.base_dtype
+    if x_dtype != y_dtype:
+      raise TypeError(
+          "`x` and `y` must have the same dtype, "
+          f"got {x_dtype!r} != {y_dtype!r}."
+      )
+    try:
+      dtype = _TRUEDIV_TABLE[x_dtype]
+    except KeyError as exc:
+      raise TypeError(
+          f"Invalid dtype {x_dtype!r} in __truediv__. Expected one "
+          f"of {{{', '.join([repr(x) for x in _TRUEDIV_TABLE.keys()])}}}."
+      ) from exc
+    if dtype is not None:
+      sp_values = gen_math_ops.cast(sp_values, dtype)
+      y = gen_math_ops.cast(y, dtype)
+    return gen_sparse_ops.sparse_dense_cwise_div(
+        sp_indices, sp_values, sp_shape, y, name=name
+    )
+
+
+# NOTE(aselle): When integer division is added for sparse_dense_cwise,
+# div, truediv, and floordiv should be delegated appropriately for
+# Python semantics, analogous to dense cwise tensor operations.
+override_binary_operator.override_binary_operator_helper(
+    gen_sparse_ops.sparse_dense_cwise_div, "div", SparseTensor
+)  # pylint: disable=protected-access
+override_binary_operator.override_binary_operator_helper(
+    _sparse_dense_truediv, "truediv", SparseTensor
+)  # pylint: disable=protected-access
+override_binary_operator.override_binary_operator_helper(
+    gen_sparse_ops.sparse_dense_cwise_mul, "mul", SparseTensor
+)  # pylint: disable=protected-access
diff --git a/tensorflow/python/framework/stack.py b/tensorflow/python/framework/stack.py
index 5a1e8fbd1311fd..a91fc99be530e9 100644
--- a/tensorflow/python/framework/stack.py
+++ b/tensorflow/python/framework/stack.py
@@ -14,39 +14,43 @@
 # ==============================================================================
 """Classes used to handle thread-local stacks."""
 
+from collections.abc import Iterator
 import threading
+from typing import Generic, Optional, TypeVar
 
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
+T = TypeVar("T")
 
-class DefaultStack(threading.local):
+
+class DefaultStack(threading.local, Generic[T]):
   """A thread-local stack of objects for providing implicit defaults."""
 
   def __init__(self):
     super().__init__()
     self._enforce_nesting = True
-    self.stack = []
+    self.stack: list[T] = []
 
-  def get_default(self):
+  def get_default(self) -> Optional[T]:
     return self.stack[-1] if self.stack else None
 
-  def reset(self):
+  def reset(self) -> None:
     self.stack = []
 
-  def is_cleared(self):
+  def is_cleared(self) -> bool:
     return not self.stack
 
   @property
-  def enforce_nesting(self):
+  def enforce_nesting(self) -> bool:
     return self._enforce_nesting
 
   @enforce_nesting.setter
-  def enforce_nesting(self, value):
+  def enforce_nesting(self, value: bool):
     self._enforce_nesting = value
 
   @tf_contextlib.contextmanager
-  def get_controller(self, default):
+  def get_controller(self, default: T) -> Iterator[T]:
     """A context manager for manipulating a default stack."""
     self.stack.append(default)
     try:
diff --git a/tensorflow/python/framework/tensor.py b/tensorflow/python/framework/tensor.py
index 5fa83194866e8d..823d31d38eeb12 100644
--- a/tensorflow/python/framework/tensor.py
+++ b/tensorflow/python/framework/tensor.py
@@ -40,7 +40,6 @@
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.types import core as core_tf_types
 from tensorflow.python.types import internal
-from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import object_identity
@@ -1455,7 +1454,6 @@ def do_decode(self, value, decode_fn):
 nested_structure_coder.register_codec(_BoundedTensorSpecCodec())
 
 trace_type.register_serializable(BoundedTensorSpec)
-_pywrap_utils.RegisterType("TensorSpec", TensorSpec)
 
 # Note: we do not include Tensor names when constructing TypeSpecs.
 type_spec.register_type_spec_from_value_converter(
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 888b681cee369a..4c982e87873f09 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -18,7 +18,7 @@
 
 import collections
 from collections import OrderedDict
-from collections.abc import Iterator
+from collections.abc import Iterable, Iterator, Callable, Collection, Sequence
 import contextlib
 import functools
 import gc
@@ -30,16 +30,21 @@
 import tempfile
 import threading
 import time
-from typing import Union
+from typing import Any, cast, Union, Optional, overload, TypeVar
 import unittest
 
 from absl.testing import parameterized
 import numpy as np
 
 from google.protobuf import descriptor_pool
+from google.protobuf import message
 from google.protobuf import text_format
 from tensorflow.core.config import flags
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.core.framework import tensor_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_sanitizers
 from tensorflow.python import tf2
@@ -99,13 +104,19 @@
 from tensorflow.python.util.tf_export import tf_export
 
 
+_F = TypeVar("_F", bound=Callable[..., Any])
+_T = TypeVar("_T")
+_TC = TypeVar("_TC", bound=type["TensorFlowTestCase"])
+
+
 # If the below import is made available through the BUILD rule, then this
 # function is overridden and will instead return True and cause Tensorflow
 # graphs to be compiled with XLA.
-def is_xla_enabled():
+def is_xla_enabled() -> bool:
   return False
 
 
+# pytype: disable=import-error
 try:
   from tensorflow.python.framework.is_xla_test_true import is_xla_enabled  # pylint: disable=g-import-not-at-top, unused-import
 except Exception:  # pylint: disable=broad-except
@@ -114,7 +125,7 @@ def is_xla_enabled():
 
 # Uses the same mechanism as above to selectively enable/disable MLIR
 # compilation.
-def is_mlir_bridge_enabled():
+def is_mlir_bridge_enabled() -> Optional[bool]:
   return None
 
 
@@ -125,36 +136,39 @@ def is_mlir_bridge_enabled():
     from tensorflow.python.framework.is_mlir_bridge_test_true import is_mlir_bridge_enabled  # pylint: disable=g-import-not-at-top, unused-import
   except ImportError:
     pass
+# pytype: enable=import-error
 
 
-def is_asan_enabled():
+def is_asan_enabled() -> bool:
   """Check if ASAN is enabled."""
   return pywrap_sanitizers.is_asan_enabled()
 
 
-def is_msan_enabled():
+def is_msan_enabled() -> bool:
   """Check if MSAN is enabled."""
   return pywrap_sanitizers.is_msan_enabled()
 
 
-def is_tsan_enabled():
+def is_tsan_enabled() -> bool:
   """Check if TSAN is enabled."""
   return pywrap_sanitizers.is_tsan_enabled()
 
 
-def is_ubsan_enabled():
+def is_ubsan_enabled() -> bool:
   """Check if UBSAN is enabled."""
   return pywrap_sanitizers.is_ubsan_enabled()
 
 
-def _get_object_count_by_type(exclude=()):
+def _get_object_count_by_type(
+    exclude: Iterable[Any] = (),
+) -> collections.Counter[str]:
   return (
       collections.Counter([type(obj).__name__ for obj in gc.get_objects()]) -
       collections.Counter([type(obj).__name__ for obj in exclude]))
 
 
 @tf_export("test.gpu_device_name")
-def gpu_device_name():
+def gpu_device_name() -> str:
   """Returns the name of a GPU device if available or a empty string.
 
   This method should only be used in tests written with `tf.test.TestCase`.
@@ -175,7 +189,9 @@ def gpu_device_name():
   return ""
 
 
-def assert_ops_in_graph(expected_ops, graph):
+def assert_ops_in_graph(
+    expected_ops: dict[str, str], graph: ops.Graph
+) -> dict[str, node_def_pb2.NodeDef]:
   """Assert all expected operations are found.
 
   Args:
@@ -188,8 +204,8 @@ def assert_ops_in_graph(expected_ops, graph):
   Raises:
     ValueError: If the expected ops are not present in the graph.
   """
-  actual_ops = {}
-  gd = graph.as_graph_def()
+  actual_ops: dict[str, node_def_pb2.NodeDef] = {}
+  gd = cast(graph_pb2.GraphDef, graph.as_graph_def())
   for node in gd.node:
     if node.name in expected_ops:
       if expected_ops[node.name] != node.op:
@@ -203,7 +219,9 @@ def assert_ops_in_graph(expected_ops, graph):
 
 
 @tf_export("test.assert_equal_graph_def", v1=[])
-def assert_equal_graph_def_v2(expected, actual):
+def assert_equal_graph_def_v2(
+    expected: graph_pb2.GraphDef, actual: graph_pb2.GraphDef
+) -> None:
   """Asserts that two `GraphDef`s are (mostly) the same.
 
   Compares two `GraphDef` protos for equality, ignoring versions and ordering of
@@ -224,8 +242,12 @@ def assert_equal_graph_def_v2(expected, actual):
 
 
 @tf_export(v1=["test.assert_equal_graph_def"])
-def assert_equal_graph_def_v1(actual, expected, checkpoint_v2=False,
-                              hash_table_shared_name=False):
+def assert_equal_graph_def_v1(
+    actual: graph_pb2.GraphDef,
+    expected: graph_pb2.GraphDef,
+    checkpoint_v2: bool = False,
+    hash_table_shared_name: bool = False
+) -> None:
   """Asserts that two `GraphDef`s are (mostly) the same.
 
   Compares two `GraphDef` protos for equality, ignoring versions and ordering of
@@ -248,8 +270,12 @@ def assert_equal_graph_def_v1(actual, expected, checkpoint_v2=False,
                          hash_table_shared_name)
 
 
-def assert_equal_graph_def(actual, expected, checkpoint_v2=False,
-                           hash_table_shared_name=False):
+def assert_equal_graph_def(
+    actual: graph_pb2.GraphDef,
+    expected: graph_pb2.GraphDef,
+    checkpoint_v2: bool = False,
+    hash_table_shared_name: bool = False
+)-> None:
   if not isinstance(actual, graph_pb2.GraphDef):
     raise TypeError("Expected tf.GraphDef for actual, got %s" %
                     type(actual).__name__)
@@ -271,7 +297,11 @@ def assert_equal_graph_def(actual, expected, checkpoint_v2=False,
     raise AssertionError(compat.as_str(diff))
 
 
-def assert_meta_graph_protos_equal(tester, a, b):
+def assert_meta_graph_protos_equal(
+    tester: "TensorFlowTestCase",
+    a: meta_graph_pb2.MetaGraphDef,
+    b: meta_graph_pb2.MetaGraphDef,
+) -> None:
   """Compares MetaGraphDefs `a` and `b` in unit test class `tester`."""
   # Carefully check the collection_defs
   tester.assertEqual(set(a.collection_def), set(b.collection_def))
@@ -279,7 +309,7 @@ def assert_meta_graph_protos_equal(tester, a, b):
   for k in collection_keys:
     a_value = a.collection_def[k]
     b_value = b.collection_def[k]
-    proto_type = ops.get_collection_proto_type(k)
+    proto_type = cast(type[message.Message], ops.get_collection_proto_type(k))
     if proto_type:
       a_proto = proto_type()
       b_proto = proto_type()
@@ -315,11 +345,12 @@ def assert_meta_graph_protos_equal(tester, a, b):
 _SHARDED_SAVE_OP_PATTERN = "_temp_[0-9a-z]{32}/part"
 
 
-def _strip_checkpoint_v2_randomized(graph_def):
+def _strip_checkpoint_v2_randomized(graph_def: graph_pb2.GraphDef) -> None:
   for node in graph_def.node:
-    delete_keys = []
+    delete_keys: list[str] = []
     for attr_key in node.attr:
-      attr_tensor_value = node.attr[attr_key].tensor
+      attr_tensor_value = cast(
+          tensor_pb2.TensorProto, node.attr[attr_key].tensor)
       if attr_tensor_value and len(attr_tensor_value.string_val) == 1:
         attr_tensor_string_value = attr_tensor_value.string_val[0]
         if (attr_tensor_string_value and
@@ -333,9 +364,9 @@ def _strip_checkpoint_v2_randomized(graph_def):
 _TABLE_SHARED_NAME_PATTERN = r"hash_table_[0-9a-z\-]+"
 
 
-def _strip_hash_table_shared_name(graph_def):
+def _strip_hash_table_shared_name(graph_def: graph_pb2.GraphDef) -> None:
   for node in graph_def.node:
-    delete_keys = []
+    delete_keys: list[str] = []
     if node.op == "HashTableV2" and "shared_name" in node.attr:
       if re.match(compat.as_bytes(_TABLE_SHARED_NAME_PATTERN),
                   node.attr["shared_name"].s):
@@ -344,35 +375,37 @@ def _strip_hash_table_shared_name(graph_def):
       del node.attr[attr_key]
 
 
-def IsGoogleCudaEnabled():
+def IsGoogleCudaEnabled() -> bool:
   return _pywrap_util_port.IsGoogleCudaEnabled()
 
 
-def IsBuiltWithROCm():
+def IsBuiltWithROCm() -> bool:
   return _pywrap_util_port.IsBuiltWithROCm()
 
 
-def IsBuiltWithXLA():
+def IsBuiltWithXLA() -> bool:
   return _pywrap_util_port.IsBuiltWithXLA()
 
 
-def IsBuiltWithNvcc():
+def IsBuiltWithNvcc() -> bool:
   return _pywrap_util_port.IsBuiltWithNvcc()
 
 
-def GpuSupportsHalfMatMulAndConv():
+def GpuSupportsHalfMatMulAndConv() -> bool:
   return _pywrap_util_port.GpuSupportsHalfMatMulAndConv()
 
 
-def IsMklEnabled():
+def IsMklEnabled() -> bool:
   return _pywrap_util_port.IsMklEnabled()
 
 
-def InstallStackTraceHandler():
+def InstallStackTraceHandler() -> None:
   _pywrap_stacktrace_handler.InstallStacktraceHandler()
 
 
-def NHWCToNCHW(input_tensor):
+def NHWCToNCHW(
+    input_tensor: Union[tensor_lib.Tensor, list[int]]
+) -> Union[tensor_lib.Tensor, list[int]]:
   """Converts the input from the NHWC format to NCHW.
 
   Args:
@@ -391,7 +424,9 @@ def NHWCToNCHW(input_tensor):
     return [input_tensor[a] for a in new_axes[ndims]]
 
 
-def NHWCToNCHW_VECT_C(input_shape_or_tensor):
+def NHWCToNCHW_VECT_C(
+    input_shape_or_tensor: Union[tensor_lib.Tensor, list[int]]
+)-> Union[tensor_lib.Tensor, list[int]]:
   """Transforms the input from the NHWC layout to NCHW_VECT_C layout.
 
   Note: Does not include quantization or type conversion steps, which should
@@ -409,7 +444,7 @@ def NHWCToNCHW_VECT_C(input_shape_or_tensor):
   """
   permutations = {5: [0, 3, 1, 2, 4], 6: [0, 4, 1, 2, 3, 5]}
   is_tensor = isinstance(input_shape_or_tensor, tensor_lib.Tensor)
-  temp_shape = (
+  temp_shape: list[int] = (
       input_shape_or_tensor.shape.as_list()
       if is_tensor else input_shape_or_tensor)
   if temp_shape[-1] % 4 != 0:
@@ -426,7 +461,9 @@ def NHWCToNCHW_VECT_C(input_shape_or_tensor):
     return [temp_shape[a] for a in permutation]
 
 
-def NCHW_VECT_CToNHWC(input_shape_or_tensor):
+def NCHW_VECT_CToNHWC(
+    input_shape_or_tensor: Union[tensor_lib.Tensor, list[int]]
+) -> Union[tensor_lib.Tensor, list[int]]:
   """Transforms the input from the NCHW_VECT_C layout to NHWC layout.
 
   Note: Does not include de-quantization or type conversion steps, which should
@@ -443,7 +480,7 @@ def NCHW_VECT_CToNHWC(input_shape_or_tensor):
   """
   permutations = {5: [0, 2, 3, 1, 4], 6: [0, 2, 3, 4, 1, 5]}
   is_tensor = isinstance(input_shape_or_tensor, tensor_lib.Tensor)
-  input_shape = (
+  input_shape: list[int] = (
       input_shape_or_tensor.shape.as_list()
       if is_tensor else input_shape_or_tensor)
   if input_shape[-1] != 4:
@@ -458,7 +495,9 @@ def NCHW_VECT_CToNHWC(input_shape_or_tensor):
     return nhwc_shape
 
 
-def NCHWToNHWC(input_tensor):
+def NCHWToNHWC(
+    input_tensor: Union[tensor_lib.Tensor, list[int]]
+) -> Union[tensor_lib.Tensor, list[int]]:
   """Converts the input from the NCHW format to NHWC.
 
   Args:
@@ -477,7 +516,7 @@ def NCHWToNHWC(input_tensor):
     return [input_tensor[a] for a in new_axes[ndims]]
 
 
-def skip_if(condition):
+def skip_if(condition: Union[Callable[[], bool], bool]) -> Callable[[_F], _F]:
   """Skips the decorated function if condition is or evaluates to True.
 
   Args:
@@ -488,7 +527,7 @@ def skip_if(condition):
     The wrapped function
   """
 
-  def real_skip_if(fn):
+  def real_skip_if(fn: _F) -> _F:
 
     def wrapper(*args, **kwargs):
       if callable(condition):
@@ -504,7 +543,11 @@ def wrapper(*args, **kwargs):
 
 
 @contextlib.contextmanager
-def skip_if_error(test_obj, error_type, messages=None):
+def skip_if_error(
+    test_obj: unittest.TestCase,
+    error_type: type[Exception],
+    messages: Union[str, list[str], None] = None
+) -> Iterator[None]:
   """Context manager to skip cases not considered failures by the tests.
 
   Note that this does not work if used in setUpClass/tearDownClass.
@@ -535,17 +578,17 @@ def skip_if_error(test_obj, error_type, messages=None):
       raise
 
 
-def enable_c_shapes(fn):
+def enable_c_shapes(fn: _F) -> _F:
   """No-op. TODO(b/74620627): Remove this."""
   return fn
 
 
-def with_c_shapes(cls):
+def with_c_shapes(cls: type[_T]) -> type[_T]:
   """No-op. TODO(b/74620627): Remove this."""
   return cls
 
 
-def enable_control_flow_v2(fn):
+def enable_control_flow_v2(fn: _F) -> _F:
   """Decorator for enabling CondV2 and WhileV2 on a test.
 
   Note this enables using CondV2 and WhileV2 after running the test class's
@@ -572,7 +615,7 @@ def wrapper(*args, **kwargs):
   return wrapper
 
 
-def with_control_flow_v2(cls):
+def with_control_flow_v2(cls: _TC) -> _TC:
   """Adds methods that call original methods with WhileV2 and CondV2 enabled.
 
   Note this enables CondV2 and WhileV2 in new methods after running the test
@@ -627,7 +670,7 @@ def testDisabledForV2(self):
   return cls
 
 
-def disable_control_flow_v2(unused_msg):
+def disable_control_flow_v2(unused_msg: str) -> Callable[[_F], _F]:
   """Decorator for a function in a with_control_flow_v2 enabled test class.
 
   Blocks the function from being run with v2 control flow ops.
@@ -639,14 +682,14 @@ def disable_control_flow_v2(unused_msg):
     The wrapped function with _disable_control_flow_v2 attr set to True.
   """
 
-  def wrapper(func):
+  def wrapper(func: _F) -> _F:
     func._disable_control_flow_v2 = True
     return func
 
   return wrapper
 
 
-def enable_output_all_intermediates(fn):
+def enable_output_all_intermediates(fn: _F) -> _F:
   """Force-enable outputing all intermediates from functional control flow ops.
 
   Args:
@@ -669,26 +712,27 @@ def wrapper(*args, **kwargs):
   return wrapper
 
 
-def assert_no_new_pyobjects_executing_eagerly(func=None, warmup_iters=2):
+def assert_no_new_pyobjects_executing_eagerly(
+    warmup_iters: int = 2,
+) -> Callable[[Callable[..., Any]], Callable[..., None]]:
   """Decorator for asserting that no new Python objects persist after a test.
 
-  Runs the test multiple times executing eagerly, first as a warmup and then to
-  let objects accumulate. The warmup helps ignore caches which do not grow as
-  the test is run repeatedly.
+  Returns a decorator that runs the test multiple times executing eagerly,
+  first as a warmup and then to let objects accumulate. The warmup helps ignore
+  caches which do not grow as the test is run repeatedly.
 
   Useful for checking that there are no missing Py_DECREFs in the C exercised by
   a bit of Python.
 
   Args:
-    func: The function to test.
     warmup_iters: The numer of warmup iterations, excluded from measuring.
 
   Returns:
-    The wrapped function performing the test.
+    A decorator function which can be applied to the test function.
   """
 
-  def wrap_f(f):
-    def decorator(self, *args, **kwargs):
+  def wrap_f(f: Callable[..., Any]) -> Callable[..., None]:
+    def decorator(self: "TensorFlowTestCase", *args, **kwargs) -> None:
       """Warms up, gets object counts, runs the test, checks for new objects."""
       with context.eager_mode():
         gc.disable()
@@ -780,15 +824,12 @@ def decorator(self, *args, **kwargs):
             "The following objects were newly created: %s" %
             str(obj_count_by_type))
         gc.enable()
-    return decorator
+    return tf_decorator.make_decorator(f, decorator)
 
-  if func is None:
-    return wrap_f
-  else:
-    return wrap_f(func)
+  return wrap_f
 
 
-def assert_no_new_tensors(f):
+def assert_no_new_tensors(f: _F) -> _F:
   """Decorator for asserting that no new Tensors persist after a test.
 
   Mainly useful for checking that code using the Python C API has correctly
@@ -807,10 +848,10 @@ def assert_no_new_tensors(f):
     The decorated test case.
   """
 
-  def decorator(self, **kwargs):
+  def decorator(self: "TensorFlowTestCase", **kwargs):
     """Finds existing Tensors, runs the test, checks for new Tensors."""
 
-    def _is_tensorflow_object(obj):
+    def _is_tensorflow_object(obj) -> bool:
       try:
         return isinstance(obj,
                           (tensor_lib.Tensor, variables.Variable,
@@ -821,7 +862,7 @@ def _is_tensorflow_object(obj):
 
     tensors_before = set(
         id(obj) for obj in gc.get_objects() if _is_tensorflow_object(obj))
-    outside_executed_eagerly = context.executing_eagerly()
+    outside_executed_eagerly = cast(bool, context.executing_eagerly())
     # Run the test in a new graph so that collections get cleared when it's
     # done, but inherit the graph key so optimizers behave.
     outside_graph_key = ops.get_default_graph()._graph_key
@@ -847,12 +888,12 @@ def _is_tensorflow_object(obj):
       )))
     return result
 
-  return decorator
+  return tf_decorator.make_decorator(f, decorator)
 
 
-def _find_reference_cycle(objects, idx):
+def _find_reference_cycle(objects: Sequence[Any], idx: int) -> bool:
 
-  def get_ignore_reason(obj, denylist):
+  def get_ignore_reason(obj: Any, denylist: Collection[Any]) -> Optional[str]:
     """Tests whether an object should be omitted from the dependency graph."""
     if len(denylist) > 100:
       return "<depth limit>"
@@ -869,7 +910,9 @@ def get_ignore_reason(obj, denylist):
   # Note: this function is meant to help with diagnostics. Its output is purely
   # a human-readable representation, so you may freely modify it to suit your
   # needs.
-  def describe(obj, denylist, leaves_only=False):
+  def describe(
+      obj: Any, denylist: Collection[Any], leaves_only: bool = False,
+  ) -> str:
     """Returns a custom human-readable summary of obj.
 
     Args:
@@ -901,7 +944,12 @@ def describe(obj, denylist, leaves_only=False):
       else:
         return "{}, {}".format(type(obj), id(obj))
 
-  def build_ref_graph(obj, graph, reprs, denylist):
+  def build_ref_graph(
+      obj: Any,
+      graph: dict[int, list[int]],
+      reprs: dict[int, str],
+      denylist: tuple[Any, ...],
+  ) -> None:
     """Builds a reference graph as <referrer> -> <list of referents>.
 
     Args:
@@ -927,7 +975,12 @@ def build_ref_graph(obj, graph, reprs, denylist):
           build_ref_graph(r, graph, reprs, denylist)
           reprs[r_id] = describe(r, denylist)
 
-  def find_cycle(el, graph, reprs, path):
+  def find_cycle(
+      el: int,
+      graph: dict[int, list[int]],
+      reprs: dict[int, str],
+      path: tuple[int, ...],
+  ) -> Optional[bool]:
     """Finds and prints a single cycle in the dependency graph."""
     if el not in graph:
       return
@@ -943,8 +996,8 @@ def find_cycle(el, graph, reprs, path):
     return False
 
   obj = objects[idx]
-  graph = {}  # referrer ID -> object ID
-  reprs = {}  # object ID -> description
+  graph: dict[int, list[int]] = {}  # referrer ID -> object ID
+  reprs: dict[int, str] = {}  # object ID -> description
   build_ref_graph(obj, graph, reprs, (objects, graph, reprs, get_ignore_reason,
                                       describe, build_ref_graph, find_cycle))
   for k in graph:
@@ -953,7 +1006,7 @@ def find_cycle(el, graph, reprs, path):
   return False
 
 
-def assert_no_garbage_created(f):
+def assert_no_garbage_created(f: _F) -> _F:
   """Test method decorator to assert that no garbage has been created.
 
   Note that this decorator sets DEBUG_SAVEALL, which in some Python interpreters
@@ -969,7 +1022,7 @@ def assert_no_garbage_created(f):
 
   # FIXME(power) -- Update documentation, we no longer care if garbage is
   # created, we only want to verify we don't have memory leaks.
-  def decorator(self, **kwargs):
+  def decorator(self: "TensorFlowTestCase", **kwargs):
     """Sets DEBUG_SAVEALL, runs the test, and checks for new garbage."""
     gc.disable()
     previous_debug_flags = gc.get_debug()
@@ -995,7 +1048,7 @@ def decorator(self, **kwargs):
           logging.error("Object %d of %d", i,
                         len(gc.garbage) - previous_garbage)
 
-          def _safe_object_str(obj):
+          def _safe_object_str(obj) -> str:
             return "<%s %d>" % (obj.__class__.__name__, id(obj))
 
           logging.error("  Object type: %s", _safe_object_str(obj))
@@ -1033,7 +1086,7 @@ def _safe_object_str(obj):
   return decorator
 
 
-def _combine_named_parameters(**kwargs):
+def _combine_named_parameters(**kwargs) -> list[OrderedDict[str, Any]]:
   """Generate combinations based on its keyword arguments.
 
   Two sets of returned combinations can be concatenated using +.  Their product
@@ -1049,7 +1102,7 @@ def _combine_named_parameters(**kwargs):
     corresponding keyword argument values.
   """
   sort_by_key = lambda k: k[0]
-  combinations = []
+  combinations: list[list[tuple[str, Any]]] = []
   for key, values in sorted(kwargs.items(), key=sort_by_key):
     if not isinstance(values, list):
       values = [values]
@@ -1058,7 +1111,9 @@ def _combine_named_parameters(**kwargs):
   return [OrderedDict(result) for result in itertools.product(*combinations)]
 
 
-def generate_combinations_with_testcase_name(**kwargs):
+def generate_combinations_with_testcase_name(
+    **kwargs,
+) -> list[OrderedDict[str, Any]]:
   """Generate combinations based on its keyword arguments using combine().
 
   This function calls combine() and appends a testcase name to the list of
@@ -1075,7 +1130,7 @@ def generate_combinations_with_testcase_name(**kwargs):
     corresponding keyword argument values.
   """
   combinations = _combine_named_parameters(**kwargs)
-  named_combinations = []
+  named_combinations: list[OrderedDict[str, Any]] = []
   for combination in combinations:
     assert isinstance(combination, OrderedDict)
     name = "".join([
@@ -1091,7 +1146,7 @@ def generate_combinations_with_testcase_name(**kwargs):
   return named_combinations
 
 
-def run_all_in_graph_and_eager_modes(cls):
+def run_all_in_graph_and_eager_modes(cls: _TC) -> _TC:
   """Execute all test methods in the given class with and without eager."""
   base_decorator = run_in_graph_and_eager_modes
   for name in dir(cls):
@@ -1107,7 +1162,7 @@ def run_all_in_graph_and_eager_modes(cls):
   return cls
 
 
-def run_class_in_v1_v2(cls):
+def run_class_in_v1_v2(cls: _TC) -> _TC:
   """Execute all test methods in a given class in v1 and v2 modes."""
   base_decorator = run_in_v1_v2
   for name in dir(cls):
@@ -1126,7 +1181,7 @@ def run_class_in_v1_v2(cls):
   return cls
 
 
-def enable_nested_function_shape_inference(fn):
+def enable_nested_function_shape_inference(fn: _F) -> _F:
   """Decorator for enabling nested_function_shape_inference on a test.
 
   This function returns a decorator intended to be applied to test methods in
@@ -1163,7 +1218,7 @@ def wrapper(*args, **kwargs):
   return wrapper
 
 
-def enable_quantized_dtypes_training(fn):
+def enable_quantized_dtypes_training(fn: _F) -> _F:
   """Decorator for enabling quantized_dtypes_training on a test.
 
   This function returns a decorator intended to be applied to test methods in
@@ -1200,7 +1255,7 @@ def wrapper(*args, **kwargs):
   return wrapper
 
 
-def enable_eager_op_as_function(fn):
+def enable_eager_op_as_function(fn: _F) -> _F:
   """Returns the same fn. This will be removed once all usages are removed.
 
   Args:
@@ -1216,8 +1271,27 @@ def wrapper(*args, **kwargs):
   return wrapper
 
 
+@overload
+def with_eager_op_as_function(
+    cls: type[_T],
+    only_as_function: bool = False,
+) -> type[_T]:
+  ...
+
+
+@overload
+def with_eager_op_as_function(
+    cls: None = None,
+    only_as_function: bool = False,
+) -> Callable[[type[_T]], type[_T]]:
+  ...
+
+
 @tf_export("test.with_eager_op_as_function")
-def with_eager_op_as_function(cls=None, only_as_function=False):  # pylint: disable=unused-argument
+def with_eager_op_as_function(
+    cls: Optional[type[_T]] = None,
+    only_as_function: bool = False,  # pylint: disable=unused-argument
+) -> Union[Callable[[type[_T]], type[_T]], type[_T]]:
   """Returns the same class. This will be removed once all usages are removed.
 
   Args:
@@ -1228,16 +1302,16 @@ def with_eager_op_as_function(cls=None, only_as_function=False):  # pylint: disa
     cls
   """
 
-  def decorator(cls):
+  def decorator(cls: type[_T]) -> type[_T]:
     return cls
 
   if cls is not None:
     return decorator(cls)
 
-  return decorator
+  return decorator  # pytype: disable=bad-return-type
 
 
-def enable_graph_building_optimization(fn):
+def enable_graph_building_optimization(fn: _F) -> _F:
   """Decorator for enabling graph_building_optimization on a test.
 
   This function returns a decorator intended to be applied to test methods in
@@ -1273,7 +1347,7 @@ def wrapper(*args, **kwargs):
   return wrapper
 
 
-def add_graph_building_optimization_tests(cls=None):
+def add_graph_building_optimization_tests(cls: _TC) -> _TC:
   """Adds methods with graph_building_optimization enabled to the test suite.
 
   Example:
@@ -1302,25 +1376,19 @@ def testBarWithGraphBuildingOptimization(self):
     cls with new test methods added.
   """
 
-  def decorator(cls):
-    if flags.config().graph_building_optimization.value():
-      return cls
-
-    for name, value in cls.__dict__.copy().items():
-      if (callable(value) and
-          (name.startswith(unittest.TestLoader.testMethodPrefix) or
-           name.startswith("benchmark"))):
-        setattr(cls, name + "WithGraphBuildingOptimization",
-                enable_graph_building_optimization(value))
+  if flags.config().graph_building_optimization.value():
     return cls
 
-  if cls is not None:
-    return decorator(cls)
-
-  return decorator
+  for name, value in cls.__dict__.copy().items():
+    if (callable(value) and
+        (name.startswith(unittest.TestLoader.testMethodPrefix) or
+         name.startswith("benchmark"))):
+      setattr(cls, name + "WithGraphBuildingOptimization",
+              enable_graph_building_optimization(value))
+  return cls
 
 
-def disable_eager_op_as_function(unused_msg):
+def disable_eager_op_as_function(unused_msg: str) -> Callable[[_F], _F]:
   """Decorator for a function in a with_eager_op_as_function enabled test class.
 
   Blocks the function from being run with eager_op_as_function enabled.
@@ -1334,7 +1402,7 @@ def disable_eager_op_as_function(unused_msg):
   return _disable_test(execute_func=False)
 
 
-def set_xla_env_flag(func=None, flag=""):
+def set_xla_env_flag(flag: str = "") -> Callable[[_F], _F]:
   """Decorator for setting XLA_FLAGS prior to running a test.
 
   This function returns a decorator intended to be applied to test methods in
@@ -1351,14 +1419,14 @@ def testFoo(self):
       ...
 
   Args:
-    func: The function to be wrapped.
     flag: The xla flag to be set in the XLA_FLAGS env variable.
 
   Returns:
-    The wrapped function.
+    A decorator which sets the configured flag in XLA_FLAGS for the decorated
+    function.
   """
 
-  def decorator(f):
+  def decorator(f: _F) -> _F:
 
     @functools.wraps(f)
     def decorated(*args, **kwargs):
@@ -1377,13 +1445,12 @@ def decorated(*args, **kwargs):
 
     return decorated
 
-  if func is not None:
-    return decorator(func)
-
   return decorator
 
 
-def build_as_function_and_v1_graph(func=None):
+def build_as_function_and_v1_graph(
+    func: Callable[..., Any],
+) -> Callable[..., None]:
   """Run a test case in v1 graph mode and inside tf.function in eager mode.
 
   WARNING: This decorator can only be used in test cases that statically checks
@@ -1400,47 +1467,46 @@ def build_as_function_and_v1_graph(func=None):
     Decorated test case function.
   """
 
-  def decorator(f):
-    if tf_inspect.isclass(f):
-      raise ValueError(
-          "`run_in_graph_mode_and_function` only supports test methods.")
-
-    @parameterized.named_parameters(("_v1_graph", "v1_graph"),
-                                    ("_function", "function"))
-    @functools.wraps(f)
-    def decorated(self, run_mode, *args, **kwargs):
-      if run_mode == "v1_graph":
-        with ops.Graph().as_default():
-          f(self, *args, **kwargs)
-      elif run_mode == "function":
-
-        @def_function.function
-        def function_in_eager():
-          f(self, *args, **kwargs)
-
-        # Create a new graph for the eagerly executed version of this test for
-        # better isolation.
-        graph_for_eager_test = ops.Graph()
-        with graph_for_eager_test.as_default(), context.eager_mode():
-          function_in_eager()
-        ops.dismantle_graph(graph_for_eager_test)
-      else:
-        raise ValueError("Unknown run mode %s" % run_mode)
-
-    return decorated
+  if tf_inspect.isclass(func):
+    raise ValueError(
+        "`run_in_graph_mode_and_function` only supports test methods.")
+
+  @parameterized.named_parameters(("_v1_graph", "v1_graph"),
+                                  ("_function", "function"))
+  @functools.wraps(func)
+  def decorated(
+      self: "TensorFlowTestCase",
+      run_mode: str,
+      *args,
+      **kwargs,
+  ) -> None:
+    if run_mode == "v1_graph":
+      with ops.Graph().as_default():
+        func(self, *args, **kwargs)
+    elif run_mode == "function":
+
+      @def_function.function
+      def function_in_eager():
+        func(self, *args, **kwargs)
 
-  if func is not None:
-    return decorator(func)
+      # Create a new graph for the eagerly executed version of this test for
+      # better isolation.
+      graph_for_eager_test = ops.Graph()
+      with graph_for_eager_test.as_default(), context.eager_mode():
+        function_in_eager()
+      ops.dismantle_graph(graph_for_eager_test)
+    else:
+      raise ValueError("Unknown run mode %s" % run_mode)
 
-  return decorator
+  return decorated
 
 
-def run_in_async_and_sync_mode(f):
+def run_in_async_and_sync_mode(f: _F) -> _F:
   """Execute the test in async mode and sync mode."""
 
   @parameterized.named_parameters([("Async", True), ("", False)])
   @functools.wraps(f)
-  def decorator(self, async_mode, *args, **kwargs):
+  def decorator(self: "TensorFlowTestCase", async_mode: bool, *args, **kwargs):
     if async_mode:
       with context.execution_mode(context.ASYNC):
         f(self, *args, **kwargs)
@@ -1450,10 +1516,35 @@ def decorator(self, async_mode, *args, **kwargs):
   return decorator
 
 
-def run_in_graph_and_eager_modes(func=None,
-                                 config=None,
-                                 use_gpu=True,
-                                 assert_no_eager_garbage=False):
+@overload
+def run_in_graph_and_eager_modes(
+    func: Callable[..., Any],
+    config: Optional[config_pb2.ConfigProto] = None,
+    use_gpu: bool = True,
+    assert_no_eager_garbage: bool = False,
+) -> Callable[..., None]:
+  ...
+
+
+@overload
+def run_in_graph_and_eager_modes(
+    func: None = None,
+    config: Optional[config_pb2.ConfigProto] = None,
+    use_gpu: bool = True,
+    assert_no_eager_garbage: bool = False,
+) -> Callable[[Callable[..., Any]], Callable[..., None]]:
+  ...
+
+
+def run_in_graph_and_eager_modes(
+    func: Optional[Callable[..., Any]] = None,
+    config: Optional[config_pb2.ConfigProto] = None,
+    use_gpu: bool = True,
+    assert_no_eager_garbage: bool = False,
+) -> Union[
+    Callable[[Callable[..., Any]], Callable[..., None]],
+    Callable[..., None],
+]:
   """Execute the decorated test with and without enabling eager execution.
 
   This function returns a decorator intended to be applied to test methods in
@@ -1511,13 +1602,13 @@ def test_foo(self):
     eager execution enabled.
   """
 
-  def decorator(f):
+  def decorator(f: Callable[..., Any]) -> Callable[..., None]:
     if tf_inspect.isclass(f):
       raise ValueError(
           "`run_in_graph_and_eager_modes` only supports test methods. "
           "Did you mean to use `run_all_in_graph_and_eager_modes`?")
 
-    def decorated(self, *args, **kwargs):
+    def decorated(self: "TensorFlowTestCase", *args, **kwargs) -> None:
       logging.info("Running %s in GRAPH mode.", f.__name__)
       try:
         with context.graph_mode(), self.subTest("graph_mode"):
@@ -1536,7 +1627,7 @@ def decorated(self, *args, **kwargs):
       except unittest.case.SkipTest:
         pass
 
-      def run_eagerly(self, **kwargs):
+      def run_eagerly(self: "TensorFlowTestCase", **kwargs) -> None:
         logging.info("Running %s in EAGER mode.", f.__name__)
         if not use_gpu:
           with ops.device("/device:CPU:0"):
@@ -1573,17 +1664,15 @@ def run_eagerly(self, **kwargs):
   return decorator
 
 
-def run_in_v1_v2(func=None,
-                 device_to_use: str = None,
-                 assert_no_eager_garbage: bool = False):
+def run_in_v1_v2(
+    device_to_use: Optional[str] = None,
+    assert_no_eager_garbage: bool = False,
+) -> Callable[[Callable[..., Any]], Callable[..., None]]:
   """Execute the decorated test in v1 and v2 modes.
 
   The overall execution is similar to that of `run_in_graph_and_eager_mode`.
 
   Args:
-    func: A test function/method to be decorated. If `func` is None, this method
-      returns a decorator the can be applied to a function. Otherwise, an
-      already applied decorator is returned.
     device_to_use: A string in the following format: "/device:CPU:0".
     assert_no_eager_garbage: If True, sets DEBUG_SAVEALL on the garbage
       collector and asserts that no extra garbage has been created when running
@@ -1600,14 +1689,13 @@ def run_in_v1_v2(func=None,
     A decorator that runs a given test in v1 and v2 modes.
   """
 
-  decorator_tag = "wrapped_with_v1_v2_decorator"
-  if hasattr(func, decorator_tag):
-    # Already decorated with this very same decorator
-    return func
-
-  def decorator(f):
+  def decorator(f: Callable[..., Any]) -> Callable[..., None]:
+    decorator_tag = "wrapped_with_v1_v2_decorator"
+    if hasattr(f, decorator_tag):
+      # Already decorated with this very same decorator
+      return f
 
-    def decorated(self, *args, **kwargs):
+    def decorated(self: "TensorFlowTestCase", *args, **kwargs) -> None:
       logging.info("Running %s in V1 mode.", f.__name__)
       try:
         with self.subTest("V1_mode"):
@@ -1616,7 +1704,7 @@ def decorated(self, *args, **kwargs):
       except unittest.case.SkipTest:
         pass
 
-      def run_v2(self, **kwargs):
+      def run_v2(self: "TensorFlowTestCase", **kwargs) -> None:
         logging.info("Running %s in V2 mode.", f.__name__)
         if device_to_use:
           with ops.device(device_to_use):
@@ -1644,20 +1732,17 @@ def run_v2(self, **kwargs):
     tf_decorated.__dict__[decorator_tag] = True
     return tf_decorated
 
-  if func is not None:
-    return decorator(func)
-
   return decorator
 
 
-def py_func_if_in_function(f):
+def py_func_if_in_function(f: _F) -> _F:
 
   def decorated(*args, **kwds):
     if not ops.inside_function():
       return f(*args, **kwds)
 
-    tensor_args = []
-    tensor_indices = []
+    tensor_args: list[Union[tensor_lib.Tensor, variables.Variable]] = []
+    tensor_indices: list[int] = []
     for i, arg in enumerate(args):
       if isinstance(arg, (tensor_lib.Tensor, variables.Variable)):
         tensor_args.append(arg)
@@ -1674,7 +1759,7 @@ def inner_f(*inner_tensor_args):
   return tf_decorator.make_decorator(f, decorated)
 
 
-def also_run_as_tf_function(f):
+def also_run_as_tf_function(f: Callable[..., Any]) -> Callable[..., None]:
   """Runs the decorated test twice--once as is, once inside a tf.function.
 
   This allows you to run a test both in eager execution and inside a
@@ -1694,9 +1779,9 @@ def also_run_as_tf_function(f):
     tf.function.
   """
 
-  def decorated(*args, **kwds):
+  def decorated(*args, **kwds) -> None:
 
-    def bound_f():
+    def bound_f() -> None:
       f(*args, **kwds)
 
     with context.eager_mode():
@@ -1709,59 +1794,64 @@ def bound_f():
   return decorated
 
 
-def deprecated_graph_mode_only(func=None):
+@overload
+def deprecated_graph_mode_only(func: _F) -> _F:
+  ...
+
+
+@overload
+def deprecated_graph_mode_only(func: _TC) -> Optional[_TC]:
+  ...
+
+
+def deprecated_graph_mode_only(func: Union[_TC, _F]) -> Union[_TC, _F]:
   """Execute the decorated test in graph mode.
 
-  This function returns a decorator intended to be applied to tests that are not
-  compatible with eager mode. When this decorator is applied, the test body will
-  be run in an environment where API calls construct graphs instead of executing
-  eagerly.
+  This is a decorator intended to be applied to tests that are not compatible
+  with eager mode. When this decorator is applied, the test body will be run in
+  an environment where API calls construct graphs instead of executing eagerly.
 
   `deprecated_graph_mode_only`, `run_v1_only`, `run_v2_only`, and
   `run_in_graph_and_eager_modes` are available decorators for different
   v1/v2/eager/graph combinations.
 
   Args:
-    func: function to be annotated. If `func` is None, this method returns a
-      decorator the can be applied to a function. If `func` is not None this
-      returns the decorator applied to `func`.
+    func: function or class to be annotated.
+      If `func` is a function this returns the decorator applied to `func`.
+      If `func` is a unit test class this returns that class with the decorator
+      applied to all test functions within that class.
 
   Returns:
-    Returns a decorator that will run the decorated test method in graph mode.
+    Returns a function or class that will run the decorated test(s)
+    in graph mode.
   """
 
-  def decorator(f):
-    if tf_inspect.isclass(f):
-      setup = f.__dict__.get("setUp")
-      if setup is not None:
-        setattr(f, "setUp", decorator(setup))
-
-      for name, value in f.__dict__.copy().items():
-        if (callable(value) and
-            name.startswith(unittest.TestLoader.testMethodPrefix)):
-          setattr(f, name, decorator(value))
-
-      return f
+  if tf_inspect.isclass(func):
+    setup = func.__dict__.get("setUp")
+    if setup is not None:
+      setattr(func, "setUp", deprecated_graph_mode_only(setup))
 
-    def decorated(self, *args, **kwargs):
-      if context.executing_eagerly():
-        with context.graph_mode():
-          return f(self, *args, **kwargs)
-      else:
-        return f(self, *args, **kwargs)
+    for name, value in func.__dict__.copy().items():
+      if (callable(value) and
+          name.startswith(unittest.TestLoader.testMethodPrefix)):
+        setattr(func, name, deprecated_graph_mode_only(value))
 
-    return decorated
+    return func
 
-  if func is not None:
-    return decorator(func)
+  def decorated(*args, **kwargs):
+    if context.executing_eagerly():
+      with context.graph_mode():
+        return func(*args, **kwargs)
+    else:
+      return func(*args, **kwargs)
 
-  return decorator
+  return tf_decorator.make_decorator(func, decorated)
 
 
 run_deprecated_v1 = deprecated_graph_mode_only
 
 
-def run_all_in_deprecated_graph_mode_only(cls):
+def run_all_in_deprecated_graph_mode_only(cls: _TC) -> _TC:
   """Execute all tests in a class in graph mode."""
   base_decorator = deprecated_graph_mode_only
   for name in dir(cls):
@@ -1847,73 +1937,57 @@ def run_v2_only(func=None, reason=None):
   return _run_vn_only(func=func, v2=True, reason=reason)
 
 
-def run_gpu_only(func=None):
+def run_gpu_only(func: _F) -> _F:
   """Execute the decorated test only if a GPU is available.
 
   This function is intended to be applied to tests that require the presence
   of a GPU. If a GPU is absent, it will simply be skipped.
 
   Args:
-    func: function to be annotated. If `func` is None, this method returns a
-      decorator the can be applied to a function. If `func` is not None this
-      returns the decorator applied to `func`.
+    func: function to be annotated.
 
   Returns:
-    Returns a decorator that will conditionally skip the decorated test method.
+    Returns a function that will conditionally skip the decorated test method.
   """
 
-  def decorator(f):
-    if tf_inspect.isclass(f):
-      raise ValueError("`run_gpu_only` only supports test methods.")
-
-    def decorated(self, *args, **kwargs):
-      if not is_gpu_available():
-        self.skipTest("Test requires GPU")
+  if tf_inspect.isclass(func):
+    raise ValueError("`run_gpu_only` only supports test methods.")
 
-      return f(self, *args, **kwargs)
+  def decorated(self: "TensorFlowTestCase", *args, **kwargs):
+    if not is_gpu_available():
+      self.skipTest("Test requires GPU")
 
-    return decorated
-
-  if func is not None:
-    return decorator(func)
+    return func(self, *args, **kwargs)
 
-  return decorator
+  return decorated
 
 
-def run_cuda_only(func=None):
+def run_cuda_only(func: _F) -> _F:
   """Execute the decorated test only if a GPU is available.
 
   This function is intended to be applied to tests that require the presence
   of a CUDA GPU. If a CUDA GPU is absent, it will simply be skipped.
 
   Args:
-    func: function to be annotated. If `func` is None, this method returns a
-      decorator the can be applied to a function. If `func` is not None this
-      returns the decorator applied to `func`.
+    func: function to be annotated.
 
   Returns:
-    Returns a decorator that will conditionally skip the decorated test method.
+    Returns a function that will conditionally skip the decorated test method.
   """
 
-  def decorator(f):
-    if tf_inspect.isclass(f):
-      raise ValueError("`run_cuda_only` only supports test methods.")
-
-    def decorated(self, *args, **kwargs):
-      if not is_gpu_available(cuda_only=True):
-        self.skipTest("Test requires CUDA GPU")
+  if tf_inspect.isclass(func):
+    raise ValueError("`run_cuda_only` only supports test methods.")
 
-      return f(self, *args, **kwargs)
+  def decorated(self: "TensorFlowTestCase", *args, **kwargs):
+    if not is_gpu_available(cuda_only=True):
+      self.skipTest("Test requires CUDA GPU")
 
-    return decorated
-
-  if func is not None:
-    return decorator(func)
+    return func(self, *args, **kwargs)
 
-  return decorator
+  return decorated
 
 
-def run_gpu_or_tpu(func=None):
+def run_gpu_or_tpu(func: _F) -> _F:
   """Execute the decorated test only if a physical GPU or TPU is available.
 
   This function is intended to be applied to tests that require the presence
@@ -1923,33 +1997,30 @@ def run_gpu_or_tpu(func=None):
   - If both GPU and TPU are absent, the test will be skipped.
 
   Args:
-    func: function to be annotated. If `func` is None, this method returns a
-      decorator the can be applied to a function. If `func` is not None this
-      returns the decorator applied to `func`.
+    func: function to be annotated.
 
   Returns:
-    Returns a decorator that will conditionally skip the decorated test method.
+    Returns a function that will conditionally skip the decorated test method.
   """
 
-  def decorator(f):
-    if tf_inspect.isclass(f):
-      raise ValueError("`run_gpu_or_tpu` only supports test methods.")
+  if tf_inspect.isclass(func):
+    raise ValueError("`run_gpu_or_tpu` only supports test methods.")
 
-    def decorated(self, *args, **kwargs):
-      if config.list_physical_devices("GPU"):
-        return f(self, "GPU", *args, **kwargs)
+  def decorated(self: "TensorFlowTestCase", *args, **kwargs):
+    if config.list_physical_devices("GPU"):
+      return func(self, "GPU", *args, **kwargs)
 
-      if config.list_physical_devices("TPU"):
-        return f(self, "TPU", *args, **kwargs)
+    if config.list_physical_devices("TPU"):
+      return func(self, "TPU", *args, **kwargs)
 
-      self.skipTest("Test requires GPU or TPU")
+    self.skipTest("Test requires GPU or TPU")
 
-    return decorated
-
-  return decorator if func is None else decorator(func)
+  return decorated
 
 
-def with_forward_compatibility_horizons(*horizons):
+def with_forward_compatibility_horizons(
+    *horizons: Optional[tuple[int, int, int]]
+) -> Callable[[Callable[..., Any]], Callable[..., None]]:
   """Executes the decorated test with the specified forward-compat horizons.
 
   Args:
@@ -1967,19 +2038,19 @@ def with_forward_compatibility_horizons(*horizons):
             (len(horizon) == 3 and all(isinstance(x, int) for x in horizon))):
       raise ValueError("Bad horizon value: %r" % horizon)
 
-  def decorator(f):
+  def decorator(f: Callable[..., Any]) -> Callable[..., None]:
     if tf_inspect.isclass(f):
       raise ValueError("`with_forward_compatibility_horizons` only "
                        "supports test methods.")
-    def decorated(self, *args, **kwargs):
+    def decorated(*args, **kwargs):
       for horizon in horizons:
         if horizon is None:
-          f(self, *args, **kwargs)
+          f(*args, **kwargs)
         else:
           (year, month, day) = horizon
           with forward_compatibility_horizon(year, month, day):
-            f(self, *args, **kwargs)
-    return decorated
+            f(*args, **kwargs)
+    return tf_decorator.make_decorator(f, decorated)
 
   return decorator
 
@@ -1987,7 +2058,10 @@ def decorated(self, *args, **kwargs):
 @deprecation.deprecated(None,
                         "Use `tf.config.list_physical_devices('GPU')` instead.")
 @tf_export("test.is_gpu_available")
-def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
+def is_gpu_available(
+    cuda_only: bool = False,
+    min_cuda_compute_capability: Optional[tuple[int, int]] = None,
+) -> bool:
   """Returns whether TensorFlow can access a GPU.
 
   Warning: if a non-GPU version of the package is installed, the function would
@@ -2043,7 +2117,7 @@ def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
 
 
 @contextlib.contextmanager
-def device(use_gpu):
+def device(use_gpu: bool) -> Iterator[None]:
   """Uses gpu when requested and available."""
   if use_gpu and is_gpu_available():
     dev = "/device:GPU:0"
@@ -2054,28 +2128,28 @@ def device(use_gpu):
 
 
 @contextlib.contextmanager
-def use_gpu():
+def use_gpu() -> Iterator[None]:
   """Uses gpu when requested and available."""
   with device(use_gpu=True):
     yield
 
 
 @contextlib.contextmanager
-def force_gpu():
+def force_gpu() -> Iterator[None]:
   """Force the gpu to be used."""
   with ops.device("/device:GPU:0"):
     yield
 
 
 @contextlib.contextmanager
-def force_cpu():
+def force_cpu() -> Iterator[None]:
   """Force the cpu to be used."""
   with ops.device("/device:CPU:0"):
     yield
 
 
 @contextlib.contextmanager
-def deterministic_ops():
+def deterministic_ops() -> Iterator[None]:
   """Enables deterministic ops."""
   try:
     config.enable_op_determinism()
@@ -2087,10 +2161,10 @@ def deterministic_ops():
 class CapturedWrites:
   """A utility class to load the captured writes made to a stream."""
 
-  def __init__(self, capture_location):
+  def __init__(self, capture_location: str):
     self.capture_location = capture_location
 
-  def contents(self):
+  def contents(self) -> str:
     """Get the captured writes as a single string."""
     with open(self.capture_location) as tmp_file:
       output_data = "".join(tmp_file.readlines())
@@ -2169,7 +2243,7 @@ def run(self, *args, **kwargs):
       raise
 
 
-def disable_cudnn_autotune(func):
+def disable_cudnn_autotune(func: _F) -> _F:
   """Disable autotuning during the call to this function.
 
   Some tests want to base assertions on a graph being isomorphic with a copy.
@@ -2182,46 +2256,39 @@ def disable_cudnn_autotune(func):
     Decorated function.
   """
 
-  def decorator(f):
+  def decorated(*args, **kwargs):
+    original_tf_cudnn_use_autotune = os.environ.get("TF_CUDNN_USE_AUTOTUNE")
+    os.environ["TF_CUDNN_USE_AUTOTUNE"] = "false"
+    original_xla_flags = os.environ.get("XLA_FLAGS")
+    new_xla_flags = "--xla_gpu_autotune_level=0"
+    if original_xla_flags:
+      new_xla_flags = original_xla_flags + " " + new_xla_flags
+    os.environ["XLA_FLAGS"] = new_xla_flags
 
-    def decorated(self, *args, **kwargs):
-      original_tf_cudnn_use_autotune = os.environ.get("TF_CUDNN_USE_AUTOTUNE")
-      os.environ["TF_CUDNN_USE_AUTOTUNE"] = "false"
-      original_xla_flags = os.environ.get("XLA_FLAGS")
-      new_xla_flags = "--xla_gpu_autotune_level=0"
-      if original_xla_flags:
-        new_xla_flags = original_xla_flags + " " + new_xla_flags
-      os.environ["XLA_FLAGS"] = new_xla_flags
+    result = func(*args, **kwargs)
 
-      result = f(self, *args, **kwargs)
-
-      if (original_tf_cudnn_use_autotune is None):
-        del os.environ["TF_CUDNN_USE_AUTOTUNE"]
-      else:
-        os.environ["TF_CUDNN_USE_AUTOTUNE"] = original_tf_cudnn_use_autotune
-      if (original_xla_flags is None):
-        del os.environ["XLA_FLAGS"]
-      else:
-        os.environ["XLA_FLAGS"] = original_xla_flags
-
-      return result
-
-    return tf_decorator.make_decorator(func, decorated)
+    if (original_tf_cudnn_use_autotune is None):
+      del os.environ["TF_CUDNN_USE_AUTOTUNE"]
+    else:
+      os.environ["TF_CUDNN_USE_AUTOTUNE"] = original_tf_cudnn_use_autotune
+    if (original_xla_flags is None):
+      del os.environ["XLA_FLAGS"]
+    else:
+      os.environ["XLA_FLAGS"] = original_xla_flags
 
-  if func is not None:
-    return decorator(func)
+    return result
 
-  return decorator
+  return tf_decorator.make_decorator(func, decorated)
 
 
 # The description is just for documentation purposes.
-def enable_tf_xla_constant_folding(description):
+def enable_tf_xla_constant_folding(description: str) -> Callable[[_F], _F]:
 
   if not isinstance(description, str):
     raise ValueError("'description' should be string, got {}".format(
         type(description)))
 
-  def enable_tf_xla_constant_folding_impl(func):
+  def enable_tf_xla_constant_folding_impl(func: _F) -> _F:
     """Enable constant folding during the call to this function.
 
     Some tests fail without constant folding.
@@ -2233,119 +2300,103 @@ def enable_tf_xla_constant_folding_impl(func):
       Decorated function.
     """
 
-    def decorator(f):
-
-      def decorated(self, *args, **kwargs):
-        original_var = pywrap_tf_session.TF_GetXlaConstantFoldingDisabled()
-        pywrap_tf_session.TF_SetXlaConstantFoldingDisabled(False)
-        result = f(self, *args, **kwargs)
-        pywrap_tf_session.TF_SetXlaConstantFoldingDisabled(original_var)
-        return result
-
-      return decorated
-
-    if func is not None:
-      return decorator(func)
+    def decorated(*args, **kwargs):
+      original_var = pywrap_tf_session.TF_GetXlaConstantFoldingDisabled()
+      pywrap_tf_session.TF_SetXlaConstantFoldingDisabled(False)
+      result = func(*args, **kwargs)
+      pywrap_tf_session.TF_SetXlaConstantFoldingDisabled(original_var)
+      return result
 
-    return decorator
+    return tf_decorator.make_decorator(func, decorated)
 
   return enable_tf_xla_constant_folding_impl
 
 
 # Updates test function by selectively disabling it.
-def _disable_test(execute_func):
-
-  def disable_test_impl(func):
+def _disable_test(execute_func: bool) -> Callable[[_F], _F]:
 
-    def decorator(func):
+  def disable_test_impl(func: _F) -> _F:
 
-      def decorated(self, *args, **kwargs):
-        if execute_func:
-          return func(self, *args, **kwargs)
-
-      return tf_decorator.make_decorator(func, decorated)
-
-    if func is not None:
-      return decorator(func)
+    def decorated(*args, **kwargs):
+      if execute_func:
+        return func(*args, **kwargs)
 
-    return decorator
+    return tf_decorator.make_decorator(func, decorated)
 
   return disable_test_impl
 
 
 # The description is just for documentation purposes.
-def disable_xla(description):  # pylint: disable=unused-argument
+def disable_xla(description: str) -> Callable[[_F], _F]:  # pylint: disable=unused-argument
   """Execute the test method only if xla is not enabled."""
   execute_func = not is_xla_enabled()
   return _disable_test(execute_func)
 
 
 # The description is just for documentation purposes.
-def disable_mlir_bridge(description):  # pylint: disable=unused-argument
+def disable_mlir_bridge(description: str) -> Callable[[_F], _F]:  # pylint: disable=unused-argument
   """Execute the test method only if MLIR bridge is not enabled."""
   execute_func = not is_mlir_bridge_enabled()
   return _disable_test(execute_func)
 
 
 # The description is just for documentation purposes.
-def disable_asan(description):  # pylint: disable=unused-argument
+def disable_asan(description: str) -> Callable[[_F], _F]:  # pylint: disable=unused-argument
   """Execute the test method only if ASAN is not enabled."""
   execute_func = not is_asan_enabled()
   return _disable_test(execute_func)
 
 
 # The description is just for documentation purposes.
-def disable_msan(description):  # pylint: disable=unused-argument
+def disable_msan(description: str) -> Callable[[_F], _F]:  # pylint: disable=unused-argument
   """Execute the test method only if MSAN is not enabled."""
   execute_func = not is_msan_enabled()
   return _disable_test(execute_func)
 
 
 # The description is just for documentation purposes.
-def disable_tsan(description):  # pylint: disable=unused-argument
+def disable_tsan(description: str) -> Callable[[_F], _F]:  # pylint: disable=unused-argument
   """Execute the test method only if TSAN is not enabled."""
   execute_func = not is_tsan_enabled()
   return _disable_test(execute_func)
 
 
 # The description is just for documentation purposes.
-def disable_ubsan(description):  # pylint: disable=unused-argument
+def disable_ubsan(description: str) -> Callable[[_F], _F]:  # pylint: disable=unused-argument
   """Execute the test method only if UBSAN is not enabled."""
   execute_func = not is_ubsan_enabled()
   return _disable_test(execute_func)
 
 
 # The description is just for documentation purposes.
-def disable_tfrt(unused_description):
+def disable_tfrt(
+    unused_description: str,  # pylint: disable=unused-argument
+) -> Callable[[Union[_TC, _F]], Union[_TC, _F, None]]:
 
-  def disable_tfrt_impl(cls_or_func):
+  def disable_tfrt_impl(cls_or_func: Union[_TC, _F]) -> Union[_TC, _F, None]:
     """Execute the test only if tfrt is not enabled."""
 
     if tf_inspect.isclass(cls_or_func):
       if tfrt_utils.enabled():
         return None
       else:
-        return cls_or_func
+        return cast(_TC, cls_or_func)
     else:
-      def decorator(func):
-
-        def decorated(self, *args, **kwargs):
-          if tfrt_utils.enabled():
-            return
-          else:
-            return func(self, *args, **kwargs)
-
-        return decorated
-
-      if cls_or_func is not None:
-        return decorator(cls_or_func)
+      func = cast(Callable[..., Any], cls_or_func)
+      def decorated(*args, **kwargs):
+        if tfrt_utils.enabled():
+          return
+        else:
+          return func(*args, **kwargs)
 
-      return decorator
+      return tf_decorator.make_decorator(cls_or_func, decorated)
 
   return disable_tfrt_impl
 
 
-def for_all_test_methods(decorator, *args, **kwargs):
+def for_all_test_methods(
+    decorator: Callable[..., Any], *args, **kwargs,
+) -> Callable[[_TC], _TC]:
   """Generate class-level decorator from given method-level decorator.
 
   It is expected for the given decorator to take some arguments and return
@@ -2360,7 +2411,7 @@ def for_all_test_methods(decorator, *args, **kwargs):
     decorator.
   """
 
-  def all_test_methods_impl(cls):
+  def all_test_methods_impl(cls: _TC) -> _TC:
     """Apply decorator to all test methods in class."""
     for name in dir(cls):
       value = getattr(cls, name)
@@ -2373,44 +2424,39 @@ def all_test_methods_impl(cls):
 
 
 # The description is just for documentation purposes.
-def no_xla_auto_jit(description):  # pylint: disable=unused-argument
+def no_xla_auto_jit(description: str) -> Callable[[_F], _F]:  # pylint: disable=unused-argument
   """This test is not intended to be run with XLA auto jit enabled."""
   execute_func = not is_xla_enabled()
   return _disable_test(execute_func)
 
 
 # The description is just for documentation purposes.
-def xla_allow_fallback(description):  # pylint: disable=unused-argument
+def xla_allow_fallback(description: str) -> Callable[[_F], _F]:  # pylint: disable=unused-argument
 
-  def xla_allow_fallback_impl(func):
+  def xla_allow_fallback_impl(func: _F) -> _F:
     """Allow fallback to TF even though testing xla."""
 
-    def decorator(func):
-
-      def decorated(self, *args, **kwargs):
-        if is_xla_enabled():
-          # Update the global XLABuildOpsPassFlags to enable lazy compilation,
-          # which allows the compiler to fall back to TF classic. Remember the
-          # old value so that we can reset it.
-          old_value = pywrap_tf_session.TF_SetXlaEnableLazyCompilation(True)
-          result = func(self, *args, **kwargs)
-          pywrap_tf_session.TF_SetXlaEnableLazyCompilation(old_value)
-          return result
-        else:
-          return func(self, *args, **kwargs)
-
-      return decorated
-
-    if func is not None:
-      return decorator(func)
+    def decorated(*args, **kwargs):
+      if is_xla_enabled():
+        # Update the global XLABuildOpsPassFlags to enable lazy compilation,
+        # which allows the compiler to fall back to TF classic. Remember the
+        # old value so that we can reset it.
+        old_value = pywrap_tf_session.TF_SetXlaEnableLazyCompilation(True)
+        result = func(*args, **kwargs)
+        pywrap_tf_session.TF_SetXlaEnableLazyCompilation(old_value)
+        return result
+      else:
+        return func(*args, **kwargs)
 
-    return decorator
+    return tf_decorator.make_decorator(func, decorated)
 
   return xla_allow_fallback_impl
 
 
 # The description is just for documentation purposes.
-def run_without_tensor_float_32(description):  # pylint: disable=unused-argument
+def run_without_tensor_float_32(
+    description: str,  # pylint: disable=unused-argument
+) -> Callable[[Callable[..., Any]], Callable[..., None]]:
   """Execute test with TensorFloat-32 disabled.
 
   While almost every real-world deep learning model runs fine with
@@ -2426,24 +2472,24 @@ def run_without_tensor_float_32(description):  # pylint: disable=unused-argument
     Decorator which runs a test with TensorFloat-32 disabled.
   """
 
-  def decorator(f):
+  def decorator(f: Callable[..., Any]) -> Callable[..., None]:
 
     @functools.wraps(f)
-    def decorated(self, *args, **kwargs):
+    def decorated(*args, **kwargs):
       allowed = config.tensor_float_32_execution_enabled()
       try:
         config.enable_tensor_float_32_execution(False)
-        f(self, *args, **kwargs)
+        f(*args, **kwargs)
       finally:
         config.enable_tensor_float_32_execution(allowed)
 
-    return decorated
+    return tf_decorator.make_decorator(f, decorated)
 
   return decorator
 
 
 # The description is just for documentation purposes.
-def run_all_without_tensor_float_32(description):  # pylint: disable=unused-argument
+def run_all_without_tensor_float_32(description: str) -> Callable[[_TC], _TC]:  # pylint: disable=unused-argument
   """Execute all tests in a class with TensorFloat-32 disabled."""
   return for_all_test_methods(run_without_tensor_float_32, description)
 
@@ -2585,7 +2631,7 @@ def _ClearCachedSession(self):
       self._cached_session.close()
       self._cached_session = None
 
-  def get_temp_dir(self):
+  def get_temp_dir(self) -> str:
     """Returns a unique temporary directory for the test to use.
 
     If you call this method multiple times during in a test, it will return the
@@ -2814,7 +2860,11 @@ def evaluate(
   # pylint: disable=redefined-outer-name
   @contextlib.contextmanager
   def session(
-      self, graph=None, config=None, use_gpu=True, force_gpu=False
+      self,
+      graph: Optional[ops.Graph] = None,
+      config: Optional[config_pb2.ConfigProto] = None,
+      use_gpu: bool = True,
+      force_gpu: bool = False,
   ) -> Iterator[s.Session]:
     """A context manager for a TensorFlow Session for use in executing tests.
 
@@ -2859,11 +2909,13 @@ def testMyOperator(self):
           yield sess
 
   @contextlib.contextmanager
-  def cached_session(self,
-                     graph=None,
-                     config=None,
-                     use_gpu=True,
-                     force_gpu=False) -> Iterator[s.Session]:
+  def cached_session(
+      self,
+      graph: Optional[ops.Graph] = None,
+      config: Optional[config_pb2.ConfigProto] = None,
+      use_gpu: bool = True,
+      force_gpu: bool = False,
+  ) -> Iterator[s.Session]:
     """Returns a TensorFlow Session for use in executing tests.
 
     This method behaves differently than self.session(): for performance reasons
@@ -2913,11 +2965,13 @@ def testMyOperator(self):
   @contextlib.contextmanager
   @deprecation.deprecated(None, "Use `self.session()` or "
                           "`self.cached_session()` instead.")
-  def test_session(self,
-                   graph=None,
-                   config=None,
-                   use_gpu=True,
-                   force_gpu=False):
+  def test_session(
+      self,
+      graph: Optional[ops.Graph] = None,
+      config: Optional[config_pb2.ConfigProto] = None,
+      use_gpu: bool = True,
+      force_gpu: bool = False,
+  ) -> Iterator[s.Session]:
     """Use cached_session instead."""
     if self.id().endswith(".test_session"):
       self.skipTest(
@@ -2947,7 +3001,13 @@ class _CheckedThread(object):
     method.
     """
 
-    def __init__(self, testcase, target, args=None, kwargs=None):
+    def __init__(
+        self,
+        testcase: "TensorFlowTestCase",
+        target: Callable[..., Any],
+        args: Optional[tuple[Any, ...]] = None,
+        kwargs: Optional[dict[str, Any]] = None,
+    ):
       """Constructs a new instance of _CheckedThread.
 
       Args:
@@ -2959,21 +3019,21 @@ def __init__(self, testcase, target, args=None, kwargs=None):
       """
       self._testcase = testcase
       self._target = target
-      self._args = () if args is None else args
-      self._kwargs = {} if kwargs is None else kwargs
+      self._args: tuple[Any, ...] = () if args is None else args
+      self._kwargs: dict[str, Any] = {} if kwargs is None else kwargs
       self._thread = threading.Thread(target=self._protected_run)
       self._exception = None
 
       self._is_thread_joined = False
 
-    def _protected_run(self):
+    def _protected_run(self) -> None:
       """Target for the wrapper thread. Sets self._exception on failure."""
       try:
         self._target(*self._args, **self._kwargs)
       except Exception as e:  # pylint: disable=broad-except
         self._exception = e
 
-    def start(self):
+    def start(self) -> None:
       """Starts the thread's activity.
 
       This must be called at most once per _CheckedThread object. It arranges
@@ -2981,7 +3041,7 @@ def start(self):
       """
       self._thread.start()
 
-    def join(self):
+    def join(self) -> None:
       """Blocks until the thread terminates.
 
       Raises:
@@ -2993,7 +3053,7 @@ def join(self):
       if self._exception is not None:
         self._testcase.fail("Error in checkedThread: %s" % str(self._exception))
 
-    def is_alive(self):
+    def is_alive(self) -> bool:
       """Returns whether the thread is alive.
 
       This method returns True just before the run() method starts
@@ -3004,7 +3064,7 @@ def is_alive(self):
       """
       return self._thread.is_alive()
 
-    def check_termination(self):
+    def check_termination(self) -> None:
       """Returns whether the checked thread was properly used and did terminate.
 
       Every checked thread should be "join"ed after starting, and before the
@@ -3026,7 +3086,12 @@ def check_termination(self):
       else:
         self._testcase.fail("A checked thread was not joined.")
 
-  def checkedThread(self, target, args=None, kwargs=None):
+  def checkedThread(
+      self,
+      target: Callable[..., Any],
+      args: Optional[tuple[Any, ...]] = None,
+      kwargs: Optional[dict[str, Any]] = None,
+  ) -> _CheckedThread:
     """Returns a Thread wrapper that asserts 'target' completes successfully.
 
     This method should be used to create all threads in test cases, as
@@ -3648,8 +3713,13 @@ def assertRaisesWithPredicateMatch(self, exception_type,
     else:
 
       def predicate(e):
-        err_str = e.message if isinstance(e, errors.OpError) else str(e)
-        op = e.op if isinstance(e, errors.OpError) else None
+        if isinstance(e, errors.OpError):
+          e = cast(errors.OpError, e)
+          err_str = cast(str, e.message)
+          op = e.op
+        else:
+          err_str = str(e)
+          op = None
         while op is not None:
           err_str += "\nCaused by: " + op.name
           op = op._original_op  # pylint: disable=protected-access
@@ -3748,7 +3818,8 @@ def assertDictEqual(self, a, b, msg=None):
   def _GetPyList(self, a):
     """Converts `a` to a nested python list."""
     if isinstance(a, ragged_tensor.RaggedTensor):
-      return self.evaluate(a).to_list()
+      a = cast(ragged_tensor_value.RaggedTensorValue, self.evaluate(a))
+      return a.to_list()
     elif isinstance(a, tensor_lib.Tensor):
       a = self.evaluate(a)
       return a.tolist() if isinstance(a, np.ndarray) else a
@@ -3802,7 +3873,9 @@ def _assertListCloseRecursive(self, a, b, rtol, atol, msg, path="value"):
   # pylint: enable=invalid-name
 
   @contextlib.contextmanager
-  def _constrain_devices_and_set_default(self, sess, use_gpu, force_gpu):
+  def _constrain_devices_and_set_default(
+      self, sess: s.Session, use_gpu: bool, force_gpu: bool,
+  ) -> Iterator[s.Session]:
     """Set the session and its graph to global default and constrain devices."""
     if context.executing_eagerly():
       yield None
@@ -3822,10 +3895,17 @@ def _constrain_devices_and_set_default(self, sess, use_gpu, force_gpu):
           with sess.graph.device("/device:CPU:0"):
             yield sess
 
-  def _create_session(self, graph, config, force_gpu):
+  def _create_session(
+      self,
+      graph: Optional[ops.Graph],
+      config: Optional[config_pb2.ConfigProto],
+      force_gpu: bool,
+  ) -> s.Session:
     """See session() for details."""
 
-    def prepare_config(config):
+    def prepare_config(
+        config: Optional[config_pb2.ConfigProto],
+    ) -> config_pb2.ConfigProto:
       """Returns a config for sessions.
 
       Args:
@@ -3861,11 +3941,13 @@ def prepare_config(config):
 
     return ErrorLoggingSession(graph=graph, config=prepare_config(config))
 
-  def _get_cached_session(self,
-                          graph=None,
-                          config=None,
-                          force_gpu=False,
-                          crash_if_inconsistent_args=True):
+  def _get_cached_session(
+      self,
+      graph: Optional[ops.Graph] = None,
+      config: Optional[config_pb2.ConfigProto] = None,
+      force_gpu: bool = False,
+      crash_if_inconsistent_args: bool = True,
+  ) -> s.Session:
     """See cached_session() for documentation."""
     if self._cached_session is None:
       sess = self._create_session(
@@ -3896,7 +3978,7 @@ def _get_cached_session(self,
       return self._cached_session
 
 
-ASSIGNED_PORTS = set()
+ASSIGNED_PORTS: set[int] = set()
 lock = threading.Lock()
 
 
@@ -3919,11 +4001,13 @@ def pick_unused_port():
 
 
 @tf_export("test.create_local_cluster")
-def create_local_cluster(num_workers,
-                         num_ps,
-                         protocol="grpc",
-                         worker_config=None,
-                         ps_config=None):
+def create_local_cluster(
+    num_workers: int,
+    num_ps: int,
+    protocol: str = "grpc",
+    worker_config: Optional[config_pb2.ConfigProto] = None,
+    ps_config: Optional[config_pb2.ConfigProto] = None,
+) -> tuple[list[server_lib.Server], list[server_lib.Server]]:
   """Create and start local servers and return the associated `Server` objects.
 
   "PS" stands for "parameter server": a task responsible for storing and
@@ -4006,7 +4090,9 @@ def create_local_cluster(num_workers,
   return workers, ps_servers
 
 
-def get_node_def_from_graph(node_name, graph_def):
+def get_node_def_from_graph(
+    node_name: str, graph_def: graph_pb2.GraphDef,
+) -> Optional[node_def_pb2.NodeDef]:
   """Returns the `NodeDef` instance for given node name in the graph def.
 
   This method explores only the NodeDefs in `graph_def.node`.
@@ -4024,7 +4110,7 @@ def get_node_def_from_graph(node_name, graph_def):
   return None
 
 
-def set_producer_version(graph, producer_version):
+def set_producer_version(graph: ops.Graph, producer_version: int) -> None:
   """Sets graph.graph_def_versions.producer to `producer_version`."""
   # The C API doesn't expose altering GraphDefVersions. We can indirectly set
   # it via import_graph_def though.
@@ -4089,7 +4175,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 @contextlib.contextmanager
-def run_functions_eagerly(run_eagerly):
+def run_functions_eagerly(run_eagerly: bool) -> Iterator[None]:
   """Runs functions eagerly if `run_eagerly` is true.
 
   WARNING: Setting `run_eagerly` to True in tests running in V1 graph mode
@@ -4134,17 +4220,17 @@ def __init__(self, name, label):
     self.label = label
     self.Reset()
 
-  def Reset(self):
+  def Reset(self) -> None:
     self.last_value = _test_metrics_util.test_counter_value(
         self.name, self.label)
 
-  def Get(self):
+  def Get(self) -> int:
     value = _test_metrics_util.test_counter_value(self.name, self.label)
     return value - self.last_value
 
 
 @tf_export("test.experimental.sync_devices")
-def sync_devices():
+def sync_devices() -> None:
   """Synchronizes all devices.
 
   By default, GPUs run asynchronously. This means that when you run an op on the
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 69680857a3b037..1407aa328b7056 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -1196,11 +1196,11 @@ def __init__(self, *args, **kwargs):
         self.accumulation = []
 
       @unittest.expectedFailure
-      @test_util.assert_no_new_pyobjects_executing_eagerly
+      @test_util.assert_no_new_pyobjects_executing_eagerly()
       def test_has_leak(self):
         self.accumulation.append([1.])
 
-      @test_util.assert_no_new_pyobjects_executing_eagerly
+      @test_util.assert_no_new_pyobjects_executing_eagerly()
       def test_has_no_leak(self):
         self.not_accumulating = [1.]
 
diff --git a/tensorflow/python/framework/traceable_stack.py b/tensorflow/python/framework/traceable_stack.py
index bce16048a24983..8a1fde77e6d506 100644
--- a/tensorflow/python/framework/traceable_stack.py
+++ b/tensorflow/python/framework/traceable_stack.py
@@ -14,21 +14,32 @@
 # ==============================================================================
 """A simple stack that associates filename and line numbers with each object."""
 
+from collections.abc import Iterator
 import inspect
+import types
 
+from typing import cast, Generic, Optional, TypeVar
 
-class TraceableObject(object):
+T = TypeVar("T")
+
+
+class TraceableObject(Generic[T]):
   """Wrap an object together with its the code definition location."""
 
   # Return codes for the set_filename_and_line_from_caller() method.
   SUCCESS, HEURISTIC_USED, FAILURE = (0, 1, 2)
 
-  def __init__(self, obj, filename=None, lineno=None):
+  def __init__(
+      self,
+      obj: T,
+      filename: Optional[str] = None,
+      lineno: Optional[int] = None,
+  ):
     self.obj = obj
     self.filename = filename
     self.lineno = lineno
 
-  def set_filename_and_line_from_caller(self, offset=0):
+  def set_filename_and_line_from_caller(self, offset: int = 0) -> int:
     """Set filename and line using the caller's stack frame.
 
     If the requested stack information is not available, a heuristic may
@@ -49,6 +60,9 @@ def set_filename_and_line_from_caller(self, offset=0):
     """
     retcode = self.SUCCESS
     frame = inspect.currentframe()
+    if not frame:
+      return self.FAILURE
+    frame = cast(types.FrameType, frame)
     # Offset is defined in "Args" as relative to the caller. We are one frame
     # beyond the caller.
     for _ in range(offset + 1):
@@ -57,9 +71,10 @@ def set_filename_and_line_from_caller(self, offset=0):
         # If the offset is too large then we use the largest offset possible.
         retcode = self.HEURISTIC_USED
         break
+      parent = cast(types.FrameType, parent)
       frame = parent
     self.filename = frame.f_code.co_filename
-    self.lineno = frame.f_lineno
+    self.lineno = cast(int, frame.f_lineno)
     return retcode
 
   def copy_metadata(self):
@@ -67,19 +82,22 @@ def copy_metadata(self):
     return self.__class__(None, filename=self.filename, lineno=self.lineno)
 
 
-class TraceableStack(object):
+class TraceableStack(Generic[T]):
   """A stack of TraceableObjects."""
 
-  def __init__(self, existing_stack=None):
+  def __init__(
+      self, existing_stack: Optional[list[TraceableObject[T]]] = None,
+  ):
     """Constructor.
 
     Args:
       existing_stack: [TraceableObject, ...] If provided, this object will
         set its new stack to a SHALLOW COPY of existing_stack.
     """
-    self._stack = existing_stack[:] if existing_stack else []
+    self._stack: list[TraceableObject[T]] = (existing_stack[:] if existing_stack
+                                             else [])
 
-  def push_obj(self, obj, offset=0):
+  def push_obj(self, obj: T, offset: int = 0):
     """Add object to the stack and record its filename and line information.
 
     Args:
@@ -98,27 +116,27 @@ def push_obj(self, obj, offset=0):
     # beyond the caller and need to compensate.
     return traceable_obj.set_filename_and_line_from_caller(offset + 1)
 
-  def pop_obj(self):
+  def pop_obj(self) -> T:
     """Remove last-inserted object and return it, without filename/line info."""
     return self._stack.pop().obj
 
-  def peek_top_obj(self):
+  def peek_top_obj(self) -> T:
     """Return the most recent stored object."""
     return self._stack[-1].obj
 
-  def peek_objs(self):
+  def peek_objs(self) -> Iterator[T]:
     """Return iterator over stored objects ordered newest to oldest."""
     return (t_obj.obj for t_obj in reversed(self._stack))
 
-  def peek_traceable_objs(self):
+  def peek_traceable_objs(self) -> Iterator[TraceableObject[T]]:
     """Return iterator over stored TraceableObjects ordered newest to oldest."""
     return reversed(self._stack)
 
-  def __len__(self):
+  def __len__(self) -> int:
     """Return number of items on the stack, and used for truth-value testing."""
     return len(self._stack)
 
-  def copy(self):
+  def copy(self) -> "TraceableStack[T]":
     """Return a copy of self referencing the same objects but in a new list.
 
     This method is implemented to support thread-local stacks.
diff --git a/tensorflow/python/framework/type_spec.py b/tensorflow/python/framework/type_spec.py
index 26911cdd97bc80..beb278b1624f2c 100644
--- a/tensorflow/python/framework/type_spec.py
+++ b/tensorflow/python/framework/type_spec.py
@@ -33,7 +33,6 @@
 from tensorflow.python.types import core as core_types
 from tensorflow.python.types import internal
 from tensorflow.python.types import trace
-from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
@@ -1057,6 +1056,3 @@ def register_type_spec_from_value_converter(type_object,
   _, type_object = tf_decorator.unwrap(type_object)
   _TYPE_CONVERSION_FUNCTION_REGISTRY.append(
       (type_object, converter_fn, allow_subclass))
-
-
-_pywrap_utils.RegisterType("TypeSpec", TypeSpec)
diff --git a/tensorflow/python/grappler/BUILD b/tensorflow/python/grappler/BUILD
index 0a19f8fbcf89c3..366ebfa1927674 100644
--- a/tensorflow/python/grappler/BUILD
+++ b/tensorflow/python/grappler/BUILD
@@ -227,7 +227,7 @@ cuda_py_strict_test(
     size = "small",
     srcs = ["cluster_test.py"],
     python_version = "PY3",
-    shard_count = 10,
+    shard_count = 5,
     tags = [
         "grappler",
         "no_pip",  # tf_optimizer is not available in pip.
diff --git a/tensorflow/python/grappler/remapper_test.py b/tensorflow/python/grappler/remapper_test.py
index 6d693431f60ea4..91f283c5969792 100644
--- a/tensorflow/python/grappler/remapper_test.py
+++ b/tensorflow/python/grappler/remapper_test.py
@@ -227,6 +227,8 @@ def test_conv2d_biasadd_act_fusion(self):
     """Test Conv2D+BiasAdd+Relu fusion."""
     if not test_util.is_gpu_available():
       self.skipTest('No GPU available')
+    if test.is_built_with_rocm():
+      self.skipTest('ROCm does not support conv biasadd fusion')
 
     N, H, W, C = (5, 3, 3, 8)  # pylint: disable=invalid-name
     # The runtime fusion requires the output dims to be 32-bit aligned.
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index c9be0a3cc5ba10..fe1a5022c5296f 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -52,6 +52,7 @@
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops import tensor_getitem_override
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.ragged import ragged_getitem
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -1559,7 +1560,7 @@ def handle(self, args, kwargs):
       return self.NOT_SUPPORTED
 
 for slicing_op in [
-    array_ops._slice_helper,  # pylint: disable=protected-access
+    tensor_getitem_override._slice_helper,  # pylint: disable=protected-access
     array_ops.boolean_mask,
     array_ops.boolean_mask_v2,
     ragged_getitem.ragged_tensor_getitem
diff --git a/tensorflow/python/kernel_tests/array_ops/BUILD b/tensorflow/python/kernel_tests/array_ops/BUILD
index 4852a3c1768527..80cb2b53072a28 100644
--- a/tensorflow/python/kernel_tests/array_ops/BUILD
+++ b/tensorflow/python/kernel_tests/array_ops/BUILD
@@ -469,9 +469,11 @@ cuda_py_strict_test(
 
 cuda_py_strict_test(
     name = "manip_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["manip_ops_test.py"],
-    tags = ["no_windows_gpu"],
+    tags = [
+        "no_windows_gpu",
+    ],
     deps = [
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:errors",
diff --git a/tensorflow/python/kernel_tests/array_ops/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
index 0a3e6a2eb29b0c..f2cd5d0fd2afef 100644
--- a/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
@@ -47,6 +47,7 @@
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import tensor_getitem_override
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables
@@ -688,7 +689,7 @@ def testInt64GPU(self):
       s = array_ops.strided_slice(x, begin, end, strides)
       self.assertAllEqual([3.], self.evaluate(s))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   @test_util.assert_no_garbage_created
   def testTensorSliceEagerMemory(self):
     with context.eager_mode():
@@ -697,7 +698,7 @@ def testTensorSliceEagerMemory(self):
       # Tests that slicing an EagerTensor doesn't leak memory
       inputs[0]  # pylint: disable=pointless-statement
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   @test_util.assert_no_garbage_created
   def testVariableSliceEagerMemory(self):
     if sys.version_info.major == 3 and sys.version_info.minor in (11, 12):
@@ -788,7 +789,7 @@ def testTensorIndexing(self):
   def testTensorIndexingTypeError(self):
     with self.session():
       checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR)
-      expected = re.escape(array_ops._SLICE_TYPE_ERROR)
+      expected = re.escape(tensor_getitem_override._SLICE_TYPE_ERROR)
       with self.assertRaisesRegex(TypeError, expected):
         _ = checker["foo"]
       with self.assertRaisesRegex(TypeError, expected):
diff --git a/tensorflow/python/kernel_tests/array_ops/constant_op_test.py b/tensorflow/python/kernel_tests/array_ops/constant_op_test.py
index 5fb4fb659d8f19..55cb3e049c0b32 100644
--- a/tensorflow/python/kernel_tests/array_ops/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/constant_op_test.py
@@ -208,7 +208,7 @@ def testExplicitShapeNumPy(self):
           shape=[2, 3, 5])
     self.assertEqual(c.get_shape(), [2, 3, 5])
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testEagerMemory(self):
     """Tests PyObject refs are managed correctly when executing eagerly."""
     constant_op.constant([[1.]])
diff --git a/tensorflow/python/kernel_tests/array_ops/manip_ops_test.py b/tensorflow/python/kernel_tests/array_ops/manip_ops_test.py
index 65291165da8827..35e2c3c0f86e36 100644
--- a/tensorflow/python/kernel_tests/array_ops/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/manip_ops_test.py
@@ -105,11 +105,25 @@ def testEmptyInput(self):
     self._testAll(np.zeros([0, 1]), 1, 1)
     self._testAll(np.zeros([1, 0]), 1, 1)
 
+  @test_util.run_v2_only
+  def testLargeInput(self):
+    with test_util.force_cpu():
+      # Num elements just over INT_MAX for int32 to ensure no overflow
+      np_input = np.arange(0, 128 * 524289 * 33, dtype=np.int8).reshape(
+          128, -1, 33
+      )
+
+      for shift in range(-5, 5):
+        roll = manip_ops.roll(np_input, shift, 0)
+        self.assertAllEqual(roll[shift], np_input[0], msg=f"shift={shift}")
+        self.assertAllEqual(roll[0], np_input[-shift], msg=f"shift={shift}")
+
   @test_util.run_deprecated_v1
   def testInvalidInputShape(self):
     # The input should be 1-D or higher, checked in shape function.
-    with self.assertRaisesRegex(ValueError,
-                                "Shape must be at least rank 1 but is rank 0"):
+    with self.assertRaisesRegex(
+        ValueError, "Shape must be at least rank 1 but is rank 0"
+    ):
       manip_ops.roll(7, 1, 0)
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/image_ops/BUILD b/tensorflow/python/kernel_tests/image_ops/BUILD
index 93bef64e928122..c63a8bccd5d5e9 100644
--- a/tensorflow/python/kernel_tests/image_ops/BUILD
+++ b/tensorflow/python/kernel_tests/image_ops/BUILD
@@ -1,6 +1,10 @@
 # Tests of TensorFlow image kernels written using the Python API.
 
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "tf_py_benchmark_test",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -64,7 +68,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_py_strict_test(
+tf_py_benchmark_test(
     name = "decode_jpeg_op_test",
     srcs = ["decode_jpeg_op_test.py"],
     data = ["//tensorflow/core:image_testdata"],
diff --git a/tensorflow/python/kernel_tests/image_ops/draw_bounding_box_op_test.py b/tensorflow/python/kernel_tests/image_ops/draw_bounding_box_op_test.py
index a66d8d8a9a2a13..f7641c63e7f7e3 100644
--- a/tensorflow/python/kernel_tests/image_ops/draw_bounding_box_op_test.py
+++ b/tensorflow/python/kernel_tests/image_ops/draw_bounding_box_op_test.py
@@ -135,7 +135,7 @@ def testDrawBoundingBoxHalf(self):
         image, dtype=dtypes.half, colors=colors)
 
   # generate_bound_box_proposals is only available on GPU.
-  @test_util.run_gpu_only()
+  @test_util.run_gpu_only
   def testGenerateBoundingBoxProposals(self):
     # Op only exists on GPU.
     with self.cached_session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 82dcd51214819e..936e4204d3ace8 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -256,7 +256,7 @@ cuda_py_strict_test(
     name = "linear_operator_circulant_test",
     size = "medium",
     srcs = ["linear_operator_circulant_test.py"],
-    shard_count = 32,
+    shard_count = 50,
     tags = [
         "no_cuda11",  # TODO(b/197522782): reenable test after fixing.
         "optonly",  # times out, b/79171797
@@ -412,7 +412,7 @@ cuda_py_strict_test(
     name = "linear_operator_low_rank_update_test",
     size = "medium",
     srcs = ["linear_operator_low_rank_update_test.py"],
-    shard_count = 10,
+    shard_count = 15,
     tags = ["optonly"],
     deps = [
         "//tensorflow/python/framework:config",
@@ -516,12 +516,14 @@ cuda_py_strict_test(
     name = "linear_operator_tridiag_test",
     size = "medium",
     srcs = ["linear_operator_tridiag_test.py"],
-    shard_count = 5,
+    shard_count = 10,
     tags = [
         "no_windows_gpu",
         "optonly",
     ],
-    xla_enable_strict_auto_jit = True,
+    # TODO(b/313470344): XLA temporarily disabled due to empty shards on 3.12.
+    xla_enable_strict_auto_jit = False,
+    xla_enabled = False,
     deps = [
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:test_lib",
@@ -881,7 +883,7 @@ cuda_py_strict_test(
     name = "tridiagonal_matmul_op_test",
     size = "medium",
     srcs = ["tridiagonal_matmul_op_test.py"],
-    shard_count = 10,
+    shard_count = 5,
     deps = [
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
diff --git a/tensorflow/python/kernel_tests/linalg/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg/linalg_grad_test.py
index 3f37a3585101d1..f478a85cd63df8 100644
--- a/tensorflow/python/kernel_tests/linalg/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linalg_grad_test.py
@@ -240,7 +240,7 @@ def Test(self):
                 lambda x: linalg_ops.matrix_inverse(x, adjoint=True),
                 dtype, shape))
 
-        if not test_lib.is_built_with_rocm():
+        if True:  # not test_lib.is_built_with_rocm():
           # TODO(rocm) :
           # re-enable this test when upstream issues are resolved
           # see commit msg for details
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
index ee84171e67ca81..ddd6879ba020a0 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -286,7 +286,8 @@ def dtypes_to_test():
   def optional_tests():
     """List of optional test names to run."""
     return [
-        "operator_matmul_with_same_type",
+        # TODO: b/310008894 - Re-enable this optional test.
+        # "operator_matmul_with_same_type",
         "operator_solve_with_same_type",
     ]
 
@@ -371,7 +372,8 @@ def setUp(self):
   def optional_tests():
     """List of optional test names to run."""
     return [
-        "operator_matmul_with_same_type",
+        # TODO: b/310008894 - Re-enable this optional test.
+        # "operator_matmul_with_same_type",
         "operator_solve_with_same_type",
     ]
 
@@ -445,7 +447,8 @@ def skip_these_tests():
   def optional_tests():
     """List of optional test names to run."""
     return [
-        "operator_matmul_with_same_type",
+        # TODO: b/310008894 - Re-enable this optional test.
+        # "operator_matmul_with_same_type",
         "operator_solve_with_same_type",
     ]
 
@@ -649,7 +652,8 @@ def operator_shapes_infos():
   def optional_tests():
     """List of optional test names to run."""
     return [
-        "operator_matmul_with_same_type",
+        # TODO: b/310008894 - Re-enable this optional test.
+        # "operator_matmul_with_same_type",
         "operator_solve_with_same_type",
     ]
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
index ca5cb6e0d1f7a9..bcb1360ea6b2eb 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
@@ -33,6 +33,13 @@ class LinearOperatorInversionTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
+  # TODO: b/311343496 - Re-enable this test.
+  @staticmethod
+  def skip_these_tests() -> list[str]:
+    return [
+        "test_saved_model",
+    ]
+
   def tearDown(self):
     config.enable_tensor_float_32_execution(self.tf32_keep_)
 
diff --git a/tensorflow/python/kernel_tests/linalg/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/linalg/matrix_square_root_op_test.py
index 73d9d9263e9262..28511c609a9d8f 100644
--- a/tensorflow/python/kernel_tests/linalg/matrix_square_root_op_test.py
+++ b/tensorflow/python/kernel_tests/linalg/matrix_square_root_op_test.py
@@ -25,7 +25,6 @@
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_without_tensor_float_32
 class SquareRootOpTest(test.TestCase):
 
   def _verifySquareRoot(self, matrix, np_type):
@@ -65,16 +64,19 @@ def _testMatrices(self, matrix1, matrix2):
     self._verifySquareRootComplex(matrix2)
     self._verifySquareRootComplex(self._makeBatch(matrix1, matrix2))
 
+  @test_util.run_without_tensor_float_32
   def testSymmetricPositiveDefinite(self):
     matrix1 = np.array([[2., 1.], [1., 2.]])
     matrix2 = np.array([[3., -1.], [-1., 3.]])
     self._testMatrices(matrix1, matrix2)
 
+  @test_util.run_without_tensor_float_32
   def testAsymmetric(self):
     matrix1 = np.array([[0., 4.], [-1., 5.]])
     matrix2 = np.array([[33., 24.], [48., 57.]])
     self._testMatrices(matrix1, matrix2)
 
+  @test_util.run_without_tensor_float_32
   def testIdentityMatrix(self):
     # 2x2
     identity = np.array([[1., 0], [0, 1.]])
@@ -83,11 +85,13 @@ def testIdentityMatrix(self):
     identity = np.array([[1., 0, 0], [0, 1., 0], [0, 0, 1.]])
     self._verifySquareRootReal(identity)
 
+  @test_util.run_without_tensor_float_32
   def testEmpty(self):
     self._verifySquareRootReal(np.empty([0, 2, 2]))
     self._verifySquareRootReal(np.empty([2, 0, 0]))
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_without_tensor_float_32
   def testWrongDimensions(self):
     # The input to the square root should be at least a 2-dimensional tensor.
     tensor = constant_op.constant([1., 2.])
@@ -95,12 +99,14 @@ def testWrongDimensions(self):
       gen_linalg_ops.matrix_square_root(tensor)
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_without_tensor_float_32
   def testNotSquare(self):
     with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       tensor = constant_op.constant([[1., 0., -1.], [-1., 1., 0.]])
       self.evaluate(gen_linalg_ops.matrix_square_root(tensor))
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_without_tensor_float_32
   def testConcurrentExecutesWithoutError(self):
     matrix_shape = [5, 5]
     seed = [42, 24]
diff --git a/tensorflow/python/kernel_tests/linalg/sparse/BUILD b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
index 9463e04bf8f0bc..d4d8d65195db00 100644
--- a/tensorflow/python/kernel_tests/linalg/sparse/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
@@ -85,7 +85,7 @@ cuda_py_strict_test(
     size = "medium",
     srcs = ["csr_sparse_matrix_grad_test.py"],
     main = "csr_sparse_matrix_grad_test.py",
-    shard_count = 50,
+    shard_count = 3,
     deps = [
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
@@ -126,7 +126,7 @@ cuda_py_strict_test(
     size = "medium",
     srcs = ["csr_sparse_matrix_sparse_mat_mul_grad_test.py"],
     main = "csr_sparse_matrix_sparse_mat_mul_grad_test.py",
-    shard_count = 50,
+    shard_count = 10,
     deps = [
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
diff --git a/tensorflow/python/kernel_tests/math_ops/BUILD b/tensorflow/python/kernel_tests/math_ops/BUILD
index 0903a13db22c46..237e39435036ef 100644
--- a/tensorflow/python/kernel_tests/math_ops/BUILD
+++ b/tensorflow/python/kernel_tests/math_ops/BUILD
@@ -1,6 +1,10 @@
 # Tests of TensorFlow math kernels written using the Python API.
 
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "cuda_py_benchmark_test",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -283,7 +287,7 @@ cuda_py_strict_test(
     name = "cwise_ops_unary_test",
     size = "medium",
     srcs = ["cwise_ops_unary_test.py"],
-    shard_count = 50,
+    shard_count = 10,
     tags = [
         "no_windows",  # TODO(b/207048097): re-enable
     ],
@@ -369,7 +373,7 @@ cuda_py_strict_test(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "reduce_benchmark_test",
     srcs = ["reduce_benchmark_test.py"],
     deps = [
diff --git a/tensorflow/python/kernel_tests/math_ops/cwise_ops_binary_test.py b/tensorflow/python/kernel_tests/math_ops/cwise_ops_binary_test.py
index 8a1d14be8417ee..b0cf1d0058c1a5 100644
--- a/tensorflow/python/kernel_tests/math_ops/cwise_ops_binary_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/cwise_ops_binary_test.py
@@ -883,7 +883,10 @@ def testPowNegativeExponentGpu(self):
     z = math_ops.pow(x, y)
     self.assertAllEqual(self.evaluate(z), [0, 1, 1, 1, -1])
 
-  def testFloorModInfDenominator(self):
+  @test.disable_with_predicate(
+      pred=test.is_built_with_rocm, skip_message="On ROCm this test fails"
+  )
+  def testFloorModfInfDenominator(self):
     """Regression test for GitHub issue #58369."""
     if not test_util.is_gpu_available():
       self.skipTest("Requires GPU")
diff --git a/tensorflow/python/kernel_tests/math_ops/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/math_ops/cwise_ops_unary_test.py
index 29daaea0b1643a..24c06cedce2443 100644
--- a/tensorflow/python/kernel_tests/math_ops/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/cwise_ops_unary_test.py
@@ -445,8 +445,6 @@ def f(x):
     self._compareBoth(x, compute_f32(np.vectorize(math.erfc)), math_ops.erfc)
     self._compareBoth(x, compute_f32(np.square), math_ops.square)
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="On ROCm this test fails")
   def testInt8Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int8)
     self._compareCpu(x, np.abs, math_ops.abs)
@@ -455,14 +453,10 @@ def testInt8Basic(self):
     self._compareBoth(x, np.negative, _NEG)
     self._compareBoth(x, np.sign, math_ops.sign)
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="On ROCm this test fails")
   def testUInt8Basic(self):
     x = np.arange(6).reshape(1, 3, 2).astype(np.uint8)
     self._compareBoth(x, np.square, math_ops.square)
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="On ROCm this test fails")
   def testInt16Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int16)
     self._compareCpu(x, np.abs, math_ops.abs)
@@ -471,8 +465,6 @@ def testInt16Basic(self):
     self._compareBoth(x, np.negative, _NEG)
     self._compareBoth(x, np.sign, math_ops.sign)
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="On ROCm this test fails")
   def testUInt16Basic(self):
     x = np.arange(6).reshape(1, 3, 2).astype(np.uint16)
     self._compareBoth(x, np.square, math_ops.square)
@@ -491,8 +483,6 @@ def testInt32Basic(self):
     self._compareBothSparse(x, np.square, math_ops.square)
     self._compareBothSparse(x, np.sign, math_ops.sign)
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="On ROCm this test fails")
   def testUInt32Basic(self):
     x = np.arange(6).reshape(1, 3, 2).astype(np.uint32)
     self._compareBoth(x, np.square, math_ops.square)
@@ -514,8 +504,6 @@ def testInt64Square(self):
     self._compareCpu(x, np.square, math_ops.square)
     self._compareBothSparse(x, np.square, math_ops.square)
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm, skip_message="On ROCm this test fails")
   def testUInt64Basic(self):
     x = np.arange(6).reshape(1, 3, 2).astype(np.uint64)
     self._compareBoth(x, np.square, math_ops.square)
diff --git a/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_d9m_test.py b/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_d9m_test.py
index fbd5f9501c0933..3c166b86fabc74 100644
--- a/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_d9m_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_d9m_test.py
@@ -89,9 +89,6 @@ def testUnsortedOps(self):
             result = op(data, segment_ids, num_segments)
             self.evaluate(result)
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm,
-      skip_message="No ROCm support for complex types in segment reduction ops")
   @test_util.run_cuda_only
   def testUnsortedOpsComplex(self):
     for op in [
diff --git a/tensorflow/python/kernel_tests/nn_ops/conv_ops_test.py b/tensorflow/python/kernel_tests/nn_ops/conv_ops_test.py
index f22ddc973cd045..71c2f3a208f122 100644
--- a/tensorflow/python/kernel_tests/nn_ops/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/conv_ops_test.py
@@ -207,11 +207,11 @@ def _DtypesToTest(self, use_gpu):
     if use_gpu:
       # It is important that float32 comes first, since we are using its
       # gradients as a reference for fp16 gradients.
-      out = [dtypes.float32]
+      out = [dtypes.float32, dtypes.bfloat16]
       if test_util.GpuSupportsHalfMatMulAndConv():
         out.append(dtypes.float16)
       if not test.is_built_with_rocm():
-        out.extend([dtypes.float64, dtypes.bfloat16])
+        out.extend([dtypes.float64])
       return out
 
     return [dtypes.float32, dtypes.float64, dtypes.float16, dtypes.bfloat16]
@@ -460,7 +460,7 @@ def _VerifyDilatedConvValuesParameters(
       op_name,
       rtol=1e-4,
   ):
-    if use_gpu and not test.is_gpu_available(cuda_only=True):
+    if use_gpu and not test.is_gpu_available():
       self.skipTest("GPU not available")
     expected_results = []
     computed_results = []
@@ -520,7 +520,7 @@ def _VerifyValues(self,
                     gpu_only=False,
                     test_grappler_layout_optimizer=False,
                     tol=1e-5):
-    if gpu_only and not test.is_gpu_available(cuda_only=True):
+    if gpu_only and not test.is_gpu_available():
       return
     tensors = []
     dilations = list(dilations)
@@ -577,7 +577,7 @@ def _VerifyValuesParameters(
       test_grappler_layout_optimizer=False,
       tol=1e-5,
   ):
-    if (gpu_only and not use_gpu) or not test.is_gpu_available(cuda_only=True):
+    if (gpu_only and not use_gpu) or not test.is_gpu_available():
       self.skipTest("GPU not available")
     if (
         test_grappler_layout_optimizer or data_format != "NHWC"
@@ -1330,8 +1330,12 @@ def MakeConv2d(inputs, filters):
           results[0], results[1], atol=tol_to_use, rtol=tol_to_use)
 
   @test_util.run_in_graph_and_eager_modes
+  @test.disable_with_predicate(
+      pred=test.is_built_with_rocm,
+      skip_message="MIOpen does not support group conv yet!",
+  )
   def testConv2DGroupConvFwd(self):
-    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
+    if test.is_gpu_available() or test_util.IsMklEnabled():
       data_formats = ["NHWC", "NCHW"]
     else:
       data_formats = ["NHWC"]
@@ -1347,7 +1351,11 @@ def testConv2DGroupConvFwd(self):
                                      dtype=dtypes.float32)
 
   @test_util.deprecated_graph_mode_only
-  @test_util.run_cuda_only
+  @test_util.run_gpu_only
+  @test.disable_with_predicate(
+      pred=test.is_built_with_rocm,
+      skip_message="MIOpen does not support group conv yet!",
+  )
   def testInputGradientGroupConv(self):
     for data_format in ["NCHW", "NHWC"]:
       for test_input in [True, False]:
@@ -1369,7 +1377,11 @@ def testInputGradientGroupConv(self):
             max_err=0.005)
 
   @test_util.deprecated_graph_mode_only
-  @test_util.run_cuda_only
+  @test_util.run_gpu_only
+  @test.disable_with_predicate(
+      pred=test.is_built_with_rocm,
+      skip_message="MIOpen does not support group conv yet!",
+  )
   def testFilterGradientGroupConv(self):
     for data_format in ["NCHW", "NHWC"]:
       for test_input in [True, False]:
@@ -1407,7 +1419,7 @@ def _RunAndVerifyBackpropInput(self,
                                  use_gpu,
                                  err,
                                  dilations=(1, 1)):
-    if use_gpu and not test.is_gpu_available(cuda_only=True):
+    if use_gpu and not test.is_gpu_available():
       return
     x1 = self._CreateNumpyTensor(filter_sizes)
     x2 = self._CreateNumpyTensor(output_sizes)
@@ -1893,7 +1905,7 @@ def _RunAndVerifyBackpropFilterDilation(self, input_sizes, filter_sizes,
 
   @test_util.deprecated_graph_mode_only
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
-    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
+    if test.is_gpu_available() or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropFilterDilation(
             input_sizes=[1, 3, 6, 1],
@@ -1908,7 +1920,7 @@ def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
 
   @test_util.deprecated_graph_mode_only
   def testConv2D2x2Depth1ValidBackpropFilterDilation1x2(self):
-    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
+    if test.is_gpu_available() or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropFilterDilation(
             input_sizes=[1, 2, 3, 1],
@@ -1923,7 +1935,7 @@ def testConv2D2x2Depth1ValidBackpropFilterDilation1x2(self):
 
   @test_util.deprecated_graph_mode_only
   def testConv2DEmptyBackpropFilterDilation1x2(self):
-    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
+    if test.is_gpu_available() or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropFilterDilation(
             input_sizes=[1, 2, 3, 1],
@@ -1938,7 +1950,7 @@ def testConv2DEmptyBackpropFilterDilation1x2(self):
 
   @test_util.deprecated_graph_mode_only
   def testConv2D2x2Depth3ValidBackpropFilterDilation2x2(self):
-    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
+    if test.is_gpu_available() or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropFilterDilation(
             input_sizes=[1, 3, 4, 3],
@@ -1953,7 +1965,7 @@ def testConv2D2x2Depth3ValidBackpropFilterDilation2x2(self):
 
   @test_util.deprecated_graph_mode_only
   def testConv2DKernelSizeMatchesInputSizeBackpropFilterDilation2x2(self):
-    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
+    if test.is_gpu_available() or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropFilterDilation(
             input_sizes=[1, 3, 3, 1],
@@ -1968,7 +1980,7 @@ def testConv2DKernelSizeMatchesInputSizeBackpropFilterDilation2x2(self):
 
   @test_util.deprecated_graph_mode_only
   def testConv2D2x2Depth3ValidBackpropInputStride1x1Dilation2x1(self):
-    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
+    if test.is_gpu_available() or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropInputDilation(
             input_sizes=[1, 3, 6, 1],
@@ -1983,7 +1995,7 @@ def testConv2D2x2Depth3ValidBackpropInputStride1x1Dilation2x1(self):
 
   @test_util.deprecated_graph_mode_only
   def testConv2D2x2Depth1ValidBackpropInputDilation1x2(self):
-    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
+    if test.is_gpu_available() or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropInputDilation(
             input_sizes=[1, 2, 3, 1],
@@ -1998,7 +2010,7 @@ def testConv2D2x2Depth1ValidBackpropInputDilation1x2(self):
 
   @test_util.deprecated_graph_mode_only
   def testConv2DEmptyBackpropInputDilation1x2(self):
-    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
+    if test.is_gpu_available() or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropInputDilation(
             input_sizes=[0, 2, 3, 1],
@@ -2013,7 +2025,7 @@ def testConv2DEmptyBackpropInputDilation1x2(self):
 
   @test_util.deprecated_graph_mode_only
   def testConv2D2x2Depth3ValidBackpropInputDilation2x1(self):
-    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
+    if test.is_gpu_available() or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         # The GPU version of this test is not very stable. So adjusting the
         # error threshold to 1e-4.
@@ -2030,7 +2042,7 @@ def testConv2D2x2Depth3ValidBackpropInputDilation2x1(self):
 
   @test_util.deprecated_graph_mode_only
   def testConv2DKernelSizeMatchesInputSizeBackpropInputDilation2x2(self):
-    if test.is_gpu_available(cuda_only=True) or test_util.IsMklEnabled():
+    if test.is_gpu_available() or test_util.IsMklEnabled():
       for (data_format, use_gpu) in GetTestConfigs():
         self._RunAndVerifyBackpropInputDilation(
             input_sizes=[1, 3, 3, 1],
@@ -2053,7 +2065,7 @@ def _RunAndVerifyBackpropInputExplicitPadding(self,
                                                 use_gpu,
                                                 dilations=(1, 1),
                                                 err=2e-5):
-    if use_gpu and not test.is_gpu_available(cuda_only=True):
+    if use_gpu and not test.is_gpu_available():
       return
     if not use_gpu and dilations != (1, 1):
       return  # Non-default dilations is currently not supported on the CPU.
@@ -2215,7 +2227,7 @@ def _RunAndVerifyBackpropFilterExplicitPadding(self,
                                                  use_gpu,
                                                  dilations=(1, 1),
                                                  err=1e-5):
-    if use_gpu and not test.is_gpu_available(cuda_only=True):
+    if use_gpu and not test.is_gpu_available():
       return
     if not use_gpu and dilations != (1, 1):
       return  # Non-default dilations is currently not supported on the CPU.
@@ -3513,7 +3525,6 @@ def testConv2D3x3FilterStride1x1Valid(self):
   def testConv2D3x3FilterStride1x1Same(self):
     self._RunTestCases([1, 1], "SAME")
 
-
 class Conv2DBenchmark(test.Benchmark):
 
   def benchmarkGPUConvStackFirst(self):
diff --git a/tensorflow/python/kernel_tests/nn_ops/depthwise_conv_op_base.py b/tensorflow/python/kernel_tests/nn_ops/depthwise_conv_op_base.py
index a9f63ad6ce9a94..4e466a0a1c876a 100644
--- a/tensorflow/python/kernel_tests/nn_ops/depthwise_conv_op_base.py
+++ b/tensorflow/python/kernel_tests/nn_ops/depthwise_conv_op_base.py
@@ -407,7 +407,7 @@ def _VerifyValues(self,
         interface_result, np_result, atol=tolerance, rtol=tolerance)
 
   @test_util.run_v1_only("b/120545219")
-  @test_util.run_cuda_only
+  @test_util.run_gpu_only
   def testDepthwiseConv2DCudnn(self):
     for index, (input_size, filter_size, _, stride, padding,
                 dilations) in enumerate(ConfigsToTest()):
@@ -510,10 +510,10 @@ def testDepthwiseConv2DExplicit(self):
           "Testing DepthwiseConv2D, %dth config: %r * %r, stride: %d, padding: "
           "%s", index, input_size, filter_size, stride, padding)
       # double datatype is currently not supported for convolution ops
-      # on the ROCm platform and its support for bfloat16 is unknown.
-      data_types = [dtypes.float16, dtypes.float32]
+      # on the ROCm platform
+      data_types = [dtypes.float16, dtypes.float32, dtypes.bfloat16]
       if not test.is_built_with_rocm():
-        data_types.extend([dtypes.float64, dtypes.bfloat16])
+        data_types.extend([dtypes.float64])
       data_formats = ["NHWC", "NCHW"] if test.is_gpu_available() else ["NHWC"]
       for data_type in data_types:
         for data_format in data_formats:
@@ -529,8 +529,7 @@ def testDepthwiseConv2DExplicit(self):
               dilations=dilations,
               tolerance=tolerance)
 
-
-# This is testing against hand calculated results.
+  # This is testing against hand calculated results.
 
   def _VerifyHandValues(self, tensor_in_sizes, filter_in_sizes, stride, padding,
                         expected, use_gpu):
@@ -736,7 +735,7 @@ def _ConstructAndTestGradient(self,
       self.assertLess(err, tolerance)
 
   @test_util.run_v1_only("b/120545219")
-  @test_util.run_cuda_only
+  @test_util.run_gpu_only
   def testDepthwiseConv2DInputGradCudnn(self):
     for index, (input_size, filter_size, output_size, stride, padding,
                 dilations) in enumerate(CheckGradConfigsToTest()):
@@ -832,10 +831,10 @@ def testDepthwiseConv2DInputGradExplicit(self):
           "stride: %d, padding: %s", index, input_size, filter_size, stride,
           padding)
       # double datatype is currently not supported for convolution ops
-      # on the ROCm platform and its support for bfloat16 is unknown.
-      data_types = [dtypes.float16, dtypes.float32]
+      # on the ROCm platform
+      data_types = [dtypes.float16, dtypes.float32, dtypes.bfloat16]
       if not test.is_built_with_rocm():
-        data_types.extend([dtypes.float64, dtypes.bfloat16])
+        data_types.extend([dtypes.float64])
       data_formats = ["NHWC", "NCHW"] if test.is_gpu_available() else ["NHWC"]
       for data_type in data_types:
         for data_format in data_formats:
@@ -852,7 +851,7 @@ def testDepthwiseConv2DInputGradExplicit(self):
               dilations=dilations)
 
   @test_util.run_v1_only("b/120545219")
-  @test_util.run_cuda_only
+  @test_util.run_gpu_only
   def testDepthwiseConv2DFilterGradCudnn(self):
     for index, (input_size, filter_size, output_size, stride, padding,
                 dilations) in enumerate(CheckGradConfigsToTest()):
@@ -945,10 +944,10 @@ def testDepthwiseConv2DFilterGradExplicit(self):
           "stride: %d, padding: %s", index, input_size, filter_size, stride,
           padding)
       # double datatype is currently not supported for convolution ops
-      # on the ROCm platform and its support for bfloat16 is unknown.
-      data_types = [dtypes.float16, dtypes.float32]
+      # on the ROCm platform
+      data_types = [dtypes.float16, dtypes.float32, dtypes.bfloat16]
       if not test.is_built_with_rocm():
-        data_types.extend([dtypes.float64, dtypes.bfloat16])
+        data_types.extend([dtypes.float64])
       data_formats = ["NHWC", "NCHW"] if test.is_gpu_available() else ["NHWC"]
       for data_type in data_types:
         for data_format in data_formats:
@@ -999,14 +998,14 @@ def testDepthwiseConv2DInputGradCompare(self):
           padding)
       self._CompareBackpropInput(input_size, filter_size, output_size, stride,
                                  padding, "float32")
-      # Convolutions on the ROCm platform don't support double dtype. And its
-      # support for bf16 is unknown. So, we skip these tests.
-      if test.is_built_with_rocm():
-        continue
-      self._CompareBackpropInput(input_size, filter_size, output_size, stride,
-                                 padding, "float64")
       self._CompareBackpropInput(input_size, filter_size, output_size, stride,
                                  padding, "bfloat16")
+      # Convolutions on the ROCm platform don't support double dtype.
+      # So, we skip these tests.
+      if not test.is_built_with_rocm():
+        self._CompareBackpropInput(
+            input_size, filter_size, output_size, stride, padding, "float64"
+        )
 
   @test_util.run_gpu_only
   def testDepthwiseConv2DInputGradExplicitCompare(self):
@@ -1020,14 +1019,13 @@ def testDepthwiseConv2DInputGradExplicitCompare(self):
           padding)
       self._CompareBackpropInput(input_size, filter_size, output_size, stride,
                                  padding, "float32")
-      # Convolutions on the ROCm platform don't support double dtype. And its
-      # support for bf16 is unknown. So, we skip these tests.
-      if test.is_built_with_rocm():
-        continue
-      self._CompareBackpropInput(input_size, filter_size, output_size, stride,
-                                 padding, "float64")
       self._CompareBackpropInput(input_size, filter_size, output_size, stride,
                                  padding, "bfloat16")
+      # Convolutions on the ROCm platform don't support double dtype.
+      if not test.is_built_with_rocm():
+        self._CompareBackpropInput(
+            input_size, filter_size, output_size, stride, padding, "float64"
+        )
 
   def _CompareBackpropFilter(self, input_sizes, filter_sizes, output_sizes,
                              stride, padding, dtype):
@@ -1080,15 +1078,13 @@ def testDepthwiseConv2DFilterGradCompare(self):
           padding)
       self._CompareBackpropFilter(input_size, filter_size, output_size, stride,
                                   padding, "float32")
-      # Convolutions on the ROCm platform don't support double dtype. And its
-      # support for bf16 is unknown. So, we skip these tests.
-      if test.is_built_with_rocm():
-        continue
-      self._CompareBackpropFilter(input_size, filter_size, output_size, stride,
-                                  padding, "float64")
-
       self._CompareBackpropFilter(input_size, filter_size, output_size, stride,
                                   padding, "bfloat16")
+      # Convolutions on the ROCm platform don't support double dtype.
+      if not test.is_built_with_rocm():
+        self._CompareBackpropFilter(
+            input_size, filter_size, output_size, stride, padding, "float64"
+        )
 
   @test_util.run_gpu_only
   def testDepthwiseConv2DFilterGradExplicitCompare(self):
@@ -1102,15 +1098,13 @@ def testDepthwiseConv2DFilterGradExplicitCompare(self):
           padding)
       self._CompareBackpropFilter(input_size, filter_size, output_size, stride,
                                   padding, "float32")
-      # Convolutions on the ROCm platform don't support double dtype. And its
-      # support for bf16 is unknown. So, we skip these tests.
-      if test.is_built_with_rocm():
-        continue
-      self._CompareBackpropFilter(input_size, filter_size, output_size, stride,
-                                  padding, "float64")
-
       self._CompareBackpropFilter(input_size, filter_size, output_size, stride,
                                   padding, "bfloat16")
+      # Convolutions on the ROCm platform don't support double dtype.
+      if not test.is_built_with_rocm():
+        self._CompareBackpropFilter(
+            input_size, filter_size, output_size, stride, padding, "float64"
+        )
 
   def _CompareForward(self, input_sizes, filter_sizes, output_sizes, stride,
                       padding, dtype):
@@ -1146,16 +1140,15 @@ def testDepthwiseConv2DForwardCompare(self):
           padding)
       self._CompareForward(input_size, filter_size, output_size, stride,
                            padding, "float32")
-      # Convolutions on the ROCm platform don't support double dtype. And its
-      # support for bf16 is unknown. So, we skip these tests.
-      if test.is_built_with_rocm():
-        continue
-      self._CompareForward(input_size, filter_size, output_size, stride,
-                           padding, "float64")
-
       self._CompareForward(input_size, filter_size, output_size, stride,
                            padding, "bfloat16")
 
+      # Convolutions on the ROCm platform don't support double dtype.
+      if not test.is_built_with_rocm():
+        self._CompareForward(
+            input_size, filter_size, output_size, stride, padding, "float64"
+        )
+
   @test_util.run_gpu_only
   def testDepthwiseConv2DForwardExplicitCompare(self):
     for index, (input_size, filter_size, output_size, stride, padding,
@@ -1166,14 +1159,14 @@ def testDepthwiseConv2DForwardExplicitCompare(self):
           "Testing DepthwiseConv2DForwardCompare, %dth config: %r * %r, "
           "stride: %d, padding: %s", index, input_size, filter_size, stride,
           padding)
-      # Convolutions on the ROCm platform don't support double dtype. And its
-      # support for bf16 is unknown. So, we skip these tests.
-      if test.is_built_with_rocm():
-        continue
-      self._CompareForward(input_size, filter_size, output_size, stride,
-                           padding, "float64")
+
       self._CompareForward(input_size, filter_size, output_size, stride,
                            padding, "float32")
-
       self._CompareForward(input_size, filter_size, output_size, stride,
                            padding, "bfloat16")
+
+      # Convolutions on the ROCm platform don't support double dtype.
+      if not test.is_built_with_rocm():
+        self._CompareForward(
+            input_size, filter_size, output_size, stride, padding, "float64"
+        )
diff --git a/tensorflow/python/kernel_tests/nn_ops/losses_test.py b/tensorflow/python/kernel_tests/nn_ops/losses_test.py
index 7da91f686a849c..b339738b485800 100644
--- a/tensorflow/python/kernel_tests/nn_ops/losses_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/losses_test.py
@@ -101,7 +101,7 @@ def testLossWithSampleSpecificWeightsAllZero(self):
     with self.cached_session():
       self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testEagerNoMemoryLeaked(self):
     # This is a somewhat convoluted way of testing that nothing gets added to
     # a global collection.
@@ -244,7 +244,7 @@ def testAllCorrectInt32Labels(self):
       self.assertEqual(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testEagerNoMemoryLeaked(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
diff --git a/tensorflow/python/kernel_tests/nn_ops/rnn_test.py b/tensorflow/python/kernel_tests/nn_ops/rnn_test.py
index e517f4ecc8864c..f13a2521d44516 100644
--- a/tensorflow/python/kernel_tests/nn_ops/rnn_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/rnn_test.py
@@ -240,7 +240,7 @@ def testUnbalancedOutputIsAccepted(self):
     self.assertAllEqual([[[1, 1], [2, 2], [3, 3], [4, 4]]], outputs[1])
     self.assertAllEqual(4, state)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testEagerMemory(self):
     with context.eager_mode():
       cell = TensorArrayStateRNNCell()
diff --git a/tensorflow/python/kernel_tests/sparse_ops/sparse_xent_op_test_base.py b/tensorflow/python/kernel_tests/sparse_ops/sparse_xent_op_test_base.py
index a30d82591da5c9..381e5c093f007e 100644
--- a/tensorflow/python/kernel_tests/sparse_ops/sparse_xent_op_test_base.py
+++ b/tensorflow/python/kernel_tests/sparse_ops/sparse_xent_op_test_base.py
@@ -71,7 +71,7 @@ def testSingleClass(self):
       self.assertAllClose([0.0, 0.0, 0.0], tf_loss)
       self.assertAllClose([[0.0], [0.0], [0.0]], tf_gradient)
 
-  @test_util.run_gpu_only()
+  @test_util.run_gpu_only
   def _testInvalidLabelGPU(self, invalid_label_gradient=np.nan):
     labels = [4, 3, 0, -1]
     logits = [[1., 1., 1., 1.], [1., 1., 1., 1.], [1., 2., 3., 4.],
diff --git a/tensorflow/python/kernel_tests/summary_ops/summary_ops_test.py b/tensorflow/python/kernel_tests/summary_ops/summary_ops_test.py
index ae41fc42fb0260..cdccc3da78c9ea 100644
--- a/tensorflow/python/kernel_tests/summary_ops/summary_ops_test.py
+++ b/tensorflow/python/kernel_tests/summary_ops/summary_ops_test.py
@@ -996,7 +996,7 @@ def testNoMemoryLeak_graphMode(self):
     with context.graph_mode(), ops.Graph().as_default():
       summary_ops.create_file_writer_v2(logdir)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testNoMemoryLeak_eagerMode(self):
     logdir = self.get_temp_dir()
     with summary_ops.create_file_writer_v2(logdir).as_default():
@@ -1495,12 +1495,13 @@ def f():
     assert context.executing_eagerly()
     logdir = self.get_temp_dir()
     writer = summary_ops.create_file_writer_v2(logdir)
-    summary_ops.trace_on(graph=True, profiler=True)
     profiler_outdir = self.get_temp_dir()
+    summary_ops.trace_on(
+        graph=True, profiler=True, profiler_outdir=profiler_outdir
+    )
     with writer.as_default():
       f()
-      summary_ops.trace_export(
-          name='foo', step=1, profiler_outdir=profiler_outdir)
+      summary_ops.trace_export(name='foo', step=1)
     writer.close()
 
   @test_util.run_v2_only
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
index 64972f3f850768..bcfa84b14d1507 100644
--- a/tensorflow/python/module/module_test.py
+++ b/tensorflow/python/module/module_test.py
@@ -17,6 +17,8 @@
 import abc
 import collections
 import itertools
+import sys
+import unittest
 
 from absl.testing import parameterized
 
@@ -514,6 +516,8 @@ class DangerousModule(module.Module):
     self.assertLen(mod.variables, 1)
     self.assertEqual(mod.variables[0], mod.normal_variable)
 
+  @unittest.skipIf(sys.version_info.major == 3 and sys.version_info.minor == 12,
+                   reason="b/313658911: _TupleWrapper __dict__ attribute error")
   def test_with_path(self):
     mod = module.Module()
     mod.w = variables.Variable(1.)
@@ -531,6 +535,8 @@ def test_with_path(self):
                       ("decoder", "w", 0, 0, "k"): mod.decoder.w[0][0]["k"],
                       ("decoder", "w", 0, 1, "k"): mod.decoder.w[0][1]["k"]},)
 
+  @unittest.skipIf(sys.version_info.major == 3 and sys.version_info.minor == 12,
+                   reason="b/313658911: _TupleWrapper __dict__ attribute error")
   def test_cycles_with_path(self):
     mod = module.Module()
     mod.w = variables.Variable(1.)
diff --git a/tensorflow/python/modules_with_exports.py b/tensorflow/python/modules_with_exports.py
index 5f86568227670c..793823905688ce 100644
--- a/tensorflow/python/modules_with_exports.py
+++ b/tensorflow/python/modules_with_exports.py
@@ -31,6 +31,10 @@
 from tensorflow.core.protobuf.config_pb2 import *
 from tensorflow.core.util.event_pb2 import *
 
+# Checkpoint Sharding
+from tensorflow.python.checkpoint.sharding import sharding_util
+from tensorflow.python.checkpoint.sharding import sharding_policies
+
 # Compat
 from tensorflow.python.compat import v2_compat
 
@@ -117,6 +121,7 @@
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import sets
 from tensorflow.python.ops import stateful_random_ops
+from tensorflow.python.ops import tensor_getitem_override
 from tensorflow.python.ops import while_v2
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.ops.linalg.sparse import sparse
@@ -170,6 +175,7 @@
 
 # Summary
 from tensorflow.python.summary import summary
+from tensorflow.python.summary import tb_summary
 
 # TPU
 from tensorflow.python.tpu import api
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index af1057af5a975c..a615f94cc04835 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -2,6 +2,10 @@ load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_xla_deps_py")
 load("//tensorflow/python:build_defs.bzl", "tf_gen_op_strict_wrapper_private_py")
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "cuda_py_benchmark_test",
+)
 
 visibility = [
     "//engedu/ml/tf_from_scratch:__pkg__",
@@ -307,6 +311,7 @@ tf_gen_op_strict_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/dtensor/python/tests:__pkg__",
         "//tensorflow/python:__pkg__",
         "//tensorflow/python/kernel_tests/image_ops:__pkg__",
         "//tensorflow/python/ops/parallel_for:__pkg__",
@@ -482,6 +487,7 @@ tf_gen_op_strict_wrapper_private_py(
     name = "parsing_ops_gen",
     visibility = [
         "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/dtensor/python/tests:__pkg__",
         "//tensorflow/python:__pkg__",
         "//tensorflow/python/autograph/operators:__pkg__",
         "//tensorflow/python/data/ops:__pkg__",
@@ -514,6 +520,7 @@ tf_gen_op_strict_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/dtensor/python/tests:__pkg__",
         "//tensorflow/python:__pkg__",
         "//tensorflow/python/kernel_tests/random:__pkg__",
     ],
@@ -775,7 +782,10 @@ cuda_py_strict_test(
 
 py_strict_library(
     name = "array_ops",
-    srcs = ["array_ops.py"],
+    srcs = [
+        "array_ops.py",
+        "tensor_getitem_override.py",
+    ],
     srcs_version = "PY3",
     visibility = visibility,
     deps = [
@@ -1481,7 +1491,6 @@ py_strict_library(
         ":ctc_ops_gen",
         ":custom_gradient",
         ":functional_ops",
-        # TODO(b/280454072) Remove inplace_ops and compat when forward compatibility window expires.
         ":inplace_ops",
         ":linalg_ops",
         ":map_fn",
@@ -1489,7 +1498,6 @@ py_strict_library(
         ":nn_grad",
         ":nn_ops",
         ":sparse_ops",
-        "//tensorflow/python/compat",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
@@ -1629,6 +1637,7 @@ py_strict_library(
         ":cudnn_rnn_grad",
         ":gradients_util",
         ":image_grad",
+        ":io_ops",
         ":linalg_grad",
         ":linalg_ops",
         ":logging_ops",
@@ -2052,7 +2061,10 @@ py_strict_library(
 
 py_strict_library(
     name = "math_ops",
-    srcs = ["math_ops.py"],
+    srcs = [
+        "math_ops.py",
+        "tensor_math_operator_overrides.py",
+    ],
     srcs_version = "PY3",
     deps = [
         ":array_ops",
@@ -2068,20 +2080,20 @@ py_strict_library(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:override_binary_operator",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor",
         "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/ops/numpy_ops:np_dtypes",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator_py",
         "//tensorflow/python/util:tf_export",
-        "//tensorflow/python/util:traceback_utils",
         "//third_party/py/numpy",
     ],
 )
@@ -2101,49 +2113,9 @@ py_strict_library(
     ],
 )
 
-py_strict_library(
+alias(
     name = "resource_variable_ops",
-    srcs = ["resource_variable_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":array_ops_gen",
-        ":handle_data_util",
-        ":math_ops",
-        ":resource_variable_ops_gen",
-        ":state_ops",
-        ":state_ops_gen",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/core/function/trace_type",
-        "//tensorflow/python/checkpoint:tensor_callable",
-        "//tensorflow/python/client:pywrap_tf_session",
-        "//tensorflow/python/compat",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:record",
-        "//tensorflow/python/eager:tape",
-        "//tensorflow/python/framework:auto_control_deps_utils",
-        "//tensorflow/python/framework:composite_tensor",
-        "//tensorflow/python/framework:composite_tensor_gradient",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:cpp_shape_inference_proto_py",
-        "//tensorflow/python/framework:device",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor",
-        "//tensorflow/python/framework:tensor_conversion_registry",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/saved_model:nested_structure_coder",
-        "//tensorflow/python/trackable:base",
-        "//tensorflow/python/types:core",
-        "//tensorflow/python/util:_pywrap_utils",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
+    actual = ":variables",
 )
 
 py_strict_library(
@@ -2998,7 +2970,6 @@ py_strict_library(
         "//tensorflow/dtensor/python:api",
         "//tensorflow/dtensor/python:layout",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:profiler",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
@@ -3006,6 +2977,7 @@ py_strict_library(
         "//tensorflow/python/framework:tensor",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/profiler:profiler_v2",
         "//tensorflow/python/trackable:resource",
         "//tensorflow/python/training:training_util",
         "//tensorflow/python/util:deprecation",
@@ -3134,46 +3106,78 @@ py_strict_library(
 
 py_strict_library(
     name = "variables",
-    srcs = ["variables.py"],
+    srcs = [
+        "resource_variable_ops.py",
+        "variables.py",
+    ],
     srcs_version = "PY3",
     deps = [
         ":array_ops",
+        ":array_ops_gen",
         ":array_ops_stack",
         ":control_flow_ops",
+        ":handle_data_util",
         ":math_ops",
         ":math_ops_gen",
+        ":resource_variable_ops_gen",
         ":state_ops",
+        ":state_ops_gen",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/function/trace_type",
         "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python/checkpoint:tensor_callable",
+        "//tensorflow/python/client:pywrap_tf_session",
+        "//tensorflow/python/compat",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:record",
+        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/framework:auto_control_deps_utils",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:composite_tensor_gradient",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:cpp_shape_inference_proto_py",
+        "//tensorflow/python/framework:device",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor",
         "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/trackable:base",
-        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/python/util:tf_should_use",
         "//tensorflow/python/util:traceback_utils",
+        "//third_party/py/numpy",
     ],
 )
 
-py_strict_library(
+alias(
     name = "ref_variable",
-    srcs = ["ref_variable.py"],
+    actual = ":variable_v1",
+)
+
+py_strict_library(
+    name = "variable_v1",
+    srcs = [
+        "ref_variable.py",
+        "variable_v1.py",
+    ],
     srcs_version = "PY3",
     deps = [
         ":array_ops",
         ":array_ops_gen",
+        ":cond",
         ":resource_variable_ops",
         ":resource_variables_toggle",
         ":state_ops",
         ":state_ops_gen",
         ":variable_scope",
-        ":variable_v1",
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
@@ -3187,19 +3191,6 @@ py_strict_library(
         "//tensorflow/python/types:core",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
-    ],
-)
-
-py_strict_library(
-    name = "variable_v1",
-    srcs = ["variable_v1.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":cond",
-        ":state_ops",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/python/framework:ops",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/python/util:tf_should_use",
     ],
@@ -3735,7 +3726,7 @@ cuda_py_strict_test(
     main = "nn_fused_batchnorm_test.py",
     python_version = "PY3",
     shard_count = 24,
-    tags = ["no_rocm"],
+    tags = [],
     deps = [
         ":array_ops",
         ":gradient_checker",
@@ -4033,7 +4024,7 @@ py_strict_test(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "accumulate_n_benchmark",
     size = "medium",
     srcs = ["accumulate_n_benchmark.py"],
@@ -4055,7 +4046,7 @@ cuda_py_strict_test(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "batch_norm_benchmark",
     srcs = ["batch_norm_benchmark.py"],
     main = "batch_norm_benchmark.py",
@@ -4077,7 +4068,7 @@ cuda_py_strict_test(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "collective_ops_benchmark",
     srcs = ["collective_ops_benchmark.py"],
     main = "collective_ops_benchmark.py",
@@ -4092,7 +4083,7 @@ cuda_py_strict_test(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "concat_benchmark",
     srcs = ["concat_benchmark.py"],
     main = "concat_benchmark.py",
@@ -4109,7 +4100,7 @@ cuda_py_strict_test(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "control_flow_ops_benchmark",
     srcs = ["control_flow_ops_benchmark.py"],
     main = "control_flow_ops_benchmark.py",
@@ -4129,7 +4120,7 @@ cuda_py_strict_test(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "conv2d_benchmark",
     size = "large",
     srcs = ["conv2d_benchmark.py"],
@@ -4150,7 +4141,7 @@ cuda_py_strict_test(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "split_benchmark",
     srcs = ["split_benchmark.py"],
     main = "split_benchmark.py",
@@ -4169,7 +4160,7 @@ cuda_py_strict_test(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "transpose_benchmark",
     size = "medium",
     srcs = ["transpose_benchmark.py"],
@@ -4187,7 +4178,7 @@ cuda_py_strict_test(
     ],
 )
 
-cuda_py_strict_test(
+cuda_py_benchmark_test(
     name = "matmul_benchmark",
     size = "medium",
     srcs = ["matmul_benchmark.py"],
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index ab2eedaaadb589..437e504114ffc6 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -25,6 +25,7 @@
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import constant_tensor_conversion  # pylint: disable=unused-import
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import indexed_slices
@@ -40,6 +41,7 @@
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import shape_util
+from tensorflow.python.ops import tensor_getitem_override  # pylint: disable=unused-import
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_array_ops import *
@@ -57,10 +59,6 @@
 newaxis = None
 tf_export("newaxis").export_constant(__name__, "newaxis")
 
-# We override the 'slice' for the "slice" op, so we keep Python's
-# existing 'slice' for later use in this module.
-_BaseSlice = slice
-
 
 @tf_export("reshape", v1=["reshape", "manip.reshape"])
 @dispatch.add_dispatch_support
@@ -936,237 +934,6 @@ def rank_internal(input, name=None, optimize=True):
       return gen_array_ops.rank(input, name=name)
 
 
-_SLICE_TYPE_ERROR = (
-    "Only integers, slices (`:`), ellipsis (`...`), "
-    "tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid "
-    "indices")
-
-_SUPPORTED_SLICE_DTYPES = (dtypes.int16, dtypes.int32, dtypes.int32_ref,
-                           dtypes.int64, dtypes.int64_ref)
-
-
-def _check_index(idx):
-  """Check if a given value is a valid index into a tensor."""
-  if isinstance(idx, (numbers.Integral, tensor_shape.Dimension)):
-    return
-
-  # Optimistic check. Assumptions:
-  # * any object with a dtype is supported
-  # * any object with a dtype has a sizeable shape attribute.
-  dtype = getattr(idx, "dtype", None)
-  if (dtype is None or dtypes.as_dtype(dtype) not in _SUPPORTED_SLICE_DTYPES or
-      idx.shape and len(idx.shape) == 1):
-    # TODO(slebedev): IndexError seems more appropriate here, but it
-    # will break `_slice_helper` contract.
-    raise TypeError(_SLICE_TYPE_ERROR + ", got {!r}".format(idx))
-
-
-def _is_undefined_dimension(d):
-  return isinstance(d, tensor_shape.Dimension) and d.value is None
-
-
-@tf_export("__operators__.getitem", v1=[])
-@dispatch.add_dispatch_support
-def _slice_helper(tensor, slice_spec, var=None):
-  """Overload for Tensor.__getitem__.
-
-  This operation extracts the specified region from the tensor.
-  The notation is similar to NumPy with the restriction that
-  currently only support basic indexing. That means that
-  using a non-scalar tensor as input is not currently allowed.
-
-  Some useful examples:
-
-  ```python
-  # Strip leading and trailing 2 elements
-  foo = tf.constant([1,2,3,4,5,6])
-  print(foo[2:-2])  # => [3,4]
-
-  # Skip every other row and reverse the order of the columns
-  foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-  print(foo[::2,::-1])  # => [[3,2,1], [9,8,7]]
-
-  # Use scalar tensors as indices on both dimensions
-  print(foo[tf.constant(0), tf.constant(2)])  # => 3
-
-  # Insert another dimension
-  foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-  print(foo[tf.newaxis, :, :]) # => [[[1,2,3], [4,5,6], [7,8,9]]]
-  print(foo[:, tf.newaxis, :]) # => [[[1,2,3]], [[4,5,6]], [[7,8,9]]]
-  print(foo[:, :, tf.newaxis]) # => [[[1],[2],[3]], [[4],[5],[6]],
-  [[7],[8],[9]]]
-
-  # Ellipses (3 equivalent operations)
-  foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-  print(foo[tf.newaxis, :, :])  # => [[[1,2,3], [4,5,6], [7,8,9]]]
-  print(foo[tf.newaxis, ...])  # => [[[1,2,3], [4,5,6], [7,8,9]]]
-  print(foo[tf.newaxis])  # => [[[1,2,3], [4,5,6], [7,8,9]]]
-
-  # Masks
-  foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-  print(foo[foo > 2])  # => [3, 4, 5, 6, 7, 8, 9]
-  ```
-
-  Notes:
-    - `tf.newaxis` is `None` as in NumPy.
-    - An implicit ellipsis is placed at the end of the `slice_spec`
-    - NumPy advanced indexing is currently not supported.
-
-  Purpose in the API:
-
-    This method is exposed in TensorFlow's API so that library developers
-    can register dispatching for `Tensor.__getitem__` to allow it to handle
-    custom composite tensors & other custom objects.
-
-    The API symbol is not intended to be called by users directly and does
-    appear in TensorFlow's generated documentation.
-
-  Args:
-    tensor: An tensor.Tensor object.
-    slice_spec: The arguments to Tensor.__getitem__.
-    var: In the case of variable slice assignment, the Variable object to slice
-      (i.e. tensor is the read-only view of this variable).
-
-  Returns:
-    The appropriate slice of "tensor", based on "slice_spec".
-
-  Raises:
-    ValueError: If a slice range is negative size.
-    TypeError: If the slice indices aren't int, slice, ellipsis,
-      tf.newaxis or scalar int32/int64 tensors.
-  """
-  tensor = ops.convert_to_tensor(tensor)
-  # TODO(wangpeng): Consider supporting var
-  if var is None and ops._numpy_style_slicing:  # pylint: disable=protected-access
-    return tensor._numpy_style_getitem(slice_spec)  # pylint: disable=protected-access
-
-  if (isinstance(slice_spec, bool)
-      or (isinstance(slice_spec, tensor_lib.Tensor)
-          and slice_spec.dtype == dtypes.bool)
-      or (isinstance(slice_spec, np.ndarray)
-          and slice_spec.dtype == bool)):
-    return boolean_mask(tensor=tensor, mask=slice_spec)
-
-  if not isinstance(slice_spec, (list, tuple)):
-    slice_spec = [slice_spec]
-
-  begin, end, strides = [], [], []
-  index = 0
-
-  new_axis_mask, shrink_axis_mask = 0, 0
-  begin_mask, end_mask = 0, 0
-  ellipsis_mask = 0
-  for s in slice_spec:
-    if isinstance(s, _BaseSlice):
-      # Finds the best dtype for begin, end, and strides.
-      dtype = None
-      for t in [s.start, s.stop, s.step]:
-        if t is None or not isinstance(t, tensor_lib.Tensor):
-          continue
-        if t.dtype == dtypes.int64:
-          dtype = dtypes.int64
-        elif t.dtype == dtypes.int32 and dtype != dtypes.int64:
-          dtype = dtypes.int32
-        elif t.dtype == dtypes.int16 and dtype is None:
-          dtype = dtypes.int16
-
-      if s.start is not None and not _is_undefined_dimension(s.start):
-        _check_index(s.start)
-        begin.append(s.start)
-      else:
-        if dtype is not None:
-          begin.append(constant_op.constant(0, dtype=dtype))
-        else:
-          begin.append(0)
-        begin_mask |= (1 << index)
-      if s.stop is not None and not _is_undefined_dimension(s.stop):
-        _check_index(s.stop)
-        end.append(s.stop)
-      else:
-        if dtype is not None:
-          end.append(constant_op.constant(0, dtype=dtype))
-        else:
-          end.append(0)
-        end_mask |= (1 << index)
-      if s.step is not None and not _is_undefined_dimension(s.step):
-        _check_index(s.step)
-        strides.append(s.step)
-      else:
-        if dtype is not None:
-          strides.append(constant_op.constant(1, dtype=dtype))
-        else:
-          strides.append(1)
-    elif s is Ellipsis:
-      begin.append(0)
-      end.append(0)
-      strides.append(1)
-      ellipsis_mask |= (1 << index)
-    elif s is newaxis:
-      begin.append(0)
-      end.append(0)
-      strides.append(1)
-      new_axis_mask |= (1 << index)
-    else:
-      _check_index(s)
-      begin.append(s)
-      end.append(s + 1)
-      # TODO(mdan): Investigate why we can't set int32 here.
-      if (
-          isinstance(s, tensor_lib.Tensor)
-          and (s.dtype == dtypes.int16 or s.dtype == dtypes.int64)):
-        strides.append(constant_op.constant(1, dtype=s.dtype))
-      else:
-        strides.append(1)
-      shrink_axis_mask |= (1 << index)
-    index += 1
-
-  # stack possibly involves no tensors, so we must use op_scope correct graph.
-  with ops.name_scope(
-      None,
-      "strided_slice", [tensor] + begin + end + strides,
-      skip_on_eager=False) as name:
-    if begin:
-      packed_begin, packed_end, packed_strides = (
-          array_ops_stack.stack(begin),
-          array_ops_stack.stack(end),
-          array_ops_stack.stack(strides))
-      # TODO(mdan): Instead of implicitly casting, it's better to enforce the
-      # same dtypes.
-      if (packed_begin.dtype == dtypes.int64 or
-          packed_end.dtype == dtypes.int64 or
-          packed_strides.dtype == dtypes.int64):
-        if packed_begin.dtype != dtypes.int64:
-          packed_begin = gen_math_ops.cast(packed_begin, dtypes.int64)
-        if packed_end.dtype != dtypes.int64:
-          packed_end = gen_math_ops.cast(packed_end, dtypes.int64)
-        if packed_strides.dtype != dtypes.int64:
-          packed_strides = gen_math_ops.cast(packed_strides, dtypes.int64)
-      elif (packed_begin.dtype == dtypes.int16 and
-            packed_end.dtype == dtypes.int16 and
-            packed_strides.dtype == dtypes.int16):
-        if packed_begin.dtype != dtypes.int16:
-          packed_begin = gen_math_ops.cast(packed_begin, dtypes.int16)
-        if packed_end.dtype != dtypes.int16:
-          packed_end = gen_math_ops.cast(packed_end, dtypes.int16)
-        if packed_strides.dtype != dtypes.int16:
-          packed_strides = gen_math_ops.cast(packed_strides, dtypes.int16)
-    else:
-      var_empty = constant([], dtype=dtypes.int32)
-      packed_begin = packed_end = packed_strides = var_empty
-    return strided_slice(
-        tensor,
-        packed_begin,
-        packed_end,
-        packed_strides,
-        begin_mask=begin_mask,
-        end_mask=end_mask,
-        shrink_axis_mask=shrink_axis_mask,
-        new_axis_mask=new_axis_mask,
-        ellipsis_mask=ellipsis_mask,
-        var=var,
-        name=name)
-
-
 # pylint: disable=undefined-variable,protected-access,redefined-outer-name
 @tf_export("slice")
 @dispatch.add_dispatch_support
@@ -1364,53 +1131,6 @@ def assign(val, name=None):
   return op
 
 
-def _SliceHelperVar(var, slice_spec):
-  """Creates a slice helper object given a variable.
-
-  This allows creating a sub-tensor from part of the current contents
-  of a variable. See `tf.Tensor.__getitem__` for detailed examples
-  of slicing.
-
-  This function in addition also allows assignment to a sliced range.
-  This is similar to `__setitem__` functionality in Python. However,
-  the syntax is different so that the user can capture the assignment
-  operation for grouping or passing to `sess.run()` in TF1.
-  For example,
-
-  ```python
-  import tensorflow as tf
-  A = tf.Variable([[1,2,3], [4,5,6], [7,8,9]], dtype=tf.float32)
-  print(A[:2, :2])  # => [[1,2], [4,5]]
-
-  A[:2,:2].assign(22. * tf.ones((2, 2))))
-  print(A) # => [[22, 22, 3], [22, 22, 6], [7,8,9]]
-  ```
-
-  Note that assignments currently do not support NumPy broadcasting
-  semantics.
-
-  Args:
-    var: An `ops.Variable` object.
-    slice_spec: The arguments to `Tensor.__getitem__`.
-
-  Returns:
-    The appropriate slice of "tensor", based on "slice_spec".
-    As an operator. The operator also has a `assign()` method
-    that can be used to generate an assignment operator.
-
-  Raises:
-    ValueError: If a slice range is negative size.
-    TypeError: TypeError: If the slice indices aren't int, slice,
-      ellipsis, tf.newaxis or int32/int64 tensors.
-
-  """
-
-  return _slice_helper(var.value(), slice_spec, var)
-
-
-tensor_lib.Tensor._override_operator("__getitem__", _slice_helper)
-
-
 @tf_export("parallel_stack")
 @dispatch.add_dispatch_support
 def parallel_stack(values, name="parallel_stack"):
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 27f57869511514..2f1247d4226642 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -207,8 +207,12 @@ def _is_op_stateful(op):
   Returns:
     Boolean indicates whether the operation is stateless or not.
   """
+  # TODO(pineapplejuice233): Remove these hardcode op names once they can be marked as
+  # stateless in TF.
   if op.type == "GlobalIterId":
     return False
+  if op.type == "UpdateFdoWithGlobalMinibatchStatistics":
+    return False
   if op.type == "CollectiveGatherV2" and op.get_attr("is_stateless"):
     return False
   return op._is_stateful
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 1bde62c0adbc20..ba4590c385569b 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -16,9 +16,6 @@
 
 import uuid
 
-# TODO(b/280454072) Remove compat and inplace_ops when foward compatibility
-# window expires.
-from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 
@@ -1497,17 +1494,10 @@ def body(i, num_elems, *args):
       new_out = []
     else:
       update_i = i + 1 if inclusive and not reverse else i
-      # TODO(b/280454072) Cleanup when foward compatibility window expires.
-      if compat.forward_compatible(2023, 10, 26):
-        new_out = [
-            gen_array_ops.tensor_scatter_update(x, [[update_i]], [y])
-            for x, y in zip(out, flat_accum)
-        ]
-      else:
-        new_out = [
-            inplace_ops.alias_inplace_update(x, update_i, y)
-            for x, y in zip(out, flat_accum)
-        ]
+      new_out = [
+          gen_array_ops.tensor_scatter_update(x, [[update_i]], [y])
+          for x, y in zip(out, flat_accum)
+      ]
     i = i - 1 if reverse else i + 1
     return [i, num_elems] + new_out + flat_accum
 
@@ -1522,15 +1512,9 @@ def body(i, num_elems, *args):
           [[num_outputs], array_ops.shape(initial_accum)], 0)
       out = inplace_ops.empty(out_shape, dtype=initial_accum.dtype, init=True)
       if inclusive:
-        # TODO(b/280454072) Cleanup when foward compatibility window expires.
-        if compat.forward_compatible(2023, 10, 26):
-          out = gen_array_ops.tensor_scatter_add(
-              out, [[init_i + (1 if reverse else 0)]], [initial_accum]
-          )
-        else:
-          out = inplace_ops.alias_inplace_add(
-              out, init_i + (1 if reverse else 0), initial_accum
-          )
+        out = gen_array_ops.tensor_scatter_add(
+            out, [[init_i + (1 if reverse else 0)]], [initial_accum]
+        )
       outputs.append(out)
   loop_in = [init_i, num_elems] + outputs + flat_initial
   hostmem = [
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index ae88a6d6306831..a45b9965078898 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -24,6 +24,7 @@
 from tensorflow.python.ops import cudnn_rnn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import image_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import io_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import logging_ops  # pylint: disable=unused-import
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index b643278e3f9eb2..44085948a48689 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -1254,6 +1254,9 @@ def Grad(*grad):
 
   @test_util.enable_quantized_dtypes_training
   def testCustomGradientQuantizedDtypeTraining(self):
+    # TODO(b/309175067): Remove below skipTest() when fixed.
+    if sys.platform == "darwin":
+      self.skipTest("This test fails in TF MacOS nightly and continuous builds")
     with context.eager_mode():
       @custom_gradient.custom_gradient
       def F(x):
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index fd8995f6250a81..e476217347bed6 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -2030,7 +2030,8 @@ def random_brightness(image, max_delta, seed=None):
 
   Args:
     image: An image or images to adjust.
-    max_delta: float, must be non-negative.
+    max_delta: float, must be non-negative. This parameter controls the maximum
+      relative change in brightness.
     seed: A Python integer. Used to create a random seed. See
       `tf.compat.v1.set_random_seed` for behavior.
 
diff --git a/tensorflow/python/ops/init_ops_test.py b/tensorflow/python/ops/init_ops_test.py
index a0ef239581405a..0d34a764e5f6fd 100644
--- a/tensorflow/python/ops/init_ops_test.py
+++ b/tensorflow/python/ops/init_ops_test.py
@@ -172,9 +172,6 @@ def test_Orthogonal(self):
         self._runner(
             init_ops.Orthogonal(seed=123), tensor_shape, target_mean=0.)
 
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm,
-      skip_message='Disable subtest on ROCm due to missing QR op support')
   @test_util.run_gpu_only
   def testVariablePlacementWithOrthogonalInitializer(self):
     with ops.Graph().as_default() as g:
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 591d50d1c089a4..b9a4a32e425481 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -1656,7 +1656,10 @@ def _matmul(  # pylint:disable=missing-docstring
     a_is_sparse=False,
     b_is_sparse=False,
     output_type=None,  # pylint: disable=unused-argument
-    name=None):
+    grad_a=False,  # pylint: disable=unused-argument
+    grad_b=False,  # pylint: disable=unused-argument
+    name=None,
+):
   if transpose_a or transpose_b:
     raise ValueError("Transposing not supported at this time.")
   if a_is_sparse or b_is_sparse:
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index 3cdbe9ddba43aa..faa71ee0548b80 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -387,9 +387,9 @@ def test_log_abs_det(self: "LinearOperatorDerivedClassTest"):
   return test_log_abs_det
 
 
-@test_util.run_without_tensor_float_32("Use FP32 in matmul")
 def _test_operator_matmul_with_same_type(use_placeholder, shapes_info, dtype):
   """op_a.matmul(op_b), in the case where the same type is returned."""
+  @test_util.run_without_tensor_float_32("Use FP32 in matmul")
   def test_operator_matmul_with_same_type(
       self: "LinearOperatorDerivedClassTest"):
     with self.session(graph=ops.Graph()) as sess:
@@ -501,7 +501,6 @@ def _test_matmul_base(
     self.assertAC(op_matmul_v, mat_matmul_v)
 
 
-@test_util.run_without_tensor_float_32("Use FP32 in matmul")
 def _test_matmul(
     use_placeholder,
     shapes_info,
@@ -509,6 +508,7 @@ def _test_matmul(
     adjoint,
     adjoint_arg,
     blockwise_arg):
+  @test_util.run_without_tensor_float_32("Use FP32 in matmul")
   def test_matmul(self: "LinearOperatorDerivedClassTest"):
     _test_matmul_base(
         self,
@@ -522,7 +522,6 @@ def test_matmul(self: "LinearOperatorDerivedClassTest"):
   return test_matmul
 
 
-@test_util.run_without_tensor_float_32("Use FP32 in matmul")
 def _test_matmul_with_broadcast(
     use_placeholder,
     shapes_info,
@@ -530,6 +529,7 @@ def _test_matmul_with_broadcast(
     adjoint,
     adjoint_arg,
     blockwise_arg):
+  @test_util.run_without_tensor_float_32("Use FP32 in matmul")
   def test_matmul_with_broadcast(self: "LinearOperatorDerivedClassTest"):
     _test_matmul_base(
         self,
@@ -822,8 +822,8 @@ def test_diag_part(self: "LinearOperatorDerivedClassTest"):
   return test_diag_part
 
 
-@test_util.run_without_tensor_float_32("Use FP32 in matmul")
 def _test_composite_tensor(use_placeholder, shapes_info, dtype):
+  @test_util.run_without_tensor_float_32("Use FP32 in matmul")
   def test_composite_tensor(self: "LinearOperatorDerivedClassTest"):
     with self.session(graph=ops.Graph()) as sess:
       sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
@@ -863,8 +863,8 @@ def body(op):
   return test_composite_tensor
 
 
-@test_util.run_without_tensor_float_32("Use FP32 in matmul")
 def _test_saved_model(use_placeholder, shapes_info, dtype):
+  @test_util.run_without_tensor_float_32("Use FP32 in matmul")
   def test_saved_model(self: "LinearOperatorDerivedClassTest"):
     with self.session(graph=ops.Graph()) as sess:
       sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 3a120565a1603b..8fe7047cbd42ef 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -1664,13 +1664,15 @@ def _MatMulGradAgainstFirstOnly(op: ops.Operation, grad):
   t_b = op.get_attr("transpose_b")
   b = math_ops.conj(op.inputs[1])
   if not t_a and not t_b:
-    grad_a = gen_math_ops.mat_mul(grad, b, transpose_b=True)
+    grad_a = gen_math_ops.mat_mul(grad, b, transpose_b=True, grad_a=True)
   elif not t_a and t_b:
-    grad_a = gen_math_ops.mat_mul(grad, b)
+    grad_a = gen_math_ops.mat_mul(grad, b, grad_a=True)
   elif t_a and not t_b:
-    grad_a = gen_math_ops.mat_mul(b, grad, transpose_b=True)
+    grad_a = gen_math_ops.mat_mul(b, grad, transpose_b=True, grad_a=True)
   elif t_a and t_b:
-    grad_a = gen_math_ops.mat_mul(b, grad, transpose_a=True, transpose_b=True)
+    grad_a = gen_math_ops.mat_mul(
+        b, grad, transpose_a=True, transpose_b=True, grad_a=True
+    )
   return grad_a, None
 
 
@@ -1680,13 +1682,15 @@ def _MatMulGradAgainstSecondOnly(op: ops.Operation, grad):
   t_b = op.get_attr("transpose_b")
   a = math_ops.conj(op.inputs[0])
   if not t_a and not t_b:
-    grad_b = gen_math_ops.mat_mul(a, grad, transpose_a=True)
+    grad_b = gen_math_ops.mat_mul(a, grad, transpose_a=True, grad_b=True)
   elif not t_a and t_b:
-    grad_b = gen_math_ops.mat_mul(grad, a, transpose_a=True)
+    grad_b = gen_math_ops.mat_mul(grad, a, transpose_a=True, grad_b=True)
   elif t_a and not t_b:
-    grad_b = gen_math_ops.mat_mul(a, grad)
+    grad_b = gen_math_ops.mat_mul(a, grad, grad_b=True)
   elif t_a and t_b:
-    grad_b = gen_math_ops.mat_mul(grad, a, transpose_a=True, transpose_b=True)
+    grad_b = gen_math_ops.mat_mul(
+        grad, a, transpose_a=True, transpose_b=True, grad_b=True
+    )
   return None, grad_b
 
 
@@ -1709,17 +1713,21 @@ def _MatMulGrad(op: ops.Operation, grad):
   a = math_ops.conj(op.inputs[0])
   b = math_ops.conj(op.inputs[1])
   if not t_a and not t_b:
-    grad_a = gen_math_ops.mat_mul(grad, b, transpose_b=True)
-    grad_b = gen_math_ops.mat_mul(a, grad, transpose_a=True)
+    grad_a = gen_math_ops.mat_mul(grad, b, transpose_b=True, grad_a=True)
+    grad_b = gen_math_ops.mat_mul(a, grad, transpose_a=True, grad_b=True)
   elif not t_a and t_b:
-    grad_a = gen_math_ops.mat_mul(grad, b)
-    grad_b = gen_math_ops.mat_mul(grad, a, transpose_a=True)
+    grad_a = gen_math_ops.mat_mul(grad, b, grad_a=True)
+    grad_b = gen_math_ops.mat_mul(grad, a, transpose_a=True, grad_b=True)
   elif t_a and not t_b:
-    grad_a = gen_math_ops.mat_mul(b, grad, transpose_b=True)
-    grad_b = gen_math_ops.mat_mul(a, grad)
+    grad_a = gen_math_ops.mat_mul(b, grad, transpose_b=True, grad_a=True)
+    grad_b = gen_math_ops.mat_mul(a, grad, grad_b=True)
   elif t_a and t_b:
-    grad_a = gen_math_ops.mat_mul(b, grad, transpose_a=True, transpose_b=True)
-    grad_b = gen_math_ops.mat_mul(grad, a, transpose_a=True, transpose_b=True)
+    grad_a = gen_math_ops.mat_mul(
+        b, grad, transpose_a=True, transpose_b=True, grad_a=True
+    )
+    grad_b = gen_math_ops.mat_mul(
+        grad, a, transpose_a=True, transpose_b=True, grad_b=True
+    )
   return grad_a, grad_b
 
 
@@ -1833,18 +1841,34 @@ def _BatchMatMulV2(op: ops.Operation, grad):
 
   if not adj_x:
     if not adj_y:
-      grad_x = math_ops.matmul(grad, y, adjoint_a=False, adjoint_b=True)
-      grad_y = math_ops.matmul(x, grad, adjoint_a=True, adjoint_b=False)
+      grad_x = math_ops.matmul(
+          grad, y, adjoint_a=False, adjoint_b=True, grad_a=True
+      )
+      grad_y = math_ops.matmul(
+          x, grad, adjoint_a=True, adjoint_b=False, grad_b=True
+      )
     else:
-      grad_x = math_ops.matmul(grad, y, adjoint_a=False, adjoint_b=False)
-      grad_y = math_ops.matmul(grad, x, adjoint_a=True, adjoint_b=False)
+      grad_x = math_ops.matmul(
+          grad, y, adjoint_a=False, adjoint_b=False, grad_a=True
+      )
+      grad_y = math_ops.matmul(
+          grad, x, adjoint_a=True, adjoint_b=False, grad_b=True
+      )
   else:
     if not adj_y:
-      grad_x = math_ops.matmul(y, grad, adjoint_a=False, adjoint_b=True)
-      grad_y = math_ops.matmul(x, grad, adjoint_a=False, adjoint_b=False)
+      grad_x = math_ops.matmul(
+          y, grad, adjoint_a=False, adjoint_b=True, grad_a=True
+      )
+      grad_y = math_ops.matmul(
+          x, grad, adjoint_a=False, adjoint_b=False, grad_b=True
+      )
     else:
-      grad_x = math_ops.matmul(y, grad, adjoint_a=True, adjoint_b=True)
-      grad_y = math_ops.matmul(grad, x, adjoint_a=True, adjoint_b=True)
+      grad_x = math_ops.matmul(
+          y, grad, adjoint_a=True, adjoint_b=True, grad_a=True
+      )
+      grad_y = math_ops.matmul(
+          grad, x, adjoint_a=True, adjoint_b=True, grad_b=True
+      )
 
   # Possibly reduce along the broadcasted batch dimensions, if broadcasting
   # is required.
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 7645eaaae39311..29c695c0da2a40 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -68,7 +68,6 @@
 API docstring: tensorflow.math
 """
 import builtins
-import numbers
 import numpy as np
 
 from tensorflow.python.eager import context
@@ -76,6 +75,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import override_binary_operator
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor as tensor_lib
 from tensorflow.python.framework import tensor_conversion_registry
@@ -89,18 +89,17 @@
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_sparse_ops
+from tensorflow.python.ops import tensor_math_operator_overrides  # pylint: disable=unused-import
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_math_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.ops.numpy_ops import np_dtypes
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import traceback_utils
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
@@ -233,11 +232,6 @@ def linspace_nd(start, stop, num, name=None, axis=0):
 tf_export(v1=["arg_min"])(dispatch.add_dispatch_support(arg_min))
 
 
-# This is set by resource_variable_ops.py. It is included in this way since
-# there is a circular dependency between math_ops and resource_variable_ops
-_resource_variable_type = None
-
-
 def _set_doc(doc):
 
   def _decorator(func):
@@ -997,8 +991,9 @@ def cast(x, dtype, name=None):
 
   """
   base_type = dtypes.as_dtype(dtype).base_dtype
-  if isinstance(
-      x, (tensor_lib.Tensor, _resource_variable_type)) and base_type == x.dtype:
+  if (
+      isinstance(x, tensor_lib.Tensor) or _pywrap_utils.IsResourceVariable(x)
+  ) and base_type == x.dtype:
     return x
   with ops.name_scope(name, "Cast", [x]) as name:
     if isinstance(x, sparse_tensor.SparseTensor):
@@ -1388,150 +1383,6 @@ def to_complex128(x, name="ToComplex128"):
   return cast(x, dtypes.complex128, name=name)
 
 
-tensor_lib.Tensor._override_operator("__neg__", gen_math_ops.neg)
-tensor_lib.Tensor._override_operator("__abs__", abs)
-
-
-def _maybe_get_dtype(x):
-  """Returns a numpy type if available from x. Skips if x is numpy.ndarray."""
-  # Don't put np.ndarray in this list, because np.result_type looks at the
-  # value (not just dtype) of np.ndarray to decide the result type.
-  if isinstance(x, numbers.Real):
-    return x
-  if isinstance(x, tensor_lib.Tensor):
-    return x.dtype.as_numpy_dtype
-  if isinstance(x, dtypes.DType):
-    return x.as_numpy_dtype
-  if isinstance(x, tensor_shape.TensorShape):
-    return np.int32
-  if isinstance(x, (list, tuple)):
-    raise ValueError(f"Cannot determine dtype.  Got sequence {x}.")
-  return x
-
-
-def maybe_promote_tensors(*tensors, force_same_dtype=False):
-  """Promotes tensors if numpy style promotion is enabled.
-
-  This function promotes `tensors` according to numpy promotion rules
-  if numpy style promotion is enabled.  Otherwise, if
-  `force_same_dtype` is `True`, it force-casts `tensors[1:]` to
-  `tensor[0]`'s dtype. Note that this force-cast can be problematic.
-  For example, when some `tensors[1:]` elements can be silently
-  downcasted.
-
-  Args:
-    *tensors: the list of tensors to promote.
-    force_same_dtype: bool (optional, default to `False`). When numpy
-      style promotion is disabled and `force_same_dtype` is `True`,
-      this function will force-casts `tensors[1:]` to `tensor[0]`'s
-      dtype (which could be problematic).
-
-  Returns:
-    The promoted list of tensors.
-  """
-  if ops.is_auto_dtype_conversion_enabled():
-    return tensors
-  if not tensors:
-    return tensors
-  if not ops.is_numpy_style_type_promotion():
-    if not force_same_dtype:
-      return tensors
-    promoted_tensors = []
-    promoted_tensors.append(tensors[0])
-    dtype = tensors[0].dtype.base_dtype
-    for tensor in tensors[1:]:
-      promoted_tensors.append(
-          ops.convert_to_tensor(tensor, dtype, name="x"))
-    return promoted_tensors
-  result_type = np_dtypes._result_type(
-      *[_maybe_get_dtype(x) for x in nest.flatten(tensors)])
-  def _promote_or_cast(x):
-    if isinstance(x, tensor_lib.Tensor):
-      x = cast(x, result_type)
-    else:
-      x = ops.convert_to_tensor(x, result_type)
-    return x
-  return [_promote_or_cast(x) for x in tensors]
-
-
-def _OverrideBinaryOperatorHelper(
-    func, op_name, clazz_object=tensor_lib.Tensor):
-  """Register operators with different tensor and scalar versions.
-
-  If `clazz_object` is `SparseTensor`, assumes `func` takes `(sp_indices,
-  sp_values, sp_shape, dense)` and outputs `(new_sp_values)`.
-
-  Args:
-    func: the operator
-    op_name: name of the operator being overridden
-    clazz_object: class to override for.  Either `Tensor` or `SparseTensor`.
-  """
-
-  @traceback_utils.filter_traceback
-  def binary_op_wrapper(x, y):
-    with ops.name_scope(None, op_name, [x, y]) as name:
-      try:
-        # force_same_dtype=False to preserve existing TF behavior
-        # TODO(b/178860388): Figure out why binary_op_wrapper and
-        #   r_binary_op_wrapper use different force_same_dtype values.
-        x, y = maybe_promote_tensors(x, y)
-        return func(x, y, name=name)
-      except (TypeError, ValueError) as e:
-        # Even if dispatching the op failed, the RHS may be a tensor aware
-        # object that can implement the operator with knowledge of itself
-        # and the tensor.
-        # If the RHS is not tensor aware we still want to raise the
-        # original error from the LHS, because it may be more
-        # informative.
-        if hasattr(type(y), "__r%s__" % op_name):
-          try:
-            r_op = getattr(y, "__r%s__" % op_name)
-            out = r_op(x)
-            if out is NotImplemented:
-              raise
-            return out
-          except (TypeError, ValueError):
-            raise e
-        else:
-          raise
-
-  @traceback_utils.filter_traceback
-  def binary_op_wrapper_sparse(sp_x, y):
-    with ops.name_scope(None, op_name, [sp_x, y]) as name:
-      y = ops.convert_to_tensor(y, dtype=sp_x.dtype.base_dtype, name="y")
-      return sparse_tensor.SparseTensor(
-          sp_x.indices,
-          func(sp_x.indices, sp_x.values, sp_x.dense_shape, y, name=name),
-          sp_x.dense_shape)
-
-  @traceback_utils.filter_traceback
-  def r_binary_op_wrapper(y, x):
-    with ops.name_scope(None, op_name, [x, y]) as name:
-      # TODO(b/178860388): Figure out why binary_op_wrapper and
-      #   r_binary_op_wrapper use different force_same_dtype values.
-      y, x = maybe_promote_tensors(y, x, force_same_dtype=True)
-      return func(x, y, name=name)
-
-  # Propagate func.__doc__ to the wrappers
-  try:
-    doc = func.__doc__
-  except AttributeError:
-    doc = None
-  binary_op_wrapper.__doc__ = doc
-  r_binary_op_wrapper.__doc__ = doc
-  binary_op_wrapper_sparse.__doc__ = doc
-
-  if clazz_object is tensor_lib.Tensor:
-    clazz_object._override_operator("__%s__" % op_name, binary_op_wrapper)
-    del binary_op_wrapper
-    clazz_object._override_operator("__r%s__" % op_name, r_binary_op_wrapper)
-    del r_binary_op_wrapper
-  else:
-    clazz_object._override_operator("__%s__" % op_name,
-                                    binary_op_wrapper_sparse)
-    del binary_op_wrapper_sparse
-
-
 # Conversion table for __truediv__.  None entries mean no conversion required.
 _TRUEDIV_TABLE = {
     dtypes.uint8: dtypes.float32,
@@ -1551,33 +1402,6 @@ def r_binary_op_wrapper(y, x):
 }
 
 
-# NOTE: the support of "sparse (true)div dense" is currently not baked in into
-# "tf.(true_)div()".  Until such an API decision is made, the supported usage is
-# to explicitly use the "/" operator to invoke either truediv or div.
-def _sparse_dense_truediv(sp_indices, sp_values, sp_shape, y, name=None):
-  """Internal helper function for 'sp_t / dense_t'."""
-  with ops.name_scope(name, "truediv",
-                      [sp_indices, sp_values, sp_shape, y]) as name:
-    sp_values = ops.convert_to_tensor(sp_values, name="sp_values")
-    y = ops.convert_to_tensor(y, name="y")
-    x_dtype = sp_values.dtype.base_dtype
-    y_dtype = y.dtype.base_dtype
-    if x_dtype != y_dtype:
-      raise TypeError(f"`x` and `y` must have the same dtype, "
-                      f"got {x_dtype!r} != {y_dtype!r}.")
-    try:
-      dtype = _TRUEDIV_TABLE[x_dtype]
-    except KeyError:
-      raise TypeError(
-          f"Invalid dtype {x_dtype!r} in __truediv__. Expected one "
-          f"of {{{', '.join([repr(x) for x in _TRUEDIV_TABLE.keys()])}}}.")
-    if dtype is not None:
-      sp_values = cast(sp_values, dtype)
-      y = cast(y, dtype)
-    return gen_sparse_ops.sparse_dense_cwise_div(
-        sp_indices, sp_values, sp_shape, y, name=name)
-
-
 def _truediv_python3(x, y, name=None):
   with ops.name_scope(name, "truediv", [x, y]) as name:
     x = ops.convert_to_tensor(x, name="x")
@@ -1881,26 +1705,6 @@ def _mul_dispatch(x, y, name=None):
     return multiply(x, y, name=name)
 
 
-# NOTE(aselle): When integer division is added for sparse_dense_cwise,
-# div, truediv, and floordiv should be delegated appropriately for
-# Python semantics, analogous to dense cwise tensor operations.
-_OverrideBinaryOperatorHelper(gen_sparse_ops.sparse_dense_cwise_div, "div",
-                              sparse_tensor.SparseTensor)
-_OverrideBinaryOperatorHelper(_sparse_dense_truediv, "truediv",
-                              sparse_tensor.SparseTensor)
-_OverrideBinaryOperatorHelper(gen_sparse_ops.sparse_dense_cwise_mul, "mul",
-                              sparse_tensor.SparseTensor)
-
-_OverrideBinaryOperatorHelper(_add_dispatch, "add")
-_OverrideBinaryOperatorHelper(subtract, "sub")
-_OverrideBinaryOperatorHelper(_mul_dispatch, "mul")
-_OverrideBinaryOperatorHelper(div, "div")
-_OverrideBinaryOperatorHelper(truediv, "truediv")
-_OverrideBinaryOperatorHelper(floordiv, "floordiv")
-_OverrideBinaryOperatorHelper(mod, "mod")
-_OverrideBinaryOperatorHelper(pow, "pow")
-
-
 @tf_export("math.logical_xor", v1=["math.logical_xor", "logical_xor"])
 @dispatch.register_binary_elementwise_api
 @dispatch.add_dispatch_support
@@ -1977,29 +1781,6 @@ def invert_(x, name=None):
   return gen_bitwise_ops.invert(x, name=name)
 
 
-_OverrideBinaryOperatorHelper(and_, "and")
-_OverrideBinaryOperatorHelper(or_, "or")
-_OverrideBinaryOperatorHelper(xor_, "xor")
-tensor_lib.Tensor._override_operator("__invert__", invert_)
-
-
-def _promote_dtypes_decorator(fn):
-  def wrapper(x, y, *args, **kwargs):
-    x, y = maybe_promote_tensors(x, y)
-    return fn(x, y, *args, **kwargs)
-  return tf_decorator.make_decorator(fn, wrapper)
-
-
-tensor_lib.Tensor._override_operator("__lt__", _promote_dtypes_decorator(
-    gen_math_ops.less))
-tensor_lib.Tensor._override_operator("__le__", _promote_dtypes_decorator(
-    gen_math_ops.less_equal))
-tensor_lib.Tensor._override_operator("__gt__", _promote_dtypes_decorator(
-    gen_math_ops.greater))
-tensor_lib.Tensor._override_operator("__ge__", _promote_dtypes_decorator(
-    gen_math_ops.greater_equal))
-
-
 @tf_export("math.equal", "equal")
 @dispatch.register_binary_elementwise_api
 @dispatch.add_dispatch_support
@@ -2109,7 +1890,7 @@ def tensor_equals(self, other):
       and ops.executing_eagerly_outside_functions()
       and (g is None or g.building_function)
   ):
-    self, other = maybe_promote_tensors(self, other)
+    self, other = override_binary_operator.maybe_promote_tensors(self, other)
     return gen_math_ops.equal(self, other, incompatible_shape_error=False)
   else:
     # In legacy graph mode, tensor equality is object equality
@@ -2149,17 +1930,13 @@ def tensor_not_equals(self, other):
       tensor_lib.Tensor._USE_EQUALITY
       and ops.executing_eagerly_outside_functions()
   ):
-    self, other = maybe_promote_tensors(self, other)
+    self, other = override_binary_operator.maybe_promote_tensors(self, other)
     return gen_math_ops.not_equal(self, other, incompatible_shape_error=False)
   else:
     # In legacy graph mode, tensor equality is object equality
     return self is not other
 
 
-tensor_lib.Tensor._override_operator("__eq__", tensor_equals)
-tensor_lib.Tensor._override_operator("__ne__", tensor_not_equals)
-
-
 @tf_export("range")
 @dispatch.add_dispatch_support
 def range(start, limit=None, delta=1, dtype=None, name="range"):  # pylint: disable=redefined-builtin
@@ -3616,16 +3393,20 @@ def trace(x, name=None):
 
 @tf_export("linalg.matmul", "matmul")
 @dispatch.add_dispatch_support
-def matmul(a,
-           b,
-           transpose_a=False,
-           transpose_b=False,
-           adjoint_a=False,
-           adjoint_b=False,
-           a_is_sparse=False,
-           b_is_sparse=False,
-           output_type=None,
-           name=None):
+def matmul(
+    a,
+    b,
+    transpose_a=False,
+    transpose_b=False,
+    adjoint_a=False,
+    adjoint_b=False,
+    a_is_sparse=False,
+    b_is_sparse=False,
+    output_type=None,
+    grad_a=False,
+    grad_b=False,
+    name=None,
+):
   """Multiplies matrix `a` by matrix `b`, producing `a` * `b`.
 
   The inputs must, following any transpositions, be tensors of rank >= 2
@@ -3711,17 +3492,19 @@ def matmul(a,
       multiplication.
     a_is_sparse: If `True`, `a` is treated as a sparse matrix. Notice, this
       **does not support `tf.sparse.SparseTensor`**, it just makes optimizations
-      that assume most values in `a` are zero.
-      See `tf.sparse.sparse_dense_matmul`
-      for some support for `tf.sparse.SparseTensor` multiplication.
+      that assume most values in `a` are zero. See
+      `tf.sparse.sparse_dense_matmul` for some support for
+      `tf.sparse.SparseTensor` multiplication.
     b_is_sparse: If `True`, `b` is treated as a sparse matrix. Notice, this
       **does not support `tf.sparse.SparseTensor`**, it just makes optimizations
-      that assume most values in `b` are zero.
-      See `tf.sparse.sparse_dense_matmul`
-      for some support for `tf.sparse.SparseTensor` multiplication.
+      that assume most values in `b` are zero. See
+      `tf.sparse.sparse_dense_matmul` for some support for
+      `tf.sparse.SparseTensor` multiplication.
     output_type: The output datatype if needed. Defaults to None in which case
       the output_type is the same as input type. Currently only works when input
       tensors are type (u)int8 and output_type can be int32.
+    grad_a: Set it to `True` to hint that Tensor `a` is for the backward pass.
+    grad_b: Set it to `True` to hint that Tensor `b` is for the backward pass.
     name: Name for the operation (optional).
 
   Returns:
@@ -3755,9 +3538,12 @@ def matmul(a,
           f"`adjoint_b`={adjoint_b}.")
 
     if context.executing_eagerly():
-      if not isinstance(a, (ops.EagerTensor, _resource_variable_type)):
+      if not (
+          isinstance(a, ops.EagerTensor) or _pywrap_utils.IsResourceVariable(a)
+      ):
         a = ops.convert_to_tensor(a, name="a")
-      if not isinstance(b, (ops.EagerTensor, _resource_variable_type)):
+      if not isinstance(b, ops.EagerTensor) or _pywrap_utils.IsResourceVariable(
+          b):
         b = ops.convert_to_tensor(b, dtype_hint=a.dtype.base_dtype, name="b")
     else:
       a = ops.convert_to_tensor(a, name="a")
@@ -3790,10 +3576,25 @@ def matmul(a,
         adjoint_b = True
       if use_batch_matmul_v3:
         return gen_math_ops.batch_mat_mul_v3(
-            a, b, adj_x=adjoint_a, adj_y=adjoint_b, Tout=output_type, name=name)
+            a,
+            b,
+            adj_x=adjoint_a,
+            adj_y=adjoint_b,
+            Tout=output_type,
+            grad_x=grad_a,
+            grad_y=grad_b,
+            name=name,
+        )
       else:
         return gen_math_ops.batch_mat_mul_v2(
-            a, b, adj_x=adjoint_a, adj_y=adjoint_b, name=name)
+            a,
+            b,
+            adj_x=adjoint_a,
+            adj_y=adjoint_b,
+            grad_x=grad_a,
+            grad_y=grad_b,
+            name=name,
+        )
 
     # Neither matmul nor sparse_matmul support adjoint, so we conjugate
     # the matrix and use transpose instead. Conj() is a noop for real
@@ -3837,10 +3638,25 @@ def matmul(a,
         adjoint_a = adjoint_a or transpose_a
         adjoint_b = adjoint_b or transpose_b
         return gen_math_ops.batch_mat_mul_v3(
-            a, b, adj_x=adjoint_a, adj_y=adjoint_b, Tout=output_type, name=name)
+            a,
+            b,
+            adj_x=adjoint_a,
+            adj_y=adjoint_b,
+            Tout=output_type,
+            grad_x=grad_a,
+            grad_y=grad_b,
+            name=name,
+        )
       else:
         return gen_math_ops.mat_mul(
-            a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
+            a,
+            b,
+            transpose_a=transpose_a,
+            transpose_b=transpose_b,
+            grad_a=grad_a,
+            grad_b=grad_b,
+            name=name,
+        )
 
 
 @tf_export("linalg.matvec")
@@ -3884,7 +3700,7 @@ def matvec(a,
   b = tf.constant([7, 9, 11], shape=[3])
 
   # `a` * `b`
-  # [ 58,  64]
+  # [ 58,  139]
   c = tf.linalg.matvec(a, b)
 
 
@@ -3950,7 +3766,6 @@ def matmul_wrapper(a, b, name=None):  # pylint: disable=missing-function-docstri
     return a._matmul(b)
   return matmul(a, b, name=name)
 matmul_wrapper.__doc__ = matmul.__doc__
-_OverrideBinaryOperatorHelper(matmul_wrapper, "matmul")
 
 sparse_matmul = deprecation.deprecated(None, "Use `tf.linalg.matmul` instead")(
     gen_math_ops.sparse_mat_mul)
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 1131ec377fac18..e4599cbb83a5eb 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -406,7 +406,7 @@ def _runtests(self, x_shape, is_training, gradient_test=False,
     else:
       data_format_list = ['NCDHW', 'NDHWC']
     use_gpu_vals = [False]
-    if test.is_gpu_available(cuda_only=True) and not cpu_only:
+    if test.is_gpu_available() and not cpu_only:
       use_gpu_vals += [True]
     factors = [1.0, 0.6]
     for dtype in [np.float16, np.float32, dtypes.bfloat16.as_numpy_dtype]:
@@ -594,7 +594,7 @@ def _testBatchNormGradGrad(self, config):
       data_format_nhwc, features_nhwc = 'NDHWC', shape[4]
       data_format_nchw, features_nchw = 'NCDHW', shape[1]
     for is_training in [True, False]:
-      if test.is_gpu_available(cuda_only=True):
+      if test.is_gpu_available():
         self._test_grad_grad(
             shape,
             dtype, [features_nhwc],
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index cee25369963fda..e62eb4c075fcf1 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -2773,9 +2773,6 @@ def loop_fn(i):
       (fft_ops.rfft2d,),
       (fft_ops.rfft3d,),
   )
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm,
-      skip_message="Disable subtest on ROCm due to rocfft issues")
   def test_rfft(self, op_func):
     for dtype in (dtypes.float32, dtypes.float64):
       x = random_ops.random_uniform([2, 3, 4, 3, 4], dtype=dtype)
@@ -2794,9 +2791,6 @@ def loop_fn(i):
       (fft_ops.irfft2d,),
       (fft_ops.irfft3d,),
   )
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm,
-      skip_message="Disable subtest on ROCm due to rocfft issues")
   def test_irfft(self, op_func):
     if config.list_physical_devices("GPU"):
       # TODO(b/149957923): The test is flaky
diff --git a/tensorflow/python/ops/ragged/__init__.py b/tensorflow/python/ops/ragged/__init__.py
index c9d9a79dad753f..457e54641c6953 100644
--- a/tensorflow/python/ops/ragged/__init__.py
+++ b/tensorflow/python/ops/ragged/__init__.py
@@ -25,3 +25,4 @@
 
 API docstring: tensorflow.ragged
 """
+from tensorflow.python.ops.ragged import ragged_tensor
diff --git a/tensorflow/python/ops/ragged/ragged_cross_op_test.py b/tensorflow/python/ops/ragged/ragged_cross_op_test.py
index c098c13644f342..ce3dc913f35e3d 100644
--- a/tensorflow/python/ops/ragged/ragged_cross_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_cross_op_test.py
@@ -475,7 +475,7 @@ def testRaggedValuesAndSplitsMustMatch(self):
   def testRaggedCrossInvalidRaggedSplits(self, ragged_splits):
     # Test case in GitHub isseu 59114.
     with self.assertRaisesRegex(
-        (ValueError, errors.InvalidArgumentError), 'Invalid RaggedTensor'
+        (ValueError, errors.InvalidArgumentError), 'Invalid ragged splits'
     ):
       ragged_values_0_tensor = ops.convert_to_tensor(np.ones([3], dtype=str))
       ragged_values_0 = array_ops.identity(ragged_values_0_tensor)
diff --git a/tensorflow/python/ops/ragged/ragged_getitem_test.py b/tensorflow/python/ops/ragged/ragged_getitem_test.py
index f707f9f5620e2e..9c46fb0c9c771e 100644
--- a/tensorflow/python/ops/ragged/ragged_getitem_test.py
+++ b/tensorflow/python/ops/ragged/ragged_getitem_test.py
@@ -24,6 +24,7 @@
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import tensor_getitem_override
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
 
@@ -289,14 +290,16 @@ def testWithStridedSlices(self, start, stop):
        'Cannot index into an inner ragged dimension'),
 
       # Tests for type errors
-      (SLICE_BUILDER[0.5], TypeError, re.escape(array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[0.5], TypeError, re.escape(
+          tensor_getitem_override._SLICE_TYPE_ERROR)),
       (SLICE_BUILDER[1:3:0.5], TypeError, re.escape(
-          array_ops._SLICE_TYPE_ERROR)),
+          tensor_getitem_override._SLICE_TYPE_ERROR)),
       (SLICE_BUILDER[:, 1:3:0.5], TypeError,
        'slice strides must be integers or None'),
       (SLICE_BUILDER[:, 0.5:1.5], TypeError,
        'slice offsets must be integers or None'),
-      (SLICE_BUILDER['foo'], TypeError, re.escape(array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER['foo'], TypeError, re.escape(
+          tensor_getitem_override._SLICE_TYPE_ERROR)),
       (SLICE_BUILDER[:, 'foo':'foo'], TypeError,
        'slice offsets must be integers or None'),
 
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
index ef98fb344aed2d..fac49983845728 100644
--- a/tensorflow/python/ops/ragged/ragged_math_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -37,9 +37,9 @@
 from tensorflow.python.util.tf_export import tf_export
 
 
-#===============================================================================
+# ===============================================================================
 # ragged.range
-#===============================================================================
+# ===============================================================================
 # pylint: disable=redefined-builtin
 @tf_export('ragged.range')
 @dispatch.add_dispatch_support
@@ -124,9 +124,9 @@ def _infer_matching_dtype(tensors, dtype_hierarchy):
 
 ops.no_gradient('RaggedRange')
 
-#===============================================================================
+# ===============================================================================
 # ragged_segment_<AGGREGATE>
-#===============================================================================
+# ===============================================================================
 
 # Docstring template used for the raggged_segment_<AGGREGATE> ops.
 _RAGGED_SEGMENT_DOCSTRING = """\
@@ -374,9 +374,9 @@ def _set_ragged_segment_docstring(func, combination, combined):
 _set_ragged_segment_docstring(segment_sqrt_n, 'sum divided by sqrt(N)',
                               'summed')
 
-#===============================================================================
+# ===============================================================================
 # ragged_reduce_<AGGREGATE>
-#===============================================================================
+# ===============================================================================
 
 # Docstring template used for ragged_reduce_<AGGREGATE> ops.
 _RAGGED_REDUCE_DOCSTRING = """\
@@ -707,7 +707,8 @@ def reduce_variance(input_tensor: ragged_tensor.Ragged,
         input_tensor, name='input_tensor')
     if input_tensor.dtype.is_complex:
       raise ValueError(
-          'reduce_variance is not supported for RaggedTensors with complex dtypes.'
+          'reduce_variance is not supported for RaggedTensors with complex'
+          ' dtypes.'
       )
     square_of_input = math_ops.square(input_tensor)
     mean_of_square = reduce_mean(square_of_input, axis=axis, keepdims=keepdims)
@@ -788,20 +789,24 @@ def _set_ragged_reduce_docstring(func, combination, combined, default, example):
                              _RAGGED_REDUCE_ANY_EXAMPLE)
 
 
-#===============================================================================
+# ===============================================================================
 # ragged.matmul
-#===============================================================================
+# ===============================================================================
 @dispatch.dispatch_for_api(math_ops.matmul)
-def matmul(a: ragged_tensor.RaggedOrDense,
-           b: ragged_tensor.RaggedOrDense,
-           transpose_a=False,
-           transpose_b=False,
-           adjoint_a=False,
-           adjoint_b=False,
-           a_is_sparse=False,
-           b_is_sparse=False,
-           output_type=None,
-           name=None):
+def matmul(
+    a: ragged_tensor.RaggedOrDense,
+    b: ragged_tensor.RaggedOrDense,
+    transpose_a=False,
+    transpose_b=False,
+    adjoint_a=False,
+    adjoint_b=False,
+    a_is_sparse=False,
+    b_is_sparse=False,
+    output_type=None,
+    grad_a=False,
+    grad_b=False,
+    name=None,
+):
   """Multiplies matrix `a` by matrix `b`.
 
   If all transpose or adjoint attributes are `False` then:
@@ -824,6 +829,8 @@ def matmul(a: ragged_tensor.RaggedOrDense,
     a_is_sparse: If `True`, optimize assuming `a` is mostly zero.
     b_is_sparse: If `True`, optimize assuming `b` is mostly zero.
     output_type: The output datatype (optional).
+    grad_a: Unused.
+    grad_b: Unused.
     name: Name for the operation (optional).
 
   Returns:
@@ -831,6 +838,8 @@ def matmul(a: ragged_tensor.RaggedOrDense,
     each inner-most matrix is the product of the corresponding matrices in `a`
     and `b`.
   """
+  del grad_a
+  del grad_b
   if transpose_a and adjoint_a:
     raise ValueError('Only one of transpose_a and adjoint_a can be True.')
   if transpose_b and adjoint_b:
@@ -1029,9 +1038,9 @@ def _matmul_3d_with_batch_dim_folding(a, b, **kwargs):
   return a.with_values(array_ops.squeeze(flat_result, axis=1))
 
 
-#===============================================================================
+# ===============================================================================
 # ragged.softmax
-#===============================================================================
+# ===============================================================================
 @dispatch.dispatch_for_api(nn_ops.softmax_v2)
 def softmax(logits: ragged_tensor.Ragged, axis=None, name=None):
   """Computes softmax activations.
@@ -1076,9 +1085,9 @@ def softmax(logits: ragged_tensor.Ragged, axis=None, name=None):
     return math_ops.divide(logits_exp, denominator)
 
 
-#===============================================================================
+# ===============================================================================
 # ragged.add_n
-#===============================================================================
+# ===============================================================================
 @dispatch.dispatch_for_api(math_ops.add_n)
 def add_n(inputs: typing.List[ragged_tensor.RaggedOrDense], name=None):
   """RaggedTensor implementation for tf.math.add_n."""
@@ -1088,9 +1097,9 @@ def add_n(inputs: typing.List[ragged_tensor.RaggedOrDense], name=None):
     return ragged_functional_ops.map_flat_values(math_ops.add_n, inputs)
 
 
-#===============================================================================
+# ===============================================================================
 # Ragged version of nn_ops.dropout
-#===============================================================================
+# ===============================================================================
 @dispatch.dispatch_for_api(nn_ops.dropout)
 def dropout_v1(x: ragged_tensor.Ragged,
                keep_prob=None,
@@ -1140,9 +1149,9 @@ def stateless_dropout(x: ragged_tensor.Ragged,
             x.flat_values, rate=rate, seed=seed, rng_alg=rng_alg))
 
 
-#===============================================================================
+# ===============================================================================
 # Ragged version of Tensor.__eq__ and Tensor.__ne__
-#===============================================================================
+# ===============================================================================
 @dispatch.dispatch_for_api(math_ops.tensor_equals)
 def tensor_equals(self: ragged_tensor.RaggedOrDense,
                   other: ragged_tensor.RaggedOrDense):
diff --git a/tensorflow/python/ops/ref_variable.py b/tensorflow/python/ops/ref_variable.py
index 7e51288b48ef9d..241275b44da30f 100644
--- a/tensorflow/python/ops/ref_variable.py
+++ b/tensorflow/python/ops/ref_variable.py
@@ -97,9 +97,6 @@ def default_variable_creator(next_creator=None, **kwargs):
         shape=shape)
 
 
-variable_v1.default_variable_creator = default_variable_creator
-
-
 def _to_proto_fn(v, export_scope=None):
   """Converts Variable and ResourceVariable to VariableDef for collections."""
   return v.to_proto(export_scope=export_scope)
@@ -1346,6 +1343,3 @@ def _restore_from_tensors(self, restored_tensors):
 # allowing instances of the class to be used as tensors.
 tensor_conversion_registry.register_tensor_conversion_function(
     RefVariable, RefVariable._TensorConversionFunction)  # pylint: disable=protected-access
-
-
-variable_v1.set_variable_from_proto_fn(RefVariable)
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 6e1a6b6280b10a..bc5011178cef7c 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -49,7 +49,6 @@
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import handle_data_util
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 # go/tf-wildcard-import
@@ -59,7 +58,6 @@
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.trackable import base as trackable
 from tensorflow.python.types import core
-from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
@@ -372,9 +370,6 @@ def default_variable_creator_v2(next_creator=None, **kwargs):
       )
 
 
-variables.default_variable_creator_v2 = default_variable_creator_v2
-
-
 class BaseResourceVariable(variables.Variable, core.Tensor):
   """A python variable from an existing handle."""
 
@@ -2332,10 +2327,6 @@ def __init__(  # pylint: disable=super-init-not-called
         in_graph_mode=self._in_graph_mode, **unused_kwargs)
 
 
-_pywrap_utils.RegisterType("ResourceVariable", ResourceVariable)
-math_ops._resource_variable_type = ResourceVariable  # pylint: disable=protected-access
-
-
 def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
@@ -2772,9 +2763,6 @@ def __eq__(self, other):
 )
 
 
-_pywrap_utils.RegisterType("VariableSpec", VariableSpec)
-
-
 def write_object_proto_for_resource_variable(resource_variable,
                                              proto,
                                              options,
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index fcfa8a8b18c260..761f42885ada59 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -28,7 +28,6 @@
 from tensorflow.dtensor.python import api as dtensor_api
 from tensorflow.dtensor.python import layout as layout_lib
 from tensorflow.python.eager import context
-from tensorflow.python.eager import profiler as _profiler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -43,6 +42,7 @@
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import summary_op_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.profiler import profiler_v2 as _profiler
 from tensorflow.python.trackable import resource
 from tensorflow.python.training import training_util
 from tensorflow.python.util import deprecation
@@ -1327,7 +1327,7 @@ def run_metadata_graphs(name, data, step=None):
 
 
 @tf_export("summary.trace_on", v1=[])
-def trace_on(graph=True, profiler=False):  # pylint: disable=redefined-outer-name
+def trace_on(graph=True, profiler=False, profiler_outdir=None):  # pylint: disable=redefined-outer-name
   """Starts a trace to record computation graphs and profiling information.
 
   Must be invoked in eager mode.
@@ -1342,12 +1342,13 @@ def trace_on(graph=True, profiler=False):  # pylint: disable=redefined-outer-nam
 
   Args:
     graph: If True, enables collection of executed graphs. It includes ones from
-        tf.function invocation and ones from the legacy graph mode. The default
-        is True.
+      tf.function invocation and ones from the legacy graph mode. The default is
+      True.
     profiler: If True, enables the advanced profiler. Enabling profiler
-        implicitly enables the graph collection. The profiler may incur a high
-        memory overhead. The default is False.
-
+      implicitly enables the graph collection. The profiler may incur a high
+      memory overhead. The default is False.
+    profiler_outdir: Output directory for profiler. It is required when profiler
+      is enabled when trace was started. Otherwise, it is ignored.
   """
   if ops.inside_function():
     logging.warn("Cannot enable trace inside a tf.function.")
@@ -1365,12 +1366,22 @@ def trace_on(graph=True, profiler=False):  # pylint: disable=redefined-outer-nam
     if graph and not profiler:
       context.context().enable_graph_collection()
     if profiler:
-      context.context().enable_run_metadata()
-      _profiler.start()
+      if profiler_outdir is None:
+        # TODO(b/149431324): Change this to throw a ValueError when Tensorflow
+        # major version advances. (current version is 2.15)
+        logging.warn(
+            "No `profiler_outdir` passed to trace_on(). Profiler won't be"
+            " enabled."
+        )
+      else:
+        context.context().enable_run_metadata()
+        _profiler.start(profiler_outdir)
 
     _current_trace_context = _TraceContext(graph=graph, profiler=profiler)
 
 
+# TODO(b/149431324): Delete `profiler_outdir` arg when Tensorflow major version
+# advances. (current version is 2.15)
 @tf_export("summary.trace_export", v1=[])
 def trace_export(name, step=None, profiler_outdir=None):
   """Stops and exports the active trace as a Summary and/or profile file.
@@ -1383,8 +1394,7 @@ def trace_export(name, step=None, profiler_outdir=None):
     step: Explicit `int64`-castable monotonic step value for this summary. If
       omitted, this defaults to `tf.summary.experimental.get_step()`, which must
       not be None.
-    profiler_outdir: Output directory for profiler. It is required when profiler
-      is enabled when trace was started. Otherwise, it is ignored.
+    profiler_outdir: This arg is a no-op. Please set this in trace_on().
 
   Raises:
     ValueError: if a default writer exists, but no step was provided and
@@ -1406,8 +1416,6 @@ def trace_export(name, step=None, profiler_outdir=None):
       raise ValueError("Must enable trace before export through "
                        "tf.summary.trace_on.")
     graph, profiler = _current_trace_context  # pylint: disable=redefined-outer-name
-    if profiler and profiler_outdir is None:
-      raise ValueError("Argument `profiler_outdir` is not specified.")
 
   run_meta = context.context().export_run_metadata()
 
@@ -1417,7 +1425,12 @@ def trace_export(name, step=None, profiler_outdir=None):
     run_metadata(name, run_meta, step)
 
   if profiler:
-    _profiler.save(profiler_outdir, _profiler.stop())
+    if profiler_outdir:
+      logging.warn(
+          "Ignoring `profiler_outdir` passed to trace_export(). Please pass it"
+          " to trace_on() instead."
+      )
+    _profiler.stop()
 
   trace_off()
 
@@ -1439,7 +1452,8 @@ def trace_off():
   if profiler:
     try:
       _profiler.stop()
-    except _profiler.ProfilerNotRunningError:
+    except Exception as e:  # pylint: disable=broad-except
+      logging.warn("Error while stopping profiler: %s", e)
       pass
 
 
diff --git a/tensorflow/python/ops/tensor_getitem_override.py b/tensorflow/python/ops/tensor_getitem_override.py
new file mode 100644
index 00000000000000..67d71ae4a3c8f4
--- /dev/null
+++ b/tensorflow/python/ops/tensor_getitem_override.py
@@ -0,0 +1,314 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Tests for this file live in python/kernel_tests/array_ops_test.py
+"""Tensor __getitem__ override logic."""
+
+import numbers
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor as tensor_lib
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.util import dispatch
+from tensorflow.python.util.tf_export import tf_export
+
+
+# We override the 'slice' for the "slice" op, so we keep Python's
+# existing 'slice' for later use in this module.
+_BaseSlice = slice
+
+
+_SLICE_TYPE_ERROR = (
+    "Only integers, slices (`:`), ellipsis (`...`), "
+    "tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid "
+    "indices")
+
+
+_SUPPORTED_SLICE_DTYPES = (dtypes.int16, dtypes.int32, dtypes.int32_ref,
+                           dtypes.int64, dtypes.int64_ref)
+
+
+def _is_undefined_dimension(d):
+  return isinstance(d, tensor_shape.Dimension) and d.value is None
+
+
+def _check_index(idx):
+  """Check if a given value is a valid index into a tensor."""
+  if isinstance(idx, (numbers.Integral, tensor_shape.Dimension)):
+    return
+
+  # Optimistic check. Assumptions:
+  # * any object with a dtype is supported
+  # * any object with a dtype has a sizeable shape attribute.
+  dtype = getattr(idx, "dtype", None)
+  if (dtype is None or dtypes.as_dtype(dtype) not in _SUPPORTED_SLICE_DTYPES or
+      idx.shape and len(idx.shape) == 1):
+    # TODO(slebedev): IndexError seems more appropriate here, but it
+    # will break `_slice_helper` contract.
+    raise TypeError(_SLICE_TYPE_ERROR + ", got {!r}".format(idx))
+
+
+@tf_export("__operators__.getitem", v1=[])
+@dispatch.add_dispatch_support
+def _slice_helper(tensor, slice_spec, var=None):
+  """Overload for Tensor.__getitem__.
+
+  This operation extracts the specified region from the tensor.
+  The notation is similar to NumPy with the restriction that
+  currently only support basic indexing. That means that
+  using a non-scalar tensor as input is not currently allowed.
+
+  Some useful examples:
+
+  ```python
+  # Strip leading and trailing 2 elements
+  foo = tf.constant([1,2,3,4,5,6])
+  print(foo[2:-2])  # => [3,4]
+
+  # Skip every other row and reverse the order of the columns
+  foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
+  print(foo[::2,::-1])  # => [[3,2,1], [9,8,7]]
+
+  # Use scalar tensors as indices on both dimensions
+  print(foo[tf.constant(0), tf.constant(2)])  # => 3
+
+  # Insert another dimension
+  foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
+  print(foo[tf.newaxis, :, :]) # => [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[:, tf.newaxis, :]) # => [[[1,2,3]], [[4,5,6]], [[7,8,9]]]
+  print(foo[:, :, tf.newaxis]) # => [[[1],[2],[3]], [[4],[5],[6]],
+  [[7],[8],[9]]]
+
+  # Ellipses (3 equivalent operations)
+  foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
+  print(foo[tf.newaxis, :, :])  # => [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[tf.newaxis, ...])  # => [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[tf.newaxis])  # => [[[1,2,3], [4,5,6], [7,8,9]]]
+
+  # Masks
+  foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
+  print(foo[foo > 2])  # => [3, 4, 5, 6, 7, 8, 9]
+  ```
+
+  Notes:
+    - `tf.newaxis` is `None` as in NumPy.
+    - An implicit ellipsis is placed at the end of the `slice_spec`
+    - NumPy advanced indexing is currently not supported.
+
+  Purpose in the API:
+
+    This method is exposed in TensorFlow's API so that library developers
+    can register dispatching for `Tensor.__getitem__` to allow it to handle
+    custom composite tensors & other custom objects.
+
+    The API symbol is not intended to be called by users directly and does
+    appear in TensorFlow's generated documentation.
+
+  Args:
+    tensor: An tensor.Tensor object.
+    slice_spec: The arguments to Tensor.__getitem__.
+    var: In the case of variable slice assignment, the Variable object to slice
+      (i.e. tensor is the read-only view of this variable).
+
+  Returns:
+    The appropriate slice of "tensor", based on "slice_spec".
+
+  Raises:
+    ValueError: If a slice range is negative size.
+    TypeError: If the slice indices aren't int, slice, ellipsis,
+      tf.newaxis or scalar int32/int64 tensors.
+  """
+  from tensorflow.python.framework import constant_op  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.ops import array_ops  # pylint: disable=g-import-not-at-top
+  tensor = ops.convert_to_tensor(tensor)
+  # TODO(wangpeng): Consider supporting var
+  if var is None and ops._numpy_style_slicing:  # pylint: disable=protected-access
+    return tensor._numpy_style_getitem(slice_spec)  # pylint: disable=protected-access
+
+  if (isinstance(slice_spec, bool)
+      or (isinstance(slice_spec, tensor_lib.Tensor)
+          and slice_spec.dtype == dtypes.bool)
+      or (isinstance(slice_spec, np.ndarray)
+          and slice_spec.dtype == bool)):
+    return array_ops.boolean_mask(tensor=tensor, mask=slice_spec)
+
+  if not isinstance(slice_spec, (list, tuple)):
+    slice_spec = [slice_spec]
+
+  begin, end, strides = [], [], []
+  index = 0
+
+  new_axis_mask, shrink_axis_mask = 0, 0
+  begin_mask, end_mask = 0, 0
+  ellipsis_mask = 0
+  for s in slice_spec:
+    if isinstance(s, _BaseSlice):
+      # Finds the best dtype for begin, end, and strides.
+      dtype = None
+      for t in [s.start, s.stop, s.step]:
+        if t is None or not isinstance(t, tensor_lib.Tensor):
+          continue
+        if t.dtype == dtypes.int64:
+          dtype = dtypes.int64
+        elif t.dtype == dtypes.int32 and dtype != dtypes.int64:
+          dtype = dtypes.int32
+        elif t.dtype == dtypes.int16 and dtype is None:
+          dtype = dtypes.int16
+
+      if s.start is not None and not _is_undefined_dimension(s.start):
+        _check_index(s.start)
+        begin.append(s.start)
+      else:
+        if dtype is not None:
+          begin.append(constant_op.constant(0, dtype=dtype))
+        else:
+          begin.append(0)
+        begin_mask |= (1 << index)
+      if s.stop is not None and not _is_undefined_dimension(s.stop):
+        _check_index(s.stop)
+        end.append(s.stop)
+      else:
+        if dtype is not None:
+          end.append(constant_op.constant(0, dtype=dtype))
+        else:
+          end.append(0)
+        end_mask |= (1 << index)
+      if s.step is not None and not _is_undefined_dimension(s.step):
+        _check_index(s.step)
+        strides.append(s.step)
+      else:
+        if dtype is not None:
+          strides.append(constant_op.constant(1, dtype=dtype))
+        else:
+          strides.append(1)
+    elif s is Ellipsis:
+      begin.append(0)
+      end.append(0)
+      strides.append(1)
+      ellipsis_mask |= (1 << index)
+    elif s is array_ops.newaxis:
+      begin.append(0)
+      end.append(0)
+      strides.append(1)
+      new_axis_mask |= (1 << index)
+    else:
+      _check_index(s)
+      begin.append(s)
+      end.append(s + 1)
+      # TODO(mdan): Investigate why we can't set int32 here.
+      if (
+          isinstance(s, tensor_lib.Tensor)
+          and (s.dtype == dtypes.int16 or s.dtype == dtypes.int64)):
+        strides.append(constant_op.constant(1, dtype=s.dtype))
+      else:
+        strides.append(1)
+      shrink_axis_mask |= (1 << index)
+    index += 1
+
+  # stack possibly involves no tensors, so we must use op_scope correct graph.
+  with ops.name_scope(
+      None,
+      "strided_slice", [tensor] + begin + end + strides,
+      skip_on_eager=False) as name:
+    if begin:
+      from tensorflow.python.ops import array_ops_stack  # pylint: disable=g-import-not-at-top
+      packed_begin, packed_end, packed_strides = (
+          array_ops_stack.stack(begin),
+          array_ops_stack.stack(end),
+          array_ops_stack.stack(strides))
+      # TODO(mdan): Instead of implicitly casting, it's better to enforce the
+      # same dtypes.
+      if (packed_begin.dtype == dtypes.int64 or
+          packed_end.dtype == dtypes.int64 or
+          packed_strides.dtype == dtypes.int64):
+        if packed_begin.dtype != dtypes.int64:
+          packed_begin = gen_math_ops.cast(packed_begin, dtypes.int64)
+        if packed_end.dtype != dtypes.int64:
+          packed_end = gen_math_ops.cast(packed_end, dtypes.int64)
+        if packed_strides.dtype != dtypes.int64:
+          packed_strides = gen_math_ops.cast(packed_strides, dtypes.int64)
+      elif (packed_begin.dtype == dtypes.int16 and
+            packed_end.dtype == dtypes.int16 and
+            packed_strides.dtype == dtypes.int16):
+        if packed_begin.dtype != dtypes.int16:
+          packed_begin = gen_math_ops.cast(packed_begin, dtypes.int16)
+        if packed_end.dtype != dtypes.int16:
+          packed_end = gen_math_ops.cast(packed_end, dtypes.int16)
+        if packed_strides.dtype != dtypes.int16:
+          packed_strides = gen_math_ops.cast(packed_strides, dtypes.int16)
+    else:
+      var_empty = constant_op.constant([], dtype=dtypes.int32)
+      packed_begin = packed_end = packed_strides = var_empty
+    return array_ops.strided_slice(
+        tensor,
+        packed_begin,
+        packed_end,
+        packed_strides,
+        begin_mask=begin_mask,
+        end_mask=end_mask,
+        shrink_axis_mask=shrink_axis_mask,
+        new_axis_mask=new_axis_mask,
+        ellipsis_mask=ellipsis_mask,
+        var=var,
+        name=name)
+
+
+def _slice_helper_var(var, slice_spec):
+  """Creates a slice helper object given a variable.
+
+  This allows creating a sub-tensor from part of the current contents
+  of a variable. See `tf.Tensor.__getitem__` for detailed examples
+  of slicing.
+
+  This function in addition also allows assignment to a sliced range.
+  This is similar to `__setitem__` functionality in Python. However,
+  the syntax is different so that the user can capture the assignment
+  operation for grouping or passing to `sess.run()` in TF1.
+  For example,
+
+  ```python
+  import tensorflow as tf
+  A = tf.Variable([[1,2,3], [4,5,6], [7,8,9]], dtype=tf.float32)
+  print(A[:2, :2])  # => [[1,2], [4,5]]
+
+  A[:2,:2].assign(22. * tf.ones((2, 2))))
+  print(A) # => [[22, 22, 3], [22, 22, 6], [7,8,9]]
+  ```
+
+  Note that assignments currently do not support NumPy broadcasting
+  semantics.
+
+  Args:
+    var: An `ops.Variable` object.
+    slice_spec: The arguments to `Tensor.__getitem__`.
+
+  Returns:
+    The appropriate slice of "tensor", based on "slice_spec".
+    As an operator. The operator also has a `assign()` method
+    that can be used to generate an assignment operator.
+
+  Raises:
+    ValueError: If a slice range is negative size.
+    TypeError: TypeError: If the slice indices aren't int, slice,
+      ellipsis, tf.newaxis or int32/int64 tensors.
+
+  """
+
+  return _slice_helper(var.value(), slice_spec, var)
+
+
+tensor_lib.Tensor._override_operator("__getitem__", _slice_helper)  # pylint: disable=protected-access
diff --git a/tensorflow/python/ops/tensor_math_operator_overrides.py b/tensorflow/python/ops/tensor_math_operator_overrides.py
new file mode 100644
index 00000000000000..f94d2a14da8faa
--- /dev/null
+++ b/tensorflow/python/ops/tensor_math_operator_overrides.py
@@ -0,0 +1,168 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Overrides for Tensor operators."""
+
+
+from tensorflow.python.framework import override_binary_operator
+from tensorflow.python.framework import tensor as tensor_lib
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.util import tf_decorator
+
+
+# pylint: disable=g-import-not-at-top
+def _add_dispatch_factory(x, y, name=None):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops._add_dispatch(x, y, name=name)  # pylint: disable=protected-access
+
+
+def _and_factory(x, y, name=None):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops.and_(x, y, name=name)
+
+
+def _div_factory(x, y, name=None):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops.div(x, y, name=name)
+
+
+def _floordiv_factory(x, y, name=None):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops.floordiv(x, y, name=name)
+
+
+def _matmul_factory(a, b, name=None):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops.matmul_wrapper(a, b, name=name)
+
+
+def _mod_factory(x, y, name=None):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops.mod(x, y, name=name)
+
+
+def _mul_dispatch_factory(x, y, name=None):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops._mul_dispatch(x, y, name=name)  # pylint: disable=protected-access
+
+
+def _or_factory(x, y, name=None):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops.or_(x, y, name=name)
+
+
+def _pow_factory(x, y, name=None):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops.pow(x, y, name=name)
+
+
+def _subtract_factory(x, y, name=None):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops.subtract(x, y, name=name)
+
+
+def _truediv_factory(x, y, name=None):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops.truediv(x, y, name=name)
+
+
+def _xor_factory(x, y, name=None):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops.xor_(x, y, name=name)
+
+
+override_binary_operator.override_binary_operator_helper(
+    _add_dispatch_factory, "add"
+)
+override_binary_operator.override_binary_operator_helper(_and_factory, "and")
+override_binary_operator.override_binary_operator_helper(_div_factory, "div")
+override_binary_operator.override_binary_operator_helper(
+    _floordiv_factory, "floordiv"
+)
+override_binary_operator.override_binary_operator_helper(
+    _matmul_factory, "matmul"
+)
+override_binary_operator.override_binary_operator_helper(_mod_factory, "mod")
+override_binary_operator.override_binary_operator_helper(
+    _mul_dispatch_factory, "mul"
+)
+override_binary_operator.override_binary_operator_helper(_or_factory, "or")
+override_binary_operator.override_binary_operator_helper(_pow_factory, "pow")
+override_binary_operator.override_binary_operator_helper(
+    _subtract_factory, "sub"
+)
+override_binary_operator.override_binary_operator_helper(
+    _truediv_factory, "truediv"
+)
+override_binary_operator.override_binary_operator_helper(_xor_factory, "xor")
+
+
+def _invert_factory(x, name=None):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops.invert_(x, name=name)
+
+
+def _abs_factory(x, name=None):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops.abs(x, name=name)
+
+
+def _tensor_equals_factory(self, other):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops.tensor_equals(self, other)
+
+
+def _tensor_not_equals_factory(self, other):
+  from tensorflow.python.ops import math_ops
+
+  return math_ops.tensor_not_equals(self, other)
+
+
+def _promote_dtypes_decorator(fn):
+  def wrapper(x, y, *args, **kwargs):
+    x, y = override_binary_operator.maybe_promote_tensors(x, y)
+    return fn(x, y, *args, **kwargs)
+
+  return tf_decorator.make_decorator(fn, wrapper)
+
+
+# pylint: disable=protected-access
+tensor_lib.Tensor._override_operator("__invert__", _invert_factory)
+tensor_lib.Tensor._override_operator("__neg__", gen_math_ops.neg)
+tensor_lib.Tensor._override_operator("__abs__", _abs_factory)
+tensor_lib.Tensor._override_operator("__lt__", _promote_dtypes_decorator(
+    gen_math_ops.less))
+tensor_lib.Tensor._override_operator("__le__", _promote_dtypes_decorator(
+    gen_math_ops.less_equal))
+tensor_lib.Tensor._override_operator("__gt__", _promote_dtypes_decorator(
+    gen_math_ops.greater))
+tensor_lib.Tensor._override_operator("__ge__", _promote_dtypes_decorator(
+    gen_math_ops.greater_equal))
+tensor_lib.Tensor._override_operator("__eq__", _tensor_equals_factory)
+tensor_lib.Tensor._override_operator("__ne__", _tensor_not_equals_factory)
diff --git a/tensorflow/python/ops/variable_v1.py b/tensorflow/python/ops/variable_v1.py
index d7d4f0e5daeee9..f3cca80758e5cb 100644
--- a/tensorflow/python/ops/variable_v1.py
+++ b/tensorflow/python/ops/variable_v1.py
@@ -23,15 +23,6 @@
 from tensorflow.python.util.tf_export import tf_export
 
 
-_variable_from_proto_fn = None
-
-
-def set_variable_from_proto_fn(variable_from_proto_fn):
-  """Set the variable class that variable proto defs will be converted to."""
-  global _variable_from_proto_fn
-  _variable_from_proto_fn = variable_from_proto_fn
-
-
 @tf_export(v1=["is_variable_initialized"])
 @tf_should_use.should_use_result
 def is_variable_initialized(variable):
@@ -47,9 +38,12 @@ def is_variable_initialized(variable):
   return state_ops.is_variable_initialized(variable)
 
 
-def default_variable_creator(_, **kwds):
-  del kwds
-  raise NotImplementedError("ref_variable needs to be imported")
+def default_variable_creator(next_creator=None, **kwds):
+  from tensorflow.python.ops import ref_variable  # pylint: disable=g-import-not-at-top
+
+  return ref_variable.default_variable_creator(
+      next_creator=next_creator, **kwds
+  )
 
 
 @tf_export(v1=["Variable"])
@@ -269,7 +263,8 @@ def initialized_value(self):
 
   @staticmethod
   def from_proto(variable_def, import_scope=None):
-    return _variable_from_proto_fn(
+    from tensorflow.python.ops import ref_variable  # pylint: disable=g-import-not-at-top
+    return ref_variable.RefVariable(
         variable_def=variable_def, import_scope=import_scope)
 
   @classmethod
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 5208dd1c8229ae..49821d75da445d 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -34,8 +34,8 @@
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import tensor_getitem_override
 from tensorflow.python.trackable import base as trackable
-from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util import traceback_utils
@@ -44,9 +44,11 @@
 from tensorflow.python.util.tf_export import tf_export
 
 
-def default_variable_creator_v2(_, **kwds):
-  del kwds
-  raise NotImplementedError("resource_variable_ops needs to be imported")
+def default_variable_creator_v2(next_creator=None, **kwds):
+  from tensorflow.python.ops import resource_variable_ops  # pylint: disable=g-import-not-at-top
+
+  return resource_variable_ops.default_variable_creator_v2(
+      next_creator=next_creator, **kwds)
 
 
 def _make_getter(captured_getter, captured_previous):
@@ -984,10 +986,10 @@ def _OverloadAllOperators(cls):  # pylint: disable=invalid-name
     """Register overloads for all operators."""
     for operator in tensor_lib.Tensor.OVERLOADABLE_OPERATORS:
       cls._OverloadOperator(operator)
-    # For slicing, bind getitem differently than a tensor (use SliceHelperVar
+    # For slicing, bind getitem differently than a tensor (use _slice_helper_var
     # instead)
     # pylint: disable=protected-access
-    setattr(cls, "__getitem__", array_ops._SliceHelperVar)
+    setattr(cls, "__getitem__", tensor_getitem_override._slice_helper_var)
 
   @classmethod
   def _OverloadOperator(cls, operator):  # pylint: disable=invalid-name
@@ -1324,7 +1326,6 @@ def to_proto(self, export_scope=None):
 
 
 Variable._OverloadAllOperators()  # pylint: disable=protected-access
-_pywrap_utils.RegisterType("Variable", Variable)
 
 
 def _try_guard_against_uninitialized_dependencies(name, initial_value):
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
index d95dcb79d1e4fd..f5df743995fb86 100644
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -112,9 +112,6 @@ class RunMetadataTest(test.TestCase):
   # work as expected. Since we now run this test with SOFTWARE_TRACE
   # (see _run_model routine above), this test will / should fail since
   # GPU device tracers are not enabled
-  @test.disable_with_predicate(
-      pred=test.is_built_with_rocm,
-      skip_message='Test fails on ROCm when run without FULL_TRACE')
   @test_util.run_deprecated_v1
   def testGPU(self):
     if not test.is_gpu_available(cuda_only=True):
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 2b522cddc7ce22..3ae6d97eb18ef8 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -646,7 +646,7 @@ cuda_py_strict_test(
         "//tensorflow/python/trackable:resource",
         "//tensorflow/python/training:monitored_session",
         "//tensorflow/python/types:core",
-        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_inspect",
         "@absl_py//absl/testing:parameterized",
     ] + if_google([
         "//tensorflow/cc/experimental/tf2:runtime_pybind",
@@ -770,7 +770,8 @@ py_strict_library(
         "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_decorator_py",
+        "//tensorflow/python/util:tf_inspect",
         "@absl_py//absl/logging",
     ],
 )
@@ -914,7 +915,6 @@ tf_python_pybind_extension(
     #        "//tensorflow:windows": [],
     #    }),
     #    static_deps = tf_python_pybind_static_deps(),
-    features = ["-layering_check"],
     pytype_srcs = [
         "pywrap_saved_model/__init__.pyi",
         "pywrap_saved_model/constants.pyi",
@@ -928,17 +928,27 @@ tf_python_pybind_extension(
         "//tensorflow/python/training:__subpackages__",
     ],
     deps = [
-        ":pywrap_saved_model_headers",
         # placeholder for index annotation deps
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "//tensorflow/cc/experimental/libexport:save",
+        "//tensorflow/cc/saved_model:constants",
+        "//tensorflow/cc/saved_model:fingerprinting",
+        "//tensorflow/cc/saved_model:metrics",
         "//tensorflow/cc/saved_model:reader",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:path",
         "//tensorflow/python/lib/core:pybind11_status",
         "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:absl_casters",
         "@pybind11_abseil//pybind11_abseil:status_casters",
         "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
-    ],
+    ] + if_google([
+        "//tensorflow/tools/proto_splitter:merge",
+    ]),
 )
 
 tf_py_strict_test(
diff --git a/tensorflow/python/saved_model/fingerprinting.py b/tensorflow/python/saved_model/fingerprinting.py
index dd8be59cfaa694..30d9dc76987b88 100644
--- a/tensorflow/python/saved_model/fingerprinting.py
+++ b/tensorflow/python/saved_model/fingerprinting.py
@@ -18,13 +18,15 @@
 fingerprint.
 """
 
+from typing import Any
+
 from tensorflow.core.protobuf import fingerprint_pb2
 from tensorflow.python.saved_model.pywrap_saved_model import fingerprinting as fingerprinting_pywrap
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("saved_model.experimental.Fingerprint", v1=[])
-class Fingerprint(object):
+class Fingerprint:
   """The SavedModel fingerprint.
 
   Each attribute of this class is named after a field name in the
@@ -42,12 +44,12 @@ class Fingerprint(object):
 
   def __init__(
       self,
-      saved_model_checksum=None,
-      graph_def_program_hash=None,
-      signature_def_hash=None,
-      saved_object_graph_hash=None,
-      checkpoint_hash=None,
-      version=None,
+      saved_model_checksum: int = None,
+      graph_def_program_hash: int = None,
+      signature_def_hash: int = None,
+      saved_object_graph_hash: int = None,
+      checkpoint_hash: int = None,
+      version: int = None,
   ):
     """Initializes the instance based on values in the SavedModel fingerprint.
 
@@ -67,7 +69,7 @@ def __init__(
     self.version = version
 
   @classmethod
-  def from_proto(cls, proto):
+  def from_proto(cls, proto: fingerprint_pb2.FingerprintDef) -> "Fingerprint":
     """Constructs Fingerprint object from protocol buffer message."""
     if isinstance(proto, bytes):
       proto = fingerprint_pb2.FingerprintDef.FromString(proto)
@@ -84,7 +86,7 @@ def from_proto(cls, proto):
           f"Given proto could not be deserialized as fingerprint."
           f"{e}") from None
 
-  def __eq__(self, other):
+  def __eq__(self, other: Any) -> bool:
     if (isinstance(other, Fingerprint) or
         isinstance(other, fingerprint_pb2.FingerprintDef)):
       try:
@@ -98,9 +100,9 @@ def __eq__(self, other):
         pass
     return False
 
-  def __str__(self):
+  def __str__(self) -> str:
     return "\n".join([
-        f"SavedModel Fingerprint",
+        "SavedModel Fingerprint",
         f"  saved_model_checksum: {self.saved_model_checksum}",
         f"  graph_def_program_hash: {self.graph_def_program_hash}",
         f"  signature_def_hash: {self.signature_def_hash}",
@@ -108,14 +110,14 @@ def __str__(self):
         f"  checkpoint_hash: {self.checkpoint_hash}"
     ])
 
-  def __repr__(self):
+  def __repr__(self) -> str:
     return (f"Fingerprint({self.saved_model_checksum}, "
             f"{self.graph_def_program_hash}, "
             f"{self.signature_def_hash}, "
             f"{self.saved_object_graph_hash}, "
             f"{self.checkpoint_hash})")
 
-  def singleprint(self):
+  def singleprint(self) -> fingerprinting_pywrap.Singleprint:
     """Canonical fingerprinting ID for a SavedModel.
 
     Uniquely identifies a SavedModel based on the regularized fingerprint
@@ -147,7 +149,7 @@ def singleprint(self):
 
 
 @tf_export("saved_model.experimental.read_fingerprint", v1=[])
-def read_fingerprint(export_dir):
+def read_fingerprint(export_dir: str) -> Fingerprint:
   """Reads the fingerprint of a SavedModel in `export_dir`.
 
   Returns a `tf.saved_model.experimental.Fingerprint` object that contains
diff --git a/tensorflow/python/saved_model/fingerprinting_utils.py b/tensorflow/python/saved_model/fingerprinting_utils.py
index 67ebdd33cd7704..cb31860ed81bd3 100644
--- a/tensorflow/python/saved_model/fingerprinting_utils.py
+++ b/tensorflow/python/saved_model/fingerprinting_utils.py
@@ -32,7 +32,7 @@
 FingerprintException = fingerprinting_pywrap.FingerprintException
 
 
-def write_fingerprint(export_dir):
+def write_fingerprint(export_dir: str) -> None:
   """Write fingerprint protobuf, if requested.
 
   Writes a `tf.saved_model.experimental.Fingerprint` object to a
@@ -66,7 +66,7 @@ def write_fingerprint(export_dir):
                    "Model saving will continue.")
 
 
-def singleprint_from_saved_model_proto(export_dir):
+def singleprint_from_saved_model_proto(export_dir: str) -> str:
   """Returns the singleprint of `saved_model.pb` in `export_dir`.
 
   Args:
@@ -85,7 +85,7 @@ def singleprint_from_saved_model_proto(export_dir):
     raise ValueError(e) from None
 
 
-def singleprint_from_fingerprint_proto(export_dir):
+def singleprint_from_fingerprint_proto(export_dir: str) -> str:
   """Returns the singleprint of `fingerprint.pb` in `export_dir`.
 
   Args:
@@ -104,7 +104,7 @@ def singleprint_from_fingerprint_proto(export_dir):
     raise ValueError(e) from None
 
 
-def singleprint_from_saved_model(export_dir):
+def singleprint_from_saved_model(export_dir: str) -> str:
   """Returns the singleprint of the SavedModel in `export_dir`.
 
   First tries to construct the singleprint from `fingerprint.pb`, then from
@@ -141,9 +141,8 @@ def singleprint_from_saved_model(export_dir):
     raise ValueError(e) from None
 
 
-def to_proto(fingerprint):
-  if not isinstance(fingerprint, fingerprinting.Fingerprint):
-    raise TypeError("Supplied value is not a Fingerprint.")
+def to_proto(
+    fingerprint: fingerprinting.Fingerprint) -> fingerprint_pb2.FingerprintDef:
   return fingerprint_pb2.FingerprintDef(
       saved_model_checksum=fingerprint.saved_model_checksum,
       graph_def_program_hash=fingerprint.graph_def_program_hash,
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 8f498936328e86..a08d41f5fcf499 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -2970,7 +2970,7 @@ def increment_v(x):
   # TODO(allenl, kkb): Use the new memory checker here once it's fast enough (3
   # iterations took hundreds of seconds). It would be really nice to check
   # allocations at a lower level.
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def test_functions_cleaned(self, use_cpp_bindings):
     # TODO(b/264869753) Fix SingleCycleTest
     if use_cpp_bindings:
diff --git a/tensorflow/python/saved_model/registration/registration_saving_test.py b/tensorflow/python/saved_model/registration/registration_saving_test.py
index ec87f06c0a5e10..8e60cc2bf9e122 100644
--- a/tensorflow/python/saved_model/registration/registration_saving_test.py
+++ b/tensorflow/python/saved_model/registration/registration_saving_test.py
@@ -223,7 +223,7 @@ def test_registered_saver(self, cycles):
 
 class SingleCycleTest(test.TestCase):
 
-  @test_util.deprecated_graph_mode_only()
+  @test_util.deprecated_graph_mode_only
   def test_registered_saver_fails_in_saved_model_graph_mode(self):
     with context.eager_mode():
       p1 = Part([1, 4])
diff --git a/tensorflow/python/summary/BUILD b/tensorflow/python/summary/BUILD
index 5ed5b0f74dc2df..7af5c5cb277ae8 100644
--- a/tensorflow/python/summary/BUILD
+++ b/tensorflow/python/summary/BUILD
@@ -37,6 +37,7 @@ py_strict_library(
     srcs = ["summary.py"],
     visibility = ["//visibility:public"],
     deps = [
+        ":tb_summary",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/distribute:summary_op_util",
         "//tensorflow/python/eager:context",
@@ -124,3 +125,10 @@ tf_py_strict_test(
         "@pypi_tb_nightly//:pkg",
     ],
 )
+
+py_strict_library(
+    name = "tb_summary",
+    srcs = ["tb_summary.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = ["//tensorflow/python/util:tf_export"],
+)
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 161456a7aecae0..b6112b1d7db1d9 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -46,7 +46,7 @@
 from tensorflow.python.ops import gen_summary_ops as _gen_summary_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import summary_op_util as _summary_op_util
 from tensorflow.python.ops import summary_ops_v2 as _summary_ops_v2
-
+from tensorflow.python.summary import tb_summary
 # exports FileWriter, FileWriterCache
 # pylint: disable=unused-import
 from tensorflow.python.summary.writer.writer import FileWriter
@@ -124,9 +124,8 @@ def scalar(name, tensor, collections=None, family=None):
   if _should_invoke_v2_op():
     # Defer the import to happen inside the symbol to prevent breakage due to
     # missing dependency.
-    from tensorboard.summary.v2 import scalar as scalar_v2  # pylint: disable=g-import-not-at-top
     with _compat_summary_scope(name, family) as tag:
-      scalar_v2(name=tag, data=tensor, step=_get_step_for_v2())
+      tb_summary.scalar(name=tag, data=tensor, step=_get_step_for_v2())
     # Return an empty Tensor, which will be acceptable as an input to the
     # `tf.compat.v1.summary.merge()` API.
     return _constant_op.constant(b'')
@@ -235,9 +234,8 @@ def image(name, tensor, max_outputs=3, collections=None, family=None):
   if _should_invoke_v2_op():
     # Defer the import to happen inside the symbol to prevent breakage due to
     # missing dependency.
-    from tensorboard.summary.v2 import image as image_v2  # pylint: disable=g-import-not-at-top
     with _compat_summary_scope(name, family) as tag:
-      image_v2(
+      tb_summary.image(
           name=tag,
           data=tensor,
           step=_get_step_for_v2(),
@@ -330,9 +328,8 @@ def histogram(name, values, collections=None, family=None):
   if _should_invoke_v2_op():
     # Defer the import to happen inside the symbol to prevent breakage due to
     # missing dependency.
-    from tensorboard.summary.v2 import histogram as histogram_v2  # pylint: disable=g-import-not-at-top
     with _compat_summary_scope(name, family) as tag:
-      histogram_v2(name=tag, data=values, step=_get_step_for_v2())
+      tb_summary.histogram(name=tag, data=values, step=_get_step_for_v2())
     # Return an empty Tensor, which will be acceptable as an input to the
     # `tf.compat.v1.summary.merge()` API.
     return _constant_op.constant(b'')
@@ -440,12 +437,11 @@ def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
   if _should_invoke_v2_op():
     # Defer the import to happen inside the symbol to prevent breakage due to
     # missing dependency.
-    from tensorboard.summary.v2 import audio as audio_v2  # pylint: disable=g-import-not-at-top
     if tensor.shape.rank == 2:
       # TF2 op requires 3-D tensor, add the `channels` dimension.
       tensor = _array_ops.expand_dims_v2(tensor, axis=2)
     with _compat_summary_scope(name, family) as tag:
-      audio_v2(
+      tb_summary.audio(
           name=tag,
           data=tensor,
           sample_rate=sample_rate,
@@ -540,8 +536,7 @@ def text(name, tensor, collections=None):
       return _constant_op.constant('')
     # Defer the import to happen inside the symbol to prevent breakage due to
     # missing dependency.
-    from tensorboard.summary.v2 import text as text_v2  # pylint: disable=g-import-not-at-top
-    text_v2(name=name, data=tensor, step=_get_step_for_v2())
+    tb_summary.text(name=name, data=tensor, step=_get_step_for_v2())
     # Return an empty Tensor, which will be acceptable as an input to the
     # `tf.compat.v1.summary.merge()` API.
     return _constant_op.constant(b'')
diff --git a/tensorflow/python/summary/summary_v2_test.py b/tensorflow/python/summary/summary_v2_test.py
index d6454b46893f05..6e3721b311f209 100644
--- a/tensorflow/python/summary/summary_v2_test.py
+++ b/tensorflow/python/summary/summary_v2_test.py
@@ -43,7 +43,9 @@ def test_scalar_summary_v2__w_writer(self):
     # Returns empty string.
     self.assertEqual(tensor.numpy(), b'')
     self.assertEqual(tensor.dtype, dtypes.string)
-    mock_scalar_v2.assert_called_once_with('float', data=i, step=1)
+    mock_scalar_v2.assert_called_once_with(
+        name='float', data=i, step=1, description=test.mock.ANY
+    )
 
   @test_util.run_v2_only
   def test_scalar_summary_v2__wo_writer(self):
@@ -79,7 +81,11 @@ def test_scalar_summary_v2__family(self):
     self.assertEqual(tensor.numpy(), b'')
     self.assertEqual(tensor.dtype, dtypes.string)
     mock_scalar_v2.assert_called_once_with(
-        'otter/otter/float', data=constant_op.constant(2.5), step=1)
+        name='otter/otter/float',
+        data=constant_op.constant(2.5),
+        step=1,
+        description=test.mock.ANY,
+    )
 
   @test_util.run_v2_only
   def test_scalar_summary_v2__family_w_outer_scope(self):
@@ -95,7 +101,11 @@ def test_scalar_summary_v2__family_w_outer_scope(self):
     self.assertEqual(tensor.numpy(), b'')
     self.assertEqual(tensor.dtype, dtypes.string)
     mock_scalar_v2.assert_called_once_with(
-        'crabnet/sea/crabnet/float', data=constant_op.constant(3.5), step=1)
+        name='crabnet/sea/crabnet/float',
+        data=constant_op.constant(3.5),
+        step=1,
+        description=test.mock.ANY,
+    )
 
   @test_util.run_v2_only
   def test_scalar_summary_v2__v1_set_step(self):
@@ -111,7 +121,9 @@ def test_scalar_summary_v2__v1_set_step(self):
     # Returns empty string.
     self.assertEqual(tensor.numpy(), b'')
     self.assertEqual(tensor.dtype, dtypes.string)
-    mock_scalar_v2.assert_called_once_with('float', data=i, step=1024)
+    mock_scalar_v2.assert_called_once_with(
+        name='float', data=i, step=1024, description=test.mock.ANY
+    )
 
   @test_util.run_v2_only
   def test_image_summary_v2(self):
@@ -127,7 +139,12 @@ def test_image_summary_v2(self):
     self.assertEqual(tensor.numpy(), b'')
     self.assertEqual(tensor.dtype, dtypes.string)
     mock_image_v2.assert_called_once_with(
-        'family/outer/family/image', data=i, step=2, max_outputs=3)
+        name='family/outer/family/image',
+        data=i,
+        step=2,
+        max_outputs=3,
+        description=test.mock.ANY,
+    )
 
   @test_util.run_v2_only
   def test_histogram_summary_v2(self):
@@ -142,7 +159,12 @@ def test_histogram_summary_v2(self):
     self.assertEqual(tensor.numpy(), b'')
     self.assertEqual(tensor.dtype, dtypes.string)
     mock_histogram_v2.assert_called_once_with(
-        'family/family/histogram', data=i, step=3)
+        name='family/family/histogram',
+        data=i,
+        step=3,
+        buckets=test.mock.ANY,
+        description=test.mock.ANY,
+    )
 
   @test_util.run_v2_only
   def test_audio_summary_v2(self):
@@ -158,7 +180,14 @@ def test_audio_summary_v2(self):
     self.assertEqual(tensor.numpy(), b'')
     self.assertEqual(tensor.dtype, dtypes.string)
     mock_audio_v2.assert_called_once_with(
-        'dolphin/wave', data=i, sample_rate=0.2, step=10, max_outputs=3)
+        name='dolphin/wave',
+        data=i,
+        sample_rate=0.2,
+        step=10,
+        max_outputs=3,
+        encoding=test.mock.ANY,
+        description=test.mock.ANY,
+    )
 
   @test_util.run_v2_only
   def test_audio_summary_v2__2d_tensor(self):
@@ -175,7 +204,14 @@ def test_audio_summary_v2__2d_tensor(self):
     self.assertEqual(tensor.dtype, dtypes.string)
 
     mock_audio_v2.assert_called_once_with(
-        'wave', data=test.mock.ANY, sample_rate=0.2, step=11, max_outputs=3)
+        name='wave',
+        data=test.mock.ANY,
+        sample_rate=0.2,
+        step=11,
+        max_outputs=3,
+        encoding=test.mock.ANY,
+        description=test.mock.ANY,
+    )
     input_3d = array_ops.ones((5, 3, 1))  # 3-D input tensor
     self.assertAllEqual(mock_audio_v2.call_args[1]['data'], input_3d)
 
@@ -191,7 +227,9 @@ def test_text_summary_v2(self):
     # Returns empty string.
     self.assertEqual(tensor.numpy(), b'')
     self.assertEqual(tensor.dtype, dtypes.string)
-    mock_text_v2.assert_called_once_with('text', data=i, step=22)
+    mock_text_v2.assert_called_once_with(
+        name='text', data=i, step=22, description=test.mock.ANY
+    )
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/summary/tb_summary.py b/tensorflow/python/summary/tb_summary.py
new file mode 100644
index 00000000000000..682ca5a2b7e1dd
--- /dev/null
+++ b/tensorflow/python/summary/tb_summary.py
@@ -0,0 +1,374 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Re-exports the APIs of TF2 summary that live in TensorBoard."""
+
+from tensorflow.python.util.tf_export import tf_export
+
+_TENSORBOARD_NOT_INSTALLED_ERROR = (
+    "TensorBoard is not installed, missing implementation for"
+)
+
+
+class TBNotInstalledError(Exception):
+
+  def __init__(self, summary_api):
+    self.error_message = f"{_TENSORBOARD_NOT_INSTALLED_ERROR} {summary_api}"
+    super().__init__(self.error_message)
+
+
+@tf_export("summary.audio", v1=[])
+def audio(
+    name,
+    data,
+    sample_rate,
+    step=None,
+    max_outputs=3,
+    encoding=None,
+    description=None,
+):
+  """Write an audio summary.
+
+  Arguments:
+    name: A name for this summary. The summary tag used for TensorBoard will be
+      this name prefixed by any active name scopes.
+    data: A `Tensor` representing audio data with shape `[k, t, c]`, where `k`
+      is the number of audio clips, `t` is the number of frames, and `c` is the
+      number of channels. Elements should be floating-point values in `[-1.0,
+      1.0]`. Any of the dimensions may be statically unknown (i.e., `None`).
+    sample_rate: An `int` or rank-0 `int32` `Tensor` that represents the sample
+      rate, in Hz. Must be positive.
+    step: Explicit `int64`-castable monotonic step value for this summary. If
+      omitted, this defaults to `tf.summary.experimental.get_step()`, which must
+      not be None.
+    max_outputs: Optional `int` or rank-0 integer `Tensor`. At most this many
+      audio clips will be emitted at each step. When more than `max_outputs`
+      many clips are provided, the first `max_outputs` many clips will be used
+      and the rest silently discarded.
+    encoding: Optional constant `str` for the desired encoding. Only "wav" is
+      currently supported, but this is not guaranteed to remain the default, so
+      if you want "wav" in particular, set this explicitly.
+    description: Optional long-form description for this summary, as a constant
+      `str`. Markdown is supported. Defaults to empty.
+
+  Returns:
+    True on success, or false if no summary was emitted because no default
+    summary writer was available.
+
+  Raises:
+    ValueError: if a default writer exists, but no step was provided and
+      `tf.summary.experimental.get_step()` is None.
+  """
+  try:
+    from tensorboard.summary.v2 import audio as audio_v2  # pylint: disable=g-import-not-at-top, g-importing-member
+  except ImportError as exc:
+    raise TBNotInstalledError("tf.summary.audio") from exc
+  return audio_v2(
+      name=name,
+      data=data,
+      sample_rate=sample_rate,
+      step=step,
+      max_outputs=max_outputs,
+      encoding=encoding,
+      description=description,
+  )
+
+
+@tf_export("summary.histogram", v1=[])
+def histogram(name, data, step=None, buckets=None, description=None):
+  """Write a histogram summary.
+
+  See also `tf.summary.scalar`, `tf.summary.SummaryWriter`.
+
+  Writes a histogram to the current default summary writer, for later analysis
+  in TensorBoard's 'Histograms' and 'Distributions' dashboards (data written
+  using this API will appear in both places). Like `tf.summary.scalar` points,
+  each histogram is associated with a `step` and a `name`. All the histograms
+  with the same `name` constitute a time series of histograms.
+
+  The histogram is calculated over all the elements of the given `Tensor`
+  without regard to its shape or rank.
+
+  This example writes 2 histograms:
+
+  ```python
+  w = tf.summary.create_file_writer('test/logs')
+  with w.as_default():
+      tf.summary.histogram("activations", tf.random.uniform([100, 50]), step=0)
+      tf.summary.histogram("initial_weights", tf.random.normal([1000]), step=0)
+  ```
+
+  A common use case is to examine the changing activation patterns (or lack
+  thereof) at specific layers in a neural network, over time.
+
+  ```python
+  w = tf.summary.create_file_writer('test/logs')
+  with w.as_default():
+  for step in range(100):
+      # Generate fake "activations".
+      activations = [
+          tf.random.normal([1000], mean=step, stddev=1),
+          tf.random.normal([1000], mean=step, stddev=10),
+          tf.random.normal([1000], mean=step, stddev=100),
+      ]
+
+      tf.summary.histogram("layer1/activate", activations[0], step=step)
+      tf.summary.histogram("layer2/activate", activations[1], step=step)
+      tf.summary.histogram("layer3/activate", activations[2], step=step)
+  ```
+
+  Arguments:
+    name: A name for this summary. The summary tag used for TensorBoard will be
+      this name prefixed by any active name scopes.
+    data: A `Tensor` of any shape. The histogram is computed over its elements,
+      which must be castable to `float64`.
+    step: Explicit `int64`-castable monotonic step value for this summary. If
+      omitted, this defaults to `tf.summary.experimental.get_step()`, which must
+      not be None.
+    buckets: Optional positive `int`. The output will have this many buckets,
+      except in two edge cases. If there is no data, then there are no buckets.
+      If there is data but all points have the same value, then all buckets'
+      left and right endpoints are the same and only the last bucket has nonzero
+      count. Defaults to 30 if not specified.
+    description: Optional long-form description for this summary, as a constant
+      `str`. Markdown is supported. Defaults to empty.
+
+  Returns:
+    True on success, or false if no summary was emitted because no default
+    summary writer was available.
+
+  Raises:
+    ValueError: if a default writer exists, but no step was provided and
+      `tf.summary.experimental.get_step()` is None.
+  """
+  try:
+    from tensorboard.summary.v2 import histogram as histogram_v2  # pylint: disable=g-import-not-at-top, g-importing-member
+  except ImportError as exc:
+    raise TBNotInstalledError("tf.summary.histogram") from exc
+  return histogram_v2(
+      name=name, data=data, step=step, buckets=buckets, description=description
+  )
+
+
+@tf_export("summary.image", v1=[])
+def image(name, data, step=None, max_outputs=3, description=None):
+  """Write an image summary.
+
+  See also `tf.summary.scalar`, `tf.summary.SummaryWriter`.
+
+  Writes a collection of images to the current default summary writer. Data
+  appears in TensorBoard's 'Images' dashboard. Like `tf.summary.scalar` points,
+  each collection of images is associated with a `step` and a `name`.  All the
+  image collections with the same `name` constitute a time series of image
+  collections.
+
+  This example writes 2 random grayscale images:
+
+  ```python
+  w = tf.summary.create_file_writer('test/logs')
+  with w.as_default():
+    image1 = tf.random.uniform(shape=[8, 8, 1])
+    image2 = tf.random.uniform(shape=[8, 8, 1])
+    tf.summary.image("grayscale_noise", [image1, image2], step=0)
+  ```
+
+  To avoid clipping, data should be converted to one of the following:
+
+  - floating point values in the range [0,1], or
+  - uint8 values in the range [0,255]
+
+  ```python
+  # Convert the original dtype=int32 `Tensor` into `dtype=float64`.
+  rgb_image_float = tf.constant([
+    [[1000, 0, 0], [0, 500, 1000]],
+  ]) / 1000
+  tf.summary.image("picture", [rgb_image_float], step=0)
+
+  # Convert original dtype=uint8 `Tensor` into proper range.
+  rgb_image_uint8 = tf.constant([
+    [[1, 1, 0], [0, 0, 1]],
+  ], dtype=tf.uint8) * 255
+  tf.summary.image("picture", [rgb_image_uint8], step=1)
+  ```
+
+  Arguments:
+    name: A name for this summary. The summary tag used for TensorBoard will be
+      this name prefixed by any active name scopes.
+    data: A `Tensor` representing pixel data with shape `[k, h, w, c]`, where
+      `k` is the number of images, `h` and `w` are the height and width of the
+      images, and `c` is the number of channels, which should be 1, 2, 3, or 4
+      (grayscale, grayscale with alpha, RGB, RGBA). Any of the dimensions may be
+      statically unknown (i.e., `None`). Floating point data will be clipped to
+      the range [0,1]. Other data types will be clipped into an allowed range
+      for safe casting to uint8, using `tf.image.convert_image_dtype`.
+    step: Explicit `int64`-castable monotonic step value for this summary. If
+      omitted, this defaults to `tf.summary.experimental.get_step()`, which must
+      not be None.
+    max_outputs: Optional `int` or rank-0 integer `Tensor`. At most this many
+      images will be emitted at each step. When more than `max_outputs` many
+      images are provided, the first `max_outputs` many images will be used and
+      the rest silently discarded.
+    description: Optional long-form description for this summary, as a constant
+      `str`. Markdown is supported. Defaults to empty.
+
+  Returns:
+    True on success, or false if no summary was emitted because no default
+    summary writer was available.
+
+  Raises:
+    ValueError: if a default writer exists, but no step was provided and
+      `tf.summary.experimental.get_step()` is None.
+  """
+  try:
+    from tensorboard.summary.v2 import image as image_v2  # pylint: disable=g-import-not-at-top, g-importing-member
+  except ImportError as exc:
+    raise TBNotInstalledError("tf.summary.image") from exc
+  return image_v2(
+      name=name,
+      data=data,
+      step=step,
+      max_outputs=max_outputs,
+      description=description,
+  )
+
+
+@tf_export("summary.scalar", v1=[])
+def scalar(name, data, step=None, description=None):
+  """Write a scalar summary.
+
+  See also `tf.summary.image`, `tf.summary.histogram`,
+  `tf.summary.SummaryWriter`.
+
+  Writes simple numeric values for later analysis in TensorBoard.  Writes go to
+  the current default summary writer. Each summary point is associated with an
+  integral `step` value. This enables the incremental logging of time series
+  data.  A common usage of this API is to log loss during training to produce
+  a loss curve.
+
+  For example:
+
+  ```python
+  test_summary_writer = tf.summary.create_file_writer('test/logdir')
+  with test_summary_writer.as_default():
+      tf.summary.scalar('loss', 0.345, step=1)
+      tf.summary.scalar('loss', 0.234, step=2)
+      tf.summary.scalar('loss', 0.123, step=3)
+  ```
+
+  Multiple independent time series may be logged by giving each series a unique
+  `name` value.
+
+  See [Get started with
+  TensorBoard](https://www.tensorflow.org/tensorboard/get_started)
+  for more examples of effective usage of `tf.summary.scalar`.
+
+  In general, this API expects that data points are logged with a monotonically
+  increasing step value. Duplicate points for a single step or points logged out
+  of order by step are not guaranteed to display as desired in TensorBoard.
+
+  Arguments:
+    name: A name for this summary. The summary tag used for TensorBoard will be
+      this name prefixed by any active name scopes.
+    data: A real numeric scalar value, convertible to a `float32` Tensor.
+    step: Explicit `int64`-castable monotonic step value for this summary. If
+      omitted, this defaults to `tf.summary.experimental.get_step()`, which must
+      not be None.
+    description: Optional long-form description for this summary, as a constant
+      `str`. Markdown is supported. Defaults to empty.
+
+  Returns:
+    True on success, or false if no summary was written because no default
+    summary writer was available.
+
+  Raises:
+    ValueError: if a default writer exists, but no step was provided and
+      `tf.summary.experimental.get_step()` is None.
+  """
+  try:
+    from tensorboard.summary.v2 import scalar as scalar_v2  # pylint: disable=g-import-not-at-top, g-importing-member
+  except ImportError as exc:
+    raise TBNotInstalledError("tf.summary.scalar") from exc
+  return scalar_v2(name=name, data=data, step=step, description=description)
+
+
+@tf_export("summary.text", v1=[])
+def text(name, data, step=None, description=None):
+  r"""Write a text summary.
+
+  See also `tf.summary.scalar`, `tf.summary.SummaryWriter`, `tf.summary.image`.
+
+  Writes text Tensor values for later visualization and analysis in TensorBoard.
+  Writes go to the current default summary writer.  Like `tf.summary.scalar`
+  points, text points are each associated with a `step` and a `name`.
+  All the points with the same `name` constitute a time series of text values.
+
+  For Example:
+  ```python
+  test_summary_writer = tf.summary.create_file_writer('test/logdir')
+  with test_summary_writer.as_default():
+      tf.summary.text('first_text', 'hello world!', step=0)
+      tf.summary.text('first_text', 'nice to meet you!', step=1)
+  ```
+
+  The text summary can also contain Markdown, and TensorBoard will render the
+  text
+  as such.
+
+  ```python
+  with test_summary_writer.as_default():
+      text_data = '''
+            | *hello* | *there* |
+            |---------|---------|
+            | this    | is      |
+            | a       | table   |
+      '''
+      text_data = '\n'.join(l.strip() for l in text_data.splitlines())
+      tf.summary.text('markdown_text', text_data, step=0)
+  ```
+
+  Since text is Tensor valued, each text point may be a Tensor of string values.
+  rank-1 and rank-2 Tensors are rendered as tables in TensorBoard.  For higher
+  ranked
+  Tensors, you'll see just a 2D slice of the data.  To avoid this, reshape the
+  Tensor
+  to at most rank-2 prior to passing it to this function.
+
+  Demo notebook at
+  ["Displaying text data in
+  TensorBoard"](https://www.tensorflow.org/tensorboard/text_summaries).
+
+  Arguments:
+    name: A name for this summary. The summary tag used for TensorBoard will be
+      this name prefixed by any active name scopes.
+    data: A UTF-8 string Tensor value.
+    step: Explicit `int64`-castable monotonic step value for this summary. If
+      omitted, this defaults to `tf.summary.experimental.get_step()`, which must
+      not be None.
+    description: Optional long-form description for this summary, as a constant
+      `str`. Markdown is supported. Defaults to empty.
+
+  Returns:
+    True on success, or false if no summary was emitted because no default
+    summary writer was available.
+
+  Raises:
+    ValueError: if a default writer exists, but no step was provided and
+      `tf.summary.experimental.get_step()` is None.
+  """
+  try:
+    from tensorboard.summary.v2 import text as text_v2  # pylint: disable=g-import-not-at-top, g-importing-member
+  except ImportError as exc:
+    raise TBNotInstalledError("tf.summary.text") from exc
+  return text_v2(name=name, data=data, step=step, description=description)
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index cae983b25dfb02..21fecca23371bc 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -858,6 +858,23 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
 // TODO(b/309152522): Remove the switch once it works on Windows.
 #if !IS_OSS
   pybind11_protobuf::ImportNativeProtoCasters();
+  m.def(
+      "TFE_ContextAddFunctionDefNoSerialization",
+      [](py::handle& ctx, tensorflow::FunctionDef function_def) {
+        tensorflow::Safe_TF_StatusPtr status =
+            tensorflow::make_safe(TF_NewStatus());
+        // Annotate eager runtime construction context to the given
+        // `function_def` as an attribute.
+        tensorflow::AttrValue value;
+        SetAttrValue("kEagerRuntime", &value);
+        (*function_def.mutable_attr())["_construction_context"] = value;
+        status->status = tensorflow::unwrap(tensorflow::InputTFE_Context(ctx))
+                             ->AddFunctionDef(function_def);
+        tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+        return;
+      },
+      pybind11::arg("ctx"), pybind11::arg("function_def"));
+
   m.def("TFE_ContextGetFunctionDefNoSerialization",
         [](py::handle& ctx,
            const char* function_name) -> tensorflow::FunctionDef {
@@ -885,6 +902,14 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
           LOG(FATAL) << "This function cannot be called.";
           return -1;
         });
+  m.def("TFE_ContextAddFunctionDefNoSerialization",
+        // Opensource fails whenever a protobuf is used as argument. The
+        // disrepency in the type is to make opensource tests pass.
+        [](py::handle& ctx, int function_def) {
+          LOG(FATAL) << "This function cannot be called.";
+          return -1;
+        });
+
 #endif
   m.def("TFE_ContextGetGraphDebugInfo",
         [](py::handle& ctx, const char* function_name, TF_Buffer& buf) {
diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl
index 7ea32b6cb51e57..763a5f241581b7 100644
--- a/tensorflow/python/tools/api/generator/api_gen.bzl
+++ b/tensorflow/python/tools/api/generator/api_gen.bzl
@@ -133,7 +133,7 @@ def gen_api_init_files(
         srcs_version = "PY3",
         visibility = ["//visibility:public"],
         deps = package_deps + [
-            "//tensorflow/python/util:tf_decorator",
+            "//tensorflow/python/util:tf_decorator_py",
             "//tensorflow/python/util:tf_export",
             "//tensorflow/python/util:module_wrapper",
             "//tensorflow/python/tools/api/generator:doc_srcs",
diff --git a/tensorflow/python/tools/api/generator2/generate_api.bzl b/tensorflow/python/tools/api/generator2/generate_api.bzl
index 64e9b96276eebe..c2a96438576d22 100644
--- a/tensorflow/python/tools/api/generator2/generate_api.bzl
+++ b/tensorflow/python/tools/api/generator2/generate_api.bzl
@@ -1,5 +1,6 @@
 """Rules to generate the TensorFlow public API from annotated files."""
 
+# Placeholder: load PyInfo
 load("@bazel_skylib//lib:paths.bzl", "paths")
 load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "TENSORFLOW_API_INIT_FILES")
 load(":apis.bzl", _APIS = "APIS")
diff --git a/tensorflow/python/tools/print_selective_registration_header.py b/tensorflow/python/tools/print_selective_registration_header.py
index 8ae04c137e4eb4..6809ea62f51513 100644
--- a/tensorflow/python/tools/print_selective_registration_header.py
+++ b/tensorflow/python/tools/print_selective_registration_header.py
@@ -32,10 +32,17 @@
 """
 
 import argparse
+import contextlib
 import sys
 
 from absl import app
-from tensorflow.python.tools import selective_registration_header_lib
+
+# Import statement prints "Using TensorFlow backend" which gets piped to
+# ops_to_register.h. Avoid this printing import statement to /dev/null
+with open('/dev/null', 'w') as f, contextlib.redirect_stdout(f):
+  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.tools import selective_registration_header_lib
+  # pylint: enable
 
 FLAGS = None
 
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index d3d06e05502fdc..751078f59d56b9 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -99,7 +99,6 @@ tpu_py_strict_test(
     disable_mlir_bridge = False,
     deps = [
         ":async_checkpoint",
-        ":tpu_estimator",
         ":tpu_lib",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/compat:v2_compat",
@@ -169,42 +168,6 @@ py_strict_library(
     ],
 )
 
-py_strict_library(
-    name = "tpu_estimator",
-    srcs = [
-        "error_handling.py",
-        "tpu_config.py",
-        "tpu_context.py",
-        "tpu_estimator.py",
-        "util.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        ":async_checkpoint",
-        ":feature_column",
-        ":feature_column_v2",
-        ":functional",
-        ":preempted_hook_py",
-        ":tpu_embedding",
-        ":tpu_lib",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/client:session",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/estimator:util",
-        "//tensorflow/python/framework:for_generated_wrappers",
-        "//tensorflow/python/framework:function",
-        "//tensorflow/python/ops:array_ops",
-        "//tensorflow/python/ops:control_flow_ops",
-        "//tensorflow/python/ops:init_ops",
-        "//tensorflow/python/ops:math_ops",
-        "//tensorflow/python/ops:state_ops",
-        "//tensorflow/python/ops:summary_ops_v2",
-        "//tensorflow/python/ops:variable_scope",
-        "//tensorflow/python/ops:variables",
-        "//tensorflow/python/training",
-    ],
-)
-
 py_strict_library(
     name = "functional",
     srcs = ["functional.py"],
diff --git a/tensorflow/python/tpu/async_checkpoint_test.py b/tensorflow/python/tpu/async_checkpoint_test.py
index 070eff0e20c60e..3601c5fad6cc0d 100644
--- a/tensorflow/python/tpu/async_checkpoint_test.py
+++ b/tensorflow/python/tpu/async_checkpoint_test.py
@@ -33,13 +33,13 @@
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model.pywrap_saved_model import metrics
 from tensorflow.python.tpu import async_checkpoint
-from tensorflow.python.tpu import tpu_config
-from tensorflow.python.tpu import tpu_estimator
 from tensorflow.python.tpu import tpu_optimizer
 from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import training
 from tensorflow_estimator.python.estimator import estimator as estimator_lib
 from tensorflow_estimator.python.estimator import model_fn as model_fn_lib
+from tensorflow_estimator.python.estimator.tpu import tpu_config
+from tensorflow_estimator.python.estimator.tpu import tpu_estimator
 
 FLAGS = flags.FLAGS
 flags.DEFINE_string('tpu', '', 'TPU to use in this test.')
diff --git a/tensorflow/python/tpu/error_handling.py b/tensorflow/python/tpu/error_handling.py
deleted file mode 100644
index 1e6660af511bc1..00000000000000
--- a/tensorflow/python/tpu/error_handling.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Stub file to maintain backwards compatibility."""
-
-# pylint: disable=wildcard-import,unused-import
-from tensorflow_estimator.python.estimator.tpu.error_handling import *
-# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/tpu/tpu_context.py b/tensorflow/python/tpu/tpu_context.py
deleted file mode 100644
index d1f3ee55723df3..00000000000000
--- a/tensorflow/python/tpu/tpu_context.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Stub file to maintain backwards compatibility."""
-
-# pylint: disable=wildcard-import,unused-import
-from tensorflow_estimator.python.estimator.tpu.tpu_context import *
-# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/tpu/tpu_embedding_v3_utils.py b/tensorflow/python/tpu/tpu_embedding_v3_utils.py
index ed30d9947c8842..276731051be54f 100644
--- a/tensorflow/python/tpu/tpu_embedding_v3_utils.py
+++ b/tensorflow/python/tpu/tpu_embedding_v3_utils.py
@@ -73,7 +73,8 @@ def unshuffle_from_sc_to_cpu(
   shards = shards_t[:, offset_in_shard : offset_in_shard + size_in_shard, :]
   # This table's shards were rotated by `shard_rotation`, so we need to rotate
   # the same amount in opposite direction
-  shards = manip_ops.roll(shards, -shard_rotation, axis=0)
+  if shard_rotation:
+    shards = manip_ops.roll(shards, -shard_rotation, axis=0)
   # Re-arrange (transpose and reshape) the shards to get the queried embedding
   # table.
   intermediate_tensor = array_ops.transpose(shards, (1, 0, 2))
@@ -169,6 +170,12 @@ def __init__(self, stacked_layouts, table_to_config):
           shape=variable_shape,
           dtype=dtypes.float32,
       )
+  # TODO(b/312743130): This is a workaround. During checkpoint restoration
+  # optimizer expects the trackable to provide a `_unique_id` or equivalent.
+  # Remove this when the bug is fixed.
+  @property
+  def _unique_id(self):
+    return self.vars[self._stacked_layouts[0].table_name]._unique_id
 
   def _serialize_to_tensors(self) -> Any:
     return {
diff --git a/tensorflow/python/tpu/tpu_estimator.py b/tensorflow/python/tpu/tpu_estimator.py
deleted file mode 100644
index f28db848e56252..00000000000000
--- a/tensorflow/python/tpu/tpu_estimator.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Stub file to maintain backwards compatibility."""
-
-# pylint: disable=wildcard-import,unused-import,redefined-builtin
-from tensorflow_estimator.python.estimator.tpu.tpu_estimator import *
-# used by tests
-from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _clone_export_output_with_tensors
-from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _create_global_step
-from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _export_output_to_tensors
-from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _get_scaffold
-from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _Inputs
-from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _ITERATIONS_PER_LOOP_VAR
-from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_ENQUEUE_OPS
-from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_ESTIMATOR
-from tensorflow_estimator.python.estimator.tpu.tpu_estimator import _TPU_TRAIN_OP
-# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/python/tpu/util.py b/tensorflow/python/tpu/util.py
deleted file mode 100644
index c5b8964b20a6e2..00000000000000
--- a/tensorflow/python/tpu/util.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Stub file to maintain backwards compatibility."""
-
-# pylint: disable=wildcard-import,unused-import
-from tensorflow_estimator.python.estimator.tpu.util import *
-# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/python/trackable/BUILD b/tensorflow/python/trackable/BUILD
index 67d4811402c864..a0a315b85fd475 100644
--- a/tensorflow/python/trackable/BUILD
+++ b/tensorflow/python/trackable/BUILD
@@ -51,7 +51,8 @@ py_strict_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/ops:control_flow_ops_gen",
         "//tensorflow/python/training/saving:saveable_object",
-        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_contextlib",
+        "//tensorflow/python/util:tf_decorator_py",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -188,7 +189,7 @@ py_strict_library(
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor",
-        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_contextlib",
         "//tensorflow/python/util:tf_export",
     ],
 )
diff --git a/tensorflow/python/training/BUILD b/tensorflow/python/training/BUILD
index a041d50f4b61cb..847e93392b2c67 100644
--- a/tensorflow/python/training/BUILD
+++ b/tensorflow/python/training/BUILD
@@ -75,7 +75,6 @@ py_strict_library(
     visibility = [
         "//tensorflow:internal",
         "//tensorflow_minigo:__subpackages__",
-        "//tensorflow_model_optimization:__subpackages__",
         "//tensorflow_models:__subpackages__",
         "//third_party/cloud_tpu/convergence_tools:__subpackages__",
         "//third_party/mlperf:__subpackages__",
@@ -229,7 +228,6 @@ py_strict_library(
     srcs_version = "PY3",
     visibility = [
         "//tensorflow:internal",
-        "//tensorflow_estimator/python/estimator:__pkg__",
         "//third_party/py/tf_slim/training:__pkg__",
     ],
     deps = [
@@ -340,7 +338,6 @@ py_strict_library(
     srcs_version = "PY3",
     visibility = [
         "//tensorflow:internal",
-        "//tensorflow_model_optimization/python/core/quantization/keras:__pkg__",
         "//third_party/py/tf_slim/layers:__pkg__",
     ],
     deps = [
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 92ec7cff402129..fd4243e4d3021e 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1029,9 +1029,10 @@ def _RecordLastCheckpoint(self, latest_save_path):
     if not self.saver_def.max_to_keep:
       return
     # Remove first from list if the same name was used before.
-    for p in self._last_checkpoints:
+    for p in self._last_checkpoints[:]:
       if latest_save_path == self._CheckpointFilename(p):
         self._last_checkpoints.remove(p)
+
     # Append new path to list
     self._last_checkpoints.append((latest_save_path, time.time()))
 
diff --git a/tensorflow/python/types/BUILD b/tensorflow/python/types/BUILD
index c04dc039153fa1..799ca38c72981a 100644
--- a/tensorflow/python/types/BUILD
+++ b/tensorflow/python/types/BUILD
@@ -21,7 +21,6 @@ pytype_strict_library(
     deps = [
         ":doc_typealias",
         "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
         "@pypi_typing_extensions//:pkg",
diff --git a/tensorflow/python/types/core.py b/tensorflow/python/types/core.py
index 16c9d24593e2ab..534211fd9d29ba 100644
--- a/tensorflow/python/types/core.py
+++ b/tensorflow/python/types/core.py
@@ -26,7 +26,6 @@
 
 
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import, g-bad-import-order
-from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util.tf_export import tf_export
 
 # pylint:disable=g-import-not-at-top
@@ -385,10 +384,6 @@ def __tf_tensor__(self, dtype=None, name=None):
     pass
 
 
-_pywrap_utils.RegisterType("TensorProtocol", TensorProtocol)
-_pywrap_utils.RegisterType("CoreTypeValue", Value)
-
-
 # TODO(rahulkamat): Add missing types that are convertible to Tensor.
 TensorLike = Union[Tensor, TensorProtocol, int, float, bool, str, bytes,
                    complex, tuple, list, np.ndarray, np.generic]
diff --git a/tensorflow/python/util/BUILD b/tensorflow/python/util/BUILD
index 7d2d9ef5398809..d8cf6ed3e5c7b1 100644
--- a/tensorflow/python/util/BUILD
+++ b/tensorflow/python/util/BUILD
@@ -43,8 +43,10 @@ package(
 py_strict_library(
     name = "core",
     deps = [
-        ":tf_decorator",
+        ":tf_contextlib",
+        ":tf_decorator_py",
         ":tf_export",
+        ":tf_inspect",
         ":tf_stack",
     ],
 )
@@ -361,7 +363,7 @@ pytype_strict_library(
     ],
 )
 
-py_strict_library(
+pytype_strict_library(
     name = "tf_contextlib",
     srcs = ["tf_contextlib.py"],
     compatible_with = get_compatible_with_portable(),
@@ -413,29 +415,6 @@ tf_py_strict_test(
     ],
 )
 
-# Leaf library: may not depend on anything else inside TensorFlow.
-# TODO(mdan): Move this utility outside of TF.
-py_strict_library(
-    name = "tf_decorator",
-    compatible_with = get_compatible_with_portable(),
-    deprecation = "This target has been split. Depend on the sub-targets instead.",
-    srcs_version = "PY3",
-    visibility = [
-        "//tensorflow:__subpackages__",
-        # TODO(mdan): Remove these dependencies.
-        "//third_party/py/tf_slim:__subpackages__",
-        "//learning/deepmind/research/language/translation/lm:__subpackages__",
-        "//learning/brain/analytics:__subpackages__",
-        "//tensorflow:__pkg__",
-        "//third_party/py/tensorflow_core:__subpackages__",
-    ],
-    deps = [
-        ":tf_contextlib",
-        ":tf_decorator_py",
-        ":tf_inspect",
-    ],
-)
-
 py_strict_library(
     name = "tf_stack",
     srcs = ["tf_stack.py"],
@@ -445,7 +424,6 @@ py_strict_library(
     deps = [
         ":_tf_stack",
         "//tensorflow/core:protos_all_py",
-        "@six_archive//:six",
     ],
 )
 
@@ -770,7 +748,6 @@ py_strict_library(
         # library. It isn't possible to add these test dependencies via tensorflow.bzl's
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
         "//tensorflow/python:global_test_configuration",
-        "@six_archive//:six",
         "@pypi_wrapt//:pkg",
         "//tensorflow/python:pywrap_tensorflow",
         ":_pywrap_utils",
@@ -788,7 +765,6 @@ py_strict_library(
         # library. It isn't possible to add these test dependencies via tensorflow.bzl's
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
         "//tensorflow/python:global_test_configuration",
-        "@six_archive//:six",
     ],
 )
 
@@ -887,7 +863,6 @@ py_strict_library(
         "//tensorflow/python:global_test_configuration",
         ":tf_export",
         "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
 )
 
@@ -997,8 +972,6 @@ py_strict_library(
         # library. It isn't possible to add these test dependencies via tensorflow.bzl's
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
         "//tensorflow/python:global_test_configuration",
-        ":tf_decorator",
-        "@six_archive//:six",
     ],
 )
 
@@ -1067,14 +1040,12 @@ py_strict_library(
     visibility = util_subpackage_visibility,
     deps = [
         ":__init__",
-        ":compat",
         ":nest_util",
         # global_test_configuration is added here because all major tests depend on this
         # library. It isn't possible to add these test dependencies via tensorflow.bzl's
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
         "//tensorflow/python:global_test_configuration",
         ":tf_export",
-        "@pypi_wrapt//:pkg",
         ":_pywrap_utils",
         ":_pywrap_nest",
     ],
diff --git a/tensorflow/python/util/_pywrap_utils.pyi b/tensorflow/python/util/_pywrap_utils.pyi
index c8e51ec4fa961f..f5c7af0c990e0a 100644
--- a/tensorflow/python/util/_pywrap_utils.pyi
+++ b/tensorflow/python/util/_pywrap_utils.pyi
@@ -32,5 +32,4 @@ def IsTensor(arg0: object) -> bool: ...
 def IsTypeSpec(arg0: object) -> bool: ...
 def IsVariable(arg0: object) -> bool: ...
 def RegisterPyObject(arg0: object, arg1: object) -> object: ...
-def RegisterType(arg0: object, arg1: object) -> object: ...
 def SameNamedtuples(arg0: object, arg1: object) -> object: ...
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 0d3c1a2b3c6582..7a4659e0f62251 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -45,20 +45,14 @@
 API docstring: tensorflow.compat
 """
 
+import codecs
+import collections.abc as collections_abc  # pylint: disable=unused-import
 import numbers as _numbers
 
 import numpy as _np
-import six as _six
-import codecs
 
 from tensorflow.python.util.tf_export import tf_export
 
-try:
-  # This import only works on python 3.3 and above.
-  import collections.abc as collections_abc  # pylint: disable=unused-import
-except ImportError:
-  import collections as collections_abc  # pylint: disable=unused-import
-
 
 def as_bytes(bytes_or_text, encoding='utf-8'):
   """Converts `bytearray`, `bytes`, or unicode python input types to `bytes`.
@@ -79,7 +73,7 @@ def as_bytes(bytes_or_text, encoding='utf-8'):
   encoding = codecs.lookup(encoding).name
   if isinstance(bytes_or_text, bytearray):
     return bytes(bytes_or_text)
-  elif isinstance(bytes_or_text, _six.text_type):
+  elif isinstance(bytes_or_text, str):
     return bytes_or_text.encode(encoding)
   elif isinstance(bytes_or_text, bytes):
     return bytes_or_text
@@ -106,7 +100,7 @@ def as_text(bytes_or_text, encoding='utf-8'):
   """
   # Validate encoding, a LookupError will be raised if invalid.
   encoding = codecs.lookup(encoding).name
-  if isinstance(bytes_or_text, _six.text_type):
+  if isinstance(bytes_or_text, str):
     return bytes_or_text
   elif isinstance(bytes_or_text, bytes):
     return bytes_or_text.decode(encoding)
@@ -212,6 +206,6 @@ def path_to_bytes(path):
 tf_export('compat.complex_types').export_constant(__name__, 'complex_types')
 
 # Either bytes or text.
-bytes_or_text_types = (bytes, _six.text_type)
+bytes_or_text_types = (bytes, str)
 tf_export('compat.bytes_or_text_types').export_constant(__name__,
                                                         'bytes_or_text_types')
diff --git a/tensorflow/python/util/function_utils.py b/tensorflow/python/util/function_utils.py
index fa978fe12d56ac..743a81343240c1 100644
--- a/tensorflow/python/util/function_utils.py
+++ b/tensorflow/python/util/function_utils.py
@@ -16,8 +16,6 @@
 
 import functools
 
-import six
-
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
@@ -89,8 +87,10 @@ def get_func_name(func):
     if tf_inspect.isfunction(func):
       return func.__name__
     elif tf_inspect.ismethod(func):
-      return '%s.%s' % (six.get_method_self(func).__class__.__name__,
-                        six.get_method_function(func).__name__)
+      return '%s.%s' % (
+          func.__self__.__class__.__name__,
+          func.__func__.__name__,
+      )
     else:  # Probably a class instance with __call__
       return str(type(func))
   else:
@@ -104,13 +104,13 @@ def get_func_code(func):
   _, func = tf_decorator.unwrap(func)
   if callable(func):
     if tf_inspect.isfunction(func) or tf_inspect.ismethod(func):
-      return six.get_function_code(func)
+      return func.__code__
     # Since the object is not a function or method, but is a callable, we will
     # try to access the __call__method as a function.  This works with callable
     # classes but fails with functool.partial objects despite their __call__
     # attribute.
     try:
-      return six.get_function_code(func.__call__)
+      return func.__call__.__code__
     except AttributeError:
       return None
   else:
diff --git a/tensorflow/python/util/lazy_loader.py b/tensorflow/python/util/lazy_loader.py
index 717965d0123614..7d8c186677583f 100644
--- a/tensorflow/python/util/lazy_loader.py
+++ b/tensorflow/python/util/lazy_loader.py
@@ -106,6 +106,9 @@ def __dir__(self):
     module = self._load()
     return dir(module)
 
+  def __reduce__(self):
+    return importlib.import_module, (self.__name__,)
+
 
 class KerasLazyLoader(LazyLoader):
   """LazyLoader that handles routing to different Keras version."""
diff --git a/tensorflow/python/util/lazy_loader_test.py b/tensorflow/python/util/lazy_loader_test.py
index 94f258131772c1..e59ef2c888edc1 100644
--- a/tensorflow/python/util/lazy_loader_test.py
+++ b/tensorflow/python/util/lazy_loader_test.py
@@ -17,6 +17,7 @@
 # pylint: disable=unused-import
 import doctest
 import inspect
+import pickle
 import types
 
 from tensorflow.python.platform import test
@@ -54,5 +55,16 @@ def testLazyLoaderMock(self, mock_warning):
     self.assertEqual(lazy_loader_module.foo, foo)
 
 
+class PickleTest(test.TestCase):
+
+  def testPickleLazyLoader(self):
+    name = PickleTest.__module__  # Try to pickle current module.
+    lazy_loader_module = lazy_loader.LazyLoader(
+        "lazy_loader_module", globals(), name)
+    restored = pickle.loads(pickle.dumps(lazy_loader_module))
+    self.assertEqual(restored.__name__, name)
+    self.assertIsNotNone(restored.PickleTest)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index d7acf836ce5e1f..748fc3b167f5c8 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -90,12 +90,9 @@
 API docstring: tensorflow.nest
 """
 
-import wrapt as _wrapt
-
 from tensorflow.python.util import _pywrap_nest
 from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import nest_util
-from tensorflow.python.util.compat import collections_abc as _collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -1315,10 +1312,3 @@ def sequence_fn(instance, args):
       False,
       sequence_fn=sequence_fn,
   )
-
-
-_pywrap_utils.RegisterType("Mapping", _collections_abc.Mapping)
-_pywrap_utils.RegisterType("MutableMapping", _collections_abc.MutableMapping)
-_pywrap_utils.RegisterType("Sequence", _collections_abc.Sequence)
-_pywrap_utils.RegisterType("MappingView", _collections_abc.MappingView)
-_pywrap_utils.RegisterType("ObjectProxy", _wrapt.ObjectProxy)
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 26341624c06619..0378076cba247b 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -154,24 +154,24 @@ class UnsortedSampleAttr(object):
       field1 = attr.ib()
       field2 = attr.ib()
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassCustomProtocol(self):
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
     self.assertIsInstance(mt, CustomNestProtocol)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassIsNested(self):
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
     self.assertTrue(nest.is_nested(mt))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassFlatten(self):
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
     leaves = nest.flatten(mt)
     self.assertLen(leaves, 1)
     self.assertAllEqual(leaves[0], [1])
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassFlattenUpToCompatible(self):
     simple_list = [2]
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
@@ -200,7 +200,7 @@ def testDataclassFlattenUpToCompatible(self):
     )
     self.assertAllEqual(flat_path_nested_list, [2])
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassFlattenUpToIncompatible(self):
     simple_list = [2]
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
@@ -239,7 +239,7 @@ def testDataclassFlattenUpToIncompatible(self):
           shallow_tree=nested_list, input_tree=mt, check_types=False
       )
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassFlattenWithTuplePathsUpToCompatible(self):
     simple_list = [2]
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
@@ -271,7 +271,7 @@ def testDataclassFlattenWithTuplePathsUpToCompatible(self):
     )
     self.assertAllEqual(flat_path_nested_list, [[(0, 0), 2]])
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassFlattenWithTuplePathsUpToIncompatible(self):
     simple_list = [2]
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
@@ -311,7 +311,7 @@ def testDataclassFlattenWithTuplePathsUpToIncompatible(self):
           shallow_tree=nested_list2, input_tree=nmt, check_types=False
       )
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassFlattenAndPack(self):
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
     leaves = nest.flatten(mt)
@@ -319,7 +319,7 @@ def testDataclassFlattenAndPack(self):
     self.assertIsInstance(reconstructed_mt, MaskedTensor)
     self.assertEqual(reconstructed_mt, mt)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassMapStructure(self):
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
     mt_doubled = nest.map_structure(lambda x: x * 2, mt)
@@ -327,7 +327,7 @@ def testDataclassMapStructure(self):
     self.assertEqual(mt_doubled.mask, True)
     self.assertAllEqual(mt_doubled.value, [2])
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassMapStructureWithPaths(self):
     mt = MaskedTensor(mask=False, value=constant_op.constant([1]))
     mt2 = MaskedTensor(mask=True, value=constant_op.constant([2]))
@@ -360,7 +360,7 @@ def path_sum(path, *tensors):
     self.assertAllEqual(nmt_combined_with_path.value.value[0], "0/0")
     self.assertAllEqual(nmt_combined_with_path.value.value[1], [9])
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassMapStructureWithTuplePaths(self):
     mt = MaskedTensor(mask=False, value=constant_op.constant([1]))
     mt2 = MaskedTensor(mask=True, value=constant_op.constant([2]))
@@ -395,7 +395,7 @@ def tuple_path_sum(tuple_path, *tensors):
     self.assertAllEqual(nmt_combined_with_path.value.value[0], (0, 0))
     self.assertAllEqual(nmt_combined_with_path.value.value[1], [9])
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassMapStructureUpTo(self):
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
     mt2 = MaskedTensor(mask=True, value=constant_op.constant([2]))
@@ -431,7 +431,7 @@ def sum_tensors(*tensors):
     self.assertEqual(nmt_combined_with_path.value.mask, True)
     self.assertAllEqual(nmt_combined_with_path.value.value, [9])
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassMapStructureWithTuplePathsUoTo(self):
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
     mt2 = MaskedTensor(mask=True, value=constant_op.constant([2]))
@@ -470,7 +470,7 @@ def tuple_path_sum(tuple_path, *tensors):
     self.assertAllEqual(nmt_combined_with_path.value.value[0], (0, 0))
     self.assertAllEqual(nmt_combined_with_path.value.value[1], [9])
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testNestedDataclassIsNested(self):
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
     self.assertTrue(nest.is_nested(mt))
@@ -480,7 +480,7 @@ def testNestedDataclassIsNested(self):
     )
     self.assertTrue(nest.is_nested(nmt))
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassAssertShallowStructure(self):
     # These assertions are expected to pass: two dataclasses with the same
     # component size are considered to have the same shallow structure.
@@ -535,7 +535,7 @@ def testDataclassAssertShallowStructure(self):
           shallow_tree=nmt, input_tree=mt, check_types=False
       )
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassGetTraverseShallowStructure(self):
     nmt = NestedMaskedTensor.nested_masked_tensor_with_opposite_masks(
         mask=True, inner_value=constant_op.constant([1])
@@ -568,7 +568,7 @@ def testDataclassGetTraverseShallowStructure(self):
     self.assertEqual(traverse_result3, False)
     nest.assert_shallow_structure(traverse_result3, nmt)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testNestedDataclassFlatten(self):
     nmt = NestedMaskedTensor.nested_masked_tensor_with_opposite_masks(
         mask=True, inner_value=constant_op.constant([1])
@@ -577,7 +577,7 @@ def testNestedDataclassFlatten(self):
     self.assertLen(leaves, 1)
     self.assertAllEqual(leaves[0], [1])
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testNestedDataclassFlattenAndPack(self):
     nmt = NestedMaskedTensor.nested_masked_tensor_with_opposite_masks(
         mask=True, inner_value=constant_op.constant([1])
@@ -587,7 +587,7 @@ def testNestedDataclassFlattenAndPack(self):
     self.assertIsInstance(reconstructed_mt, NestedMaskedTensor)
     self.assertEqual(reconstructed_mt, nmt)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testNestedDataclassMapStructure(self):
     nmt = NestedMaskedTensor.nested_masked_tensor_with_opposite_masks(
         mask=True, inner_value=constant_op.constant([1])
@@ -602,7 +602,7 @@ def testNestedDataclassMapStructure(self):
     self.assertEqual(mt_doubled.value.mask, expected.value.mask)
     self.assertAllEqual(mt_doubled.value.value, expected.value.value)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassYieldFlatPaths(self):
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
     mt_flat_paths = list(nest.yield_flat_paths(mt))
@@ -626,7 +626,7 @@ def testDataclassYieldFlatPaths(self):
         ],
     )
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassFlattenWithStringPaths(self):
     sep = "/"
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
@@ -650,7 +650,7 @@ def testDataclassFlattenWithStringPaths(self):
     self.assertEqual(dict_mt_nmt_flat_paths[1][0], "nmt/0/0")
     self.assertAllEqual(dict_mt_nmt_flat_paths[1][1], [2])
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassFlattenWithTuplePaths(self):
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
     mt_flat_paths = nest.flatten_with_tuple_paths(mt)
@@ -671,7 +671,7 @@ def testDataclassFlattenWithTuplePaths(self):
     self.assertEqual(dict_mt_nmt_flat_paths[1][0], ("nmt", 0, 0))
     self.assertAllEqual(dict_mt_nmt_flat_paths[1][1], [2])
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testDataclassListToTuple(self):
     mt = MaskedTensor(mask=True, value=constant_op.constant([1]))
     nmt = NestedMaskedTensor.nested_masked_tensor_with_opposite_masks(
@@ -690,7 +690,7 @@ def testDataclassListToTuple(self):
     )
     nest.assert_same_structure(results, expected)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testAttrsFlattenAndPack(self):
     if attr is None:
       self.skipTest("attr module is unavailable.")
@@ -715,7 +715,7 @@ def testAttrsFlattenAndPack(self):
       {"values": [(1, 2), [3, 4], 5]},
       {"values": [PointXY(1, 2), 3, 4]},
   )
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testAttrsMapStructure(self, values):
     if attr is None:
       self.skipTest("attr module is unavailable.")
@@ -724,7 +724,7 @@ def testAttrsMapStructure(self, values):
     new_structure = nest.map_structure(lambda x: x, structure)
     self.assertEqual(structure, new_structure)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testFlattenAndPack(self):
     structure = ((3, 4), 5, (6, 7, (9, 10), 8))
     flat = ["a", "b", "c", "d", "e", "f", "g", "h"]
@@ -761,7 +761,7 @@ def testFlattenAndPack(self):
 
   @parameterized.parameters({"mapping_type": collections.OrderedDict},
                             {"mapping_type": _CustomMapping})
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testFlattenDictOrder(self, mapping_type):
     """`flatten` orders dicts by key, including OrderedDicts."""
     ordered = mapping_type([("d", 3), ("b", 1), ("a", 0), ("c", 2)])
@@ -787,7 +787,7 @@ def testPackDictOrder(self, mapping_type):
         custom_reconstruction)
     self.assertEqual({"d": 3, "b": 1, "a": 0, "c": 2}, plain_reconstruction)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testFlattenAndPackMappingViews(self):
     """`flatten` orders dicts by key, including OrderedDicts."""
     ordered = collections.OrderedDict([("d", 3), ("b", 1), ("a", 0), ("c", 2)])
@@ -806,7 +806,7 @@ def testFlattenAndPackMappingViews(self):
 
   Abc = collections.namedtuple("A", ("b", "c"))  # pylint: disable=invalid-name
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testFlattenAndPack_withDicts(self):
     # A nice messy mix of tuples, lists, dicts, and `OrderedDict`s.
     mess = [
@@ -889,7 +889,7 @@ def testPackSequenceAs_CompositeTensor(self):
         ValueError, "Structure had 2 atoms, but flat_sequence had 1 items."):
       nest.pack_sequence_as(val, [val], expand_composites=True)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testIsNested(self):
     self.assertFalse(nest.is_nested("1234"))
     self.assertTrue(nest.is_nested([1, 3, [4, 5]]))
@@ -942,7 +942,7 @@ def testFlattenDictItems(self, mapping_type):
   class SameNamedType1(SameNameab):
     pass
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testAssertSameStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
     structure2 = ((("foo1", "foo2"), "foo3"), "foo4", ("foo5", "foo6"))
@@ -1053,7 +1053,7 @@ def testHeterogeneousComparison(self):
     nest.assert_same_structure({"a": 4}, _CustomMapping(a=3))
     nest.assert_same_structure(_CustomMapping(b=3), {"b": 4})
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testMapStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
     structure2 = (((7, 8), 9), 10, (11, 12))
@@ -1129,7 +1129,7 @@ def testMapStructure(self):
 
   ABTuple = collections.namedtuple("ab_tuple", "a, b")  # pylint: disable=invalid-name
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
   def testMapStructureWithStrings(self):
     inp_a = NestTest.ABTuple(a="foo", b=("bar", "baz"))
     inp_b = NestTest.ABTuple(a=2, b=(1, 3))
diff --git a/tensorflow/python/util/nest_util.py b/tensorflow/python/util/nest_util.py
index f40cc2d3642341..c53042f7dc11ab 100644
--- a/tensorflow/python/util/nest_util.py
+++ b/tensorflow/python/util/nest_util.py
@@ -27,7 +27,6 @@
 import collections as _collections
 import enum
 
-import six as _six
 import wrapt as _wrapt
 
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
@@ -236,7 +235,7 @@ def sequence_like(instance, args):
     # Pack a CompositeTensor's components according to a TypeSpec.
     assert len(args) == 1
     return instance._from_components(args[0])  # pylint: disable=protected-access
-  elif isinstance(instance, _six.moves.range):
+  elif isinstance(instance, range):
     return sequence_like(list(instance), args)
   elif isinstance(instance, _wrapt.ObjectProxy):
     # For object proxies, first create the underlying type and then re-wrap it
diff --git a/tensorflow/python/util/protobuf/BUILD b/tensorflow/python/util/protobuf/BUILD
index 85c44a9eedccfa..585c51706e271f 100644
--- a/tensorflow/python/util/protobuf/BUILD
+++ b/tensorflow/python/util/protobuf/BUILD
@@ -48,7 +48,6 @@ tf_py_strict_test(
         ":compare_test_proto_py",
         ":protobuf",
         "//tensorflow/python/platform:test",
-        "@six_archive//:six",
     ],
 )
 
@@ -88,7 +87,6 @@ py_strict_library(
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
         "//tensorflow/python:global_test_configuration",
         "@com_google_protobuf//:protobuf_python",
-        "@six_archive//:six",
         "//tensorflow/python/util:compat",
     ],
 )
diff --git a/tensorflow/python/util/protobuf/compare.py b/tensorflow/python/util/protobuf/compare.py
index 44a9bfd15b3b75..dbc61ae28f4674 100644
--- a/tensorflow/python/util/protobuf/compare.py
+++ b/tensorflow/python/util/protobuf/compare.py
@@ -58,12 +58,10 @@ def testXXX(self):
       self.assertProtoEqual(a, b)
 """
 
+import collections.abc as collections_abc
 import difflib
 import math
 
-from ..compat import collections_abc
-import six
-
 from google.protobuf import descriptor
 from google.protobuf import descriptor_pool
 from google.protobuf import message
@@ -147,7 +145,7 @@ def checkFloatEqAndReplace(self, expected, actual, relative_tolerance):  # pylin
             == descriptor.FieldDescriptor.TYPE_MESSAGE
         ):
           for e_v, a_v in zip(
-              six.itervalues(expected_values), six.itervalues(actual_values)
+              iter(expected_values.values()), iter(actual_values.values())
           ):
             checkFloatEqAndReplace(
                 self,
@@ -191,7 +189,7 @@ def assertProtoEqual(
       comparisons are done using the relative tolerance provided.
   """
   pool = descriptor_pool.Default()
-  if isinstance(a, six.string_types):
+  if isinstance(a, str):
     a = text_format.Parse(a, b.__class__(), descriptor_pool=pool)
 
   for pb in a, b:
@@ -281,7 +279,7 @@ def NormalizeNumberFields(pb):
         # This is a map, only recurse if the values have a message type.
         if (desc.message_type.fields_by_number[2].type ==
             descriptor.FieldDescriptor.TYPE_MESSAGE):
-          for v in six.itervalues(values):
+          for v in iter(values.values()):
             NormalizeNumberFields(v)
       else:
         for v in values:
@@ -296,7 +294,7 @@ def _IsMap(value):
 
 
 def _IsRepeatedContainer(value):
-  if isinstance(value, six.string_types):
+  if isinstance(value, str):
     return False
   try:
     iter(value)
diff --git a/tensorflow/python/util/protobuf/compare_test.py b/tensorflow/python/util/protobuf/compare_test.py
index 96484c5df87856..ef521baf2807b6 100644
--- a/tensorflow/python/util/protobuf/compare_test.py
+++ b/tensorflow/python/util/protobuf/compare_test.py
@@ -19,7 +19,6 @@
 import sys
 import textwrap
 
-import six
 
 from google.protobuf import text_format
 
@@ -30,13 +29,7 @@
 
 def LargePbs(*args):
   """Converts ASCII string Large PBs to messages."""
-  pbs = []
-  for arg in args:
-    pb = compare_test_pb2.Large()
-    text_format.Merge(arg, pb)
-    pbs.append(pb)
-
-  return pbs
+  return [text_format.Merge(arg, compare_test_pb2.Large()) for arg in args]
 
 
 class ProtoEqTest(googletest.TestCase):
@@ -267,49 +260,44 @@ class NormalizeNumbersTest(googletest.TestCase):
   """Tests for NormalizeNumberFields()."""
 
   def testNormalizesInts(self):
-    pb = compare_test_pb2.Large()
-    pb.int64_ = 4
+    pb = compare_test_pb2.Large(int64_=4)
     compare.NormalizeNumberFields(pb)
-    self.assertTrue(isinstance(pb.int64_, six.integer_types))
+    self.assertIsInstance(pb.int64_, int)
 
     pb.int64_ = 4
     compare.NormalizeNumberFields(pb)
-    self.assertTrue(isinstance(pb.int64_, six.integer_types))
+    self.assertIsInstance(pb.int64_, int)
 
     pb.int64_ = 9999999999999999
     compare.NormalizeNumberFields(pb)
-    self.assertTrue(isinstance(pb.int64_, six.integer_types))
+    self.assertIsInstance(pb.int64_, int)
 
   def testNormalizesRepeatedInts(self):
-    pb = compare_test_pb2.Large()
-    pb.int64s.extend([1, 400, 999999999999999])
+    pb = compare_test_pb2.Large(int64s=[1, 400, 999999999999999])
     compare.NormalizeNumberFields(pb)
-    self.assertTrue(isinstance(pb.int64s[0], six.integer_types))
-    self.assertTrue(isinstance(pb.int64s[1], six.integer_types))
-    self.assertTrue(isinstance(pb.int64s[2], six.integer_types))
+    self.assertIsInstance(pb.int64s[0], int)
+    self.assertIsInstance(pb.int64s[1], int)
+    self.assertIsInstance(pb.int64s[2], int)
 
   def testNormalizesFloats(self):
-    pb1 = compare_test_pb2.Large()
-    pb1.float_ = 1.2314352351231
-    pb2 = compare_test_pb2.Large()
-    pb2.float_ = 1.231435
+    pb1 = compare_test_pb2.Large(float_=1.2314352351231)
+    pb2 = compare_test_pb2.Large(float_=1.231435)
     self.assertNotEqual(pb1.float_, pb2.float_)
     compare.NormalizeNumberFields(pb1)
     compare.NormalizeNumberFields(pb2)
     self.assertEqual(pb1.float_, pb2.float_)
 
   def testNormalizesRepeatedFloats(self):
-    pb = compare_test_pb2.Large()
-    pb.medium.floats.extend([0.111111111, 0.111111])
+    pb = compare_test_pb2.Large(
+        medium=compare_test_pb2.Medium(floats=[0.111111111, 0.111111])
+    )
     compare.NormalizeNumberFields(pb)
     for value in pb.medium.floats:
       self.assertAlmostEqual(0.111111, value)
 
   def testNormalizesDoubles(self):
-    pb1 = compare_test_pb2.Large()
-    pb1.double_ = 1.2314352351231
-    pb2 = compare_test_pb2.Large()
-    pb2.double_ = 1.2314352
+    pb1 = compare_test_pb2.Large(double_=1.2314352351231)
+    pb2 = compare_test_pb2.Large(double_=1.2314352)
     self.assertNotEqual(pb1.double_, pb2.double_)
     compare.NormalizeNumberFields(pb1)
     compare.NormalizeNumberFields(pb2)
@@ -326,7 +314,7 @@ class AssertTest(googletest.TestCase):
   """Tests assertProtoEqual()."""
 
   def assertProtoEqual(self, a, b, **kwargs):
-    if isinstance(a, six.string_types) and isinstance(b, six.string_types):
+    if isinstance(a, str) and isinstance(b, str):
       a, b = LargePbs(a, b)
     compare.assertProtoEqual(self, a, b, **kwargs)
 
@@ -346,8 +334,7 @@ def assertNone(self, a, b, message, **kwargs):
 
   def testCheckInitialized(self):
     # neither is initialized
-    a = compare_test_pb2.Labeled()
-    a.optional = 1
+    a = compare_test_pb2.Labeled(optional=1)
     self.assertNone(a, a, 'Initialization errors: ', check_initialized=True)
     self.assertAll(a, check_initialized=False)
 
@@ -365,8 +352,7 @@ def testCheckInitialized(self):
         check_initialized=False)
 
     # both are initialized
-    a = compare_test_pb2.Labeled()
-    a.required = 2
+    a = compare_test_pb2.Labeled(required=2)
     self.assertAll(a, check_initialized=True)
     self.assertAll(a, check_initialized=False)
 
@@ -382,26 +368,20 @@ def testCheckInitialized(self):
     self.assertNone(a, b, message, check_initialized=False)
 
   def testAssertEqualWithStringArg(self):
-    pb = compare_test_pb2.Large()
-    pb.string_ = 'abc'
-    pb.float_ = 1.234
+    pb = compare_test_pb2.Large(string_='abc', float_=1.234)
     compare.assertProtoEqual(self, """
           string_: 'abc'
           float_: 1.234
         """, pb)
 
   def testNormalizesNumbers(self):
-    pb1 = compare_test_pb2.Large()
-    pb1.int64_ = 4
-    pb2 = compare_test_pb2.Large()
-    pb2.int64_ = 4
+    pb1 = compare_test_pb2.Large(int64_=4)
+    pb2 = compare_test_pb2.Large(int64_=4)
     compare.assertProtoEqual(self, pb1, pb2)
 
   def testNormalizesFloat(self):
-    pb1 = compare_test_pb2.Large()
-    pb1.double_ = 4.0
-    pb2 = compare_test_pb2.Large()
-    pb2.double_ = 4
+    pb1 = compare_test_pb2.Large(double_=4.0)
+    pb2 = compare_test_pb2.Large(double_=4)
     compare.assertProtoEqual(self, pb1, pb2, normalize_numbers=True)
 
   def testLargeProtoData(self):
@@ -542,9 +522,7 @@ def testRepeatedMessage(self):
 class MixinTests(compare.ProtoAssertions, googletest.TestCase):
 
   def testAssertEqualWithStringArg(self):
-    pb = compare_test_pb2.Large()
-    pb.string_ = 'abc'
-    pb.float_ = 1.234
+    pb = compare_test_pb2.Large(string_='abc', float_=1.234)
     self.assertProtoEqual("""
           string_: 'abc'
           float_: 1.234
diff --git a/tensorflow/python/util/tf_contextlib.py b/tensorflow/python/util/tf_contextlib.py
index 06a947e26249bb..52f2c3d1c3e3fc 100644
--- a/tensorflow/python/util/tf_contextlib.py
+++ b/tensorflow/python/util/tf_contextlib.py
@@ -13,12 +13,19 @@
 # limitations under the License.
 # ==============================================================================
 """TFDecorator-aware replacements for the contextlib module."""
+from collections.abc import Callable, Iterator
 import contextlib as _contextlib
 
+from typing import ContextManager, TypeVar
+
 from tensorflow.python.util import tf_decorator
 
+_T = TypeVar('_T')
+
 
-def contextmanager(target):
+def contextmanager(
+    target: Callable[..., Iterator[_T]],
+) -> Callable[..., ContextManager[_T]]:
   """A tf_decorator-aware wrapper for `contextlib.contextmanager`.
 
   Usage is identical to `contextlib.contextmanager`.
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 781dcb2ae89ee6..a716f354ad415f 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -17,8 +17,6 @@
 import functools
 import inspect as _inspect
 
-import six
-
 from tensorflow.python.util import tf_decorator
 
 
@@ -235,7 +233,7 @@ def _get_argspec_for_partial(obj):
     all_defaults[-len(defaults):] = defaults
 
   # Fill in default values provided by partial function in all_defaults.
-  for kw, default in six.iteritems(partial_keywords):
+  for kw, default in iter(partial_keywords.items()):
     if kw in args:
       idx = args.index(kw)
       all_defaults[idx] = default
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index a537864036534c..b42e75b2ca2365 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -32,6 +32,26 @@ namespace tensorflow {
 namespace swig {
 
 namespace {
+constexpr const char ITERATOR_OPS_MODULE[] =
+    "tensorflow.python.data.ops.iterator_ops";
+constexpr const char COMPOSITE_TENSOR_MODULE[] =
+    "tensorflow.python.framework.composite_tensor";
+constexpr const char INDEXED_SLICES_MODULE[] =
+    "tensorflow.python.framework.indexed_slices";
+constexpr const char OPS_MODULE[] =
+    "tensorflow.python.framework.ops";
+constexpr const char SPARSE_TENSOR_MODULE[] =
+    "tensorflow.python.framework.sparse_tensor";
+constexpr const char TENSOR_MODULE[] =
+    "tensorflow.python.framework.tensor";
+constexpr const char TYPE_SPEC_MODULE[] =
+    "tensorflow.python.framework.type_spec";
+constexpr const char RESOURCE_VAR_MODULE[] =
+    "tensorflow.python.ops.resource_variable_ops";
+constexpr const char VARIABLES_MODULE[] =
+    "tensorflow.python.ops.variables";
+constexpr const char CORE_TYPES_MODULE[] =
+    "tensorflow.python.types.core";
 string PyObjectToString(PyObject* o);
 }  // namespace
 
@@ -53,17 +73,6 @@ PyObject* GetRegisteredPyObject(const string& name) {
   return it->second;
 }
 
-PyObject* RegisterType(PyObject* type_name, PyObject* type) {
-  if (!PyType_Check(type)) {
-    PyErr_SetString(PyExc_TypeError,
-                    tensorflow::strings::StrCat("Expecting a type, got ",
-                                                Py_TYPE(type)->tp_name)
-                        .c_str());
-    return nullptr;
-  }
-  return RegisterPyObject(type_name, type);
-}
-
 PyObject* RegisterPyObject(PyObject* name, PyObject* value) {
   string key;
   if (PyBytes_Check(name)) {
@@ -212,22 +221,31 @@ class CachedTypeCheck {
       TF_GUARDED_BY(type_to_sequence_map_mu_);
 };
 
-// Returns 1 if 'obj' is an instance of 'type_name'
-// Returns 0 otherwise.
-// Returns -1 if an error occurred (e.g., if 'type_name' is not registered.)
-int IsInstanceOfRegisteredType(PyObject* obj, const char* type_name) {
-  PyObject* type_obj = GetRegisteredPyObject(type_name);
-  if (TF_PREDICT_FALSE(type_obj == nullptr)) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    tensorflow::strings::StrCat(
-                        type_name,
-                        " type has not been set. "
-                        "Please register the type with the identifier \"",
-                        type_name, "\" using RegisterType.")
-                        .c_str());
-    return -1;
+PyObject* ImportTypeFromModule(const char* module_name, const char* type_name) {
+  static PyObject* given_type;
+  given_type = [module_name, type_name]() {
+    PyObject* module = PyImport_ImportModule(module_name);
+    PyObject* attr =
+        module ? PyObject_GetAttrString(module, type_name) : nullptr;
+    if (attr == nullptr) {
+      PyErr_WriteUnraisable(nullptr);
+      PyErr_Clear();
+    }
+    if (module) Py_DECREF(module);
+    return attr;
+  }();
+  return given_type;
+}
+
+// Returns true if 'obj' is an instance of 'type_name'
+// Returns false otherwise.
+int IsInstanceOfGivenType(PyObject* obj, const char* module_name,
+                          const char* type_name) {
+  PyObject* given_type = ImportTypeFromModule(module_name, type_name);
+  if (TF_PREDICT_FALSE(given_type == nullptr)) {
+    return false;
   }
-  return PyObject_IsInstance(obj, type_obj);
+  return PyObject_IsInstance(obj, given_type);
 }
 
 // Returns 1 if `o` is considered a mapping for the purposes of Flatten().
@@ -235,7 +253,7 @@ int IsInstanceOfRegisteredType(PyObject* obj, const char* type_name) {
 // Returns -1 if an error occurred.
 int IsMappingHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return IsInstanceOfRegisteredType(to_check, "Mapping");
+    return IsInstanceOfGivenType(to_check, "collections.abc", "Mapping");
   });
   if (PyDict_Check(o)) return true;
   return check_cache->CachedLookup(o);
@@ -245,7 +263,7 @@ int IsMappingHelper(PyObject* o) {
 // Flatten(). Returns 0 otherwise. Returns -1 if an error occurred.
 int IsMutableMappingHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return IsInstanceOfRegisteredType(to_check, "MutableMapping");
+    return IsInstanceOfGivenType(to_check, "collections.abc", "MutableMapping");
   });
   if (PyDict_Check(o)) return true;
   return check_cache->CachedLookup(o);
@@ -256,7 +274,7 @@ int IsMutableMappingHelper(PyObject* o) {
 // Returns -1 if an error occurred.
 int IsMappingViewHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return IsInstanceOfRegisteredType(to_check, "MappingView");
+    return IsInstanceOfGivenType(to_check, "collections.abc", "MappingView");
   });
   return check_cache->CachedLookup(o);
 }
@@ -266,7 +284,7 @@ int IsMappingViewHelper(PyObject* o) {
 // Returns -1 if an error occurred.
 int IsObjectProxy(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return IsInstanceOfRegisteredType(to_check, "ObjectProxy");
+    return IsInstanceOfGivenType(to_check, "wrapt", "ObjectProxy");
   });
   return check_cache->CachedLookup(o);
 }
@@ -309,7 +327,8 @@ int IsCustomNestProtocolDefined(PyObject* o) {
 // Returns -1 if an error occurred.
 int IsIndexedSlicesHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return IsInstanceOfRegisteredType(to_check, "IndexedSlices");
+    return IsInstanceOfGivenType(to_check, INDEXED_SLICES_MODULE,
+                                 "IndexedSlices");
   });
   return check_cache->CachedLookup(o);
 }
@@ -319,7 +338,7 @@ int IsIndexedSlicesHelper(PyObject* o) {
 // Returns -1 if an error occurred.
 int IsTensorHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return IsInstanceOfRegisteredType(to_check, "Tensor");
+    return IsInstanceOfGivenType(to_check, TENSOR_MODULE, "Tensor");
   });
   return check_cache->CachedLookup(o);
 }
@@ -329,7 +348,7 @@ int IsTensorHelper(PyObject* o) {
 // Returns -1 if an error occurred.
 int IsTensorSpecHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return IsInstanceOfRegisteredType(to_check, "TensorSpec");
+    return IsInstanceOfGivenType(to_check, TENSOR_MODULE, "TensorSpec");
   });
   return check_cache->CachedLookup(o);
 }
@@ -339,21 +358,21 @@ int IsTensorSpecHelper(PyObject* o) {
 // Returns -1 if an error occurred.
 int IsEagerTensorHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return IsInstanceOfRegisteredType(to_check, "EagerTensor");
+    return IsInstanceOfGivenType(to_check, OPS_MODULE, "EagerTensor");
   });
   return check_cache->CachedLookup(o);
 }
 
 int IsTensorProtocolHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return IsInstanceOfRegisteredType(to_check, "TensorProtocol");
+    return IsInstanceOfGivenType(to_check, CORE_TYPES_MODULE, "TensorProtocol");
   });
   return check_cache->CachedLookup(o);
 }
 
 int IsCoreTypeValueHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return IsInstanceOfRegisteredType(to_check, "CoreTypeValue");
+    return IsInstanceOfGivenType(to_check, CORE_TYPES_MODULE, "Value");
   });
   return check_cache->CachedLookup(o);
 }
@@ -363,7 +382,8 @@ int IsCoreTypeValueHelper(PyObject* o) {
 // Returns -1 if an error occurred.
 int IsResourceVariableHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return IsInstanceOfRegisteredType(to_check, "ResourceVariable");
+    return IsInstanceOfGivenType(to_check, RESOURCE_VAR_MODULE,
+                                 "ResourceVariable");
   });
   return check_cache->CachedLookup(o);
 }
@@ -373,7 +393,8 @@ int IsResourceVariableHelper(PyObject* o) {
 // Returns -1 if an error occurred.
 int IsOwnedIteratorHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return IsInstanceOfRegisteredType(to_check, "OwnedIterator");
+    return IsInstanceOfGivenType(to_check, ITERATOR_OPS_MODULE,
+                                 "OwnedIterator");
   });
   return check_cache->CachedLookup(o);
 }
@@ -383,7 +404,7 @@ int IsOwnedIteratorHelper(PyObject* o) {
 // Returns -1 if an error occurred.
 int IsVariableHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return IsInstanceOfRegisteredType(to_check, "Variable");
+    return IsInstanceOfGivenType(to_check, VARIABLES_MODULE, "Variable");
   });
   return check_cache->CachedLookup(o);
 }
@@ -399,7 +420,8 @@ int IsNestedHelper(PyObject* o) {
   if (IsCustomNestProtocolDefined(o)) return true;
 
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    int is_instance = IsInstanceOfRegisteredType(to_check, "Sequence");
+    int is_instance =
+        IsInstanceOfGivenType(to_check, "collections.abc", "Sequence");
 
     // Don't cache a failed is_instance check.
     if (is_instance == -1) return -1;
@@ -617,11 +639,10 @@ class CustomNestedIterator : public ValueIterator {
 
 bool IsSparseTensorValueType(PyObject* o) {
   PyObject* sparse_tensor_value_type =
-      GetRegisteredPyObject("SparseTensorValue");
+      ImportTypeFromModule(SPARSE_TENSOR_MODULE, "SparseTensorValue");
   if (TF_PREDICT_FALSE(sparse_tensor_value_type == nullptr)) {
     return false;
   }
-
   return PyObject_TypeCheck(
              o, reinterpret_cast<PyTypeObject*>(sparse_tensor_value_type)) == 1;
 }
@@ -632,7 +653,8 @@ bool IsSparseTensorValueType(PyObject* o) {
 bool IsCompositeTensorHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
     // TODO(b/246438937): Remove the ResourceVariable test.
-    return IsInstanceOfRegisteredType(to_check, "CompositeTensor") &&
+    return IsInstanceOfGivenType(to_check, COMPOSITE_TENSOR_MODULE,
+                                 "CompositeTensor") &&
            !IsResourceVariable(to_check);
   });
   return check_cache->CachedLookup(o);
@@ -644,10 +666,12 @@ bool IsCompositeTensorHelper(PyObject* o) {
 // Returns -1 if an error occurred.
 bool IsTypeSpecHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    int is_type_spec = IsInstanceOfRegisteredType(to_check, "TypeSpec");
+    int is_type_spec =
+        IsInstanceOfGivenType(to_check, TYPE_SPEC_MODULE, "TypeSpec");
     // TODO(b/246438937): Remove the VariableSpec special case.
-    int is_dense_spec = (IsInstanceOfRegisteredType(to_check, "TensorSpec") ||
-                         IsInstanceOfRegisteredType(to_check, "VariableSpec"));
+    int is_dense_spec =
+        (IsInstanceOfGivenType(to_check, TENSOR_MODULE, "TensorSpec") ||
+         IsInstanceOfGivenType(to_check, RESOURCE_VAR_MODULE, "VariableSpec"));
     if ((is_type_spec == -1) || (is_dense_spec == -1)) return -1;
     return static_cast<int>(is_type_spec && !is_dense_spec);
   });
@@ -1128,7 +1152,8 @@ PyObject* IsNamedtuple(PyObject* o, bool strict) {
   }
 
   Safe_PyObjectPtr fields = make_safe(PyObject_GetAttrString(o, "_fields"));
-  int is_instance = IsInstanceOfRegisteredType(fields.get(), "Sequence");
+  int is_instance =
+      IsInstanceOfGivenType(fields.get(), "collections.abc", "Sequence");
   if (is_instance == 0) {
     Py_RETURN_FALSE;
   } else if (is_instance == -1) {
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index fd58430cf8233d..903ddb0f4d1ea1 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -244,9 +244,6 @@ PyObject* AssertSameStructureForData(PyObject* o1, PyObject* o2,
 // the documentation for `RegisteredPyObjects`.  Returns PyNone.
 PyObject* RegisterPyObject(PyObject* name, PyObject* value);
 
-// Variant of RegisterPyObject that requires the object's value to be a type.
-PyObject* RegisterType(PyObject* type_name, PyObject* type);
-
 // Returns a borrowed reference to an object that was registered with
 // RegisterPyObject.  (Do not call Py_DECREF on the result).
 PyObject* GetRegisteredPyObject(const std::string& name);
diff --git a/tensorflow/python/util/util_wrapper.cc b/tensorflow/python/util/util_wrapper.cc
index 48aa34e72a04a4..5e48eb594d39a1 100644
--- a/tensorflow/python/util/util_wrapper.cc
+++ b/tensorflow/python/util/util_wrapper.cc
@@ -26,11 +26,6 @@ PYBIND11_MODULE(_pywrap_utils, m) {
     _pywrap_utils
     -----
   )pbdoc";
-  m.def("RegisterType",
-        [](const py::handle& type_name, const py::handle& type) {
-          return tensorflow::PyoOrThrow(
-              tensorflow::swig::RegisterType(type_name.ptr(), type.ptr()));
-        });
   m.def("RegisterPyObject", [](const py::handle& name, const py::handle& type) {
     return tensorflow::PyoOrThrow(
         tensorflow::swig::RegisterPyObject(name.ptr(), type.ptr()));
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index a4fe30b11d1676..dd704da3a62d11 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -106,6 +106,30 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_BOOL
     }
+    field {
+      name: "enable_multi_host"
+      number: 27
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "backend_server_port"
+      number: 28
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "target_tpu"
+      number: 29
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "target_gpu"
+      number: 30
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
     field {
       name: "disable_functional_ops_lowering"
       number: 21
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index 2f53abbe2b3953..c3f36236a34c8b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -235,6 +235,30 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_BOOL
       }
+      field {
+        name: "enable_multi_host"
+        number: 27
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
+      field {
+        name: "backend_server_port"
+        number: 28
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      field {
+        name: "target_tpu"
+        number: 29
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
+      field {
+        name: "target_gpu"
+        number: 30
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       field {
         name: "disable_functional_ops_lowering"
         number: 21
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
index d517b4a6219751..e8b27d1124aff1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
@@ -186,7 +186,7 @@ tf_module {
   }
   member_method {
     name: "matmul"
-    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'output_type\', \'grad_a\', \'grad_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "matrix_rank"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index fbebe3b89e42f4..5987b21598a535 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -1670,7 +1670,7 @@ tf_module {
   }
   member_method {
     name: "matmul"
-    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'output_type\', \'grad_a\', \'grad_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "matrix_band_part"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index f78ba2e0839c78..80e84f38715742 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -398,15 +398,15 @@ tf_module {
   }
   member_method {
     name: "BatchMatMul"
-    argspec: "args=[\'x\', \'y\', \'adj_x\', \'adj_y\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'adj_x\', \'adj_y\', \'grad_x\', \'grad_y\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "BatchMatMulV2"
-    argspec: "args=[\'x\', \'y\', \'adj_x\', \'adj_y\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'adj_x\', \'adj_y\', \'grad_x\', \'grad_y\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "BatchMatMulV3"
-    argspec: "args=[\'x\', \'y\', \'Tout\', \'adj_x\', \'adj_y\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'Tout\', \'adj_x\', \'adj_y\', \'grad_x\', \'grad_y\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "BatchMatrixBandPart"
@@ -1936,6 +1936,10 @@ tf_module {
     name: "GetSessionTensor"
     argspec: "args=[\'handle\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "GlobalIterId"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Greater"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2300,6 +2304,10 @@ tf_module {
     name: "ListDiff"
     argspec: "args=[\'x\', \'y\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
+  member_method {
+    name: "ListSnapshotChunksDataset"
+    argspec: "args=[\'snapshot_path\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "LoadAndRemapMatrix"
     argspec: "args=[\'ckpt_path\', \'old_tensor_name\', \'row_remapping\', \'col_remapping\', \'initializing_values\', \'num_rows\', \'num_cols\', \'max_rows_in_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
@@ -2494,7 +2502,7 @@ tf_module {
   }
   member_method {
     name: "MatMul"
-    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'grad_a\', \'grad_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "MatchingFiles"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-options.pbtxt
index e649623069d76e..63505344a89afc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-options.pbtxt
@@ -14,12 +14,16 @@ tf_class {
     name: "experimental_io_device"
     mtype: "<type \'member_descriptor\'>"
   }
+  member {
+    name: "experimental_sharding_callback"
+    mtype: "<type \'member_descriptor\'>"
+  }
   member {
     name: "experimental_write_callbacks"
     mtype: "<type \'member_descriptor\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'experimental_io_device\', \'experimental_enable_async_checkpoint\', \'experimental_write_callbacks\', \'enable_async\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'experimental_io_device\', \'experimental_enable_async_checkpoint\', \'experimental_write_callbacks\', \'enable_async\', \'experimental_sharding_callback\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-max-shard-size-policy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-max-shard-size-policy.pbtxt
new file mode 100644
index 00000000000000..eeb8a04569157a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-max-shard-size-policy.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.train.experimental.MaxShardSizePolicy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.checkpoint.sharding.sharding_policies.MaxShardSizePolicy\'>"
+  is_instance: "<class \'tensorflow.python.checkpoint.sharding.sharding_util.ShardingCallback\'>"
+  is_instance: "<class \'abc.ABC\'>"
+  member {
+    name: "description"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_shard_size\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-shard-by-task-policy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-shard-by-task-policy.pbtxt
new file mode 100644
index 00000000000000..19c91cb1bc42f3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-shard-by-task-policy.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.train.experimental.ShardByTaskPolicy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.checkpoint.sharding.sharding_policies.ShardByTaskPolicy\'>"
+  is_instance: "<class \'tensorflow.python.checkpoint.sharding.sharding_util.ShardingCallback\'>"
+  is_instance: "<class \'abc.ABC\'>"
+  member {
+    name: "description"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-shardable-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-shardable-tensor.pbtxt
new file mode 100644
index 00000000000000..6848e8565c4866
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-shardable-tensor.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.train.experimental.ShardableTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.checkpoint.sharding.sharding_util.ShardableTensor\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'_tensor_save_spec\', \'tensor\', \'dtype\', \'device\', \'name\', \'shape\', \'slice_spec\', \'checkpoint_key\', \'trackable\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-sharding-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-sharding-callback.pbtxt
new file mode 100644
index 00000000000000..583a7f7c3135e9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-sharding-callback.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.train.experimental.ShardingCallback"
+tf_class {
+  is_instance: "<class \'tensorflow.python.checkpoint.sharding.sharding_util.ShardingCallback\'>"
+  is_instance: "<class \'abc.ABC\'>"
+  member {
+    name: "description"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.pbtxt
index fc07c4283256e8..c22cacc50d16e0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "LossScale"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MaxShardSizePolicy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MixedPrecisionLossScaleOptimizer"
     mtype: "<type \'type\'>"
@@ -20,6 +24,18 @@ tf_module {
     name: "PythonState"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "ShardByTaskPolicy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ShardableTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ShardingCallback"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "disable_mixed_precision_graph_rewrite"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.pbtxt
index 1fbfb172de4394..598411258b41a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.pbtxt
@@ -90,15 +90,15 @@ tf_module {
   }
   member_method {
     name: "initialize_accelerator_system"
-    argspec: "args=[\'device_type\', \'enable_coordination_service\', \'num_logical_cpu_devices\', \'experimental_reset_context\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'False\'], "
+    argspec: "args=[\'device_type\', \'enable_coordination_service\', \'num_logical_cpu_devices\', \'experimental_reset_context\', \'experimental_enable_megcore\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "initialize_multi_client"
-    argspec: "args=[\'device_type\', \'enable_coordination_service\', \'num_logical_cpu_devices\', \'experimental_reset_context\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'False\'], "
+    argspec: "args=[\'device_type\', \'enable_coordination_service\', \'num_logical_cpu_devices\', \'experimental_reset_context\', \'experimental_enable_megcore\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "initialize_tpu_system"
-    argspec: "args=[\'device_type\', \'enable_coordination_service\', \'num_logical_cpu_devices\', \'experimental_reset_context\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'False\'], "
+    argspec: "args=[\'device_type\', \'enable_coordination_service\', \'num_logical_cpu_devices\', \'experimental_reset_context\', \'experimental_enable_megcore\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "is_dtensor"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index 2319f6abb046b6..b1861f63d55b8d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -198,7 +198,7 @@ tf_module {
   }
   member_method {
     name: "matmul"
-    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'output_type\', \'grad_a\', \'grad_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "matrix_rank"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
index 675bb89d694de6..15cdd2e274e29b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
@@ -24,7 +24,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adam\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'adaptive_epsilon\', \'amsgrad\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adam\'], "
   }
   member_method {
     name: "add_variable"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adam.pbtxt
index d31bab3e3d8c7d..fb2ea437049b45 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adam.pbtxt
@@ -24,7 +24,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adam\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'adaptive_epsilon\', \'amsgrad\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adam\'], "
   }
   member_method {
     name: "add_variable"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index c514ae513bd6e3..60f091cdb9c303 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -830,7 +830,7 @@ tf_module {
   }
   member_method {
     name: "matmul"
-    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'output_type\', \'grad_a\', \'grad_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "matrix_square_root"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index f78ba2e0839c78..80e84f38715742 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -398,15 +398,15 @@ tf_module {
   }
   member_method {
     name: "BatchMatMul"
-    argspec: "args=[\'x\', \'y\', \'adj_x\', \'adj_y\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'adj_x\', \'adj_y\', \'grad_x\', \'grad_y\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "BatchMatMulV2"
-    argspec: "args=[\'x\', \'y\', \'adj_x\', \'adj_y\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'adj_x\', \'adj_y\', \'grad_x\', \'grad_y\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "BatchMatMulV3"
-    argspec: "args=[\'x\', \'y\', \'Tout\', \'adj_x\', \'adj_y\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'Tout\', \'adj_x\', \'adj_y\', \'grad_x\', \'grad_y\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "BatchMatrixBandPart"
@@ -1936,6 +1936,10 @@ tf_module {
     name: "GetSessionTensor"
     argspec: "args=[\'handle\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "GlobalIterId"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Greater"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2300,6 +2304,10 @@ tf_module {
     name: "ListDiff"
     argspec: "args=[\'x\', \'y\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
+  member_method {
+    name: "ListSnapshotChunksDataset"
+    argspec: "args=[\'snapshot_path\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "LoadAndRemapMatrix"
     argspec: "args=[\'ckpt_path\', \'old_tensor_name\', \'row_remapping\', \'col_remapping\', \'initializing_values\', \'num_rows\', \'num_cols\', \'max_rows_in_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
@@ -2494,7 +2502,7 @@ tf_module {
   }
   member_method {
     name: "MatMul"
-    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'grad_a\', \'grad_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "MatchingFiles"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
index 2d3ef0b3fbb669..1d36dacaff7eea 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
@@ -62,7 +62,7 @@ tf_module {
   }
   member_method {
     name: "trace_on"
-    argspec: "args=[\'graph\', \'profiler\'], varargs=None, keywords=None, defaults=[\'True\', \'False\'], "
+    argspec: "args=[\'graph\', \'profiler\', \'profiler_outdir\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
   }
   member_method {
     name: "write"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-options.pbtxt
index e649623069d76e..63505344a89afc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-options.pbtxt
@@ -14,12 +14,16 @@ tf_class {
     name: "experimental_io_device"
     mtype: "<type \'member_descriptor\'>"
   }
+  member {
+    name: "experimental_sharding_callback"
+    mtype: "<type \'member_descriptor\'>"
+  }
   member {
     name: "experimental_write_callbacks"
     mtype: "<type \'member_descriptor\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'experimental_io_device\', \'experimental_enable_async_checkpoint\', \'experimental_write_callbacks\', \'enable_async\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'experimental_io_device\', \'experimental_enable_async_checkpoint\', \'experimental_write_callbacks\', \'enable_async\', \'experimental_sharding_callback\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-max-shard-size-policy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-max-shard-size-policy.pbtxt
new file mode 100644
index 00000000000000..eeb8a04569157a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-max-shard-size-policy.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.train.experimental.MaxShardSizePolicy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.checkpoint.sharding.sharding_policies.MaxShardSizePolicy\'>"
+  is_instance: "<class \'tensorflow.python.checkpoint.sharding.sharding_util.ShardingCallback\'>"
+  is_instance: "<class \'abc.ABC\'>"
+  member {
+    name: "description"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_shard_size\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-shard-by-task-policy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-shard-by-task-policy.pbtxt
new file mode 100644
index 00000000000000..19c91cb1bc42f3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-shard-by-task-policy.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.train.experimental.ShardByTaskPolicy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.checkpoint.sharding.sharding_policies.ShardByTaskPolicy\'>"
+  is_instance: "<class \'tensorflow.python.checkpoint.sharding.sharding_util.ShardingCallback\'>"
+  is_instance: "<class \'abc.ABC\'>"
+  member {
+    name: "description"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-shardable-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-shardable-tensor.pbtxt
new file mode 100644
index 00000000000000..6848e8565c4866
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-shardable-tensor.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.train.experimental.ShardableTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.checkpoint.sharding.sharding_util.ShardableTensor\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'_tensor_save_spec\', \'tensor\', \'dtype\', \'device\', \'name\', \'shape\', \'slice_spec\', \'checkpoint_key\', \'trackable\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-sharding-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-sharding-callback.pbtxt
new file mode 100644
index 00000000000000..583a7f7c3135e9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.-sharding-callback.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.train.experimental.ShardingCallback"
+tf_class {
+  is_instance: "<class \'tensorflow.python.checkpoint.sharding.sharding_util.ShardingCallback\'>"
+  is_instance: "<class \'abc.ABC\'>"
+  member {
+    name: "description"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt
index 2761b489b965ad..1306e29aa98256 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt
@@ -1,7 +1,23 @@
 path: "tensorflow.train.experimental"
 tf_module {
+  member {
+    name: "MaxShardSizePolicy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "PythonState"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "ShardByTaskPolicy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ShardableTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ShardingCallback"
+    mtype: "<type \'type\'>"
+  }
 }
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu.arm64 b/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
index 5090e739c1daa7..7710379719be1d 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
@@ -1,4 +1,4 @@
-FROM linaro/tensorflow-arm64-build:2.15-multipython
+FROM linaro/tensorflow-arm64-build:2.16-multipython
 
 ARG py_major_minor_version='3.10'
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython
new file mode 100644
index 00000000000000..b99e17355729b8
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython
@@ -0,0 +1,44 @@
+# Dockerfile to build a manylinux 2010 compliant cross-compiler.
+#
+# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
+# glibc (2.12) and system libstdc++ (4.4).
+#
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython \
+#  --tag "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython
+
+FROM gcr.io/tensorflow-sigs/build@sha256:1aa3486c05856d76810dc725a26fc9262ab75dd888169d101e5612bf0800c970
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+COPY install/install_bootstrap_deb_packages.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+
+COPY install/install_deb_packages.sh /install/
+RUN /install/install_deb_packages.sh
+
+RUN apt-get update && apt-get install -y \
+    libbz2-dev \
+    libffi-dev \
+    libgdbm-dev \
+    libncurses5-dev \
+    libnss3-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    patchelf \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY install/build_and_install_python.sh /install/
+RUN /install/build_and_install_python.sh "3.9.18"
+RUN /install/build_and_install_python.sh "3.10.13"
+RUN /install/build_and_install_python.sh "3.11.6"
+RUN /install/build_and_install_python.sh "3.12.0"
+
+COPY install/install_pip_packages_by_version.sh /install/
+# https://github.com/numpy/numpy/issues/22623 for `SETUPTOOLS_USE_DISTUTILS`.
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.12" "jax"
diff --git a/tensorflow/tools/ci_build/a100/nightly.sh b/tensorflow/tools/ci_build/a100/nightly.sh
index d2ca9a3ae86cdf..6914b0269bd564 100644
--- a/tensorflow/tools/ci_build/a100/nightly.sh
+++ b/tensorflow/tools/ci_build/a100/nightly.sh
@@ -18,4 +18,4 @@ set -e
 
 docker pull tensorflow/tensorflow:devel-gpu
 docker run --gpus all -w /tensorflow_src -v $PWD:/mnt -e HOST_PERMS="$(id -u):$(id -g)" \
-    tensorflow/tensorflow:devel-gpu bash -c "git pull; bazel test --config=cuda -c opt --test_tag_filters=gpu,-no_gpu,-benchmark-test,-no_oss,-oss_excluded,-oss_serial,-v1only,-no_gpu_presubmit,-no_cuda11 -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/mlir/tosa/... -//tensorflow/compiler/xrt/... //tensorflow/compiler/mlir/lite/... -//tensorflow/lite/micro/examples/... -//tensorflow/core/tpu/... -//tensorflow/lite/..."
+    tensorflow/tensorflow:devel-gpu bash -c "git pull; bazel test --config=cuda -c opt --test_tag_filters=gpu,-no_gpu,-benchmark-test,-no_oss,-oss_excluded,-oss_serial,-v1only,-no_gpu_presubmit,-no_cuda11 -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/mlir/tosa/... //tensorflow/compiler/mlir/lite/... -//tensorflow/lite/micro/examples/... -//tensorflow/core/tpu/... -//tensorflow/lite/..."
diff --git a/tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh b/tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
index 631c869c80decf..aca2745bd3e8cf 100755
--- a/tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+++ b/tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
@@ -17,7 +17,6 @@ set -x
 
 DEFAULT_BAZEL_TARGETS="//tensorflow/... \
 -//tensorflow/compiler/tf2tensorrt/... \
--//tensorflow/compiler/xrt/... \
 -//tensorflow/core/tpu/... \
 -//tensorflow/go/... \
 -//tensorflow/java/... \
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
index 1b4cc0552274d9..1a3fec1e179f7f 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
@@ -42,6 +42,7 @@ JAX_PACKAGES=(
   "typing_extensions"
   "ml_dtypes>=0.3.0"
   "importlib_metadata>=4.6"
+  "flatbuffers"
 )
 
 PACKAGES=(
diff --git a/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc b/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc
index c388f5322abf07..11e64b54f97100 100644
--- a/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc
+++ b/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc
@@ -37,4 +37,4 @@ test --build_tests_only --keep_going
 test:nonpip_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
 test:nonpip_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
 test:nonpip_filters --test_lang_filters=cc,py
-test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xla/service/gpu/... -//tensorflow/compiler/xla/tools/multihost_hlo_runner/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/... -//tensorflow/compiler/xla/tests:local_client_aot_test_computation -//tensorflow/compiler/xla/tests:local_client_aot_test_helper -//tensorflow/compiler/xla/tests:local_client_aot_test
+test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xla/service/gpu/... -//tensorflow/compiler/xla/tools/multihost_hlo_runner/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/... -//tensorflow/compiler/xla/tests:local_client_aot_test_computation -//tensorflow/compiler/xla/tests:local_client_aot_test_helper -//tensorflow/compiler/xla/tests:local_client_aot_test
diff --git a/tensorflow/tools/ci_build/release/requirements_mac.txt b/tensorflow/tools/ci_build/release/requirements_mac.txt
index 39349a3f3a6aa2..aa08a8c8db45e3 100644
--- a/tensorflow/tools/ci_build/release/requirements_mac.txt
+++ b/tensorflow/tools/ci_build/release/requirements_mac.txt
@@ -8,5 +8,5 @@ twine ~= 3.6.0
 setuptools
 
 # Test dependencies which don't exist on Windows
-jax ~= 0.3.24
+jax ~= 0.4.1
 jaxlib ~= 0.4.1
diff --git a/tensorflow/tools/ci_build/release/requirements_ubuntu.txt b/tensorflow/tools/ci_build/release/requirements_ubuntu.txt
index 8d7122076fcd91..db2e1ee8b47fca 100644
--- a/tensorflow/tools/ci_build/release/requirements_ubuntu.txt
+++ b/tensorflow/tools/ci_build/release/requirements_ubuntu.txt
@@ -5,5 +5,5 @@
 PyYAML ~= 6.0
 
 # Test dependencies which don't exist on Windows
-jax ~= 0.3.14
+jax ~= 0.4.1
 jaxlib ~= 0.4.1; platform.machine != 'aarch64'
diff --git a/tensorflow/tools/ci_build/windows/bazel/cpu_win_test.sh b/tensorflow/tools/ci_build/windows/bazel/cpu_win_test.sh
new file mode 100644
index 00000000000000..25a2f1d4cb44f7
--- /dev/null
+++ b/tensorflow/tools/ci_build/windows/bazel/cpu_win_test.sh
@@ -0,0 +1,257 @@
+#!/bin/bash
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# You may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# This script is a CI script maintained by Intel and is used to launch the nightly CI test 
+# build on the Windows platform.
+# It assumes the standard setup on tensorflow Jenkins Windows machines.
+# Update the flags/variables below to make it work on your local system.
+
+# REQUIREMENTS:
+# * All installed in standard locations:
+#   - JDK8, and JAVA_HOME set.
+#   - Microsoft Visual Studio 2015 Community Edition
+#   - Msys2
+#   - Python 3.x (with pip, setuptools, venv)
+# * Bazel Windows executable copied as "bazel.exe" and included in PATH.
+
+
+# All commands should be visible (-x).
+set -x
+
+POSITIONAL_ARGS=()
+XBF_ARGS=""
+XTF_ARGS=""
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --extra_build_flags)
+      XBF_ARGS="$2"
+      shift # past argument
+      shift # past value
+      ;;
+    --extra_test_flags)
+      XTF_ARGS="$2"
+      shift # past argument
+      shift # past value
+      ;;
+    *)
+      POSITIONAL_ARGS+=("$1") # save positional arg
+      shift # past argument
+      ;;
+  esac
+done
+
+# Bazelisk (renamed as bazel) is kept in C:\Tools
+export PATH=/c/ProgramData/chocolatey/bin:/c/Tools/bazel:/c/Program\ Files/Git:/c/Program\ \
+Files/Git/cmd:/c/msys64:/c/msys64/usr/bin:/c/Windows/system32:/c/Windows:/c/Windows/System32/Wbem
+
+# Environment variables to be set by Jenkins before calling this script
+
+export PYTHON_VERSION=${PYTHON_VERSION:-"310"}
+export TF_PYTHON_VERSION=${PYTHON_VERSION:0:1}.${PYTHON_VERSION:1}
+# keep the tensorflow git repo clone under here as tensorflow subdir
+MYTFWS_ROOT=${WORKSPACE:-"C:/Users/mlp_admin"} 
+MYTFWS_ROOT=`cygpath -m $MYTFWS_ROOT`
+export MYTFWS_ROOT="$MYTFWS_ROOT"
+export MYTFWS_NAME="tensorflow"
+export MYTFWS="${MYTFWS_ROOT}/${MYTFWS_NAME}"
+export MYTFWS_ARTIFACT="${MYTFWS_ROOT}/artifact"
+
+
+# Import General Test Target
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+# Environment variables specific to the system where this job is running, are to
+# be set by a script for the specific system. This needs to be set here by sourcing a file.
+
+export TMP=${TMP:-"${MYTFWS_ROOT}/tmp"}
+export TEMP="$TMP"
+export TMPDIR=${TMPDIR:-"${MYTFWS}-build"} # used internally by TF build
+export TEST_TARGET=${TEST_TARGET:-"${DEFAULT_BAZEL_TARGETS}"}
+export MSYS_LOCATION='C:/msys64'
+export GIT_LOCATION='C:/Program Files/Git'
+export JAVA_LOCATION='C:/Program Files/Eclipse Adoptium/jdk-11.0.14.101-hotspot'
+export VS_LOCATION='C:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools'
+export NATIVE_PYTHON_LOCATION="C:/Python${PYTHON_VERSION}"
+export PORTSERVER_LOCATION='C:/Program Files/python_portpicker/src/portserver.py'
+
+
+echo "*** *** hostname is $(hostname) *** ***"
+which bazel
+which git
+[[ -e "$NATIVE_PYTHON_LOCATION/python.exe" ]] || \
+{ echo "Specified Python path is incorrect: $NATIVE_PYTHON_LOCATION"; exit 1;}
+[[ -e "$NATIVE_PYTHON_LOCATION/Scripts/pip.exe" ]] || \
+{ echo "Specified Python path has no pip: $NATIVE_PYTHON_LOCATION"; exit 1;}
+[[ -e "$NATIVE_PYTHON_LOCATION/Lib/venv" ]] || \
+{ echo "Specified Python path has no venv: $NATIVE_PYTHON_LOCATION"; exit 1;}
+
+$NATIVE_PYTHON_LOCATION/python.exe -m pip list
+
+# =========================== Start of actual script =========================
+# This script sets necessary environment variables and runs TF-Windows build & unit tests
+# We also assume a few Software components are also installed in the machine: MS VC++,
+# MINGW SYS64, Python 3.x, JAVA, Git, Bazelisk etc.
+
+# Asuumptions
+# 1) TF repo cloned into to %WORKSPACE%\tensorflow (aka %TF_LOCATION%)
+# 2) Bazelisk is installed in "C:\Tools\Bazel"
+# 3) The following jobs-specific env vars will be exported  by the caller
+#       WORKSPACE (ex. C:\Jenkins\workspace\tensorflow-eigen-test-win)
+#       PYTHON_VERSION  (ex. 38)
+#       PIP_MODULES (if set will contain any additional pip packages)
+# 4) System-specific env variables for the location of different software
+#    components needed for building.
+
+# Create Python virtual env
+cd ${MYTFWS_ROOT}
+export PYTHON_DIRECTORY="${MYTFWS_ROOT}"/venv_py${PYTHON_VERSION}
+"${NATIVE_PYTHON_LOCATION}"/python.exe -mvenv --clear  "${PYTHON_DIRECTORY}"
+
+#activate virtual env
+source "${PYTHON_DIRECTORY}"/Scripts/activate
+
+which python
+python --version
+
+# Install pip modules specs from tensorflow/tools/ci_build/release/requirements_common.txt
+python -m pip install -r $MYTFWS/tensorflow/tools/ci_build/release/requirements_common.txt
+
+# set up other Variables required by Bazel.
+export PYTHON_BIN_PATH="${PYTHON_DIRECTORY}"/Scripts/python.exe
+export PYTHON_LIB_PATH="${PYTHON_DIRECTORY}"/Lib/site-packages
+export BAZEL_VS=${VS_LOCATION}
+export BAZEL_VC=${VS_LOCATION}/VC
+export JAVA_HOME=${JAVA_LOCATION}
+export BAZEL_SH="${MSYS_LOCATION}"/usr/bin/bash.exe
+
+cd ${MYTFWS_ROOT}
+mkdir -p "$TMP"
+mv summary.log summary.log.bak
+mv test_failures.log test_failures.log.bak
+mv test_run.log test_run.log.bak
+rm -rf ${MYTFWS_ARTIFACT}
+mkdir -p ${MYTFWS_ARTIFACT}
+
+cd $MYTFWS
+
+# All commands shall pass
+set -e
+
+# Setting up the environment variables Bazel and ./configure needs
+source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
+  || { echo "Failed to source common_env.sh" >&2; exit 1; }
+
+# load bazel_test_lib.sh
+source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
+  || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
+
+# Recreate an empty bazelrc file under source root
+export TMP_BAZELRC=.tmp.bazelrc
+rm -f "${TMP_BAZELRC}"
+touch "${TMP_BAZELRC}"
+
+function cleanup {
+  # Remove all options in .tmp.bazelrc
+  echo "" > "${TMP_BAZELRC}"
+}
+trap cleanup EXIT
+
+# Enable short object file path to avoid long path issues on Windows.
+echo "startup --output_user_root=${TMPDIR}" >> "${TMP_BAZELRC}"
+
+if ! grep -q "import %workspace%/${TMP_BAZELRC}" .bazelrc; then
+  echo "import %workspace%/${TMP_BAZELRC}" >> .bazelrc
+fi
+
+run_configure_for_cpu_build
+
+# Unset so the script continues even if commands fail, needed to correctly process the logs
+set +e   
+
+# start the port server before testing so that each invocation of 
+# portpicker will defer to the single instance of portserver
+# Define the batch script content
+BATCH_SCRIPT_START="
+@echo off
+set SCRIPT_PATH="${PORTSERVER_LOCATION}"
+echo Starting the server...
+start \"PORTSERVER\" \"%PYTHON_BIN_PATH%\" \"%SCRIPT_PATH%\"
+echo Server started.
+"
+# Save the batch script content to a temporary batch file
+BATCH_SCRIPT_FILE="temp_script.bat"
+echo "$BATCH_SCRIPT_START" > "$BATCH_SCRIPT_FILE"
+
+# Run the batch script
+cmd.exe /C "$BATCH_SCRIPT_FILE"
+
+# NUMBER_OF_PROCESSORS is predefined on Windows
+N_JOBS="${NUMBER_OF_PROCESSORS}"
+bazel --windows_enable_symlinks test \
+  --action_env=TEMP=${TMP} --action_env=TMP=${TMP} ${XTF_ARGS} \
+  --experimental_cc_shared_library --enable_runfiles --nodistinct_host_configuration \
+  --build_tag_filters=-no_pip,-no_windows,-no_oss,-gpu,-tpu \
+  --test_tag_filters=-no_windows,-no_oss,-gpu,-tpu \
+  --build_tests_only --config=monolithic \
+  --dynamic_mode=off --config=xla --config=opt \
+  --build_tests_only -k \
+  --test_env=PORTSERVER_ADDRESS=@unittest-portserver \
+  --repo_env=TF_PYTHON_VERSION=${TF_PYTHON_VERSION} \
+  --test_size_filters=small,medium --jobs="${N_JOBS}" --test_timeout=300,450,1200,3600 \
+  --flaky_test_attempts=3 --verbose_failures \
+  ${POSITIONAL_ARGS[@]} \
+  -- ${TEST_TARGET} \
+  > run.log 2>&1
+
+build_ret_val=$?   # Store the ret value
+
+BATCH_SCRIPT_STOP="
+echo Killing the server...
+taskkill /FI \"WindowTitle eq PORTSERVER*\" /F /T
+echo Server killed.
+"
+BATCH_SCRIPT_FILEl="temp_script.bat"
+echo "$BATCH_SCRIPT_STOP" > "$BATCH_SCRIPT_FILEl"
+cmd.exe /C "$BATCH_SCRIPT_FILEl"
+
+# Removing the temporary batch script
+rm -f "$BATCH_SCRIPT_FILE"
+rm -f "$BATCH_SCRIPT_FILEl"
+
+# process results
+cd $MYTFWS_ROOT
+
+# Check to make sure the log was created
+[ ! -f "${MYTFWS}"/run.log  ] && exit 1
+
+# handle logs for unit test
+cd ${MYTFWS_ARTIFACT}
+cp "${MYTFWS}"/run.log ./test_run.log
+
+fgrep "FAILED: Build did NOT complete" test_run.log > summary.log
+fgrep "Executed" test_run.log >> summary.log
+
+[ $build_ret_val -eq 0 ] && exit 0
+
+echo "FAILED TESTS:" > test_failures.log
+fgrep "FAILED" test_run.log | grep " ms)" | sed -e 's/^.*\] //' -e 's/ .*$//' | sort | \
+uniq >> test_failures.log
+echo >> test_failures.log
+echo "SKIPPED TESTS:" >> test_failures.log
+fgrep "SKIPPED" test_run.log | grep -v "listed below:" | sed -e 's/^.*\] //' | sort | \
+uniq >> test_failures.log
+
+exit 1
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index d56e508f8bf2dc..ed8ffc015fb7df 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -324,7 +324,7 @@ def edit_yaml_file(path):
 
   expected_path_contents = {
       "tf/summary/audio.md":
-          "tensorboard/plugins/audio/summary_v2.py",
+          "python/summary/tb_summary.py",
       "tf/estimator/DNNClassifier.md":
           "tensorflow_estimator/python/estimator/canned/dnn.py",
       "tf/nn/sigmoid_cross_entropy_with_logits.md":
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 37394c6eb9a010..aaf9b0f5f31bb1 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -218,11 +218,7 @@ filegroup(
         "transform_graph.h",
         "transform_utils.h",
     ],
-    visibility = [
-        "//tensorflow/core:__pkg__",
-        "//tensorflow/python:__pkg__",
-        "//tensorflow/python/util:__pkg__",
-    ],
+    visibility = ["//tensorflow/python/util:__pkg__"],
 )
 
 cc_library(
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 0a712456f4e609..513b271be55508 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -165,6 +165,7 @@ genrule(
         ],
         "//conditions:default": [],
     }) + if_cuda([
+        "@cub_archive//:LICENSE.TXT",
         "@local_config_nccl//:LICENSE",
     ]) + if_mkl([
         "//third_party/mkl_dnn:LICENSE",
@@ -207,6 +208,7 @@ genrule(
         ],
         "//conditions:default": [],
     }) + if_cuda([
+        "@cub_archive//:LICENSE.TXT",
         "@local_config_nccl//:LICENSE",
     ]) + if_mkl([
         "//third_party/mkl_dnn:LICENSE",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index b926de8e53952a..8b83ce23ab5ef6 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -237,6 +237,7 @@ filegroup(
         ],
         "//conditions:default": [],
     }) + if_cuda([
+        "@cub_archive//:LICENSE.TXT",
         "@local_config_nccl//:LICENSE",
     ]) + if_mkl([
         "//third_party/mkl_dnn:LICENSE",
diff --git a/tensorflow/tools/pip_package/THIRD_PARTY_NOTICES.txt b/tensorflow/tools/pip_package/THIRD_PARTY_NOTICES.txt
index c0ecfe99bcefff..9ac7ee9b800fd7 100644
--- a/tensorflow/tools/pip_package/THIRD_PARTY_NOTICES.txt
+++ b/tensorflow/tools/pip_package/THIRD_PARTY_NOTICES.txt
@@ -315,7 +315,7 @@ record keeping.)
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
+ *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
@@ -370,21 +370,21 @@ record keeping.)
  * This package is an SSL implementation written
  * by Eric Young (eay@cryptsoft.com).
  * The implementation was written so as to conform with Netscapes SSL.
- * 
+ *
  * This library is free for commercial and non-commercial use as long as
  * the following conditions are aheared to.  The following conditions
  * apply to all code found in this distribution, be it the RC4, RSA,
  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
  * included with this distribution is covered by the same copyright terms
  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- * 
+ *
  * Copyright remains Eric Young's, and as such any Copyright notices in
  * the code are not to be removed.
  * If this package is used in a product, Eric Young should be given attribution
  * as the author of the parts of the library used.
  * This can be in the form of a textual message at program startup or
  * in documentation (online or textual) provided with the package.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -399,10 +399,10 @@ record keeping.)
  *     Eric Young (eay@cryptsoft.com)"
  *    The word 'cryptographic' can be left out if the rouines from the library
  *    being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from 
+ * 4. If you include any Windows specific code (or a derivative thereof) from
  *    the apps directory (application code) you must include an acknowledgement:
  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -414,7 +414,7 @@ record keeping.)
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- * 
+ *
  * The licence and distribution terms for any publically available version or
  * derivative of this code cannot be changed.  i.e. this code cannot simply be
  * copied and put under another distribution licence
@@ -557,7 +557,40 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
- */ 
+ */
+
+--------------------------------------------------------------------------------
+
+--------------------------------------------------------------------------------
+== cutlass
+
+Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 --------------------------------------------------------------------------------
 
@@ -931,9 +964,9 @@ the other COPYING.* files here.
 
 If you want to guarantee that the Eigen code that you are #including
 is licensed under the MPL2 and possibly more permissive licenses (like
-BSD), #define this preprocessor symbol: EIGEN_MPL2_ONLY 
+BSD), #define this preprocessor symbol: EIGEN_MPL2_ONLY
 For example, with most compilers, you could add this to your project
-      CXXFLAGS: -DEIGEN_MPL2_ONLY 
+      CXXFLAGS: -DEIGEN_MPL2_ONLY
 This will cause a compilation error to be generated if you #include
 any code that is covered by more restrictive licences than MPL2.
 
@@ -1693,7 +1726,7 @@ Mozilla Public License Version 2.0
     means any form of the work other than Source Code Form.
 
 1.7. "Larger Work"
-    means a work that combines Covered Software with other material, in 
+    means a work that combines Covered Software with other material, in
     a separate file or files, that is not Covered Software.
 
 1.8. "License"
@@ -3591,7 +3624,7 @@ Mozilla Public License Version 2.0
     means any form of the work other than Source Code Form.
 
 1.7. "Larger Work"
-    means a work that combines Covered Software with other material, in 
+    means a work that combines Covered Software with other material, in
     a separate file or files, that is not Covered Software.
 
 1.8. "License"
@@ -3952,8 +3985,8 @@ Copyright Notice and Statement for the h5py Project
        documentation and/or other materials provided with the
        distribution.
 
-    c. Neither the name of the author nor the names of contributors may 
-       be used to endorse or promote products derived from this software 
+    c. Neither the name of the author nor the names of contributors may
+       be used to endorse or promote products derived from this software
        without specific prior written permission.
 
     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
@@ -5242,7 +5275,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 ==============================================================================
-==============================================================================                                                                                
+==============================================================================
 Copied from llvm-project/llvm/lib/Support/COPYRIGHT.regex:
 $OpenBSD: COPYRIGHT,v 1.3 2003/06/02 20:18:36 millert Exp $
 
@@ -5300,7 +5333,7 @@ to the following restrictions:
  */
 
 ==============================================================================
-==============================================================================                                                                                
+==============================================================================
 License for third_party/llvm/llvm-project/llvm/cmake/config.guess:
 
                     GNU GENERAL PUBLIC LICENSE
@@ -5612,7 +5645,7 @@ exception to the GPL from your modified version.
 
 
 ==============================================================================
-==============================================================================                                                                                
+==============================================================================
 Copied from llvm-project/llvm/-project/polly/lib/External/isl/LICENSE:
 
 MIT License (MIT)
@@ -5636,7 +5669,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 
 ==============================================================================
-==============================================================================                                                                                
+==============================================================================
 Copied from llvm-project/llgo/third_party/gotools/LICENSE:
 
 Copyright (c) 2009 The Go Authors. All rights reserved.
@@ -5668,7 +5701,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 ==============================================================================
-==============================================================================                                                                                
+==============================================================================
 Copied from llvm-project/llgo/third_party/gofrontend/libffi/LICENSE:
 
 libffi - Copyright (c) 1996-2014  Anthony Green, Red Hat, Inc and others.
@@ -5694,7 +5727,7 @@ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 ==============================================================================
-==============================================================================                                                                                
+==============================================================================
 Copied from llvm-project/lldb/third_party/Python/module/six/LICENSE:
 
 Copyright (c) 2010-2015 Benjamin Peterson
@@ -5717,7 +5750,7 @@ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 ==============================================================================
-==============================================================================                                                                                
+==============================================================================
 Copied from llvm-project/lldb/third_party/Python/module/pexpect-4.6/LICENSE and
 lldb/third_party/Python/module/ptyprocess-0.6.0/LICENSE.
 
@@ -5732,7 +5765,7 @@ ISC LICENSE
     Permission to use, copy, modify, and/or distribute this software for any
     purpose with or without fee is hereby granted, provided that the above
     copyright notice and this permission notice appear in all copies.
-    
+
     THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
@@ -5742,7 +5775,7 @@ ISC LICENSE
     OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
 ==============================================================================
-==============================================================================                                                                                
+==============================================================================
 Copied from
 llvm-project/clang-tools-extra/clangd/clients/clangd-vscode/LICENSE:
 
@@ -5769,7 +5802,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 
 ==============================================================================
-==============================================================================                                                                                
+==============================================================================
 Copied from llvm-project/llvm/include/llvm/Support/LICENSE.TXT:
 
 LLVM System Interface Library
@@ -5780,7 +5813,7 @@ License and has the following additional copyright:
 Copyright (C) 2004 eXtensible Systems, Inc.
 
 ==============================================================================
-==============================================================================                                                                                
+==============================================================================
 Copied from llvm-project/llvm/test/YAMLParser/LICENSE.txt:
 
 Copyright (c) 2006 Kirill Simonov
@@ -5804,7 +5837,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 
 ==============================================================================
-==============================================================================                                                                                
+==============================================================================
 Copied from llvm-project/clang-tools-extra/clang-tidy/cert/LICENSE.TXT:
 
 ------------------------------------------------------------------------------
@@ -5831,7 +5864,7 @@ to reproduce the title of the content being linked to, nor to reproduce any
 de Minimis description of such content.
 
 ==============================================================================
-==============================================================================                                                                                
+==============================================================================
 Copied from llvm-project/clang-tools-extra/clang-tidy/hicpp/LICENSE.TXT:
 
 ------------------------------------------------------------------------------
@@ -6108,21 +6141,21 @@ Copied from docker_kokoro/dockerfiles/scripts/google_packages/deb_packages/copyr
 
 Files: libcxx/utils/google-benchmark/*
 License: Apache 2.0
- 
+
                                   Apache License
                             Version 2.0, January 2004
                          http://www.apache.org/licenses/
- 
+
     TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
- 
+
     1. Definitions.
- 
+
        "License" shall mean the terms and conditions for use, reproduction,
        and distribution as defined by Sections 1 through 9 of this document.
- 
+
        "Licensor" shall mean the copyright owner or entity authorized by
        the copyright owner that is granting the License.
- 
+
        "Legal Entity" shall mean the union of the acting entity and all
        other entities that control, are controlled by, or are under common
        control with that entity. For the purposes of this definition,
@@ -6130,24 +6163,24 @@ License: Apache 2.0
        direction or management of such entity, whether by contract or
        otherwise, or (ii) ownership of fifty percent (50%) or more of the
        outstanding shares, or (iii) beneficial ownership of such entity.
- 
+
        "You" (or "Your") shall mean an individual or Legal Entity
        exercising permissions granted by this License.
- 
+
        "Source" form shall mean the preferred form for making modifications,
        including but not limited to software source code, documentation
        source, and configuration files.
- 
+
        "Object" form shall mean any form resulting from mechanical
        transformation or translation of a Source form, including but
        not limited to compiled object code, generated documentation,
        and conversions to other media types.
- 
+
        "Work" shall mean the work of authorship, whether in Source or
        Object form, made available under the License, as indicated by a
        copyright notice that is included in or attached to the work
        (an example is provided in the Appendix below).
- 
+
        "Derivative Works" shall mean any work, whether in Source or Object
        form, that is based on (or derived from) the Work and for which the
        editorial revisions, annotations, elaborations, or other modifications
@@ -6155,7 +6188,7 @@ License: Apache 2.0
        of this License, Derivative Works shall not include works that remain
        separable from, or merely link (or bind by name) to the interfaces of,
        the Work and Derivative Works thereof.
- 
+
        "Contribution" shall mean any work of authorship, including
        the original version of the Work and any modifications or additions
        to that Work or Derivative Works thereof, that is intentionally
@@ -6169,18 +6202,18 @@ License: Apache 2.0
        Licensor for the purpose of discussing and improving the Work, but
        excluding communication that is conspicuously marked or otherwise
        designated in writing by the copyright owner as "Not a Contribution."
- 
+
        "Contributor" shall mean Licensor and any individual or Legal Entity
        on behalf of whom a Contribution has been received by Licensor and
        subsequently incorporated within the Work.
- 
+
     2. Grant of Copyright License. Subject to the terms and conditions of
        this License, each Contributor hereby grants to You a perpetual,
        worldwide, non-exclusive, no-charge, royalty-free, irrevocable
        copyright license to reproduce, prepare Derivative Works of,
        publicly display, publicly perform, sublicense, and distribute the
        Work and such Derivative Works in Source or Object form.
- 
+
     3. Grant of Patent License. Subject to the terms and conditions of
        this License, each Contributor hereby grants to You a perpetual,
        worldwide, non-exclusive, no-charge, royalty-free, irrevocable
@@ -6196,24 +6229,24 @@ License: Apache 2.0
        or contributory patent infringement, then any patent licenses
        granted to You under this License for that Work shall terminate
        as of the date such litigation is filed.
- 
+
     4. Redistribution. You may reproduce and distribute copies of the
        Work or Derivative Works thereof in any medium, with or without
        modifications, and in Source or Object form, provided that You
        meet the following conditions:
- 
+
        (a) You must give any other recipients of the Work or
            Derivative Works a copy of this License; and
- 
+
        (b) You must cause any modified files to carry prominent notices
            stating that You changed the files; and
- 
+
        (c) You must retain, in the Source form of any Derivative Works
            that You distribute, all copyright, patent, trademark, and
            attribution notices from the Source form of the Work,
            excluding those notices that do not pertain to any part of
            the Derivative Works; and
- 
+
        (d) If the Work includes a "NOTICE" text file as part of its
            distribution, then any Derivative Works that You distribute must
            include a readable copy of the attribution notices contained
@@ -6230,14 +6263,14 @@ License: Apache 2.0
            or as an addendum to the NOTICE text from the Work, provided
            that such additional attribution notices cannot be construed
            as modifying the License.
- 
+
        You may add Your own copyright statement to Your modifications and
        may provide additional or different license terms and conditions
        for use, reproduction, or distribution of Your modifications, or
        for any such Derivative Works as a whole, provided Your use,
        reproduction, and distribution of the Work otherwise complies with
        the conditions stated in this License.
- 
+
     5. Submission of Contributions. Unless You explicitly state otherwise,
        any Contribution intentionally submitted for inclusion in the Work
        by You to the Licensor shall be under the terms and conditions of
@@ -6245,12 +6278,12 @@ License: Apache 2.0
        Notwithstanding the above, nothing herein shall supersede or modify
        the terms of any separate license agreement you may have executed
        with Licensor regarding such Contributions.
- 
+
     6. Trademarks. This License does not grant permission to use the trade
        names, trademarks, service marks, or product names of the Licensor,
        except as required for reasonable and customary use in describing the
        origin of the Work and reproducing the content of the NOTICE file.
- 
+
     7. Disclaimer of Warranty. Unless required by applicable law or
        agreed to in writing, Licensor provides the Work (and each
        Contributor provides its Contributions) on an "AS IS" BASIS,
@@ -6260,7 +6293,7 @@ License: Apache 2.0
        PARTICULAR PURPOSE. You are solely responsible for determining the
        appropriateness of using or redistributing the Work and assume any
        risks associated with Your exercise of permissions under this License.
- 
+
     8. Limitation of Liability. In no event and under no legal theory,
        whether in tort (including negligence), contract, or otherwise,
        unless required by applicable law (such as deliberate and grossly
@@ -6272,7 +6305,7 @@ License: Apache 2.0
        work stoppage, computer failure or malfunction, or any and all
        other commercial damages or losses), even if such Contributor
        has been advised of the possibility of such damages.
- 
+
     9. Accepting Warranty or Additional Liability. While redistributing
        the Work or Derivative Works thereof, You may choose to offer,
        and charge a fee for, acceptance of support, warranty, indemnity,
@@ -6283,11 +6316,11 @@ License: Apache 2.0
        defend, and hold each Contributor harmless for any liability
        incurred by, or claims asserted against, such Contributor by reason
        of your accepting any such warranty or additional liability.
- 
+
     END OF TERMS AND CONDITIONS
- 
+
     APPENDIX: How to apply the Apache License to your work.
- 
+
        To apply the Apache License to your work, attach the following
        boilerplate notice, with the fields enclosed by brackets "[]"
        replaced with your own identifying information. (Don't include
@@ -6296,15 +6329,15 @@ License: Apache 2.0
        file or class name and description of purpose be included on the
        same "printed page" as the copyright notice for easier
        identification within third-party archives.
- 
+
     Copyright [yyyy] [name of copyright owner]
- 
+
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
     You may obtain a copy of the License at
- 
+
         http://www.apache.org/licenses/LICENSE-2.0
- 
+
     Unless required by applicable law or agreed to in writing, software
     distributed under the License is distributed on an "AS IS" BASIS,
     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -7334,8 +7367,8 @@ THE POSSIBILITY OF SUCH DAMAGE.**
 
 # Components
 
-Many parts of this module have been derived from original sources, 
-often the algorithm's designer. Component licenses are located with 
+Many parts of this module have been derived from original sources,
+often the algorithm's designer. Component licenses are located with
 the component code.
 
 
@@ -9264,33 +9297,33 @@ been taken from other projects or from the open internet. Every line of code can
 be traced back to its original author, and all of those authors have public
 domain dedications on file. So the SQLite code base is clean and is
 uncontaminated with licensed code from other projects.
-  
+
 --------------------------------------------------------------------------------
 
 --------------------------------------------------------------------------------
 == triton
 
-/* 
+/*
 * Copyright 2018-2020 Philippe Tillet
 * Copyright 2020-2022 OpenAI
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
+*
+* The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 8a626aa2f887a0..ff21aadba95be6 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -167,12 +167,12 @@ function prepare_src() {
     cp -L \
       bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/org_tensorflow/LICENSE \
       "${TMPDIR}"
-      
+
     # Change the format of file path (TMPDIR-->TMPDIR_rsync) which is input to the rsync from
-    # Windows-compatible to Linux-compatible to resolve the error below 
-    # error: ssh: Could not resolve hostname c: No such host is known. 
-    
-    TMPDIR_rsync=`cygpath $TMPDIR`  
+    # Windows-compatible to Linux-compatible to resolve the error below
+    # error: ssh: Could not resolve hostname c: No such host is known.
+
+    TMPDIR_rsync=`cygpath $TMPDIR`
     rsync -a \
       bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/org_tensorflow/tensorflow \
       "${TMPDIR_rsync}"
@@ -215,23 +215,6 @@ function prepare_src() {
     cp -L \
       bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/LICENSE \
       "${TMPDIR}"
-    # Check if it is a tpu build
-    if [[ ${TPU_BUILD} == "1" ]]; then
-      # Check if libtpu.so exists
-      if [[ -f "./tensorflow/lib/libtpu.so" ]]; then
-        if [[ ! -L "${RUNFILES}/tensorflow/lib/libtpu.so" ]]; then
-          mkdir "$(real_path ${RUNFILES}/tensorflow/lib)"
-          ln -s $(real_path ./tensorflow/lib/libtpu.so) $(real_path ${RUNFILES}/tensorflow/lib/libtpu.so)
-          echo "Created symlink: $(real_path ./tensorflow/lib/libtpu.so) -> \
-            $(real_path ${RUNFILES}/tensorflow/lib/libtpu.so)"
-        else
-          echo "Symlink already exists: ${RUNFILES}/tensorflow/lib/libtpu.so"
-        fi
-      else
-        echo "Libtpu.so is not found in $(real_path ./tensorflow/lib/)"
-        exit 1
-      fi
-    fi
     cp -LR \
       bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/tensorflow \
       "${TMPDIR}"
@@ -263,11 +246,17 @@ function prepare_src() {
       chmod +rw ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
     else
       chmod +rw ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
+      chmod +rw ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.so
       chmod +rw ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.so
+      chmod +rw ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.so
       patchelf --set-rpath $(patchelf --print-rpath ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so):\$ORIGIN/../../tensorflow/tsl/python/lib/core ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
+      patchelf --set-rpath $(patchelf --print-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.so):\$ORIGIN/../../../../../python ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.so
       patchelf --set-rpath $(patchelf --print-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.so):\$ORIGIN/../../../../../python ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.so
+      patchelf --set-rpath $(patchelf --print-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.so):\$ORIGIN/../../../../../python ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.so
       patchelf --shrink-rpath ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
+      patchelf --shrink-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.so
       patchelf --shrink-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.so
+      patchelf --shrink-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.so
     fi
     mkl_so_dir=$(ls ${RUNFILES}/${so_lib_dir} | grep mkl) || true
     if [ -n "${mkl_so_dir}" ]; then
@@ -358,7 +347,7 @@ function build_wheel() {
     FULL_DIR="$(real_path "$PY_DIR")/bin/python3"
     export PYTHONPATH="$PYTHONPATH:$PWD/bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/pypi_wheel/site-packages/"
   fi
-  
+
   pushd ${TMPDIR} > /dev/null
 
   rm -f MANIFEST
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index a5a20b8dcf78e7..57faa7fb4ae7f2 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,6 +29,7 @@
 2.0](https://github.com/tensorflow/tensorflow/blob/master/LICENSE).
 """
 
+import datetime
 import fnmatch
 import os
 import platform
@@ -178,9 +179,6 @@ def standard_or_nightly(standard, nightly):
     'nvidia-cusparse-cu12 == 12.1.2.141',
     'nvidia-nccl-cu12 == 2.18.3',
     'nvidia-nvjitlink-cu12 == 12.2.140',
-    'tensorrt == 8.6.1.post1',
-    'tensorrt-bindings == 8.6.1',
-    'tensorrt-libs == 8.6.1',
 ]
 
 DOCLINES = __doc__.split('\n')
@@ -322,9 +320,23 @@ def find_files(pattern, root):
 for path in so_lib_paths:
   matches.extend(['../' + x for x in find_files('*', path) if '.py' not in x])
 
-# If building a tpu package, bundle libtpu.so as part of the wheel
+# If building a tpu package, LibTPU for Cloud TPU VM can be installed via:
+# $ pip install <tf-tpu project> -f https://storage.googleapis.com/libtpu-releases/index.html
+# libtpu is built and uploaded to this link every night (PST).
 if '_tpu' in project_name:
-  matches.append('tensorflow/lib/libtpu.so')
+  # For tensorflow-tpu releases, use a set libtpu-nightly version;
+  # For tf-nightly-tpu, use the most recent libtpu-nightly. Because of the
+  # timing of these tests, the UTC date from eight hours ago is expected to be a
+  # valid version.
+  _libtpu_version = standard_or_nightly(
+      '0.1.dev20231018',
+      '0.1.dev'
+      + (
+          datetime.datetime.now(tz=datetime.timezone.utc)
+          - datetime.timedelta(hours=8)
+      ).strftime('%Y%m%d'),
+  )
+  REQUIRED_PACKAGES.append([f'libtpu-nightly=={_libtpu_version}'])
 
 if os.name == 'nt':
   EXTENSION_NAME = 'python/_pywrap_tensorflow_internal.pyd'
@@ -422,6 +434,7 @@ def find_files(pattern, root):
         'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
         'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.12',
         'Programming Language :: Python :: 3 :: Only',
         'Topic :: Scientific/Engineering',
         'Topic :: Scientific/Engineering :: Mathematics',
diff --git a/tensorflow/tools/proto_splitter/BUILD b/tensorflow/tools/proto_splitter/BUILD
index 5a74f394eb9dd1..447bae692394e7 100644
--- a/tensorflow/tools/proto_splitter/BUILD
+++ b/tensorflow/tools/proto_splitter/BUILD
@@ -100,6 +100,7 @@ py_strict_test(
         ":chunk_proto_py",
         ":split",
         ":versions_proto_py",
+        #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/tools/proto_splitter/testdata:test_message_proto_py",
         "@riegeli_py//python/riegeli",
@@ -136,6 +137,7 @@ py_strict_test(
         ":constants",
         ":split_graph_def",
         ":util",
+        #internal proto upb dep
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/tools/proto_splitter/python:test_util",
@@ -153,6 +155,7 @@ py_strict_test(
     srcs = ["util_test.py"],
     deps = [
         ":util",
+        #internal proto upb dep
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/tools/proto_splitter/testdata:test_message_proto_py",
     ],
diff --git a/tensorflow/tools/proto_splitter/cc/BUILD b/tensorflow/tools/proto_splitter/cc/BUILD
index 266d69479ff8a0..1188ed94533864 100644
--- a/tensorflow/tools/proto_splitter/cc/BUILD
+++ b/tensorflow/tools/proto_splitter/cc/BUILD
@@ -1,12 +1,13 @@
-# Description:
-#   Utilities for splitting and joining large protos > 2GB.
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_oss",
     "tf_cc_test",
 )
 
+# Description:
+#   Utilities for splitting and joining large protos > 2GB.
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
@@ -43,11 +44,16 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/tools/proto_splitter:chunk_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:protobuf",
+        "@riegeli//riegeli/bytes:cord_writer",
         "@riegeli//riegeli/bytes:fd_writer",
+        "@riegeli//riegeli/bytes:string_writer",
         "@riegeli//riegeli/records:record_writer",
     ] + if_oss([
         "//tensorflow/tools/proto_splitter:protos_impl",
@@ -85,11 +91,15 @@ tf_cc_test(
         "//tensorflow/tools/proto_splitter:chunk_proto_cc",
         "//tensorflow/tools/proto_splitter/testdata:test_message_proto_cc",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status_matchers",
+        "@riegeli//riegeli/bytes:cord_reader",
         "@riegeli//riegeli/bytes:fd_reader",
+        "@riegeli//riegeli/bytes:string_reader",
         "@riegeli//riegeli/records:record_reader",
     ] + if_oss([
         "//tensorflow/tools/proto_splitter:protos_impl",
@@ -197,6 +207,7 @@ cc_library(
         ":composable_splitter",
         ":max_size",
         ":size_splitter",
+        ":split",
         ":util",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/status",
diff --git a/tensorflow/tools/proto_splitter/cc/composable_splitter_base.cc b/tensorflow/tools/proto_splitter/cc/composable_splitter_base.cc
index 8a9ee3091a1366..b02c09c6fa8d62 100644
--- a/tensorflow/tools/proto_splitter/cc/composable_splitter_base.cc
+++ b/tensorflow/tools/proto_splitter/cc/composable_splitter_base.cc
@@ -1,5 +1,7 @@
 #include "tensorflow/tools/proto_splitter/cc/composable_splitter_base.h"
 
+#include <unistd.h>
+
 /* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,25 +16,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstddef>
+#include <cstdint>
 #include <deque>
 #include <iterator>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "riegeli/bytes/cord_writer.h"  // from @riegeli
 #include "riegeli/bytes/fd_writer.h"  // from @riegeli
+#include "riegeli/bytes/string_writer.h"  // from @riegeli
 #include "riegeli/records/record_writer.h"  // from @riegeli
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/tools/proto_splitter/cc/max_size.h"
+#include "tensorflow/tools/proto_splitter/cc/split.h"
 #include "tensorflow/tools/proto_splitter/cc/util.h"
 #include "tensorflow/tools/proto_splitter/chunk.pb.h"
+#include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+#define IS_OSS true
 
 namespace tensorflow {
 namespace tools::proto_splitter {
@@ -86,27 +102,67 @@ ComposableSplitterBase::Split() {
   return std::make_pair(&chunks_, &chunked_message_);
 }
 
-absl::Status ComposableSplitterBase::Write(std::string file_prefix) {
+template <typename T>
+static absl::Status WriteToRecordWriter(
+    riegeli::RecordWriter<T>& writer, const std::vector<MessageBytes>& chunks,
+    ChunkedMessage& chunked_message,
+    const ::proto_splitter::VersionDef& version) {
+  // Export Riegeli / chunked file.
+  ChunkMetadata metadata;
+  *metadata.mutable_message() = chunked_message;
+  *metadata.mutable_version() = version;
+  auto* metadata_chunks = metadata.mutable_chunks();
+
+  for (const auto& chunk : chunks) {
+    auto* chunk_metadata = metadata_chunks->Add();
+    if (std::holds_alternative<std::shared_ptr<tsl::protobuf::Message>>(
+            chunk)) {
+      const auto& msg_chunk =
+          std::get<std::shared_ptr<tsl::protobuf::Message>>(chunk);
+      LOG(INFO) << "Writing chunk of size " << msg_chunk->ByteSizeLong();
+      writer.WriteRecord(*msg_chunk);
+      chunk_metadata->set_size(msg_chunk->ByteSizeLong());
+      chunk_metadata->set_type(::proto_splitter::ChunkInfo::MESSAGE);
+    } else if (std::holds_alternative<tsl::protobuf::Message*>(chunk)) {
+      auto* msg_chunk = std::get<tsl::protobuf::Message*>(chunk);
+      writer.WriteRecord(*msg_chunk);
+      chunk_metadata->set_size(msg_chunk->ByteSizeLong());
+      chunk_metadata->set_type(::proto_splitter::ChunkInfo::MESSAGE);
+    } else {
+      const auto& str_chunk = std::get<std::string>(chunk);
+      writer.WriteRecord(str_chunk);
+      chunk_metadata->set_size(str_chunk.size());
+      chunk_metadata->set_type(::proto_splitter::ChunkInfo::BYTES);
+    }
+    chunk_metadata->set_offset(writer.LastPos().get().numeric());
+  }
+  writer.WriteRecord(metadata);
+  return absl::OkStatus();
+}
+
+absl::Status ComposableSplitterBase::CheckIfWriteImplemented() {
   if (parent_splitter_ != nullptr) {
     return absl::UnimplementedError(
         "The `Write` function behavior for children ComposableSplitter has not "
-        "been defined. Please call the parent ComposableSplitter's `Write` "
-        "instead.");
-  }
-  auto split_status = Split();
-  if (!split_status.ok()) {
-    return split_status.status();
+        "been defined. Please call `parent_splitter.Write()` instead.");
   }
+  return absl::OkStatus();
+}
 
-  auto chunks = split_status.value().first;
-  auto chunked_message = split_status.value().second;
+absl::Status ComposableSplitterBase::Write(std::string file_prefix) {
+  TF_RETURN_IF_ERROR(CheckIfWriteImplemented());
+
+  auto split_results = Split();
+  if (!split_results.ok()) return split_results.status();
+  auto& chunks = *split_results.value().first;
+  auto& chunked_message = *split_results.value().second;
 
   tsl::Env* env = tsl::Env::Default();
   TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(
       std::string{tensorflow::io::Dirname(file_prefix)}));
 
   std::string output_path;
-  if (chunked_message->chunked_fields().empty()) {
+  if (chunked_message.chunked_fields().empty()) {
     // Export regular pb.
     output_path = absl::StrCat(file_prefix, ".pb");
     TF_RETURN_IF_ERROR(
@@ -114,43 +170,77 @@ absl::Status ComposableSplitterBase::Write(std::string file_prefix) {
   } else {
     // Export Riegeli / chunked file.
     output_path = absl::StrCat(file_prefix, ".cpb");
-    riegeli::RecordWriter writer((riegeli::FdWriter(output_path)));
-
-    ChunkMetadata metadata;
-    metadata.mutable_message()->MergeFrom(*chunked_message);
-    metadata.mutable_version()->MergeFrom(Version());
-    auto metadata_chunks = metadata.mutable_chunks();
-
-    for (auto chunk : *chunks) {
-      auto chunk_metadata = metadata_chunks->Add();
-      if (std::holds_alternative<std::shared_ptr<tsl::protobuf::Message>>(
-              chunk)) {
-        auto msg_chunk =
-            std::get<std::shared_ptr<tsl::protobuf::Message>>(chunk);
-        writer.WriteRecord(*msg_chunk);
-        chunk_metadata->set_size(msg_chunk->ByteSizeLong());
-        chunk_metadata->set_type(::proto_splitter::ChunkInfo::MESSAGE);
-      } else if (std::holds_alternative<tsl::protobuf::Message*>(chunk)) {
-        auto msg_chunk = std::get<tsl::protobuf::Message*>(chunk);
-        writer.WriteRecord(*msg_chunk);
-        chunk_metadata->set_size(msg_chunk->ByteSizeLong());
-        chunk_metadata->set_type(::proto_splitter::ChunkInfo::MESSAGE);
-      } else {
-        auto str_chunk = std::get<std::string>(chunk);
-        writer.WriteRecord(str_chunk);
-        chunk_metadata->set_size(str_chunk.size());
-        chunk_metadata->set_type(::proto_splitter::ChunkInfo::BYTES);
-      }
-      chunk_metadata->set_offset(writer.LastPos().get().numeric());
-    }
-
-    writer.WriteRecord(metadata);
+    using WriterType = riegeli::FdWriter<>;
+    riegeli::RecordWriter<WriterType> writer((WriterType(output_path)));
+    if (!writer.is_open()) return writer.status();
+    TF_RETURN_IF_ERROR(WriteToRecordWriter<WriterType>(
+        writer, chunks, chunked_message, Version()));
     if (!writer.Close()) return writer.status();
   }
   LOG(INFO) << "Splitter output written to " << output_path;
   return absl::OkStatus();
 }
 
+absl::StatusOr<std::tuple<std::string, bool>>
+ComposableSplitterBase::WriteToString() {
+  TF_RETURN_IF_ERROR(CheckIfWriteImplemented());
+
+  auto split_results = Split();
+  if (!split_results.ok()) return split_results.status();
+  auto& chunks = *split_results.value().first;
+  auto& chunked_message = *split_results.value().second;
+
+  std::string output;
+  if (chunked_message.chunked_fields().empty()) {
+    // Export regular pb.
+    if (!message_->SerializeToString(&output))
+      return absl::InvalidArgumentError("Serialization to string failed");
+    LOG(INFO) << "Splitter output written to string";
+    return std::make_tuple(output, false);
+  } else {
+    // Export Riegeli / chunked file.
+    using WriterType = riegeli::StringWriter<>;
+    riegeli::RecordWriter<WriterType> writer((WriterType(&output)));
+    if (!writer.is_open()) return writer.status();
+    TF_RETURN_IF_ERROR(WriteToRecordWriter<WriterType>(
+        writer, chunks, chunked_message, Version()));
+    if (!writer.Close()) return writer.status();
+    LOG(INFO) << "Splitter output written to string";
+    return std::make_tuple(output, true);
+  }
+}
+
+#if !IS_OSS
+absl::StatusOr<std::tuple<absl::Cord, bool>>
+ComposableSplitterBase::WriteToCord() {
+  TF_RETURN_IF_ERROR(CheckIfWriteImplemented());
+
+  auto split_results = Split();
+  if (!split_results.ok()) return split_results.status();
+  auto& chunks = *split_results.value().first;
+  auto& chunked_message = *split_results.value().second;
+
+  absl::Cord output;
+  if (chunked_message.chunked_fields().empty()) {
+    // Export regular pb.
+    if (!message_->SerializeToCord(&output))
+      return absl::InvalidArgumentError("Serialization to absl::Cord failed");
+    LOG(INFO) << "Splitter output written to absl::Cord";
+    return std::make_tuple(output, false);
+  } else {
+    // Export Riegeli / chunked file.
+    using WriterType = riegeli::CordWriter<>;
+    riegeli::RecordWriter<WriterType> writer((WriterType(&output)));
+    if (!writer.is_open()) return writer.status();
+    TF_RETURN_IF_ERROR(WriteToRecordWriter<WriterType>(
+        writer, chunks, chunked_message, Version()));
+    if (!writer.Close()) return writer.status();
+    LOG(INFO) << "Splitter output written to absl::Cord";
+    return std::make_tuple(output, true);
+  }
+}
+#endif
+
 absl::Status ComposableSplitterBase::SetMessageAsBaseChunk() {
   if (!chunks_.empty()) {
     return absl::FailedPreconditionError(
diff --git a/tensorflow/tools/proto_splitter/cc/composable_splitter_base.h b/tensorflow/tools/proto_splitter/cc/composable_splitter_base.h
index 478638b43fb989..a37a3c61ca0a02 100644
--- a/tensorflow/tools/proto_splitter/cc/composable_splitter_base.h
+++ b/tensorflow/tools/proto_splitter/cc/composable_splitter_base.h
@@ -15,18 +15,23 @@ limitations under the License.
 #ifndef TENSORFLOW_TOOLS_PROTO_SPLITTER_CC_COMPOSABLE_SPLITTER_BASE_H_
 #define TENSORFLOW_TOOLS_PROTO_SPLITTER_CC_COMPOSABLE_SPLITTER_BASE_H_
 
+#include <cstddef>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
 #include "tensorflow/tools/proto_splitter/cc/split.h"
 #include "tensorflow/tools/proto_splitter/cc/util.h"
 #include "tensorflow/tools/proto_splitter/chunk.pb.h"
 #include "tsl/platform/protobuf.h"
 
+#define IS_OSS true
+
 namespace tensorflow {
 namespace tools::proto_splitter {
 
@@ -62,6 +67,12 @@ class ComposableSplitterBase : public Splitter {
   //     attach a `.pb` or `.cpb` (chunked pb) suffix depending on whether the
   //     proto is split.
   absl::Status Write(std::string file_prefix) override;
+  // The bool field record whether it's saved as a chunked protobuf (true) or
+  // regular protobuf (false).
+  absl::StatusOr<std::tuple<std::string, bool>> WriteToString();
+#if !IS_OSS
+  absl::StatusOr<std::tuple<absl::Cord, bool>> WriteToCord();
+#endif
 
   VersionDef Version() override;
 
@@ -93,6 +104,7 @@ class ComposableSplitterBase : public Splitter {
   // the chunks were always added to the end of the list. However, this is not
   // always the case the indices must be updated.
   absl::Status FixChunks();
+  absl::Status CheckIfWriteImplemented();
 
   bool built_;
   tsl::protobuf::Message* message_;
diff --git a/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc b/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc
index 85eeab4f5a2dad..8efdf36caee628 100644
--- a/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc
+++ b/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc
@@ -16,12 +16,18 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <tuple>
+#include <utility>
+#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/strings/cord.h"
+#include "riegeli/bytes/cord_reader.h"  // from @riegeli
 #include "riegeli/bytes/fd_reader.h"  // from @riegeli
+#include "riegeli/bytes/string_reader.h"  // from @riegeli
 #include "riegeli/records/record_reader.h"  // from @riegeli
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/platform/env.h"
@@ -33,10 +39,11 @@ limitations under the License.
 #include "tensorflow/tools/proto_splitter/testdata/test_message.pb.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/errors.h"
-#include "tsl/platform/protobuf.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 
+#define IS_OSS true
+
 namespace tensorflow {
 namespace tools::proto_splitter {
 namespace {
@@ -120,23 +127,9 @@ TEST(RepeatedStringSplitterTest, TestSplitChunks) {
   EXPECT_EQ(chunked_message2, chunked_message);
 }
 
-TEST(RepeatedStringSplitterTest, TestWrite) {
-  std::vector<string> strings = {"piece-1", "piece-2", "piece-3"};
-  auto message = SetUpRepeatedString(strings);
-  RepeatedStringSplitter splitter = RepeatedStringSplitter(&message);
-
-  std::string output_prefix = tensorflow::io::GetTempFilename("");
-  TF_ASSERT_OK(splitter.Write(output_prefix));
-  std::string expected_file = absl::StrCat(output_prefix, ".cpb");
-
-  TF_ASSERT_OK_AND_ASSIGN(auto exists,
-                          internal::FileExists(Env::Default(), expected_file));
-  EXPECT_TRUE(exists);
-
-  // Look for the last chunk, which should contain a ChunkMetadata proto.
-  riegeli::RecordReader<riegeli::FdReader<>> reader(
-      (riegeli::FdReader(expected_file)));
-
+template <typename T>
+static void CheckChunks(riegeli::RecordReader<T>& reader,
+                        std::vector<string>& strings) {
   ChunkMetadata chunk_metadata;
   reader.Seek(reader.Size().value());
   reader.SeekBack();
@@ -169,6 +162,60 @@ TEST(RepeatedStringSplitterTest, TestWrite) {
                                })pb"));
 }
 
+TEST(RepeatedStringSplitterTest, TestWrite) {
+  std::vector<string> strings = {"piece-1", "piece-2", "piece-3"};
+  auto message = SetUpRepeatedString(strings);
+  RepeatedStringSplitter splitter = RepeatedStringSplitter(&message);
+
+  std::string output_prefix = tensorflow::io::GetTempFilename("");
+  TF_ASSERT_OK(splitter.Write(output_prefix));
+  std::string expected_file = absl::StrCat(output_prefix, ".cpb");
+
+  TF_ASSERT_OK_AND_ASSIGN(auto exists,
+                          internal::FileExists(Env::Default(), expected_file));
+  EXPECT_TRUE(exists);
+
+  // Look for the last chunk, which should contain a ChunkMetadata proto.
+  riegeli::RecordReader<riegeli::FdReader<>> file_reader(
+      (riegeli::FdReader(expected_file)));
+
+  CheckChunks(file_reader, strings);
+}
+
+TEST(RepeatedStringSplitterTest, TestWriteToString) {
+  std::vector<string> strings = {"piece-1", "piece-2", "piece-3"};
+  auto message = SetUpRepeatedString(strings);
+  RepeatedStringSplitter splitter = RepeatedStringSplitter(&message);
+  auto string_output_results = splitter.WriteToString();
+  TF_EXPECT_OK(string_output_results.status());
+  std::string string_output = std::get<0>(string_output_results.value());
+  bool is_chunked = std::get<1>(string_output_results.value());
+  EXPECT_TRUE(is_chunked);
+  // Look for the last chunk, which should contain a ChunkMetadata proto.
+  riegeli::RecordReader<riegeli::StringReader<>> string_reader(
+      std::forward_as_tuple(string_output));
+
+  CheckChunks(string_reader, strings);
+}
+
+#if !IS_OSS
+TEST(RepeatedStringSplitterTest, TestWriteToCord) {
+  std::vector<string> strings = {"piece-1", "piece-2", "piece-3"};
+  auto message = SetUpRepeatedString(strings);
+  RepeatedStringSplitter splitter = RepeatedStringSplitter(&message);
+  auto cord_output_results = splitter.WriteToCord();
+  TF_EXPECT_OK(cord_output_results.status());
+  absl::Cord cord_output = std::get<0>(cord_output_results.value());
+  bool is_chunked = std::get<1>(cord_output_results.value());
+  EXPECT_TRUE(is_chunked);
+  // Look for the last chunk, which should contain a ChunkMetadata proto.
+  riegeli::RecordReader<riegeli::CordReader<>> cord_reader(
+      std::forward_as_tuple(&cord_output));
+
+  CheckChunks(cord_reader, strings);
+}
+#endif
+
 TEST(RepeatedStringSplitterTest, TestNoSplit) {
   RepeatedString message;  // No strings
   RepeatedStringSplitter splitter = RepeatedStringSplitter(&message);
diff --git a/tensorflow/tools/proto_splitter/cc/graph_def_splitter_test.cc b/tensorflow/tools/proto_splitter/cc/graph_def_splitter_test.cc
index 036c90fde04a94..bbb2587a2d3c39 100644
--- a/tensorflow/tools/proto_splitter/cc/graph_def_splitter_test.cc
+++ b/tensorflow/tools/proto_splitter/cc/graph_def_splitter_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/tools/proto_splitter/cc/graph_def_splitter.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <variant>
@@ -179,7 +180,12 @@ TEST(GraphDefSplitterTest, TestLotsNodes) {
   const std::string graph_def_path =
       io::JoinPath(testing::TensorFlowSrcRoot(),
                    "tools/proto_splitter/testdata", "split-lots-nodes.pb");
-  int64_t max_size = 500;
+
+  // split-lots-nodes.pb has 15 nodes that are 95 or 96 bytes each. The max size
+  // is set to "exactly" the size of 5 nodes, but with the extra encoding bytes,
+  // only 4 nodes should fit in each chunk. Thus, there should be exactly 4
+  // chunks created for all 15 nodes.
+  int64_t max_size = 96 * 5;
   DebugSetMaxSize(max_size);
 
   TF_EXPECT_OK(tensorflow::ReadBinaryProto(tensorflow::Env::Default(),
@@ -196,7 +202,9 @@ TEST(GraphDefSplitterTest, TestLotsNodes) {
       *chunked_message,
       EqualsProto(R"pb(chunk_index: 0
                        chunked_fields { message { chunk_index: 1 } }
-                       chunked_fields { message { chunk_index: 2 } })pb"));
+                       chunked_fields { message { chunk_index: 2 } }
+                       chunked_fields { message { chunk_index: 3 } }
+                       chunked_fields { message { chunk_index: 4 } })pb"));
 
   auto chunks = x.first;
   EXPECT_CHUNK_SIZES(chunks, max_size);
diff --git a/tensorflow/tools/proto_splitter/cc/repeated_field_splitter.cc b/tensorflow/tools/proto_splitter/cc/repeated_field_splitter.cc
index e836556d569974..01601c7e22a1fc 100644
--- a/tensorflow/tools/proto_splitter/cc/repeated_field_splitter.cc
+++ b/tensorflow/tools/proto_splitter/cc/repeated_field_splitter.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/tools/proto_splitter/cc/repeated_field_splitter.h"
 
+#include <cstdint>
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
@@ -23,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/tools/proto_splitter/cc/max_size.h"
+#include "tensorflow/tools/proto_splitter/cc/split.h"
 #include "tensorflow/tools/proto_splitter/cc/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/protobuf.h"
@@ -31,6 +34,10 @@ limitations under the License.
 namespace tensorflow {
 namespace tools::proto_splitter {
 
+// Additional bytes added to each node to account for the extra info needed to
+// encode the field key (realistically 3 but making it 5 for some wiggle room).
+constexpr int kExtraBytes = 5;
+
 template <typename ParentMessage, typename RepeatedMessage>
 absl::StatusOr<RepeatedFieldSplitters<ParentMessage, RepeatedMessage>>
 RepeatedFieldSplitters<ParentMessage, RepeatedMessage>::Create(
@@ -65,13 +72,8 @@ absl::StatusOr<int> RepeatedFieldSplitters<
 
   // List of indices at which to split the repeated field. For example, [3, 5]
   // means that the field list is split into: [:3], [3:5], [5:]
-  std::vector<int> repeated_msg_split = {};
-  // Should be the same length as the list above. Contains new protos to hold
-  // the elements that are split from the original proto.
-  // From the [3, 5] example above, the messages in this list contain nodes
-  // [3:5] and [5:]
-  std::vector<std::shared_ptr<ParentMessage>> repeated_new_msg;
-  // Track the total size of the current node split.
+  std::vector<int> repeated_msg_split = {0};
+  // Track the total byte size of the current node split.
   uint64_t total_size = 0;
 
   // Linearly iterate through all nodes. It may be possible to optimize this
@@ -99,17 +101,12 @@ absl::StatusOr<int> RepeatedFieldSplitters<
     }
     if (total_size + node_size > max_size) {
       repeated_msg_split.push_back(i);
-      auto new_chunk = std::make_shared<ParentMessage>();
-      repeated_new_msg.push_back(new_chunk);
-      std::vector<FieldType> empty_fields = {};
-      auto x = std::make_unique<MessageBytes>(new_chunk);
-      TF_RETURN_IF_ERROR(AddChunk(std::move(x), &empty_fields));
       total_size = 0;
     }
-    total_size += node_size;
+    total_size += node_size + kExtraBytes;
   }
 
-  if (!repeated_msg_split.empty()) {
+  if (repeated_msg_split.size() > 1) {
     auto repeated_nodes_ptrs =
         ret.parent->GetReflection()
             ->template MutableRepeatedPtrField<RepeatedMessage>(ret.parent,
@@ -127,7 +124,11 @@ absl::StatusOr<int> RepeatedFieldSplitters<
     for (int i = 1; i < repeated_msg_split.size(); ++i) {
       start = repeated_msg_split[i - 1];
       int end = repeated_msg_split[i];
-      std::shared_ptr<ParentMessage> new_msg = repeated_new_msg[i - 1];
+
+      auto new_msg = std::make_shared<ParentMessage>();
+      std::vector<FieldType> empty_fields;
+      auto x = std::make_unique<MessageBytes>(new_msg);
+      TF_RETURN_IF_ERROR(AddChunk(std::move(x), &empty_fields));
 
       // Move nodes into new_msg.
       TF_ASSIGN_OR_RETURN(auto new_ret,
diff --git a/tensorflow/tools/proto_splitter/python/BUILD b/tensorflow/tools/proto_splitter/python/BUILD
index 18cd31b66bb8d8..b18bd64b0b8cd8 100644
--- a/tensorflow/tools/proto_splitter/python/BUILD
+++ b/tensorflow/tools/proto_splitter/python/BUILD
@@ -1,8 +1,7 @@
-load("//tensorflow:strict.default.bzl", "py_strict_test")
-
 # Description:
 #   Python library for splitting and joining large protos.
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -13,6 +12,11 @@ package(
 pytype_strict_library(
     name = "saved_model",
     srcs = ["saved_model.py"],
+    # NOTE(yibaimeng): To be removed when everything is migrated to `pywrap_saved_model.Save`.
+    visibility = [
+        "//tensorflow:internal",
+        "//waymo/ml/deploy/tensorflow:__pkg__",
+    ],
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/tools/proto_splitter:constants",
@@ -31,6 +35,7 @@ py_strict_test(
     deps = [
         ":saved_model",
         ":test_util",
+        #internal proto upb dep
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/tools/proto_splitter:constants",
@@ -52,6 +57,7 @@ py_strict_test(
     name = "test_util_test",
     srcs = ["test_util_test.py"],
     deps = [
+        #internal proto upb dep
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/tools/proto_splitter/python:test_util",
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index 43d46c8b5c1e79..d4e12622377a68 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -21,12 +21,10 @@ exports_files([
 
 py_strict_library(
     name = "system_info_lib",
-    srcs = [
-        "gpu_info_lib.py",
-        "system_info_lib.py",
-    ],
+    srcs = ["system_info_lib.py"],
     srcs_version = "PY3",
     deps = [
+        ":gpu_info_lib",
         "//tensorflow:tensorflow_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/client:device_lib",
@@ -36,6 +34,19 @@ py_strict_library(
     ],
 )
 
+py_strict_library(
+    name = "gpu_info_lib",
+    srcs = ["gpu_info_lib.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:gfile",
+        "@six_archive//:six",
+    ],
+)
+
 py_strict_binary(
     name = "system_info",
     srcs = ["system_info.py"],
@@ -54,6 +65,7 @@ py_strict_library(
     ],
     srcs_version = "PY3",
     deps = [
+        ":gpu_info_lib",
         ":system_info_lib",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/platform:gfile",
diff --git a/tensorflow/tools/test/performance.bzl b/tensorflow/tools/test/performance.bzl
index 4f4201e62e73a4..f918da44589729 100644
--- a/tensorflow/tools/test/performance.bzl
+++ b/tensorflow/tools/test/performance.bzl
@@ -1,4 +1,12 @@
-load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
+"""
+Benchmark-related macros.
+"""
+
+load(
+    "//tensorflow:tensorflow.default.bzl",
+    "cuda_py_strict_test",
+    "tf_py_strict_test",
+)
 
 # Create a benchmark test target of a TensorFlow C++ test (tf_cc_*_test)
 def tf_cc_logged_benchmark(
@@ -50,6 +58,33 @@ def tf_cc_logged_benchmark(
         **kwargs
     )
 
+def add_benchmark_tag_to_kwargs(kwargs):
+    """Adds the `benchmark-test` tag to the kwargs, if not already present.
+
+    Notes:
+      For benchmarks which are not technically tests, but whose class methods
+      can still be discovered, and run as such via `bazel run`.
+    Args:
+      kwargs: kwargs to be passed to a test wrapper/rule further down.
+    Returns:
+      kwargs: kwargs with the tags including the `benchmark-test` tags.
+    """
+    benchmark_tag = "benchmark-test"
+    if "tags" in kwargs and kwargs["tags"] != None:
+        if benchmark_tag not in kwargs["tags"]:
+            kwargs["tags"].append(benchmark_tag)
+    else:
+        kwargs["tags"] = [benchmark_tag]
+    return kwargs
+
+def tf_py_benchmark_test(**kwargs):
+    kwargs = add_benchmark_tag_to_kwargs(kwargs)
+    tf_py_strict_test(**kwargs)
+
+def cuda_py_benchmark_test(**kwargs):
+    kwargs = add_benchmark_tag_to_kwargs(kwargs)
+    cuda_py_strict_test(**kwargs)
+
 # Create a benchmark test target of a TensorFlow python test (*py_tests)
 def tf_py_logged_benchmark(
         name = None,
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
index 62e73c996b1829..4a899fb3504e11 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
@@ -42,8 +42,8 @@ scipy ~= 1.7.2; python_version < '3.11'
 scipy ~= 1.9.2; python_version == '3.11' # Earliest version for Python 3.11
 scipy ~= 1.11.3; python_version >= '3.12' # Earliest version for Python 3.12
 # Required for TFLite import from JAX tests
-jax ~= 0.3.25; python_version <= '3.11'
-jaxlib ~= 0.3.25; python_version <= '3.11' # Earliest version for Python 3.11
+jax ~= 0.4.1; python_version <= '3.11'
+jaxlib ~= 0.4.1; python_version <= '3.11' # Earliest version for Python 3.11
 # Needs to be addressed. Unblocked 2.4 branchcut cl/338377048
 PyYAML ~= 6.0
 # For uploading
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc
index c3a792a539c607..0bfee88d16c710 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc
@@ -23,7 +23,7 @@ build --config=release_cpu_linux
 test:nonpip_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --test_lang_filters=py --test_size_filters=small,medium
-test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # For building libtensorflow archives
 test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
@@ -40,4 +40,4 @@ build:rbe --config=rbe_linux_cpu
 test:pycpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:pycpp_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:pycpp_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
index 14b75645a85fab..1f21969496f1e9 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
@@ -45,7 +45,7 @@ test --test_summary=short
 test:nonpip_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --test_lang_filters=py --test_size_filters=small,medium
-test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # For building libtensorflow archives
 test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
@@ -82,4 +82,4 @@ build:rbe --project_id="tensorflow-testing"
 test:pycpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:pycpp_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:pycpp_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
index e85df2f297ec0a..8bfadb03c734bd 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
@@ -19,10 +19,10 @@ build --config=release_gpu_linux
 # Pass --config=nonpip to run the same suite of tests. If you want to run just
 # one test for investigation, you don't need --config=nonpip; just run the
 # bazel test invocation as normal.
-test:nonpip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py39,-no_oss_py310
-test:nonpip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py39,-no_oss_py310
+test:nonpip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-no_cuda11,-no_oss_py39,-no_oss_py310
+test:nonpip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-no_cuda11,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --test_lang_filters=py --test_size_filters=small,medium
-test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # For building libtensorflow archives
 test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
@@ -39,4 +39,4 @@ build:rbe --config=rbe_linux_cuda
 test:pycpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:pycpp_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:pycpp_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/rename_and_verify_wheels.sh b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/rename_and_verify_wheels.sh
index dd7c1524ba9bec..1ba11e07f53a10 100755
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/rename_and_verify_wheels.sh
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/rename_and_verify_wheels.sh
@@ -21,11 +21,7 @@ set -euxo pipefail
 
 for wheel in /tf/pkg/*.whl; do
   echo "Checking and renaming $wheel..."
-  if [[ "$wheel" =~ .*_tpu.* ]]; then
-    time python3 -m auditwheel repair --plat manylinux_2_27_x86_64 "$wheel" --wheel-dir /tf/pkg 2>&1 | tee check.txt
-  else
-    time python3 -m auditwheel repair --plat manylinux2014_x86_64 "$wheel" --wheel-dir /tf/pkg 2>&1 | tee check.txt
-  fi
+  time python3 -m auditwheel repair --plat manylinux2014_x86_64 "$wheel" --wheel-dir /tf/pkg 2>&1 | tee check.txt
 
   # We don't need the original wheel if it was renamed
   new_wheel=$(grep --extended-regexp --only-matching '/tf/pkg/\S+.whl' check.txt)
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/wheel_verification.bats b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/wheel_verification.bats
index 19662eb904bfb5..17b689dbedd6dd 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/wheel_verification.bats
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/wheel_verification.bats
@@ -26,13 +26,9 @@ teardown_file() {
     rm -rf /tf/venv
 }
 
-@test "Wheel is manylinux2014 (manylinux_2_17) compliant (TPU wheel is manylinux_2_27 compliant)" {
+@test "Wheel is manylinux2014 (manylinux_2_17) compliant" {
     python3 -m auditwheel show "$TF_WHEEL" > audit.txt
-    if [[ "$TF_WHEEL" =~ .*_tpu.* ]]; then
-        grep --quiet 'This constrains the platform tag to "manylinux_2_27_x86_64"' audit.txt
-    else
-        grep --quiet 'This constrains the platform tag to "manylinux_2_17_x86_64"' audit.txt
-    fi
+    grep --quiet 'This constrains the platform tag to "manylinux_2_17_x86_64"' audit.txt
 }
 
 @test "Wheel conforms to upstream size limitations" {
@@ -58,10 +54,12 @@ teardown_file() {
 # Note: this runs before the tests further down the file, so TF is installed in
 # the venv and the venv is active when those tests run. The venv gets cleaned
 # up in teardown_file() above.
+# LibTPU is necessary if building a tpu package, and it is installed via
+# "-f <libtpu_download_url>". See tensorflow/setup.py.
 @test "Wheel is installable" {
     python3 -m venv /tf/venv
     source /tf/venv/bin/activate
-    python3 -m pip install "$TF_WHEEL"
+    python3 -m pip install "$TF_WHEEL" -f https://storage.googleapis.com/libtpu-releases/index.html
 }
 
 @test "TensorFlow is importable" {
diff --git a/tensorflow/tools/toolchains/cross_compile/cc/BUILD b/tensorflow/tools/toolchains/cross_compile/cc/BUILD
new file mode 100644
index 00000000000000..7db2527259d026
--- /dev/null
+++ b/tensorflow/tools/toolchains/cross_compile/cc/BUILD
@@ -0,0 +1,188 @@
+"""Toolchain configs for cross-compiling TensorFlow"""
+
+load("@bazel_tools//tools/cpp:unix_cc_toolchain_config.bzl", "cc_toolchain_config")
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["restricted"])
+
+cc_toolchain_suite(
+    name = "cross_compile_toolchain_suite",
+    toolchains = {
+        "aarch64": ":linux_aarch64_toolchain",
+        "k8": ":linux_x86_toolchain",
+    },
+)
+
+filegroup(name = "empty")
+
+cc_toolchain(
+    name = "linux_x86_toolchain",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":linux_x86_toolchain_config",
+    toolchain_identifier = "linux_x86_toolchain",
+)
+
+cc_toolchain_config(
+    name = "linux_x86_toolchain_config",
+    abi_libc_version = "local",
+    abi_version = "local",
+    builtin_sysroot = "/dt9",
+    compile_flags = [
+        "--target=x86_64-unknown-linux-gnu",
+        "-fstack-protector",
+        "-Wall",
+        "-Wthread-safety",
+        "-Wself-assign",
+        "-Wunused-but-set-parameter",
+        "-Wno-free-nonheap-object",
+        "-fcolor-diagnostics",
+        "-fno-omit-frame-pointer",
+        "-mavx",
+    ],
+    compiler = "clang",
+    coverage_compile_flags = ["--coverage"],
+    coverage_link_flags = ["--coverage"],
+    cpu = "k8",
+    cxx_builtin_include_directories = [
+        "/dt9/",
+        "/usr/lib/llvm-17/include/",
+        "/usr/lib/llvm-17/lib/clang/17/include",
+    ],
+    dbg_compile_flags = ["-g"],
+    host_system_name = "linux",
+    link_flags = [
+        "--target=x86_64-unknown-linux-gnu",
+        "-fuse-ld=lld",
+        "--ld-path=/usr/lib/llvm-17/bin/ld.lld",
+        "-Wl,--undefined-version",
+    ],
+    link_libs = [
+        "-lstdc++",
+        "-lm",
+    ],
+    opt_compile_flags = [
+        "-g0",
+        "-O2",
+        "-D_FORTIFY_SOURCE=1",
+        "-DNDEBUG",
+        "-ffunction-sections",
+        "-fdata-sections",
+    ],
+    opt_link_flags = ["-Wl,--gc-sections"],
+    supports_start_end_lib = True,
+    target_libc = "",
+    target_system_name = "x86_64-unknown-linux-gnu",
+    tool_paths = {
+        "gcc": "/usr/lib/llvm-17/bin/clang",
+        "ld": "/usr/lib/llvm-17/bin/ld.lld",
+        "ar": "/usr/lib/llvm-17/bin/llvm-ar",
+        "cpp": "/usr/lib/llvm-17/bin/clang++",
+        "llvm-cov": "/usr/lib/llvm-17/bin/llvm-cov",
+        "nm": "/usr/lib/llvm-17/bin/llvm-nm",
+        "objdump": "/usr/lib/llvm-17/bin/llvm-objdump",
+        "strip": "/usr/lib/llvm-17/bin/llvm-strip",
+    },
+    toolchain_identifier = "linux_x86_toolchain",
+    unfiltered_compile_flags = [
+        "-no-canonical-prefixes",
+        "-Wno-builtin-macro-redefined",
+        "-D__DATE__=\"redacted\"",
+        "-D__TIMESTAMP__=\"redacted\"",
+        "-D__TIME__=\"redacted\"",
+        "-Wno-unused-command-line-argument",
+        "-Wno-gnu-offsetof-extensions",
+    ],
+)
+
+cc_toolchain(
+    name = "linux_aarch64_toolchain",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":linux_aarch64_toolchain_config",
+    toolchain_identifier = "linux_aarch64_toolchain",
+)
+
+cc_toolchain_config(
+    name = "linux_aarch64_toolchain_config",
+    abi_libc_version = "local",
+    abi_version = "local",
+    builtin_sysroot = "/dt10/",
+    compile_flags = [
+        "--target=aarch64-unknown-linux-gnu",
+        "-fstack-protector",
+        "-Wall",
+        "-Wthread-safety",
+        "-Wself-assign",
+        "-Wunused-but-set-parameter",
+        "-Wno-free-nonheap-object",
+        "-fcolor-diagnostics",
+        "-fno-omit-frame-pointer",
+        "-mtune=generic",
+        "-march=armv8-a",
+    ],
+    compiler = "clang",
+    coverage_compile_flags = ["--coverage"],
+    coverage_link_flags = ["--coverage"],
+    cpu = "aarch64",
+    cxx_builtin_include_directories = [
+        "/dt10/",
+        "/usr/lib/llvm-17/include/",
+        "/usr/lib/llvm-17/lib/clang/17/include",
+    ],
+    dbg_compile_flags = ["-g"],
+    host_system_name = "linux",
+    link_flags = [
+        "--target=aarch64-unknown-linux-gnu",
+        "-fuse-ld=lld",
+        "--ld-path=/usr/lib/llvm-17/bin/ld.lld",
+        "-Wl,--undefined-version",
+    ],
+    link_libs = [
+        "-lstdc++",
+        "-lm",
+    ],
+    opt_compile_flags = [
+        "-g0",
+        "-O2",
+        "-D_FORTIFY_SOURCE=1",
+        "-DNDEBUG",
+        "-ffunction-sections",
+        "-fdata-sections",
+    ],
+    opt_link_flags = ["-Wl,--gc-sections"],
+    supports_start_end_lib = True,
+    target_libc = "",
+    target_system_name = "aarch64-unknown-linux-gnu",
+    tool_paths = {
+        "gcc": "/usr/lib/llvm-17/bin/clang",
+        "ld": "/usr/lib/llvm-17/bin/ld.lld",
+        "ar": "/usr/lib/llvm-17/bin/llvm-ar",
+        "cpp": "/usr/lib/llvm-17/bin/clang++",
+        "llvm-cov": "/usr/lib/llvm-17/bin/llvm-cov",
+        "nm": "/usr/lib/llvm-17/bin/llvm-nm",
+        "objdump": "/usr/lib/llvm-17/bin/llvm-objdump",
+        "strip": "/usr/lib/llvm-17/bin/llvm-strip",
+    },
+    toolchain_identifier = "linux_aarch64_toolchain",
+    unfiltered_compile_flags = [
+        "-no-canonical-prefixes",
+        "-Wno-builtin-macro-redefined",
+        "-D__DATE__=\"redacted\"",
+        "-D__TIMESTAMP__=\"redacted\"",
+        "-D__TIME__=\"redacted\"",
+        "-Wno-unused-command-line-argument",
+        "-Wno-gnu-offsetof-extensions",
+    ],
+)
diff --git a/tensorflow/tools/toolchains/cross_compile/config/BUILD b/tensorflow/tools/toolchains/cross_compile/config/BUILD
new file mode 100644
index 00000000000000..b6a504ba1449d6
--- /dev/null
+++ b/tensorflow/tools/toolchains/cross_compile/config/BUILD
@@ -0,0 +1,23 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["restricted"])
+
+platform(
+    name = "linux_x86_64",
+    constraint_values = [
+        "@platforms//os:linux",
+        "@platforms//cpu:x86_64",
+    ],
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/ml-devinfra-linux-aarch64-cross-compile@sha256:11c5ac3b9b4e01cfa82b39b90826a9bfc5b806ccc92cd3d272e6bf861de43be1",
+        "OSFamily": "Linux",
+    },
+)
+
+platform(
+    name = "linux_aarch64",
+    constraint_values = [
+        "@platforms//os:linux",
+        "@platforms//cpu:aarch64",
+    ],
+)
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index e8fc081f0af511..a7cbf50e47eea3 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -200,6 +200,28 @@ def initialize_rbe_configs():
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu20.04-clang_manylinux2014-cuda12.3-cudnn8.9",
+        compiler = "/usr/lib/llvm-17/bin/clang",
+        cuda_version = "12.3",
+        cudnn_version = "8.9",
+        os = "ubuntu20.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        sysroot = "/dt9",
+        python_install_path = "/usr/local",
+    )
+
+    tensorflow_rbe_config(
+        name = "ubuntu20.04-gcc9_manylinux2014-cuda12.3-cudnn8.9",
+        compiler = "/dt9/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "12.3",
+        cudnn_version = "8.9",
+        os = "ubuntu20.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        python_install_path = "/usr/local",
+    )
+
     tensorflow_rbe_win_config(
         name = "windows_py37",
         python_bin_path = "C:/Python37/python.exe",
diff --git a/tensorflow/tools/toolchains/remote_config/containers.bzl b/tensorflow/tools/toolchains/remote_config/containers.bzl
index bfb4634e810328..cd346c2816def1 100644
--- a/tensorflow/tools/toolchains/remote_config/containers.bzl
+++ b/tensorflow/tools/toolchains/remote_config/containers.bzl
@@ -5,8 +5,9 @@ container_digests = {
     # TF now uses only this container
     "cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython": "sha256:48612bd85709cd014711d0b0f87e0806f3567d06d2e81c6e860516b87498b821",
     # JAX manylinux2014 configs.
-    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:ab39410baf2fc1d31d50540acec7640d7f4814fa694e2421b696b6f0a058d645",
-    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:b699d6ae235ac601dc3e62391ac7c4606cb10331f8141983858c1580f5e74ddb",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:b112c0c77d4172fc025420938f13ea83f3ad480c01778e743a201e5e3f4710e1",
+    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:9fefda035b4a12b24cd5bae56c7dbb9527a5fd06a41ced0a22ac86fe5ed26428",
+    "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:afe68c3448734cb07b16005fd9ed47d19533eb8bf5acd92863735ce24766b93b",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
@@ -98,6 +99,13 @@ containers = {
         "digest": container_digests["cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython.
     "rocm-ubuntu18.04-manylinux2010-multipython": {
         "registry": "gcr.io",
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 65074788800b78..631771d1e09f34 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -150,9 +150,9 @@ def _tf_repositories():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "88e0158aff1e1498e34dfcaf08d948a73a3246a04fe96e548da71f6b9245a009",
-        strip_prefix = "XNNPACK-c7e7cde37615a81a529c326aa278bfab4cd6fe5a",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/c7e7cde37615a81a529c326aa278bfab4cd6fe5a.zip"),
+        sha256 = "ca829b6486d7dcc0a63eae9d5d5be21dcb542e6601af4cada17b9d5f7d5fafb7",
+        strip_prefix = "XNNPACK-0cbbe74a16e6ca11acf8484ccac85f620336dea4",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/0cbbe74a16e6ca11acf8484ccac85f620336dea4.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
@@ -172,9 +172,9 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "cpuinfo",
-        strip_prefix = "cpuinfo-959002f82d7962a473d8bf301845f2af720e0aa4",
-        sha256 = "a0f53ccfb477c57753c595df02bf79ed67bf092fd9a5c61ec5b8992b81bc1e65",
-        urls = tf_mirror_urls("https://github.com/pytorch/cpuinfo/archive/959002f82d7962a473d8bf301845f2af720e0aa4.zip"),
+        strip_prefix = "cpuinfo-ef634603954d88d2643d5809011288b890ac126e",
+        sha256 = "e07512a11e1c71687359a133f49d60583d7465b737fe5dbe11f461c9aaa72a2b",
+        urls = tf_mirror_urls("https://github.com/pytorch/cpuinfo/archive/ef634603954d88d2643d5809011288b890ac126e.zip"),
     )
 
     tf_http_archive(
@@ -186,6 +186,14 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v0.9.zip"),
     )
 
+    tf_http_archive(
+        name = "cutlass_archive",
+        build_file = "//third_party:cutlass.BUILD",
+        sha256 = "ea1b7f96919460a5d80b09c1b246652539a8605600b2be4cccc02c254bccbe50",
+        strip_prefix = "cutlass-5783d6dbd0c34032371cce2bd999fc76007520d7",
+        urls = tf_mirror_urls("https://github.com/chsigg/cutlass/archive/5783d6dbd0c34032371cce2bd999fc76007520d7.tar.gz"),
+    )
+
     tf_http_archive(
         name = "mkl_dnn_v1",
         build_file = "//third_party/mkl_dnn:mkldnn_v1.BUILD",
@@ -589,6 +597,16 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/google/pprof/archive/83db2b799d1f74c40857232cb5eb4c60379fe6c2.tar.gz"),
     )
 
+    # The CUDA 11 toolkit ships with CUB.  We should be able to delete this rule
+    # once TF drops support for CUDA 10.
+    tf_http_archive(
+        name = "cub_archive",
+        build_file = "//third_party:cub.BUILD",
+        sha256 = "162514b3cc264ac89d91898b58450190b8192e2af1142cf8ccac2d59aa160dda",
+        strip_prefix = "cub-1.9.9",
+        urls = tf_mirror_urls("https://github.com/NVlabs/cub/archive/1.9.9.zip"),
+    )
+
     tf_http_archive(
         name = "nvtx_archive",
         build_file = "//third_party:nvtx.BUILD",
diff --git a/third_party/cutlass.BUILD b/third_party/cutlass.BUILD
new file mode 100644
index 00000000000000..923d2f044c395a
--- /dev/null
+++ b/third_party/cutlass.BUILD
@@ -0,0 +1,24 @@
+# Description:
+# CUTLASS is a collection of CUDA C++ template abstractions for implementing high-performance
+# matrix-matrix multiplication (GEMM) and related computations at all levels and scales within CUDA.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # MIT
+
+exports_files(["LICENSE.txt"])
+
+filegroup(
+    name = "cutlass_header_files",
+    srcs = glob([
+        "include/**",
+    ]),
+)
+
+cc_library(
+    name = "cutlass",
+    hdrs = [":cutlass_header_files"],
+    strip_include_prefix = "/include",
+)
diff --git a/third_party/flatbuffers/workspace.bzl b/third_party/flatbuffers/workspace.bzl
index 1aa9b2ff2d00ba..a0b943d7a9487b 100644
--- a/third_party/flatbuffers/workspace.bzl
+++ b/third_party/flatbuffers/workspace.bzl
@@ -2,12 +2,20 @@
 
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
+# _FLATBUFFERS_GIT_COMMIT / _FLATBUFFERS_SHA256 were added due to an urgent change being made to
+# Flatbuffers that needed to be updated in order for Flatbuffers/TfLite be compatible with Android
+# API level >= 23. They can be removed next flatbuffers offical release / update.
+_FLATBUFFERS_GIT_COMMIT = "7d6d99c6befa635780a4e944d37ebfd58e68a108"
+
+# curl -L https://github.com/google/flatbuffers/archive/<_FLATBUFFERS_GIT_COMMIT>.tar.gz | shasum -a 256
+_FLATBUFFERS_SHA256 = "d27761f6b2fb1017ec00ed317a7b98cb7aed86b81d90528b498fb17ec13579a1"
+
 def repo():
     tf_http_archive(
         name = "flatbuffers",
-        strip_prefix = "flatbuffers-23.5.26",
-        sha256 = "1cce06b17cddd896b6d73cc047e36a254fb8df4d7ea18a46acf16c4c0cd3f3f3",
-        urls = tf_mirror_urls("https://github.com/google/flatbuffers/archive/v23.5.26.tar.gz"),
+        strip_prefix = "flatbuffers-%s" % _FLATBUFFERS_GIT_COMMIT,
+        sha256 = _FLATBUFFERS_SHA256,
+        urls = tf_mirror_urls("https://github.com/google/flatbuffers/archive/%s.tar.gz" % _FLATBUFFERS_GIT_COMMIT),
         build_file = "//third_party/flatbuffers:flatbuffers.BUILD",
         system_build_file = "//third_party/flatbuffers:BUILD.system",
         link_files = {
diff --git a/third_party/gemmlowp/workspace.bzl b/third_party/gemmlowp/workspace.bzl
index b98035569852e2..884f707719a623 100644
--- a/third_party/gemmlowp/workspace.bzl
+++ b/third_party/gemmlowp/workspace.bzl
@@ -7,8 +7,8 @@ def repo():
 
     # Attention: tools parse and update these lines.
     # LINT.IfChange
-    GEMMLOWP_COMMIT = "e844ffd17118c1e17d94e1ba4354c075a4577b88"
-    GEMMLOWP_SHA256 = "522b7a82d920ebd0c4408a5365866a40b81d1c0d60b2369011d315cca03c6476"
+    GEMMLOWP_COMMIT = "16e8662c34917be0065110bfcd9cc27d30f52fdf"
+    GEMMLOWP_SHA256 = "7dc418717c8456473fac4ff2288b71057e3dcb72894524c734a4362cdb51fa8b"
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/gemmlowp.cmake)
 
     tf_http_archive(
diff --git a/third_party/gif_fix_image_counter.patch b/third_party/gif_fix_image_counter.patch
index 1d72f75d6e80f4..2184e18af1b435 100644
--- a/third_party/gif_fix_image_counter.patch
+++ b/third_party/gif_fix_image_counter.patch
@@ -1,5 +1,5 @@
 diff --git a/dgif_lib.c b/dgif_lib.c
-index 82fc097..c6700a9 100644
+index 82fc097..214a0e7 100644
 --- a/dgif_lib.c
 +++ b/dgif_lib.c
 @@ -810,7 +810,8 @@ DGifSetupDecompress(GifFileType *GifFile)
@@ -12,7 +12,7 @@ index 82fc097..c6700a9 100644
      }
      BitsPerPixel = CodeSize;
  
-@@ -1118,6 +1119,28 @@ DGifBufferedInput(GifFileType *GifFile, GifByteType *Buf, GifByteType *NextByte)
+@@ -1118,6 +1119,31 @@ DGifBufferedInput(GifFileType *GifFile, GifByteType *Buf, GifByteType *NextByte)
      return GIF_OK;
  }
  
@@ -29,6 +29,9 @@ index 82fc097..c6700a9 100644
 +    if (GifFile->SavedImages[GifFile->ImageCount].RasterBits != NULL) {
 +        free(GifFile->SavedImages[GifFile->ImageCount].RasterBits);
 +    }
++    if (GifFile->SavedImages[GifFile->ImageCount].ImageDesc.ColorMap != NULL) {
++        GifFreeMapObject(GifFile->SavedImages[GifFile->ImageCount].ImageDesc.ColorMap);
++    }
 +    
 +    // Realloc array according to the new image counter.
 +    SavedImage *correct_saved_images = (SavedImage *)reallocarray(
@@ -41,7 +44,7 @@ index 82fc097..c6700a9 100644
  /******************************************************************************
   This routine reads an entire GIF into core, hanging all its state info off
   the GifFileType pointer.  Call DGifOpenFileName() or DGifOpenFileHandle()
-@@ -1148,17 +1171,20 @@ DGifSlurp(GifFileType *GifFile)
+@@ -1148,17 +1174,20 @@ DGifSlurp(GifFileType *GifFile)
                /* Allocate memory for the image */
                if (sp->ImageDesc.Width <= 0 || sp->ImageDesc.Height <= 0 ||
                        sp->ImageDesc.Width > (INT_MAX / sp->ImageDesc.Height)) {
@@ -62,7 +65,7 @@ index 82fc097..c6700a9 100644
                    return GIF_ERROR;
                }
  
-@@ -1177,13 +1203,17 @@ DGifSlurp(GifFileType *GifFile)
+@@ -1177,13 +1206,17 @@ DGifSlurp(GifFileType *GifFile)
  			   j += InterlacedJumps[i]) {
  			  if (DGifGetLine(GifFile, 
  					  sp->RasterBits+j*sp->ImageDesc.Width, 
diff --git a/third_party/gloo/BUILD b/third_party/gloo/BUILD
new file mode 100644
index 00000000000000..3c413807167aeb
--- /dev/null
+++ b/third_party/gloo/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/gloo/gloo.BUILD b/third_party/gloo/gloo.BUILD
new file mode 100644
index 00000000000000..68ba4e3610da70
--- /dev/null
+++ b/third_party/gloo/gloo.BUILD
@@ -0,0 +1,97 @@
+# Description:
+#   Gloo is a collective communications library
+
+load("//third_party/bazel_skylib/rules:expand_template.bzl", "expand_template")
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+substitions = {
+    "@GLOO_VERSION_MAJOR@": "9999",
+    "@GLOO_VERSION_MINOR@": "0",
+    "@GLOO_VERSION_PATCH@": "0",
+    "#cmakedefine01 GLOO_USE_CUDA": "#define GLOO_USE_CUDA 0",
+    "#cmakedefine01 GLOO_USE_NCCL": "#define GLOO_USE_NCCL 0",
+    "#cmakedefine01 GLOO_USE_ROCM": "#define GLOO_USE_ROCM 0",
+    "#cmakedefine01 GLOO_USE_RCCL": "#define GLOO_USE_RCCL 0",
+    "#cmakedefine01 GLOO_USE_REDIS": "#define GLOO_USE_REDIS 0",
+    "#cmakedefine01 GLOO_USE_IBVERBS": "#define GLOO_USE_IBVERBS 0",
+    "#cmakedefine01 GLOO_USE_MPI": "#define GLOO_USE_MPI 0",
+    "#cmakedefine01 GLOO_USE_LIBUV": "#define GLOO_USE_LIBUV 0",
+    "#cmakedefine01 GLOO_HAVE_TRANSPORT_TCP": "#define GLOO_HAVE_TRANSPORT_TCP 1",
+    "#cmakedefine01 GLOO_HAVE_TRANSPORT_TCP_TLS": "#define GLOO_HAVE_TRANSPORT_TCP_TLS 0",
+    "#cmakedefine01 GLOO_HAVE_TRANSPORT_IBVERBS": "#define GLOO_HAVE_TRANSPORT_IBVERBS 0",
+    "#cmakedefine01 GLOO_HAVE_TRANSPORT_UV": "#define GLOO_HAVE_TRANSPORT_UV 0",
+    "#cmakedefine01 GLOO_USE_AVX": "#define GLOO_USE_AVX __AVX__",
+}
+
+expand_template(
+    name = "config",
+    out = "gloo/config.h",
+    substitutions = substitions,
+    template = "gloo/config.h.in",
+)
+
+cc_library(
+    name = "gloo",
+    srcs = glob(
+        [
+            "gloo/*.cc",
+            "gloo/common/*.cc",
+            "gloo/transport/*.cc",
+        ],
+        exclude = [
+            "gloo/common/linux.cc",
+            "gloo/common/win.cc",
+            "gloo/cuda*.cc",
+        ],
+    ) + [
+        "gloo/rendezvous/context.cc",
+        "gloo/rendezvous/file_store.cc",
+        "gloo/rendezvous/hash_store.cc",
+        "gloo/rendezvous/prefix_store.cc",
+        "gloo/rendezvous/store.cc",
+    ] + select({
+        "@local_tsl//tsl:macos": [],
+        "@local_tsl//tsl:windows": [],
+        "//conditions:default": [
+            "gloo/common/linux.cc",
+        ],
+    }),
+    copts = [
+        "-fexceptions",
+        "-Wno-unused-variable",
+    ],
+    includes = ["."],
+    textual_hdrs = glob(
+        [
+            "gloo/*.h",
+            "gloo/common/*.h",
+            "gloo/transport/*.h",
+        ],
+        exclude = [
+            "gloo/cuda*.h",
+            "gloo/common/win.h",
+        ],
+    ) + [
+        "gloo/config.h",
+        "gloo/rendezvous/context.h",
+        "gloo/rendezvous/file_store.h",
+        "gloo/rendezvous/hash_store.h",
+        "gloo/rendezvous/prefix_store.h",
+        "gloo/rendezvous/store.h",
+    ],
+)
+
+cc_library(
+    name = "transport_tcp",
+    srcs = glob(["gloo/transport/tcp/*.cc"]),
+    hdrs = glob(["gloo/transport/tcp/*.h"]),
+    copts = ["-fexceptions"],
+    deps = [":gloo"],
+)
diff --git a/third_party/gloo/workspace.bzl b/third_party/gloo/workspace.bzl
new file mode 100644
index 00000000000000..ede168395acdc5
--- /dev/null
+++ b/third_party/gloo/workspace.bzl
@@ -0,0 +1,17 @@
+"""Provides the repository macro to import Gloo."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    """Imports Gloo."""
+
+    GLOO_COMMIT = "5354032ea08eadd7fc4456477f7f7c6308818509"
+    GLOO_SHA256 = "5759a06e6c8863c58e8ceadeb56f7c701fec89b2559ba33a103a447207bf69c7"
+
+    tf_http_archive(
+        name = "gloo",
+        sha256 = GLOO_SHA256,
+        strip_prefix = "gloo-{commit}".format(commit = GLOO_COMMIT),
+        urls = tf_mirror_urls("https://github.com/facebookincubator/gloo/archive/{commit}.tar.gz".format(commit = GLOO_COMMIT)),
+        build_file = "//third_party/gloo:gloo.BUILD",
+    )
diff --git a/third_party/gpus/check_cuda_libs.py b/third_party/gpus/check_cuda_libs.py
index b7d98ef2581157..afd6380b0ac203 100644
--- a/third_party/gpus/check_cuda_libs.py
+++ b/third_party/gpus/check_cuda_libs.py
@@ -23,6 +23,7 @@
 """
 import os
 import os.path
+import platform
 import subprocess
 import sys
 
@@ -38,6 +39,10 @@ class ConfigError(Exception):
   pass
 
 
+def _is_windows():
+  return platform.system() == "Windows"
+
+
 def check_cuda_lib(path, check_soname=True):
   """Tests if a library exists on disk and whether its soname matches the filename.
 
@@ -52,7 +57,7 @@ def check_cuda_lib(path, check_soname=True):
   if not os.path.isfile(path):
     raise ConfigError("No library found under: " + path)
   objdump = which("objdump")
-  if check_soname and objdump is not None:
+  if check_soname and objdump is not None and not _is_windows():
     # Decode is necessary as in py3 the return type changed from str to bytes
     output = subprocess.check_output([objdump, "-p", path]).decode("utf-8")
     output = [line for line in output.splitlines() if "SONAME" in line]
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
index 81e54ad431fccf..0da1d7b58f4bb0 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -45,10 +45,11 @@ import pipes
 
 # Template values set by cuda_autoconf.
 CPU_COMPILER = ('%{cpu_compiler}')
-GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
+HOST_COMPILER_PATH = ('%{host_compiler_path}')
 
 NVCC_PATH = '%{nvcc_path}'
-PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+PREFIX_DIR = os.path.dirname(HOST_COMPILER_PATH)
+USE_CLANG_COMPILER = '%{use_clang_compiler}'
 NVCC_VERSION = '%{cuda_version}'
 
 def Log(s):
@@ -253,13 +254,23 @@ def InvokeNvcc(argv, log=False):
   # Force C++17 dialect (note, everything in just one string!)
   nvccopts += ' --std c++17 '
   nvccopts += fatbin_options
+  # The option `-allow-unsupported-compiler` is required for the combination of
+  # NVCC+clang compilers. 
+  # The following message appears if this option is not provided:
+  # unsupported clang version! clang version must be less than 16 and greater
+  # than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used
+  # to override this version check; however, using an unsupported host compiler
+  # may cause compilation failure or incorrect run time execution.
+  # Use at your own risk.
+  if USE_CLANG_COMPILER:
+    nvccopts += ' -allow-unsupported-compiler --expt-extended-lambda --expt-relaxed-constexpr '
 
   if depfiles:
     # Generate the dependency file
     depfile = depfiles[0]
     cmd = (NVCC_PATH + ' ' + nvccopts +
            ' --compiler-options "' + host_compiler_options + '"' +
-           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+           ' --compiler-bindir=' + HOST_COMPILER_PATH +
            ' -I .' +
            ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
     if log: Log(cmd)
@@ -269,7 +280,7 @@ def InvokeNvcc(argv, log=False):
 
   cmd = (NVCC_PATH + ' ' + nvccopts +
          ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
-         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+         ' --compiler-bindir=' + HOST_COMPILER_PATH +
          ' -I .' +
          ' -x cu ' + opt + includes + ' -c ' + srcs + out)
 
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index 8fb22313010a45..77ec948af32c6e 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -86,8 +86,8 @@ def GetHostCompilerOptions(argv):
     opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
   if args.g:
     opts += ' -g' + ' -g'.join(sum(args.g, []))
-  #if args.fno_canonical_system_headers:
-  #  opts += ' -fno-canonical-system-headers'
+  if args.fno_canonical_system_headers:
+    opts += ' -no-canonical-prefixes'
   if args.sysroot:
     opts += ' --sysroot ' + args.sysroot[0]
 
diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
new file mode 100644
index 00000000000000..c46e09484fdfad
--- /dev/null
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -0,0 +1,256 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
+
+DESCRIPTION:
+  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
+"""
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import tempfile
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('%{cpu_compiler}')
+GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
+
+NVCC_PATH = '%{nvcc_path}'
+NVCC_VERSION = '%{cuda_version}'
+NVCC_TEMP_DIR = "%{nvcc_tmp_dir}"
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from options.
+
+  Args:
+    option: The option whose value to extract.
+
+  Returns:
+    1. A list of values, either directly following the option,
+    (eg., /opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., /opt val1 /opt val2).
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser(prefix_chars='-/')
+  parser.add_argument(option, nargs='*', action='append')
+  option = option.lstrip('-/').replace('-', '_')
+  args, leftover = parser.parse_known_args(argv)
+  if args and vars(args)[option]:
+    return (sum(vars(args)[option], []), leftover)
+  return ([], leftover)
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    1. The string that can be passed directly to nvcc.
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, leftover = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return (['--' + a for a in options], leftover)
+  return ([], leftover)
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  src_files = [f for f in argv if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  if len(src_files) == 0:
+    raise Error('No source files found for cuda compilation.')
+
+  out_file = [ f for f in argv if f.startswith('/Fo') ]
+  if len(out_file) != 1:
+    raise Error('Please specify exactly one output file for cuda compilation.')
+  out = ['-o', out_file[0][len('/Fo'):]]
+
+  nvcc_compiler_options, argv = GetNvccOptions(argv)
+
+  opt_option, argv = GetOptionValue(argv, '/O')
+  opt = ['-g']
+  if (len(opt_option) > 0 and opt_option[0] != 'd'):
+    opt = ['-O2']
+
+  include_options, argv = GetOptionValue(argv, '/I')
+  includes = ["-I " + include for include in include_options]
+
+  defines, argv = GetOptionValue(argv, '/D')
+  defines = [
+      '-D' + define
+      for define in defines
+      if 'BAZEL_CURRENT_REPOSITORY' not in define
+  ]
+
+  undefines, argv = GetOptionValue(argv, '/U')
+  undefines = ['-U' + define for define in undefines]
+
+  fatbin_options, argv = GetOptionValue(argv, '-Xcuda-fatbinary')
+  fatbin_options = ['--fatbin-options=' + option for option in fatbin_options]
+
+  # The rest of the unrecognized options should be passed to host compiler
+  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
+
+  m_options = ["-m64"]
+
+  nvccopts = ['-D_FORCE_INLINES']
+  compute_capabilities, argv = GetOptionValue(argv, "--cuda-gpu-arch")
+  for capability in compute_capabilities:
+    capability = capability[len('sm_'):]
+    nvccopts += [
+        r'-gencode=arch=compute_%s,"code=sm_%s"' % (capability, capability)
+    ]
+  compute_capabilities, argv = GetOptionValue(argv, '--cuda-include-ptx')
+  for capability in compute_capabilities:
+    capability = capability[len('sm_'):]
+    nvccopts += [
+        r'-gencode=arch=compute_%s,"code=compute_%s"' % (capability, capability)
+    ]
+  _, argv = GetOptionValue(argv, '--no-cuda-include-ptx')
+
+  # nvcc doesn't respect the INCLUDE and LIB env vars from MSVC,
+  # so we explicity specify the system include paths and library search paths.
+  if 'INCLUDE' in os.environ:
+    nvccopts += [('--system-include="%s"' % p) for p in os.environ['INCLUDE'].split(";")]
+  if 'LIB' in os.environ:
+    nvccopts += [('--library-path="%s"' % p) for p in os.environ['LIB'].split(";")]
+
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += m_options
+  nvccopts += fatbin_options
+  nvccopts += ['--compiler-options=' + ",".join(host_compiler_options)]
+  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
+  # Specify a unique temp directory for nvcc to generate intermediate files,
+  # then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
+  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
+  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
+  if os.path.isfile(NVCC_TEMP_DIR):
+    os.remove(NVCC_TEMP_DIR)
+  if not os.path.exists(NVCC_TEMP_DIR):
+    os.makedirs(NVCC_TEMP_DIR)
+  # Provide a unique dir for each compiling action to avoid conflicts.
+  tempdir = tempfile.mkdtemp(dir = NVCC_TEMP_DIR)
+  nvccopts += ['--keep', '--keep-dir', tempdir]
+  # Force C++17 dialect (note, everything in just one string!)
+  nvccopts += ['--std c++17']
+  if log:
+    Log([NVCC_PATH] + nvccopts)
+
+  # Store command line options in a file to avoid hitting the character limit.
+  optsfile = tempfile.NamedTemporaryFile(mode='w', dir=tempdir, delete=False)
+  optsfile.write("\n".join(nvccopts))
+  optsfile.close()
+
+  proc = subprocess.Popen([NVCC_PATH, "--options-file", optsfile.name],
+                          stdout=sys.stdout,
+                          stderr=sys.stderr,
+                          env=os.environ.copy(),
+                          shell=True)
+  proc.wait()
+  return proc.returncode
+
+def ExpandParamsFileForArgv():
+  new_argv = []
+  for arg in sys.argv:
+    if arg.startswith("@"):
+      with open(arg.strip("@")) as f:
+        new_argv.extend([l.strip() for l in f.readlines()])
+    else:
+      new_argv.append(arg)
+
+  sys.argv = new_argv
+
+def ProcessFlagForCommandFile(flag):
+  if flag.startswith("/D") or flag.startswith("-D"):
+    # We need to re-escape /DFOO="BAR" as /DFOO=\"BAR\", so that we get
+    # `#define FOO "BAR"` after expansion as a string literal define
+    if flag.endswith('"') and not flag.endswith('\\"'):
+      flag = '\\"'.join(flag.split('"', 1))
+      flag = '\\"'.join(flag.rsplit('"', 1))
+      return flag
+  return flag
+
+def main():
+  ExpandParamsFileForArgv()
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))
+                             and not flag.startswith(('-nvcc_options'))]
+  output = [flag for flag in cpu_compiler_flags if flag.startswith("/Fo")]
+
+  # Store command line options in a file to avoid hitting the character limit.
+  if len(output) == 1:
+    commandfile_path = output[0][3:] + ".msvc_params"
+    commandfile = open(commandfile_path, "w")
+    cpu_compiler_flags = [ProcessFlagForCommandFile(flag) for flag in cpu_compiler_flags]
+    commandfile.write("\n".join(cpu_compiler_flags))
+    commandfile.close()
+    return subprocess.call([CPU_COMPILER, "@" + commandfile_path])
+  else:
+    return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 700e040a88eeca..90a18b90de048c 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -61,23 +61,23 @@ cuda_header_library(
 
 cc_library(
     name = "cudart_static",
-    srcs = ["cuda/lib/libcudart_static.a"],
+    srcs = ["cuda/lib/%{cudart_static_lib}"],
     linkopts = [
         "-ldl",
-        "-lrt",
         "-lpthread",
+        %{cudart_static_linkopt}
     ],
 )
 
 cc_library(
     name = "cuda_driver",
-    srcs = ["cuda/lib/libcuda.so"],
+    srcs = ["cuda/lib/%{cuda_driver_lib}"],
 )
 
 cc_library(
     name = "cudart",
-    srcs = glob(["cuda/lib/libcudart.so.*"]),
-    data = glob(["cuda/lib/libcudart.so.*"]),
+    srcs = ["cuda/lib/%{cudart_lib}"],
+    data = ["cuda/lib/%{cudart_lib}"],
     linkstatic = 1,
 )
 
@@ -128,30 +128,30 @@ cuda_header_library(
 
 cc_library(
     name = "cublas",
-    srcs = glob(["cuda/lib/libcublas.so.*"]),
-    data = glob(["cuda/lib/libcublas.so.*"]),
+    srcs = ["cuda/lib/%{cublas_lib}"],
+    data = ["cuda/lib/%{cublas_lib}"],
     linkstatic = 1,
 )
 
 cc_library(
     name = "cublasLt",
-    srcs = glob(["cuda/lib/libcublasLt.so.*"]),
-    data = glob(["cuda/lib/libcublasLt.so.*"]),
+    srcs = ["cuda/lib/%{cublasLt_lib}"],
+    data = ["cuda/lib/%{cublasLt_lib}"],
     linkstatic = 1,
 )
 
 cc_library(
     name = "cusolver",
-    srcs = glob(["cuda/lib/libcusolver.so.*"]),
-    data = glob(["cuda/lib/libcusolver.so.*"]),
+    srcs = ["cuda/lib/%{cusolver_lib}"],
+    data = ["cuda/lib/%{cusolver_lib}"],
     linkopts = ["-lgomp"],
     linkstatic = 1,
 )
 
 cc_library(
     name = "cudnn",
-    srcs = glob(["cuda/lib/libcudnn.so.*"]),
-    data = glob(["cuda/lib/libcudnn.so.*"]),
+    srcs = ["cuda/lib/%{cudnn_lib}"],
+    data = ["cuda/lib/%{cudnn_lib}"],
     linkstatic = 1,
 )
 
@@ -165,15 +165,15 @@ cc_library(
 
 cc_library(
     name = "cufft",
-    srcs = glob(["cuda/lib/libcufft.so.*"]),
-    data = glob(["cuda/lib/libcufft.so.*"]),
+    srcs = ["cuda/lib/%{cufft_lib}"],
+    data = ["cuda/lib/%{cufft_lib}"],
     linkstatic = 1,
 )
 
 cc_library(
     name = "curand",
-    srcs = glob(["cuda/lib/libcurand.so.*"]),
-    data = glob(["cuda/lib/libcurand.so.*"]),
+    srcs = ["cuda/lib/%{curand_lib}"],
+    data = ["cuda/lib/%{curand_lib}"],
     linkstatic = 1,
 )
 
@@ -192,7 +192,7 @@ cc_library(
 
 alias(
     name = "cub_headers",
-    actual = ":cuda_headers",
+    actual = "%{cub_actual}",
 )
 
 cuda_header_library(
@@ -213,13 +213,13 @@ cuda_header_library(
 
 cc_library(
     name = "cupti_dsos",
-    data = glob(["cuda/lib/libcupti.so.*"]),
+    data = ["cuda/lib/%{cupti_lib}"],
 )
 
 cc_library(
     name = "cusparse",
-    srcs = glob(["cuda/lib/libcusparse.so.*"]),
-    data = glob(["cuda/lib/libcusparse.so.*"]),
+    srcs = ["cuda/lib/%{cusparse_lib}"],
+    data = ["cuda/lib/%{cusparse_lib}"],
     linkopts = ["-lgomp"],
     linkstatic = 1,
 )
diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
new file mode 100644
index 00000000000000..dee0e898d9ae7a
--- /dev/null
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@@ -0,0 +1,238 @@
+load(":build_defs.bzl", "cuda_header_library")
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@bazel_skylib//lib:selects.bzl", "selects")
+
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+# Config setting whether TensorFlow is built with CUDA support using clang.
+#
+# TODO(b/174244321), DEPRECATED: this target will be removed when all users
+# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_clang.
+selects.config_setting_group(
+    name = "using_clang",
+    match_all = [
+        "@local_config_cuda//:is_cuda_enabled",
+        "@local_config_cuda//:is_cuda_compiler_clang",
+    ],
+)
+
+# Config setting whether TensorFlow is built with CUDA support using nvcc.
+#
+# TODO(b/174244321), DEPRECATED: this target will be removed when all users
+# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_nvcc.
+selects.config_setting_group(
+    name = "using_nvcc",
+    match_all = [
+        "@local_config_cuda//:is_cuda_enabled",
+        "@local_config_cuda//:is_cuda_compiler_nvcc",
+    ],
+)
+
+# Equivalent to using_clang && -c opt.
+selects.config_setting_group(
+    name = "using_clang_opt",
+    match_all = [
+        ":using_clang",
+        ":_opt",
+    ],
+)
+
+config_setting(
+    name = "_opt",
+    values = {"compilation_mode": "opt"},
+)
+
+# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
+# All clients including TensorFlow should use these directives.
+cuda_header_library(
+    name = "cuda_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        ":cuda-include",
+    ],
+    include_prefix = "third_party/gpus",
+    includes = [
+        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
+        "cuda/include",
+    ],
+)
+
+cc_import(
+    name = "cudart_static",
+    # /WHOLEARCHIVE:cudart_static.lib will cause a
+    # "Internal error during CImplib::EmitThunk" error.
+    # Treat this library as interface library to avoid being whole archived when
+    # linking a DLL that depends on this.
+    # TODO(pcloudy): Remove this rule after b/111278841 is resolved.
+    interface_library = "cuda/lib/%{cudart_static_lib}",
+    system_provided = 1,
+)
+
+cc_import(
+    name = "cuda_driver",
+    interface_library = "cuda/lib/%{cuda_driver_lib}",
+    system_provided = 1,
+)
+
+cc_import(
+    name = "cudart",
+    interface_library = "cuda/lib/%{cudart_lib}",
+    system_provided = 1,
+)
+
+cuda_header_library(
+    name = "cublas_headers",
+    hdrs = [":cublas-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cublas/include"],
+    strip_include_prefix = "cublas/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "cusolver_headers",
+    hdrs = [":cusolver-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cusolver/include"],
+    strip_include_prefix = "cusolver/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "cufft_headers",
+    hdrs = [":cufft-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cufft/include"],
+    strip_include_prefix = "cufft/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "cusparse_headers",
+    hdrs = [":cusparse-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cusparse/include"],
+    strip_include_prefix = "cusparse/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "curand_headers",
+    hdrs = [":curand-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["curand/include"],
+    strip_include_prefix = "curand/include",
+    deps = [":cuda_headers"],
+)
+
+cc_import(
+    name = "cublas",
+    interface_library = "cuda/lib/%{cublas_lib}",
+    system_provided = 1,
+)
+
+cc_import(
+    name = "cublasLt",
+    interface_library = "cuda/lib/%{cublasLt_lib}",
+    system_provided = 1,
+)
+
+cc_import(
+    name = "cusolver",
+    interface_library = "cuda/lib/%{cusolver_lib}",
+    system_provided = 1,
+)
+
+cc_import(
+    name = "cudnn",
+    interface_library = "cuda/lib/%{cudnn_lib}",
+    system_provided = 1,
+)
+
+cc_library(
+    name = "cudnn_header",
+    hdrs = [":cudnn-include"],
+    include_prefix = "third_party/gpus/cudnn",
+    strip_include_prefix = "cudnn/include",
+    deps = [":cuda_headers"],
+)
+
+cc_import(
+    name = "cufft",
+    interface_library = "cuda/lib/%{cufft_lib}",
+    system_provided = 1,
+)
+
+cc_import(
+    name = "curand",
+    interface_library = "cuda/lib/%{curand_lib}",
+    system_provided = 1,
+)
+
+cc_library(
+    name = "cuda",
+    deps = [
+        ":cublas",
+        ":cublasLt",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+    ],
+)
+
+alias(
+    name = "cub_headers",
+    actual = "%{cub_actual}",
+)
+
+cuda_header_library(
+    name = "cupti_headers",
+    hdrs = [":cuda-extras"],
+    include_prefix = "third_party/gpus",
+    includes = ["cuda/extras/CUPTI/include/"],
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "nvml_headers",
+    hdrs = [":nvml"],
+    include_prefix = "third_party/gpus",
+    includes = ["cuda/nvml/include/"],
+    deps = [":cuda_headers"],
+)
+
+cc_import(
+    name = "cupti_dsos",
+    interface_library = "cuda/lib/%{cupti_lib}",
+    system_provided = 1,
+)
+
+cc_import(
+    name = "cusparse",
+    interface_library = "cuda/lib/%{cusparse_lib}",
+    system_provided = 1,
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = [":cuda-nvvm"],
+)
+
+bzl_library(
+    name = "build_defs_bzl",
+    srcs = ["build_defs.bzl"],
+    deps = [
+        "@bazel_skylib//lib:selects",
+    ],
+)
+
+py_library(
+    name = "cuda_config_py",
+    srcs = ["cuda/cuda_config.py"],
+)
+
+%{copy_rules}
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 3f2b67632e1a67..e73e41a0c383a2 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -4,7 +4,8 @@
 
   * `TF_NEED_CUDA`: Whether to enable building with CUDA.
   * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
-  * `TF_CUDA_CLANG`: Whether to use clang as a cuda compiler.
+  * `TF_CUDA_CLANG`: Whether to use clang for C++ and Cuda compilation.
+  * `TF_NVCC_CLANG`: Whether to use clang for C++ and NVCC for Cuda compilation.
   * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
     both host and device code compilation if TF_CUDA_CLANG is 1.
   * `TF_SYSROOT`: The sysroot to use when compiling.
@@ -26,14 +27,27 @@
 """
 
 load("//third_party/clang_toolchain:download_clang.bzl", "download_clang")
+load(
+    "@bazel_tools//tools/cpp:lib_cc_configure.bzl",
+    "escape_string",
+    "get_env_var",
+)
+load(
+    "@bazel_tools//tools/cpp:windows_cc_configure.bzl",
+    "find_msvc_tool",
+    "find_vc_path",
+    "setup_vc_env_vars",
+)
 load(
     "//third_party/remote_config:common.bzl",
     "config_repo_label",
     "err_out",
     "execute",
     "get_bash_bin",
+    "get_cpu_value",
     "get_host_environ",
     "get_python_bin",
+    "is_windows",
     "raw_exec",
     "read_dir",
     "realpath",
@@ -82,7 +96,16 @@ def verify_build_defines(params):
         "host_compiler_warnings",
         "linker_bin_path",
         "compiler_deps",
+        "msvc_cl_path",
+        "msvc_env_include",
+        "msvc_env_lib",
+        "msvc_env_path",
+        "msvc_env_tmp",
+        "msvc_lib_path",
+        "msvc_link_path",
+        "msvc_ml_path",
         "unfiltered_compile_flags",
+        "win_compiler_deps",
     ]:
         if ("%{" + param + "}") not in params:
             missing.append(param)
@@ -96,13 +119,104 @@ def verify_build_defines(params):
             ".",
         )
 
+def _get_nvcc_tmp_dir_for_windows(repository_ctx):
+    """Return the Windows tmp directory for nvcc to generate intermediate source files."""
+    escaped_tmp_dir = escape_string(
+        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
+            "\\",
+            "\\\\",
+        ),
+    )
+    return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir"
+
+def _get_msvc_compiler(repository_ctx):
+    vc_path = find_vc_path(repository_ctx)
+    return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/")
+
+def _get_win_cuda_defines(repository_ctx):
+    """Return CROSSTOOL defines for Windows"""
+
+    # If we are not on Windows, return fake vaules for Windows specific fields.
+    # This ensures the CROSSTOOL file parser is happy.
+    if not is_windows(repository_ctx):
+        return {
+            "%{msvc_env_tmp}": "msvc_not_used",
+            "%{msvc_env_path}": "msvc_not_used",
+            "%{msvc_env_include}": "msvc_not_used",
+            "%{msvc_env_lib}": "msvc_not_used",
+            "%{msvc_cl_path}": "msvc_not_used",
+            "%{msvc_ml_path}": "msvc_not_used",
+            "%{msvc_link_path}": "msvc_not_used",
+            "%{msvc_lib_path}": "msvc_not_used",
+        }
+
+    vc_path = find_vc_path(repository_ctx)
+    if not vc_path:
+        auto_configure_fail(
+            "Visual C++ build tools not found on your machine." +
+            "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using",
+        )
+        return {}
+
+    env = setup_vc_env_vars(repository_ctx, vc_path)
+    escaped_paths = escape_string(env["PATH"])
+    escaped_include_paths = escape_string(env["INCLUDE"])
+    escaped_lib_paths = escape_string(env["LIB"])
+    escaped_tmp_dir = escape_string(
+        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
+            "\\",
+            "\\\\",
+        ),
+    )
+
+    msvc_cl_path = "windows/msvc_wrapper_for_nvcc.bat"
+    msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace(
+        "\\",
+        "/",
+    )
+    msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace(
+        "\\",
+        "/",
+    )
+    msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace(
+        "\\",
+        "/",
+    )
+
+    # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
+    # The generated files are guaranteed to have unique name, so they can share
+    # the same tmp directory
+    escaped_cxx_include_directories = [
+        _get_nvcc_tmp_dir_for_windows(repository_ctx),
+        "C:\\\\botcode\\\\w",
+    ]
+    for path in escaped_include_paths.split(";"):
+        if path:
+            escaped_cxx_include_directories.append(path)
+
+    return {
+        "%{msvc_env_tmp}": escaped_tmp_dir,
+        "%{msvc_env_path}": escaped_paths,
+        "%{msvc_env_include}": escaped_include_paths,
+        "%{msvc_env_lib}": escaped_lib_paths,
+        "%{msvc_cl_path}": msvc_cl_path,
+        "%{msvc_ml_path}": msvc_ml_path,
+        "%{msvc_link_path}": msvc_link_path,
+        "%{msvc_lib_path}": msvc_lib_path,
+        "%{cxx_builtin_include_directories}": to_list_of_strings(
+            escaped_cxx_include_directories,
+        ),
+    }
+
 # TODO(dzc): Once these functions have been factored out of Bazel's
 # cc_configure.bzl, load them from @bazel_tools instead.
 # BEGIN cc_configure common functions.
-def find_cc(repository_ctx):
+def find_cc(repository_ctx, use_cuda_clang):
     """Find the C++ compiler."""
+    if is_windows(repository_ctx):
+        return _get_msvc_compiler(repository_ctx)
 
-    if _use_cuda_clang(repository_ctx):
+    if use_cuda_clang:
         target_cc_name = "clang"
         cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
         if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
@@ -251,9 +365,10 @@ def _cuda_include_path(repository_ctx, cuda_config):
       Returns:
         A list of the gcc host compiler include directories.
       """
-    nvcc_path = repository_ctx.path(
-        "%s/bin/nvcc" % cuda_config.cuda_toolkit_path,
-    )
+    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
+        cuda_config.cuda_toolkit_path,
+        ".exe" if cuda_config.cpu_value == "Windows" else "",
+    ))
 
     # The expected exit code of this command is non-zero. Bazel remote execution
     # only caches commands with zero exit code. So force a zero exit code.
@@ -314,6 +429,10 @@ def matches_version(environ_version, detected_version):
             return False
     return True
 
+_NVCC_VERSION_PREFIX = "Cuda compilation tools, release "
+
+_DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"
+
 def compute_capabilities(repository_ctx):
     """Returns a list of strings representing cuda compute capabilities.
 
@@ -356,11 +475,12 @@ def compute_capabilities(repository_ctx):
 
     return capabilities
 
-def lib_name(base_name, version = None, static = False):
+def lib_name(base_name, cpu_value, version = None, static = False):
     """Constructs the platform-specific name of a library.
 
       Args:
         base_name: The name of the library, such as "cudart"
+        cpu_value: The name of the host operating system.
         version: The version of the library.
         static: True the library is static or False if it is a shared object.
 
@@ -368,20 +488,29 @@ def lib_name(base_name, version = None, static = False):
         The platform-specific name of the library.
       """
     version = "" if not version else "." + version
-    if static:
-        return "lib%s.a" % base_name
-    return "lib%s.so%s" % (base_name, version)
+    if cpu_value in ("Linux", "FreeBSD"):
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s.so%s" % (base_name, version)
+    elif cpu_value == "Windows":
+        return "%s.lib" % base_name
+    elif cpu_value == "Darwin":
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s%s.dylib" % (base_name, version)
+    else:
+        auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
 
-def _lib_path(lib, basedir, version, static):
-    file_name = lib_name(lib, version, static)
+def _lib_path(lib, cpu_value, basedir, version, static):
+    file_name = lib_name(lib, cpu_value, version, static)
     return "%s/%s" % (basedir, file_name)
 
 def _should_check_soname(version, static):
     return version and not static
 
-def _check_cuda_lib_params(lib, basedir, version, static = False):
+def _check_cuda_lib_params(lib, cpu_value, basedir, version, static = False):
     return (
-        _lib_path(lib, basedir, version, static),
+        _lib_path(lib, cpu_value, basedir, version, static),
         _should_check_soname(version, static),
     )
 
@@ -401,6 +530,8 @@ def _check_cuda_libs(repository_ctx, script_path, libs):
     all_paths = [path for path, _ in libs]
     checked_paths = execute(repository_ctx, [python_bin, "-c", cmd]).stdout.splitlines()
 
+    # Filter out empty lines from splitting on '\r\n' on Windows
+    checked_paths = [path for path in checked_paths if len(path) > 0]
     if all_paths != checked_paths:
         auto_configure_fail("Error with installed CUDA libs. Expected '%s'. Actual '%s'." % (all_paths, checked_paths))
 
@@ -418,62 +549,86 @@ def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
       Returns:
         Map of library names to structs of filename and path.
       """
+    cpu_value = cuda_config.cpu_value
+    stub_dir = "" if is_windows(repository_ctx) else "/stubs"
+
     check_cuda_libs_params = {
         "cuda": _check_cuda_lib_params(
             "cuda",
-            cuda_config.config["cuda_library_dir"] + "/stubs",
+            cpu_value,
+            cuda_config.config["cuda_library_dir"] + stub_dir,
             version = None,
+            static = False,
         ),
         "cudart": _check_cuda_lib_params(
             "cudart",
+            cpu_value,
             cuda_config.config["cuda_library_dir"],
             cuda_config.cudart_version,
+            static = False,
         ),
         "cudart_static": _check_cuda_lib_params(
             "cudart_static",
+            cpu_value,
             cuda_config.config["cuda_library_dir"],
             cuda_config.cudart_version,
             static = True,
         ),
         "cublas": _check_cuda_lib_params(
             "cublas",
+            cpu_value,
             cuda_config.config["cublas_library_dir"],
             cuda_config.cublas_version,
+            static = False,
         ),
         "cublasLt": _check_cuda_lib_params(
             "cublasLt",
+            cpu_value,
             cuda_config.config["cublas_library_dir"],
             cuda_config.cublas_version,
+            static = False,
         ),
         "cusolver": _check_cuda_lib_params(
             "cusolver",
+            cpu_value,
             cuda_config.config["cusolver_library_dir"],
             cuda_config.cusolver_version,
+            static = False,
         ),
         "curand": _check_cuda_lib_params(
             "curand",
+            cpu_value,
             cuda_config.config["curand_library_dir"],
             cuda_config.curand_version,
+            static = False,
         ),
         "cufft": _check_cuda_lib_params(
             "cufft",
+            cpu_value,
             cuda_config.config["cufft_library_dir"],
             cuda_config.cufft_version,
+            static = False,
         ),
         "cudnn": _check_cuda_lib_params(
             "cudnn",
+            cpu_value,
             cuda_config.config["cudnn_library_dir"],
             cuda_config.cudnn_version,
+            static = False,
         ),
         "cupti": _check_cuda_lib_params(
             "cupti",
+            cpu_value,
             cuda_config.config["cupti_library_dir"],
             cuda_config.cupti_version,
+            static = False,
         ),
         "cusparse": _check_cuda_lib_params(
             "cusparse",
+            cpu_value,
             cuda_config.config["cusparse_library_dir"],
             cuda_config.cusparse_version,
+            static = False,
         ),
     }
 
@@ -483,6 +638,10 @@ def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
     paths = {filename: v[0] for (filename, v) in check_cuda_libs_params.items()}
     return paths
 
+def _cudart_static_linkopt(cpu_value):
+    """Returns additional platform-specific linkopts for cudart."""
+    return "" if cpu_value == "Darwin" else "\"-lrt\","
+
 # TODO(csigg): Only call once instead of from here, tensorrt_configure.bzl,
 # and nccl_configure.bzl.
 def find_cuda_config(repository_ctx, cuda_libraries):
@@ -509,34 +668,37 @@ def _get_cuda_config(repository_ctx):
           cudart_version: The CUDA runtime version on the system.
           cudnn_version: The version of cuDNN on the system.
           compute_capabilities: A list of the system's CUDA compute capabilities.
+          cpu_value: The name of the host operating system.
       """
     config = find_cuda_config(repository_ctx, ["cuda", "cudnn"])
+    cpu_value = get_cpu_value(repository_ctx)
     toolkit_path = config["cuda_toolkit_path"]
 
+    is_windows = cpu_value == "Windows"
     cuda_version = config["cuda_version"].split(".")
     cuda_major = cuda_version[0]
     cuda_minor = cuda_version[1]
 
-    cuda_version = "%s.%s" % (cuda_major, cuda_minor)
-    cudnn_version = "%s" % config["cudnn_version"]
+    cuda_version = ("64_%s%s" if is_windows else "%s.%s") % (cuda_major, cuda_minor)
+    cudnn_version = ("64_%s" if is_windows else "%s") % config["cudnn_version"]
 
     if int(cuda_major) >= 11:
         # The libcudart soname in CUDA 11.x is versioned as 11.0 for backward compatability.
         if int(cuda_major) == 11:
-            cudart_version = "11.0"
+            cudart_version = "64_110" if is_windows else "11.0"
             cupti_version = cuda_version
         else:
-            cudart_version = "%s" % cuda_major
+            cudart_version = ("64_%s" if is_windows else "%s") % cuda_major
             cupti_version = cudart_version
-        cublas_version = "%s" % config["cublas_version"].split(".")[0]
-        cusolver_version = "%s" % config["cusolver_version"].split(".")[0]
-        curand_version = "%s" % config["curand_version"].split(".")[0]
-        cufft_version = "%s" % config["cufft_version"].split(".")[0]
-        cusparse_version = "%s" % config["cusparse_version"].split(".")[0]
+        cublas_version = ("64_%s" if is_windows else "%s") % config["cublas_version"].split(".")[0]
+        cusolver_version = ("64_%s" if is_windows else "%s") % config["cusolver_version"].split(".")[0]
+        curand_version = ("64_%s" if is_windows else "%s") % config["curand_version"].split(".")[0]
+        cufft_version = ("64_%s" if is_windows else "%s") % config["cufft_version"].split(".")[0]
+        cusparse_version = ("64_%s" if is_windows else "%s") % config["cusparse_version"].split(".")[0]
     elif (int(cuda_major), int(cuda_minor)) >= (10, 1):
         # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.
         # It changed from 'x.y' to just 'x' in CUDA 10.1.
-        cuda_lib_version = "%s" % cuda_major
+        cuda_lib_version = ("64_%s" if is_windows else "%s") % cuda_major
         cudart_version = cuda_version
         cupti_version = cuda_version
         cublas_version = cuda_lib_version
@@ -566,6 +728,7 @@ def _get_cuda_config(repository_ctx):
         cusparse_version = cusparse_version,
         cudnn_version = cudnn_version,
         compute_capabilities = compute_capabilities(repository_ctx),
+        cpu_value = cpu_value,
         config = config,
     )
 
@@ -611,6 +774,8 @@ error_gpu_disabled()
 """
 
 def _create_dummy_repository(repository_ctx):
+    cpu_value = get_cpu_value(repository_ctx)
+
     # Set up BUILD file for cuda/.
     _tpl(
         repository_ctx,
@@ -625,6 +790,23 @@ def _create_dummy_repository(repository_ctx):
         repository_ctx,
         "cuda:BUILD",
         {
+            "%{cuda_driver_lib}": lib_name("cuda", cpu_value),
+            "%{cudart_static_lib}": lib_name(
+                "cudart_static",
+                cpu_value,
+                static = True,
+            ),
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
+            "%{cudart_lib}": lib_name("cudart", cpu_value),
+            "%{cublas_lib}": lib_name("cublas", cpu_value),
+            "%{cublasLt_lib}": lib_name("cublasLt", cpu_value),
+            "%{cusolver_lib}": lib_name("cusolver", cpu_value),
+            "%{cudnn_lib}": lib_name("cudnn", cpu_value),
+            "%{cufft_lib}": lib_name("cufft", cpu_value),
+            "%{curand_lib}": lib_name("curand", cpu_value),
+            "%{cupti_lib}": lib_name("cupti", cpu_value),
+            "%{cusparse_lib}": lib_name("cusparse", cpu_value),
+            "%{cub_actual}": ":cuda_headers",
             "%{copy_rules}": """
 filegroup(name="cuda-include")
 filegroup(name="cublas-include")
@@ -643,9 +825,20 @@ filegroup(name="cudnn-include")
     repository_ctx.file("cuda/cuda/include/cublas.h")
     repository_ctx.file("cuda/cuda/include/cudnn.h")
     repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h")
-    repository_ctx.file("cuda/cuda/lib/libcuda.so")
-    repository_ctx.file("cuda/cuda/lib/libcudart_static.a")
     repository_ctx.file("cuda/cuda/nvml/include/nvml.h")
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cuda", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudart", cpu_value))
+    repository_ctx.file(
+        "cuda/cuda/lib/%s" % lib_name("cudart_static", cpu_value),
+    )
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublas", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublasLt", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusolver", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudnn", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("curand", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cufft", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cupti", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusparse", cpu_value))
 
     # Set up cuda_config.h, which is used by
     # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
@@ -709,7 +902,7 @@ def make_copy_files_rule(repository_ctx, name, srcs, outs):
     cmd = \"""%s \""",
 )""" % (name, "\n".join(outs), " && \\\n".join(cmds))
 
-def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
+def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir, exceptions = None):
     """Returns a rule to recursively copy a directory.
     If exceptions is not None, it must be a list of files or directories in
     'src_dir'; these will be excluded from copying.
@@ -717,25 +910,39 @@ def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
     src_dir = _norm_path(src_dir)
     out_dir = _norm_path(out_dir)
     outs = read_dir(repository_ctx, src_dir)
+    post_cmd = ""
+    if exceptions != None:
+        outs = [x for x in outs if not any([
+            x.startswith(src_dir + "/" + y)
+            for y in exceptions
+        ])]
     outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
 
     # '@D' already contains the relative path for a single file, see
     # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
     out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
+    if exceptions != None:
+        for x in exceptions:
+            post_cmd += " ; rm -fR " + out_dir + "/" + x
     return """genrule(
     name = "%s",
     outs = [
 %s
     ],
-    cmd = \"""cp -rLf "%s/." "%s/" \""",
-)""" % (name, "\n".join(outs), src_dir, out_dir)
+    cmd = \"""cp -rLf "%s/." "%s/" %s\""",
+)""" % (name, "\n".join(outs), src_dir, out_dir, post_cmd)
 
 def _flag_enabled(repository_ctx, flag_name):
     return get_host_environ(repository_ctx, flag_name) == "1"
 
 def _use_cuda_clang(repository_ctx):
+    # Returns the flag if we need to use clang both for C++ and Cuda.
     return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
 
+def _use_nvcc_and_clang(repository_ctx):
+    # Returns the flag if we need to use clang for C++ and NVCC for Cuda.
+    return _flag_enabled(repository_ctx, "TF_NVCC_CLANG")
+
 def _tf_sysroot(repository_ctx):
     return get_host_environ(repository_ctx, _TF_SYSROOT, "")
 
@@ -752,6 +959,22 @@ def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
 def _tpl_path(repository_ctx, filename):
     return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % filename))
 
+def _basename(repository_ctx, path_str):
+    """Returns the basename of a path of type string.
+
+    This method is different from path.basename in that it also works if
+    the host platform is different from the execution platform
+    i.e. linux -> windows.
+    """
+
+    num_chars = len(path_str)
+    is_win = is_windows(repository_ctx)
+    for i in range(num_chars):
+        r_i = num_chars - 1 - i
+        if (is_win and path_str[r_i] == "\\") or path_str[r_i] == "/":
+            return path_str[r_i + 1:]
+    return path_str
+
 def _create_local_cuda_repository(repository_ctx):
     """Creates the repository containing files set up to build with CUDA."""
 
@@ -760,14 +983,15 @@ def _create_local_cuda_repository(repository_ctx):
     # can easily lead to a O(n^2) runtime in the number of labels.
     # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
     tpl_paths = {filename: _tpl_path(repository_ctx, filename) for filename in [
-        "cuda:BUILD",
         "cuda:build_defs.bzl",
         "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
+        "crosstool:windows/msvc_wrapper_for_nvcc.py",
         "crosstool:BUILD",
         "crosstool:cc_toolchain_config.bzl",
         "cuda:cuda_config.h",
         "cuda:cuda_config.py",
     ]}
+    tpl_paths["cuda:BUILD"] = _tpl_path(repository_ctx, "cuda:BUILD.windows" if is_windows(repository_ctx) else "cuda:BUILD")
 
     cuda_config = _get_cuda_config(repository_ctx)
 
@@ -879,7 +1103,7 @@ def _create_local_cuda_repository(repository_ctx):
     cuda_lib_outs = []
     for path in cuda_libs.values():
         cuda_lib_srcs.append(path)
-        cuda_lib_outs.append("cuda/lib/" + path.rpartition("/")[-1])
+        cuda_lib_outs.append("cuda/lib/" + _basename(repository_ctx, path))
     copy_rules.append(make_copy_files_rule(
         repository_ctx,
         name = "cuda-lib",
@@ -888,7 +1112,11 @@ def _create_local_cuda_repository(repository_ctx):
     ))
 
     # copy files mentioned in third_party/nccl/build_defs.bzl.tpl
-    bin_files = ["crt/link.stub", "bin2c", "fatbinary", "nvlink", "nvprune"]
+    file_ext = ".exe" if is_windows(repository_ctx) else ""
+    bin_files = (
+        ["crt/link.stub"] +
+        [f + file_ext for f in ["bin2c", "fatbinary", "nvlink", "nvprune"]]
+    )
     copy_rules.append(make_copy_files_rule(
         repository_ctx,
         name = "cuda-bin",
@@ -896,7 +1124,7 @@ def _create_local_cuda_repository(repository_ctx):
         outs = ["cuda/bin/" + f for f in bin_files],
     ))
 
-    # Select the headers based on the cuDNN version.
+    # Select the headers based on the cuDNN version (strip '64_' for Windows).
     cudnn_headers = ["cudnn.h"]
     if cuda_config.cudnn_version.rsplit("_", 1)[-1] >= "8":
         cudnn_headers += [
@@ -937,15 +1165,33 @@ def _create_local_cuda_repository(repository_ctx):
         },
     )
 
+    cub_actual = "@cub_archive//:cub"
+    if int(cuda_config.cuda_version_major) >= 11:
+        cub_actual = ":cuda_headers"
+
     repository_ctx.template(
         "cuda/BUILD",
         tpl_paths["cuda:BUILD"],
         {
+            "%{cuda_driver_lib}": _basename(repository_ctx, cuda_libs["cuda"]),
+            "%{cudart_static_lib}": _basename(repository_ctx, cuda_libs["cudart_static"]),
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
+            "%{cudart_lib}": _basename(repository_ctx, cuda_libs["cudart"]),
+            "%{cublas_lib}": _basename(repository_ctx, cuda_libs["cublas"]),
+            "%{cublasLt_lib}": _basename(repository_ctx, cuda_libs["cublasLt"]),
+            "%{cusolver_lib}": _basename(repository_ctx, cuda_libs["cusolver"]),
+            "%{cudnn_lib}": _basename(repository_ctx, cuda_libs["cudnn"]),
+            "%{cufft_lib}": _basename(repository_ctx, cuda_libs["cufft"]),
+            "%{curand_lib}": _basename(repository_ctx, cuda_libs["curand"]),
+            "%{cupti_lib}": _basename(repository_ctx, cuda_libs["cupti"]),
+            "%{cusparse_lib}": _basename(repository_ctx, cuda_libs["cusparse"]),
+            "%{cub_actual}": cub_actual,
             "%{copy_rules}": "\n".join(copy_rules),
         },
     )
 
     is_cuda_clang = _use_cuda_clang(repository_ctx)
+    is_nvcc_and_clang = _use_nvcc_and_clang(repository_ctx)
     tf_sysroot = _tf_sysroot(repository_ctx)
 
     should_download_clang = is_cuda_clang and _flag_enabled(
@@ -956,7 +1202,7 @@ def _create_local_cuda_repository(repository_ctx):
         download_clang(repository_ctx, "crosstool/extra_tools")
 
     # Set up crosstool/
-    cc = find_cc(repository_ctx)
+    cc = find_cc(repository_ctx, is_cuda_clang)
     cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
 
     host_compiler_includes = get_cxx_inc_directories(
@@ -993,7 +1239,7 @@ def _create_local_cuda_repository(repository_ctx):
 
     cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
     cuda_defines["%{unfiltered_compile_flags}"] = ""
-    if is_cuda_clang:
+    if is_cuda_clang and not is_nvcc_and_clang:
         cuda_defines["%{host_compiler_path}"] = str(cc)
         cuda_defines["%{host_compiler_warnings}"] = """
         # Some parts of the codebase set -Werror and hit this warning, so
@@ -1002,10 +1248,12 @@ def _create_local_cuda_repository(repository_ctx):
     """
         cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(host_compiler_includes)
         cuda_defines["%{compiler_deps}"] = ":empty"
+        cuda_defines["%{win_compiler_deps}"] = ":empty"
         repository_ctx.file(
             "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
             "",
         )
+        repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.py", "")
     else:
         cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
         cuda_defines["%{host_compiler_warnings}"] = ""
@@ -1025,22 +1273,40 @@ def _create_local_cuda_repository(repository_ctx):
         # .d file - given that includes that are prefixed with "../" multiple
         # time quickly grow longer than the root of the tree, this can lead to
         # bazel's header check failing.
-        cuda_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
+        if not is_cuda_clang:
+            cuda_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
 
-        nvcc_path = "%s/nvcc" % cuda_config.config["cuda_binary_dir"]
+        file_ext = ".exe" if is_windows(repository_ctx) else ""
+        nvcc_path = "%s/nvcc%s" % (cuda_config.config["cuda_binary_dir"], file_ext)
         cuda_defines["%{compiler_deps}"] = ":crosstool_wrapper_driver_is_not_gcc"
+        cuda_defines["%{win_compiler_deps}"] = ":windows_msvc_wrapper_files"
 
         wrapper_defines = {
             "%{cpu_compiler}": str(cc),
             "%{cuda_version}": cuda_config.cuda_version,
             "%{nvcc_path}": nvcc_path,
-            "%{gcc_host_compiler_path}": str(cc),
+            "%{host_compiler_path}": str(cc),
+            "%{use_clang_compiler}": str(is_nvcc_and_clang),
+            "%{nvcc_tmp_dir}": _get_nvcc_tmp_dir_for_windows(repository_ctx),
         }
         repository_ctx.template(
             "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
             tpl_paths["crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc"],
             wrapper_defines,
         )
+        repository_ctx.file(
+            "crosstool/windows/msvc_wrapper_for_nvcc.bat",
+            content = "@echo OFF\n{} -B external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py %*".format(
+                get_python_bin(repository_ctx),
+            ),
+        )
+        repository_ctx.template(
+            "crosstool/windows/msvc_wrapper_for_nvcc.py",
+            tpl_paths["crosstool:windows/msvc_wrapper_for_nvcc.py"],
+            wrapper_defines,
+        )
+
+    cuda_defines.update(_get_win_cuda_defines(repository_ctx))
 
     verify_build_defines(cuda_defines)
 
@@ -1171,12 +1437,28 @@ def _cuda_autoconf_impl(repository_ctx):
 
     repository_ctx.symlink(build_file, "BUILD")
 
+# For @bazel_tools//tools/cpp:windows_cc_configure.bzl
+_MSVC_ENVVARS = [
+    "BAZEL_VC",
+    "BAZEL_VC_FULL_VERSION",
+    "BAZEL_VS",
+    "BAZEL_WINSDK_FULL_VERSION",
+    "VS90COMNTOOLS",
+    "VS100COMNTOOLS",
+    "VS110COMNTOOLS",
+    "VS120COMNTOOLS",
+    "VS140COMNTOOLS",
+    "VS150COMNTOOLS",
+    "VS160COMNTOOLS",
+]
+
 _ENVIRONS = [
     _GCC_HOST_COMPILER_PATH,
     _GCC_HOST_COMPILER_PREFIX,
     _CLANG_CUDA_COMPILER_PATH,
     "TF_NEED_CUDA",
     "TF_CUDA_CLANG",
+    "TF_NVCC_CLANG",
     _TF_DOWNLOAD_CLANG,
     _CUDA_TOOLKIT_PATH,
     _CUDNN_INSTALL_PATH,
@@ -1188,7 +1470,7 @@ _ENVIRONS = [
     "TMP",
     "TMPDIR",
     "TF_CUDA_PATHS",
-]
+] + _MSVC_ENVVARS
 
 remote_cuda_configure = repository_rule(
     implementation = _create_local_cuda_repository,
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index 78292c7b40237a..b88694af5c014d 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -29,6 +29,8 @@
 If TF_CUDA_PATHS is not specified, a OS specific default is used:
 
   Linux:   /usr/local/cuda, /usr, and paths from 'ldconfig -p'.
+  Windows: CUDA_PATH environment variable, or
+           C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\*
 
 For backwards compatibility, some libraries also use alternative base
 directories from other environment variables if they are specified. List of
@@ -54,6 +56,7 @@
 import io
 import os
 import glob
+import platform
 import re
 import subprocess
 import sys
@@ -70,6 +73,18 @@ class ConfigError(Exception):
   pass
 
 
+def _is_linux():
+  return platform.system() == "Linux"
+
+
+def _is_windows():
+  return platform.system() == "Windows"
+
+
+def _is_macos():
+  return platform.system() == "Darwin"
+
+
 def _matches_version(actual_version, required_version):
   """Checks whether some version meets the requirements.
 
@@ -119,6 +134,8 @@ def _cartesian_product(first, second):
 
 def _get_ld_config_paths():
   """Returns all directories from 'ldconfig -p'."""
+  if not _is_linux():
+    return []
   ldconfig_path = which("ldconfig") or "/sbin/ldconfig"
   output = subprocess.check_output([ldconfig_path, "-p"])
   pattern = re.compile(".* => (.*)")
@@ -139,6 +156,13 @@ def _get_default_cuda_paths(cuda_version):
   elif not "." in cuda_version:
     cuda_version = cuda_version + ".*"
 
+  if _is_windows():
+    return [
+        os.environ.get(
+            "CUDA_PATH",
+            "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v%s\\" %
+            cuda_version)
+    ]
   return ["/usr/local/cuda-%s" % cuda_version, "/usr/local/cuda", "/usr",
          "/usr/local/cudnn"] + _get_ld_config_paths()
 
@@ -188,8 +212,14 @@ def _find_file(base_paths, relative_paths, filepattern):
 
 def _find_library(base_paths, library_name, required_version):
   """Returns first valid path to the requested library."""
-  filepattern = ".".join(["lib" + library_name, "so"] +
-                         required_version.split(".")[:1]) + "*"
+  if _is_windows():
+    filepattern = library_name + ".lib"
+  elif _is_macos():
+    filepattern = "%s*.dylib" % (".".join(["lib" + library_name] +
+                                          required_version.split(".")[:1]))
+  else:
+    filepattern = ".".join(["lib" + library_name, "so"] +
+                           required_version.split(".")[:1]) + "*"
   return _find_file(base_paths, _library_paths(), filepattern)
 
 
@@ -238,7 +268,7 @@ def get_nvcc_version(path):
         return match.group(1)
     return None
 
-  nvcc_name = "nvcc"
+  nvcc_name = "nvcc.exe" if _is_windows() else "nvcc"
   nvcc_path, nvcc_version = _find_versioned_file(base_paths, [
       "",
       "bin",
@@ -528,6 +558,14 @@ def _get_legacy_path(env_name, default=[]):
   return _list_from_env(env_name, default)
 
 
+def _normalize_path(path):
+  """Returns normalized path, with forward slashes on Windows."""
+  path = os.path.realpath(path)
+  if _is_windows():
+    path = path.replace("\\", "/")
+  return path
+
+
 def find_cuda_config():
   """Returns a dictionary of CUDA library and header file paths."""
   libraries = [argv.lower() for argv in sys.argv[1:]]
@@ -596,7 +634,7 @@ def find_cuda_config():
 
   for k, v in result.items():
     if k.endswith("_dir") or k.endswith("_path"):
-      result[k] = os.path.realpath(v)
+      result[k] = _normalize_path(v)
 
   return result
 
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 90464b07264101..520c9bce6c5265 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -198,6 +198,8 @@ def _rocm_include_path(repository_ctx, rocm_config, bash_bin):
     inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/15.0.0/include")
     inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/16.0.0/include")
     inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/17.0.0/include")
+    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/17/include")
+    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/18/include")
 
     # Support hcc based off clang 10.0.0 (for ROCm 3.3)
     inc_dirs.append(rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/")
@@ -345,14 +347,14 @@ def _find_libs(repository_ctx, rocm_config, hipfft_or_rocfft, miopen_path, rccl_
     libs_paths = [
         (name, _rocm_lib_paths(repository_ctx, name, path))
         for name, path in [
-            ("amdhip64", rocm_config.rocm_toolkit_path + "/hip"),
+            ("amdhip64", rocm_config.rocm_toolkit_path),
             ("rocblas", rocm_config.rocm_toolkit_path),
             (hipfft_or_rocfft, rocm_config.rocm_toolkit_path),
             ("hiprand", rocm_config.rocm_toolkit_path),
             ("MIOpen", miopen_path),
             ("rccl", rccl_path),
             ("hipsparse", rocm_config.rocm_toolkit_path),
-            ("roctracer64", rocm_config.rocm_toolkit_path + "/roctracer"),
+            ("roctracer64", rocm_config.rocm_toolkit_path),
             ("rocsolver", rocm_config.rocm_toolkit_path),
         ]
     ]
@@ -694,7 +696,7 @@ def _create_local_rocm_repository(repository_ctx):
 
     rocm_defines["%{unfiltered_compile_flags}"] = to_list_of_strings([
         "-DTENSORFLOW_USE_ROCM=1",
-        "-D__HIP_PLATFORM_HCC__",
+        "-D__HIP_PLATFORM_AMD__",
         "-DEIGEN_USE_HIP",
     ])
 
@@ -729,7 +731,7 @@ def _create_local_rocm_repository(repository_ctx):
             "%{hipcc_env}": _hipcc_env(repository_ctx),
             "%{rocr_runtime_path}": rocm_config.rocm_toolkit_path + "/lib",
             "%{rocr_runtime_library}": "hsa-runtime64",
-            "%{hip_runtime_path}": rocm_config.rocm_toolkit_path + "/hip/lib",
+            "%{hip_runtime_path}": rocm_config.rocm_toolkit_path + "/lib",
             "%{hip_runtime_library}": "amdhip64",
             "%{crosstool_verbose}": _crosstool_verbose(repository_ctx),
             "%{gcc_host_compiler_path}": str(cc),
diff --git a/third_party/highwayhash/highwayhash.BUILD b/third_party/highwayhash/highwayhash.BUILD
index 76f0c962ef8b8a..c24c987a276acd 100644
--- a/third_party/highwayhash/highwayhash.BUILD
+++ b/third_party/highwayhash/highwayhash.BUILD
@@ -286,6 +286,7 @@ cc_library(
         ":hh_portable",
         ":hh_types",
     ] + select({
+        ":cpu_ppc": [":hh_vsx"],
         ":cpu_aarch64": [":hh_neon"],
         "//conditions:default": [
             ":hh_avx2",
diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 3597c8870d19ff..509398da979e83 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,2483 +1 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/llvm/include/llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h b/llvm/include/llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h
---- a/llvm/include/llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h
-+++ b/llvm/include/llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h
-@@ -1,42 +0,0 @@
--//===- MergeFunctionsIgnoringConst.h - Merge Functions ----------*- C++ -*-===//
--//
--// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
--// See https://llvm.org/LICENSE.txt for license information.
--// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
--//
--//===----------------------------------------------------------------------===//
--//
--// This pass transforms simple global variables that never have their address
--// taken.  If obviously true, it marks read/write globals as constant, deletes
--// variables only stored to, etc.
--//
--//===----------------------------------------------------------------------===//
--
--#ifndef LLVM_TRANSFORMS_IPO_MERGEFUNCTIONSIGNORINGCONST_H
--#define LLVM_TRANSFORMS_IPO_MERGEFUNCTIONSIGNORINGCONST_H
--
--#include "llvm/IR/PassManager.h"
--
--namespace llvm {
--
--class Module;
--
--/// Merge functions that differ by constants.
--class MergeFuncIgnoringConstPass
--    : public PassInfoMixin<MergeFuncIgnoringConstPass> {
--  bool PtrAuthEnabled = false;
--  unsigned PtrAuthKey = 0;
--  std::string MergeFuncSuffix = ".Tm";
--
--public:
--  MergeFuncIgnoringConstPass() {}
--  MergeFuncIgnoringConstPass(bool PtrAuthEnabled, unsigned PtrAuthKey,
--                             std::string Suffix)
--      : PtrAuthEnabled(PtrAuthEnabled), PtrAuthKey(PtrAuthKey),
--        MergeFuncSuffix(Suffix) {}
--  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
--};
--
--} // end namespace llvm
--
--#endif // LLVM_TRANSFORMS_IPO_MERGEFUNCTIONSIGNORINGCONST_H
-diff -ruN --strip-trailing-cr a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
---- a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
-+++ b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
-@@ -379,7 +379,6 @@
-   /// But, we are still not able to compare operands of PHI nodes, since those
-   /// could be operands from further BBs we didn't scan yet.
-   /// So it's impossible to use dominance properties in general.
--protected:
-   mutable DenseMap<const Value*, int> sn_mapL, sn_mapR;
- 
-   // The global state we will use
-diff -ruN --strip-trailing-cr a/llvm/include/llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h b/llvm/include/llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h
---- a/llvm/include/llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h
-+++ b/llvm/include/llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h
-@@ -1,58 +0,0 @@
--//===- FunctionComparatorIgnoringConst.h - Function Comparator --*- C++ -*-===//
--//
--// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
--// See https://llvm.org/LICENSE.txt for license information.
--// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
--//
--//===----------------------------------------------------------------------===//
--//
--// This file defines the FunctionComparatorIgnoringConst class which is used by
--// the MergeFuncIgnoringConst pass for comparing functions.
--//
--//===----------------------------------------------------------------------===//
--
--#ifndef LLVM_TRANSFORMS_UTILS_FUNCTIONCOMPARATORIGNORINGCONST_H
--#define LLVM_TRANSFORMS_UTILS_FUNCTIONCOMPARATORIGNORINGCONST_H
--
--#include "llvm/ADT/DenseMap.h"
--#include "llvm/ADT/StringRef.h"
--#include "llvm/IR/Attributes.h"
--#include "llvm/IR/Instructions.h"
--#include "llvm/IR/Operator.h"
--#include "llvm/IR/ValueMap.h"
--#include "llvm/Support/AtomicOrdering.h"
--#include "llvm/Support/Casting.h"
--#include "llvm/Transforms/Utils/FunctionComparator.h"
--#include <set>
--
--namespace llvm {
--
--/// FunctionComparatorIgnoringConst - Compares two functions to determine
--/// whether or not they match when certain constants are ignored.
--class FunctionComparatorIgnoringConst : public FunctionComparator {
--public:
--  FunctionComparatorIgnoringConst(const Function *F1, const Function *F2,
--                                  GlobalNumberState *GN)
--      : FunctionComparator(F1, F2, GN) {}
--
--  int cmpOperandsIgnoringConsts(const Instruction *L, const Instruction *R,
--                                unsigned opIdx);
--
--  int cmpBasicBlocksIgnoringConsts(
--      const BasicBlock *BBL, const BasicBlock *BBR,
--      const std::set<std::pair<int, int>> *InstOpndIndex = nullptr);
--
--  int compareIgnoringConsts(
--      const std::set<std::pair<int, int>> *InstOpndIndex = nullptr);
--
--  int compareConstants(const Constant *L, const Constant *R) const {
--    return cmpConstants(L, R);
--  }
--
--private:
--  /// Scratch index for instruction in order during cmpOperandsIgnoringConsts.
--  int Index = 0;
--};
--
--} // end namespace llvm
--#endif // LLVM_TRANSFORMS_UTILS_FUNCTIONCOMPARATORIGNORINGCONST_H
-diff -ruN --strip-trailing-cr a/llvm/include/llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h b/llvm/include/llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h
---- a/llvm/include/llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h
-+++ b/llvm/include/llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h
-@@ -1,29 +0,0 @@
--//===- MergeFunctionsIgnoringConst.h - Merge Functions ---------*- C++ -*-===//
--//
--// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
--// See https://llvm.org/LICENSE.txt for license information.
--// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
--//
--//===----------------------------------------------------------------------===//
--//
--// This file defines helpers used in the MergeFunctionsIgnoringConst.
--//
--//===----------------------------------------------------------------------===//
--
--#ifndef LLVM_TRANSFORMS_UTILS_MERGEFUNCTIONSIGNORINGCONST_H
--#define LLVM_TRANSFORMS_UTILS_MERGEFUNCTIONSIGNORINGCONST_H
--
--#include "llvm/IR/IRBuilder.h"
--#include "llvm/IR/Instructions.h"
--#include "llvm/IR/Operator.h"
--
--using namespace llvm;
--
--bool isEligibleInstrunctionForConstantSharing(const Instruction *I);
--
--bool isEligibleOperandForConstantSharing(const Instruction *I, unsigned OpIdx);
--
--bool isEligibleFunction(Function *F);
--
--Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy);
--#endif // LLVM_TRANSFORMS_UTILS_MERGEFUNCTIONSIGNORINGCONST_H
-diff -ruN --strip-trailing-cr a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
---- a/llvm/lib/Passes/PassBuilder.cpp
-+++ b/llvm/lib/Passes/PassBuilder.cpp
-@@ -123,7 +123,6 @@
- #include "llvm/Transforms/IPO/LowerTypeTests.h"
- #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
- #include "llvm/Transforms/IPO/MergeFunctions.h"
--#include "llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h"
- #include "llvm/Transforms/IPO/ModuleInliner.h"
- #include "llvm/Transforms/IPO/OpenMPOpt.h"
- #include "llvm/Transforms/IPO/PartialInlining.h"
-diff -ruN --strip-trailing-cr a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
---- a/llvm/lib/Passes/PassBuilderPipelines.cpp
-+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
-@@ -60,7 +60,6 @@
- #include "llvm/Transforms/IPO/LowerTypeTests.h"
- #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
- #include "llvm/Transforms/IPO/MergeFunctions.h"
--#include "llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h"
- #include "llvm/Transforms/IPO/ModuleInliner.h"
- #include "llvm/Transforms/IPO/OpenMPOpt.h"
- #include "llvm/Transforms/IPO/PartialInlining.h"
-@@ -177,10 +176,6 @@
-     "enable-merge-functions", cl::init(false), cl::Hidden,
-     cl::desc("Enable function merging as part of the optimization pipeline"));
- 
--static cl::opt<bool> EnableMergeFuncIgnoringConst(
--    "enable-merge-func-ignoring-const", cl::init(false), cl::Hidden,
--    cl::desc("Enable function merger that ignores constants"));
--
- static cl::opt<bool> EnablePostPGOLoopRotation(
-     "enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden,
-     cl::desc("Run the loop rotation transformation after PGO instrumentation"));
-@@ -1638,9 +1633,6 @@
-   MPM.addPass(buildModuleOptimizationPipeline(
-       Level, ThinOrFullLTOPhase::ThinLTOPostLink));
- 
--  if (EnableMergeFuncIgnoringConst)
--    MPM.addPass(MergeFuncIgnoringConstPass());
--
-   // Emit annotation remarks.
-   addAnnotationRemarksPass(MPM);
- 
-@@ -1966,9 +1958,6 @@
- 
-   invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
- 
--  if (EnableMergeFuncIgnoringConst)
--    MPM.addPass(MergeFuncIgnoringConstPass());
--
-   // Emit annotation remarks.
-   addAnnotationRemarksPass(MPM);
- 
-diff -ruN --strip-trailing-cr a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
---- a/llvm/lib/Passes/PassRegistry.def
-+++ b/llvm/lib/Passes/PassRegistry.def
-@@ -87,7 +87,6 @@
- MODULE_PASS("lowertypetests", LowerTypeTestsPass())
- MODULE_PASS("metarenamer", MetaRenamerPass())
- MODULE_PASS("mergefunc", MergeFunctionsPass())
--MODULE_PASS("mergefunc-ignoring-const", MergeFuncIgnoringConstPass())
- MODULE_PASS("name-anon-globals", NameAnonGlobalPass())
- MODULE_PASS("no-op-module", NoOpModulePass())
- MODULE_PASS("objc-arc-apelim", ObjCARCAPElimPass())
-diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
---- a/llvm/lib/Transforms/IPO/CMakeLists.txt
-+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
-@@ -30,7 +30,6 @@
-   LowerTypeTests.cpp
-   MemProfContextDisambiguation.cpp
-   MergeFunctions.cpp
--  MergeFunctionsIgnoringConst.cpp
-   ModuleInliner.cpp
-   OpenMPOpt.cpp
-   PartialInlining.cpp
-diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/IPO/MergeFunctionsIgnoringConst.cpp b/llvm/lib/Transforms/IPO/MergeFunctionsIgnoringConst.cpp
---- a/llvm/lib/Transforms/IPO/MergeFunctionsIgnoringConst.cpp
-+++ b/llvm/lib/Transforms/IPO/MergeFunctionsIgnoringConst.cpp
-@@ -1,1399 +0,0 @@
--//===--- MergeFunctionsIgnoringConst.cpp - Merge functions ----------------===//
--//
--// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
--// See https://llvm.org/LICENSE.txt for license information.
--// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
--//
--//===----------------------------------------------------------------------===//
--//
--// This pass looks for similar functions that are mergeable and folds them.
--// The implementation is similar to LLVM's MergeFunctions pass. Instead of
--// merging identical functions, it merges functions which only differ by a few
--// constants in certain instructions.
--// This is copied from Swift's implementation.
--//
--// This pass should run after LLVM's MergeFunctions pass, because it works best
--// if there are no _identical_ functions in the module.
--// Note: it would also work for identical functions but could produce more
--// code overhead than the LLVM pass.
--//
--//===----------------------------------------------------------------------===//
--
--#include "llvm/Transforms/IPO/MergeFunctionsIgnoringConst.h"
--#include "llvm/ADT/DenseSet.h"
--#include "llvm/ADT/FoldingSet.h"
--#include "llvm/ADT/Hashing.h"
--#include "llvm/ADT/STLExtras.h"
--#include "llvm/ADT/SmallSet.h"
--#include "llvm/ADT/StableHashing.h"
--#include "llvm/ADT/Statistic.h"
--#include "llvm/Analysis/ObjCARCUtil.h"
--#include "llvm/IR/Attributes.h"
--#include "llvm/IR/Constants.h"
--#include "llvm/IR/DataLayout.h"
--#include "llvm/IR/DebugInfoMetadata.h"
--#include "llvm/IR/IRBuilder.h"
--#include "llvm/IR/InlineAsm.h"
--#include "llvm/IR/Instructions.h"
--#include "llvm/IR/Module.h"
--#include "llvm/IR/Operator.h"
--#include "llvm/IR/StructuralHash.h"
--#include "llvm/IR/ValueHandle.h"
--#include "llvm/IR/ValueMap.h"
--#include "llvm/InitializePasses.h"
--#include "llvm/Pass.h"
--#include "llvm/Support/CommandLine.h"
--#include "llvm/Support/Debug.h"
--#include "llvm/Support/ErrorHandling.h"
--#include "llvm/Support/FileSystem.h"
--#include "llvm/Support/Regex.h"
--#include "llvm/Support/raw_ostream.h"
--#include "llvm/Transforms/IPO.h"
--#include "llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h"
--#include <vector>
--
--using namespace llvm;
--
--#define DEBUG_TYPE "mergefunc-ignoring-const"
--
--STATISTIC(NumFunctionsMergedIgnoringConst, "Number of functions merged");
--STATISTIC(NumThunksWrittenIgnoringConst, "Number of thunks generated");
--
--static cl::opt<bool> EnableAggressiveMergeFunc(
--    "enable-aggressive-mergefunc-ignoringconst", cl::init(false), cl::Hidden,
--    cl::desc("Enable more aggressive function merger"));
--
--static cl::opt<unsigned> NumFunctionsIgnoringConstForSanityCheck(
--    "mergefunc-ignoringconst-sanity",
--    cl::desc("How many functions in module could be used for "
--             "MergeFunctionsIgnoringConst pass sanity check. "
--             "'0' disables this check. Works only with '-debug' key."),
--    cl::init(0), cl::Hidden);
--
--static cl::opt<unsigned> IgnoringConstMergeThreshold(
--    "mergefunc-ignoringconst-threshold",
--    cl::desc("Functions larger than the threshold are considered for merging."
--             "'0' disables function merging at all."),
--    cl::init(15), cl::Hidden);
--
--cl::opt<bool> UseLinkOnceODRLinkageMerging(
--    "use-linkonceodr-linkage-merging", cl::init(false), cl::Hidden,
--    cl::desc(
--        "Use LinkeOnceODR linkage to deduplicate the identical merged function "
--        "(default = off)"));
--
--cl::opt<bool> NoInlineForMergedFunction(
--    "no-inline-merged-function", cl::init(false), cl::Hidden,
--    cl::desc("set noinline for merged function (default = off)"));
--
--static cl::opt<bool>
--    CastArrayType("merge-cast-array-type", cl::init(false), cl::Hidden,
--                  cl::desc("support for casting array type (default = off)"));
--
--static cl::opt<bool> IgnoreMusttailFunction(
--    "ignore-musttail-function", cl::init(false), cl::Hidden,
--    cl::desc(
--        "ignore functions containing callsites with musttail (default = off)"));
--
--static cl::opt<bool> AlwaysCallThunk(
--    "merge-always-call-thunk", cl::init(false), cl::Hidden,
--    cl::desc(
--        "do not replace callsites and always emit a thunk (default = off)"));
--
--static cl::list<std::string> MergeBlockRegexFilters(
--    "merge-block-regex", cl::Optional,
--    cl::desc("Block functions from merging if they match the given "
--             "regular expression"),
--    cl::ZeroOrMore);
--
--static cl::list<std::string> MergeAllowRegexFilters(
--    "merge-allow-regex", cl::Optional,
--    cl::desc("Allow functions from merging if they match the given "
--             "regular expression"),
--    cl::ZeroOrMore);
--
--bool isEligibleInstrunctionForConstantSharing(const Instruction *I) {
--  switch (I->getOpcode()) {
--  case Instruction::Load:
--  case Instruction::Store:
--  case Instruction::Call:
--    return true;
--  default: {
--    if (EnableAggressiveMergeFunc && I->getOpcode() == Instruction::Invoke)
--      return true;
--    return false;
--  }
--  }
--}
--
--/// Returns true if the \OpIdx operand of \p CI is the callee operand.
--static bool isCalleeOperand(const CallBase *CI, unsigned OpIdx) {
--  return &CI->getCalledOperandUse() == &CI->getOperandUse(OpIdx);
--}
--
--static bool canParameterizeCallOperand(const CallBase *CI, unsigned OpIdx) {
--  if (CI->isInlineAsm())
--    return false;
--  Function *Callee = CI->getCalledOperand()
--                         ? dyn_cast_or_null<Function>(
--                               CI->getCalledOperand()->stripPointerCasts())
--                         : nullptr;
--  if (Callee) {
--    if (Callee->isIntrinsic())
--      return false;
--    // objc_msgSend stubs must be called, and can't have their address taken.
--    if (Callee->getName().startswith("objc_msgSend$"))
--      return false;
--  }
--  if (isCalleeOperand(CI, OpIdx) &&
--      CI->getOperandBundle(LLVMContext::OB_ptrauth).has_value()) {
--    // The operand is the callee and it has already been signed. Ignore this
--    // because we cannot add another ptrauth bundle to the call instruction.
--    return false;
--  }
--  return true;
--}
--
--bool isEligibleOperandForConstantSharing(const Instruction *I, unsigned OpIdx) {
--  assert(OpIdx < I->getNumOperands() && "Invalid operand index");
--
--  if (!isEligibleInstrunctionForConstantSharing(I))
--    return false;
--
--  auto Opnd = I->getOperand(OpIdx);
--  if (!isa<Constant>(Opnd))
--    return false;
--
--  if (const auto *CI = dyn_cast<CallBase>(I))
--    return canParameterizeCallOperand(CI, OpIdx);
--
--  return true;
--}
--
--namespace {
--
--/// MergeFuncIgnoringConst finds functions which only differ by constants in
--/// certain instructions, e.g. resulting from specialized functions of layout
--/// compatible types.
--/// Such functions are merged by replacing the differing constants by a
--/// parameter. The original functions are replaced by thunks which call the
--/// merged function with the specific argument constants.
--///
--class MergeFuncIgnoringConstImpl {
--public:
--  MergeFuncIgnoringConstImpl(bool PtrAuthEnabled, unsigned PtrAuthKey,
--                             std::string Suffix)
--      : FnTree(FunctionNodeCmp(&GlobalNumbers)), PtrAuthEnabled(PtrAuthEnabled),
--        PtrAuthKey(PtrAuthKey), MergeFuncSuffix(Suffix) {}
--
--  bool runImpl(Module &M);
--
--private:
--  struct FunctionEntry;
--
--  /// Describes the set of functions which are considered as "equivalent" (i.e.
--  /// only differing by some constants).
--  struct EquivalenceClass {
--    /// The single-linked list of all functions which are a member of this
--    /// equivalence class.
--    FunctionEntry *First;
--
--    /// A very cheap hash, used to early exit if functions do not match.
--    llvm::IRHash Hash;
--
--  public:
--    // Note the hash is recalculated potentially multiple times, but it is
--    // cheap.
--    EquivalenceClass(FunctionEntry *First)
--        : First(First), Hash(StructuralHash(*First->F)) {
--      assert(!First->Next);
--    }
--  };
--
--  /// The function comparison operator is provided here so that FunctionNodes do
--  /// not need to become larger with another pointer.
--  class FunctionNodeCmp {
--    GlobalNumberState *GlobalNumbers;
--
--  public:
--    FunctionNodeCmp(GlobalNumberState *GN) : GlobalNumbers(GN) {}
--    bool operator()(const EquivalenceClass &LHS,
--                    const EquivalenceClass &RHS) const {
--      // Order first by hashes, then full function comparison.
--      if (LHS.Hash != RHS.Hash)
--        return LHS.Hash < RHS.Hash;
--      FunctionComparatorIgnoringConst FCmp(LHS.First->F, RHS.First->F,
--                                           GlobalNumbers);
--      return FCmp.compareIgnoringConsts() == -1;
--    }
--  };
--  using FnTreeType = std::set<EquivalenceClass, FunctionNodeCmp>;
--
--  ///
--  struct FunctionEntry {
--    FunctionEntry(Function *F, FnTreeType::iterator I)
--        : F(F), Next(nullptr), NumUnhandledCallees(0), TreeIter(I),
--          IsMerged(false) {}
--
--    /// Back-link to the function.
--    AssertingVH<Function> F;
--
--    /// The next function in its equivalence class.
--    FunctionEntry *Next;
--
--    /// The number of not-yet merged callees. Used to process the merging in
--    /// bottom-up call order.
--    /// This is only valid in the first entry of an equivalence class. The
--    /// counts of all functions in an equivalence class are accumulated in the
--    /// first entry.
--    int NumUnhandledCallees;
--
--    /// The iterator of the function's equivalence class in the FnTree.
--    /// It's FnTree.end() if the function is not in an equivalence class.
--    FnTreeType::iterator TreeIter;
--
--    /// True if this function is already a thunk, calling the merged function.
--    bool IsMerged;
--  };
--
--  /// Describes an operator of a specific instruction.
--  struct OpLocation {
--    Instruction *I;
--    unsigned OpIndex;
--  };
--
--  /// Information for a function. Used during merging.
--  struct FunctionInfo {
--
--    FunctionInfo(Function *F)
--        : F(F), CurrentInst(nullptr), NumParamsNeeded(0) {}
--
--    void init() {
--      CurrentInst = &*F->begin()->begin();
--      NumParamsNeeded = 0;
--    }
--
--    /// Advances the current instruction to the next instruction.
--    void nextInst() {
--      assert(CurrentInst);
--      if (CurrentInst->isTerminator()) {
--        auto BlockIter = std::next(CurrentInst->getParent()->getIterator());
--        if (BlockIter == F->end()) {
--          CurrentInst = nullptr;
--          return;
--        }
--        CurrentInst = &*BlockIter->begin();
--        return;
--      }
--      CurrentInst = &*std::next(CurrentInst->getIterator());
--    }
--
--    /// Returns true if the operand \p OpIdx of the current instruction is the
--    /// callee of a call, which needs to be signed if passed as a parameter.
--    bool needsPointerSigning(unsigned OpIdx) const {
--      if (auto *CI = dyn_cast<CallInst>(CurrentInst))
--        return isCalleeOperand(CI, OpIdx);
--      return false;
--    }
--
--    Function *F;
--
--    /// The current instruction while iterating over all instructions.
--    Instruction *CurrentInst;
--
--    /// Roughly the number of parameters needed if this function would be
--    /// merged with the first function of the equivalence class.
--    int NumParamsNeeded;
--  };
--
--  using FunctionInfos = SmallVector<FunctionInfo, 8>;
--
--  /// Describes a parameter which we create to parameterize the merged function.
--  struct ParamInfo {
--    /// The value of the parameter for all the functions in the equivalence
--    /// class.
--    SmallVector<Constant *, 8> Values;
--
--    /// All uses of the parameter in the merged function.
--    SmallVector<OpLocation, 16> Uses;
--
--    /// The Discriminator for pointer signing.
--    /// Only not null if needsPointerSigning is true.
--    ConstantInt *Discriminator = nullptr;
--
--    /// True if the value is a callee function, which needs to be signed if
--    /// passed as a parameter.
--    bool NeedsPointerSigning = false;
--
--    /// Checks if this parameter can be used to describe an operand in all
--    /// functions of the equivalence class. Returns true if all values match
--    /// the specific instruction operands in all functions.
--    bool matches(const FunctionInfos &FInfos, unsigned OpIdx,
--                 bool PtrAuthEnabled) const {
--      unsigned NumFuncs = FInfos.size();
--      assert(Values.size() == NumFuncs);
--      if (PtrAuthEnabled &&
--          NeedsPointerSigning != FInfos[0].needsPointerSigning(OpIdx)) {
--        return false;
--      }
--      for (unsigned Idx = 0; Idx < NumFuncs; ++Idx) {
--        const FunctionInfo &FI = FInfos[Idx];
--        Constant *C = cast<Constant>(FI.CurrentInst->getOperand(OpIdx));
--        if (Values[Idx] != C)
--          return false;
--      }
--      return true;
--    }
--
--    /// Computes the Discriminator for pointer signing.
--    void computeDiscriminator(LLVMContext &Context) {
--      assert(NeedsPointerSigning);
--      assert(!Discriminator);
--
--      /// Get a hash from the concatenated function names.
--      /// The hash is deterministic, because the order of values depends on the
--      /// order of functions in the module, which is itself deterministic.
--      /// Note that the hash is not part of the ABI, because it's purly used
--      /// for pointer authentication between a module-private caller-callee
--      /// pair.
--      std::string concatenatedCalleeNames;
--      for (Constant *value : Values) {
--        if (auto *GO = dyn_cast<GlobalObject>(value))
--          concatenatedCalleeNames += GO->getName();
--      }
--      uint64_t rawHash = stable_hash_combine_string(concatenatedCalleeNames);
--      IntegerType *discrTy = Type::getInt64Ty(Context);
--      Discriminator = ConstantInt::get(discrTy, (rawHash % 0xFFFF) + 1);
--    }
--  };
--
--  using ParamInfos = SmallVector<ParamInfo, 16>;
--
--  Module *CurrentModule = nullptr;
--
--  GlobalNumberState GlobalNumbers;
--
--  /// A work queue of functions that may have been modified and should be
--  /// analyzed again.
--  std::vector<WeakTrackingVH> Deferred;
--
--  /// The set of all distinct functions. Use the insert() and remove() methods
--  /// to modify it. The map allows efficient lookup and deferring of Functions.
--  FnTreeType FnTree;
--
--  ValueMap<Function *, FunctionEntry *> FuncEntries;
--
--  // Maps a function-pointer / Discriminator pair to a corresponding global in
--  // the llvm.ptrauth section.
--  // This map is used as a cache to not create ptrauth globals twice.
--  DenseMap<std::pair<Constant *, ConstantInt *>, Constant *> PtrAuthGlobals;
--
--  /// True if the architecture has pointer authentication enabled.
--  bool PtrAuthEnabled = false;
--
--  /// The key for pointer authentication.
--  unsigned PtrAuthKey = 0;
--
--  std::string MergeFuncSuffix = ".Tm";
--
--  FunctionEntry *getEntry(Function *F) const { return FuncEntries.lookup(F); }
--
--  bool isInEquivalenceClass(FunctionEntry *FE) const {
--    if (FE->TreeIter != FnTree.end()) {
--      return true;
--    }
--    assert(!FE->Next);
--    assert(FE->NumUnhandledCallees == 0);
--    return false;
--  }
--
--  /// Checks the rules of order relation introduced among functions set.
--  /// Returns true, if sanity check has been passed, and false if failed.
--  bool doSanityCheck(std::vector<WeakTrackingVH> &Worklist);
--
--  /// Updates the NumUnhandledCallees of all user functions of the equivalence
--  /// class containing \p FE by \p Delta.
--  void updateUnhandledCalleeCount(FunctionEntry *FE, int Delta);
--
--  bool tryMergeEquivalenceClass(FunctionEntry *FirstInClass);
--
--  FunctionInfo removeFuncWithMostParams(FunctionInfos &FInfos);
--
--  bool deriveParams(ParamInfos &Params, FunctionInfos &FInfos,
--                    unsigned maxParams);
--
--  bool numOperandsDiffer(FunctionInfos &FInfos);
--
--  bool constsDiffer(const FunctionInfos &FInfos, unsigned OpIdx);
--
--  bool tryMapToParameter(FunctionInfos &FInfos, unsigned OpIdx,
--                         ParamInfos &Params, unsigned maxParams);
--
--  void replaceCallWithAddedPtrAuth(CallInst *origCall, Value *newCallee,
--                                   ConstantInt *Discriminator);
--
--  void mergeWithParams(const FunctionInfos &FInfos, ParamInfos &Params);
--  static void dumpMergeInfo(const FunctionInfos &FInfos, unsigned);
--
--  void removeEquivalenceClassFromTree(FunctionEntry *FE);
--
--  void writeThunk(Function *ToFunc, Function *Thunk, const ParamInfos &Params,
--                  unsigned FuncIdx);
--
--  bool isPtrAuthEnabled() const {
--    // TODO: fix pointer authentication
--    return PtrAuthEnabled;
--  }
--
--  ConstantInt *getPtrAuthKey() {
--    // TODO: fix pointer authentication
--    return ConstantInt::get(Type::getInt32Ty(CurrentModule->getContext()),
--                            PtrAuthKey);
--  }
--
--  /// Returns the value of function \p FuncIdx, and signes it if required.
--  Constant *getSignedValue(const ParamInfo &PI, unsigned FuncIdx) {
--    Constant *value = PI.Values[FuncIdx];
--    if (!PI.NeedsPointerSigning)
--      return value;
--
--    auto lookupKey = std::make_pair(value, PI.Discriminator);
--    Constant *&ptrAuthGlobal = PtrAuthGlobals[lookupKey];
--    if (!ptrAuthGlobal) {
--      // TODO: fix pointer authentication
--    }
--    return ptrAuthGlobal;
--  }
--
--  /// Replace all direct calls of Old with calls of New. Will bitcast New if
--  /// necessary to make types match.
--  bool replaceDirectCallers(Function *Old, Function *New,
--                            const ParamInfos &Params, unsigned FuncIdx);
--};
--
--} // end anonymous namespace
--
--bool MergeFuncIgnoringConstImpl::doSanityCheck(
--    std::vector<WeakTrackingVH> &Worklist) {
--  if (const unsigned Max = NumFunctionsIgnoringConstForSanityCheck) {
--    unsigned TripleNumber = 0;
--    bool Valid = true;
--
--    dbgs() << "MERGEFUNC-SANITY: Started for first " << Max << " functions.\n";
--
--    unsigned i = 0;
--    for (std::vector<WeakTrackingVH>::iterator I = Worklist.begin(),
--                                               E = Worklist.end();
--         I != E && i < Max; ++I, ++i) {
--      unsigned j = i;
--      for (std::vector<WeakTrackingVH>::iterator J = I; J != E && j < Max;
--           ++J, ++j) {
--        Function *F1 = cast<Function>(*I);
--        Function *F2 = cast<Function>(*J);
--        int Res1 = FunctionComparatorIgnoringConst(F1, F2, &GlobalNumbers)
--                       .compareIgnoringConsts();
--        int Res2 = FunctionComparatorIgnoringConst(F2, F1, &GlobalNumbers)
--                       .compareIgnoringConsts();
--
--        // If F1 <= F2, then F2 >= F1, otherwise report failure.
--        if (Res1 != -Res2) {
--          dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber
--                 << "\n";
--          LLVM_DEBUG(F1->dump());
--          LLVM_DEBUG(F2->dump());
--          Valid = false;
--        }
--
--        if (Res1 == 0)
--          continue;
--
--        unsigned k = j;
--        for (std::vector<WeakTrackingVH>::iterator K = J; K != E && k < Max;
--             ++k, ++K, ++TripleNumber) {
--          if (K == J)
--            continue;
--
--          Function *F3 = cast<Function>(*K);
--          int Res3 = FunctionComparatorIgnoringConst(F1, F3, &GlobalNumbers)
--                         .compareIgnoringConsts();
--          int Res4 = FunctionComparatorIgnoringConst(F2, F3, &GlobalNumbers)
--                         .compareIgnoringConsts();
--
--          bool Transitive = true;
--
--          if (Res1 != 0 && Res1 == Res4) {
--            // F1 > F2, F2 > F3 => F1 > F3
--            Transitive = Res3 == Res1;
--          } else if (Res3 != 0 && Res3 == -Res4) {
--            // F1 > F3, F3 > F2 => F1 > F2
--            Transitive = Res3 == Res1;
--          } else if (Res4 != 0 && -Res3 == Res4) {
--            // F2 > F3, F3 > F1 => F2 > F1
--            Transitive = Res4 == -Res1;
--          }
--
--          if (!Transitive) {
--            dbgs() << "MERGEFUNC-SANITY: Non-transitive; triple: "
--                   << TripleNumber << "\n";
--            dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", "
--                   << Res4 << "\n";
--            LLVM_DEBUG(F1->dump());
--            LLVM_DEBUG(F2->dump());
--            LLVM_DEBUG(F3->dump());
--            Valid = false;
--          }
--        }
--      }
--    }
--
--    dbgs() << "MERGEFUNC-SANITY: " << (Valid ? "Passed." : "Failed.") << "\n";
--    return Valid;
--  }
--  return true;
--}
--
--/// Returns true if functions containing calls to \p F may be merged together.
--static bool mayMergeCallsToFunction(Function &F) {
--  StringRef Name = F.getName();
--
--  // Calls to dtrace probes must generate unique patchpoints.
--  if (Name.startswith("__dtrace"))
--    return false;
--
--  return true;
--}
--
--/// Returns the benefit, which is approximately the size of the function.
--/// Return 0, if the function should not be merged.
--static unsigned getBenefit(Function *F) {
--  unsigned Benefit = 0;
--
--  // We don't want to merge very small functions, because the overhead of
--  // adding creating thunks and/or adding parameters to the call sites
--  // outweighs the benefit.
--  for (BasicBlock &BB : *F) {
--    for (Instruction &I : BB) {
--      if (CallBase *CB = dyn_cast<CallBase>(&I)) {
--        Function *Callee = CB->getCalledFunction();
--        if (Callee && !mayMergeCallsToFunction(*Callee))
--          return 0;
--        if (!Callee || !Callee->isIntrinsic()) {
--          Benefit += 5;
--          continue;
--        }
--      }
--      Benefit += 1;
--    }
--  }
--  return Benefit;
--}
--
--/// Returns true if function \p F is eligible for merging.
--bool isEligibleFunction(Function *F) {
--  if (F->isDeclaration())
--    return false;
--
--  if (F->hasFnAttribute(llvm::Attribute::NoMerge))
--    return false;
--
--  if (F->hasAvailableExternallyLinkage()) {
--    return false;
--  }
--
--  if (F->getFunctionType()->isVarArg()) {
--    return false;
--  }
--
--  // Check against blocklist.
--  if (!MergeBlockRegexFilters.empty()) {
--    StringRef FuncName = F->getName();
--    for (const auto &tRegex : MergeBlockRegexFilters)
--      if (Regex(tRegex).match(FuncName)) {
--        return false;
--      }
--  }
--  // Check against allowlist
--  if (!MergeAllowRegexFilters.empty()) {
--    StringRef FuncName = F->getName();
--    bool found = false;
--    for (const auto &tRegex : MergeAllowRegexFilters)
--      if (Regex(tRegex).match(FuncName)) {
--        found = true;
--        break;
--      }
--    if (!found)
--      return false;
--  }
--
--  if (F->getCallingConv() == CallingConv::SwiftTail)
--    return false;
--
--  // if function contains callsites with musttail, if we merge
--  // it, the merged function will have the musttail callsite, but
--  // the number of parameters can change, thus the parameter count
--  // of the callsite will mismatch with the function itself.
--  if (IgnoreMusttailFunction) {
--    for (const BasicBlock &BB : *F) {
--      for (const Instruction &I : BB) {
--        const auto *CB = dyn_cast<CallBase>(&I);
--        if (CB && CB->isMustTailCall())
--          return false;
--      }
--    }
--  }
--
--  unsigned Benefit = getBenefit(F);
--  if (Benefit < IgnoringConstMergeThreshold) {
--    return false;
--  }
--
--  return true;
--}
--
--bool MergeFuncIgnoringConstImpl::runImpl(Module &M) {
--  if (IgnoringConstMergeThreshold == 0)
--    return false;
--
--  CurrentModule = &M;
--
--  // TODO: fix pointer authentication
--
--  bool Changed = false;
--
--  // All functions in the module, ordered by hash. Functions with a unique
--  // hash value are easily eliminated.
--  std::vector<std::pair<llvm::IRHash, Function *>> HashedFuncs;
--
--  for (Function &Func : M) {
--    if (isEligibleFunction(&Func)) {
--      HashedFuncs.push_back({StructuralHash(Func), &Func});
--    }
--  }
--
--  std::stable_sort(HashedFuncs.begin(), HashedFuncs.end(),
--                   [](const std::pair<llvm::IRHash, Function *> &a,
--                      const std::pair<llvm::IRHash, Function *> &b) {
--                     return a.first < b.first;
--                   });
--
--  std::vector<FunctionEntry> FuncEntryStorage;
--  FuncEntryStorage.reserve(HashedFuncs.size());
--
--  auto S = HashedFuncs.begin();
--  for (auto I = HashedFuncs.begin(), IE = HashedFuncs.end(); I != IE; ++I) {
--
--    Function *F = I->second;
--    FuncEntryStorage.push_back(FunctionEntry(F, FnTree.end()));
--    FunctionEntry &FE = FuncEntryStorage.back();
--    FuncEntries[F] = &FE;
--
--    // If the hash value matches the previous value or the next one, we must
--    // consider merging it. Otherwise it is dropped and never considered again.
--    if ((I != S && std::prev(I)->first == I->first) ||
--        (std::next(I) != IE && std::next(I)->first == I->first)) {
--      Deferred.push_back(WeakTrackingVH(F));
--    }
--  }
--
--  do {
--    std::vector<WeakTrackingVH> Worklist;
--    Deferred.swap(Worklist);
--
--    LLVM_DEBUG(dbgs() << "======\nbuild tree: worklist-size=" << Worklist.size()
--                      << '\n');
--    LLVM_DEBUG(doSanityCheck(Worklist));
--
--    SmallVector<FunctionEntry *, 8> FuncsToMerge;
--
--    // Insert all candidates into the Worklist.
--    for (WeakTrackingVH &I : Worklist) {
--      if (!I)
--        continue;
--      Function *F = cast<Function>(I);
--      FunctionEntry *FE = getEntry(F);
--      assert(!isInEquivalenceClass(FE));
--
--      std::pair<FnTreeType::iterator, bool> Result = FnTree.insert(FE);
--
--      FE->TreeIter = Result.first;
--      const EquivalenceClass &Eq = *Result.first;
--
--      if (Result.second) {
--        assert(Eq.First == FE);
--        LLVM_DEBUG(dbgs() << "  new in tree: " << F->getName() << '\n');
--      } else {
--        assert(Eq.First != FE);
--        LLVM_DEBUG(dbgs() << "  add to existing: " << F->getName() << '\n');
--        // Add the function to the existing equivalence class.
--        FE->Next = Eq.First->Next;
--        Eq.First->Next = FE;
--        // Schedule for merging if the function's equivalence class reaches the
--        // size of 2.
--        if (!FE->Next)
--          FuncsToMerge.push_back(Eq.First);
--      }
--    }
--    LLVM_DEBUG(dbgs() << "merge functions: tree-size=" << FnTree.size()
--                      << '\n');
--
--    // Figure out the leaf functions. We want to do the merging in bottom-up
--    // call order. This ensures that we don't parameterize on callee function
--    // names if we don't have to (because the callee may be merged).
--    // Note that "leaf functions" refer to the sub-call-graph of functions which
--    // are in the FnTree.
--    for (FunctionEntry *ToMerge : FuncsToMerge) {
--      assert(isInEquivalenceClass(ToMerge));
--      updateUnhandledCalleeCount(ToMerge, 1);
--    }
--
--    // Check if there are any leaf functions at all.
--    bool LeafFound = false;
--    for (FunctionEntry *ToMerge : FuncsToMerge) {
--      if (ToMerge->NumUnhandledCallees == 0)
--        LeafFound = true;
--    }
--    for (FunctionEntry *ToMerge : FuncsToMerge) {
--      if (isInEquivalenceClass(ToMerge)) {
--        // Only merge leaf functions (or all functions if all functions are in
--        // a call cycle).
--        if (ToMerge->NumUnhandledCallees == 0 || !LeafFound) {
--          updateUnhandledCalleeCount(ToMerge, -1);
--          Changed |= tryMergeEquivalenceClass(ToMerge);
--        } else {
--          // Non-leaf functions (i.e. functions in a call cycle) may become
--          // leaf functions in the next iteration.
--          removeEquivalenceClassFromTree(ToMerge);
--        }
--      }
--    }
--  } while (!Deferred.empty());
--
--  FnTree.clear();
--  GlobalNumbers.clear();
--  FuncEntries.clear();
--  PtrAuthGlobals.clear();
--
--  return Changed;
--}
--
--void MergeFuncIgnoringConstImpl::updateUnhandledCalleeCount(FunctionEntry *FE,
--                                                            int Delta) {
--  // Iterate over all functions of FE's equivalence class.
--  do {
--    for (Use &U : FE->F->uses()) {
--      if (auto *I = dyn_cast<Instruction>(U.getUser())) {
--        FunctionEntry *CallerFE = getEntry(I->getFunction());
--        if (CallerFE && CallerFE->TreeIter != FnTree.end()) {
--          // Accumulate the count in the first entry of the equivalence class.
--          FunctionEntry *Head = CallerFE->TreeIter->First;
--          Head->NumUnhandledCallees += Delta;
--        }
--      }
--    }
--    FE = FE->Next;
--  } while (FE);
--}
--
--bool MergeFuncIgnoringConstImpl::tryMergeEquivalenceClass(
--    FunctionEntry *FirstInClass) {
--  // Build the FInfos vector from all functions in the equivalence class.
--  FunctionInfos FInfos;
--  FunctionEntry *FE = FirstInClass;
--  do {
--    FInfos.push_back(FunctionInfo(FE->F));
--    FE->IsMerged = true;
--    FE = FE->Next;
--  } while (FE);
--  assert(FInfos.size() >= 2);
--
--  // Merged or not: in any case we remove the equivalence class from the FnTree.
--  removeEquivalenceClassFromTree(FirstInClass);
--
--  // Contains functions which differ too much from the first function (i.e.
--  // would need too many parameters).
--  FunctionInfos Removed;
--
--  bool Changed = false;
--  int Try = 0;
--
--  unsigned Benefit = getBenefit(FirstInClass->F);
--
--  // The bigger the function, the more parameters are allowed.
--  unsigned maxParams = std::max(4u, Benefit / 100);
--
--  // We need multiple tries if there are some functions in FInfos which differ
--  // too much from the first function in FInfos. But we limit the number of
--  // tries to a small number, because this is quadratic.
--  while (FInfos.size() >= 2 && Try++ < 4) {
--    ParamInfos Params;
--    bool Merged = deriveParams(Params, FInfos, maxParams);
--    if (Merged) {
--      mergeWithParams(FInfos, Params);
--      Changed = true;
--    } else {
--      // We ran out of parameters. Remove the function from the set which
--      // differs most from the first function.
--      Removed.push_back(removeFuncWithMostParams(FInfos));
--    }
--    if (Merged || FInfos.size() < 2) {
--      // Try again with the functions which were removed from the original set.
--      FInfos.swap(Removed);
--      Removed.clear();
--    }
--  }
--  return Changed;
--}
--
--/// Remove the function from \p FInfos which needs the most parameters. Add the
--/// removed function to
--MergeFuncIgnoringConstImpl::FunctionInfo
--MergeFuncIgnoringConstImpl::removeFuncWithMostParams(FunctionInfos &FInfos) {
--  FunctionInfos::iterator MaxIter = FInfos.end();
--  for (auto Iter = FInfos.begin(), End = FInfos.end(); Iter != End; ++Iter) {
--    if (MaxIter == FInfos.end() ||
--        Iter->NumParamsNeeded > MaxIter->NumParamsNeeded) {
--      MaxIter = Iter;
--    }
--  }
--  FunctionInfo Removed = *MaxIter;
--  FInfos.erase(MaxIter);
--  return Removed;
--}
--
--/// Finds the set of parameters which are required to merge the functions in
--/// \p FInfos.
--/// Returns true on success, i.e. the functions in \p FInfos can be merged with
--/// the parameters returned in \p Params.
--bool MergeFuncIgnoringConstImpl::deriveParams(ParamInfos &Params,
--                                              FunctionInfos &FInfos,
--                                              unsigned maxParams) {
--  for (FunctionInfo &FI : FInfos)
--    FI.init();
--
--  FunctionInfo &FirstFI = FInfos.front();
--
--  // Iterate over all instructions synchronously in all functions.
--  do {
--    if (isEligibleInstrunctionForConstantSharing(FirstFI.CurrentInst)) {
--
--      // Here we handle a rare corner case which needs to be explained:
--      // Usually the number of operands match, because otherwise the functions
--      // in FInfos would not be in the same equivalence class. There is only one
--      // exception to that: If the current instruction is a call to a function,
--      // which was merged in the previous iteration (in
--      // tryMergeEquivalenceClass) then the call could be replaced and has more
--      // arguments than the original call.
--      if (numOperandsDiffer(FInfos)) {
--        assert(isa<CallInst>(FirstFI.CurrentInst) &&
--               "only calls are expected to differ in number of operands");
--        return false;
--      }
--
--      for (unsigned OpIdx = 0, NumOps = FirstFI.CurrentInst->getNumOperands();
--           OpIdx != NumOps; ++OpIdx) {
--
--        if (constsDiffer(FInfos, OpIdx)) {
--          // This instruction has operands which differ in at least some
--          // functions. So we need to parameterize it.
--          if (!tryMapToParameter(FInfos, OpIdx, Params, maxParams)) {
--            // We ran out of parameters.
--            return false;
--          }
--        }
--      }
--    }
--    // Go to the next instruction in all functions.
--    for (FunctionInfo &FI : FInfos)
--      FI.nextInst();
--  } while (FirstFI.CurrentInst);
--
--  return true;
--}
--
--/// Returns true if the number of operands of the current instruction differs.
--bool MergeFuncIgnoringConstImpl::numOperandsDiffer(FunctionInfos &FInfos) {
--  unsigned numOps = FInfos[0].CurrentInst->getNumOperands();
--  for (const FunctionInfo &FI : ArrayRef<FunctionInfo>(FInfos).drop_front(1)) {
--    if (FI.CurrentInst->getNumOperands() != numOps)
--      return true;
--  }
--  return false;
--}
--
--/// Returns true if the \p OpIdx's constant operand in the current instruction
--/// does differ in any of the functions in \p FInfos.
--bool MergeFuncIgnoringConstImpl::constsDiffer(const FunctionInfos &FInfos,
--                                              unsigned OpIdx) {
--  Constant *CommonConst = nullptr;
--
--  for (const FunctionInfo &FI : FInfos) {
--    Value *Op = FI.CurrentInst->getOperand(OpIdx);
--    if (auto *C = dyn_cast<Constant>(Op)) {
--      if (!CommonConst) {
--        CommonConst = C;
--      } else if (EnableAggressiveMergeFunc &&
--                 isa<ConstantPointerNull>(CommonConst) &&
--                 isa<ConstantPointerNull>(C)) {
--        // if both are null pointer, and if they are different constants
--        // due to type, still treat them as the same.
--      } else if (C != CommonConst) {
--        return true;
--      }
--    }
--  }
--  return false;
--}
--
--/// Create a new parameter for differing operands or try to reuse an existing
--/// parameter.
--/// Returns true if a parameter could be created or found without exceeding the
--/// maximum number of parameters.
--bool MergeFuncIgnoringConstImpl::tryMapToParameter(FunctionInfos &FInfos,
--                                                   unsigned OpIdx,
--                                                   ParamInfos &Params,
--                                                   unsigned maxParams) {
--  ParamInfo *Matching = nullptr;
--  // Try to find an existing parameter which exactly matches the differing
--  // operands of the current instruction.
--  for (ParamInfo &PI : Params) {
--    if (PI.matches(FInfos, OpIdx, isPtrAuthEnabled())) {
--      Matching = &PI;
--      break;
--    }
--  }
--  if (!Matching) {
--    // We need a new parameter.
--    // Check if we are within the limit.
--    if (Params.size() >= maxParams)
--      return false;
--
--    Params.resize(Params.size() + 1);
--    Matching = &Params.back();
--    // Store the constant values into the new parameter.
--    Constant *FirstC = cast<Constant>(FInfos[0].CurrentInst->getOperand(OpIdx));
--    for (FunctionInfo &FI : FInfos) {
--      Constant *C = cast<Constant>(FI.CurrentInst->getOperand(OpIdx));
--      Matching->Values.push_back(C);
--      if (C != FirstC)
--        FI.NumParamsNeeded += 1;
--    }
--    if (isPtrAuthEnabled())
--      Matching->NeedsPointerSigning = FInfos[0].needsPointerSigning(OpIdx);
--  }
--  /// Remember where the parameter is needed when we build our merged function.
--  Matching->Uses.push_back({FInfos[0].CurrentInst, OpIdx});
--  return true;
--}
--
--/// Copy \p origCall with a \p newCalle and add a ptrauth bundle with \p
--/// Discriminator.
--void MergeFuncIgnoringConstImpl::replaceCallWithAddedPtrAuth(
--    CallInst *origCall, Value *newCallee, ConstantInt *Discriminator) {
--  SmallVector<llvm::OperandBundleDef, 4> bundles;
--  origCall->getOperandBundlesAsDefs(bundles);
--  ConstantInt *key = getPtrAuthKey();
--  llvm::Value *bundleArgs[] = {key, Discriminator};
--  bundles.emplace_back("ptrauth", bundleArgs);
--
--  SmallVector<llvm::Value *, 4> copiedArgs;
--  for (Value *op : origCall->args()) {
--    copiedArgs.push_back(op);
--  }
--
--  auto *newCall =
--      CallInst::Create(origCall->getFunctionType(), newCallee, copiedArgs,
--                       bundles, origCall->getName(), origCall);
--  newCall->setAttributes(origCall->getAttributes());
--  newCall->setTailCallKind(origCall->getTailCallKind());
--  newCall->setCallingConv(origCall->getCallingConv());
--  origCall->replaceAllUsesWith(newCall);
--  origCall->eraseFromParent();
--}
--
--void MergeFuncIgnoringConstImpl::dumpMergeInfo(const FunctionInfos &FInfos,
--                                               unsigned paramSize) {
--  std::set<llvm::IRHash> oHashes;
--  std::vector<std::string> funcLocs;
--  Function *OrigFunc = nullptr;
--  for (const auto &FInfo : FInfos) {
--    OrigFunc = FInfo.F;
--
--    llvm::IRHash origHash = StructuralHash(*OrigFunc);
--    oHashes.insert(origHash);
--
--    // Print debug location.
--    std::string Result;
--    raw_string_ostream DbgLocOS(Result);
--    if (DISubprogram *DIS = OrigFunc->getSubprogram()) {
--      DebugLoc FuncDbgLoc =
--          DILocation::get(DIS->getContext(), DIS->getScopeLine(), 0, DIS);
--      FuncDbgLoc.print(DbgLocOS);
--      DbgLocOS.flush();
--    }
--    std::string singleLine =
--        "# functionLoc " +
--        std::to_string(GlobalValue::getGUID(OrigFunc->getName())) + " " +
--        Result + " " + std::string(OrigFunc->getName()) + "\n";
--    funcLocs.push_back(singleLine);
--  }
--}
--
--/// Merge all functions in \p FInfos by creating thunks which call the single
--/// merged function with additional parameters.
--void MergeFuncIgnoringConstImpl::mergeWithParams(const FunctionInfos &FInfos,
--                                                 ParamInfos &Params) {
--  // We reuse the body of the first function for the new merged function.
--  Function *FirstF = FInfos.front().F;
--
--  // Build the type for the merged function. This will be the type of the
--  // original function (FirstF) but with the additional parameter which are
--  // needed to parameterize the merged function.
--  FunctionType *OrigTy = FirstF->getFunctionType();
--  SmallVector<Type *, 8> ParamTypes(OrigTy->param_begin(), OrigTy->param_end());
--
--  for (const ParamInfo &PI : Params) {
--    ParamTypes.push_back(PI.Values[0]->getType());
--  }
--
--  FunctionType *funcType =
--      FunctionType::get(OrigTy->getReturnType(), ParamTypes, false);
--
--  // Create the new function.
--  Function *NewFunction = Function::Create(funcType, FirstF->getLinkage(),
--                                           FirstF->getName() + MergeFuncSuffix);
--  if (auto *SP = FirstF->getSubprogram())
--    NewFunction->setSubprogram(SP);
--  NewFunction->copyAttributesFrom(FirstF);
--  // NOTE: this function is not externally available, do ensure that we reset
--  // the DLL storage
--  NewFunction->setDLLStorageClass(GlobalValue::DefaultStorageClass);
--  if (UseLinkOnceODRLinkageMerging)
--    NewFunction->setLinkage(GlobalValue::LinkOnceODRLinkage);
--  else
--    NewFunction->setLinkage(GlobalValue::InternalLinkage);
--  if (NoInlineForMergedFunction)
--    NewFunction->addFnAttr(Attribute::NoInline);
--
--  // Insert the new function after the last function in the equivalence class.
--  FirstF->getParent()->getFunctionList().insert(
--      std::next(FInfos[1].F->getIterator()), NewFunction);
--
--  LLVM_DEBUG(dbgs() << "  Merge into " << NewFunction->getName() << '\n');
--
--  // Move the body of FirstF into the NewFunction.
--  NewFunction->splice(NewFunction->begin(), FirstF);
--
--  auto NewArgIter = NewFunction->arg_begin();
--  for (Argument &OrigArg : FirstF->args()) {
--    Argument &NewArg = *NewArgIter++;
--    OrigArg.replaceAllUsesWith(&NewArg);
--  }
--  unsigned numOrigArgs = FirstF->arg_size();
--
--  SmallPtrSet<Function *, 8> SelfReferencingFunctions;
--
--  // Replace all differing operands with a parameter.
--  for (unsigned paramIdx = 0; paramIdx < Params.size(); ++paramIdx) {
--    const ParamInfo &PI = Params[paramIdx];
--    Argument *NewArg = NewFunction->getArg(numOrigArgs + paramIdx);
--
--    if (!PI.NeedsPointerSigning) {
--      for (const OpLocation &OL : PI.Uses) {
--        OL.I->setOperand(OL.OpIndex, NewArg);
--      }
--    }
--    // Collect all functions which are referenced by any parameter.
--    for (Value *V : PI.Values) {
--      if (auto *F = dyn_cast<Function>(V))
--        SelfReferencingFunctions.insert(F);
--    }
--  }
--
--  // Replace all differing operands, which need pointer signing, with a
--  // parameter.
--  // We need to do that after all other parameters, because here we replace
--  // call instructions, which must be live in case it has another constant to
--  // be replaced.
--  for (unsigned paramIdx = 0; paramIdx < Params.size(); ++paramIdx) {
--    ParamInfo &PI = Params[paramIdx];
--    if (PI.NeedsPointerSigning) {
--      PI.computeDiscriminator(NewFunction->getContext());
--      for (const OpLocation &OL : PI.Uses) {
--        auto *origCall = cast<CallInst>(OL.I);
--        Argument *newCallee = NewFunction->getArg(numOrigArgs + paramIdx);
--        replaceCallWithAddedPtrAuth(origCall, newCallee, PI.Discriminator);
--      }
--    }
--  }
--
--  for (unsigned FIdx = 0, NumFuncs = FInfos.size(); FIdx < NumFuncs; ++FIdx) {
--    Function *OrigFunc = FInfos[FIdx].F;
--    // Don't try to replace all callers of functions which are used as
--    // parameters because we must not delete such functions.
--    if (SelfReferencingFunctions.count(OrigFunc) == 0 &&
--        replaceDirectCallers(OrigFunc, NewFunction, Params, FIdx)) {
--      // We could replace all uses (and the function is not externally visible),
--      // so we can delete the original function.
--      auto Iter = FuncEntries.find(OrigFunc);
--      assert(Iter != FuncEntries.end());
--      assert(!isInEquivalenceClass(&*Iter->second));
--      Iter->second->F = nullptr;
--      FuncEntries.erase(Iter);
--      LLVM_DEBUG(dbgs() << "    Erase " << OrigFunc->getName() << '\n');
--      OrigFunc->eraseFromParent();
--    } else {
--      // Otherwise we need a thunk which calls the merged function.
--      writeThunk(NewFunction, OrigFunc, Params, FIdx);
--    }
--    ++NumFunctionsMergedIgnoringConst;
--  }
--}
--
--/// Remove all functions of \p FE's equivalence class from FnTree. Add them to
--/// Deferred so that we'll look at them in the next round.
--void MergeFuncIgnoringConstImpl::removeEquivalenceClassFromTree(
--    FunctionEntry *FE) {
--  if (!isInEquivalenceClass(FE))
--    return;
--
--  FnTreeType::iterator Iter = FE->TreeIter;
--  FunctionEntry *Unlink = Iter->First;
--  Unlink->NumUnhandledCallees = 0;
--  while (Unlink) {
--    LLVM_DEBUG(dbgs() << "    remove from tree: " << Unlink->F->getName()
--                      << '\n');
--    if (!Unlink->IsMerged)
--      Deferred.emplace_back(Unlink->F);
--    Unlink->TreeIter = FnTree.end();
--    assert(Unlink->NumUnhandledCallees == 0);
--    FunctionEntry *NextEntry = Unlink->Next;
--    Unlink->Next = nullptr;
--    Unlink = NextEntry;
--  }
--  FnTree.erase(Iter);
--}
--
--// Helper for writeThunk,
--// Selects proper bitcast operation,
--// but a bit simpler then CastInst::getCastOpcode.
--Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
--  Type *SrcTy = V->getType();
--  if (SrcTy->isStructTy()) {
--    assert(DestTy->isStructTy());
--    assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements());
--    Value *Result = UndefValue::get(DestTy);
--    for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) {
--      Value *Element =
--          createCast(Builder, Builder.CreateExtractValue(V, ArrayRef(I)),
--                     DestTy->getStructElementType(I));
--
--      Result = Builder.CreateInsertValue(Result, Element, ArrayRef(I));
--    }
--    return Result;
--  }
--  assert(!DestTy->isStructTy());
--  if (CastArrayType) {
--    if (auto *SrcAT = dyn_cast<ArrayType>(SrcTy)) {
--      auto *DestAT = dyn_cast<ArrayType>(DestTy);
--      assert(DestAT);
--      assert(SrcAT->getNumElements() == DestAT->getNumElements());
--      Value *Result = UndefValue::get(DestTy);
--      for (unsigned int I = 0, E = SrcAT->getNumElements(); I < E; ++I) {
--        Value *Element =
--            createCast(Builder, Builder.CreateExtractValue(V, ArrayRef(I)),
--                       DestAT->getElementType());
--
--        Result = Builder.CreateInsertValue(Result, Element, ArrayRef(I));
--      }
--      return Result;
--    }
--    assert(!DestTy->isArrayTy());
--  }
--  if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
--    return Builder.CreateIntToPtr(V, DestTy);
--  else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
--    return Builder.CreatePtrToInt(V, DestTy);
--  else
--    return Builder.CreateBitCast(V, DestTy);
--}
--
--/// Replace \p Thunk with a simple tail call to \p ToFunc. Also add parameters
--/// to the call to \p ToFunc, which are defined by the FuncIdx's value in
--/// \p Params.
--void MergeFuncIgnoringConstImpl::writeThunk(Function *ToFunc, Function *Thunk,
--                                            const ParamInfos &Params,
--                                            unsigned FuncIdx) {
--  // Delete the existing content of Thunk.
--  Thunk->dropAllReferences();
--
--  BasicBlock *BB = BasicBlock::Create(Thunk->getContext(), "", Thunk);
--  IRBuilder<> Builder(BB);
--
--  SmallVector<Value *, 16> Args;
--  unsigned ParamIdx = 0;
--  FunctionType *ToFuncTy = ToFunc->getFunctionType();
--
--  // Add arguments which are passed through Thunk.
--  for (Argument &AI : Thunk->args()) {
--    Args.push_back(createCast(Builder, &AI, ToFuncTy->getParamType(ParamIdx)));
--    ++ParamIdx;
--  }
--  // Add new arguments defined by Params.
--  for (const ParamInfo &PI : Params) {
--    assert(ParamIdx < ToFuncTy->getNumParams());
--    Constant *param = getSignedValue(PI, FuncIdx);
--    Args.push_back(
--        createCast(Builder, param, ToFuncTy->getParamType(ParamIdx)));
--    ++ParamIdx;
--  }
--
--  CallInst *CI = Builder.CreateCall(ToFunc, Args);
--  bool isSwiftTailCall = ToFunc->getCallingConv() == CallingConv::SwiftTail &&
--                         Thunk->getCallingConv() == CallingConv::SwiftTail;
--  CI->setTailCallKind(isSwiftTailCall ? llvm::CallInst::TCK_MustTail
--                                      : llvm::CallInst::TCK_Tail);
--  CI->setCallingConv(ToFunc->getCallingConv());
--  CI->setAttributes(ToFunc->getAttributes());
--  if (Thunk->getReturnType()->isVoidTy()) {
--    Builder.CreateRetVoid();
--  } else {
--    Builder.CreateRet(createCast(Builder, CI, Thunk->getReturnType()));
--  }
--
--  LLVM_DEBUG(dbgs() << "    writeThunk: " << Thunk->getName() << '\n');
--  ++NumThunksWrittenIgnoringConst;
--}
--
--static llvm::AttributeList
--fixUpTypesInByValAndStructRetAttributes(llvm::FunctionType *fnType,
--                                        llvm::AttributeList attrList) {
--  auto &context = fnType->getContext();
--  if (!context.supportsTypedPointers())
--    return attrList;
--
--  for (unsigned i = 0; i < fnType->getNumParams(); ++i) {
--    auto paramTy = fnType->getParamType(i);
--    auto attrListIndex = llvm::AttributeList::FirstArgIndex + i;
--    if (attrList.hasParamAttr(i, llvm::Attribute::StructRet) &&
--        paramTy->getNonOpaquePointerElementType() !=
--            attrList.getParamStructRetType(i))
--      attrList = attrList.replaceAttributeTypeAtIndex(
--          context, attrListIndex, llvm::Attribute::StructRet,
--          paramTy->getNonOpaquePointerElementType());
--    if (attrList.hasParamAttr(i, llvm::Attribute::ByVal) &&
--        paramTy->getNonOpaquePointerElementType() !=
--            attrList.getParamByValType(i))
--      attrList = attrList.replaceAttributeTypeAtIndex(
--          context, attrListIndex, llvm::Attribute::ByVal,
--          paramTy->getNonOpaquePointerElementType());
--  }
--  return attrList;
--}
--
--/// Replace direct callers of Old with New. Also add parameters to the call to
--/// \p New, which are defined by the FuncIdx's value in \p Params.
--bool MergeFuncIgnoringConstImpl::replaceDirectCallers(Function *Old,
--                                                      Function *New,
--                                                      const ParamInfos &Params,
--                                                      unsigned FuncIdx) {
--  bool AllReplaced = true;
--
--  SmallVector<CallInst *, 8> Callers;
--
--  for (Use &U : Old->uses()) {
--    auto *I = dyn_cast<Instruction>(U.getUser());
--    if (!I) {
--      AllReplaced = false;
--      continue;
--    }
--    FunctionEntry *FE = getEntry(I->getFunction());
--    if (FE)
--      removeEquivalenceClassFromTree(FE);
--
--    auto *CI = dyn_cast<CallInst>(I);
--    if (!CI || CI->getCalledOperand() != Old) {
--      AllReplaced = false;
--      continue;
--    }
--    Callers.push_back(CI);
--  }
--  if (!AllReplaced)
--    return false;
--
--  // When AlwaysCallThunk is true, return false so a thunk will be emitted, also
--  // do not replace callsites.
--  if (AlwaysCallThunk)
--    return false;
--
--  for (CallInst *CI : Callers) {
--    auto &Context = New->getContext();
--    auto NewPAL = New->getAttributes();
--
--    SmallVector<Type *, 8> OldParamTypes;
--    SmallVector<Value *, 16> NewArgs;
--    SmallVector<AttributeSet, 8> NewArgAttrs;
--    IRBuilder<> Builder(CI);
--
--    FunctionType *NewFuncTy = New->getFunctionType();
--    (void)NewFuncTy;
--    unsigned ParamIdx = 0;
--
--    // Add the existing parameters.
--    for (Value *OldArg : CI->args()) {
--      NewArgAttrs.push_back(NewPAL.getParamAttrs(ParamIdx));
--      NewArgs.push_back(OldArg);
--      OldParamTypes.push_back(OldArg->getType());
--      ++ParamIdx;
--    }
--    // Add the new parameters.
--    for (const ParamInfo &PI : Params) {
--      assert(ParamIdx < NewFuncTy->getNumParams());
--      Constant *ArgValue = getSignedValue(PI, FuncIdx);
--      assert(ArgValue != Old && "should not try to replace all callers of self "
--                                "referencing functions");
--      NewArgs.push_back(ArgValue);
--      OldParamTypes.push_back(ArgValue->getType());
--      ++ParamIdx;
--    }
--
--    auto *FType = FunctionType::get(Old->getFunctionType()->getReturnType(),
--                                    OldParamTypes, false);
--    auto *FPtrType = PointerType::get(
--        FType, cast<PointerType>(New->getType())->getAddressSpace());
--
--    Value *Callee = ConstantExpr::getBitCast(New, FPtrType);
--    CallInst *NewCI;
--    if (objcarc::hasAttachedCallOpBundle(CI)) {
--      Value *BundleArgs[] = {*objcarc::getAttachedARCFunction(CI)};
--      OperandBundleDef OB("clang.arc.attachedcall", BundleArgs);
--      NewCI = Builder.CreateCall(FType, Callee, NewArgs, {OB});
--    } else {
--      NewCI = Builder.CreateCall(FType, Callee, NewArgs);
--    }
--    NewCI->setCallingConv(CI->getCallingConv());
--    // Don't transfer attributes from the function to the callee. Function
--    // attributes typically aren't relevant to the calling convention or ABI.
--    auto newAttrList = AttributeList::get(Context, /*FnAttrs=*/AttributeSet(),
--                                          NewPAL.getRetAttrs(), NewArgAttrs);
--    newAttrList = fixUpTypesInByValAndStructRetAttributes(FType, newAttrList);
--    NewCI->setAttributes(newAttrList);
--    if (IgnoreMusttailFunction && CI->isMustTailCall()) {
--      // replace a callsite with musttail.
--      llvm::errs() << "callsite has musttail in newF " << New->getName()
--                   << "\n";
--    }
--    NewCI->copyMetadata(*CI);
--    CI->replaceAllUsesWith(NewCI);
--    CI->eraseFromParent();
--  }
--  assert(Old->use_empty() && "should have replaced all uses of old function");
--  return Old->hasLocalLinkage();
--}
--
--PreservedAnalyses MergeFuncIgnoringConstPass::run(Module &M,
--                                                  ModuleAnalysisManager &MAM) {
--  if (MergeFuncIgnoringConstImpl(PtrAuthEnabled, PtrAuthKey, MergeFuncSuffix)
--          .runImpl(M))
--    return PreservedAnalyses::none();
--  return PreservedAnalyses::all();
--}
-diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
---- a/llvm/lib/Transforms/Utils/CMakeLists.txt
-+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
-@@ -27,7 +27,6 @@
-   FixIrreducible.cpp
-   FlattenCFG.cpp
-   FunctionComparator.cpp
--  FunctionComparatorIgnoringConst.cpp
-   FunctionImportUtils.cpp
-   GlobalStatus.cpp
-   GuardUtils.cpp
-diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Utils/FunctionComparatorIgnoringConst.cpp b/llvm/lib/Transforms/Utils/FunctionComparatorIgnoringConst.cpp
---- a/llvm/lib/Transforms/Utils/FunctionComparatorIgnoringConst.cpp
-+++ b/llvm/lib/Transforms/Utils/FunctionComparatorIgnoringConst.cpp
-@@ -1,107 +0,0 @@
--//===--- FunctionComparatorIgnoringConst.cpp - Function Comparator --------===//
--//
--// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
--// See https://llvm.org/LICENSE.txt for license information.
--// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
--//
--//===----------------------------------------------------------------------===//
--//
--//===----------------------------------------------------------------------===//
--
--#include "llvm/Transforms/Utils/FunctionComparatorIgnoringConst.h"
--#include "llvm/IR/Instructions.h"
--#include "llvm/Transforms/Utils/MergeFunctionsIgnoringConst.h"
--
--using namespace llvm;
--
--int FunctionComparatorIgnoringConst::cmpOperandsIgnoringConsts(
--    const Instruction *L, const Instruction *R, unsigned opIdx) {
--  Value *OpL = L->getOperand(opIdx);
--  Value *OpR = R->getOperand(opIdx);
--
--  int Res = cmpValues(OpL, OpR);
--  if (Res == 0)
--    return Res;
--
--  if (!isa<Constant>(OpL) || !isa<Constant>(OpR))
--    return Res;
--
--  if (!isEligibleOperandForConstantSharing(L, opIdx) ||
--      !isEligibleOperandForConstantSharing(R, opIdx))
--    return Res;
--
--  if (cmpTypes(OpL->getType(), OpR->getType()))
--    return Res;
--
--  return 0;
--}
--
--// Test whether two basic blocks have equivalent behavior.
--int FunctionComparatorIgnoringConst::cmpBasicBlocksIgnoringConsts(
--    const BasicBlock *BBL, const BasicBlock *BBR,
--    const std::set<std::pair<int, int>> *InstOpndIndex) {
--  BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end();
--  BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end();
--
--  do {
--    bool needToCmpOperands = true;
--    if (int Res = cmpOperations(&*InstL, &*InstR, needToCmpOperands))
--      return Res;
--    if (needToCmpOperands) {
--      assert(InstL->getNumOperands() == InstR->getNumOperands());
--
--      for (unsigned i = 0, e = InstL->getNumOperands(); i != e; ++i) {
--        // When a set for (instruction, operand) index pairs is given, we only
--        // ignore constants located at such indices. Otherwise, we precisely
--        // compare the operands.
--        if (InstOpndIndex && !InstOpndIndex->count(std::make_pair(Index, i))) {
--          Value *OpL = InstL->getOperand(i);
--          Value *OpR = InstR->getOperand(i);
--          if (int Res = cmpValues(OpL, OpR))
--            return Res;
--        }
--        if (int Res = cmpOperandsIgnoringConsts(&*InstL, &*InstR, i))
--          return Res;
--        // cmpValues should ensure this is true.
--        assert(cmpTypes(InstL->getOperand(i)->getType(),
--                        InstR->getOperand(i)->getType()) == 0);
--      }
--    }
--    ++Index;
--    ++InstL, ++InstR;
--  } while (InstL != InstLE && InstR != InstRE);
--
--  if (InstL != InstLE && InstR == InstRE)
--    return 1;
--  if (InstL == InstLE && InstR != InstRE)
--    return -1;
--  return 0;
--}
--
--// Test whether the two functions have equivalent behavior.
--int FunctionComparatorIgnoringConst::compareIgnoringConsts(
--    const std::set<std::pair<int, int>> *InstOpndIndex) {
--  beginCompare();
--  Index = 0;
--
--  if (int Res = compareSignature())
--    return Res;
--
--  Function::const_iterator LIter = FnL->begin(), LEnd = FnL->end();
--  Function::const_iterator RIter = FnR->begin(), REnd = FnR->end();
--
--  do {
--    const BasicBlock *BBL = &*LIter;
--    const BasicBlock *BBR = &*RIter;
--
--    if (int Res = cmpValues(BBL, BBR))
--      return Res;
--
--    if (int Res = cmpBasicBlocksIgnoringConsts(BBL, BBR, InstOpndIndex))
--      return Res;
--
--    ++LIter, ++RIter;
--  } while (LIter != LEnd && RIter != REnd);
--
--  return 0;
--}
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/MergeFuncIgnoringConst/merge_func.ll b/llvm/test/Transforms/MergeFuncIgnoringConst/merge_func.ll
---- a/llvm/test/Transforms/MergeFuncIgnoringConst/merge_func.ll
-+++ b/llvm/test/Transforms/MergeFuncIgnoringConst/merge_func.ll
-@@ -1,532 +0,0 @@
--; RUN: opt -S -mergefunc-ignoringconst-threshold=4 -passes=mergefunc-ignoring-const %s | FileCheck %s
--
--@g1 = external global i32
--@g2 = external global i32
--@g3 = external global i32
--@g4 = external global i32
--@g5 = external global i32
--
--; Test the most trivial example.
--
--; CHECK-LABEL: define i32 @simple_func1(i32 %x, i32 %y)
--; CHECK: %1 = tail call i32 @simple_func1.Tm(i32 %x, i32 %y, ptr @g1)
--; CHECK: ret i32 %1
--define i32 @simple_func1(i32 %x, i32 %y) {
--  %sum = add i32 %x, %y
--  %sum2 = add i32 %sum, %y
--  %l = load i32, i32* @g1, align 4
--  %sum3 = add i32 %sum2, %y
--  ret i32 %sum3
--}
--
--; CHECK-LABEL: define i32 @simple_func2(i32 %x, i32 %y)
--; CHECK: %1 = tail call i32 @simple_func1.Tm(i32 %x, i32 %y, ptr @g2)
--; CHECK: ret i32 %1
--define i32 @simple_func2(i32 %x, i32 %y) {
--  %sum = add i32 %x, %y
--  %sum2 = add i32 %sum, %y
--  %l = load i32, i32* @g2, align 4
--  %sum3 = add i32 %sum2, %y
--  ret i32 %sum3
--}
--
--; CHECK-LABEL: define internal i32 @simple_func1.Tm(i32 %0, i32 %1, ptr %2)
--; CHECK: %l = load i32, ptr %2
--; CHECK: ret
--
--
--; Merge 3 functions with 3 types of differing instructions: load, store and call.
--
--; CHECK-LABEL: define i32 @func1_of_3(i32 %x)
--; CHECK: %1 = tail call i32 @func1_of_3.Tm(i32 %x, ptr @g1, ptr @g1, ptr @callee1)
--; CHECK: ret i32 %1
--define i32 @func1_of_3(i32 %x) {
--  %l1 = load i32, i32* @g1, align 4
--  %sum = add i32 %x, %l1
--  %l2 = load i32, i32* @g1, align 4
--  %sum2 = add i32 %sum, %l2
--  store i32 %sum2, i32 *@g1, align 4
--  call void @callee1(i32 %sum2)
--  %sum3 = add i32 %sum2, %l2
--  ret i32 %sum3
--}
--
--; CHECK-LABEL: define i32 @func2_of_3(i32 %x)
--; CHECK: %1 = tail call i32 @func1_of_3.Tm(i32 %x, ptr @g2, ptr @g2, ptr @callee2)
--; CHECK: ret i32 %1
--define i32 @func2_of_3(i32 %x) {
--  %l1 = load i32, i32* @g2, align 4
--  %sum = add i32 %x, %l1
--  %l2 = load i32, i32* @g2, align 4
--  %sum2 = add i32 %sum, %l2
--  store i32 %sum2, i32 *@g2, align 4
--  call void @callee2(i32 %sum2)
--  %sum3 = add i32 %sum2, %l2
--  ret i32 %sum3
--}
--
--; CHECK-LABEL: define i32 @func3_of_3(i32 %x)
--; CHECK: %1 = tail call i32 @func1_of_3.Tm(i32 %x, ptr @g3, ptr @g1, ptr @callee3)
--; CHECK: ret i32 %1
--define i32 @func3_of_3(i32 %x) {
--  %l1 = load i32, i32* @g3, align 4
--  %sum = add i32 %x, %l1
--  %l2 = load i32, i32* @g1, align 4
--  %sum2 = add i32 %sum, %l2
--  store i32 %sum2, i32 *@g3, align 4
--  call void @callee3(i32 %sum2)
--  %sum3 = add i32 %sum2, %l2
--  ret i32 %sum3
--}
--
--; CHECK-LABEL: define internal i32 @func1_of_3.Tm(i32 %0, ptr %1, ptr %2, ptr %3)
--; CHECK: %l1 = load i32, ptr %1
--; CHECK: %l2 = load i32, ptr %2
--; CHECK: store i32 %sum2, ptr %1
--; CHECK: call void %3(i32 %sum2)
--; CHECK: ret
--
--declare void @callee1(i32 %x)
--declare void @callee2(i32 %x)
--declare void @callee3(i32 %x)
--
--; Preserve attributes
--
--; CHECK-LABEL: define void @sret_func1(ptr sret(i32) %p, i32 %x, i32 %y)
--; CHECK: tail call void @sret_func1.Tm(ptr sret(i32) %p, i32 %x, i32 %y, ptr @g1)
--; CHECK: ret void
--define void @sret_func1(i32* sret(i32) %p, i32 %x, i32 %y) {
--  %sum = add i32 %x, %y
--  %l = load i32, i32* @g1, align 4
--  %sum2 = add i32 %sum, %l
--  store i32 %sum2, i32* %p
--  ret void
--}
--
--; CHECK-LABEL: define void @sret_func2(ptr sret(i32) %p, i32 %x, i32 %y)
--; CHECK: tail call void @sret_func1.Tm(ptr sret(i32) %p, i32 %x, i32 %y, ptr @g2)
--; CHECK: ret void
--define void @sret_func2(i32* sret(i32) %p, i32 %x, i32 %y) {
--  %sum = add i32 %x, %y
--  %l = load i32, i32* @g2, align 4
--  %sum2 = add i32 %sum, %l
--  store i32 %sum2, i32* %p
--  ret void
--}
--
--; CHECK-LABEL: define internal void @sret_func1.Tm(ptr sret(i32) %0, i32 %1, i32 %2, ptr %3)
--; CHECK: %l = load i32, ptr %3, align 4
--; CHECK: store i32 %sum2, ptr %0
--; CHECK: ret
--
--
--; Don't merge all functions, because we would generate too many parameters.
--; Instead merge those functions which match best.
--
--; CHECK-LABEL: define i32 @func1_merged_with3(i32 %x)
--; CHECK: %1 = tail call i32 @func1_merged_with3.Tm(i32 %x, ptr @g1)
--; CHECK: ret i32 %1
--define i32 @func1_merged_with3(i32 %x) {
--  %l1 = load i32, i32* @g1, align 4
--  %sum = add i32 %x, %l1
--  %l2 = load i32, i32* @g2, align 4
--  %sum2 = add i32 %sum, %l2
--  %l3 = load i32, i32* @g3, align 4
--  %sum3 = add i32 %sum2, %l2
--  %l4 = load i32, i32* @g4, align 4
--  %sum4 = add i32 %sum3, %l2
--  %l5 = load i32, i32* @g5, align 4
--  %sum5 = add i32 %sum4, %l2
--  ret i32 %sum5
--}
--
--; CHECK-LABEL: define i32 @func2_merged_with4(i32 %x)
--; CHECK: %1 = tail call i32 @func2_merged_with4.Tm(i32 %x, ptr @g2)
--; CHECK: ret i32 %1
--define i32 @func2_merged_with4(i32 %x) {
--  %l1 = load i32, i32* @g2, align 4
--  %sum = add i32 %x, %l1
--  %l2 = load i32, i32* @g3, align 4
--  %sum2 = add i32 %sum, %l2
--  %l3 = load i32, i32* @g4, align 4
--  %sum3 = add i32 %sum2, %l2
--  %l4 = load i32, i32* @g5, align 4
--  %sum4 = add i32 %sum3, %l2
--  %l5 = load i32, i32* @g1, align 4
--  %sum5 = add i32 %sum4, %l2
--  ret i32 %sum5
--}
--
--; CHECK-LABEL: define i32 @func3_merged_with1(i32 %x)
--; CHECK: %1 = tail call i32 @func1_merged_with3.Tm(i32 %x, ptr @g2)
--; CHECK: ret i32 %1
--define i32 @func3_merged_with1(i32 %x) {
--  %l1 = load i32, i32* @g2, align 4
--  %sum = add i32 %x, %l1
--  %l2 = load i32, i32* @g2, align 4
--  %sum2 = add i32 %sum, %l2
--  %l3 = load i32, i32* @g3, align 4
--  %sum3 = add i32 %sum2, %l2
--  %l4 = load i32, i32* @g4, align 4
--  %sum4 = add i32 %sum3, %l2
--  %l5 = load i32, i32* @g5, align 4
--  %sum5 = add i32 %sum4, %l2
--  ret i32 %sum5
--}
--
--; CHECK-LABEL: define internal i32 @func1_merged_with3.Tm(i32 %0, ptr %1)
--; CHECK: load i32, ptr %1, align 4
--; CHECK: load i32, ptr @g2, align 4
--; CHECK: load i32, ptr @g3, align 4
--; CHECK: load i32, ptr @g4, align 4
--; CHECK: load i32, ptr @g5, align 4
--; CHECK: ret i32
--
--; CHECK-LABEL: define i32 @func4_merged_with2(i32 %x) {
--; CHECK: %1 = tail call i32 @func2_merged_with4.Tm(i32 %x, ptr @g1)
--; CHECK: ret i32 %1
--define i32 @func4_merged_with2(i32 %x) {
--  %l1 = load i32, i32* @g1, align 4
--  %sum = add i32 %x, %l1
--  %l2 = load i32, i32* @g3, align 4
--  %sum2 = add i32 %sum, %l2
--  %l3 = load i32, i32* @g4, align 4
--  %sum3 = add i32 %sum2, %l2
--  %l4 = load i32, i32* @g5, align 4
--  %sum4 = add i32 %sum3, %l2
--  %l5 = load i32, i32* @g1, align 4
--  %sum5 = add i32 %sum4, %l2
--  ret i32 %sum5
--}
--
--
--; The same example as above, but we cannot merge func2 with func4, because
--; func4 calls func1 (which is merged with func2 in the first iteration).
--
--declare i32 @get_int(i32 %x)
--
--; CHECK-LABEL: define i32 @Function1_merged_with_3(i32 %x)
--; CHECK: %1 = tail call i32 @Function1_merged_with_3.Tm(i32 %x, ptr @g1)
--; CHECK: ret i32 %1
--define i32 @Function1_merged_with_3(i32 %x) {
--  %l1 = load i32, i32* @g1, align 4
--  %sum = add i32 %x, %l1
--  %l2 = load i32, i32* @g2, align 4
--  %sum2 = add i32 %sum, %l2
--  %l3 = load i32, i32* @g3, align 4
--  %sum3 = add i32 %sum2, %l2
--  %l4 = load i32, i32* @g4, align 4
--  %sum4 = add i32 %sum3, %l2
--  %l5 = load i32, i32* @g5, align 4
--  %sum5 = add i32 %sum4, %l2
--  %c = call fastcc i32 @get_int(i32 %sum5)
--  ret i32 %c
--}
--
--; CHECK-LABEL: define i32 @Function2_not_merged(i32 %x)
--; CHECK: load
--; CHECK: load
--; CHECK: load
--; CHECK: load
--; CHECK: %c = call fastcc i32 @get_int
--; CHECK: ret i32 %c
--define i32 @Function2_not_merged(i32 %x) {
--  %l1 = load i32, i32* @g2, align 4
--  %sum = add i32 %x, %l1
--  %l2 = load i32, i32* @g3, align 4
--  %sum2 = add i32 %sum, %l2
--  %l3 = load i32, i32* @g4, align 4
--  %sum3 = add i32 %sum2, %l2
--  %l4 = load i32, i32* @g5, align 4
--  %sum4 = add i32 %sum3, %l2
--  %l5 = load i32, i32* @g1, align 4
--  %sum5 = add i32 %sum4, %l2
--  %c = call fastcc i32 @get_int(i32 %sum5)
--  ret i32 %c
--}
--
--; CHECK-LABEL: define i32 @Function3_merged_with_1(i32 %x)
--; CHECK: %1 = tail call i32 @Function1_merged_with_3.Tm(i32 %x, ptr @g2)
--; CHECK: ret i32 %1
--define i32 @Function3_merged_with_1(i32 %x) {
--  %l1 = load i32, i32* @g2, align 4
--  %sum = add i32 %x, %l1
--  %l2 = load i32, i32* @g2, align 4
--  %sum2 = add i32 %sum, %l2
--  %l3 = load i32, i32* @g3, align 4
--  %sum3 = add i32 %sum2, %l2
--  %l4 = load i32, i32* @g4, align 4
--  %sum4 = add i32 %sum3, %l2
--  %l5 = load i32, i32* @g5, align 4
--  %sum5 = add i32 %sum4, %l2
--  %c = call fastcc i32 @get_int(i32 %sum5)
--  ret i32 %c
--}
--
--; CHECK-LABEL: define internal i32 @Function1_merged_with_3.Tm(i32 %0, ptr %1)
--; CHECK: load
--; CHECK: load
--; CHECK: load
--; CHECK: load
--; CHECK: %c = call fastcc i32 @get_int
--; CHECK: ret i32 %c
--
--; CHECK-LABEL: define i32 @Function4_not_merged(i32 %x) {
--; CHECK: load
--; CHECK: load
--; CHECK: load
--; CHECK: load
--; CHECK: %1 = call fastcc i32 @Function1_merged_with_3.Tm(i32 %sum5, ptr @g1)
--; CHECK: ret i32 %1
--define i32 @Function4_not_merged(i32 %x) {
--  %l1 = load i32, i32* @g1, align 4
--  %sum = add i32 %x, %l1
--  %l2 = load i32, i32* @g3, align 4
--  %sum2 = add i32 %sum, %l2
--  %l3 = load i32, i32* @g4, align 4
--  %sum3 = add i32 %sum2, %l2
--  %l4 = load i32, i32* @g5, align 4
--  %sum4 = add i32 %sum3, %l2
--  %l5 = load i32, i32* @g1, align 4
--  %sum5 = add i32 %sum4, %l2
--  %c = call fastcc i32 @Function1_merged_with_3(i32 %sum5)
--  ret i32 %c
--}
--
--
--; Test a call chain: caller -> callee1 -> callee2.
--; Functions should be merged in bottom-up order: callee2, callee1, caller.
--; Also check that the calling convention is preserved.
--
--; CHECK-LABEL: define fastcc i32 @callee1_a(i32 %x, i32 %y)
--; CHECK: %1 = tail call fastcc i32 @callee1_a.Tm(i32 %x, i32 %y, ptr @g1)
--; CHECK: ret i32 %1
--define fastcc i32 @callee1_a(i32 %x, i32 %y) {
--  %sum = add i32 %x, %y
--  %sum2 = add i32 %sum, %y
--  %c = call i32 @callee2_a(i32 %sum2, i32 %y)
--  %sum3 = add i32 %sum2, %c
--  ret i32 %sum3
--}
--
--; CHECK-LABEL: define fastcc i32 @callee1_b(i32 %x, i32 %y)
--; CHECK: %1 = tail call fastcc i32 @callee1_a.Tm(i32 %x, i32 %y, ptr @g2)
--; CHECK: ret i32 %1
--define fastcc i32 @callee1_b(i32 %x, i32 %y) {
--  %sum = add i32 %x, %y
--  %sum2 = add i32 %sum, %y
--  %c = call i32 @callee2_b(i32 %sum2, i32 %y)
--  %sum3 = add i32 %sum2, %c
--  ret i32 %sum3
--}
--
--; CHECK-LABEL: define internal fastcc i32 @callee1_a.Tm(i32 %0, i32 %1, ptr %2)
--; CHECK: call i32 @callee2_a.Tm(i32 %sum2, i32 %1, ptr %2)
--; CHECK: ret
--
--; CHECK-NOT: @callee2_a(
--define internal i32 @callee2_a(i32 %x, i32 %y) {
--  %sum = add i32 %x, %y
--  %sum2 = sub i32 %sum, %y
--  %l = load i32, i32* @g1, align 4
--  %sum3 = add i32 %sum2, %y
--  ret i32 %sum3
--}
--
--; CHECK-NOT: @callee2_b(
--define internal i32 @callee2_b(i32 %x, i32 %y) {
--  %sum = add i32 %x, %y
--  %sum2 = sub i32 %sum, %y
--  %l = load i32, i32* @g2, align 4
--  %sum3 = add i32 %sum2, %y
--  ret i32 %sum3
--}
--
--; CHECK-LABEL: define i32 @caller_a(i32 %x, i32 %y)
--; CHECK: %1 = tail call i32 @caller_a.Tm(i32 %x, i32 %y, ptr @g1)
--; CHECK: ret i32 %1
--define i32 @caller_a(i32 %x, i32 %y) {
--  %sum = add i32 %x, %y
--  %sum2 = add i32 %sum, %y
--  %c = call fastcc i32 @callee1_a(i32 %sum2, i32 %y)
--  %sum3 = add i32 %sum2, %c
--  ret i32 %sum3
--}
--
--; CHECK-LABEL: define i32 @caller_b(i32 %x, i32 %y)
--; CHECK: %1 = tail call i32 @caller_a.Tm(i32 %x, i32 %y, ptr @g2)
--; CHECK: ret i32 %1
--define i32 @caller_b(i32 %x, i32 %y) {
--  %sum = add i32 %x, %y
--  %sum2 = add i32 %sum, %y
--  %c = call fastcc i32 @callee1_b(i32 %sum2, i32 %y)
--  %sum3 = add i32 %sum2, %c
--  ret i32 %sum3
--}
--
--; CHECK-LABEL: define internal i32 @caller_a.Tm(i32 %0, i32 %1, ptr %2)
--; CHECK: call fastcc i32 @callee1_a.Tm(i32 %sum2, i32 %1, ptr %2)
--; CHECK: ret
--
--
--; Ensure that we do not merge functions that are identical with the
--; exception of the order of the incoming blocks to a phi.
--
--; CHECK-LABEL: define linkonce_odr hidden i1 @first(i2 %0)
--define linkonce_odr hidden i1 @first(i2) {
--entry:
--; CHECK: switch i2
--  switch i2 %0, label %default [
--    i2 0, label %L1
--    i2 1, label %L2
--    i2 -2, label %L3
--  ]
--default:
--  unreachable
--L1:
--  br label %done
--L2:
--  br label %done
--L3:
--  br label %done
--done:
--  %result = phi i1 [ true, %L1 ], [ false, %L2 ], [ false, %L3 ]
--; CHECK: ret i1
--  ret i1 %result
--}
--
--; CHECK-LABEL: define linkonce_odr hidden i1 @second(i2 %0)
--define linkonce_odr hidden i1 @second(i2) {
--entry:
--; CHECK: switch i2
--  switch i2 %0, label %default [
--    i2 0, label %L1
--    i2 1, label %L2
--    i2 -2, label %L3
--  ]
--default:
--  unreachable
--L1:
--  br label %done
--L2:
--  br label %done
--L3:
--  br label %done
--done:
--  %result = phi i1 [ true, %L3 ], [ false, %L2 ], [ false, %L1 ]
--; CHECK: ret i1
--  ret i1 %result
--}
--
--; Check self recursive functions
--
--; CHECK-LABEL: define internal void @recursive1(i32 %x, i32 %y)
--; CHECK: tail call void @recursive1.Tm(i32 %x, i32 %y, ptr @g1, ptr @recursive1)
--; CHECK: ret void
--define internal void @recursive1(i32 %x, i32 %y) {
--  br i1 undef, label %bb1, label %bb2
--
--bb1:
--  %l = load i32, i32* @g1, align 4
--  call void @recursive1(i32 %x, i32 %y)
--  br label %bb2
--
--bb2:
--  ret void
--}
--
--; CHECK-LABEL: define internal void @recursive2(i32 %x, i32 %y)
--; CHECK: tail call void @recursive1.Tm(i32 %x, i32 %y, ptr @g2, ptr @recursive2)
--; CHECK: ret void
--define internal void @recursive2(i32 %x, i32 %y) {
--  br i1 undef, label %bb1, label %bb2
--
--bb1:
--  %l = load i32, i32* @g2, align 4
--  call void @recursive2(i32 %x, i32 %y)
--  br label %bb2
--
--bb2:
--  ret void
--}
--; CHECK-LABEL: define internal void @recursive1.Tm(i32 %0, i32 %1, ptr %2, ptr %3)
--; CHECK: load i32, ptr %2
--; CHECK: call void %3(i32 %0, i32 %1)
--; CHECK: ret void
--
--
--; CHECK-LABEL: define internal void @another_recursive_func(i32 %x)
--; CHECK: tail call void @another_recursive_func.Tm(i32 %x, ptr @g1, ptr @another_recursive_func)
--; CHECK: ret void
--define internal void @another_recursive_func(i32 %x) {
--  br i1 undef, label %bb1, label %bb2
--
--bb1:
--  store i32 %x, i32 *@g1, align 4
--  call void @another_recursive_func(i32 %x)
--  br label %bb2
--
--bb2:
--  ret void
--}
--; CHECK-NOT: @not_really_recursive(
--
--; CHECK-LABEL: define internal void @another_recursive_func.Tm(i32 %0, ptr %1, ptr %2)
--; CHECK: store i32 %0, ptr %1
--; CHECK: call void %2(i32 %0)
--; CHECK: ret void
--define internal void @not_really_recursive(i32 %x) {
--  br i1 undef, label %bb1, label %bb2
--
--bb1:
--  store i32 %x, i32 *@g2, align 4
--  call void @callee1(i32 %x)
--  br label %bb2
--
--bb2:
--  ret void
--}
--; CHECK-NOT: @not_really_recursive(
--
--; CHECK-LABEL: define void @call_recursive_funcs(i32 %x)
--; CHECK: call void @recursive1(i32 %x, i32 %x)
--; CHECK: call void @recursive2(i32 %x, i32 %x)
--; CHECK: call void @another_recursive_func(i32 %x)
--; CHECK: call void @another_recursive_func.Tm(i32 %x, ptr @g2, ptr @callee1)
--; CHECK: ret void
--define void @call_recursive_funcs(i32 %x) {
--  call void @recursive1(i32 %x, i32 %x)
--  call void @recursive2(i32 %x, i32 %x)
--  call void @another_recursive_func(i32 %x)
--  call void @not_really_recursive(i32 %x)
--  ret void
--}
--
--; Ensure that we do not merge functions which make use of distinct dtrace
--; probes. Each call to a dtrace probe must resolve to a unique patchpoint.
--
--declare void @"__dtrace_probe$Apple$Probe1$v1$696e74"(i32) local_unnamed_addr
--
--; CHECK-LABEL: define i32 @use_dtrace_probe1
--; CHECK: call void @"__dtrace_probe$Apple$Probe1$v1$696e74"
--define i32 @use_dtrace_probe1(i32 %x, i32 %y) {
--  %sum = add i32 %x, %y
--  %sum2 = add i32 %sum, %y
--  %l = load i32, i32* @g1, align 4
--  %sum3 = add i32 %sum2, %y
--  tail call void @"__dtrace_probe$Apple$Probe1$v1$696e74"(i32 undef)
--  ret i32 %sum3
--}
--
--declare void @"__dtrace_probe$Apple$Probe2$v1$696e74"(i32) local_unnamed_addr
--
--; CHECK-LABEL: define i32 @use_dtrace_probe2
--; CHECK: call void @"__dtrace_probe$Apple$Probe2$v1$696e74"
--define i32 @use_dtrace_probe2(i32 %x, i32 %y) {
--  %sum = add i32 %x, %y
--  %sum2 = add i32 %sum, %y
--  %l = load i32, i32* @g2, align 4
--  %sum3 = add i32 %sum2, %y
--  tail call void @"__dtrace_probe$Apple$Probe2$v1$696e74"(i32 undef)
--  ret i32 %sum3
--}
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/MergeFuncIgnoringConst/merge_with_exception.ll b/llvm/test/Transforms/MergeFuncIgnoringConst/merge_with_exception.ll
---- a/llvm/test/Transforms/MergeFuncIgnoringConst/merge_with_exception.ll
-+++ b/llvm/test/Transforms/MergeFuncIgnoringConst/merge_with_exception.ll
-@@ -1,190 +0,0 @@
--; RUN: opt -S -enable-aggressive-mergefunc-ignoringconst -passes=mergefunc-ignoring-const %s -o - | FileCheck %s
--
--%4 = type opaque
--%10 = type opaque
--%"struct.SearchSpec::State" = type { %4* }
--%"struct.PointerList" = type { i8*, i8*, i8*, i8*, i8* }
--%"struct.DynamicCallback" = type { %10* }
--
--; CHECK: define ptr @invoke_foo(ptr nocapture readonly %.block_descriptor, ptr %stateWrapper)
--; CHECK: %1 = {{.*}}call ptr @invoke_foo.Tm
--; CHECK: define ptr @invoke_bar(ptr nocapture readonly %.block_descriptor, ptr %stateWrapper) {
--; CHECK: %1 = {{.*}}call ptr @invoke_foo.Tm
--; CHECK: define {{.*}}.Tm(ptr nocapture readonly %0, ptr %1, ptr %2, ptr %3)
--
--; Function Attrs: minsize optsize ssp uwtable
--define i8* @invoke_foo(i8* nocapture readonly %.block_descriptor, i8* %stateWrapper) #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
--entry:
--  %state = alloca %"struct.SearchSpec::State", align 8
--  %agg.tmp = alloca %"struct.PointerList", align 8
--  %0 = tail call i8* @llvm.objc.retain(i8* %stateWrapper) #2
--  %1 = bitcast %"struct.SearchSpec::State"* %state to i8*
--  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %1) #2
--  %2 = getelementptr inbounds i8, i8* %stateWrapper, i64 16
--  %3 = bitcast i8* %2 to %"struct.SearchSpec::State"* (i8*)**
--  %4 = load %"struct.SearchSpec::State"* (i8*)*, %"struct.SearchSpec::State"* (i8*)** %3, align 8
--  %call.i4 = invoke nonnull align 8 dereferenceable(8) %"struct.SearchSpec::State"* %4(i8* nonnull %stateWrapper) #31
--          to label %invoke.cont unwind label %lpad
--
--invoke.cont:                                      ; preds = %entry
--  %initialText.i.i = getelementptr inbounds %"struct.SearchSpec::State", %"struct.SearchSpec::State"* %state, i64 0, i32 0
--  %initialText2.i.i = getelementptr inbounds %"struct.SearchSpec::State", %"struct.SearchSpec::State"* %call.i4, i64 0, i32 0
--  %5 = load %4*, %4** %initialText2.i.i, align 8
--  %6 = bitcast %4* %5 to i8*
--  %7 = tail call i8* @llvm.objc.retain(i8* %6) #2
--  store %4* %5, %4** %initialText.i.i, align 8
--  %block.capture.addr = getelementptr inbounds i8, i8* %.block_descriptor, i64 32
--  %8 = bitcast i8* %block.capture.addr to i8**
--  %9 = load i8*, i8** %8, align 8
--  invoke void @callee2(%"struct.PointerList"* nonnull sret(%"struct.PointerList") align 8 %agg.tmp, i8* %9, i1 zeroext false) #31
--          to label %invoke.cont2 unwind label %lpad1
--
--invoke.cont2:                                     ; preds = %invoke.cont
--  %block.capture.addr3 = getelementptr inbounds i8, i8* %.block_descriptor, i64 40
--  %10 = bitcast i8* %block.capture.addr3 to %4**
--  %agg.tmp6.sroa.3.0..sroa_idx12 = getelementptr inbounds %"struct.PointerList", %"struct.PointerList"* %agg.tmp, i64 0, i32 3
--  %agg.tmp6.sroa.3.0.copyload = load i8*, i8** %agg.tmp6.sroa.3.0..sroa_idx12, align 8
--  %11 = load %4*, %4** %10, align 8
--  invoke void @callee1(%"struct.SearchSpec::State"* nonnull align 8 dereferenceable(8) %state, %4* %11) #31
--          to label %invoke.cont4 unwind label %lpad.i
--
--lpad.i:                                           ; preds = %invoke.cont2
--  %12 = landingpad { i8*, i32 }
--          cleanup
--  call void @llvm.objc.release(i8* %agg.tmp6.sroa.3.0.copyload) #2
--  %.phi.trans.insert = bitcast %"struct.SearchSpec::State"* %state to i8**
--  %.pre = load i8*, i8** %.phi.trans.insert, align 8
--  br label %lpad1.body
--
--invoke.cont4:                                     ; preds = %invoke.cont2
--  call void @llvm.objc.release(i8* %agg.tmp6.sroa.3.0.copyload) #2
--  %13 = load %4*, %4** %initialText.i.i, align 8
--  store %4* null, %4** %initialText.i.i, align 8
--  %call78 = call fastcc i8* @callee3(%4* %13) #31 [ "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
--  call void (...) @llvm.objc.clang.arc.noop.use(i8* %call78) #2
--  %14 = bitcast %"struct.SearchSpec::State"* %state to i8**
--  %15 = load i8*, i8** %14, align 8
--  call void @llvm.objc.release(i8* %15) #2
--  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #2
--  call void @llvm.objc.release(i8* nonnull %stateWrapper) #2, !clang.imprecise_release !1
--  %16 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %call78) #2
--  ret i8* %call78
--
--lpad:                                             ; preds = %entry
--  %17 = landingpad { i8*, i32 }
--          cleanup
--  br label %ehcleanup
--
--lpad1:                                            ; preds = %invoke.cont
--  %18 = landingpad { i8*, i32 }
--          cleanup
--  br label %lpad1.body
--
--lpad1.body:                                       ; preds = %lpad1, %lpad.i
--  %19 = phi i8* [ %6, %lpad1 ], [ %.pre, %lpad.i ]
--  %eh.lpad-body = phi { i8*, i32 } [ %18, %lpad1 ], [ %12, %lpad.i ]
--  call void @llvm.objc.release(i8* %19) #2
--  br label %ehcleanup
--
--ehcleanup:                                        ; preds = %lpad1.body, %lpad
--  %.pn = phi { i8*, i32 } [ %eh.lpad-body, %lpad1.body ], [ %17, %lpad ]
--  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #2
--  call void @llvm.objc.release(i8* nonnull %stateWrapper) #2, !clang.imprecise_release !1
--  resume { i8*, i32 } %.pn
--}
--
--; Function Attrs: minsize optsize ssp uwtable
--define i8* @invoke_bar(i8* nocapture readonly %.block_descriptor, i8* %stateWrapper) #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
--entry:
--  %state = alloca %"struct.DynamicCallback", align 8
--  %agg.tmp = alloca %"struct.PointerList", align 8
--  %0 = tail call i8* @llvm.objc.retain(i8* %stateWrapper) #2
--  %1 = bitcast %"struct.DynamicCallback"* %state to i8*
--  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %1) #2
--  %2 = getelementptr inbounds i8, i8* %stateWrapper, i64 16
--  %3 = bitcast i8* %2 to %"struct.DynamicCallback"* (i8*)**
--  %4 = load %"struct.DynamicCallback"* (i8*)*, %"struct.DynamicCallback"* (i8*)** %3, align 8
--  %call.i4 = invoke nonnull align 8 dereferenceable(8) %"struct.DynamicCallback"* %4(i8* nonnull %stateWrapper) #31
--          to label %invoke.cont unwind label %lpad
--
--invoke.cont:                                      ; preds = %entry
--  %call.i.i = getelementptr inbounds %"struct.DynamicCallback", %"struct.DynamicCallback"* %state, i64 0, i32 0
--  %call2.i.i = getelementptr inbounds %"struct.DynamicCallback", %"struct.DynamicCallback"* %call.i4, i64 0, i32 0
--  %5 = load %10*, %10** %call2.i.i, align 8
--  %6 = bitcast %10* %5 to i8*
--  %7 = tail call i8* @llvm.objc.retain(i8* %6) #2
--  store %10* %5, %10** %call.i.i, align 8
--  %block.capture.addr = getelementptr inbounds i8, i8* %.block_descriptor, i64 32
--  %8 = bitcast i8* %block.capture.addr to i8**
--  %9 = load i8*, i8** %8, align 8
--  invoke void @callee2(%"struct.PointerList"* nonnull sret(%"struct.PointerList") align 8 %agg.tmp, i8* %9, i1 zeroext false) #31
--          to label %invoke.cont2 unwind label %lpad1
--
--invoke.cont2:                                     ; preds = %invoke.cont
--  %block.capture.addr3 = getelementptr inbounds i8, i8* %.block_descriptor, i64 40
--  %10 = bitcast i8* %block.capture.addr3 to %10**
--  %agg.tmp6.sroa.3.0..sroa_idx12 = getelementptr inbounds %"struct.PointerList", %"struct.PointerList"* %agg.tmp, i64 0, i32 3
--  %agg.tmp6.sroa.3.0.copyload = load i8*, i8** %agg.tmp6.sroa.3.0..sroa_idx12, align 8
--  %11 = load %10*, %10** %10, align 8
--  invoke void @callee5(%"struct.DynamicCallback"* nonnull align 8 dereferenceable(8) %state, %10* %11) #31
--          to label %invoke.cont4 unwind label %lpad.i
--
--lpad.i:                                           ; preds = %invoke.cont2
--  %12 = landingpad { i8*, i32 }
--          cleanup
--  call void @llvm.objc.release(i8* %agg.tmp6.sroa.3.0.copyload) #2
--  %.phi.trans.insert = bitcast %"struct.DynamicCallback"* %state to i8**
--  %.pre = load i8*, i8** %.phi.trans.insert, align 8
--  br label %lpad1.body
--
--invoke.cont4:                                     ; preds = %invoke.cont2
--  call void @llvm.objc.release(i8* %agg.tmp6.sroa.3.0.copyload) #2
--  %13 = load %10*, %10** %call.i.i, align 8
--  store %10* null, %10** %call.i.i, align 8
--  %call78 = call fastcc i8* @callee4(%10* %13) #31 [ "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ]
--  call void (...) @llvm.objc.clang.arc.noop.use(i8* %call78) #2
--  %14 = bitcast %"struct.DynamicCallback"* %state to i8**
--  %15 = load i8*, i8** %14, align 8
--  call void @llvm.objc.release(i8* %15) #2
--  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #2
--  call void @llvm.objc.release(i8* nonnull %stateWrapper) #2, !clang.imprecise_release !1
--  %16 = tail call i8* @llvm.objc.autoreleaseReturnValue(i8* %call78) #2
--  ret i8* %call78
--
--lpad:                                             ; preds = %entry
--  %17 = landingpad { i8*, i32 }
--          cleanup
--  br label %ehcleanup
--
--lpad1:                                            ; preds = %invoke.cont
--  %18 = landingpad { i8*, i32 }
--          cleanup
--  br label %lpad1.body
--
--lpad1.body:                                       ; preds = %lpad1, %lpad.i
--  %19 = phi i8* [ %6, %lpad1 ], [ %.pre, %lpad.i ]
--  %eh.lpad-body = phi { i8*, i32 } [ %18, %lpad1 ], [ %12, %lpad.i ]
--  call void @llvm.objc.release(i8* %19) #2
--  br label %ehcleanup
--
--ehcleanup:                                        ; preds = %lpad1.body, %lpad
--  %.pn = phi { i8*, i32 } [ %eh.lpad-body, %lpad1.body ], [ %17, %lpad ]
--  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %1) #2
--  call void @llvm.objc.release(i8* nonnull %stateWrapper) #2, !clang.imprecise_release !1
--  resume { i8*, i32 } %.pn
--}
--declare void @callee1(%"struct.SearchSpec::State"* nonnull align 8 dereferenceable(8), %4*)
--declare void @callee2(%"struct.PointerList"* sret(%"struct.PointerList") align 8, i8*, i1 zeroext)
--declare i8* @callee3(%4* %state.coerce)
--declare i8* @callee4(%10* %state.coerce)
--declare void @callee5(%"struct.DynamicCallback"* nonnull align 8 dereferenceable(8), %10*)
--declare i32 @__gxx_personality_v0(...)
--declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
--declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
--declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
--declare void @llvm.objc.clang.arc.noop.use(...)
--declare void @llvm.objc.release(i8*)
--declare i8* @llvm.objc.retain(i8*)
--declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
--
--!1 = !{}
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 967a6224edd455..5015e65c2d7640 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "bcb685e11945946335c2dc6265779f0226491b49"
-    LLVM_SHA256 = "dbeb744a9656b7e7035b350ea6b2d303db26da8da000bc85a13f517c5a13195b"
+    LLVM_COMMIT = "67d7903262ce5c35bb23d599040dff29b9d7759e"
+    LLVM_SHA256 = "49062de6219c30871d4dd11c047ed1d70783c345adaacca2199c0830849b06a7"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
old mode 100644
new mode 100755
index be1c1f0838e9d7..a476720fd2dbd6
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -1,39 +1,14 @@
 diff --ruN a/stablehlo/BUILD.bazel b/stablehlo/BUILD.bazel
 --- stablehlo/BUILD.bazel
 +++ stablehlo/BUILD.bazel
-@@ -279,6 +279,24 @@
- )
- 
- cc_library(
-+    name = "experimental_ops",
-+    srcs = [
-+        "stablehlo/dialect/ExperimentalOps.cpp",
-+    ],
-+    hdrs = [
-+        "stablehlo/dialect/ExperimentalOps.h",
-+    ],
-+    strip_include_prefix = ".",
-+    deps = [
-+        ":stablehlo_ops",
-+        "@llvm-project//llvm:Support",
-+        "@llvm-project//mlir:FuncDialect",
-+        "@llvm-project//mlir:IR",
-+        "@llvm-project//mlir:Support",
-+    ],
-+)
-+
-+cc_library(
-     name = "interpreter_ops",
-     srcs = [
-         "stablehlo/reference/InterpreterOps.cpp",
-@@ -780,6 +798,7 @@
+@@ -890,6 +890,7 @@
+     hdrs = [
+         "stablehlo/transforms/MapStablehloToVhlo.h",
+         "stablehlo/transforms/Passes.h",
++        "stablehlo/transforms/StablehloRefineShapes.h",
+     ],
+     strip_include_prefix = ".",
      deps = [
-         ":base",
-         ":chlo_ops",
-+        ":experimental_ops",
-         ":stablehlo_ops",
-         ":stablehlo_ops_inc_gen",
-         ":stablehlo_pass_inc_gen",
 diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 --- stablehlo/CMakeLists.txt
 +++ stablehlo/CMakeLists.txt
@@ -181,32 +156,198 @@ diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
  
  #-------------------------------------------------------------------------------
  # Directory setup
-diff --ruN a/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir b/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
---- stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
-+++ stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
-@@ -19,6 +19,7 @@
- func.func @iota_dimension_0() -> tensor<4x8xf32> {
-   // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"()
-   // CHECK-SAME{LITERAL}: <{value = dense<[[0.000000e+00], [1.000000e+00], [2.000000e+00], [3.000000e+00]]> : tensor<4x1xf32>}>
-+  // CHECK-DAG: %[[VAR1:.*]] = tosa.tile %[[VAR0]] {multiples = array<i64: 1, 8>}
-   %0 = "stablehlo.iota"() {iota_dimension = 0 : i64} : () -> (tensor<4x8xf32>)
-   return %0 : tensor<4x8xf32>
- }
-@@ -27,6 +28,7 @@
- func.func @iota_dimension_1() -> tensor<4x8xi32> {
-   // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"()
-   // CHECK-SAME{LITERAL}: <{value = dense<[[0, 1, 2, 3, 4, 5, 6, 7]]> : tensor<1x8xi32>}>
-+  // CHECK-DAG: %[[VAR1:.*]] = tosa.tile %[[VAR0]] {multiples = array<i64: 4, 1>}
-   %0 = "stablehlo.iota"() {iota_dimension = 1 : i64} : () -> (tensor<4x8xi32>)
-   return %0 : tensor<4x8xi32>
- }
-diff --ruN a/stablehlo/stablehlo/dialect/Base.cpp b/stablehlo/stablehlo/dialect/Base.cpp
---- stablehlo/stablehlo/dialect/Base.cpp
-+++ stablehlo/stablehlo/dialect/Base.cpp
-@@ -600,5 +600,18 @@
-   return UnrankedTensorType::get(components.getElementType());
- }
+diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists.txt
+--- stablehlo/stablehlo/CMakeLists.txt
++++ stablehlo/stablehlo/CMakeLists.txt
+@@ -15,6 +15,7 @@
+ add_subdirectory(api)
+ add_subdirectory(conversions)
+ add_subdirectory(dialect)
++add_subdirectory(experimental)
+ add_subdirectory(integrations)
+ add_subdirectory(reference)
+ add_subdirectory(tests)
+diff --ruN a/stablehlo/stablehlo/api/PortableApi.h b/stablehlo/stablehlo/api/PortableApi.h
+--- stablehlo/stablehlo/api/PortableApi.h
++++ stablehlo/stablehlo/api/PortableApi.h
+@@ -27,7 +27,8 @@
  
+ /// Return the current version for portable API.
+ /// Increments on all meaningful changes to this file.
+-inline int64_t getApiVersion() { return 4; }
++/// Or on large breaking source changes that are difficult to integrate.
++inline int64_t getApiVersion() { return 5; }
+ 
+ // Get the current StableHLO version.
+ //
+diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
+--- stablehlo/stablehlo/experimental/BUILD.bazel
++++ stablehlo/stablehlo/experimental/BUILD.bazel
+@@ -0,0 +1,114 @@
++# Copyright 2023 The StableHLO Authors. All Rights Reserved.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      https://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
++
++package(
++    default_visibility = ["//visibility:public"],
++    licenses = ["notice"],
++)
++
++cc_library(
++    name = "experimental_base",
++    srcs = [
++        "dialect/Base.cpp",
++    ],
++    hdrs = [
++        "dialect/Base.h",
++    ],
++    deps = [
++        "@llvm-project//llvm:Support",
++        "@llvm-project//mlir:IR",
++    ],
++)
++
++cc_library(
++    name = "experimental_stablehlo_ops",
++    srcs = [
++        "dialect/StablehloOps.cpp",
++    ],
++    hdrs = [
++        "dialect/StablehloOps.h",
++    ],
++    deps = [
++        ":experimental_base",
++        "//:stablehlo_ops",
++        "@llvm-project//llvm:Support",
++        "@llvm-project//mlir:FuncDialect",
++        "@llvm-project//mlir:IR",
++        "@llvm-project//mlir:Support",
++    ],
++)
++
++gentbl_cc_library(
++    name = "experimental_stablehlo_pass_inc_gen",
++    tbl_outs = [
++        (
++            [
++                "-gen-pass-decls",
++            ],
++            "transforms/Passes.h.inc",
++        ),
++    ],
++    tblgen = "@llvm-project//mlir:mlir-tblgen",
++    td_file = "transforms/Passes.td",
++    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
++)
++
++cc_library(
++    name = "experimental_stablehlo_passes",
++    srcs = [
++        "transforms/StablehloCanonicalizeDynamism.cpp",
++        "transforms/StablehloRefineShapes.cpp",
++    ],
++    hdrs = [
++        "transforms/Passes.h",
++    ],
++    deps = [
++        ":experimental_stablehlo_ops",
++        ":experimental_stablehlo_pass_inc_gen",
++        "//:base",
++        "//:chlo_ops",
++        "//:stablehlo_ops",
++        "//:stablehlo_ops_inc_gen",
++        "//:stablehlo_passes",
++        "//:stablehlo_type_inference",
++        "@llvm-project//llvm:Support",
++        "@llvm-project//mlir:FuncDialect",
++        "@llvm-project//mlir:IR",
++        "@llvm-project//mlir:InferTypeOpInterface",
++        "@llvm-project//mlir:Pass",
++        "@llvm-project//mlir:Support",
++        "@llvm-project//mlir:TransformUtils",
++        "@llvm-project//mlir:Transforms",
++    ],
++)
++
++cc_binary(
++    name = "experimental-stablehlo-opt",
++    srcs = [
++        "tools/StablehloOptMain.cpp",
++    ],
++    deps = [
++        ":experimental_stablehlo_passes",
++        "//:interpreter_ops",
++        "//:register",
++        "//:stablehlo_passes",
++        "//:test_utils",
++        "//:tosa_passes",
++        "@llvm-project//mlir:AllExtensions",
++        "@llvm-project//mlir:AllPassesAndDialects",
++        "@llvm-project//mlir:MlirOptLib",
++        "@llvm-project//mlir:TosaDialect",
++    ],
++)
+diff --ruN a/stablehlo/stablehlo/experimental/CMakeLists.txt b/stablehlo/stablehlo/experimental/CMakeLists.txt
+--- stablehlo/stablehlo/experimental/CMakeLists.txt
++++ stablehlo/stablehlo/experimental/CMakeLists.txt
+@@ -0,0 +1,18 @@
++# Copyright 2023 The StableHLO Authors.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      https://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++add_subdirectory(dialect)
++add_subdirectory(tests)
++add_subdirectory(tools)
++add_subdirectory(transforms)
+diff --ruN a/stablehlo/stablehlo/experimental/dialect/Base.cpp b/stablehlo/stablehlo/experimental/dialect/Base.cpp
+--- stablehlo/stablehlo/experimental/dialect/Base.cpp
++++ stablehlo/stablehlo/experimental/dialect/Base.cpp
+@@ -0,0 +1,39 @@
++/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
++   Copyright 2022 The StableHLO Authors.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include "stablehlo/experimental/dialect/Base.h"
++
++#include "mlir/IR/BuiltinAttributes.h"
++#include "mlir/IR/BuiltinTypes.h"
++
++namespace mlir {
++namespace hlo {
++
 +DenseIntElementsAttr getPaddingAttr(MLIRContext* context,
 +                                    ArrayRef<int64_t> values) {
 +  return DenseIntElementsAttr::get(
@@ -220,50 +361,97 @@ diff --ruN a/stablehlo/stablehlo/dialect/Base.cpp b/stablehlo/stablehlo/dialect/
 +  return getPaddingAttr(builder->getContext(), values);
 +}
 +
- }  // namespace hlo
- }  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/dialect/Base.h b/stablehlo/stablehlo/dialect/Base.h
---- stablehlo/stablehlo/dialect/Base.h
-+++ stablehlo/stablehlo/dialect/Base.h
-@@ -194,6 +194,10 @@
- 
- ShapedType createShapedType(ShapedTypeComponents components);
- 
++}  // namespace hlo
++}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/experimental/dialect/Base.h b/stablehlo/stablehlo/experimental/dialect/Base.h
+--- stablehlo/stablehlo/experimental/dialect/Base.h
++++ stablehlo/stablehlo/experimental/dialect/Base.h
+@@ -0,0 +1,35 @@
++/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
++   Copyright 2022 The StableHLO Authors.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#ifndef STABLEHLO_EXPERIMENTAL_DIALECT_BASE_H
++#define STABLEHLO_EXPERIMENTAL_DIALECT_BASE_H
++
++#include "llvm/ADT/ArrayRef.h"
++#include "mlir/IR/Builders.h"
++#include "mlir/IR/BuiltinAttributes.h"
++#include "mlir/IR/MLIRContext.h"
++
++namespace mlir {
++namespace hlo {
++
 +DenseIntElementsAttr getPaddingAttr(MLIRContext *context,
 +                                    ArrayRef<int64_t> value);
 +DenseIntElementsAttr getPaddingAttr(Builder *builder, ArrayRef<int64_t> value);
 +
- // This interface is implemented by both StableHLO and MHLO dialects
- // and is used as the foundation for sharing verification, type inference and
- // prettyprinting logic between them.
-diff --ruN a/stablehlo/stablehlo/dialect/CMakeLists.txt b/stablehlo/stablehlo/dialect/CMakeLists.txt
---- stablehlo/stablehlo/dialect/CMakeLists.txt
-+++ stablehlo/stablehlo/dialect/CMakeLists.txt
-@@ -77,6 +77,20 @@
- target_include_directories(ChloOps INTERFACE
-   $<BUILD_INTERFACE:${STABLEHLO_SOURCE_DIR}>
-   $<BUILD_INTERFACE:${STABLEHLO_BINARY_DIR}>
++}  // namespace hlo
++}  // namespace mlir
++
++#endif  // STABLEHLO_EXPERIMENTAL_DIALECT_BASE_H
+diff --ruN a/stablehlo/stablehlo/experimental/dialect/CMakeLists.txt b/stablehlo/stablehlo/experimental/dialect/CMakeLists.txt
+--- stablehlo/stablehlo/experimental/dialect/CMakeLists.txt
++++ stablehlo/stablehlo/experimental/dialect/CMakeLists.txt
+@@ -0,0 +1,42 @@
++# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
++# Copyright 2023 The StableHLO Authors.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      https://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++add_mlir_library(ExperimentalStablehloBase
++  PARTIAL_SOURCES_INTENDED
++  Base.cpp
++
++  LINK_LIBS PUBLIC
++  MLIRIR
 +)
 +
-+add_mlir_dialect_library(ExperimentalOps
++add_mlir_dialect_library(ExperimentalStablehloOps
 +  PARTIAL_SOURCES_INTENDED
-+  ExperimentalOps.cpp
++  StablehloOps.cpp
 +
 +  DEPENDS
 +  StablehloOpsIncGen
 +
 +  LINK_LIBS PUBLIC
++  ExperimentalStablehloBase
 +  MLIRFuncDialect
 +  MLIRIR
 +  MLIRSupport
 +  StablehloOps
- )
- 
- add_mlir_dialect_library(StablehloRegister
-diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.cpp b/stablehlo/stablehlo/dialect/ExperimentalOps.cpp
---- stablehlo/stablehlo/dialect/ExperimentalOps.cpp
-+++ stablehlo/stablehlo/dialect/ExperimentalOps.cpp
-@@ -0,0 +1,504 @@
++)
++
++target_include_directories(ExperimentalStablehloOps INTERFACE
++  $<BUILD_INTERFACE:${STABLEHLO_SOURCE_DIR}>
++  $<BUILD_INTERFACE:${STABLEHLO_BINARY_DIR}>
++)
+diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
+--- stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
++++ stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
+@@ -0,0 +1,615 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -279,8 +467,9 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.cpp b/stablehlo/stableh
 +limitations under the License.
 +==============================================================================*/
 +
-+#include "stablehlo/dialect/ExperimentalOps.h"
++#include "stablehlo/experimental/dialect/StablehloOps.h"
 +
++#include <cstdint>
 +#include <optional>
 +
 +#include "llvm/ADT/ArrayRef.h"
@@ -293,6 +482,7 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.cpp b/stablehlo/stableh
 +
 +namespace mlir {
 +namespace stablehlo {
++namespace experimental {
 +
 +LogicalResult DynamicReduceWindowOpAdaptor::verify() {
 +  // Before checking the constraints inherited from ReduceWindowOp,
@@ -306,8 +496,7 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.cpp b/stablehlo/stableh
 +    // api_version and backend_config have default values.
 +    // call_target_name should be "stablehlo.dynamic_reduce_window".
 +    // called_computations carries the body.
-+    if (attr.getName() != "api_version" &&
-+        attr.getName() != "backend_config" &&
++    if (attr.getName() != "api_version" && attr.getName() != "backend_config" &&
 +        attr.getName() != "call_target_name" &&
 +        attr.getName() != "called_computations")
 +      return op_.emitError()
@@ -688,8 +877,8 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.cpp b/stablehlo/stableh
 +
 +  // dynamic_top_k_i2
 +  auto kType = k.getType().dyn_cast<ShapedType>();
-+  if (!kType || !kType.hasRank() ||
-+      kType.getRank() != 0 || !kType.getElementType().isIntOrIndex())
++  if (!kType || !kType.hasRank() || kType.getRank() != 0 ||
++      !kType.getElementType().isIntOrIndex())
 +    return op_.emitError()
 +           << "expects k (operand #1) "
 +           << "to be a 0-dimensional tensor of integer or index type";
@@ -751,7 +940,6 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.cpp b/stablehlo/stableh
 +  return op_.getInputs()[1].cast<TypedValue<ShapedType>>();
 +}
 +
-+
 +TypedValue<ShapedType> DynamicTopKOpAdaptor::getValues() {
 +  return op_.getResults()[0].cast<TypedValue<ShapedType>>();
 +}
@@ -760,18 +948,129 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.cpp b/stablehlo/stableh
 +  return op_.getResults()[1].cast<TypedValue<ShapedType>>();
 +}
 +
-+std::optional<DynamicTopKOpAdaptor> getDynamicTopKOp(
-+    CustomCallOp op) {
++std::optional<DynamicTopKOpAdaptor> getDynamicTopKOp(CustomCallOp op) {
 +  if (op.getCallTargetName() != "stablehlo.dynamic_top_k") return {};
 +  return DynamicTopKOpAdaptor(op);
 +}
 +
++LogicalResult TopKOpAdaptor::verify() {
++  if (op_->getNumOperands() != 1)
++    return op_.emitError("expects size(operands) = 1");
++  if (op_->getNumResults() != 2)
++    return op_.emitError("expects size(results) = 2");
++  if (!op_.getBackendConfig().empty())
++    return op_.emitError() << "expects an empty backend_config";
++  if (op_.getCallTargetName() != "mhlo.topk")
++    return op_.emitError() << "expects @mhlo.topk";
++
++  auto operand = op_.getInputs()[0];
++  auto values = op_.getResults()[0];
++  auto indices = op_.getResults()[1];
++  DictionaryAttr topkAttributes =
++      op_->getAttrOfType<DictionaryAttr>("mhlo.attributes");
++  if (!topkAttributes) {
++    return op_.emitError()
++           << "mhlo.attributes missing or not a dictionary attribute";
++  }
++
++  IntegerAttr k_attr = topkAttributes.get("k").dyn_cast_or_null<IntegerAttr>();
++  if (!k_attr) {
++    return op_.emitError() << "mhlo.attributes.k not present or not an integer";
++  }
++  int64_t k = k_attr.getInt();
++
++  // mhlo.topk_c5
++  if (k < 0) return op_.emitError() << "expects k >= 0";
++
++  // mhlo.topk_i1
++  auto operandType = operand.getType().dyn_cast<ShapedType>();
++  if (!operandType || !operandType.hasRank() || operandType.getRank() < 1 ||
++      !operandType.getElementType().isIntOrFloat())
++    return op_.emitError()
++           << "expects operand #0 "
++           << "to be a tensor of integer or floating-point type "
++           << "of rank at least 1";
++
++  // mhlo.topk_o1
++  auto valuesType = values.getType().dyn_cast<ShapedType>();
++  if (!valuesType || !valuesType.hasRank() || valuesType.getRank() < 1 ||
++      !valuesType.getElementType().isIntOrFloat())
++    return op_.emitError()
++           << "expects values (result #0) "
++           << "to be a tensor of integer or floating-point type "
++           << "of rank at least 1";
++
++  // mhlo.topk_o2
++  auto indicesType = indices.getType().dyn_cast<ShapedType>();
++  if (!indicesType || !indicesType.hasRank() || indicesType.getRank() < 1 ||
++      !indicesType.getElementType().isSignlessInteger(32))
++    return op_.emitError() << "expects indices (result #1) "
++                           << "to be a tensor of si32 of rank at least 1";
++
++  // mhlo.topk_c1 && mhlo.topk_c2
++  auto operandLastDim = operandType.getRank() - 1;
++  SmallVector<int64_t> expectedValuesShape(operandType.getShape());
++  expectedValuesShape[operandLastDim] = k;
++  if (failed(verifyCompatibleShape(expectedValuesShape, valuesType.getShape())))
++    return op_.emitError() << "expects the values shape to match the operand "
++                              "shape in all but the last dimension, and "
++                              "that the last dimension of the values shape "
++                              "has a size k";
++
++  // mhlo.topk_c3
++  if (valuesType.getElementType() != operandType.getElementType())
++    return op_.emitError()
++           << "expects the values element type to be the same as the operand "
++           << "element type";
++
++  // mhlo.topk_c4
++  if (failed(
++          verifyCompatibleShape(indicesType.getShape(), valuesType.getShape())))
++    return op_.emitError()
++           << "expects the indices shape to match the values shape";
++
++  return success();
++}
++
++TypedValue<ShapedType> TopKOpAdaptor::getOperand() {
++  return op_.getInputs()[0].cast<TypedValue<ShapedType>>();
++}
++
++TypedValue<ShapedType> TopKOpAdaptor::getValues() {
++  return op_.getResults()[0].cast<TypedValue<ShapedType>>();
++}
++
++TypedValue<ShapedType> TopKOpAdaptor::getIndices() {
++  return op_.getResults()[1].cast<TypedValue<ShapedType>>();
++}
++
++int64_t TopKOpAdaptor::getK() {
++  DictionaryAttr topkAttributes =
++      op_->getAttrOfType<DictionaryAttr>("mhlo.attributes");
++  return topkAttributes.get("k").cast<mlir::IntegerAttr>().getInt();
++}
++
++bool TopKOpAdaptor::getLargest() {
++  DictionaryAttr topkAttributes =
++      op_->getAttrOfType<DictionaryAttr>("mhlo.attributes");
++  IntegerAttr largest =
++      topkAttributes.get("largest").dyn_cast_or_null<mlir::IntegerAttr>();
++
++  return (!largest) ? true : largest.getInt();
++}
++
++std::optional<TopKOpAdaptor> getTopKOp(CustomCallOp op) {
++  if (op.getCallTargetName() != "mhlo.topk") return {};
++  return TopKOpAdaptor(op);
++}
++
++}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.h b/stablehlo/stablehlo/dialect/ExperimentalOps.h
---- stablehlo/stablehlo/dialect/ExperimentalOps.h
-+++ stablehlo/stablehlo/dialect/ExperimentalOps.h
-@@ -0,0 +1,227 @@
+diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.h b/stablehlo/stablehlo/experimental/dialect/StablehloOps.h
+--- stablehlo/stablehlo/experimental/dialect/StablehloOps.h
++++ stablehlo/stablehlo/experimental/dialect/StablehloOps.h
+@@ -0,0 +1,299 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -787,8 +1086,8 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.h b/stablehlo/stablehlo
 +limitations under the License.
 +==============================================================================*/
 +
-+#ifndef STABLEHLO_DIALECT_EXPERIMENTAL_OPS_H
-+#define STABLEHLO_DIALECT_EXPERIMENTAL_OPS_H
++#ifndef STABLEHLO_EXPERIMENTAL_DIALECT_STABLEHLO_OPS_H
++#define STABLEHLO_EXPERIMENTAL_DIALECT_STABLEHLO_OPS_H
 +
 +// This file supports XLA-specific experiments with the StableHLO opset.
 +// These experiments are not yet ready to be upstreamed to openxla/stablehlo
@@ -805,9 +1104,11 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.h b/stablehlo/stablehlo
 +#include "mlir/IR/ValueRange.h"
 +#include "mlir/Support/LogicalResult.h"
 +#include "stablehlo/dialect/StablehloOps.h"
++#include "stablehlo/experimental/dialect/Base.h"
 +
 +namespace mlir {
 +namespace stablehlo {
++namespace experimental {
 +
 +// The DynamicReduceWindowOp experiment provides a dynamic version of
 +// ReduceWindowOp. Once the dynamism RFC is figured out, we expect to have an
@@ -995,55 +1296,253 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.h b/stablehlo/stablehlo
 +// "stablehlo.dynamic_top_k".
 +std::optional<DynamicTopKOpAdaptor> getDynamicTopKOp(CustomCallOp op);
 +
++///////////////////
++// MHLO Op Wrappers
++// There are some ops in MHLO which have experimental support in StableHLO
++// programs by representing them as custom_calls with the target `mhlo.op_name`.
++// The level of support of these ops is similar to the other custom_calls in
++// this file. Generally these ops will be added to StableHLO and their
++// experimental support can be deprecated in favor of op's type inference.
++///////////////////
++
++// The TopK experiment provides a StableHLO adapter to MHLO TopKOp.
++// In the future we expect stablehlo.top_k to be added which will use the same
++// refinement rules.
++//
++// Within this experiment, TopKOp is represented via the serialized MHLO
++// `stablehlo.custom_call @mhlo.topk` custom call.
++//
++// The semantics of experimental TopKOp are inherited from the semantics of
++// mhlo.topk.
++//
++// #### Inputs
++//
++// | Label | Name            | Type                                         |
++// |-------|-----------------|----------------------------------------------|
++// | (I1)  | `operand`       | tensor of integer or floating-point type     |
++// | (I2)  | `k`             | constant of type si64                        |
++// | (I3)  | `largest`       | constant of type i1                          |
++//
++// #### Outputs
++//
++// | Name           | Type                                     |
++// |----------------|------------------------------------------|
++// | `values`       | tensor of integer or floating-point type |
++// | `indices`      | tensor of si32 type                      |
++//
++// #### Constraints
++//
++// * (C1) `shape(values)[:-1] = shape(operand)[:-1]`
++// * (C2) `shape(values)[-1] = k`
++// * (C3) `element_type(values) = element_type(operand)`
++// * (C4) `shape(indices) = shape(values)`
++// * (C5) `k >= 0`
++//
++class TopKOpAdaptor {
++ public:
++  TopKOpAdaptor(CustomCallOp op) : op_(op) {}
++  operator Operation*() { return op_; }
++  Operation* operator->() { return op_; }
++
++  // These accessors assume that the operation is well-formed (i.e. that it
++  // can pass verification).
++  TypedValue<ShapedType> getOperand();
++  TypedValue<ShapedType> getValues();
++  TypedValue<ShapedType> getIndices();
++  int64_t getK();
++  bool getLargest();
++
++  // Verifies the constraints documented above.
++  // Emits errors if errors are detected.
++  LogicalResult verify();
++
++ private:
++  CustomCallOp op_;
++};
++
++// Wraps a custom call in a TopKOpAdaptor.
++// Fails if the call_target_name of the custom call doesn't match
++// "mhlo.topk".
++std::optional<TopKOpAdaptor> getTopKOp(CustomCallOp op);
++
++}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
 +
-+#endif  // STABLEHLO_DIALECT_EXPERIMENTAL_OPS_H
-diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.cpp b/stablehlo/stablehlo/dialect/StablehloOps.cpp
---- stablehlo/stablehlo/dialect/StablehloOps.cpp
-+++ stablehlo/stablehlo/dialect/StablehloOps.cpp
-@@ -1543,6 +1543,7 @@
-     p << " across dimensions = [";
-     llvm::interleaveComma(getDimensions().getValues<int64_t>(), p);
-     p << "]";
-+    p.printOptionalAttrDict(getOperation()->getAttrs(), {"dimensions"});
-     p << " : ";
-     p.printFunctionalType(*this);
-   } else {
-@@ -1705,6 +1706,7 @@
-   if (parser.parseKeyword("across") || parser.parseKeyword("dimensions") ||
-       parser.parseEqual() ||
-       parser.parseCommaSeparatedList(AsmParser::Delimiter::Square, parseDim) ||
-+      parser.parseOptionalAttrDict(result.attributes) ||
-       parser.parseColon() || parser.parseType(reduceOpFnType) ||
-       parser.parseOptionalLocationSpecifier(explicitLoc))
-     return failure();
-diff --ruN a/stablehlo/stablehlo/tests/print_reduce.mlir b/stablehlo/stablehlo/tests/print_reduce.mlir
---- stablehlo/stablehlo/tests/print_reduce.mlir
-+++ stablehlo/stablehlo/tests/print_reduce.mlir
-@@ -168,3 +168,15 @@
- 
-   func.return %0: tensor<4xf32>
- }
++#endif  // STABLEHLO_EXPERIMENTAL_DIALECT_STABLEHLO_OPS_H
+diff --ruN a/stablehlo/stablehlo/experimental/tests/BUILD.bazel b/stablehlo/stablehlo/experimental/tests/BUILD.bazel
+--- stablehlo/stablehlo/experimental/tests/BUILD.bazel
++++ stablehlo/stablehlo/experimental/tests/BUILD.bazel
+@@ -0,0 +1,59 @@
++# Copyright 2023 The StableHLO Authors. All Rights Reserved.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      https://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
++load("@llvm-project//llvm:lit_test.bzl", "lit_test", "package_path")
++
++package(
++    default_visibility = ["//visibility:public"],
++    licenses = ["notice"],
++)
 +
-+// The test case makes sure any custom attrs set on the reduce-op are
-+// printed/parsed when pretty-printed.
++# Equivalent of configure_lit_site_cfg from CMakeLists.txt.
++expand_template(
++    name = "lit_site_cfg_py_gen",
++    testonly = True,
++    out = "lit.site.cfg.py",
++    substitutions = {
++        "@LIT_SITE_CFG_IN_HEADER@": "# Autogenerated, do not edit.",
++        "@LLVM_TOOLS_DIR@": package_path("@llvm-project//llvm:BUILD"),
++        "\"@STABLEHLO_TOOLS_DIR@\"": "os.path.join(os.environ['TEST_SRCDIR'], 'stablehlo')",
++        "\"@STABLEHLO_SOURCE_DIR@\"": "os.path.join(os.environ['TEST_SRCDIR'], 'stablehlo')",
++    },
++    template = "lit.site.cfg.py.in",
++)
 +
-+// CHECK-LABEL:  func @pretty_print_with_custom_attr
-+// CHECK:          applies stablehlo.add across dimensions = [1] {custom_user_attr = 1 : i64}
++# Equivalent of add_lit_testsuite from CMakeLists.txt.
++[
++    lit_test(
++        name = "%s.test" % src,
++        size = "small",
++        srcs = [src],
++        data = [
++            "lit.cfg.py",
++            "lit.site.cfg.py",
++            "//:stablehlo-opt",
++            "//:stablehlo-translate",
++            "//stablehlo/experimental:experimental-stablehlo-opt",
++            "@llvm-project//llvm:FileCheck",
++            "@llvm-project//llvm:not",
++        ] + glob(["%s.bc" % src]),
++        tags = ["stablehlo_tests"],
++    )
++    for src in glob(["**/*.mlir"])
++]
++
++test_suite(
++    name = "experimental_stablehlo_tests",
++    tags = ["experimental_stablehlo_tests"],
++)
+diff --ruN a/stablehlo/stablehlo/experimental/tests/CMakeLists.txt b/stablehlo/stablehlo/experimental/tests/CMakeLists.txt
+--- stablehlo/stablehlo/experimental/tests/CMakeLists.txt
++++ stablehlo/stablehlo/experimental/tests/CMakeLists.txt
+@@ -0,0 +1,29 @@
++# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
++# Copyright 2023 The StableHLO Authors.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      https://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++configure_lit_site_cfg(
++  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
++  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
++  MAIN_CONFIG
++  ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
++)
++add_lit_testsuite(check-experimental-stablehlo-tests "Running the experimental/tests/ suite"
++  ${CMAKE_CURRENT_BINARY_DIR}
++  DEPENDS
++  FileCheck
++  experimental-stablehlo-opt
++  stablehlo-translate
++)
++add_dependencies(check-stablehlo-quick check-experimental-stablehlo-tests)
+diff --ruN a/stablehlo/stablehlo/experimental/tests/lit.cfg.py b/stablehlo/stablehlo/experimental/tests/lit.cfg.py
+--- stablehlo/stablehlo/experimental/tests/lit.cfg.py
++++ stablehlo/stablehlo/experimental/tests/lit.cfg.py
+@@ -0,0 +1,42 @@
++"""Lit configuration to drive test in this repo."""
++# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
++# Copyright 2023 The StableHLO Authors.
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++# -*- Python -*-
++# pylint: disable=undefined-variable
++
++import os
++
++import lit.formats
++from lit.llvm import llvm_config
++
++# Populate Lit configuration with the minimal required metadata.
++# Some metadata is populated in lit.site.cfg.py.in.
++config.name = 'STABLEHLO_TESTS_SUITE'
++config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
++config.suffixes = ['.mlir']
++config.test_source_root = os.path.dirname(__file__)
++
++# Make LLVM and StableHLO tools available in RUN directives
++tools = [
++  'FileCheck',
++  'experimental-stablehlo-opt',
++  'stablehlo-translate',
++  'not',
++]
++tool_dirs = [
++  config.llvm_tools_dir,
++  config.stablehlo_tools_dir,
++]
++llvm_config.add_tool_substitutions(tools, tool_dirs)
+diff --ruN a/stablehlo/stablehlo/experimental/tests/lit.site.cfg.py.in b/stablehlo/stablehlo/experimental/tests/lit.site.cfg.py.in
+--- stablehlo/stablehlo/experimental/tests/lit.site.cfg.py.in
++++ stablehlo/stablehlo/experimental/tests/lit.site.cfg.py.in
+@@ -0,0 +1,21 @@
++# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
++# Copyright 2023 The StableHLO Authors.
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++@LIT_SITE_CFG_IN_HEADER@
++
++import lit.llvm
++lit.llvm.initialize(lit_config, config)
++config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
++config.stablehlo_tools_dir = "@STABLEHLO_TOOLS_DIR@"
++lit_config.load_config(config, "@STABLEHLO_SOURCE_DIR@" + "/stablehlo/experimental/tests/lit.cfg.py")
+diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir b/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir
+--- stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir
++++ stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir
+@@ -0,0 +1,344 @@
++// RUN: experimental-stablehlo-opt --experimental-stablehlo-canonicalize-dynamism --split-input-file --verify-diagnostics %s | FileCheck %s
 +
-+func.func @pretty_print_with_custom_attr(%arg0: tensor<2x64x13xf32>) -> tensor<2x13xf32> {
-+  %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-+  %1 = stablehlo.reduce(%arg0 init: %0) applies stablehlo.add across dimensions = [1] {custom_user_attr = 1 : i64} : (tensor<2x64x13xf32>, tensor<f32>) -> tensor<2x13xf32>
-+  return %1 : tensor<2x13xf32>
-+}
-diff --ruN a/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir b/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
---- stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
-+++ stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
-@@ -426,6 +426,172 @@
- 
- // -----
- 
 +// CHECK-LABEL: func @dynamic_reduce_window_success_static_result_type
 +func.func @dynamic_reduce_window_success_static_result_type(%arg0: tensor<3x2xf32>, %arg1: tensor<f32>) -> tensor<2x2xf32> {
 +  //           CHECK-NOT: stablehlo.dynamic_reduce_window
@@ -1209,17 +1708,6 @@ diff --ruN a/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir b/st
 +}
 +
 +// -----
-+
- // CHECK-LABEL: func @dynamic_reshape_success
- func.func @dynamic_reshape_success(%arg0: tensor<4xf32>) -> tensor<1x4xf32> {
-   // CHECK-NOT: stablehlo.dynamic_reshape
-@@ -452,6 +618,185 @@
-   %0 = stablehlo.constant dense<[1, 4]> : tensor<2xi64>
-   %1 = stablehlo.dynamic_reshape %arg0, %0 : (tensor<4xf32>, tensor<2xi64>) -> tensor<1x?xf32>
-   return %1 : tensor<1x?xf32>
-+}
-+
-+// -----
 +
 +// CHECK-LABEL: func @dynamic_rng_bit_generator_success
 +func.func @dynamic_rng_bit_generator_success(%arg0: tensor<2xui64>) -> tensor<1x4xf32> {
@@ -1396,16 +1884,13 @@ diff --ruN a/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir b/st
 +  %k = stablehlo.constant dense<3> : tensor<ui64>
 +  %1:2 = stablehlo.custom_call @stablehlo.dynamic_top_k(%arg0, %k) : (tensor<16xf32>, tensor<ui64>) -> (tensor<3xf32>, tensor<4xi32>)
 +  return %1#0, %1#1 : tensor<3xf32>, tensor<4xi32>
- }
- 
- // -----
-diff --ruN a/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir b/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
---- stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
-+++ stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
-@@ -607,12 +607,55 @@
- 
- // -----
- 
++}
+diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir b/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
+--- stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
++++ stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
+@@ -0,0 +1,152 @@
++// RUN: experimental-stablehlo-opt --experimental-stablehlo-refine-shapes --split-input-file --verify-diagnostics %s | FileCheck %s
++
 +// CHECK-LABEL: @main
 +func.func @main(%arg0: tensor<3x2xf32>, %arg1: tensor<f32>) -> tensor<*xf32> {
 +  // CHECK: stablehlo.dynamic_reduce_window{{.*}} -> tensor<2x2xf32>
@@ -1426,16 +1911,6 @@ diff --ruN a/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir b/stablehlo/
 +}
 +
 +// -----
-+
- // CHECK-LABEL: @refine_dynamic_reshape
- func.func @refine_dynamic_reshape(%arg0: tensor<4xf32>) -> tensor<*xf32> {
-   // CHECK: stablehlo.dynamic_reshape{{.*}} -> tensor<1x4xf32>
-   %0 = stablehlo.constant dense<[1, 4]> : tensor<2xi64>
-   %1 = stablehlo.dynamic_reshape %arg0, %0 : (tensor<4xf32>, tensor<2xi64>) -> tensor<*xf32>
-   func.return %1 : tensor<*xf32>
-+}
-+
-+// -----
 +
 +// CHECK-LABEL: @refine_dynamic_rng_bit_generator
 +func.func @refine_dynamic_rng_bit_generator(%arg0: tensor<2xui64>) -> (tensor<?xui64>, tensor<*xf32>) {
@@ -1455,36 +1930,374 @@ diff --ruN a/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir b/stablehlo/
 +  %k = stablehlo.constant dense<4> : tensor<ui64>
 +  %1:2 = stablehlo.custom_call @stablehlo.dynamic_top_k(%arg0, %k) : (tensor<16xf32>, tensor<ui64>) -> (tensor<?xf32>, tensor<?xi32>)
 +  return %1#0, %1#1 : tensor<?xf32>, tensor<?xi32>
- }
- 
- // -----
-diff --ruN a/stablehlo/stablehlo/transforms/Passes.td b/stablehlo/stablehlo/transforms/Passes.td
---- stablehlo/stablehlo/transforms/Passes.td
-+++ stablehlo/stablehlo/transforms/Passes.td
-@@ -25,6 +25,7 @@
-     For example, if the output_shape operand of DynamicReshapeOp is a constant
-     value, then the operation can be transformed to ReshapeOp.
-   }];
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_topk
++func.func @refine_mhlo_topk(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
++  // CHECK: mhlo.topk{{.*}} -> (tensor<5x4xf32>, tensor<5x4xi32>)
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_too_many_operands
++func.func @refine_mhlo_error_too_many_operands(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
++  // expected-error@+1{{expects size(operands) = 1}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0, %arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>, tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_too_few_results
++func.func @refine_mhlo_error_too_few_results(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>) {
++  // expected-error@+1{{expects size(results) = 2}}
++  %0 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>)
++  return %0 : tensor<?x?xf32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_wrong_output_1_type
++func.func @refine_mhlo_error_wrong_output_1_type(%arg0: tensor<5x16xf32>) -> (tensor<f32>, tensor<?x?xi32>) {
++  // expected-error@+1{{expects values (result #0) to be a tensor of integer or floating-point type of rank at least 1}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<f32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<f32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_wrong_output_2_type
++func.func @refine_mhlo_error_wrong_output_2_type(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
++  // expected-error@+1{{expects indices (result #1) to be a tensor of si32 of rank at least 1}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>)
++  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_c1_wrong_output_shape
++func.func @refine_mhlo_error_c1_wrong_output_shape(%arg0: tensor<5x16xf32>) -> (tensor<?x?x?xf32>, tensor<?x?xi32>) {
++  // expected-error@+1{{expects the values shape to match the operand}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x?x?xf32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x?x?xf32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_c2_last_dim_not_k
++func.func @refine_mhlo_error_c2_last_dim_not_k(%arg0: tensor<5x16xf32>) -> (tensor<?x5xf32>, tensor<?x?xi32>) {
++  // expected-error@+1{{expects the values shape to match the operand}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x5xf32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x5xf32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_c3_wrong_output_type
++func.func @refine_mhlo_error_c3_wrong_output_type(%arg0: tensor<5x16xf32>) -> (tensor<?x?xi32>, tensor<?x?xi32>) {
++  // expected-error@+1{{expects the values element type to be the same as the operand element type}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x?xi32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x?xi32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_c4_outputs_shape_mismatch
++func.func @refine_mhlo_error_c4_outputs_shape_mismatch(%arg0: tensor<5x16xf32>) -> (tensor<?x4xf32>, tensor<?x5xi32>) {
++  // expected-error@+1{{expects the indices shape to match the values shape}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x4xf32>, tensor<?x5xi32>)
++  return %0#0, %0#1 : tensor<?x4xf32>, tensor<?x5xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_c5_negative_k
++func.func @refine_mhlo_error_c5_negative_k(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
++  // expected-error@+1{{expects k >= 0}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = -4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
++}
+diff --ruN a/stablehlo/stablehlo/experimental/tools/CMakeLists.txt b/stablehlo/stablehlo/experimental/tools/CMakeLists.txt
+--- stablehlo/stablehlo/experimental/tools/CMakeLists.txt
++++ stablehlo/stablehlo/experimental/tools/CMakeLists.txt
+@@ -0,0 +1,41 @@
++# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
++# Copyright 2023 The StableHLO Authors.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      https://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++set(LLVM_OPTIONAL_SOURCES
++  StablehloOptMain.cpp
++)
++
++# stablehlo-opt
++get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
++get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
++get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
++set(LIBS
++        ${dialect_libs}
++        ${conversion_libs}
++        ${extension_libs}
++        ExperimentalStablehloPasses
++        MLIROptLib
++        StablehloRegister
++        StablehloTestUtils
++        StablehloPasses
++        InterpreterOps
++        StablehloTOSATransforms
++        )
++add_llvm_executable(experimental-stablehlo-opt StablehloOptMain.cpp)
++llvm_update_compile_flags(experimental-stablehlo-opt)
++target_link_libraries(experimental-stablehlo-opt PRIVATE ${LIBS})
++
++mlir_check_all_link_libraries(experimental-stablehlo-opt)
++
+diff --ruN a/stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp b/stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp
+--- stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp
++++ stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp
+@@ -0,0 +1,46 @@
++/* Copyright 2023 The StableHLO Authors.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include "mlir/Dialect/Tosa/IR/TosaOps.h"
++#include "mlir/Dialect/Tosa/Transforms/Passes.h"
++#include "mlir/InitAllDialects.h"
++#include "mlir/InitAllExtensions.h"
++#include "mlir/InitAllPasses.h"
++#include "mlir/Tools/mlir-opt/MlirOptMain.h"
++#include "stablehlo/conversions/tosa/transforms/Passes.h"
++#include "stablehlo/dialect/Register.h"
++#include "stablehlo/experimental/transforms/Passes.h"
++#include "stablehlo/reference/InterpreterOps.h"
++#include "stablehlo/tests/TestUtils.h"
++#include "stablehlo/transforms/Passes.h"
++
++int main(int argc, char **argv) {
++  mlir::registerAllPasses();
++  mlir::hlo::registerAllTestPasses();
++  mlir::stablehlo::registerPassPipelines();
++  mlir::stablehlo::registerPasses();
++  mlir::stablehlo::experimental::registerPasses();
++  mlir::tosa::registerStablehloLegalizeToTosaPassPass();
++  mlir::tosa::registerStablehloPrepareForTosaPassPass();
++
++  mlir::DialectRegistry registry;
++  mlir::registerAllDialects(registry);
++  mlir::registerAllExtensions(registry);
++  mlir::stablehlo::registerAllDialects(registry);
++  registry.insert<mlir::stablehlo::interpreter::InterpreterDialect>();
++
++  return failed(
++      mlir::MlirOptMain(argc, argv, "Experimental StableHLO optimizer driver\n", registry));
++}
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
+--- stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
++++ stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
+@@ -0,0 +1,39 @@
++# Copyright 2023 The StableHLO Authors.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      https://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++set(LLVM_TARGET_DEFINITIONS Passes.td)
++mlir_tablegen(Passes.h.inc -gen-pass-decls)
++add_public_tablegen_target(ExperimentalPassesIncGen)
++
++add_mlir_dialect_library(ExperimentalStablehloPasses
++  PARTIAL_SOURCES_INTENDED
++  StablehloCanonicalizeDynamism.cpp
++  StablehloRefineShapes.cpp
++
++  DEPENDS
++  ExperimentalPassesIncGen
++
++  LINK_LIBS PUBLIC
++  ChloOps
++  MLIRFuncDialect
++  MLIRIR
++  MLIRInferTypeOpInterface
++  MLIRSupport
++  MLIRTransformUtils
++  ExperimentalStablehloOps
++  StablehloBase
++  StablehloOps
++  StablehloPasses
++  StablehloTypeInference
++)
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/stablehlo/experimental/transforms/Passes.h
+--- stablehlo/stablehlo/experimental/transforms/Passes.h
++++ stablehlo/stablehlo/experimental/transforms/Passes.h
+@@ -0,0 +1,37 @@
++/* Copyright 2023 The StableHLO Authors.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#ifndef STABLEHLO_EXPERIMENTAL_TRANSFORMS_PASSES_H
++#define STABLEHLO_EXPERIMENTAL_TRANSFORMS_PASSES_H
++
++#include <memory>
++
++#include "mlir/Pass/Pass.h"
++#include "mlir/Transforms/DialectConversion.h"
++
++namespace mlir {
++namespace stablehlo {
++namespace experimental {
++  
++#define GEN_PASS_DECL_STABLEHLOCANONICALIZEDYNAMISMPASS
++#define GEN_PASS_DECL_STABLEHLOREFINESHAPESPASS
++#define GEN_PASS_REGISTRATION
++#include "stablehlo/experimental/transforms/Passes.h.inc"
++
++}  // namespace experimental
++}  // namespace stablehlo
++}  // namespace mlir
++
++#endif  // STABLEHLO_EXPERIMENTAL_TRANSFORMS_PASSES_H
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/stablehlo/experimental/transforms/Passes.td
+--- stablehlo/stablehlo/experimental/transforms/Passes.td
++++ stablehlo/stablehlo/experimental/transforms/Passes.td
+@@ -0,0 +1,31 @@
++/* Copyright 2023 The StableHLO Authors.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++include "mlir/Pass/PassBase.td"
++
++def StablehloCanonicalizeDynamismPass : Pass<"experimental-stablehlo-canonicalize-dynamism", "func::FuncOp"> {
++  let summary = "(Experimental) Canonicalizes dynamic StableHLO ops into static ops.";
++  let description = [{
++    Experimental version of the --stablehlo-canonicalize-dynamism pass.
++  }];
 +  let dependentDialects = ["mlir::chlo::ChloDialect"];
- }
- 
- def StablehloLegalizeToVhloPass : Pass<"stablehlo-legalize-to-vhlo", "ModuleOp"> {
-diff --ruN a/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp
---- stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp
-+++ stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp
-@@ -24,6 +24,8 @@
- #include "mlir/Interfaces/InferTypeOpInterface.h"
- #include "mlir/Support/LogicalResult.h"
- #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
++}
++
++def StablehloRefineShapesPass : Pass<"experimental-stablehlo-refine-shapes", "ModuleOp"> {
++  let summary = "(Experimental) Refines shapes across a StableHLO program.";
++  let description = [{
++    Experimental version of the --stablehlo-refine-shapes pass.
++  }];
++}
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
+--- stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
++++ stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
+@@ -0,0 +1,167 @@
++/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
++   Copyright 2023 The StableHLO Authors.
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include <cstdint>
++
++#include "llvm/ADT/STLExtras.h"
++#include "llvm/ADT/SmallVector.h"
++#include "mlir/Dialect/Func/IR/FuncOps.h"
++#include "mlir/IR/PatternMatch.h"
++#include "mlir/Interfaces/InferTypeOpInterface.h"
++#include "mlir/Support/LogicalResult.h"
++#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 +#include "stablehlo/dialect/ChloOps.h"
-+#include "stablehlo/dialect/ExperimentalOps.h"
- #include "stablehlo/dialect/StablehloOps.h"
- #include "stablehlo/transforms/Passes.h"
- 
-@@ -198,6 +200,54 @@
-   }
- };
- 
++#include "stablehlo/dialect/StablehloOps.h"
++#include "stablehlo/experimental/dialect/StablehloOps.h"
++#include "stablehlo/experimental/transforms/Passes.h"
++#include "stablehlo/transforms/Passes.h"
++
++namespace mlir {
++namespace stablehlo {
++namespace experimental {
++
++#define GEN_PASS_DEF_STABLEHLOCANONICALIZEDYNAMISMPASS
++#include "stablehlo/experimental/transforms/Passes.h.inc"
++
++namespace {
++
 +struct CanonicalizeDynamicReduceWindowOpPattern
 +    : public OpRewritePattern<CustomCallOp> {
 +  using OpRewritePattern::OpRewritePattern;
@@ -1532,17 +2345,6 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp b/
 +    return success();
 +  }
 +};
-+
- struct CanonicalizeDynamicReshapeOpPattern
-     : public OpRewritePattern<DynamicReshapeOp> {
-   using OpRewritePattern::OpRewritePattern;
-@@ -210,6 +260,56 @@
-     if (!op.getType().hasStaticShape())
-       return rewriter.notifyMatchFailure(op, "expected static result type");
-     rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(), op.getOperand());
-+    return success();
-+  }
-+};
 +
 +struct CanonicalizeDynamicRngBitGeneratorOpPattern
 +    : public OpRewritePattern<CustomCallOp> {
@@ -1590,35 +2392,84 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp b/
 +
 +    rewriter.replaceOpWithNewOp<chlo::TopKOp>(
 +        op, op->getResultTypes(), op.getOperand(), k[0]);
-     return success();
-   }
- };
-@@ -320,7 +420,10 @@
-     patterns.add<CanonicalizeDynamicGatherOpPattern>(&getContext());
-     patterns.add<CanonicalizeDynamicIotaOpPattern>(&getContext());
-     patterns.add<CanonicalizeDynamicPadOpPattern>(&getContext());
++    return success();
++  }
++};
++
++struct StablehloCanonicalizeDynamismPass
++    : public impl::StablehloCanonicalizeDynamismPassBase<
++          StablehloCanonicalizeDynamismPass> {
++  using StablehloCanonicalizeDynamismPassBase::
++      StablehloCanonicalizeDynamismPassBase;
++
++  void runOnOperation() override {
++    GreedyRewriteConfig config;
++    config.useTopDownTraversal = true;
++    config.enableRegionSimplification = true;
++    config.maxIterations = 2;
++    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
++    config.strictMode = GreedyRewriteStrictness::AnyOp;
++
++    RewritePatternSet patterns(&getContext());
++    populateStablehloCanonicalizeDynamismPatterns(&patterns, &getContext());
 +    patterns.add<CanonicalizeDynamicReduceWindowOpPattern>(&getContext());
-     patterns.add<CanonicalizeDynamicReshapeOpPattern>(&getContext());
 +    patterns.add<CanonicalizeDynamicRngBitGeneratorOpPattern>(&getContext());
 +    patterns.add<CanonicalizeDynamicTopKOpPattern>(&getContext());
-     patterns.add<CanonicalizeRealDynamicSliceOpToDynamicSliceOpPattern>(
-         &getContext());
-     patterns.add<CanonicalizeRealDynamicSliceOpToSliceOpPattern>(&getContext());
-diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
---- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-+++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-@@ -43,6 +43,7 @@
- #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
- #include "stablehlo/dialect/Base.h"
- #include "stablehlo/dialect/ChloOps.h"
-+#include "stablehlo/dialect/ExperimentalOps.h"
- #include "stablehlo/dialect/StablehloOps.h"
- #include "stablehlo/dialect/TypeInference.h"
- #include "stablehlo/transforms/Passes.h"
-@@ -844,12 +845,97 @@
-   }
- };
- 
++    if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
++                                            config))) {
++      return signalPassFailure();
++    }
++  }
++};
++
++}  // namespace
++}  // namespace experimental
++}  // namespace stablehlo
++}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
+--- stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
++++ stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
+@@ -0,0 +1,178 @@
++/* Copyright 2022 The StableHLO Authors.
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include "stablehlo/transforms/StablehloRefineShapes.h"
++
++#include <cstdint>
++
++#include "llvm/ADT/SmallVector.h"
++#include "mlir/Dialect/Func/IR/FuncOps.h"
++#include "mlir/IR/PatternMatch.h"
++#include "mlir/Interfaces/InferTypeOpInterface.h"
++#include "mlir/Support/LogicalResult.h"
++#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
++#include "stablehlo/dialect/Base.h"
++#include "stablehlo/dialect/StablehloOps.h"
++#include "stablehlo/dialect/TypeInference.h"
++#include "stablehlo/experimental/dialect/StablehloOps.h"
++#include "stablehlo/experimental/transforms/Passes.h"
++#include "stablehlo/transforms/Passes.h"
++
++namespace mlir {
++namespace stablehlo {
++namespace experimental {
++
++#define GEN_PASS_DEF_STABLEHLOREFINESHAPESPASS
++#include "stablehlo/experimental/transforms/Passes.h.inc"
++
++namespace {
++
 +struct RefineDynamicReduceWindowOpPattern
 +    : public OpRewritePattern<CustomCallOp> {
 +  using OpRewritePattern::OpRewritePattern;
@@ -1660,15 +2511,6 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
 +    return refineReturnTypes(rewriter, op, inferredReturnTypes);
 +  }
 +};
-+
- struct RefineDynamicReshapeOpPattern
-     : public OpRewritePattern<DynamicReshapeOp> {
-   using OpRewritePattern::OpRewritePattern;
-   LogicalResult matchAndRewrite(DynamicReshapeOp op,
-                                 PatternRewriter& rewriter) const override {
-     return refineReturnShape(rewriter, op, op.getOutputShape());
-+  }
-+};
 +
 +struct RefineDynamicRngBitGeneratorOpPattern
 +    : public OpRewritePattern<CustomCallOp> {
@@ -1710,18 +2552,908 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
 +
 +    outputShape[operandType.getRank() - 1] = k[0];
 +    return refineReturnTypes(rewriter, op, {{outputShape}, {outputShape}});
-   }
- };
- 
-@@ -1181,7 +1267,10 @@
-     patterns.add<RefineDynamicConvOpPattern>(&getContext());
-     patterns.add<RefineDynamicIotaOpPattern>(&getContext());
-     patterns.add<RefineDynamicPadOpPattern>(&getContext());
++  }
++};
++
++struct RefineTopKOpPattern : public OpRewritePattern<CustomCallOp> {
++  using OpRewritePattern::OpRewritePattern;
++  LogicalResult matchAndRewrite(CustomCallOp impl,
++                                PatternRewriter& rewriter) const override {
++    auto maybeOp = getTopKOp(impl);
++    if (!maybeOp || failed(maybeOp->verify())) return failure();
++    TopKOpAdaptor op = *maybeOp;
++
++    auto operandType = op.getOperand().getType().cast<ShapedType>();
++    SmallVector<int64_t> outputShape(operandType.getShape());
++    outputShape.back() = op.getK();
++    return refineReturnTypes(rewriter, op, {{outputShape}, {outputShape}});
++  }
++};
++
++struct StablehloRefineShapesPass
++    : public impl::StablehloRefineShapesPassBase<StablehloRefineShapesPass> {
++  using StablehloRefineShapesPassBase::StablehloRefineShapesPassBase;
++
++  void runOnOperation() override {
++    auto func = getStablehloRefineShapesTarget(getOperation());
++    if (!func) return signalPassFailure();
++
++    // The algorithm behind this pass consists of a single traversal of the
++    // function. This is sufficient because we only support one function per
++    // program at the moment.
++    // TODO(#1048): Find out why .maxIterations = 1 no longer works.
++    // There have been recent refactors to applyPatternsAndFoldGreedily
++    // upstream, and that might be the reason.
++    GreedyRewriteConfig config;
++    config.useTopDownTraversal = true;
++    config.enableRegionSimplification = true;
++    config.maxIterations = 2;
++    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
++    config.strictMode = GreedyRewriteStrictness::AnyOp;
++
++    RewritePatternSet patterns(&getContext());
++    populateStablehloRefineShapesPatterns(&patterns, &getContext());
 +    patterns.add<RefineDynamicReduceWindowOpPattern>(&getContext());
-     patterns.add<RefineDynamicReshapeOpPattern>(&getContext());
 +    patterns.add<RefineDynamicRngBitGeneratorOpPattern>(&getContext());
 +    patterns.add<RefineDynamicTopKOpPattern>(&getContext());
-     patterns.add<RefineInferTypeOpInterfacePattern>(&getContext());
-     patterns.add<RefineRealDynamicSliceOpPattern>(&getContext());
-     patterns.add<RefineReduceScatterOpPattern>(&getContext());
++    patterns.add<RefineTopKOpPattern>(&getContext());
++    if (failed(
++            applyPatternsAndFoldGreedily(func, std::move(patterns), config))) {
++      return signalPassFailure();
++    }
++  }
++};
++
++}  // namespace
++}  // namespace experimental
++}  // namespace stablehlo
++}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/tests/infer_chlo.mlir b/stablehlo/stablehlo/tests/infer_chlo.mlir
+--- stablehlo/stablehlo/tests/infer_chlo.mlir
++++ stablehlo/stablehlo/tests/infer_chlo.mlir
+@@ -120,10 +120,10 @@
+ // -----
+ // CHECK-LABEL: @broadcast_select_reify
+ func.func @broadcast_select_reify(%arg0: tensor<2xi1>, %arg1: tensor<?xi32>, %arg2: tensor<?xi32>) -> tensor<1xindex> {
+-  // CHECK:      %0 = shape.const_shape [2] : tensor<1xindex>
++  // CHECK:      %0 = shape.shape_of %arg0 : tensor<2xi1> -> tensor<1xindex>
+   // CHECK-NEXT: %1 = shape.shape_of %arg1 : tensor<?xi32> -> tensor<1xindex>
+   // CHECK-NEXT: %2 = shape.shape_of %arg2 : tensor<?xi32> -> tensor<1xindex>
+-  // CHECK-NEXT: %3 = shape.broadcast %1, %2, %0 : tensor<1xindex>, tensor<1xindex>, tensor<1xindex> -> tensor<1xindex>
++  // CHECK-NEXT: %3 = shape.broadcast %0, %1, %2 : tensor<1xindex>, tensor<1xindex>, tensor<1xindex> -> tensor<1xindex>
+   %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+   %1 = "hlo_test_infer.reify_return_type_shapes"(%0) : (tensor<?xi32>) -> tensor<1xindex>
+   return %1: tensor<1xindex>
+diff --ruN a/stablehlo/stablehlo/transforms/Passes.h b/stablehlo/stablehlo/transforms/Passes.h
+--- stablehlo/stablehlo/transforms/Passes.h
++++ stablehlo/stablehlo/transforms/Passes.h
+@@ -18,9 +18,12 @@
+ 
+ #include <memory>
+ 
++#include "mlir/Dialect/Func/IR/FuncOps.h"
+ #include "mlir/Dialect/Quant/QuantOps.h"
+ #include "mlir/Dialect/Shape/IR/Shape.h"
++#include "mlir/IR/BuiltinOps.h"
+ #include "mlir/Pass/Pass.h"
++#include "mlir/Support/LogicalResult.h"
+ #include "mlir/Transforms/DialectConversion.h"
+ 
+ namespace mlir {
+@@ -34,6 +37,14 @@
+ #define GEN_PASS_DECL_VHLOTOVERSIONPASS
+ #define GEN_PASS_REGISTRATION
+ #include "stablehlo/transforms/Passes.h.inc"
++
++// Populates --stablehlo-canonicalize-dynamism patterns.
++void populateStablehloCanonicalizeDynamismPatterns(RewritePatternSet *patterns,
++                                                   MLIRContext *context);
++
++// Populates --stablehlo-refine-shapes patterns.
++void populateStablehloRefineShapesPatterns(RewritePatternSet *patterns,
++                                           MLIRContext *context);
+ 
+ // Populates StableHLO ops to VHLO ops rewriting patterns.
+ void populateStablehloToVhloPatterns(RewritePatternSet *patterns,
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp
+--- stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp
++++ stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp
+@@ -307,16 +307,7 @@
+     config.strictMode = GreedyRewriteStrictness::AnyOp;
+ 
+     RewritePatternSet patterns(&getContext());
+-    patterns.add<CanonicalizeCustomCallOpPattern>(&getContext());
+-    patterns.add<CanonicalizeDynamicBroadcastInDimOpPattern>(&getContext());
+-    patterns.add<CanonicalizeDynamicConvOpPattern>(&getContext());
+-    patterns.add<CanonicalizeDynamicGatherOpPattern>(&getContext());
+-    patterns.add<CanonicalizeDynamicIotaOpPattern>(&getContext());
+-    patterns.add<CanonicalizeDynamicPadOpPattern>(&getContext());
+-    patterns.add<CanonicalizeDynamicReshapeOpPattern>(&getContext());
+-    patterns.add<CanonicalizeRealDynamicSliceOpToDynamicSliceOpPattern>(
+-        &getContext());
+-    patterns.add<CanonicalizeRealDynamicSliceOpToSliceOpPattern>(&getContext());
++    populateStablehloCanonicalizeDynamismPatterns(&patterns, &getContext());
+     if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
+                                             config))) {
+       return signalPassFailure();
+@@ -325,5 +316,19 @@
+ };
+ 
+ }  // namespace
++
++void populateStablehloCanonicalizeDynamismPatterns(RewritePatternSet* patterns,
++                                                   MLIRContext* context) {
++  patterns->add<CanonicalizeCustomCallOpPattern>(context);
++  patterns->add<CanonicalizeDynamicBroadcastInDimOpPattern>(context);
++  patterns->add<CanonicalizeDynamicConvOpPattern>(context);
++  patterns->add<CanonicalizeDynamicGatherOpPattern>(context);
++  patterns->add<CanonicalizeDynamicIotaOpPattern>(context);
++  patterns->add<CanonicalizeDynamicPadOpPattern>(context);
++  patterns->add<CanonicalizeDynamicReshapeOpPattern>(context);
++  patterns->add<CanonicalizeRealDynamicSliceOpToDynamicSliceOpPattern>(context);
++  patterns->add<CanonicalizeRealDynamicSliceOpToSliceOpPattern>(context);
++}
++
+ }  // namespace stablehlo
+ }  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
+--- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
++++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
+@@ -11,6 +11,8 @@
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
++
++#include "stablehlo/transforms/StablehloRefineShapes.h"
+ 
+ #include <cstdint>
+ #include <memory>
+@@ -53,6 +55,193 @@
+ #define GEN_PASS_DEF_STABLEHLOREFINESHAPESPASS
+ #include "stablehlo/transforms/Passes.h.inc"
+ 
++LogicalResult refineValues(PatternRewriter& rewriter, Operation* op,
++                           ValueRange values, TypeRange types) {
++  if (values.size() != types.size())
++    return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
++      diag << "refineValues failed for " << types << ": expected "
++           << values.size() << " types, got " << types.size();
++    });
++
++  // Check whether `types` contain any new information with respect to existing
++  // return types. Even if just a single dimension size out of an entire tensor
++  // type got updated, using `inferMostSpecificType` ensures that we don't
++  // miss that.
++  bool needsRefinement = false;
++  SmallVector<Type> refinedTypes;
++  for (auto it : llvm::zip(values.getTypes(), types)) {
++    // Cannot use structured bindings to simplify this because capturing
++    // structured bindings in a lambda is a C++ 20 extension.
++    auto currentType = std::get<0>(it);
++    auto refinement = std::get<1>(it);
++    auto refinedType = hlo::inferMostSpecificType(
++        /*location=*/{}, {currentType, refinement});
++    if (failed(refinedType))
++      return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
++        diag << "inferMostSpecificType failed for " << currentType << " and "
++             << refinement;
++      });
++    refinedTypes.push_back(*refinedType);
++    needsRefinement |= (currentType != *refinedType);
++  }
++  if (!needsRefinement)
++    return rewriter.notifyMatchFailure(op, "doesn't need refinement");
++
++  for (auto it : llvm::zip(values, refinedTypes)) {
++    // Cannot use structured bindings to simplify this because capturing
++    // structured bindings in a lambda is a C++ 20 extension.
++    auto value = std::get<0>(it);
++    auto refinedType = std::get<1>(it);
++    if (value.getType() == refinedType) continue;
++
++    // Check whether the users of this value are ready for the type of the
++    // value to be refined.
++    for (Operation* user : value.getUsers()) {
++      // CHLO and StableHLO ops are designed to support type refinements of
++      // their operands and results. Any operand type in these ops can change
++      // within what's supported by `inferMostSpecificType` without breaking
++      // verification of the op.
++      if (isa<chlo::ChloDialect, StablehloDialect>(user->getDialect()))
++        continue;
++
++      // Simply changing operand type of `func.return` won't work because
++      // that won't update the FunctionType of the enclosing `func.func`.
++      // Nonetheless, we still want to support these ops because they are widely
++      // used in StableHLO programs (although the plan of record is to replace
++      // `func.return` ops in StableHLO programs with `stablehlo.return`:
++      // https://github.com/openxla/stablehlo/issues/425).
++      if (isa<func::ReturnOp>(user)) continue;
++
++      // Unlike in TensorFlow's type inference pass, here we work only with
++      // allowlisted ops to focus our support on well-defined semantics of
++      // StableHLO programs.
++      return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
++        diag << "unsupported refinement: tried to refine " << value.getType()
++             << " to " << refinedType << " for user " << user;
++      });
++    }
++
++    // Happy path: simply call setType here because most of our users are
++    // fine with that.
++    auto unrefinedType = value.getType();
++    value.setType(refinedType);
++
++    // Special case: for `func.return`, guard the refinement with a cast
++    // and leave propagation of the refined return type to a dedicated pattern.
++    auto isFuncReturn = [](OpOperand& use) -> bool {
++      return isa<func::ReturnOp>(use.getOwner());
++    };
++    if (llvm::none_of(value.getUses(), isFuncReturn)) continue;
++    rewriter.setInsertionPointAfter(op);
++    auto castToUnrefinedType = rewriter.create<UnrealizedConversionCastOp>(
++        op->getLoc(), unrefinedType, value);
++    value.replaceUsesWithIf(castToUnrefinedType.getOutputs()[0], isFuncReturn);
++  }
++
++  return success();
++}
++
++LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
++                                ArrayRef<Type> types) {
++  if (failed(refineValues(rewriter, op, op->getResults(), types)))
++    return failure();
++
++  // This `replaceOpWithIf` call doesn't actually change the IR, but
++  // it does ask the rewriter to visit all the users of this op. There is no
++  // upstream API to achieve this directly, but if it's introduced in the
++  // future, we could use it here.
++  rewriter.replaceOpWithIf(op, op->getResults(),
++                           [](OpOperand& use) { return false; });
++  return success();
++}
++
++LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
++                                ArrayRef<ShapedTypeComponents> refinements) {
++  SmallVector<Type> flattenedTypes;
++  hlo::flattenTupleTypes(op->getResultTypes(), flattenedTypes);
++  auto flattenedSize = flattenedTypes.size();
++  if (flattenedSize != refinements.size())
++    return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
++      diag << "refineReturnTypes failed: expected " << flattenedSize
++           << " refinements, got " << refinements.size();
++    });
++
++  SmallVector<Type> flattenedRefinedTypes;
++  for (auto it : llvm::zip(flattenedTypes, refinements)) {
++    // Cannot use structured bindings to simplify this because capturing
++    // structured bindings in a lambda is a C++ 20 extension.
++    ShapedType currentType = std::get<0>(it).dyn_cast<ShapedType>();
++    ShapedTypeComponents refinement = std::get<1>(it);
++    auto failWithReason = [&](StringRef reason) {
++      return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
++        diag << "refineTypes failed: refining " << currentType
++             << "with refinement: {";
++        if (refinement.hasRank()) {
++          diag << "shape = [" << refinement.getDims() << "]";
++          if (refinement.getAttribute())
++            diag << "attribute = " << refinement.getAttribute();
++        } else {
++          diag << "hasRank = false";
++        }
++        diag << ", elementType = " << refinement.getElementType();
++        diag << "} failed: " << reason;
++      });
++    };
++
++    // If the current type is not a shaped type, then the refinement must
++    // be completely empty.
++    if (!currentType) {
++      if (refinement.hasRank() || refinement.getElementType() ||
++          refinement.getAttribute())
++        return failWithReason("unsupported refinement");
++      flattenedRefinedTypes.push_back(currentType);
++      continue;
++    }
++
++    // If the refinement has an element type, then it must be the same as
++    // the current element type.
++    Type currentElementType = currentType.getElementType();
++    if (refinement.getElementType() &&
++        currentElementType != refinement.getElementType())
++      return failWithReason("expected compatible element types");
++
++    // If neither the current type nor the refinement are ranked, then there's
++    // nothing to refine, and we return the current type.
++    bool hasRank = currentType.hasRank() || refinement.hasRank();
++    if (!hasRank) {
++      flattenedRefinedTypes.push_back(currentType);
++      continue;
++    }
++
++    // If either the current type or the refinement have encodings, then
++    // we fail. Encodings are left for future work.
++    Attribute currentEncoding = nullptr;
++    if (auto currentRankedType = currentType.dyn_cast<RankedTensorType>()) {
++      currentEncoding = currentRankedType.getEncoding();
++    }
++    Attribute refinedEncoding = refinement.getAttribute();
++    if (currentEncoding || refinedEncoding)
++      return failWithReason("expected compatible encodings");
++
++    // If both the current type and the refinement have shapes, use the shape
++    // from the refinement. Otherwise, pick whatever is available.
++    // Make sure that the resulting type is compatible with the current type
++    // to avoid creating invalid code.
++    auto refinedShape =
++        refinement.hasRank() ? refinement.getDims() : currentType.getShape();
++    auto refinedType = RankedTensorType::get(refinedShape, currentElementType);
++    if (!hlo::isCompatibleForHloTypeInference(currentType, refinedType))
++      return failWithReason("expected compatible shapes");
++    flattenedRefinedTypes.push_back(refinedType);
++  }
++
++  SmallVector<Type> refinedTypes;
++  if (failed(hlo::unflattenTupleTypes(op->getResultTypes(),
++                                      flattenedRefinedTypes, refinedTypes)))
++    return failure();
++  return refineReturnTypes(rewriter, op, refinedTypes);
++}
++
+ namespace {
+ 
+ // DenseElementsAttr can be constructed from ArrayRef<APInt> but not from
+@@ -422,245 +611,6 @@
+ // StableHLO-specific extension to refine return types based on potentially
+ // refined operands.
+ 
+-// Refines the values using the given types.
+-// Tricky implementation details:
+-//   1) Need to support partial shape refinements, e.g. if just a single
+-//      dimension size out of an entire tensor type got refined. This is done
+-//      via inferMostSpecificType.
+-//   2) Need to signal propagation of the refined shapes across the
+-//      StableHLO program. Different callers of this function have different
+-//      propagation needs, so this function doesn't signal anything on its own
+-//      and leaves that to the callers.
+-LogicalResult refineValues(PatternRewriter& rewriter, Operation* op,
+-                           ValueRange values, TypeRange types) {
+-  if (values.size() != types.size())
+-    return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
+-      diag << "refineValues failed for " << types << ": expected "
+-           << values.size() << " types, got " << types.size();
+-    });
+-
+-  // Check whether `types` contain any new information with respect to existing
+-  // return types. Even if just a single dimension size out of an entire tensor
+-  // type got updated, using `inferMostSpecificType` ensures that we don't
+-  // miss that.
+-  bool needsRefinement = false;
+-  SmallVector<Type> refinedTypes;
+-  for (auto it : llvm::zip(values.getTypes(), types)) {
+-    // Cannot use structured bindings to simplify this because capturing
+-    // structured bindings in a lambda is a C++ 20 extension.
+-    auto currentType = std::get<0>(it);
+-    auto refinement = std::get<1>(it);
+-    auto refinedType = hlo::inferMostSpecificType(
+-        /*location=*/{}, {currentType, refinement});
+-    if (failed(refinedType))
+-      return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
+-        diag << "inferMostSpecificType failed for " << currentType << " and "
+-             << refinement;
+-      });
+-    refinedTypes.push_back(*refinedType);
+-    needsRefinement |= (currentType != *refinedType);
+-  }
+-  if (!needsRefinement)
+-    return rewriter.notifyMatchFailure(op, "doesn't need refinement");
+-
+-  for (auto it : llvm::zip(values, refinedTypes)) {
+-    // Cannot use structured bindings to simplify this because capturing
+-    // structured bindings in a lambda is a C++ 20 extension.
+-    auto value = std::get<0>(it);
+-    auto refinedType = std::get<1>(it);
+-    if (value.getType() == refinedType) continue;
+-
+-    // Check whether the users of this value are ready for the type of the
+-    // value to be refined.
+-    for (Operation* user : value.getUsers()) {
+-      // CHLO and StableHLO ops are designed to support type refinements of
+-      // their operands and results. Any operand type in these ops can change
+-      // within what's supported by `inferMostSpecificType` without breaking
+-      // verification of the op.
+-      if (isa<chlo::ChloDialect, StablehloDialect>(user->getDialect()))
+-        continue;
+-
+-      // Simply changing operand type of `func.return` won't work because
+-      // that won't update the FunctionType of the enclosing `func.func`.
+-      // Nonetheless, we still want to support these ops because they are widely
+-      // used in StableHLO programs (although the plan of record is to replace
+-      // `func.return` ops in StableHLO programs with `stablehlo.return`:
+-      // https://github.com/openxla/stablehlo/issues/425).
+-      if (isa<func::ReturnOp>(user)) continue;
+-
+-      // Unlike in TensorFlow's type inference pass, here we work only with
+-      // allowlisted ops to focus our support on well-defined semantics of
+-      // StableHLO programs.
+-      return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
+-        diag << "unsupported refinement: tried to refine " << value.getType()
+-             << " to " << refinedType << " for user " << user;
+-      });
+-    }
+-
+-    // Happy path: simply call setType here because most of our users are
+-    // fine with that.
+-    auto unrefinedType = value.getType();
+-    value.setType(refinedType);
+-
+-    // Special case: for `func.return`, guard the refinement with a cast
+-    // and leave propagation of the refined return type to a dedicated pattern.
+-    auto isFuncReturn = [](OpOperand& use) -> bool {
+-      return isa<func::ReturnOp>(use.getOwner());
+-    };
+-    if (llvm::none_of(value.getUses(), isFuncReturn)) continue;
+-    rewriter.setInsertionPointAfter(op);
+-    auto castToUnrefinedType = rewriter.create<UnrealizedConversionCastOp>(
+-        op->getLoc(), unrefinedType, value);
+-    value.replaceUsesWithIf(castToUnrefinedType.getOutputs()[0], isFuncReturn);
+-  }
+-
+-  return success();
+-}
+-
+-// Refines the return types of the given operation using the given types.
+-// This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
+-LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
+-                                ArrayRef<Type> types) {
+-  if (failed(refineValues(rewriter, op, op->getResults(), types)))
+-    return failure();
+-
+-  // This `replaceOpWithIf` call doesn't actually change the IR, but
+-  // it does ask the rewriter to visit all the users of this op. There is no
+-  // upstream API to achieve this directly, but if it's introduced in the
+-  // future, we could use it here.
+-  rewriter.replaceOpWithIf(op, op->getResults(),
+-                           [](OpOperand& use) { return false; });
+-  return success();
+-}
+-
+-// Refines the return types of the given operation using the given types.
+-// Tricky implementation details:
+-//   1) `types` can include non-shaped types. If there are tuple types,
+-//      then they are first flattened into non-tuple types using in-order
+-//      traversal, and only then we apply the refinements. If there are other
+-//      types, then the corresponding refinements must be completely empty.
+-//   2) Encodings are not supported. In principle, TypeExtensions should be
+-//      supportable, but this needs careful thinking through. Given that no one
+-//      asked for support for bounded dynamism in this pass yet, this is left
+-//      for future work.
+-// This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
+-LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
+-                                ArrayRef<ShapedTypeComponents> refinements) {
+-  SmallVector<Type> flattenedTypes;
+-  hlo::flattenTupleTypes(op->getResultTypes(), flattenedTypes);
+-  auto flattenedSize = flattenedTypes.size();
+-  if (flattenedSize != refinements.size())
+-    return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
+-      diag << "refineReturnTypes failed: expected " << flattenedSize
+-           << " refinements, got " << refinements.size();
+-    });
+-
+-  SmallVector<Type> flattenedRefinedTypes;
+-  for (auto it : llvm::zip(flattenedTypes, refinements)) {
+-    // Cannot use structured bindings to simplify this because capturing
+-    // structured bindings in a lambda is a C++ 20 extension.
+-    ShapedType currentType = std::get<0>(it).dyn_cast<ShapedType>();
+-    ShapedTypeComponents refinement = std::get<1>(it);
+-    auto failWithReason = [&](StringRef reason) {
+-      return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
+-        diag << "refineTypes failed: refining " << currentType
+-             << "with refinement: {";
+-        if (refinement.hasRank()) {
+-          diag << "shape = [" << refinement.getDims() << "]";
+-          if (refinement.getAttribute())
+-            diag << "attribute = " << refinement.getAttribute();
+-        } else {
+-          diag << "hasRank = false";
+-        }
+-        diag << ", elementType = " << refinement.getElementType();
+-        diag << "} failed: " << reason;
+-      });
+-    };
+-
+-    // If the current type is not a shaped type, then the refinement must
+-    // be completely empty.
+-    if (!currentType) {
+-      if (refinement.hasRank() || refinement.getElementType() ||
+-          refinement.getAttribute())
+-        return failWithReason("unsupported refinement");
+-      flattenedRefinedTypes.push_back(currentType);
+-      continue;
+-    }
+-
+-    // If the refinement has an element type, then it must be the same as
+-    // the current element type.
+-    Type currentElementType = currentType.getElementType();
+-    if (refinement.getElementType() &&
+-        currentElementType != refinement.getElementType())
+-      return failWithReason("expected compatible element types");
+-
+-    // If neither the current type nor the refinement are ranked, then there's
+-    // nothing to refine, and we return the current type.
+-    bool hasRank = currentType.hasRank() || refinement.hasRank();
+-    if (!hasRank) {
+-      flattenedRefinedTypes.push_back(currentType);
+-      continue;
+-    }
+-
+-    // If either the current type or the refinement have encodings, then
+-    // we fail. Encodings are left for future work.
+-    Attribute currentEncoding = nullptr;
+-    if (auto currentRankedType = currentType.dyn_cast<RankedTensorType>()) {
+-      currentEncoding = currentRankedType.getEncoding();
+-    }
+-    Attribute refinedEncoding = refinement.getAttribute();
+-    if (currentEncoding || refinedEncoding)
+-      return failWithReason("expected compatible encodings");
+-
+-    // If both the current type and the refinement have shapes, use the shape
+-    // from the refinement. Otherwise, pick whatever is available.
+-    // Make sure that the resulting type is compatible with the current type
+-    // to avoid creating invalid code.
+-    auto refinedShape =
+-        refinement.hasRank() ? refinement.getDims() : currentType.getShape();
+-    auto refinedType = RankedTensorType::get(refinedShape, currentElementType);
+-    if (!hlo::isCompatibleForHloTypeInference(currentType, refinedType))
+-      return failWithReason("expected compatible shapes");
+-    flattenedRefinedTypes.push_back(refinedType);
+-  }
+-
+-  SmallVector<Type> refinedTypes;
+-  if (failed(hlo::unflattenTupleTypes(op->getResultTypes(),
+-                                      flattenedRefinedTypes, refinedTypes)))
+-    return failure();
+-  return refineReturnTypes(rewriter, op, refinedTypes);
+-}
+-
+-// Refines the return type of the given operation using the given shape.
+-// This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
+-template <typename OpType>
+-LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
+-                                ArrayRef<int64_t> shape) {
+-  return refineReturnTypes(rewriter, op, ShapedTypeComponents(shape));
+-}
+-
+-// Refines the return type of the given operation using the given shape.
+-// This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
+-template <typename OpType>
+-LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
+-                                Value shapeValue) {
+-  // At the moment, we only support refining return types using fully static
+-  // shape values which serves the current use cases well.
+-  // Support for partially static shape values is left for future work.
+-  SmallVector<int64_t> shape;
+-  if (failed(hlo::matchInts(shapeValue, shape)))
+-    return rewriter.notifyMatchFailure(op, "expected constant output shape");
+-  return refineReturnShape(rewriter, op, shape);
+-}
+-
+ struct RefineAllGatherOpPattern : public OpRewritePattern<AllGatherOp> {
+   using OpRewritePattern::OpRewritePattern;
+   LogicalResult matchAndRewrite(AllGatherOp op,
+@@ -1115,39 +1065,8 @@
+   using StablehloRefineShapesPassBase::StablehloRefineShapesPassBase;
+ 
+   void runOnOperation() override {
+-    // Only one function per module is supported at the moment to avoid the need
+-    // to think about iterative type inference algorithms.
+-    // Current use cases are served well by inlining multiple functions into
+-    // a single function, so we leave native support for multiple functions to
+-    // future work.
+-    // To enable modules that contain CustomCallOp::called_computations,
+-    // we allow multiple functions, in which case we only refine the main
+-    // function called "main", assuming that the called computations will have
+-    // static shapes. Lifting this assumption and expanding refinement to
+-    // multiple functions is left for future work.
+-    ModuleOp module = getOperation();
+-    auto funcs = llvm::to_vector(module.getOps<func::FuncOp>());
+-    if (funcs.empty()) return;
+-    func::FuncOp func;
+-    if (funcs.size() == 1) {
+-      func = funcs[0];
+-    } else {
+-      func = module.lookupSymbol<func::FuncOp>("main");
+-    }
+-    if (!func) {
+-      module.emitOpError()
+-          << "must have no more than one function or a `main`"
+-          << " function to clearly identify which function will be refined";
+-      return signalPassFailure();
+-    }
+-
+-    // Similarly, only one block per function is supported at the moment.
+-    // At the StableHLO level, functions are expected to only have one block,
+-    // so supporting more is out of scope for this pass.
+-    if (!func.getRegion().hasOneBlock()) {
+-      func.emitOpError() << "must have exactly one block";
+-      return signalPassFailure();
+-    }
++    auto func = getStablehloRefineShapesTarget(getOperation());
++    if (!func) return signalPassFailure();
+ 
+     // The algorithm behind this pass consists of a single traversal of the
+     // function. This is sufficient because we only support one function per
+@@ -1163,44 +1082,7 @@
+     config.strictMode = GreedyRewriteStrictness::AnyOp;
+ 
+     RewritePatternSet patterns(&getContext());
+-    patterns.add<EvalAddOpPattern>(&getContext());
+-    patterns.add<EvalAndOpPattern>(&getContext());
+-    patterns.add<EvalBroadcastInDimOpPattern>(&getContext());
+-    patterns.add<EvalClampOpPattern>(&getContext());
+-    patterns.add<EvalCompareOpPattern>(&getContext());
+-    patterns.add<EvalConcatenateOpPattern>(&getContext());
+-    patterns.add<EvalConvertOpPattern>(&getContext());
+-    patterns.add<EvalDivOpPattern>(&getContext());
+-    patterns.add<EvalGetDimensionSizeOpPattern>(&getContext());
+-    patterns.add<EvalMaxOpPattern>(&getContext());
+-    patterns.add<EvalMinOpPattern>(&getContext());
+-    patterns.add<EvalMulOpPattern>(&getContext());
+-    patterns.add<EvalOrOpPattern>(&getContext());
+-    patterns.add<EvalRemOpPattern>(&getContext());
+-    patterns.add<EvalReshapeOpPattern>(&getContext());
+-    patterns.add<EvalSelectOpPattern>(&getContext());
+-    patterns.add<EvalSignOpPattern>(&getContext());
+-    patterns.add<EvalSliceOpPattern>(&getContext());
+-    patterns.add<EvalSubtractOpPattern>(&getContext());
+-    patterns.add<RefineAllGatherOpPattern>(&getContext());
+-    patterns.add<RefineBitcastConvertOpPattern>(&getContext());
+-    patterns.add<RefineConvertOpPattern>(&getContext());
+-    patterns.add<RefineConvolutionOpPattern>(&getContext());
+-    patterns.add<RefineCustomCallOpPattern>(&getContext());
+-    patterns.add<RefineDotGeneralOpPattern>(&getContext());
+-    patterns.add<RefineDynamicBroadcastInDimOpPattern>(&getContext());
+-    patterns.add<RefineDynamicConvOpPattern>(&getContext());
+-    patterns.add<RefineDynamicIotaOpPattern>(&getContext());
+-    patterns.add<RefineDynamicPadOpPattern>(&getContext());
+-    patterns.add<RefineDynamicReshapeOpPattern>(&getContext());
+-    patterns.add<RefineInferTypeOpInterfacePattern>(&getContext());
+-    patterns.add<RefineRealDynamicSliceOpPattern>(&getContext());
+-    patterns.add<RefineReduceScatterOpPattern>(&getContext());
+-    patterns.add<RefineRngOpPattern>(&getContext());
+-    patterns.add<RefineUniformQuantizeOpPattern>(&getContext());
+-    patterns.add<RefineWhileOpPattern>(&getContext());
+-    patterns.add<UpdateFunctionTypePattern>(&getContext());
+-    patterns.add<UpdateRegionTypePattern>(&getContext());
++    populateStablehloRefineShapesPatterns(&patterns, &getContext());
+     if (failed(
+             applyPatternsAndFoldGreedily(func, std::move(patterns), config))) {
+       return signalPassFailure();
+@@ -1209,5 +1091,86 @@
+ };
+ 
+ }  // namespace
++
++func::FuncOp getStablehloRefineShapesTarget(ModuleOp module) {
++  // Only one function per module is supported at the moment to avoid the need
++  // to think about iterative type inference algorithms.
++  // Current use cases are served well by inlining multiple functions into
++  // a single function, so we leave native support for multiple functions to
++  // future work.
++  // To enable modules that contain CustomCallOp::called_computations,
++  // we allow multiple functions, in which case we only refine the main
++  // function called "main", assuming that the called computations will have
++  // static shapes. Lifting this assumption and expanding refinement to
++  // multiple functions is left for future work.
++  auto funcs = llvm::to_vector(module.getOps<func::FuncOp>());
++  if (funcs.empty()) return nullptr;
++
++  func::FuncOp result;
++  if (funcs.size() == 1) {
++    result = funcs[0];
++  } else {
++    result = module.lookupSymbol<func::FuncOp>("main");
++  }
++  if (!result) {
++    module.emitOpError()
++        << "must have no more than one function or a `main`"
++        << " function to clearly identify which function will be refined";
++    return nullptr;
++  }
++
++  // Similarly, only one block per function is supported at the moment.
++  // At the StableHLO level, functions are expected to only have one block,
++  // so supporting more is out of scope for this pass.
++  if (!result.getRegion().hasOneBlock()) {
++    result.emitOpError() << "must have exactly one block";
++    return nullptr;
++  }
++
++  return result;
++}
++
++void populateStablehloRefineShapesPatterns(RewritePatternSet* patterns,
++                                           MLIRContext* context) {
++  patterns->add<EvalAddOpPattern>(context);
++  patterns->add<EvalAndOpPattern>(context);
++  patterns->add<EvalBroadcastInDimOpPattern>(context);
++  patterns->add<EvalClampOpPattern>(context);
++  patterns->add<EvalCompareOpPattern>(context);
++  patterns->add<EvalConcatenateOpPattern>(context);
++  patterns->add<EvalConvertOpPattern>(context);
++  patterns->add<EvalDivOpPattern>(context);
++  patterns->add<EvalGetDimensionSizeOpPattern>(context);
++  patterns->add<EvalMaxOpPattern>(context);
++  patterns->add<EvalMinOpPattern>(context);
++  patterns->add<EvalMulOpPattern>(context);
++  patterns->add<EvalOrOpPattern>(context);
++  patterns->add<EvalRemOpPattern>(context);
++  patterns->add<EvalReshapeOpPattern>(context);
++  patterns->add<EvalSelectOpPattern>(context);
++  patterns->add<EvalSignOpPattern>(context);
++  patterns->add<EvalSliceOpPattern>(context);
++  patterns->add<EvalSubtractOpPattern>(context);
++  patterns->add<RefineAllGatherOpPattern>(context);
++  patterns->add<RefineBitcastConvertOpPattern>(context);
++  patterns->add<RefineConvertOpPattern>(context);
++  patterns->add<RefineConvolutionOpPattern>(context);
++  patterns->add<RefineCustomCallOpPattern>(context);
++  patterns->add<RefineDotGeneralOpPattern>(context);
++  patterns->add<RefineDynamicBroadcastInDimOpPattern>(context);
++  patterns->add<RefineDynamicConvOpPattern>(context);
++  patterns->add<RefineDynamicIotaOpPattern>(context);
++  patterns->add<RefineDynamicPadOpPattern>(context);
++  patterns->add<RefineDynamicReshapeOpPattern>(context);
++  patterns->add<RefineInferTypeOpInterfacePattern>(context);
++  patterns->add<RefineRealDynamicSliceOpPattern>(context);
++  patterns->add<RefineReduceScatterOpPattern>(context);
++  patterns->add<RefineRngOpPattern>(context);
++  patterns->add<RefineUniformQuantizeOpPattern>(context);
++  patterns->add<RefineWhileOpPattern>(context);
++  patterns->add<UpdateFunctionTypePattern>(context);
++  patterns->add<UpdateRegionTypePattern>(context);
++}
++
+ }  // namespace stablehlo
+ }  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.h b/stablehlo/stablehlo/transforms/StablehloRefineShapes.h
+--- stablehlo/stablehlo/transforms/StablehloRefineShapes.h
++++ stablehlo/stablehlo/transforms/StablehloRefineShapes.h
+@@ -0,0 +1,102 @@
++/* Copyright 2022 The StableHLO Authors.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#ifndef STABLEHLO_TRANSFORMS_STABLEHLO_REFINE_SHAPES_H
++#define STABLEHLO_TRANSFORMS_STABLEHLO_REFINE_SHAPES_H
++
++#include "llvm/ADT/SmallVector.h"
++#include "mlir/Dialect/Func/IR/FuncOps.h"
++#include "mlir/IR/BuiltinOps.h"
++#include "mlir/IR/Operation.h"
++#include "mlir/IR/PatternMatch.h"
++#include "mlir/IR/Types.h"
++#include "mlir/IR/Value.h"
++#include "mlir/Interfaces/InferTypeOpInterface.h"
++#include "mlir/Support/LogicalResult.h"
++#include "stablehlo/dialect/Base.h"
++
++namespace mlir {
++namespace stablehlo {
++
++// Gets a FuncOp that --stablehlo-refine-shapes will run on.
++// Returns a nullptr and emits appropriate errors if such a function cannot
++// be obtained from the module.
++func::FuncOp getStablehloRefineShapesTarget(ModuleOp module);
++
++// Refines the values using the given types.
++// Tricky implementation details:
++//   1) Need to support partial shape refinements, e.g. if just a single
++//      dimension size out of an entire tensor type got refined. This is done
++//      via inferMostSpecificType.
++//   2) Need to signal propagation of the refined shapes across the
++//      StableHLO program. Different callers of this function have different
++//      propagation needs, so this function doesn't signal anything on its own
++//      and leaves that to the callers.
++LogicalResult refineValues(PatternRewriter& rewriter, Operation* op,
++                           ValueRange values, TypeRange types);
++
++// Refines the return types of the given operation using the given types.
++// This function also signals PatternRewriter that it needs to visit all the
++// users of this op if any updates to its results have happened during execution
++// of the function.
++LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
++                                ArrayRef<Type> types);
++
++// Refines the return types of the given operation using the given types.
++// Tricky implementation details:
++//   1) `types` can include non-shaped types. If there are tuple types,
++//      then they are first flattened into non-tuple types using in-order
++//      traversal, and only then we apply the refinements. If there are other
++//      types, then the corresponding refinements must be completely empty.
++//   2) Encodings are not supported. In principle, TypeExtensions should be
++//      supportable, but this needs careful thinking through. Given that no one
++//      asked for support for bounded dynamism in this pass yet, this is left
++//      for future work.
++// This function also signals PatternRewriter that it needs to visit all the
++// users of this op if any updates to its results have happened during execution
++// of the function.
++LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
++                                ArrayRef<ShapedTypeComponents> refinements);
++
++// Refines the return type of the given operation using the given shape.
++// This function also signals PatternRewriter that it needs to visit all the
++// users of this op if any updates to its results have happened during execution
++// of the function.
++template <typename OpType>
++LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
++                                ArrayRef<int64_t> shape) {
++  return refineReturnTypes(rewriter, op, ShapedTypeComponents(shape));
++}
++
++// Refines the return type of the given operation using the given shape.
++// This function also signals PatternRewriter that it needs to visit all the
++// users of this op if any updates to its results have happened during execution
++// of the function.
++template <typename OpType>
++LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
++                                Value shapeValue) {
++  // At the moment, we only support refining return types using fully static
++  // shape values which serves the current use cases well.
++  // Support for partially static shape values is left for future work.
++  SmallVector<int64_t> shape;
++  if (failed(hlo::matchInts(shapeValue, shape)))
++    return rewriter.notifyMatchFailure(op, "expected constant output shape");
++  return refineReturnShape(rewriter, op, shape);
++}
++
++}  // namespace stablehlo
++}  // namespace mlir
++
++#endif  // STABLEHLO_TRANSFORMS_STABLEHLO_REFINE_SHAPES_H
+diff --ruN a/stablehlo/stablehlo/transforms/VhloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/VhloLegalizeToStablehlo.cpp
+--- stablehlo/stablehlo/transforms/VhloLegalizeToStablehlo.cpp
++++ stablehlo/stablehlo/transforms/VhloLegalizeToStablehlo.cpp
+@@ -430,9 +430,20 @@
+                                 SmallVector<NamedAttribute>& stablehloAttrs) {
+   auto tensorAttr = dyn_cast<vhlo::TensorV1Attr>(vhloAttr);
+   if (!tensorAttr) return specialFailure();
+-  ArrayRef<int64_t> data(
+-      reinterpret_cast<const int64_t*>(tensorAttr.getData().data()),
+-      tensorAttr.getData().size() / sizeof(int64_t));
++
++  auto data = ArrayRef<int64_t>(
++                  reinterpret_cast<const int64_t*>(tensorAttr.getData().data()),
++                  tensorAttr.getData().size() / sizeof(int64_t))
++                  .vec();
++
++  // Handle splats
++  if (data.size() == 1) {
++    auto tensorType = tensorAttr.getType().dyn_cast<vhlo::RankedTensorV1Type>();
++    if (!tensorType || (tensorType.getShape().size() != 1))
++      return specialFailure();
++    auto size = tensorType.getShape()[0];
++    data.resize(size, data[0]);
++  }
+ 
+   stablehloAttrs.emplace_back(
+       vhloName, DenseI64ArrayAttr::get(vhloAttr.getContext(), data));
 
diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index 8d7054dda8b2c0..f175093e925b74 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "04291aea6b50d9573e6f4de184938d83b9564cd0"
-    STABLEHLO_SHA256 = "2f57b2cb8eeadebe8430e294f88919b392cf472c62fdd40d4713680b283d64e5"
+    STABLEHLO_COMMIT = "ab709fe48de88c67717abfbd7ef17425eb95ddaf"
+    STABLEHLO_SHA256 = "a469ecc3d6747f9effdc1c7813568953dd1dc30070ca8f4f6f8a4d405e8c687e"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 6dd0e178ec09b7..9fca8c020bf276 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "e45cd275068c87cbd1d42d0dc89475d72798a9e8"
-    TFRT_SHA256 = "dd4a1440fdc8bf142c5ac00bd6227e41999a0912b2f847e932b57307f97138dd"
+    TFRT_COMMIT = "dbd8da33ab49ed8aa5f08ebe85bacb91341f5d61"
+    TFRT_SHA256 = "b95b1d17eb2e28ee0f00ae672c7377767a17e7dadde169b335aa481bb07883c7"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/triton/cl577369732.patch b/third_party/triton/cl577369732.patch
deleted file mode 100644
index e63b9f3804974b..00000000000000
--- a/third_party/triton/cl577369732.patch
+++ /dev/null
@@ -1,116 +0,0 @@
-==== triton/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp#19 - /google/src/cloud/springerm/mlir_3cd2a0bc1a2dcf851f1821765946b77d0e65bd2e_1698463035/triton/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp	2023-10-19 14:55:11.000000000 -0700
-+++ triton/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp	2023-10-27 20:17:46.000000000 -0700
-@@ -759,7 +759,7 @@
-   OpBuilder builder(forOp);
-   // Get init operands for loop carried values
-   for (BlockArgument &arg : forOp.getRegionIterArgs()) {
--    OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg);
-+    OpOperand &operand = *forOp.getTiedLoopInit(arg);
-     setValueMapping(arg, operand.get(), 0);
-   }
- 
-==== triton/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp#10 - /google/src/cloud/springerm/mlir_3cd2a0bc1a2dcf851f1821765946b77d0e65bd2e_1698463035/triton/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp	2023-10-19 14:55:11.000000000 -0700
-+++ triton/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp	2023-10-27 20:17:46.000000000 -0700
-@@ -188,7 +188,7 @@
-   auto getIncomingOp = [this](Value v) -> Value {
-     if (auto arg = v.dyn_cast<BlockArgument>())
-       if (arg.getOwner()->getParentOp() == forOp.getOperation())
--        return forOp.getOpOperandForRegionIterArg(arg).get();
-+        return forOp.getTiedLoopInit(arg)->get();
-     return Value();
-   };
- 
-@@ -298,10 +298,10 @@
-       Operation *firstDot = builder.clone(*dot, mapping);
-       if (Value a = operand2headPrefetch.lookup(dot.getA()))
-         firstDot->setOperand(
--            0, newForOp.getRegionIterArgForOpOperand(*a.use_begin()));
-+            0, newForOp.getTiedLoopRegionIterArg(&*a.use_begin()));
-       if (Value b = operand2headPrefetch.lookup(dot.getB()))
-         firstDot->setOperand(
--            1, newForOp.getRegionIterArgForOpOperand(*b.use_begin()));
-+            1, newForOp.getTiedLoopRegionIterArg(&*b.use_begin()));
- 
-       // remaining part
-       int64_t kOff = prefetchWidth;
-==== triton/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp#18 - /google/src/cloud/springerm/mlir_3cd2a0bc1a2dcf851f1821765946b77d0e65bd2e_1698463035/triton/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp	2023-10-24 18:31:01.000000000 -0700
-+++ triton/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp	2023-10-27 20:17:46.000000000 -0700
-@@ -245,7 +245,7 @@
-   for (OpOperand &use : value.getUses()) {
-     Operation *user = use.getOwner();
-     if (auto forOp = dyn_cast<scf::ForOp>(user)) {
--      Value arg = forOp.getRegionIterArgForOpOperand(use);
-+      Value arg = forOp.getTiedLoopRegionIterArg(&use);
-       Value result = forOp.getResultForOpOperand(use);
-       setEncoding({arg, result}, info, changed, user);
-       continue;
-@@ -767,7 +767,7 @@
-       SmallVector<Value> newOperands;
-       for (auto arg : forOp.getRegionIterArgs()) {
-         if (slice.count(arg)) {
--          OpOperand &initVal = forOp.getOpOperandForRegionIterArg(arg);
-+          OpOperand &initVal = *forOp.getTiedLoopInit(arg);
-           argMapping.push_back(std::make_pair(
-               forOp.getResultForOpOperand(initVal).getResultNumber(),
-               forOp.getInitArgs().size() + newOperands.size()));
-==== triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp#16 - /google/src/cloud/springerm/mlir_3cd2a0bc1a2dcf851f1821765946b77d0e65bd2e_1698463035/triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2023-10-24 18:31:01.000000000 -0700
-+++ triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2023-10-27 20:17:46.000000000 -0700
-@@ -430,10 +430,10 @@
-     Block *block = blockArg.getOwner();
-     Operation *parentOp = block->getParentOp();
-     if (auto forOp = dyn_cast<scf::ForOp>(parentOp)) {
--      OpOperand &initOperand = forOp.getOpOperandForRegionIterArg(blockArg);
-+      OpOperand *initOperand = forOp.getTiedLoopInit(blockArg);
-       Value yieldOperand = forOp.getBody()->getTerminator()->getOperand(
-           blockArg.getArgNumber() - forOp.getNumInductionVars());
--      queue.push_back({initOperand.get(), encoding});
-+      queue.push_back({initOperand->get(), encoding});
-       queue.push_back({yieldOperand, encoding});
-       continue;
-     }
-==== triton/lib/Dialect/TritonNvidiaGPU/Transforms/Utility.cpp#1 - /google/src/cloud/springerm/mlir_3cd2a0bc1a2dcf851f1821765946b77d0e65bd2e_1698463035/triton/lib/Dialect/TritonNvidiaGPU/Transforms/Utility.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonNvidiaGPU/Transforms/Utility.cpp	2023-10-12 01:35:16.000000000 -0700
-+++ triton/lib/Dialect/TritonNvidiaGPU/Transforms/Utility.cpp	2023-10-27 20:17:46.000000000 -0700
-@@ -88,9 +88,8 @@
-     auto parentOp = blockArg.getOwner()->getParentOp();
-     if (auto forOp = dyn_cast<scf::ForOp>(parentOp)) {
-       if (blockArg.getArgNumber() >= forOp.getNumInductionVars()) {
--        if (failed(getDependentPointers(
--                forOp.getOpOperandForRegionIterArg(blockArg).get(),
--                dependentSet, processedSet)))
-+        if (failed(getDependentPointers(forOp.getTiedLoopInit(blockArg)->get(),
-+                                        dependentSet, processedSet)))
-           return failure();
- 
-         unsigned operandIdx =
-@@ -383,7 +382,7 @@
-       if (failed(addControlOperandsForForOp(forOp)))
-         return failure();
-       if (blockArg.getArgNumber() >= forOp.getNumInductionVars()) {
--        Value operand = forOp.getOpOperandForRegionIterArg(blockArg).get();
-+        Value operand = forOp.getTiedLoopInit(blockArg)->get();
-         if (failed(tryInsertAndPropagate(operand)))
-           return failure();
- 
-==== triton/test/lib/Analysis/TestAlias.cpp#5 - /google/src/cloud/springerm/mlir_3cd2a0bc1a2dcf851f1821765946b77d0e65bd2e_1698463035/triton/test/lib/Analysis/TestAlias.cpp ====
-# action=edit type=text
---- triton/test/lib/Analysis/TestAlias.cpp	2023-10-19 14:55:11.000000000 -0700
-+++ triton/test/lib/Analysis/TestAlias.cpp	2023-10-27 20:17:47.000000000 -0700
-@@ -87,7 +87,7 @@
-       }
-       if (auto forOp = dyn_cast<scf::ForOp>(op)) {
-         for (auto arg : llvm::enumerate(forOp.getRegionIterArgs())) {
--          auto operand = forOp.getOpOperandForRegionIterArg(arg.value()).get();
-+          auto operand = forOp.getTiedLoopInit(arg.value())->get();
-           auto opNames = getAllocOpNames(operand);
-           auto argName = getValueOperandName(arg.value(), state);
-           print(argName, opNames, os);
diff --git a/third_party/triton/cl577379396.patch b/third_party/triton/cl577379396.patch
deleted file mode 100644
index ee569f9b8f55c3..00000000000000
--- a/third_party/triton/cl577379396.patch
+++ /dev/null
@@ -1,33 +0,0 @@
-diff --git a/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp b/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
---- a/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
-@@ -246,7 +246,7 @@ SmallVector<Value> LayoutPropagation::pr
-     Operation *user = use.getOwner();
-     if (auto forOp = dyn_cast<scf::ForOp>(user)) {
-       Value arg = forOp.getTiedLoopRegionIterArg(&use);
--      Value result = forOp.getResultForOpOperand(use);
-+      Value result = forOp.getTiedLoopResult(&use);
-       setEncoding({arg, result}, info, changed, user);
-       continue;
-     }
-@@ -769,7 +769,7 @@ static void rewriteSlice(SetVector<Value
-         if (slice.count(arg)) {
-           OpOperand &initVal = *forOp.getTiedLoopInit(arg);
-           argMapping.push_back(std::make_pair(
--              forOp.getResultForOpOperand(initVal).getResultNumber(),
-+              forOp.getTiedLoopResult(&initVal).getResultNumber(),
-               forOp.getInitArgs().size() + newOperands.size()));
-           newOperands.push_back(mapping.lookup(initVal.get()));
-         }
-diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
---- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
-@@ -545,7 +545,7 @@ struct ForOpDeadArgElimination : public 
-       Value value = queue.pop_back_val();
-       if (auto nestedFor = value.getDefiningOp<scf::ForOp>()) {
-         auto result = value.cast<OpResult>();
--        OpOperand &forOperand = nestedFor.getOpOperandForResult(result);
-+        OpOperand &forOperand = *nestedFor.getTiedLoopInit(result);
-         markLive(forOperand.get());
-         auto nestedYieldOp =
-             cast<scf::YieldOp>(nestedFor.getBody()->getTerminator());
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index c0c6207f85da73..b864617b503f3e 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl578837341"
-    TRITON_SHA256 = "0d8112bb31d48b5beadbfc2e13c52770a95d3759b312b15cf26dd72e71410568"
+    TRITON_COMMIT = "cl588045313"
+    TRITON_SHA256 = "14cb6ddccc3139b2e8d77af08bb232eb06536d5c715c4bbc720a752af40ba2dc"
 
     tf_http_archive(
         name = "triton",
@@ -15,7 +15,7 @@ def repo():
         urls = tf_mirror_urls("https://github.com/openxla/triton/archive/{commit}.tar.gz".format(commit = TRITON_COMMIT)),
         # For temporary changes which haven't landed upstream yet.
         patch_file = [
-            "//third_party/triton:cl568176943.patch",
             "//third_party/triton:b304456327.patch",
+            "//third_party/triton:cl568176943.patch",
         ],
     )
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index e9fc2d4eb20a55..9de6b6e0c2bd54 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -526,34 +526,9 @@ build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_c
 build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
-build:rbe_linux_cuda_nvcc --config=cuda
+build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
-build:rbe_linux_cuda_nvcc --@local_xla//xla/python:enable_gpu=true
-build:rbe_linux_cuda_nvcc --@local_xla//xla/python:jax_cuda_pip_rpaths=true
-build:rbe_linux_cuda_nvcc --define=xla_python_enable_gpu=true
-build:rbe_linux_cuda_nvcc --config=tensorrt
-build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_75,compute_80"
-build:rbe_linux_cuda_nvcc --action_env=TF_CUDA_VERSION="12"
-build:rbe_linux_cuda_nvcc --action_env=TF_CUDNN_VERSION="8"
-build:rbe_linux_cuda_nvcc --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
-build:rbe_linux_cuda_nvcc --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
-build:rbe_linux_cuda_nvcc --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:rbe_linux_cuda_nvcc --crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
-build:rbe_linux_cuda_nvcc --config=rbe_linux
-build:rbe_linux_cuda_nvcc --host_crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
-build:rbe_linux_cuda_nvcc --extra_toolchains="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cuda_nvcc --extra_execution_platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --host_platform="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_python3.9"
-build:rbe_linux_cuda_nvcc --python_path="/usr/bin/python3"
-# These you may need to change for your own GCP project.
-common:rbe_linux_cuda_nvcc --remote_instance_name=projects/tensorflow-testing/instances/default_instance
-build:rbe_linux_cuda_nvcc --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_cuda"
-build:rbe_linux_cuda_nvcc --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_tensorrt"
-build:rbe_linux_cuda_nvcc --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_nccl"
-test:rbe_linux_cuda_nvcc --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+build:rbe_linux_cuda_nvcc --action_env=TF_NVCC_CLANG="1"
 
 # TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
 build:rbe_win --config=rbe_base
@@ -692,19 +667,39 @@ build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda
 build:release_cpu_macos --config=avx_linux
 test:release_cpu_macos --config=release_base
 
-# Build configs for macOS ARM CPUs
+# Base build configs for macOS
+build:release_macos_base --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
+build:release_macos_base --define=no_nccl_support=true --output_filter=^$
+
+# Build configs for macOS x86
+build:release_macos_x86 --config=release_macos_base
+# Build with the AVX instruction set when on macOS x86
+build:release_macos_x86 --config=avx_linux
+build:release_macos_x86 --cpu=darwin
+# Target Catalina as the minimum compatible OS version
+build:release_macos_x86 --macos_minimum_os=10.15
+build:release_macos_x86 --action_env MACOSX_DEPLOYMENT_TARGET=10.15
+
+# Build configs for macOS Arm64
+build:release_macos_arm64 --config=release_macos_base
 build:release_macos_arm64 --cpu=darwin_arm64
-# Set DEVELOPER_DIR to select a version of Xcode.
-build:release_macos_arm64 --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
-build:release_macos_arm64 --define=no_nccl_support=true
-# Suppress all warning messages
-build:release_macos_arm64 --output_filter=^$
-# Disable MKL
 build:release_macos_arm64 --define=tensorflow_mkldnn_contraction_kernel=0
 # Target Moneterey as the minimum compatible OS version
 build:release_macos_arm64 --macos_minimum_os=12.0
 build:release_macos_arm64 --action_env MACOSX_DEPLOYMENT_TARGET=12.0
 
+# Base test configs for macOS
+test:release_macos_base --verbose_failures=true --local_test_jobs=HOST_CPUS
+test:release_macos_base --test_timeout=300,450,1200,3600 --test_output=errors
+test:release_macos_base --build_tests_only --keep_going
+test:release_macos_base --flaky_test_attempts=3
+
+# Test configs for macOS x86
+test:release_macos_x86 --config=release_macos_base
+
+# Test configs for macOS Arm64
+test:release_macos_arm64 --config=release_macos_base
+
 # TODO(kanglan): Update windows configs after b/289091160 is fixed
 build:release_cpu_windows --config=avx_win
 build:release_cpu_windows --define=no_tensorflow_py_deps=true
@@ -723,10 +718,14 @@ build:no_tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compil
 
 # Use --config=tf_public_cache to try and use the TensorFlow public build cache
 # to build TensorFlow. Look at ci/official/envs to find which types of jobs
-# push to the cache.
+# push to the cache.  For macOS, use --config=tf_public_macos_cache
 build:tf_public_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --remote_upload_local_results=false
 # Cache pushes are limited to TF's CI system.
 build:tf_public_cache_push --config=tf_public_cache --remote_upload_local_results=true --google_default_credentials
+# Public cache for macOS builds
+build:tf_public_macos_cache --remote_cache="https://storage.googleapis.com/tensorflow-macos-bazel-cache/oct2023" --remote_upload_local_results=false
+# Cache pushes are limited to TF's CI system.
+build:tf_public_macos_cache_push --config=tf_public_macos_cache --remote_upload_local_results=true --google_default_credentials
 
 # END TF CACHE HELPER OPTIONS
 # BEGIN TF TEST SUITE OPTIONS
@@ -743,22 +742,27 @@ build:linux_libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.
 test:linux_cpu_wheel_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cpu_wheel_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cpu_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_cpu_wheel_test --config=linux_cpu_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cpu_wheel_test --config=linux_cpu_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # CUDA WHEEL
-test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cuda_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # ARM64 WHEEL
 test:linux_arm64_wheel_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_arm64_wheel_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_arm64_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_arm64_wheel_test --config=linux_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...  -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test
+test:linux_arm64_wheel_test --config=linux_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...  -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test
 # MACOS ARM64 WHEEL
 test:macos_arm64_wheel_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
 test:macos_arm64_wheel_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
-test:macos_arm64_wheel_test_filters --test_lang_filters=py
-test:macos_arm64_wheel_test --config=macos_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xla/service/gpu/... -//tensorflow/compiler/xla/tools/multihost_hlo_runner/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/... -//tensorflow/compiler/xla/tests:local_client_aot_test_computation -//tensorflow/compiler/xla/tests:local_client_aot_test_helper -//tensorflow/compiler/xla/tests:local_client_aot_test
+test:macos_arm64_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
+test:macos_arm64_wheel_test --config=macos_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/...
+# MACOS X86 WHEEL
+test:macos_x86_wheel_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
+test:macos_x86_wheel_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
+test:macos_x86_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
+test:macos_x86_wheel_test --config=macos_x86_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/...
 
 # PYCPP TESTS run a suite of Python and C++ tests to verify general correctness over
 # the whole TF code base. These are usually run continuously or upon presubmit.
@@ -766,21 +770,53 @@ test:macos_arm64_wheel_test --config=macos_arm64_wheel_test_filters -- //tensorf
 test:linux_cpu_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_cpu_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_cpu_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # CUDA PYCPP:
 test:linux_cuda_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # ARM64 PYCPP
 test:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --flaky_test_attempts=3
 # TODO(michaelhudgins): Why do we need to specifically omit go and java here? 
-test:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test
+test:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test -//tensorflow/python/tools:aot_compiled_test
+# CROSS-COMPILE ARM64 PYCPP
+test:cross_compile_linux_arm64_pycpp_test --config=linux_arm64_pycpp_test
+# Tests that fail only when cross-compiled
+test:cross_compile_linux_arm64_pycpp_test -//tensorflow/compiler/mlir/quantization/stablehlo:convert_tf_quant_to_mhlo_int_test
 # MACOS ARM64 PYCPP
 test:macos_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
 test:macos_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
-test:macos_arm64_pycpp_test_filters --test_lang_filters=cc,py
-test:macos_arm64_pycpp_test --config=macos_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xla/service/gpu/... -//tensorflow/compiler/xla/tools/multihost_hlo_runner/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/... -//tensorflow/compiler/xla/tests:local_client_aot_test_computation -//tensorflow/compiler/xla/tests:local_client_aot_test_helper -//tensorflow/compiler/xla/tests:local_client_aot_test
+test:macos_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
+test:macos_arm64_pycpp_test --config=macos_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/... -//tensorflow/core/kernels/image:resize_bicubic_op_test
 # END TF TEST SUITE OPTIONS
+
+# START LINUX AARCH64 CROSS-COMPILE CONFIGS
+# Set execution platform to Linux x86
+# Note: Lot of the "host_" flags such as "host_cpu" and "host_crosstool_top"
+# flags seem to be actually used to specify the execution platform details. It
+# seems it is this way because these flags are old and predate the distinction
+# between host and execution platform.
+build:cross_compile_linux_arm64 --host_cpu=k8
+build:cross_compile_linux_arm64 --host_crosstool_top=//tensorflow/tools/toolchains/cross_compile/cc:cross_compile_toolchain_suite
+build:cross_compile_linux_arm64 --extra_execution_platforms=//tensorflow/tools/toolchains/cross_compile/config:linux_x86_64
+
+# Set the target CPU to Aarch64
+build:cross_compile_linux_arm64 --platforms=//tensorflow/tools/toolchains/cross_compile/config:linux_aarch64
+build:cross_compile_linux_arm64 --cpu=aarch64
+build:cross_compile_linux_arm64 --crosstool_top=//tensorflow/tools/toolchains/cross_compile/cc:cross_compile_toolchain_suite
+
+# RBE configs
+build:rbe_cross_compile_linux_arm64 --config=cross_compile_linux_arm64
+build:rbe_cross_compile_linux_arm64 --config=rbe_base
+build:rbe_cross_compile_linux_arm64 --remote_instance_name=projects/tensorflow-testing/instances/default_instance
+
+# Test-related settings below this point
+# We cannot run cross-compiled tests on the remote Linux x86 VMs so we need to
+# force all tests to run locally on the Aarch64 host.
+test:rbe_cross_compile_linux_arm64 --strategy=TestRunner=local
+test:rbe_cross_compile_linux_arm64 --verbose_failures=true --local_test_jobs=HOST_CPUS --test_output=errors
+test:rbe_cross_compile_linux_arm64 --flaky_test_attempts=3 --build_tests_only
+# END LINUX AARCH64 CROSS-COMPILE CONFIGS
diff --git a/third_party/xla/.github/workflows/trusted_partners.js b/third_party/xla/.github/workflows/trusted_partners.js
index fcb1551059cc73..75a1ff082592b7 100644
--- a/third_party/xla/.github/workflows/trusted_partners.js
+++ b/third_party/xla/.github/workflows/trusted_partners.js
@@ -53,7 +53,7 @@ const get_email_domain = async ({github, username}) => {
 const filter_action = async ({github, context, domain}) => {
   const labels = ['kokoro:force-run'];
 
-  let assignees = ['radhakrishnaba', 'xla-rotation'];
+  let assignees = ['kamaljeeti', 'xla-rotation'];
   const title =
       context.payload.pull_request && context.payload.pull_request.title;
   const lowercased_title = (title || '').toLowerCase();
diff --git a/third_party/xla/.kokoro/jax/build.sh b/third_party/xla/.kokoro/jax/build.sh
index 417b515a4b4898..4cfd6d12426b87 100644
--- a/third_party/xla/.kokoro/jax/build.sh
+++ b/third_party/xla/.kokoro/jax/build.sh
@@ -37,11 +37,12 @@ prelude() {
   if is_linux_gpu_job ; then
     export JAX_CUDA_VERSION=12
     export JAX_CUDNN_VERSION=8.9
-
     nvidia-smi
+    setup_env_vars_py39
+  else
+    setup_env_vars_py312
   fi
 
-  setup_env_vars_py312
   cd "${KOKORO_ARTIFACTS_DIR}"
 
   use_local_or_install_python
@@ -50,52 +51,49 @@ prelude() {
   # Install bazel
   update_bazel_linux
 
-  chmod +x "${KOKORO_GFILE_DIR}/bazel_wrapper.py"
   cd jax
 
 }
 
 build_and_test_on_rbe_cpu() {
   # Run the tests.
-   "${KOKORO_GFILE_DIR}/bazel_wrapper.py" \
+  bazel \
       test \
       --verbose_failures=true \
       --override_repository=xla="${KOKORO_ARTIFACTS_DIR}"/github/xla \
       --config=avx_posix \
-      --config=tpu \
       --config=mkl_open_source_only \
-      --config="$NOCUDA_RBE_CONFIG_NAME" \
+      --config="rbe_cpu_linux_py3.12" \
       --config=tensorflow_testing_rbe_linux \
       --test_env=JAX_NUM_GENERATED_CASES=25 \
-      //tests:cpu_tests //tests:backend_independent_tests \
-      --test_output=errors
+      --test_output=errors \
+      -- //tests:cpu_tests //tests:backend_independent_tests
 }
 
 build_and_test_on_rbe_gpu() {
   # Runs non-multiaccelerator tests with one GPU apiece.
   # It appears --run_under needs an absolute path.
-  "${KOKORO_GFILE_DIR}/bazel_wrapper.py" \
+
+  bazel \
     test \
     --verbose_failures=true \
-    //tests:gpu_tests //tests:backend_independent_tests \
     --override_repository=xla="${KOKORO_ARTIFACTS_DIR}"/github/xla \
     --config=avx_posix \
     --config=mkl_open_source_only \
-    --config="$CUDA_RBE_CONFIG_NAME" \
+    --config="rbe_linux_cuda12.2_nvcc_py3.9" \
+    --config=tensorflow_testing_rbe_linux \
     --test_env=XLA_PYTHON_CLIENT_ALLOCATOR=platform \
     --test_output=errors \
     --test_env=JAX_SKIP_SLOW_TESTS=1 \
     --test_env=TF_CPP_MIN_LOG_LEVEL=0 \
-    --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow \
-    --test_tag_filters=-multiaccelerator
+    --test_env=JAX_EXCLUDE_TEST_TARGETS="PmapTest.testSizeOverflow" \
+    --test_tag_filters=-multiaccelerator \
+    -- //tests:gpu_tests //tests:backend_independent_tests
 }
 
 # Generate a templated results file to make output accessible to everyone
 "$KOKORO_ARTIFACTS_DIR"/github/xla/.kokoro/generate_index_html.sh "$KOKORO_ARTIFACTS_DIR"/index.html
 
-NOCUDA_RBE_CONFIG_NAME="rbe_cpu_linux_py312"
-CUDA_RBE_CONFIG_NAME="rbe_linux_cuda12.2_nvcc_py3.12"
-
 prelude
 
 if is_linux_gpu_job ; then
diff --git a/third_party/xla/.kokoro/linux/build.sh b/third_party/xla/.kokoro/linux/build.sh
index 49b10b04a899ca..635af61a6d3ed5 100644
--- a/third_party/xla/.kokoro/linux/build.sh
+++ b/third_party/xla/.kokoro/linux/build.sh
@@ -26,10 +26,6 @@ function is_linux_gpu_job() {
   [[ "$KOKORO_JOB_NAME" =~ tensorflow/xla/linux/.*gpu.* ]]
 }
 
-function is_use_nvcc() {
-  [[ -z "${USE_NVCC:-}" ]] || [[ "$USE_NVCC" == "true" ]]
-}
-
 # Pull the container (in case it was updated since the instance started) and
 # store its SHA in the Sponge log.
 docker pull "$DOCKER_IMAGE"
@@ -54,11 +50,7 @@ if is_linux_gpu_job ; then
     TAGS_FILTER="$TAGS_FILTER,gpu,requires-gpu-nvidia,-no_gpu"
     ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute"
     RC_FILE="/usertools/gpu.bazelrc"
-    if is_use_nvcc ; then
-      RBE_CONFIG="rbe_linux_cuda_nvcc"
-    else
-      RBE_CONFIG="rbe_linux_cuda"
-    fi
+    RBE_CONFIG="rbe_linux_cuda_nvcc"
     echo "***NOTE: nvidia-smi lists the highest CUDA version the driver supports, which may be different than the version of CUDA actually used!!***"
     nvidia-smi
 else
diff --git a/third_party/xla/build_tools/lint/BUILD b/third_party/xla/build_tools/lint/BUILD
index 8ca1872bb1b064..0270b76421a545 100644
--- a/third_party/xla/build_tools/lint/BUILD
+++ b/third_party/xla/build_tools/lint/BUILD
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ============================================================================
 
-load("//xla:pytype.default.bzl", "pytype_strict_library")
+load("//xla:pytype.default.bzl", "pytype_strict_binary", "pytype_strict_library")
 # Placeholder: load py_test
 
 package(
@@ -34,6 +34,11 @@ pytype_strict_library(
     visibility = ["//visibility:public"],
 )
 
+pytype_strict_binary(
+    name = "generate_compile_commands",
+    srcs = ["generate_compile_commands.py"],
+)
+
 py_test(
     name = "check_contents_test",
     srcs = ["check_contents_test.py"],
diff --git a/third_party/xla/build_tools/lint/check_contents.py b/third_party/xla/build_tools/lint/check_contents.py
index 1649152148d1a4..5d09ec074b3b1e 100644
--- a/third_party/xla/build_tools/lint/check_contents.py
+++ b/third_party/xla/build_tools/lint/check_contents.py
@@ -22,7 +22,7 @@
 import logging  # Intended to run on vanilla Github Actions runner
 import re
 import sys
-from typing import Iterable, Optional, Sequence
+from typing import Iterable, Sequence
 
 from xla.build_tools.lint import diff_parser
 
@@ -92,7 +92,7 @@ def check_diffs(
     hunks: Iterable[diff_parser.Hunk],
     *,
     prohibited_regex: str,
-    suppression_regex: Optional[str] = None,  # TODO(ddunleavy): CI not on 3.10
+    suppression_regex: str | None = None,
 ) -> list[RegexLocation]:
   """Checks FileDiffs for prohibited regexes.
 
diff --git a/third_party/xla/build_tools/lint/generate_compile_commands.py b/third_party/xla/build_tools/lint/generate_compile_commands.py
new file mode 100644
index 00000000000000..735fc53f8aa8a6
--- /dev/null
+++ b/third_party/xla/build_tools/lint/generate_compile_commands.py
@@ -0,0 +1,129 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+r"""Produces a `compile_commands.json` from the output of `bazel aquery`.
+
+This tool requires that a build has been completed for all targets in the
+query (e.g., for the example usage below `bazel build //xla/...`). This is due
+to generated files like proto headers and files generated via tablegen. So if
+LSP or other tools get out of date, it may be necessary to rebuild or regenerate
+`compile_commands.json`, or both.
+
+Example usage:
+  bazel aquery "mnemonic(CppCompile, //xla/...)" --output=jsonproto | \
+      python3 build_tools/lint/generate_compile_commands.py
+"""
+import dataclasses
+import json
+import logging
+import pathlib
+import sys
+from typing import Any
+
+_JSONDict = dict[Any, Any]  # Approximates parsed JSON
+
+_DISALLOWED_ARGS = frozenset(["-fno-canonical-system-headers"])
+_XLA_SRC_ROOT = pathlib.Path(__file__).absolute().parent.parent.parent
+
+
+@dataclasses.dataclass
+class CompileCommand:
+  """Represents a compilation command with options on a specific file."""
+
+  file: str
+  arguments: list[str]
+
+  @classmethod
+  def from_args_list(cls, args_list: list[str]) -> "CompileCommand":
+    """Alternative constructor which uses the args_list from `bazel aquery`.
+
+    This collects arguments and the file being run on from the output of
+    `bazel aquery`. Also filters out arguments which break clang-tidy.
+
+    Arguments:
+      args_list: List of arguments generated by `bazel aquery`
+
+    Returns:
+      The corresponding ClangTidyCommand.
+    """
+    cc_file = None
+    filtered_args = []
+
+    for arg in args_list:
+      if arg in _DISALLOWED_ARGS:
+        continue
+
+      if arg.endswith(".cc"):
+        cc_file = arg
+
+      filtered_args.append(arg)
+
+    return cls(cc_file, filtered_args)
+
+  def to_dumpable_json(self, directory: str) -> _JSONDict:
+    return {
+        "directory": directory,
+        "file": self.file,
+        "arguments": self.arguments,
+    }
+
+
+def extract_compile_commands(
+    parsed_aquery_output: _JSONDict,
+) -> list[CompileCommand]:
+  """Gathers compile commands to run from `bazel aquery` JSON output.
+
+  Arguments:
+    parsed_aquery_output: Parsed JSON representing the output of `bazel aquery
+      --output=jsonproto`.
+
+  Returns:
+    The list of CompileCommands that should be executed.
+  """
+  actions = parsed_aquery_output["actions"]
+
+  commands = []
+  for action in actions:
+    command = CompileCommand.from_args_list(action["arguments"])
+    commands.append(command)
+  return commands
+
+
+def main():
+  # Setup logging
+  logging.basicConfig()
+  logging.getLogger().setLevel(logging.INFO)
+
+  # Setup external symlink if necessary so headers can be found in include paths
+  if not (external := _XLA_SRC_ROOT / "external").exists():
+    logging.info("Symlinking `xla/bazel-xla/external` to `xla/external`")
+    external.symlink_to(_XLA_SRC_ROOT / "bazel-xla" / "external")
+
+  logging.info("Reading `bazel aquery` output from stdin...")
+  parsed_aquery_output = json.loads(sys.stdin.read())
+
+  commands = extract_compile_commands(parsed_aquery_output)
+
+  with (_XLA_SRC_ROOT / "compile_commands.json").open("w") as f:
+    json.dump(
+        [
+            command.to_dumpable_json(directory=str(_XLA_SRC_ROOT))
+            for command in commands
+        ],
+        f,
+    )
+
+
+if __name__ == "__main__":
+  main()
diff --git a/third_party/xla/docs/_book.yaml b/third_party/xla/docs/_book.yaml
new file mode 100644
index 00000000000000..a6030d45a9949f
--- /dev/null
+++ b/third_party/xla/docs/_book.yaml
@@ -0,0 +1,47 @@
+upper_tabs:
+# Tabs left of dropdown menu
+- include: /_upper_tabs_left.yaml
+- include: /api_docs/_upper_tabs_api.yaml
+# Dropdown menu
+- name: Resources
+  path: /resources
+  is_default: true
+  menu:
+  - include: /resources/_menu_toc.yaml
+  lower_tabs:
+    # Subsite tabs
+    other:
+    - name: Overview
+      contents:
+      - heading: OpenXLA
+      - title: Overview
+        path: /xla
+      - title: XLA architecture
+        path: /xla/architecture
+      - title: Broadcasting semantics
+        path: /xla/broadcasting
+      - title: Develop a new backend for XLA
+        path: /xla/developing_new_backend
+      - title: Code Reviews Guide
+        path: /xla/code_reviews
+      - title: Operation semantics
+        path: /xla/operation_semantics
+      - title: Shapes and layout
+        path: /xla/shapes
+      - title: Aliasing
+        path: /xla/aliasing
+      - title: Tiled layout
+        path: /xla/tiled_layout
+      - title: Writing custom calls
+        path: /xla/custom_call
+      - heading: TensorFlow - XLA
+      - title: Known issues
+        path: /xla/known_issues
+      - title: Use AOT compilation
+        path: /xla/tfcompile
+      - title: XLA autoclustering
+        path: /xla/tutorials/autoclustering_xla
+      - title: Use XLA with tf.function
+        path: /xla/tutorials/jit_compile
+
+- include: /_upper_tabs_right.yaml
diff --git a/third_party/xla/docs/async_ops.md b/third_party/xla/docs/async_ops.md
new file mode 100644
index 00000000000000..889272eecc4411
--- /dev/null
+++ b/third_party/xla/docs/async_ops.md
@@ -0,0 +1,121 @@
+# Async HLO Instructions
+
+1. Adding async operations to HLO is cumbersome (i.e. `all-reduce-start` and
+   `all-reduce-done`).
+2. The start and done split may be inadequate for some of the asynchronous use
+   cases.
+
+To target the first shortcoming, we propose to introduce one last set of new
+asynchronous opcodes: `kAsyncStart`, `kAsyncUpdate`, and `kAsyncDone`. The idea
+is to create a generic asynchronous opcode that can wrap any HLO instruction.
+The actual operation that will be performed asynchronously will be encoded using
+a called computation that only has the instruction as its root and any
+parameters for inputs. The in-flight input/output buffer handling and aliasing
+can then be shared for any asynchronous operation. The async-start instruction’s
+output shape will then be a tuple of the input operands, output values, and any
+intermediate state that is needed for the `async-update` or `async-done`
+instructions.
+
+```
+%async_op {
+  %param0 = f32[64] parameter(0)
+  ROOT %op = f32[32] op(f32[64] %param0), op_specific_attr=”foo”
+}
+
+%async-start = (f32[64], f32[32], s32[]) async-start(f32[64] %operand),
+                                         calls=%async_op
+%async-done = f32[32] async-done((f32[64], f32[32], s32[]) %async-start),
+                                         calls=%async_op
+```
+
+In the representation above, only `async-start` has a called computation since
+it is trivial to find what the `async-done` does by following its operand to
+find the corresponding `async-start` to find the called computation.
+
+Today both `async-start` and `async-done` have a called computation attribute,
+but long term we plan to keep it only for `async-start`, since it is trivial
+to find what the `async-done` does by following its operand to find the
+corresponding `async-start` to find the called computation.
+
+> [!NOTE]
+> Tracked as b/302594825 internally.
+
+Also note
+that the first element in the output tuple of `async-start` aliases with the
+operand, so the buffer stays alive until at least the async-done instruction.
+Similarly, the second element aliases with the output of `async-done`, and the
+third element is the context state that is used to keep track of the
+asynchronous operation. This representation also supports multiple tensors in
+the asynchronous operation input and/or output and the aliasing works the same
+way:
+
+```
+%async_op {
+  %param0 = f32[64] parameter(0)
+  %param1 = f32[64] parameter(1)
+  ROOT %op = (f32[32], f32[32]) op(f32[64] %param0, f32[64] %param1),
+                                op_specific_attr=”foo”
+}
+
+%async-start = ((f32[64], f32[64]), (f32[32], f32[32]), s32[])
+               async-start(f32[64] %operand0, f32[64] %operand1),
+               calls=%async_op
+%async-done = (f32[32], f32[32]) async-done(%async-start)
+```
+
+In addition, the op can further be decomposed into zero or more `async-update`
+steps that perform intermediate computations. The input/output aliasing works
+the same way with the `async-update` instruction and each `async-start` and
+`async-update` instructions must have one user that is either another
+`async-update` or an `async-done`:
+
+```
+%async_op {
+  %param0 = f32[64] parameter(0)
+  ROOT %op = f32[32] op(f32[64] %param0), op_specific_attr=”foo”
+}
+
+%async-start = (f32[64], f32[32], s32[]) async-start(f32[64] %operand),
+                                         calls=%async_op
+%async-update0 = (f32[64], f32[32], s32[]) async-update(
+                           (f32[64], f32[32], s32[]) %async-start)
+%async-update1 = (f32[64], f32[32], s32[]) async-update(
+                           (f32[64], f32[32], s32[]) %async-update0)
+%async-done = f32[32] async-done((f32[64], f32[32], s32[]) %async-update1)
+
+```
+
+## Syntax sugar
+
+Since having a separate computation to define the operation that will be
+performed asynchronously is a bit cumbersome, we also propose a syntax sugar to
+automatically print and parse asynchronous operations as if they are first-class
+opcodes. The idea is to treat the “-start”,  “-update”, and “-done” suffixes
+specially by automatically creating the computation and instruction (without the
+suffix) when parsing. For example, the code snippet above can be pretty-printed
+to the following and the two can be parsed to the same representation:
+
+```
+%op-start = (f32[64], f32[32], s32[]) op-start(f32[64] %operand),
+                                      op_specific_attr=”foo”
+%op-update0 = (f32[64], f32[32], s32[]) op-update(
+                        (f32[64], f32[32], s32[]) %op-start),
+                        op_specific_attr=”foo”
+%op-update1 = (f32[64], f32[32], s32[]) op-update(
+                        (f32[64], f32[32], s32[]) %op-update0),
+                        op_specific_attr=”foo”
+%op-done = f32[32] op-done((f32[64], f32[32], s32[]) %op-update1),
+                          op_specific_attr=”foo”
+
+```
+
+In order not to create ambiguities, the verifier will not allow an operation to
+be wrapped with async-start if we explicitly defined an opcode for that
+operation with the “-start” and/or “-done” suffixes. This is also an escape
+hatch in case we have any instructions that require HLO-level treatment that
+doesn’t fit in the model described above (e.g. the aliasing input/output
+buffers). So, initially, `copy-start`/`copy-done`,
+`collective-permute-start`/`collective-permute-done` etc. will continue to use
+their respective first-class opcodes instead of the new
+`async-start`/`async-done` opcodes until we clean up the code to remove these
+“-start”/”-done” opcodes.
diff --git a/third_party/xla/docs/build_from_source.md b/third_party/xla/docs/build_from_source.md
index 9c4cc0e401fd37..f5b2ded3c4cd4e 100644
--- a/third_party/xla/docs/build_from_source.md
+++ b/third_party/xla/docs/build_from_source.md
@@ -33,7 +33,7 @@ We recommend using a suitable docker container to build/test XLA, such as
 [TensorFlow's docker container](https://www.tensorflow.org/install/docker):
 
 ```
-docker run --name xla -w /xla -it -d --rm -v $PWD:/xla tensorflow/build:latest-python3.9 bash
+docker run --name xla -w /xla -it -d --rm -v $PWD:/xla tensorflow/tensorflow:latest-gpu bash
 ```
 
 Using a docker container you can build XLA with CPU support using the following commands:
diff --git a/third_party/xla/opensource_only.files b/third_party/xla/opensource_only.files
index 9abb2546fa24ed..9de7578a5801a9 100644
--- a/third_party/xla/opensource_only.files
+++ b/third_party/xla/opensource_only.files
@@ -26,6 +26,8 @@ tools/toolchains/BUILD:
 tools/toolchains/clang6/BUILD:
 tools/toolchains/cpus/py/BUILD:
 tools/toolchains/cpus/py3/BUILD:
+tools/toolchains/cross_compile/cc/BUILD:
+tools/toolchains/cross_compile/config/BUILD:
 tools/toolchains/embedded/arm-linux/BUILD:
 tools/toolchains/java/BUILD:
 tools/toolchains/python/BUILD:
diff --git a/third_party/xla/third_party/cutlass.BUILD b/third_party/xla/third_party/cutlass.BUILD
new file mode 100644
index 00000000000000..923d2f044c395a
--- /dev/null
+++ b/third_party/xla/third_party/cutlass.BUILD
@@ -0,0 +1,24 @@
+# Description:
+# CUTLASS is a collection of CUDA C++ template abstractions for implementing high-performance
+# matrix-matrix multiplication (GEMM) and related computations at all levels and scales within CUDA.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # MIT
+
+exports_files(["LICENSE.txt"])
+
+filegroup(
+    name = "cutlass_header_files",
+    srcs = glob([
+        "include/**",
+    ]),
+)
+
+cc_library(
+    name = "cutlass",
+    hdrs = [":cutlass_header_files"],
+    strip_include_prefix = "/include",
+)
diff --git a/third_party/xla/third_party/gloo/BUILD b/third_party/xla/third_party/gloo/BUILD
new file mode 100644
index 00000000000000..3c413807167aeb
--- /dev/null
+++ b/third_party/xla/third_party/gloo/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/gloo/gloo.BUILD b/third_party/xla/third_party/gloo/gloo.BUILD
new file mode 100644
index 00000000000000..e960fc518a7699
--- /dev/null
+++ b/third_party/xla/third_party/gloo/gloo.BUILD
@@ -0,0 +1,97 @@
+# Description:
+#   Gloo is a collective communications library
+
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+substitions = {
+    "@GLOO_VERSION_MAJOR@": "9999",
+    "@GLOO_VERSION_MINOR@": "0",
+    "@GLOO_VERSION_PATCH@": "0",
+    "#cmakedefine01 GLOO_USE_CUDA": "#define GLOO_USE_CUDA 0",
+    "#cmakedefine01 GLOO_USE_NCCL": "#define GLOO_USE_NCCL 0",
+    "#cmakedefine01 GLOO_USE_ROCM": "#define GLOO_USE_ROCM 0",
+    "#cmakedefine01 GLOO_USE_RCCL": "#define GLOO_USE_RCCL 0",
+    "#cmakedefine01 GLOO_USE_REDIS": "#define GLOO_USE_REDIS 0",
+    "#cmakedefine01 GLOO_USE_IBVERBS": "#define GLOO_USE_IBVERBS 0",
+    "#cmakedefine01 GLOO_USE_MPI": "#define GLOO_USE_MPI 0",
+    "#cmakedefine01 GLOO_USE_LIBUV": "#define GLOO_USE_LIBUV 0",
+    "#cmakedefine01 GLOO_HAVE_TRANSPORT_TCP": "#define GLOO_HAVE_TRANSPORT_TCP 1",
+    "#cmakedefine01 GLOO_HAVE_TRANSPORT_TCP_TLS": "#define GLOO_HAVE_TRANSPORT_TCP_TLS 0",
+    "#cmakedefine01 GLOO_HAVE_TRANSPORT_IBVERBS": "#define GLOO_HAVE_TRANSPORT_IBVERBS 0",
+    "#cmakedefine01 GLOO_HAVE_TRANSPORT_UV": "#define GLOO_HAVE_TRANSPORT_UV 0",
+    "#cmakedefine01 GLOO_USE_AVX": "#define GLOO_USE_AVX __AVX__",
+}
+
+expand_template(
+    name = "config",
+    out = "gloo/config.h",
+    substitutions = substitions,
+    template = "gloo/config.h.in",
+)
+
+cc_library(
+    name = "gloo",
+    srcs = glob(
+        [
+            "gloo/*.cc",
+            "gloo/common/*.cc",
+            "gloo/transport/*.cc",
+        ],
+        exclude = [
+            "gloo/common/linux.cc",
+            "gloo/common/win.cc",
+            "gloo/cuda*.cc",
+        ],
+    ) + [
+        "gloo/rendezvous/context.cc",
+        "gloo/rendezvous/file_store.cc",
+        "gloo/rendezvous/hash_store.cc",
+        "gloo/rendezvous/prefix_store.cc",
+        "gloo/rendezvous/store.cc",
+    ] + select({
+        "@local_tsl//tsl:macos": [],
+        "@local_tsl//tsl:windows": [],
+        "//conditions:default": [
+            "gloo/common/linux.cc",
+        ],
+    }),
+    copts = [
+        "-fexceptions",
+        "-Wno-unused-variable",
+    ],
+    includes = ["."],
+    textual_hdrs = glob(
+        [
+            "gloo/*.h",
+            "gloo/common/*.h",
+            "gloo/transport/*.h",
+        ],
+        exclude = [
+            "gloo/cuda*.h",
+            "gloo/common/win.h",
+        ],
+    ) + [
+        "gloo/config.h",
+        "gloo/rendezvous/context.h",
+        "gloo/rendezvous/file_store.h",
+        "gloo/rendezvous/hash_store.h",
+        "gloo/rendezvous/prefix_store.h",
+        "gloo/rendezvous/store.h",
+    ],
+)
+
+cc_library(
+    name = "transport_tcp",
+    srcs = glob(["gloo/transport/tcp/*.cc"]),
+    hdrs = glob(["gloo/transport/tcp/*.h"]),
+    copts = ["-fexceptions"],
+    deps = [":gloo"],
+)
diff --git a/third_party/xla/third_party/gloo/workspace.bzl b/third_party/xla/third_party/gloo/workspace.bzl
new file mode 100644
index 00000000000000..ede168395acdc5
--- /dev/null
+++ b/third_party/xla/third_party/gloo/workspace.bzl
@@ -0,0 +1,17 @@
+"""Provides the repository macro to import Gloo."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    """Imports Gloo."""
+
+    GLOO_COMMIT = "5354032ea08eadd7fc4456477f7f7c6308818509"
+    GLOO_SHA256 = "5759a06e6c8863c58e8ceadeb56f7c701fec89b2559ba33a103a447207bf69c7"
+
+    tf_http_archive(
+        name = "gloo",
+        sha256 = GLOO_SHA256,
+        strip_prefix = "gloo-{commit}".format(commit = GLOO_COMMIT),
+        urls = tf_mirror_urls("https://github.com/facebookincubator/gloo/archive/{commit}.tar.gz".format(commit = GLOO_COMMIT)),
+        build_file = "//third_party/gloo:gloo.BUILD",
+    )
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
old mode 100644
new mode 100755
index be1c1f0838e9d7..a476720fd2dbd6
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -1,39 +1,14 @@
 diff --ruN a/stablehlo/BUILD.bazel b/stablehlo/BUILD.bazel
 --- stablehlo/BUILD.bazel
 +++ stablehlo/BUILD.bazel
-@@ -279,6 +279,24 @@
- )
- 
- cc_library(
-+    name = "experimental_ops",
-+    srcs = [
-+        "stablehlo/dialect/ExperimentalOps.cpp",
-+    ],
-+    hdrs = [
-+        "stablehlo/dialect/ExperimentalOps.h",
-+    ],
-+    strip_include_prefix = ".",
-+    deps = [
-+        ":stablehlo_ops",
-+        "@llvm-project//llvm:Support",
-+        "@llvm-project//mlir:FuncDialect",
-+        "@llvm-project//mlir:IR",
-+        "@llvm-project//mlir:Support",
-+    ],
-+)
-+
-+cc_library(
-     name = "interpreter_ops",
-     srcs = [
-         "stablehlo/reference/InterpreterOps.cpp",
-@@ -780,6 +798,7 @@
+@@ -890,6 +890,7 @@
+     hdrs = [
+         "stablehlo/transforms/MapStablehloToVhlo.h",
+         "stablehlo/transforms/Passes.h",
++        "stablehlo/transforms/StablehloRefineShapes.h",
+     ],
+     strip_include_prefix = ".",
      deps = [
-         ":base",
-         ":chlo_ops",
-+        ":experimental_ops",
-         ":stablehlo_ops",
-         ":stablehlo_ops_inc_gen",
-         ":stablehlo_pass_inc_gen",
 diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 --- stablehlo/CMakeLists.txt
 +++ stablehlo/CMakeLists.txt
@@ -181,32 +156,198 @@ diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
  
  #-------------------------------------------------------------------------------
  # Directory setup
-diff --ruN a/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir b/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
---- stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
-+++ stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
-@@ -19,6 +19,7 @@
- func.func @iota_dimension_0() -> tensor<4x8xf32> {
-   // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"()
-   // CHECK-SAME{LITERAL}: <{value = dense<[[0.000000e+00], [1.000000e+00], [2.000000e+00], [3.000000e+00]]> : tensor<4x1xf32>}>
-+  // CHECK-DAG: %[[VAR1:.*]] = tosa.tile %[[VAR0]] {multiples = array<i64: 1, 8>}
-   %0 = "stablehlo.iota"() {iota_dimension = 0 : i64} : () -> (tensor<4x8xf32>)
-   return %0 : tensor<4x8xf32>
- }
-@@ -27,6 +28,7 @@
- func.func @iota_dimension_1() -> tensor<4x8xi32> {
-   // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"()
-   // CHECK-SAME{LITERAL}: <{value = dense<[[0, 1, 2, 3, 4, 5, 6, 7]]> : tensor<1x8xi32>}>
-+  // CHECK-DAG: %[[VAR1:.*]] = tosa.tile %[[VAR0]] {multiples = array<i64: 4, 1>}
-   %0 = "stablehlo.iota"() {iota_dimension = 1 : i64} : () -> (tensor<4x8xi32>)
-   return %0 : tensor<4x8xi32>
- }
-diff --ruN a/stablehlo/stablehlo/dialect/Base.cpp b/stablehlo/stablehlo/dialect/Base.cpp
---- stablehlo/stablehlo/dialect/Base.cpp
-+++ stablehlo/stablehlo/dialect/Base.cpp
-@@ -600,5 +600,18 @@
-   return UnrankedTensorType::get(components.getElementType());
- }
+diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists.txt
+--- stablehlo/stablehlo/CMakeLists.txt
++++ stablehlo/stablehlo/CMakeLists.txt
+@@ -15,6 +15,7 @@
+ add_subdirectory(api)
+ add_subdirectory(conversions)
+ add_subdirectory(dialect)
++add_subdirectory(experimental)
+ add_subdirectory(integrations)
+ add_subdirectory(reference)
+ add_subdirectory(tests)
+diff --ruN a/stablehlo/stablehlo/api/PortableApi.h b/stablehlo/stablehlo/api/PortableApi.h
+--- stablehlo/stablehlo/api/PortableApi.h
++++ stablehlo/stablehlo/api/PortableApi.h
+@@ -27,7 +27,8 @@
  
+ /// Return the current version for portable API.
+ /// Increments on all meaningful changes to this file.
+-inline int64_t getApiVersion() { return 4; }
++/// Or on large breaking source changes that are difficult to integrate.
++inline int64_t getApiVersion() { return 5; }
+ 
+ // Get the current StableHLO version.
+ //
+diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
+--- stablehlo/stablehlo/experimental/BUILD.bazel
++++ stablehlo/stablehlo/experimental/BUILD.bazel
+@@ -0,0 +1,114 @@
++# Copyright 2023 The StableHLO Authors. All Rights Reserved.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      https://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
++
++package(
++    default_visibility = ["//visibility:public"],
++    licenses = ["notice"],
++)
++
++cc_library(
++    name = "experimental_base",
++    srcs = [
++        "dialect/Base.cpp",
++    ],
++    hdrs = [
++        "dialect/Base.h",
++    ],
++    deps = [
++        "@llvm-project//llvm:Support",
++        "@llvm-project//mlir:IR",
++    ],
++)
++
++cc_library(
++    name = "experimental_stablehlo_ops",
++    srcs = [
++        "dialect/StablehloOps.cpp",
++    ],
++    hdrs = [
++        "dialect/StablehloOps.h",
++    ],
++    deps = [
++        ":experimental_base",
++        "//:stablehlo_ops",
++        "@llvm-project//llvm:Support",
++        "@llvm-project//mlir:FuncDialect",
++        "@llvm-project//mlir:IR",
++        "@llvm-project//mlir:Support",
++    ],
++)
++
++gentbl_cc_library(
++    name = "experimental_stablehlo_pass_inc_gen",
++    tbl_outs = [
++        (
++            [
++                "-gen-pass-decls",
++            ],
++            "transforms/Passes.h.inc",
++        ),
++    ],
++    tblgen = "@llvm-project//mlir:mlir-tblgen",
++    td_file = "transforms/Passes.td",
++    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
++)
++
++cc_library(
++    name = "experimental_stablehlo_passes",
++    srcs = [
++        "transforms/StablehloCanonicalizeDynamism.cpp",
++        "transforms/StablehloRefineShapes.cpp",
++    ],
++    hdrs = [
++        "transforms/Passes.h",
++    ],
++    deps = [
++        ":experimental_stablehlo_ops",
++        ":experimental_stablehlo_pass_inc_gen",
++        "//:base",
++        "//:chlo_ops",
++        "//:stablehlo_ops",
++        "//:stablehlo_ops_inc_gen",
++        "//:stablehlo_passes",
++        "//:stablehlo_type_inference",
++        "@llvm-project//llvm:Support",
++        "@llvm-project//mlir:FuncDialect",
++        "@llvm-project//mlir:IR",
++        "@llvm-project//mlir:InferTypeOpInterface",
++        "@llvm-project//mlir:Pass",
++        "@llvm-project//mlir:Support",
++        "@llvm-project//mlir:TransformUtils",
++        "@llvm-project//mlir:Transforms",
++    ],
++)
++
++cc_binary(
++    name = "experimental-stablehlo-opt",
++    srcs = [
++        "tools/StablehloOptMain.cpp",
++    ],
++    deps = [
++        ":experimental_stablehlo_passes",
++        "//:interpreter_ops",
++        "//:register",
++        "//:stablehlo_passes",
++        "//:test_utils",
++        "//:tosa_passes",
++        "@llvm-project//mlir:AllExtensions",
++        "@llvm-project//mlir:AllPassesAndDialects",
++        "@llvm-project//mlir:MlirOptLib",
++        "@llvm-project//mlir:TosaDialect",
++    ],
++)
+diff --ruN a/stablehlo/stablehlo/experimental/CMakeLists.txt b/stablehlo/stablehlo/experimental/CMakeLists.txt
+--- stablehlo/stablehlo/experimental/CMakeLists.txt
++++ stablehlo/stablehlo/experimental/CMakeLists.txt
+@@ -0,0 +1,18 @@
++# Copyright 2023 The StableHLO Authors.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      https://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++add_subdirectory(dialect)
++add_subdirectory(tests)
++add_subdirectory(tools)
++add_subdirectory(transforms)
+diff --ruN a/stablehlo/stablehlo/experimental/dialect/Base.cpp b/stablehlo/stablehlo/experimental/dialect/Base.cpp
+--- stablehlo/stablehlo/experimental/dialect/Base.cpp
++++ stablehlo/stablehlo/experimental/dialect/Base.cpp
+@@ -0,0 +1,39 @@
++/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
++   Copyright 2022 The StableHLO Authors.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include "stablehlo/experimental/dialect/Base.h"
++
++#include "mlir/IR/BuiltinAttributes.h"
++#include "mlir/IR/BuiltinTypes.h"
++
++namespace mlir {
++namespace hlo {
++
 +DenseIntElementsAttr getPaddingAttr(MLIRContext* context,
 +                                    ArrayRef<int64_t> values) {
 +  return DenseIntElementsAttr::get(
@@ -220,50 +361,97 @@ diff --ruN a/stablehlo/stablehlo/dialect/Base.cpp b/stablehlo/stablehlo/dialect/
 +  return getPaddingAttr(builder->getContext(), values);
 +}
 +
- }  // namespace hlo
- }  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/dialect/Base.h b/stablehlo/stablehlo/dialect/Base.h
---- stablehlo/stablehlo/dialect/Base.h
-+++ stablehlo/stablehlo/dialect/Base.h
-@@ -194,6 +194,10 @@
- 
- ShapedType createShapedType(ShapedTypeComponents components);
- 
++}  // namespace hlo
++}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/experimental/dialect/Base.h b/stablehlo/stablehlo/experimental/dialect/Base.h
+--- stablehlo/stablehlo/experimental/dialect/Base.h
++++ stablehlo/stablehlo/experimental/dialect/Base.h
+@@ -0,0 +1,35 @@
++/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
++   Copyright 2022 The StableHLO Authors.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#ifndef STABLEHLO_EXPERIMENTAL_DIALECT_BASE_H
++#define STABLEHLO_EXPERIMENTAL_DIALECT_BASE_H
++
++#include "llvm/ADT/ArrayRef.h"
++#include "mlir/IR/Builders.h"
++#include "mlir/IR/BuiltinAttributes.h"
++#include "mlir/IR/MLIRContext.h"
++
++namespace mlir {
++namespace hlo {
++
 +DenseIntElementsAttr getPaddingAttr(MLIRContext *context,
 +                                    ArrayRef<int64_t> value);
 +DenseIntElementsAttr getPaddingAttr(Builder *builder, ArrayRef<int64_t> value);
 +
- // This interface is implemented by both StableHLO and MHLO dialects
- // and is used as the foundation for sharing verification, type inference and
- // prettyprinting logic between them.
-diff --ruN a/stablehlo/stablehlo/dialect/CMakeLists.txt b/stablehlo/stablehlo/dialect/CMakeLists.txt
---- stablehlo/stablehlo/dialect/CMakeLists.txt
-+++ stablehlo/stablehlo/dialect/CMakeLists.txt
-@@ -77,6 +77,20 @@
- target_include_directories(ChloOps INTERFACE
-   $<BUILD_INTERFACE:${STABLEHLO_SOURCE_DIR}>
-   $<BUILD_INTERFACE:${STABLEHLO_BINARY_DIR}>
++}  // namespace hlo
++}  // namespace mlir
++
++#endif  // STABLEHLO_EXPERIMENTAL_DIALECT_BASE_H
+diff --ruN a/stablehlo/stablehlo/experimental/dialect/CMakeLists.txt b/stablehlo/stablehlo/experimental/dialect/CMakeLists.txt
+--- stablehlo/stablehlo/experimental/dialect/CMakeLists.txt
++++ stablehlo/stablehlo/experimental/dialect/CMakeLists.txt
+@@ -0,0 +1,42 @@
++# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
++# Copyright 2023 The StableHLO Authors.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      https://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++add_mlir_library(ExperimentalStablehloBase
++  PARTIAL_SOURCES_INTENDED
++  Base.cpp
++
++  LINK_LIBS PUBLIC
++  MLIRIR
 +)
 +
-+add_mlir_dialect_library(ExperimentalOps
++add_mlir_dialect_library(ExperimentalStablehloOps
 +  PARTIAL_SOURCES_INTENDED
-+  ExperimentalOps.cpp
++  StablehloOps.cpp
 +
 +  DEPENDS
 +  StablehloOpsIncGen
 +
 +  LINK_LIBS PUBLIC
++  ExperimentalStablehloBase
 +  MLIRFuncDialect
 +  MLIRIR
 +  MLIRSupport
 +  StablehloOps
- )
- 
- add_mlir_dialect_library(StablehloRegister
-diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.cpp b/stablehlo/stablehlo/dialect/ExperimentalOps.cpp
---- stablehlo/stablehlo/dialect/ExperimentalOps.cpp
-+++ stablehlo/stablehlo/dialect/ExperimentalOps.cpp
-@@ -0,0 +1,504 @@
++)
++
++target_include_directories(ExperimentalStablehloOps INTERFACE
++  $<BUILD_INTERFACE:${STABLEHLO_SOURCE_DIR}>
++  $<BUILD_INTERFACE:${STABLEHLO_BINARY_DIR}>
++)
+diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
+--- stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
++++ stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
+@@ -0,0 +1,615 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -279,8 +467,9 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.cpp b/stablehlo/stableh
 +limitations under the License.
 +==============================================================================*/
 +
-+#include "stablehlo/dialect/ExperimentalOps.h"
++#include "stablehlo/experimental/dialect/StablehloOps.h"
 +
++#include <cstdint>
 +#include <optional>
 +
 +#include "llvm/ADT/ArrayRef.h"
@@ -293,6 +482,7 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.cpp b/stablehlo/stableh
 +
 +namespace mlir {
 +namespace stablehlo {
++namespace experimental {
 +
 +LogicalResult DynamicReduceWindowOpAdaptor::verify() {
 +  // Before checking the constraints inherited from ReduceWindowOp,
@@ -306,8 +496,7 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.cpp b/stablehlo/stableh
 +    // api_version and backend_config have default values.
 +    // call_target_name should be "stablehlo.dynamic_reduce_window".
 +    // called_computations carries the body.
-+    if (attr.getName() != "api_version" &&
-+        attr.getName() != "backend_config" &&
++    if (attr.getName() != "api_version" && attr.getName() != "backend_config" &&
 +        attr.getName() != "call_target_name" &&
 +        attr.getName() != "called_computations")
 +      return op_.emitError()
@@ -688,8 +877,8 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.cpp b/stablehlo/stableh
 +
 +  // dynamic_top_k_i2
 +  auto kType = k.getType().dyn_cast<ShapedType>();
-+  if (!kType || !kType.hasRank() ||
-+      kType.getRank() != 0 || !kType.getElementType().isIntOrIndex())
++  if (!kType || !kType.hasRank() || kType.getRank() != 0 ||
++      !kType.getElementType().isIntOrIndex())
 +    return op_.emitError()
 +           << "expects k (operand #1) "
 +           << "to be a 0-dimensional tensor of integer or index type";
@@ -751,7 +940,6 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.cpp b/stablehlo/stableh
 +  return op_.getInputs()[1].cast<TypedValue<ShapedType>>();
 +}
 +
-+
 +TypedValue<ShapedType> DynamicTopKOpAdaptor::getValues() {
 +  return op_.getResults()[0].cast<TypedValue<ShapedType>>();
 +}
@@ -760,18 +948,129 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.cpp b/stablehlo/stableh
 +  return op_.getResults()[1].cast<TypedValue<ShapedType>>();
 +}
 +
-+std::optional<DynamicTopKOpAdaptor> getDynamicTopKOp(
-+    CustomCallOp op) {
++std::optional<DynamicTopKOpAdaptor> getDynamicTopKOp(CustomCallOp op) {
 +  if (op.getCallTargetName() != "stablehlo.dynamic_top_k") return {};
 +  return DynamicTopKOpAdaptor(op);
 +}
 +
++LogicalResult TopKOpAdaptor::verify() {
++  if (op_->getNumOperands() != 1)
++    return op_.emitError("expects size(operands) = 1");
++  if (op_->getNumResults() != 2)
++    return op_.emitError("expects size(results) = 2");
++  if (!op_.getBackendConfig().empty())
++    return op_.emitError() << "expects an empty backend_config";
++  if (op_.getCallTargetName() != "mhlo.topk")
++    return op_.emitError() << "expects @mhlo.topk";
++
++  auto operand = op_.getInputs()[0];
++  auto values = op_.getResults()[0];
++  auto indices = op_.getResults()[1];
++  DictionaryAttr topkAttributes =
++      op_->getAttrOfType<DictionaryAttr>("mhlo.attributes");
++  if (!topkAttributes) {
++    return op_.emitError()
++           << "mhlo.attributes missing or not a dictionary attribute";
++  }
++
++  IntegerAttr k_attr = topkAttributes.get("k").dyn_cast_or_null<IntegerAttr>();
++  if (!k_attr) {
++    return op_.emitError() << "mhlo.attributes.k not present or not an integer";
++  }
++  int64_t k = k_attr.getInt();
++
++  // mhlo.topk_c5
++  if (k < 0) return op_.emitError() << "expects k >= 0";
++
++  // mhlo.topk_i1
++  auto operandType = operand.getType().dyn_cast<ShapedType>();
++  if (!operandType || !operandType.hasRank() || operandType.getRank() < 1 ||
++      !operandType.getElementType().isIntOrFloat())
++    return op_.emitError()
++           << "expects operand #0 "
++           << "to be a tensor of integer or floating-point type "
++           << "of rank at least 1";
++
++  // mhlo.topk_o1
++  auto valuesType = values.getType().dyn_cast<ShapedType>();
++  if (!valuesType || !valuesType.hasRank() || valuesType.getRank() < 1 ||
++      !valuesType.getElementType().isIntOrFloat())
++    return op_.emitError()
++           << "expects values (result #0) "
++           << "to be a tensor of integer or floating-point type "
++           << "of rank at least 1";
++
++  // mhlo.topk_o2
++  auto indicesType = indices.getType().dyn_cast<ShapedType>();
++  if (!indicesType || !indicesType.hasRank() || indicesType.getRank() < 1 ||
++      !indicesType.getElementType().isSignlessInteger(32))
++    return op_.emitError() << "expects indices (result #1) "
++                           << "to be a tensor of si32 of rank at least 1";
++
++  // mhlo.topk_c1 && mhlo.topk_c2
++  auto operandLastDim = operandType.getRank() - 1;
++  SmallVector<int64_t> expectedValuesShape(operandType.getShape());
++  expectedValuesShape[operandLastDim] = k;
++  if (failed(verifyCompatibleShape(expectedValuesShape, valuesType.getShape())))
++    return op_.emitError() << "expects the values shape to match the operand "
++                              "shape in all but the last dimension, and "
++                              "that the last dimension of the values shape "
++                              "has a size k";
++
++  // mhlo.topk_c3
++  if (valuesType.getElementType() != operandType.getElementType())
++    return op_.emitError()
++           << "expects the values element type to be the same as the operand "
++           << "element type";
++
++  // mhlo.topk_c4
++  if (failed(
++          verifyCompatibleShape(indicesType.getShape(), valuesType.getShape())))
++    return op_.emitError()
++           << "expects the indices shape to match the values shape";
++
++  return success();
++}
++
++TypedValue<ShapedType> TopKOpAdaptor::getOperand() {
++  return op_.getInputs()[0].cast<TypedValue<ShapedType>>();
++}
++
++TypedValue<ShapedType> TopKOpAdaptor::getValues() {
++  return op_.getResults()[0].cast<TypedValue<ShapedType>>();
++}
++
++TypedValue<ShapedType> TopKOpAdaptor::getIndices() {
++  return op_.getResults()[1].cast<TypedValue<ShapedType>>();
++}
++
++int64_t TopKOpAdaptor::getK() {
++  DictionaryAttr topkAttributes =
++      op_->getAttrOfType<DictionaryAttr>("mhlo.attributes");
++  return topkAttributes.get("k").cast<mlir::IntegerAttr>().getInt();
++}
++
++bool TopKOpAdaptor::getLargest() {
++  DictionaryAttr topkAttributes =
++      op_->getAttrOfType<DictionaryAttr>("mhlo.attributes");
++  IntegerAttr largest =
++      topkAttributes.get("largest").dyn_cast_or_null<mlir::IntegerAttr>();
++
++  return (!largest) ? true : largest.getInt();
++}
++
++std::optional<TopKOpAdaptor> getTopKOp(CustomCallOp op) {
++  if (op.getCallTargetName() != "mhlo.topk") return {};
++  return TopKOpAdaptor(op);
++}
++
++}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.h b/stablehlo/stablehlo/dialect/ExperimentalOps.h
---- stablehlo/stablehlo/dialect/ExperimentalOps.h
-+++ stablehlo/stablehlo/dialect/ExperimentalOps.h
-@@ -0,0 +1,227 @@
+diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.h b/stablehlo/stablehlo/experimental/dialect/StablehloOps.h
+--- stablehlo/stablehlo/experimental/dialect/StablehloOps.h
++++ stablehlo/stablehlo/experimental/dialect/StablehloOps.h
+@@ -0,0 +1,299 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -787,8 +1086,8 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.h b/stablehlo/stablehlo
 +limitations under the License.
 +==============================================================================*/
 +
-+#ifndef STABLEHLO_DIALECT_EXPERIMENTAL_OPS_H
-+#define STABLEHLO_DIALECT_EXPERIMENTAL_OPS_H
++#ifndef STABLEHLO_EXPERIMENTAL_DIALECT_STABLEHLO_OPS_H
++#define STABLEHLO_EXPERIMENTAL_DIALECT_STABLEHLO_OPS_H
 +
 +// This file supports XLA-specific experiments with the StableHLO opset.
 +// These experiments are not yet ready to be upstreamed to openxla/stablehlo
@@ -805,9 +1104,11 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.h b/stablehlo/stablehlo
 +#include "mlir/IR/ValueRange.h"
 +#include "mlir/Support/LogicalResult.h"
 +#include "stablehlo/dialect/StablehloOps.h"
++#include "stablehlo/experimental/dialect/Base.h"
 +
 +namespace mlir {
 +namespace stablehlo {
++namespace experimental {
 +
 +// The DynamicReduceWindowOp experiment provides a dynamic version of
 +// ReduceWindowOp. Once the dynamism RFC is figured out, we expect to have an
@@ -995,55 +1296,253 @@ diff --ruN a/stablehlo/stablehlo/dialect/ExperimentalOps.h b/stablehlo/stablehlo
 +// "stablehlo.dynamic_top_k".
 +std::optional<DynamicTopKOpAdaptor> getDynamicTopKOp(CustomCallOp op);
 +
++///////////////////
++// MHLO Op Wrappers
++// There are some ops in MHLO which have experimental support in StableHLO
++// programs by representing them as custom_calls with the target `mhlo.op_name`.
++// The level of support of these ops is similar to the other custom_calls in
++// this file. Generally these ops will be added to StableHLO and their
++// experimental support can be deprecated in favor of op's type inference.
++///////////////////
++
++// The TopK experiment provides a StableHLO adapter to MHLO TopKOp.
++// In the future we expect stablehlo.top_k to be added which will use the same
++// refinement rules.
++//
++// Within this experiment, TopKOp is represented via the serialized MHLO
++// `stablehlo.custom_call @mhlo.topk` custom call.
++//
++// The semantics of experimental TopKOp are inherited from the semantics of
++// mhlo.topk.
++//
++// #### Inputs
++//
++// | Label | Name            | Type                                         |
++// |-------|-----------------|----------------------------------------------|
++// | (I1)  | `operand`       | tensor of integer or floating-point type     |
++// | (I2)  | `k`             | constant of type si64                        |
++// | (I3)  | `largest`       | constant of type i1                          |
++//
++// #### Outputs
++//
++// | Name           | Type                                     |
++// |----------------|------------------------------------------|
++// | `values`       | tensor of integer or floating-point type |
++// | `indices`      | tensor of si32 type                      |
++//
++// #### Constraints
++//
++// * (C1) `shape(values)[:-1] = shape(operand)[:-1]`
++// * (C2) `shape(values)[-1] = k`
++// * (C3) `element_type(values) = element_type(operand)`
++// * (C4) `shape(indices) = shape(values)`
++// * (C5) `k >= 0`
++//
++class TopKOpAdaptor {
++ public:
++  TopKOpAdaptor(CustomCallOp op) : op_(op) {}
++  operator Operation*() { return op_; }
++  Operation* operator->() { return op_; }
++
++  // These accessors assume that the operation is well-formed (i.e. that it
++  // can pass verification).
++  TypedValue<ShapedType> getOperand();
++  TypedValue<ShapedType> getValues();
++  TypedValue<ShapedType> getIndices();
++  int64_t getK();
++  bool getLargest();
++
++  // Verifies the constraints documented above.
++  // Emits errors if errors are detected.
++  LogicalResult verify();
++
++ private:
++  CustomCallOp op_;
++};
++
++// Wraps a custom call in a TopKOpAdaptor.
++// Fails if the call_target_name of the custom call doesn't match
++// "mhlo.topk".
++std::optional<TopKOpAdaptor> getTopKOp(CustomCallOp op);
++
++}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
 +
-+#endif  // STABLEHLO_DIALECT_EXPERIMENTAL_OPS_H
-diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.cpp b/stablehlo/stablehlo/dialect/StablehloOps.cpp
---- stablehlo/stablehlo/dialect/StablehloOps.cpp
-+++ stablehlo/stablehlo/dialect/StablehloOps.cpp
-@@ -1543,6 +1543,7 @@
-     p << " across dimensions = [";
-     llvm::interleaveComma(getDimensions().getValues<int64_t>(), p);
-     p << "]";
-+    p.printOptionalAttrDict(getOperation()->getAttrs(), {"dimensions"});
-     p << " : ";
-     p.printFunctionalType(*this);
-   } else {
-@@ -1705,6 +1706,7 @@
-   if (parser.parseKeyword("across") || parser.parseKeyword("dimensions") ||
-       parser.parseEqual() ||
-       parser.parseCommaSeparatedList(AsmParser::Delimiter::Square, parseDim) ||
-+      parser.parseOptionalAttrDict(result.attributes) ||
-       parser.parseColon() || parser.parseType(reduceOpFnType) ||
-       parser.parseOptionalLocationSpecifier(explicitLoc))
-     return failure();
-diff --ruN a/stablehlo/stablehlo/tests/print_reduce.mlir b/stablehlo/stablehlo/tests/print_reduce.mlir
---- stablehlo/stablehlo/tests/print_reduce.mlir
-+++ stablehlo/stablehlo/tests/print_reduce.mlir
-@@ -168,3 +168,15 @@
- 
-   func.return %0: tensor<4xf32>
- }
++#endif  // STABLEHLO_EXPERIMENTAL_DIALECT_STABLEHLO_OPS_H
+diff --ruN a/stablehlo/stablehlo/experimental/tests/BUILD.bazel b/stablehlo/stablehlo/experimental/tests/BUILD.bazel
+--- stablehlo/stablehlo/experimental/tests/BUILD.bazel
++++ stablehlo/stablehlo/experimental/tests/BUILD.bazel
+@@ -0,0 +1,59 @@
++# Copyright 2023 The StableHLO Authors. All Rights Reserved.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      https://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
++load("@llvm-project//llvm:lit_test.bzl", "lit_test", "package_path")
++
++package(
++    default_visibility = ["//visibility:public"],
++    licenses = ["notice"],
++)
 +
-+// The test case makes sure any custom attrs set on the reduce-op are
-+// printed/parsed when pretty-printed.
++# Equivalent of configure_lit_site_cfg from CMakeLists.txt.
++expand_template(
++    name = "lit_site_cfg_py_gen",
++    testonly = True,
++    out = "lit.site.cfg.py",
++    substitutions = {
++        "@LIT_SITE_CFG_IN_HEADER@": "# Autogenerated, do not edit.",
++        "@LLVM_TOOLS_DIR@": package_path("@llvm-project//llvm:BUILD"),
++        "\"@STABLEHLO_TOOLS_DIR@\"": "os.path.join(os.environ['TEST_SRCDIR'], 'stablehlo')",
++        "\"@STABLEHLO_SOURCE_DIR@\"": "os.path.join(os.environ['TEST_SRCDIR'], 'stablehlo')",
++    },
++    template = "lit.site.cfg.py.in",
++)
 +
-+// CHECK-LABEL:  func @pretty_print_with_custom_attr
-+// CHECK:          applies stablehlo.add across dimensions = [1] {custom_user_attr = 1 : i64}
++# Equivalent of add_lit_testsuite from CMakeLists.txt.
++[
++    lit_test(
++        name = "%s.test" % src,
++        size = "small",
++        srcs = [src],
++        data = [
++            "lit.cfg.py",
++            "lit.site.cfg.py",
++            "//:stablehlo-opt",
++            "//:stablehlo-translate",
++            "//stablehlo/experimental:experimental-stablehlo-opt",
++            "@llvm-project//llvm:FileCheck",
++            "@llvm-project//llvm:not",
++        ] + glob(["%s.bc" % src]),
++        tags = ["stablehlo_tests"],
++    )
++    for src in glob(["**/*.mlir"])
++]
++
++test_suite(
++    name = "experimental_stablehlo_tests",
++    tags = ["experimental_stablehlo_tests"],
++)
+diff --ruN a/stablehlo/stablehlo/experimental/tests/CMakeLists.txt b/stablehlo/stablehlo/experimental/tests/CMakeLists.txt
+--- stablehlo/stablehlo/experimental/tests/CMakeLists.txt
++++ stablehlo/stablehlo/experimental/tests/CMakeLists.txt
+@@ -0,0 +1,29 @@
++# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
++# Copyright 2023 The StableHLO Authors.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      https://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++configure_lit_site_cfg(
++  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
++  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
++  MAIN_CONFIG
++  ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
++)
++add_lit_testsuite(check-experimental-stablehlo-tests "Running the experimental/tests/ suite"
++  ${CMAKE_CURRENT_BINARY_DIR}
++  DEPENDS
++  FileCheck
++  experimental-stablehlo-opt
++  stablehlo-translate
++)
++add_dependencies(check-stablehlo-quick check-experimental-stablehlo-tests)
+diff --ruN a/stablehlo/stablehlo/experimental/tests/lit.cfg.py b/stablehlo/stablehlo/experimental/tests/lit.cfg.py
+--- stablehlo/stablehlo/experimental/tests/lit.cfg.py
++++ stablehlo/stablehlo/experimental/tests/lit.cfg.py
+@@ -0,0 +1,42 @@
++"""Lit configuration to drive test in this repo."""
++# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
++# Copyright 2023 The StableHLO Authors.
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++# -*- Python -*-
++# pylint: disable=undefined-variable
++
++import os
++
++import lit.formats
++from lit.llvm import llvm_config
++
++# Populate Lit configuration with the minimal required metadata.
++# Some metadata is populated in lit.site.cfg.py.in.
++config.name = 'STABLEHLO_TESTS_SUITE'
++config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
++config.suffixes = ['.mlir']
++config.test_source_root = os.path.dirname(__file__)
++
++# Make LLVM and StableHLO tools available in RUN directives
++tools = [
++  'FileCheck',
++  'experimental-stablehlo-opt',
++  'stablehlo-translate',
++  'not',
++]
++tool_dirs = [
++  config.llvm_tools_dir,
++  config.stablehlo_tools_dir,
++]
++llvm_config.add_tool_substitutions(tools, tool_dirs)
+diff --ruN a/stablehlo/stablehlo/experimental/tests/lit.site.cfg.py.in b/stablehlo/stablehlo/experimental/tests/lit.site.cfg.py.in
+--- stablehlo/stablehlo/experimental/tests/lit.site.cfg.py.in
++++ stablehlo/stablehlo/experimental/tests/lit.site.cfg.py.in
+@@ -0,0 +1,21 @@
++# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
++# Copyright 2023 The StableHLO Authors.
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++@LIT_SITE_CFG_IN_HEADER@
++
++import lit.llvm
++lit.llvm.initialize(lit_config, config)
++config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
++config.stablehlo_tools_dir = "@STABLEHLO_TOOLS_DIR@"
++lit_config.load_config(config, "@STABLEHLO_SOURCE_DIR@" + "/stablehlo/experimental/tests/lit.cfg.py")
+diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir b/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir
+--- stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir
++++ stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynamism.mlir
+@@ -0,0 +1,344 @@
++// RUN: experimental-stablehlo-opt --experimental-stablehlo-canonicalize-dynamism --split-input-file --verify-diagnostics %s | FileCheck %s
 +
-+func.func @pretty_print_with_custom_attr(%arg0: tensor<2x64x13xf32>) -> tensor<2x13xf32> {
-+  %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-+  %1 = stablehlo.reduce(%arg0 init: %0) applies stablehlo.add across dimensions = [1] {custom_user_attr = 1 : i64} : (tensor<2x64x13xf32>, tensor<f32>) -> tensor<2x13xf32>
-+  return %1 : tensor<2x13xf32>
-+}
-diff --ruN a/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir b/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
---- stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
-+++ stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir
-@@ -426,6 +426,172 @@
- 
- // -----
- 
 +// CHECK-LABEL: func @dynamic_reduce_window_success_static_result_type
 +func.func @dynamic_reduce_window_success_static_result_type(%arg0: tensor<3x2xf32>, %arg1: tensor<f32>) -> tensor<2x2xf32> {
 +  //           CHECK-NOT: stablehlo.dynamic_reduce_window
@@ -1209,17 +1708,6 @@ diff --ruN a/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir b/st
 +}
 +
 +// -----
-+
- // CHECK-LABEL: func @dynamic_reshape_success
- func.func @dynamic_reshape_success(%arg0: tensor<4xf32>) -> tensor<1x4xf32> {
-   // CHECK-NOT: stablehlo.dynamic_reshape
-@@ -452,6 +618,185 @@
-   %0 = stablehlo.constant dense<[1, 4]> : tensor<2xi64>
-   %1 = stablehlo.dynamic_reshape %arg0, %0 : (tensor<4xf32>, tensor<2xi64>) -> tensor<1x?xf32>
-   return %1 : tensor<1x?xf32>
-+}
-+
-+// -----
 +
 +// CHECK-LABEL: func @dynamic_rng_bit_generator_success
 +func.func @dynamic_rng_bit_generator_success(%arg0: tensor<2xui64>) -> tensor<1x4xf32> {
@@ -1396,16 +1884,13 @@ diff --ruN a/stablehlo/stablehlo/tests/stablehlo_canonicalize_dynamism.mlir b/st
 +  %k = stablehlo.constant dense<3> : tensor<ui64>
 +  %1:2 = stablehlo.custom_call @stablehlo.dynamic_top_k(%arg0, %k) : (tensor<16xf32>, tensor<ui64>) -> (tensor<3xf32>, tensor<4xi32>)
 +  return %1#0, %1#1 : tensor<3xf32>, tensor<4xi32>
- }
- 
- // -----
-diff --ruN a/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir b/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
---- stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
-+++ stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir
-@@ -607,12 +607,55 @@
- 
- // -----
- 
++}
+diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir b/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
+--- stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
++++ stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
+@@ -0,0 +1,152 @@
++// RUN: experimental-stablehlo-opt --experimental-stablehlo-refine-shapes --split-input-file --verify-diagnostics %s | FileCheck %s
++
 +// CHECK-LABEL: @main
 +func.func @main(%arg0: tensor<3x2xf32>, %arg1: tensor<f32>) -> tensor<*xf32> {
 +  // CHECK: stablehlo.dynamic_reduce_window{{.*}} -> tensor<2x2xf32>
@@ -1426,16 +1911,6 @@ diff --ruN a/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir b/stablehlo/
 +}
 +
 +// -----
-+
- // CHECK-LABEL: @refine_dynamic_reshape
- func.func @refine_dynamic_reshape(%arg0: tensor<4xf32>) -> tensor<*xf32> {
-   // CHECK: stablehlo.dynamic_reshape{{.*}} -> tensor<1x4xf32>
-   %0 = stablehlo.constant dense<[1, 4]> : tensor<2xi64>
-   %1 = stablehlo.dynamic_reshape %arg0, %0 : (tensor<4xf32>, tensor<2xi64>) -> tensor<*xf32>
-   func.return %1 : tensor<*xf32>
-+}
-+
-+// -----
 +
 +// CHECK-LABEL: @refine_dynamic_rng_bit_generator
 +func.func @refine_dynamic_rng_bit_generator(%arg0: tensor<2xui64>) -> (tensor<?xui64>, tensor<*xf32>) {
@@ -1455,36 +1930,374 @@ diff --ruN a/stablehlo/stablehlo/tests/stablehlo_refine_shapes.mlir b/stablehlo/
 +  %k = stablehlo.constant dense<4> : tensor<ui64>
 +  %1:2 = stablehlo.custom_call @stablehlo.dynamic_top_k(%arg0, %k) : (tensor<16xf32>, tensor<ui64>) -> (tensor<?xf32>, tensor<?xi32>)
 +  return %1#0, %1#1 : tensor<?xf32>, tensor<?xi32>
- }
- 
- // -----
-diff --ruN a/stablehlo/stablehlo/transforms/Passes.td b/stablehlo/stablehlo/transforms/Passes.td
---- stablehlo/stablehlo/transforms/Passes.td
-+++ stablehlo/stablehlo/transforms/Passes.td
-@@ -25,6 +25,7 @@
-     For example, if the output_shape operand of DynamicReshapeOp is a constant
-     value, then the operation can be transformed to ReshapeOp.
-   }];
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_topk
++func.func @refine_mhlo_topk(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
++  // CHECK: mhlo.topk{{.*}} -> (tensor<5x4xf32>, tensor<5x4xi32>)
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_too_many_operands
++func.func @refine_mhlo_error_too_many_operands(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
++  // expected-error@+1{{expects size(operands) = 1}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0, %arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>, tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_too_few_results
++func.func @refine_mhlo_error_too_few_results(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>) {
++  // expected-error@+1{{expects size(results) = 2}}
++  %0 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>)
++  return %0 : tensor<?x?xf32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_wrong_output_1_type
++func.func @refine_mhlo_error_wrong_output_1_type(%arg0: tensor<5x16xf32>) -> (tensor<f32>, tensor<?x?xi32>) {
++  // expected-error@+1{{expects values (result #0) to be a tensor of integer or floating-point type of rank at least 1}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<f32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<f32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_wrong_output_2_type
++func.func @refine_mhlo_error_wrong_output_2_type(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
++  // expected-error@+1{{expects indices (result #1) to be a tensor of si32 of rank at least 1}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>)
++  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_c1_wrong_output_shape
++func.func @refine_mhlo_error_c1_wrong_output_shape(%arg0: tensor<5x16xf32>) -> (tensor<?x?x?xf32>, tensor<?x?xi32>) {
++  // expected-error@+1{{expects the values shape to match the operand}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x?x?xf32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x?x?xf32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_c2_last_dim_not_k
++func.func @refine_mhlo_error_c2_last_dim_not_k(%arg0: tensor<5x16xf32>) -> (tensor<?x5xf32>, tensor<?x?xi32>) {
++  // expected-error@+1{{expects the values shape to match the operand}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x5xf32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x5xf32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_c3_wrong_output_type
++func.func @refine_mhlo_error_c3_wrong_output_type(%arg0: tensor<5x16xf32>) -> (tensor<?x?xi32>, tensor<?x?xi32>) {
++  // expected-error@+1{{expects the values element type to be the same as the operand element type}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x?xi32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x?xi32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_c4_outputs_shape_mismatch
++func.func @refine_mhlo_error_c4_outputs_shape_mismatch(%arg0: tensor<5x16xf32>) -> (tensor<?x4xf32>, tensor<?x5xi32>) {
++  // expected-error@+1{{expects the indices shape to match the values shape}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x4xf32>, tensor<?x5xi32>)
++  return %0#0, %0#1 : tensor<?x4xf32>, tensor<?x5xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @refine_mhlo_error_c5_negative_k
++func.func @refine_mhlo_error_c5_negative_k(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
++  // expected-error@+1{{expects k >= 0}}
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = -4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
++}
+diff --ruN a/stablehlo/stablehlo/experimental/tools/CMakeLists.txt b/stablehlo/stablehlo/experimental/tools/CMakeLists.txt
+--- stablehlo/stablehlo/experimental/tools/CMakeLists.txt
++++ stablehlo/stablehlo/experimental/tools/CMakeLists.txt
+@@ -0,0 +1,41 @@
++# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
++# Copyright 2023 The StableHLO Authors.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      https://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++set(LLVM_OPTIONAL_SOURCES
++  StablehloOptMain.cpp
++)
++
++# stablehlo-opt
++get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
++get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
++get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
++set(LIBS
++        ${dialect_libs}
++        ${conversion_libs}
++        ${extension_libs}
++        ExperimentalStablehloPasses
++        MLIROptLib
++        StablehloRegister
++        StablehloTestUtils
++        StablehloPasses
++        InterpreterOps
++        StablehloTOSATransforms
++        )
++add_llvm_executable(experimental-stablehlo-opt StablehloOptMain.cpp)
++llvm_update_compile_flags(experimental-stablehlo-opt)
++target_link_libraries(experimental-stablehlo-opt PRIVATE ${LIBS})
++
++mlir_check_all_link_libraries(experimental-stablehlo-opt)
++
+diff --ruN a/stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp b/stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp
+--- stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp
++++ stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp
+@@ -0,0 +1,46 @@
++/* Copyright 2023 The StableHLO Authors.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include "mlir/Dialect/Tosa/IR/TosaOps.h"
++#include "mlir/Dialect/Tosa/Transforms/Passes.h"
++#include "mlir/InitAllDialects.h"
++#include "mlir/InitAllExtensions.h"
++#include "mlir/InitAllPasses.h"
++#include "mlir/Tools/mlir-opt/MlirOptMain.h"
++#include "stablehlo/conversions/tosa/transforms/Passes.h"
++#include "stablehlo/dialect/Register.h"
++#include "stablehlo/experimental/transforms/Passes.h"
++#include "stablehlo/reference/InterpreterOps.h"
++#include "stablehlo/tests/TestUtils.h"
++#include "stablehlo/transforms/Passes.h"
++
++int main(int argc, char **argv) {
++  mlir::registerAllPasses();
++  mlir::hlo::registerAllTestPasses();
++  mlir::stablehlo::registerPassPipelines();
++  mlir::stablehlo::registerPasses();
++  mlir::stablehlo::experimental::registerPasses();
++  mlir::tosa::registerStablehloLegalizeToTosaPassPass();
++  mlir::tosa::registerStablehloPrepareForTosaPassPass();
++
++  mlir::DialectRegistry registry;
++  mlir::registerAllDialects(registry);
++  mlir::registerAllExtensions(registry);
++  mlir::stablehlo::registerAllDialects(registry);
++  registry.insert<mlir::stablehlo::interpreter::InterpreterDialect>();
++
++  return failed(
++      mlir::MlirOptMain(argc, argv, "Experimental StableHLO optimizer driver\n", registry));
++}
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
+--- stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
++++ stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
+@@ -0,0 +1,39 @@
++# Copyright 2023 The StableHLO Authors.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#      https://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++set(LLVM_TARGET_DEFINITIONS Passes.td)
++mlir_tablegen(Passes.h.inc -gen-pass-decls)
++add_public_tablegen_target(ExperimentalPassesIncGen)
++
++add_mlir_dialect_library(ExperimentalStablehloPasses
++  PARTIAL_SOURCES_INTENDED
++  StablehloCanonicalizeDynamism.cpp
++  StablehloRefineShapes.cpp
++
++  DEPENDS
++  ExperimentalPassesIncGen
++
++  LINK_LIBS PUBLIC
++  ChloOps
++  MLIRFuncDialect
++  MLIRIR
++  MLIRInferTypeOpInterface
++  MLIRSupport
++  MLIRTransformUtils
++  ExperimentalStablehloOps
++  StablehloBase
++  StablehloOps
++  StablehloPasses
++  StablehloTypeInference
++)
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/stablehlo/experimental/transforms/Passes.h
+--- stablehlo/stablehlo/experimental/transforms/Passes.h
++++ stablehlo/stablehlo/experimental/transforms/Passes.h
+@@ -0,0 +1,37 @@
++/* Copyright 2023 The StableHLO Authors.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#ifndef STABLEHLO_EXPERIMENTAL_TRANSFORMS_PASSES_H
++#define STABLEHLO_EXPERIMENTAL_TRANSFORMS_PASSES_H
++
++#include <memory>
++
++#include "mlir/Pass/Pass.h"
++#include "mlir/Transforms/DialectConversion.h"
++
++namespace mlir {
++namespace stablehlo {
++namespace experimental {
++  
++#define GEN_PASS_DECL_STABLEHLOCANONICALIZEDYNAMISMPASS
++#define GEN_PASS_DECL_STABLEHLOREFINESHAPESPASS
++#define GEN_PASS_REGISTRATION
++#include "stablehlo/experimental/transforms/Passes.h.inc"
++
++}  // namespace experimental
++}  // namespace stablehlo
++}  // namespace mlir
++
++#endif  // STABLEHLO_EXPERIMENTAL_TRANSFORMS_PASSES_H
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/stablehlo/experimental/transforms/Passes.td
+--- stablehlo/stablehlo/experimental/transforms/Passes.td
++++ stablehlo/stablehlo/experimental/transforms/Passes.td
+@@ -0,0 +1,31 @@
++/* Copyright 2023 The StableHLO Authors.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++include "mlir/Pass/PassBase.td"
++
++def StablehloCanonicalizeDynamismPass : Pass<"experimental-stablehlo-canonicalize-dynamism", "func::FuncOp"> {
++  let summary = "(Experimental) Canonicalizes dynamic StableHLO ops into static ops.";
++  let description = [{
++    Experimental version of the --stablehlo-canonicalize-dynamism pass.
++  }];
 +  let dependentDialects = ["mlir::chlo::ChloDialect"];
- }
- 
- def StablehloLegalizeToVhloPass : Pass<"stablehlo-legalize-to-vhlo", "ModuleOp"> {
-diff --ruN a/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp
---- stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp
-+++ stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp
-@@ -24,6 +24,8 @@
- #include "mlir/Interfaces/InferTypeOpInterface.h"
- #include "mlir/Support/LogicalResult.h"
- #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
++}
++
++def StablehloRefineShapesPass : Pass<"experimental-stablehlo-refine-shapes", "ModuleOp"> {
++  let summary = "(Experimental) Refines shapes across a StableHLO program.";
++  let description = [{
++    Experimental version of the --stablehlo-refine-shapes pass.
++  }];
++}
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
+--- stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
++++ stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
+@@ -0,0 +1,167 @@
++/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
++   Copyright 2023 The StableHLO Authors.
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include <cstdint>
++
++#include "llvm/ADT/STLExtras.h"
++#include "llvm/ADT/SmallVector.h"
++#include "mlir/Dialect/Func/IR/FuncOps.h"
++#include "mlir/IR/PatternMatch.h"
++#include "mlir/Interfaces/InferTypeOpInterface.h"
++#include "mlir/Support/LogicalResult.h"
++#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 +#include "stablehlo/dialect/ChloOps.h"
-+#include "stablehlo/dialect/ExperimentalOps.h"
- #include "stablehlo/dialect/StablehloOps.h"
- #include "stablehlo/transforms/Passes.h"
- 
-@@ -198,6 +200,54 @@
-   }
- };
- 
++#include "stablehlo/dialect/StablehloOps.h"
++#include "stablehlo/experimental/dialect/StablehloOps.h"
++#include "stablehlo/experimental/transforms/Passes.h"
++#include "stablehlo/transforms/Passes.h"
++
++namespace mlir {
++namespace stablehlo {
++namespace experimental {
++
++#define GEN_PASS_DEF_STABLEHLOCANONICALIZEDYNAMISMPASS
++#include "stablehlo/experimental/transforms/Passes.h.inc"
++
++namespace {
++
 +struct CanonicalizeDynamicReduceWindowOpPattern
 +    : public OpRewritePattern<CustomCallOp> {
 +  using OpRewritePattern::OpRewritePattern;
@@ -1532,17 +2345,6 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp b/
 +    return success();
 +  }
 +};
-+
- struct CanonicalizeDynamicReshapeOpPattern
-     : public OpRewritePattern<DynamicReshapeOp> {
-   using OpRewritePattern::OpRewritePattern;
-@@ -210,6 +260,56 @@
-     if (!op.getType().hasStaticShape())
-       return rewriter.notifyMatchFailure(op, "expected static result type");
-     rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(), op.getOperand());
-+    return success();
-+  }
-+};
 +
 +struct CanonicalizeDynamicRngBitGeneratorOpPattern
 +    : public OpRewritePattern<CustomCallOp> {
@@ -1590,35 +2392,84 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp b/
 +
 +    rewriter.replaceOpWithNewOp<chlo::TopKOp>(
 +        op, op->getResultTypes(), op.getOperand(), k[0]);
-     return success();
-   }
- };
-@@ -320,7 +420,10 @@
-     patterns.add<CanonicalizeDynamicGatherOpPattern>(&getContext());
-     patterns.add<CanonicalizeDynamicIotaOpPattern>(&getContext());
-     patterns.add<CanonicalizeDynamicPadOpPattern>(&getContext());
++    return success();
++  }
++};
++
++struct StablehloCanonicalizeDynamismPass
++    : public impl::StablehloCanonicalizeDynamismPassBase<
++          StablehloCanonicalizeDynamismPass> {
++  using StablehloCanonicalizeDynamismPassBase::
++      StablehloCanonicalizeDynamismPassBase;
++
++  void runOnOperation() override {
++    GreedyRewriteConfig config;
++    config.useTopDownTraversal = true;
++    config.enableRegionSimplification = true;
++    config.maxIterations = 2;
++    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
++    config.strictMode = GreedyRewriteStrictness::AnyOp;
++
++    RewritePatternSet patterns(&getContext());
++    populateStablehloCanonicalizeDynamismPatterns(&patterns, &getContext());
 +    patterns.add<CanonicalizeDynamicReduceWindowOpPattern>(&getContext());
-     patterns.add<CanonicalizeDynamicReshapeOpPattern>(&getContext());
 +    patterns.add<CanonicalizeDynamicRngBitGeneratorOpPattern>(&getContext());
 +    patterns.add<CanonicalizeDynamicTopKOpPattern>(&getContext());
-     patterns.add<CanonicalizeRealDynamicSliceOpToDynamicSliceOpPattern>(
-         &getContext());
-     patterns.add<CanonicalizeRealDynamicSliceOpToSliceOpPattern>(&getContext());
-diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
---- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-+++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-@@ -43,6 +43,7 @@
- #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
- #include "stablehlo/dialect/Base.h"
- #include "stablehlo/dialect/ChloOps.h"
-+#include "stablehlo/dialect/ExperimentalOps.h"
- #include "stablehlo/dialect/StablehloOps.h"
- #include "stablehlo/dialect/TypeInference.h"
- #include "stablehlo/transforms/Passes.h"
-@@ -844,12 +845,97 @@
-   }
- };
- 
++    if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
++                                            config))) {
++      return signalPassFailure();
++    }
++  }
++};
++
++}  // namespace
++}  // namespace experimental
++}  // namespace stablehlo
++}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
+--- stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
++++ stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
+@@ -0,0 +1,178 @@
++/* Copyright 2022 The StableHLO Authors.
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include "stablehlo/transforms/StablehloRefineShapes.h"
++
++#include <cstdint>
++
++#include "llvm/ADT/SmallVector.h"
++#include "mlir/Dialect/Func/IR/FuncOps.h"
++#include "mlir/IR/PatternMatch.h"
++#include "mlir/Interfaces/InferTypeOpInterface.h"
++#include "mlir/Support/LogicalResult.h"
++#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
++#include "stablehlo/dialect/Base.h"
++#include "stablehlo/dialect/StablehloOps.h"
++#include "stablehlo/dialect/TypeInference.h"
++#include "stablehlo/experimental/dialect/StablehloOps.h"
++#include "stablehlo/experimental/transforms/Passes.h"
++#include "stablehlo/transforms/Passes.h"
++
++namespace mlir {
++namespace stablehlo {
++namespace experimental {
++
++#define GEN_PASS_DEF_STABLEHLOREFINESHAPESPASS
++#include "stablehlo/experimental/transforms/Passes.h.inc"
++
++namespace {
++
 +struct RefineDynamicReduceWindowOpPattern
 +    : public OpRewritePattern<CustomCallOp> {
 +  using OpRewritePattern::OpRewritePattern;
@@ -1660,15 +2511,6 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
 +    return refineReturnTypes(rewriter, op, inferredReturnTypes);
 +  }
 +};
-+
- struct RefineDynamicReshapeOpPattern
-     : public OpRewritePattern<DynamicReshapeOp> {
-   using OpRewritePattern::OpRewritePattern;
-   LogicalResult matchAndRewrite(DynamicReshapeOp op,
-                                 PatternRewriter& rewriter) const override {
-     return refineReturnShape(rewriter, op, op.getOutputShape());
-+  }
-+};
 +
 +struct RefineDynamicRngBitGeneratorOpPattern
 +    : public OpRewritePattern<CustomCallOp> {
@@ -1710,18 +2552,908 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehl
 +
 +    outputShape[operandType.getRank() - 1] = k[0];
 +    return refineReturnTypes(rewriter, op, {{outputShape}, {outputShape}});
-   }
- };
- 
-@@ -1181,7 +1267,10 @@
-     patterns.add<RefineDynamicConvOpPattern>(&getContext());
-     patterns.add<RefineDynamicIotaOpPattern>(&getContext());
-     patterns.add<RefineDynamicPadOpPattern>(&getContext());
++  }
++};
++
++struct RefineTopKOpPattern : public OpRewritePattern<CustomCallOp> {
++  using OpRewritePattern::OpRewritePattern;
++  LogicalResult matchAndRewrite(CustomCallOp impl,
++                                PatternRewriter& rewriter) const override {
++    auto maybeOp = getTopKOp(impl);
++    if (!maybeOp || failed(maybeOp->verify())) return failure();
++    TopKOpAdaptor op = *maybeOp;
++
++    auto operandType = op.getOperand().getType().cast<ShapedType>();
++    SmallVector<int64_t> outputShape(operandType.getShape());
++    outputShape.back() = op.getK();
++    return refineReturnTypes(rewriter, op, {{outputShape}, {outputShape}});
++  }
++};
++
++struct StablehloRefineShapesPass
++    : public impl::StablehloRefineShapesPassBase<StablehloRefineShapesPass> {
++  using StablehloRefineShapesPassBase::StablehloRefineShapesPassBase;
++
++  void runOnOperation() override {
++    auto func = getStablehloRefineShapesTarget(getOperation());
++    if (!func) return signalPassFailure();
++
++    // The algorithm behind this pass consists of a single traversal of the
++    // function. This is sufficient because we only support one function per
++    // program at the moment.
++    // TODO(#1048): Find out why .maxIterations = 1 no longer works.
++    // There have been recent refactors to applyPatternsAndFoldGreedily
++    // upstream, and that might be the reason.
++    GreedyRewriteConfig config;
++    config.useTopDownTraversal = true;
++    config.enableRegionSimplification = true;
++    config.maxIterations = 2;
++    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
++    config.strictMode = GreedyRewriteStrictness::AnyOp;
++
++    RewritePatternSet patterns(&getContext());
++    populateStablehloRefineShapesPatterns(&patterns, &getContext());
 +    patterns.add<RefineDynamicReduceWindowOpPattern>(&getContext());
-     patterns.add<RefineDynamicReshapeOpPattern>(&getContext());
 +    patterns.add<RefineDynamicRngBitGeneratorOpPattern>(&getContext());
 +    patterns.add<RefineDynamicTopKOpPattern>(&getContext());
-     patterns.add<RefineInferTypeOpInterfacePattern>(&getContext());
-     patterns.add<RefineRealDynamicSliceOpPattern>(&getContext());
-     patterns.add<RefineReduceScatterOpPattern>(&getContext());
++    patterns.add<RefineTopKOpPattern>(&getContext());
++    if (failed(
++            applyPatternsAndFoldGreedily(func, std::move(patterns), config))) {
++      return signalPassFailure();
++    }
++  }
++};
++
++}  // namespace
++}  // namespace experimental
++}  // namespace stablehlo
++}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/tests/infer_chlo.mlir b/stablehlo/stablehlo/tests/infer_chlo.mlir
+--- stablehlo/stablehlo/tests/infer_chlo.mlir
++++ stablehlo/stablehlo/tests/infer_chlo.mlir
+@@ -120,10 +120,10 @@
+ // -----
+ // CHECK-LABEL: @broadcast_select_reify
+ func.func @broadcast_select_reify(%arg0: tensor<2xi1>, %arg1: tensor<?xi32>, %arg2: tensor<?xi32>) -> tensor<1xindex> {
+-  // CHECK:      %0 = shape.const_shape [2] : tensor<1xindex>
++  // CHECK:      %0 = shape.shape_of %arg0 : tensor<2xi1> -> tensor<1xindex>
+   // CHECK-NEXT: %1 = shape.shape_of %arg1 : tensor<?xi32> -> tensor<1xindex>
+   // CHECK-NEXT: %2 = shape.shape_of %arg2 : tensor<?xi32> -> tensor<1xindex>
+-  // CHECK-NEXT: %3 = shape.broadcast %1, %2, %0 : tensor<1xindex>, tensor<1xindex>, tensor<1xindex> -> tensor<1xindex>
++  // CHECK-NEXT: %3 = shape.broadcast %0, %1, %2 : tensor<1xindex>, tensor<1xindex>, tensor<1xindex> -> tensor<1xindex>
+   %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+   %1 = "hlo_test_infer.reify_return_type_shapes"(%0) : (tensor<?xi32>) -> tensor<1xindex>
+   return %1: tensor<1xindex>
+diff --ruN a/stablehlo/stablehlo/transforms/Passes.h b/stablehlo/stablehlo/transforms/Passes.h
+--- stablehlo/stablehlo/transforms/Passes.h
++++ stablehlo/stablehlo/transforms/Passes.h
+@@ -18,9 +18,12 @@
+ 
+ #include <memory>
+ 
++#include "mlir/Dialect/Func/IR/FuncOps.h"
+ #include "mlir/Dialect/Quant/QuantOps.h"
+ #include "mlir/Dialect/Shape/IR/Shape.h"
++#include "mlir/IR/BuiltinOps.h"
+ #include "mlir/Pass/Pass.h"
++#include "mlir/Support/LogicalResult.h"
+ #include "mlir/Transforms/DialectConversion.h"
+ 
+ namespace mlir {
+@@ -34,6 +37,14 @@
+ #define GEN_PASS_DECL_VHLOTOVERSIONPASS
+ #define GEN_PASS_REGISTRATION
+ #include "stablehlo/transforms/Passes.h.inc"
++
++// Populates --stablehlo-canonicalize-dynamism patterns.
++void populateStablehloCanonicalizeDynamismPatterns(RewritePatternSet *patterns,
++                                                   MLIRContext *context);
++
++// Populates --stablehlo-refine-shapes patterns.
++void populateStablehloRefineShapesPatterns(RewritePatternSet *patterns,
++                                           MLIRContext *context);
+ 
+ // Populates StableHLO ops to VHLO ops rewriting patterns.
+ void populateStablehloToVhloPatterns(RewritePatternSet *patterns,
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp
+--- stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp
++++ stablehlo/stablehlo/transforms/StablehloCanonicalizeDynamism.cpp
+@@ -307,16 +307,7 @@
+     config.strictMode = GreedyRewriteStrictness::AnyOp;
+ 
+     RewritePatternSet patterns(&getContext());
+-    patterns.add<CanonicalizeCustomCallOpPattern>(&getContext());
+-    patterns.add<CanonicalizeDynamicBroadcastInDimOpPattern>(&getContext());
+-    patterns.add<CanonicalizeDynamicConvOpPattern>(&getContext());
+-    patterns.add<CanonicalizeDynamicGatherOpPattern>(&getContext());
+-    patterns.add<CanonicalizeDynamicIotaOpPattern>(&getContext());
+-    patterns.add<CanonicalizeDynamicPadOpPattern>(&getContext());
+-    patterns.add<CanonicalizeDynamicReshapeOpPattern>(&getContext());
+-    patterns.add<CanonicalizeRealDynamicSliceOpToDynamicSliceOpPattern>(
+-        &getContext());
+-    patterns.add<CanonicalizeRealDynamicSliceOpToSliceOpPattern>(&getContext());
++    populateStablehloCanonicalizeDynamismPatterns(&patterns, &getContext());
+     if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
+                                             config))) {
+       return signalPassFailure();
+@@ -325,5 +316,19 @@
+ };
+ 
+ }  // namespace
++
++void populateStablehloCanonicalizeDynamismPatterns(RewritePatternSet* patterns,
++                                                   MLIRContext* context) {
++  patterns->add<CanonicalizeCustomCallOpPattern>(context);
++  patterns->add<CanonicalizeDynamicBroadcastInDimOpPattern>(context);
++  patterns->add<CanonicalizeDynamicConvOpPattern>(context);
++  patterns->add<CanonicalizeDynamicGatherOpPattern>(context);
++  patterns->add<CanonicalizeDynamicIotaOpPattern>(context);
++  patterns->add<CanonicalizeDynamicPadOpPattern>(context);
++  patterns->add<CanonicalizeDynamicReshapeOpPattern>(context);
++  patterns->add<CanonicalizeRealDynamicSliceOpToDynamicSliceOpPattern>(context);
++  patterns->add<CanonicalizeRealDynamicSliceOpToSliceOpPattern>(context);
++}
++
+ }  // namespace stablehlo
+ }  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
+--- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
++++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
+@@ -11,6 +11,8 @@
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
++
++#include "stablehlo/transforms/StablehloRefineShapes.h"
+ 
+ #include <cstdint>
+ #include <memory>
+@@ -53,6 +55,193 @@
+ #define GEN_PASS_DEF_STABLEHLOREFINESHAPESPASS
+ #include "stablehlo/transforms/Passes.h.inc"
+ 
++LogicalResult refineValues(PatternRewriter& rewriter, Operation* op,
++                           ValueRange values, TypeRange types) {
++  if (values.size() != types.size())
++    return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
++      diag << "refineValues failed for " << types << ": expected "
++           << values.size() << " types, got " << types.size();
++    });
++
++  // Check whether `types` contain any new information with respect to existing
++  // return types. Even if just a single dimension size out of an entire tensor
++  // type got updated, using `inferMostSpecificType` ensures that we don't
++  // miss that.
++  bool needsRefinement = false;
++  SmallVector<Type> refinedTypes;
++  for (auto it : llvm::zip(values.getTypes(), types)) {
++    // Cannot use structured bindings to simplify this because capturing
++    // structured bindings in a lambda is a C++ 20 extension.
++    auto currentType = std::get<0>(it);
++    auto refinement = std::get<1>(it);
++    auto refinedType = hlo::inferMostSpecificType(
++        /*location=*/{}, {currentType, refinement});
++    if (failed(refinedType))
++      return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
++        diag << "inferMostSpecificType failed for " << currentType << " and "
++             << refinement;
++      });
++    refinedTypes.push_back(*refinedType);
++    needsRefinement |= (currentType != *refinedType);
++  }
++  if (!needsRefinement)
++    return rewriter.notifyMatchFailure(op, "doesn't need refinement");
++
++  for (auto it : llvm::zip(values, refinedTypes)) {
++    // Cannot use structured bindings to simplify this because capturing
++    // structured bindings in a lambda is a C++ 20 extension.
++    auto value = std::get<0>(it);
++    auto refinedType = std::get<1>(it);
++    if (value.getType() == refinedType) continue;
++
++    // Check whether the users of this value are ready for the type of the
++    // value to be refined.
++    for (Operation* user : value.getUsers()) {
++      // CHLO and StableHLO ops are designed to support type refinements of
++      // their operands and results. Any operand type in these ops can change
++      // within what's supported by `inferMostSpecificType` without breaking
++      // verification of the op.
++      if (isa<chlo::ChloDialect, StablehloDialect>(user->getDialect()))
++        continue;
++
++      // Simply changing operand type of `func.return` won't work because
++      // that won't update the FunctionType of the enclosing `func.func`.
++      // Nonetheless, we still want to support these ops because they are widely
++      // used in StableHLO programs (although the plan of record is to replace
++      // `func.return` ops in StableHLO programs with `stablehlo.return`:
++      // https://github.com/openxla/stablehlo/issues/425).
++      if (isa<func::ReturnOp>(user)) continue;
++
++      // Unlike in TensorFlow's type inference pass, here we work only with
++      // allowlisted ops to focus our support on well-defined semantics of
++      // StableHLO programs.
++      return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
++        diag << "unsupported refinement: tried to refine " << value.getType()
++             << " to " << refinedType << " for user " << user;
++      });
++    }
++
++    // Happy path: simply call setType here because most of our users are
++    // fine with that.
++    auto unrefinedType = value.getType();
++    value.setType(refinedType);
++
++    // Special case: for `func.return`, guard the refinement with a cast
++    // and leave propagation of the refined return type to a dedicated pattern.
++    auto isFuncReturn = [](OpOperand& use) -> bool {
++      return isa<func::ReturnOp>(use.getOwner());
++    };
++    if (llvm::none_of(value.getUses(), isFuncReturn)) continue;
++    rewriter.setInsertionPointAfter(op);
++    auto castToUnrefinedType = rewriter.create<UnrealizedConversionCastOp>(
++        op->getLoc(), unrefinedType, value);
++    value.replaceUsesWithIf(castToUnrefinedType.getOutputs()[0], isFuncReturn);
++  }
++
++  return success();
++}
++
++LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
++                                ArrayRef<Type> types) {
++  if (failed(refineValues(rewriter, op, op->getResults(), types)))
++    return failure();
++
++  // This `replaceOpWithIf` call doesn't actually change the IR, but
++  // it does ask the rewriter to visit all the users of this op. There is no
++  // upstream API to achieve this directly, but if it's introduced in the
++  // future, we could use it here.
++  rewriter.replaceOpWithIf(op, op->getResults(),
++                           [](OpOperand& use) { return false; });
++  return success();
++}
++
++LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
++                                ArrayRef<ShapedTypeComponents> refinements) {
++  SmallVector<Type> flattenedTypes;
++  hlo::flattenTupleTypes(op->getResultTypes(), flattenedTypes);
++  auto flattenedSize = flattenedTypes.size();
++  if (flattenedSize != refinements.size())
++    return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
++      diag << "refineReturnTypes failed: expected " << flattenedSize
++           << " refinements, got " << refinements.size();
++    });
++
++  SmallVector<Type> flattenedRefinedTypes;
++  for (auto it : llvm::zip(flattenedTypes, refinements)) {
++    // Cannot use structured bindings to simplify this because capturing
++    // structured bindings in a lambda is a C++ 20 extension.
++    ShapedType currentType = std::get<0>(it).dyn_cast<ShapedType>();
++    ShapedTypeComponents refinement = std::get<1>(it);
++    auto failWithReason = [&](StringRef reason) {
++      return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
++        diag << "refineTypes failed: refining " << currentType
++             << "with refinement: {";
++        if (refinement.hasRank()) {
++          diag << "shape = [" << refinement.getDims() << "]";
++          if (refinement.getAttribute())
++            diag << "attribute = " << refinement.getAttribute();
++        } else {
++          diag << "hasRank = false";
++        }
++        diag << ", elementType = " << refinement.getElementType();
++        diag << "} failed: " << reason;
++      });
++    };
++
++    // If the current type is not a shaped type, then the refinement must
++    // be completely empty.
++    if (!currentType) {
++      if (refinement.hasRank() || refinement.getElementType() ||
++          refinement.getAttribute())
++        return failWithReason("unsupported refinement");
++      flattenedRefinedTypes.push_back(currentType);
++      continue;
++    }
++
++    // If the refinement has an element type, then it must be the same as
++    // the current element type.
++    Type currentElementType = currentType.getElementType();
++    if (refinement.getElementType() &&
++        currentElementType != refinement.getElementType())
++      return failWithReason("expected compatible element types");
++
++    // If neither the current type nor the refinement are ranked, then there's
++    // nothing to refine, and we return the current type.
++    bool hasRank = currentType.hasRank() || refinement.hasRank();
++    if (!hasRank) {
++      flattenedRefinedTypes.push_back(currentType);
++      continue;
++    }
++
++    // If either the current type or the refinement have encodings, then
++    // we fail. Encodings are left for future work.
++    Attribute currentEncoding = nullptr;
++    if (auto currentRankedType = currentType.dyn_cast<RankedTensorType>()) {
++      currentEncoding = currentRankedType.getEncoding();
++    }
++    Attribute refinedEncoding = refinement.getAttribute();
++    if (currentEncoding || refinedEncoding)
++      return failWithReason("expected compatible encodings");
++
++    // If both the current type and the refinement have shapes, use the shape
++    // from the refinement. Otherwise, pick whatever is available.
++    // Make sure that the resulting type is compatible with the current type
++    // to avoid creating invalid code.
++    auto refinedShape =
++        refinement.hasRank() ? refinement.getDims() : currentType.getShape();
++    auto refinedType = RankedTensorType::get(refinedShape, currentElementType);
++    if (!hlo::isCompatibleForHloTypeInference(currentType, refinedType))
++      return failWithReason("expected compatible shapes");
++    flattenedRefinedTypes.push_back(refinedType);
++  }
++
++  SmallVector<Type> refinedTypes;
++  if (failed(hlo::unflattenTupleTypes(op->getResultTypes(),
++                                      flattenedRefinedTypes, refinedTypes)))
++    return failure();
++  return refineReturnTypes(rewriter, op, refinedTypes);
++}
++
+ namespace {
+ 
+ // DenseElementsAttr can be constructed from ArrayRef<APInt> but not from
+@@ -422,245 +611,6 @@
+ // StableHLO-specific extension to refine return types based on potentially
+ // refined operands.
+ 
+-// Refines the values using the given types.
+-// Tricky implementation details:
+-//   1) Need to support partial shape refinements, e.g. if just a single
+-//      dimension size out of an entire tensor type got refined. This is done
+-//      via inferMostSpecificType.
+-//   2) Need to signal propagation of the refined shapes across the
+-//      StableHLO program. Different callers of this function have different
+-//      propagation needs, so this function doesn't signal anything on its own
+-//      and leaves that to the callers.
+-LogicalResult refineValues(PatternRewriter& rewriter, Operation* op,
+-                           ValueRange values, TypeRange types) {
+-  if (values.size() != types.size())
+-    return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
+-      diag << "refineValues failed for " << types << ": expected "
+-           << values.size() << " types, got " << types.size();
+-    });
+-
+-  // Check whether `types` contain any new information with respect to existing
+-  // return types. Even if just a single dimension size out of an entire tensor
+-  // type got updated, using `inferMostSpecificType` ensures that we don't
+-  // miss that.
+-  bool needsRefinement = false;
+-  SmallVector<Type> refinedTypes;
+-  for (auto it : llvm::zip(values.getTypes(), types)) {
+-    // Cannot use structured bindings to simplify this because capturing
+-    // structured bindings in a lambda is a C++ 20 extension.
+-    auto currentType = std::get<0>(it);
+-    auto refinement = std::get<1>(it);
+-    auto refinedType = hlo::inferMostSpecificType(
+-        /*location=*/{}, {currentType, refinement});
+-    if (failed(refinedType))
+-      return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
+-        diag << "inferMostSpecificType failed for " << currentType << " and "
+-             << refinement;
+-      });
+-    refinedTypes.push_back(*refinedType);
+-    needsRefinement |= (currentType != *refinedType);
+-  }
+-  if (!needsRefinement)
+-    return rewriter.notifyMatchFailure(op, "doesn't need refinement");
+-
+-  for (auto it : llvm::zip(values, refinedTypes)) {
+-    // Cannot use structured bindings to simplify this because capturing
+-    // structured bindings in a lambda is a C++ 20 extension.
+-    auto value = std::get<0>(it);
+-    auto refinedType = std::get<1>(it);
+-    if (value.getType() == refinedType) continue;
+-
+-    // Check whether the users of this value are ready for the type of the
+-    // value to be refined.
+-    for (Operation* user : value.getUsers()) {
+-      // CHLO and StableHLO ops are designed to support type refinements of
+-      // their operands and results. Any operand type in these ops can change
+-      // within what's supported by `inferMostSpecificType` without breaking
+-      // verification of the op.
+-      if (isa<chlo::ChloDialect, StablehloDialect>(user->getDialect()))
+-        continue;
+-
+-      // Simply changing operand type of `func.return` won't work because
+-      // that won't update the FunctionType of the enclosing `func.func`.
+-      // Nonetheless, we still want to support these ops because they are widely
+-      // used in StableHLO programs (although the plan of record is to replace
+-      // `func.return` ops in StableHLO programs with `stablehlo.return`:
+-      // https://github.com/openxla/stablehlo/issues/425).
+-      if (isa<func::ReturnOp>(user)) continue;
+-
+-      // Unlike in TensorFlow's type inference pass, here we work only with
+-      // allowlisted ops to focus our support on well-defined semantics of
+-      // StableHLO programs.
+-      return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
+-        diag << "unsupported refinement: tried to refine " << value.getType()
+-             << " to " << refinedType << " for user " << user;
+-      });
+-    }
+-
+-    // Happy path: simply call setType here because most of our users are
+-    // fine with that.
+-    auto unrefinedType = value.getType();
+-    value.setType(refinedType);
+-
+-    // Special case: for `func.return`, guard the refinement with a cast
+-    // and leave propagation of the refined return type to a dedicated pattern.
+-    auto isFuncReturn = [](OpOperand& use) -> bool {
+-      return isa<func::ReturnOp>(use.getOwner());
+-    };
+-    if (llvm::none_of(value.getUses(), isFuncReturn)) continue;
+-    rewriter.setInsertionPointAfter(op);
+-    auto castToUnrefinedType = rewriter.create<UnrealizedConversionCastOp>(
+-        op->getLoc(), unrefinedType, value);
+-    value.replaceUsesWithIf(castToUnrefinedType.getOutputs()[0], isFuncReturn);
+-  }
+-
+-  return success();
+-}
+-
+-// Refines the return types of the given operation using the given types.
+-// This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
+-LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
+-                                ArrayRef<Type> types) {
+-  if (failed(refineValues(rewriter, op, op->getResults(), types)))
+-    return failure();
+-
+-  // This `replaceOpWithIf` call doesn't actually change the IR, but
+-  // it does ask the rewriter to visit all the users of this op. There is no
+-  // upstream API to achieve this directly, but if it's introduced in the
+-  // future, we could use it here.
+-  rewriter.replaceOpWithIf(op, op->getResults(),
+-                           [](OpOperand& use) { return false; });
+-  return success();
+-}
+-
+-// Refines the return types of the given operation using the given types.
+-// Tricky implementation details:
+-//   1) `types` can include non-shaped types. If there are tuple types,
+-//      then they are first flattened into non-tuple types using in-order
+-//      traversal, and only then we apply the refinements. If there are other
+-//      types, then the corresponding refinements must be completely empty.
+-//   2) Encodings are not supported. In principle, TypeExtensions should be
+-//      supportable, but this needs careful thinking through. Given that no one
+-//      asked for support for bounded dynamism in this pass yet, this is left
+-//      for future work.
+-// This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
+-LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
+-                                ArrayRef<ShapedTypeComponents> refinements) {
+-  SmallVector<Type> flattenedTypes;
+-  hlo::flattenTupleTypes(op->getResultTypes(), flattenedTypes);
+-  auto flattenedSize = flattenedTypes.size();
+-  if (flattenedSize != refinements.size())
+-    return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
+-      diag << "refineReturnTypes failed: expected " << flattenedSize
+-           << " refinements, got " << refinements.size();
+-    });
+-
+-  SmallVector<Type> flattenedRefinedTypes;
+-  for (auto it : llvm::zip(flattenedTypes, refinements)) {
+-    // Cannot use structured bindings to simplify this because capturing
+-    // structured bindings in a lambda is a C++ 20 extension.
+-    ShapedType currentType = std::get<0>(it).dyn_cast<ShapedType>();
+-    ShapedTypeComponents refinement = std::get<1>(it);
+-    auto failWithReason = [&](StringRef reason) {
+-      return rewriter.notifyMatchFailure(op, [&](Diagnostic& diag) {
+-        diag << "refineTypes failed: refining " << currentType
+-             << "with refinement: {";
+-        if (refinement.hasRank()) {
+-          diag << "shape = [" << refinement.getDims() << "]";
+-          if (refinement.getAttribute())
+-            diag << "attribute = " << refinement.getAttribute();
+-        } else {
+-          diag << "hasRank = false";
+-        }
+-        diag << ", elementType = " << refinement.getElementType();
+-        diag << "} failed: " << reason;
+-      });
+-    };
+-
+-    // If the current type is not a shaped type, then the refinement must
+-    // be completely empty.
+-    if (!currentType) {
+-      if (refinement.hasRank() || refinement.getElementType() ||
+-          refinement.getAttribute())
+-        return failWithReason("unsupported refinement");
+-      flattenedRefinedTypes.push_back(currentType);
+-      continue;
+-    }
+-
+-    // If the refinement has an element type, then it must be the same as
+-    // the current element type.
+-    Type currentElementType = currentType.getElementType();
+-    if (refinement.getElementType() &&
+-        currentElementType != refinement.getElementType())
+-      return failWithReason("expected compatible element types");
+-
+-    // If neither the current type nor the refinement are ranked, then there's
+-    // nothing to refine, and we return the current type.
+-    bool hasRank = currentType.hasRank() || refinement.hasRank();
+-    if (!hasRank) {
+-      flattenedRefinedTypes.push_back(currentType);
+-      continue;
+-    }
+-
+-    // If either the current type or the refinement have encodings, then
+-    // we fail. Encodings are left for future work.
+-    Attribute currentEncoding = nullptr;
+-    if (auto currentRankedType = currentType.dyn_cast<RankedTensorType>()) {
+-      currentEncoding = currentRankedType.getEncoding();
+-    }
+-    Attribute refinedEncoding = refinement.getAttribute();
+-    if (currentEncoding || refinedEncoding)
+-      return failWithReason("expected compatible encodings");
+-
+-    // If both the current type and the refinement have shapes, use the shape
+-    // from the refinement. Otherwise, pick whatever is available.
+-    // Make sure that the resulting type is compatible with the current type
+-    // to avoid creating invalid code.
+-    auto refinedShape =
+-        refinement.hasRank() ? refinement.getDims() : currentType.getShape();
+-    auto refinedType = RankedTensorType::get(refinedShape, currentElementType);
+-    if (!hlo::isCompatibleForHloTypeInference(currentType, refinedType))
+-      return failWithReason("expected compatible shapes");
+-    flattenedRefinedTypes.push_back(refinedType);
+-  }
+-
+-  SmallVector<Type> refinedTypes;
+-  if (failed(hlo::unflattenTupleTypes(op->getResultTypes(),
+-                                      flattenedRefinedTypes, refinedTypes)))
+-    return failure();
+-  return refineReturnTypes(rewriter, op, refinedTypes);
+-}
+-
+-// Refines the return type of the given operation using the given shape.
+-// This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
+-template <typename OpType>
+-LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
+-                                ArrayRef<int64_t> shape) {
+-  return refineReturnTypes(rewriter, op, ShapedTypeComponents(shape));
+-}
+-
+-// Refines the return type of the given operation using the given shape.
+-// This function also signals PatternRewriter that it needs to visit all the
+-// users of this op if any updates to its results have happened during execution
+-// of the function.
+-template <typename OpType>
+-LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
+-                                Value shapeValue) {
+-  // At the moment, we only support refining return types using fully static
+-  // shape values which serves the current use cases well.
+-  // Support for partially static shape values is left for future work.
+-  SmallVector<int64_t> shape;
+-  if (failed(hlo::matchInts(shapeValue, shape)))
+-    return rewriter.notifyMatchFailure(op, "expected constant output shape");
+-  return refineReturnShape(rewriter, op, shape);
+-}
+-
+ struct RefineAllGatherOpPattern : public OpRewritePattern<AllGatherOp> {
+   using OpRewritePattern::OpRewritePattern;
+   LogicalResult matchAndRewrite(AllGatherOp op,
+@@ -1115,39 +1065,8 @@
+   using StablehloRefineShapesPassBase::StablehloRefineShapesPassBase;
+ 
+   void runOnOperation() override {
+-    // Only one function per module is supported at the moment to avoid the need
+-    // to think about iterative type inference algorithms.
+-    // Current use cases are served well by inlining multiple functions into
+-    // a single function, so we leave native support for multiple functions to
+-    // future work.
+-    // To enable modules that contain CustomCallOp::called_computations,
+-    // we allow multiple functions, in which case we only refine the main
+-    // function called "main", assuming that the called computations will have
+-    // static shapes. Lifting this assumption and expanding refinement to
+-    // multiple functions is left for future work.
+-    ModuleOp module = getOperation();
+-    auto funcs = llvm::to_vector(module.getOps<func::FuncOp>());
+-    if (funcs.empty()) return;
+-    func::FuncOp func;
+-    if (funcs.size() == 1) {
+-      func = funcs[0];
+-    } else {
+-      func = module.lookupSymbol<func::FuncOp>("main");
+-    }
+-    if (!func) {
+-      module.emitOpError()
+-          << "must have no more than one function or a `main`"
+-          << " function to clearly identify which function will be refined";
+-      return signalPassFailure();
+-    }
+-
+-    // Similarly, only one block per function is supported at the moment.
+-    // At the StableHLO level, functions are expected to only have one block,
+-    // so supporting more is out of scope for this pass.
+-    if (!func.getRegion().hasOneBlock()) {
+-      func.emitOpError() << "must have exactly one block";
+-      return signalPassFailure();
+-    }
++    auto func = getStablehloRefineShapesTarget(getOperation());
++    if (!func) return signalPassFailure();
+ 
+     // The algorithm behind this pass consists of a single traversal of the
+     // function. This is sufficient because we only support one function per
+@@ -1163,44 +1082,7 @@
+     config.strictMode = GreedyRewriteStrictness::AnyOp;
+ 
+     RewritePatternSet patterns(&getContext());
+-    patterns.add<EvalAddOpPattern>(&getContext());
+-    patterns.add<EvalAndOpPattern>(&getContext());
+-    patterns.add<EvalBroadcastInDimOpPattern>(&getContext());
+-    patterns.add<EvalClampOpPattern>(&getContext());
+-    patterns.add<EvalCompareOpPattern>(&getContext());
+-    patterns.add<EvalConcatenateOpPattern>(&getContext());
+-    patterns.add<EvalConvertOpPattern>(&getContext());
+-    patterns.add<EvalDivOpPattern>(&getContext());
+-    patterns.add<EvalGetDimensionSizeOpPattern>(&getContext());
+-    patterns.add<EvalMaxOpPattern>(&getContext());
+-    patterns.add<EvalMinOpPattern>(&getContext());
+-    patterns.add<EvalMulOpPattern>(&getContext());
+-    patterns.add<EvalOrOpPattern>(&getContext());
+-    patterns.add<EvalRemOpPattern>(&getContext());
+-    patterns.add<EvalReshapeOpPattern>(&getContext());
+-    patterns.add<EvalSelectOpPattern>(&getContext());
+-    patterns.add<EvalSignOpPattern>(&getContext());
+-    patterns.add<EvalSliceOpPattern>(&getContext());
+-    patterns.add<EvalSubtractOpPattern>(&getContext());
+-    patterns.add<RefineAllGatherOpPattern>(&getContext());
+-    patterns.add<RefineBitcastConvertOpPattern>(&getContext());
+-    patterns.add<RefineConvertOpPattern>(&getContext());
+-    patterns.add<RefineConvolutionOpPattern>(&getContext());
+-    patterns.add<RefineCustomCallOpPattern>(&getContext());
+-    patterns.add<RefineDotGeneralOpPattern>(&getContext());
+-    patterns.add<RefineDynamicBroadcastInDimOpPattern>(&getContext());
+-    patterns.add<RefineDynamicConvOpPattern>(&getContext());
+-    patterns.add<RefineDynamicIotaOpPattern>(&getContext());
+-    patterns.add<RefineDynamicPadOpPattern>(&getContext());
+-    patterns.add<RefineDynamicReshapeOpPattern>(&getContext());
+-    patterns.add<RefineInferTypeOpInterfacePattern>(&getContext());
+-    patterns.add<RefineRealDynamicSliceOpPattern>(&getContext());
+-    patterns.add<RefineReduceScatterOpPattern>(&getContext());
+-    patterns.add<RefineRngOpPattern>(&getContext());
+-    patterns.add<RefineUniformQuantizeOpPattern>(&getContext());
+-    patterns.add<RefineWhileOpPattern>(&getContext());
+-    patterns.add<UpdateFunctionTypePattern>(&getContext());
+-    patterns.add<UpdateRegionTypePattern>(&getContext());
++    populateStablehloRefineShapesPatterns(&patterns, &getContext());
+     if (failed(
+             applyPatternsAndFoldGreedily(func, std::move(patterns), config))) {
+       return signalPassFailure();
+@@ -1209,5 +1091,86 @@
+ };
+ 
+ }  // namespace
++
++func::FuncOp getStablehloRefineShapesTarget(ModuleOp module) {
++  // Only one function per module is supported at the moment to avoid the need
++  // to think about iterative type inference algorithms.
++  // Current use cases are served well by inlining multiple functions into
++  // a single function, so we leave native support for multiple functions to
++  // future work.
++  // To enable modules that contain CustomCallOp::called_computations,
++  // we allow multiple functions, in which case we only refine the main
++  // function called "main", assuming that the called computations will have
++  // static shapes. Lifting this assumption and expanding refinement to
++  // multiple functions is left for future work.
++  auto funcs = llvm::to_vector(module.getOps<func::FuncOp>());
++  if (funcs.empty()) return nullptr;
++
++  func::FuncOp result;
++  if (funcs.size() == 1) {
++    result = funcs[0];
++  } else {
++    result = module.lookupSymbol<func::FuncOp>("main");
++  }
++  if (!result) {
++    module.emitOpError()
++        << "must have no more than one function or a `main`"
++        << " function to clearly identify which function will be refined";
++    return nullptr;
++  }
++
++  // Similarly, only one block per function is supported at the moment.
++  // At the StableHLO level, functions are expected to only have one block,
++  // so supporting more is out of scope for this pass.
++  if (!result.getRegion().hasOneBlock()) {
++    result.emitOpError() << "must have exactly one block";
++    return nullptr;
++  }
++
++  return result;
++}
++
++void populateStablehloRefineShapesPatterns(RewritePatternSet* patterns,
++                                           MLIRContext* context) {
++  patterns->add<EvalAddOpPattern>(context);
++  patterns->add<EvalAndOpPattern>(context);
++  patterns->add<EvalBroadcastInDimOpPattern>(context);
++  patterns->add<EvalClampOpPattern>(context);
++  patterns->add<EvalCompareOpPattern>(context);
++  patterns->add<EvalConcatenateOpPattern>(context);
++  patterns->add<EvalConvertOpPattern>(context);
++  patterns->add<EvalDivOpPattern>(context);
++  patterns->add<EvalGetDimensionSizeOpPattern>(context);
++  patterns->add<EvalMaxOpPattern>(context);
++  patterns->add<EvalMinOpPattern>(context);
++  patterns->add<EvalMulOpPattern>(context);
++  patterns->add<EvalOrOpPattern>(context);
++  patterns->add<EvalRemOpPattern>(context);
++  patterns->add<EvalReshapeOpPattern>(context);
++  patterns->add<EvalSelectOpPattern>(context);
++  patterns->add<EvalSignOpPattern>(context);
++  patterns->add<EvalSliceOpPattern>(context);
++  patterns->add<EvalSubtractOpPattern>(context);
++  patterns->add<RefineAllGatherOpPattern>(context);
++  patterns->add<RefineBitcastConvertOpPattern>(context);
++  patterns->add<RefineConvertOpPattern>(context);
++  patterns->add<RefineConvolutionOpPattern>(context);
++  patterns->add<RefineCustomCallOpPattern>(context);
++  patterns->add<RefineDotGeneralOpPattern>(context);
++  patterns->add<RefineDynamicBroadcastInDimOpPattern>(context);
++  patterns->add<RefineDynamicConvOpPattern>(context);
++  patterns->add<RefineDynamicIotaOpPattern>(context);
++  patterns->add<RefineDynamicPadOpPattern>(context);
++  patterns->add<RefineDynamicReshapeOpPattern>(context);
++  patterns->add<RefineInferTypeOpInterfacePattern>(context);
++  patterns->add<RefineRealDynamicSliceOpPattern>(context);
++  patterns->add<RefineReduceScatterOpPattern>(context);
++  patterns->add<RefineRngOpPattern>(context);
++  patterns->add<RefineUniformQuantizeOpPattern>(context);
++  patterns->add<RefineWhileOpPattern>(context);
++  patterns->add<UpdateFunctionTypePattern>(context);
++  patterns->add<UpdateRegionTypePattern>(context);
++}
++
+ }  // namespace stablehlo
+ }  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.h b/stablehlo/stablehlo/transforms/StablehloRefineShapes.h
+--- stablehlo/stablehlo/transforms/StablehloRefineShapes.h
++++ stablehlo/stablehlo/transforms/StablehloRefineShapes.h
+@@ -0,0 +1,102 @@
++/* Copyright 2022 The StableHLO Authors.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#ifndef STABLEHLO_TRANSFORMS_STABLEHLO_REFINE_SHAPES_H
++#define STABLEHLO_TRANSFORMS_STABLEHLO_REFINE_SHAPES_H
++
++#include "llvm/ADT/SmallVector.h"
++#include "mlir/Dialect/Func/IR/FuncOps.h"
++#include "mlir/IR/BuiltinOps.h"
++#include "mlir/IR/Operation.h"
++#include "mlir/IR/PatternMatch.h"
++#include "mlir/IR/Types.h"
++#include "mlir/IR/Value.h"
++#include "mlir/Interfaces/InferTypeOpInterface.h"
++#include "mlir/Support/LogicalResult.h"
++#include "stablehlo/dialect/Base.h"
++
++namespace mlir {
++namespace stablehlo {
++
++// Gets a FuncOp that --stablehlo-refine-shapes will run on.
++// Returns a nullptr and emits appropriate errors if such a function cannot
++// be obtained from the module.
++func::FuncOp getStablehloRefineShapesTarget(ModuleOp module);
++
++// Refines the values using the given types.
++// Tricky implementation details:
++//   1) Need to support partial shape refinements, e.g. if just a single
++//      dimension size out of an entire tensor type got refined. This is done
++//      via inferMostSpecificType.
++//   2) Need to signal propagation of the refined shapes across the
++//      StableHLO program. Different callers of this function have different
++//      propagation needs, so this function doesn't signal anything on its own
++//      and leaves that to the callers.
++LogicalResult refineValues(PatternRewriter& rewriter, Operation* op,
++                           ValueRange values, TypeRange types);
++
++// Refines the return types of the given operation using the given types.
++// This function also signals PatternRewriter that it needs to visit all the
++// users of this op if any updates to its results have happened during execution
++// of the function.
++LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
++                                ArrayRef<Type> types);
++
++// Refines the return types of the given operation using the given types.
++// Tricky implementation details:
++//   1) `types` can include non-shaped types. If there are tuple types,
++//      then they are first flattened into non-tuple types using in-order
++//      traversal, and only then we apply the refinements. If there are other
++//      types, then the corresponding refinements must be completely empty.
++//   2) Encodings are not supported. In principle, TypeExtensions should be
++//      supportable, but this needs careful thinking through. Given that no one
++//      asked for support for bounded dynamism in this pass yet, this is left
++//      for future work.
++// This function also signals PatternRewriter that it needs to visit all the
++// users of this op if any updates to its results have happened during execution
++// of the function.
++LogicalResult refineReturnTypes(PatternRewriter& rewriter, Operation* op,
++                                ArrayRef<ShapedTypeComponents> refinements);
++
++// Refines the return type of the given operation using the given shape.
++// This function also signals PatternRewriter that it needs to visit all the
++// users of this op if any updates to its results have happened during execution
++// of the function.
++template <typename OpType>
++LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
++                                ArrayRef<int64_t> shape) {
++  return refineReturnTypes(rewriter, op, ShapedTypeComponents(shape));
++}
++
++// Refines the return type of the given operation using the given shape.
++// This function also signals PatternRewriter that it needs to visit all the
++// users of this op if any updates to its results have happened during execution
++// of the function.
++template <typename OpType>
++LogicalResult refineReturnShape(PatternRewriter& rewriter, OpType op,
++                                Value shapeValue) {
++  // At the moment, we only support refining return types using fully static
++  // shape values which serves the current use cases well.
++  // Support for partially static shape values is left for future work.
++  SmallVector<int64_t> shape;
++  if (failed(hlo::matchInts(shapeValue, shape)))
++    return rewriter.notifyMatchFailure(op, "expected constant output shape");
++  return refineReturnShape(rewriter, op, shape);
++}
++
++}  // namespace stablehlo
++}  // namespace mlir
++
++#endif  // STABLEHLO_TRANSFORMS_STABLEHLO_REFINE_SHAPES_H
+diff --ruN a/stablehlo/stablehlo/transforms/VhloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/VhloLegalizeToStablehlo.cpp
+--- stablehlo/stablehlo/transforms/VhloLegalizeToStablehlo.cpp
++++ stablehlo/stablehlo/transforms/VhloLegalizeToStablehlo.cpp
+@@ -430,9 +430,20 @@
+                                 SmallVector<NamedAttribute>& stablehloAttrs) {
+   auto tensorAttr = dyn_cast<vhlo::TensorV1Attr>(vhloAttr);
+   if (!tensorAttr) return specialFailure();
+-  ArrayRef<int64_t> data(
+-      reinterpret_cast<const int64_t*>(tensorAttr.getData().data()),
+-      tensorAttr.getData().size() / sizeof(int64_t));
++
++  auto data = ArrayRef<int64_t>(
++                  reinterpret_cast<const int64_t*>(tensorAttr.getData().data()),
++                  tensorAttr.getData().size() / sizeof(int64_t))
++                  .vec();
++
++  // Handle splats
++  if (data.size() == 1) {
++    auto tensorType = tensorAttr.getType().dyn_cast<vhlo::RankedTensorV1Type>();
++    if (!tensorType || (tensorType.getShape().size() != 1))
++      return specialFailure();
++    auto size = tensorType.getShape()[0];
++    data.resize(size, data[0]);
++  }
+ 
+   stablehloAttrs.emplace_back(
+       vhloName, DenseI64ArrayAttr::get(vhloAttr.getContext(), data));
 
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index 8d7054dda8b2c0..f175093e925b74 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "04291aea6b50d9573e6f4de184938d83b9564cd0"
-    STABLEHLO_SHA256 = "2f57b2cb8eeadebe8430e294f88919b392cf472c62fdd40d4713680b283d64e5"
+    STABLEHLO_COMMIT = "ab709fe48de88c67717abfbd7ef17425eb95ddaf"
+    STABLEHLO_SHA256 = "a469ecc3d6747f9effdc1c7813568953dd1dc30070ca8f4f6f8a4d405e8c687e"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/triton/cl577369732.patch b/third_party/xla/third_party/triton/cl577369732.patch
deleted file mode 100644
index e63b9f3804974b..00000000000000
--- a/third_party/xla/third_party/triton/cl577369732.patch
+++ /dev/null
@@ -1,116 +0,0 @@
-==== triton/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp#19 - /google/src/cloud/springerm/mlir_3cd2a0bc1a2dcf851f1821765946b77d0e65bd2e_1698463035/triton/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp	2023-10-19 14:55:11.000000000 -0700
-+++ triton/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp	2023-10-27 20:17:46.000000000 -0700
-@@ -759,7 +759,7 @@
-   OpBuilder builder(forOp);
-   // Get init operands for loop carried values
-   for (BlockArgument &arg : forOp.getRegionIterArgs()) {
--    OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg);
-+    OpOperand &operand = *forOp.getTiedLoopInit(arg);
-     setValueMapping(arg, operand.get(), 0);
-   }
- 
-==== triton/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp#10 - /google/src/cloud/springerm/mlir_3cd2a0bc1a2dcf851f1821765946b77d0e65bd2e_1698463035/triton/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp	2023-10-19 14:55:11.000000000 -0700
-+++ triton/lib/Dialect/TritonGPU/Transforms/Prefetch.cpp	2023-10-27 20:17:46.000000000 -0700
-@@ -188,7 +188,7 @@
-   auto getIncomingOp = [this](Value v) -> Value {
-     if (auto arg = v.dyn_cast<BlockArgument>())
-       if (arg.getOwner()->getParentOp() == forOp.getOperation())
--        return forOp.getOpOperandForRegionIterArg(arg).get();
-+        return forOp.getTiedLoopInit(arg)->get();
-     return Value();
-   };
- 
-@@ -298,10 +298,10 @@
-       Operation *firstDot = builder.clone(*dot, mapping);
-       if (Value a = operand2headPrefetch.lookup(dot.getA()))
-         firstDot->setOperand(
--            0, newForOp.getRegionIterArgForOpOperand(*a.use_begin()));
-+            0, newForOp.getTiedLoopRegionIterArg(&*a.use_begin()));
-       if (Value b = operand2headPrefetch.lookup(dot.getB()))
-         firstDot->setOperand(
--            1, newForOp.getRegionIterArgForOpOperand(*b.use_begin()));
-+            1, newForOp.getTiedLoopRegionIterArg(&*b.use_begin()));
- 
-       // remaining part
-       int64_t kOff = prefetchWidth;
-==== triton/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp#18 - /google/src/cloud/springerm/mlir_3cd2a0bc1a2dcf851f1821765946b77d0e65bd2e_1698463035/triton/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp	2023-10-24 18:31:01.000000000 -0700
-+++ triton/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp	2023-10-27 20:17:46.000000000 -0700
-@@ -245,7 +245,7 @@
-   for (OpOperand &use : value.getUses()) {
-     Operation *user = use.getOwner();
-     if (auto forOp = dyn_cast<scf::ForOp>(user)) {
--      Value arg = forOp.getRegionIterArgForOpOperand(use);
-+      Value arg = forOp.getTiedLoopRegionIterArg(&use);
-       Value result = forOp.getResultForOpOperand(use);
-       setEncoding({arg, result}, info, changed, user);
-       continue;
-@@ -767,7 +767,7 @@
-       SmallVector<Value> newOperands;
-       for (auto arg : forOp.getRegionIterArgs()) {
-         if (slice.count(arg)) {
--          OpOperand &initVal = forOp.getOpOperandForRegionIterArg(arg);
-+          OpOperand &initVal = *forOp.getTiedLoopInit(arg);
-           argMapping.push_back(std::make_pair(
-               forOp.getResultForOpOperand(initVal).getResultNumber(),
-               forOp.getInitArgs().size() + newOperands.size()));
-==== triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp#16 - /google/src/cloud/springerm/mlir_3cd2a0bc1a2dcf851f1821765946b77d0e65bd2e_1698463035/triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2023-10-24 18:31:01.000000000 -0700
-+++ triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2023-10-27 20:17:46.000000000 -0700
-@@ -430,10 +430,10 @@
-     Block *block = blockArg.getOwner();
-     Operation *parentOp = block->getParentOp();
-     if (auto forOp = dyn_cast<scf::ForOp>(parentOp)) {
--      OpOperand &initOperand = forOp.getOpOperandForRegionIterArg(blockArg);
-+      OpOperand *initOperand = forOp.getTiedLoopInit(blockArg);
-       Value yieldOperand = forOp.getBody()->getTerminator()->getOperand(
-           blockArg.getArgNumber() - forOp.getNumInductionVars());
--      queue.push_back({initOperand.get(), encoding});
-+      queue.push_back({initOperand->get(), encoding});
-       queue.push_back({yieldOperand, encoding});
-       continue;
-     }
-==== triton/lib/Dialect/TritonNvidiaGPU/Transforms/Utility.cpp#1 - /google/src/cloud/springerm/mlir_3cd2a0bc1a2dcf851f1821765946b77d0e65bd2e_1698463035/triton/lib/Dialect/TritonNvidiaGPU/Transforms/Utility.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonNvidiaGPU/Transforms/Utility.cpp	2023-10-12 01:35:16.000000000 -0700
-+++ triton/lib/Dialect/TritonNvidiaGPU/Transforms/Utility.cpp	2023-10-27 20:17:46.000000000 -0700
-@@ -88,9 +88,8 @@
-     auto parentOp = blockArg.getOwner()->getParentOp();
-     if (auto forOp = dyn_cast<scf::ForOp>(parentOp)) {
-       if (blockArg.getArgNumber() >= forOp.getNumInductionVars()) {
--        if (failed(getDependentPointers(
--                forOp.getOpOperandForRegionIterArg(blockArg).get(),
--                dependentSet, processedSet)))
-+        if (failed(getDependentPointers(forOp.getTiedLoopInit(blockArg)->get(),
-+                                        dependentSet, processedSet)))
-           return failure();
- 
-         unsigned operandIdx =
-@@ -383,7 +382,7 @@
-       if (failed(addControlOperandsForForOp(forOp)))
-         return failure();
-       if (blockArg.getArgNumber() >= forOp.getNumInductionVars()) {
--        Value operand = forOp.getOpOperandForRegionIterArg(blockArg).get();
-+        Value operand = forOp.getTiedLoopInit(blockArg)->get();
-         if (failed(tryInsertAndPropagate(operand)))
-           return failure();
- 
-==== triton/test/lib/Analysis/TestAlias.cpp#5 - /google/src/cloud/springerm/mlir_3cd2a0bc1a2dcf851f1821765946b77d0e65bd2e_1698463035/triton/test/lib/Analysis/TestAlias.cpp ====
-# action=edit type=text
---- triton/test/lib/Analysis/TestAlias.cpp	2023-10-19 14:55:11.000000000 -0700
-+++ triton/test/lib/Analysis/TestAlias.cpp	2023-10-27 20:17:47.000000000 -0700
-@@ -87,7 +87,7 @@
-       }
-       if (auto forOp = dyn_cast<scf::ForOp>(op)) {
-         for (auto arg : llvm::enumerate(forOp.getRegionIterArgs())) {
--          auto operand = forOp.getOpOperandForRegionIterArg(arg.value()).get();
-+          auto operand = forOp.getTiedLoopInit(arg.value())->get();
-           auto opNames = getAllocOpNames(operand);
-           auto argName = getValueOperandName(arg.value(), state);
-           print(argName, opNames, os);
diff --git a/third_party/xla/third_party/triton/cl577379396.patch b/third_party/xla/third_party/triton/cl577379396.patch
deleted file mode 100644
index ee569f9b8f55c3..00000000000000
--- a/third_party/xla/third_party/triton/cl577379396.patch
+++ /dev/null
@@ -1,33 +0,0 @@
-diff --git a/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp b/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
---- a/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
-@@ -246,7 +246,7 @@ SmallVector<Value> LayoutPropagation::pr
-     Operation *user = use.getOwner();
-     if (auto forOp = dyn_cast<scf::ForOp>(user)) {
-       Value arg = forOp.getTiedLoopRegionIterArg(&use);
--      Value result = forOp.getResultForOpOperand(use);
-+      Value result = forOp.getTiedLoopResult(&use);
-       setEncoding({arg, result}, info, changed, user);
-       continue;
-     }
-@@ -769,7 +769,7 @@ static void rewriteSlice(SetVector<Value
-         if (slice.count(arg)) {
-           OpOperand &initVal = *forOp.getTiedLoopInit(arg);
-           argMapping.push_back(std::make_pair(
--              forOp.getResultForOpOperand(initVal).getResultNumber(),
-+              forOp.getTiedLoopResult(&initVal).getResultNumber(),
-               forOp.getInitArgs().size() + newOperands.size()));
-           newOperands.push_back(mapping.lookup(initVal.get()));
-         }
-diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
---- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
-@@ -545,7 +545,7 @@ struct ForOpDeadArgElimination : public 
-       Value value = queue.pop_back_val();
-       if (auto nestedFor = value.getDefiningOp<scf::ForOp>()) {
-         auto result = value.cast<OpResult>();
--        OpOperand &forOperand = nestedFor.getOpOperandForResult(result);
-+        OpOperand &forOperand = *nestedFor.getTiedLoopInit(result);
-         markLive(forOperand.get());
-         auto nestedYieldOp =
-             cast<scf::YieldOp>(nestedFor.getBody()->getTerminator());
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index c0c6207f85da73..b864617b503f3e 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl578837341"
-    TRITON_SHA256 = "0d8112bb31d48b5beadbfc2e13c52770a95d3759b312b15cf26dd72e71410568"
+    TRITON_COMMIT = "cl588045313"
+    TRITON_SHA256 = "14cb6ddccc3139b2e8d77af08bb232eb06536d5c715c4bbc720a752af40ba2dc"
 
     tf_http_archive(
         name = "triton",
@@ -15,7 +15,7 @@ def repo():
         urls = tf_mirror_urls("https://github.com/openxla/triton/archive/{commit}.tar.gz".format(commit = TRITON_COMMIT)),
         # For temporary changes which haven't landed upstream yet.
         patch_file = [
-            "//third_party/triton:cl568176943.patch",
             "//third_party/triton:b304456327.patch",
+            "//third_party/triton:cl568176943.patch",
         ],
     )
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index e9fc2d4eb20a55..9de6b6e0c2bd54 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -526,34 +526,9 @@ build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_c
 build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
-build:rbe_linux_cuda_nvcc --config=cuda
+build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
-build:rbe_linux_cuda_nvcc --@local_xla//xla/python:enable_gpu=true
-build:rbe_linux_cuda_nvcc --@local_xla//xla/python:jax_cuda_pip_rpaths=true
-build:rbe_linux_cuda_nvcc --define=xla_python_enable_gpu=true
-build:rbe_linux_cuda_nvcc --config=tensorrt
-build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_75,compute_80"
-build:rbe_linux_cuda_nvcc --action_env=TF_CUDA_VERSION="12"
-build:rbe_linux_cuda_nvcc --action_env=TF_CUDNN_VERSION="8"
-build:rbe_linux_cuda_nvcc --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.2"
-build:rbe_linux_cuda_nvcc --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
-build:rbe_linux_cuda_nvcc --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:rbe_linux_cuda_nvcc --crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
-build:rbe_linux_cuda_nvcc --config=rbe_linux
-build:rbe_linux_cuda_nvcc --host_crosstool_top="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain"
-build:rbe_linux_cuda_nvcc --extra_toolchains="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cuda_nvcc --extra_execution_platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --host_platform="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --platforms="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-clang_manylinux2014-cuda12.2-cudnn8.9_config_python3.9"
-build:rbe_linux_cuda_nvcc --python_path="/usr/bin/python3"
-# These you may need to change for your own GCP project.
-common:rbe_linux_cuda_nvcc --remote_instance_name=projects/tensorflow-testing/instances/default_instance
-build:rbe_linux_cuda_nvcc --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_cuda"
-build:rbe_linux_cuda_nvcc --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_tensorrt"
-build:rbe_linux_cuda_nvcc --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda12.2-cudnn8.9_config_nccl"
-test:rbe_linux_cuda_nvcc --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+build:rbe_linux_cuda_nvcc --action_env=TF_NVCC_CLANG="1"
 
 # TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
 build:rbe_win --config=rbe_base
@@ -692,19 +667,39 @@ build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda
 build:release_cpu_macos --config=avx_linux
 test:release_cpu_macos --config=release_base
 
-# Build configs for macOS ARM CPUs
+# Base build configs for macOS
+build:release_macos_base --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
+build:release_macos_base --define=no_nccl_support=true --output_filter=^$
+
+# Build configs for macOS x86
+build:release_macos_x86 --config=release_macos_base
+# Build with the AVX instruction set when on macOS x86
+build:release_macos_x86 --config=avx_linux
+build:release_macos_x86 --cpu=darwin
+# Target Catalina as the minimum compatible OS version
+build:release_macos_x86 --macos_minimum_os=10.15
+build:release_macos_x86 --action_env MACOSX_DEPLOYMENT_TARGET=10.15
+
+# Build configs for macOS Arm64
+build:release_macos_arm64 --config=release_macos_base
 build:release_macos_arm64 --cpu=darwin_arm64
-# Set DEVELOPER_DIR to select a version of Xcode.
-build:release_macos_arm64 --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
-build:release_macos_arm64 --define=no_nccl_support=true
-# Suppress all warning messages
-build:release_macos_arm64 --output_filter=^$
-# Disable MKL
 build:release_macos_arm64 --define=tensorflow_mkldnn_contraction_kernel=0
 # Target Moneterey as the minimum compatible OS version
 build:release_macos_arm64 --macos_minimum_os=12.0
 build:release_macos_arm64 --action_env MACOSX_DEPLOYMENT_TARGET=12.0
 
+# Base test configs for macOS
+test:release_macos_base --verbose_failures=true --local_test_jobs=HOST_CPUS
+test:release_macos_base --test_timeout=300,450,1200,3600 --test_output=errors
+test:release_macos_base --build_tests_only --keep_going
+test:release_macos_base --flaky_test_attempts=3
+
+# Test configs for macOS x86
+test:release_macos_x86 --config=release_macos_base
+
+# Test configs for macOS Arm64
+test:release_macos_arm64 --config=release_macos_base
+
 # TODO(kanglan): Update windows configs after b/289091160 is fixed
 build:release_cpu_windows --config=avx_win
 build:release_cpu_windows --define=no_tensorflow_py_deps=true
@@ -723,10 +718,14 @@ build:no_tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compil
 
 # Use --config=tf_public_cache to try and use the TensorFlow public build cache
 # to build TensorFlow. Look at ci/official/envs to find which types of jobs
-# push to the cache.
+# push to the cache.  For macOS, use --config=tf_public_macos_cache
 build:tf_public_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --remote_upload_local_results=false
 # Cache pushes are limited to TF's CI system.
 build:tf_public_cache_push --config=tf_public_cache --remote_upload_local_results=true --google_default_credentials
+# Public cache for macOS builds
+build:tf_public_macos_cache --remote_cache="https://storage.googleapis.com/tensorflow-macos-bazel-cache/oct2023" --remote_upload_local_results=false
+# Cache pushes are limited to TF's CI system.
+build:tf_public_macos_cache_push --config=tf_public_macos_cache --remote_upload_local_results=true --google_default_credentials
 
 # END TF CACHE HELPER OPTIONS
 # BEGIN TF TEST SUITE OPTIONS
@@ -743,22 +742,27 @@ build:linux_libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.
 test:linux_cpu_wheel_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cpu_wheel_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cpu_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_cpu_wheel_test --config=linux_cpu_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cpu_wheel_test --config=linux_cpu_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # CUDA WHEEL
-test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cuda_wheel_test_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:linux_cuda_wheel_test_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_cuda_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cuda_wheel_test --config=linux_cuda_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # ARM64 WHEEL
 test:linux_arm64_wheel_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_arm64_wheel_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:linux_arm64_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
-test:linux_arm64_wheel_test --config=linux_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...  -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test
+test:linux_arm64_wheel_test --config=linux_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...  -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test
 # MACOS ARM64 WHEEL
 test:macos_arm64_wheel_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
 test:macos_arm64_wheel_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
-test:macos_arm64_wheel_test_filters --test_lang_filters=py
-test:macos_arm64_wheel_test --config=macos_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xla/service/gpu/... -//tensorflow/compiler/xla/tools/multihost_hlo_runner/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/... -//tensorflow/compiler/xla/tests:local_client_aot_test_computation -//tensorflow/compiler/xla/tests:local_client_aot_test_helper -//tensorflow/compiler/xla/tests:local_client_aot_test
+test:macos_arm64_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
+test:macos_arm64_wheel_test --config=macos_arm64_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/...
+# MACOS X86 WHEEL
+test:macos_x86_wheel_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
+test:macos_x86_wheel_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
+test:macos_x86_wheel_test_filters --test_lang_filters=py --test_size_filters=small,medium
+test:macos_x86_wheel_test --config=macos_x86_wheel_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/...
 
 # PYCPP TESTS run a suite of Python and C++ tests to verify general correctness over
 # the whole TF code base. These are usually run continuously or upon presubmit.
@@ -766,21 +770,53 @@ test:macos_arm64_wheel_test --config=macos_arm64_wheel_test_filters -- //tensorf
 test:linux_cpu_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_cpu_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_cpu_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # CUDA PYCPP:
 test:linux_cuda_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:linux_cuda_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
-test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # ARM64 PYCPP
 test:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:linux_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --flaky_test_attempts=3
 # TODO(michaelhudgins): Why do we need to specifically omit go and java here? 
-test:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test
+test:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test -//tensorflow/python/tools:aot_compiled_test
+# CROSS-COMPILE ARM64 PYCPP
+test:cross_compile_linux_arm64_pycpp_test --config=linux_arm64_pycpp_test
+# Tests that fail only when cross-compiled
+test:cross_compile_linux_arm64_pycpp_test -//tensorflow/compiler/mlir/quantization/stablehlo:convert_tf_quant_to_mhlo_int_test
 # MACOS ARM64 PYCPP
 test:macos_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
 test:macos_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
-test:macos_arm64_pycpp_test_filters --test_lang_filters=cc,py
-test:macos_arm64_pycpp_test --config=macos_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xla/service/gpu/... -//tensorflow/compiler/xla/tools/multihost_hlo_runner/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/... -//tensorflow/compiler/xla/tests:local_client_aot_test_computation -//tensorflow/compiler/xla/tests:local_client_aot_test_helper -//tensorflow/compiler/xla/tests:local_client_aot_test
+test:macos_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
+test:macos_arm64_pycpp_test --config=macos_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/... -//tensorflow/core/kernels/image:resize_bicubic_op_test
 # END TF TEST SUITE OPTIONS
+
+# START LINUX AARCH64 CROSS-COMPILE CONFIGS
+# Set execution platform to Linux x86
+# Note: Lot of the "host_" flags such as "host_cpu" and "host_crosstool_top"
+# flags seem to be actually used to specify the execution platform details. It
+# seems it is this way because these flags are old and predate the distinction
+# between host and execution platform.
+build:cross_compile_linux_arm64 --host_cpu=k8
+build:cross_compile_linux_arm64 --host_crosstool_top=//tensorflow/tools/toolchains/cross_compile/cc:cross_compile_toolchain_suite
+build:cross_compile_linux_arm64 --extra_execution_platforms=//tensorflow/tools/toolchains/cross_compile/config:linux_x86_64
+
+# Set the target CPU to Aarch64
+build:cross_compile_linux_arm64 --platforms=//tensorflow/tools/toolchains/cross_compile/config:linux_aarch64
+build:cross_compile_linux_arm64 --cpu=aarch64
+build:cross_compile_linux_arm64 --crosstool_top=//tensorflow/tools/toolchains/cross_compile/cc:cross_compile_toolchain_suite
+
+# RBE configs
+build:rbe_cross_compile_linux_arm64 --config=cross_compile_linux_arm64
+build:rbe_cross_compile_linux_arm64 --config=rbe_base
+build:rbe_cross_compile_linux_arm64 --remote_instance_name=projects/tensorflow-testing/instances/default_instance
+
+# Test-related settings below this point
+# We cannot run cross-compiled tests on the remote Linux x86 VMs so we need to
+# force all tests to run locally on the Aarch64 host.
+test:rbe_cross_compile_linux_arm64 --strategy=TestRunner=local
+test:rbe_cross_compile_linux_arm64 --verbose_failures=true --local_test_jobs=HOST_CPUS --test_output=errors
+test:rbe_cross_compile_linux_arm64 --flaky_test_attempts=3 --build_tests_only
+# END LINUX AARCH64 CROSS-COMPILE CONFIGS
diff --git a/third_party/xla/third_party/tsl/.kokoro/windows/windows_build.sh b/third_party/xla/third_party/tsl/.kokoro/windows/windows_build.sh
index 331efa186fb87e..4f4b0a0fdf9d31 100644
--- a/third_party/xla/third_party/tsl/.kokoro/windows/windows_build.sh
+++ b/third_party/xla/third_party/tsl/.kokoro/windows/windows_build.sh
@@ -50,7 +50,7 @@ export PATH="$PATH:/c/Python38"
   -- //tsl/... \
   || { echo "Bazel Build Failed" && exit 1; }
 
-# Test TSL TODO(ddunleavy) enable all tests
+# Test TSL
 /c/tools/bazel.exe test \
   --output_filter="" \
   --flaky_test_attempts=3 \
@@ -60,7 +60,7 @@ export PATH="$PATH:/c/Python38"
   --build_tag_filters=$TAGS_FILTER  \
   --test_tag_filters=$TAGS_FILTER \
   --keep_going \
-  -- //tsl/... -//tsl/platform:subprocess_test -//tsl/platform/cloud:google_auth_provider_test -//tsl/platform/cloud:oauth_client_test \
+  -- //tsl/... \
   || { echo "Bazel Test Failed" && exit 1; }
 
 exit 0
diff --git a/third_party/xla/third_party/tsl/opensource_only.files b/third_party/xla/third_party/tsl/opensource_only.files
index e4974e79805725..fa84f35768a5d2 100644
--- a/third_party/xla/third_party/tsl/opensource_only.files
+++ b/third_party/xla/third_party/tsl/opensource_only.files
@@ -29,7 +29,9 @@ third_party/gpus/crosstool/BUILD:
 third_party/gpus/crosstool/LICENSE:
 third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl:
 third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl:
+third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl:
 third_party/gpus/cuda/BUILD.tpl:
+third_party/gpus/cuda/BUILD.windows.tpl:
 third_party/gpus/cuda/BUILD:
 third_party/gpus/cuda/LICENSE:
 third_party/gpus/cuda/build_defs.bzl.tpl:
@@ -129,6 +131,8 @@ tools/toolchains/BUILD:
 tools/toolchains/clang6/BUILD:
 tools/toolchains/cpus/py/BUILD:
 tools/toolchains/cpus/py3/BUILD:
+tools/toolchains/cross_compile/cc/BUILD:
+tools/toolchains/cross_compile/config/BUILD:
 tools/toolchains/embedded/arm-linux/BUILD:
 tools/toolchains/java/BUILD:
 tools/toolchains/python/BUILD:
diff --git a/third_party/xla/third_party/tsl/third_party/gemmlowp/workspace.bzl b/third_party/xla/third_party/tsl/third_party/gemmlowp/workspace.bzl
index b98035569852e2..884f707719a623 100644
--- a/third_party/xla/third_party/tsl/third_party/gemmlowp/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gemmlowp/workspace.bzl
@@ -7,8 +7,8 @@ def repo():
 
     # Attention: tools parse and update these lines.
     # LINT.IfChange
-    GEMMLOWP_COMMIT = "e844ffd17118c1e17d94e1ba4354c075a4577b88"
-    GEMMLOWP_SHA256 = "522b7a82d920ebd0c4408a5365866a40b81d1c0d60b2369011d315cca03c6476"
+    GEMMLOWP_COMMIT = "16e8662c34917be0065110bfcd9cc27d30f52fdf"
+    GEMMLOWP_SHA256 = "7dc418717c8456473fac4ff2288b71057e3dcb72894524c734a4362cdb51fa8b"
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/gemmlowp.cmake)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/check_cuda_libs.py b/third_party/xla/third_party/tsl/third_party/gpus/check_cuda_libs.py
index b7d98ef2581157..afd6380b0ac203 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/check_cuda_libs.py
+++ b/third_party/xla/third_party/tsl/third_party/gpus/check_cuda_libs.py
@@ -23,6 +23,7 @@
 """
 import os
 import os.path
+import platform
 import subprocess
 import sys
 
@@ -38,6 +39,10 @@ class ConfigError(Exception):
   pass
 
 
+def _is_windows():
+  return platform.system() == "Windows"
+
+
 def check_cuda_lib(path, check_soname=True):
   """Tests if a library exists on disk and whether its soname matches the filename.
 
@@ -52,7 +57,7 @@ def check_cuda_lib(path, check_soname=True):
   if not os.path.isfile(path):
     raise ConfigError("No library found under: " + path)
   objdump = which("objdump")
-  if check_soname and objdump is not None:
+  if check_soname and objdump is not None and not _is_windows():
     # Decode is necessary as in py3 the return type changed from str to bytes
     output = subprocess.check_output([objdump, "-p", path]).decode("utf-8")
     output = [line for line in output.splitlines() if "SONAME" in line]
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
index 81e54ad431fccf..0da1d7b58f4bb0 100755
--- a/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -45,10 +45,11 @@ import pipes
 
 # Template values set by cuda_autoconf.
 CPU_COMPILER = ('%{cpu_compiler}')
-GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
+HOST_COMPILER_PATH = ('%{host_compiler_path}')
 
 NVCC_PATH = '%{nvcc_path}'
-PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+PREFIX_DIR = os.path.dirname(HOST_COMPILER_PATH)
+USE_CLANG_COMPILER = '%{use_clang_compiler}'
 NVCC_VERSION = '%{cuda_version}'
 
 def Log(s):
@@ -253,13 +254,23 @@ def InvokeNvcc(argv, log=False):
   # Force C++17 dialect (note, everything in just one string!)
   nvccopts += ' --std c++17 '
   nvccopts += fatbin_options
+  # The option `-allow-unsupported-compiler` is required for the combination of
+  # NVCC+clang compilers. 
+  # The following message appears if this option is not provided:
+  # unsupported clang version! clang version must be less than 16 and greater
+  # than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used
+  # to override this version check; however, using an unsupported host compiler
+  # may cause compilation failure or incorrect run time execution.
+  # Use at your own risk.
+  if USE_CLANG_COMPILER:
+    nvccopts += ' -allow-unsupported-compiler --expt-extended-lambda --expt-relaxed-constexpr '
 
   if depfiles:
     # Generate the dependency file
     depfile = depfiles[0]
     cmd = (NVCC_PATH + ' ' + nvccopts +
            ' --compiler-options "' + host_compiler_options + '"' +
-           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+           ' --compiler-bindir=' + HOST_COMPILER_PATH +
            ' -I .' +
            ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
     if log: Log(cmd)
@@ -269,7 +280,7 @@ def InvokeNvcc(argv, log=False):
 
   cmd = (NVCC_PATH + ' ' + nvccopts +
          ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
-         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+         ' --compiler-bindir=' + HOST_COMPILER_PATH +
          ' -I .' +
          ' -x cu ' + opt + includes + ' -c ' + srcs + out)
 
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index 8fb22313010a45..77ec948af32c6e 100755
--- a/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -86,8 +86,8 @@ def GetHostCompilerOptions(argv):
     opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
   if args.g:
     opts += ' -g' + ' -g'.join(sum(args.g, []))
-  #if args.fno_canonical_system_headers:
-  #  opts += ' -fno-canonical-system-headers'
+  if args.fno_canonical_system_headers:
+    opts += ' -no-canonical-prefixes'
   if args.sysroot:
     opts += ' --sysroot ' + args.sysroot[0]
 
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
new file mode 100644
index 00000000000000..c46e09484fdfad
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -0,0 +1,256 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
+
+DESCRIPTION:
+  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
+"""
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import tempfile
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('%{cpu_compiler}')
+GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
+
+NVCC_PATH = '%{nvcc_path}'
+NVCC_VERSION = '%{cuda_version}'
+NVCC_TEMP_DIR = "%{nvcc_tmp_dir}"
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from options.
+
+  Args:
+    option: The option whose value to extract.
+
+  Returns:
+    1. A list of values, either directly following the option,
+    (eg., /opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., /opt val1 /opt val2).
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser(prefix_chars='-/')
+  parser.add_argument(option, nargs='*', action='append')
+  option = option.lstrip('-/').replace('-', '_')
+  args, leftover = parser.parse_known_args(argv)
+  if args and vars(args)[option]:
+    return (sum(vars(args)[option], []), leftover)
+  return ([], leftover)
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    1. The string that can be passed directly to nvcc.
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, leftover = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return (['--' + a for a in options], leftover)
+  return ([], leftover)
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  src_files = [f for f in argv if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  if len(src_files) == 0:
+    raise Error('No source files found for cuda compilation.')
+
+  out_file = [ f for f in argv if f.startswith('/Fo') ]
+  if len(out_file) != 1:
+    raise Error('Please specify exactly one output file for cuda compilation.')
+  out = ['-o', out_file[0][len('/Fo'):]]
+
+  nvcc_compiler_options, argv = GetNvccOptions(argv)
+
+  opt_option, argv = GetOptionValue(argv, '/O')
+  opt = ['-g']
+  if (len(opt_option) > 0 and opt_option[0] != 'd'):
+    opt = ['-O2']
+
+  include_options, argv = GetOptionValue(argv, '/I')
+  includes = ["-I " + include for include in include_options]
+
+  defines, argv = GetOptionValue(argv, '/D')
+  defines = [
+      '-D' + define
+      for define in defines
+      if 'BAZEL_CURRENT_REPOSITORY' not in define
+  ]
+
+  undefines, argv = GetOptionValue(argv, '/U')
+  undefines = ['-U' + define for define in undefines]
+
+  fatbin_options, argv = GetOptionValue(argv, '-Xcuda-fatbinary')
+  fatbin_options = ['--fatbin-options=' + option for option in fatbin_options]
+
+  # The rest of the unrecognized options should be passed to host compiler
+  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
+
+  m_options = ["-m64"]
+
+  nvccopts = ['-D_FORCE_INLINES']
+  compute_capabilities, argv = GetOptionValue(argv, "--cuda-gpu-arch")
+  for capability in compute_capabilities:
+    capability = capability[len('sm_'):]
+    nvccopts += [
+        r'-gencode=arch=compute_%s,"code=sm_%s"' % (capability, capability)
+    ]
+  compute_capabilities, argv = GetOptionValue(argv, '--cuda-include-ptx')
+  for capability in compute_capabilities:
+    capability = capability[len('sm_'):]
+    nvccopts += [
+        r'-gencode=arch=compute_%s,"code=compute_%s"' % (capability, capability)
+    ]
+  _, argv = GetOptionValue(argv, '--no-cuda-include-ptx')
+
+  # nvcc doesn't respect the INCLUDE and LIB env vars from MSVC,
+  # so we explicity specify the system include paths and library search paths.
+  if 'INCLUDE' in os.environ:
+    nvccopts += [('--system-include="%s"' % p) for p in os.environ['INCLUDE'].split(";")]
+  if 'LIB' in os.environ:
+    nvccopts += [('--library-path="%s"' % p) for p in os.environ['LIB'].split(";")]
+
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += m_options
+  nvccopts += fatbin_options
+  nvccopts += ['--compiler-options=' + ",".join(host_compiler_options)]
+  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
+  # Specify a unique temp directory for nvcc to generate intermediate files,
+  # then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
+  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
+  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
+  if os.path.isfile(NVCC_TEMP_DIR):
+    os.remove(NVCC_TEMP_DIR)
+  if not os.path.exists(NVCC_TEMP_DIR):
+    os.makedirs(NVCC_TEMP_DIR)
+  # Provide a unique dir for each compiling action to avoid conflicts.
+  tempdir = tempfile.mkdtemp(dir = NVCC_TEMP_DIR)
+  nvccopts += ['--keep', '--keep-dir', tempdir]
+  # Force C++17 dialect (note, everything in just one string!)
+  nvccopts += ['--std c++17']
+  if log:
+    Log([NVCC_PATH] + nvccopts)
+
+  # Store command line options in a file to avoid hitting the character limit.
+  optsfile = tempfile.NamedTemporaryFile(mode='w', dir=tempdir, delete=False)
+  optsfile.write("\n".join(nvccopts))
+  optsfile.close()
+
+  proc = subprocess.Popen([NVCC_PATH, "--options-file", optsfile.name],
+                          stdout=sys.stdout,
+                          stderr=sys.stderr,
+                          env=os.environ.copy(),
+                          shell=True)
+  proc.wait()
+  return proc.returncode
+
+def ExpandParamsFileForArgv():
+  new_argv = []
+  for arg in sys.argv:
+    if arg.startswith("@"):
+      with open(arg.strip("@")) as f:
+        new_argv.extend([l.strip() for l in f.readlines()])
+    else:
+      new_argv.append(arg)
+
+  sys.argv = new_argv
+
+def ProcessFlagForCommandFile(flag):
+  if flag.startswith("/D") or flag.startswith("-D"):
+    # We need to re-escape /DFOO="BAR" as /DFOO=\"BAR\", so that we get
+    # `#define FOO "BAR"` after expansion as a string literal define
+    if flag.endswith('"') and not flag.endswith('\\"'):
+      flag = '\\"'.join(flag.split('"', 1))
+      flag = '\\"'.join(flag.rsplit('"', 1))
+      return flag
+  return flag
+
+def main():
+  ExpandParamsFileForArgv()
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))
+                             and not flag.startswith(('-nvcc_options'))]
+  output = [flag for flag in cpu_compiler_flags if flag.startswith("/Fo")]
+
+  # Store command line options in a file to avoid hitting the character limit.
+  if len(output) == 1:
+    commandfile_path = output[0][3:] + ".msvc_params"
+    commandfile = open(commandfile_path, "w")
+    cpu_compiler_flags = [ProcessFlagForCommandFile(flag) for flag in cpu_compiler_flags]
+    commandfile.write("\n".join(cpu_compiler_flags))
+    commandfile.close()
+    return subprocess.call([CPU_COMPILER, "@" + commandfile_path])
+  else:
+    return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.tpl
index 700e040a88eeca..90a18b90de048c 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.tpl
@@ -61,23 +61,23 @@ cuda_header_library(
 
 cc_library(
     name = "cudart_static",
-    srcs = ["cuda/lib/libcudart_static.a"],
+    srcs = ["cuda/lib/%{cudart_static_lib}"],
     linkopts = [
         "-ldl",
-        "-lrt",
         "-lpthread",
+        %{cudart_static_linkopt}
     ],
 )
 
 cc_library(
     name = "cuda_driver",
-    srcs = ["cuda/lib/libcuda.so"],
+    srcs = ["cuda/lib/%{cuda_driver_lib}"],
 )
 
 cc_library(
     name = "cudart",
-    srcs = glob(["cuda/lib/libcudart.so.*"]),
-    data = glob(["cuda/lib/libcudart.so.*"]),
+    srcs = ["cuda/lib/%{cudart_lib}"],
+    data = ["cuda/lib/%{cudart_lib}"],
     linkstatic = 1,
 )
 
@@ -128,30 +128,30 @@ cuda_header_library(
 
 cc_library(
     name = "cublas",
-    srcs = glob(["cuda/lib/libcublas.so.*"]),
-    data = glob(["cuda/lib/libcublas.so.*"]),
+    srcs = ["cuda/lib/%{cublas_lib}"],
+    data = ["cuda/lib/%{cublas_lib}"],
     linkstatic = 1,
 )
 
 cc_library(
     name = "cublasLt",
-    srcs = glob(["cuda/lib/libcublasLt.so.*"]),
-    data = glob(["cuda/lib/libcublasLt.so.*"]),
+    srcs = ["cuda/lib/%{cublasLt_lib}"],
+    data = ["cuda/lib/%{cublasLt_lib}"],
     linkstatic = 1,
 )
 
 cc_library(
     name = "cusolver",
-    srcs = glob(["cuda/lib/libcusolver.so.*"]),
-    data = glob(["cuda/lib/libcusolver.so.*"]),
+    srcs = ["cuda/lib/%{cusolver_lib}"],
+    data = ["cuda/lib/%{cusolver_lib}"],
     linkopts = ["-lgomp"],
     linkstatic = 1,
 )
 
 cc_library(
     name = "cudnn",
-    srcs = glob(["cuda/lib/libcudnn.so.*"]),
-    data = glob(["cuda/lib/libcudnn.so.*"]),
+    srcs = ["cuda/lib/%{cudnn_lib}"],
+    data = ["cuda/lib/%{cudnn_lib}"],
     linkstatic = 1,
 )
 
@@ -165,15 +165,15 @@ cc_library(
 
 cc_library(
     name = "cufft",
-    srcs = glob(["cuda/lib/libcufft.so.*"]),
-    data = glob(["cuda/lib/libcufft.so.*"]),
+    srcs = ["cuda/lib/%{cufft_lib}"],
+    data = ["cuda/lib/%{cufft_lib}"],
     linkstatic = 1,
 )
 
 cc_library(
     name = "curand",
-    srcs = glob(["cuda/lib/libcurand.so.*"]),
-    data = glob(["cuda/lib/libcurand.so.*"]),
+    srcs = ["cuda/lib/%{curand_lib}"],
+    data = ["cuda/lib/%{curand_lib}"],
     linkstatic = 1,
 )
 
@@ -192,7 +192,7 @@ cc_library(
 
 alias(
     name = "cub_headers",
-    actual = ":cuda_headers",
+    actual = "%{cub_actual}",
 )
 
 cuda_header_library(
@@ -213,13 +213,13 @@ cuda_header_library(
 
 cc_library(
     name = "cupti_dsos",
-    data = glob(["cuda/lib/libcupti.so.*"]),
+    data = ["cuda/lib/%{cupti_lib}"],
 )
 
 cc_library(
     name = "cusparse",
-    srcs = glob(["cuda/lib/libcusparse.so.*"]),
-    data = glob(["cuda/lib/libcusparse.so.*"]),
+    srcs = ["cuda/lib/%{cusparse_lib}"],
+    data = ["cuda/lib/%{cusparse_lib}"],
     linkopts = ["-lgomp"],
     linkstatic = 1,
 )
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.windows.tpl
new file mode 100644
index 00000000000000..dee0e898d9ae7a
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/BUILD.windows.tpl
@@ -0,0 +1,238 @@
+load(":build_defs.bzl", "cuda_header_library")
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@bazel_skylib//lib:selects.bzl", "selects")
+
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+# Config setting whether TensorFlow is built with CUDA support using clang.
+#
+# TODO(b/174244321), DEPRECATED: this target will be removed when all users
+# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_clang.
+selects.config_setting_group(
+    name = "using_clang",
+    match_all = [
+        "@local_config_cuda//:is_cuda_enabled",
+        "@local_config_cuda//:is_cuda_compiler_clang",
+    ],
+)
+
+# Config setting whether TensorFlow is built with CUDA support using nvcc.
+#
+# TODO(b/174244321), DEPRECATED: this target will be removed when all users
+# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_nvcc.
+selects.config_setting_group(
+    name = "using_nvcc",
+    match_all = [
+        "@local_config_cuda//:is_cuda_enabled",
+        "@local_config_cuda//:is_cuda_compiler_nvcc",
+    ],
+)
+
+# Equivalent to using_clang && -c opt.
+selects.config_setting_group(
+    name = "using_clang_opt",
+    match_all = [
+        ":using_clang",
+        ":_opt",
+    ],
+)
+
+config_setting(
+    name = "_opt",
+    values = {"compilation_mode": "opt"},
+)
+
+# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
+# All clients including TensorFlow should use these directives.
+cuda_header_library(
+    name = "cuda_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        ":cuda-include",
+    ],
+    include_prefix = "third_party/gpus",
+    includes = [
+        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
+        "cuda/include",
+    ],
+)
+
+cc_import(
+    name = "cudart_static",
+    # /WHOLEARCHIVE:cudart_static.lib will cause a
+    # "Internal error during CImplib::EmitThunk" error.
+    # Treat this library as interface library to avoid being whole archived when
+    # linking a DLL that depends on this.
+    # TODO(pcloudy): Remove this rule after b/111278841 is resolved.
+    interface_library = "cuda/lib/%{cudart_static_lib}",
+    system_provided = 1,
+)
+
+cc_import(
+    name = "cuda_driver",
+    interface_library = "cuda/lib/%{cuda_driver_lib}",
+    system_provided = 1,
+)
+
+cc_import(
+    name = "cudart",
+    interface_library = "cuda/lib/%{cudart_lib}",
+    system_provided = 1,
+)
+
+cuda_header_library(
+    name = "cublas_headers",
+    hdrs = [":cublas-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cublas/include"],
+    strip_include_prefix = "cublas/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "cusolver_headers",
+    hdrs = [":cusolver-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cusolver/include"],
+    strip_include_prefix = "cusolver/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "cufft_headers",
+    hdrs = [":cufft-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cufft/include"],
+    strip_include_prefix = "cufft/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "cusparse_headers",
+    hdrs = [":cusparse-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cusparse/include"],
+    strip_include_prefix = "cusparse/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "curand_headers",
+    hdrs = [":curand-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["curand/include"],
+    strip_include_prefix = "curand/include",
+    deps = [":cuda_headers"],
+)
+
+cc_import(
+    name = "cublas",
+    interface_library = "cuda/lib/%{cublas_lib}",
+    system_provided = 1,
+)
+
+cc_import(
+    name = "cublasLt",
+    interface_library = "cuda/lib/%{cublasLt_lib}",
+    system_provided = 1,
+)
+
+cc_import(
+    name = "cusolver",
+    interface_library = "cuda/lib/%{cusolver_lib}",
+    system_provided = 1,
+)
+
+cc_import(
+    name = "cudnn",
+    interface_library = "cuda/lib/%{cudnn_lib}",
+    system_provided = 1,
+)
+
+cc_library(
+    name = "cudnn_header",
+    hdrs = [":cudnn-include"],
+    include_prefix = "third_party/gpus/cudnn",
+    strip_include_prefix = "cudnn/include",
+    deps = [":cuda_headers"],
+)
+
+cc_import(
+    name = "cufft",
+    interface_library = "cuda/lib/%{cufft_lib}",
+    system_provided = 1,
+)
+
+cc_import(
+    name = "curand",
+    interface_library = "cuda/lib/%{curand_lib}",
+    system_provided = 1,
+)
+
+cc_library(
+    name = "cuda",
+    deps = [
+        ":cublas",
+        ":cublasLt",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+    ],
+)
+
+alias(
+    name = "cub_headers",
+    actual = "%{cub_actual}",
+)
+
+cuda_header_library(
+    name = "cupti_headers",
+    hdrs = [":cuda-extras"],
+    include_prefix = "third_party/gpus",
+    includes = ["cuda/extras/CUPTI/include/"],
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "nvml_headers",
+    hdrs = [":nvml"],
+    include_prefix = "third_party/gpus",
+    includes = ["cuda/nvml/include/"],
+    deps = [":cuda_headers"],
+)
+
+cc_import(
+    name = "cupti_dsos",
+    interface_library = "cuda/lib/%{cupti_lib}",
+    system_provided = 1,
+)
+
+cc_import(
+    name = "cusparse",
+    interface_library = "cuda/lib/%{cusparse_lib}",
+    system_provided = 1,
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = [":cuda-nvvm"],
+)
+
+bzl_library(
+    name = "build_defs_bzl",
+    srcs = ["build_defs.bzl"],
+    deps = [
+        "@bazel_skylib//lib:selects",
+    ],
+)
+
+py_library(
+    name = "cuda_config_py",
+    srcs = ["cuda/cuda_config.py"],
+)
+
+%{copy_rules}
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
index 8a0d9eb0872911..ff2f2f41091fe8 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
@@ -4,7 +4,8 @@
 
   * `TF_NEED_CUDA`: Whether to enable building with CUDA.
   * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
-  * `TF_CUDA_CLANG`: Whether to use clang as a cuda compiler.
+  * `TF_CUDA_CLANG`: Whether to use clang for C++ and Cuda compilation.
+  * `TF_NVCC_CLANG`: Whether to use clang for C++ and NVCC for Cuda compilation.
   * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
     both host and device code compilation if TF_CUDA_CLANG is 1.
   * `TF_SYSROOT`: The sysroot to use when compiling.
@@ -26,14 +27,27 @@
 """
 
 load("//third_party/clang_toolchain:download_clang.bzl", "download_clang")
+load(
+    "@bazel_tools//tools/cpp:lib_cc_configure.bzl",
+    "escape_string",
+    "get_env_var",
+)
+load(
+    "@bazel_tools//tools/cpp:windows_cc_configure.bzl",
+    "find_msvc_tool",
+    "find_vc_path",
+    "setup_vc_env_vars",
+)
 load(
     "//third_party/remote_config:common.bzl",
     "config_repo_label",
     "err_out",
     "execute",
     "get_bash_bin",
+    "get_cpu_value",
     "get_host_environ",
     "get_python_bin",
+    "is_windows",
     "raw_exec",
     "read_dir",
     "realpath",
@@ -82,7 +96,16 @@ def verify_build_defines(params):
         "host_compiler_warnings",
         "linker_bin_path",
         "compiler_deps",
+        "msvc_cl_path",
+        "msvc_env_include",
+        "msvc_env_lib",
+        "msvc_env_path",
+        "msvc_env_tmp",
+        "msvc_lib_path",
+        "msvc_link_path",
+        "msvc_ml_path",
         "unfiltered_compile_flags",
+        "win_compiler_deps",
     ]:
         if ("%{" + param + "}") not in params:
             missing.append(param)
@@ -96,13 +119,104 @@ def verify_build_defines(params):
             ".",
         )
 
+def _get_nvcc_tmp_dir_for_windows(repository_ctx):
+    """Return the Windows tmp directory for nvcc to generate intermediate source files."""
+    escaped_tmp_dir = escape_string(
+        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
+            "\\",
+            "\\\\",
+        ),
+    )
+    return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir"
+
+def _get_msvc_compiler(repository_ctx):
+    vc_path = find_vc_path(repository_ctx)
+    return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/")
+
+def _get_win_cuda_defines(repository_ctx):
+    """Return CROSSTOOL defines for Windows"""
+
+    # If we are not on Windows, return fake vaules for Windows specific fields.
+    # This ensures the CROSSTOOL file parser is happy.
+    if not is_windows(repository_ctx):
+        return {
+            "%{msvc_env_tmp}": "msvc_not_used",
+            "%{msvc_env_path}": "msvc_not_used",
+            "%{msvc_env_include}": "msvc_not_used",
+            "%{msvc_env_lib}": "msvc_not_used",
+            "%{msvc_cl_path}": "msvc_not_used",
+            "%{msvc_ml_path}": "msvc_not_used",
+            "%{msvc_link_path}": "msvc_not_used",
+            "%{msvc_lib_path}": "msvc_not_used",
+        }
+
+    vc_path = find_vc_path(repository_ctx)
+    if not vc_path:
+        auto_configure_fail(
+            "Visual C++ build tools not found on your machine." +
+            "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using",
+        )
+        return {}
+
+    env = setup_vc_env_vars(repository_ctx, vc_path)
+    escaped_paths = escape_string(env["PATH"])
+    escaped_include_paths = escape_string(env["INCLUDE"])
+    escaped_lib_paths = escape_string(env["LIB"])
+    escaped_tmp_dir = escape_string(
+        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
+            "\\",
+            "\\\\",
+        ),
+    )
+
+    msvc_cl_path = "windows/msvc_wrapper_for_nvcc.bat"
+    msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace(
+        "\\",
+        "/",
+    )
+    msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace(
+        "\\",
+        "/",
+    )
+    msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace(
+        "\\",
+        "/",
+    )
+
+    # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
+    # The generated files are guaranteed to have unique name, so they can share
+    # the same tmp directory
+    escaped_cxx_include_directories = [
+        _get_nvcc_tmp_dir_for_windows(repository_ctx),
+        "C:\\\\botcode\\\\w",
+    ]
+    for path in escaped_include_paths.split(";"):
+        if path:
+            escaped_cxx_include_directories.append(path)
+
+    return {
+        "%{msvc_env_tmp}": escaped_tmp_dir,
+        "%{msvc_env_path}": escaped_paths,
+        "%{msvc_env_include}": escaped_include_paths,
+        "%{msvc_env_lib}": escaped_lib_paths,
+        "%{msvc_cl_path}": msvc_cl_path,
+        "%{msvc_ml_path}": msvc_ml_path,
+        "%{msvc_link_path}": msvc_link_path,
+        "%{msvc_lib_path}": msvc_lib_path,
+        "%{cxx_builtin_include_directories}": to_list_of_strings(
+            escaped_cxx_include_directories,
+        ),
+    }
+
 # TODO(dzc): Once these functions have been factored out of Bazel's
 # cc_configure.bzl, load them from @bazel_tools instead.
 # BEGIN cc_configure common functions.
-def find_cc(repository_ctx):
+def find_cc(repository_ctx, use_cuda_clang):
     """Find the C++ compiler."""
+    if is_windows(repository_ctx):
+        return _get_msvc_compiler(repository_ctx)
 
-    if _use_cuda_clang(repository_ctx):
+    if use_cuda_clang:
         target_cc_name = "clang"
         cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
         if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
@@ -251,9 +365,10 @@ def _cuda_include_path(repository_ctx, cuda_config):
       Returns:
         A list of the gcc host compiler include directories.
       """
-    nvcc_path = repository_ctx.path(
-        "%s/bin/nvcc" % cuda_config.cuda_toolkit_path,
-    )
+    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
+        cuda_config.cuda_toolkit_path,
+        ".exe" if cuda_config.cpu_value == "Windows" else "",
+    ))
 
     # The expected exit code of this command is non-zero. Bazel remote execution
     # only caches commands with zero exit code. So force a zero exit code.
@@ -314,6 +429,10 @@ def matches_version(environ_version, detected_version):
             return False
     return True
 
+_NVCC_VERSION_PREFIX = "Cuda compilation tools, release "
+
+_DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"
+
 def compute_capabilities(repository_ctx):
     """Returns a list of strings representing cuda compute capabilities.
 
@@ -356,11 +475,12 @@ def compute_capabilities(repository_ctx):
 
     return capabilities
 
-def lib_name(base_name, version = None, static = False):
+def lib_name(base_name, cpu_value, version = None, static = False):
     """Constructs the platform-specific name of a library.
 
       Args:
         base_name: The name of the library, such as "cudart"
+        cpu_value: The name of the host operating system.
         version: The version of the library.
         static: True the library is static or False if it is a shared object.
 
@@ -368,20 +488,29 @@ def lib_name(base_name, version = None, static = False):
         The platform-specific name of the library.
       """
     version = "" if not version else "." + version
-    if static:
-        return "lib%s.a" % base_name
-    return "lib%s.so%s" % (base_name, version)
+    if cpu_value in ("Linux", "FreeBSD"):
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s.so%s" % (base_name, version)
+    elif cpu_value == "Windows":
+        return "%s.lib" % base_name
+    elif cpu_value == "Darwin":
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s%s.dylib" % (base_name, version)
+    else:
+        auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
 
-def _lib_path(lib, basedir, version, static):
-    file_name = lib_name(lib, version, static)
+def _lib_path(lib, cpu_value, basedir, version, static):
+    file_name = lib_name(lib, cpu_value, version, static)
     return "%s/%s" % (basedir, file_name)
 
 def _should_check_soname(version, static):
     return version and not static
 
-def _check_cuda_lib_params(lib, basedir, version, static = False):
+def _check_cuda_lib_params(lib, cpu_value, basedir, version, static = False):
     return (
-        _lib_path(lib, basedir, version, static),
+        _lib_path(lib, cpu_value, basedir, version, static),
         _should_check_soname(version, static),
     )
 
@@ -401,6 +530,8 @@ def _check_cuda_libs(repository_ctx, script_path, libs):
     all_paths = [path for path, _ in libs]
     checked_paths = execute(repository_ctx, [python_bin, "-c", cmd]).stdout.splitlines()
 
+    # Filter out empty lines from splitting on '\r\n' on Windows
+    checked_paths = [path for path in checked_paths if len(path) > 0]
     if all_paths != checked_paths:
         auto_configure_fail("Error with installed CUDA libs. Expected '%s'. Actual '%s'." % (all_paths, checked_paths))
 
@@ -418,62 +549,86 @@ def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
       Returns:
         Map of library names to structs of filename and path.
       """
+    cpu_value = cuda_config.cpu_value
+    stub_dir = "" if is_windows(repository_ctx) else "/stubs"
+
     check_cuda_libs_params = {
         "cuda": _check_cuda_lib_params(
             "cuda",
-            cuda_config.config["cuda_library_dir"] + "/stubs",
+            cpu_value,
+            cuda_config.config["cuda_library_dir"] + stub_dir,
             version = None,
+            static = False,
         ),
         "cudart": _check_cuda_lib_params(
             "cudart",
+            cpu_value,
             cuda_config.config["cuda_library_dir"],
             cuda_config.cudart_version,
+            static = False,
         ),
         "cudart_static": _check_cuda_lib_params(
             "cudart_static",
+            cpu_value,
             cuda_config.config["cuda_library_dir"],
             cuda_config.cudart_version,
             static = True,
         ),
         "cublas": _check_cuda_lib_params(
             "cublas",
+            cpu_value,
             cuda_config.config["cublas_library_dir"],
             cuda_config.cublas_version,
+            static = False,
         ),
         "cublasLt": _check_cuda_lib_params(
             "cublasLt",
+            cpu_value,
             cuda_config.config["cublas_library_dir"],
             cuda_config.cublas_version,
+            static = False,
         ),
         "cusolver": _check_cuda_lib_params(
             "cusolver",
+            cpu_value,
             cuda_config.config["cusolver_library_dir"],
             cuda_config.cusolver_version,
+            static = False,
         ),
         "curand": _check_cuda_lib_params(
             "curand",
+            cpu_value,
             cuda_config.config["curand_library_dir"],
             cuda_config.curand_version,
+            static = False,
         ),
         "cufft": _check_cuda_lib_params(
             "cufft",
+            cpu_value,
             cuda_config.config["cufft_library_dir"],
             cuda_config.cufft_version,
+            static = False,
         ),
         "cudnn": _check_cuda_lib_params(
             "cudnn",
+            cpu_value,
             cuda_config.config["cudnn_library_dir"],
             cuda_config.cudnn_version,
+            static = False,
         ),
         "cupti": _check_cuda_lib_params(
             "cupti",
+            cpu_value,
             cuda_config.config["cupti_library_dir"],
             cuda_config.cupti_version,
+            static = False,
         ),
         "cusparse": _check_cuda_lib_params(
             "cusparse",
+            cpu_value,
             cuda_config.config["cusparse_library_dir"],
             cuda_config.cusparse_version,
+            static = False,
         ),
     }
 
@@ -483,6 +638,10 @@ def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
     paths = {filename: v[0] for (filename, v) in check_cuda_libs_params.items()}
     return paths
 
+def _cudart_static_linkopt(cpu_value):
+    """Returns additional platform-specific linkopts for cudart."""
+    return "" if cpu_value == "Darwin" else "\"-lrt\","
+
 # TODO(csigg): Only call once instead of from here, tensorrt_configure.bzl,
 # and nccl_configure.bzl.
 def find_cuda_config(repository_ctx, cuda_libraries):
@@ -509,34 +668,37 @@ def _get_cuda_config(repository_ctx):
           cudart_version: The CUDA runtime version on the system.
           cudnn_version: The version of cuDNN on the system.
           compute_capabilities: A list of the system's CUDA compute capabilities.
+          cpu_value: The name of the host operating system.
       """
     config = find_cuda_config(repository_ctx, ["cuda", "cudnn"])
+    cpu_value = get_cpu_value(repository_ctx)
     toolkit_path = config["cuda_toolkit_path"]
 
+    is_windows = cpu_value == "Windows"
     cuda_version = config["cuda_version"].split(".")
     cuda_major = cuda_version[0]
     cuda_minor = cuda_version[1]
 
-    cuda_version = "%s.%s" % (cuda_major, cuda_minor)
-    cudnn_version = "%s" % config["cudnn_version"]
+    cuda_version = ("64_%s%s" if is_windows else "%s.%s") % (cuda_major, cuda_minor)
+    cudnn_version = ("64_%s" if is_windows else "%s") % config["cudnn_version"]
 
     if int(cuda_major) >= 11:
         # The libcudart soname in CUDA 11.x is versioned as 11.0 for backward compatability.
         if int(cuda_major) == 11:
-            cudart_version = "11.0"
+            cudart_version = "64_110" if is_windows else "11.0"
             cupti_version = cuda_version
         else:
-            cudart_version = "%s" % cuda_major
+            cudart_version = ("64_%s" if is_windows else "%s") % cuda_major
             cupti_version = cudart_version
-        cublas_version = "%s" % config["cublas_version"].split(".")[0]
-        cusolver_version = "%s" % config["cusolver_version"].split(".")[0]
-        curand_version = "%s" % config["curand_version"].split(".")[0]
-        cufft_version = "%s" % config["cufft_version"].split(".")[0]
-        cusparse_version = "%s" % config["cusparse_version"].split(".")[0]
+        cublas_version = ("64_%s" if is_windows else "%s") % config["cublas_version"].split(".")[0]
+        cusolver_version = ("64_%s" if is_windows else "%s") % config["cusolver_version"].split(".")[0]
+        curand_version = ("64_%s" if is_windows else "%s") % config["curand_version"].split(".")[0]
+        cufft_version = ("64_%s" if is_windows else "%s") % config["cufft_version"].split(".")[0]
+        cusparse_version = ("64_%s" if is_windows else "%s") % config["cusparse_version"].split(".")[0]
     elif (int(cuda_major), int(cuda_minor)) >= (10, 1):
         # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.
         # It changed from 'x.y' to just 'x' in CUDA 10.1.
-        cuda_lib_version = "%s" % cuda_major
+        cuda_lib_version = ("64_%s" if is_windows else "%s") % cuda_major
         cudart_version = cuda_version
         cupti_version = cuda_version
         cublas_version = cuda_lib_version
@@ -566,6 +728,7 @@ def _get_cuda_config(repository_ctx):
         cusparse_version = cusparse_version,
         cudnn_version = cudnn_version,
         compute_capabilities = compute_capabilities(repository_ctx),
+        cpu_value = cpu_value,
         config = config,
     )
 
@@ -611,6 +774,8 @@ error_gpu_disabled()
 """
 
 def _create_dummy_repository(repository_ctx):
+    cpu_value = get_cpu_value(repository_ctx)
+
     # Set up BUILD file for cuda/.
     _tpl(
         repository_ctx,
@@ -625,6 +790,23 @@ def _create_dummy_repository(repository_ctx):
         repository_ctx,
         "cuda:BUILD",
         {
+            "%{cuda_driver_lib}": lib_name("cuda", cpu_value),
+            "%{cudart_static_lib}": lib_name(
+                "cudart_static",
+                cpu_value,
+                static = True,
+            ),
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
+            "%{cudart_lib}": lib_name("cudart", cpu_value),
+            "%{cublas_lib}": lib_name("cublas", cpu_value),
+            "%{cublasLt_lib}": lib_name("cublasLt", cpu_value),
+            "%{cusolver_lib}": lib_name("cusolver", cpu_value),
+            "%{cudnn_lib}": lib_name("cudnn", cpu_value),
+            "%{cufft_lib}": lib_name("cufft", cpu_value),
+            "%{curand_lib}": lib_name("curand", cpu_value),
+            "%{cupti_lib}": lib_name("cupti", cpu_value),
+            "%{cusparse_lib}": lib_name("cusparse", cpu_value),
+            "%{cub_actual}": ":cuda_headers",
             "%{copy_rules}": """
 filegroup(name="cuda-include")
 filegroup(name="cublas-include")
@@ -643,9 +825,20 @@ filegroup(name="cudnn-include")
     repository_ctx.file("cuda/cuda/include/cublas.h")
     repository_ctx.file("cuda/cuda/include/cudnn.h")
     repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h")
-    repository_ctx.file("cuda/cuda/lib/libcuda.so")
-    repository_ctx.file("cuda/cuda/lib/libcudart_static.a")
     repository_ctx.file("cuda/cuda/nvml/include/nvml.h")
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cuda", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudart", cpu_value))
+    repository_ctx.file(
+        "cuda/cuda/lib/%s" % lib_name("cudart_static", cpu_value),
+    )
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublas", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublasLt", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusolver", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudnn", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("curand", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cufft", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cupti", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusparse", cpu_value))
 
     # Set up cuda_config.h, which is used by
     # tensorflow/compiler/xla/stream_executor/dso_loader.cc.
@@ -709,7 +902,7 @@ def make_copy_files_rule(repository_ctx, name, srcs, outs):
     cmd = \"""%s \""",
 )""" % (name, "\n".join(outs), " && \\\n".join(cmds))
 
-def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
+def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir, exceptions = None):
     """Returns a rule to recursively copy a directory.
     If exceptions is not None, it must be a list of files or directories in
     'src_dir'; these will be excluded from copying.
@@ -717,25 +910,39 @@ def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
     src_dir = _norm_path(src_dir)
     out_dir = _norm_path(out_dir)
     outs = read_dir(repository_ctx, src_dir)
+    post_cmd = ""
+    if exceptions != None:
+        outs = [x for x in outs if not any([
+            x.startswith(src_dir + "/" + y)
+            for y in exceptions
+        ])]
     outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
 
     # '@D' already contains the relative path for a single file, see
     # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
     out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
+    if exceptions != None:
+        for x in exceptions:
+            post_cmd += " ; rm -fR " + out_dir + "/" + x
     return """genrule(
     name = "%s",
     outs = [
 %s
     ],
-    cmd = \"""cp -rLf "%s/." "%s/" \""",
-)""" % (name, "\n".join(outs), src_dir, out_dir)
+    cmd = \"""cp -rLf "%s/." "%s/" %s\""",
+)""" % (name, "\n".join(outs), src_dir, out_dir, post_cmd)
 
 def _flag_enabled(repository_ctx, flag_name):
     return get_host_environ(repository_ctx, flag_name) == "1"
 
 def _use_cuda_clang(repository_ctx):
+    # Returns the flag if we need to use clang both for C++ and Cuda.
     return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
 
+def _use_nvcc_and_clang(repository_ctx):
+    # Returns the flag if we need to use clang for C++ and NVCC for Cuda.
+    return _flag_enabled(repository_ctx, "TF_NVCC_CLANG")
+
 def _tf_sysroot(repository_ctx):
     return get_host_environ(repository_ctx, _TF_SYSROOT, "")
 
@@ -752,6 +959,22 @@ def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
 def _tpl_path(repository_ctx, filename):
     return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % filename))
 
+def _basename(repository_ctx, path_str):
+    """Returns the basename of a path of type string.
+
+    This method is different from path.basename in that it also works if
+    the host platform is different from the execution platform
+    i.e. linux -> windows.
+    """
+
+    num_chars = len(path_str)
+    is_win = is_windows(repository_ctx)
+    for i in range(num_chars):
+        r_i = num_chars - 1 - i
+        if (is_win and path_str[r_i] == "\\") or path_str[r_i] == "/":
+            return path_str[r_i + 1:]
+    return path_str
+
 def _create_local_cuda_repository(repository_ctx):
     """Creates the repository containing files set up to build with CUDA."""
 
@@ -760,14 +983,15 @@ def _create_local_cuda_repository(repository_ctx):
     # can easily lead to a O(n^2) runtime in the number of labels.
     # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
     tpl_paths = {filename: _tpl_path(repository_ctx, filename) for filename in [
-        "cuda:BUILD",
         "cuda:build_defs.bzl",
         "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
+        "crosstool:windows/msvc_wrapper_for_nvcc.py",
         "crosstool:BUILD",
         "crosstool:cc_toolchain_config.bzl",
         "cuda:cuda_config.h",
         "cuda:cuda_config.py",
     ]}
+    tpl_paths["cuda:BUILD"] = _tpl_path(repository_ctx, "cuda:BUILD.windows" if is_windows(repository_ctx) else "cuda:BUILD")
 
     cuda_config = _get_cuda_config(repository_ctx)
 
@@ -879,7 +1103,7 @@ def _create_local_cuda_repository(repository_ctx):
     cuda_lib_outs = []
     for path in cuda_libs.values():
         cuda_lib_srcs.append(path)
-        cuda_lib_outs.append("cuda/lib/" + path.rpartition("/")[-1])
+        cuda_lib_outs.append("cuda/lib/" + _basename(repository_ctx, path))
     copy_rules.append(make_copy_files_rule(
         repository_ctx,
         name = "cuda-lib",
@@ -888,7 +1112,11 @@ def _create_local_cuda_repository(repository_ctx):
     ))
 
     # copy files mentioned in third_party/nccl/build_defs.bzl.tpl
-    bin_files = ["crt/link.stub", "bin2c", "fatbinary", "nvlink", "nvprune"]
+    file_ext = ".exe" if is_windows(repository_ctx) else ""
+    bin_files = (
+        ["crt/link.stub"] +
+        [f + file_ext for f in ["bin2c", "fatbinary", "nvlink", "nvprune"]]
+    )
     copy_rules.append(make_copy_files_rule(
         repository_ctx,
         name = "cuda-bin",
@@ -896,7 +1124,7 @@ def _create_local_cuda_repository(repository_ctx):
         outs = ["cuda/bin/" + f for f in bin_files],
     ))
 
-    # Select the headers based on the cuDNN version.
+    # Select the headers based on the cuDNN version (strip '64_' for Windows).
     cudnn_headers = ["cudnn.h"]
     if cuda_config.cudnn_version.rsplit("_", 1)[-1] >= "8":
         cudnn_headers += [
@@ -937,15 +1165,33 @@ def _create_local_cuda_repository(repository_ctx):
         },
     )
 
+    cub_actual = "@cub_archive//:cub"
+    if int(cuda_config.cuda_version_major) >= 11:
+        cub_actual = ":cuda_headers"
+
     repository_ctx.template(
         "cuda/BUILD",
         tpl_paths["cuda:BUILD"],
         {
+            "%{cuda_driver_lib}": _basename(repository_ctx, cuda_libs["cuda"]),
+            "%{cudart_static_lib}": _basename(repository_ctx, cuda_libs["cudart_static"]),
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
+            "%{cudart_lib}": _basename(repository_ctx, cuda_libs["cudart"]),
+            "%{cublas_lib}": _basename(repository_ctx, cuda_libs["cublas"]),
+            "%{cublasLt_lib}": _basename(repository_ctx, cuda_libs["cublasLt"]),
+            "%{cusolver_lib}": _basename(repository_ctx, cuda_libs["cusolver"]),
+            "%{cudnn_lib}": _basename(repository_ctx, cuda_libs["cudnn"]),
+            "%{cufft_lib}": _basename(repository_ctx, cuda_libs["cufft"]),
+            "%{curand_lib}": _basename(repository_ctx, cuda_libs["curand"]),
+            "%{cupti_lib}": _basename(repository_ctx, cuda_libs["cupti"]),
+            "%{cusparse_lib}": _basename(repository_ctx, cuda_libs["cusparse"]),
+            "%{cub_actual}": cub_actual,
             "%{copy_rules}": "\n".join(copy_rules),
         },
     )
 
     is_cuda_clang = _use_cuda_clang(repository_ctx)
+    is_nvcc_and_clang = _use_nvcc_and_clang(repository_ctx)
     tf_sysroot = _tf_sysroot(repository_ctx)
 
     should_download_clang = is_cuda_clang and _flag_enabled(
@@ -956,7 +1202,7 @@ def _create_local_cuda_repository(repository_ctx):
         download_clang(repository_ctx, "crosstool/extra_tools")
 
     # Set up crosstool/
-    cc = find_cc(repository_ctx)
+    cc = find_cc(repository_ctx, is_cuda_clang)
     cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
 
     host_compiler_includes = get_cxx_inc_directories(
@@ -993,7 +1239,7 @@ def _create_local_cuda_repository(repository_ctx):
 
     cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
     cuda_defines["%{unfiltered_compile_flags}"] = ""
-    if is_cuda_clang:
+    if is_cuda_clang and not is_nvcc_and_clang:
         cuda_defines["%{host_compiler_path}"] = str(cc)
         cuda_defines["%{host_compiler_warnings}"] = """
         # Some parts of the codebase set -Werror and hit this warning, so
@@ -1002,10 +1248,12 @@ def _create_local_cuda_repository(repository_ctx):
     """
         cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(host_compiler_includes)
         cuda_defines["%{compiler_deps}"] = ":empty"
+        cuda_defines["%{win_compiler_deps}"] = ":empty"
         repository_ctx.file(
             "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
             "",
         )
+        repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.py", "")
     else:
         cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
         cuda_defines["%{host_compiler_warnings}"] = ""
@@ -1025,22 +1273,40 @@ def _create_local_cuda_repository(repository_ctx):
         # .d file - given that includes that are prefixed with "../" multiple
         # time quickly grow longer than the root of the tree, this can lead to
         # bazel's header check failing.
-        cuda_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
+        if not is_cuda_clang:
+            cuda_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
 
-        nvcc_path = "%s/nvcc" % cuda_config.config["cuda_binary_dir"]
+        file_ext = ".exe" if is_windows(repository_ctx) else ""
+        nvcc_path = "%s/nvcc%s" % (cuda_config.config["cuda_binary_dir"], file_ext)
         cuda_defines["%{compiler_deps}"] = ":crosstool_wrapper_driver_is_not_gcc"
+        cuda_defines["%{win_compiler_deps}"] = ":windows_msvc_wrapper_files"
 
         wrapper_defines = {
             "%{cpu_compiler}": str(cc),
             "%{cuda_version}": cuda_config.cuda_version,
             "%{nvcc_path}": nvcc_path,
-            "%{gcc_host_compiler_path}": str(cc),
+            "%{host_compiler_path}": str(cc),
+            "%{use_clang_compiler}": str(is_nvcc_and_clang),
+            "%{nvcc_tmp_dir}": _get_nvcc_tmp_dir_for_windows(repository_ctx),
         }
         repository_ctx.template(
             "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
             tpl_paths["crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc"],
             wrapper_defines,
         )
+        repository_ctx.file(
+            "crosstool/windows/msvc_wrapper_for_nvcc.bat",
+            content = "@echo OFF\n{} -B external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py %*".format(
+                get_python_bin(repository_ctx),
+            ),
+        )
+        repository_ctx.template(
+            "crosstool/windows/msvc_wrapper_for_nvcc.py",
+            tpl_paths["crosstool:windows/msvc_wrapper_for_nvcc.py"],
+            wrapper_defines,
+        )
+
+    cuda_defines.update(_get_win_cuda_defines(repository_ctx))
 
     verify_build_defines(cuda_defines)
 
@@ -1171,12 +1437,28 @@ def _cuda_autoconf_impl(repository_ctx):
 
     repository_ctx.symlink(build_file, "BUILD")
 
+# For @bazel_tools//tools/cpp:windows_cc_configure.bzl
+_MSVC_ENVVARS = [
+    "BAZEL_VC",
+    "BAZEL_VC_FULL_VERSION",
+    "BAZEL_VS",
+    "BAZEL_WINSDK_FULL_VERSION",
+    "VS90COMNTOOLS",
+    "VS100COMNTOOLS",
+    "VS110COMNTOOLS",
+    "VS120COMNTOOLS",
+    "VS140COMNTOOLS",
+    "VS150COMNTOOLS",
+    "VS160COMNTOOLS",
+]
+
 _ENVIRONS = [
     _GCC_HOST_COMPILER_PATH,
     _GCC_HOST_COMPILER_PREFIX,
     _CLANG_CUDA_COMPILER_PATH,
     "TF_NEED_CUDA",
     "TF_CUDA_CLANG",
+    "TF_NVCC_CLANG",
     _TF_DOWNLOAD_CLANG,
     _CUDA_TOOLKIT_PATH,
     _CUDNN_INSTALL_PATH,
@@ -1188,7 +1470,7 @@ _ENVIRONS = [
     "TMP",
     "TMPDIR",
     "TF_CUDA_PATHS",
-]
+] + _MSVC_ENVVARS
 
 remote_cuda_configure = repository_rule(
     implementation = _create_local_cuda_repository,
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/find_cuda_config.py b/third_party/xla/third_party/tsl/third_party/gpus/find_cuda_config.py
index 78292c7b40237a..b88694af5c014d 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/find_cuda_config.py
+++ b/third_party/xla/third_party/tsl/third_party/gpus/find_cuda_config.py
@@ -29,6 +29,8 @@
 If TF_CUDA_PATHS is not specified, a OS specific default is used:
 
   Linux:   /usr/local/cuda, /usr, and paths from 'ldconfig -p'.
+  Windows: CUDA_PATH environment variable, or
+           C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\*
 
 For backwards compatibility, some libraries also use alternative base
 directories from other environment variables if they are specified. List of
@@ -54,6 +56,7 @@
 import io
 import os
 import glob
+import platform
 import re
 import subprocess
 import sys
@@ -70,6 +73,18 @@ class ConfigError(Exception):
   pass
 
 
+def _is_linux():
+  return platform.system() == "Linux"
+
+
+def _is_windows():
+  return platform.system() == "Windows"
+
+
+def _is_macos():
+  return platform.system() == "Darwin"
+
+
 def _matches_version(actual_version, required_version):
   """Checks whether some version meets the requirements.
 
@@ -119,6 +134,8 @@ def _cartesian_product(first, second):
 
 def _get_ld_config_paths():
   """Returns all directories from 'ldconfig -p'."""
+  if not _is_linux():
+    return []
   ldconfig_path = which("ldconfig") or "/sbin/ldconfig"
   output = subprocess.check_output([ldconfig_path, "-p"])
   pattern = re.compile(".* => (.*)")
@@ -139,6 +156,13 @@ def _get_default_cuda_paths(cuda_version):
   elif not "." in cuda_version:
     cuda_version = cuda_version + ".*"
 
+  if _is_windows():
+    return [
+        os.environ.get(
+            "CUDA_PATH",
+            "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v%s\\" %
+            cuda_version)
+    ]
   return ["/usr/local/cuda-%s" % cuda_version, "/usr/local/cuda", "/usr",
          "/usr/local/cudnn"] + _get_ld_config_paths()
 
@@ -188,8 +212,14 @@ def _find_file(base_paths, relative_paths, filepattern):
 
 def _find_library(base_paths, library_name, required_version):
   """Returns first valid path to the requested library."""
-  filepattern = ".".join(["lib" + library_name, "so"] +
-                         required_version.split(".")[:1]) + "*"
+  if _is_windows():
+    filepattern = library_name + ".lib"
+  elif _is_macos():
+    filepattern = "%s*.dylib" % (".".join(["lib" + library_name] +
+                                          required_version.split(".")[:1]))
+  else:
+    filepattern = ".".join(["lib" + library_name, "so"] +
+                           required_version.split(".")[:1]) + "*"
   return _find_file(base_paths, _library_paths(), filepattern)
 
 
@@ -238,7 +268,7 @@ def get_nvcc_version(path):
         return match.group(1)
     return None
 
-  nvcc_name = "nvcc"
+  nvcc_name = "nvcc.exe" if _is_windows() else "nvcc"
   nvcc_path, nvcc_version = _find_versioned_file(base_paths, [
       "",
       "bin",
@@ -528,6 +558,14 @@ def _get_legacy_path(env_name, default=[]):
   return _list_from_env(env_name, default)
 
 
+def _normalize_path(path):
+  """Returns normalized path, with forward slashes on Windows."""
+  path = os.path.realpath(path)
+  if _is_windows():
+    path = path.replace("\\", "/")
+  return path
+
+
 def find_cuda_config():
   """Returns a dictionary of CUDA library and header file paths."""
   libraries = [argv.lower() for argv in sys.argv[1:]]
@@ -596,7 +634,7 @@ def find_cuda_config():
 
   for k, v in result.items():
     if k.endswith("_dir") or k.endswith("_path"):
-      result[k] = os.path.realpath(v)
+      result[k] = _normalize_path(v)
 
   return result
 
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
index 0bbbc09832db13..5c1195bada43f8 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
@@ -198,6 +198,8 @@ def _rocm_include_path(repository_ctx, rocm_config, bash_bin):
     inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/15.0.0/include")
     inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/16.0.0/include")
     inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/17.0.0/include")
+    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/17/include")
+    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/18/include")
 
     # Support hcc based off clang 10.0.0 (for ROCm 3.3)
     inc_dirs.append(rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/")
@@ -345,14 +347,14 @@ def _find_libs(repository_ctx, rocm_config, hipfft_or_rocfft, miopen_path, rccl_
     libs_paths = [
         (name, _rocm_lib_paths(repository_ctx, name, path))
         for name, path in [
-            ("amdhip64", rocm_config.rocm_toolkit_path + "/hip"),
+            ("amdhip64", rocm_config.rocm_toolkit_path),
             ("rocblas", rocm_config.rocm_toolkit_path),
             (hipfft_or_rocfft, rocm_config.rocm_toolkit_path),
             ("hiprand", rocm_config.rocm_toolkit_path),
             ("MIOpen", miopen_path),
             ("rccl", rccl_path),
             ("hipsparse", rocm_config.rocm_toolkit_path),
-            ("roctracer64", rocm_config.rocm_toolkit_path + "/roctracer"),
+            ("roctracer64", rocm_config.rocm_toolkit_path),
             ("rocsolver", rocm_config.rocm_toolkit_path),
         ]
     ]
@@ -694,7 +696,7 @@ def _create_local_rocm_repository(repository_ctx):
 
     rocm_defines["%{unfiltered_compile_flags}"] = to_list_of_strings([
         "-DTENSORFLOW_USE_ROCM=1",
-        "-D__HIP_PLATFORM_HCC__",
+        "-D__HIP_PLATFORM_AMD__",
         "-DEIGEN_USE_HIP",
     ])
 
@@ -729,7 +731,7 @@ def _create_local_rocm_repository(repository_ctx):
             "%{hipcc_env}": _hipcc_env(repository_ctx),
             "%{rocr_runtime_path}": rocm_config.rocm_toolkit_path + "/lib",
             "%{rocr_runtime_library}": "hsa-runtime64",
-            "%{hip_runtime_path}": rocm_config.rocm_toolkit_path + "/hip/lib",
+            "%{hip_runtime_path}": rocm_config.rocm_toolkit_path + "/lib",
             "%{hip_runtime_library}": "amdhip64",
             "%{crosstool_verbose}": _crosstool_verbose(repository_ctx),
             "%{gcc_host_compiler_path}": str(cc),
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 6dd0e178ec09b7..9fca8c020bf276 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "e45cd275068c87cbd1d42d0dc89475d72798a9e8"
-    TFRT_SHA256 = "dd4a1440fdc8bf142c5ac00bd6227e41999a0912b2f847e932b57307f97138dd"
+    TFRT_COMMIT = "dbd8da33ab49ed8aa5f08ebe85bacb91341f5d61"
+    TFRT_SHA256 = "b95b1d17eb2e28ee0f00ae672c7377767a17e7dadde169b335aa481bb07883c7"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/cross_compile/cc/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/cross_compile/cc/BUILD
new file mode 100644
index 00000000000000..dc621893ac9675
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tools/toolchains/cross_compile/cc/BUILD
@@ -0,0 +1,191 @@
+"""Toolchain configs for cross-compiling TensorFlow"""
+
+load("@bazel_tools//tools/cpp:unix_cc_toolchain_config.bzl", "cc_toolchain_config")
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["restricted"])
+
+cc_toolchain_suite(
+    name = "cross_compile_toolchain_suite",
+    toolchains = {
+        "aarch64": ":linux_aarch64_toolchain",
+        "k8": ":linux_x86_toolchain",
+    },
+)
+
+filegroup(
+    name = "empty",
+    visibility = ["//visibility:public"],
+)
+
+cc_toolchain(
+    name = "linux_x86_toolchain",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":linux_x86_toolchain_config",
+    toolchain_identifier = "linux_x86_toolchain",
+)
+
+cc_toolchain_config(
+    name = "linux_x86_toolchain_config",
+    abi_libc_version = "local",
+    abi_version = "local",
+    builtin_sysroot = "/dt9",
+    compile_flags = [
+        "--target=x86_64-unknown-linux-gnu",
+        "-fstack-protector",
+        "-Wall",
+        "-Wthread-safety",
+        "-Wself-assign",
+        "-Wunused-but-set-parameter",
+        "-Wno-free-nonheap-object",
+        "-fcolor-diagnostics",
+        "-fno-omit-frame-pointer",
+        "-mavx",
+    ],
+    compiler = "clang",
+    coverage_compile_flags = ["--coverage"],
+    coverage_link_flags = ["--coverage"],
+    cpu = "k8",
+    cxx_builtin_include_directories = [
+        "/dt9/",
+        "/usr/lib/llvm-17/include/",
+        "/usr/lib/llvm-17/lib/clang/17/include",
+    ],
+    dbg_compile_flags = ["-g"],
+    host_system_name = "linux",
+    link_flags = [
+        "--target=x86_64-unknown-linux-gnu",
+        "-fuse-ld=lld",
+        "--ld-path=/usr/lib/llvm-17/bin/ld.lld",
+        "-Wl,--undefined-version",
+    ],
+    link_libs = [
+        "-lstdc++",
+        "-lm",
+    ],
+    opt_compile_flags = [
+        "-g0",
+        "-O2",
+        "-D_FORTIFY_SOURCE=1",
+        "-DNDEBUG",
+        "-ffunction-sections",
+        "-fdata-sections",
+    ],
+    opt_link_flags = ["-Wl,--gc-sections"],
+    supports_start_end_lib = True,
+    target_libc = "",
+    target_system_name = "x86_64-unknown-linux-gnu",
+    tool_paths = {
+        "gcc": "/usr/lib/llvm-17/bin/clang",
+        "ld": "/usr/lib/llvm-17/bin/ld.lld",
+        "ar": "/usr/lib/llvm-17/bin/llvm-ar",
+        "cpp": "/usr/lib/llvm-17/bin/clang++",
+        "llvm-cov": "/usr/lib/llvm-17/bin/llvm-cov",
+        "nm": "/usr/lib/llvm-17/bin/llvm-nm",
+        "objdump": "/usr/lib/llvm-17/bin/llvm-objdump",
+        "strip": "/usr/lib/llvm-17/bin/llvm-strip",
+    },
+    toolchain_identifier = "linux_x86_toolchain",
+    unfiltered_compile_flags = [
+        "-no-canonical-prefixes",
+        "-Wno-builtin-macro-redefined",
+        "-D__DATE__=\"redacted\"",
+        "-D__TIMESTAMP__=\"redacted\"",
+        "-D__TIME__=\"redacted\"",
+        "-Wno-unused-command-line-argument",
+        "-Wno-gnu-offsetof-extensions",
+    ],
+)
+
+cc_toolchain(
+    name = "linux_aarch64_toolchain",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":linux_aarch64_toolchain_config",
+    toolchain_identifier = "linux_aarch64_toolchain",
+)
+
+cc_toolchain_config(
+    name = "linux_aarch64_toolchain_config",
+    abi_libc_version = "local",
+    abi_version = "local",
+    builtin_sysroot = "/dt10/",
+    compile_flags = [
+        "--target=aarch64-unknown-linux-gnu",
+        "-fstack-protector",
+        "-Wall",
+        "-Wthread-safety",
+        "-Wself-assign",
+        "-Wunused-but-set-parameter",
+        "-Wno-free-nonheap-object",
+        "-fcolor-diagnostics",
+        "-fno-omit-frame-pointer",
+        "-mtune=generic",
+        "-march=armv8-a",
+    ],
+    compiler = "clang",
+    coverage_compile_flags = ["--coverage"],
+    coverage_link_flags = ["--coverage"],
+    cpu = "aarch64",
+    cxx_builtin_include_directories = [
+        "/dt10/",
+        "/usr/lib/llvm-17/include/",
+        "/usr/lib/llvm-17/lib/clang/17/include",
+    ],
+    dbg_compile_flags = ["-g"],
+    host_system_name = "linux",
+    link_flags = [
+        "--target=aarch64-unknown-linux-gnu",
+        "-fuse-ld=lld",
+        "--ld-path=/usr/lib/llvm-17/bin/ld.lld",
+        "-Wl,--undefined-version",
+    ],
+    link_libs = [
+        "-lstdc++",
+        "-lm",
+    ],
+    opt_compile_flags = [
+        "-g0",
+        "-O2",
+        "-D_FORTIFY_SOURCE=1",
+        "-DNDEBUG",
+        "-ffunction-sections",
+        "-fdata-sections",
+    ],
+    opt_link_flags = ["-Wl,--gc-sections"],
+    supports_start_end_lib = True,
+    target_libc = "",
+    target_system_name = "aarch64-unknown-linux-gnu",
+    tool_paths = {
+        "gcc": "/usr/lib/llvm-17/bin/clang",
+        "ld": "/usr/lib/llvm-17/bin/ld.lld",
+        "ar": "/usr/lib/llvm-17/bin/llvm-ar",
+        "cpp": "/usr/lib/llvm-17/bin/clang++",
+        "llvm-cov": "/usr/lib/llvm-17/bin/llvm-cov",
+        "nm": "/usr/lib/llvm-17/bin/llvm-nm",
+        "objdump": "/usr/lib/llvm-17/bin/llvm-objdump",
+        "strip": "/usr/lib/llvm-17/bin/llvm-strip",
+    },
+    toolchain_identifier = "linux_aarch64_toolchain",
+    unfiltered_compile_flags = [
+        "-no-canonical-prefixes",
+        "-Wno-builtin-macro-redefined",
+        "-D__DATE__=\"redacted\"",
+        "-D__TIMESTAMP__=\"redacted\"",
+        "-D__TIME__=\"redacted\"",
+        "-Wno-unused-command-line-argument",
+        "-Wno-gnu-offsetof-extensions",
+    ],
+)
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/cross_compile/config/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/cross_compile/config/BUILD
new file mode 100644
index 00000000000000..b6a504ba1449d6
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tools/toolchains/cross_compile/config/BUILD
@@ -0,0 +1,23 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["restricted"])
+
+platform(
+    name = "linux_x86_64",
+    constraint_values = [
+        "@platforms//os:linux",
+        "@platforms//cpu:x86_64",
+    ],
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/ml-devinfra-linux-aarch64-cross-compile@sha256:11c5ac3b9b4e01cfa82b39b90826a9bfc5b806ccc92cd3d272e6bf861de43be1",
+        "OSFamily": "Linux",
+    },
+)
+
+platform(
+    name = "linux_aarch64",
+    constraint_values = [
+        "@platforms//os:linux",
+        "@platforms//cpu:aarch64",
+    ],
+)
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
index 4554463cb90675..4b07fb5c18670d 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
@@ -200,6 +200,28 @@ def initialize_rbe_configs():
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu20.04-clang_manylinux2014-cuda12.3-cudnn8.9",
+        compiler = "/usr/lib/llvm-17/bin/clang",
+        cuda_version = "12.3",
+        cudnn_version = "8.9",
+        os = "ubuntu20.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        sysroot = "/dt9",
+        python_install_path = "/usr/local",
+    )
+
+    tensorflow_rbe_config(
+        name = "ubuntu20.04-gcc9_manylinux2014-cuda12.3-cudnn8.9",
+        compiler = "/dt9/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "12.3",
+        cudnn_version = "8.9",
+        os = "ubuntu20.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        python_install_path = "/usr/local",
+    )
+
     tensorflow_rbe_win_config(
         name = "windows_py37",
         python_bin_path = "C:/Python37/python.exe",
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
index bfb4634e810328..cd346c2816def1 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
@@ -5,8 +5,9 @@ container_digests = {
     # TF now uses only this container
     "cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython": "sha256:48612bd85709cd014711d0b0f87e0806f3567d06d2e81c6e860516b87498b821",
     # JAX manylinux2014 configs.
-    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:ab39410baf2fc1d31d50540acec7640d7f4814fa694e2421b696b6f0a058d645",
-    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:b699d6ae235ac601dc3e62391ac7c4606cb10331f8141983858c1580f5e74ddb",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:b112c0c77d4172fc025420938f13ea83f3ad480c01778e743a201e5e3f4710e1",
+    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:9fefda035b4a12b24cd5bae56c7dbb9527a5fd06a41ced0a22ac86fe5ed26428",
+    "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:afe68c3448734cb07b16005fd9ed47d19533eb8bf5acd92863735ce24766b93b",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
@@ -98,6 +99,13 @@ containers = {
         "digest": container_digests["cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython.
     "rocm-ubuntu18.04-manylinux2010-multipython": {
         "registry": "gcr.io",
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel b/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
index 57597e207686ff..6ccfd7a019a3ce 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
+++ b/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
@@ -2,10 +2,6 @@
 #   Stubs for dynamically loading CUDA.
 
 load("//tsl/cuda:stub.bzl", "cuda_stub")
-load(
-    "//tsl/platform:build_config.bzl",
-    "tsl_cc_test",
-)
 load(
     "//tsl/platform:rules_cc.bzl",
     "cc_library",
@@ -44,7 +40,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@local_config_cuda//cuda:cuda_headers",
         "//tsl/platform:dso_loader",
-        "//tsl/platform:env",
+        "//tsl/platform:logging",
+        "//tsl/platform:load_library",
     ]),
 )
 
@@ -67,7 +64,8 @@ cc_library(
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
         "//tsl/platform:dso_loader",
-        "//tsl/platform:env",
+        "//tsl/platform:logging",
+        "//tsl/platform:load_library",
     ]),
 )
 
@@ -90,7 +88,8 @@ cc_library(
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
         "//tsl/platform:dso_loader",
-        "//tsl/platform:env",
+        "//tsl/platform:logging",
+        "//tsl/platform:load_library",
     ]),
 )
 
@@ -122,7 +121,8 @@ cc_library(
         "//tsl:is_cuda_enabled_and_oss": [
             ":cuda",
             "//tsl/platform:dso_loader",
-            "//tsl/platform:env",
+            "//tsl/platform:load_library",
+            "//tsl/platform:logging",
             "@com_google_absl//absl/container:flat_hash_set",
             "@local_config_cuda//cuda:cuda_headers",
         ],
@@ -151,7 +151,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@local_config_cuda//cuda:cudnn_header",
         "//tsl/platform:dso_loader",
-        "//tsl/platform:env",
+        "//tsl/platform:logging",
+        "//tsl/platform:load_library",
     ]),
 )
 
@@ -187,7 +188,8 @@ cc_library(
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
         "//tsl/platform:dso_loader",
-        "//tsl/platform:env",
+        "//tsl/platform:logging",
+        "//tsl/platform:load_library",
     ]),
 )
 
@@ -213,7 +215,8 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cupti_headers",
         "//tsl/platform:dso_loader",
-        "//tsl/platform:env",
+        "//tsl/platform:logging",
+        "//tsl/platform:load_library",
     ]),
 )
 
@@ -237,7 +240,8 @@ cc_library(
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
         "//tsl/platform:dso_loader",
-        "//tsl/platform:env",
+        "//tsl/platform:logging",
+        "//tsl/platform:load_library",
     ]),
 )
 
@@ -261,7 +265,8 @@ cc_library(
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
         "//tsl/platform:dso_loader",
-        "//tsl/platform:env",
+        "//tsl/platform:logging",
+        "//tsl/platform:load_library",
     ]),
 )
 
@@ -287,6 +292,7 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_nccl//:nccl_headers",
         "//tsl/platform:dso_loader",
-        "//tsl/platform:env",
+        "//tsl/platform:logging",
+        "//tsl/platform:load_library",
     ]),
 )
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cublasLt.symbols b/third_party/xla/third_party/tsl/tsl/cuda/cublasLt.symbols
index 7f93cfcb3ad49f..db6fa52731f784 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cublasLt.symbols
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cublasLt.symbols
@@ -38,62 +38,119 @@ cublasLtDDDMatmulAlgoGetHeuristic
 cublasLtDDDMatmulAlgoGetIds
 cublasLtDDDMatmulAlgoInit
 cublasLtDestroy
+cublasLtE4m3E4m3Fp32Bf16Bf16Matmul
 cublasLtE4m3E4m3Fp32Bf16Bf16MatmulAlgoCapGetAttribute
 cublasLtE4m3E4m3Fp32Bf16Bf16MatmulAlgoCheck
+cublasLtE4m3E4m3Fp32Bf16Bf16MatmulAlgoGetHeuristic
+cublasLtE4m3E4m3Fp32Bf16Bf16MatmulAlgoGetIds
 cublasLtE4m3E4m3Fp32Bf16Bf16MatmulAlgoInit
+cublasLtE4m3E4m3Fp32Bf16E4m3Matmul
 cublasLtE4m3E4m3Fp32Bf16E4m3MatmulAlgoCapGetAttribute
 cublasLtE4m3E4m3Fp32Bf16E4m3MatmulAlgoCheck
+cublasLtE4m3E4m3Fp32Bf16E4m3MatmulAlgoGetHeuristic
+cublasLtE4m3E4m3Fp32Bf16E4m3MatmulAlgoGetIds
 cublasLtE4m3E4m3Fp32Bf16E4m3MatmulAlgoInit
+cublasLtE4m3E4m3Fp32Fp16E4m3Matmul
 cublasLtE4m3E4m3Fp32Fp16E4m3MatmulAlgoCapGetAttribute
 cublasLtE4m3E4m3Fp32Fp16E4m3MatmulAlgoCheck
+cublasLtE4m3E4m3Fp32Fp16E4m3MatmulAlgoGetHeuristic
+cublasLtE4m3E4m3Fp32Fp16E4m3MatmulAlgoGetIds
 cublasLtE4m3E4m3Fp32Fp16E4m3MatmulAlgoInit
+cublasLtE4m3E4m3Fp32Fp16Fp16Matmul
 cublasLtE4m3E4m3Fp32Fp16Fp16MatmulAlgoCapGetAttribute
 cublasLtE4m3E4m3Fp32Fp16Fp16MatmulAlgoCheck
+cublasLtE4m3E4m3Fp32Fp16Fp16MatmulAlgoGetHeuristic
+cublasLtE4m3E4m3Fp32Fp16Fp16MatmulAlgoGetIds
 cublasLtE4m3E4m3Fp32Fp16Fp16MatmulAlgoInit
+cublasLtE4m3E4m3Fp32Fp32Fp32Matmul
 cublasLtE4m3E4m3Fp32Fp32Fp32MatmulAlgoCapGetAttribute
 cublasLtE4m3E4m3Fp32Fp32Fp32MatmulAlgoCheck
+cublasLtE4m3E4m3Fp32Fp32Fp32MatmulAlgoGetHeuristic
+cublasLtE4m3E4m3Fp32Fp32Fp32MatmulAlgoGetIds
 cublasLtE4m3E4m3Fp32Fp32Fp32MatmulAlgoInit
+cublasLtE4m3E5m2Fp32Bf16Bf16Matmul
 cublasLtE4m3E5m2Fp32Bf16Bf16MatmulAlgoCapGetAttribute
 cublasLtE4m3E5m2Fp32Bf16Bf16MatmulAlgoCheck
+cublasLtE4m3E5m2Fp32Bf16Bf16MatmulAlgoGetHeuristic
+cublasLtE4m3E5m2Fp32Bf16Bf16MatmulAlgoGetIds
 cublasLtE4m3E5m2Fp32Bf16Bf16MatmulAlgoInit
+cublasLtE4m3E5m2Fp32Bf16E4m3Matmul
 cublasLtE4m3E5m2Fp32Bf16E4m3MatmulAlgoCapGetAttribute
 cublasLtE4m3E5m2Fp32Bf16E4m3MatmulAlgoCheck
+cublasLtE4m3E5m2Fp32Bf16E4m3MatmulAlgoGetHeuristic
+cublasLtE4m3E5m2Fp32Bf16E4m3MatmulAlgoGetIds
 cublasLtE4m3E5m2Fp32Bf16E4m3MatmulAlgoInit
+cublasLtE4m3E5m2Fp32Bf16E5m2Matmul
 cublasLtE4m3E5m2Fp32Bf16E5m2MatmulAlgoCapGetAttribute
 cublasLtE4m3E5m2Fp32Bf16E5m2MatmulAlgoCheck
+cublasLtE4m3E5m2Fp32Bf16E5m2MatmulAlgoGetHeuristic
+cublasLtE4m3E5m2Fp32Bf16E5m2MatmulAlgoGetIds
 cublasLtE4m3E5m2Fp32Bf16E5m2MatmulAlgoInit
+cublasLtE4m3E5m2Fp32Fp16E4m3Matmul
 cublasLtE4m3E5m2Fp32Fp16E4m3MatmulAlgoCapGetAttribute
 cublasLtE4m3E5m2Fp32Fp16E4m3MatmulAlgoCheck
+cublasLtE4m3E5m2Fp32Fp16E4m3MatmulAlgoGetHeuristic
+cublasLtE4m3E5m2Fp32Fp16E4m3MatmulAlgoGetIds
 cublasLtE4m3E5m2Fp32Fp16E4m3MatmulAlgoInit
+cublasLtE4m3E5m2Fp32Fp16E5m2Matmul
 cublasLtE4m3E5m2Fp32Fp16E5m2MatmulAlgoCapGetAttribute
 cublasLtE4m3E5m2Fp32Fp16E5m2MatmulAlgoCheck
+cublasLtE4m3E5m2Fp32Fp16E5m2MatmulAlgoGetHeuristic
+cublasLtE4m3E5m2Fp32Fp16E5m2MatmulAlgoGetIds
 cublasLtE4m3E5m2Fp32Fp16E5m2MatmulAlgoInit
+cublasLtE4m3E5m2Fp32Fp16Fp16Matmul
 cublasLtE4m3E5m2Fp32Fp16Fp16MatmulAlgoCapGetAttribute
 cublasLtE4m3E5m2Fp32Fp16Fp16MatmulAlgoCheck
+cublasLtE4m3E5m2Fp32Fp16Fp16MatmulAlgoGetHeuristic
+cublasLtE4m3E5m2Fp32Fp16Fp16MatmulAlgoGetIds
 cublasLtE4m3E5m2Fp32Fp16Fp16MatmulAlgoInit
+cublasLtE4m3E5m2Fp32Fp32Fp32Matmul
 cublasLtE4m3E5m2Fp32Fp32Fp32MatmulAlgoCapGetAttribute
 cublasLtE4m3E5m2Fp32Fp32Fp32MatmulAlgoCheck
+cublasLtE4m3E5m2Fp32Fp32Fp32MatmulAlgoGetHeuristic
+cublasLtE4m3E5m2Fp32Fp32Fp32MatmulAlgoGetIds
 cublasLtE4m3E5m2Fp32Fp32Fp32MatmulAlgoInit
+cublasLtE5m2E4m3Fp32Bf16Bf16Matmul
 cublasLtE5m2E4m3Fp32Bf16Bf16MatmulAlgoCapGetAttribute
 cublasLtE5m2E4m3Fp32Bf16Bf16MatmulAlgoCheck
+cublasLtE5m2E4m3Fp32Bf16Bf16MatmulAlgoGetHeuristic
+cublasLtE5m2E4m3Fp32Bf16Bf16MatmulAlgoGetIds
 cublasLtE5m2E4m3Fp32Bf16Bf16MatmulAlgoInit
+cublasLtE5m2E4m3Fp32Bf16E4m3Matmul
 cublasLtE5m2E4m3Fp32Bf16E4m3MatmulAlgoCapGetAttribute
 cublasLtE5m2E4m3Fp32Bf16E4m3MatmulAlgoCheck
+cublasLtE5m2E4m3Fp32Bf16E4m3MatmulAlgoGetHeuristic
+cublasLtE5m2E4m3Fp32Bf16E4m3MatmulAlgoGetIds
 cublasLtE5m2E4m3Fp32Bf16E4m3MatmulAlgoInit
+cublasLtE5m2E4m3Fp32Bf16E5m2Matmul
 cublasLtE5m2E4m3Fp32Bf16E5m2MatmulAlgoCapGetAttribute
 cublasLtE5m2E4m3Fp32Bf16E5m2MatmulAlgoCheck
+cublasLtE5m2E4m3Fp32Bf16E5m2MatmulAlgoGetHeuristic
+cublasLtE5m2E4m3Fp32Bf16E5m2MatmulAlgoGetIds
 cublasLtE5m2E4m3Fp32Bf16E5m2MatmulAlgoInit
+cublasLtE5m2E4m3Fp32Fp16E4m3Matmul
 cublasLtE5m2E4m3Fp32Fp16E4m3MatmulAlgoCapGetAttribute
 cublasLtE5m2E4m3Fp32Fp16E4m3MatmulAlgoCheck
+cublasLtE5m2E4m3Fp32Fp16E4m3MatmulAlgoGetHeuristic
+cublasLtE5m2E4m3Fp32Fp16E4m3MatmulAlgoGetIds
 cublasLtE5m2E4m3Fp32Fp16E4m3MatmulAlgoInit
+cublasLtE5m2E4m3Fp32Fp16E5m2Matmul
 cublasLtE5m2E4m3Fp32Fp16E5m2MatmulAlgoCapGetAttribute
 cublasLtE5m2E4m3Fp32Fp16E5m2MatmulAlgoCheck
+cublasLtE5m2E4m3Fp32Fp16E5m2MatmulAlgoGetHeuristic
+cublasLtE5m2E4m3Fp32Fp16E5m2MatmulAlgoGetIds
 cublasLtE5m2E4m3Fp32Fp16E5m2MatmulAlgoInit
+cublasLtE5m2E4m3Fp32Fp16Fp16Matmul
 cublasLtE5m2E4m3Fp32Fp16Fp16MatmulAlgoCapGetAttribute
 cublasLtE5m2E4m3Fp32Fp16Fp16MatmulAlgoCheck
+cublasLtE5m2E4m3Fp32Fp16Fp16MatmulAlgoGetHeuristic
+cublasLtE5m2E4m3Fp32Fp16Fp16MatmulAlgoGetIds
 cublasLtE5m2E4m3Fp32Fp16Fp16MatmulAlgoInit
+cublasLtE5m2E4m3Fp32Fp32Fp32Matmul
 cublasLtE5m2E4m3Fp32Fp32Fp32MatmulAlgoCapGetAttribute
 cublasLtE5m2E4m3Fp32Fp32Fp32MatmulAlgoCheck
+cublasLtE5m2E4m3Fp32Fp32Fp32MatmulAlgoGetHeuristic
+cublasLtE5m2E4m3Fp32Fp32Fp32MatmulAlgoGetIds
 cublasLtE5m2E4m3Fp32Fp32Fp32MatmulAlgoInit
 cublasLtGetCudartVersion
 cublasLtGetProperty
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cublasLt_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cublasLt_stub.cc
index df4e73bebc126c..d078aa2f2c55ee 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cublasLt_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cublasLt_stub.cc
@@ -15,7 +15,8 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cublasLt.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "tsl/platform/dso_loader.h"
-#include "tsl/platform/env.h"
+#include "tsl/platform/load_library.h"
+#include "tsl/platform/logging.h"
 
 // Implements the cuBLASLt API by forwarding to cuBLASLt loaded from the DSO.
 
@@ -33,8 +34,7 @@ void* GetDsoHandle() {
 void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
-    tsl::Env::Default()
-        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+    tsl::internal::GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
   return symbol;
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cublas_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cublas_stub.cc
index 814d64d75d8d61..fe3cec911ca186 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cublas_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cublas_stub.cc
@@ -24,7 +24,8 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "tsl/platform/dso_loader.h"
-#include "tsl/platform/env.h"
+#include "tsl/platform/load_library.h"
+#include "tsl/platform/logging.h"
 
 // Implements the cuBLAS API by forwarding to cuBLAS loaded from the DSO.
 // Note that it does not implement the v1 interface.
@@ -43,8 +44,7 @@ void *GetDsoHandle() {
 void *LoadSymbol(const char *symbol_name) {
   void *symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
-    tsl::Env::Default()
-        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+    tsl::internal::GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
   return symbol;
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda.symbols b/third_party/xla/third_party/tsl/tsl/cuda/cuda.symbols
index 558d11cafdbc99..97e1d00ebd57ae 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda.symbols
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cuda.symbols
@@ -10,6 +10,10 @@ cuArrayGetDescriptor_v2
 cuArrayGetMemoryRequirements
 cuArrayGetPlane
 cuArrayGetSparseProperties
+cuCoredumpGetAttribute
+cuCoredumpGetAttributeGlobal
+cuCoredumpSetAttribute
+cuCoredumpSetAttributeGlobal
 cuCtxAttach
 cuCtxCreate
 cuCtxCreate_v2
@@ -36,6 +40,7 @@ cuCtxPushCurrent_v2
 cuCtxResetPersistingL2Cache
 cuCtxSetCacheConfig
 cuCtxSetCurrent
+cuCtxSetFlags
 cuCtxSetLimit
 cuCtxSetSharedMemConfig
 cuCtxSynchronize
@@ -99,6 +104,7 @@ cuExternalMemoryGetMappedMipmappedArray
 cuFlushGPUDirectRDMAWrites
 cuFuncGetAttribute
 cuFuncGetModule
+cuFuncGetName
 cuFuncSetAttribute
 cuFuncSetBlockShape
 cuFuncSetCacheConfig
@@ -128,6 +134,7 @@ cuGetProcAddress_v2
 cuGraphAddBatchMemOpNode
 cuGraphAddChildGraphNode
 cuGraphAddDependencies
+cuGraphAddDependencies_v2
 cuGraphAddEmptyNode
 cuGraphAddEventRecordNode
 cuGraphAddEventWaitNode
@@ -140,10 +147,13 @@ cuGraphAddMemAllocNode
 cuGraphAddMemFreeNode
 cuGraphAddMemcpyNode
 cuGraphAddMemsetNode
+cuGraphAddNode
+cuGraphAddNode_v2
 cuGraphBatchMemOpNodeGetParams
 cuGraphBatchMemOpNodeSetParams
 cuGraphChildGraphNodeGetGraph
 cuGraphClone
+cuGraphConditionalHandleCreate
 cuGraphCreate
 cuGraphDebugDotPrint
 cuGraphDestroy
@@ -165,6 +175,7 @@ cuGraphExecKernelNodeSetParams
 cuGraphExecKernelNodeSetParams_v2
 cuGraphExecMemcpyNodeSetParams
 cuGraphExecMemsetNodeSetParams
+cuGraphExecNodeSetParams
 cuGraphExecUpdate
 cuGraphExecUpdate_v2
 cuGraphExternalSemaphoresSignalNodeGetParams
@@ -172,6 +183,7 @@ cuGraphExternalSemaphoresSignalNodeSetParams
 cuGraphExternalSemaphoresWaitNodeGetParams
 cuGraphExternalSemaphoresWaitNodeSetParams
 cuGraphGetEdges
+cuGraphGetEdges_v2
 cuGraphGetNodes
 cuGraphGetRootNodes
 cuGraphHostNodeGetParams
@@ -198,12 +210,16 @@ cuGraphMemsetNodeGetParams
 cuGraphMemsetNodeSetParams
 cuGraphNodeFindInClone
 cuGraphNodeGetDependencies
+cuGraphNodeGetDependencies_v2
 cuGraphNodeGetDependentNodes
+cuGraphNodeGetDependentNodes_v2
 cuGraphNodeGetEnabled
 cuGraphNodeGetType
 cuGraphNodeSetEnabled
+cuGraphNodeSetParams
 cuGraphReleaseUserObject
 cuGraphRemoveDependencies
+cuGraphRemoveDependencies_v2
 cuGraphRetainUserObject
 cuGraphUpload
 cuGraphUpload_ptsz
@@ -235,6 +251,7 @@ cuIpcOpenMemHandle
 cuIpcOpenMemHandle_v2
 cuKernelGetAttribute
 cuKernelGetFunction
+cuKernelGetName
 cuKernelSetAttribute
 cuKernelSetCacheConfig
 cuLaunch
@@ -268,6 +285,7 @@ cuLinkDestroy
 cuMemAddressFree
 cuMemAddressReserve
 cuMemAdvise
+cuMemAdvise_v2
 cuMemAlloc
 cuMemAllocAsync
 cuMemAllocAsync_ptsz
@@ -320,6 +338,8 @@ cuMemPoolSetAttribute
 cuMemPoolTrimTo
 cuMemPrefetchAsync
 cuMemPrefetchAsync_ptsz
+cuMemPrefetchAsync_v2
+cuMemPrefetchAsync_v2_ptsz
 cuMemRangeGetAttribute
 cuMemRangeGetAttributes
 cuMemRelease
@@ -438,6 +458,12 @@ cuModuleLoadData
 cuModuleLoadDataEx
 cuModuleLoadFatBinary
 cuModuleUnload
+cuMulticastAddDevice
+cuMulticastBindAddr
+cuMulticastBindMem
+cuMulticastCreate
+cuMulticastGetGranularity
+cuMulticastUnbind
 cuOccupancyAvailableDynamicSMemPerBlock
 cuOccupancyMaxActiveBlocksPerMultiprocessor
 cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
@@ -467,6 +493,8 @@ cuStreamBatchMemOp_ptsz
 cuStreamBatchMemOp_v2
 cuStreamBatchMemOp_v2_ptsz
 cuStreamBeginCapture
+cuStreamBeginCaptureToGraph
+cuStreamBeginCaptureToGraph_ptsz
 cuStreamBeginCapture_ptsz
 cuStreamBeginCapture_v2
 cuStreamBeginCapture_v2_ptsz
@@ -484,6 +512,8 @@ cuStreamGetCaptureInfo
 cuStreamGetCaptureInfo_ptsz
 cuStreamGetCaptureInfo_v2
 cuStreamGetCaptureInfo_v2_ptsz
+cuStreamGetCaptureInfo_v3
+cuStreamGetCaptureInfo_v3_ptsz
 cuStreamGetCtx
 cuStreamGetCtx_ptsz
 cuStreamGetFlags
@@ -502,6 +532,8 @@ cuStreamSynchronize
 cuStreamSynchronize_ptsz
 cuStreamUpdateCaptureDependencies
 cuStreamUpdateCaptureDependencies_ptsz
+cuStreamUpdateCaptureDependencies_v2
+cuStreamUpdateCaptureDependencies_v2_ptsz
 cuStreamWaitEvent
 cuStreamWaitEvent_ptsz
 cuStreamWaitValue32
@@ -574,10 +606,30 @@ cuVDPAUGetDevice
 cuWaitExternalSemaphoresAsync
 cuWaitExternalSemaphoresAsync_ptsz
 cudbgApiAttach
+cudbgApiClientPid
+cudbgApiClientRevision
 cudbgApiDetach
 cudbgApiInit
+cudbgAttachHandlerAvailable
+cudbgDebuggerCapabilities
+cudbgDebuggerInitialized
+cudbgDetachSuspendedDevicesMask
+cudbgEnableIntegratedMemcheck
+cudbgEnableLaunchBlocking
+cudbgEnablePreemptionDebugging
 cudbgGetAPI
 cudbgGetAPIVersion
+cudbgInjectionPath
+cudbgIpcFlag
 cudbgMain
 cudbgReportDriverApiError
+cudbgReportDriverApiErrorFlags
 cudbgReportDriverInternalError
+cudbgReportedDriverApiErrorCode
+cudbgReportedDriverApiErrorFuncNameAddr
+cudbgReportedDriverApiErrorFuncNameSize
+cudbgReportedDriverInternalErrorCode
+cudbgResumeForAttachDetach
+cudbgRpcEnabled
+cudbgSessionId
+cudbgUseExternalDebugger
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cuda_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cuda_stub.cc
index a199d4cc700442..298d493db97d15 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cuda_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cuda_stub.cc
@@ -14,7 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "tsl/platform/dso_loader.h"
-#include "tsl/platform/env.h"
+#include "tsl/platform/load_library.h"
+#include "tsl/platform/logging.h"
 
 // Implements the CUDA driver API by forwarding to CUDA loaded from the DSO.
 
@@ -36,8 +37,7 @@ void* GetDsoHandle() {
 void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
-    tsl::Env::Default()
-        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+    tsl::internal::GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
   return symbol;
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudart.symbols b/third_party/xla/third_party/tsl/tsl/cuda/cudart.symbols
index 69b990cb3879b5..443b8057e44f0e 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudart.symbols
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cudart.symbols
@@ -80,6 +80,7 @@ cudaFreeAsync_ptsz
 cudaFreeHost
 cudaFreeMipmappedArray
 cudaFuncGetAttributes
+cudaFuncGetName
 cudaFuncSetAttribute
 cudaFuncSetCacheConfig
 cudaFuncSetSharedMemConfig
@@ -115,6 +116,7 @@ cudaGetTextureObjectResourceViewDesc
 cudaGetTextureObjectTextureDesc
 cudaGraphAddChildGraphNode
 cudaGraphAddDependencies
+cudaGraphAddDependencies_v2
 cudaGraphAddEmptyNode
 cudaGraphAddEventRecordNode
 cudaGraphAddEventWaitNode
@@ -130,8 +132,10 @@ cudaGraphAddMemcpyNodeFromSymbol
 cudaGraphAddMemcpyNodeToSymbol
 cudaGraphAddMemsetNode
 cudaGraphAddNode
+cudaGraphAddNode_v2
 cudaGraphChildGraphNodeGetGraph
 cudaGraphClone
+cudaGraphConditionalHandleCreate
 cudaGraphCreate
 cudaGraphDebugDotPrint
 cudaGraphDestroy
@@ -161,6 +165,7 @@ cudaGraphExternalSemaphoresSignalNodeSetParams
 cudaGraphExternalSemaphoresWaitNodeGetParams
 cudaGraphExternalSemaphoresWaitNodeSetParams
 cudaGraphGetEdges
+cudaGraphGetEdges_v2
 cudaGraphGetNodes
 cudaGraphGetRootNodes
 cudaGraphHostNodeGetParams
@@ -187,13 +192,16 @@ cudaGraphMemsetNodeGetParams
 cudaGraphMemsetNodeSetParams
 cudaGraphNodeFindInClone
 cudaGraphNodeGetDependencies
+cudaGraphNodeGetDependencies_v2
 cudaGraphNodeGetDependentNodes
+cudaGraphNodeGetDependentNodes_v2
 cudaGraphNodeGetEnabled
 cudaGraphNodeGetType
 cudaGraphNodeSetEnabled
 cudaGraphNodeSetParams
 cudaGraphReleaseUserObject
 cudaGraphRemoveDependencies
+cudaGraphRemoveDependencies_v2
 cudaGraphRetainUserObject
 cudaGraphUpload
 cudaGraphUpload_ptsz
@@ -348,6 +356,8 @@ cudaStreamAddCallback_ptsz
 cudaStreamAttachMemAsync
 cudaStreamAttachMemAsync_ptsz
 cudaStreamBeginCapture
+cudaStreamBeginCaptureToGraph
+cudaStreamBeginCaptureToGraph_ptsz
 cudaStreamBeginCapture_ptsz
 cudaStreamCopyAttributes
 cudaStreamCopyAttributes_ptsz
@@ -363,6 +373,8 @@ cudaStreamGetCaptureInfo
 cudaStreamGetCaptureInfo_ptsz
 cudaStreamGetCaptureInfo_v2
 cudaStreamGetCaptureInfo_v2_ptsz
+cudaStreamGetCaptureInfo_v3
+cudaStreamGetCaptureInfo_v3_ptsz
 cudaStreamGetFlags
 cudaStreamGetFlags_ptsz
 cudaStreamGetId
@@ -379,6 +391,8 @@ cudaStreamSynchronize
 cudaStreamSynchronize_ptsz
 cudaStreamUpdateCaptureDependencies
 cudaStreamUpdateCaptureDependencies_ptsz
+cudaStreamUpdateCaptureDependencies_v2
+cudaStreamUpdateCaptureDependencies_v2_ptsz
 cudaStreamWaitEvent
 cudaStreamWaitEvent_ptsz
 cudaThreadExchangeStreamCaptureMode
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudart_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cudart_stub.cc
index a3797b5c751cd8..5ec2fabd84a712 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudart_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cudart_stub.cc
@@ -21,7 +21,8 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tsl/platform/dso_loader.h"
-#include "tsl/platform/env.h"
+#include "tsl/platform/load_library.h"
+#include "tsl/platform/logging.h"
 
 namespace {
 void *GetDsoHandle() {
@@ -39,8 +40,8 @@ void *GetDsoHandle() {
 
 void *LoadSymbol(const char *symbol_name) {
   void *symbol = nullptr;
-  auto env = tsl::Env::Default();
-  env->GetSymbolFromLibrary(GetDsoHandle(), symbol_name, &symbol).IgnoreError();
+  tsl::internal::GetSymbolFromLibrary(GetDsoHandle(), symbol_name, &symbol)
+      .IgnoreError();
   return symbol;
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cudnn_stub.cc
index f3cab179eb0b71..1c85b1ea684a28 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudnn_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cudnn_stub.cc
@@ -16,7 +16,8 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "third_party/gpus/cudnn/cudnn.h"
 #include "tsl/platform/dso_loader.h"
-#include "tsl/platform/env.h"
+#include "tsl/platform/load_library.h"
+#include "tsl/platform/logging.h"
 
 // Implements the cuDNN API by forwarding to cuDNN loaded from the DSO.
 
@@ -38,8 +39,7 @@ void* GetDsoHandle() {
 void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
-    tsl::Env::Default()
-        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+    tsl::internal::GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
   return symbol;
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cufft.symbols b/third_party/xla/third_party/tsl/tsl/cuda/cufft.symbols
index 605815200bd90e..0f18127df42af5 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cufft.symbols
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cufft.symbols
@@ -1,7 +1,6 @@
 cufftCreate
 cufftDebug
 cufftDestroy
-cufftEnterCS
 cufftEstimate1d
 cufftEstimate2d
 cufftEstimate3d
@@ -20,11 +19,9 @@ cufftGetSize3d
 cufftGetSizeMany
 cufftGetSizeMany64
 cufftGetVersion
-cufftLeaveCS
 cufftMakePlan1d
 cufftMakePlan2d
 cufftMakePlan3d
-cufftMakePlanGuru64
 cufftMakePlanMany
 cufftMakePlanMany64
 cufftPlan1d
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cufft_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cufft_stub.cc
index 8f5c1b0d687337..275560027af19b 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cufft_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cufft_stub.cc
@@ -15,7 +15,8 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cufft.h"
 #include "third_party/gpus/cuda/include/cufftXt.h"
 #include "tsl/platform/dso_loader.h"
-#include "tsl/platform/env.h"
+#include "tsl/platform/load_library.h"
+#include "tsl/platform/logging.h"
 
 // Implements the cuFFT API by forwarding to cuFFT loaded from the DSO.
 
@@ -37,8 +38,7 @@ void* GetDsoHandle() {
 void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
-    tsl::Env::Default()
-        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+    tsl::internal::GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
   return symbol;
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cupti_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cupti_stub.cc
index 9e632010d83a7a..aab8217aa3ebe5 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cupti_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cupti_stub.cc
@@ -16,7 +16,8 @@ limitations under the License.
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "tsl/platform/dso_loader.h"
-#include "tsl/platform/env.h"
+#include "tsl/platform/load_library.h"
+#include "tsl/platform/logging.h"
 
 // Implements the CUPTI API by forwarding to CUPTI loaded from the DSO.
 
@@ -38,8 +39,7 @@ void* GetDsoHandle() {
 void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
-    tsl::Env::Default()
-        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+    tsl::internal::GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
   return symbol;
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusolver_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cusolver_stub.cc
index d11601b3bd4217..418ce47311d718 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cusolver_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cusolver_stub.cc
@@ -16,7 +16,8 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cusolverDn.h"
 #include "third_party/gpus/cuda/include/cusolverSp.h"
 #include "tsl/platform/dso_loader.h"
-#include "tsl/platform/env.h"
+#include "tsl/platform/load_library.h"
+#include "tsl/platform/logging.h"
 
 // Implements the cusolver API by forwarding to cusolver loaded from the DSO.
 
@@ -38,8 +39,7 @@ void* GetDsoHandle() {
 void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
-    tsl::Env::Default()
-        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+    tsl::internal::GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
   return symbol;
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/cusparse_stub.cc
index 16141e51e2613b..8b545cd0c1c1d8 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cusparse_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cusparse_stub.cc
@@ -15,7 +15,8 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cusparse.h"
 #include "tsl/platform/dso_loader.h"
-#include "tsl/platform/env.h"
+#include "tsl/platform/load_library.h"
+#include "tsl/platform/logging.h"
 
 // Implements the cusparse API by forwarding to cusparse loaded from the DSO.
 
@@ -37,8 +38,7 @@ void* GetDsoHandle() {
 void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
-    tsl::Env::Default()
-        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+    tsl::internal::GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
   return symbol;
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/nccl_stub.cc b/third_party/xla/third_party/tsl/tsl/cuda/nccl_stub.cc
index 0ebae2f3c2b2eb..462ab127ee446b 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/nccl_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/cuda/nccl_stub.cc
@@ -18,7 +18,8 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/nccl/nccl.h"
 #include "tsl/platform/dso_loader.h"
-#include "tsl/platform/env.h"
+#include "tsl/platform/load_library.h"
+#include "tsl/platform/logging.h"
 
 // Implements the nccl API by forwarding to nccl loaded from a DSO.
 
@@ -40,8 +41,7 @@ void* GetDsoHandle() {
 void* LoadSymbol(const char* symbol_name) {
   void* symbol = nullptr;
   if (auto handle = GetDsoHandle()) {
-    tsl::Env::Default()
-        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+    tsl::internal::GetSymbolFromLibrary(handle, symbol_name, &symbol)
         .IgnoreError();
   }
   return symbol;
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
index 0b916e65aaa208..9d92bdccceb2c9 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -62,6 +62,7 @@ constexpr int kServiceToClientTimeoutMs = 10 * 1000;   // 10 seconds
 constexpr size_t kOngoingBarriersSoftLimit = 20;
 constexpr char kHealthCheckThread[] = "CoordinationServiceHealthCheck";
 constexpr int kPendingTaskLogLimit = 20;
+constexpr int kPendingStragglerLogLimit = 3;
 
 std::string GetTaskName(absl::string_view job_name, int task_id) {
   return strings::StrCat("/job:", job_name, "/replica:", 0, "/task:", task_id);
@@ -104,6 +105,9 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
   void SetDeviceAggregationFunction(
       std::function<DeviceInfo(const DeviceInfo& devices)>
           post_aggregate_device_fn) override;
+
+  void LogConnectStatusLocked() const TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+
   Status RegisterTask(const CoordinatedTask& task,
                       uint64_t incarnation) override;
   void WaitForAllTasks(const CoordinatedTask& task, const DeviceInfo& devices,
@@ -519,6 +523,26 @@ void CoordinationServiceStandaloneImpl::Stop(bool shut_staleness_thread) {
   }
 }
 
+// Helper to log progress to having waited for all tasks.
+void CoordinationServiceStandaloneImpl::LogConnectStatusLocked() const {
+  const int num_tasks = cluster_state_.size();
+  int pending_tasks = 0;
+  std::vector<std::string> task_names;
+  for (const auto& [task_name, task_state] : cluster_state_) {
+    if (task_state->GetState() != CoordinatedTaskState::TASKSTATE_CONNECTED) {
+      pending_tasks++;
+      if (task_names.size() < kPendingStragglerLogLimit) {
+        task_names.push_back(task_name);
+      }
+    }
+  }
+  LOG(INFO) << "Waiting for " << pending_tasks << "/" << num_tasks
+            << " tasks to connect.";
+  if (!task_names.empty()) {
+    LOG(INFO) << "Example stragglers:\n" << absl::StrJoin(task_names, "\n");
+  }
+}
+
 Status CoordinationServiceStandaloneImpl::RegisterTask(
     const CoordinatedTask& task, uint64_t incarnation) {
   const std::string& task_name = GetTaskName(task);
@@ -553,6 +577,7 @@ Status CoordinationServiceStandaloneImpl::RegisterTask(
       LOG(INFO) << task_name
                 << " has connected to coordination service. Incarnation: "
                 << incarnation;
+      LogConnectStatusLocked();
       return OkStatus();
     } else if (task_state == CoordinatedTaskState::TASKSTATE_CONNECTED) {
       // This may happen if the service processes the initial RegisterTask(),
@@ -565,6 +590,7 @@ Status CoordinationServiceStandaloneImpl::RegisterTask(
         LOG(INFO) << task_name
                   << " has connected to coordination service with the same "
                   << "incarnation again: " << incarnation;
+        LogConnectStatusLocked();
         return OkStatus();
       } else {
         error_message =
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc
index a45213d1817624..79065f7a9118ab 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <optional>
 #include <random>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -92,29 +93,27 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
   Status Shutdown() override;
   Status Reset() override;
 
-  StatusOr<std::string> GetKeyValue(const std::string& key) override;
+  StatusOr<std::string> GetKeyValue(std::string_view key) override;
   StatusOr<std::string> GetKeyValue(const char* key, int64_t key_size) override;
-  StatusOr<std::string> GetKeyValue(const std::string& key,
+  StatusOr<std::string> GetKeyValue(std::string_view key,
                                     absl::Duration timeout) override;
   std::shared_ptr<CallOptions> GetKeyValueAsync(
-      const std::string& key, StatusOrValueCallback done) override;
-  StatusOr<std::string> TryGetKeyValue(const std::string& key) override;
+      std::string_view key, StatusOrValueCallback done) override;
+  StatusOr<std::string> TryGetKeyValue(std::string_view key) override;
   StatusOr<std::vector<KeyValueEntry>> GetKeyValueDir(
-      const std::string& key) override;
-  void GetKeyValueDirAsync(const std::string& key,
+      std::string_view key) override;
+  void GetKeyValueDirAsync(std::string_view key,
                            StatusOrValueDirCallback done) override;
-  Status InsertKeyValue(const std::string& key,
-                        const std::string& value) override;
+  Status InsertKeyValue(std::string_view key, std::string_view value) override;
   Status InsertKeyValue(const char* key, int64_t key_size, const char* value,
                         int64_t value_size) override;
-  Status DeleteKeyValue(const std::string& key) override;
+  Status DeleteKeyValue(std::string_view key) override;
   Status DeleteKeyValue(const char* key, int64_t key_size) override;
-  Status UpdateKeyValue(const std::string& key,
-                        const std::string& value) override;
+  Status UpdateKeyValue(std::string_view key, std::string_view value) override;
 
-  Status StartWatchKey(const std::string& key,
+  Status StartWatchKey(std::string_view key,
                        ChangedKeyValuesCallback on_change) override;
-  Status StopWatchKey(const std::string& key) override;
+  Status StopWatchKey(std::string_view key) override;
   Status WaitAtBarrier(const std::string& barrier_id, absl::Duration timeout,
                        const std::vector<CoordinatedTask>& tasks) override;
   void WaitAtBarrierAsync(const std::string& barrier_id, absl::Duration timeout,
@@ -128,7 +127,7 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
 
  protected:
   void SetError(const Status& error) override;
-  Status ActivateWatch(const std::string& key,
+  Status ActivateWatch(std::string_view key,
                        const std::map<std::string, std::string>&) override;
   // Returns an error if agent is not running. If `allow_disconnected` is true,
   // returns OK even if the agent is in DISCONNECTED state.
@@ -567,17 +566,17 @@ Status CoordinationServiceAgentImpl::Reset() {
 }
 
 StatusOr<std::string> CoordinationServiceAgentImpl::GetKeyValue(
-    const std::string& key) {
+    std::string_view key) {
   return GetKeyValue(key, /*timeout=*/absl::InfiniteDuration());
 }
 
 StatusOr<std::string> CoordinationServiceAgentImpl::GetKeyValue(
     const char* key, int64_t key_size) {
-  return GetKeyValue(std::string(key, key_size));
+  return GetKeyValue(std::string_view(key, key_size));
 }
 
 StatusOr<std::string> CoordinationServiceAgentImpl::GetKeyValue(
-    const std::string& key, absl::Duration timeout) {
+    std::string_view key, absl::Duration timeout) {
   auto n = std::make_shared<absl::Notification>();
   auto result = std::make_shared<StatusOr<std::string>>();
   GetKeyValueAsync(key,
@@ -597,9 +596,9 @@ StatusOr<std::string> CoordinationServiceAgentImpl::GetKeyValue(
 }
 
 std::shared_ptr<CallOptions> CoordinationServiceAgentImpl::GetKeyValueAsync(
-    const std::string& key, StatusOrValueCallback done) {
+    std::string_view key, StatusOrValueCallback done) {
   auto request = std::make_shared<GetKeyValueRequest>();
-  request->set_key(key);
+  request->set_key(key.data(), key.size());
   VLOG(3) << "GetKeyValueRequest: " << request->DebugString();
   auto response = std::make_shared<GetKeyValueResponse>();
   auto call_opts = std::make_shared<CallOptions>();
@@ -633,33 +632,31 @@ std::shared_ptr<CallOptions> CoordinationServiceAgentImpl::GetKeyValueAsync(
 }
 
 StatusOr<std::string> CoordinationServiceAgentImpl::TryGetKeyValue(
-    const std::string& key) {
+    std::string_view key) {
   absl::Notification n;
   StatusOr<std::string> result;
   TryGetKeyValueRequest request;
-  request.set_key(key);
+  request.set_key(key.data(), key.size());
   VLOG(3) << "TryGetKeyValueRequest: " << request.DebugString();
   TryGetKeyValueResponse response;
-  leader_client_->TryGetKeyValueAsync(&request, &response,
-                                      [&](const Status& s) {
-                                        if (s.ok()) {
-                                          result = response.kv().value();
-                                          VLOG(3) << "TryGetKeyValueResponse: "
-                                                  << result.value();
-                                        } else {
-                                          result = s;
-                                          VLOG(3) << "TryGetKeyValueResponse: "
-                                                  << s;
-                                        }
-                                        n.Notify();
-                                      });
+  leader_client_->TryGetKeyValueAsync(
+      &request, &response, [&](const Status& s) {
+        if (s.ok()) {
+          result = response.kv().value();
+          VLOG(3) << "TryGetKeyValueResponse: " << result.value();
+        } else {
+          result = s;
+          VLOG(3) << "TryGetKeyValueResponse: " << s;
+        }
+        n.Notify();
+      });
   n.WaitForNotification();
 
   return result;
 }
 
 StatusOr<std::vector<KeyValueEntry>>
-CoordinationServiceAgentImpl::GetKeyValueDir(const std::string& key) {
+CoordinationServiceAgentImpl::GetKeyValueDir(std::string_view key) {
   absl::Notification n;
   StatusOr<std::vector<KeyValueEntry>> result;
   GetKeyValueDirAsync(
@@ -673,9 +670,9 @@ CoordinationServiceAgentImpl::GetKeyValueDir(const std::string& key) {
 }
 
 void CoordinationServiceAgentImpl::GetKeyValueDirAsync(
-    const std::string& key, StatusOrValueDirCallback done) {
+    std::string_view key, StatusOrValueDirCallback done) {
   auto request = std::make_shared<GetKeyValueDirRequest>();
-  request->set_directory_key(key);
+  request->set_directory_key(key.data(), key.size());
   VLOG(3) << "GetKeyValueDirRequest: " << request->DebugString();
   auto response = std::make_shared<GetKeyValueDirResponse>();
   leader_client_->GetKeyValueDirAsync(
@@ -694,8 +691,8 @@ void CoordinationServiceAgentImpl::GetKeyValueDirAsync(
       });
 }
 
-Status CoordinationServiceAgentImpl::InsertKeyValue(const std::string& key,
-                                                    const std::string& value) {
+Status CoordinationServiceAgentImpl::InsertKeyValue(std::string_view key,
+                                                    std::string_view value) {
   InsertKeyValueRequest request;
   request.mutable_kv()->set_key(key.data(), key.size());
   request.mutable_kv()->set_value(value.data(), value.size());
@@ -717,13 +714,13 @@ Status CoordinationServiceAgentImpl::InsertKeyValue(const char* key,
                                                     int64_t key_size,
                                                     const char* value,
                                                     int64_t value_size) {
-  return InsertKeyValue(std::string(key, key_size),
-                        std::string(value, value_size));
+  return InsertKeyValue(std::string_view(key, key_size),
+                        std::string_view(value, value_size));
 }
 
-Status CoordinationServiceAgentImpl::DeleteKeyValue(const std::string& key) {
+Status CoordinationServiceAgentImpl::DeleteKeyValue(std::string_view key) {
   DeleteKeyValueRequest request;
-  request.set_key(key);
+  request.set_key(key.data(), key.size());
   request.set_is_directory(true);
   VLOG(3) << "DeleteKeyValueRequest: " << request.DebugString();
   DeleteKeyValueResponse response;
@@ -741,23 +738,23 @@ Status CoordinationServiceAgentImpl::DeleteKeyValue(const std::string& key) {
 
 Status CoordinationServiceAgentImpl::DeleteKeyValue(const char* key,
                                                     int64_t key_size) {
-  return DeleteKeyValue(std::string(key, key_size));
+  return DeleteKeyValue(std::string_view(key, key_size));
 }
 
-Status CoordinationServiceAgentImpl::UpdateKeyValue(const std::string& key,
-                                                    const std::string& value) {
+Status CoordinationServiceAgentImpl::UpdateKeyValue(std::string_view key,
+                                                    std::string_view value) {
   return MakeCoordinationError(errors::Unimplemented(
       "CoordinationServiceAgent::UpdateKeyValue is not implemented."));
 }
 
 Status CoordinationServiceAgentImpl::StartWatchKey(
-    const std::string& key,
+    std::string_view key,
     CoordinationServiceAgentImpl::ChangedKeyValuesCallback on_change) {
   return MakeCoordinationError(errors::Unimplemented(
       "CoordinationServiceAgent::StartWatchKey is not implemented."));
 }
 
-Status CoordinationServiceAgentImpl::StopWatchKey(const std::string& key) {
+Status CoordinationServiceAgentImpl::StopWatchKey(std::string_view key) {
   return MakeCoordinationError(errors::Unimplemented(
       "CoordinationServiceAgent::StopWatchKey is not implemented."));
 }
@@ -774,7 +771,7 @@ void CoordinationServiceAgentImpl::SetError(const Status& error) {
 }
 
 Status CoordinationServiceAgentImpl::ActivateWatch(
-    const std::string& key, const std::map<std::string, std::string>& kvs) {
+    std::string_view key, const std::map<std::string, std::string>& kvs) {
   return MakeCoordinationError(errors::Unimplemented(
       "CoordinationServiceAgent::ActivateWatch is not implemented."));
 }
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h
index a567272f9d72ef..f94e6ac9dcb209 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -168,19 +169,19 @@ class CoordinationServiceAgent {
   // If the key-value is not inserted yet, this is a blocking call that waits
   // until the corresponding key is inserted.
   //   - errors::DeadlineExceeded: timed out waiting for key.
-  virtual StatusOr<std::string> GetKeyValue(const std::string& key) = 0;
+  virtual StatusOr<std::string> GetKeyValue(std::string_view key) = 0;
   virtual StatusOr<std::string> GetKeyValue(const char* key,
                                             int64_t key_size) = 0;
-  virtual StatusOr<std::string> GetKeyValue(const std::string& key,
+  virtual StatusOr<std::string> GetKeyValue(std::string_view key,
                                             absl::Duration timeout) = 0;
   // Note: Cancel the underlying RPC call with `call_opts->StartCancel()` and
   // `call_opts->ClearCancelCallback()`.
   virtual std::shared_ptr<CallOptions> GetKeyValueAsync(
-      const std::string& key, StatusOrValueCallback done) = 0;
+      std::string_view, StatusOrValueCallback done) = 0;
 
   // Get config key-value from the service.
   //   - errors::NotFound: the requested key does not exist.
-  virtual StatusOr<std::string> TryGetKeyValue(const std::string& key) = 0;
+  virtual StatusOr<std::string> TryGetKeyValue(std::string_view key) = 0;
 
   // Get all values under a directory (key).
   // A value is considered to be in the directory if its key is prefixed with
@@ -188,30 +189,30 @@ class CoordinationServiceAgent {
   // This is not a blocking call. If no keys are found, an empty vector is
   // returned immediately.
   virtual StatusOr<std::vector<tensorflow::KeyValueEntry>> GetKeyValueDir(
-      const std::string& key) = 0;
-  virtual void GetKeyValueDirAsync(const std::string& key,
+      std::string_view key) = 0;
+  virtual void GetKeyValueDirAsync(std::string_view key,
                                    StatusOrValueDirCallback done) = 0;
 
   // Insert config key-value to the service.
   //   - errors::AlreadyExists: key is already set.
-  virtual Status InsertKeyValue(const std::string& key,
-                                const std::string& value) = 0;
+  virtual Status InsertKeyValue(std::string_view key,
+                                std::string_view value) = 0;
   virtual Status InsertKeyValue(const char* key, int64_t key_size,
                                 const char* value, int64_t value_size) = 0;
 
   // Delete config keys in the coordination service.
-  virtual Status DeleteKeyValue(const std::string& key) = 0;
+  virtual Status DeleteKeyValue(std::string_view key) = 0;
   virtual Status DeleteKeyValue(const char* key, int64_t key_size) = 0;
 
   // Update the value of a config key.
-  virtual Status UpdateKeyValue(const std::string& key,
-                                const std::string& value) = 0;
+  virtual Status UpdateKeyValue(std::string_view key,
+                                std::string_view value) = 0;
 
   // Register a callback that will be invoked when the key or keys under the key
   // directory are changed (inserted, deleted, or updated).
-  virtual Status StartWatchKey(const std::string& key,
+  virtual Status StartWatchKey(std::string_view key,
                                ChangedKeyValuesCallback on_change) = 0;
-  virtual Status StopWatchKey(const std::string& key) = 0;
+  virtual Status StopWatchKey(std::string_view key) = 0;
 
   // Blocks until all (or a subset of) tasks are at the barrier or the barrier
   // fails.
@@ -273,7 +274,7 @@ class CoordinationServiceAgent {
   virtual void SetError(const Status& error) = 0;
 
   // Activate the key-value callback watch.
-  virtual Status ActivateWatch(const std::string& key,
+  virtual Status ActivateWatch(std::string_view,
                                const std::map<std::string, std::string>&) = 0;
 
  private:
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/collected_metrics.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/collected_metrics.h
index 8582594922adf2..ba67299b57a952 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/collected_metrics.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/collected_metrics.h
@@ -90,6 +90,7 @@ struct Point {
   int64_t int64_value;
   string string_value;
   bool bool_value;
+  double double_value;
   HistogramProto histogram_value;
   Percentiles percentiles_value;
 
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/collection_registry.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/collection_registry.h
index d988d2f19f15ad..7af6c87e51f0bb 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/collection_registry.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/collection_registry.h
@@ -352,6 +352,18 @@ inline void CollectValue(Percentiles value, Point* const point) {
   point->percentiles_value = std::move(value);
 }
 
+template <>
+inline void CollectValue(double value, Point* const point) {
+  point->value_type = ValueType::kDouble;
+  point->double_value = value;
+}
+
+template <>
+inline void CollectValue(std::function<double()> value_fn, Point* const point) {
+  point->value_type = ValueType::kDouble;
+  point->double_value = value_fn();
+}
+
 // Used by the CollectionRegistry class to collect all the values of all the
 // metrics in the registry. This is an implementation detail of the
 // CollectionRegistry class, please do not depend on this.
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/gauge.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/gauge.h
index 93cbe9aa928df0..0b69383b5f2d13 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/gauge.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/gauge.h
@@ -65,8 +65,10 @@ class Gauge {
             std::is_same<ValueType, bool>::value ||
             std::is_same<ValueType, std::function<int64()> >::value ||
             std::is_same<ValueType, std::function<std::string()> >::value ||
-            std::is_same<ValueType, std::function<bool()> >::value,
-        "Gauge only allows bool, int64, and string types.");
+            std::is_same<ValueType, std::function<bool()> >::value ||
+            std::is_same<ValueType, std::function<double()> >::value ||
+            std::is_same<ValueType, double>::value,
+        "Gauge only allows bool, int64, double and string types.");
     return new Gauge();
   }
 
@@ -296,8 +298,10 @@ Gauge<ValueType, NumLabels>* Gauge<ValueType, NumLabels>::New(
           std::is_same<ValueType, bool>::value ||
           std::is_same<ValueType, std::function<int64_t()> >::value ||
           std::is_same<ValueType, std::function<std::string()> >::value ||
-          std::is_same<ValueType, std::function<bool()> >::value,
-      "Gauge only allows bool, int64, and string types.");
+          std::is_same<ValueType, std::function<bool()> >::value ||
+          std::is_same<ValueType, std::function<double()> >::value ||
+          std::is_same<ValueType, double>::value,
+      "Gauge only allows bool, int64, double, and string types.");
   return new Gauge<ValueType, NumLabels>(
       MetricDef<MetricKind::kGauge, ValueType, NumLabels>(
           std::forward<MetricDefArgs>(metric_def_args)...));
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/metric_def.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/metric_def.h
index f8c21c360a2b09..ab454664691b1e 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/metric_def.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/metric_def.h
@@ -47,7 +47,8 @@ enum class ValueType : int {
   kHistogram,
   kString,
   kBool,
-  kPercentiles
+  kPercentiles,
+  kDouble
 };
 
 // Everything in the internal namespace is implementation details. Do not depend
@@ -97,6 +98,16 @@ inline ValueType GetValueType<std::function<bool()>>() {
   return ValueType::kBool;
 }
 
+template <>
+inline ValueType GetValueType<double>() {
+  return ValueType::kDouble;
+}
+
+template <>
+inline ValueType GetValueType<std::function<double()>>() {
+  return ValueType::kDouble;
+}
+
 }  // namespace internal
 
 // Abstract base class for a metric definition.
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cpu_info.cc b/third_party/xla/third_party/tsl/tsl/platform/cpu_info.cc
index c25c354fd37cac..1de5eb8031623d 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cpu_info.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/cpu_info.cc
@@ -82,6 +82,7 @@ class CPUIDInfo {
       : have_adx_(0),
         have_aes_(0),
         have_amx_bf16_(0),
+        have_amx_fp16_(0),
         have_amx_int8_(0),
         have_amx_tile_(0),
         have_avx_(0),
@@ -98,8 +99,11 @@ class CPUIDInfo {
         have_avx512_4vnniw_(0),
         have_avx512_4fmaps_(0),
         have_avx512_bf16_(0),
+        have_avx512_fp16_(0),
         have_avx512_vnni_(0),
         have_avx_vnni_(0),
+        have_avx_vnni_int8_(0),
+        have_avx_ne_convert_(0),
         have_bmi1_(0),
         have_bmi2_(0),
         have_cmov_(0),
@@ -226,12 +230,19 @@ class CPUIDInfo {
     cpuid->have_amx_int8_ = (edx >> 25) & 0x1;
     cpuid->have_amx_bf16_ = (edx >> 22) & 0x1;
 
+    // Check for avx512_fp16 using information from Xbyak in oneDNN:
+    // https://github.com/oneapi-src/oneDNN/blob/acf8d214cedfe7e24c9446bacc1f9f648c9273f8/src/cpu/x64/xbyak/xbyak_util.h#L516
+    cpuid->have_avx512_fp16_ = have_avx512 && ((edx >> 23) & 0x1);
+
     // Get more Structured Extended Feature info by issuing CPUID with
     // sub-leaf = 1 (eax = 7, ecx = 1)
     if (kMaxNumSubLeaves >= 1) {
       GETCPUID(eax, ebx, ecx, edx, 7, 1);
       cpuid->have_avx_vnni_ = (eax >> 4) & 0x1;
       cpuid->have_avx512_bf16_ = have_avx512 && ((eax >> 5) & 0x1);
+      cpuid->have_amx_fp16_ = (eax >> 21) & 0x1;
+      cpuid->have_avx_vnni_int8_ = (edx >> 4) & 0x1;
+      cpuid->have_avx_ne_convert_ = (edx >> 5) & 0x1;
     }
   }
 
@@ -242,6 +253,7 @@ class CPUIDInfo {
       case ADX:           return cpuid->have_adx_;
       case AES:           return cpuid->have_aes_;
       case AMX_BF16:      return cpuid->have_amx_bf16_;
+      case AMX_FP16:      return cpuid->have_amx_fp16_;
       case AMX_INT8:      return cpuid->have_amx_int8_;
       case AMX_TILE:      return cpuid->have_amx_tile_;
       case AVX2:          return cpuid->have_avx2_;
@@ -258,8 +270,11 @@ class CPUIDInfo {
       case AVX512_4VNNIW: return cpuid->have_avx512_4vnniw_;
       case AVX512_4FMAPS: return cpuid->have_avx512_4fmaps_;
       case AVX512_BF16:   return cpuid->have_avx512_bf16_;
+      case AVX512_FP16:   return cpuid->have_avx512_fp16_;
       case AVX512_VNNI:   return cpuid->have_avx512_vnni_;
       case AVX_VNNI:      return cpuid->have_avx_vnni_;
+      case AVX_VNNI_INT8:  return cpuid->have_avx_vnni_int8_;
+      case AVX_NE_CONVERT: return cpuid->have_avx_ne_convert_;
       case BMI1:          return cpuid->have_bmi1_;
       case BMI2:          return cpuid->have_bmi2_;
       case CMOV:          return cpuid->have_cmov_;
@@ -297,6 +312,7 @@ class CPUIDInfo {
   int have_adx_ : 1;
   int have_aes_ : 1;
   int have_amx_bf16_ : 1;
+  int have_amx_fp16_ : 1;
   int have_amx_int8_ : 1;
   int have_amx_tile_ : 1;
   int have_avx_ : 1;
@@ -313,8 +329,11 @@ class CPUIDInfo {
   int have_avx512_4vnniw_ : 1;
   int have_avx512_4fmaps_ : 1;
   int have_avx512_bf16_ : 1;
+  int have_avx512_fp16_ : 1;
   int have_avx512_vnni_ : 1;
   int have_avx_vnni_ : 1;
+  int have_avx_vnni_int8_ : 1;
+  int have_avx_ne_convert_ : 1;
   int have_bmi1_ : 1;
   int have_bmi2_ : 1;
   int have_cmov_ : 1;
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cpu_info.h b/third_party/xla/third_party/tsl/tsl/platform/cpu_info.h
index e0b0d66bb11118..68506b1d34ae8e 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cpu_info.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/cpu_info.h
@@ -132,6 +132,11 @@ enum CPUFeature {
   AMX_TILE = 41,  // Tile configuration and load/store
   AMX_INT8 = 42,  // Int8 tile matrix multiplication
   AMX_BF16 = 43,  // Bfloat16 tile matrix multiplication
+
+  AVX512_FP16 = 44,     // Float16 neural network
+  AMX_FP16 = 45,        // Float16 tile matrix multiplication
+  AVX_NE_CONVERT = 46,  // Instructions for faster bfloat16, float16 convert.
+  AVX_VNNI_INT8 = 47,   // VNNI instructions for combinations of u8, s8 dtypes.
 };
 
 enum Aarch64CPU {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
index e56abd66607093..aac69570b88c4f 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
@@ -82,12 +82,11 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        "//tsl/platform:env",
-        "//tsl/platform:errors",
+        "//tsl/platform:load_library",
         "//tsl/platform:logging",
         "//tsl/platform:path",
-        "//tsl/platform:status",
-        "//tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_config_cuda//cuda:cuda_headers",
@@ -247,8 +246,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        "//tsl/platform:errors",
-        "//tsl/platform:status",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -362,7 +360,6 @@ cc_library(
         "//tsl:with_numa_support": ["TENSORFLOW_USE_NUMA"],
         "//conditions:default": [],
     }),
-    features = ["-layering_check"],
     tags = [
         "manual",
         "no_oss",
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/build_config/BUILD b/third_party/xla/third_party/tsl/tsl/platform/default/build_config/BUILD
index 93f35c45c0569d..2d6dfda0028a1b 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/build_config/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/build_config/BUILD
@@ -117,10 +117,16 @@ cc_library(
     data = [
         "@local_config_cuda//cuda:cudart",
     ],
-    linkopts = [
-        "-Wl,-rpath,../local_config_cuda/cuda/lib64",
-        "-Wl,-rpath,../local_config_cuda/cuda/extras/CUPTI/lib64",
-    ],
+    linkopts = select({
+        "//tsl:macos": [
+            "-Wl,-rpath,../local_config_cuda/cuda/lib",
+            "-Wl,-rpath,../local_config_cuda/cuda/extras/CUPTI/lib",
+        ],
+        "//conditions:default": [
+            "-Wl,-rpath,../local_config_cuda/cuda/lib64",
+            "-Wl,-rpath,../local_config_cuda/cuda/extras/CUPTI/lib64",
+        ],
+    }),
     visibility = ["//visibility:public"],
     deps = [
         "@local_config_cuda//cuda:cudart",
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/dlopen_checker.cc b/third_party/xla/third_party/tsl/tsl/platform/default/dlopen_checker.cc
index 2d67789d8a0017..eb8fff80bfb6ac 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/dlopen_checker.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/dlopen_checker.cc
@@ -12,17 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "tsl/platform/default/dso_loader.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace tsl {
 namespace internal {
 namespace DsoLoader {
 
-Status TryDlopenCUDALibraries() {
+absl::Status TryDlopenCUDALibraries() {
   namespace CachedLoader = ::tsl::internal::CachedDsoLoader;
   auto cudart_status = CachedLoader::GetCudaRuntimeDsoHandle();
   auto cublas_status = CachedLoader::GetCublasDsoHandle();
@@ -36,14 +35,14 @@ Status TryDlopenCUDALibraries() {
       !cufft_status.status().ok() || !cusolver_status.status().ok() ||
       !cusparse_status.status().ok() || !cudnn_status.status().ok() ||
       !cublaslt_status.status().ok()) {
-    return Status(absl::StatusCode::kInternal,
-                  absl::StrCat("Cannot dlopen all CUDA libraries."));
+    return absl::Status(absl::StatusCode::kInternal,
+                        absl::StrCat("Cannot dlopen all CUDA libraries."));
   } else {
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 }
 
-Status TryDlopenROCmLibraries() {
+absl::Status TryDlopenROCmLibraries() {
   auto rocblas_status = GetRocblasDsoHandle();
   auto miopen_status = GetMiopenDsoHandle();
   auto rocfft_status = GetHipfftDsoHandle();
@@ -57,32 +56,30 @@ Status TryDlopenROCmLibraries() {
       || !hipblaslt_status.status().ok()
 #endif
   ) {
-    return Status(absl::StatusCode::kInternal,
-                  absl::StrCat("Cannot dlopen all ROCm libraries."));
+    return absl::InternalError("Cannot dlopen all ROCm libraries.");
   } else {
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 }
 
-Status MaybeTryDlopenGPULibraries() {
+absl::Status MaybeTryDlopenGPULibraries() {
 #if GOOGLE_CUDA
   return TryDlopenCUDALibraries();
 #elif TENSORFLOW_USE_ROCM
   return TryDlopenROCmLibraries();
 #else
   LOG(INFO) << "Not built with GPU enabled. Skip GPU library dlopen check.";
-  return tsl::OkStatus();
+  return absl::OkStatus();
 #endif
 }
 
-Status TryDlopenTensorRTLibraries() {
+absl::Status TryDlopenTensorRTLibraries() {
   auto nvinfer_status = GetNvInferDsoHandle();
   auto nvinferplugin_status = GetNvInferPluginDsoHandle();
   if (!nvinfer_status.status().ok() || !nvinferplugin_status.status().ok()) {
-    return Status(absl::StatusCode::kInternal,
-                  absl::StrCat("Cannot dlopen all TensorRT libraries."));
+    return absl::InternalError("Cannot dlopen all TensorRT libraries.");
   } else {
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/dlopen_checker_stub.cc b/third_party/xla/third_party/tsl/tsl/platform/default/dlopen_checker_stub.cc
index 1d4b213427b5a0..67f734302835d8 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/dlopen_checker_stub.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/dlopen_checker_stub.cc
@@ -12,18 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "absl/status/status.h"
 #include "tsl/platform/default/dso_loader.h"
 #include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 namespace tsl {
 namespace internal {
 namespace DsoLoader {
 
 // Skip check when GPU libraries are statically linked.
-Status MaybeTryDlopenGPULibraries() {
+absl::Status MaybeTryDlopenGPULibraries() {
   LOG(INFO) << "GPU libraries are statically linked, skip dlopen check.";
-  return ::tsl::OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace DsoLoader
 }  // namespace internal
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.cc b/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.cc
index fd28f05590683c..a835a81489367a 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.cc
@@ -16,17 +16,18 @@ limitations under the License.
 
 #include <stdlib.h>
 
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "third_party/gpus/cuda/cuda_config.h"
 #include "third_party/nccl/nccl_config.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
+#include "tsl/platform/load_library.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/platform.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 #include "third_party/tensorrt/tensorrt_config.h"
 
 #if TENSORFLOW_USE_ROCM
@@ -37,22 +38,23 @@ namespace tsl {
 namespace internal {
 
 namespace {
-string GetCudaVersion() { return TF_CUDA_VERSION; }
-string GetCudaRtVersion() { return TF_CUDART_VERSION; }
-string GetCuptiVersion() { return TF_CUPTI_VERSION; }
-string GetCudnnVersion() { return TF_CUDNN_VERSION; }
-string GetCublasVersion() { return TF_CUBLAS_VERSION; }
-string GetCusolverVersion() { return TF_CUSOLVER_VERSION; }
-string GetCufftVersion() { return TF_CUFFT_VERSION; }
-string GetCusparseVersion() { return TF_CUSPARSE_VERSION; }
-string GetNcclVersion() { return TF_NCCL_VERSION; }
-string GetTensorRTVersion() { return TF_TENSORRT_VERSION; }
-
-StatusOr<void*> GetDsoHandle(const string& name, const string& version) {
-  auto filename = Env::Default()->FormatLibraryFileName(name, version);
+std::string GetCudaVersion() { return TF_CUDA_VERSION; }
+std::string GetCudaRtVersion() { return TF_CUDART_VERSION; }
+std::string GetCuptiVersion() { return TF_CUPTI_VERSION; }
+std::string GetCudnnVersion() { return TF_CUDNN_VERSION; }
+std::string GetCublasVersion() { return TF_CUBLAS_VERSION; }
+std::string GetCusolverVersion() { return TF_CUSOLVER_VERSION; }
+std::string GetCufftVersion() { return TF_CUFFT_VERSION; }
+std::string GetCusparseVersion() { return TF_CUSPARSE_VERSION; }
+std::string GetNcclVersion() { return TF_NCCL_VERSION; }
+std::string GetTensorRTVersion() { return TF_TENSORRT_VERSION; }
+
+absl::StatusOr<void*> GetDsoHandle(const std::string& name,
+                                   const std::string& version) {
+  auto filename = tsl::internal::FormatLibraryFileName(name, version);
   void* dso_handle;
-  Status status =
-      Env::Default()->LoadDynamicLibrary(filename.c_str(), &dso_handle);
+  absl::Status status =
+      tsl::internal::LoadDynamicLibrary(filename.c_str(), &dso_handle);
   if (status.ok()) {
     VLOG(1) << "Successfully opened dynamic library " << filename;
     return dso_handle;
@@ -60,41 +62,56 @@ StatusOr<void*> GetDsoHandle(const string& name, const string& version) {
 
   auto message = absl::StrCat("Could not load dynamic library '", filename,
                               "'; dlerror: ", status.message());
+#if !defined(PLATFORM_WINDOWS)
+  if (const char* ld_library_path = getenv("LD_LIBRARY_PATH")) {
+    message += absl::StrCat("; LD_LIBRARY_PATH: ", ld_library_path);
+  }
+#endif
   VLOG(1) << message;
-  return Status(absl::StatusCode::kFailedPrecondition, message);
+  return absl::Status(absl::StatusCode::kFailedPrecondition, message);
 }
 }  // namespace
 
 namespace DsoLoader {
-StatusOr<void*> GetCudaDriverDsoHandle() {
+absl::StatusOr<void*> GetCudaDriverDsoHandle() {
+#if defined(PLATFORM_WINDOWS)
+  return GetDsoHandle("nvcuda", "");
+#elif defined(__APPLE__)
+  // On Mac OS X, CUDA sometimes installs libcuda.dylib instead of
+  // libcuda.1.dylib.
+  auto handle_or = GetDsoHandle("cuda", "");
+  if (handle_or.ok()) {
+    return handle_or;
+  }
+#endif
   return GetDsoHandle("cuda", "1");
 }
 
-StatusOr<void*> GetCudaRuntimeDsoHandle() {
+absl::StatusOr<void*> GetCudaRuntimeDsoHandle() {
   return GetDsoHandle("cudart", GetCudaRtVersion());
 }
 
-StatusOr<void*> GetCublasDsoHandle() {
+absl::StatusOr<void*> GetCublasDsoHandle() {
   return GetDsoHandle("cublas", GetCublasVersion());
 }
 
-StatusOr<void*> GetCublasLtDsoHandle() {
+absl::StatusOr<void*> GetCublasLtDsoHandle() {
   return GetDsoHandle("cublasLt", GetCublasVersion());
 }
 
-StatusOr<void*> GetCufftDsoHandle() {
+absl::StatusOr<void*> GetCufftDsoHandle() {
   return GetDsoHandle("cufft", GetCufftVersion());
 }
 
-StatusOr<void*> GetCusolverDsoHandle() {
+absl::StatusOr<void*> GetCusolverDsoHandle() {
   return GetDsoHandle("cusolver", GetCusolverVersion());
 }
 
-StatusOr<void*> GetCusparseDsoHandle() {
+absl::StatusOr<void*> GetCusparseDsoHandle() {
   return GetDsoHandle("cusparse", GetCusparseVersion());
 }
 
-StatusOr<void*> GetCuptiDsoHandle() {
+absl::StatusOr<void*> GetCuptiDsoHandle() {
   // Load specific version of CUPTI this is built.
   auto status_or_handle = GetDsoHandle("cupti", GetCuptiVersion());
   if (status_or_handle.ok()) return status_or_handle;
@@ -102,150 +119,166 @@ StatusOr<void*> GetCuptiDsoHandle() {
   return GetDsoHandle("cupti", "");
 }
 
-StatusOr<void*> GetCudnnDsoHandle() {
+absl::StatusOr<void*> GetCudnnDsoHandle() {
   return GetDsoHandle("cudnn", GetCudnnVersion());
 }
 
-StatusOr<void*> GetNcclDsoHandle() {
+absl::StatusOr<void*> GetNcclDsoHandle() {
   return GetDsoHandle("nccl", GetNcclVersion());
 }
 
-StatusOr<void*> GetNvInferDsoHandle() {
+absl::StatusOr<void*> GetNvInferDsoHandle() {
+#if defined(PLATFORM_WINDOWS)
+  return GetDsoHandle("nvinfer", "");
+#else
   return GetDsoHandle("nvinfer", GetTensorRTVersion());
+#endif
 }
 
-StatusOr<void*> GetNvInferPluginDsoHandle() {
+absl::StatusOr<void*> GetNvInferPluginDsoHandle() {
+#if defined(PLATFORM_WINDOWS)
+  return GetDsoHandle("nvinfer_plugin", "");
+#else
   return GetDsoHandle("nvinfer_plugin", GetTensorRTVersion());
+#endif
 }
 
-StatusOr<void*> GetRocblasDsoHandle() { return GetDsoHandle("rocblas", ""); }
+absl::StatusOr<void*> GetRocblasDsoHandle() {
+  return GetDsoHandle("rocblas", "");
+}
 
-StatusOr<void*> GetMiopenDsoHandle() { return GetDsoHandle("MIOpen", ""); }
+absl::StatusOr<void*> GetMiopenDsoHandle() {
+  return GetDsoHandle("MIOpen", "");
+}
 
-StatusOr<void*> GetHipfftDsoHandle() { return GetDsoHandle("hipfft", ""); }
+absl::StatusOr<void*> GetHipfftDsoHandle() {
+  return GetDsoHandle("hipfft", "");
+}
 
-StatusOr<void*> GetRocrandDsoHandle() { return GetDsoHandle("rocrand", ""); }
+absl::StatusOr<void*> GetRocrandDsoHandle() {
+  return GetDsoHandle("rocrand", "");
+}
 
-StatusOr<void*> GetRocsolverDsoHandle() {
+absl::StatusOr<void*> GetRocsolverDsoHandle() {
   return GetDsoHandle("rocsolver", "");
 }
 
 #if TF_ROCM_VERSION >= 40500
-StatusOr<void*> GetHipsolverDsoHandle() {
+absl::StatusOr<void*> GetHipsolverDsoHandle() {
   return GetDsoHandle("hipsolver", "");
 }
 #endif
 
-StatusOr<void*> GetRoctracerDsoHandle() {
+absl::StatusOr<void*> GetRoctracerDsoHandle() {
   return GetDsoHandle("roctracer64", "");
 }
 
-StatusOr<void*> GetHipsparseDsoHandle() {
+absl::StatusOr<void*> GetHipsparseDsoHandle() {
   return GetDsoHandle("hipsparse", "");
 }
 
-StatusOr<void*> GetHipblasltDsoHandle() {
+absl::StatusOr<void*> GetHipblasltDsoHandle() {
   return GetDsoHandle("hipblaslt", "");
 }
 
-StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("amdhip64", ""); }
+absl::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("amdhip64", ""); }
 
 }  // namespace DsoLoader
 
 namespace CachedDsoLoader {
-StatusOr<void*> GetCudaDriverDsoHandle() {
+absl::StatusOr<void*> GetCudaDriverDsoHandle() {
   static auto result = new auto(DsoLoader::GetCudaDriverDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetCudaRuntimeDsoHandle() {
+absl::StatusOr<void*> GetCudaRuntimeDsoHandle() {
   static auto result = new auto(DsoLoader::GetCudaRuntimeDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetCublasDsoHandle() {
+absl::StatusOr<void*> GetCublasDsoHandle() {
   static auto result = new auto(DsoLoader::GetCublasDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetCublasLtDsoHandle() {
+absl::StatusOr<void*> GetCublasLtDsoHandle() {
   static auto result = new auto(DsoLoader::GetCublasLtDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetCufftDsoHandle() {
+absl::StatusOr<void*> GetCufftDsoHandle() {
   static auto result = new auto(DsoLoader::GetCufftDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetCusolverDsoHandle() {
+absl::StatusOr<void*> GetCusolverDsoHandle() {
   static auto result = new auto(DsoLoader::GetCusolverDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetCusparseDsoHandle() {
+absl::StatusOr<void*> GetCusparseDsoHandle() {
   static auto result = new auto(DsoLoader::GetCusparseDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetCuptiDsoHandle() {
+absl::StatusOr<void*> GetCuptiDsoHandle() {
   static auto result = new auto(DsoLoader::GetCuptiDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetCudnnDsoHandle() {
+absl::StatusOr<void*> GetCudnnDsoHandle() {
   static auto result = new auto(DsoLoader::GetCudnnDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetRocblasDsoHandle() {
+absl::StatusOr<void*> GetRocblasDsoHandle() {
   static auto result = new auto(DsoLoader::GetRocblasDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetMiopenDsoHandle() {
+absl::StatusOr<void*> GetMiopenDsoHandle() {
   static auto result = new auto(DsoLoader::GetMiopenDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetHipfftDsoHandle() {
+absl::StatusOr<void*> GetHipfftDsoHandle() {
   static auto result = new auto(DsoLoader::GetHipfftDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetRocrandDsoHandle() {
+absl::StatusOr<void*> GetRocrandDsoHandle() {
   static auto result = new auto(DsoLoader::GetRocrandDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetRoctracerDsoHandle() {
+absl::StatusOr<void*> GetRoctracerDsoHandle() {
   static auto result = new auto(DsoLoader::GetRoctracerDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetRocsolverDsoHandle() {
+absl::StatusOr<void*> GetRocsolverDsoHandle() {
   static auto result = new auto(DsoLoader::GetRocsolverDsoHandle());
   return *result;
 }
 
 #if TF_ROCM_VERSION >= 40500
-StatusOr<void*> GetHipsolverDsoHandle() {
+absl::StatusOr<void*> GetHipsolverDsoHandle() {
   static auto result = new auto(DsoLoader::GetHipsolverDsoHandle());
   return *result;
 }
 #endif
 
-StatusOr<void*> GetHipsparseDsoHandle() {
+absl::StatusOr<void*> GetHipsparseDsoHandle() {
   static auto result = new auto(DsoLoader::GetHipsparseDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetHipblasltDsoHandle() {
+absl::StatusOr<void*> GetHipblasltDsoHandle() {
   static auto result = new auto(DsoLoader::GetHipblasltDsoHandle());
   return *result;
 }
 
-StatusOr<void*> GetHipDsoHandle() {
+absl::StatusOr<void*> GetHipDsoHandle() {
   static auto result = new auto(DsoLoader::GetHipDsoHandle());
   return *result;
 }
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.h b/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.h
index ee5b2b28af3486..6f72484d504f53 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/dso_loader.h
@@ -19,8 +19,8 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PLATFORM_DEFAULT_DSO_LOADER_H_
 #define TENSORFLOW_TSL_PLATFORM_DEFAULT_DSO_LOADER_H_
 
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 
 namespace tsl {
 namespace internal {
@@ -28,65 +28,65 @@ namespace internal {
 namespace DsoLoader {
 // The following methods either load the DSO of interest and return a dlopen
 // handle or error status.
-StatusOr<void*> GetCudaDriverDsoHandle();
-StatusOr<void*> GetCudaRuntimeDsoHandle();
-StatusOr<void*> GetCublasDsoHandle();
-StatusOr<void*> GetCublasLtDsoHandle();
-StatusOr<void*> GetCufftDsoHandle();
-StatusOr<void*> GetCusolverDsoHandle();
-StatusOr<void*> GetCusparseDsoHandle();
-StatusOr<void*> GetCuptiDsoHandle();
-StatusOr<void*> GetCudnnDsoHandle();
-StatusOr<void*> GetNcclDsoHandle();
-StatusOr<void*> GetNvInferDsoHandle();
-StatusOr<void*> GetNvInferPluginDsoHandle();
+absl::StatusOr<void*> GetCudaDriverDsoHandle();
+absl::StatusOr<void*> GetCudaRuntimeDsoHandle();
+absl::StatusOr<void*> GetCublasDsoHandle();
+absl::StatusOr<void*> GetCublasLtDsoHandle();
+absl::StatusOr<void*> GetCufftDsoHandle();
+absl::StatusOr<void*> GetCusolverDsoHandle();
+absl::StatusOr<void*> GetCusparseDsoHandle();
+absl::StatusOr<void*> GetCuptiDsoHandle();
+absl::StatusOr<void*> GetCudnnDsoHandle();
+absl::StatusOr<void*> GetNcclDsoHandle();
+absl::StatusOr<void*> GetNvInferDsoHandle();
+absl::StatusOr<void*> GetNvInferPluginDsoHandle();
 
-StatusOr<void*> GetRocblasDsoHandle();
-StatusOr<void*> GetMiopenDsoHandle();
-StatusOr<void*> GetHipfftDsoHandle();
-StatusOr<void*> GetRocrandDsoHandle();
-StatusOr<void*> GetRoctracerDsoHandle();
-StatusOr<void*> GetRocsolverDsoHandle();
-StatusOr<void*> GetHipsolverDsoHandle();
-StatusOr<void*> GetHipsparseDsoHandle();
-StatusOr<void*> GetHipDsoHandle();
+absl::StatusOr<void*> GetRocblasDsoHandle();
+absl::StatusOr<void*> GetMiopenDsoHandle();
+absl::StatusOr<void*> GetHipfftDsoHandle();
+absl::StatusOr<void*> GetRocrandDsoHandle();
+absl::StatusOr<void*> GetRoctracerDsoHandle();
+absl::StatusOr<void*> GetRocsolverDsoHandle();
+absl::StatusOr<void*> GetHipsolverDsoHandle();
+absl::StatusOr<void*> GetHipsparseDsoHandle();
+absl::StatusOr<void*> GetHipDsoHandle();
 
 // The following method tries to dlopen all necessary GPU libraries for the GPU
 // platform TF is built with (CUDA or ROCm) only when these libraries should be
 // dynamically loaded. Error status is returned when any of the libraries cannot
 // be dlopened.
-Status MaybeTryDlopenGPULibraries();
+absl::Status MaybeTryDlopenGPULibraries();
 
 // The following method tries to dlopen all necessary TensorRT libraries when
 // these libraries should be dynamically loaded. Error status is returned when
 // any of the libraries cannot be dlopened.
-Status TryDlopenTensorRTLibraries();
+absl::Status TryDlopenTensorRTLibraries();
 }  // namespace DsoLoader
 
 // Wrapper around the DsoLoader that prevents us from dlopen'ing any of the DSOs
 // more than once.
 namespace CachedDsoLoader {
 // Cached versions of the corresponding DsoLoader methods above.
-StatusOr<void*> GetCudaDriverDsoHandle();
-StatusOr<void*> GetCudaRuntimeDsoHandle();
-StatusOr<void*> GetCublasDsoHandle();
-StatusOr<void*> GetCublasLtDsoHandle();
-StatusOr<void*> GetCufftDsoHandle();
-StatusOr<void*> GetCusolverDsoHandle();
-StatusOr<void*> GetCusparseDsoHandle();
-StatusOr<void*> GetCuptiDsoHandle();
-StatusOr<void*> GetCudnnDsoHandle();
+absl::StatusOr<void*> GetCudaDriverDsoHandle();
+absl::StatusOr<void*> GetCudaRuntimeDsoHandle();
+absl::StatusOr<void*> GetCublasDsoHandle();
+absl::StatusOr<void*> GetCublasLtDsoHandle();
+absl::StatusOr<void*> GetCufftDsoHandle();
+absl::StatusOr<void*> GetCusolverDsoHandle();
+absl::StatusOr<void*> GetCusparseDsoHandle();
+absl::StatusOr<void*> GetCuptiDsoHandle();
+absl::StatusOr<void*> GetCudnnDsoHandle();
 
-StatusOr<void*> GetRocblasDsoHandle();
-StatusOr<void*> GetMiopenDsoHandle();
-StatusOr<void*> GetHipfftDsoHandle();
-StatusOr<void*> GetRocrandDsoHandle();
-StatusOr<void*> GetRocsolverDsoHandle();
-StatusOr<void*> GetHipsolverDsoHandle();
-StatusOr<void*> GetRoctracerDsoHandle();
-StatusOr<void*> GetHipsparseDsoHandle();
-StatusOr<void*> GetHipblasltDsoHandle();
-StatusOr<void*> GetHipDsoHandle();
+absl::StatusOr<void*> GetRocblasDsoHandle();
+absl::StatusOr<void*> GetMiopenDsoHandle();
+absl::StatusOr<void*> GetHipfftDsoHandle();
+absl::StatusOr<void*> GetRocrandDsoHandle();
+absl::StatusOr<void*> GetRocsolverDsoHandle();
+absl::StatusOr<void*> GetHipsolverDsoHandle();
+absl::StatusOr<void*> GetRoctracerDsoHandle();
+absl::StatusOr<void*> GetHipsparseDsoHandle();
+absl::StatusOr<void*> GetHipblasltDsoHandle();
+absl::StatusOr<void*> GetHipDsoHandle();
 }  // namespace CachedDsoLoader
 
 }  // namespace internal
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/load_library.cc b/third_party/xla/third_party/tsl/tsl/platform/default/load_library.cc
index f49adf2f7f257d..70961c8dc990ef 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/load_library.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/load_library.cc
@@ -17,26 +17,26 @@ limitations under the License.
 
 #include <dlfcn.h>
 
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
+#include <string>
+
+#include "absl/status/status.h"
 
 namespace tsl {
 
 namespace internal {
 
-Status LoadDynamicLibrary(const char* library_filename, void** handle) {
+absl::Status LoadDynamicLibrary(const char* library_filename, void** handle) {
   *handle = dlopen(library_filename, RTLD_NOW | RTLD_LOCAL);
   if (!*handle) {
     // Note that in C++17 std::string_view(nullptr) gives segfault!
     const char* error_msg = dlerror();
-    return tsl::errors::NotFound(error_msg ? error_msg
-                                           : "(null error message)");
+    return absl::NotFoundError(error_msg ? error_msg : "(null error message)");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
-                            void** symbol) {
+absl::Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
+                                  void** symbol) {
   // Check that the handle is not NULL to avoid dlsym's RTLD_DEFAULT behavior.
   if (!handle) {
     *symbol = nullptr;
@@ -46,14 +46,14 @@ Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
   if (!*symbol) {
     // Note that in C++17 std::string_view(nullptr) gives segfault!
     const char* error_msg = dlerror();
-    return tsl::errors::NotFound(error_msg ? error_msg
-                                           : "(null error message)");
+    return absl::NotFoundError(error_msg ? error_msg : "(null error message)");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-string FormatLibraryFileName(const string& name, const string& version) {
-  string filename;
+std::string FormatLibraryFileName(const std::string& name,
+                                  const std::string& version) {
+  std::string filename;
 #if defined(__APPLE__)
   if (version.size() == 0) {
     filename = "lib" + name + ".dylib";
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/port.cc b/third_party/xla/third_party/tsl/tsl/platform/default/port.cc
index c2151c78ec5330..868fb35f887dab 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/port.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/port.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "absl/base/internal/sysinfo.h"
 #include "tsl/platform/cpu_info.h"
+#include "tsl/platform/host_info.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/mem.h"
 #include "tsl/platform/numa.h"
@@ -256,7 +257,6 @@ int NUMAGetThreadNodeAffinity() {
   return node_index;
 }
 
-
 void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
 #ifdef TENSORFLOW_USE_NUMA
   if (HaveHWLocTopology()) {
@@ -307,7 +307,6 @@ int NUMAGetMemAffinity(const void* addr) {
   return node;
 }
 
-
 bool Snappy_Compress(const char* input, size_t length, string* output) {
 #ifdef TF_USE_SNAPPY
   output->resize(snappy::MaxCompressedLength(length));
@@ -447,5 +446,8 @@ MemoryBandwidthInfo GetMemoryBandwidthInfo() {
   MemoryBandwidthInfo membw_info = {INT64_MAX};
   return membw_info;
 }
+
+IOStatistics GetIOStatistics() { return IOStatistics(); }
+
 }  // namespace port
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/subprocess.cc b/third_party/xla/third_party/tsl/tsl/platform/default/subprocess.cc
index d750328ebf38fd..c786295c08e0e9 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/subprocess.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/subprocess.cc
@@ -30,7 +30,11 @@ limitations under the License.
 #include "tsl/platform/logging.h"
 
 // Android versions older than 28 do not have posix_spawn().
-#define USE_POSIX_SPAWN !defined(__ANDROID_API__) || __ANDROID_API__ >= 28
+#if !defined(__ANDROID_API__) || __ANDROID_API__ >= 28
+#define USE_POSIX_SPAWN 1
+#else  // defined(__ANDROID_API__) && __ANDROID_API__ < 28
+#define USE_POSIX_SPAWN 0
+#endif  // !defined(__ANDROID_API__) || __ANDROID_API__ >= 28
 
 // 1) FYI from m3b@ about fork():
 // A danger of calling fork() (as opposed to clone() or vfork()) is that if
diff --git a/third_party/xla/third_party/tsl/tsl/platform/denormal.cc b/third_party/xla/third_party/tsl/tsl/platform/denormal.cc
index 4f071109c32abd..9d65ddc68fda85 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/denormal.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/denormal.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tsl/platform/denormal.h"
 
+#include <cstdint>
+
 #include "tsl/platform/cpu_info.h"
 #include "tsl/platform/platform.h"
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/file_system.h b/third_party/xla/third_party/tsl/tsl/platform/file_system.h
index 76fab57f4b64b5..8f7bd875e35bc3 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/file_system.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/file_system.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <stdint.h>
 
 #include <functional>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -40,6 +41,7 @@ limitations under the License.
 
 namespace tsl {
 
+class FileAcl;
 class RandomAccessFile;
 class ReadOnlyMemoryRegion;
 class WritableFile;
@@ -531,6 +533,13 @@ class FileSystem {
     return errors::Unimplemented("SetOption");
   }
 
+  /// \brief Set File System ACL checker.
+  ///
+  /// No checks are enforced if a FileAcl is never set.
+  virtual tsl::Status SetFileAcl(std::shared_ptr<FileAcl> file_acl) {
+    return errors::Unimplemented("SetFileAcl");
+  }
+
   FileSystem() {}
 
   virtual ~FileSystem() = default;
@@ -902,6 +911,13 @@ class FileSystemRegistry {
       std::vector<std::string>* schemes) = 0;
 };
 
+/// \brief An abstraction for enforcing ACL checks in FileSystem.
+class FileAcl {
+ public:
+  virtual absl::Status CheckAccess(std::string_view path) = 0;
+  virtual ~FileAcl() = default;
+};
+
 }  // namespace tsl
 
 #endif  // TENSORFLOW_TSL_PLATFORM_FILE_SYSTEM_H_
diff --git a/third_party/xla/third_party/tsl/tsl/platform/host_info.h b/third_party/xla/third_party/tsl/tsl/platform/host_info.h
index 189f3be2934ce3..630f9424525e04 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/host_info.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/host_info.h
@@ -16,11 +16,26 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PLATFORM_HOST_INFO_H_
 #define TENSORFLOW_TSL_PLATFORM_HOST_INFO_H_
 
+#include <cstdint>
+
 #include "tsl/platform/types.h"
 
 namespace tsl {
 namespace port {
 
+// Statistical data of IO operations performed by the job.
+struct IOStatistics {
+  struct Distribution {
+    uint64_t count = 0;
+    double mean = 0.0;
+    double std_dev = 0.0;
+  };
+  // Distribution of round trip IO latency in microseconds.
+  Distribution roundtrip_latency_usec;
+  // Distribution of data received by IO reads in bytes.
+  Distribution response_bytes;
+};
+
 // Return the hostname of the machine on which this process is running.
 string Hostname();
 
@@ -34,6 +49,9 @@ int64_t JobUid();
 // Returns the Borg task ID as an int64_t if it exists. Otherwise return -1.
 int64_t TaskId();
 
+// Retrieves the host file read statistics.
+IOStatistics GetIOStatistics();
+
 }  // namespace port
 }  // namespace tsl
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/load_library.h b/third_party/xla/third_party/tsl/tsl/platform/load_library.h
index e46f85da0a7f9a..5a42f2a3439fd0 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/load_library.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/load_library.h
@@ -16,16 +16,19 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PLATFORM_LOAD_LIBRARY_H_
 #define TENSORFLOW_TSL_PLATFORM_LOAD_LIBRARY_H_
 
-#include "tsl/platform/status.h"
+#include <string>
+
+#include "absl/status/status.h"
 
 namespace tsl {
 
 namespace internal {
 
-Status LoadDynamicLibrary(const char* library_filename, void** handle);
-Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
-                            void** symbol);
-string FormatLibraryFileName(const string& name, const string& version);
+absl::Status LoadDynamicLibrary(const char* library_filename, void** handle);
+absl::Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
+                                  void** symbol);
+std::string FormatLibraryFileName(const std::string& name,
+                                  const std::string& version);
 
 }  // namespace internal
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
index 1c04a4d2d4a1a7..1c5558dbb9faeb 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
@@ -44,10 +44,10 @@ cc_library(
     srcs = [
         "android_armv7a_cpu_utils_helper.h",
         "cpu_utils.cc",
-        "i_cpu_utils_helper.h",
     ],
     hdrs = [
         "cpu_utils.h",
+        "i_cpu_utils_helper.h",
     ],
     copts = tsl_copts(),
     visibility = ["//visibility:public"],
diff --git a/third_party/xla/third_party/tsl/tsl/platform/status_matchers.h b/third_party/xla/third_party/tsl/tsl/platform/status_matchers.h
index bddf2529771f1e..ee2144dca8a698 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status_matchers.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/status_matchers.h
@@ -98,19 +98,12 @@ limitations under the License.
 //   Status status = OkStatus();
 //   EXPECT_THAT(status, IsOk());
 
-namespace tensorflow {
-namespace error {
-// TODO(ddunleavy) Move this to TSL. This stays here until error_codes proto
-// is moved to TSL due to an ADL issue
+namespace tsl {
+
 inline void PrintTo(const tsl::error::Code code, std::ostream* os) {
   *os << Code_Name(code);
 }
 
-}  // namespace error
-}  // namespace tensorflow
-
-namespace tsl {
-
 template <typename T>
 void PrintTo(const StatusOr<T>& status_or, std::ostream* os) {
   *os << ::testing::PrintToString(status_or.status());
diff --git a/third_party/xla/third_party/tsl/tsl/platform/tensor_float_32_utils.h b/third_party/xla/third_party/tsl/tsl/platform/tensor_float_32_utils.h
index 5d1db659c9f43c..d956340c303309 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/tensor_float_32_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/tensor_float_32_utils.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 namespace tsl {
 
+// NOTE: The usage of this function is only supported through the Tensorflow
+// framework.
 void enable_tensor_float_32_execution(bool enabled);
 
 bool tensor_float_32_execution_enabled();
diff --git a/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD b/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
index 7ff0f110fe6722..2bde9eb95b3b73 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
@@ -1,10 +1,9 @@
-load("//tsl:tsl.default.bzl", "filegroup")
-
 # Tensorflow windows-specific implementations of tensorflow/core/platform libraries.
 load(
     "//tsl:tsl.bzl",
     "tsl_copts",
 )
+load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "//tsl/platform:rules_cc.bzl",
     "cc_library",
@@ -144,7 +143,7 @@ cc_library(
     deps = [
         ":wide_char",
         "//tsl/platform:errors",
-        "//tsl/platform:status",
+        "@com_google_absl//absl/status",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/windows/load_library.cc b/third_party/xla/third_party/tsl/tsl/platform/windows/load_library.cc
index 0c47532dc687a7..66d2d62cf6e130 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/windows/load_library.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/windows/load_library.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include <algorithm>
 #include <string>
 
-#include "tsl/platform/errors.h"
+#include "absl/status/status.h"
 #include "tsl/platform/windows/wide_char.h"
 
 #pragma comment(lib, "Shlwapi.lib")
@@ -37,8 +37,8 @@ namespace tsl {
 
 namespace internal {
 
-Status LoadDynamicLibrary(const char* library_filename, void** handle) {
-  string file_name = library_filename;
+absl::Status LoadDynamicLibrary(const char* library_filename, void** handle) {
+  std::string file_name = library_filename;
   std::replace(file_name.begin(), file_name.end(), '/', '\\');
 
   std::wstring ws_file_name(tsl::Utf8ToWideChar(file_name));
@@ -46,26 +46,27 @@ Status LoadDynamicLibrary(const char* library_filename, void** handle) {
   HMODULE hModule =
       LoadLibraryExW(ws_file_name.c_str(), NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
   if (!hModule) {
-    return tsl::errors::NotFound(file_name + " not found");
+    return absl::NotFoundError(file_name + " not found");
   }
   *handle = hModule;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
-                            void** symbol) {
+absl::Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
+                                  void** symbol) {
   FARPROC found_symbol;
 
   found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
   if (found_symbol == NULL) {
-    return tsl::errors::NotFound(std::string(symbol_name) + " not found");
+    return absl::NotFoundError(std::string(symbol_name) + " not found");
   }
   *symbol = (void**)found_symbol;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-string FormatLibraryFileName(const string& name, const string& version) {
-  string filename;
+std::string FormatLibraryFileName(const std::string& name,
+                                  const std::string& version) {
+  std::string filename;
   if (version.size() == 0) {
     filename = name + ".dll";
   } else {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/windows/port.cc b/third_party/xla/third_party/tsl/tsl/platform/windows/port.cc
index 9b5692650dbb5c..f8e19503edb305 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/windows/port.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/windows/port.cc
@@ -61,6 +61,8 @@ int64_t JobUid() { return -1; }
 
 int64_t TaskId() { return -1; }
 
+IOStatistics GetIOStatistics() { return IOStatistics(); }
+
 int NumSchedulableCPUs() {
   SYSTEM_INFO system_info;
   GetSystemInfo(&system_info);
@@ -122,7 +124,6 @@ void NUMAFree(void* ptr, size_t size) { tsl::port::Free(ptr); }
 
 int NUMAGetMemAffinity(const void* addr) { return kNUMANoAffinity; }
 
-
 bool Snappy_Compress(const char* input, size_t length, string* output) {
 #ifdef TF_USE_SNAPPY
   output->resize(snappy::MaxCompressedLength(length));
@@ -183,7 +184,7 @@ string Demangle(const char* mangled) { return mangled; }
 double NominalCPUFrequency() {
   DWORD data;
   DWORD data_size = sizeof(data);
-  #pragma comment(lib, "shlwapi.lib")  // For SHGetValue().
+#pragma comment(lib, "shlwapi.lib")  // For SHGetValue().
   if (SUCCEEDED(
           SHGetValueA(HKEY_LOCAL_MACHINE,
                       "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
index 70fe322adcda52..c23d63f5f4eddd 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
@@ -3,6 +3,10 @@ load("//tsl/platform:build_config_root.bzl", "if_static")
 load("//tsl:tsl.default.bzl", "filegroup")
 load("//tsl:tsl.bzl", "if_not_android", "set_external_visibility")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
+load(
+    "//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 load(
     "//tsl/profiler/builds:build_config.bzl",
     "tf_profiler_copts",
@@ -252,8 +256,8 @@ cc_library(
         "//tsl/platform:macros",
         "//tsl/platform:types",
         "@com_google_absl//absl/strings",
-    ] + if_not_android([
-        "//tsl/profiler/backends/cpu:annotation_stack",
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",  # NVTX headers
     ]),
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h
index 416d8293784551..e3eaaa08af79e8 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h
@@ -24,18 +24,17 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "nvtx3/nvToolsExt.h"
+#else
+// Some typedef to help build without NVTX.
+typedef void* nvtxEventAttributes_t;
+typedef void* nvtxDomainHandle_t;
+typedef void* nvtxStringHandle_t;
 #endif
 
 namespace tsl {
 namespace profiler {
 namespace nvtx {
 
-// Some typedef to help build without NVTX.
-#if !GOOGLE_CUDA
-typedef void* nvtxEventAttributes_t;
-typedef void* nvtxDomainHandle_t;
-#endif
-
 // A helper function that return the domains to use if NVTX profiling
 // is enabled.
 inline std::optional<nvtxDomainHandle_t> GetNVTXDomain() {
@@ -65,15 +64,38 @@ inline bool RangesEnabled() {
 #endif
 }
 
-// Note: The memory backing msg must persist until the result of this function
-// has been consumed by an NVTX API.
-inline void MakeAttributes(const char* msg, nvtxEventAttributes_t* result) {
-  *result = {0};
+// Two types of NVTX range annotation are supported, the older/simpler option
+// is to use std::string and have the NVTX implementation copy a C-style
+// string every time. The other option is to pass a struct implementing two
+// methods:
+//
+//   std::string_view Title() const;
+//   nvtxStringHandle_t NvtxRegisteredTitle() const;
+//
+// in which case NvtxRegisteredTitle() will be used when starting NVTX ranges,
+// avoiding this string copy.
+// The Title() method is needed because AnnotationStack::PushAnnotation(...) is
+// the backend for some annotations when NVTX is not enabled, and it does not
+// recognise registered strings. has_annotation_api_v<AnnotationType>
+// distinguishes between the two types of annotation.
+template <typename AnnotationType>
+inline constexpr bool has_annotation_api_v =
+    !std::is_same_v<AnnotationType, std::string>;
+
+template <typename AnnotationType>
+void RangePush(nvtxDomainHandle_t domain, const AnnotationType& annotation) {
 #if GOOGLE_CUDA
-  result->version = NVTX_VERSION;
-  result->size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-  result->messageType = NVTX_MESSAGE_TYPE_ASCII;
-  result->message.ascii = msg;
+  nvtxEventAttributes_t attrs{};
+  attrs.version = NVTX_VERSION;
+  attrs.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  if constexpr (has_annotation_api_v<std::decay_t<AnnotationType>>) {
+    attrs.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
+    attrs.message.registered = annotation.NvtxRegisteredTitle();
+  } else {
+    attrs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+    attrs.message.ascii = annotation.c_str();
+  }
+  ::nvtxDomainRangePushEx(domain, &attrs);
 #endif
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
index 643d7045428605..f047fafc4ebe3a 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
@@ -53,10 +53,7 @@ class ScopedAnnotationT {
     std::optional<nvtxDomainHandle_t> domain =
         tsl::profiler::nvtx::GetNVTXDomain();
     if (TF_PREDICT_FALSE(domain.has_value())) {
-      nvtxEventAttributes_t attrs;
-      std::string name_str(name);
-      tsl::profiler::nvtx::MakeAttributes(name_str.c_str(), &attrs);
-      ::nvtxDomainRangePushEx(domain.value(), &attrs);
+      tsl::profiler::nvtx::RangePush(domain.value(), std::string{name});
     } else  // NOLINT
 #endif
         if (always_annotate || TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
@@ -74,9 +71,7 @@ class ScopedAnnotationT {
     std::optional<nvtxDomainHandle_t> domain =
         tsl::profiler::nvtx::GetNVTXDomain();
     if (TF_PREDICT_FALSE(domain.has_value())) {
-      nvtxEventAttributes_t attrs;
-      tsl::profiler::nvtx::MakeAttributes(name.c_str(), &attrs);
-      ::nvtxDomainRangePushEx(domain.value(), &attrs);
+      tsl::profiler::nvtx::RangePush(domain.value(), name);
     } else  // NOLINT
 #endif
         if (always_annotate || TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
@@ -91,9 +86,7 @@ class ScopedAnnotationT {
     std::optional<nvtxDomainHandle_t> domain =
         tsl::profiler::nvtx::GetNVTXDomain();
     if (TF_PREDICT_FALSE(domain.has_value())) {
-      nvtxEventAttributes_t attrs;
-      tsl::profiler::nvtx::MakeAttributes(name.c_str(), &attrs);
-      ::nvtxDomainRangePushEx(domain.value(), &attrs);
+      tsl::profiler::nvtx::RangePush(domain.value(), name);
     } else  // NOLINT
 #endif
         if (always_annotate || TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
@@ -109,15 +102,17 @@ class ScopedAnnotationT {
     std::optional<nvtxDomainHandle_t> domain =
         tsl::profiler::nvtx::GetNVTXDomain();
     if (TF_PREDICT_FALSE(domain.has_value())) {
-      auto name = name_generator();
-      nvtxEventAttributes_t attrs;
-      tsl::profiler::nvtx::MakeAttributes(name.c_str(), &attrs);
-      ::nvtxDomainRangePushEx(domain.value(), &attrs);
+      tsl::profiler::nvtx::RangePush(domain.value(), name_generator());
     } else  // NOLINT
 #endif
         if (always_annotate || TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
-      auto name = name_generator();
-      old_length_ = AnnotationStack::PushAnnotation(name);
+      auto annotation = name_generator();
+      if constexpr (tsl::profiler::nvtx::has_annotation_api_v<
+                        std::decay_t<decltype(annotation)>>) {
+        old_length_ = AnnotationStack::PushAnnotation(annotation.Title());
+      } else {
+        old_length_ = AnnotationStack::PushAnnotation(std::move(annotation));
+      }
     }
 #endif
   }
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation_stack.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation_stack.h
index f4e538f127c9bb..db46f7c99135e4 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation_stack.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation_stack.h
@@ -55,10 +55,7 @@ class ScopedAnnotationStack {
     std::optional<nvtxDomainHandle_t> domain =
         tsl::profiler::nvtx::GetNVTXDomain();
     if (TF_PREDICT_FALSE(domain.has_value())) {
-      nvtxEventAttributes_t attrs;
-      std::string name_str(name);
-      tsl::profiler::nvtx::MakeAttributes(name_str.c_str(), &attrs);
-      ::nvtxDomainRangePushEx(domain.value(), &attrs);
+      tsl::profiler::nvtx::RangePush(domain.value(), name);
     } else  // NOLINT
 #endif
         if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
@@ -83,15 +80,17 @@ class ScopedAnnotationStack {
     std::optional<nvtxDomainHandle_t> domain =
         tsl::profiler::nvtx::GetNVTXDomain();
     if (TF_PREDICT_FALSE(domain.has_value())) {
-      auto name = name_generator();
-      nvtxEventAttributes_t attrs;
-      std::string name_str(name);
-      tsl::profiler::nvtx::MakeAttributes(name_str.c_str(), &attrs);
-      ::nvtxDomainRangePushEx(domain.value(), &attrs);
+      tsl::profiler::nvtx::RangePush(domain.value(), name_generator());
     } else  // NOLINT
 #endif
         if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
-      return AnnotationStack::PushAnnotation(name_generator());
+      auto annotation = name_generator();
+      if constexpr (tsl::profiler::nvtx::has_annotation_api_v<
+                        std::decay_t<decltype(annotation)>>) {
+        return AnnotationStack::PushAnnotation(annotation.Title());
+      } else {
+        return AnnotationStack::PushAnnotation(std::move(annotation));
+      }
     }
 #endif
     return kInvalidActivity;
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/tf_op_utils.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/tf_op_utils.cc
index 7dadfae46f7913..4129e2ae8fa7c7 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/tf_op_utils.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/tf_op_utils.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tsl/profiler/utils/tf_op_utils.h"
 
+#include <cstdint>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -61,6 +63,32 @@ absl::string_view DeriveOpType(absl::string_view full_op_name) {
   return op_type;
 }
 
+// TODO(xprof-devs): Include the corresponding Ops on TPU.
+std::optional<TfOp> GetMemcpyOp(absl::string_view tf_op_fullname) {
+  TfOp tf_op;
+  tf_op.name = tf_op_fullname;
+  if (absl::StartsWithIgnoreCase(tf_op_fullname, "MEMCPYHToD")) {
+    tf_op.category = Category::kMemcpyHToD;
+    tf_op.type = kMemcpyHToDOp;
+    return tf_op;
+  }
+  if (absl::StartsWithIgnoreCase(tf_op_fullname, "MEMCPYDToH")) {
+    tf_op.category = Category::kMemcpyDToH;
+    tf_op.type = kMemcpyDToHOp;
+    return tf_op;
+  }
+  if (absl::StartsWithIgnoreCase(tf_op_fullname, "MEMCPYDToD")) {
+    tf_op.category = Category::kMemcpyDToD;
+    tf_op.type = kMemcpyDToDOp;
+    return tf_op;
+  } else if (absl::StartsWithIgnoreCase(tf_op_fullname, "MEMCPYHToH")) {
+    tf_op.category = Category::kMemcpyHToH;
+    tf_op.type = kMemcpyHToHOp;
+    return tf_op;
+  }
+  return std::nullopt;
+}
+
 }  // namespace
 
 const absl::string_view kUnknownOp = "";  // op types are non-empty strings
@@ -70,12 +98,14 @@ const absl::string_view kMemcpyDToHOp = "MemcpyDToH";
 const absl::string_view kMemcpyDToDOp = "MemcpyDToD";
 const absl::string_view kMemcpyHToHOp = "MemcpyHToH";
 
+// Example inputs: "MyOpName", "MyNamespace>MyOpName"
 bool IsTfOpName(absl::string_view op_name) {
   // TODO(b/177602927): Confirm the naming convention with the TF team.
   static const LazyRE2 kTfOpNameRegEx = {"[A-Za-z0-9.][A-Za-z0-9_.\\/>-]*"};
   return RE2::FullMatch(op_name, *kTfOpNameRegEx);
 }
 
+// Example inputs: "MyType", "_MyInternalType"
 bool IsTfOpType(absl::string_view op_type) {
   static const LazyRE2 kTfOpTypeRegEx = {"[A-Z_][a-zA-Z0-9_]*"};
   return RE2::FullMatch(op_type, *kTfOpTypeRegEx);
@@ -97,52 +127,64 @@ bool IsJaxOpNameAndType(absl::string_view op_name, absl::string_view op_type) {
 }
 
 TfOp ParseTfOpFullname(absl::string_view tf_op_fullname) {
-  // TF Op names have the format "name:type".
+  // For op types below, they all have the format "<op_name>:<op_type>", though
+  // op_type could be empty.
   TfOp tf_op = {Category::kUnknown, tf_op_fullname, kUnknownOp};
   std::vector<absl::string_view> parts =
       absl::StrSplit(tf_op_fullname, absl::MaxSplits(':', 1));
+
   if (parts.size() != 2) {
-    // GPU-related Ops that need to be tracked.
-    if (absl::StartsWithIgnoreCase(tf_op_fullname, "MEMCPYHToD")) {
-      tf_op.category = Category::kMemcpyHToD;
-      tf_op.type = kMemcpyHToDOp;
-    } else if (absl::StartsWithIgnoreCase(tf_op_fullname, "MEMCPYDToH")) {
-      tf_op.category = Category::kMemcpyDToH;
-      tf_op.type = kMemcpyDToHOp;
-    } else if (absl::StartsWithIgnoreCase(tf_op_fullname, "MEMCPYDToD")) {
-      tf_op.category = Category::kMemcpyDToD;
-      tf_op.type = kMemcpyDToDOp;
-    } else if (absl::StartsWithIgnoreCase(tf_op_fullname, "MEMCPYHToH")) {
-      tf_op.category = Category::kMemcpyHToH;
-      tf_op.type = kMemcpyHToHOp;
+    // Two possibilities here: GPU memcpy op or invalid op.
+    if (std::optional<TfOp> tfop = GetMemcpyOp(parts[0]); tfop.has_value()) {
+      return *tfop;
     }
-    // TODO(ckluk): Include the corresponding Ops on TPU.
-  } else if (parts[0] == kIterator) {
+    return tf_op;
+  }
+
+  // Check for a Dataset op.
+  if (parts[0] == kIterator) {
     // Dataset Op names (e.g., Iterator::Batch::Map::TFRecord) do not follow the
     // format of TF Op names. But we still want to capture them for
     // input-pipeline analysis.
     tf_op.category = Category::kTfData;
     tf_op.type = kDatasetOp;
-  } else if (IsTfOpType(parts[1]) && IsTfOpName(parts[0])) {
-    tf_op = {Category::kTensorFlow, parts[0], parts[1]};
-  } else {
-    absl::string_view op_type =
-        parts[1].empty() ? DeriveOpType(parts[0]) : parts[1];
-    if (IsJaxOpType(op_type)) {
-      // JAX category introduces op_type with '[]' including unnecessary details
-      // to represent a group of ops.
-      // We need to striping the brackets and contents inside. Based on our
-      // analysis, all the op_type ends with a closing ']' if it contains
-      // brakets. It's safe to remove all the characters starting with the
-      // position of '['.
-      // Example:
-      //    "transpose[permutation=(0, 3, 1, 2)]"  =>  "transpose"
-      // See: go/xprof-jax-op-type
-      tf_op = {Category::kJax, parts[0], op_type.substr(0, op_type.find('['))};
-    } else if (parts[1].empty()) {
-      tf_op = {Category::kTensorFlow, parts[0], op_type};
-    }
+    return tf_op;
+  }
+
+  // Check for Tensorflow Op.
+  if (IsTfOpName(parts[0]) && IsTfOpType(parts[1])) {
+    tf_op.category = Category::kTensorFlow;
+    tf_op.name = parts[0];
+    tf_op.type = parts[1];
+    return tf_op;
+  }
+
+  // Check for JAX op.
+  absl::string_view op_type =
+      parts[1].empty() ? DeriveOpType(parts[0]) : parts[1];
+  if (IsJaxOpType(op_type)) {
+    // JAX category introduces op_type with '[]' including unnecessary details
+    // to represent a group of ops.
+    // We need to striping the brackets and contents inside. Based on our
+    // analysis, all the op_type ends with a closing ']' if it contains
+    // brakets. It's safe to remove all the characters starting with the
+    // position of '['.
+    // Example:
+    //    "transpose[permutation=(0, 3, 1, 2)]"  =>  "transpose"
+    // See: go/xprof-jax-op-type
+    tf_op.category = Category::kJax;
+    tf_op.name = parts[0];
+    tf_op.type = op_type.substr(0, op_type.find('['));
+    return tf_op;
+  }
+
+  if (parts[1].empty()) {
+    tf_op.category = Category::kTensorFlow;
+    tf_op.name = parts[0];
+    tf_op.type = op_type;
+    return tf_op;
   }
+
   return tf_op;
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h
index 90cee796fd95a7..6a7093b422c7d1 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h
@@ -29,10 +29,15 @@ namespace profiler {
 // Support up to 500 accelerator devices.
 constexpr uint32 kFirstDeviceId = 1;
 constexpr uint32 kLastDeviceId = 500;
-// Support Upto 200 custom planes.
-constexpr uint32 kCustomPlaneDeviceId = kLastDeviceId + 1;
+// Support Upto 200 custom planes as fake devices (i.e., planes with a
+// "/custom:" prefix). See `<project_name>::kCustomPlanePrefix` for more
+// information
+constexpr uint32 kFirstCustomPlaneDeviceId = kLastDeviceId + 1;
+constexpr uint32 kMaxCustomPlaneDevicesPerHost = 200;
+constexpr uint32 kLastCustomPlaneDeviceId =
+    kFirstCustomPlaneDeviceId + kMaxCustomPlaneDevicesPerHost - 1;
 // Host threads are shown as a single fake device.
-constexpr uint32 kHostThreadsDeviceId = kCustomPlaneDeviceId + 200;
+constexpr uint32 kHostThreadsDeviceId = kLastCustomPlaneDeviceId + 1;
 
 // Constants used as trace_viewer TID (resource_id in trace_events.proto).
 constexpr int kThreadIdDerivedMin = 0xdeadbeef;
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
index 2f7eb630aa324a..62b69f2910b334 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
@@ -272,6 +272,7 @@ const StatTypeMap& GetStatTypeMap() {
       {"model_version", kModelVersion},
       {"bytes_transferred", kBytesTransferred},
       {"queue", kDmaQueue},
+      {"dcn_collective_info", kDcnCollectiveInfo},
       // Performance counter related.
       {"Raw Value", kRawValue},
       {"Scaled Value", kScaledValue},
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
index 8fa320791f0ee5..7bbd052f815eb9 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
@@ -259,6 +259,7 @@ enum StatType {
   kModelVersion,
   kBytesTransferred,
   kDmaQueue,
+  kDcnCollectiveInfo,
   // Performance counter related.
   kRawValue,
   kScaledValue,
diff --git a/third_party/xla/third_party/tsl/tsl/tsl.bzl b/third_party/xla/third_party/tsl/tsl/tsl.bzl
index e1c2b364fbca78..caad0d8eeb5a20 100644
--- a/third_party/xla/third_party/tsl/tsl/tsl.bzl
+++ b/third_party/xla/third_party/tsl/tsl/tsl.bzl
@@ -37,6 +37,11 @@ load(
     "if_tensorrt",
 )
 
+# buildifier: disable=out-of-order-load
+# Internally this loads a macro, but in OSS this is a function
+def register_extension_info(**kwargs):
+    pass
+
 two_gpu_tags = ["requires-gpu-nvidia:2", "notap", "manual", "no_pip"]
 
 def clean_dep(target):
@@ -349,6 +354,8 @@ def tsl_gpu_library(deps = None, cuda_deps = None, copts = tsl_copts(), **kwargs
         **kwargs
     )
 
+register_extension_info(extension = tsl_gpu_library, label_regex_for_dep = "{extension_name}")
+
 # Traverse the dependency graph along the "deps" attribute of the
 # target and return a struct with one field called 'tf_collected_deps'.
 # tf_collected_deps will be the union of the deps of the current target
@@ -562,6 +569,7 @@ def tsl_pybind_extension_opensource(
         data = [],
         defines = [],
         deprecation = None,
+        enable_stub_generation = False,  # @unused
         features = [],
         licenses = None,
         linkopts = [],
@@ -754,9 +762,6 @@ def tsl_pybind_extension_opensource(
         compatible_with = compatible_with,
     )
 
-# Export open source version of pybind_extension under base name as well.
-tsl_pybind_extension = tsl_pybind_extension_opensource
-
 # Used for specifying external visibility constraints. In non-monorepo situations, this needs to be
 # public, but monorepos can have more precise constraints.
 def set_external_visibility(monorepo_paths):
diff --git a/third_party/xla/third_party/tsl/tsl/tsl.default.bzl b/third_party/xla/third_party/tsl/tsl/tsl.default.bzl
index c6bb4f3526e9b9..1759e5106320d5 100644
--- a/third_party/xla/third_party/tsl/tsl/tsl.default.bzl
+++ b/third_party/xla/third_party/tsl/tsl/tsl.default.bzl
@@ -7,7 +7,7 @@ load(
     _if_not_mobile_or_arm_or_lgpl_restricted = "if_not_mobile_or_arm_or_lgpl_restricted",
     _internal_hlo_deps = "internal_hlo_deps",
     _tsl_grpc_cc_dependencies = "tsl_grpc_cc_dependencies",
-    _tsl_pybind_extension = "tsl_pybind_extension",
+    _tsl_pybind_extension = "tsl_pybind_extension_opensource",
 )
 
 get_compatible_with_portable = _get_compatible_with_portable
diff --git a/third_party/xla/third_party/tsl/tsl/util/BUILD b/third_party/xla/third_party/tsl/tsl/util/BUILD
index a913dbb77ac724..09b864ac264d20 100644
--- a/third_party/xla/third_party/tsl/tsl/util/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/util/BUILD
@@ -286,6 +286,7 @@ cc_library(
         "//tsl/platform:stringpiece",
         "//tsl/platform:stringprintf",
         "//tsl/platform:types",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/util/command_line_flags.cc b/third_party/xla/third_party/tsl/tsl/util/command_line_flags.cc
index 520962fe410262..5e316e9ae9fc6a 100644
--- a/third_party/xla/third_party/tsl/tsl/util/command_line_flags.cc
+++ b/third_party/xla/third_party/tsl/tsl/util/command_line_flags.cc
@@ -15,11 +15,13 @@ limitations under the License.
 
 #include "tsl/util/command_line_flags.h"
 
+#include <algorithm>
 #include <cinttypes>
 #include <cstring>
 #include <string>
 #include <vector>
 
+#include "absl/strings/match.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/str_util.h"
 #include "tsl/platform/stringpiece.h"
@@ -96,10 +98,10 @@ bool ParseBoolFlag(StringPiece arg, StringPiece flag,
     if (!absl::ConsumePrefix(&arg, "=")) {
       return false;
     }
-    if (absl::EqualsIgnoreCase(arg, "true")) {
+    if (absl::EqualsIgnoreCase(arg, "true") || arg == "1") {
       *value_parsing_ok = hook(true);
       return true;
-    } else if (absl::EqualsIgnoreCase(arg, "false")) {
+    } else if (absl::EqualsIgnoreCase(arg, "false") || arg == "0") {
       *value_parsing_ok = hook(false);
       return true;
     } else {
@@ -290,6 +292,29 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
   return result && (*argc < 2 || strcmp(argv[1], "--help") != 0);
 }
 
+/*static*/ bool Flags::Parse(std::vector<std::string>& flags,
+                             const std::vector<Flag>& flag_list) {
+  bool result = true;
+  std::vector<std::string> unknown_flags;
+  for (auto& flag : flags) {
+    for (const Flag& flag_object : flag_list) {
+      bool value_parsing_ok;
+      bool was_found = flag_object.Parse(flag, &value_parsing_ok);
+      if (!value_parsing_ok) {
+        result = false;
+      }
+      // Clear parsed flags, these empty entries are removed later.
+      if (was_found) {
+        flag.clear();
+        break;
+      }
+    }
+  }
+  auto IsEmpty = [](const std::string& flag) { return flag.empty(); };
+  flags.erase(std::remove_if(flags.begin(), flags.end(), IsEmpty), flags.end());
+  return result;
+}
+
 /*static*/ string Flags::Usage(const string& cmdline,
                                const std::vector<Flag>& flag_list) {
   string usage_text;
diff --git a/third_party/xla/third_party/tsl/tsl/util/command_line_flags.h b/third_party/xla/third_party/tsl/tsl/util/command_line_flags.h
index 6553bc887c853e..2710de5753cd01 100644
--- a/third_party/xla/third_party/tsl/tsl/util/command_line_flags.h
+++ b/third_party/xla/third_party/tsl/tsl/util/command_line_flags.h
@@ -132,6 +132,11 @@ class Flags {
   // first remaining argument is not "--help".
   static bool Parse(int* argc, char** argv, const std::vector<Flag>& flag_list);
 
+  // Similar as above, but accepts a mutable vector of strings in place of
+  // argc and argv. Doesn't ignore the first flag, and return the unknown flags
+  // back in flags vector.
+  static bool Parse(std::vector<std::string>& flags,
+                    const std::vector<Flag>& flag_list);
   // Return a usage message with command line cmdline, and the
   // usage_text strings in flag_list[].
   static string Usage(const string& cmdline,
diff --git a/third_party/xla/tools/toolchains/cross_compile/cc/BUILD b/third_party/xla/tools/toolchains/cross_compile/cc/BUILD
new file mode 100644
index 00000000000000..dc621893ac9675
--- /dev/null
+++ b/third_party/xla/tools/toolchains/cross_compile/cc/BUILD
@@ -0,0 +1,191 @@
+"""Toolchain configs for cross-compiling TensorFlow"""
+
+load("@bazel_tools//tools/cpp:unix_cc_toolchain_config.bzl", "cc_toolchain_config")
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["restricted"])
+
+cc_toolchain_suite(
+    name = "cross_compile_toolchain_suite",
+    toolchains = {
+        "aarch64": ":linux_aarch64_toolchain",
+        "k8": ":linux_x86_toolchain",
+    },
+)
+
+filegroup(
+    name = "empty",
+    visibility = ["//visibility:public"],
+)
+
+cc_toolchain(
+    name = "linux_x86_toolchain",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":linux_x86_toolchain_config",
+    toolchain_identifier = "linux_x86_toolchain",
+)
+
+cc_toolchain_config(
+    name = "linux_x86_toolchain_config",
+    abi_libc_version = "local",
+    abi_version = "local",
+    builtin_sysroot = "/dt9",
+    compile_flags = [
+        "--target=x86_64-unknown-linux-gnu",
+        "-fstack-protector",
+        "-Wall",
+        "-Wthread-safety",
+        "-Wself-assign",
+        "-Wunused-but-set-parameter",
+        "-Wno-free-nonheap-object",
+        "-fcolor-diagnostics",
+        "-fno-omit-frame-pointer",
+        "-mavx",
+    ],
+    compiler = "clang",
+    coverage_compile_flags = ["--coverage"],
+    coverage_link_flags = ["--coverage"],
+    cpu = "k8",
+    cxx_builtin_include_directories = [
+        "/dt9/",
+        "/usr/lib/llvm-17/include/",
+        "/usr/lib/llvm-17/lib/clang/17/include",
+    ],
+    dbg_compile_flags = ["-g"],
+    host_system_name = "linux",
+    link_flags = [
+        "--target=x86_64-unknown-linux-gnu",
+        "-fuse-ld=lld",
+        "--ld-path=/usr/lib/llvm-17/bin/ld.lld",
+        "-Wl,--undefined-version",
+    ],
+    link_libs = [
+        "-lstdc++",
+        "-lm",
+    ],
+    opt_compile_flags = [
+        "-g0",
+        "-O2",
+        "-D_FORTIFY_SOURCE=1",
+        "-DNDEBUG",
+        "-ffunction-sections",
+        "-fdata-sections",
+    ],
+    opt_link_flags = ["-Wl,--gc-sections"],
+    supports_start_end_lib = True,
+    target_libc = "",
+    target_system_name = "x86_64-unknown-linux-gnu",
+    tool_paths = {
+        "gcc": "/usr/lib/llvm-17/bin/clang",
+        "ld": "/usr/lib/llvm-17/bin/ld.lld",
+        "ar": "/usr/lib/llvm-17/bin/llvm-ar",
+        "cpp": "/usr/lib/llvm-17/bin/clang++",
+        "llvm-cov": "/usr/lib/llvm-17/bin/llvm-cov",
+        "nm": "/usr/lib/llvm-17/bin/llvm-nm",
+        "objdump": "/usr/lib/llvm-17/bin/llvm-objdump",
+        "strip": "/usr/lib/llvm-17/bin/llvm-strip",
+    },
+    toolchain_identifier = "linux_x86_toolchain",
+    unfiltered_compile_flags = [
+        "-no-canonical-prefixes",
+        "-Wno-builtin-macro-redefined",
+        "-D__DATE__=\"redacted\"",
+        "-D__TIMESTAMP__=\"redacted\"",
+        "-D__TIME__=\"redacted\"",
+        "-Wno-unused-command-line-argument",
+        "-Wno-gnu-offsetof-extensions",
+    ],
+)
+
+cc_toolchain(
+    name = "linux_aarch64_toolchain",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":linux_aarch64_toolchain_config",
+    toolchain_identifier = "linux_aarch64_toolchain",
+)
+
+cc_toolchain_config(
+    name = "linux_aarch64_toolchain_config",
+    abi_libc_version = "local",
+    abi_version = "local",
+    builtin_sysroot = "/dt10/",
+    compile_flags = [
+        "--target=aarch64-unknown-linux-gnu",
+        "-fstack-protector",
+        "-Wall",
+        "-Wthread-safety",
+        "-Wself-assign",
+        "-Wunused-but-set-parameter",
+        "-Wno-free-nonheap-object",
+        "-fcolor-diagnostics",
+        "-fno-omit-frame-pointer",
+        "-mtune=generic",
+        "-march=armv8-a",
+    ],
+    compiler = "clang",
+    coverage_compile_flags = ["--coverage"],
+    coverage_link_flags = ["--coverage"],
+    cpu = "aarch64",
+    cxx_builtin_include_directories = [
+        "/dt10/",
+        "/usr/lib/llvm-17/include/",
+        "/usr/lib/llvm-17/lib/clang/17/include",
+    ],
+    dbg_compile_flags = ["-g"],
+    host_system_name = "linux",
+    link_flags = [
+        "--target=aarch64-unknown-linux-gnu",
+        "-fuse-ld=lld",
+        "--ld-path=/usr/lib/llvm-17/bin/ld.lld",
+        "-Wl,--undefined-version",
+    ],
+    link_libs = [
+        "-lstdc++",
+        "-lm",
+    ],
+    opt_compile_flags = [
+        "-g0",
+        "-O2",
+        "-D_FORTIFY_SOURCE=1",
+        "-DNDEBUG",
+        "-ffunction-sections",
+        "-fdata-sections",
+    ],
+    opt_link_flags = ["-Wl,--gc-sections"],
+    supports_start_end_lib = True,
+    target_libc = "",
+    target_system_name = "aarch64-unknown-linux-gnu",
+    tool_paths = {
+        "gcc": "/usr/lib/llvm-17/bin/clang",
+        "ld": "/usr/lib/llvm-17/bin/ld.lld",
+        "ar": "/usr/lib/llvm-17/bin/llvm-ar",
+        "cpp": "/usr/lib/llvm-17/bin/clang++",
+        "llvm-cov": "/usr/lib/llvm-17/bin/llvm-cov",
+        "nm": "/usr/lib/llvm-17/bin/llvm-nm",
+        "objdump": "/usr/lib/llvm-17/bin/llvm-objdump",
+        "strip": "/usr/lib/llvm-17/bin/llvm-strip",
+    },
+    toolchain_identifier = "linux_aarch64_toolchain",
+    unfiltered_compile_flags = [
+        "-no-canonical-prefixes",
+        "-Wno-builtin-macro-redefined",
+        "-D__DATE__=\"redacted\"",
+        "-D__TIMESTAMP__=\"redacted\"",
+        "-D__TIME__=\"redacted\"",
+        "-Wno-unused-command-line-argument",
+        "-Wno-gnu-offsetof-extensions",
+    ],
+)
diff --git a/third_party/xla/tools/toolchains/cross_compile/config/BUILD b/third_party/xla/tools/toolchains/cross_compile/config/BUILD
new file mode 100644
index 00000000000000..b6a504ba1449d6
--- /dev/null
+++ b/third_party/xla/tools/toolchains/cross_compile/config/BUILD
@@ -0,0 +1,23 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["restricted"])
+
+platform(
+    name = "linux_x86_64",
+    constraint_values = [
+        "@platforms//os:linux",
+        "@platforms//cpu:x86_64",
+    ],
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/ml-devinfra-linux-aarch64-cross-compile@sha256:11c5ac3b9b4e01cfa82b39b90826a9bfc5b806ccc92cd3d272e6bf861de43be1",
+        "OSFamily": "Linux",
+    },
+)
+
+platform(
+    name = "linux_aarch64",
+    constraint_values = [
+        "@platforms//os:linux",
+        "@platforms//cpu:aarch64",
+    ],
+)
diff --git a/third_party/xla/tools/toolchains/remote_config/configs.bzl b/third_party/xla/tools/toolchains/remote_config/configs.bzl
index 4554463cb90675..4b07fb5c18670d 100644
--- a/third_party/xla/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/configs.bzl
@@ -200,6 +200,28 @@ def initialize_rbe_configs():
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu20.04-clang_manylinux2014-cuda12.3-cudnn8.9",
+        compiler = "/usr/lib/llvm-17/bin/clang",
+        cuda_version = "12.3",
+        cudnn_version = "8.9",
+        os = "ubuntu20.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        sysroot = "/dt9",
+        python_install_path = "/usr/local",
+    )
+
+    tensorflow_rbe_config(
+        name = "ubuntu20.04-gcc9_manylinux2014-cuda12.3-cudnn8.9",
+        compiler = "/dt9/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "12.3",
+        cudnn_version = "8.9",
+        os = "ubuntu20.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        python_install_path = "/usr/local",
+    )
+
     tensorflow_rbe_win_config(
         name = "windows_py37",
         python_bin_path = "C:/Python37/python.exe",
diff --git a/third_party/xla/tools/toolchains/remote_config/containers.bzl b/third_party/xla/tools/toolchains/remote_config/containers.bzl
index bfb4634e810328..cd346c2816def1 100644
--- a/third_party/xla/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/containers.bzl
@@ -5,8 +5,9 @@ container_digests = {
     # TF now uses only this container
     "cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython": "sha256:48612bd85709cd014711d0b0f87e0806f3567d06d2e81c6e860516b87498b821",
     # JAX manylinux2014 configs.
-    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:ab39410baf2fc1d31d50540acec7640d7f4814fa694e2421b696b6f0a058d645",
-    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:b699d6ae235ac601dc3e62391ac7c4606cb10331f8141983858c1580f5e74ddb",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:b112c0c77d4172fc025420938f13ea83f3ad480c01778e743a201e5e3f4710e1",
+    "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:9fefda035b4a12b24cd5bae56c7dbb9527a5fd06a41ced0a22ac86fe5ed26428",
+    "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:afe68c3448734cb07b16005fd9ed47d19533eb8bf5acd92863735ce24766b93b",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
@@ -98,6 +99,13 @@ containers = {
         "digest": container_digests["cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython.
     "rocm-ubuntu18.04-manylinux2010-multipython": {
         "registry": "gcr.io",
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index 31cb19540a7020..2221985b7bd3c1 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -9,12 +9,14 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 # Import third party repository rules. See go/tfbr-thirdparty.
 load("//third_party/dlpack:workspace.bzl", dlpack = "repo")
+load("//third_party/gloo:workspace.bzl", gloo = "repo")
 load("//third_party/stablehlo:workspace.bzl", stablehlo = "repo")
 load("//third_party/triton:workspace.bzl", triton = "repo")
 
 def _initialize_third_party():
     """ Load third party repositories.  See above load() statements. """
     dlpack()
+    gloo()
     stablehlo()
     triton()
 
@@ -37,6 +39,14 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v0.9.zip"),
     )
 
+    tf_http_archive(
+        name = "cutlass_archive",
+        build_file = "//third_party:cutlass.BUILD",
+        sha256 = "ea1b7f96919460a5d80b09c1b246652539a8605600b2be4cccc02c254bccbe50",
+        strip_prefix = "cutlass-5783d6dbd0c34032371cce2bd999fc76007520d7",
+        urls = tf_mirror_urls("https://github.com/chsigg/cutlass/archive/5783d6dbd0c34032371cce2bd999fc76007520d7.tar.gz"),
+    )
+
     tf_http_archive(
         name = "boringssl",
         sha256 = "9dc53f851107eaf87b391136d13b815df97ec8f76dadb487b58b2fc45e624d2c",
diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 24100c8dbcdc5d..caf77930363679 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -470,6 +470,7 @@ xla_cc_test(
         ":shape_util",
         ":test",
         ":xla_data_proto_cc",
+        "//xla:status",
         "@com_google_absl//absl/hash:hash_testing",
         "@local_tsl//tsl/platform:test_benchmark",
         "@local_tsl//tsl/platform:test_main",
diff --git a/third_party/xla/xla/backends/interpreter/compiler.cc b/third_party/xla/xla/backends/interpreter/compiler.cc
index 864d98a8269082..3b89c3b6054de1 100644
--- a/third_party/xla/xla/backends/interpreter/compiler.cc
+++ b/third_party/xla/xla/backends/interpreter/compiler.cc
@@ -49,7 +49,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/backends/interpreter/executor.cc b/third_party/xla/xla/backends/interpreter/executor.cc
index 3766f7cb7af82c..1095c71a86b226 100644
--- a/third_party/xla/xla/backends/interpreter/executor.cc
+++ b/third_party/xla/xla/backends/interpreter/executor.cc
@@ -34,12 +34,6 @@ DeviceMemoryBase XlaInterpreterExecutor::Allocate(uint64_t size,
   return DeviceMemoryBase(new char[size], size);
 }
 
-void *XlaInterpreterExecutor::GetSubBuffer(DeviceMemoryBase *parent,
-                                           uint64_t offset_bytes,
-                                           uint64_t /*size_bytes*/) {
-  return parent + offset_bytes;
-}
-
 void XlaInterpreterExecutor::Deallocate(DeviceMemoryBase *mem) {
   delete[] static_cast<char *>(mem->opaque());
 }
diff --git a/third_party/xla/xla/backends/interpreter/executor.h b/third_party/xla/xla/backends/interpreter/executor.h
index 358b609f23020d..5d866462950072 100644
--- a/third_party/xla/xla/backends/interpreter/executor.h
+++ b/third_party/xla/xla/backends/interpreter/executor.h
@@ -48,22 +48,22 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
   XlaInterpreterExecutor() = default;
 
   tsl::Status Init(int device_ordinal, DeviceOptions device_options) override {
+    device_ordinal_ = device_ordinal;
     return ::tsl::OkStatus();
   }
 
+  int device_ordinal() const override { return device_ordinal_; };
   tsl::Status GetKernel(const MultiKernelLoaderSpec &spec,
-                        KernelBase *kernel) override {
+                        Kernel *kernel) override {
     return tsl::errors::Unimplemented("Not Implemented");
   }
   tsl::Status Launch(Stream *stream, const ThreadDim &thread_dims,
-                     const BlockDim &block_dims, const KernelBase &kernel,
-                     const KernelArgsArrayBase &args) override {
+                     const BlockDim &block_dims, const Kernel &kernel,
+                     const KernelArgs &args) override {
     return tsl::errors::Unimplemented("Not Implemented");
   }
 
   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
-  void *GetSubBuffer(DeviceMemoryBase *parent, uint64_t offset_bytes,
-                     uint64_t size_bytes) override;
   void Deallocate(DeviceMemoryBase *mem) override;
 
   void *HostMemoryAllocate(uint64_t size) override { return new char[size]; }
@@ -182,6 +182,10 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
   }
 
  private:
+  // The device ordinal value that this executor was initialized with; recorded
+  // for use in getting device metadata. Immutable post-initialization.
+  int device_ordinal_;
+
   DeviceMemoryBase AllocateSingleOutput(const xla::Shape &shape);
 
   tsl::StatusOr<DeviceMemoryBase> AllocateOutputBuffer(const xla::Shape &shape);
diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index ca9d8a952a26b9..50f033f79e6b3a 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -105,14 +105,19 @@ cc_library(
         "//xla:debug_options_flags",
         "//xla:execution_options_util",
         "//xla:shape_util",
+        "//xla:statusor",
+        "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/pjrt:compile_options_proto_cc",
         "//xla/service:compilation_environments",
         "//xla/service:computation_placer",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -301,13 +306,17 @@ xla_cc_test(
         ":xla_computation",
         "//xla:debug_options_flags",
         "//xla:shape_util",
+        "//xla:statusor",
+        "//xla:test",
         "//xla:test_helpers",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_parser",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
     ],
diff --git a/third_party/xla/xla/client/executable_build_options.cc b/third_party/xla/xla/client/executable_build_options.cc
index 3089a9820a1810..8227de75f19114 100644
--- a/third_party/xla/xla/client/executable_build_options.cc
+++ b/third_party/xla/xla/client/executable_build_options.cc
@@ -15,16 +15,26 @@ limitations under the License.
 
 #include "xla/client/executable_build_options.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/strings/str_format.h"
 #include "xla/debug_options_flags.h"
 #include "xla/execution_options_util.h"
+#include "xla/layout_util.h"
+#include "xla/service/compilation_environments.h"
+#include "xla/service/computation_placer.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
 #include "xla/xla.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -151,6 +161,10 @@ StatusOr<ExecutableBuildOptionsProto> ExecutableBuildOptions::ToProto() const {
         "Cannot serialize "
         "ExecutableBuildOptions::layout_canonicalization_callback");
   }
+  if (compile_thread_pool() != nullptr) {
+    return InvalidArgument(
+        "Cannot serialize ExecutableBuildOptions::compile_thread_pool");
+  }
   output.set_num_replicas(num_replicas());
   output.set_num_partitions(num_partitions());
   output.set_use_spmd_partitioning(use_spmd_partitioning());
@@ -170,6 +184,12 @@ StatusOr<ExecutableBuildOptionsProto> ExecutableBuildOptions::ToProto() const {
   }
   *output.mutable_fdo_profile() = fdo_profile();
   output.set_device_memory_size(device_memory_size());
+  for (int64_t s : auto_spmd_partitioning_mesh_shape()) {
+    output.mutable_auto_spmd_partitioning_mesh_shape()->Add(s);
+  }
+  for (int64_t s : auto_spmd_partitioning_mesh_ids()) {
+    output.mutable_auto_spmd_partitioning_mesh_ids()->Add(s);
+  }
   return output;
 }
 
@@ -208,6 +228,12 @@ StatusOr<ExecutableBuildOptions> ExecutableBuildOptionsFromProto(
       input.allow_spmd_sharding_propagation_to_output());
   *output.mutable_fdo_profile() = input.fdo_profile();
   output.set_device_memory_size(input.device_memory_size());
+  output.set_auto_spmd_partitioning_mesh_shape(
+      std::vector<int64_t>(input.auto_spmd_partitioning_mesh_shape().begin(),
+                           input.auto_spmd_partitioning_mesh_shape().end()));
+  output.set_auto_spmd_partitioning_mesh_ids(
+      std::vector<int64_t>(input.auto_spmd_partitioning_mesh_ids().begin(),
+                           input.auto_spmd_partitioning_mesh_ids().end()));
   return output;
 }
 
diff --git a/third_party/xla/xla/client/xla_builder.cc b/third_party/xla/xla/client/xla_builder.cc
index 172ded25f552af..40666f7a0b6a69 100644
--- a/third_party/xla/xla/client/xla_builder.cc
+++ b/third_party/xla/xla/client/xla_builder.cc
@@ -937,9 +937,15 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
                            reshaped_dynamic_dimensions);
 
   // Eliminate the size one dimensions.
-  TF_ASSIGN_OR_RETURN(
-      XlaOp reshaped_operand,
-      ReshapeInternal(reshaped_shape, operand, /*inferred_dimension=*/-1));
+  // The added reshape reduces the rank of the tensor. Hence we cannot directly
+  // apply the broadcast's sharding on reshape.
+  XlaOp reshaped_operand;
+  {
+    XlaScopedShardingAssignment scoped_sharding(this, std::nullopt);
+    TF_ASSIGN_OR_RETURN(
+        reshaped_operand,
+        ReshapeInternal(reshaped_shape, operand, /*inferred_dimension=*/-1));
+  }
   // Broadcast 'reshape' up to the larger size.
   return InDimBroadcast(broadcast_shape, reshaped_operand,
                         broadcast_dimensions);
@@ -1002,15 +1008,18 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
 
     TF_ASSIGN_OR_RETURN(const Shape* updated_lhs_shape,
                         GetShapePtr(updated_lhs));
-    if (!ShapeUtil::SameDimensions(shape, *updated_lhs_shape)) {
-      TF_ASSIGN_OR_RETURN(updated_lhs,
-                          AddBroadcastSequence(shape, updated_lhs));
-    }
     TF_ASSIGN_OR_RETURN(const Shape* updated_rhs_shape,
                         GetShapePtr(updated_rhs));
-    if (!ShapeUtil::SameDimensions(shape, *updated_rhs_shape)) {
-      TF_ASSIGN_OR_RETURN(updated_rhs,
-                          AddBroadcastSequence(shape, updated_rhs));
+    if (!updated_lhs_shape->is_unbounded_dynamic() &&
+        !updated_rhs_shape->is_unbounded_dynamic()) {
+      if (!ShapeUtil::SameDimensions(shape, *updated_lhs_shape)) {
+        TF_ASSIGN_OR_RETURN(updated_lhs,
+                            AddBroadcastSequence(shape, updated_lhs));
+      }
+      if (!ShapeUtil::SameDimensions(shape, *updated_rhs_shape)) {
+        TF_ASSIGN_OR_RETURN(updated_rhs,
+                            AddBroadcastSequence(shape, updated_rhs));
+      }
     }
 
     if (binop == HloOpcode::kCompare) {
@@ -2495,6 +2504,25 @@ StatusOr<XlaOp> XlaBuilder::SortInternal(const Shape& shape,
   return AddInstruction(std::move(instr), HloOpcode::kSort, operands);
 }
 
+XlaOp XlaBuilder::TopK(XlaOp operand, int64_t k, bool largest) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
+    TF_ASSIGN_OR_RETURN(Shape shape,
+                        ShapeInference::InferTopKShape(*operand_shape, k));
+    return TopKInternal(shape, operand, k, largest);
+  });
+}
+
+StatusOr<XlaOp> XlaBuilder::TopKInternal(const Shape& shape, XlaOp operand,
+                                         int64_t k, bool largest) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  instr.set_k(k);
+  instr.set_largest(largest);
+  return AddInstruction(std::move(instr), HloOpcode::kTopK, {operand});
+}
+
 XlaOp XlaBuilder::ConvertElementType(XlaOp operand,
                                      PrimitiveType new_element_type) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -3910,7 +3938,6 @@ XlaOp XlaBuilder::GetDimensionSize(XlaOp operand, int64_t dimension) {
 
 XlaOp XlaBuilder::RemoveDynamicDimension(XlaOp operand, int64_t dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
 
     Shape shape = *operand_shape;
@@ -5210,6 +5237,10 @@ XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
                                      is_stable);
 }
 
+XlaOp TopK(XlaOp operand, int64_t k, bool largest) {
+  return operand.builder()->TopK(operand, k, largest);
+}
+
 XlaOp Clamp(const XlaOp min, const XlaOp operand, const XlaOp max) {
   return min.builder()->Clamp(min, operand, max);
 }
diff --git a/third_party/xla/xla/client/xla_builder.h b/third_party/xla/xla/client/xla_builder.h
index cbc0259bea7944..aca638833e7097 100644
--- a/third_party/xla/xla/client/xla_builder.h
+++ b/third_party/xla/xla/client/xla_builder.h
@@ -901,6 +901,10 @@ class XlaBuilder {
                                        const XlaComputation& comparator,
                                        int64_t dimension, bool is_stable);
 
+  XlaOp TopK(XlaOp operand, int64_t k, bool largest);
+  virtual StatusOr<XlaOp> TopKInternal(const Shape& shape, XlaOp operand,
+                                       int64_t k, bool largest);
+
   XlaOp Clamp(XlaOp min, XlaOp operand, XlaOp max);
 
   XlaOp Map(absl::Span<const XlaOp> operands, const XlaComputation& computation,
@@ -1532,6 +1536,7 @@ class XlaBuilder {
   friend XlaOp Sort(absl::Span<const XlaOp> operands,
                     const XlaComputation& comparator, int64_t dimension,
                     bool is_stable);
+  friend XlaOp TopK(XlaOp operand, int64_t k, bool largest);
   friend XlaOp Clamp(XlaOp min, XlaOp operand, XlaOp max);
   friend XlaOp Map(XlaBuilder* builder, absl::Span<const XlaOp> operands,
                    const XlaComputation& computation,
@@ -2674,6 +2679,26 @@ XlaOp Rev(XlaOp operand, absl::Span<const int64_t> dimensions);
 XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
            int64_t dimension = -1, bool is_stable = false);
 
+// Enqueues a topk instruction onto the computation. TopK returns the largest
+// 'k' values and their indices along the last dimension of the 'operand' if
+// `lagest=true` or the smallest `k` values if `largest=false`.
+//
+// * If the operand is a rank-1 tensor (an array), the result is a tuple that
+//   consists of:
+//   * a sorted array with the top 'k' elements.
+//   * an array containing the indices of the k elements.
+//   For example, if the input is [0.1, 0.3, 0.2] and k == 2, the output tuple
+//   is ([0.3, 0.2], [1, 2]).
+// * If the operand has higher rank, the result is a tuple that consists of:
+//   * a tensor equivalent to one produced by sorting the operand along the last
+//     dimension and slicing that dimension to only the top 'k' values. The last
+//     dimension is sorted as in the rank-1 case.
+//   * a tensor containing the indices of the top 'k' values along the last
+//     dimension.
+//   For example, if the input is [0.1, 0.3, 0.2][0.5, 0.4, 0.6] and k == 1, the
+//   output tuple is ([0.3][0.6], [1][2]).
+XlaOp TopK(XlaOp operand, int64_t k, bool largest);
+
 // Enqueues a clamp instruction onto the computation.
 XlaOp Clamp(XlaOp min, XlaOp operand, XlaOp max);
 
diff --git a/third_party/xla/xla/client/xla_builder_test.cc b/third_party/xla/xla/client/xla_builder_test.cc
index 60503f890be013..98fafbea51d3c3 100644
--- a/third_party/xla/xla/client/xla_builder_test.cc
+++ b/third_party/xla/xla/client/xla_builder_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/types/span.h"
 #include "xla/client/sharding_builder.h"
 #include "xla/client/value_inference.h"
 #include "xla/client/xla_computation.h"
@@ -31,9 +32,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_parser.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/statusor.h"
+#include "xla/test.h"
 #include "xla/test_helpers.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -197,6 +202,16 @@ TEST_F(XlaBuilderTest, ParamPlusConstantHasScalarBroadcast) {
               GmockMatch(m::Add(m::Parameter(), m::Broadcast(m::Constant()))));
 }
 
+TEST_F(XlaBuilderTest, ParamPlusConstantHasScalarBroadcastReversed) {
+  XlaBuilder b(TestName());
+  XlaOp x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {3, 5}), "x");
+  Add(ConstantR0<float>(&b, 1.0), x);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              GmockMatch(m::Add(m::Broadcast(m::Constant()), m::Parameter())));
+}
+
 TEST_F(XlaBuilderTest, ParamPlusParamHasBroadcast) {
   XlaBuilder b(TestName());
   const auto& x_shape = ShapeUtil::MakeShape(S32, {2, 4, 6});
@@ -1524,5 +1539,353 @@ TEST_F(XlaBuilderTest, InvalidSharding) {
               HasSubstr("Number of tile assignment dimensions (excluding "
                         "subgroups) is different than the input rank"));
 }
+
+TEST_F(XlaBuilderTest, TopKDimensions) {
+  XlaBuilder b(TestName());
+  int64_t k = 1;
+  int64_t largest = true;
+  TopK(Parameter(&b, 0, ShapeUtil::MakeShape(F32, {6, 8}), "p0"), k, largest);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_TRUE(root->opcode() == HloOpcode::kTopK);
+  EXPECT_TRUE(root->shape().IsTuple());
+  EXPECT_EQ(root->shape().tuple_shapes_size(), 2);
+  EXPECT_EQ(root->shape().tuple_shapes(0).rank(), 2);
+  EXPECT_EQ(root->shape().tuple_shapes(1).rank(), 2);
+  EXPECT_EQ(root->shape().tuple_shapes(0).dimensions(0), 6);
+  EXPECT_EQ(root->shape().tuple_shapes(0).dimensions(1), k);
+  EXPECT_EQ(root->shape().tuple_shapes(1).dimensions(0), 6);
+  EXPECT_EQ(root->shape().tuple_shapes(1).dimensions(1), k);
+}
+
+TEST_F(XlaBuilderTest, UnboundedAbs) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> operand = ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]");
+  StatusOr<Shape> expected = ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]");
+  ASSERT_IS_OK(operand.status());
+  ASSERT_IS_OK(expected.status());
+  Abs(Parameter(&b, 0, operand.value(), "operand"));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected.value()))
+      << "result: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_F(XlaBuilderTest, UnboundedAdd) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> lhs = ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]");
+  StatusOr<Shape> rhs = ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]");
+  StatusOr<Shape> expected = ParseShape("f32[?, ?, 2, 2, <=2, <=2, ?]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  ASSERT_IS_OK(expected.status());
+  Add(Parameter(&b, 0, lhs.value(), "lhs"),
+      Parameter(&b, 1, rhs.value(), "rhs"), /*broadcast_dimensions=*/{});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected.value()))
+      << "result: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_F(XlaBuilderTest, UnboundedAddUnsupportedImplicitBroadcast) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> lhs = ParseShape("f32[?, 10]");
+  StatusOr<Shape> rhs = ParseShape("f32[1]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  Add(Parameter(&b, 0, lhs.value(), "lhs"),
+      Parameter(&b, 1, rhs.value(), "rhs"), /*broadcast_dimensions=*/{1});
+  StatusOr<std::unique_ptr<HloModule>> build_status = BuildHloModule(&b);
+  EXPECT_FALSE(build_status.ok());
+  EXPECT_THAT(build_status.status().message(),
+              HasSubstr("Unbounded dynamic shapes not supported"));
+}
+
+TEST_F(XlaBuilderTest, UnboundedDiv) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> lhs = ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]");
+  StatusOr<Shape> rhs = ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]");
+  StatusOr<Shape> expected = ParseShape("f32[?, ?, 2, 2, <=2, <=2, ?]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  ASSERT_IS_OK(expected.status());
+  Div(Parameter(&b, 0, lhs.value(), "lhs"),
+      Parameter(&b, 1, rhs.value(), "rhs"), /*broadcast_dimensions=*/{});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected.value()))
+      << "result: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_F(XlaBuilderTest, UnboundedDivUnsupportedImplicitBroadcast) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> lhs = ParseShape("f32[?, 10]");
+  StatusOr<Shape> rhs = ParseShape("f32[1]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  Div(Parameter(&b, 0, lhs.value(), "lhs"),
+      Parameter(&b, 1, rhs.value(), "rhs"), /*broadcast_dimensions=*/{1});
+  StatusOr<std::unique_ptr<HloModule>> build_status = BuildHloModule(&b);
+  EXPECT_FALSE(build_status.ok());
+  EXPECT_THAT(build_status.status().message(),
+              HasSubstr("Unbounded dynamic shapes not supported"));
+}
+
+TEST_F(XlaBuilderTest, UnboundedDot) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> lhs = ParseShape("f32[?, 10]");
+  StatusOr<Shape> rhs = ParseShape("f32[?, 10]");
+  StatusOr<Shape> expected = ParseShape("f32[?, 10]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  ASSERT_IS_OK(expected.status());
+
+  Dot(Parameter(&b, 0, lhs.value(), "lhs"),
+      Parameter(&b, 1, rhs.value(), "rhs"));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result =
+      module->entry_computation()->root_instruction()->shape();
+  ASSERT_TRUE(ShapeUtil::Equal(result, expected.value()))
+      << "result: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_F(XlaBuilderTest, UnboundedDotGeneral) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> lhs = ParseShape("f32[?, <=3, ?]");
+  StatusOr<Shape> rhs = ParseShape("f32[2, 4, 5]");
+  StatusOr<Shape> expected = ParseShape("f32[?, <=3, 5]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  ASSERT_IS_OK(expected.status());
+
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(2);
+  dnums.add_rhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(0);
+  dnums.add_rhs_batch_dimensions(0);
+
+  DotGeneral(Parameter(&b, 0, lhs.value(), "lhs"),
+             Parameter(&b, 1, rhs.value(), "rhs"), dnums);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result =
+      module->entry_computation()->root_instruction()->shape();
+  ASSERT_TRUE(ShapeUtil::Equal(result, expected.value()))
+      << "result: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_F(XlaBuilderTest, UnboundedExp) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> operand = ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]");
+  StatusOr<Shape> expected = ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]");
+  ASSERT_IS_OK(operand.status());
+  ASSERT_IS_OK(expected.status());
+  Exp(Parameter(&b, 0, operand.value(), "operand"));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected.value()))
+      << "result: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_F(XlaBuilderTest, UnboundedMax) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> lhs = ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]");
+  StatusOr<Shape> rhs = ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]");
+  StatusOr<Shape> expected = ParseShape("f32[?, ?, 2, 2, <=2, <=2, ?]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  ASSERT_IS_OK(expected.status());
+  Max(Parameter(&b, 0, lhs.value(), "lhs"),
+      Parameter(&b, 1, rhs.value(), "rhs"), /*broadcast_dimensions=*/{});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected.value()))
+      << "result: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_F(XlaBuilderTest, UnboundedMaxUnsupportedImplicitBroadcast) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> lhs = ParseShape("f32[?, 10]");
+  StatusOr<Shape> rhs = ParseShape("f32[1]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  Max(Parameter(&b, 0, lhs.value(), "lhs"),
+      Parameter(&b, 1, rhs.value(), "rhs"), /*broadcast_dimensions=*/{1});
+  StatusOr<std::unique_ptr<HloModule>> build_status = BuildHloModule(&b);
+  EXPECT_FALSE(build_status.ok());
+  EXPECT_THAT(build_status.status().message(),
+              HasSubstr("Unbounded dynamic shapes not supported"));
+}
+
+TEST_F(XlaBuilderTest, UnboundedMul) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> lhs = ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]");
+  StatusOr<Shape> rhs = ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]");
+  StatusOr<Shape> expected = ParseShape("f32[?, ?, 2, 2, <=2, <=2, ?]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  ASSERT_IS_OK(expected.status());
+  Mul(Parameter(&b, 0, lhs.value(), "lhs"),
+      Parameter(&b, 1, rhs.value(), "rhs"), /*broadcast_dimensions=*/{});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected.value()))
+      << "result: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_F(XlaBuilderTest, UnboundedMulUnsupportedImplicitBroadcast) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> lhs = ParseShape("f32[?, 10]");
+  StatusOr<Shape> rhs = ParseShape("f32[1]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  Mul(Parameter(&b, 0, lhs.value(), "lhs"),
+      Parameter(&b, 1, rhs.value(), "rhs"), /*broadcast_dimensions=*/{1});
+  StatusOr<std::unique_ptr<HloModule>> build_status = BuildHloModule(&b);
+  EXPECT_FALSE(build_status.ok());
+  EXPECT_THAT(build_status.status().message(),
+              HasSubstr("Unbounded dynamic shapes not supported"));
+}
+
+TEST_F(XlaBuilderTest, UnboundedPow) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> lhs = ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]");
+  StatusOr<Shape> rhs = ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]");
+  StatusOr<Shape> expected = ParseShape("f32[?, ?, 2, 2, <=2, <=2, ?]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  ASSERT_IS_OK(expected.status());
+  Pow(Parameter(&b, 0, lhs.value(), "lhs"),
+      Parameter(&b, 1, rhs.value(), "rhs"), /*broadcast_dimensions=*/{});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected.value()))
+      << "result: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_F(XlaBuilderTest, UnboundedPowUnsupportedImplicitBroadcast) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> lhs = ParseShape("f32[?, 10]");
+  StatusOr<Shape> rhs = ParseShape("f32[1]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  Pow(Parameter(&b, 0, lhs.value(), "lhs"),
+      Parameter(&b, 1, rhs.value(), "rhs"), /*broadcast_dimensions=*/{1});
+  StatusOr<std::unique_ptr<HloModule>> build_status = BuildHloModule(&b);
+  EXPECT_FALSE(build_status.ok());
+  EXPECT_THAT(build_status.status().message(),
+              HasSubstr("Unbounded dynamic shapes not supported"));
+}
+
+TEST_F(XlaBuilderTest, UnboundedReduce) {
+  XlaBuilder b(TestName());
+  XlaOp input0 = Parameter(&b, 0, ParseShape("f32[7, 5]").value(), "input0");
+  XlaOp input1 = Parameter(&b, 1, ParseShape("f32[?, 5]").value(), "input1");
+  XlaOp input2 = Parameter(&b, 2, ParseShape("f32[7, ?]").value(), "input2");
+  XlaOp init = Parameter(&b, 3, ShapeUtil::MakeShape(F32, {}), "init");
+
+  XlaBuilder bsum(TestName());
+  XlaOp arg0 = Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "arg0");
+  XlaOp arg1 = Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "arg1");
+  XlaOp arg2 = Parameter(&bsum, 2, ShapeUtil::MakeShape(F32, {}), "arg2");
+  XlaOp arg3 = Parameter(&bsum, 3, ShapeUtil::MakeShape(F32, {}), "arg3");
+  XlaOp arg4 = Parameter(&bsum, 4, ShapeUtil::MakeShape(F32, {}), "arg4");
+  XlaOp arg5 = Parameter(&bsum, 5, ShapeUtil::MakeShape(F32, {}), "arg5");
+
+  std::vector<XlaOp> output_operands = {Add(arg0, arg1), Add(arg2, arg3),
+                                        Add(arg4, arg5)};
+  Tuple(&bsum, absl::MakeSpan(output_operands));
+  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  Reduce(&b, {input0, input1, input2}, {init, init, init}, sum, {1});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+
+  const Shape& result =
+      module->entry_computation()->root_instruction()->shape();
+  Shape shape = ShapeUtil::MakeShape(F32, {7}, {false});
+  Shape expected = ShapeUtil::MakeTupleShape({shape, shape, shape});
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected));
+}
+
+TEST_F(XlaBuilderTest, UnboundedSlice) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> operand = ParseShape("f32[1, <=3, ?]");
+  StatusOr<Shape> expected = ParseShape("f32[1, <=2, 3]");
+  ASSERT_IS_OK(operand.status());
+  ASSERT_IS_OK(expected.status());
+  Slice(Parameter(&b, 0, operand.value(), "operand"),
+        /*start_indices=*/{0, 1, 2},
+        /*limit_indices=*/{1, 3, 5},
+        /*strides=*/{1, 1, 1});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto result = module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected.value()))
+      << "result: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_F(XlaBuilderTest, UnboundedSub) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> lhs = ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]");
+  StatusOr<Shape> rhs = ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]");
+  StatusOr<Shape> expected = ParseShape("f32[?, ?, 2, 2, <=2, <=2, ?]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  ASSERT_IS_OK(expected.status());
+  Sub(Parameter(&b, 0, lhs.value(), "lhs"),
+      Parameter(&b, 1, rhs.value(), "rhs"), /*broadcast_dimensions=*/{});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected.value()))
+      << "result: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_F(XlaBuilderTest, UnboundedSubUnsupportedImplicitBroadcast) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> lhs = ParseShape("f32[?, 10]");
+  StatusOr<Shape> rhs = ParseShape("f32[1]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  Sub(Parameter(&b, 0, lhs.value(), "lhs"),
+      Parameter(&b, 1, rhs.value(), "rhs"), /*broadcast_dimensions=*/{1});
+  StatusOr<std::unique_ptr<HloModule>> build_status = BuildHloModule(&b);
+  EXPECT_FALSE(build_status.ok());
+  EXPECT_THAT(build_status.status().message(),
+              HasSubstr("Unbounded dynamic shapes not supported"));
+}
+
+TEST_F(XlaBuilderTest, UnboundedTranspose) {
+  XlaBuilder b(TestName());
+  StatusOr<Shape> operand = ParseShape("f32[1, ?, 2, ?, <=2]{4,3,2,1,0}");
+  StatusOr<Shape> expected = ParseShape("f32[<=2, 1, ?, 2, ?]{0,2,3,4,1}");
+  ASSERT_IS_OK(operand.status());
+  ASSERT_IS_OK(expected.status());
+  Transpose(Parameter(&b, 0, operand.value(), "operand"),
+            /*permutation=*/{4, 0, 3, 2, 1});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected.value()))
+      << "result: " << ShapeUtil::HumanStringWithLayout(result)
+      << " expected: " << ShapeUtil::HumanStringWithLayout(expected.value());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index e7b4805aa2a34f..4a1d998e5d76a4 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -136,6 +136,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_enable_dumping(true);
 
   opts.set_xla_gpu_enable_xla_runtime_executable(true);
+  opts.set_xla_gpu_enable_custom_fusions(false);
   opts.set_xla_gpu_nccl_termination_timeout_seconds(-1);
   opts.set_xla_gpu_enable_shared_constants(true);
 
@@ -200,7 +201,6 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_collect_cost_model_stats(false);
   opts.set_xla_gpu_enable_split_k_autotuning(true);
 
-  opts.set_xla_gpu_single_wave_autotuning(true);
   opts.set_xla_gpu_enable_reduction_epilogue_fusion(true);
   opts.set_xla_gpu_enable_nccl_clique_optimization(false);
   opts.set_xla_gpu_cublas_fallback(true);
@@ -211,7 +211,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_target_config_filename("");
   opts.set_xla_gpu_enable_cub_radix_sort(true);
   opts.set_xla_gpu_enable_cudnn_layer_norm(false);
-
+  opts.set_xla_gpu_threshold_for_windowed_einsum_mib(100000);
   return opts;
 }
 
@@ -1066,6 +1066,18 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_xla_runtime_executable),
       debug_options->xla_gpu_enable_xla_runtime_executable(),
       "Whether to enable XLA runtime for XLA:GPU backend"));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_custom_fusions",
+      bool_setter_for(&DebugOptions::set_xla_gpu_enable_custom_fusions),
+      debug_options->xla_gpu_enable_custom_fusions(),
+      "Whether to enable XLA custom fusions"));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_custom_fusions_re",
+      string_setter_for(&DebugOptions::set_xla_gpu_enable_custom_fusions_re),
+      debug_options->xla_gpu_enable_custom_fusions_re(),
+      "Limits custom fusion only to fusions which match this regular "
+      "expression. Default is all custom fusions registerered in a current "
+      "process."));
   flag_list->push_back(
       tsl::Flag("xla_gpu_enable_gpu2_runtime",
                 bool_setter_for(&DebugOptions::set_xla_gpu_enable_gpu2_runtime),
@@ -1341,13 +1353,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_split_k_autotuning),
       debug_options->xla_gpu_enable_split_k_autotuning(),
       "Enable split_k autotuning for triton gemms."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_single_wave_autotuning",
-      bool_setter_for(&DebugOptions::set_xla_gpu_single_wave_autotuning),
-      debug_options->xla_gpu_single_wave_autotuning(),
-      "Enable single \"wave\" autotuning. This uses more memory for "
-      "compilation, but utilizes CPU cores better, so compilation can be "
-      "faster."));
 
   flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_reduction_epilogue_fusion",
@@ -1415,6 +1420,13 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_cub_radix_sort),
       debug_options->xla_gpu_enable_cub_radix_sort(),
       "Enable radix sort using CUB for simple shapes"));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_threshold_for_windowed_einsum_mib",
+      int64_setter_for(
+          &DebugOptions::set_xla_gpu_threshold_for_windowed_einsum_mib),
+      debug_options->xla_gpu_threshold_for_windowed_einsum_mib(),
+      "Threshold to enable windowed einsum (collective matmul) in MB."
+      "Default is 100000"));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/third_party/xla/xla/executable_run_options.cc b/third_party/xla/xla/executable_run_options.cc
index 795c5fc4176431..0cb33fce343363 100644
--- a/third_party/xla/xla/executable_run_options.cc
+++ b/third_party/xla/xla/executable_run_options.cc
@@ -124,6 +124,17 @@ ExecutableRunOptions::gpu_executable_run_options() const {
   return gpu_executable_run_options_;
 }
 
+ExecutableRunOptions& ExecutableRunOptions::set_cpu_executable_run_options(
+    const cpu::CpuExecutableRunOptions* cpu_executable_run_options) {
+  cpu_executable_run_options_ = cpu_executable_run_options;
+  return *this;
+}
+
+const cpu::CpuExecutableRunOptions*
+ExecutableRunOptions::cpu_executable_run_options() const {
+  return cpu_executable_run_options_;
+}
+
 ExecutableRunOptions& ExecutableRunOptions::set_rng_seed(int rng_seed) {
   rng_seed_ = rng_seed;
   return *this;
diff --git a/third_party/xla/xla/executable_run_options.h b/third_party/xla/xla/executable_run_options.h
index 31ba23bf3b7a14..861e13d2a5c02f 100644
--- a/third_party/xla/xla/executable_run_options.h
+++ b/third_party/xla/xla/executable_run_options.h
@@ -51,6 +51,10 @@ class DeviceAssignment;
 class ExecutionProfile;
 class Shape;
 
+namespace cpu {
+class CpuExecutableRunOptions;
+}  // namespace cpu
+
 namespace gpu {
 class GpuExecutableRunOptions;
 }  // namespace gpu
@@ -210,6 +214,12 @@ class ExecutableRunOptions {
     return recv_device_memory_function_;
   }
 
+  // CPU-backend specific options. These are kept out-of-line to avoid bloating
+  // the size of this dependency for CPU-only AOT builds.
+  ExecutableRunOptions& set_cpu_executable_run_options(
+      const cpu::CpuExecutableRunOptions* cpu_executable_run_options);
+  const cpu::CpuExecutableRunOptions* cpu_executable_run_options() const;
+
   // GPU-backend specific options. These are kept out-of-line to avoid bloating
   // the size of this dependency for CPU-only AOT builds.
   ExecutableRunOptions& set_gpu_executable_run_options(
@@ -231,6 +241,7 @@ class ExecutableRunOptions {
   SendDeviceMemoryFunction* send_device_memory_function_ = nullptr;
   RecvDeviceMemoryFunction* recv_device_memory_function_ = nullptr;
   RunId run_id_;
+  const cpu::CpuExecutableRunOptions* cpu_executable_run_options_ = nullptr;
   const gpu::GpuExecutableRunOptions* gpu_executable_run_options_ = nullptr;
 };
 
diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index af726018ff3db7..53770a9618d2dd 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -18,13 +18,17 @@ cc_library(
     hdrs = ["call_frame.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":api",
+        "//xla:status",
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
+        "//xla/service:executable",
         "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
     ],
@@ -32,7 +36,6 @@ cc_library(
 
 cc_library(
     name = "ffi",
-    srcs = ["ffi.cc"],
     hdrs = ["ffi.h"],
     visibility = ["//visibility:public"],
     deps = [
@@ -44,6 +47,34 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
+        "//xla/hlo/ir:hlo",
+        "//xla/runtime:memref_view",
+        "//xla/service:executable",
+        "//xla/stream_executor:device_memory",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+cc_library(
+    name = "ffi_api",
+    srcs = ["ffi_api.cc"],
+    hdrs = ["ffi_api.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":api",
+        ":call_frame",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/ffi/api:c_api",
+        "//xla/ffi/api:c_api_internal",
+        "//xla/hlo/ir:hlo",
         "//xla/runtime:memref_view",
         "//xla/service:executable",
         "//xla/stream_executor:device_memory",
@@ -60,11 +91,10 @@ xla_cc_test(
     name = "ffi_test",
     srcs = ["ffi_test.cc"],
     deps = [
-        ":api",
         ":call_frame",
         ":ffi",
+        ":ffi_api",
         "//xla:xla_data_proto_cc",
-        "//xla/ffi/api:c_api",
         "//xla/service:executable",
         "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
index d18f89ed489303..fa35ce81f57128 100644
--- a/third_party/xla/xla/ffi/api/BUILD
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -1,3 +1,4 @@
+load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -5,6 +6,22 @@ package(
     default_visibility = ["//visibility:public"],
 )
 
+#===-------------------------------------------------------------------------------------------===//
+# Public XLA FFI API
+#===-------------------------------------------------------------------------------------------===//
+
+# XLA FFI is a header only library that does not have any dependencies on XLA. The intent is that
+# users that do want to register custom FFI handlers with XLA should copy these headers to their
+# project, build a shared object with an XLA FFI handler implementation, and load it at run time.
+#
+# `api.h` and `ffi.h` headers provide a C++ library for decoding XLA FFI C API structs into a more
+# user friendly C++ types. Shared objects defining XLA FFI handlers should be built with private
+# symbol visibility to avoid potential ODR violations coming from template instantiations of
+# different XLA FFI versions.
+#
+# `ffi.h` defines builtin decoding for canonical XLA types, but users can add their own decodings
+# with template specializations.
+
 filegroup(
     name = "api_headers",
     srcs = ["api.h"],
@@ -46,3 +63,26 @@ cc_library(
         ":c_api",
     ],
 )
+
+#===-------------------------------------------------------------------------------------------===//
+# Internal tests for XLA FFI API
+#===-------------------------------------------------------------------------------------------===//
+
+xla_cc_test(
+    name = "ffi_test",
+    srcs = ["ffi_test.cc"],
+    deps = [
+        ":ffi",
+        "//xla:xla_data_proto_cc",
+        "//xla/ffi:call_frame",
+        "//xla/ffi:ffi_api",
+        "//xla/stream_executor:device_memory",
+        "@com_google_absl//absl/log:check",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_benchmark",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
diff --git a/third_party/xla/xla/ffi/api/api.h b/third_party/xla/xla/ffi/api/api.h
index 46105cf58dd602..b37a170f57638d 100644
--- a/third_party/xla/xla/ffi/api/api.h
+++ b/third_party/xla/xla/ffi/api/api.h
@@ -18,18 +18,21 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <iterator>
 #include <memory>
 #include <optional>
+#include <ostream>
 #include <sstream>
 #include <string>
 #include <string_view>
 #include <tuple>
 #include <type_traits>
 #include <utility>
+#include <variant>
 #include <vector>
 
 // This is a header-only base C++ library that defines templates for decoding
@@ -54,6 +57,22 @@ limitations under the License.
 
 #include "xla/ffi/api/c_api.h"
 
+#if __has_attribute(always_inline)
+#define XLA_ATTRIBUTE_ALWAYS_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define XLA_ATTRIBUTE_ALWAYS_INLINE __forceinline
+#else
+#define XLA_ATTRIBUTE_ALWAYS_INLINE inline
+#endif
+
+#if __has_attribute(noinline)
+#define XLA_ATTRIBUTE_NEVER_INLINE __attribute__((noinline))
+#elif defined(_MSC_VER)
+#define XLA_ATTRIBUTE_NEVER_INLINE __declspec(noinline)
+#else
+#define XLA_ATTRIBUTE_NEVER_INLINE
+#endif
+
 namespace xla::ffi {
 
 // Forward declare template defined below.
@@ -147,17 +166,61 @@ XLA_FFI_Error* Ffi::CheckStructSize(const XLA_FFI_Api* api,
 // Type tags for distinguishing handler argument types
 //===----------------------------------------------------------------------===//
 
+// Forward declare.
+class Dictionary;
+
 namespace internal {
 
+// WARNING: A lot of template metaprogramming on top of C++ variadic templates
+// parameter packs. We need this to be able to pattern match FFI handler
+// signature at compile time.
+
+// A type tag to forward all remaining args as `RemainingArgs`.
+struct RemainingArgsTag {};
+
 // A type tag to distinguish arguments tied to the attributes in the
 // `Binding` variadic template argument.
 template <typename T>
 struct AttrTag {};
 
+// A type tag to forward all attributes as `Dictionary` (and optionally decode
+// it into a custom struct).
+template <typename T = Dictionary>
+struct AttrsTag {};
+
 // A type tag to distinguish arguments extracted from an execution context.
 template <typename T>
 struct CtxTag {};
 
+//----------------------------------------------------------------------------//
+// A template for counting tagged arguments in the Ts pack (i.e. attributes).
+//----------------------------------------------------------------------------//
+
+template <template <typename> class Tag, typename... Ts>
+struct NumTagged;
+
+template <template <typename> class Tag>
+struct NumTagged<Tag> {
+  static constexpr int64_t value = 0;
+};
+
+template <template <typename> class Tag, typename T, typename... Ts>
+struct NumTagged<Tag, Tag<T>, Ts...> {
+  static constexpr int64_t value = 1 + NumTagged<Tag, Ts...>::value;
+};
+
+template <template <typename> class Tag, typename T, typename... Ts>
+struct NumTagged<Tag, T, Ts...> {
+  static constexpr int64_t value = 0 + NumTagged<Tag, Ts...>::value;
+};
+
+//----------------------------------------------------------------------------//
+
+// Checks if remaining arguments are in the parameter pack.
+template <typename... Ts>
+using HasRemainingArgsTag =
+    std::disjunction<std::is_same<RemainingArgsTag, Ts>...>;
+
 }  // namespace internal
 
 //===----------------------------------------------------------------------===//
@@ -172,6 +235,12 @@ class Binding {
     return {std::move(*this)};
   }
 
+  Binding<Ts..., internal::RemainingArgsTag> RemainingArgs() && {
+    static_assert(!internal::HasRemainingArgsTag<Ts...>::value,
+                  "remaining arguments can be passed just once");
+    return {std::move(*this)};
+  }
+
   template <typename T>
   Binding<Ts..., internal::CtxTag<T>> Ctx() && {
     return {std::move(*this)};
@@ -179,10 +248,19 @@ class Binding {
 
   template <typename T>
   Binding<Ts..., internal::AttrTag<T>> Attr(std::string attr) && {
+    static_assert(internal::NumTagged<internal::AttrsTag, Ts...>::value == 0,
+                  "dictionary attributes can't be mixed with regular ones");
     attrs_.push_back(std::move(attr));
     return {std::move(*this)};
   }
 
+  template <typename T = Dictionary>
+  Binding<Ts..., internal::AttrsTag<T>> Attrs() && {
+    static_assert(internal::NumTagged<internal::AttrTag, Ts...>::value == 0,
+                  "dictionary attributes can't be mixed with regular ones");
+    return {std::move(*this)};
+  }
+
   template <typename Fn>
   std::unique_ptr<Handler<Fn, Ts...>> To(Fn fn) {
     return std::unique_ptr<Handler<Fn, Ts...>>(
@@ -236,8 +314,8 @@ struct ArgDecoding;
 //
 //   template <>
 //   struct AttrDecoding<MyType> {
-//    static std::optional<MyType> Decode(std::string_view name,
-//                                        XLA_FFI_AttrType type, void* attr);
+//    static std::optional<MyType> Decode(XLA_FFI_AttrType type, void* attr,
+//                                        DiagnosticEngine&);
 //   }
 //
 template <typename T>
@@ -282,6 +360,66 @@ struct CtxDecoding;
 template <typename T>
 struct ResultEncoding;
 
+//===----------------------------------------------------------------------===//
+// Diagnostics
+//===----------------------------------------------------------------------===//
+
+class DiagnosticEngine;
+
+// RAII wrapper around constructed, but but not yet emitted diagnostic. In
+// flight diagnostic gives an opportunity to build a diagnostic before reporting
+// it to the engine, similar to the builder pattern.
+class InFlightDiagnostic {
+ public:
+  explicit InFlightDiagnostic(DiagnosticEngine* engine, std::string s)
+      : engine_(engine) {
+    stream_ << s;
+  }
+  InFlightDiagnostic(const InFlightDiagnostic&) = delete;
+  InFlightDiagnostic& operator=(const InFlightDiagnostic&) = delete;
+
+  ~InFlightDiagnostic();
+
+  template <typename Arg>
+  InFlightDiagnostic& operator<<(Arg&& arg) {
+    stream_ << std::forward<Arg>(arg);
+    return *this;
+  }
+
+  template <typename T>
+  operator std::optional<T>() const {  // NOLINT
+    return std::nullopt;
+  }
+
+ private:
+  DiagnosticEngine* engine_;
+  std::stringstream stream_;
+};
+
+class DiagnosticEngine {
+ public:
+  DiagnosticEngine() = default;
+  DiagnosticEngine(const DiagnosticEngine&) = delete;
+  DiagnosticEngine& operator=(const DiagnosticEngine&) = delete;
+
+  InFlightDiagnostic Emit(std::string message) {
+    return InFlightDiagnostic(this, std::move(message));
+  }
+
+  std::string Result() const { return acc_; }
+
+ private:
+  friend class InFlightDiagnostic;
+
+  void append(std::string s) { acc_.append(std::move(s)); }
+
+  std::string acc_;
+};
+
+inline InFlightDiagnostic::~InFlightDiagnostic() {
+  engine_->append(stream_.str());
+}
+
 //===----------------------------------------------------------------------===//
 // Decoding arguments and attributes
 //===----------------------------------------------------------------------===//
@@ -304,56 +442,226 @@ struct DecodingContext {
 
 template <typename T>
 struct Decode {
-  static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx) {
+  XLA_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx,
+                               DiagnosticEngine& diagnostic) {
     int64_t idx = offsets.args++;
     return ArgDecoding<T>::Decode(ctx.call_frame->args.types[idx],
-                                  ctx.call_frame->args.args[idx]);
+                                  ctx.call_frame->args.args[idx], diagnostic);
   }
 };
 
+}  // namespace internal
+
 template <typename T>
-struct Decode<internal::AttrTag<T>> {
-  static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx) {
+struct internal::Decode<internal::AttrTag<T>> {
+  static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx,
+                               DiagnosticEngine& diagnostic) {
     // Find decoded attribute corresponding to the given attribute index.
-    int64_t idx = offsets.attrs++;
+    int64_t i = offsets.attrs++;
 
     // Get mapping from the attribute to its index in the sorted array.
-    size_t i = ctx.attrs_idx[idx];
+    size_t idx = ctx.attrs_idx[i];
 
     // Load attribute from call frame using index into the sorted array.
-    XLA_FFI_AttrType type = ctx.call_frame->attrs.types[i];
-    XLA_FFI_ByteSpan* name = ctx.call_frame->attrs.names[i];
-    void* attr = ctx.call_frame->attrs.attrs[i];
+    XLA_FFI_AttrType attr_type = ctx.call_frame->attrs.types[idx];
+    XLA_FFI_ByteSpan* attr_name = ctx.call_frame->attrs.names[idx];
+    void* attr = ctx.call_frame->attrs.attrs[idx];
 
     // TODO(ezhulenev): Currently we require that attributes passed to the FFI
     // handler must match attributes referenced in a binding, however
     // we could safely ignore extra attributes. Relax this if needed.
 
     // Attribute name does not match.
-    std::string_view name_view = {name->ptr, name->len};
-    if (name_view != ctx.attrs_names[idx]) return std::nullopt;
+    std::string_view attr_name_view = {attr_name->ptr, attr_name->len};
+    if (attr_name_view != ctx.attrs_names[i]) return std::nullopt;
 
-    return AttrDecoding<T>::Decode(name_view, type, attr);
+    return AttrDecoding<T>::Decode(attr_type, attr, diagnostic);
   }
 };
 
 template <typename T>
-struct Decode<internal::CtxTag<T>> {
+struct internal::Decode<internal::CtxTag<T>> {
   using R = typename CtxDecoding<T>::Type;
 
-  static std::optional<R> call(DecodingOffsets& offsets, DecodingContext& ctx) {
-    return CtxDecoding<T>::Decode(ctx.call_frame->api, ctx.call_frame->ctx);
+  static std::optional<R> call(DecodingOffsets& offsets, DecodingContext& ctx,
+                               DiagnosticEngine& diagnostic) {
+    return CtxDecoding<T>::Decode(ctx.call_frame->api, ctx.call_frame->ctx,
+                                  diagnostic);
   }
 };
 
-}  // namespace internal
+//===----------------------------------------------------------------------===//
+// Expected
+//===----------------------------------------------------------------------===//
+
+// Forward declare.
+template <typename E>
+class Unexpected;
+
+// TODO(slebedev): Replace with `std::expected` when C++23 is available.
+template <typename T, typename E>
+class Expected {
+ public:
+  Expected(T value) : data_(std::move(value)) {}  // NOLINT
+  Expected(Unexpected<E> u);                      // NOLINT
+
+  operator bool() const {  // NOLINT
+    return has_value();
+  }
+  T operator*() const { return value(); }
+  T* operator->() const { return &value(); }
+
+  bool has_value() const { return std::holds_alternative<T>(data_); }
+  T value() const { return std::get<T>(data_); }
+  E error() const { return std::get<E>(data_); }
+
+ private:
+  std::variant<T, E> data_;
+};
+
+template <typename E>
+class Unexpected {
+ public:
+  explicit Unexpected(E error) : error_(std::move(error)) {}
+
+ private:
+  template <typename, typename>
+  friend class Expected;
+
+  E error_;
+};
+
+Unexpected(const char*) -> Unexpected<std::string>;
+
+template <typename T, typename E>
+Expected<T, E>::Expected(Unexpected<E> u) : data_(std::move(u.error_)) {}
+
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing a variable number of arguments.
+//===----------------------------------------------------------------------===//
+
+class RemainingArgs {
+ public:
+  RemainingArgs(const XLA_FFI_Args* args, size_t offset)
+      : args_(args), offset_(offset) {
+    assert(offset <= args_->num_args && "illegal remaining args offset");
+  }
+
+  size_t size() const { return args_->num_args - offset_; }
+  bool empty() const { return size() == 0; }
+
+  template <typename T>
+  Expected<T, std::string> get(size_t index) const {
+    size_t idx = offset_ + index;
+    if (idx >= args_->num_args) {
+      return Unexpected("Index out of range.");
+    }
+
+    DiagnosticEngine diagnostic;
+    auto value_opt =
+        ArgDecoding<T>::Decode(args_->types[idx], args_->args[idx], diagnostic);
+    if (!value_opt.has_value()) {
+      return Unexpected(diagnostic.Result());
+    }
+    return *value_opt;
+  }
+
+ private:
+  const XLA_FFI_Args* args_;  // not owned
+  size_t offset_;
+};
+
+template <>
+struct internal::Decode<internal::RemainingArgsTag> {
+  static std::optional<RemainingArgs> call(DecodingOffsets& offsets,
+                                           DecodingContext& ctx,
+                                           DiagnosticEngine& diagnostic) {
+    return RemainingArgs(&ctx.call_frame->args, offsets.args);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing dictionary attributes.
+//===----------------------------------------------------------------------===//
+
+class Dictionary {
+ public:
+  explicit Dictionary(const XLA_FFI_Attrs* attrs) : attrs_(attrs) {}
+
+  size_t size() const { return attrs_->num_attrs; }
+
+  bool contains(std::string_view name) const {
+    return Find(name) < attrs_->num_attrs;
+  }
+
+  template <typename T>
+  Expected<T, std::string> get(std::string_view name) const {
+    DiagnosticEngine diagnostic;
+    auto value_opt = get<T>(name, diagnostic);
+    if (!value_opt.has_value()) {
+      return Unexpected(diagnostic.Result());
+    }
+    return *value_opt;
+  }
+
+  template <typename T>
+  std::optional<T> get(std::string_view name,
+                       DiagnosticEngine& diagnostic) const {
+    size_t idx = Find(name);
+    if (idx >= attrs_->num_attrs) {
+      return diagnostic.Emit("Unexpected attribute: ") << name;
+    }
+
+    XLA_FFI_AttrType attr_type = attrs_->types[idx];
+    void* attr = attrs_->attrs[idx];
+    return AttrDecoding<T>::Decode(attr_type, attr, diagnostic);
+  }
+
+ private:
+  size_t Find(std::string_view name) const {
+    XLA_FFI_ByteSpan** begin = attrs_->names;
+    XLA_FFI_ByteSpan** end = begin + attrs_->num_attrs;
+
+    auto name_eq = [&](XLA_FFI_ByteSpan* attr) {
+      std::string_view name_view = {attr->ptr, attr->len};
+      return name_view == name;
+    };
+
+    // TODO(ezhulenev): Attributes names sorted by name. We can use a binary
+    // search here instead of a linear scan.
+    return std::distance(begin, std::find_if(begin, end, name_eq));
+  }
+
+  const XLA_FFI_Attrs* attrs_;
+};
+
+// Decode `AttrsTag` into a generic `Dictionary` attribute.
+template <>
+struct internal::Decode<internal::AttrsTag<Dictionary>> {
+  static std::optional<Dictionary> call(DecodingOffsets& offsets,
+                                        DecodingContext& ctx,
+                                        DiagnosticEngine& diagnostic) {
+    return Dictionary(&ctx.call_frame->attrs);
+  }
+};
+
+// Decode `AttrsTag` into a type `T` relying on struct decoding defined below.
+template <typename T>
+struct internal::Decode<internal::AttrsTag<T>> {
+  static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx,
+                               DiagnosticEngine& diagnostic) {
+    return AttrDecoding<T>::Decode(
+        XLA_FFI_AttrType_DICTIONARY,
+        const_cast<XLA_FFI_Attrs*>(&ctx.call_frame->attrs), diagnostic);
+  }
+};
 
 //===----------------------------------------------------------------------===//
 // Template metaprogramming for decoding handler signature
 //===----------------------------------------------------------------------===//
 
 namespace internal {
-
 // A helper struct to extract the type of the handler argument.
 template <typename T>
 struct FnArgType {
@@ -372,13 +680,28 @@ struct FnArgType<internal::CtxTag<T>> {
   using Type = typename CtxDecoding<T>::Type;
 };
 
-// A template for checking if type is a wrapped attribute or user data.
+template <>
+struct FnArgType<internal::RemainingArgsTag> {
+  using Type = RemainingArgs;
+};
+
+template <typename T>
+struct FnArgType<internal::AttrsTag<T>> {
+  using Type = T;
+};
+
+// A template for checking if type in a parameter pack is a tagged one and has
+// a special decoding rule defined by template specialization.
 template <typename>
-struct IsWrapped : std::false_type {};
+struct IsTagged : std::false_type {};
 template <typename T>
-struct IsWrapped<AttrTag<T>> : std::true_type {};
+struct IsTagged<AttrTag<T>> : std::true_type {};
 template <typename T>
-struct IsWrapped<CtxTag<T>> : std::true_type {};
+struct IsTagged<AttrsTag<T>> : std::true_type {};
+template <typename T>
+struct IsTagged<CtxTag<T>> : std::true_type {};
+template <>
+struct IsTagged<RemainingArgsTag> : std::true_type {};
 
 // A template for counting regular arguments in the Ts pack.
 template <typename... Ts>
@@ -391,26 +714,7 @@ struct NumArgs<> {
 
 template <typename T, typename... Ts>
 struct NumArgs<T, Ts...> {
-  static constexpr int64_t value = !IsWrapped<T>::value + NumArgs<Ts...>::value;
-};
-
-// A template for counting tagged arguments in the Ts pack (i.e. attributes).
-template <template <typename> class Tag, typename... Ts>
-struct NumTagged;
-
-template <template <typename> class Tag>
-struct NumTagged<Tag> {
-  static constexpr int64_t value = 0;
-};
-
-template <template <typename> class Tag, typename T, typename... Ts>
-struct NumTagged<Tag, Tag<T>, Ts...> {
-  static constexpr int64_t value = 1 + NumTagged<Tag, Ts...>::value;
-};
-
-template <template <typename> class Tag, typename T, typename... Ts>
-struct NumTagged<Tag, T, Ts...> {
-  static constexpr int64_t value = 0 + NumTagged<Tag, Ts...>::value;
+  static constexpr int64_t value = !IsTagged<T>::value + NumArgs<Ts...>::value;
 };
 
 }  // namespace internal
@@ -424,9 +728,16 @@ class Handler : public Ffi {
   static constexpr int64_t kSize = sizeof...(Ts);
 
   static constexpr int64_t kNumArgs = internal::NumArgs<Ts...>::value;
+
   static constexpr int64_t kNumAttrs =
       internal::NumTagged<internal::AttrTag, Ts...>::value;
 
+  static constexpr int64_t kNumDictAttrs =
+      internal::NumTagged<internal::AttrsTag, Ts...>::value;
+
+  static_assert(kNumAttrs == 0 || kNumDictAttrs == 0,
+                "dictionary attributes can't be mixed with regular ones");
+
   template <typename T>
   using FnArgType = typename internal::FnArgType<T>::Type;
 
@@ -445,16 +756,28 @@ class Handler : public Ffi {
 
     // Check that the number of passed arguments matches the signature. Each
     // individual argument decoding will check the actual type.
-    if (call_frame->args.num_args != kNumArgs) {
-      return InvalidArgument(
-          call_frame->api,
-          StrCat("Wrong number of arguments: expected ", kNumArgs, " but got ",
-                 call_frame->args.num_args));
+    if (internal::HasRemainingArgsTag<Ts...>::value) {
+      if (call_frame->args.num_args < kNumArgs) {
+        return InvalidArgument(
+            call_frame->api,
+            StrCat("Wrong number of arguments: expected at least ",
+                   kNumArgs - 1, " but got ", call_frame->args.num_args));
+      }
+    } else {
+      if (call_frame->args.num_args != kNumArgs) {
+        return InvalidArgument(
+            call_frame->api,
+            StrCat("Wrong number of arguments: expected ", kNumArgs,
+                   " but got ", call_frame->args.num_args));
+      }
     }
 
     // Check that the number of passed attributes matches the signature. Each
-    // individual attribute decoding will check the actual type.
-    if (call_frame->attrs.num_attrs != kNumAttrs) {
+    // individual attribute decoding will check the actual type. If we decode
+    // attributes into a dictionary (or a custom struct decoded from a
+    // dictionary), then there is no need to check attributes, as the FFI
+    // handler (or a struct decoding) should be responsible for it.
+    if (kNumDictAttrs == 0 && call_frame->attrs.num_attrs != kNumAttrs) {
       return InvalidArgument(
           call_frame->api,
           StrCat("Wrong number of attributes: expected ", kNumAttrs,
@@ -469,8 +792,8 @@ class Handler : public Ffi {
 
  private:
   template <size_t... Is>
-  XLA_FFI_Error* Call(const XLA_FFI_CallFrame* call_frame,
-                      std::index_sequence<Is...>) const {
+  XLA_ATTRIBUTE_ALWAYS_INLINE XLA_FFI_Error* Call(
+      const XLA_FFI_CallFrame* call_frame, std::index_sequence<Is...>) const {
     // A helper structure to allow each decoder find the correct offset.
     internal::DecodingOffsets offsets;
 
@@ -478,12 +801,15 @@ class Handler : public Ffi {
     internal::DecodingContext ctx = {call_frame, attrs_.data(),
                                      attrs_idx_.data()};
 
+    DiagnosticEngine diagnostic;
+
     std::tuple<std::optional<FnArgType<Ts>>...> args = {
-        internal::Decode<Ts>::call(offsets, ctx)...};
+        internal::Decode<Ts>::call(offsets, ctx, diagnostic)...};
 
     bool all_decoded = (std::get<Is>(args).has_value() && ...);
     if (!all_decoded) {
-      return FailedDecodeError(call_frame, {std::get<Is>(args).has_value()...});
+      return FailedDecodeError(call_frame, {std::get<Is>(args).has_value()...},
+                               diagnostic);
     }
 
     auto result = fn_(std::move(*std::get<Is>(args))...);
@@ -492,7 +818,8 @@ class Handler : public Ffi {
   }
 
   XLA_FFI_Error* FailedDecodeError(const XLA_FFI_CallFrame* call_frame,
-                                   std::array<bool, kSize> decoded) const {
+                                   std::array<bool, kSize> decoded,
+                                   const DiagnosticEngine& diagnostic) const {
     std::string message =
         "Failed to decode all FFI handler operands (bad operands at: ";
     for (size_t cnt = 0, idx = 0; idx < kSize; ++idx) {
@@ -502,6 +829,10 @@ class Handler : public Ffi {
       }
     }
     message.append(")");
+    if (auto s = std::move(diagnostic).Result(); !s.empty()) {
+      message.append("\nDiagnostics:\n");
+      message.append(s);
+    }
     return InvalidArgument(call_frame->api, message);
   }
 
@@ -542,31 +873,49 @@ class Handler : public Ffi {
 // Builtin attributes decoding
 //===----------------------------------------------------------------------===//
 
-#define XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(T, TYPE)                  \
-  template <>                                                           \
-  struct AttrDecoding<T> {                                              \
-    static std::optional<T> Decode(std::string_view name,               \
-                                   XLA_FFI_AttrType type, void* attr) { \
-      if (type != TYPE) {                                               \
-        return std::nullopt;                                            \
-      }                                                                 \
-                                                                        \
-      return *reinterpret_cast<T*>(attr);                               \
-    }                                                                   \
+inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_AttrType type) {
+  switch (type) {
+    case XLA_FFI_AttrType_I32:
+      return os << "int32";
+    case XLA_FFI_AttrType_I64:
+      return os << "int64";
+    case XLA_FFI_AttrType_F32:
+      return os << "float";
+    case XLA_FFI_AttrType_STRING:
+      return os << "string";
+    case XLA_FFI_AttrType_DICTIONARY:
+      return os << "dictionary";
+  }
+}
+
+#define XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(T, TYPE)                \
+  template <>                                                         \
+  struct AttrDecoding<T> {                                            \
+    static std::optional<T> Decode(XLA_FFI_AttrType type, void* attr, \
+                                   DiagnosticEngine& diagnostic) {    \
+      if (type != TYPE) {                                             \
+        return diagnostic.Emit("Wrong attribute type: expected ")     \
+               << TYPE << " but got " << type;                        \
+      }                                                               \
+                                                                      \
+      return *reinterpret_cast<T*>(attr);                             \
+    }                                                                 \
   }
 
 XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int32_t, XLA_FFI_AttrType_I32);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int64_t, XLA_FFI_AttrType_I64);
 XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(float, XLA_FFI_AttrType_F32);
 
 #undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
 
 template <>
 struct AttrDecoding<std::string_view> {
-  static std::optional<std::string_view> Decode(std::string_view name,
-                                                XLA_FFI_AttrType type,
-                                                void* attr) {
+  static std::optional<std::string_view> Decode(XLA_FFI_AttrType type,
+                                                void* attr,
+                                                DiagnosticEngine& diagnostic) {
     if (type != XLA_FFI_AttrType_STRING) {
-      return std::nullopt;
+      return diagnostic.Emit("Wrong attribute type: expected ")
+             << XLA_FFI_AttrType_STRING << " but got " << type;
     }
 
     auto* span = reinterpret_cast<XLA_FFI_ByteSpan*>(attr);
@@ -574,6 +923,114 @@ struct AttrDecoding<std::string_view> {
   }
 };
 
+template <>
+struct AttrDecoding<Dictionary> {
+  static std::optional<Dictionary> Decode(XLA_FFI_AttrType type, void* attr,
+                                          DiagnosticEngine& diagnostic) {
+    if (type != XLA_FFI_AttrType_DICTIONARY) {
+      return diagnostic.Emit("Wrong attribute type: expected ")
+             << XLA_FFI_AttrType_DICTIONARY << " but got " << type;
+    }
+
+    auto* attrs = reinterpret_cast<XLA_FFI_Attrs*>(attr);
+    return Dictionary(attrs);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Automatic dictionary attributes to structs decoding.
+//===----------------------------------------------------------------------===//
+
+template <typename T>
+struct StructMember {
+  using Type = T;
+
+  explicit StructMember(std::string_view name) : name(name) {}
+  std::string_view name;
+};
+
+namespace internal {
+
+// Decodes dictionary attribute into the object of type `T` that must be
+// constructible from the `Ts` types.
+template <typename T, typename... Ts>
+struct DecodeDictionaryAttr {
+  static constexpr size_t kSize = sizeof...(Ts);
+
+  XLA_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<T> Decode(const XLA_FFI_Attrs* attrs,
+                                 std::array<std::string_view, kSize> names,
+                                 DiagnosticEngine& diagnostic) {
+    return Decode(attrs, names, std::make_index_sequence<kSize>{}, diagnostic);
+  }
+
+  template <size_t... Is>
+  XLA_ATTRIBUTE_ALWAYS_INLINE static std::optional<T> Decode(
+      const XLA_FFI_Attrs* attrs, std::array<std::string_view, kSize> names,
+      std::index_sequence<Is...>, DiagnosticEngine& diagnostic) {
+    if (kSize != attrs->num_attrs) {
+      return diagnostic.Emit("Wrong number of attributes: expected ")
+             << kSize << " attributes but got " << attrs->num_attrs;
+    }
+
+    // TODO(ezhulenev): We rely on dictionary to lookup struct members by name
+    // at run time, however it can become really expensive. We should
+    // pre-compute mapping from `names` to index in the `XLA_FFI_Attrs`
+    // (attributes ordered by name) in a static variable, and rely on it
+    // to decode attributes with constant run time complexity.
+    //
+    // Consider using `static auto decoder = ...` below, and compute mapping in
+    // constructor. Add benchmarks first to know what to improve!
+    Dictionary dict(attrs);
+
+    std::tuple<std::optional<Ts>...> members = {
+        dict.get<Ts>(names[Is], diagnostic)...};
+    bool all_decoded = (std::get<Is>(members).has_value() && ...);
+    if (!all_decoded) return std::nullopt;
+
+    return T{std::move(*std::get<Is>(members))...};
+  }
+};
+
+template <typename... Members>
+auto StructMemberNames(Members... m) {
+  return std::array<std::string_view, sizeof...(Members)>{m.name...};
+}
+
+template <typename T, typename... Members>
+auto DictionaryDecoder(Members... m) {
+  return DecodeDictionaryAttr<T, typename Members::Type...>();
+}
+
+}  // namespace internal
+
+// Example: register decoding for a user-defined struct
+//
+//   struct PairOfI64 { int64_t a; int64_t b; };
+//
+//   XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(
+//     PairOfI64,
+//     StructMember<int64_t>("a"),
+//     StructMember<int64_t>("b"));
+//
+#define XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(T, ...)                 \
+  template <>                                                         \
+  struct AttrDecoding<T> {                                            \
+    static std::optional<T> Decode(XLA_FFI_AttrType type, void* attr, \
+                                   DiagnosticEngine& diagnostic) {    \
+      if (type != XLA_FFI_AttrType_DICTIONARY) {                      \
+        diagnostic.Emit("Wrong attribute type: expected ")            \
+            << XLA_FFI_AttrType_DICTIONARY << " but got " << type;    \
+        return std::nullopt;                                          \
+      }                                                               \
+                                                                      \
+      auto decoder = internal::DictionaryDecoder<T>(__VA_ARGS__);     \
+      return decltype(decoder)::Decode(                               \
+          reinterpret_cast<const XLA_FFI_Attrs*>(attr),               \
+          internal::StructMemberNames(__VA_ARGS__), diagnostic);      \
+    }                                                                 \
+  }
+
 //===----------------------------------------------------------------------===//
 // Helper macro for registering FFI implementations
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/ffi/api/c_api.h b/third_party/xla/xla/ffi/api/c_api.h
index 55d92827558218..3092b7755531ca 100644
--- a/third_party/xla/xla/ffi/api/c_api.h
+++ b/third_party/xla/xla/ffi/api/c_api.h
@@ -84,6 +84,7 @@ typedef struct XLA_FFI_Error XLA_FFI_Error;
 
 // Codes are based on https://abseil.io/docs/cpp/guides/status-codes
 typedef enum {
+  XLA_FFI_Error_Code_OK = 0,
   XLA_FFI_Error_Code_CANCELLED = 1,
   XLA_FFI_Error_Code_UNKNOWN = 2,
   XLA_FFI_Error_Code_INVALID_ARGUMENT = 3,
@@ -117,6 +118,51 @@ XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Error_Create_Args, errc);
 
 typedef XLA_FFI_Error* XLA_FFI_Error_Create(XLA_FFI_Error_Create_Args* args);
 
+struct XLA_FFI_Error_GetMessage_Args {
+  size_t struct_size;
+  void* priv;
+  XLA_FFI_Error* error;
+  const char* message;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Error_GetMessage_Args, message);
+
+typedef void XLA_FFI_Error_GetMessage(XLA_FFI_Error_GetMessage_Args* args);
+
+struct XLA_FFI_Error_Destroy_Args {
+  size_t struct_size;
+  void* priv;
+  XLA_FFI_Error* error;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Error_Destroy_Args, error);
+
+typedef void XLA_FFI_Error_Destroy(XLA_FFI_Error_Destroy_Args* args);
+
+//===----------------------------------------------------------------------===//
+// DataType
+//===----------------------------------------------------------------------===//
+
+// This enum corresponds to xla::PrimitiveType enum defined in `xla_data.proto`.
+// LINT.IfChange
+typedef enum {
+  XLA_FFI_DataType_INVALID = 0,
+  XLA_FFI_DataType_PRED = 1,
+  XLA_FFI_DataType_S8 = 2,
+  XLA_FFI_DataType_S16 = 3,
+  XLA_FFI_DataType_S32 = 4,
+  XLA_FFI_DataType_S64 = 5,
+  XLA_FFI_DataType_U8 = 6,
+  XLA_FFI_DataType_U16 = 7,
+  XLA_FFI_DataType_U32 = 8,
+  XLA_FFI_DataType_U64 = 9,
+  XLA_FFI_DataType_F16 = 10,
+  XLA_FFI_DataType_F32 = 11,
+  XLA_FFI_DataType_F64 = 12,
+  XLA_FFI_DataType_BF16 = 16,
+} XLA_FFI_DataType;
+// LINT.ThenChange(ffi_test.cc)
+
 //===----------------------------------------------------------------------===//
 // Builtin argument types
 //===----------------------------------------------------------------------===//
@@ -125,8 +171,8 @@ struct XLA_FFI_Buffer {
   size_t struct_size;
   void* priv;
 
+  XLA_FFI_DataType dtype;
   void* data;
-  uint8_t primitive_type;
   int64_t rank;
   int64_t* dims;  // length == rank
 };
@@ -143,8 +189,10 @@ typedef enum {
 
 typedef enum {
   XLA_FFI_AttrType_I32 = 1,
-  XLA_FFI_AttrType_F32 = 2,
-  XLA_FFI_AttrType_STRING = 3,
+  XLA_FFI_AttrType_I64 = 2,
+  XLA_FFI_AttrType_F32 = 3,
+  XLA_FFI_AttrType_STRING = 4,
+  XLA_FFI_AttrType_DICTIONARY = 5,
 } XLA_FFI_AttrType;
 
 //===----------------------------------------------------------------------===//
@@ -228,6 +276,24 @@ XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Handler_Register_Args, handler);
 typedef XLA_FFI_Error* XLA_FFI_Handler_Register(
     XLA_FFI_Handler_Register_Args* args);
 
+//===----------------------------------------------------------------------===//
+// Stream
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_Stream_Get_Args {
+  size_t struct_size;
+  void* priv;
+
+  XLA_FFI_ExecutionContext* ctx;
+  void* stream;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Stream_Get_Args, stream);
+
+// Returns an underling platform-specific stream via out argument, i.e. for CUDA
+// platform it returns `CUstream` (same as `cudaStream`).
+typedef XLA_FFI_Error* XLA_FFI_Stream_Get(XLA_FFI_Stream_Get_Args* args);
+
 //===----------------------------------------------------------------------===//
 // API access
 //===----------------------------------------------------------------------===//
@@ -241,7 +307,10 @@ struct XLA_FFI_Api {
   XLA_FFI_InternalApi* internal_api;
 
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Error_Create);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Error_GetMessage);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Error_Destroy);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Handler_Register);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Stream_Get);
 };
 
 #undef _XLA_FFI_API_STRUCT_FIELD
diff --git a/third_party/xla/xla/ffi/api/c_api_internal.h b/third_party/xla/xla/ffi/api/c_api_internal.h
index c8c9fc78a5c358..0434cd7cd83033 100644
--- a/third_party/xla/xla/ffi/api/c_api_internal.h
+++ b/third_party/xla/xla/ffi/api/c_api_internal.h
@@ -44,6 +44,10 @@ typedef XLA_FFI_Error* XLA_FFI_Error_Forward(void* status);
 typedef void* XLA_FFI_ServiceExecutableRunOptions_Get(
     XLA_FFI_ExecutionContext* ctx);
 
+// Returns a pointer to `xla::HloComputation` if FFI handler has a called
+// computation attached to it.
+typedef void* XLA_FFI_CalledComputation_Get(XLA_FFI_ExecutionContext* ctx);
+
 //===----------------------------------------------------------------------===//
 // API access
 //===----------------------------------------------------------------------===//
@@ -53,6 +57,7 @@ typedef void* XLA_FFI_ServiceExecutableRunOptions_Get(
 struct XLA_FFI_InternalApi {
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_Error_Forward);
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_ServiceExecutableRunOptions_Get);
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_CalledComputation_Get);
 };
 
 #undef _XLA_FFI_INTERNAL_API_STRUCT_FIELD
diff --git a/third_party/xla/xla/ffi/api/ffi.h b/third_party/xla/xla/ffi/api/ffi.h
index 1985864be1e3bf..a2c662cabf6d92 100644
--- a/third_party/xla/xla/ffi/api/ffi.h
+++ b/third_party/xla/xla/ffi/api/ffi.h
@@ -20,11 +20,277 @@ limitations under the License.
 #error Two different XLA FFI implementations cannot be included together
 #endif  // XLA_FFI_API_H_
 
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "xla/ffi/api/c_api.h"
+
 // IWYU pragma: begin_exports
 #include "xla/ffi/api/api.h"
 // IWYU pragma: end_exports
 
-// TODO(ezhulenev): Implement FFI arguments and attributes decoding for external
-// FFI users without any dependencies on absl or other libraries.
+namespace xla::ffi {
+
+enum class DataType : uint8_t {
+  INVALID = XLA_FFI_DataType_INVALID,
+  PRED = XLA_FFI_DataType_PRED,
+  S8 = XLA_FFI_DataType_S8,
+  S16 = XLA_FFI_DataType_S16,
+  S32 = XLA_FFI_DataType_S32,
+  S64 = XLA_FFI_DataType_S64,
+  U8 = XLA_FFI_DataType_U8,
+  U16 = XLA_FFI_DataType_U16,
+  U32 = XLA_FFI_DataType_U32,
+  U64 = XLA_FFI_DataType_U64,
+  F16 = XLA_FFI_DataType_F16,
+  F32 = XLA_FFI_DataType_F32,
+  F64 = XLA_FFI_DataType_F64,
+  BF16 = XLA_FFI_DataType_BF16,
+};
+
+inline std::ostream& operator<<(std::ostream& os, const DataType dtype) {
+  static constexpr const char* kDataTypeNames[] = {
+      "PRED", "S8",  "S16", "S32", "S64", "U8",   "U16",
+      "U32",  "U64", "F16", "F32", "F64", "BF16",
+  };
+  return os << kDataTypeNames[static_cast<int>(dtype)];
+}
+
+//===----------------------------------------------------------------------===//
+// Span is non-owning view into contiguous values of type `T`.
+//===----------------------------------------------------------------------===//
+
+// TODO(ezhulenev): Replace with `std::span` when C++20 is available.
+template <typename T>
+class Span {
+ public:
+  constexpr Span() : data_(nullptr), size_(0) {}
+
+  Span(T* data, size_t size) : data_(data), size_(size) {}
+  Span(const std::vector<std::remove_const_t<T>>& vec)  // NOLINT
+      : Span(vec.data(), vec.size()) {}
+
+  T& operator[](size_t index) const { return data_[index]; }
+
+  size_t size() const { return size_; }
+
+  T* begin() const { return data_; }
+  T* end() const { return data_ + size_; }
+
+ private:
+  T* data_;
+  size_t size_;
+};
+
+//===----------------------------------------------------------------------===//
+// Error
+//===----------------------------------------------------------------------===//
+
+class Error {
+ public:
+  Error() = default;
+  Error(XLA_FFI_Error_Code errc, std::string message)
+      : errc_(errc), message_(std::move(message)) {}
+
+  static Error Success() { return Error(); }
+
+  bool success() const { return errc_ == XLA_FFI_Error_Code_OK; }
+  bool failure() const { return !success(); }
+
+  std::optional<XLA_FFI_Error_Code> errc() const { return errc_; }
+  const std::string& message() const { return message_; }
+
+ private:
+  XLA_FFI_Error_Code errc_;
+  std::string message_;
+};
+
+//===----------------------------------------------------------------------===//
+// Arguments
+//===----------------------------------------------------------------------===//
+
+struct BufferBase {
+  DataType dtype;
+  void* data;
+  Span<const int64_t> dimensions;
+};
+
+namespace internal {
+
+// A workaround for the fact that a static_assertion can be evaluated
+// whether or not the template is instantiated
+template <DataType dtype>
+struct always_false : std::false_type {};
+
+template <DataType dtype>
+struct PtrType {
+  static_assert(always_false<dtype>::value, "unsupported data type");
+};
+
+// clang-format off
+template <> struct PtrType<DataType::PRED> { using Type = bool; };
+template <> struct PtrType<DataType::U8>   { using Type = uint8_t; };
+template <> struct PtrType<DataType::U16>  { using Type = uint16_t; };
+template <> struct PtrType<DataType::U32>  { using Type = uint32_t; };
+template <> struct PtrType<DataType::U64>  { using Type = uint64_t; };
+template <> struct PtrType<DataType::S8>   { using Type = int8_t; };
+template <> struct PtrType<DataType::S16>  { using Type = int16_t; };
+template <> struct PtrType<DataType::S32>  { using Type = int32_t; };
+template <> struct PtrType<DataType::S64>  { using Type = int64_t; };
+template <> struct PtrType<DataType::F16>  { using Type = uint16_t; };
+template <> struct PtrType<DataType::F32>  { using Type = float; };
+template <> struct PtrType<DataType::F64>  { using Type = double; };
+template <> struct PtrType<DataType::BF16> { using Type = uint16_t; };
+// clang-format on
+
+inline constexpr size_t kDynamicRank = std::numeric_limits<size_t>::max();
+
+}  // namespace internal
+
+template <DataType dtype, size_t rank = internal::kDynamicRank>
+struct Buffer {
+  typename internal::PtrType<dtype>::Type* data;
+  Span<const int64_t> dimensions;
+};
+
+// clang-format off
+template <DataType dtype> using BufferR0 = Buffer<dtype, 0>;
+template <DataType dtype> using BufferR1 = Buffer<dtype, 1>;
+template <DataType dtype> using BufferR2 = Buffer<dtype, 2>;
+template <DataType dtype> using BufferR3 = Buffer<dtype, 3>;
+template <DataType dtype> using BufferR4 = Buffer<dtype, 4>;
+// clang-format on
+
+//===----------------------------------------------------------------------===//
+// Arguments decoding
+//===----------------------------------------------------------------------===//
+
+inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_ArgType type) {
+  switch (type) {
+    case XLA_FFI_ArgType_BUFFER:
+      return os << "buffer";
+  }
+}
+
+template <>
+struct ArgDecoding<BufferBase> {
+  XLA_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<BufferBase> Decode(XLA_FFI_ArgType type, void* arg,
+                                          DiagnosticEngine& diagnostic) {
+    if (type != XLA_FFI_ArgType_BUFFER) {
+      return diagnostic.Emit("Wrong argument type: expected ")
+             << XLA_FFI_ArgType_BUFFER << " but got " << type;
+    }
+    auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
+    return BufferBase{static_cast<DataType>(buf->dtype), buf->data,
+                      Span<const int64_t>(buf->dims, buf->rank)};
+  }
+};
+
+template <DataType dtype, size_t rank>
+struct ArgDecoding<Buffer<dtype, rank>> {
+  XLA_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Buffer<dtype, rank>> Decode(
+      XLA_FFI_ArgType type, void* arg, DiagnosticEngine& diagnostic) {
+    if (type != XLA_FFI_ArgType_BUFFER) {
+      return diagnostic.Emit("Wrong argument type: expected ")
+             << XLA_FFI_ArgType_BUFFER << " but got " << type;
+    }
+    auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
+    if (auto actual_dtype = static_cast<DataType>(buf->dtype);
+        actual_dtype != dtype) {
+      return diagnostic.Emit("Wrong buffer dtype: expected ")
+             << dtype << " but got " << actual_dtype;
+    }
+    auto* data =
+        static_cast<typename internal::PtrType<dtype>::Type*>(buf->data);
+    if constexpr (rank != internal::kDynamicRank) {
+      if (buf->rank != rank) {
+        diagnostic.Emit("Wrong buffer rank: expected ")
+            << rank << " but got " << buf->rank;
+        return std::nullopt;
+      }
+    }
+    return Buffer<dtype, rank>{data, Span<const int64_t>(buf->dims, rank)};
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Result encoding
+//===----------------------------------------------------------------------===//
+
+template <>
+struct ResultEncoding<Error> {
+  static XLA_FFI_Error* Encode(XLA_FFI_Api* api, Error error) {
+    if (error.success()) return nullptr;
+
+    XLA_FFI_Error_Create_Args args;
+    args.struct_size = XLA_FFI_Error_Create_Args_STRUCT_SIZE;
+    args.priv = nullptr;
+    args.errc = *error.errc();
+    args.message = error.message().c_str();
+    return api->XLA_FFI_Error_Create(&args);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// PlatformStream
+//===----------------------------------------------------------------------===//
+
+template <typename T>
+struct PlatformStream {};
+
+template <typename T>
+struct CtxDecoding<PlatformStream<T>> {
+  using Type = T;
+
+  static_assert(std::is_pointer_v<T>, "stream type must be a pointer");
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    XLA_FFI_Stream_Get_Args args;
+    args.struct_size = XLA_FFI_Stream_Get_Args_STRUCT_SIZE;
+    args.priv = nullptr;
+    args.ctx = ctx;
+    args.stream = nullptr;
+
+    if (XLA_FFI_Error* error = api->XLA_FFI_Stream_Get(&args); error) {
+      diagnostic.Emit("Failed to get platform stream: ")
+          << GetErrorMessage(api, error);
+      DestroyError(api, error);
+      return std::nullopt;
+    }
+
+    return reinterpret_cast<T>(args.stream);
+  }
+
+  static const char* GetErrorMessage(const XLA_FFI_Api* api,
+                                     XLA_FFI_Error* error) {
+    XLA_FFI_Error_GetMessage_Args args;
+    args.struct_size = XLA_FFI_Error_GetMessage_Args_STRUCT_SIZE;
+    args.priv = nullptr;
+    args.error = error;
+    api->XLA_FFI_Error_GetMessage(&args);
+    return args.message;
+  }
+
+  static void DestroyError(const XLA_FFI_Api* api, XLA_FFI_Error* error) {
+    XLA_FFI_Error_Destroy_Args args;
+    args.struct_size = XLA_FFI_Error_Destroy_Args_STRUCT_SIZE;
+    args.priv = nullptr;
+    args.error = error;
+    api->XLA_FFI_Error_Destroy(&args);
+  }
+};
+
+}  // namespace xla::ffi
 
 #endif  // XLA_FFI_API_FFI_H_
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
new file mode 100644
index 00000000000000..caa5fc5e25971b
--- /dev/null
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -0,0 +1,295 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/ffi/api/ffi.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "xla/ffi/call_frame.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/test.h"
+#include "tsl/platform/test_benchmark.h"
+
+namespace xla::ffi {
+
+namespace {
+
+using ::testing::HasSubstr;
+using ::tsl::testing::StatusIs;
+
+}  // namespace
+
+TEST(FfiTest, DataTypeEnumValue) {
+  // Verify that xla::PrimitiveType and xla::ffi::DataType use the same
+  // integer value for encoding data types.
+  auto encoded = [](auto value) { return static_cast<uint8_t>(value); };
+
+  EXPECT_EQ(encoded(PrimitiveType::PRED), encoded(DataType::PRED));
+
+  EXPECT_EQ(encoded(PrimitiveType::S8), encoded(DataType::S8));
+  EXPECT_EQ(encoded(PrimitiveType::S16), encoded(DataType::S16));
+  EXPECT_EQ(encoded(PrimitiveType::S32), encoded(DataType::S32));
+  EXPECT_EQ(encoded(PrimitiveType::S64), encoded(DataType::S64));
+
+  EXPECT_EQ(encoded(PrimitiveType::U8), encoded(DataType::U8));
+  EXPECT_EQ(encoded(PrimitiveType::U16), encoded(DataType::U16));
+  EXPECT_EQ(encoded(PrimitiveType::U32), encoded(DataType::U32));
+  EXPECT_EQ(encoded(PrimitiveType::U64), encoded(DataType::U64));
+
+  EXPECT_EQ(encoded(PrimitiveType::F16), encoded(DataType::F16));
+  EXPECT_EQ(encoded(PrimitiveType::F32), encoded(DataType::F32));
+  EXPECT_EQ(encoded(PrimitiveType::F64), encoded(DataType::F64));
+
+  EXPECT_EQ(encoded(PrimitiveType::BF16), encoded(DataType::BF16));
+}
+
+TEST(FfiTest, BufferBaseArgument) {
+  std::vector<float> storage(4, 0.0f);
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+
+  CallFrameBuilder builder;
+  builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
+  auto call_frame = builder.Build();
+
+  auto handler = Ffi::Bind().Arg<BufferBase>().To([&](auto buffer) {
+    EXPECT_EQ(buffer.data, storage.data());
+    EXPECT_EQ(buffer.dimensions.size(), 2);
+    return Error::Success();
+  });
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
+TEST(FfiTest, BufferArgument) {
+  std::vector<float> storage(4, 0.0f);
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+
+  CallFrameBuilder builder;
+  builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
+  auto call_frame = builder.Build();
+
+  auto handler =
+      Ffi::Bind().Arg<BufferR2<DataType::F32>>().To([&](auto buffer) {
+        EXPECT_EQ(buffer.data, storage.data());
+        EXPECT_EQ(buffer.dimensions.size(), 2);
+        return Error::Success();
+      });
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
+TEST(FfiTest, MissingBufferArgument) {
+  CallFrameBuilder builder;
+  auto call_frame = builder.Build();
+
+  auto handler = Ffi::Bind().Arg<BufferR1<DataType::F32>>().To(
+      [](auto) { return Error::Success(); });
+  auto status = Call(*handler, call_frame);
+
+  EXPECT_THAT(status, StatusIs(tsl::error::INVALID_ARGUMENT,
+                               HasSubstr("Wrong number of arguments")));
+}
+
+TEST(FfiTest, WrongRankBufferArgument) {
+  std::vector<std::int32_t> storage(4, 0.0);
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(std::int32_t));
+
+  CallFrameBuilder builder;
+  builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
+  auto call_frame = builder.Build();
+
+  auto handler = Ffi::Bind().Arg<BufferR1<DataType::F32>>().To(
+      [](auto) { return Error::Success(); });
+  auto status = Call(*handler, call_frame);
+
+  EXPECT_THAT(status,
+              StatusIs(tsl::error::INVALID_ARGUMENT,
+                       HasSubstr("Wrong buffer rank: expected 1 but got 2")));
+}
+
+TEST(FfiTest, WrongTypeBufferArgument) {
+  std::vector<std::int32_t> storage(4, 0.0);
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(std::int32_t));
+
+  CallFrameBuilder builder;
+  builder.AddBufferArg(memory, PrimitiveType::S32, /*dims=*/{2, 2});
+  auto call_frame = builder.Build();
+
+  auto handler = Ffi::Bind().Arg<BufferR2<DataType::F32>>().To(
+      [](auto) { return Error::Success(); });
+  auto status = Call(*handler, call_frame);
+
+  EXPECT_THAT(
+      status,
+      StatusIs(tsl::error::INVALID_ARGUMENT,
+               HasSubstr("Wrong buffer dtype: expected F64 but got S64")));
+}
+
+//===----------------------------------------------------------------------===//
+// Performance benchmarks are below.
+//===----------------------------------------------------------------------===//
+
+static CallFrameBuilder WithBufferArgs(size_t num_args, size_t rank = 4) {
+  se::DeviceMemoryBase memory;
+  std::vector<int64_t> dims(4, 1);
+
+  CallFrameBuilder builder;
+  for (size_t i = 0; i < num_args; ++i) {
+    builder.AddBufferArg(memory, PrimitiveType::F32, dims);
+  }
+  return builder;
+}
+
+//===----------------------------------------------------------------------===//
+// BM_BufferBaseArgX1
+//===----------------------------------------------------------------------===//
+
+void BM_BufferBaseArgX1(benchmark::State& state) {
+  auto call_frame = WithBufferArgs(1).Build();
+
+  auto handler = Ffi::Bind().Arg<BufferBase>().To([](auto buffer) {
+    benchmark::DoNotOptimize(buffer);
+    return Error::Success();
+  });
+  for (auto _ : state) {
+    CHECK_OK(Call(*handler, call_frame));
+  }
+}
+
+BENCHMARK(BM_BufferBaseArgX1);
+
+//===----------------------------------------------------------------------===//
+// BM_BufferBaseArgX4
+//===----------------------------------------------------------------------===//
+
+void BM_BufferBaseArgX4(benchmark::State& state) {
+  auto call_frame = WithBufferArgs(4).Build();
+
+  auto handler = Ffi::Bind()
+                     .Arg<BufferBase>()
+                     .Arg<BufferBase>()
+                     .Arg<BufferBase>()
+                     .Arg<BufferBase>()
+                     .To([](auto b0, auto b1, auto b2, auto b3) {
+                       benchmark::DoNotOptimize(b0);
+                       benchmark::DoNotOptimize(b1);
+                       benchmark::DoNotOptimize(b2);
+                       benchmark::DoNotOptimize(b3);
+                       return Error::Success();
+                     });
+
+  for (auto _ : state) {
+    CHECK_OK(Call(*handler, call_frame));
+  }
+}
+
+BENCHMARK(BM_BufferBaseArgX4);
+
+//===----------------------------------------------------------------------===//
+// BM_BufferArgX1
+//===----------------------------------------------------------------------===//
+
+void BM_BufferArgX1(benchmark::State& state) {
+  auto call_frame = WithBufferArgs(1).Build();
+
+  auto handler = Ffi::Bind().Arg<BufferR4<DataType::F32>>().To([](auto buffer) {
+    benchmark::DoNotOptimize(buffer);
+    return Error::Success();
+  });
+  for (auto _ : state) {
+    CHECK_OK(Call(*handler, call_frame));
+  }
+}
+
+BENCHMARK(BM_BufferArgX1);
+
+//===----------------------------------------------------------------------===//
+// BM_BufferArgX4
+//===----------------------------------------------------------------------===//
+
+void BM_BufferArgX4(benchmark::State& state) {
+  auto call_frame = WithBufferArgs(4).Build();
+
+  auto handler = Ffi::Bind()
+                     .Arg<BufferR4<DataType::F32>>()
+                     .Arg<BufferR4<DataType::F32>>()
+                     .Arg<BufferR4<DataType::F32>>()
+                     .Arg<BufferR4<DataType::F32>>()
+                     .To([](auto b0, auto b1, auto b2, auto b3) {
+                       benchmark::DoNotOptimize(b0);
+                       benchmark::DoNotOptimize(b1);
+                       benchmark::DoNotOptimize(b2);
+                       benchmark::DoNotOptimize(b3);
+                       return Error::Success();
+                     });
+
+  for (auto _ : state) {
+    CHECK_OK(Call(*handler, call_frame));
+  }
+}
+
+BENCHMARK(BM_BufferArgX4);
+
+//===----------------------------------------------------------------------===//
+// BM_TupleOfI32Attrs
+//===----------------------------------------------------------------------===//
+
+struct TupleOfI32 {
+  int64_t i32_0;
+  int64_t i32_1;
+  int64_t i32_2;
+  int64_t i32_3;
+};
+
+XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(TupleOfI32,
+                                      StructMember<int32_t>("i32_0"),
+                                      StructMember<int32_t>("i32_1"),
+                                      StructMember<int32_t>("i32_2"),
+                                      StructMember<int32_t>("i32_3"));
+
+void BM_TupleOfI32Attrs(benchmark::State& state) {
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("i32_0", 1);
+  attrs.Insert("i32_1", 2);
+  attrs.Insert("i32_2", 3);
+  attrs.Insert("i32_3", 4);
+
+  CallFrameBuilder builder;
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto handler = Ffi::Bind().Attrs<TupleOfI32>().To([](auto tuple) {
+    benchmark::DoNotOptimize(tuple);
+    return Error::Success();
+  });
+
+  for (auto _ : state) {
+    CHECK_OK(Call(*handler, call_frame));
+  }
+}
+
+BENCHMARK(BM_TupleOfI32Attrs);
+
+}  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
index e828ab202ade1c..3ddc3bc4971fb3 100644
--- a/third_party/xla/xla/ffi/call_frame.cc
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/log/check.h"
 #include "absl/types/span.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
@@ -37,34 +38,63 @@ namespace xla::ffi {
 // CallFrameBuilder
 //===----------------------------------------------------------------------===//
 
-void CallFrameBuilder::AddBufferArg(se::DeviceMemoryBase memory,
-                                    PrimitiveType type,
-                                    absl::Span<const int64_t> dims) {
-  args_.push_back(Buffer{memory, type, {dims.begin(), dims.end()}});
+struct CallFrameBuilder::Buffer {
+  se::DeviceMemoryBase memory;
+  PrimitiveType type;
+  std::vector<int64_t> dims;
+};
+
+CallFrameBuilder::AttributesMap CallFrameBuilder::AttributesBuilder::Build() {
+  return std::move(attrs_);
 }
 
-void CallFrameBuilder::AddI32Attr(std::string name, int32_t value) {
-  attrs_.try_emplace(std::move(name), value);
+static CallFrameBuilder::Attribute FromFlatAttribute(
+    CallFrameBuilder::FlatAttribute attr) {
+  return std::visit(
+      [](auto& attr) { return CallFrameBuilder::Attribute{attr}; }, attr);
 }
 
-void CallFrameBuilder::AddF32Attr(std::string name, float value) {
-  attrs_.try_emplace(std::move(name), value);
+CallFrameBuilder::AttributesBuilder::AttributesBuilder() = default;
+CallFrameBuilder::AttributesBuilder::~AttributesBuilder() = default;
+
+void CallFrameBuilder::AttributesBuilder::Insert(std::string name,
+                                                 FlatAttribute attr) {
+  attrs_.try_emplace(std::move(name), FromFlatAttribute(std::move(attr)));
 }
 
-void CallFrameBuilder::AddStringAttr(std::string name, std::string value) {
-  attrs_.try_emplace(std::move(name), value);
+void CallFrameBuilder::AttributesBuilder::Insert(std::string name,
+                                                 FlatAttributesMap attrs) {
+  AttributesBuilder builder;
+  for (auto& [name, attr] : attrs) builder.Insert(name, std::move(attr));
+
+  auto attrs_map = std::make_unique<AttributesMap>(builder.Build());
+  attrs_.try_emplace(std::move(name), Dictionary{std::move(attrs_map)});
+}
+
+void CallFrameBuilder::AttributesBuilder::Append(FlatAttributesMap attrs) {
+  for (auto& [name, attr] : attrs) Insert(name, std::move(attr));
 }
 
-void CallFrameBuilder::AddAttribute(std::string name, Attribute attr) {
-  attrs_.try_emplace(std::move(name), attr);
+CallFrameBuilder::CallFrameBuilder() = default;
+CallFrameBuilder::~CallFrameBuilder() = default;
+
+void CallFrameBuilder::AddBufferArg(se::DeviceMemoryBase memory,
+                                    PrimitiveType type,
+                                    absl::Span<const int64_t> dims) {
+  args_.push_back(Buffer{memory, type, {dims.begin(), dims.end()}});
 }
 
-void CallFrameBuilder::AddAttributes(const AttributesMap& attrs) {
-  attrs_.insert(attrs.begin(), attrs.end());
+void CallFrameBuilder::AddAttributes(AttributesMap attrs) {
+  for (auto& [name, attr] : attrs) {
+    attrs_.try_emplace(std::move(name), std::move(attr));
+  }
 }
 
 CallFrame CallFrameBuilder::Build() { return CallFrame(args_, attrs_); }
 
+CallFrameBuilder::CallFrameBuilder(CallFrameBuilder&&) = default;
+CallFrameBuilder& CallFrameBuilder::operator=(CallFrameBuilder&&) = default;
+
 // ------------------------    !!! !!! !!!     ------------------------------ //
 
 // WARNING: In many structs defined below we use a pattern where we declare
@@ -91,6 +121,21 @@ struct CallFrame::Buffer {
   XLA_FFI_Buffer buffer = {XLA_FFI_Buffer_STRUCT_SIZE, nullptr};
 };
 
+struct CallFrame::Dictionary {
+  std::unique_ptr<Attributes> attrs;
+};
+
+struct CallFrame::String {
+  std::string value;  // XLA_FFI_ByteSpan::ptr
+
+  XLA_FFI_ByteSpan span = {XLA_FFI_ByteSpan_STRUCT_SIZE, nullptr};
+};
+
+struct CallFrame::NamedAttribute {
+  String name;
+  Attribute value;
+};
+
 struct CallFrame::Arguments {
   explicit Arguments(size_t size) {
     arguments.reserve(size);
@@ -106,21 +151,6 @@ struct CallFrame::Arguments {
   XLA_FFI_Args ffi_args = {XLA_FFI_Args_STRUCT_SIZE, nullptr};
 };
 
-//----------------------------------------------------------------------------//
-// Attributes storage + reference types
-//----------------------------------------------------------------------------//
-
-struct CallFrame::String {
-  std::string value;  // XLA_FFI_ByteSpan::ptr
-
-  XLA_FFI_ByteSpan span = {XLA_FFI_ByteSpan_STRUCT_SIZE, nullptr};
-};
-
-struct CallFrame::NamedAttribute {
-  String name;
-  Attribute value;
-};
-
 struct CallFrame::Attributes {
   explicit Attributes(size_t size) {
     attributes.reserve(size);
@@ -166,12 +196,37 @@ CallFrame::~CallFrame() = default;
     absl::Span<const CallFrameBuilder::Buffer> bargs) {
   auto res = std::make_unique<Arguments>(bargs.size());
 
+  // We rely on casting to and from underlying integral type to convert from
+  // PrimitiveType to XLA FFI DataType, and for safety convert all unknown types
+  // to invalid type, otherwise we can accidentally cause UB.
+  auto to_data_type = [](PrimitiveType primitive_type) {
+    switch (primitive_type) {
+      case PrimitiveType::PRIMITIVE_TYPE_INVALID:
+      case PrimitiveType::S8:
+      case PrimitiveType::S16:
+      case PrimitiveType::S32:
+      case PrimitiveType::S64:
+      case PrimitiveType::U8:
+      case PrimitiveType::U16:
+      case PrimitiveType::U32:
+      case PrimitiveType::U64:
+      case PrimitiveType::F16:
+      case PrimitiveType::F32:
+      case PrimitiveType::F64:
+      case PrimitiveType::BF16:
+        return static_cast<XLA_FFI_DataType>(primitive_type);
+      default:
+        DCHECK(false) << "Unsupported primitive type" << primitive_type;
+        return XLA_FFI_DataType_INVALID;
+    }
+  };
+
   // Convert call frame builder arguments to call frame arguments.
   for (const CallFrameBuilder::Buffer& barg : bargs) {
     Buffer buffer;
     buffer.dims = barg.dims;
     buffer.buffer.data = const_cast<void*>(barg.memory.opaque());
-    buffer.buffer.primitive_type = static_cast<uint8_t>(barg.type);
+    buffer.buffer.dtype = to_data_type(barg.type);
     buffer.buffer.rank = buffer.dims.size();
     res->arguments.push_back(std::move(buffer));
   }
@@ -211,6 +266,10 @@ struct CallFrame::ConvertAttribute {
   CallFrame::Attribute operator()(const std::string& str) {
     return CallFrame::String{str};
   }
+
+  CallFrame::Attribute operator()(const CallFrameBuilder::Dictionary& dict) {
+    return CallFrame::Dictionary{InitAttrs(*dict.attrs)};
+  }
 };
 
 // An std::visit overload set to fix up CallFrame::Attribute storage and
@@ -229,11 +288,17 @@ struct CallFrame::FixupAttribute {
 struct CallFrame::AttributeType {
   XLA_FFI_AttrType operator()(int32_t&) { return XLA_FFI_AttrType_I32; }
 
+  XLA_FFI_AttrType operator()(int64_t&) { return XLA_FFI_AttrType_I64; }
+
   XLA_FFI_AttrType operator()(float&) { return XLA_FFI_AttrType_F32; }
 
   XLA_FFI_AttrType operator()(CallFrame::String&) {
     return XLA_FFI_AttrType_STRING;
   }
+
+  XLA_FFI_AttrType operator()(CallFrame::Dictionary&) {
+    return XLA_FFI_AttrType_DICTIONARY;
+  }
 };
 
 // An std::visit overload set to get CallFrame::Attribute storage pointer.
@@ -244,6 +309,10 @@ struct CallFrame::AttributeStorage {
   }
 
   void* operator()(CallFrame::String& str) { return &str.span; }
+
+  void* operator()(CallFrame::Dictionary& dict) {
+    return &dict.attrs->ffi_attrs;
+  }
 };
 
 /*static*/ std::unique_ptr<CallFrame::Attributes> CallFrame::InitAttrs(
diff --git a/third_party/xla/xla/ffi/call_frame.h b/third_party/xla/xla/ffi/call_frame.h
index 2f5327f40bca57..f57a70caeecf27 100644
--- a/third_party/xla/xla/ffi/call_frame.h
+++ b/third_party/xla/xla/ffi/call_frame.h
@@ -41,30 +41,67 @@ namespace xla::ffi {
 class CallFrame;  // forward declare
 
 class CallFrameBuilder {
+  // A little bit of template metaprogramming to append type to std::variant.
+  template <typename V, class T>
+  struct AppendType;
+
+  template <typename... Ts, class T>
+  struct AppendType<std::variant<Ts...>, T> {
+    using Type = std::variant<Ts..., T>;
+  };
+
  public:
-  using Attribute = std::variant<int32_t, float, std::string>;
+  CallFrameBuilder();
+  ~CallFrameBuilder();
+
+  CallFrameBuilder(CallFrameBuilder&&);
+  CallFrameBuilder& operator=(CallFrameBuilder&&);
+
+  // Declare implementation detail structs for call frame builder storage.
+  struct Dictionary;
+
+  // Attributes that do not support nested dictionaries.
+  using FlatAttribute = std::variant<int32_t, int64_t, float, std::string>;
+  using FlatAttributesMap = absl::flat_hash_map<std::string, FlatAttribute>;
+
+  // Attributes that support arbitrary nesting.
+  using Attribute = typename AppendType<FlatAttribute, Dictionary>::Type;
   using AttributesMap = absl::flat_hash_map<std::string, Attribute>;
 
+  // Dictionary is just a wrapper around AttributesMap. We need an indirection
+  // through `std::unique_ptr` to be able to define recursive `std::variant`.
+  struct Dictionary {
+    std::unique_ptr<AttributesMap> attrs;
+  };
+
+  // A helper class to build call frame attributes.
+  class AttributesBuilder {
+   public:
+    AttributesBuilder();
+    ~AttributesBuilder();
+
+    void Insert(std::string name, FlatAttribute attr);
+    void Insert(std::string name, FlatAttributesMap attrs);
+
+    void Append(FlatAttributesMap attrs);
+
+    AttributesMap Build();
+
+   private:
+    AttributesMap attrs_;
+  };
+
   CallFrame Build();
 
   void AddBufferArg(se::DeviceMemoryBase memory, PrimitiveType type,
                     absl::Span<const int64_t> dims);
 
-  void AddI32Attr(std::string name, int32_t value);
-  void AddF32Attr(std::string name, float value);
-  void AddStringAttr(std::string name, std::string value);
-
-  void AddAttribute(std::string name, Attribute attr);
-  void AddAttributes(const AttributesMap& attrs);
+  void AddAttributes(AttributesMap attrs);
 
  private:
   friend class CallFrame;
 
-  struct Buffer {
-    se::DeviceMemoryBase memory;
-    PrimitiveType type;
-    std::vector<int64_t> dims;
-  };
+  struct Buffer;
 
   std::vector<Buffer> args_;
   AttributesMap attrs_;
@@ -88,10 +125,11 @@ class CallFrame {
   struct Arguments;
   struct Attributes;
   struct Buffer;
+  struct Dictionary;
   struct NamedAttribute;
   struct String;
 
-  using Attribute = std::variant<int32_t, float, String>;
+  using Attribute = std::variant<int32_t, int64_t, float, String, Dictionary>;
 
   CallFrame(absl::Span<const CallFrameBuilder::Buffer> args,
             const CallFrameBuilder::AttributesMap& attrs);
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index 18c9ca83268bc7..85ccf7bd5c9a2d 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
-#include <string_view>
 
 // IWYU pragma: begin_exports
 #include "xla/ffi/api/api.h"
@@ -31,30 +30,32 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
-#include "xla/ffi/call_frame.h"
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/runtime/memref_view.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "xla/xla_data.pb.h"
 
 namespace xla::ffi {
 
+// A tag to declare called computation argument in FFI handler.
+struct CalledComputation {};
+
 //===----------------------------------------------------------------------===//
 // Arguments
 //===----------------------------------------------------------------------===//
 
 struct Buffer {
-  PrimitiveType primitive_type;
+  PrimitiveType dtype;
   se::DeviceMemoryBase data;
   absl::Span<const int64_t> dimensions;
 
   // TODO(ezhulenev): Remove this implicit conversion once we'll migrate to FFI
   // handlers from runtime custom calls.
   operator runtime::MemrefView() {  // NOLINT
-    return runtime::MemrefView{primitive_type, data.opaque(), dimensions};
+    return runtime::MemrefView{dtype, data.opaque(), dimensions};
   }
 };
 
@@ -64,12 +65,13 @@ struct Buffer {
 
 template <>
 struct ArgDecoding<Buffer> {
-  static std::optional<Buffer> Decode(XLA_FFI_ArgType type, void* arg) {
+  static std::optional<Buffer> Decode(XLA_FFI_ArgType type, void* arg,
+                                      DiagnosticEngine&) {
     if (type != XLA_FFI_ArgType_BUFFER) return std::nullopt;
     auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
 
     Buffer buffer;
-    buffer.primitive_type = PrimitiveType(buf->primitive_type);
+    buffer.dtype = PrimitiveType(buf->dtype);
     buffer.data = se::DeviceMemoryBase(buf->data);
     buffer.dimensions = absl::MakeConstSpan(buf->dims, buf->rank);
     return buffer;
@@ -80,17 +82,32 @@ struct ArgDecoding<Buffer> {
 // Context decoding
 //===----------------------------------------------------------------------===//
 
+// TODO(ezhulenev): We should remove `ServiceExecutableRunOptions` context and
+// pass only se::Stream to FFI handlers.
 template <>
 struct CtxDecoding<ServiceExecutableRunOptions> {
   using Type = const ServiceExecutableRunOptions*;
 
   static std::optional<Type> Decode(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx) {
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine&) {
     void* ptr = api->internal_api->XLA_FFI_ServiceExecutableRunOptions_Get(ctx);
     return reinterpret_cast<Type>(ptr);
   }
 };
 
+template <>
+struct CtxDecoding<CalledComputation> {
+  using Type = const HloComputation*;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine&) {
+    void* ptr = api->internal_api->XLA_FFI_CalledComputation_Get(ctx);
+    return reinterpret_cast<Type>(ptr);
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Result encoding
 //===----------------------------------------------------------------------===//
@@ -102,38 +119,6 @@ struct ResultEncoding<Status> {
   }
 };
 
-//===----------------------------------------------------------------------===//
-// Result encoding
-//===----------------------------------------------------------------------===//
-
-// Takes ownership of the XLA FFI error and returns underlying status. Frees
-// `error` if it's not nullptr; returns OK status otherwise.
-Status TakeStatus(XLA_FFI_Error* error);
-
-struct CallOptions {
-  const ServiceExecutableRunOptions* run_options = nullptr;
-};
-
-Status Call(Ffi& handler, CallFrame& call_frame,
-            const CallOptions& options = {});
-
-Status Call(XLA_FFI_Handler* handler, CallFrame& call_frame,
-            const CallOptions& options = {});
-
-//===----------------------------------------------------------------------===//
-// XLA FFI registry
-//===----------------------------------------------------------------------===//
-
-// Returns registered FFI handler for a given name, or an error if it's not
-// found in the static registry.
-StatusOr<XLA_FFI_Handler*> FindHandler(std::string_view name);
-
-//===----------------------------------------------------------------------===//
-// XLA FFI Api Implementation
-//===----------------------------------------------------------------------===//
-
-XLA_FFI_Api* GetXlaFfiApi();
-
 }  // namespace xla::ffi
 
 #endif  // XLA_FFI_FFI_H_
diff --git a/third_party/xla/xla/ffi/ffi.cc b/third_party/xla/xla/ffi/ffi_api.cc
similarity index 77%
rename from third_party/xla/xla/ffi/ffi.cc
rename to third_party/xla/xla/ffi/ffi_api.cc
index 9899010c41f8c2..ddf51c125814f9 100644
--- a/third_party/xla/xla/ffi/ffi.cc
+++ b/third_party/xla/xla/ffi/ffi_api.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
 
 #include <cstddef>
 #include <string>
@@ -23,9 +23,11 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "xla/ffi/api/api.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
 #include "xla/ffi/call_frame.h"
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
@@ -41,12 +43,20 @@ struct XLA_FFI_Error {
 
 struct XLA_FFI_ExecutionContext {
   const xla::ServiceExecutableRunOptions* run_options;
+  const xla::HloComputation* called_computation;
 };
 
 //===----------------------------------------------------------------------===//
 
 namespace xla::ffi {
 
+//===----------------------------------------------------------------------===//
+// Calling XLA FFI handlers
+//===----------------------------------------------------------------------===//
+
+// WARNING: These functions defined in `call_frame.h` as we need to make them
+// available without having to depend on `ffi.h` header.
+
 Status TakeStatus(XLA_FFI_Error* error) {
   if (error == nullptr) return absl::OkStatus();
   Status status = std::move(error->status);
@@ -55,14 +65,16 @@ Status TakeStatus(XLA_FFI_Error* error) {
 }
 
 Status Call(Ffi& handler, CallFrame& call_frame, const CallOptions& options) {
-  XLA_FFI_ExecutionContext ctx = {options.run_options};
+  XLA_FFI_ExecutionContext ctx = {options.run_options,
+                                  options.called_computation};
   XLA_FFI_CallFrame ffi_call_frame = call_frame.Build(GetXlaFfiApi(), &ctx);
   return TakeStatus(handler.Call(&ffi_call_frame));
 }
 
 Status Call(XLA_FFI_Handler* handler, CallFrame& call_frame,
             const CallOptions& options) {
-  XLA_FFI_ExecutionContext ctx = {options.run_options};
+  XLA_FFI_ExecutionContext ctx = {options.run_options,
+                                  options.called_computation};
   XLA_FFI_CallFrame ffi_call_frame = call_frame.Build(GetXlaFfiApi(), &ctx);
   return TakeStatus((*handler)(&ffi_call_frame));
 }
@@ -121,6 +133,8 @@ static Status ActualStructSizeIsGreaterOrEqual(std::string_view struct_name,
 
 static absl::StatusCode ToStatusCode(XLA_FFI_Error_Code errc) {
   switch (errc) {
+    case XLA_FFI_Error_Code_OK:
+      return absl::StatusCode::kOk;
     case XLA_FFI_Error_Code_CANCELLED:
       return absl::StatusCode::kCancelled;
     case XLA_FFI_Error_Code_UNKNOWN:
@@ -173,6 +187,28 @@ static XLA_FFI_Error* XLA_FFI_Error_Create(XLA_FFI_Error_Create_Args* args) {
   return new XLA_FFI_Error{Status(ToStatusCode(args->errc), args->message)};
 }
 
+static void XLA_FFI_Error_GetMessage(XLA_FFI_Error_GetMessage_Args* args) {
+  Status struct_size_check = ActualStructSizeIsGreaterOrEqual(
+      "XLA_FFI_Error_GetMessage", XLA_FFI_Error_GetMessage_Args_STRUCT_SIZE,
+      args->struct_size);
+  if (!struct_size_check.ok()) {
+    LOG(ERROR) << struct_size_check.message();
+  }
+  // absl::Status owns error message in a std::string which guarantees that
+  // we'll get a null terminated string.
+  args->message = args->error->status.message().data();
+}
+
+static void XLA_FFI_Error_Destroy(XLA_FFI_Error_Destroy_Args* args) {
+  Status struct_size_check = ActualStructSizeIsGreaterOrEqual(
+      "XLA_FFI_Error_Destroy", XLA_FFI_Error_Destroy_Args_STRUCT_SIZE,
+      args->struct_size);
+  if (!struct_size_check.ok()) {
+    LOG(ERROR) << struct_size_check.message();
+  }
+  delete args->error;
+}
+
 static XLA_FFI_Error* XLA_FFI_Handler_Register(
     XLA_FFI_Handler_Register_Args* args) {
   XLA_FFI_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
@@ -185,6 +221,17 @@ static XLA_FFI_Error* XLA_FFI_Handler_Register(
   return nullptr;
 }
 
+static XLA_FFI_Error* XLA_FFI_Stream_Get(XLA_FFI_Stream_Get_Args* args) {
+  XLA_FFI_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "XLA_FFI_Stream_Get", XLA_FFI_Stream_Get_Args_STRUCT_SIZE,
+      args->struct_size));
+
+  auto handle = args->ctx->run_options->stream()->platform_specific_handle();
+  args->stream = handle.stream;
+
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // XLA FFI Internal Api Implementation
 //===----------------------------------------------------------------------===//
@@ -198,6 +245,10 @@ static void* XLA_FFI_ServiceExecutableRunOptions_Get(
   return const_cast<ServiceExecutableRunOptions*>(ctx->run_options);
 }
 
+static void* XLA_FFI_CalledComputation_Get(XLA_FFI_ExecutionContext* ctx) {
+  return const_cast<HloComputation*>(ctx->called_computation);
+}
+
 //===----------------------------------------------------------------------===//
 // XLA FFI Api access
 //===----------------------------------------------------------------------===//
@@ -205,6 +256,7 @@ static void* XLA_FFI_ServiceExecutableRunOptions_Get(
 static XLA_FFI_InternalApi internal_api = {
     XLA_FFI_Error_Forward,
     XLA_FFI_ServiceExecutableRunOptions_Get,
+    XLA_FFI_CalledComputation_Get,
 };
 
 static XLA_FFI_Api api = {
@@ -214,7 +266,10 @@ static XLA_FFI_Api api = {
     &internal_api,
 
     XLA_FFI_Error_Create,      // creates error
+    XLA_FFI_Error_GetMessage,  // get error message
+    XLA_FFI_Error_Destroy,     // frees error
     XLA_FFI_Handler_Register,  // registers handler
+    XLA_FFI_Stream_Get,        // returns platform specific stream
 };
 
 XLA_FFI_Api* GetXlaFfiApi() { return &api; }
diff --git a/third_party/xla/xla/ffi/ffi_api.h b/third_party/xla/xla/ffi/ffi_api.h
new file mode 100644
index 00000000000000..ebaeac28946103
--- /dev/null
+++ b/third_party/xla/xla/ffi/ffi_api.h
@@ -0,0 +1,75 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_FFI_API_H_
+#define XLA_FFI_FFI_API_H_
+
+#include <string_view>
+
+#include "xla/ffi/api/api.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
+#include "xla/ffi/call_frame.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+
+namespace xla::ffi {
+
+// This is an implementation of XLA FFI API defined in `api/c_api.h` header. It
+// should be linked statically into the "main" XLA binary, and third party FFI
+// handlers can be linked and registered dynamically.
+//
+// FFI handlers registered statically (and built from the same XLA commit with
+// the same toolchain) can also use `api/c_api_internal.h` to get access to
+// various internal data structures.
+
+//===----------------------------------------------------------------------===//
+// Calling XLA FFI handlers
+//===----------------------------------------------------------------------===//
+
+struct CallOptions {
+  const ServiceExecutableRunOptions* run_options = nullptr;
+  const HloComputation* called_computation = nullptr;
+};
+
+// Takes ownership of the XLA FFI error and returns underlying status. Frees
+// `error` if it's not nullptr; returns OK status otherwise.
+Status TakeStatus(XLA_FFI_Error* error);
+
+Status Call(Ffi& handler, CallFrame& call_frame,
+            const CallOptions& options = {});
+
+Status Call(XLA_FFI_Handler* handler, CallFrame& call_frame,
+            const CallOptions& options = {});
+
+//===----------------------------------------------------------------------===//
+// XLA FFI registry
+//===----------------------------------------------------------------------===//
+
+// Returns registered FFI handler for a given name, or an error if it's not
+// found in the static registry.
+StatusOr<XLA_FFI_Handler*> FindHandler(std::string_view name);
+
+//===----------------------------------------------------------------------===//
+// XLA FFI Api Implementation
+//===----------------------------------------------------------------------===//
+
+XLA_FFI_Api* GetXlaFfiApi();
+
+}  // namespace xla::ffi
+
+#endif  // XLA_FFI_FFI_API_H_
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index cd93c3900ca324..f40b09fc1eb189 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "xla/ffi/call_frame.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/xla_data.pb.h"
@@ -61,9 +62,12 @@ TEST(FfiTest, WrongNumArgs) {
 }
 
 TEST(FfiTest, WrongNumAttrs) {
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("i32", 42);
+  attrs.Insert("f32", 42.0f);
+
   CallFrameBuilder builder;
-  builder.AddI32Attr("i32", 42);
-  builder.AddF32Attr("f32", 42.0f);
+  builder.AddAttributes(attrs.Build());
   auto call_frame = builder.Build();
 
   auto handler = Ffi::Bind().Attr<int32_t>("i32").To(
@@ -76,10 +80,13 @@ TEST(FfiTest, WrongNumAttrs) {
 }
 
 TEST(FfiTest, BuiltinAttributes) {
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("i32", 42);
+  attrs.Insert("f32", 42.0f);
+  attrs.Insert("str", "foo");
+
   CallFrameBuilder builder;
-  builder.AddI32Attr("i32", 42);
-  builder.AddF32Attr("f32", 42.0f);
-  builder.AddStringAttr("str", "foo");
+  builder.AddAttributes(attrs.Build());
   auto call_frame = builder.Build();
 
   auto fn = [&](int32_t i32, float f32, std::string_view str) {
@@ -100,17 +107,168 @@ TEST(FfiTest, BuiltinAttributes) {
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, AttrsAsDictionary) {
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("i32", 42);
+  attrs.Insert("f32", 42.0f);
+  attrs.Insert("str", "foo");
+
+  CallFrameBuilder builder;
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto fn = [&](Dictionary dict) {
+    EXPECT_EQ(dict.size(), 3);
+
+    EXPECT_TRUE(dict.contains("i32"));
+    EXPECT_TRUE(dict.contains("f32"));
+    EXPECT_TRUE(dict.contains("str"));
+
+    auto i32 = dict.get<int32_t>("i32");
+    auto f32 = dict.get<float>("f32");
+    auto str = dict.get<std::string_view>("str");
+
+    EXPECT_TRUE(i32.has_value());
+    EXPECT_TRUE(f32.has_value());
+    EXPECT_TRUE(str.has_value());
+
+    if (i32) EXPECT_EQ(*i32, 42);
+    if (f32) EXPECT_EQ(*f32, 42.0f);
+    if (str) EXPECT_EQ(*str, "foo");
+
+    EXPECT_FALSE(dict.contains("i64"));
+    EXPECT_FALSE(dict.get<int64_t>("i32").has_value());
+    EXPECT_FALSE(dict.get<int64_t>("i64").has_value());
+
+    return absl::OkStatus();
+  };
+
+  auto handler = Ffi::Bind().Attrs().To(fn);
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
+TEST(FfiTest, DictionaryAttr) {
+  CallFrameBuilder::FlatAttributesMap dict0;
+  dict0.try_emplace("i32", 42);
+
+  CallFrameBuilder::FlatAttributesMap dict1;
+  dict1.try_emplace("f32", 42.0f);
+
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("dict0", dict0);
+  attrs.Insert("dict1", dict1);
+
+  CallFrameBuilder builder;
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto fn = [&](Dictionary dict0, Dictionary dict1) {
+    EXPECT_EQ(dict0.size(), 1);
+    EXPECT_EQ(dict1.size(), 1);
+
+    EXPECT_TRUE(dict0.contains("i32"));
+    EXPECT_TRUE(dict1.contains("f32"));
+
+    auto i32 = dict0.get<int32_t>("i32");
+    auto f32 = dict1.get<float>("f32");
+
+    EXPECT_TRUE(i32.has_value());
+    EXPECT_TRUE(f32.has_value());
+
+    if (i32) EXPECT_EQ(*i32, 42);
+    if (f32) EXPECT_EQ(*f32, 42.0f);
+
+    return absl::OkStatus();
+  };
+
+  auto handler =
+      Ffi::Bind().Attr<Dictionary>("dict0").Attr<Dictionary>("dict1").To(fn);
+
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
+struct PairOfI32AndF32 {
+  int32_t i32;
+  float f32;
+};
+
+XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(PairOfI32AndF32,
+                                      StructMember<int32_t>("i32"),
+                                      StructMember<float>("f32"));
+
+TEST(FfiTest, StructAttr) {
+  CallFrameBuilder::FlatAttributesMap dict;
+  dict.try_emplace("i32", 42);
+  dict.try_emplace("f32", 42.0f);
+
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("str", "foo");
+  attrs.Insert("i32_and_f32", dict);
+
+  CallFrameBuilder builder;
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto fn = [&](std::string_view str, PairOfI32AndF32 i32_and_f32) {
+    EXPECT_EQ(str, "foo");
+    EXPECT_EQ(i32_and_f32.i32, 42);
+    EXPECT_EQ(i32_and_f32.f32, 42.0f);
+    return absl::OkStatus();
+  };
+
+  auto handler = Ffi::Bind()
+                     .Attr<std::string_view>("str")
+                     .Attr<PairOfI32AndF32>("i32_and_f32")
+                     .To(fn);
+
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
+TEST(FfiTest, AttrsAsStruct) {
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("i32", 42);
+  attrs.Insert("f32", 42.0f);
+
+  CallFrameBuilder builder;
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto fn = [&](PairOfI32AndF32 i32_and_f32) {
+    EXPECT_EQ(i32_and_f32.i32, 42);
+    EXPECT_EQ(i32_and_f32.f32, 42.0f);
+    return absl::OkStatus();
+  };
+
+  auto handler = Ffi::Bind().Attrs<PairOfI32AndF32>().To(fn);
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
 TEST(FfiTest, DecodingErrors) {
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("i32", 42);
+  attrs.Insert("i64", 42);
+  attrs.Insert("f32", 42.0f);
+  attrs.Insert("str", "foo");
+
   CallFrameBuilder builder;
-  builder.AddI32Attr("i32", 42);
-  builder.AddF32Attr("f32", 42.0f);
-  builder.AddStringAttr("str", "foo");
+  builder.AddAttributes(attrs.Build());
   auto call_frame = builder.Build();
 
-  auto fn = [](int32_t, float, std::string_view) { return absl::OkStatus(); };
+  auto fn = [](int32_t, int64_t, float, std::string_view) {
+    return absl::OkStatus();
+  };
 
   auto handler = Ffi::Bind()
                      .Attr<int32_t>("not_i32_should_fail")
+                     .Attr<int64_t>("not_i64_should_fail")
                      .Attr<float>("f32")
                      .Attr<std::string_view>("not_str_should_fail")
                      .To(fn);
@@ -119,7 +277,7 @@ TEST(FfiTest, DecodingErrors) {
 
   ASSERT_EQ(
       status.message(),
-      "Failed to decode all FFI handler operands (bad operands at: 0, 2)");
+      "Failed to decode all FFI handler operands (bad operands at: 0, 1, 3)");
 }
 
 TEST(FfiTest, BufferArgument) {
@@ -131,8 +289,8 @@ TEST(FfiTest, BufferArgument) {
   auto call_frame = builder.Build();
 
   auto fn = [&](Buffer buffer) {
+    EXPECT_EQ(buffer.dtype, PrimitiveType::F32);
     EXPECT_EQ(buffer.data.opaque(), storage.data());
-    EXPECT_EQ(buffer.primitive_type, PrimitiveType::F32);
     EXPECT_EQ(buffer.dimensions.size(), 2);
     return absl::OkStatus();
   };
@@ -143,6 +301,27 @@ TEST(FfiTest, BufferArgument) {
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, RemainingArgs) {
+  std::vector<float> storage(4, 0.0f);
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+
+  CallFrameBuilder builder;
+  builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
+  auto call_frame = builder.Build();
+
+  auto fn = [&](RemainingArgs args) {
+    EXPECT_EQ(args.size(), 1);
+    EXPECT_TRUE(args.get<Buffer>(0).has_value());
+    EXPECT_FALSE(args.get<Buffer>(1).has_value());
+    return absl::OkStatus();
+  };
+
+  auto handler = Ffi::Bind().RemainingArgs().To(fn);
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
 TEST(FfiTest, RunOptionsCtx) {
   auto call_frame = CallFrameBuilder().Build();
   auto* expected = reinterpret_cast<ServiceExecutableRunOptions*>(0x01234567);
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
index 5e23668045bce2..bcb5747e84f2f8 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
@@ -2,6 +2,7 @@
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("//xla:xla.bzl", "auto_sharding_deps", "auto_sharding_solver_deps", "xla_cc_binary", "xla_cc_test")
+load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -21,6 +22,7 @@ cc_library(
     srcs = [
         "auto_sharding.cc",
         "auto_sharding_dot_handler.cc",
+        "auto_sharding_strategy.cc",
     ],
     hdrs = [
         "auto_sharding.h",
@@ -50,9 +52,11 @@ cc_library(
         "//xla/service:buffer_value",
         "//xla/service:call_graph",
         "//xla/service:computation_layout",
+        "//xla/service:dot_as_convolution_util",
         "//xla/service:dump",
         "//xla/service:hlo_alias_analysis",
         "//xla/service:hlo_buffer",
+        "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_memory_scheduler",
         "//xla/service:hlo_pass",
         "//xla/service:hlo_value",
@@ -60,6 +64,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -78,6 +83,7 @@ cc_library(
     srcs = ["auto_sharding_solver_impl.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":auto_sharding_proto_cc",
         ":auto_sharding_strategy",
         "@com_google_ortools//ortools/linear_solver",
     ],
@@ -88,6 +94,7 @@ cc_library(
     srcs = ["auto_sharding_solver.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":auto_sharding_proto_cc",
         ":auto_sharding_strategy",
         "//xla:statusor",
         "//xla:util",
@@ -100,6 +107,7 @@ cc_library(
         "@com_google_absl//absl/time",
         "@com_google_ortools//ortools/linear_solver",
         "@com_google_ortools//ortools/linear_solver:linear_solver_cc_proto",
+        "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:hash",
         "@local_tsl//tsl/platform:types",
     ] + auto_sharding_solver_deps(),
@@ -113,6 +121,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":auto_sharding_proto_cc",
         "//xla:shape_util",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
@@ -131,6 +140,10 @@ cc_library(
     deps = [
         ":auto_sharding_strategy",
         ":matrix",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -156,9 +169,11 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":auto_sharding_cost_graph",
+        ":auto_sharding_option",
         ":auto_sharding_strategy",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_live_range",
+        "//xla/service:hlo_cost_analysis",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -174,6 +189,7 @@ cc_library(
         ":auto_sharding_wrapper",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_live_range",
+        "//xla/service:hlo_cost_analysis",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -220,6 +236,7 @@ cc_library(
         "//xla:array",
         "//xla:shape_tree",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -262,6 +279,12 @@ xla_cc_binary(
     ],
 )
 
+tf_proto_library(
+    name = "auto_sharding_proto",
+    srcs = ["auto_sharding.proto"],
+    visibility = ["//visibility:public"],
+)
+
 build_test(
     name = "auto_sharding_runner_build_test",
     targets = [
@@ -272,6 +295,11 @@ build_test(
 xla_cc_test(
     name = "auto_sharding_test",
     srcs = ["auto_sharding_test.cc"],
+    tags = [
+        # Disabled until autosharding is fully supported in OSS,
+        # https://github.com/openxla/xla/issues/7248.
+        "no_oss",
+    ],
     deps = [
         ":auto_sharding",
         ":auto_sharding_option",
@@ -291,7 +319,13 @@ xla_cc_test(
 xla_cc_test(
     name = "auto_sharding_solver_test",
     srcs = ["auto_sharding_solver_test.cc"],
+    tags = [
+        # Disabled until autosharding is fully supported in OSS,
+        # https://github.com/openxla/xla/issues/7248.
+        "no_oss",
+    ],
     deps = [
+        ":auto_sharding_proto_cc",
         ":auto_sharding_solver",
         ":auto_sharding_strategy",
         "//xla/tests:xla_internal_test_main",
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 8b70921d4a8f41..21ede057ee8dd1 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <climits>
-#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@@ -34,6 +33,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_solver.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h"
 #include "xla/hlo/experimental/auto_sharding/cluster_environment.h"
 #include "xla/hlo/experimental/auto_sharding/matrix.h"
 #include "xla/hlo/experimental/auto_sharding/metrics.h"
@@ -70,12 +71,14 @@ limitations under the License.
 #include "xla/service/dump.h"
 #include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_buffer.h"
+#include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_memory_scheduler.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/sharding_propagation.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -85,62 +88,61 @@ limitations under the License.
 
 namespace xla {
 namespace spmd {
-// Compute the resharding cost vector from multiple possible strategies
-// to a desired sharding spec.
+
+namespace {
+constexpr double kOverbudgetCoeff = 1e6;
+constexpr double kSaltiplier = 0.001;  // Modifies each obj. term by at most .1%
+}  // namespace
+
+// Compute the resharding cost vector from multiple possible strategies to a
+// desired sharding spec.
 std::vector<double> ReshardingCostVector(
-    const StrategyVector* strategies, const Shape& operand_shape,
+    const StrategyGroup* strategy_group, const Shape& operand_shape,
     const HloSharding& required_sharding,
     const ClusterEnvironment& cluster_env) {
-  CHECK(!strategies->is_tuple) << "Only works with strategy vector.";
+  CHECK(!strategy_group->is_tuple) << "Only works with strategy vector.";
   std::vector<double> ret;
-  ret.reserve(strategies->leaf_vector.size());
+  ret.reserve(strategy_group->strategies.size());
   auto required_sharding_for_resharding = required_sharding.IsTileMaximal()
                                               ? HloSharding::Replicate()
                                               : required_sharding;
-  for (const auto& x : strategies->leaf_vector) {
+  for (const auto& x : strategy_group->strategies) {
     ret.push_back(cluster_env.ReshardingCost(operand_shape, x.output_sharding,
                                              required_sharding_for_resharding));
   }
   return ret;
 }
 
-// Factory functions for StrategyVector.
-std::unique_ptr<StrategyVector> CreateLeafStrategyVectorWithoutInNodes(
-    size_t instruction_id, LeafStrategies& leaf_strategies) {
-  auto strategies = std::make_unique<StrategyVector>();
-  strategies->is_tuple = false;
-  strategies->node_idx = leaf_strategies.size();
-  leaf_strategies.push_back(strategies.get());
-  strategies->instruction_id = instruction_id;
-  return strategies;
+// Factory functions for StrategyGroup.
+std::unique_ptr<StrategyGroup> CreateLeafStrategyGroupWithoutInNodes(
+    const size_t instruction_id, StrategyGroups& strategy_groups) {
+  auto strategy_group = std::make_unique<StrategyGroup>();
+  strategy_group->is_tuple = false;
+  strategy_group->node_idx = strategy_groups.size();
+  strategy_groups.push_back(strategy_group.get());
+  strategy_group->instruction_id = instruction_id;
+  return strategy_group;
 }
 
-// Factory functions for StrategyVector.
-std::unique_ptr<StrategyVector> CreateLeafStrategyVector(
-    size_t instruction_id, const HloInstruction* ins,
-    const StrategyMap& strategy_map, LeafStrategies& leaf_strategies) {
-  auto strategies =
-      CreateLeafStrategyVectorWithoutInNodes(instruction_id, leaf_strategies);
+// Factory functions for StrategyGroup.
+std::unique_ptr<StrategyGroup> CreateLeafStrategyGroup(
+    const size_t instruction_id, const HloInstruction* ins,
+    const StrategyMap& strategy_map, StrategyGroups& strategy_groups) {
+  auto strategy_group =
+      CreateLeafStrategyGroupWithoutInNodes(instruction_id, strategy_groups);
   for (int64_t i = 0; i < ins->operand_count(); ++i) {
-    strategies->in_nodes.push_back(strategy_map.at(ins->operand(i)).get());
+    strategy_group->in_nodes.push_back(strategy_map.at(ins->operand(i)).get());
   }
-  return strategies;
-}
-
-std::unique_ptr<StrategyVector> CreateTupleStrategyVector(
-    size_t instruction_id) {
-  auto strategies = std::make_unique<StrategyVector>();
-  strategies->is_tuple = true;
-  strategies->node_idx = -1;
-  strategies->instruction_id = instruction_id;
-  return strategies;
+  return strategy_group;
 }
 
-// ShardingPropagation::GetShardingFromUser does not handle TopK custom
-// calls. Mirroring that function's handling of kSort, we handle TopK below.
-HloSharding InferInputShardingForTopK(const HloInstruction* ins,
-                                      const HloSharding& output_sharding) {
-  return output_sharding;
+std::unique_ptr<StrategyGroup> CreateTupleStrategyGroup(
+    const size_t instruction_id) {
+  auto strategy_group = std::make_unique<StrategyGroup>();
+  strategy_group->is_tuple = true;
+  strategy_group->node_idx = -1;
+  strategy_group->instruction_id = instruction_id;
+  return strategy_group;
 }
 
 // Compute the resharding costs as well as input shardings (when missing) for
@@ -160,19 +162,18 @@ GenerateReshardingCostsAndMissingShardingsForAllOperands(
     auto operand = ins->operand(k);
     if (operand->shape().IsToken() || operand->shape().rank() == 0) {
       resharding_costs.push_back(std::vector<double>(
-          strategy_map.at(operand)->leaf_vector.size(), 0.0));
+          strategy_map.at(operand)->strategies.size(), 0.0));
       if (!input_shardings[k].has_value()) {
         input_shardings[k] = HloSharding::Replicate();
       }
     } else {
       std::optional<HloSharding> cur_input_sharding;
+      CHECK_EQ(input_shardings.size(), ins->operand_count());
       if (input_shardings[k].has_value()) {
-        CHECK_EQ(input_shardings.size(), ins->operand_count());
         cur_input_sharding = input_shardings[k];
       } else {
-        cur_input_sharding =
-            GetInputSharding(ins, operand, k, output_sharding, call_graph,
-                             cluster_env.NumDevices());
+        cur_input_sharding = GetInputSharding(
+            ins, k, output_sharding, call_graph, cluster_env.NumDevices());
       }
       bool is_sharding_default_replicated = false;
       if (!cur_input_sharding.has_value()) {
@@ -180,11 +181,11 @@ GenerateReshardingCostsAndMissingShardingsForAllOperands(
             (ins->opcode() == HloOpcode::kScatter && k != 0)) {
           is_sharding_default_replicated = true;
           cur_input_sharding = HloSharding::Replicate();
-        } else if (IsTopKCustomCall(ins)) {
-          cur_input_sharding = InferInputShardingForTopK(ins, output_sharding);
         } else if (ins->opcode() == HloOpcode::kCustomCall) {
           is_sharding_default_replicated = true;
           cur_input_sharding = HloSharding::Replicate();
+        } else if (ins->opcode() == HloOpcode::kRngBitGenerator) {
+          cur_input_sharding = HloSharding::Replicate();
         }
       }
       CHECK(cur_input_sharding.has_value());
@@ -195,11 +196,10 @@ GenerateReshardingCostsAndMissingShardingsForAllOperands(
       auto operand_shape = operand->shape();
       if (ins->opcode() == HloOpcode::kGather && k == 0 &&
           is_sharding_default_replicated) {
-        LOG(INFO)
-            << "Zeroing out operand 0 resharding costs for gather sharding "
-            << output_sharding.ToString();
+        VLOG(2) << "Zeroing out operand 0 resharding costs for gather sharding "
+                << output_sharding.ToString();
         resharding_costs.push_back(
-            std::vector<double>(operand_strategies->leaf_vector.size(), 0));
+            std::vector<double>(operand_strategies->strategies.size(), 0));
         input_shardings[k] = std::nullopt;
       } else {
         resharding_costs.push_back(
@@ -226,105 +226,123 @@ GenerateReshardingCostsAndShardingsForAllOperands(
     CHECK(sharding_optional.has_value());
   }
 
-  return std::make_pair(resharding_costs, input_shardings_optional);
+  return {resharding_costs, input_shardings_optional};
+}
+
+// When computing resharding costs for inputs, this function assumes that the
+// shape of the input is the same as the shape of the output (i.e., the `shape`
+// operand to the function).
+void FollowArrayOrTokenStrategyGroup(
+    const StrategyGroup& src_strategy_group, const Shape& shape,
+    const size_t instruction_id, const bool have_memory_cost,
+    const ClusterEnvironment& cluster_env,
+    StableHashMap<NodeIdx, std::vector<ShardingStrategy>>&
+        pretrimmed_strategy_map,
+    StrategyGroup& strategy_group) {
+  CHECK(shape.IsArray() || shape.IsToken());
+
+  // Only follows the given strategy when there is no other strategy to be
+  // restored.
+  if (!pretrimmed_strategy_map.contains(src_strategy_group.node_idx)) {
+    strategy_group.following = &src_strategy_group;
+  }
+  strategy_group.strategies.reserve(src_strategy_group.strategies.size());
+  // Creates the sharding strategies and restores trimmed strategies, if any.
+  std::vector<ShardingStrategy>& pretrimmed_strategies =
+      pretrimmed_strategy_map[src_strategy_group.node_idx];
+  for (int64_t sid = 0; sid < src_strategy_group.strategies.size() +
+                                  pretrimmed_strategies.size();
+       ++sid) {
+    const HloSharding* output_spec;
+    if (sid < src_strategy_group.strategies.size()) {
+      output_spec = &src_strategy_group.strategies[sid].output_sharding;
+    } else {
+      output_spec =
+          &pretrimmed_strategies[sid - src_strategy_group.strategies.size()]
+               .output_sharding;
+      VLOG(1) << "Adding outspec from the trimmed strategy map: "
+              << output_spec->ToString();
+    }
+    const std::string name = ToStringSimple(*output_spec);
+    double compute_cost = 0, communication_cost = 0;
+    double memory_cost =
+        have_memory_cost ? GetBytes(shape) / output_spec->NumTiles() : 0;
+    size_t num_in_nodes = strategy_group.in_nodes.size();
+    std::vector<std::optional<HloSharding>> input_shardings(num_in_nodes,
+                                                            *output_spec);
+    std::vector<std::vector<double>> resharding_costs;
+    for (size_t i = 0; i < strategy_group.in_nodes.size(); ++i) {
+      resharding_costs.push_back(ReshardingCostVector(
+          strategy_group.in_nodes[i], shape, *output_spec, cluster_env));
+    }
+
+    strategy_group.strategies.push_back(
+        ShardingStrategy({name, *output_spec, compute_cost, communication_cost,
+                          memory_cost, resharding_costs, input_shardings}));
+  }
 }
 
-std::unique_ptr<StrategyVector> MaybeFollowInsStrategyVector(
-    const StrategyVector* src_strategies, const Shape& shape,
-    size_t instruction_id, bool have_memory_cost,
-    LeafStrategies& leaf_strategies, const ClusterEnvironment& cluster_env,
+std::unique_ptr<StrategyGroup> MaybeFollowInsStrategyGroup(
+    const StrategyGroup* src_strategy_group, const Shape& shape,
+    const size_t instruction_id, const bool have_memory_cost,
+    StrategyGroups& strategy_groups, const ClusterEnvironment& cluster_env,
     StableHashMap<NodeIdx, std::vector<ShardingStrategy>>&
         pretrimmed_strategy_map) {
-  std::unique_ptr<StrategyVector> strategies;
-  if (src_strategies->is_tuple) {
+  std::unique_ptr<StrategyGroup> strategy_group;
+  if (src_strategy_group->is_tuple) {
     CHECK(shape.IsTuple());
-    CHECK_EQ(shape.tuple_shapes_size(), src_strategies->childs.size());
-    strategies = CreateTupleStrategyVector(instruction_id);
-    strategies->childs.reserve(src_strategies->childs.size());
-    for (size_t i = 0; i < src_strategies->childs.size(); ++i) {
-      auto child_strategies = MaybeFollowInsStrategyVector(
-          src_strategies->childs[i].get(), shape.tuple_shapes(i),
-          instruction_id, have_memory_cost, leaf_strategies, cluster_env,
+    CHECK_EQ(shape.tuple_shapes_size(), src_strategy_group->childs.size());
+    strategy_group = CreateTupleStrategyGroup(instruction_id);
+    strategy_group->childs.reserve(src_strategy_group->childs.size());
+    for (size_t i = 0; i < src_strategy_group->childs.size(); ++i) {
+      auto child_strategies = MaybeFollowInsStrategyGroup(
+          src_strategy_group->childs[i].get(), shape.tuple_shapes(i),
+          instruction_id, have_memory_cost, strategy_groups, cluster_env,
           pretrimmed_strategy_map);
       child_strategies->tuple_element_idx = i;
-      strategies->childs.push_back(std::move(child_strategies));
+      strategy_group->childs.push_back(std::move(child_strategies));
     }
   } else {
-    CHECK(shape.IsArray() || shape.IsToken());
-    strategies =
-        CreateLeafStrategyVectorWithoutInNodes(instruction_id, leaf_strategies);
-    strategies->in_nodes.push_back(src_strategies);
-    // Only follows the given strategy when there is no other strategy to be
-    // restored.
-    if (!pretrimmed_strategy_map.contains(src_strategies->node_idx)) {
-      strategies->following = src_strategies;
-    }
-    strategies->leaf_vector.reserve(src_strategies->leaf_vector.size());
-    // Creates the sharding strategies and restores the trimmed strategies if
-    // there is any.
-    for (int64_t sid = 0;
-         sid < src_strategies->leaf_vector.size() +
-                   pretrimmed_strategy_map[src_strategies->node_idx].size();
-         ++sid) {
-      const HloSharding* output_spec;
-      if (sid < src_strategies->leaf_vector.size()) {
-        output_spec = &src_strategies->leaf_vector[sid].output_sharding;
-      } else {
-        output_spec =
-            &pretrimmed_strategy_map[src_strategies->node_idx]
-                                    [sid - src_strategies->leaf_vector.size()]
-                                        .output_sharding;
-        VLOG(1) << "Adding outspec from the trimmed strategy map: "
-                << output_spec->ToString();
-      }
-      std::string name = ToStringSimple(*output_spec);
-      double compute_cost = 0, communication_cost = 0;
-      double memory_cost =
-          have_memory_cost ? GetBytes(shape) / output_spec->NumTiles() : 0;
-      auto resharding_costs = ReshardingCostVector(src_strategies, shape,
-                                                   *output_spec, cluster_env);
-      strategies->leaf_vector.push_back(
-          ShardingStrategy({name,
-                            *output_spec,
-                            compute_cost,
-                            communication_cost,
-                            memory_cost,
-                            {std::move(resharding_costs)},
-                            {*output_spec}}));
-    }
+    strategy_group =
+        CreateLeafStrategyGroupWithoutInNodes(instruction_id, strategy_groups);
+    strategy_group->in_nodes.push_back(src_strategy_group);
+    FollowArrayOrTokenStrategyGroup(*src_strategy_group, shape, instruction_id,
+                                    have_memory_cost, cluster_env,
+                                    pretrimmed_strategy_map, *strategy_group);
   }
-  return strategies;
+  return strategy_group;
 }
 
-StatusOr<std::unique_ptr<StrategyVector>> FollowReduceStrategy(
+StatusOr<std::unique_ptr<StrategyGroup>> FollowReduceStrategy(
     const HloInstruction* ins, const Shape& output_shape,
     const HloInstruction* operand, const HloInstruction* unit,
-    size_t instruction_id, StrategyMap& strategy_map,
-    LeafStrategies& leaf_strategies, const ClusterEnvironment& cluster_env,
-    bool allow_mixed_mesh_shape, bool crash_at_error) {
-  std::unique_ptr<StrategyVector> strategies;
+    const size_t instruction_id, StrategyMap& strategy_map,
+    StrategyGroups& strategy_groups, const ClusterEnvironment& cluster_env,
+    const bool allow_mixed_mesh_shape, const bool crash_at_error) {
+  std::unique_ptr<StrategyGroup> strategy_group;
   if (output_shape.IsTuple()) {
-    strategies = CreateTupleStrategyVector(instruction_id);
-    strategies->childs.reserve(ins->shape().tuple_shapes_size());
+    strategy_group = CreateTupleStrategyGroup(instruction_id);
+    strategy_group->childs.reserve(ins->shape().tuple_shapes_size());
     for (size_t i = 0; i < ins->shape().tuple_shapes_size(); ++i) {
       auto child_strategy_status = FollowReduceStrategy(
           ins, ins->shape().tuple_shapes().at(i), ins->operand(i),
           ins->operand(i + ins->shape().tuple_shapes_size()), instruction_id,
-          strategy_map, leaf_strategies, cluster_env, allow_mixed_mesh_shape,
+          strategy_map, strategy_groups, cluster_env, allow_mixed_mesh_shape,
           crash_at_error);
       if (!child_strategy_status.ok()) {
         return child_strategy_status;
       }
       child_strategy_status.value()->tuple_element_idx = i;
-      strategies->childs.push_back(std::move(child_strategy_status.value()));
+      strategy_group->childs.push_back(
+          std::move(child_strategy_status.value()));
     }
   } else if (output_shape.IsArray()) {
-    strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
-                                          leaf_strategies);
-    const StrategyVector* src_strategies = strategy_map.at(operand).get();
+    strategy_group = CreateLeafStrategyGroup(instruction_id, ins, strategy_map,
+                                             strategy_groups);
+    const StrategyGroup* src_strategy_group = strategy_map.at(operand).get();
     // Follows the strategy of the operand.
-    strategies->following = src_strategies;
-    strategies->leaf_vector.reserve(src_strategies->leaf_vector.size());
+    strategy_group->following = src_strategy_group;
+    strategy_group->strategies.reserve(src_strategy_group->strategies.size());
     // Map operand dims to inst dim
     // Example: f32[1,16]{1,0} reduce(f32[1,16,4096]{2,1,0} %param0, f32[]
     // %param1), dimensions={2}
@@ -336,9 +354,9 @@ StatusOr<std::unique_ptr<StrategyVector>> FollowReduceStrategy(
              operand->shape().rank())
         << "Invalid kReduce: output size + reduced dimensions size != op count";
 
-    for (size_t sid = 0; sid < src_strategies->leaf_vector.size(); ++sid) {
+    for (size_t sid = 0; sid < src_strategy_group->strategies.size(); ++sid) {
       HloSharding input_sharding =
-          src_strategies->leaf_vector[sid].output_sharding;
+          src_strategy_group->strategies[sid].output_sharding;
       const auto& tensor_dim_to_mesh = cluster_env.GetTensorDimToMeshDimWrapper(
           operand->shape(), input_sharding,
           /* consider_reverse_device_meshes */ true,
@@ -367,7 +385,7 @@ StatusOr<std::unique_ptr<StrategyVector>> FollowReduceStrategy(
           output_shape, operand_clone.get(), unit_clone.get(),
           ins->dimensions(), ins->to_apply());
       operand_clone->set_sharding(
-          src_strategies->leaf_vector[sid].output_sharding);
+          src_strategy_group->strategies[sid].output_sharding);
       auto s = new_reduce->ReplaceOperandWith(0, operand_clone.get());
       if (!s.ok()) {
         continue;
@@ -381,7 +399,7 @@ StatusOr<std::unique_ptr<StrategyVector>> FollowReduceStrategy(
       operand_clone.reset();
       unit_clone.reset();
 
-      std::string name = ToStringSimple(output_spec);
+      const std::string name = ToStringSimple(output_spec);
 
       double compute_cost = 0, communication_cost = 0;
       double memory_cost = GetBytes(output_shape) / output_spec.NumTiles();
@@ -398,7 +416,7 @@ StatusOr<std::unique_ptr<StrategyVector>> FollowReduceStrategy(
               operand_strategies, output_shape, input_sharding, cluster_env));
         } else {
           resharding_costs.push_back(std::vector<double>(
-              strategy_map.at(cur_operand)->leaf_vector.size(), 0.0));
+              strategy_map.at(cur_operand)->strategies.size(), 0.0));
         }
       }
       const ShardingStrategy strategy = ShardingStrategy({name,
@@ -408,12 +426,12 @@ StatusOr<std::unique_ptr<StrategyVector>> FollowReduceStrategy(
                                                           memory_cost,
                                                           resharding_costs,
                                                           {input_sharding}});
-      strategies->leaf_vector.push_back(strategy);
+      strategy_group->strategies.push_back(strategy);
     }
   } else {
     LOG(FATAL) << "Unhandled kReduce shape: " << ins->shape().ToString();
   }
-  return strategies;
+  return strategy_group;
 }
 
 std::vector<size_t> FindReplicateStrategyIndices(
@@ -430,7 +448,7 @@ std::vector<size_t> FindReplicateStrategyIndices(
 std::pair<std::vector<std::vector<double>>,
           std::vector<std::optional<HloSharding>>>
 ReshardingCostsForTupleOperand(const HloInstruction* operand,
-                               StrategyVector* operand_strategy_vector) {
+                               StrategyGroup* operand_strategy_vector) {
   // TODO(yuemmawang) Support instructions with more than one tuple operand.
   // Creates resharding costs such that favors when operand strategies are
   // replicated.
@@ -442,22 +460,21 @@ ReshardingCostsForTupleOperand(const HloInstruction* operand,
     auto tuple_element_strategies =
         operand_strategy_vector->childs.at(tuple_element_idx).get();
     std::vector<size_t> indices =
-        FindReplicateStrategyIndices(tuple_element_strategies->leaf_vector);
+        FindReplicateStrategyIndices(tuple_element_strategies->strategies);
     CHECK_GT(indices.size(), 0)
         << "There is no replicated strategy in instruction "
         << operand->ToString() << ".\nStrategies:\n"
         << tuple_element_strategies->ToString();
     resharding_costs.push_back(std::vector<double>(
-        tuple_element_strategies->leaf_vector.size(), kInfinityCost));
+        tuple_element_strategies->strategies.size(), kInfinityCost));
     tuple_element_shardings.push_back(HloSharding::Replicate());
     for (const size_t i : indices) {
       resharding_costs.back().at(i) = 0.0;
     }
   }
-  return std::make_pair(
-      resharding_costs,
-      std::vector<std::optional<HloSharding>>(
-          {HloSharding::Tuple(operand->shape(), tuple_element_shardings)}));
+  return {resharding_costs,
+          std::vector<std::optional<HloSharding>>(
+              {HloSharding::Tuple(operand->shape(), tuple_element_shardings)})};
 }
 
 std::vector<std::vector<double>> CreateZeroReshardingCostsForAllOperands(
@@ -480,12 +497,12 @@ std::vector<std::vector<double>> CreateZeroReshardingCostsForAllOperands(
           auto tuple_element_strategies =
               operand_strategies->childs.at(tuple_element_idx).get();
           resharding_costs.push_back(std::vector<double>(
-              tuple_element_strategies->leaf_vector.size(), 0));
+              tuple_element_strategies->strategies.size(), 0));
         }
       }
     } else {
       resharding_costs.push_back(
-          std::vector<double>(operand_strategies->leaf_vector.size(), 0));
+          std::vector<double>(operand_strategies->strategies.size(), 0));
     }
   }
   return resharding_costs;
@@ -494,13 +511,13 @@ std::vector<std::vector<double>> CreateZeroReshardingCostsForAllOperands(
 void GenerateOutfeedStrategy(const HloInstruction* ins, const Shape& shape,
                              const ClusterEnvironment& cluster_env,
                              const StrategyMap& strategy_map,
-                             std::unique_ptr<StrategyVector>& strategies,
-                             double replicated_penalty) {
+                             std::unique_ptr<StrategyGroup>& strategy_group,
+                             const double replicated_penalty) {
   HloSharding output_spec = HloSharding::Replicate();
   std::vector<std::vector<double>> resharding_costs;
   std::vector<std::optional<HloSharding>> input_shardings;
 
-  int tuple_size = ins->operand(0)->shape().tuple_shapes_size();
+  const int tuple_size = ins->operand(0)->shape().tuple_shapes_size();
   if (ins->has_sharding()) {
     std::vector<Shape> operand_shapes(ins->operand_count());
     for (int i = 0; i < ins->operand_count(); ++i) {
@@ -532,13 +549,13 @@ void GenerateOutfeedStrategy(const HloInstruction* ins, const Shape& shape,
   } else {
     for (size_t i = 0; i < tuple_size; ++i) {
       resharding_costs.push_back(std::vector<double>(
-          strategy_map.at(ins->operand(0))->childs[i].get()->leaf_vector.size(),
+          strategy_map.at(ins->operand(0))->childs[i].get()->strategies.size(),
           0));
     }
   }
   resharding_costs.push_back({});
   double memory_cost = GetBytes(shape) / output_spec.NumTiles();
-  strategies->leaf_vector.push_back(ShardingStrategy(
+  strategy_group->strategies.push_back(ShardingStrategy(
       {"R", HloSharding::Replicate(), replicated_penalty, 0, memory_cost,
        std::move(resharding_costs), input_shardings}));
 }
@@ -558,7 +575,7 @@ double ComputeCommunicationCost(
         // As seen in the test
         // SpmdPartitioningTest.GatherPartitionedOnTrivialSliceDims (in file
         // third_party/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc),
-        // when the gather op is replicated, and the first operand sharded, we
+        // when the gather op is replicated and the first operand sharded, we
         // need an AllReduce to implement the gather op. We capture that cost
         // here.
         // TODO(pratikf) Model gather communication costs in a more principled
@@ -584,8 +601,9 @@ double ComputeCommunicationCost(
 void AddReplicatedStrategy(
     const HloInstruction* ins, const Shape& shape,
     const ClusterEnvironment& cluster_env, const StrategyMap& strategy_map,
-    std::unique_ptr<StrategyVector>& strategies, double replicated_penalty,
-    absl::flat_hash_set<int64_t> operands_to_consider_all_strategies_for = {}) {
+    std::unique_ptr<StrategyGroup>& strategy_group,
+    const double replicated_penalty,
+    absl::flat_hash_set<int64_t> operands_to_consider_all_strategies_for) {
   HloSharding replicated_strategy = HloSharding::Replicate();
   HloSharding output_spec = replicated_strategy;
   double memory_cost = GetBytes(shape) / output_spec.NumTiles();
@@ -599,23 +617,23 @@ void AddReplicatedStrategy(
     auto operand_strategies_to_consider = strategy_map.at(operand).get();
     std::vector<std::vector<std::optional<HloSharding>>>
         possible_input_shardings(
-            operand_strategies_to_consider->leaf_vector.size(),
+            operand_strategies_to_consider->strategies.size(),
             std::vector<std::optional<HloSharding>>(ins->operand_count()));
     std::vector<std::vector<std::vector<double>>> possible_resharding_costs(
-        operand_strategies_to_consider->leaf_vector.size(),
+        operand_strategies_to_consider->strategies.size(),
         std::vector<std::vector<double>>(ins->operand_count()));
 
     for (int64_t k = 0; k < ins->operand_count(); ++k) {
       CHECK(!ins->operand(k)->shape().IsTuple());
       if (k == operand_to_consider_all_strategies_for) {
         CHECK_EQ(possible_input_shardings.size(),
-                 operand_strategies_to_consider->leaf_vector.size());
+                 operand_strategies_to_consider->strategies.size());
         for (size_t j = 0; j < possible_input_shardings.size(); ++j) {
           possible_input_shardings[j][k] =
-              operand_strategies_to_consider->leaf_vector[j].output_sharding;
+              operand_strategies_to_consider->strategies[j].output_sharding;
           possible_resharding_costs[j][k] = ReshardingCostVector(
               strategy_map.at(ins->operand(k)).get(), ins->operand(k)->shape(),
-              operand_strategies_to_consider->leaf_vector[j].output_sharding,
+              operand_strategies_to_consider->strategies[j].output_sharding,
               cluster_env);
         }
       } else {
@@ -631,7 +649,7 @@ void AddReplicatedStrategy(
     for (size_t j = 0; j < possible_input_shardings.size(); ++j) {
       double communication_cost = ComputeCommunicationCost(
           ins, possible_input_shardings[j], cluster_env);
-      strategies->leaf_vector.push_back(ShardingStrategy(
+      strategy_group->strategies.push_back(ShardingStrategy(
           {"R", replicated_strategy, replicated_penalty, communication_cost,
            memory_cost, std::move(possible_resharding_costs[j]),
            std::move(possible_input_shardings[j])}));
@@ -653,7 +671,7 @@ void AddReplicatedStrategy(
         auto operand = ins->operand(k);
         if (ins->opcode() == HloOpcode::kConditional) {
           resharding_costs.push_back(std::vector<double>(
-              strategy_map.at(operand)->leaf_vector.size(), 0));
+              strategy_map.at(operand)->strategies.size(), 0));
         } else {
           resharding_costs.push_back(ReshardingCostVector(
               strategy_map.at(operand).get(), ins->operand(k)->shape(),
@@ -662,7 +680,7 @@ void AddReplicatedStrategy(
         }
       }
     }
-    strategies->leaf_vector.push_back(ShardingStrategy(
+    strategy_group->strategies.push_back(ShardingStrategy(
         {"R", HloSharding::Replicate(), replicated_penalty, 0, memory_cost,
          std::move(resharding_costs), input_shardings}));
   }
@@ -670,9 +688,9 @@ void AddReplicatedStrategy(
 
 // TODO(pratikf) Communication costs for sort HLO ops. This is currently a
 // placeholder approximation and should be improved.
-double ComputeSortCommunicationCost(int64_t sort_dim,
-                                    int64_t operand_sharded_dim,
-                                    int64_t mesh_sharding_dim,
+double ComputeSortCommunicationCost(const int64_t sort_dim,
+                                    const int64_t operand_sharded_dim,
+                                    const int64_t mesh_sharding_dim,
                                     const Shape& shape,
                                     const ClusterEnvironment& cluster_env) {
   if (sort_dim == operand_sharded_dim) {
@@ -686,8 +704,8 @@ void EnumerateAll1DPartition(const HloInstruction* ins, const Shape& shape,
                              const Array<int64_t>& device_mesh,
                              const ClusterEnvironment& cluster_env,
                              const StrategyMap& strategy_map,
-                             std::unique_ptr<StrategyVector>& strategies,
-                             bool only_allow_divisible,
+                             std::unique_ptr<StrategyGroup>& strategy_group,
+                             const bool only_allow_divisible,
                              const std::string& suffix,
                              const CallGraph& call_graph) {
   for (int64_t i = 0; i < shape.rank(); ++i) {
@@ -698,7 +716,7 @@ void EnumerateAll1DPartition(const HloInstruction* ins, const Shape& shape,
         continue;
       }
 
-      std::string name = absl::StrFormat("S%d @ %d", i, j) + suffix;
+      const std::string name = absl::StrFormat("S%d @ %d", i, j) + suffix;
       HloSharding output_spec = Tile(shape, {i}, {j}, device_mesh);
       double compute_cost = 0, communication_cost = 0;
       double memory_cost = GetBytes(shape) / output_spec.NumTiles();
@@ -719,9 +737,7 @@ void EnumerateAll1DPartition(const HloInstruction* ins, const Shape& shape,
                 ins->operand(0), strategy_map.at(ins->operand(0)).get());
       } else if (ins->opcode() == HloOpcode::kRngBitGenerator &&
                  ins->operand(0)->shape().IsArray()) {
-        auto replicated_sharding = HloSharding::Replicate();
-        input_shardings.push_back(HloSharding::SingleTuple(
-            ins->operand(0)->shape(), replicated_sharding));
+        input_shardings.push_back(HloSharding::Replicate());
         resharding_costs =
             GenerateReshardingCostsAndMissingShardingsForAllOperands(
                 ins, output_spec, strategy_map, cluster_env, call_graph,
@@ -743,7 +759,7 @@ void EnumerateAll1DPartition(const HloInstruction* ins, const Shape& shape,
         communication_cost = ComputeSortCommunicationCost(
             ins->operand(0)->shape().rank() - 1, i, j, shape, cluster_env);
       }
-      strategies->leaf_vector.push_back(ShardingStrategy(
+      strategy_group->strategies.push_back(ShardingStrategy(
           {name, output_spec, compute_cost, communication_cost, memory_cost,
            std::move(resharding_costs), input_shardings}));
     }
@@ -754,25 +770,24 @@ void BuildStrategyAndCostForOp(const HloInstruction* ins, const Shape& shape,
                                const Array<int64_t>& device_mesh,
                                const ClusterEnvironment& cluster_env,
                                const StrategyMap& strategy_map,
-                               std::unique_ptr<StrategyVector>& strategies,
+                               std::unique_ptr<StrategyGroup>& strategy_group,
                                const CallGraph& call_graph,
                                absl::Span<const int64_t> tensor_dims);
 
-// Enumerate all partitions recursively
 void EnumerateAllPartition(const HloInstruction* ins, const Shape& shape,
                            const Array<int64_t>& device_mesh,
                            const ClusterEnvironment& cluster_env,
                            const StrategyMap& strategy_map,
-                           std::unique_ptr<StrategyVector>& strategies,
+                           std::unique_ptr<StrategyGroup>& strategy_group,
                            const InstructionBatchDimMap& batch_dim_map,
-                           bool only_allow_divisible,
+                           const bool only_allow_divisible,
                            const CallGraph& call_graph,
-                           int64_t partition_dimensions,
-                           const std::vector<int64_t>& tensor_dims = {}) {
+                           const int64_t partition_dimensions,
+                           const std::vector<int64_t>& tensor_dims) {
   const auto tensor_dims_size = tensor_dims.size();
   if (tensor_dims_size == partition_dimensions) {
     BuildStrategyAndCostForOp(ins, shape, device_mesh, cluster_env,
-                              strategy_map, strategies, call_graph,
+                              strategy_map, strategy_group, call_graph,
                               tensor_dims);
     return;
   }
@@ -797,22 +812,21 @@ void EnumerateAllPartition(const HloInstruction* ins, const Shape& shape,
     std::vector<int64_t> next_tensor_dims = tensor_dims;
     next_tensor_dims.push_back(i);
     EnumerateAllPartition(ins, shape, device_mesh, cluster_env, strategy_map,
-                          strategies, batch_dim_map, only_allow_divisible,
+                          strategy_group, batch_dim_map, only_allow_divisible,
                           call_graph, partition_dimensions, next_tensor_dims);
   }
 }
 
-// Builds the strategy + cost for the given tensor_dims & mesh_dims.
 void BuildStrategyAndCostForOp(const HloInstruction* ins, const Shape& shape,
                                const Array<int64_t>& device_mesh,
                                const ClusterEnvironment& cluster_env,
                                const StrategyMap& strategy_map,
-                               std::unique_ptr<StrategyVector>& strategies,
+                               std::unique_ptr<StrategyGroup>& strategy_group,
                                const CallGraph& call_graph,
                                absl::Span<const int64_t> tensor_dims) {
   std::vector<int64_t> mesh_dims(tensor_dims.size());
   std::iota(mesh_dims.begin(), mesh_dims.end(), 0);
-  std::string name =
+  const std::string name =
       absl::StrFormat("S{%s} @ {%s}", absl::StrJoin(tensor_dims, ","),
                       absl::StrJoin(mesh_dims, ","));
   HloSharding output_spec = Tile(shape, tensor_dims, mesh_dims, device_mesh);
@@ -832,7 +846,6 @@ void BuildStrategyAndCostForOp(const HloInstruction* ins, const Shape& shape,
     std::tie(resharding_costs, input_shardings) =
         ReshardingCostsForTupleOperand(ins->operand(0),
                                        strategy_map.at(ins->operand(0)).get());
-    LOG(INFO) << absl::StrJoin(resharding_costs.back(), ",");
   } else {
     std::tie(resharding_costs, input_shardings) =
         GenerateReshardingCostsAndShardingsForAllOperands(
@@ -861,19 +874,16 @@ void BuildStrategyAndCostForOp(const HloInstruction* ins, const Shape& shape,
       }
     }
   }
-  strategies->leaf_vector.push_back(ShardingStrategy(
+  strategy_group->strategies.push_back(ShardingStrategy(
       {name, output_spec, compute_cost, communication_cost, memory_cost,
        std::move(resharding_costs), input_shardings}));
 }
 
-// Enumerate all 1d partition strategies for reshape.
-void EnumerateAll1DPartitionReshape(const HloInstruction* ins,
-                                    const Array<int64_t>& device_mesh,
-                                    const ClusterEnvironment& cluster_env,
-                                    const StrategyMap& strategy_map,
-                                    std::unique_ptr<StrategyVector>& strategies,
-                                    bool only_allow_divisible,
-                                    const std::string& suffix) {
+void EnumerateAll1DPartitionReshape(
+    const HloInstruction* ins, const Array<int64_t>& device_mesh,
+    const ClusterEnvironment& cluster_env, const StrategyMap& strategy_map,
+    std::unique_ptr<StrategyGroup>& strategy_group, bool only_allow_divisible,
+    const std::string& suffix) {
   const HloInstruction* operand = ins->operand(0);
 
   for (int64_t i = 0; i < ins->shape().rank(); ++i) {
@@ -898,14 +908,14 @@ void EnumerateAll1DPartitionReshape(const HloInstruction* ins,
         continue;
       }
 
-      std::string name = absl::StrFormat("S%d @ %d", i, j) + suffix;
+      const std::string name = absl::StrFormat("S%d @ %d", i, j) + suffix;
       double compute_cost = 0, communication_cost = 0;
       double memory_cost = GetBytes(ins->shape()) / output_spec.NumTiles();
 
       std::vector<std::vector<double>> resharding_costs{
           ReshardingCostVector(strategy_map.at(operand).get(), operand->shape(),
                                *input_spec, cluster_env)};
-      strategies->leaf_vector.push_back(
+      strategy_group->strategies.push_back(
           ShardingStrategy({name,
                             output_spec,
                             compute_cost,
@@ -917,12 +927,11 @@ void EnumerateAll1DPartitionReshape(const HloInstruction* ins,
   }
 }
 
-void BuildStrategyAndCostForReshape(const HloInstruction* ins,
-                                    const Array<int64_t>& device_mesh,
-                                    const ClusterEnvironment& cluster_env,
-                                    const StrategyMap& strategy_map,
-                                    std::unique_ptr<StrategyVector>& strategies,
-                                    absl::Span<const int64_t> tensor_dims);
+void BuildStrategyAndCostForReshape(
+    const HloInstruction* ins, const Array<int64_t>& device_mesh,
+    const ClusterEnvironment& cluster_env, const StrategyMap& strategy_map,
+    std::unique_ptr<StrategyGroup>& strategy_group,
+    absl::Span<const int64_t> tensor_dims);
 
 // Enumerate all partitions for reshape. Batch dim is always partitioned.
 void EnumeratePartitionReshape(const HloInstruction* ins,
@@ -930,14 +939,14 @@ void EnumeratePartitionReshape(const HloInstruction* ins,
                                const ClusterEnvironment& cluster_env,
                                const StrategyMap& strategy_map,
                                const InstructionBatchDimMap& batch_dim_map,
-                               std::unique_ptr<StrategyVector>& strategies,
-                               bool only_allow_divisible,
-                               int64_t partition_dimensions,
+                               std::unique_ptr<StrategyGroup>& strategy_group,
+                               const bool only_allow_divisible,
+                               const int64_t partition_dimensions,
                                const std::vector<int64_t>& tensor_dims = {}) {
   const auto tensor_dims_size = tensor_dims.size();
   if (tensor_dims_size == partition_dimensions) {
     BuildStrategyAndCostForReshape(ins, device_mesh, cluster_env, strategy_map,
-                                   strategies, tensor_dims);
+                                   strategy_group, tensor_dims);
     return;
   }
   auto iter = batch_dim_map.find(GetBatchDimMapKey(ins));
@@ -964,17 +973,17 @@ void EnumeratePartitionReshape(const HloInstruction* ins,
     std::vector<int64_t> next_tensor_dims = tensor_dims;
     next_tensor_dims.push_back(i);
     EnumeratePartitionReshape(ins, device_mesh, cluster_env, strategy_map,
-                              batch_dim_map, strategies, only_allow_divisible,
-                              partition_dimensions, next_tensor_dims);
+                              batch_dim_map, strategy_group,
+                              only_allow_divisible, partition_dimensions,
+                              next_tensor_dims);
   }
 }
 
-void BuildStrategyAndCostForReshape(const HloInstruction* ins,
-                                    const Array<int64_t>& device_mesh,
-                                    const ClusterEnvironment& cluster_env,
-                                    const StrategyMap& strategy_map,
-                                    std::unique_ptr<StrategyVector>& strategies,
-                                    absl::Span<const int64_t> tensor_dims) {
+void BuildStrategyAndCostForReshape(
+    const HloInstruction* ins, const Array<int64_t>& device_mesh,
+    const ClusterEnvironment& cluster_env, const StrategyMap& strategy_map,
+    std::unique_ptr<StrategyGroup>& strategy_group,
+    absl::Span<const int64_t> tensor_dims) {
   const HloInstruction* operand = ins->operand(0);
   std::vector<int64_t> mesh_dims(tensor_dims.size());
   std::iota(mesh_dims.begin(), mesh_dims.end(), 0);
@@ -994,7 +1003,7 @@ void BuildStrategyAndCostForReshape(const HloInstruction* ins,
   std::vector<std::vector<double>> resharding_costs{
       ReshardingCostVector(strategy_map.at(operand).get(), operand->shape(),
                            *input_spec, cluster_env)};
-  strategies->leaf_vector.push_back(
+  strategy_group->strategies.push_back(
       ShardingStrategy({name,
                         output_spec,
                         compute_cost,
@@ -1007,23 +1016,22 @@ void BuildStrategyAndCostForReshape(const HloInstruction* ins,
 // Return the maximum number of tiles among all strategies of an instruction.
 int64_t MaxNumTiles(const StrategyMap& strategy_map,
                     const HloInstruction* ins) {
-  const StrategyVector* strategies = strategy_map.at(ins).get();
+  const StrategyGroup* strategy_group = strategy_map.at(ins).get();
   // TODO(zhuohan): optimize with path compression.
-  while (strategies->following != nullptr) {
-    strategies = strategies->following;
+  while (strategy_group->following != nullptr) {
+    strategy_group = strategy_group->following;
   }
   int64_t max_num_tiles = -1;
-  for (size_t i = 0; i < strategies->leaf_vector.size(); ++i) {
-    max_num_tiles = std::max(
-        max_num_tiles, strategies->leaf_vector[i].output_sharding.NumTiles());
+  for (size_t i = 0; i < strategy_group->strategies.size(); ++i) {
+    max_num_tiles =
+        std::max(max_num_tiles,
+                 strategy_group->strategies[i].output_sharding.NumTiles());
   }
-
   return max_num_tiles;
 }
 
-// Choose an operand to follow.  We choose to follow the operand with the
-// highest priority.  The priority is defined as a function of two entities as
-// below:
+// Choose an operand to follow. We choose to follow the operand with the highest
+// priority. The priority is defined as a function of two entities as below:
 //
 // priority(operand) =
 //   max(x.output_spec.num_tiles for x in operand.strategies) +
@@ -1041,14 +1049,15 @@ int64_t MaxNumTiles(const StrategyMap& strategy_map,
 // one to follow.
 std::pair<int64_t, bool> ChooseOperandToFollow(
     const StrategyMap& strategy_map, const InstructionDepthMap& depth_map,
-    const AliasMap& alias_map, int64_t max_depth, const HloInstruction* ins) {
+    const AliasMap& alias_map, const int64_t max_depth,
+    const HloInstruction* ins) {
   // If an alias constraint is set, always follow its alias source.
   auto it = alias_map.find(ins);
   if (it != alias_map.end()) {
     for (int64_t i = 0; i < ins->operand_count(); ++i) {
       const HloInstruction* operand = ins->operand(i);
       if (operand == it->second) {
-        return std::make_pair(i, false);
+        return {i, false};
       }
     }
   }
@@ -1072,12 +1081,11 @@ std::pair<int64_t, bool> ChooseOperandToFollow(
     }
   }
   CHECK(follow_idx.has_value());
-
-  return std::make_pair(*follow_idx, tie);
+  return {*follow_idx, tie};
 }
 
-// Return whether an instruciton can follow one of its operand when
-// more than one operand have the same priority.
+// Return whether an instruction can follow one of its operand when more than
+// one operand have the same priority.
 // Consider adding special cases here if the auto sharding following strategy
 // behaves weird for your model.
 bool AllowTieFollowing(const HloInstruction* ins) {
@@ -1122,91 +1130,98 @@ void DisableIncompatibleMixedMeshShapeAndForceBatchDim(
   }
 }
 
-StatusOr<std::unique_ptr<StrategyVector>> CreateAllStrategiesVector(
-    const HloInstruction* ins, const Shape& shape, size_t instruction_id,
-    LeafStrategies& leaf_strategies, const ClusterEnvironment& cluster_env,
+void FillAllStrategiesForArray(
+    std::unique_ptr<StrategyGroup>& strategy_group, const HloInstruction* ins,
+    const Shape& shape, const ClusterEnvironment& cluster_env,
+    const StrategyMap& strategy_map, const AutoShardingOption& option,
+    const double replicated_penalty,
+    const InstructionBatchDimMap& batch_dim_map, const CallGraph& call_graph,
+    const bool only_allow_divisible, const bool create_replicated_strategies,
+    const bool create_partially_replicated_strategies) {
+  if (create_partially_replicated_strategies || cluster_env.IsDeviceMesh1D()) {
+    EnumerateAll1DPartition(ins, shape, cluster_env.device_mesh_, cluster_env,
+                            strategy_map, strategy_group, only_allow_divisible,
+                            "", call_graph);
+  }
+  // Split 2 dims
+  if (cluster_env.IsDeviceMesh2D()) {
+    EnumerateAllPartition(ins, shape, cluster_env.device_mesh_, cluster_env,
+                          strategy_map, strategy_group, batch_dim_map,
+                          only_allow_divisible, call_graph, /*partitions*/ 2);
+  }
+  // Split 3 dims
+  if (cluster_env.IsDeviceMesh3D()) {
+    EnumerateAllPartition(ins, shape, cluster_env.device_mesh_, cluster_env,
+                          strategy_map, strategy_group, batch_dim_map,
+                          only_allow_divisible, call_graph, /*partitions*/ 3);
+  }
+
+  if (option.allow_mixed_mesh_shape && cluster_env.IsDeviceMesh2D()) {
+    // Set penalty for 1d partial tiled layout
+    for (size_t i = 0; i < strategy_group->strategies.size(); ++i) {
+      strategy_group->strategies[i].compute_cost += replicated_penalty * 0.8;
+    }
+
+    // Split 1 dim, but for 1d mesh
+    EnumerateAll1DPartition(ins, shape, cluster_env.device_mesh_1d_,
+                            cluster_env, strategy_map, strategy_group,
+                            only_allow_divisible, " 1d", call_graph);
+  }
+  if (create_replicated_strategies || strategy_group->strategies.empty()) {
+    AddReplicatedStrategy(ins, shape, cluster_env, strategy_map, strategy_group,
+                          replicated_penalty);
+  }
+
+  // If force_batch_dim_to_mesh_dim is set, filter out invalid strategies
+  // and only keep the data parallel strategies.
+  if (option.force_batch_dim_to_mesh_dim >= 0 &&
+      batch_dim_map.contains(GetBatchDimMapKey(ins))) {
+    CHECK_OK(FilterStrategy(ins, shape, strategy_group, cluster_env,
+                            batch_dim_map, option));
+  }
+}
+
+StatusOr<std::unique_ptr<StrategyGroup>> CreateAllStrategiesGroup(
+    const HloInstruction* ins, const Shape& shape, const size_t instruction_id,
+    StrategyGroups& strategy_groups, const ClusterEnvironment& cluster_env,
     const StrategyMap& strategy_map, const AutoShardingOption& option,
-    double replicated_penalty, const InstructionBatchDimMap& batch_dim_map,
-    const CallGraph& call_graph, bool only_allow_divisible,
-    bool create_replicated_strategies) {
-  std::unique_ptr<StrategyVector> strategies;
+    const double replicated_penalty,
+    const InstructionBatchDimMap& batch_dim_map, const CallGraph& call_graph,
+    const bool only_allow_divisible, const bool create_replicated_strategies,
+    const bool create_partially_replicated_strategies) {
+  std::unique_ptr<StrategyGroup> strategy_group;
   if (shape.IsTuple()) {
-    strategies = CreateTupleStrategyVector(instruction_id);
-    strategies->childs.reserve(shape.tuple_shapes_size());
+    strategy_group = CreateTupleStrategyGroup(instruction_id);
+    strategy_group->childs.reserve(shape.tuple_shapes_size());
     for (size_t i = 0; i < shape.tuple_shapes_size(); ++i) {
       auto child_strategies =
-          CreateAllStrategiesVector(ins, shape.tuple_shapes(i), instruction_id,
-                                    leaf_strategies, cluster_env, strategy_map,
-                                    option, replicated_penalty, batch_dim_map,
-                                    call_graph, only_allow_divisible,
-                                    create_replicated_strategies)
+          CreateAllStrategiesGroup(ins, shape.tuple_shapes(i), instruction_id,
+                                   strategy_groups, cluster_env, strategy_map,
+                                   option, replicated_penalty, batch_dim_map,
+                                   call_graph, only_allow_divisible,
+                                   create_replicated_strategies,
+                                   create_partially_replicated_strategies)
               .value();
       child_strategies->tuple_element_idx = i;
-      strategies->childs.push_back(std::move(child_strategies));
+      strategy_group->childs.push_back(std::move(child_strategies));
     }
   } else if (shape.IsArray()) {
-    strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
-                                          leaf_strategies);
-    EnumerateAll1DPartition(ins, shape, cluster_env.device_mesh_, cluster_env,
-                            strategy_map, strategies, only_allow_divisible, "",
-                            call_graph);
-    // Split 2 dims
-    if (cluster_env.IsDeviceMesh2D()) {
-      EnumerateAllPartition(ins, shape, cluster_env.device_mesh_, cluster_env,
-                            strategy_map, strategies, batch_dim_map,
-                            only_allow_divisible, call_graph, /*partitions*/ 2);
-    }
-    // Split 3 dims
-    if (cluster_env.IsDeviceMesh3D()) {
-      EnumerateAllPartition(ins, shape, cluster_env.device_mesh_, cluster_env,
-                            strategy_map, strategies, batch_dim_map,
-                            only_allow_divisible, call_graph, /*partitions*/ 3);
-    }
-
-    if (option.allow_mixed_mesh_shape && cluster_env.IsDeviceMesh2D()) {
-      // Set penalty for 1d partial tiled layout
-      for (size_t i = 0; i < strategies->leaf_vector.size(); ++i) {
-        strategies->leaf_vector[i].compute_cost += replicated_penalty * 0.8;
-      }
-
-      // Split 1 dim, but for 1d mesh
-      EnumerateAll1DPartition(ins, shape, cluster_env.device_mesh_1d_,
-                              cluster_env, strategy_map, strategies,
-                              only_allow_divisible, " 1d", call_graph);
-    }
-    if (create_replicated_strategies || strategies->leaf_vector.empty()) {
-      AddReplicatedStrategy(ins, shape, cluster_env, strategy_map, strategies,
-                            replicated_penalty);
-    }
+    strategy_group = CreateLeafStrategyGroup(instruction_id, ins, strategy_map,
+                                             strategy_groups);
 
-    // If force_batch_dim_to_mesh_dim is set, filter out invalid strategies
-    // and only keep the data parallel strategies.
-    if (option.force_batch_dim_to_mesh_dim >= 0 &&
-        batch_dim_map.contains(GetBatchDimMapKey(ins))) {
-      TF_RETURN_IF_ERROR(FilterStrategy(ins, shape, strategies, cluster_env,
-                                        batch_dim_map, option));
-    }
+    FillAllStrategiesForArray(
+        strategy_group, ins, shape, cluster_env, strategy_map, option,
+        replicated_penalty, batch_dim_map, call_graph, only_allow_divisible,
+        create_replicated_strategies, create_partially_replicated_strategies);
   } else if (shape.IsToken()) {
-    strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
-                                          leaf_strategies);
-    AddReplicatedStrategy(ins, shape, cluster_env, strategy_map, strategies,
+    strategy_group = CreateLeafStrategyGroup(instruction_id, ins, strategy_map,
+                                             strategy_groups);
+    AddReplicatedStrategy(ins, shape, cluster_env, strategy_map, strategy_group,
                           replicated_penalty);
   } else {
     LOG(FATAL) << "Unsupported instruction shape: " << shape.DebugString();
   }
-  return strategies;
-}
-
-StatusOr<std::unique_ptr<StrategyVector>> CreateParameterStrategyVector(
-    const HloInstruction* ins, const Shape& shape, size_t instruction_id,
-    LeafStrategies& leaf_strategies, const ClusterEnvironment& cluster_env,
-    const StrategyMap& strategy_map, const AutoShardingOption& option,
-    double replicated_penalty, const InstructionBatchDimMap& batch_dim_map,
-    const CallGraph& call_graph, bool only_allow_divisible) {
-  return CreateAllStrategiesVector(
-      ins, shape, instruction_id, leaf_strategies, cluster_env, strategy_map,
-      option, replicated_penalty, batch_dim_map, call_graph,
-      only_allow_divisible, option.allow_replicated_parameters);
+  return strategy_group;
 }
 
 // The sharding is replicated or the total number of tiles is over or equal to
@@ -1251,17 +1266,17 @@ bool ShardingIsConsistent(const HloSharding& partial_sharding,
 // HloSharding.
 // These two are distinguished by ShardingIsComplete().
 void TrimOrGenerateStrategiesBasedOnExistingSharding(
-    const Shape& output_shape, StrategyVector* strategies,
+    const Shape& output_shape, StrategyGroup* strategy_group,
     const StrategyMap& strategy_map,
-    const std::vector<HloInstruction*> instructions,
+    const std::vector<HloInstruction*>& instructions,
     const HloSharding& existing_sharding, const ClusterEnvironment& cluster_env,
     StableHashMap<int64_t, std::vector<ShardingStrategy>>&
         pretrimmed_strategy_map,
-    const CallGraph& call_graph, bool strict) {
-  if (strategies->is_tuple) {
-    for (size_t i = 0; i < strategies->childs.size(); ++i) {
+    const CallGraph& call_graph, const bool strict) {
+  if (strategy_group->is_tuple) {
+    for (size_t i = 0; i < strategy_group->childs.size(); ++i) {
       TrimOrGenerateStrategiesBasedOnExistingSharding(
-          output_shape.tuple_shapes(i), strategies->childs.at(i).get(),
+          output_shape.tuple_shapes(i), strategy_group->childs.at(i).get(),
           strategy_map, instructions, existing_sharding.tuple_elements().at(i),
           cluster_env, pretrimmed_strategy_map, call_graph, strict);
     }
@@ -1269,34 +1284,33 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
     if (ShardingIsComplete(existing_sharding,
                            cluster_env.device_mesh_.num_elements())) {
       // Sharding provided by XLA users, we need to keep them.
-      strategies->following = nullptr;
-      int32_t strategy_index = -1;
-      for (size_t i = 0; i < strategies->leaf_vector.size(); i++) {
-        if (strategies->leaf_vector[i].output_sharding == existing_sharding) {
-          strategy_index = i;
+      strategy_group->following = nullptr;
+      std::vector<ShardingStrategy> new_strategies;
+      for (size_t i = 0; i < strategy_group->strategies.size(); i++) {
+        if (strategy_group->strategies[i].output_sharding ==
+            existing_sharding) {
+          VLOG(1) << "Keeping strategy index: " << i;
+          ShardingStrategy found_strategy = strategy_group->strategies[i];
+          new_strategies.push_back(found_strategy);
         }
       }
-      if (strategy_index >= 0) {
-        VLOG(1) << "Keeping strategy index: " << strategy_index;
+      if (!new_strategies.empty()) {
         // Stores other strategies in the map, removes them in the vector and
         // only keeps the one we found.
-        ShardingStrategy found_strategy =
-            strategies->leaf_vector[strategy_index];
-        pretrimmed_strategy_map[strategies->node_idx] = strategies->leaf_vector;
-        strategies->leaf_vector.clear();
-        strategies->leaf_vector.push_back(found_strategy);
+        pretrimmed_strategy_map[strategy_group->node_idx] =
+            strategy_group->strategies;
+        strategy_group->strategies.clear();
+        strategy_group->strategies = new_strategies;
       } else {
         VLOG(1) << "Generate a new strategy based on user sharding.";
         std::string name = ToStringSimple(existing_sharding);
         std::vector<std::vector<double>> resharding_costs;
         std::vector<std::optional<HloSharding>> input_shardings;
-        if (strategies->in_nodes.empty()) {
-          resharding_costs = {};
-        } else {
-          HloInstruction* ins = instructions.at(strategies->instruction_id);
-          for (size_t i = 0; i < strategies->in_nodes.size(); i++) {
+        if (!strategy_group->in_nodes.empty()) {
+          HloInstruction* ins = instructions.at(strategy_group->instruction_id);
+          for (size_t i = 0; i < strategy_group->in_nodes.size(); i++) {
             HloInstruction* operand =
-                instructions.at(strategies->in_nodes.at(i)->instruction_id);
+                instructions.at(strategy_group->in_nodes.at(i)->instruction_id);
             std::optional<HloSharding> input_sharding_or =
                 ShardingPropagation::GetShardingFromUser(*operand, *ins, 10,
                                                          true, call_graph);
@@ -1304,52 +1318,51 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
               input_shardings.push_back(input_sharding_or.value());
             }
 
-            StrategyVector* operand_strategies;
-            Shape operand_shape;
+            StrategyGroup* operand_strategy_group =
+                strategy_map.at(operand).get();
+            Shape operand_shape = operand->shape();
             if (ins->opcode() == HloOpcode::kGetTupleElement) {
-              operand_strategies =
-                  strategy_map.at(operand)->childs[ins->tuple_index()].get();
-              operand_shape = operand->shape().tuple_shapes(ins->tuple_index());
-            } else {
-              operand_strategies = strategy_map.at(operand).get();
-              operand_shape = operand->shape();
+              operand_strategy_group =
+                  operand_strategy_group->childs[ins->tuple_index()].get();
+              operand_shape = operand_shape.tuple_shapes(ins->tuple_index());
             }
             resharding_costs.push_back(
-                ReshardingCostVector(operand_strategies, operand_shape,
+                ReshardingCostVector(operand_strategy_group, operand_shape,
                                      existing_sharding, cluster_env));
           }
         }
         double memory_cost =
             GetBytes(output_shape) / existing_sharding.NumTiles();
-        if (!strategies->leaf_vector.empty()) {
-          pretrimmed_strategy_map[strategies->node_idx] =
-              strategies->leaf_vector;
+        if (!strategy_group->strategies.empty()) {
+          pretrimmed_strategy_map[strategy_group->node_idx] =
+              strategy_group->strategies;
         }
-        strategies->leaf_vector.clear();
-        strategies->leaf_vector.push_back(
+        strategy_group->strategies.clear();
+        strategy_group->strategies.push_back(
             ShardingStrategy({name, existing_sharding, 0, 0, memory_cost,
                               resharding_costs, input_shardings}));
       }
-      CHECK_EQ(strategies->leaf_vector.size(), 1);
       // If there is only one option for resharding, and the cost computed for
       // that option is kInfinityCost, set the cost to zero. This is okay
       // because there is only one option anyway, and having the costs set to
       // kInfinityCost is problematic for the solver.
-      for (auto& operand_resharding_costs :
-           strategies->leaf_vector[0].resharding_costs) {
-        if (operand_resharding_costs.size() == 1 &&
-            operand_resharding_costs[0] >= kInfinityCost) {
-          operand_resharding_costs[0] = 0;
+      if (strategy_group->strategies.size() == 1) {
+        for (auto& operand_resharding_costs :
+             strategy_group->strategies[0].resharding_costs) {
+          if (operand_resharding_costs.size() == 1 &&
+              operand_resharding_costs[0] >= kInfinityCost) {
+            operand_resharding_costs[0] = 0;
+          }
         }
       }
-    } else if (!strategies->following) {
+    } else if (!strategy_group->following) {
       // If existing sharding is a partial sharding from previous iteration,
       // find the strategies that are 1D&&complete or align with user
       // sharding.
       // It is IMPORTANT that we do this only for instructions that do no follow
       // others, to keep the number of ILP variable small.
       std::vector<ShardingStrategy> new_vector;
-      for (const auto& strategy : strategies->leaf_vector) {
+      for (const auto& strategy : strategy_group->strategies) {
         if (strategy.output_sharding.IsReplicated() ||
             ShardingIsConsistent(existing_sharding, strategy.output_sharding,
                                  strict) ||
@@ -1366,29 +1379,30 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
       // not have to strictly keep those shardings and the only purpose is to
       // reduce problem size for the last iteration.
       if (!new_vector.empty() &&
-          new_vector.size() != strategies->leaf_vector.size()) {
-        strategies->following = nullptr;
-        strategies->leaf_vector = std::move(new_vector);
+          new_vector.size() != strategy_group->strategies.size()) {
+        strategy_group->following = nullptr;
+        strategy_group->strategies = std::move(new_vector);
       }
     }
   }
 }
 
-void CheckMemoryCosts(StrategyVector* strategies, const Shape& shape) {
-  if (strategies->is_tuple) {
-    for (size_t i = 0; i < strategies->childs.size(); i++) {
-      CheckMemoryCosts(strategies->childs[i].get(), shape.tuple_shapes().at(i));
+void CheckMemoryCosts(StrategyGroup* strategy_group, const Shape& shape) {
+  if (strategy_group->is_tuple) {
+    for (size_t i = 0; i < strategy_group->childs.size(); i++) {
+      CheckMemoryCosts(strategy_group->childs[i].get(),
+                       shape.tuple_shapes().at(i));
     }
   } else {
     double full_mem = 0.0;
-    for (const auto& strategy : strategies->leaf_vector) {
+    for (const auto& strategy : strategy_group->strategies) {
       if (strategy.output_sharding.IsReplicated()) {
         full_mem = strategy.memory_cost;
         size_t size = GetInstructionSize(shape);
         CHECK_EQ(strategy.memory_cost, size);
       }
     }
-    for (const auto& strategy : strategies->leaf_vector) {
+    for (const auto& strategy : strategy_group->strategies) {
       if (!strategy.output_sharding.IsReplicated() && full_mem > 0.0) {
         CHECK_EQ(strategy.memory_cost * strategy.output_sharding.NumTiles(),
                  full_mem);
@@ -1397,17 +1411,18 @@ void CheckMemoryCosts(StrategyVector* strategies, const Shape& shape) {
   }
 }
 
-void RemoveInvalidShardingsWithShapes(const Shape& shape,
-                                      StrategyVector* strategies,
-                                      bool instruction_has_user_sharding) {
-  if (strategies->is_tuple) {
-    for (size_t i = 0; i < strategies->childs.size(); i++) {
+void RemoveInvalidShardingsWithShapes(
+    const Shape& shape, StrategyGroup* strategy_group,
+    const bool instruction_has_user_sharding) {
+  if (strategy_group->is_tuple) {
+    for (size_t i = 0; i < strategy_group->childs.size(); i++) {
       RemoveInvalidShardingsWithShapes(shape.tuple_shapes().at(i),
-                                       strategies->childs[i].get(),
+                                       strategy_group->childs[i].get(),
                                        instruction_has_user_sharding);
     }
   } else {
-    if (instruction_has_user_sharding && strategies->leaf_vector.size() == 1) {
+    if (instruction_has_user_sharding &&
+        strategy_group->strategies.size() == 1) {
       // If an instruction has a specified user sharding, and there is only a
       // single strategy, removing that strategy would mean we won't have any
       // strategy for that instruction. Further, given that the user has
@@ -1416,7 +1431,7 @@ void RemoveInvalidShardingsWithShapes(const Shape& shape,
       return;
     }
     std::vector<ShardingStrategy> new_vector;
-    for (const auto& strategy : strategies->leaf_vector) {
+    for (const auto& strategy : strategy_group->strategies) {
       if (strategy.output_sharding.IsReplicated()) {
         new_vector.push_back(strategy);
         continue;
@@ -1427,7 +1442,8 @@ void RemoveInvalidShardingsWithShapes(const Shape& shape,
       for (int64_t i = 0; i < shape.rank(); ++i) {
         if (tile_assignment.dim(i) > 1 &&
             tile_assignment.dim(i) > shape.dimensions(i)) {
-          VLOG(1) << "Removing invalid strategy: " << strategy.ToString();
+          VLOG(1) << "May remove invalid strategy if valid ones exist: "
+                  << strategy.ToString();
           is_strategy_valid = false;
           break;
         }
@@ -1436,49 +1452,52 @@ void RemoveInvalidShardingsWithShapes(const Shape& shape,
         new_vector.push_back(strategy);
       }
     }
-    strategies->leaf_vector = std::move(new_vector);
+    if (!new_vector.empty()) {
+      strategy_group->strategies = std::move(new_vector);
+    }
   }
 }
 
-void CheckReshardingCostsShape(StrategyVector* strategies) {
-  if (strategies->is_tuple) {
-    for (size_t i = 0; i < strategies->childs.size(); i++) {
-      CheckReshardingCostsShape(strategies->childs[i].get());
+void CheckReshardingCostsShape(StrategyGroup* strategy_group) {
+  if (strategy_group->is_tuple) {
+    for (size_t i = 0; i < strategy_group->childs.size(); i++) {
+      CheckReshardingCostsShape(strategy_group->childs[i].get());
     }
   } else {
-    for (const auto& strategy : strategies->leaf_vector) {
-      if (strategies->in_nodes.size() == 1 &&
-          strategies->in_nodes.at(0)->is_tuple) {
+    for (const auto& strategy : strategy_group->strategies) {
+      if (strategy_group->in_nodes.size() == 1 &&
+          strategy_group->in_nodes.at(0)->is_tuple) {
         // This is when current instruction's only operand is tuple, and the
         // first dimension of resharding_costs should equal its number of
         // tuple elements.
         CHECK_EQ(strategy.resharding_costs.size(),
-                 strategies->in_nodes.at(0)->childs.size())
-            << "Instruction ID: " << strategies->instruction_id << "\n"
-            << strategies->ToString();
+                 strategy_group->in_nodes.at(0)->childs.size())
+            << "Instruction ID: " << strategy_group->instruction_id << "\n"
+            << strategy_group->ToString();
       } else {
         // The rest of the time, the first dimension of resharding_costs
         // should equal its number of operands (in_nodes).
-        CHECK_EQ(strategy.resharding_costs.size(), strategies->in_nodes.size())
-            << "Instruction ID: " << strategies->instruction_id << "\n"
-            << strategies->ToString();
+        CHECK_EQ(strategy.resharding_costs.size(),
+                 strategy_group->in_nodes.size())
+            << "Instruction ID: " << strategy_group->instruction_id << "\n"
+            << strategy_group->ToString();
       }
       for (size_t i = 0; i < strategy.resharding_costs.size(); i++) {
         size_t to_compare;
-        if (strategies->in_nodes.size() == 1 &&
-            strategies->in_nodes.at(0)->is_tuple) {
+        if (strategy_group->in_nodes.size() == 1 &&
+            strategy_group->in_nodes.at(0)->is_tuple) {
           to_compare =
-              strategies->in_nodes.at(0)->childs.at(i)->leaf_vector.size();
-        } else if (strategies->is_tuple) {
-          to_compare = strategies->in_nodes.at(i)->childs.size();
+              strategy_group->in_nodes.at(0)->childs.at(i)->strategies.size();
+        } else if (strategy_group->is_tuple) {
+          to_compare = strategy_group->in_nodes.at(i)->childs.size();
         } else {
-          to_compare = strategies->in_nodes.at(i)->leaf_vector.size();
+          to_compare = strategy_group->in_nodes.at(i)->strategies.size();
         }
         CHECK_EQ(strategy.resharding_costs[i].size(), to_compare)
             << "\nIndex of resharding_costs: " << i
-            << "\nInstruction ID: " << strategies->instruction_id
+            << "\nInstruction ID: " << strategy_group->instruction_id
             << "\nCurrent strategies:\n"
-            << strategies->ToString();
+            << strategy_group->ToString();
       }
     }
   }
@@ -1486,22 +1505,22 @@ void CheckReshardingCostsShape(StrategyVector* strategies) {
 
 bool LeafVectorsAreConsistent(const std::vector<ShardingStrategy>& one,
                               const std::vector<ShardingStrategy>& two,
-                              bool is_reshape) {
+                              const bool is_reshape) {
   if (one.size() != two.size()) {
     return false;
   }
   return true;
 }
 
-void ScaleCostsWithExecutionCounts(StrategyVector* strategies,
-                                   int64_t execution_count) {
-  if (strategies->is_tuple) {
-    for (size_t i = 0; i < strategies->childs.size(); ++i) {
-      ScaleCostsWithExecutionCounts(strategies->childs[i].get(),
+void ScaleCostsWithExecutionCounts(StrategyGroup* strategy_group,
+                                   const int64_t execution_count) {
+  if (strategy_group->is_tuple) {
+    for (size_t i = 0; i < strategy_group->childs.size(); ++i) {
+      ScaleCostsWithExecutionCounts(strategy_group->childs[i].get(),
                                     execution_count);
     }
   } else {
-    for (auto& strategy : strategies->leaf_vector) {
+    for (auto& strategy : strategy_group->strategies) {
       strategy.compute_cost *= execution_count;
       strategy.communication_cost *= execution_count;
       for (auto i = 0; i < strategy.resharding_costs.size(); ++i) {
@@ -1513,72 +1532,46 @@ void ScaleCostsWithExecutionCounts(StrategyVector* strategies,
   }
 }
 
-// Enumerates sharding strategies for elementwise operators by following
-// strategies of an operand of the elementwise op.
-std::unique_ptr<StrategyVector> CreateElementwiseOperatorStrategies(
-    size_t instruction_id, const HloInstruction* ins,
+std::unique_ptr<StrategyGroup> CreateElementwiseOperatorStrategies(
+    const size_t instruction_id, const HloInstruction* ins,
     const StrategyMap& strategy_map, const ClusterEnvironment& cluster_env,
     const InstructionDepthMap& depth_map, const AliasMap& alias_map,
-    const StableHashMap<int64_t, std::vector<ShardingStrategy>>&
+    StableHashMap<int64_t, std::vector<ShardingStrategy>>&
         pretrimmed_strategy_map,
-    int64_t max_depth, LeafStrategies& leaf_strategies,
+    const int64_t max_depth, StrategyGroups& strategy_groups,
     AssociativeDotPairs& associative_dot_pairs) {
-  std::unique_ptr<StrategyVector> strategies = CreateLeafStrategyVector(
-      instruction_id, ins, strategy_map, leaf_strategies);
+  std::unique_ptr<StrategyGroup> strategy_group = CreateLeafStrategyGroup(
+      instruction_id, ins, strategy_map, strategy_groups);
 
-  // Choose an operand to follow
+  // Choose an operand to follow.
   int64_t follow_idx;
   bool tie;
   std::tie(follow_idx, tie) =
       ChooseOperandToFollow(strategy_map, depth_map, alias_map, max_depth, ins);
 
   if (!tie || AllowTieFollowing(ins)) {
-    strategies->following = strategy_map.at(ins->operand(follow_idx)).get();
+    strategy_group->following = strategy_map.at(ins->operand(follow_idx)).get();
   } else {
-    strategies->following = nullptr;
+    strategy_group->following = nullptr;
   }
 
-  // Get all possible sharding specs from operands
+  // Get all possible sharding specs from operands.
   for (int64_t i = 0; i < ins->operand_count(); ++i) {
-    if (strategies->following != nullptr && i != follow_idx) {
+    if (strategy_group->following != nullptr && i != follow_idx) {
       // If ins follows one operand, do not consider sharding specs from
       // other operands.
       continue;
     }
 
-    auto process_src_strategies =
-        [&](const std::vector<ShardingStrategy>& src_strategies_leaf_vector) {
-          for (int64_t sid = 0; sid < src_strategies_leaf_vector.size();
-               ++sid) {
-            HloSharding output_spec =
-                src_strategies_leaf_vector[sid].output_sharding;
-            std::string name = ToStringSimple(output_spec);
-            double compute_cost = 0, communication_cost = 0;
-            double memory_cost =
-                GetBytes(ins->shape()) / output_spec.NumTiles();
-            std::vector<std::vector<double>> resharding_costs;
-            std::vector<std::optional<HloSharding>> input_shardings;
-            for (int64_t k = 0; k < ins->operand_count(); ++k) {
-              resharding_costs.push_back(ReshardingCostVector(
-                  strategy_map.at(ins->operand(k)).get(),
-                  ins->operand(k)->shape(), output_spec, cluster_env));
-              input_shardings.push_back(output_spec);
-            }
-
-            strategies->leaf_vector.push_back(ShardingStrategy(
-                {name, output_spec, compute_cost, communication_cost,
-                 memory_cost, std::move(resharding_costs), input_shardings}));
-          }
-        };
-    StrategyVector* src_strategies = strategy_map.at(ins->operand(i)).get();
-    CHECK(!src_strategies->is_tuple);
+    StrategyGroup* src_strategy_group = strategy_map.at(ins->operand(i)).get();
+    CHECK(!src_strategy_group->is_tuple);
 
-    process_src_strategies(src_strategies->leaf_vector);
-    if (pretrimmed_strategy_map.contains(src_strategies->node_idx)) {
-      process_src_strategies(
-          pretrimmed_strategy_map.at(src_strategies->node_idx));
-    }
+    FollowArrayOrTokenStrategyGroup(*src_strategy_group, ins->shape(),
+                                    instruction_id,
+                                    /* have_memory_cost */ true, cluster_env,
+                                    pretrimmed_strategy_map, *strategy_group);
   }
+
   if (ins->opcode() == HloOpcode::kAdd) {
     // Adjust the resharding costs for AllReduceReassociate pass.
     // The AllReduceReassociate pass can simplify
@@ -1594,36 +1587,35 @@ std::unique_ptr<StrategyVector> CreateElementwiseOperatorStrategies(
                                        strategy_map.at(ins->operand(1)).get()});
     }
   }
-  return strategies;
+  return strategy_group;
 }
 
-// Enumerates sharding strategies for reshape operators. The function does so by
-// essentially reshaping the sharding of the operand in a manner similar to the
-// tensor reshape itself.
-std::unique_ptr<StrategyVector> CreateReshapeStrategies(
-    size_t instruction_id, const HloInstruction* ins,
+std::unique_ptr<StrategyGroup> CreateReshapeStrategies(
+    const size_t instruction_id, const HloInstruction* ins,
     const StrategyMap& strategy_map, const ClusterEnvironment& cluster_env,
-    bool only_allow_divisible, double replicated_penalty,
+    const bool only_allow_divisible, const double replicated_penalty,
     const InstructionBatchDimMap& batch_dim_map,
-    const AutoShardingOption& option, LeafStrategies& leaf_strategies) {
-  std::unique_ptr<StrategyVector> strategies = CreateLeafStrategyVector(
-      instruction_id, ins, strategy_map, leaf_strategies);
-  const HloInstruction* operand = ins->operand(0);
+    const AutoShardingOption& option, StrategyGroups& strategy_groups,
+    const CallGraph& call_graph) {
   const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
-  const Array<int64_t>& device_mesh_1d = cluster_env.device_mesh_1d_;
 
   int mesh_nn_dims = VectorGreaterThanOneElementCount(device_mesh.dimensions());
+  std::unique_ptr<StrategyGroup> strategy_group = CreateLeafStrategyGroup(
+      instruction_id, ins, strategy_map, strategy_groups);
+
   if (mesh_nn_dims < 2 || !option.allow_mixed_mesh_shape) {
+    const HloInstruction* operand = ins->operand(0);
+
     // Create follow strategies
-    const StrategyVector* src_strategies = strategy_map.at(operand).get();
-    CHECK(!src_strategies->is_tuple);
-    strategies->following = src_strategies;
+    const StrategyGroup* src_strategy_group = strategy_map.at(operand).get();
+    CHECK(!src_strategy_group->is_tuple);
+    strategy_group->following = src_strategy_group;
 
-    for (int64_t sid = 0; sid < src_strategies->leaf_vector.size(); ++sid) {
+    for (int64_t sid = 0; sid < src_strategy_group->strategies.size(); ++sid) {
       std::optional<HloSharding> output_spec =
           hlo_sharding_util::ReshapeSharding(
               operand->shape(), ins->shape(),
-              src_strategies->leaf_vector[sid].output_sharding);
+              src_strategy_group->strategies[sid].output_sharding);
 
       if (!output_spec.has_value()) {
         continue;
@@ -1636,932 +1628,137 @@ std::unique_ptr<StrategyVector> CreateReshapeStrategies(
       if (!TileAssignmentMatchesMesh(*output_spec, device_mesh)) {
         continue;
       }
-      std::string name = ToStringSimple(*output_spec);
+      const std::string name = ToStringSimple(*output_spec);
       double compute_cost = 0, communication_cost = 0;
       double memory_cost = GetBytes(ins->shape()) / output_spec->NumTiles();
       std::vector<double> resharding_costs = ReshardingCostVector(
-          src_strategies, operand->shape(),
-          src_strategies->leaf_vector[sid].output_sharding, cluster_env);
-      strategies->leaf_vector.push_back(ShardingStrategy(
+          src_strategy_group, operand->shape(),
+          src_strategy_group->strategies[sid].output_sharding, cluster_env);
+      strategy_group->strategies.push_back(ShardingStrategy(
           {name,
            *output_spec,
            compute_cost,
            communication_cost,
            memory_cost,
            {resharding_costs},
-           {src_strategies->leaf_vector[sid].output_sharding}}));
+           {src_strategy_group->strategies[sid].output_sharding}}));
     }
   }
 
-  // Fail to create follow strategies, enumerate all possible cases
-  if (strategies->leaf_vector.empty()) {
-    strategies->leaf_vector.clear();
-    strategies->following = nullptr;
-
-    // Split 1 dim
-    if (cluster_env.IsDeviceMesh1D()) {
-      EnumerateAll1DPartitionReshape(ins, device_mesh, cluster_env,
-                                     strategy_map, strategies,
-                                     only_allow_divisible, "");
-    }
-    if (option.allow_mixed_mesh_shape && cluster_env.IsDeviceMesh2D()) {
-      // Split 1 dim, but for 1d mesh
-      EnumerateAll1DPartitionReshape(ins, device_mesh_1d, cluster_env,
-                                     strategy_map, strategies,
-                                     only_allow_divisible, " 1d");
-    }
-    if (cluster_env.IsDeviceMesh2D()) {
-      // Split 2 dim, one is always the batch dim
-      EnumeratePartitionReshape(ins, device_mesh, cluster_env, strategy_map,
-                                batch_dim_map, strategies, only_allow_divisible,
-                                /*partitions*/ 2);
-    }
-    if (cluster_env.IsDeviceMesh3D()) {
-      // Split 3 dim, one is always the batch dim
-      EnumeratePartitionReshape(ins, device_mesh, cluster_env, strategy_map,
-                                batch_dim_map, strategies, only_allow_divisible,
-                                /*partitions*/ 3);
-    }
-
-    // Replicate
-    AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
-                          strategies, replicated_penalty);
-  }
-  return strategies;
-}
-
-// NOLINTBEGIN(readability/fn_size)
-// TODO(zhuohan): Decompose this function into smaller pieces
-// Build possible sharding strategies and their costs for all instructions.
-StatusOr<std::tuple<StrategyMap, LeafStrategies, AssociativeDotPairs>>
-BuildStrategyAndCost(const HloInstructionSequence& sequence,
-                     const HloModule* module,
-                     const absl::flat_hash_map<const HloInstruction*, int64_t>&
-                         instruction_execution_counts,
-                     const InstructionDepthMap& depth_map,
-                     const InstructionBatchDimMap& batch_dim_map,
-                     const AliasMap& alias_map,
-                     const ClusterEnvironment& cluster_env,
-                     AutoShardingOption& option, const CallGraph& call_graph,
-                     bool trying_multiple_mesh_shapes) {
-  const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
-  const Array<int64_t>& device_mesh_1d = cluster_env.device_mesh_1d_;
-  StrategyMap strategy_map;
-  // This map stores all of the trimmed strategies due to user specified
-  // sharding. The key is the instruction id, the value is the strategies. This
-  // is useful when the operand is forced to use a user sharding, and the op
-  // doesn't need to strictly follow it. We restore the trimmed strategies in
-  // this situation.
-  StableHashMap<int64_t, std::vector<ShardingStrategy>> pretrimmed_strategy_map;
-  LeafStrategies leaf_strategies;
-  AssociativeDotPairs associative_dot_pairs;
-
-  const std::vector<HloInstruction*>& instructions = sequence.instructions();
-
-  // Add penalty for replicated tensors
-  double replicated_penalty = std::round(cluster_env.AllReduceCost(1, 0) +
-                                         cluster_env.AllReduceCost(1, 1));
-
-  int64_t max_depth = -1;
-  for (auto iter : depth_map) {
-    max_depth = std::max(max_depth, iter.second);
-  }
-
-  // Register strategies and their costs for each instruction.
-  for (size_t instruction_id = 0; instruction_id < instructions.size();
-       ++instruction_id) {
-    const HloInstruction* ins = instructions[instruction_id];
-    VLOG(2) << "instruction_id = " << instruction_id << ": "
-            << ToAdaptiveString(ins);
-    std::unique_ptr<StrategyVector> strategies;
-
-    HloOpcode opcode = ins->opcode();
-
-    bool only_allow_divisible;
-    if (IsEntryComputationInputOrOutput(module, ins)) {
-      // With IsEntryComputationInputOrOutput(module, ins) == true, entry
-      // computation's root instruction may still be unevenly sharded because it
-      // usually "follows" other instruction's sharding. If the instruction it
-      // follows is an intermediate instruction, it may be able to choose
-      // unevenly sharded strategiyes. Usually if we constraint input's sharding
-      // strategies, outputs would be constrained as welll, but if outputs are
-      // still unevely sharded in some cases, we need to fix the implementation
-      // in auto sharding.
-      only_allow_divisible = option.only_allow_divisible_input_output;
-    } else {
-      only_allow_divisible = option.only_allow_divisible_intermediate;
-    }
-    switch (opcode) {
-      case HloOpcode::kParameter:
-      case HloOpcode::kRngBitGenerator:
-      case HloOpcode::kRng: {
-        strategies = CreateParameterStrategyVector(
-                         ins, ins->shape(), instruction_id, leaf_strategies,
-                         cluster_env, strategy_map, option, replicated_penalty,
-                         batch_dim_map, call_graph, only_allow_divisible)
-                         .value();
-        break;
-      }
-      case HloOpcode::kConstant: {
-        strategies = CreateLeafStrategyVectorWithoutInNodes(instruction_id,
-                                                            leaf_strategies);
-        AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
-                              strategies, 0);
-        break;
-      }
-      case HloOpcode::kScatter: {
-        strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
-                                              leaf_strategies);
-        // We follow the first operand (the array we're scattering into)
-        auto src_strategies = strategy_map.at(ins->operand(0)).get();
-        CHECK(!src_strategies->is_tuple);
-        for (int64_t sid = 0; sid < src_strategies->leaf_vector.size(); ++sid) {
-          HloSharding output_spec =
-              src_strategies->leaf_vector[sid].output_sharding;
-          std::string name = ToStringSimple(output_spec);
-          double compute_cost = 0, communication_cost = 0;
-          double memory_cost = GetBytes(ins->shape()) / output_spec.NumTiles();
-
-          std::vector<std::optional<HloSharding>> input_shardings_optional(
-              {output_spec, std::nullopt, std::nullopt});
-          std::vector<std::vector<double>> resharding_cost =
-              GenerateReshardingCostsAndMissingShardingsForAllOperands(
-                  ins, output_spec, strategy_map, cluster_env, call_graph,
-                  input_shardings_optional);
-
-          for (const auto& sharding_optional : input_shardings_optional) {
-            CHECK(sharding_optional.has_value());
-          }
-
-          strategies->leaf_vector.push_back(ShardingStrategy(
-              {name, output_spec, compute_cost, communication_cost, memory_cost,
-               std::move(resharding_cost), input_shardings_optional}));
-        }
-        break;
-      }
-      case HloOpcode::kGather: {
-        strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
-                                              leaf_strategies);
-        const HloInstruction* indices = ins->operand(1);
-        const Shape& shape = ins->shape();
-        for (int32_t index_dim = 0; index_dim < indices->shape().rank();
-             index_dim++) {
-          // Shard on indices dimensions that correspond to output dimensions
-          // TODO(b/220935014) Shard the last dim of output (model dim) with
-          // AllGather cost and no follow.
-          if (index_dim == ins->gather_dimension_numbers().index_vector_dim()) {
-            continue;
-          }
-          for (int64_t j = 0; j < device_mesh.num_dimensions(); ++j) {
-            // Split only when the tensor shape is divisible by device
-            // mesh.
-            if (device_mesh.dim(j) == 1 ||
-                (only_allow_divisible &&
-                 !IsDivisible(shape.dimensions(index_dim),
-                              device_mesh.dim(j)))) {
-              continue;
-            }
-            std::string name = absl::StrCat("S", index_dim, " @ ", j);
-
-            HloSharding output_spec =
-                Tile(shape, {index_dim}, {j}, device_mesh);
-            double compute_cost = 0, communication_cost = 0;
-            double memory_cost = GetBytes(shape) / output_spec.NumTiles();
-            std::optional<HloSharding> input_spec =
-                hlo_sharding_util::ReshapeSharding(shape, indices->shape(),
-                                                   output_spec);
-            if (!input_spec.has_value()) {  // invalid reshape
-              continue;
-            }
-            std::vector<std::optional<HloSharding>> input_shardings_optional(
-                {std::nullopt, input_spec});
-            std::vector<std::vector<double>> resharding_cost =
-                GenerateReshardingCostsAndMissingShardingsForAllOperands(
-                    ins, output_spec, strategy_map, cluster_env, call_graph,
-                    input_shardings_optional);
-
-            strategies->leaf_vector.push_back(ShardingStrategy(
-                {name, output_spec, compute_cost, communication_cost,
-                 memory_cost, std::move(resharding_cost),
-                 input_shardings_optional}));
-          }
-        }
-        auto src_strategies = strategy_map.at(ins->operand(0)).get();
-        for (int64_t sid = 0; sid < src_strategies->leaf_vector.size(); ++sid) {
-          HloSharding output_spec =
-              src_strategies->leaf_vector[sid].output_sharding;
-          auto gather_parallel_dims =
-              hlo_sharding_util::GetGatherParallelBatchDims(*ins, call_graph);
-          absl::Span<const int64_t> operand_parallel_dims;
-          if (gather_parallel_dims) {
-            operand_parallel_dims = absl::MakeConstSpan(
-                gather_parallel_dims->operand_parallel_dims);
-          }
-          HloSharding filtered_operand_sharding =
-              hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
-                  output_spec, operand_parallel_dims);
-          auto maybe_from_data = hlo_sharding_util::
-              GatherOutputShardingFromOperandOperandPassthroughDimensions(
-                  filtered_operand_sharding, *ins);
-          if (!maybe_from_data) continue;
-          std::string name = ToStringSimple(*maybe_from_data);
-          double compute_cost = 0, communication_cost = 0;
-          double memory_cost =
-              GetBytes(ins->shape()) / maybe_from_data->NumTiles();
-          std::vector<std::optional<HloSharding>> input_shardings_optional(
-              {*maybe_from_data, std::nullopt});
-          std::vector<std::vector<double>> resharding_cost =
-              GenerateReshardingCostsAndMissingShardingsForAllOperands(
-                  ins, *maybe_from_data, strategy_map, cluster_env, call_graph,
-                  input_shardings_optional);
-          strategies->leaf_vector.push_back(ShardingStrategy(
-              {name, *maybe_from_data, compute_cost, communication_cost,
-               memory_cost, std::move(resharding_cost),
-               input_shardings_optional}));
-        }
-        AddReplicatedStrategy(
-            ins, ins->shape(), cluster_env, strategy_map, strategies, 0,
-            /* operands_to_consider_all_strategies_for */ {0});
-        break;
-      }
-      case HloOpcode::kBroadcast: {
-        strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
-                                              leaf_strategies);
-
-        const HloInstruction* operand = ins->operand(0);
-
-        const StrategyVector* operand_strategies =
-            strategy_map.at(operand).get();
-        CHECK(!operand_strategies->is_tuple);
-        if (ins->shape().rank() == 1 || cluster_env.IsDeviceMesh1D()) {
-          EnumerateAll1DPartition(ins, ins->shape(), cluster_env.device_mesh_,
-                                  cluster_env, strategy_map, strategies,
-                                  only_allow_divisible, "", call_graph);
-        } else {
-          EnumerateAllPartition(ins, ins->shape(), cluster_env.device_mesh_,
-                                cluster_env, strategy_map, strategies,
-                                batch_dim_map, only_allow_divisible, call_graph,
-                                /*partitions*/ 2);
-          if (option.allow_mixed_mesh_shape) {
-            EnumerateAll1DPartition(ins, ins->shape(),
-                                    cluster_env.device_mesh_1d_, cluster_env,
-                                    strategy_map, strategies,
-                                    only_allow_divisible, "1d", call_graph);
-          }
-        }
-        AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
-                              strategies, replicated_penalty);
-
-        break;
-      }
-      case HloOpcode::kReshape: {
-        strategies = CreateReshapeStrategies(instruction_id, ins, strategy_map,
-                                             cluster_env, only_allow_divisible,
-                                             replicated_penalty, batch_dim_map,
-                                             option, leaf_strategies);
-        break;
-      }
-      case HloOpcode::kTranspose:
-      case HloOpcode::kReverse: {
-        strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
-                                              leaf_strategies);
-
-        const HloInstruction* operand = ins->operand(0);
-
-        // Create follow strategies
-        const StrategyVector* src_strategies = strategy_map.at(operand).get();
-        CHECK(!src_strategies->is_tuple);
-        strategies->following = src_strategies;
-
-        for (int64_t sid = 0; sid < src_strategies->leaf_vector.size(); ++sid) {
-          HloSharding output_spec = Undefined();
-          auto input_spec = src_strategies->leaf_vector[sid].output_sharding;
-          if (opcode == HloOpcode::kTranspose) {
-            output_spec = hlo_sharding_util::TransposeSharding(
-                input_spec, ins->dimensions());
-          } else {
-            output_spec = hlo_sharding_util::ReverseSharding(input_spec,
-                                                             ins->dimensions());
-          }
-
-          std::string name = ToStringSimple(output_spec);
-          double compute_cost = 0, communication_cost = 0;
-          double memory_cost = GetBytes(ins->shape()) / output_spec.NumTiles();
-          auto resharding_costs = ReshardingCostVector(
-              src_strategies, operand->shape(), input_spec, cluster_env);
-          strategies->leaf_vector.push_back(
-              ShardingStrategy({name,
-                                output_spec,
-                                compute_cost,
-                                communication_cost,
-                                memory_cost,
-                                {resharding_costs},
-                                {input_spec}}));
-        }
-        break;
-      }
-      case HloOpcode::kPad:
-      case HloOpcode::kSlice:
-      case HloOpcode::kConcatenate:  // TODO(zhuohan): revisit concatenate
-      case HloOpcode::kDynamicSlice:
-      case HloOpcode::kDynamicUpdateSlice:
-      case HloOpcode::kReduceWindow:
-      case HloOpcode::kSelectAndScatter: {
-        strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
-                                              leaf_strategies);
-        int64_t follow_idx;
-        switch (opcode) {
-          // TODO(yuemmawang) Re-evaluate the follow_idx choices for the
-          // following 3.
-          case HloOpcode::kPad:
-          case HloOpcode::kReduceWindow:
-          case HloOpcode::kSelectAndScatter:
-          case HloOpcode::kConcatenate:
-            // Follow the operand according to the follow heuristics
-            follow_idx = ChooseOperandToFollow(strategy_map, depth_map,
-                                               alias_map, max_depth, ins)
-                             .first;
-            break;
-          // The following types are better to follow the first operand.
-          case HloOpcode::kSlice:
-          case HloOpcode::kDynamicSlice:
-          case HloOpcode::kDynamicUpdateSlice:
-            follow_idx = 0;
-            break;
-          default:
-            LOG(FATAL) << "Selecting follow index encounters an unhandled "
-                          "instruction type: " +
-                              ins->ToShortString();
-        }
-        // Create follow strategies
-        const HloInstruction* operand = ins->operand(follow_idx);
-        StrategyVector* src_strategies = strategy_map.at(operand).get();
-        CHECK(!src_strategies->is_tuple);
-        strategies->following = src_strategies;
-
-        for (int64_t sid = 0; sid < src_strategies->leaf_vector.size(); ++sid) {
-          std::optional<HloSharding> output_spec;
-          HloSharding input_spec =
-              src_strategies->leaf_vector[sid].output_sharding;
-
-          // Find output shardings.
-          switch (opcode) {
-            case HloOpcode::kPad:
-            case HloOpcode::kSlice:
-            case HloOpcode::kConcatenate:
-            case HloOpcode::kDynamicSlice:
-            case HloOpcode::kDynamicUpdateSlice:
-              output_spec = PropagateDimwiseSharding(
-                  input_spec, operand->shape(), ins->shape());
-              break;
-            case HloOpcode::kReduceWindow:
-            case HloOpcode::kSelectAndScatter:
-              output_spec = PropagateReduceWindowSharding(
-                  input_spec, operand->shape(), ins->window());
-              break;
-            default:
-              LOG(FATAL) << "Unhandled instruction: " + ins->ToString();
-          }
-
-          // Get a list of input shardings, each corresponds to an operand.
-          std::vector<std::optional<HloSharding>> input_shardings;
-          for (int64_t k = 0; k < ins->operand_count(); ++k) {
-            if (k == follow_idx ||
-                ToString(ins->operand(k)->shape().dimensions()) ==
-                    ToString(operand->shape().dimensions())) {
-              input_shardings.push_back(input_spec);
-            } else {
-              input_shardings.push_back(std::nullopt);
-            }
-          }
-          if (!output_spec.has_value()) {
-            continue;
-          }
-
-          std::string name = ToStringSimple(*output_spec);
-          double compute_cost = 0, communication_cost = 0;
-          double memory_cost = GetBytes(ins->shape()) / output_spec->NumTiles();
-          std::vector<std::vector<double>> resharding_costs =
-              GenerateReshardingCostsAndMissingShardingsForAllOperands(
-                  ins, *output_spec, strategy_map, cluster_env, call_graph,
-                  input_shardings);
-
-          strategies->leaf_vector.push_back(
-              ShardingStrategy({name,
-                                *output_spec,
-                                compute_cost,
-                                communication_cost,
-                                memory_cost,
-                                std::move(resharding_costs),
-                                {input_spec}}));
-        }
-
-        if (strategies->leaf_vector.empty()) {
-          strategies->following = nullptr;
-          AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
-                                strategies, 0);
-        }
-
-        break;
-      }
-      case HloOpcode::kOptimizationBarrier: {
-        auto operand_strategies = strategy_map.at(ins->operand(0)).get();
-        strategies = MaybeFollowInsStrategyVector(
-            operand_strategies, ins->shape(), instruction_id,
-            /* have_memory_cost */ true, leaf_strategies, cluster_env,
-            pretrimmed_strategy_map);
-        break;
-      }
-      case HloOpcode::kBitcast: {
-        if (ins->shape() == ins->operand(0)->shape()) {
-          strategies = CreateElementwiseOperatorStrategies(
-              instruction_id, ins, strategy_map, cluster_env, depth_map,
-              alias_map, pretrimmed_strategy_map, max_depth, leaf_strategies,
-              associative_dot_pairs);
-        } else {
-          strategies = CreateReshapeStrategies(
-              instruction_id, ins, strategy_map, cluster_env,
-              only_allow_divisible, replicated_penalty, batch_dim_map, option,
-              leaf_strategies);
-        }
-        break;
-      }
-      // Unary elementwise operations.
-      case HloOpcode::kAbs:
-      case HloOpcode::kRoundNearestAfz:
-      case HloOpcode::kRoundNearestEven:
-      case HloOpcode::kCeil:
-      case HloOpcode::kClz:
-      case HloOpcode::kConvert:
-      case HloOpcode::kBitcastConvert:
-      case HloOpcode::kCopy:
-      case HloOpcode::kCos:
-      case HloOpcode::kExp:
-      case HloOpcode::kExpm1:
-      case HloOpcode::kFloor:
-      case HloOpcode::kImag:
-      case HloOpcode::kIsFinite:
-      case HloOpcode::kLog:
-      case HloOpcode::kLog1p:
-      case HloOpcode::kNot:
-      case HloOpcode::kNegate:
-      case HloOpcode::kPopulationCount:
-      case HloOpcode::kReal:
-      case HloOpcode::kReducePrecision:
-      case HloOpcode::kRsqrt:
-      case HloOpcode::kLogistic:
-      case HloOpcode::kSign:
-      case HloOpcode::kSin:
-      case HloOpcode::kSqrt:
-      case HloOpcode::kCbrt:
-      case HloOpcode::kTan:
-      case HloOpcode::kTanh:
-      // Binary elementwise operations
-      case HloOpcode::kAdd:
-      case HloOpcode::kAtan2:
-      case HloOpcode::kCompare:
-      case HloOpcode::kComplex:
-      case HloOpcode::kDivide:
-      case HloOpcode::kMaximum:
-      case HloOpcode::kMinimum:
-      case HloOpcode::kMultiply:
-      case HloOpcode::kPower:
-      case HloOpcode::kRemainder:
-      case HloOpcode::kSubtract:
-      case HloOpcode::kAnd:
-      case HloOpcode::kOr:
-      case HloOpcode::kXor:
-      case HloOpcode::kShiftLeft:
-      case HloOpcode::kShiftRightArithmetic:
-      case HloOpcode::kShiftRightLogical:
-      case HloOpcode::kStochasticConvert:
-      // Ternary elementwise operations.
-      case HloOpcode::kSelect:
-      case HloOpcode::kClamp: {
-        strategies = CreateElementwiseOperatorStrategies(
-            instruction_id, ins, strategy_map, cluster_env, depth_map,
-            alias_map, pretrimmed_strategy_map, max_depth, leaf_strategies,
-            associative_dot_pairs);
-        break;
-      }
-      case HloOpcode::kReduce: {
-        auto strategies_status = FollowReduceStrategy(
-            ins, ins->shape(), ins->operand(0), ins->operand(1), instruction_id,
-            strategy_map, leaf_strategies, cluster_env,
-            option.allow_mixed_mesh_shape, !trying_multiple_mesh_shapes);
-        if (strategies_status.ok()) {
-          strategies = std::move(strategies_status.value());
-        } else {
-          return strategies_status.status();
-        }
-        break;
-      }
-      case HloOpcode::kDot: {
-        TF_RETURN_IF_ERROR(HandleDot(strategies, leaf_strategies, strategy_map,
-                                     ins, instruction_id, cluster_env,
-                                     batch_dim_map, option, call_graph));
-        if (option.allow_replicated_strategy_for_dot_and_conv) {
-          AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
-                                strategies, 0);
-        }
-        break;
-      }
-      case HloOpcode::kConvolution: {
-        TF_RETURN_IF_ERROR(HandleConv(strategies, leaf_strategies, strategy_map,
-                                      ins, instruction_id, cluster_env,
-                                      batch_dim_map, option, call_graph));
-        if (option.allow_replicated_strategy_for_dot_and_conv) {
-          AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
-                                strategies, 0);
-        }
-        break;
-      }
-      case HloOpcode::kRngGetAndUpdateState: {
-        strategies = CreateLeafStrategyVectorWithoutInNodes(instruction_id,
-                                                            leaf_strategies);
-        AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
-                              strategies, 0);
-        break;
-      }
-      case HloOpcode::kIota: {
-        strategies = CreateLeafStrategyVectorWithoutInNodes(instruction_id,
-                                                            leaf_strategies);
-        if (cluster_env.IsDeviceMesh1D()) {
-          EnumerateAll1DPartition(ins, ins->shape(), device_mesh, cluster_env,
-                                  strategy_map, strategies,
-                                  only_allow_divisible, "", call_graph);
-        }
-        if (cluster_env.IsDeviceMesh2D()) {
-          // Split 2 dims
-          EnumerateAllPartition(ins, ins->shape(), device_mesh, cluster_env,
-                                strategy_map, strategies, batch_dim_map,
-                                only_allow_divisible, call_graph, /*parts*/ 2);
-        }
-        if (cluster_env.IsDeviceMesh3D()) {
-          // Split 3 dims
-          EnumerateAllPartition(ins, ins->shape(), device_mesh, cluster_env,
-                                strategy_map, strategies, batch_dim_map,
-                                only_allow_divisible, call_graph, /*parts*/ 3);
-        }
-        if (cluster_env.IsDeviceMesh2D() && option.allow_mixed_mesh_shape) {
-          // Split 1 dim, but for 1d flattened version of the 2d mesh
-          // For example, when the mesh shape is (2, 4), we add strategies for
-          // mesh shape (1, 8) here in addition.
-          EnumerateAll1DPartition(ins, ins->shape(), device_mesh_1d,
-                                  cluster_env, strategy_map, strategies,
-                                  only_allow_divisible, " 1d", call_graph);
-        }
-
-        // Replicate
-        AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
-                              strategies, replicated_penalty * 5);
-
-        break;
-      }
-      case HloOpcode::kTuple: {
-        strategies = CreateTupleStrategyVector(instruction_id);
-        strategies->childs.reserve(ins->operand_count());
-        for (size_t i = 0; i < ins->operand_count(); ++i) {
-          const HloInstruction* operand = ins->operand(i);
-          const StrategyVector* src_strategies = strategy_map.at(operand).get();
-          auto child_strategies = MaybeFollowInsStrategyVector(
-              src_strategies, operand->shape(), instruction_id,
-              /* have_memory_cost= */ true, leaf_strategies, cluster_env,
-              pretrimmed_strategy_map);
-          child_strategies->tuple_element_idx = i;
-          strategies->childs.push_back(std::move(child_strategies));
-        }
-        break;
-      }
-      case HloOpcode::kGetTupleElement: {
-        const HloInstruction* operand = ins->operand(0);
-        const StrategyVector* src_strategies = strategy_map.at(operand).get();
-        CHECK(src_strategies->is_tuple);
-        strategies = MaybeFollowInsStrategyVector(
-            src_strategies->childs[ins->tuple_index()].get(), ins->shape(),
-            instruction_id,
-            /* have_memory_cost= */ true, leaf_strategies, cluster_env,
-            pretrimmed_strategy_map);
-        break;
-      }
-      case HloOpcode::kCustomCall: {
-        auto generate_non_following_strategies =
-            [&](bool only_replicated,
-                absl::flat_hash_set<int64_t>
-                    operands_to_consider_all_strategies_for = {}) {
-              if (ins->shape().IsTuple()) {
-                if (only_replicated) {
-                  strategies = CreateTupleStrategyVector(instruction_id);
-                  strategies->childs.reserve(ins->shape().tuple_shapes_size());
-                  for (size_t i = 0; i < ins->shape().tuple_shapes_size();
-                       ++i) {
-                    std::unique_ptr<StrategyVector> child_strategies =
-                        CreateLeafStrategyVector(instruction_id, ins,
-                                                 strategy_map, leaf_strategies);
-                    AddReplicatedStrategy(ins, ins->shape().tuple_shapes(i),
-                                          cluster_env, strategy_map,
-                                          child_strategies, replicated_penalty);
-                    strategies->childs.push_back(std::move(child_strategies));
-                  }
-                } else {
-                  strategies =
-                      CreateAllStrategiesVector(
-                          ins, ins->shape(), instruction_id, leaf_strategies,
-                          cluster_env, strategy_map, option, replicated_penalty,
-                          batch_dim_map, call_graph, only_allow_divisible, true)
-                          .value();
-                }
-              } else {
-                if (only_replicated) {
-                  strategies = CreateLeafStrategyVector(
-                      instruction_id, ins, strategy_map, leaf_strategies);
-                  AddReplicatedStrategy(ins, ins->shape(), cluster_env,
-                                        strategy_map, strategies,
-                                        replicated_penalty);
-                } else {
-                  strategies =
-                      CreateAllStrategiesVector(
-                          ins, ins->shape(), instruction_id, leaf_strategies,
-                          cluster_env, strategy_map, option, replicated_penalty,
-                          batch_dim_map, call_graph, only_allow_divisible, true)
-                          .value();
-                }
-              }
-            };
-
-        if (IsCustomCallMarker(ins)) {
-          const HloInstruction* operand = ins->operand(0);
-          const StrategyVector* src_strategies = strategy_map.at(operand).get();
-          CHECK(src_strategies->is_tuple);
-          strategies = MaybeFollowInsStrategyVector(
-              src_strategies, ins->shape(), instruction_id,
-              /* have_memory_cost= */ true, leaf_strategies, cluster_env,
-              pretrimmed_strategy_map);
-        } else if (ins->has_sharding()) {
-          generate_non_following_strategies(false);
-        } else if (OutputInputSameShapes(ins)) {
-          auto* partitioner =
-              GetCustomCallPartitioner(ins->custom_call_target());
-          if (partitioner && partitioner->IsCustomCallShardable(ins)) {
-            // Follows operand 0's strategies if this custom-call op is
-            // shardable and has the same input and output sizes.
-            const HloInstruction* operand = ins->operand(0);
-            const StrategyVector* src_strategies =
-                strategy_map.at(operand).get();
-            strategies = MaybeFollowInsStrategyVector(
-                src_strategies, ins->shape(), instruction_id,
-                /* have_memory_cost= */ true, leaf_strategies, cluster_env,
-                pretrimmed_strategy_map);
-          }
-        } else if (IsTopKCustomCall(ins)) {
-          generate_non_following_strategies(false, {0});
-        } else {
-          // TODO (b/258723035) Handle CustomCall ops for GPUs in a better way.
-          generate_non_following_strategies(true);
-        }
-        break;
-      }
-      case HloOpcode::kWhile: {
-        strategies = CreateTupleStrategyVector(instruction_id);
-        strategies->childs.reserve(ins->shape().tuple_shapes_size());
-        const StrategyVector* src_strategies =
-            strategy_map.at(ins->operand(0)).get();
-        for (size_t i = 0; i < ins->shape().tuple_shapes_size(); ++i) {
-          auto child_strategies = MaybeFollowInsStrategyVector(
-              src_strategies->childs[i].get(),
-              ins->shape().tuple_shapes().at(i), instruction_id,
-              /* have_memory_cost= */ true, leaf_strategies, cluster_env,
-              pretrimmed_strategy_map);
-          child_strategies->tuple_element_idx = i;
-          strategies->childs.push_back(std::move(child_strategies));
-        }
-
-        break;
-      }
-      case HloOpcode::kConditional:
-      case HloOpcode::kInfeed:
-      case HloOpcode::kSort: {
-        strategies = CreateAllStrategiesVector(
-                         ins, ins->shape(), instruction_id, leaf_strategies,
-                         cluster_env, strategy_map, option, replicated_penalty,
-                         batch_dim_map, call_graph, only_allow_divisible,
-                         /*create_replicated_strategies*/ true)
-                         .value();
-        break;
-      }
-      case HloOpcode::kOutfeed: {
-        strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
-                                              leaf_strategies);
-        GenerateOutfeedStrategy(ins, ins->shape(), cluster_env, strategy_map,
-                                strategies, replicated_penalty);
-        break;
-      }
-      case HloOpcode::kAfterAll: {
-        strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
-                                              leaf_strategies);
-        AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
-                              strategies, replicated_penalty);
-        break;
-      }
-      default:
-        LOG(FATAL) << "Unhandled instruction: " + ins->ToString();
-    }
-    RemoveDuplicatedStrategy(strategies);
-    if (ins->has_sharding() && ins->opcode() != HloOpcode::kOutfeed) {
-      // Finds the sharding strategy that aligns with the given sharding spec
-      // Do not merge nodes if this one instruction has annotations.
-      TrimOrGenerateStrategiesBasedOnExistingSharding(
-          ins->shape(), strategies.get(), strategy_map, instructions,
-          ins->sharding(), cluster_env, pretrimmed_strategy_map, call_graph,
-          option.nd_sharding_iteratively_strict_search_space);
-    }
-    if (!strategies->is_tuple && strategies->following) {
-      if (!LeafVectorsAreConsistent(
-              strategies->leaf_vector, strategies->following->leaf_vector,
-              /*is_reshape*/ ins->opcode() == HloOpcode::kReshape)) {
-        // It confuses the solver if two instructions have different number of
-        // sharding strategies but share the same ILP variable. The solver
-        // would run much longer and/or return infeasible solutions.
-        // So if two strategies' leaf_vectors are inconsistent, we unfollow
-        // them.
-        strategies->following = nullptr;
-      }
-    } else if (strategies->is_tuple) {
-      for (size_t i = 0; i < strategies->childs.size(); i++) {
-        if (strategies->childs.at(i)->following &&
-            !LeafVectorsAreConsistent(
-                strategies->childs.at(i)->leaf_vector,
-                strategies->childs.at(i)->following->leaf_vector,
-                /*is_reshape*/ ins->opcode() == HloOpcode::kReshape)) {
-          strategies->childs.at(i)->following = nullptr;
-        }
-      }
-    }
-    RemoveInvalidShardingsWithShapes(
-        ins->shape(), strategies.get(),
-        /* instruction_has_user_sharding */ ins->has_sharding());
-
-    if (instruction_execution_counts.contains(ins)) {
-      ScaleCostsWithExecutionCounts(strategies.get(),
-                                    instruction_execution_counts.at(ins));
-    } else {
-      VLOG(5) << "No execution count available for " << ins->name();
-    }
-    XLA_VLOG_LINES(2, absl::StrCat("strategies:\n", strategies->ToString()));
-
-    // Debug options: forcibly set the strategy of some instructions.
-    if (option.force_strategy) {
-      std::vector<int64_t> inst_indices = option.force_strategy_inst_indices;
-      std::vector<std::string> stra_names = option.force_strategy_stra_names;
-      CHECK_EQ(inst_indices.size(), stra_names.size());
-      auto it = absl::c_find(inst_indices, strategies->node_idx);
-      if (it != inst_indices.end()) {
-        CHECK(!strategies->is_tuple);
-        std::vector<ShardingStrategy> new_leaf_vector;
-        int64_t idx = it - inst_indices.begin();
-        for (const auto& stra : strategies->leaf_vector) {
-          if (stra.name == stra_names[idx]) {
-            new_leaf_vector.push_back(stra);
-          }
-        }
-        strategies->leaf_vector = std::move(new_leaf_vector);
-      }
-    }
-
-    // When trying out multiple mesh shapes in the presence of user specified
-    // sharding (as in
-    // AutoShardingTest.AutoShardingKeepUserShardingInputOutput), there may be a
-    // situation when we cannot generate any shardings for an instruction when
-    // the mesh shape we're trying does not match with the mesh shape used in
-    // user specified shardings. So we disable the check in that situation.
-    if (!trying_multiple_mesh_shapes) {
-      CHECK(strategies->is_tuple || !strategies->leaf_vector.empty())
-          << ins->ToString() << " does not have any valid strategies.";
-    } else if (!(strategies->is_tuple || !strategies->leaf_vector.empty())) {
-      return Status(absl::StatusCode::kFailedPrecondition,
-                    "Could not generate any shardings for an instruction due "
-                    "to mismatched mesh shapes.");
-    }
-    // Checks the shape of resharding_costs is valid. It will check fail if the
-    // shape is not as expected.
-    // CheckReshardingCostsShape(strategies.get());
-    CheckMemoryCosts(strategies.get(), ins->shape());
-    strategy_map[ins] = std::move(strategies);
-  }  // end of for loop
-
-  // If gradient accumulation is used, adjust the cost of all-reduce for
-  // gradient synchronization.
-  if (option.grad_acc_num_micro_batches > 1) {
-    // find gradient-computation instructions
-    std::vector<const HloInstruction*> grad_insts =
-        GetGradientComputationInstructions(instructions);
-    for (const HloInstruction* inst : grad_insts) {
-      StrategyVector* stra_vector = strategy_map[inst].get();
-      CHECK(!stra_vector->is_tuple);
-
-      for (auto& stra : stra_vector->leaf_vector) {
-        if (absl::StrContains(stra.name, "allreduce")) {
-          stra.communication_cost /= option.grad_acc_num_micro_batches;
-        }
-      }
-    }
+  if (strategy_group->strategies.empty()) {
+    // Fail to create follow strategies, enumerate all possible cases
+    VLOG(2) << "Enumerating all strategies for reshape";
+    FillAllStrategiesForArray(
+        strategy_group, ins, ins->shape(), cluster_env, strategy_map, option,
+        replicated_penalty, batch_dim_map, call_graph, only_allow_divisible,
+        /* create_replicated_strategies */ true,
+        /* create_partially_replicated_strategies */ true);
   }
 
-  return std::make_tuple(std::move(strategy_map), std::move(leaf_strategies),
-                         std::move(associative_dot_pairs));
+  return strategy_group;
 }
 
-// NOLINTEND
-
 AutoShardingSolverResult CallSolver(
-    const HloLiveRange& hlo_live_range,
+    const HloModule& hlo_module, const HloLiveRange& hlo_live_range,
     const LivenessNodeSet& liveness_node_set, const StrategyMap& strategy_map,
-    const LeafStrategies& leaf_strategies, const CostGraph& cost_graph,
+    const StrategyGroups& strategy_groups, const CostGraph& cost_graph,
     const AliasSet& alias_set, const std::vector<NodeStrategyIdx>& s_hint,
-    int64_t memory_budget_per_device, bool crash_at_infinity_costs_check,
-    bool compute_iis, int64_t solver_timeout_in_seconds,
-    bool allow_alias_to_follower_conversion,
+    const bool compute_iis, const int64_t solver_timeout_in_seconds,
+    const AutoShardingOption& option,
     const absl::flat_hash_map<std::string, const HloInstruction*>&
         sharding_propagation_solution) {
-  // Serialize edges and edge costs to 1d numpy arrays
+  // Serialize edges and edge costs to 1d numpy arrays.
   AutoShardingSolverRequest request;
-  request.num_nodes = leaf_strategies.size();
-  request.memory_budget = memory_budget_per_device;
-  request.s_len = cost_graph.node_lens_;
-  request.s_follow = cost_graph.follow_idx_;
-  request.s_hint = s_hint;
-  request.solver_timeout_in_seconds = solver_timeout_in_seconds;
-  request.crash_at_infinity_costs_check = crash_at_infinity_costs_check;
-  request.compute_iis = compute_iis;
-  for (const auto& iter : cost_graph.edge_costs_) {
-    request.e.push_back(iter.first);
-    std::vector<double> rij;
-    const Matrix& edge_cost = iter.second;
+  request.set_module_name(hlo_module.name());
+  request.set_num_nodes(strategy_groups.size());
+  request.set_memory_budget(option.memory_budget_per_device);
+  request.mutable_s_len()->Add(cost_graph.node_lens_.begin(),
+                               cost_graph.node_lens_.end());
+  request.mutable_s_follow()->Add(cost_graph.follow_idx_.begin(),
+                                  cost_graph.follow_idx_.end());
+  request.mutable_s_hint()->Add(s_hint.begin(), s_hint.end());
+  request.mutable_solver_timeout()->set_solver_timeout_in_seconds(
+      solver_timeout_in_seconds);
+  request.mutable_overbudget_coeff()->set_coeff(kOverbudgetCoeff);
+  request.set_crash_at_infinity_costs_check(!option.try_multiple_mesh_shapes);
+  request.set_compute_iis(compute_iis);
+  request.set_saltiplier(kSaltiplier);
+  for (const auto& [edge, edge_cost] : cost_graph.edge_costs_) {
+    AutoShardingSolverRequest_Pair raw_edge;
+    raw_edge.set_first(edge.first);
+    raw_edge.set_second(edge.second);
+    *request.add_edges() = raw_edge;
+    AutoShardingSolverRequest_Costs rij;
     for (NodeStrategyIdx i = 0; i < edge_cost.n_; i++) {
       for (NodeStrategyIdx j = 0; j < edge_cost.m_; j++) {
-        rij.push_back(edge_cost(i, j));
+        rij.add_costs(edge_cost(i, j));
       }
     }
-    request.r.push_back(std::move(rij));
+    request.mutable_resharding_costs()->Add(std::move(rij));
   }
 
   const HloInstructionSequence& sequence =
       hlo_live_range.flattened_instruction_sequence();
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
 
-  // Serialize node costs
+  // Serialize node costs.
   int num_nodes_without_default = 0;
-  for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
-    const StrategyVector* strategies = leaf_strategies[node_idx];
-    auto instruction_name = instructions.at(strategies->instruction_id)->name();
-    request.instruction_names.push_back(
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+    const StrategyGroup* strategy_group = strategy_groups[node_idx];
+    auto instruction_name =
+        instructions.at(strategy_group->instruction_id)->name();
+    request.add_instruction_names(
         absl::StrCat(instruction_name, " (id: ", node_idx, ")"));
-    std::vector<double> ci, di, mi, pi;
+    AutoShardingSolverRequest_Costs ci, di, mi, pi;
     std::optional<HloSharding> default_strategy;
     auto iter = sharding_propagation_solution.find(instruction_name);
     if (iter != sharding_propagation_solution.end()) {
       CHECK(iter->second->has_sharding()) << iter->second->ToString();
       default_strategy = iter->second->sharding();
-      if (strategies->tuple_element_idx) {
-        const auto& tuple_elements = default_strategy->tuple_elements();
-        CHECK_LT(*strategies->tuple_element_idx, tuple_elements.size());
-        default_strategy = tuple_elements.at(*strategies->tuple_element_idx);
+      if (strategy_group->tuple_element_idx) {
+        const auto& tuple_elements = iter->second->sharding().tuple_elements();
+        CHECK_LT(*strategy_group->tuple_element_idx, tuple_elements.size());
+        default_strategy =
+            tuple_elements.at(*strategy_group->tuple_element_idx);
       }
     }
-    for (NodeStrategyIdx j = 0; j < strategies->leaf_vector.size(); ++j) {
-      const ShardingStrategy& strategy = strategies->leaf_vector[j];
+    for (NodeStrategyIdx j = 0; j < strategy_group->strategies.size(); ++j) {
+      const ShardingStrategy& strategy = strategy_group->strategies[j];
       const HloSharding& sharding = strategy.output_sharding;
-      ci.push_back(strategy.compute_cost);
-      di.push_back(strategy.communication_cost +
+      ci.add_costs(strategy.compute_cost);
+      di.add_costs(strategy.communication_cost +
                    cost_graph.extra_node_costs_[node_idx][j]);
-      mi.push_back(strategy.memory_cost);
-      pi.push_back(default_strategy && sharding == *default_strategy ? 0 : 1);
+      mi.add_costs(strategy.memory_cost);
+      pi.add_costs(default_strategy && sharding == *default_strategy ? 0 : 1);
     }
-    if (*std::min_element(pi.begin(), pi.end()) > 0) {
+    if (option.use_sharding_propagation_for_default_shardings &&
+        *std::min_element(pi.costs().begin(), pi.costs().end()) > 0) {
       LOG(WARNING) << "No default strategy for {node_idx " << node_idx
-                   << ", instruction ID " << strategies->instruction_id
+                   << ", instruction ID " << strategy_group->instruction_id
                    << ", instruction name " << instruction_name << "}";
       ++num_nodes_without_default;
     }
-    request.c.push_back(ci);
-    request.d.push_back(di);
-    request.m.push_back(mi);
-    request.p.push_back(pi);
+    request.mutable_computation_costs()->Add(std::move(ci));
+    request.mutable_communication_costs()->Add(std::move(di));
+    request.mutable_memory_costs()->Add(std::move(mi));
+    request.mutable_departure_costs()->Add(std::move(pi));
   }
   LOG(INFO) << "Total nodes without default: " << num_nodes_without_default;
 
   // Serialize special edges that forces a alias pair have the same sharding
-  // spec
+  // spec.
   std::vector<std::pair<NodeIdx, NodeIdx>> new_followers;
   for (const auto& pair : alias_set) {
-    const StrategyVector* src_strategies = leaf_strategies[pair.first];
-    const StrategyVector* dst_strategies = leaf_strategies[pair.second];
-    Matrix raw_cost(src_strategies->leaf_vector.size(),
-                    dst_strategies->leaf_vector.size());
-    for (NodeStrategyIdx i = 0; i < src_strategies->leaf_vector.size(); ++i) {
-      for (NodeStrategyIdx j = 0; j < dst_strategies->leaf_vector.size(); ++j) {
-        if (src_strategies->leaf_vector[i].output_sharding ==
-            dst_strategies->leaf_vector[j].output_sharding) {
+    const StrategyGroup* src_strategy_group = strategy_groups[pair.first];
+    const StrategyGroup* dst_strategy_group = strategy_groups[pair.second];
+    Matrix raw_cost(src_strategy_group->strategies.size(),
+                    dst_strategy_group->strategies.size());
+    for (NodeStrategyIdx i = 0; i < src_strategy_group->strategies.size();
+         ++i) {
+      for (NodeStrategyIdx j = 0; j < dst_strategy_group->strategies.size();
+           ++j) {
+        if (src_strategy_group->strategies[i].output_sharding ==
+            dst_strategy_group->strategies[j].output_sharding) {
           raw_cost(i, j) = 0.0;
         } else {
           raw_cost(i, j) = 1.0;
@@ -2573,95 +1770,78 @@ AutoShardingSolverResult CallSolver(
     std::vector<NodeStrategyIdx> row_indices;
     std::vector<NodeStrategyIdx> col_indices;
 
-    if (request.s_follow[idx_a] >= 0) {
+    if (request.s_follow(idx_a) >= 0) {
       row_indices = cost_graph.reindexing_vector_.at(idx_a);
-      idx_a = request.s_follow[idx_a];
+      idx_a = request.s_follow(idx_a);
     } else {
-      row_indices.assign(request.s_len[idx_a], 0);
+      row_indices.assign(request.s_len(idx_a), 0);
       std::iota(row_indices.begin(), row_indices.end(), 0);
     }
 
-    if (request.s_follow[idx_b] >= 0) {
+    if (request.s_follow(idx_b) >= 0) {
       col_indices = cost_graph.reindexing_vector_.at(idx_b);
-      idx_b = request.s_follow[idx_b];
+      idx_b = request.s_follow(idx_b);
     } else {
-      col_indices.assign(request.s_len[idx_b], 0);
+      col_indices.assign(request.s_len(idx_b), 0);
       std::iota(col_indices.begin(), col_indices.end(), 0);
     }
 
-    CHECK_EQ(request.s_len[idx_a], row_indices.size());
-    CHECK_EQ(request.s_len[idx_b], col_indices.size());
+    CHECK_EQ(request.s_len(idx_a), row_indices.size());
+    CHECK_EQ(request.s_len(idx_b), col_indices.size());
 
-    std::vector<double> vij;
+    AutoShardingSolverRequest_Costs vij;
     for (NodeStrategyIdx i : row_indices) {
       for (NodeStrategyIdx j : col_indices) {
-        vij.push_back(raw_cost(i, j));
+        vij.add_costs(raw_cost(i, j));
       }
     }
-    bool convertable = (row_indices.size() == col_indices.size());
-    for (NodeStrategyIdx i = 0; i < row_indices.size() && convertable; ++i) {
-      if (vij[i * col_indices.size() + i] != 0.0) convertable = false;
+    bool convertible = (row_indices.size() == col_indices.size());
+    for (NodeStrategyIdx i = 0; i < row_indices.size() && convertible; ++i) {
+      if (vij.costs(i * col_indices.size() + i) != 0.0) convertible = false;
     }
-    if (convertable && allow_alias_to_follower_conversion) {
-      new_followers.push_back(std::make_pair(idx_a, idx_b));
+    if (convertible && option.allow_alias_to_follower_conversion) {
+      new_followers.push_back({idx_a, idx_b});
     } else {
-      request.a.push_back(std::make_pair(idx_a, idx_b));
-      request.v.push_back(vij);
+      AutoShardingSolverRequest_Pair alias;
+      alias.set_first(idx_a);
+      alias.set_second(idx_b);
+      *request.add_aliases() = alias;
+      request.mutable_value_costs()->Add(std::move(vij));
     }
   }
 
   // Process any new followers that had originally been modeled as aliases.
-  std::vector<NodeIdx>& s_follow = request.s_follow;
+  auto s_follow = request.mutable_s_follow();
   for (auto [follower, followee] : new_followers) {
     // New followers may have introduced chains, so find the root nodes.
-    while (s_follow[follower] >= 0) follower = s_follow[follower];
-    while (s_follow[followee] >= 0) followee = s_follow[followee];
-    if (follower != followee) s_follow[follower] = followee;
+    while (s_follow->at(follower) >= 0) follower = s_follow->at(follower);
+    while (s_follow->at(followee) >= 0) followee = s_follow->at(followee);
+    if (follower != followee) s_follow->Set(follower, followee);
   }
 
   // Flatten the follower indices to remove any transitive arcs.
-  for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
-    if (s_follow[node_idx] < 0) continue;
-    while (s_follow[s_follow[node_idx]] >= 0) {
-      s_follow[node_idx] = s_follow[s_follow[node_idx]];
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+    if (s_follow->at(node_idx) < 0) continue;
+    while (s_follow->at(s_follow->at(node_idx)) >= 0) {
+      s_follow->Set(node_idx, s_follow->at(s_follow->at(node_idx)));
     }
   }
 
   // Serialize liveness_set
-  request.live = liveness_node_set;
+  for (const auto& liveness_node_subset : liveness_node_set) {
+    AutoShardingSolverRequest_Nodes nodes;
+    nodes.mutable_nodes()->Add(liveness_node_subset.begin(),
+                               liveness_node_subset.end());
+    request.mutable_live()->Add(std::move(nodes));
+  }
 
   PopulateTemporalValues(cost_graph, request);
 
-  const AutoShardingSolverResult result = CallORToolsSolver(request);
-  if (result.status.ok()) {
-    const AutoShardingEvaluation evaluation = Evaluate(request, result);
-    LOG(INFO) << "Total Communication Cost: "
-              << evaluation.total.communication_cost
-              << " (lower bound: " << evaluation.lower_bound.communication_cost
-              << ")";
-    LOG(INFO) << "Total Computation Cost: " << evaluation.total.computation_cost
-              << " (lower bound: " << evaluation.lower_bound.computation_cost
-              << ")";
-    LOG(INFO) << "Total Resharding Cost: " << evaluation.total.resharding_cost
-              << " (lower bound: " << evaluation.lower_bound.resharding_cost
-              << ")";
-    LOG(INFO) << "Total Overbudget Cost: " << evaluation.total.overbudget_cost
-              << " (lower bound: " << evaluation.lower_bound.overbudget_cost
-              << ")";
-    LOG(INFO) << "Total Makespan Cost: " << evaluation.total.makespan_cost
-              << " (lower bound: " << evaluation.lower_bound.makespan_cost
-              << ")";
-    LOG(INFO) << "Total Cost: " << evaluation.total.cost()
-              << " (lower bound: " << evaluation.lower_bound.cost() << ")";
-    LOG(INFO) << "Total Departures: " << evaluation.total_departures;
-    LOG(INFO) << "Total Makespan: " << evaluation.total_makespan;
-    LOG(INFO) << "Total Violations: " << evaluation.violation_codes.size();
-  }
-  return result;
+  return CallORToolsSolver(request);
 }
 
 void CheckHloSharding(const HloInstructionSequence& sequence,
-                      size_t total_num_devices) {
+                      const size_t total_num_devices) {
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
   std::vector<std::pair<size_t, std::string>> size_string;
   for (const HloInstruction* ins : instructions) {
@@ -2714,7 +1894,7 @@ void CheckHloSharding(const HloInstructionSequence& sequence,
             std::string str = absl::StrCat("Shardings not consistent (op size ",
                                            op_size, " GB):", ins->ToString(),
                                            "\n Operand: ", op->ToString());
-            size_string.push_back(std::make_pair(op_size, std::move(str)));
+            size_string.push_back({op_size, std::move(str)});
           }
         } else {
           LOG(INFO) << "Instruction " << op->name()
@@ -2732,7 +1912,7 @@ void CheckHloSharding(const HloInstructionSequence& sequence,
   std::sort(size_string.begin(), size_string.end(), MemLarger);
   size_t k = 10;
   k = std::min(k, size_string.size());
-  for (size_t t = 0; t < k; t++) {
+  for (size_t t = 0; t < k; ++t) {
     LOG(INFO) << size_string.at(t).second;
   }
 }
@@ -2742,7 +1922,7 @@ void SetHloSharding(const HloInstructionSequence& sequence,
                     const StrategyMap& strategy_map,
                     const CostGraph& cost_graph,
                     absl::Span<const NodeStrategyIdx> s_val,
-                    bool last_iteration) {
+                    const bool last_iteration) {
   // Set the HloSharding for every instruction
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
 
@@ -2755,34 +1935,34 @@ void SetHloSharding(const HloInstructionSequence& sequence,
       continue;
     }
 
-    const StrategyVector* strategies = iter->second.get();
-    if (strategies->is_tuple) {
+    const StrategyGroup* strategy_group = iter->second.get();
+    if (strategy_group->is_tuple) {
       const Shape& out_shape = inst->shape();
       ShapeTree<HloSharding> output_tuple_sharding(out_shape, Undefined());
       std::vector<HloSharding> output_flattened_shardings;
 
-      std::function<void(const StrategyVector*)> extract_tuple_shardings;
+      std::function<void(const StrategyGroup*)> extract_tuple_shardings;
       bool set_tuple_sharding = true;
 
-      extract_tuple_shardings = [&](const StrategyVector* strategies) {
-        if (strategies->is_tuple) {
-          for (const auto& child_strategies : strategies->childs) {
+      extract_tuple_shardings = [&](const StrategyGroup* strategy_group) {
+        if (strategy_group->is_tuple) {
+          for (const auto& child_strategies : strategy_group->childs) {
             extract_tuple_shardings(child_strategies.get());
           }
         } else {
-          NodeIdx node_idx = strategies->node_idx;
+          NodeIdx node_idx = strategy_group->node_idx;
           NodeStrategyIdx stra_idx = s_val[node_idx];
           // Do not set completed sharding before the last iteration
-          if (strategies->leaf_vector[stra_idx]
+          if (strategy_group->strategies[stra_idx]
                   .output_sharding.IsReplicated() &&
               !last_iteration) {
             set_tuple_sharding = false;
           }
           output_flattened_shardings.push_back(
-              strategies->leaf_vector[stra_idx].output_sharding);
+              strategy_group->strategies[stra_idx].output_sharding);
         }
       };
-      extract_tuple_shardings(strategies);
+      extract_tuple_shardings(strategy_group);
 
       // Create Tuple HloSharding.
       int i = 0;
@@ -2812,7 +1992,7 @@ void SetHloSharding(const HloInstructionSequence& sequence,
 Status SetHloShardingPostProcessing(
     const HloInstructionSequence& sequence, const StrategyMap& strategy_map,
     const CostGraph& cost_graph, absl::Span<const NodeStrategyIdx> s_val,
-    const ClusterEnvironment& cluster_env, bool crash_at_error,
+    const ClusterEnvironment& cluster_env, const bool crash_at_error,
     absl::flat_hash_map<std::string, std::vector<HloSharding>>*
         preserve_shardings) {
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
@@ -2829,23 +2009,33 @@ Status SetHloShardingPostProcessing(
     // Here we insert some extra annotated identity instructions to help the
     // spmd partitioner generate correct code.
 
-    if (inst->opcode() == HloOpcode::kDot) {
+    if (inst->opcode() == HloOpcode::kDot ||
+        inst->opcode() == HloOpcode::kConvolution) {
       const ShardingStrategy& stra =
           GetShardingStrategy(inst, strategy_map, cost_graph, s_val);
       const HloInstruction* lhs = inst->operand(0);
       const HloInstruction* rhs = inst->operand(1);
       const HloSharding& lhs_sharding = lhs->sharding();
       const HloSharding& rhs_sharding = rhs->sharding();
-      const DotDimensionNumbers& dot_dnums = inst->dot_dimension_numbers();
-      const auto& lhs_con_dims = dot_dnums.lhs_contracting_dimensions();
-      const auto& rhs_con_dims = dot_dnums.rhs_contracting_dimensions();
+      std::vector<int64_t> lhs_con_dims;
+      std::vector<int64_t> rhs_con_dims;
+      if (inst->opcode() == HloOpcode::kDot) {
+        const DotDimensionNumbers& dot_dnums = inst->dot_dimension_numbers();
+        lhs_con_dims.push_back(dot_dnums.lhs_contracting_dimensions()[0]);
+        rhs_con_dims.push_back(dot_dnums.rhs_contracting_dimensions()[0]);
+      } else {
+        const ConvolutionDimensionNumbers& conv_dnums =
+            inst->convolution_dimension_numbers();
+        lhs_con_dims.push_back(conv_dnums.input_feature_dimension());
+        rhs_con_dims.push_back(conv_dnums.kernel_input_feature_dimension());
+      }
 
-      const auto& lhs_tensor_dim_to_mesh_dim =
+      const std::vector<int64_t>& lhs_tensor_dim_to_mesh_dim =
           cluster_env.GetTensorDimToMeshDimWrapper(
               lhs->shape(), lhs_sharding,
               /* consider_reverse_device_meshes */ true,
               /* crash_at_error */ crash_at_error);
-      const auto& rhs_tensor_dim_to_mesh_dim =
+      const std::vector<int64_t>& rhs_tensor_dim_to_mesh_dim =
           cluster_env.GetTensorDimToMeshDimWrapper(
               rhs->shape(), rhs_sharding,
               /* consider_reverse_device_meshes */ true,
@@ -2856,10 +2046,17 @@ Status SetHloShardingPostProcessing(
         return absl::InvalidArgumentError(
             "Cannot generate tensor dim to mesh dim mapping");
       }
+
       if (absl::StrContains(stra.name, "allreduce") &&
-          lhs_tensor_dim_to_mesh_dim[lhs_con_dims[0]] == -1 &&
-          rhs_tensor_dim_to_mesh_dim[rhs_con_dims[0]] == -1) {
-        // Allow duplicatd dot computation in this case to reduce
+          std::any_of(lhs_con_dims.begin(), lhs_con_dims.end(),
+                      [&lhs_tensor_dim_to_mesh_dim](int64_t dim) {
+                        return lhs_tensor_dim_to_mesh_dim[dim] == -1;
+                      }) &&
+          std::any_of(rhs_con_dims.begin(), rhs_con_dims.end(),
+                      [&rhs_tensor_dim_to_mesh_dim](int64_t dim) {
+                        return rhs_tensor_dim_to_mesh_dim[dim] == -1;
+                      })) {
+        // Allow duplicated dot computation in this case to reduce
         // communication
       } else {
         CHECK(stra.input_shardings.size() == 2)
@@ -2875,51 +2072,6 @@ Status SetHloShardingPostProcessing(
                                       device_mesh, resharding_cache);
         }
       }
-    } else if (inst->opcode() == HloOpcode::kConvolution) {
-      const ShardingStrategy& stra =
-          GetShardingStrategy(inst, strategy_map, cost_graph, s_val);
-      const HloInstruction* lhs = inst->operand(0);
-      const HloInstruction* rhs = inst->operand(1);
-      const HloSharding& lhs_sharding = lhs->sharding();
-      const HloSharding& rhs_sharding = rhs->sharding();
-      const ConvolutionDimensionNumbers& conv_dnums =
-          inst->convolution_dimension_numbers();
-      const int lhs_in_channel_dim = conv_dnums.input_feature_dimension();
-      const int rhs_in_channel_dim =
-          conv_dnums.kernel_input_feature_dimension();
-
-      const auto& lhs_tensor_dim_to_mesh_dim =
-          cluster_env.GetTensorDimToMeshDimWrapper(
-              lhs->shape(), lhs_sharding,
-              /* consider_reverse_device_meshes */ true,
-              /* crash_at_error */ crash_at_error);
-      const auto& rhs_tensor_dim_to_mesh_dim =
-          cluster_env.GetTensorDimToMeshDimWrapper(
-              rhs->shape(), rhs_sharding,
-              /* consider_reverse_device_meshes */ true,
-              /* crash_at_error */ crash_at_error);
-
-      if (lhs_tensor_dim_to_mesh_dim.size() != lhs->shape().rank() ||
-          rhs_tensor_dim_to_mesh_dim.size() != rhs->shape().rank()) {
-        return absl::InvalidArgumentError(
-            "Cannot generate tensor dim to mesh dim mapping");
-      }
-
-      if (absl::StrContains(stra.name, "allreduce") &&
-          lhs_tensor_dim_to_mesh_dim[lhs_in_channel_dim] == -1 &&
-          rhs_tensor_dim_to_mesh_dim[rhs_in_channel_dim] == -1) {
-        // Allow duplicatd conv computation in this case to reduce
-        // communication
-      } else {
-        if (stra.input_shardings[0].has_value()) {
-          FixMixedMeshShapeResharding(inst, 0, stra.input_shardings[0].value(),
-                                      device_mesh, resharding_cache);
-        }
-        if (stra.input_shardings[1].has_value()) {
-          FixMixedMeshShapeResharding(inst, 1, stra.input_shardings[1].value(),
-                                      device_mesh, resharding_cache);
-        }
-      }
     } else if (inst->opcode() == HloOpcode::kOutfeed) {
       // Outfeed operand shardings are handled in downstream passes and so we
       // ignore outfeed ops here. However, we need to ensure that outfeed ops
@@ -2953,6 +2105,7 @@ Status SetHloShardingPostProcessing(
         switch (inst->opcode()) {
           case HloOpcode::kReduce:
           case HloOpcode::kCustomCall:
+          case HloOpcode::kRngBitGenerator:
           case HloOpcode::kSort: {
             for (size_t i = 0; i < inst->shape().tuple_shapes_size(); ++i) {
               const ShardingStrategy& stra =
@@ -3077,33 +2230,33 @@ std::string PrintStrategyMap(const StrategyMap& strategy_map,
 }
 
 // Print the chosen auto sharding strategy for debugging.
-// TODO (zhuohan): update the following function
+// TODO (zhuohan): Update the following function.
 std::string PrintAutoShardingSolution(const HloInstructionSequence& sequence,
                                       const LivenessSet& liveness_set,
                                       const StrategyMap& strategy_map,
-                                      const LeafStrategies& leaf_strategies,
+                                      const StrategyGroups& strategy_groups,
                                       const CostGraph& cost_graph,
                                       absl::Span<const NodeStrategyIdx> s_val,
-                                      double objective) {
+                                      const double objective) {
   std::string str("=== Auto sharding strategy ===\n");
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
-  size_t N = leaf_strategies.size();
+  size_t N = strategy_groups.size();
 
   // Print the chosen strategy
   for (NodeIdx node_idx = 0; node_idx < N; ++node_idx) {
     absl::StrAppend(
         &str, node_idx, " ",
         ToAdaptiveString(
-            instructions[leaf_strategies[node_idx]->instruction_id]),
+            instructions[strategy_groups[node_idx]->instruction_id]),
         " ");
     NodeStrategyIdx stra_idx = cost_graph.RemapIndex(node_idx, s_val[node_idx]);
     if (cost_graph.follow_idx_[node_idx] < 0) {
       absl::StrAppend(
-          &str, leaf_strategies[node_idx]->leaf_vector[stra_idx].ToString(),
+          &str, strategy_groups[node_idx]->strategies[stra_idx].ToString(),
           "\n");
     } else {
       absl::StrAppend(
-          &str, leaf_strategies[node_idx]->leaf_vector[stra_idx].ToString(),
+          &str, strategy_groups[node_idx]->strategies[stra_idx].ToString(),
           " follow ", cost_graph.follow_idx_[node_idx], "\n");
     }
   }
@@ -3118,20 +2271,20 @@ std::string PrintSolutionMemoryUsage(const LivenessSet& liveness_set,
   // Print the memory usage
   std::string str("=== Memory ===\n");
   std::vector<std::pair<LivenessIdx, double>> time_memory_usage;
-  // Function that gets the memory usage of a StrategyVector belongs to one
+  // Function that gets the memory usage of a StrategyGroup belongs to one
   // tensor.
-  std::function<double(const StrategyVector*)> calculate_memory_usage;
-  calculate_memory_usage = [&](const StrategyVector* strategies) {
-    if (strategies->is_tuple) {
+  std::function<double(const StrategyGroup*)> calculate_memory_usage;
+  calculate_memory_usage = [&](const StrategyGroup* strategy_group) {
+    if (strategy_group->is_tuple) {
       double m = 0.0;
-      for (const auto& child : strategies->childs) {
+      for (const auto& child : strategy_group->childs) {
         m += calculate_memory_usage(child.get());
       }
       return m;
     }
-    NodeIdx ins_idx = strategies->node_idx;
+    NodeIdx ins_idx = strategy_group->node_idx;
     NodeStrategyIdx stra_idx = cost_graph.RemapIndex(ins_idx, s_val[ins_idx]);
-    const ShardingStrategy& strategy = strategies->leaf_vector[stra_idx];
+    const ShardingStrategy& strategy = strategy_group->strategies[stra_idx];
     return strategy.memory_cost;
   };
   for (LivenessIdx time_idx = 0; time_idx < liveness_set.size(); ++time_idx) {
@@ -3148,7 +2301,7 @@ std::string PrintSolutionMemoryUsage(const LivenessSet& liveness_set,
                         " MB; mem=", mem / (1024 * 1024), " MB\n");
       }
     }
-    time_memory_usage.push_back(std::make_pair(time_idx, mem));
+    time_memory_usage.push_back({time_idx, mem});
     if (VLOG_IS_ON(6)) {
       absl::StrAppend(&str, "Time ", time_idx, ": ", mem / (1024 * 1024),
                       " MB\n");
@@ -3178,8 +2331,8 @@ std::string PrintSolutionMemoryUsage(const LivenessSet& liveness_set,
       const HloInstruction* ins = val->instruction();
       auto mem = calculate_memory_usage(strategy_map.at(ins).get());
       if (mem > 100 * 1024 * 1024) {
-        instruction_mem.push_back(std::make_pair(
-            absl::StrCat(ins->name(), val->index().ToString()), mem));
+        instruction_mem.push_back(
+            {absl::StrCat(ins->name(), val->index().ToString()), mem});
       }
     }
   }
@@ -3242,7 +2395,7 @@ absl::flat_hash_map<std::string, std::vector<HloSharding>> SaveUserShardings(
     }
   } else if (type == AutoShardingOption::PreserveShardingsType::
                          kKeepInputOutputShardings) {
-    // Saves parameter shardings
+    // Saves parameter shardings.
     for (const auto inst :
          module->entry_computation()->parameter_instructions()) {
       SaveShardingForInstruction(preserve_shardings, inst);
@@ -3262,7 +2415,7 @@ absl::flat_hash_map<std::string, std::vector<HloSharding>> SaveUserShardings(
         }
       }
     }
-    // Saves output shardings
+    // Saves output shardings.
     auto inst = module->entry_computation()->root_instruction();
     SaveShardingForInstruction(preserve_shardings, inst);
   }
@@ -3325,7 +2478,7 @@ void CheckUserShardingPreservation(
 int64_t MemoryBudgetLowerBound(const HloModule& module,
                                const LivenessSet& liveness_set,
                                const HloAliasAnalysis* alias_analysis,
-                               int64_t num_devices) {
+                               const int64_t num_devices) {
   auto get_value_sharding = [](const HloValue* value) {
     return !value->index().empty()
                ? value->instruction()->sharding().GetSubSharding(
@@ -3427,7 +2580,7 @@ void FindReplicateSet(
     HloInstruction* cur, const AliasMap& alias_map, const CostGraph& cost_graph,
     absl::Span<const NodeStrategyIdx> s_val, const StrategyMap& strategy_map,
     const ShardingStrategy& strategy, const HloInstruction* output,
-    bool do_all_gather_after_backward, HloInstruction*& transpose_inst,
+    const bool do_all_gather_after_backward, HloInstruction*& transpose_inst,
     StableHashSet<HloInstruction*>& replicated_set,
     StableHashSet<HloInstruction*>& boundary_set,
     StableHashSet<HloInstruction*>& consumer_set,
@@ -3439,13 +2592,13 @@ void FindReplicateSet(
   for (HloInstruction* consumer : users) {
     const HloInstruction* shape_inst = cur;
 
-    // Allow at most one transpose
+    // Allow at most one transpose.
     if (consumer->opcode() == HloOpcode::kTranspose &&
         (transpose_inst == nullptr ||
          DimensionsEqual(transpose_inst->shape(), consumer->shape()))) {
       shape_inst = consumer;
       transpose_inst = consumer;
-      // TODO(zhuohan): fix output_sharding comparison.
+      // TODO(zhuohan): Fix output_sharding comparison.
     }
 
     if (consumer->opcode() == HloOpcode::kTuple ||
@@ -3494,7 +2647,7 @@ void GenerateReduceScatter(
     const ClusterEnvironment& cluster_env, const AutoShardingOption& option) {
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
 
-  // Propagation ends at output
+  // Propagation ends at output.
   const HloInstruction* output = instructions.back();
   if (IsCustomCallMarker(output)) {
     output = output->operand(0);
@@ -3708,7 +2861,7 @@ void GenerateReduceScatter(
 
             CHECK(!cur->users().empty());
 
-            // Find the first user
+            // Find the first user.
             HloInstruction* first_user = nullptr;
             int64_t min_depth = ((int64_t)1) << 50;
             for (const auto& x : cur->users()) {
@@ -3718,7 +2871,7 @@ void GenerateReduceScatter(
               }
               if (x->opcode() != HloOpcode::kConvolution &&
                   x->opcode() != HloOpcode::kDot) {
-                // Only apply this aggressive optimization for dot and conv
+                // Only apply this aggressive optimization for dot and conv.
                 continue;
               }
               if (iter->second < min_depth) {
@@ -3728,7 +2881,7 @@ void GenerateReduceScatter(
             }
 
             if (first_user != nullptr) {
-              // Insert an identity to prevent CSE of all-gather
+              // Insert an identity to prevent CSE of all-gather.
               HloInstruction* identity = inst->parent()->AddInstruction(
                   HloInstruction::CreateCustomCall(cur->shape(), {cur},
                                                    kIdentityMarker));
@@ -3874,7 +3027,7 @@ void AnnotateShardingWithSimpleHeuristic(
 // Filter strategies according to the option.force_batch_dim_to_mesh_dim.
 // This can be used to forcibly generate data-parallel strategies.
 Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
-                      std::unique_ptr<StrategyVector>& strategies,
+                      std::unique_ptr<StrategyGroup>& strategy_group,
                       const ClusterEnvironment& cluster_env,
                       const InstructionBatchDimMap& batch_map,
                       const AutoShardingOption& option) {
@@ -3888,8 +3041,8 @@ Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
         "not divisible by the number of devices");
   }
 
-  std::vector<ShardingStrategy> new_leaf_vector;
-  for (auto& stra : strategies->leaf_vector) {
+  std::vector<ShardingStrategy> new_strategies;
+  for (auto& stra : strategy_group->strategies) {
     std::vector<int64_t> tensor_dim_to_mesh_dim =
         cluster_env.GetTensorDimToMeshDimWrapper(shape, stra.output_sharding);
 
@@ -3897,19 +3050,19 @@ Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
       // If the mesh dim is not one, the output tensor must be
       // tiled along the mesh dim.
       if (tensor_dim_to_mesh_dim[batch_dim] == mesh_dim) {
-        new_leaf_vector.push_back(std::move(stra));
+        new_strategies.push_back(std::move(stra));
       }
     } else {
       // If the mesh dim is one, the output tensor must be replicated
       // on the mesh dim.
       if (tensor_dim_to_mesh_dim[batch_dim] == -1) {
-        new_leaf_vector.push_back(std::move(stra));
+        new_strategies.push_back(std::move(stra));
       }
     }
   }
-  CHECK(!new_leaf_vector.empty())
+  CHECK(!new_strategies.empty())
       << ins->ToString() << " does not have any valid strategies";
-  strategies->leaf_vector = std::move(new_leaf_vector);
+  strategy_group->strategies = std::move(new_strategies);
 
   return OkStatus();
 }
@@ -4173,14 +3326,14 @@ StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
   // shardings to their input ops.
   absl::flat_hash_map<const HloInstruction*, std::vector<int64_t>>
       unspecified_dims;
-  auto status_or_changed = ProcessShardingInstruction(
+  StatusOr<bool> changed = ProcessShardingInstruction(
       module, execution_threads, /*replace_sharding_with_copy=*/true,
       &unspecified_dims, /*saved_root_shardings=*/nullptr,
       /*saved_parameter_shardings=*/nullptr);
-  if (!status_or_changed.ok()) {
-    return status_or_changed.status();
+  if (!changed.ok()) {
+    return changed.status();
   }
-  if (status_or_changed.value()) {
+  if (changed.value()) {
     module_is_changed = true;
     VLOG(3) << "CustomCalls with custom_call_target=Sharding are removed and "
                "their shardings are moved to their input ops.";
@@ -4193,15 +3346,15 @@ StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
       preserve_shardings = spmd::SaveUserShardings(
           module, replicated_small_tensors, option_.preserve_shardings);
 
-  // Remove xla sharding annotations, if there is any.
+  // Remove XLA sharding annotations, if there are any.
   if (option_.preserve_shardings !=
       AutoShardingOption::PreserveShardingsType::kKeepAllShardings) {
-    StatusOr<bool> status_or_changed = RemoveShardingAnnotation(
+    StatusOr<bool> changed = RemoveShardingAnnotation(
         module, replicated_small_tensors, execution_threads);
-    if (!status_or_changed.ok()) {
-      return status_or_changed.status();
+    if (!changed.ok()) {
+      return changed.status();
     }
-    if (status_or_changed.value()) {
+    if (changed.value()) {
       module_is_changed = true;
       LOG(INFO) << "XLA sharding annotations are removed.";
     } else {
@@ -4252,10 +3405,11 @@ StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
   // supposed to make the solver faster, but it makes it much much slower for
   // both 1D and 2D mesh shapes.
   // batch_dim_map = spmd::BuildInstructionBatchDimMap(sequence);
-  // ----- Read parameters of device mesh ----
+
+  // ----- Read parameters of device mesh -----
   Array<int64_t> original_device_mesh(option_.device_mesh_shape);
   original_device_mesh.SetValues(option_.device_mesh_ids);
-  int64_t original_memory_budget = option_.memory_budget_per_device;
+  const int64_t original_memory_budget = option_.memory_budget_per_device;
 
   std::vector<std::vector<int64_t>> partial_mesh_shapes;
   if (option_.solve_nd_sharding_iteratively) {
@@ -4267,8 +3421,13 @@ StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
 
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
 
+  HloCostAnalysis::Options hlo_cost_analysis_options{
+      .shape_size = [](const Shape& shape) { return spmd::GetBytes(shape); }};
+  HloCostAnalysis hlo_cost_analysis(hlo_cost_analysis_options);
+  CHECK_OK(module->entry_computation()->Accept(&hlo_cost_analysis));
+
   for (size_t mesh_idx = 0; mesh_idx < partial_mesh_shapes.size(); ++mesh_idx) {
-    // Adjust existing shardings with current partial mesh shapes;
+    // Adjust existing shardings with current partial mesh shapes.
     std::vector<int64_t> mesh_shape = partial_mesh_shapes[mesh_idx];
     LOG(INFO) << "Processing partial mesh shape: "
               << spmd::ToString(mesh_shape);
@@ -4279,33 +3438,33 @@ StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
       total_devices *= i;
     }
     if (mesh_idx != partial_mesh_shapes.size() - 1) {
-      auto changed_or = spmd::AdjustShardingsWithPartialMeshShape(
+      StatusOr<bool> changed = spmd::AdjustShardingsWithPartialMeshShape(
           sequence.instructions(), mesh_shape, total_devices,
           /* crash_on_error */ !option_.try_multiple_mesh_shapes);
-      if (changed_or.ok()) {
+      if (changed.ok()) {
         LOG(INFO)
             << "Shardings are adjusted based on current partial mesh shape: "
-            << *changed_or;
+            << *changed;
       } else {
-        return changed_or.status();
+        return changed.status();
       }
     }
     std::vector<int64_t> device_mesh_ids = std::vector<int64_t>(total_devices);
     std::iota(device_mesh_ids.begin(), device_mesh_ids.end(), 0);
     device_mesh.SetValues(device_mesh_ids);
 
-    // TODO (zhuohan): include the prof result as an option.
+    // TODO (zhuohan): Include the prof result as an option.
     spmd::ProfilingResult prof_result;
     spmd::ClusterEnvironment cluster_env(
         original_device_mesh, device_mesh, option_.device_mesh_alpha,
         option_.device_mesh_beta, prof_result, option_);
 
-    XLA_VLOG_LINES(3, module->ToString());
-    int64_t memory_lower_bound = spmd::MemoryBudgetLowerBound(
+    XLA_VLOG_LINES(6, module->ToString());
+    const int64_t memory_lower_bound = spmd::MemoryBudgetLowerBound(
         *module, liveness_set, alias_analysis.get(),
         device_mesh.num_elements());
-    // Rounds up to the next GB.
-    int64_t memory_lower_bound_gb =
+    // Round up to the next GB.
+    const int64_t memory_lower_bound_gb =
         1 + memory_lower_bound / (1024 * 1024 * 1024);
     LOG(INFO) << "Memory consumption lower bound is " << memory_lower_bound_gb
               << " GB.";
@@ -4345,23 +3504,23 @@ StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
     // ----- Analyze depth -----
     spmd::InstructionDepthMap ins_depth_map;
     ins_depth_map = spmd::BuildInstructionDepthMap(sequence, batch_dim_map);
+
     // ----- Build strategies and costs -----
     spmd::StrategyMap strategy_map;
-    spmd::LeafStrategies leaf_strategies;
+    spmd::StrategyGroups strategy_groups;
     spmd::AssociativeDotPairs associative_dot_pairs;
-
     TF_ASSIGN_OR_RETURN(
-        std::tie(strategy_map, leaf_strategies, associative_dot_pairs),
-        BuildStrategyAndCost(sequence, module, instruction_execution_counts,
-                             ins_depth_map, batch_dim_map, alias_map,
-                             cluster_env, option_, *call_graph,
-                             option_.try_multiple_mesh_shapes));
+        std::tie(strategy_map, strategy_groups, associative_dot_pairs),
+        BuildStrategyAndCost(
+            sequence, module, instruction_execution_counts, ins_depth_map,
+            batch_dim_map, alias_map, cluster_env, option_, *call_graph,
+            hlo_cost_analysis, option_.try_multiple_mesh_shapes));
     spmd::AliasSet alias_set = spmd::BuildAliasSet(module, strategy_map);
-    CheckAliasSetCompatibility(alias_set, leaf_strategies, sequence);
+    CheckAliasSetCompatibility(alias_set, strategy_groups, sequence);
     XLA_VLOG_LINES(8, PrintStrategyMap(strategy_map, sequence));
 
     // ----- Build cost graph and merge unimportant nodes -----
-    spmd::CostGraph cost_graph(leaf_strategies, associative_dot_pairs);
+    spmd::CostGraph cost_graph(strategy_groups, associative_dot_pairs);
     cost_graph.Simplify(option_.simplify_graph);
 
     // ----- Build the liveness node set -----
@@ -4371,39 +3530,37 @@ StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
         const HloInstruction* instruction = value->instruction();
         const ShapeIndex& index = value->index();
         if (instruction->shape().IsTuple() && index.empty()) continue;
-        const spmd::StrategyVector* strategies =
+        const spmd::StrategyGroup* strategy_group =
             strategy_map.at(instruction).get();
         const spmd::NodeIdx node_idx =
-            strategies->GetSubStrategyVector(index)->node_idx;
+            strategy_group->GetSubStrategyGroup(index)->node_idx;
         if (node_idx >= 0) liveness_node_set[t].push_back(node_idx);
       }
+      std::sort(liveness_node_set[t].begin(), liveness_node_set[t].end());
     }
 
     // ----- Call the ILP Solver -----
     std::vector<spmd::NodeStrategyIdx> s_val;
     std::vector<spmd::EdgeStrategyIdx> e_val;
     double objective = -1.0;
-    if (!option_.load_solution_vector) {
-      auto solver_result = Solve(
-          *hlo_live_range, liveness_node_set, strategy_map, leaf_strategies,
-          cost_graph, alias_set, option_, sharding_propagation_solution);
-      if (solver_result.skip_auto_sharding) {
-        return AutoShardingResult::kModuleUnchangedNoShardingPerfomed;
-      } else if (!solver_result.status.ok()) {
-        return AutoShardingResult::kModuleUnchanged;
-      } else {
-        TF_ASSIGN_OR_RETURN(auto solution, solver_result.status);
-        std::tie(s_val, e_val, objective) = solution;
-        if (mesh_idx == partial_mesh_shapes.size() - 1) {
-          this->solver_optimal_objective_value_ = objective;
-        }
-      }
+    auto solver_result =
+        Solve(*module, *hlo_live_range, liveness_node_set, strategy_map,
+              strategy_groups, cost_graph, alias_set, option_,
+              sharding_propagation_solution);
+    if (solver_result.skip_auto_sharding) {
+      return AutoShardingResult::kModuleUnchangedNoShardingPerfomed;
+    } else if (!solver_result.status.ok()) {
+      return AutoShardingResult::kModuleUnchanged;
     } else {
-      s_val = option_.strategy_vector;
+      TF_ASSIGN_OR_RETURN(auto solution, solver_result.status);
+      std::tie(s_val, e_val, objective) = solution;
+      if (mesh_idx == partial_mesh_shapes.size() - 1) {
+        this->solver_optimal_objective_value_ = objective;
+      }
     }
 
     XLA_VLOG_LINES(5, PrintAutoShardingSolution(sequence, liveness_set,
-                                                strategy_map, leaf_strategies,
+                                                strategy_map, strategy_groups,
                                                 cost_graph, s_val, objective));
     XLA_VLOG_LINES(1, PrintSolutionMemoryUsage(liveness_set, strategy_map,
                                                cost_graph, s_val));
@@ -4495,7 +3652,7 @@ StatusOr<bool> AutoSharding::Run(
   if (!option_.enable) {
     return false;
   }
-  VLOG(1) << "Start auto sharding pass";
+  LOG(INFO) << "Starting the auto sharding pass";
 
   if (IsModuleManuallySharded(module)) {
     LOG(ERROR)
@@ -4515,7 +3672,7 @@ StatusOr<bool> AutoSharding::Run(
 #endif
 
   TF_RETURN_IF_ERROR(option_.CheckAndSetup());
-  VLOG(1) << "AutoShardingOptions:\n" << option_.ToString();
+  LOG(INFO) << "AutoShardingOptions:\n" << option_.ToString();
 
   absl::flat_hash_set<std::string> replicated_small_tensors;
   if (option_.small_tensor_byte_size > 0) {
@@ -4535,18 +3692,19 @@ StatusOr<bool> AutoSharding::Run(
     }
   }
 
-  bool asymmetrical_mesh_dims = false;
-  for (size_t i = 0; i < option_.device_mesh_shape.size(); ++i) {
-    if (option_.device_mesh_beta[0] != option_.device_mesh_beta[i] ||
-        option_.device_mesh_alpha[0] != option_.device_mesh_alpha[i]) {
-      asymmetrical_mesh_dims = true;
-      break;
-    }
-  }
-
   std::vector<std::vector<int64_t>> mesh_shapes;
   if (option_.try_multiple_mesh_shapes) {
-    mesh_shapes = spmd::CreateDifferentMeshShapesToTry(
+    bool asymmetrical_mesh_dims = false;
+    for (size_t i = 0; i < option_.device_mesh_shape.size(); ++i) {
+      if (option_.device_mesh_beta[0] != option_.device_mesh_beta[i] ||
+          option_.device_mesh_alpha[0] != option_.device_mesh_alpha[i]) {
+        asymmetrical_mesh_dims = true;
+        break;
+      }
+    }
+
+    mesh_shapes = spmd::InferOrEnumerateMeshShapesToTry(
+        *module,
         absl::c_accumulate(option_.device_mesh_shape, 1,
                            [](int64_t a, int64_t b) { return a * b; }),
         option_.device_mesh_shape.size(),
@@ -4560,7 +3718,7 @@ StatusOr<bool> AutoSharding::Run(
   std::unique_ptr<HloModule> module_with_default_solution = nullptr;
   if (option_.use_sharding_propagation_for_default_shardings) {
     module_with_default_solution = CloneModule(module);
-    // TODO(pratikf): Ensure that we're passing the correct customc all sharding
+    // TODO(pratikf): Ensure that we're passing the correct custom call sharding
     // helper to the sharding propagation pass.
     auto sharding_prop = ShardingPropagation(
         /*is_spmd */ true, /*propagate_metadata */ false,
@@ -4573,7 +3731,7 @@ StatusOr<bool> AutoSharding::Run(
 
     CHECK_OK(sharding_prop.Run(module_with_default_solution.get(),
                                execution_threads));
-    LOG(INFO) << module_with_default_solution->ToString();
+    VLOG(6) << module_with_default_solution->ToString();
     for (const auto computation :
          module_with_default_solution->computations()) {
       for (const auto instruction : computation->instructions()) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
index 9fb51c270da07a..04394da25a4ca6 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
@@ -19,7 +19,10 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
+#include <tuple>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -37,8 +40,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/service/call_graph.h"
+#include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/shape.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace xla {
@@ -66,7 +71,6 @@ class AutoShardingImplementation {
   explicit AutoShardingImplementation(const AutoShardingOption& option);
   ~AutoShardingImplementation() = default;
 
-  // using HloPassInterface::Run;
   StatusOr<AutoShardingResult> RunAutoSharding(
       HloModule* module,
       const absl::flat_hash_set<std::string>& replicated_small_tensors,
@@ -79,7 +83,6 @@ class AutoShardingImplementation {
   StatusOr<bool> RemoveShardingAnnotation(
       HloModule* module,
       const absl::flat_hash_set<std::string>& replicated_small_tensors = {},
-
       const absl::flat_hash_set<absl::string_view>& execution_threads = {});
 
   // Canonicalizes entry_computation_layouts by calling
@@ -90,7 +93,7 @@ class AutoShardingImplementation {
   //     tensorflow/compiler/xla/pjrt/utils.cc
   Status CanonicalizeLayouts(HloModule* module);
 
-  // Returns the optimal objective value that the ILP solver computes
+  // Returns the optimal objective value that the ILP solver computes.
   double GetSolverOptimalObjectiveValue() {
     return solver_optimal_objective_value_;
   }
@@ -99,7 +102,7 @@ class AutoShardingImplementation {
   AutoShardingOption option_;
 
   // Stores the optimal value of the objective the solver found. This is used to
-  // chose the best mesh shape when the try_multiple_mesh_shapes option is on.
+  // choose the best mesh shape when the try_multiple_mesh_shapes option is on.
   double solver_optimal_objective_value_ = -1.0;
 };
 
@@ -129,44 +132,44 @@ class AutoSharding : public HloModulePass {
 };
 
 namespace spmd {
-// Function declarations
+// Function declarations.
 // Their comments can be found in their definitions in *.cc files.
 HloSharding Tile(const Shape& shape, absl::Span<const int64_t> tensor_dims,
                  absl::Span<const int64_t> mesh_dims,
                  const Array<int64_t>& device_mesh);
 
-std::vector<double> ReshardingCostVector(const StrategyVector* strategies,
+std::vector<double> ReshardingCostVector(const StrategyGroup* strategy_group,
                                          const Shape& shape,
                                          const HloSharding& required_sharding,
                                          const ClusterEnvironment& cluster_env);
 
 std::vector<double> FollowInsCostVector(int64_t source_len, int64_t index);
 
-std::unique_ptr<StrategyVector> CreateLeafStrategyVector(
+std::unique_ptr<StrategyGroup> CreateLeafStrategyGroup(
     size_t instruction_id, const HloInstruction* ins,
-    const StrategyMap& strategy_map, LeafStrategies& leaf_strategies);
+    const StrategyMap& strategy_map, StrategyGroups& strategy_groups);
 
-void SetInNodesWithInstruction(std::unique_ptr<StrategyVector>& strategies,
+void SetInNodesWithInstruction(std::unique_ptr<StrategyGroup>& strategy_group,
                                const HloInstruction* ins,
                                const StrategyMap& strategy_map);
 
-void RemoveDuplicatedStrategy(std::unique_ptr<StrategyVector>& strategies);
+void RemoveDuplicatedStrategy(std::unique_ptr<StrategyGroup>& strategy_group);
 
 Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
-                      std::unique_ptr<StrategyVector>& strategies,
+                      std::unique_ptr<StrategyGroup>& strategy_group,
                       const ClusterEnvironment& cluster_env,
                       const InstructionBatchDimMap& batch_map,
                       const AutoShardingOption& option);
 
-Status HandleDot(std::unique_ptr<StrategyVector>& strategies,
-                 LeafStrategies& leaf_strategies, StrategyMap& strategy_map,
+Status HandleDot(std::unique_ptr<StrategyGroup>& strategy_group,
+                 StrategyGroups& strategy_groups, StrategyMap& strategy_map,
                  const HloInstruction* ins, size_t instruction_id,
                  const ClusterEnvironment& cluster_env,
                  const InstructionBatchDimMap& batch_map,
                  const AutoShardingOption& option, const CallGraph& call_graph);
 
-Status HandleConv(std::unique_ptr<StrategyVector>& strategies,
-                  LeafStrategies& leaf_strategies, StrategyMap& strategy_map,
+Status HandleConv(std::unique_ptr<StrategyGroup>& strategy_group,
+                  StrategyGroups& strategy_groups, StrategyMap& strategy_map,
                   const HloInstruction* ins, size_t instruction_id,
                   const ClusterEnvironment& cluster_env,
                   const InstructionBatchDimMap& batch_map,
@@ -192,7 +195,7 @@ AliasSet BuildAliasSet(const HloModule* module,
                        const StrategyMap& strategy_map);
 
 void CheckAliasSetCompatibility(const AliasSet& alias_set,
-                                const LeafStrategies& leaf_strategies,
+                                const StrategyGroups& strategy_groups,
                                 const HloInstructionSequence& sequence);
 
 void GenerateReduceScatter(
@@ -212,9 +215,9 @@ HloSharding GetReduceScatterOutput(const HloInstruction* ins,
 
 // The high-level "recipe" for solving an Auto Sharding problem.
 AutoShardingSolverResult Solve(
-    const HloLiveRange& hlo_live_range,
+    const HloModule& hlo_module, const HloLiveRange& hlo_live_range,
     const LivenessNodeSet& liveness_node_set, const StrategyMap& strategy_map,
-    const LeafStrategies& leaf_strategies, const CostGraph& cost_graph,
+    const StrategyGroups& strategy_groups, const CostGraph& cost_graph,
     const AliasSet& alias_set, const AutoShardingOption& option,
     const absl::flat_hash_map<std::string, const HloInstruction*>&
         sharding_propagation_solution = {});
@@ -223,6 +226,151 @@ AutoShardingSolverResult Solve(
 void PopulateTemporalValues(const CostGraph& cost_graph,
                             AutoShardingSolverRequest& request);
 
+void AddReplicatedStrategy(
+    const HloInstruction* ins, const Shape& shape,
+    const ClusterEnvironment& cluster_env, const StrategyMap& strategy_map,
+    std::unique_ptr<StrategyGroup>& strategy_group, double replicated_penalty,
+    absl::flat_hash_set<int64_t> operands_to_consider_all_strategies_for = {});
+
+void CheckMemoryCosts(StrategyGroup* strategy_group, const Shape& shape);
+
+// Choose an operand to follow. We choose to follow the operand with the highest
+// priority.
+std::pair<int64_t, bool> ChooseOperandToFollow(
+    const StrategyMap& strategy_map, const InstructionDepthMap& depth_map,
+    const AliasMap& alias_map, int64_t max_depth, const HloInstruction* ins);
+
+void FillAllStrategiesForArray(
+    std::unique_ptr<StrategyGroup>& strategy_group, const HloInstruction* ins,
+    const Shape& shape, const ClusterEnvironment& cluster_env,
+    const StrategyMap& strategy_map, const AutoShardingOption& option,
+    double replicated_penalty, const InstructionBatchDimMap& batch_dim_map,
+    const CallGraph& call_graph, bool only_allow_divisible,
+    bool create_replicated_strategies,
+    bool create_partially_replicated_strategies);
+
+StatusOr<std::unique_ptr<StrategyGroup>> CreateAllStrategiesGroup(
+    const HloInstruction* ins, const Shape& shape, size_t instruction_id,
+    StrategyGroups& strategy_groups, const ClusterEnvironment& cluster_env,
+    const StrategyMap& strategy_map, const AutoShardingOption& option,
+    double replicated_penalty, const InstructionBatchDimMap& batch_dim_map,
+    const CallGraph& call_graph, bool only_allow_divisible,
+    bool create_replicated_strategies,
+    bool create_partially_replicated_strategies);
+
+// Enumerates sharding strategies for elementwise operators by following
+// strategies of an operand of the elementwise op.
+std::unique_ptr<StrategyGroup> CreateElementwiseOperatorStrategies(
+    size_t instruction_id, const HloInstruction* ins,
+    const StrategyMap& strategy_map, const ClusterEnvironment& cluster_env,
+    const InstructionDepthMap& depth_map, const AliasMap& alias_map,
+    StableHashMap<int64_t, std::vector<ShardingStrategy>>&
+        pretrimmed_strategy_map,
+    int64_t max_depth, StrategyGroups& strategy_groups,
+    AssociativeDotPairs& associative_dot_pairs);
+
+// Factory functions for StrategyGroup.
+std::unique_ptr<StrategyGroup> CreateLeafStrategyGroupWithoutInNodes(
+    size_t instruction_id, StrategyGroups& strategy_groups);
+
+// Enumerates sharding strategies for reshape operators. The function does so by
+// essentially reshaping the sharding of the operand in a manner similar to the
+// tensor reshape itself.
+std::unique_ptr<StrategyGroup> CreateReshapeStrategies(
+    size_t instruction_id, const HloInstruction* ins,
+    const StrategyMap& strategy_map, const ClusterEnvironment& cluster_env,
+    bool only_allow_divisible, double replicated_penalty,
+    const InstructionBatchDimMap& batch_dim_map,
+    const AutoShardingOption& option, StrategyGroups& strategy_groups,
+    const CallGraph& call_graph);
+
+std::unique_ptr<StrategyGroup> CreateTupleStrategyGroup(size_t instruction_id);
+
+// Enumerate all 1d partition strategies.
+void EnumerateAll1DPartition(const HloInstruction* ins, const Shape& shape,
+                             const Array<int64_t>& device_mesh,
+                             const ClusterEnvironment& cluster_env,
+                             const StrategyMap& strategy_map,
+                             std::unique_ptr<StrategyGroup>& strategy_group,
+                             bool only_allow_divisible,
+                             const std::string& suffix,
+                             const CallGraph& call_graph);
+
+// Enumerate all partitions recursively.
+void EnumerateAllPartition(const HloInstruction* ins, const Shape& shape,
+                           const Array<int64_t>& device_mesh,
+                           const ClusterEnvironment& cluster_env,
+                           const StrategyMap& strategy_map,
+                           std::unique_ptr<StrategyGroup>& strategy_group,
+                           const InstructionBatchDimMap& batch_dim_map,
+                           bool only_allow_divisible,
+                           const CallGraph& call_graph,
+                           int64_t partition_dimensions,
+                           const std::vector<int64_t>& tensor_dims = {});
+
+StatusOr<std::unique_ptr<StrategyGroup>> FollowReduceStrategy(
+    const HloInstruction* ins, const Shape& output_shape,
+    const HloInstruction* operand, const HloInstruction* unit,
+    size_t instruction_id, StrategyMap& strategy_map,
+    StrategyGroups& strategy_groups, const ClusterEnvironment& cluster_env,
+    bool allow_mixed_mesh_shape, bool crash_at_error);
+
+void GenerateOutfeedStrategy(const HloInstruction* ins, const Shape& shape,
+                             const ClusterEnvironment& cluster_env,
+                             const StrategyMap& strategy_map,
+                             std::unique_ptr<StrategyGroup>& strategy_group,
+                             double replicated_penalty);
+
+std::vector<std::vector<double>>
+GenerateReshardingCostsAndMissingShardingsForAllOperands(
+    const HloInstruction* ins, const HloSharding& output_sharding,
+    const StrategyMap& strategy_map, const ClusterEnvironment& cluster_env,
+    const CallGraph& call_graph,
+    std::vector<std::optional<HloSharding>>& input_shardings);
+
+bool LeafVectorsAreConsistent(const std::vector<ShardingStrategy>& one,
+                              const std::vector<ShardingStrategy>& two,
+                              bool is_reshape);
+
+std::unique_ptr<StrategyGroup> MaybeFollowInsStrategyGroup(
+    const StrategyGroup* src_strategy_group, const Shape& shape,
+    size_t instruction_id, bool have_memory_cost,
+    StrategyGroups& strategy_groups, const ClusterEnvironment& cluster_env,
+    StableHashMap<NodeIdx, std::vector<ShardingStrategy>>&
+        pretrimmed_strategy_map);
+
+void RemoveInvalidShardingsWithShapes(const Shape& shape,
+                                      StrategyGroup* strategy_group,
+                                      bool instruction_has_user_sharding);
+
+void ScaleCostsWithExecutionCounts(StrategyGroup* strategy_group,
+                                   int64_t execution_count);
+
+// Existing shardings refer to the HloSharding field in the given
+// HloInstruction.
+void TrimOrGenerateStrategiesBasedOnExistingSharding(
+    const Shape& output_shape, StrategyGroup* strategy_group,
+    const StrategyMap& strategy_map,
+    const std::vector<HloInstruction*>& instructions,
+    const HloSharding& existing_sharding, const ClusterEnvironment& cluster_env,
+    StableHashMap<int64_t, std::vector<ShardingStrategy>>&
+        pretrimmed_strategy_map,
+    const CallGraph& call_graph, bool strict);
+
+// Build possible sharding strategies and their costs for all instructions.
+StatusOr<std::tuple<StrategyMap, StrategyGroups, AssociativeDotPairs>>
+BuildStrategyAndCost(const HloInstructionSequence& sequence,
+                     const HloModule* module,
+                     const absl::flat_hash_map<const HloInstruction*, int64_t>&
+                         instruction_execution_counts,
+                     const InstructionDepthMap& depth_map,
+                     const InstructionBatchDimMap& batch_dim_map,
+                     const AliasMap& alias_map,
+                     const ClusterEnvironment& cluster_env,
+                     AutoShardingOption& option, const CallGraph& call_graph,
+                     const HloCostAnalysis& hlo_cost_analysis,
+                     bool trying_multiple_mesh_shapes);
+
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto
new file mode 100644
index 00000000000000..095a3fce35e3e5
--- /dev/null
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto
@@ -0,0 +1,62 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla;
+
+message AutoShardingSolverRequest {
+  message Pair {
+    int64 first = 1;
+    int64 second = 2;
+  }
+  message Costs {
+    repeated double costs = 1;
+  }
+  message Nodes {
+    repeated int64 nodes = 1;
+  }
+  message SolverTimeout {
+    int64 solver_timeout_in_seconds = 1;
+  }
+  message Coeff {
+    double coeff = 1;
+  }
+
+  int64 num_nodes = 1;
+  int64 memory_budget = 2;
+  repeated int64 s_len = 3;
+  repeated int64 s_follow = 4;
+  repeated int64 s_hint = 5;
+  repeated Pair edges = 6;
+  repeated Nodes live = 7;
+  repeated Costs computation_costs = 8;
+  repeated Costs communication_costs = 9;
+  repeated Costs memory_costs = 10;
+  repeated Costs departure_costs = 11;
+  repeated Costs resharding_costs = 12;
+  repeated Costs duration_costs = 13;
+  repeated Pair aliases = 14;
+  repeated Costs value_costs = 15;
+  repeated string instruction_names = 16;
+  optional SolverTimeout solver_timeout = 17;
+  optional Coeff overbudget_coeff = 18;
+  optional Coeff makespan_coeff = 19;
+  optional Coeff max_departures = 20;
+  bool crash_at_infinity_costs_check = 21;
+  bool compute_iis = 22;
+  double saltiplier = 23;
+  string module_name = 24;
+}
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
index 21808e3fc6eafc..fd627355089688 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
@@ -23,9 +23,13 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/hlo/experimental/auto_sharding/matrix.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/shape_util.h"
 namespace xla {
 namespace spmd {
 
@@ -33,17 +37,17 @@ namespace spmd {
 // It merges nodes and does path compression.
 class CostGraph {
  public:
-  CostGraph(const LeafStrategies& leaf_strategies,
+  CostGraph(const StrategyGroups& strategy_groups,
             const AssociativeDotPairs& associative_dot_pairs) {
-    node_lens_.reserve(leaf_strategies.size());
-    extra_node_costs_.reserve(leaf_strategies.size());
-    adjacency_.assign(leaf_strategies.size(), StableHashSet<int>());
+    node_lens_.reserve(strategy_groups.size());
+    extra_node_costs_.reserve(strategy_groups.size());
+    adjacency_.assign(strategy_groups.size(), StableHashSet<int>());
 
     // Build the cost graph
-    for (const auto& strategies : leaf_strategies) {
-      node_lens_.push_back(strategies->leaf_vector.size());
+    for (const auto& strategies : strategy_groups) {
+      node_lens_.push_back(strategies->strategies.size());
       extra_node_costs_.push_back(
-          std::vector<double>(strategies->leaf_vector.size(), 0.0));
+          std::vector<double>(strategies->strategies.size(), 0.0));
 
       for (size_t i = 0; i < strategies->in_nodes.size(); ++i) {
         if (!strategies->in_nodes[i]->is_tuple) {
@@ -97,14 +101,14 @@ class CostGraph {
 
       Matrix edge_cost(node_lens_[src_idx], node_lens_[dst_idx]);
       for (NodeStrategyIdx i = 0; i < node_lens_[src_idx]; ++i) {
-        if (leaf_strategies[src_idx]->leaf_vector[i].communication_cost > 0) {
+        if (strategy_groups[src_idx]->strategies[i].communication_cost > 0) {
           CHECK_LE(
               std::abs(
-                  leaf_strategies[src_idx]->leaf_vector[i].communication_cost -
-                  leaf_strategies[dst_idx]->leaf_vector[i].communication_cost),
+                  strategy_groups[src_idx]->strategies[i].communication_cost -
+                  strategy_groups[dst_idx]->strategies[i].communication_cost),
               1e-6);
           edge_cost(i, i) =
-              -leaf_strategies[src_idx]->leaf_vector[i].communication_cost;
+              -strategy_groups[src_idx]->strategies[i].communication_cost;
         }
       }
       AddEdgeCost(src_idx, dst_idx, edge_cost);
@@ -112,12 +116,12 @@ class CostGraph {
   }
 
   Matrix CreateEdgeCost(NodeIdx src_idx, NodeIdx dst_idx, size_t in_node_idx,
-                        StrategyVector* strategies, bool zero_cost = false) {
-    CHECK_GE(node_lens_.size(), src_idx);
-    CHECK_GE(node_lens_.size(), dst_idx);
+                        StrategyGroup* strategy_group, bool zero_cost = false) {
+    CHECK_LT(src_idx, node_lens_.size());
+    CHECK_LT(dst_idx, node_lens_.size());
     Matrix edge_cost(node_lens_[src_idx], node_lens_[dst_idx]);
-    for (NodeStrategyIdx k = 0; k < strategies->leaf_vector.size(); ++k) {
-      const ShardingStrategy& strategy = strategies->leaf_vector[k];
+    for (NodeStrategyIdx k = 0; k < strategy_group->strategies.size(); ++k) {
+      const ShardingStrategy& strategy = strategy_group->strategies[k];
       size_t start_idx = 0;
       if (strategy.resharding_costs[in_node_idx].size() > node_lens_[src_idx]) {
         start_idx =
@@ -358,11 +362,11 @@ class CostGraph {
 inline const ShardingStrategy& GetShardingStrategy(
     const HloInstruction* inst, const StrategyMap& strategy_map,
     const CostGraph& cost_graph, absl::Span<const NodeStrategyIdx> s_val) {
-  const StrategyVector* strategies = strategy_map.at(inst).get();
-  CHECK(!strategies->is_tuple);
-  NodeIdx node_idx = strategies->node_idx;
+  const StrategyGroup* strategy_group = strategy_map.at(inst).get();
+  CHECK(!strategy_group->is_tuple);
+  NodeIdx node_idx = strategy_group->node_idx;
   NodeStrategyIdx stra_idx = cost_graph.RemapIndex(node_idx, s_val[node_idx]);
-  return strategies->leaf_vector[stra_idx];
+  return strategy_group->strategies[stra_idx];
 }
 
 // Get the final sharding strategy according to the ilp solution.
@@ -370,16 +374,16 @@ inline const ShardingStrategy& GetShardingStrategyForTuple(
     const HloInstruction* inst, ShapeIndex index,
     const StrategyMap& strategy_map, const CostGraph& cost_graph,
     absl::Span<const NodeStrategyIdx> s_val) {
-  const StrategyVector* tuple_strategies = strategy_map.at(inst).get();
-  CHECK(tuple_strategies->is_tuple);
+  const StrategyGroup* strategy_group = strategy_map.at(inst).get();
+  CHECK(strategy_group->is_tuple);
   for (auto index_element : index) {
-    CHECK_LT(index_element, tuple_strategies->childs.size());
-    const auto& strategies = tuple_strategies->childs[index_element];
-    tuple_strategies = strategies.get();
+    CHECK_LT(index_element, strategy_group->childs.size());
+    const auto& strategies = strategy_group->childs[index_element];
+    strategy_group = strategies.get();
   }
-  NodeIdx node_idx = tuple_strategies->node_idx;
+  NodeIdx node_idx = strategy_group->node_idx;
   NodeStrategyIdx stra_idx = cost_graph.RemapIndex(node_idx, s_val[node_idx]);
-  return tuple_strategies->leaf_vector[stra_idx];
+  return strategy_group->strategies[stra_idx];
 }
 
 }  // namespace spmd
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index fb082f1169e2dd..bde495a71447dc 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -33,14 +34,20 @@ limitations under the License.
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
 #include "xla/hlo/experimental/auto_sharding/cluster_environment.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/service/call_graph.h"
+#include "xla/service/dot_as_convolution_util.h"
+#include "xla/service/sharding_propagation.h"
 #include "xla/status.h"
 #include "tsl/platform/errors.h"
 
 namespace xla {
 namespace spmd {
+namespace {
 
 using DimMap = StableHashMap</*tensor dim*/ int, /* mesh dim*/ int>;
 using MeshDims = absl::Span<const int64_t>;
@@ -54,12 +61,12 @@ struct Enumeration {
 // Contains base functionality common to both DotHandler and ConvHandler.
 class HandlerBase {
  protected:
-  HandlerBase(std::unique_ptr<StrategyVector>& strategies,
+  HandlerBase(std::unique_ptr<StrategyGroup>& strategy_group,
               StrategyMap& strategy_map, const HloInstruction* ins,
               const ClusterEnvironment& cluster_env,
               const InstructionBatchDimMap& batch_map,
               const AutoShardingOption& option, const CallGraph& call_graph)
-      : strategies_(strategies),
+      : strategy_group_(strategy_group),
         strategy_map_(strategy_map),
         ins_(ins),
         cluster_env_(cluster_env),
@@ -74,26 +81,7 @@ class HandlerBase {
   void AppendNewStrategy(const std::string& name,
                          const HloSharding& output_spec,
                          absl::Span<const HloSharding> input_specs,
-                         double compute_cost, double communication_cost) {
-    std::vector<std::vector<double>> resharding_costs;
-
-    for (int i = 0; i < ins_->operand_count(); ++i) {
-      const HloInstruction* operand = ins_->operand(i);
-      resharding_costs.push_back(
-          ReshardingCostVector(strategy_map_.at(operand).get(),
-                               operand->shape(), input_specs[i], cluster_env_));
-    }
-
-    strategies_->leaf_vector.push_back(ShardingStrategy({
-        name,
-        output_spec,
-        compute_cost,
-        communication_cost,
-        GetBytes(ins_->shape()) / output_spec.NumTiles(),
-        resharding_costs,
-        {input_specs.begin(), input_specs.end()},
-    }));
-  }
+                         double compute_cost, double communication_cost);
 
   bool CheckDims(const HloInstruction* ins, const DimMap& dim_map) const {
     for (const auto& [tensor_dim, mesh_dim] : dim_map) {
@@ -118,38 +106,24 @@ class HandlerBase {
     return Tile(ins->shape(), tensor_dims, mesh_dims, device_mesh);
   }
 
-  HloSharding CreateInputSpecUsingShardingPropagation(
-      int operand_index, const HloSharding& output_spec) const {
-    std::optional<HloSharding> operand_sharding =
-        GetInputSharding(ins_, ins_->operand(operand_index), operand_index,
-                         output_spec, call_graph_, cluster_env_.NumDevices());
-    CHECK(operand_sharding.has_value());
-    return operand_sharding.value();
-  }
+  // Given lhs and rhs dim maps, infers a sharding for the output by relying on
+  // the sharding_propagation pass.
+  void MaybeAppend(
+      const std::string& name, const DimMap& lhs_dim_map,
+      const DimMap& rhs_dim_map,
+      const std::optional<DimMap>& expected_output_dim_map,
+      const Array<int64_t>& device_mesh, double compute_cost = 0,
+      const std::optional<std::function<double(const HloSharding&)>>&
+          communication_cost_fn = std::nullopt);
 
-  void MaybeAppend(const std::string& name, const HloSharding& output_spec,
-                   const DimMap& lhs_dim_map, const DimMap& rhs_dim_map,
-                   const Array<int64_t>& device_mesh, double compute_cost = 0,
-                   double communication_cost = 0,
-                   bool use_sharding_propagation = true) {
-    if (!CheckDims(lhs_, lhs_dim_map) || !CheckDims(rhs_, rhs_dim_map)) return;
-    HloSharding lhs_spec =
-        use_sharding_propagation
-            ? CreateInputSpecUsingShardingPropagation(0, output_spec)
-            : CreateInputSpec(lhs_, lhs_dim_map, device_mesh);
-    HloSharding rhs_spec =
-        use_sharding_propagation
-            ? CreateInputSpecUsingShardingPropagation(1, output_spec)
-            : CreateInputSpec(rhs_, rhs_dim_map, device_mesh);
-    AppendNewStrategy(name, output_spec, {lhs_spec, rhs_spec}, compute_cost,
-                      communication_cost);
-  }
+  std::optional<HloSharding> GetShardingFromUser(const HloSharding& lhs_spec,
+                                                 const HloSharding& rhs_spec);
 
   // Enumerates combinations of the given mesh + tensor dimensions.
   void Enumerate(std::function<void(const Enumeration&)> split_func,
                  size_t num_outer_dims = 2, size_t num_inner_dims = 2,
                  bool half = false) {
-    auto mesh_shape = device_mesh_.dimensions();
+    absl::Span<const int64_t> mesh_shape = device_mesh_.dimensions();
     for (int64_t dim0 = 0; dim0 < mesh_shape.size(); ++dim0) {
       for (int64_t dim1 = 0; dim1 < mesh_shape.size(); ++dim1) {
         if (dim0 == dim1) continue;
@@ -168,7 +142,7 @@ class HandlerBase {
     Enumerate(split_func, num_outer_dims, num_inner_dims, true);
   }
 
-  std::unique_ptr<StrategyVector>& strategies_;
+  std::unique_ptr<StrategyGroup>& strategy_group_;
   StrategyMap& strategy_map_;
   const HloInstruction* ins_;
   const ClusterEnvironment& cluster_env_;
@@ -184,641 +158,890 @@ class HandlerBase {
 
 class DotHandler : public HandlerBase {
  public:
-  DotHandler(std::unique_ptr<StrategyVector>& strategies,
-             StrategyMap& strategy_map, const HloInstruction* ins,
+  DotHandler(std::unique_ptr<StrategyGroup>& strategy_group,
+             StrategyMap& strategy_map, const HloDotInstruction* ins,
              const ClusterEnvironment& cluster_env,
              const InstructionBatchDimMap& batch_map,
-             const AutoShardingOption& option, const CallGraph& call_graph)
-      : HandlerBase(strategies, strategy_map, ins, cluster_env, batch_map,
-                    option, call_graph),
-        space_base_dim_(
-            ins->dot_dimension_numbers().lhs_batch_dimensions_size()),
-        lhs_con_dims_(
-            ins->dot_dimension_numbers().lhs_contracting_dimensions()),
-        rhs_con_dims_(
-            ins->dot_dimension_numbers().rhs_contracting_dimensions()),
-        lhs_batch_dims_(ins->dot_dimension_numbers().lhs_batch_dimensions()),
-        rhs_batch_dims_(ins->dot_dimension_numbers().rhs_batch_dimensions()) {
-    std::tie(lhs_space_dims_, rhs_space_dims_) = GetSpaceDims(
-        lhs_->shape(), rhs_->shape(), ins->dot_dimension_numbers());
-    CHECK_EQ(lhs_con_dims_.size(), rhs_con_dims_.size());
-    CHECK_EQ(lhs_batch_dims_.size(), rhs_batch_dims_.size());
+             const AutoShardingOption& option, const CallGraph& call_graph);
+
+  DotHandler(
+      std::unique_ptr<StrategyGroup>& strategy_group, StrategyMap& strategy_map,
+      const HloConvolutionInstruction* ins,
+      const dot_as_convolution_util::DotConvolutionDimsInfo& conv_as_dot_dims,
+      const ClusterEnvironment& cluster_env,
+      const InstructionBatchDimMap& batch_map, const AutoShardingOption& option,
+      const CallGraph& call_graph);
+
+  void SplitLhsSpaceRhsSpace();
+
+  void SplitLhsSpaceOnly();
+
+  void SplitRhsSpaceOnly();
+
+  void SplitLhsSpaceBothContract();
+
+  void SplitRhsSpaceBothContract();
+
+  void SplitOneBatchDim();
+
+  void SplitTwoBatchDims();
+
+  void SplitBatchDimLhsSpace();
+
+  void SplitBatchDimRhsSpace();
+
+  void SplitBatchDimBothContract();
+
+  void SplitBothContractTwoDims();
+
+  void RecomputeSplitBothContract();
+
+  void Add1DDataParallel();
+
+  void Add1DBatchSplit();
+
+  Status RegisterStrategies();
+
+  // Dimension information
+  bool is_dot_;
+  int64_t space_base_dim_;
+  tsl::protobuf::RepeatedField<int64_t> lhs_space_dims_, rhs_space_dims_;
+  tsl::protobuf::RepeatedField<int64_t> lhs_con_dims_;
+  tsl::protobuf::RepeatedField<int64_t> rhs_con_dims_;
+  tsl::protobuf::RepeatedField<int64_t> lhs_batch_dims_;
+  tsl::protobuf::RepeatedField<int64_t> rhs_batch_dims_;
+};
+
+class ConvHandler : public HandlerBase {
+ public:
+  ConvHandler(std::unique_ptr<StrategyGroup>& strategy_group,
+              StrategyMap& strategy_map, const HloInstruction* ins,
+              const ClusterEnvironment& cluster_env,
+              const InstructionBatchDimMap& batch_map,
+              const AutoShardingOption& option, const CallGraph& call_graph);
+
+  void SplitLhsBatchRhsOutchannel();
+
+  void SplitLhsBatchBothInchannel();
+
+  void SplitRhsOutchannelBothInchannel();
+
+  void Add1DDataParallel();
+
+  void SplitDepthwise(bool forward);
+
+  Status RegisterStrategies();
+
+  // Dimension information
+  const ConvolutionDimensionNumbers& conv_dnums_;
+  int64_t lhs_batch_dim_, lhs_in_channel_dim_;
+  int64_t rhs_in_channel_dim_, rhs_out_channel_dim_;
+  int64_t out_batch_dim_, out_out_channel_dim_;
+};
+
+/************** HandlerBase function definitions **************/
+
+void HandlerBase::AppendNewStrategy(const std::string& name,
+                                    const HloSharding& output_spec,
+                                    absl::Span<const HloSharding> input_specs,
+                                    double compute_cost,
+                                    double communication_cost) {
+  std::vector<std::vector<double>> resharding_costs;
+
+  for (int i = 0; i < ins_->operand_count(); ++i) {
+    const HloInstruction* operand = ins_->operand(i);
+    resharding_costs.push_back(
+        ReshardingCostVector(strategy_map_.at(operand).get(), operand->shape(),
+                             input_specs[i], cluster_env_));
   }
 
-  void SplitLhsSpaceRhsSpace() {
-    auto func = [this](const Enumeration& e) {
-      const DimMap lhs_dim_map = {{lhs_space_dims_[e.i], e.mesh_dims[0]}};
-      const DimMap rhs_dim_map = {{rhs_space_dims_[e.j], e.mesh_dims[1]}};
-      std::string name = absl::StrFormat("SS = SR x RS @ {%s}",
-                                         absl::StrJoin(e.mesh_dims, ","));
-      HloSharding output_spec =
-          Tile(ins_->shape(),
-               {space_base_dim_ + e.i,
-                space_base_dim_ + static_cast<int64_t>(lhs_space_dims_.size()) +
-                    e.j},
-               e.mesh_dims, device_mesh_);
-      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_);
-    };
-    Enumerate(func, lhs_space_dims_.size(), rhs_space_dims_.size());
+  strategy_group_->strategies.push_back(ShardingStrategy({
+      name,
+      output_spec,
+      compute_cost,
+      communication_cost,
+      GetBytes(ins_->shape()) / output_spec.NumTiles(),
+      resharding_costs,
+      {input_specs.begin(), input_specs.end()},
+  }));
+}
+
+// Given lhs and rhs dim maps, infers a sharding for the output by relying on
+// the sharding_propagation pass. Given that this is a relatively new change
+// (as of 11/2023), we also take an optional expected output dim map as an
+// argument, to verify that sharding propagation in fact infers the sharding
+// we expect (and to crash if it doesn't).
+// TODO(b/309638633) As we build more confidence in this, we should remove
+// this expected_output_dim_map argument and fully rely on sharding
+// propagation.
+void HandlerBase::MaybeAppend(
+    const std::string& name, const DimMap& lhs_dim_map,
+    const DimMap& rhs_dim_map,
+    const std::optional<DimMap>& expected_output_dim_map,
+    const Array<int64_t>& device_mesh, double compute_cost,
+    const std::optional<std::function<double(const HloSharding&)>>&
+        communication_cost_fn) {
+  HloSharding lhs_spec = CreateInputSpec(lhs_, lhs_dim_map, device_mesh);
+  HloSharding rhs_spec = CreateInputSpec(rhs_, rhs_dim_map, device_mesh);
+  if (std::optional<HloSharding> output_spec =
+          GetShardingFromUser(lhs_spec, rhs_spec);
+      output_spec.has_value()) {
+    if (expected_output_dim_map.has_value()) {
+      HloSharding expected_output_spec =
+          CreateInputSpec(ins_, *expected_output_dim_map, device_mesh);
+      // TODO(b/308687597) Once the bug is resolved, we ideally either want
+      // have a CHECK statement verifying that the sharding inferred by
+      // sharding propagation is in fact what we expect, or we trust sharding
+      // propagation's results without the check. b/308687597 currently
+      // prevents us from doing so. AutoShardingTest.LargeSize in
+      // //third_party/tensorflow/compiler/xla/hlo/experimental/auto_sharding:auto_sharding_test
+      // currently fails due to the issue.
+      if (ins_->opcode() == HloOpcode::kDot &&
+          *output_spec != expected_output_spec) {
+        output_spec = expected_output_spec;
+        LOG(ERROR)
+            << "The sharding inferred by sharding propagation in this case "
+               "does not match the expected sharding for the dot "
+               "instruction. This may be related to b/308687597. Given this "
+               "mismatch, we continue with the expected sharding";
+      }
+    }
+    double communication_cost = 0;
+    if (communication_cost_fn.has_value()) {
+      communication_cost = communication_cost_fn.value()(*output_spec);
+    }
+    AppendNewStrategy(name, *output_spec, {lhs_spec, rhs_spec}, compute_cost,
+                      communication_cost);
+  } else {
+    LOG(FATAL) << "Sharding propagation could not infer output sharding";
   }
+}
 
-  void SplitLhsSpaceOnly() {
-    auto func = [this](const Enumeration& e) {
-      const DimMap lhs_dim_map = {{lhs_space_dims_[e.i], e.mesh_dims[0]},
-                                  {lhs_space_dims_[e.j], e.mesh_dims[1]}};
-      std::string name = absl::StrFormat("SSR = SSR x RR @ {%s}",
-                                         absl::StrJoin(e.mesh_dims, ","));
-      HloSharding output_spec =
-          Tile(ins_->shape(), {space_base_dim_ + e.i, space_base_dim_ + e.j},
-               e.mesh_dims, device_mesh_);
-      MaybeAppend(name, output_spec, lhs_dim_map, {}, device_mesh_);
-    };
-    EnumerateHalf(func, lhs_space_dims_.size(), lhs_space_dims_.size());
+std::optional<HloSharding> HandlerBase::GetShardingFromUser(
+    const HloSharding& lhs_spec, const HloSharding& rhs_spec) {
+  std::unique_ptr<HloInstruction> ins_clone = ins_->Clone();
+  std::unique_ptr<HloInstruction> lhs_clone = lhs_->Clone();
+  std::unique_ptr<HloInstruction> rhs_clone = rhs_->Clone();
+  ins_clone->clear_sharding();
+  lhs_clone->set_sharding(lhs_spec);
+  rhs_clone->set_sharding(rhs_spec);
+  CHECK_OK(ins_clone->ReplaceOperandWith(0, lhs_clone.get()));
+  CHECK_OK(ins_clone->ReplaceOperandWith(1, rhs_clone.get()));
+  if (ins_->opcode() == HloOpcode::kConvolution) {
+    xla::InferConvolutionShardingFromOperands(
+        ins_clone.get(), call_graph_, 10,
+        /* may_combine_partial_sharding */ true, /* is_spmd */ true);
+  } else {
+    xla::InferDotShardingFromOperands(
+        ins_clone.get(), call_graph_,
+        dot_as_convolution_util::ParseDotGeneralFromDot(ins_clone.get()),
+        /* may_combine_partial_sharding/ */ true, /* is_spmd */ true);
+  }
+  if (!ins_clone->has_sharding()) {
+    return std::nullopt;
   }
+  return ins_clone->sharding();
+}
 
-  void SplitRhsSpaceOnly() {
-    auto func = [this](const Enumeration& e) {
-      const DimMap rhs_dim_map = {{rhs_space_dims_[e.i], e.mesh_dims[0]},
-                                  {rhs_space_dims_[e.j], e.mesh_dims[1]}};
-      std::string name = absl::StrFormat("RSS = RR x RSS @ {%s}",
-                                         absl::StrJoin(e.mesh_dims, ","));
-      HloSharding output_spec = Tile(
-          ins_->shape(),
-          {space_base_dim_ + static_cast<int64_t>(lhs_space_dims_.size()) + e.i,
-           space_base_dim_ + static_cast<int64_t>(lhs_space_dims_.size()) +
-               e.j},
-          e.mesh_dims, device_mesh_);
-      MaybeAppend(name, output_spec, {}, rhs_dim_map, device_mesh_);
-    };
-    EnumerateHalf(func, rhs_space_dims_.size(), rhs_space_dims_.size());
+/************** DotHandler function definitions **************/
+
+DotHandler::DotHandler(std::unique_ptr<StrategyGroup>& strategy_group,
+                       StrategyMap& strategy_map, const HloDotInstruction* ins,
+                       const ClusterEnvironment& cluster_env,
+                       const InstructionBatchDimMap& batch_map,
+                       const AutoShardingOption& option,
+                       const CallGraph& call_graph)
+    : HandlerBase(strategy_group, strategy_map, ins, cluster_env, batch_map,
+                  option, call_graph),
+      is_dot_(true),
+      space_base_dim_(ins->dot_dimension_numbers().lhs_batch_dimensions_size()),
+      lhs_con_dims_(ins->dot_dimension_numbers().lhs_contracting_dimensions()),
+      rhs_con_dims_(ins->dot_dimension_numbers().rhs_contracting_dimensions()),
+      lhs_batch_dims_(ins->dot_dimension_numbers().lhs_batch_dimensions()),
+      rhs_batch_dims_(ins->dot_dimension_numbers().rhs_batch_dimensions()) {
+  std::tie(lhs_space_dims_, rhs_space_dims_) =
+      GetSpaceDims(lhs_->shape(), rhs_->shape(), ins->dot_dimension_numbers());
+  CHECK_EQ(lhs_con_dims_.size(), rhs_con_dims_.size());
+  CHECK_EQ(lhs_batch_dims_.size(), rhs_batch_dims_.size());
+}
+
+DotHandler::DotHandler(
+    std::unique_ptr<StrategyGroup>& strategy_group, StrategyMap& strategy_map,
+    const HloConvolutionInstruction* ins,
+    const dot_as_convolution_util::DotConvolutionDimsInfo& conv_as_dot_dims,
+    const ClusterEnvironment& cluster_env,
+    const InstructionBatchDimMap& batch_map, const AutoShardingOption& option,
+    const CallGraph& call_graph)
+    : HandlerBase(strategy_group, strategy_map, ins, cluster_env, batch_map,
+                  option, call_graph),
+      is_dot_(false),
+      space_base_dim_(-1) {
+  CHECK(conv_as_dot_dims.conv_spatial_dims.empty());
+
+  for (auto dim_idx : conv_as_dot_dims.batch_dims) {
+    if (dim_idx.lhs >= 0) lhs_batch_dims_.Add(dim_idx.lhs);
+    if (dim_idx.rhs >= 0) rhs_batch_dims_.Add(dim_idx.rhs);
   }
 
-  void SplitLhsSpaceBothContract() {
-    auto func = [this](const Enumeration& e) {
-      if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
-          device_mesh_.dim(e.mesh_dims[1]) <= 1)
-        return;
-      std::string name =
-          absl::StrFormat("SR = SS x SR @ {%s} (allreduce @ %d)",
-                          absl::StrJoin(e.mesh_dims, ","), e.mesh_dims[1]);
-      const DimMap lhs_dim_map = {{lhs_space_dims_[e.i], e.mesh_dims[0]},
-                                  {lhs_con_dims_[e.j], e.mesh_dims[1]}};
-      const DimMap rhs_dim_map = {{rhs_con_dims_[e.j], e.mesh_dims[1]}};
-      HloSharding output_spec = Tile(ins_->shape(), {space_base_dim_ + e.i},
-                                     {e.mesh_dims[0]}, device_mesh_);
-      double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
-      double communication_cost =
-          cluster_env_.AllReduceCost(memory_cost, e.mesh_dims[1]);
-      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_, 0,
-                  communication_cost, /* use_sharding_propagation */ false);
-    };
-    Enumerate(func, lhs_space_dims_.size(), lhs_con_dims_.size());
+  for (auto dim_idx : conv_as_dot_dims.contracting_dims) {
+    if (dim_idx.lhs >= 0) lhs_con_dims_.Add(dim_idx.lhs);
+    if (dim_idx.rhs >= 0) rhs_con_dims_.Add(dim_idx.rhs);
   }
 
-  void SplitRhsSpaceBothContract() {
-    auto func = [this](const Enumeration& e) {
-      if (device_mesh_.dim(e.mesh_dims[0]) <= 1) return;
-      std::string name =
-          absl::StrFormat("RS = RS x SS @ {%s} (allreduce @ %d)",
-                          absl::StrJoin(e.mesh_dims, ","), e.mesh_dims[0]);
-      const DimMap rhs_dim_map = {{rhs_space_dims_[e.i], e.mesh_dims[1]},
-                                  {rhs_con_dims_[e.j], e.mesh_dims[0]}};
-      const DimMap lhs_dim_map = {{lhs_con_dims_[e.j], e.mesh_dims[0]}};
-      HloSharding output_spec =
-          Tile(ins_->shape(),
-               {space_base_dim_ + static_cast<int64_t>(lhs_space_dims_.size()) +
-                e.i},
-               {e.mesh_dims[1]}, device_mesh_);
-      double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
-      double communication_cost =
-          cluster_env_.AllReduceCost(memory_cost, e.mesh_dims[0]);
-      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_, 0,
-                  communication_cost, /* use_sharding_propagation */ false);
-    };
-    Enumerate(func, rhs_space_dims_.size(), lhs_con_dims_.size());
+  for (auto dim_idx : conv_as_dot_dims.lhs_non_contracting_dims) {
+    if (dim_idx.lhs >= 0) lhs_space_dims_.Add(dim_idx.lhs);
+  }
+
+  for (auto dim_idx : conv_as_dot_dims.rhs_non_contracting_dims) {
+    if (dim_idx.rhs >= 0) rhs_space_dims_.Add(dim_idx.rhs);
   }
+}
+
+void DotHandler::SplitLhsSpaceRhsSpace() {
+  auto func = [this](const Enumeration& e) {
+    const DimMap lhs_dim_map = {{lhs_space_dims_[e.i], e.mesh_dims[0]}};
+    const DimMap rhs_dim_map = {{rhs_space_dims_[e.j], e.mesh_dims[1]}};
+    std::string name =
+        absl::StrFormat("SS = SR x RS @ {%s}", absl::StrJoin(e.mesh_dims, ","));
+
+    std::optional<DimMap> out_dim_map = std::nullopt;
+    if (is_dot_) {
+      out_dim_map = DimMap{
+          {space_base_dim_ + e.i, e.mesh_dims[0]},
+          {space_base_dim_ + static_cast<int64_t>(lhs_space_dims_.size()) + e.j,
+           e.mesh_dims[1]}};
+    }
+    MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_);
+  };
+  Enumerate(func, lhs_space_dims_.size(), rhs_space_dims_.size());
+}
+
+void DotHandler::SplitLhsSpaceOnly() {
+  auto func = [this](const Enumeration& e) {
+    const DimMap lhs_dim_map = {{lhs_space_dims_[e.i], e.mesh_dims[0]},
+                                {lhs_space_dims_[e.j], e.mesh_dims[1]}};
+    std::string name = absl::StrFormat("SSR = SSR x RR @ {%s}",
+                                       absl::StrJoin(e.mesh_dims, ","));
+    std::optional<DimMap> out_dim_map = std::nullopt;
+    if (is_dot_) {
+      out_dim_map = DimMap{{space_base_dim_ + e.i, e.mesh_dims[0]},
+                           {space_base_dim_ + e.j, e.mesh_dims[1]}};
+    }
+    MaybeAppend(name, lhs_dim_map, {}, out_dim_map, device_mesh_);
+  };
+  EnumerateHalf(func, lhs_space_dims_.size(), lhs_space_dims_.size());
+}
+
+void DotHandler::SplitRhsSpaceOnly() {
+  auto func = [this](const Enumeration& e) {
+    const DimMap rhs_dim_map = {{rhs_space_dims_[e.i], e.mesh_dims[0]},
+                                {rhs_space_dims_[e.j], e.mesh_dims[1]}};
+    std::string name = absl::StrFormat("RSS = RR x RSS @ {%s}",
+                                       absl::StrJoin(e.mesh_dims, ","));
+    std::optional<DimMap> out_dim_map = std::nullopt;
+    if (is_dot_) {
+      out_dim_map = DimMap{
+          {space_base_dim_ + static_cast<int64_t>(lhs_space_dims_.size()) + e.i,
+           e.mesh_dims[0]},
+          {space_base_dim_ + static_cast<int64_t>(lhs_space_dims_.size()) + e.j,
+           e.mesh_dims[1]}};
+    }
+    MaybeAppend(name, {}, rhs_dim_map, out_dim_map, device_mesh_);
+  };
+  EnumerateHalf(func, rhs_space_dims_.size(), rhs_space_dims_.size());
+}
 
-  void SplitOneBatchDim() {
-    if (absl::c_count_if(device_mesh_.dimensions(),
-                         [](int64_t size) { return size > 1; }) != 1) {
+void DotHandler::SplitLhsSpaceBothContract() {
+  auto func = [this](const Enumeration& e) {
+    if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
+        device_mesh_.dim(e.mesh_dims[1]) <= 1)
       return;
+    std::string name =
+        absl::StrFormat("SR = SS x SR @ {%s} (allreduce @ %d)",
+                        absl::StrJoin(e.mesh_dims, ","), e.mesh_dims[1]);
+    const DimMap lhs_dim_map = {{lhs_space_dims_[e.i], e.mesh_dims[0]},
+                                {lhs_con_dims_[e.j], e.mesh_dims[1]}};
+    const DimMap rhs_dim_map = {{rhs_con_dims_[e.j], e.mesh_dims[1]}};
+    std::optional<DimMap> out_dim_map = std::nullopt;
+    if (is_dot_) {
+      out_dim_map = DimMap{{space_base_dim_ + e.i, e.mesh_dims[0]}};
     }
-    auto func = [this](const Enumeration& e) {
-      const DimMap lhs_dim_map = {{lhs_batch_dims_[e.i], e.j}};
-      const DimMap rhs_dim_map = {{rhs_batch_dims_[e.i], e.j}};
-      std::string name = absl::StrFormat("Sb_%d = Sb x Sb @ {%d}", e.i, e.j);
-      HloSharding output_spec = Tile(ins_->shape(), {e.i}, {e.j}, device_mesh_);
-      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_);
-    };
-    Enumerate(func, lhs_batch_dims_.size(), device_mesh_.num_dimensions());
-  }
 
-  void SplitTwoBatchDims() {
-    if (lhs_batch_dims_.size() != 2) return;
-    auto func = [this](const Enumeration& e) {
-      if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
-          device_mesh_.dim(e.mesh_dims[1]) <= 1)
-        return;
-      const DimMap lhs_dim_map = {{lhs_batch_dims_[0], e.mesh_dims[0]},
-                                  {lhs_batch_dims_[1], e.mesh_dims[1]}};
-      const DimMap rhs_dim_map = {{rhs_batch_dims_[0], e.mesh_dims[0]},
-                                  {rhs_batch_dims_[1], e.mesh_dims[1]}};
-      std::string name = absl::StrFormat("Sb = Sb x Sb @ {%s}",
-                                         absl::StrJoin(e.mesh_dims, ","));
-      HloSharding output_spec =
-          Tile(ins_->shape(), {0, 1}, e.mesh_dims, device_mesh_);
-      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_);
+    auto communication_cost_fn = [this, &e](const HloSharding& output_spec) {
+      double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
+      return cluster_env_.AllReduceCost(memory_cost, e.mesh_dims[1]);
     };
-    EnumerateHalf(func, lhs_batch_dims_.size(), lhs_batch_dims_.size());
-  }
+    MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_, 0,
+                communication_cost_fn);
+  };
+  Enumerate(func, lhs_space_dims_.size(), lhs_con_dims_.size());
+}
 
-  void SplitBatchDimLhsSpace() {
-    if (lhs_batch_dims_.empty()) return;
-    auto func = [this](const Enumeration& e) {
-      if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
-          device_mesh_.dim(e.mesh_dims[1]) <= 1)
-        return;
-      std::string name = absl::StrFormat("SbSi = SbSi x SbR @ {%s}",
-                                         absl::StrJoin(e.mesh_dims, ","));
-      const DimMap lhs_dim_map = {{lhs_space_dims_[e.i], e.mesh_dims[1]},
-                                  {lhs_batch_dims_[e.j], e.mesh_dims[0]}};
-      const DimMap rhs_dim_map = {{rhs_batch_dims_[e.j], e.mesh_dims[0]}};
-      HloSharding output_spec =
-          Tile(ins_->shape(), {e.j, space_base_dim_ + e.i}, e.mesh_dims,
-               device_mesh_);
-      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_);
+void DotHandler::SplitRhsSpaceBothContract() {
+  auto func = [this](const Enumeration& e) {
+    if (device_mesh_.dim(e.mesh_dims[0]) <= 1) return;
+    std::string name =
+        absl::StrFormat("RS = RS x SS @ {%s} (allreduce @ %d)",
+                        absl::StrJoin(e.mesh_dims, ","), e.mesh_dims[0]);
+    const DimMap rhs_dim_map = {{rhs_space_dims_[e.i], e.mesh_dims[1]},
+                                {rhs_con_dims_[e.j], e.mesh_dims[0]}};
+    const DimMap lhs_dim_map = {{lhs_con_dims_[e.j], e.mesh_dims[0]}};
+    std::optional<DimMap> out_dim_map = std::nullopt;
+    if (is_dot_) {
+      out_dim_map = DimMap{
+          {space_base_dim_ + static_cast<int64_t>(lhs_space_dims_.size()) + e.i,
+           e.mesh_dims[1]}};
+    }
+    auto communication_cost_fn = [this, &e](const HloSharding& output_spec) {
+      double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
+      return cluster_env_.AllReduceCost(memory_cost, e.mesh_dims[0]);
     };
-    Enumerate(func, lhs_space_dims_.size(), lhs_batch_dims_.size());
-  }
+    MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_, 0,
+                communication_cost_fn);
+  };
+  Enumerate(func, rhs_space_dims_.size(), lhs_con_dims_.size());
+}
 
-  void SplitBatchDimRhsSpace() {
-    if (lhs_batch_dims_.empty()) return;
-    auto func = [this](const Enumeration& e) {
-      if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
-          device_mesh_.dim(e.mesh_dims[1]) <= 1)
-        return;
-      std::string name = absl::StrFormat("SbSj = SbR x SbSj @ {%s}",
-                                         absl::StrJoin(e.mesh_dims, ","));
-      const DimMap rhs_dim_map = {{rhs_space_dims_[e.i], e.mesh_dims[1]},
-                                  {rhs_batch_dims_[e.j], e.mesh_dims[0]}};
-      const DimMap lhs_dim_map = {{lhs_batch_dims_[e.j], e.mesh_dims[0]}};
-      HloSharding output_spec =
-          Tile(ins_->shape(),
-               {e.j, space_base_dim_ +
-                         static_cast<int64_t>(lhs_space_dims_.size()) + e.i},
-               e.mesh_dims, device_mesh_);
-      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_);
-    };
-    Enumerate(func, rhs_space_dims_.size(), lhs_batch_dims_.size());
+void DotHandler::SplitOneBatchDim() {
+  if (absl::c_count_if(device_mesh_.dimensions(),
+                       [](int64_t size) { return size > 1; }) != 1) {
+    return;
   }
+  auto func = [this](const Enumeration& e) {
+    const DimMap lhs_dim_map = {{lhs_batch_dims_[e.i], e.j}};
+    const DimMap rhs_dim_map = {{rhs_batch_dims_[e.i], e.j}};
+    std::string name = absl::StrFormat("Sb_%d = Sb x Sb @ {%d}", e.i, e.j);
+    std::optional<DimMap> out_dim_map = std::nullopt;
+    if (is_dot_) {
+      out_dim_map = DimMap{{e.i, e.j}};
+    }
+    MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_);
+  };
+  Enumerate(func, lhs_batch_dims_.size(), device_mesh_.num_dimensions());
+}
 
-  void SplitBatchDimBothContract() {
-    if (lhs_batch_dims_.empty()) return;
-    auto func = [this](const Enumeration& e) {
-      if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
-          device_mesh_.dim(e.mesh_dims[1]) <= 1)
-        return;
-      std::string name =
-          absl::StrFormat("SbR = SbSk x SbSk @ {%s} (allreduce @ %d}",
-                          absl::StrJoin(e.mesh_dims, ","), e.mesh_dims[1]);
-      const DimMap lhs_dim_map = {{lhs_con_dims_[e.i], e.mesh_dims[1]},
-                                  {lhs_batch_dims_[e.j], e.mesh_dims[0]}};
-      const DimMap rhs_dim_map = {{rhs_batch_dims_[e.j], e.mesh_dims[0]}};
-      HloSharding output_spec =
-          Tile(ins_->shape(), {e.j}, {e.mesh_dims[0]}, device_mesh_);
+void DotHandler::SplitTwoBatchDims() {
+  if (lhs_batch_dims_.size() != 2) return;
+  auto func = [this](const Enumeration& e) {
+    if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
+        device_mesh_.dim(e.mesh_dims[1]) <= 1)
+      return;
+    const DimMap lhs_dim_map = {{lhs_batch_dims_[0], e.mesh_dims[0]},
+                                {lhs_batch_dims_[1], e.mesh_dims[1]}};
+    const DimMap rhs_dim_map = {{rhs_batch_dims_[0], e.mesh_dims[0]},
+                                {rhs_batch_dims_[1], e.mesh_dims[1]}};
+    std::string name =
+        absl::StrFormat("Sb = Sb x Sb @ {%s}", absl::StrJoin(e.mesh_dims, ","));
+    std::optional<DimMap> out_dim_map = std::nullopt;
+    if (is_dot_) {
+      out_dim_map = DimMap{{0, e.mesh_dims[0]}, {1, e.mesh_dims[1]}};
+    }
+    MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_);
+  };
+  EnumerateHalf(func, lhs_batch_dims_.size(), lhs_batch_dims_.size());
+}
+
+void DotHandler::SplitBatchDimLhsSpace() {
+  if (lhs_batch_dims_.empty()) return;
+  auto func = [this](const Enumeration& e) {
+    if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
+        device_mesh_.dim(e.mesh_dims[1]) <= 1)
+      return;
+    std::string name = absl::StrFormat("SbSi = SbSi x SbR @ {%s}",
+                                       absl::StrJoin(e.mesh_dims, ","));
+    const DimMap lhs_dim_map = {{lhs_space_dims_[e.i], e.mesh_dims[1]},
+                                {lhs_batch_dims_[e.j], e.mesh_dims[0]}};
+    const DimMap rhs_dim_map = {{rhs_batch_dims_[e.j], e.mesh_dims[0]}};
+    std::optional<DimMap> out_dim_map = std::nullopt;
+    if (is_dot_) {
+      out_dim_map = DimMap{{e.j, e.mesh_dims[0]},
+                           {space_base_dim_ + e.i, e.mesh_dims[1]}};
+    }
+    MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_);
+  };
+  Enumerate(func, lhs_space_dims_.size(), lhs_batch_dims_.size());
+}
+
+void DotHandler::SplitBatchDimRhsSpace() {
+  if (lhs_batch_dims_.empty()) return;
+  auto func = [this](const Enumeration& e) {
+    if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
+        device_mesh_.dim(e.mesh_dims[1]) <= 1)
+      return;
+    std::string name = absl::StrFormat("SbSj = SbR x SbSj @ {%s}",
+                                       absl::StrJoin(e.mesh_dims, ","));
+    const DimMap rhs_dim_map = {{rhs_space_dims_[e.i], e.mesh_dims[1]},
+                                {rhs_batch_dims_[e.j], e.mesh_dims[0]}};
+    const DimMap lhs_dim_map = {{lhs_batch_dims_[e.j], e.mesh_dims[0]}};
+    std::optional<DimMap> out_dim_map = std::nullopt;
+    if (is_dot_) {
+      out_dim_map = DimMap{
+          {e.j, e.mesh_dims[0]},
+          {space_base_dim_ + static_cast<int64_t>(lhs_space_dims_.size()) + e.i,
+           e.mesh_dims[1]}};
+    }
+    MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_);
+  };
+  Enumerate(func, rhs_space_dims_.size(), lhs_batch_dims_.size());
+}
+
+void DotHandler::SplitBatchDimBothContract() {
+  if (lhs_batch_dims_.empty()) return;
+  auto func = [this](const Enumeration& e) {
+    if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
+        device_mesh_.dim(e.mesh_dims[1]) <= 1)
+      return;
+    std::string name =
+        absl::StrFormat("SbR = SbSk x SbSk @ {%s} (allreduce @ %d}",
+                        absl::StrJoin(e.mesh_dims, ","), e.mesh_dims[1]);
+    const DimMap lhs_dim_map = {{lhs_con_dims_[e.i], e.mesh_dims[1]},
+                                {lhs_batch_dims_[e.j], e.mesh_dims[0]}};
+    const DimMap rhs_dim_map = {{rhs_batch_dims_[e.j], e.mesh_dims[0]}};
+    std::optional<DimMap> out_dim_map = std::nullopt;
+    if (is_dot_) {
+      out_dim_map = DimMap{{e.j, e.mesh_dims[0]}};
+    }
+    auto communication_cost_fn = [this, &e](const HloSharding& output_spec) {
       double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
-      double communication_cost =
-          cluster_env_.AllReduceCost(memory_cost, e.mesh_dims[1]);
-      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_, 0,
-                  communication_cost, /* use_sharding_propagation */ false);
+      return cluster_env_.AllReduceCost(memory_cost, e.mesh_dims[1]);
     };
-    Enumerate(func, lhs_con_dims_.size(), lhs_batch_dims_.size());
-  }
+    MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_, 0,
+                communication_cost_fn);
+  };
+  Enumerate(func, lhs_con_dims_.size(), lhs_batch_dims_.size());
+}
 
-  void SplitBothContractTwoDims() {
-    if (lhs_con_dims_.size() < 2 || rhs_con_dims_.size() < 2) return;
-    auto func = [this](const Enumeration& e) {
-      // Applies when there are more than one contracting dimension.
-      if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
-          device_mesh_.dim(e.mesh_dims[1]) <= 1)
-        return;
-      std::string name = absl::StrFormat(
-          "RR = SS x SS @ {%s} (allreduce @ {%s}}",
-          absl::StrJoin(e.mesh_dims, ","), absl::StrJoin(e.mesh_dims, ", "));
-      const DimMap lhs_dim_map = {{lhs_con_dims_[e.i], e.mesh_dims[0]},
-                                  {lhs_con_dims_[e.j], e.mesh_dims[1]}};
-      const DimMap rhs_dim_map = {{rhs_con_dims_[e.i], e.mesh_dims[0]},
-                                  {rhs_con_dims_[e.j], e.mesh_dims[1]}};
-      HloSharding output_spec = HloSharding::Replicate();
+void DotHandler::SplitBothContractTwoDims() {
+  if (lhs_con_dims_.size() < 2 || rhs_con_dims_.size() < 2) return;
+  auto func = [this](const Enumeration& e) {
+    // Applies when there are more than one contracting dimension.
+    if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
+        device_mesh_.dim(e.mesh_dims[1]) <= 1)
+      return;
+    std::string name = absl::StrFormat("RR = SS x SS @ {%s} (allreduce @ {%s}}",
+                                       absl::StrJoin(e.mesh_dims, ","),
+                                       absl::StrJoin(e.mesh_dims, ", "));
+    const DimMap lhs_dim_map = {{lhs_con_dims_[e.i], e.mesh_dims[0]},
+                                {lhs_con_dims_[e.j], e.mesh_dims[1]}};
+    const DimMap rhs_dim_map = {{rhs_con_dims_[e.i], e.mesh_dims[0]},
+                                {rhs_con_dims_[e.j], e.mesh_dims[1]}};
+    std::optional<DimMap> out_dim_map = std::nullopt;
+    if (is_dot_) {
+      out_dim_map = DimMap{};
+    }
+    auto communication_cost_fn = [this, &e](const HloSharding& output_spec) {
       double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
-      double communication_cost = cluster_env_.AllReduceCost(
-          memory_cost, e.mesh_dims[0], e.mesh_dims[1]);
-      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_, 0,
-                  communication_cost, /* use_sharding_propagation */ false);
+      return cluster_env_.AllReduceCost(memory_cost, e.mesh_dims[0],
+                                        e.mesh_dims[1]);
     };
-    EnumerateHalf(func, lhs_con_dims_.size(), lhs_con_dims_.size());
-  }
+    MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_, 0,
+                communication_cost_fn);
+  };
+  EnumerateHalf(func, lhs_con_dims_.size(), lhs_con_dims_.size());
+}
 
-  void RecomputeSplitBothContract() {
-    auto func = [this](const Enumeration& e) {
-      if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
-          device_mesh_.dim(e.mesh_dims[1]) <= 1)
-        return;
-      std::string name = absl::StrFormat("RR = RS x SR @ {%d} (allreduce @ %d)",
-                                         e.mesh_dims[0], e.mesh_dims[0]);
-      const DimMap lhs_dim_map = {{lhs_con_dims_[e.i], e.mesh_dims[0]}};
-      const DimMap rhs_dim_map = {{rhs_con_dims_[e.i], e.mesh_dims[0]}};
-      HloSharding output_spec = HloSharding::Replicate();
+void DotHandler::RecomputeSplitBothContract() {
+  auto func = [this](const Enumeration& e) {
+    if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
+        device_mesh_.dim(e.mesh_dims[1]) <= 1)
+      return;
+    std::string name = absl::StrFormat("RR = RS x SR @ {%d} (allreduce @ %d)",
+                                       e.mesh_dims[0], e.mesh_dims[0]);
+    const DimMap lhs_dim_map = {{lhs_con_dims_[e.i], e.mesh_dims[0]}};
+    const DimMap rhs_dim_map = {{rhs_con_dims_[e.i], e.mesh_dims[0]}};
+    std::optional<DimMap> out_dim_map = std::nullopt;
+    if (is_dot_) {
+      out_dim_map = DimMap{};
+    }
+    double compute_cost = cluster_env_.DotCost(lhs_->shape(), rhs_->shape());
+    auto communication_cost_fn = [this, &e](const HloSharding& output_spec) {
       double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
-      double compute_cost = cluster_env_.DotCost(lhs_->shape(), rhs_->shape());
-      double communication_cost =
-          cluster_env_.AllReduceCost(memory_cost, e.mesh_dims[0]);
-      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_,
-                  compute_cost, communication_cost,
-                  /* use_sharding_propagation */ false);
+      return cluster_env_.AllReduceCost(memory_cost, e.mesh_dims[0]);
     };
-    Enumerate(func, lhs_con_dims_.size(), 1);
-  }
+    MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_,
+                compute_cost, communication_cost_fn);
+  };
+  Enumerate(func, lhs_con_dims_.size(), 1);
+}
 
-  void Add1DDataParallel() {
-    if (device_mesh_.dim(0) > 1 &&
-        absl::c_count_if(device_mesh_.dimensions(),
-                         [](int64_t size) { return size > 1; }) > 1) {
-      int mesh_dim = 0;
-      int64_t num_devices = device_mesh_1d_.dim(mesh_dim);
-
-      // Si = Si x R @ 0
-      for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
-        const DimMap lhs_dim_map = {{lhs_space_dims_[i], mesh_dim}};
-        if (lhs_->shape().dimensions(lhs_space_dims_[i]) < num_devices) {
-          continue;
-        }
-        if (option_.only_allow_divisible_intermediate &&
-            !IsDivisible(lhs_->shape().dimensions(lhs_space_dims_[i]),
-                         num_devices)) {
-          continue;
-        }
-        std::string name = absl::StrFormat("Si = Si x R @ %d", mesh_dim);
-        HloSharding output_spec = Tile(ins_->shape(), {space_base_dim_ + i},
-                                       {mesh_dim}, device_mesh_1d_);
-        MaybeAppend(name, output_spec, lhs_dim_map, {}, device_mesh_1d_);
+void DotHandler::Add1DDataParallel() {
+  if (device_mesh_.dim(0) > 1 &&
+      absl::c_count_if(device_mesh_.dimensions(),
+                       [](int64_t size) { return size > 1; }) > 1) {
+    int mesh_dim = 0;
+    int64_t num_devices = device_mesh_1d_.dim(mesh_dim);
+
+    // Si = Si x R @ 0
+    for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
+      const DimMap lhs_dim_map = {{lhs_space_dims_[i], mesh_dim}};
+      if (lhs_->shape().dimensions(lhs_space_dims_[i]) < num_devices) {
+        continue;
       }
+      if (option_.only_allow_divisible_intermediate &&
+          !IsDivisible(lhs_->shape().dimensions(lhs_space_dims_[i]),
+                       num_devices)) {
+        continue;
+      }
+      std::string name = absl::StrFormat("Si = Si x R @ %d", mesh_dim);
+      std::optional<DimMap> out_dim_map = std::nullopt;
+      if (is_dot_) {
+        out_dim_map = DimMap{{space_base_dim_ + i, mesh_dim}};
+      }
+      MaybeAppend(name, lhs_dim_map, {}, out_dim_map, device_mesh_1d_);
+    }
 
-      // R = Sk x Sk @ (allreduce @ 0)
-      for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
-        const DimMap lhs_dim_map = {{lhs_con_dims_[i], mesh_dim}};
-        const DimMap rhs_dim_map = {{rhs_con_dims_[i], mesh_dim}};
-        if (lhs_->shape().dimensions(lhs_con_dims_[i]) < num_devices) {
-          continue;
-        }
-        if (option_.only_allow_divisible_intermediate &&
-            !IsDivisible(lhs_->shape().dimensions(lhs_con_dims_[i]),
-                         num_devices)) {
-          continue;
-        }
-        std::string name = absl::StrFormat("R = Sk x Sk @ %d (allreduce @ %d)",
-                                           mesh_dim, mesh_dim);
-        HloSharding output_spec = HloSharding::Replicate();
-        double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
-        double communication_cost =
-            cluster_env_.AllReduceCost(memory_cost, mesh_dim);
-        MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map,
-                    device_mesh_1d_, 0, communication_cost,
-                    /* use_sharding_propagation */ false);
+    // R = Sk x Sk @ (allreduce @ 0)
+    for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
+      const DimMap lhs_dim_map = {{lhs_con_dims_[i], mesh_dim}};
+      const DimMap rhs_dim_map = {{rhs_con_dims_[i], mesh_dim}};
+      if (lhs_->shape().dimensions(lhs_con_dims_[i]) < num_devices) {
+        continue;
       }
+      if (option_.only_allow_divisible_intermediate &&
+          !IsDivisible(lhs_->shape().dimensions(lhs_con_dims_[i]),
+                       num_devices)) {
+        continue;
+      }
+      std::string name = absl::StrFormat("R = Sk x Sk @ %d (allreduce @ %d)",
+                                         mesh_dim, mesh_dim);
+      std::optional<DimMap> out_dim_map = std::nullopt;
+      if (is_dot_) {
+        out_dim_map = DimMap{};
+      }
+      auto communication_cost_fn = [this,
+                                    mesh_dim](const HloSharding& output_spec) {
+        double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
+        return cluster_env_.AllReduceCost(memory_cost, mesh_dim);
+      };
+      MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_1d_,
+                  0, communication_cost_fn);
     }
   }
+}
 
-  void Add1DBatchSplit() {
-    if (device_mesh_.dim(0) > 1 &&
-        absl::c_count_if(device_mesh_.dimensions(),
-                         [](int64_t size) { return size > 1; }) > 1) {
-      int mesh_dim = 0;
-      for (int64_t i = 0; i < lhs_batch_dims_.size(); ++i) {
-        const DimMap lhs_dim_map = {{lhs_batch_dims_[i], mesh_dim}};
-        const DimMap rhs_dim_map = {{rhs_batch_dims_[i], mesh_dim}};
-        std::string name =
-            absl::StrFormat("Sb_%d = Sb x Sb @ {%d} 1d", i, mesh_dim);
-        HloSharding output_spec =
-            Tile(ins_->shape(), {i}, {mesh_dim}, device_mesh_1d_);
-        MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map,
-                    device_mesh_1d_);
+void DotHandler::Add1DBatchSplit() {
+  if (device_mesh_.dim(0) > 1 &&
+      absl::c_count_if(device_mesh_.dimensions(),
+                       [](int64_t size) { return size > 1; }) > 1) {
+    int mesh_dim = 0;
+    for (int64_t i = 0; i < lhs_batch_dims_.size(); ++i) {
+      const DimMap lhs_dim_map = {{lhs_batch_dims_[i], mesh_dim}};
+      const DimMap rhs_dim_map = {{rhs_batch_dims_[i], mesh_dim}};
+      std::string name =
+          absl::StrFormat("Sb_%d = Sb x Sb @ {%d} 1d", i, mesh_dim);
+      std::optional<DimMap> out_dim_map = std::nullopt;
+      if (is_dot_) {
+        out_dim_map = DimMap{{i, mesh_dim}};
       }
+      MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_1d_);
     }
   }
+}
 
-  Status RegisterStrategies() {
-    // SS = SR x RS
-    // Split lhs space dim and rhs space dim.
-    SplitLhsSpaceRhsSpace();
+Status DotHandler::RegisterStrategies() {
+  // SS = SR x RS
+  // Split lhs space dim and rhs space dim.
+  SplitLhsSpaceRhsSpace();
 
-    // SSR = SSR x RR
-    // Split lhs space dims only if it has more than 1 space dims.
-    if (lhs_space_dims_.size() > 1) {
-      SplitLhsSpaceOnly();
-    }
-    // RSS = RR x RSS
-    // Split rhs space dims only if it has more than 1 space dims.
-    if (rhs_space_dims_.size() > 1) {
-      SplitRhsSpaceOnly();
-    }
+  // SSR = SSR x RR
+  // Split lhs space dims only if it has more than 1 space dims.
+  if (lhs_space_dims_.size() > 1) {
+    SplitLhsSpaceOnly();
+  }
+  // RSS = RR x RSS
+  // Split rhs space dims only if it has more than 1 space dims.
+  if (rhs_space_dims_.size() > 1) {
+    SplitRhsSpaceOnly();
+  }
 
-    // SR = SS x SR
-    // Split lhs space dim and both contracting dims.
-    SplitLhsSpaceBothContract();
+  // SR = SS x SR
+  // Split lhs space dim and both contracting dims.
+  SplitLhsSpaceBothContract();
 
-    // RS = RS x SS
-    // Split rhs space dim and both contracting dims.
-    SplitRhsSpaceBothContract();
+  // RS = RS x SS
+  // Split rhs space dim and both contracting dims.
+  SplitRhsSpaceBothContract();
 
-    // RR = SS x SS
-    // Split two contracting dims on lhs and rhs.
-    SplitBothContractTwoDims();
+  // RR = SS x SS
+  // Split two contracting dims on lhs and rhs.
+  SplitBothContractTwoDims();
 
-    // RR = RS x SR
-    // This is a special case where we allow spliting only one dim in the
-    // multi-dimensional mesh case. This allows some recomputation
-    // (e.g., the dense layer in the LM_head of BERT).
-    RecomputeSplitBothContract();
+  // RR = RS x SR
+  // This is a special case where we allow splitting only one dim in the
+  // multi-dimensional mesh case. This allows some recomputation
+  // (e.g., the dense layer in the LM_head of BERT).
+  RecomputeSplitBothContract();
 
-    // Add 1d data parallel in multi-dimensional mesh
-    if (option_.allow_mixed_mesh_shape) {
-      Add1DDataParallel();
-    }
+  // Add 1d data parallel in multi-dimensional mesh
+  if (option_.allow_mixed_mesh_shape) {
+    Add1DDataParallel();
+  }
 
-    if (option_.batch_matmul_always_split_batch && !lhs_batch_dims_.empty() &&
-        cluster_env_.non_zero_mesh_dims_.size() > 1) {
-      // If there is a batch dim and the device mesh is multi-dimensional,
-      // always split on batch dim. Clear all old strategies.
-      strategies_->leaf_vector.clear();
-    }
+  if (option_.batch_matmul_always_split_batch && !lhs_batch_dims_.empty() &&
+      cluster_env_.non_zero_mesh_dims_.size() > 1) {
+    // If there is a batch dim and the device mesh is multi-dimensional,
+    // always split on batch dim. Clear all old strategies.
+    strategy_group_->strategies.clear();
+  }
 
-    // Sb = Sb x Sb
-    // Split one batch dim. Only used for 1d mesh
-    SplitOneBatchDim();
-
-    // SbSi = SbSi x SbR
-    // Split batch dim and lhs space dim
-    SplitBatchDimLhsSpace();
-
-    // SbSj = SbR x SbSj
-    // Split batch dim and rhs space dim
-    SplitBatchDimRhsSpace();
-
-    // SbSj = SbR x SbSj
-    // Split batch dim and contracting dim
-    SplitBatchDimBothContract();
-
-    if (option_.batch_matmul_always_split_batch &&
-        lhs_batch_dims_.size() == 2 &&
-        absl::c_count_if(device_mesh_.dimensions(),
-                         [](int64_t size) { return size > 1; }) > 1) {
-      // If there are two batch dims, always split on these two dims.
-      // Clear all old strategies.
-      strategies_->leaf_vector.clear();
-    }
+  // Sb = Sb x Sb
+  // Split one batch dim. Only used for 1d mesh
+  SplitOneBatchDim();
 
-    // Sb = Sb x Sb
-    // Split batch dims.
-    SplitTwoBatchDims();
+  // SbSi = SbSi x SbR
+  // Split batch dim and lhs space dim
+  SplitBatchDimLhsSpace();
 
-    if (option_.allow_mixed_mesh_shape) {
-      Add1DBatchSplit();
-    }
+  // SbSj = SbR x SbSj
+  // Split batch dim and rhs space dim
+  SplitBatchDimRhsSpace();
 
-    // If force_batch_dim_to_mesh_dim is set, filter out invalid strategies
-    // and only keep the data parallel strategies.
-    if (option_.force_batch_dim_to_mesh_dim >= 0 &&
-        batch_map_.contains(GetBatchDimMapKey(ins_))) {
-      TF_RETURN_IF_ERROR(FilterStrategy(ins_, ins_->shape(), strategies_,
-                                        cluster_env_, batch_map_, option_));
-    }
+  // SbSj = SbR x SbSj
+  // Split batch dim and contracting dim
+  SplitBatchDimBothContract();
 
-    return OkStatus();
+  if (option_.batch_matmul_always_split_batch && lhs_batch_dims_.size() == 2 &&
+      absl::c_count_if(device_mesh_.dimensions(),
+                       [](int64_t size) { return size > 1; }) > 1) {
+    // If there are two batch dims, always split on these two dims.
+    // Clear all old strategies.
+    strategy_group_->strategies.clear();
   }
 
-  // Dimension information
-  int64_t space_base_dim_;
-  tsl::protobuf::RepeatedField<int64_t> lhs_space_dims_, rhs_space_dims_;
-  const tsl::protobuf::RepeatedField<int64_t>& lhs_con_dims_;
-  const tsl::protobuf::RepeatedField<int64_t>& rhs_con_dims_;
-  const tsl::protobuf::RepeatedField<int64_t>& lhs_batch_dims_;
-  const tsl::protobuf::RepeatedField<int64_t>& rhs_batch_dims_;
-};
+  // Sb = Sb x Sb
+  // Split batch dims.
+  SplitTwoBatchDims();
 
-// Register strategies for dot instructions.
-Status HandleDot(std::unique_ptr<StrategyVector>& strategies,
-                 LeafStrategies& leaf_strategies, StrategyMap& strategy_map,
-                 const HloInstruction* ins, size_t instruction_id,
-                 const ClusterEnvironment& cluster_env,
-                 const InstructionBatchDimMap& batch_map,
-                 const AutoShardingOption& option,
-                 const CallGraph& call_graph) {
-  strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
-                                        leaf_strategies);
+  if (option_.allow_mixed_mesh_shape) {
+    Add1DBatchSplit();
+  }
+
+  // If force_batch_dim_to_mesh_dim is set, filter out invalid strategies
+  // and only keep the data parallel strategies.
+  if (option_.force_batch_dim_to_mesh_dim >= 0 &&
+      batch_map_.contains(GetBatchDimMapKey(ins_))) {
+    TF_RETURN_IF_ERROR(FilterStrategy(ins_, ins_->shape(), strategy_group_,
+                                      cluster_env_, batch_map_, option_));
+  }
 
-  DotHandler handler(strategies, strategy_map, ins, cluster_env, batch_map,
-                     option, call_graph);
-  TF_RETURN_IF_ERROR(handler.RegisterStrategies());
   return OkStatus();
 }
 
-class ConvHandler : public HandlerBase {
- public:
-  ConvHandler(std::unique_ptr<StrategyVector>& strategies,
-              StrategyMap& strategy_map, const HloInstruction* ins,
-              const ClusterEnvironment& cluster_env,
-              const InstructionBatchDimMap& batch_map,
-              const AutoShardingOption& option, const CallGraph& call_graph)
-      : HandlerBase(strategies, strategy_map, ins, cluster_env, batch_map,
-                    option, call_graph),
-        conv_dnums_(ins->convolution_dimension_numbers()) {
-    lhs_batch_dim_ = conv_dnums_.input_batch_dimension();
-    lhs_in_channel_dim_ = conv_dnums_.input_feature_dimension();
-    rhs_in_channel_dim_ = conv_dnums_.kernel_input_feature_dimension();
-    rhs_out_channel_dim_ = conv_dnums_.kernel_output_feature_dimension();
-    out_batch_dim_ = conv_dnums_.output_batch_dimension();
-    out_out_channel_dim_ = conv_dnums_.output_feature_dimension();
-  }
+/************** ConvHandler function definitions **************/
+
+ConvHandler::ConvHandler(std::unique_ptr<StrategyGroup>& strategy_group,
+                         StrategyMap& strategy_map, const HloInstruction* ins,
+                         const ClusterEnvironment& cluster_env,
+                         const InstructionBatchDimMap& batch_map,
+                         const AutoShardingOption& option,
+                         const CallGraph& call_graph)
+    : HandlerBase(strategy_group, strategy_map, ins, cluster_env, batch_map,
+                  option, call_graph),
+      conv_dnums_(ins->convolution_dimension_numbers()) {
+  lhs_batch_dim_ = conv_dnums_.input_batch_dimension();
+  lhs_in_channel_dim_ = conv_dnums_.input_feature_dimension();
+  rhs_in_channel_dim_ = conv_dnums_.kernel_input_feature_dimension();
+  rhs_out_channel_dim_ = conv_dnums_.kernel_output_feature_dimension();
+  out_batch_dim_ = conv_dnums_.output_batch_dimension();
+  out_out_channel_dim_ = conv_dnums_.output_feature_dimension();
+}
 
-  void SplitLhsBatchRhsOutchannel() {
-    auto func = [this](const Enumeration& e) {
-      const DimMap lhs_dim_map = {{lhs_batch_dim_, e.mesh_dims[0]}};
-      const DimMap rhs_dim_map = {{rhs_out_channel_dim_, e.mesh_dims[1]}};
-      std::string name = absl::StrFormat("SS = SR x RS @ {%s}",
-                                         absl::StrJoin(e.mesh_dims, ","));
-      HloSharding output_spec =
-          Tile(ins_->shape(), {out_batch_dim_, out_out_channel_dim_},
-               e.mesh_dims, device_mesh_);
-      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_, 0,
-                  0, /* use_sharding_propagation */ false);
-    };
-    EnumerateHalf(func);
+Status ConvHandler::RegisterStrategies() {
+  // For 1D sharding
+  if ((ins_->feature_group_count() ==
+           lhs_->shape().dimensions(lhs_in_channel_dim_) &&
+       ins_->feature_group_count() ==
+           rhs_->shape().dimensions(rhs_out_channel_dim_))) {
+    // for depthwise conv
+    // SS = SS x S
+    // Split batch dim and channel dim
+    SplitDepthwise(true);
+  } else if ((ins_->batch_group_count() ==
+                  lhs_->shape().dimensions(lhs_batch_dim_) &&
+              ins_->batch_group_count() ==
+                  rhs_->shape().dimensions(rhs_out_channel_dim_))) {
+    // for depthwise conv filter_backward
+    // SS = SS x S
+    // Split batch dim and channel dim
+    SplitDepthwise(false);
   }
 
-  void SplitLhsBatchBothInchannel() {
-    auto func = [this](const Enumeration& e) {
-      if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
-          device_mesh_.dim(e.mesh_dims[1]) <= 1)
-        return;
-      const DimMap lhs_dim_map = {{lhs_batch_dim_, e.mesh_dims[0]},
-                                  {lhs_in_channel_dim_, e.mesh_dims[1]}};
-      const DimMap rhs_dim_map = {{rhs_in_channel_dim_, e.mesh_dims[1]}};
-      std::string name =
-          absl::StrFormat("SR = SS x SR @ {%s} (allreduce @ %d)",
-                          absl::StrJoin(e.mesh_dims, ","), e.mesh_dims[1]);
-      HloSharding output_spec =
-          Tile(ins_->shape(), {out_batch_dim_}, {e.mesh_dims[0]}, device_mesh_);
-      double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
-      double communication_cost =
-          cluster_env_.AllReduceCost(memory_cost, e.mesh_dims[1]);
-      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_, 0,
-                  communication_cost, /* use_sharding_propagation */ false);
-    };
-    EnumerateHalf(func);
-  }
+  // SS = SR x RS
+  // Split lhs batch dim and rhs out_channel dim.
+  SplitLhsBatchRhsOutchannel();
 
-  void SplitRhsOutchannelBothInchannel() {
-    auto func = [this](const Enumeration& e) {
-      if (device_mesh_.dim(e.mesh_dims[0]) <= 1) return;
-      const DimMap lhs_dim_map = {{lhs_in_channel_dim_, e.mesh_dims[0]}};
-      const DimMap rhs_dim_map = {{rhs_in_channel_dim_, e.mesh_dims[0]},
-                                  {rhs_out_channel_dim_, e.mesh_dims[1]}};
-      std::string name =
-          absl::StrFormat("RS = RS x SS @ {%s} (allreduce @ %d)",
-                          absl::StrJoin(e.mesh_dims, ","), e.mesh_dims[0]);
-      HloSharding output_spec = Tile(ins_->shape(), {out_out_channel_dim_},
-                                     {e.mesh_dims[1]}, device_mesh_);
-      double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
-      double communication_cost =
-          cluster_env_.AllReduceCost(memory_cost, e.mesh_dims[0]);
-      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_, 0,
-                  communication_cost, /* use_sharding_propagation */ false);
-    };
-    EnumerateHalf(func);
-  }
+  // SR = SS x SR
+  // Split lhs batch dim and both in_channel dims.
+  SplitLhsBatchBothInchannel();
 
-  void Add1DDataParallel() {
-    if (device_mesh_.dim(0) > 1 &&
-        absl::c_count_if(device_mesh_.dimensions(),
-                         [](int64_t size) { return size > 1; }) > 1) {
-      int mesh_dim = 0;
-      int64_t num_devices = device_mesh_1d_.dim(mesh_dim);
-
-      // Si = Si x R @ 0
-      if (lhs_->shape().dimensions(lhs_batch_dim_) % num_devices == 0) {
-        const DimMap lhs_dim_map = {{lhs_batch_dim_, mesh_dim}};
-        std::string name = absl::StrFormat("Si = Si x R @ 0");
-        HloSharding output_spec =
-            Tile(ins_->shape(), {out_batch_dim_}, {mesh_dim}, device_mesh_1d_);
-        MaybeAppend(name, output_spec, lhs_dim_map, {}, device_mesh_1d_, 0, 0,
-                    /* use_sharding_propagation */ false);
-      }
+  // RS = RS x SS
+  // Split rhs out_channel dim and both in_channel dims.
+  SplitRhsOutchannelBothInchannel();
 
-      // R = Sk x Sk @ (allreduce @ 0)
-      if (lhs_->shape().dimensions(lhs_in_channel_dim_) % num_devices == 0 &&
-          rhs_->shape().dimensions(rhs_in_channel_dim_) % num_devices == 0) {
-        const DimMap lhs_dim_map = {{lhs_in_channel_dim_, mesh_dim}};
-        const DimMap rhs_dim_map = {{rhs_in_channel_dim_, mesh_dim}};
-        std::string name = absl::StrFormat("R = Sk x Sk @ %d (allreduce @ %d)",
-                                           mesh_dim, mesh_dim);
-        HloSharding output_spec = HloSharding::Replicate();
-        double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
-        double communication_cost = cluster_env_.AllReduceCost(memory_cost, 0) +
-                                    cluster_env_.AllReduceCost(memory_cost, 1);
-        MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map,
-                    device_mesh_1d_, 0, communication_cost,
-                    /* use_sharding_propagation */ false);
-      }
-    }
+  // Add 1d data parallel in multi-dimensional mesh
+  if (option_.allow_mixed_mesh_shape) {
+    Add1DDataParallel();
   }
 
-  void SplitDepthwise(bool forward) {
-    auto func = [this, forward](const Enumeration& e) {
-      const DimMap lhs_dim_map = {
-          {lhs_batch_dim_, e.mesh_dims[forward ? 0 : 1]},
-          {lhs_in_channel_dim_, e.mesh_dims[forward ? 1 : 0]}};
-      const DimMap rhs_dim_map = {{rhs_out_channel_dim_, e.mesh_dims[1]}};
-      std::string name = absl::StrFormat("SS = SS x RS @ {%s}",
-                                         absl::StrJoin(e.mesh_dims, ","));
-      HloSharding output_spec =
-          Tile(ins_->shape(), {out_batch_dim_, out_out_channel_dim_},
-               e.mesh_dims, device_mesh_);
-      MaybeAppend(name, output_spec, lhs_dim_map, rhs_dim_map, device_mesh_, 0,
-                  0, /* use_sharding_propagation */ false);
-    };
-    EnumerateHalf(func);
+  // If force_batch_dim_to_mesh_dim is set, filter out invalid strategies
+  // and only keep the data parallel strategies.
+  if (option_.force_batch_dim_to_mesh_dim >= 0 &&
+      batch_map_.contains(GetBatchDimMapKey(ins_))) {
+    TF_RETURN_IF_ERROR(FilterStrategy(ins_, ins_->shape(), strategy_group_,
+                                      cluster_env_, batch_map_, option_));
   }
 
-  Status RegisterStrategies() {
-    // For 1D sharding
-    if ((ins_->feature_group_count() ==
-             lhs_->shape().dimensions(lhs_in_channel_dim_) &&
-         ins_->feature_group_count() ==
-             rhs_->shape().dimensions(rhs_out_channel_dim_))) {
-      // for depthwise conv
-      // SS = SS x S
-      // Split batch dim and channel dim
-      SplitDepthwise(true);
-    } else if ((ins_->batch_group_count() ==
-                    lhs_->shape().dimensions(lhs_batch_dim_) &&
-                ins_->batch_group_count() ==
-                    rhs_->shape().dimensions(rhs_out_channel_dim_))) {
-      // for depthwise conv filter_backward
-      // SS = SS x S
-      // Split batch dim and channel dim
-      SplitDepthwise(false);
-    }
+  return OkStatus();
+}
 
-    // SS = SR x RS
-    // Split lhs batch dim and rhs out_channel dim.
-    SplitLhsBatchRhsOutchannel();
+void ConvHandler::SplitLhsBatchRhsOutchannel() {
+  auto func = [this](const Enumeration& e) {
+    const DimMap lhs_dim_map = {{lhs_batch_dim_, e.mesh_dims[0]}};
+    const DimMap rhs_dim_map = {{rhs_out_channel_dim_, e.mesh_dims[1]}};
+    std::string name =
+        absl::StrFormat("SS = SR x RS @ {%s}", absl::StrJoin(e.mesh_dims, ","));
+    const DimMap out_dim_map = {{out_batch_dim_, e.mesh_dims[0]},
+                                {out_out_channel_dim_, e.mesh_dims[1]}};
+    MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_);
+  };
+  EnumerateHalf(func);
+}
 
-    // SR = SS x SR
-    // Split lhs batch dim and both in_channel dims.
-    SplitLhsBatchBothInchannel();
+void ConvHandler::SplitLhsBatchBothInchannel() {
+  auto func = [this](const Enumeration& e) {
+    if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
+        device_mesh_.dim(e.mesh_dims[1]) <= 1)
+      return;
+    const DimMap lhs_dim_map = {{lhs_batch_dim_, e.mesh_dims[0]},
+                                {lhs_in_channel_dim_, e.mesh_dims[1]}};
+    const DimMap rhs_dim_map = {{rhs_in_channel_dim_, e.mesh_dims[1]}};
+    std::string name =
+        absl::StrFormat("SR = SS x SR @ {%s} (allreduce @ %d)",
+                        absl::StrJoin(e.mesh_dims, ","), e.mesh_dims[1]);
+    const DimMap out_dim_map = {{out_batch_dim_, e.mesh_dims[0]}};
+    auto communication_cost_fn = [this, &e](const HloSharding& output_spec) {
+      double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
+      return cluster_env_.AllReduceCost(memory_cost, e.mesh_dims[1]);
+    };
+    MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_, 0,
+                communication_cost_fn);
+  };
+  EnumerateHalf(func);
+}
 
-    // RS = RS x SS
-    // Split rhs out_channel dim and both in_channel dims.
-    SplitRhsOutchannelBothInchannel();
+void ConvHandler::SplitRhsOutchannelBothInchannel() {
+  auto func = [this](const Enumeration& e) {
+    if (device_mesh_.dim(e.mesh_dims[0]) <= 1) return;
+    const DimMap lhs_dim_map = {{lhs_in_channel_dim_, e.mesh_dims[0]}};
+    const DimMap rhs_dim_map = {{rhs_in_channel_dim_, e.mesh_dims[0]},
+                                {rhs_out_channel_dim_, e.mesh_dims[1]}};
+    std::string name =
+        absl::StrFormat("RS = RS x SS @ {%s} (allreduce @ %d)",
+                        absl::StrJoin(e.mesh_dims, ","), e.mesh_dims[0]);
+    const DimMap out_dim_map = {{out_out_channel_dim_, e.mesh_dims[1]}};
+    auto communication_cost_fn = [this, &e](const HloSharding& output_spec) {
+      double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
+      return cluster_env_.AllReduceCost(memory_cost, e.mesh_dims[0]);
+    };
+    MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_, 0,
+                communication_cost_fn);
+  };
+  EnumerateHalf(func);
+}
 
-    // Add 1d data parallel in multi-dimensional mesh
-    if (option_.allow_mixed_mesh_shape) {
-      Add1DDataParallel();
+void ConvHandler::Add1DDataParallel() {
+  if (device_mesh_.dim(0) > 1 &&
+      absl::c_count_if(device_mesh_.dimensions(),
+                       [](int64_t size) { return size > 1; }) > 1) {
+    int mesh_dim = 0;
+    int64_t num_devices = device_mesh_1d_.dim(mesh_dim);
+
+    // Si = Si x R @ 0
+    if (lhs_->shape().dimensions(lhs_batch_dim_) % num_devices == 0) {
+      const DimMap lhs_dim_map = {{lhs_batch_dim_, mesh_dim}};
+      std::string name = absl::StrFormat("Si = Si x R @ 0");
+      const DimMap out_dim_map = {{out_batch_dim_, mesh_dim}};
+      MaybeAppend(name, lhs_dim_map, {}, out_dim_map, device_mesh_1d_);
     }
 
-    // If force_batch_dim_to_mesh_dim is set, filter out invalid strategies
-    // and only keep the data parallel strategies.
-    if (option_.force_batch_dim_to_mesh_dim >= 0 &&
-        batch_map_.contains(GetBatchDimMapKey(ins_))) {
-      TF_RETURN_IF_ERROR(FilterStrategy(ins_, ins_->shape(), strategies_,
-                                        cluster_env_, batch_map_, option_));
+    // R = Sk x Sk @ (allreduce @ 0)
+    if (lhs_->shape().dimensions(lhs_in_channel_dim_) % num_devices == 0 &&
+        rhs_->shape().dimensions(rhs_in_channel_dim_) % num_devices == 0) {
+      const DimMap lhs_dim_map = {{lhs_in_channel_dim_, mesh_dim}};
+      const DimMap rhs_dim_map = {{rhs_in_channel_dim_, mesh_dim}};
+      std::string name = absl::StrFormat("R = Sk x Sk @ %d (allreduce @ %d)",
+                                         mesh_dim, mesh_dim);
+      const DimMap out_dim_map = {};
+      auto communication_cost_fn = [this](const HloSharding& output_spec) {
+        double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
+        return cluster_env_.AllReduceCost(memory_cost, 0) +
+               cluster_env_.AllReduceCost(memory_cost, 1);
+      };
+      MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_1d_,
+                  0, communication_cost_fn);
     }
-
-    return OkStatus();
   }
+}
 
-  // Dimension information
-  const ConvolutionDimensionNumbers& conv_dnums_;
-  int64_t lhs_batch_dim_, lhs_in_channel_dim_;
-  int64_t rhs_in_channel_dim_, rhs_out_channel_dim_;
-  int64_t out_batch_dim_, out_out_channel_dim_;
-};
+void ConvHandler::SplitDepthwise(bool forward) {
+  auto func = [this, forward](const Enumeration& e) {
+    const DimMap lhs_dim_map = {
+        {lhs_batch_dim_, e.mesh_dims[forward ? 0 : 1]},
+        {lhs_in_channel_dim_, e.mesh_dims[forward ? 1 : 0]}};
+    const DimMap rhs_dim_map = {{rhs_out_channel_dim_, e.mesh_dims[1]}};
+    std::string name =
+        absl::StrFormat("SS = SS x RS @ {%s}", absl::StrJoin(e.mesh_dims, ","));
+    const DimMap out_dim_map = {{out_batch_dim_, e.mesh_dims[0]},
+                                {out_out_channel_dim_, e.mesh_dims[1]}};
+    MaybeAppend(name, lhs_dim_map, rhs_dim_map, out_dim_map, device_mesh_);
+  };
+  EnumerateHalf(func);
+}
+
+}  // namespace
 
 // Register strategies for dot instructions.
-Status HandleConv(std::unique_ptr<StrategyVector>& strategies,
-                  LeafStrategies& leaf_strategies, StrategyMap& strategy_map,
+Status HandleDot(std::unique_ptr<StrategyGroup>& strategy_group,
+                 StrategyGroups& strategy_groups, StrategyMap& strategy_map,
+                 const HloInstruction* ins, size_t instruction_id,
+                 const ClusterEnvironment& cluster_env,
+                 const InstructionBatchDimMap& batch_map,
+                 const AutoShardingOption& option,
+                 const CallGraph& call_graph) {
+  strategy_group = CreateLeafStrategyGroup(instruction_id, ins, strategy_map,
+                                           strategy_groups);
+
+  DotHandler handler(strategy_group, strategy_map, Cast<HloDotInstruction>(ins),
+                     cluster_env, batch_map, option, call_graph);
+  TF_RETURN_IF_ERROR(handler.RegisterStrategies());
+  return OkStatus();
+}
+
+// Register strategies for convolution instructions.
+Status HandleConv(std::unique_ptr<StrategyGroup>& strategy_group,
+                  StrategyGroups& strategy_groups, StrategyMap& strategy_map,
                   const HloInstruction* ins, size_t instruction_id,
                   const ClusterEnvironment& cluster_env,
                   const InstructionBatchDimMap& batch_map,
                   const AutoShardingOption& option,
                   const CallGraph& call_graph) {
-  strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
-                                        leaf_strategies);
+  strategy_group = CreateLeafStrategyGroup(instruction_id, ins, strategy_map,
+                                           strategy_groups);
+
+  auto conv_as_dot_dims =
+      dot_as_convolution_util::ParseConvolutionDimsInfo(ins);
+  if (conv_as_dot_dims.conv_spatial_dims.empty()) {
+    DotHandler handler(strategy_group, strategy_map,
+                       Cast<HloConvolutionInstruction>(ins), conv_as_dot_dims,
+                       cluster_env, batch_map, option, call_graph);
+    TF_RETURN_IF_ERROR(handler.RegisterStrategies());
+
+  } else {
+    ConvHandler handler(strategy_group, strategy_map, ins, cluster_env,
+                        batch_map, option, call_graph);
+    TF_RETURN_IF_ERROR(handler.RegisterStrategies());
+  }
 
-  ConvHandler handler(strategies, strategy_map, ins, cluster_env, batch_map,
-                      option, call_graph);
-  TF_RETURN_IF_ERROR(handler.RegisterStrategies());
   return OkStatus();
 }
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
index 7e1f6f02d65e36..21281269eb5350 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstddef>
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
@@ -22,24 +23,24 @@ limitations under the License.
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/hlo_cost_analysis.h"
 
 namespace xla {
 namespace spmd {
 
 AutoShardingSolverResult Solve(
-    const HloLiveRange& hlo_live_range,
+    const HloModule& hlo_module, const HloLiveRange& hlo_live_range,
     const LivenessNodeSet& liveness_node_set, const StrategyMap& strategy_map,
-    const LeafStrategies& leaf_strategies, const CostGraph& cost_graph,
+    const StrategyGroups& strategy_groups, const CostGraph& cost_graph,
     const AliasSet& alias_set, const AutoShardingOption& option,
     const absl::flat_hash_map<std::string, const HloInstruction*>&
         sharding_propagation_solution) {
-  return CallSolver(
-      hlo_live_range, liveness_node_set, strategy_map, leaf_strategies,
-      cost_graph, alias_set, /*s_hint*/ {}, option.memory_budget_per_device,
-      /*crash_at_infinity_costs_check*/ !option.try_multiple_mesh_shapes,
-      /*compute_iis*/ true, option.solver_timeout_in_seconds,
-      option.allow_alias_to_follower_conversion, sharding_propagation_solution);
+  return CallSolver(hlo_module, hlo_live_range, liveness_node_set, strategy_map,
+                    strategy_groups, cost_graph, alias_set, /*s_hint*/ {},
+                    /*compute_iis*/ true, option.solver_timeout_in_seconds,
+                    option, sharding_propagation_solution);
 }
 
 void PopulateTemporalValues(const CostGraph& cost_graph,
@@ -47,5 +48,12 @@ void PopulateTemporalValues(const CostGraph& cost_graph,
   // TODO(moffitt): Implement this.
 }
 
+double GetDotConvReplicationPenalty(const HloInstruction* inst,
+                                    size_t instruction_id, size_t window,
+                                    const HloInstructionSequence& sequence,
+                                    const HloCostAnalysis& hlo_cost_analysis) {
+  return 0;
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.cc
index 661535c9db027c..ed3a760caa834b 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.cc
@@ -85,7 +85,6 @@ std::string AutoShardingOption::ToString() const {
       absl::StrCat("allow_mixed_mesh_shape: ", allow_mixed_mesh_shape));
   lines.push_back(
       absl::StrCat("grad_acc_num_micro_batches: ", grad_acc_num_micro_batches));
-  lines.push_back(absl::StrCat("load_solution_vector: ", load_solution_vector));
   lines.push_back(
       absl::StrCat("force_simple_heuristic: ", force_simple_heuristic));
   lines.push_back(absl::StrCat("force_strategy: ", force_strategy));
@@ -118,11 +117,24 @@ std::string AutoShardingOption::ToString() const {
   lines.push_back(absl::StrCat("device_mesh_beta: [",
                                absl::StrJoin(device_mesh_beta, ","), "]"));
 
-  lines.push_back(absl::StrCat("load_strategy: ", load_strategy));
-  if (load_strategy) {
-    lines.push_back(absl::StrCat("strategy_vector: [",
-                                 absl::StrJoin(strategy_vector, ","), "]"));
-  }
+  lines.push_back(
+      absl::StrCat("try_multiple_mesh_shapes: ", try_multiple_mesh_shapes));
+
+  lines.push_back(
+      absl::StrCat("solver_timeout_in_seconds: ", solver_timeout_in_seconds));
+
+  lines.push_back(absl::StrCat("loop_iteration_count_estimate: ",
+                               loop_iteration_count_estimate));
+
+  lines.push_back(absl::StrCat("allow_alias_to_follower_conversion: ",
+                               allow_alias_to_follower_conversion));
+
+  lines.push_back(
+      absl::StrCat("small_tensor_byte_size: ", small_tensor_byte_size));
+
+  lines.push_back(
+      absl::StrCat("use_sharding_propagation_for_default_shardings: ",
+                   use_sharding_propagation_for_default_shardings));
 
   return absl::StrJoin(lines, "\n");
 }
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h
index c2898ce610693b..fe950b190df245 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h
@@ -116,9 +116,6 @@ struct AutoShardingOption {
   // is divided by this number.
   int grad_acc_num_micro_batches = 1;
 
-  // If true, load solution vector from PassContext
-  bool load_solution_vector = false;
-
   // If true, N-D sharding (e.g., N maybe be 2 or 3) will be solved in N
   // iterations, where one iteration chooses one tensor dimension to shard. If
   // false, solve N-D sharding directly, i.e., generating all possible sharding
@@ -161,8 +158,7 @@ struct AutoShardingOption {
   // element models the communication performance along each mesh dimension.
   std::vector<double> device_mesh_alpha;
   std::vector<double> device_mesh_beta;
-  // Load the strategy vector instead of solving one.
-  bool load_strategy = false;
+
   // Explore other mesh shapes with the same number of devices as the provided
   // one for a potentially better auto-sharding solution.
   bool try_multiple_mesh_shapes = false;
@@ -180,7 +176,6 @@ struct AutoShardingOption {
   // smaller Mixed ILP).
   bool allow_alias_to_follower_conversion = true;
 
-  std::vector<int64_t> strategy_vector;
   // If greater than zero, tensors with size smaller than or equal to this limit
   // will always be replicated if they don't have a different user-specified
   // sharding.
@@ -189,7 +184,7 @@ struct AutoShardingOption {
   // In order to obtain default sharding strategies for instructions to limit
   // departures from the defaults, use sharding propagation instead of assuming
   // a simple replicated default.
-  bool use_sharding_propagation_for_default_shardings = true;
+  bool use_sharding_propagation_for_default_shardings = false;
 
   // Prints a debug string.
   std::string ToString() const;
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
index fc76d6eb1bcbb7..1b7438cdd404e8 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
@@ -20,12 +20,13 @@ limitations under the License.
 #include <cstdint>
 #include <limits>
 #include <memory>
-#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
 
+#include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
+
 #ifdef PLATFORM_GOOGLE
 #include "file/base/options.h"
 #endif
@@ -35,10 +36,11 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
+#include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/util.h"
+#include "tsl/platform/fingerprint.h"
 #include "tsl/platform/hash.h"
 #include "tsl/platform/types.h"
 #include "ortools/linear_solver/linear_solver.h"
@@ -48,14 +50,13 @@ limitations under the License.
 #include "util/task/status.pb.h"
 #endif
 
-using MPConstraint = operations_research::MPConstraint;
-using MPSolver = operations_research::MPSolver;
-using MPSolverParameters = operations_research::MPSolverParameters;
-using MPVariable = operations_research::MPVariable;
-
 namespace xla {
 namespace spmd {
 
+using ::operations_research::MPConstraint;
+using ::operations_research::MPSolver;
+using ::operations_research::MPVariable;
+
 bool AutoShardingSolverResult::operator==(
     const AutoShardingSolverResult& other) const {
   return status == other.status &&
@@ -64,20 +65,17 @@ bool AutoShardingSolverResult::operator==(
 
 void PrintLargestInstructions(
     const std::vector<NodeStrategyIdx>& chosen_strategy,
-    const std::vector<std::vector<double>>& memory_cost,
-    const std::vector<std::vector<NodeIdx>>& liveness,
-    const std::vector<std::string>& instruction_names) {
-  // This memory consumption computation is different from
-  // that in PrintAutoShardingSolution() because how L and m are created to be
-  // different from liveness_set and strategy.memory_cost.
-
+    const AutoShardingSolverRequest& request) {
+  // This memory consumption computation is different from that in
+  // PrintAutoShardingSolution() because L and m are created to be different
+  // from liveness_set and strategy.memory_cost.
   std::vector<std::pair<LivenessIdx, double>> time_memory_usage;
-  for (LivenessIdx time_idx = 0; time_idx < liveness.size(); ++time_idx) {
+  for (LivenessIdx time_idx = 0; time_idx < request.live_size(); ++time_idx) {
     double mem = 0.0;
-    for (NodeIdx node_idx : liveness[time_idx]) {
-      mem += memory_cost[node_idx][chosen_strategy[node_idx]];
+    for (NodeIdx node_idx : request.live(time_idx).nodes()) {
+      mem += request.memory_costs(node_idx).costs(chosen_strategy[node_idx]);
     }
-    time_memory_usage.push_back(std::make_pair(time_idx, mem));
+    time_memory_usage.push_back({time_idx, mem});
   }
   struct {
     bool operator()(std::pair<LivenessIdx, double> a,
@@ -95,12 +93,14 @@ void PrintLargestInstructions(
   k = std::min(k, time_memory_usage.size());
   std::vector<std::pair<NodeIdx, double>> instruction_mem;
   absl::flat_hash_set<NodeIdx> instruction_set;
-  for (LivenessIdx time_idx = 0; time_idx < k; time_idx++) {
-    for (NodeIdx node_idx : liveness[time_memory_usage.at(time_idx).first]) {
-      double mem = memory_cost[node_idx][chosen_strategy[node_idx]];
+  for (auto usage_idx = 0; usage_idx < k; ++usage_idx) {
+    LivenessIdx time_idx = time_memory_usage.at(usage_idx).first;
+    for (NodeIdx node_idx : request.live(time_idx).nodes()) {
+      double mem =
+          request.memory_costs(node_idx).costs(chosen_strategy[node_idx]);
       if (mem > 100 * 1024 * 1024 &&
           instruction_set.find(node_idx) == instruction_set.end()) {
-        instruction_mem.push_back(std::make_pair(node_idx, mem));
+        instruction_mem.push_back({node_idx, mem});
         instruction_set.insert(node_idx);
       }
     }
@@ -110,19 +110,19 @@ void PrintLargestInstructions(
   size_t top_tensors = 10;
   top_tensors = std::min(top_tensors, instruction_mem.size());
   VLOG(1) << "Top " << top_tensors << " largest tensors:";
-  for (size_t i = 0; i < top_tensors; i++) {
+  for (size_t i = 0; i < top_tensors; ++i) {
     VLOG(1) << "instruction name: "
-            << instruction_names.at(instruction_mem.at(i).first)
+            << request.instruction_names(instruction_mem.at(i).first)
             << " memory usage: "
             << instruction_mem.at(i).second / (1024 * 1024 * 1024) << "GB";
   }
 }
 
-// Applies deterministic noise to the coefficient using the name & saltiplier,
+// Applies deterministic noise to the coefficient using `name` and `saltiplier`
 // so that ties between equal terms can be broken in the solver's objective
-// function.  We include both a multiplicative term (in case the coefficient is
+// function. We include both a multiplicative term (in case the coefficient is
 // large) and an additive term (in case the coefficient is zero).
-void AddSalt(const std::string& name, double saltiplier, double* coeff) {
+void AddSalt(const std::string& name, const double saltiplier, double* coeff) {
   if (saltiplier <= 0.0) return;
   const tsl::uint64 hash = tsl::Hash64(name);  // stable across runs & platforms
   double salt = saltiplier * hash / std::numeric_limits<tsl::uint64>::max();
@@ -137,22 +137,27 @@ AutoShardingSolverResult SolveAndExtractSolution(
     MPSolver& solver);
 
 double MinimumMemoryBudgetRequired(const AutoShardingSolverRequest& request) {
-  double minimum_memory_budget_required_estimate = 0.0;
-  for (LivenessIdx time_idx = 0; time_idx < request.live.size(); ++time_idx) {
-    double minimum_memory_budget_required_estimate_local = 0.0;
-    for (NodeIdx node_idx : request.live[time_idx]) {
-      const std::vector<double>& m = request.m[node_idx];
+  double min_memory_budget_required_estimate = 0.0;
+  for (LivenessIdx time_idx = 0; time_idx < request.live_size(); ++time_idx) {
+    double min_memory_budget_required_estimate_local = 0.0;
+    for (NodeIdx node_idx : request.live(time_idx).nodes()) {
+      const auto& m = request.memory_costs(node_idx).costs();
       const double fixed_memory_cost = *std::min_element(m.begin(), m.end());
-      minimum_memory_budget_required_estimate_local += fixed_memory_cost;
+      min_memory_budget_required_estimate_local += fixed_memory_cost;
     }
-    minimum_memory_budget_required_estimate =
-        std::max(minimum_memory_budget_required_estimate,
-                 minimum_memory_budget_required_estimate_local);
+    min_memory_budget_required_estimate =
+        std::max(min_memory_budget_required_estimate,
+                 min_memory_budget_required_estimate_local);
   }
-  return minimum_memory_budget_required_estimate;
+  return min_memory_budget_required_estimate;
 }
 
-// We formulate the auto sharding process as the following ILP problem:
+// Taking an auto-sharding problem (`request`) as an input, calls the OR tools
+// CP-SAT solver and outputs a solution to the input problem.
+//
+// We formulate the auto-sharding process as the following ILP problem
+// (correspondences to the fields of the request parameter are specified in
+// parenthesis):
 // Variables:
 //   s[i]: Sharding strategy one-hot vector.
 //         dim(s[i]) == # sharding strategies of the i-th XLA op
@@ -160,19 +165,22 @@ double MinimumMemoryBudgetRequired(const AutoShardingSolverRequest& request) {
 //   e[i, j]: Strategy one-hot vector of edge i -> j.
 //            dim(e[i, j]) == dim(s[i]) * dim(s[j])
 // Constants:
-//   N: Number of total XLA ops
-//   M: Memory budget
-//   E: Edge set {(i, j)}
-//   L[t]: Index of live instructions at time t
-//   c[i]: Computation cost vector of instruction i
+//   N: Number of total XLA ops (request.num_nodes)
+//   M: Memory budget (request.memory_budget)
+//   E: Edge set {(i, j)} (request.edges)
+//   L[t]: Index of live instructions at time t (request.live)
+//   c[i]: Computation cost vector of instruction i (request.computation_costs)
 //   d[i]: Communication cost vector of instruction i
-//   m[i]: Memory cost vector of instruction i
+//         (request.communication_costs)
+//   m[i]: Memory cost vector of instruction i (request.memory_costs)
 //         dim(c[i]) == dim(d[i]) == dim(m[i]) == dim(s[i])
 //   r[i, j]: The resharding cost vector of edge i -> j
+//            (request.resharding_costs)
 //            dim(e[i, j]) == dim(r[i, j])
-//   A: Alias set {(i, j)}
+//   A: Alias set {(i, j)} (request.aliases)
 //   v[i, j]: v[i, j](p, q) == 1 if strategy p is different than q, otherwise
 //            v[i, j](p, q) == 0
+//            (request.value_costs)
 //            dim(e[i, j]) == dim(v[i, j])
 // Problem:
 //   Minimize sum_{0 <= i < N} s[i]^T * (c[i] + d[i])
@@ -197,11 +205,20 @@ double MinimumMemoryBudgetRequired(const AutoShardingSolverRequest& request) {
 // Serialize parameters of the ILP problem as numpy arrays and call the python
 // solver.
 
+// Beyond what is described, note the following:
+// 1. We also enforce that certain HLO ops have the same sharding as some other
+//    HLO ops (think elementwise ops, for example). This information stored in
+//    request.s_follow, where if s_follow[i] >= 0, then instruction i is forced
+//    the share same sharding as s_follow[i].
+// 2. If request.overbudget_coeff is present, we turn the hard memory budget
+//    constraint into a soft constraint instead.
+// 3. If request.makespan_coeff is present, the objective additionally includes
+//    a makespan term. This is experimental and turned off by default.
+// 4. request.max_departures is used only for debugging and can be ignored.
 AutoShardingSolverResult CallORToolsSolver(
     const AutoShardingSolverRequest& request) {
-  size_t num_edges = request.e.size();
-
-  int32_t num_workers = 32;
+  const size_t num_edges = request.edges_size();
+  const int num_workers = 32;
   // SAT or SCIP
   std::unique_ptr<MPSolver> solver(std::make_unique<MPSolver>("", MPSolver::SAT_INTEGER_PROGRAMMING));
   CHECK(solver);
@@ -210,70 +227,79 @@ AutoShardingSolverResult CallORToolsSolver(
 #ifdef PLATFORM_GOOGLE
   if (solver->ProblemType() ==
       operations_research::MPSolver::SAT_INTEGER_PROGRAMMING) {
-    // Set num_workers for parallelism.
-    solver_parameter_str = absl::StrCat("num_workers:", num_workers);
-    solver->SetSolverSpecificParametersAsString(solver_parameter_str);
+    // Set random_seed, interleave_search and share_binary_clauses for
+    // determinism, and num_workers for parallelism.
+    solver_parameter_str = absl::StrCat(
+        "share_binary_clauses:false,random_seed:1,interleave_"
+        "search:true,num_workers:",
+        num_workers);
+    // solver->SetSolverSpecificParametersAsString(solver_parameter_str);
   }
 #endif
   // Create variables
-  std::vector<std::vector<MPVariable*>> s(request.num_nodes);
+  std::vector<std::vector<MPVariable*>> s(request.num_nodes());
   std::vector<std::vector<MPVariable*>> e(num_edges);
   MPVariable* overbudget_var = nullptr;
   MPVariable* makespan_var = nullptr;
 
-  size_t var_vector_cnt = 0;
-  for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
-    if (request.s_follow[node_idx] < 0) {
-      var_vector_cnt += 1;
+  size_t unique_nodes = 0;
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+    if (request.s_follow(node_idx) < 0) {
+      unique_nodes += 1;
       // Creates variables for instructions that do not follow others.
-      solver->MakeBoolVarArray(request.s_len[node_idx],
+      solver->MakeBoolVarArray(request.s_len(node_idx),
                                absl::StrCat("s[", node_idx, "]"), &s[node_idx]);
     }
   }
 
-  for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
-    if (request.s_follow[node_idx] >= 0) {
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+    if (request.s_follow(node_idx) >= 0) {
       // Copies the variable of followed instruction to the following
       // instruction.
-      s[node_idx] = s[request.s_follow[node_idx]];
+      s[node_idx] = s[request.s_follow(node_idx)];
     }
   }
 
+  size_t unique_edges = 0;
   std::vector<EdgeIdx> e_follow(num_edges, -1);
   absl::flat_hash_map<std::pair<NodeIdx, NodeIdx>, EdgeIdx> edge_map;
   for (EdgeIdx edge_idx = 0; edge_idx < num_edges; ++edge_idx) {
-    const std::pair<NodeIdx, NodeIdx>& edge = request.e[edge_idx];
-    std::pair<NodeIdx, NodeIdx> followed_edge = edge;
-    if (int f = request.s_follow[edge.first]; f >= 0) followed_edge.first = f;
-    if (int f = request.s_follow[edge.second]; f >= 0) followed_edge.second = f;
+    const auto& raw_edge = request.edges(edge_idx);
+    const std::pair<NodeIdx, NodeIdx> edge(raw_edge.first(), raw_edge.second());
+    auto followed_edge = edge;
+    if (int f = request.s_follow(edge.first); f >= 0) followed_edge.first = f;
+    if (int f = request.s_follow(edge.second); f >= 0) followed_edge.second = f;
     if (const auto& it = edge_map.find(followed_edge); it != edge_map.end()) {
       e[edge_idx] = e[it->second];  // Copy variable of followed edge
       e_follow[edge_idx] = it->second;
       continue;
     }
+    unique_edges += 1;
     solver->MakeBoolVarArray(
-        request.s_len[edge.first] * request.s_len[edge.second],
+        request.s_len(edge.first) * request.s_len(edge.second),
         absl::StrCat("e[", edge.first, ",", edge.second, "]"), &e[edge_idx]);
     edge_map.insert({followed_edge, edge_idx});
   }
 
-  if (request.memory_budget > 0 && request.overbudget_coeff) {
+  if (request.memory_budget() > 0 && request.has_overbudget_coeff()) {
     overbudget_var =
         solver->MakeNumVar(0.0, MPSolver::infinity(), "overbudget");
   }
 
-  if (request.makespan_coeff) {
+  if (request.has_makespan_coeff()) {
     makespan_var = CreateMakespanVar(request, e, *solver);
   }
 
-  // Objective
+  // Construct objective function.
   // Node costs
-  for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
     for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
       double accumulated_coefficient =
           solver->Objective().GetCoefficient(s[node_idx][j]);
-      double coefficient = request.c[node_idx][j] + request.d[node_idx][j];
-      AddSalt(absl::StrCat(node_idx, "S", j), request.saltiplier, &coefficient);
+      double coefficient = request.computation_costs(node_idx).costs(j) +
+                           request.communication_costs(node_idx).costs(j);
+      AddSalt(absl::StrCat(node_idx, "S", j), request.saltiplier(),
+              &coefficient);
       solver->MutableObjective()->SetCoefficient(
           s[node_idx][j], accumulated_coefficient + coefficient);
     }
@@ -283,19 +309,20 @@ AutoShardingSolverResult CallORToolsSolver(
     for (EdgeStrategyIdx j = 0; j < e[edge_idx].size(); ++j) {
       double accumulated_coefficient =
           solver->Objective().GetCoefficient(e[edge_idx][j]);
-      double coefficient = request.r[edge_idx][j];
-      AddSalt(absl::StrCat(edge_idx, "E", j), request.saltiplier, &coefficient);
+      double coefficient = request.resharding_costs(edge_idx).costs(j);
+      AddSalt(absl::StrCat(edge_idx, "E", j), request.saltiplier(),
+              &coefficient);
       solver->MutableObjective()->SetCoefficient(
           e[edge_idx][j], accumulated_coefficient + coefficient);
     }
   }
 
-  // Constraints
+  // Add constraints.
   // 0. Do not choose solutions with infinity costs, as it will make the
   // objective value so large that other solution choices do not matter anymore.
   // Remove these constraints once b/238210866 is done.
-  for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
-    if (s[node_idx].empty() || request.s_follow[node_idx] >= 0) continue;
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+    if (s[node_idx].empty() || request.s_follow(node_idx) >= 0) continue;
     bool all_infinity = true;
     for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
       if (solver->Objective().GetCoefficient(s[node_idx][j]) >= kInfinityCost) {
@@ -311,7 +338,6 @@ AutoShardingSolverResult CallORToolsSolver(
       LOG(FATAL) << "All of s[" << node_idx << "][*] have infinity costs";
     }
   }
-
   for (EdgeIdx edge_idx = 0; edge_idx < num_edges; ++edge_idx) {
     if (e[edge_idx].empty() || e_follow[edge_idx] >= 0) continue;
     bool all_infinity = true;
@@ -326,10 +352,10 @@ AutoShardingSolverResult CallORToolsSolver(
       }
     }
     if (all_infinity) {
-      auto err_msg =
-          absl::StrCat("All of e[", request.e[edge_idx].first, "][",
-                       request.e[edge_idx].second, "][*] have infinity costs");
-      if (request.crash_at_infinity_costs_check) {
+      auto err_msg = absl::StrCat("All of e[", request.edges(edge_idx).first(),
+                                  "][", request.edges(edge_idx).second(),
+                                  "][*] have infinity costs");
+      if (request.crash_at_infinity_costs_check()) {
         LOG(FATAL) << err_msg;
       } else {
         LOG(WARNING) << err_msg;
@@ -340,8 +366,8 @@ AutoShardingSolverResult CallORToolsSolver(
 
   // a. specified via "BoolVarArray"
   // b.
-  for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
-    if (request.s_follow[node_idx] >= 0) continue;
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+    if (request.s_follow(node_idx) >= 0) continue;
     MPConstraint* constraint = solver->MakeRowConstraint(
         1.0, 1.0,
         absl::StrCat("sum(s[", node_idx, "][j] for j = [0 .. ",
@@ -351,48 +377,47 @@ AutoShardingSolverResult CallORToolsSolver(
     }
   }
   // c.
-  if (request.memory_budget > 0) {
-    const double minimum_memory_budget_required_estimate =
+  if (request.memory_budget() > 0) {
+    const double min_memory_budget_required_estimate =
         MinimumMemoryBudgetRequired(request);
-    const double minimum_memory_overbudget = std::max(
-        0.0, minimum_memory_budget_required_estimate - request.memory_budget);
-    for (LivenessIdx time_idx = 0; time_idx < request.live.size(); ++time_idx) {
-      const std::string str =
-          absl::StrCat("[", absl::StrJoin(request.live[time_idx], ", "), "]");
-      double upper_bound = request.memory_budget;
-      if (overbudget_var) upper_bound += minimum_memory_overbudget;
-      MPConstraint* constraint = solver->MakeRowConstraint(
-          -MPSolver::infinity(), upper_bound,
-          absl::StrCat("mem[", time_idx, "] = ", str));
+    const double min_memory_overbudget = std::max(
+        0.0, min_memory_budget_required_estimate - request.memory_budget());
+    for (LivenessIdx time_idx = 0; time_idx < request.live_size(); ++time_idx) {
+      double upper_bound = request.memory_budget();
+      if (overbudget_var) upper_bound += min_memory_overbudget;
+      MPConstraint* constraint =
+          solver->MakeRowConstraint(-MPSolver::infinity(), upper_bound,
+                                    absl::StrCat("mem[", time_idx, "]"));
       if (overbudget_var) constraint->SetCoefficient(overbudget_var, -1.0);
-      for (NodeIdx node_idx : request.live[time_idx]) {
+      for (NodeIdx node_idx : request.live(time_idx).nodes()) {
         for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
           const double accumulated_coefficient =
               constraint->GetCoefficient(s[node_idx][j]);
-          constraint->SetCoefficient(
-              s[node_idx][j], accumulated_coefficient + request.m[node_idx][j]);
+          const double memory_cost = request.memory_costs(node_idx).costs(j);
+          constraint->SetCoefficient(s[node_idx][j],
+                                     accumulated_coefficient + memory_cost);
         }
       }
     }
     if (overbudget_var) {
-      solver->MutableObjective()->SetCoefficient(overbudget_var,
-                                                 *request.overbudget_coeff);
-      solver->MutableObjective()->SetOffset(*request.overbudget_coeff *
-                                            minimum_memory_overbudget);
+      solver->MutableObjective()->SetCoefficient(
+          overbudget_var, request.overbudget_coeff().coeff());
+      solver->MutableObjective()->SetOffset(request.overbudget_coeff().coeff() *
+                                            min_memory_overbudget);
     }
     LOG(INFO) << "Minimum memory budget estimate: "
-              << minimum_memory_budget_required_estimate;
-    LOG(INFO) << "Using memory budget: " << request.memory_budget;
+              << min_memory_budget_required_estimate;
+    LOG(INFO) << "Using memory budget: " << request.memory_budget();
   }
 
   // d. specified via "BoolVarArray"
   // e.
   for (EdgeIdx edge_idx = 0; edge_idx < num_edges; ++edge_idx) {
     if (e_follow[edge_idx] >= 0) continue;
-    const std::pair<NodeIdx, NodeIdx>& edge = request.e[edge_idx];
+    const auto& edge = request.edges(edge_idx);
     MPConstraint* constraint = solver->MakeRowConstraint(
         1.0, 1.0,
-        absl::StrCat("sum(e[", edge.first, "][", edge.second, "][*]) = 1"));
+        absl::StrCat("sum(e[", edge.first(), "][", edge.second(), "][*]) = 1"));
     for (EdgeStrategyIdx j = 0; j < e[edge_idx].size(); ++j) {
       constraint->SetCoefficient(e[edge_idx][j], 1.0);
     }
@@ -400,14 +425,14 @@ AutoShardingSolverResult CallORToolsSolver(
   // f.
   for (EdgeIdx edge_idx = 0; edge_idx < num_edges; ++edge_idx) {
     if (e_follow[edge_idx] >= 0) continue;
-    const std::pair<NodeIdx, NodeIdx>& edge = request.e[edge_idx];
-    for (NodeStrategyIdx p = 0; p < s[edge.first].size(); ++p) {
+    const auto& edge = request.edges(edge_idx);
+    for (NodeStrategyIdx p = 0; p < s[edge.first()].size(); ++p) {
       MPConstraint* constraint = solver->MakeRowConstraint(
           -MPSolver::infinity(), 0,
           absl::StrCat("f for i = ", edge_idx, ", p = ", p));
-      constraint->SetCoefficient(s[edge.first][p], -1.0);
-      for (NodeStrategyIdx q = 0; q < s[edge.second].size(); ++q) {
-        constraint->SetCoefficient(e[edge_idx][p * s[edge.second].size() + q],
+      constraint->SetCoefficient(s[edge.first()][p], -1.0);
+      for (NodeStrategyIdx q = 0; q < s[edge.second()].size(); ++q) {
+        constraint->SetCoefficient(e[edge_idx][p * s[edge.second()].size() + q],
                                    1.0);
       }
     }
@@ -415,55 +440,57 @@ AutoShardingSolverResult CallORToolsSolver(
   // g.
   for (EdgeIdx edge_idx = 0; edge_idx < num_edges; ++edge_idx) {
     if (e_follow[edge_idx] >= 0) continue;
-    const std::pair<NodeIdx, NodeIdx>& edge = request.e[edge_idx];
-    for (NodeStrategyIdx q = 0; q < s[edge.second].size(); ++q) {
+    const auto& edge = request.edges(edge_idx);
+    for (NodeStrategyIdx q = 0; q < s[edge.second()].size(); ++q) {
       MPConstraint* constraint = solver->MakeRowConstraint(
           -MPSolver::infinity(), 0,
           absl::StrCat("g for i = ", edge_idx, ", q = ", q));
-      constraint->SetCoefficient(s[edge.second][q], -1.0);
-      for (NodeStrategyIdx p = 0; p < s[edge.first].size(); ++p) {
-        constraint->SetCoefficient(e[edge_idx][p * s[edge.second].size() + q],
+      constraint->SetCoefficient(s[edge.second()][q], -1.0);
+      for (NodeStrategyIdx p = 0; p < s[edge.first()].size(); ++p) {
+        constraint->SetCoefficient(e[edge_idx][p * s[edge.second()].size() + q],
                                    1.0);
       }
     }
   }
   // h.
-  for (AliasIdx alias_idx = 0; alias_idx < request.a.size(); ++alias_idx) {
-    const std::pair<NodeIdx, NodeIdx>& alias = request.a[alias_idx];
-    for (NodeStrategyIdx p = 0; p < s[alias.first].size(); ++p) {
-      for (NodeStrategyIdx q = 0; q < s[alias.second].size(); ++q) {
+  for (auto alias_idx = 0; alias_idx < request.aliases_size(); ++alias_idx) {
+    const auto& alias = request.aliases(alias_idx);
+    const auto& value_costs = request.value_costs(alias_idx).costs();
+    for (NodeStrategyIdx p = 0; p < s[alias.first()].size(); ++p) {
+      for (NodeStrategyIdx q = 0; q < s[alias.second()].size(); ++q) {
         // if lhs == 1
-        if (request.v[alias_idx][p * s[alias.second].size() + q] > 0.5) {
+        if (value_costs[p * s[alias.second()].size() + q] > 0.5) {
           MPConstraint* constraint = solver->MakeRowConstraint(
               -MPSolver::infinity(), 1,
-              absl::StrCat("s[", alias.first, "][", p, "] + s[", alias.second,
-                           "][", q, "] <= 1"));
-          constraint->SetCoefficient(s[alias.first][p], 1.0);
-          constraint->SetCoefficient(s[alias.second][q], 1.0);
+              absl::StrCat("s[", alias.first(), "][", p, "] + s[",
+                           alias.second(), "][", q, "] <= 1"));
+          constraint->SetCoefficient(s[alias.first()][p], 1.0);
+          constraint->SetCoefficient(s[alias.second()][q], 1.0);
         }
       }
     }
   }
-  if (request.max_departures) {
+  if (request.has_max_departures()) {
     MPConstraint* constraint = solver->MakeRowConstraint(
-        0, *request.max_departures,
-        absl::StrCat("departures <= ", *request.max_departures));
-    for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
+        0, request.max_departures().coeff(),
+        absl::StrCat("departures <= ", request.max_departures().coeff()));
+    for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
       for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
         double accumulated_coefficient =
             constraint->GetCoefficient(s[node_idx][j]);
-        constraint->SetCoefficient(
-            s[node_idx][j], accumulated_coefficient + request.p[node_idx][j]);
+        double departure_cost = request.departure_costs(node_idx).costs(j);
+        constraint->SetCoefficient(s[node_idx][j],
+                                   accumulated_coefficient + departure_cost);
       }
     }
   }
 
-  if (!request.s_hint.empty()) {
+  if (!request.s_hint().empty()) {
     std::vector<std::pair<const MPVariable*, double>> hint;
-    for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
-      if (request.s_follow[node_idx] >= 0) continue;
+    for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+      if (request.s_follow(node_idx) >= 0) continue;
       for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
-        double hint_val = (request.s_hint[node_idx] == j) ? 1.0 : 0.0;
+        double hint_val = (request.s_hint(node_idx) == j) ? 1.0 : 0.0;
         hint.push_back({s[node_idx][j], hint_val});
       }
     }
@@ -484,9 +511,23 @@ AutoShardingSolverResult CallORToolsSolver(
       LOG(ERROR) << write_status.message();
     }
   }
+  // Exports the solver request proto for debugging.
+  bool dump_solver_request = false;
+  if (dump_solver_request) {
+    uint64_t solver_request_fprint =
+        tsl::Fingerprint64(request.SerializeAsString());
+    auto write_status = file::SetBinaryProto(
+        // Modify this file path if needed.
+        absl::StrCat("/tmp/solver_request_", solver_request_fprint, ".proto"),
+        request, file::Defaults());
+    if (!write_status.ok()) {
+      LOG(ERROR) << write_status.message();
+    }
+  }
 #endif
-  if (request.solver_timeout_in_seconds) {
-    solver->SetTimeLimit(absl::Seconds(*request.solver_timeout_in_seconds));
+  if (request.has_solver_timeout()) {
+    solver->SetTimeLimit(
+        absl::Seconds(request.solver_timeout().solver_timeout_in_seconds()));
   }
   VLOG(0) << "Starting solver " << solver->ProblemType() << "\n"
           << "Solver parameter string: " << solver_parameter_str << "\n"
@@ -494,13 +535,40 @@ AutoShardingSolverResult CallORToolsSolver(
           << "Number of threads: " << solver->GetNumThreads() << "\n"
           << "Time limit: " << solver->time_limit() << "\n"
           << "Number variables for ILP: " << solver->NumVariables() << "\n"
-          << "Total vector of variables: " << var_vector_cnt << "\n"
-          << "Total instructions: " << request.num_nodes << "\n"
-          << "Memory budget: " << request.memory_budget / (1024 * 1024 * 1024)
+          << "Unique nodes: " << unique_nodes << "\n"
+          << "Unique edges: " << unique_edges << "\n"
+          << "Total instructions: " << request.num_nodes() << "\n"
+          << "Memory budget: " << request.memory_budget() / (1024 * 1024 * 1024)
           << "GB\n"
-          << "Number of ILP constraints: " << solver->NumConstraints();
-  return SolveAndExtractSolution(request, s, e, overbudget_var, makespan_var,
-                                 *solver);
+          << "Number of ILP constraints: " << solver->NumConstraints() << "\n"
+          << "Module name: " << request.module_name();
+  auto result = SolveAndExtractSolution(request, s, e, overbudget_var,
+                                        makespan_var, *solver);
+  if (result.status.ok()) {
+    const AutoShardingEvaluation evaluation = Evaluate(request, result);
+    LOG(INFO) << "Total Communication Cost: "
+              << evaluation.total.communication_cost
+              << " (lower bound: " << evaluation.lower_bound.communication_cost
+              << ")";
+    LOG(INFO) << "Total Computation Cost: " << evaluation.total.computation_cost
+              << " (lower bound: " << evaluation.lower_bound.computation_cost
+              << ")";
+    LOG(INFO) << "Total Resharding Cost: " << evaluation.total.resharding_cost
+              << " (lower bound: " << evaluation.lower_bound.resharding_cost
+              << ")";
+    LOG(INFO) << "Total Overbudget Cost: " << evaluation.total.overbudget_cost
+              << " (lower bound: " << evaluation.lower_bound.overbudget_cost
+              << ")";
+    LOG(INFO) << "Total Makespan Cost: " << evaluation.total.makespan_cost
+              << " (lower bound: " << evaluation.lower_bound.makespan_cost
+              << ")";
+    LOG(INFO) << "Total Cost: " << evaluation.total.cost()
+              << " (lower bound: " << evaluation.lower_bound.cost() << ")";
+    LOG(INFO) << "Total Departures: " << evaluation.total_departures;
+    LOG(INFO) << "Total Makespan: " << evaluation.total_makespan;
+    LOG(INFO) << "Total Violations: " << evaluation.violation_codes.size();
+  }
+  return result;
 }
 
 AutoShardingSolverResult SolveAndExtractSolution(
@@ -509,11 +577,17 @@ AutoShardingSolverResult SolveAndExtractSolution(
     const std::vector<std::vector<MPVariable*>>& e,
     const MPVariable* overbudget_var, const MPVariable* makespan_var,
     MPSolver& solver) {
+  absl::Time start_time = absl::Now();
   auto status = solver.Solve();
+  absl::Time end_time = absl::Now();
+  auto duration = end_time - start_time;
+  LOG(INFO) << "Solver took " << absl::ToInt64Milliseconds(duration) << " ms";
+  LOG(INFO) << "Solver Status: " << status;
+
   if (status == operations_research::MPSolver::INFEASIBLE) {
     LOG(ERROR) << "MPSolver could not find any feasible solution.";
 #ifdef PLATFORM_GOOGLE
-    if (request.compute_iis) {
+    if (request.compute_iis()) {
       operations_research::MPModelRequest model_request;
       solver.ExportModelToProto(model_request.mutable_model());
       if (solver.ProblemType() ==
@@ -547,15 +621,23 @@ AutoShardingSolverResult SolveAndExtractSolution(
     LOG(FATAL) << "Solver says that the input MIP is invalid. This is most "
                   "likely a bug and should be reported.";
   } else if (status != operations_research::MPSolver::OPTIMAL) {
-    auto err_msg = "Solver timed out. Will proceed without auto sharding.";
-    LOG(WARNING) << err_msg << " Solver status " << status;
-    // The solver timed out. We now rely on heuristic-based sharding propagation
-    // to degrade gracefully.
+    auto err_msg = "Solver timed out.";
     return AutoShardingSolverResult(absl::InternalError(err_msg), true);
   }
 
-  LOG(INFO) << "Solver Status: " << status
-            << " Objective value: " << solver.Objective().Value();
+  // Fingerprint the model & solution (useful when checking for determinism).
+  // We use TensorFlow's fingerprint library here, which differs from CP-SAT's.
+  operations_research::MPModelProto model_proto;
+  solver.ExportModelToProto(&model_proto);
+  uint64_t model_fprint = tsl::Fingerprint64(model_proto.SerializeAsString());
+  operations_research::MPSolutionResponse response;
+  solver.FillSolutionResponseProto(&response);
+  response.clear_solve_info();  // Remove for fingerprint; can vary between runs
+  uint64_t solution_fprint = tsl::Fingerprint64(response.SerializeAsString());
+
+  LOG(INFO) << "Objective value: " << solver.Objective().Value()
+            << " Model fingerprint: " << model_fprint
+            << " Solution fingerprint: " << solution_fprint;
   if (solver.Objective().Value() >= kInfinityCost) {
     LOG(WARNING) << "Objective (" << solver.Objective().Value()
                  << ") is larger than kInfinityCost. It means the solver "
@@ -565,27 +647,24 @@ AutoShardingSolverResult SolveAndExtractSolution(
   if (VLOG_IS_ON(10)) {
     // Print solver information for debugging. This hasn't been useful so far,
     // so leave it at VLOG level 10.
-    operations_research::MPModelProto model_proto;
-    solver.ExportModelToProto(&model_proto);
     VLOG(10) << "MODEL:";
     XLA_VLOG_LINES(10, model_proto.DebugString());
     VLOG(10) << "RESPONSE:";
-    operations_research::MPSolutionResponse response;
-    solver.FillSolutionResponseProto(&response);
     XLA_VLOG_LINES(10, response.DebugString());
   }
 
   // Return value
-  size_t num_edges = request.e.size();
+  size_t num_edges = request.edges_size();
   double unsalted_objective = 0.0;
-  std::vector<NodeStrategyIdx> chosen_strategy(request.num_nodes, -1);
+  std::vector<NodeStrategyIdx> chosen_strategy(request.num_nodes(), -1);
   std::vector<EdgeStrategyIdx> e_val(num_edges, -1);
-  for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
     for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
       // if lhs == 1
       if (s[node_idx][j]->solution_value() > 0.5) {
         chosen_strategy[node_idx] = j;
-        unsalted_objective += request.c[node_idx][j] + request.d[node_idx][j];
+        unsalted_objective += request.computation_costs(node_idx).costs(j) +
+                              request.communication_costs(node_idx).costs(j);
         break;
       }
     }
@@ -595,31 +674,30 @@ AutoShardingSolverResult SolveAndExtractSolution(
       // if lhs == 1
       if (e[edge_idx][j]->solution_value() > 0.5) {
         e_val[edge_idx] = j;
-        unsalted_objective += request.r[edge_idx][j];
+        unsalted_objective += request.resharding_costs(edge_idx).costs(j);
         break;
       }
     }
   }
   if (overbudget_var) {
     unsalted_objective +=
-        *request.overbudget_coeff * overbudget_var->solution_value();
+        request.overbudget_coeff().coeff() * overbudget_var->solution_value();
     unsalted_objective += solver.Objective().offset();
   }
   if (makespan_var) {
     unsalted_objective +=
-        *request.makespan_coeff * makespan_var->solution_value();
+        request.makespan_coeff().coeff() * makespan_var->solution_value();
   }
 
   LOG(INFO) << "Unsalted objective value: " << unsalted_objective;
-  LOG(INFO) << "N = " << request.num_nodes;
-  if (request.memory_budget < 0) {
+  LOG(INFO) << "N = " << request.num_nodes();
+  if (request.memory_budget() < 0) {
     LOG(INFO) << "memory budget: -1";
   } else {
     LOG(INFO) << "memory budget: "
-              << request.memory_budget / (1024 * 1024 * 1024) << " GB";
+              << request.memory_budget() / (1024 * 1024 * 1024) << " GB";
   }
-  PrintLargestInstructions(chosen_strategy, request.m, request.live,
-                           request.instruction_names);
+  PrintLargestInstructions(chosen_strategy, request);
   return AutoShardingSolverResult(
       std::make_tuple(std::move(chosen_strategy), std::move(e_val),
                       unsalted_objective),
@@ -648,85 +726,90 @@ bool AutoShardingEvaluation::operator==(
 
 AutoShardingEvaluation Evaluate(const AutoShardingSolverRequest& request,
                                 const AutoShardingSolverResult& result) {
-  const std::vector<std::vector<double>>& c = request.c;
-  const std::vector<std::vector<double>>& d = request.d;
-  const std::vector<std::vector<double>>& r = request.r;
+  const auto& c = request.computation_costs();
+  const auto& d = request.communication_costs();
+  const auto& r = request.resharding_costs();
+  const auto& v = request.value_costs();
+  const auto& p = request.departure_costs();
   const std::vector<NodeStrategyIdx>& s_val = std::get<0>(*result.status);
   const std::vector<EdgeStrategyIdx>& e_val = std::get<1>(*result.status);
   AutoShardingEvaluation evaluation;
   // Compute violations.
-  for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
-    NodeIdx s_follow = request.s_follow[node_idx];
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+    NodeIdx s_follow = request.s_follow(node_idx);
     if (s_follow >= 0 && s_val[node_idx] != s_val[s_follow]) {
       evaluation.violation_codes.insert(kFollowerViolationCode);
     }
   }
-  for (AliasIdx alias_idx = 0; alias_idx < request.a.size(); ++alias_idx) {
-    const std::pair<NodeIdx, NodeIdx>& alias = request.a[alias_idx];
-    NodeStrategyIdx p = s_val[alias.first], q = s_val[alias.second];
-    if (request.v[alias_idx][p * request.s_len[alias.second] + q] > 0.5) {
+  for (auto alias_idx = 0; alias_idx < request.aliases_size(); ++alias_idx) {
+    const auto& alias = request.aliases(alias_idx);
+    NodeStrategyIdx p = s_val[alias.first()], q = s_val[alias.second()];
+    if (v.at(alias_idx).costs(p * request.s_len(alias.second()) + q) > 0.5) {
       evaluation.violation_codes.insert(kAliasViolationCode);
     }
   }
-  for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
     NodeStrategyIdx strat_idx = s_val[node_idx];
-    if (c[node_idx][strat_idx] + d[node_idx][strat_idx] >= kInfinityCost) {
+    const double node_cost =
+        c.at(node_idx).costs(strat_idx) + d.at(node_idx).costs(strat_idx);
+    if (node_cost >= kInfinityCost) {
       evaluation.violation_codes.insert(kInfiniteCostViolationCode);
     }
   }
-  for (EdgeIdx edge_idx = 0; edge_idx < request.e.size(); ++edge_idx) {
-    if (request.r[edge_idx][e_val[edge_idx]] >= kInfinityCost) {
+  for (EdgeIdx edge_idx = 0; edge_idx < request.edges_size(); ++edge_idx) {
+    if (r.at(edge_idx).costs(e_val[edge_idx]) >= kInfinityCost) {
       evaluation.violation_codes.insert(kInfiniteCostViolationCode);
     }
   }
-  for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
-    evaluation.total_departures += request.p[node_idx][s_val[node_idx]];
-    if (request.max_departures &&
-        evaluation.total_departures > *request.max_departures) {
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+    evaluation.total_departures += p.at(node_idx).costs(s_val[node_idx]);
+    if (request.has_max_departures() &&
+        evaluation.total_departures > request.max_departures().coeff()) {
       evaluation.violation_codes.insert(kMaxDeparturesViolationCode);
     }
   }
-  if (request.memory_budget > 0) {
+  if (request.memory_budget() > 0) {
     double total_overbudget = 0.0;
     double lower_bound_overbudget = 0.0;
-    for (LivenessIdx time_idx = 0; time_idx < request.live.size(); ++time_idx) {
+    for (LivenessIdx time_idx = 0; time_idx < request.live_size(); ++time_idx) {
       double total_memory_cost = 0.0;
       double lower_bound_memory_cost = 0.0;
-      for (NodeIdx node_idx : request.live[time_idx]) {
-        const std::vector<double>& m = request.m[node_idx];
+      for (NodeIdx node_idx : request.live(time_idx).nodes()) {
+        const auto& m = request.memory_costs(node_idx).costs();
         total_memory_cost += m[s_val[node_idx]];
         lower_bound_memory_cost += *std::min_element(m.begin(), m.end());
       }
-      if (request.overbudget_coeff) {
-        total_overbudget = std::max(total_overbudget,
-                                    total_memory_cost - request.memory_budget);
+      if (request.has_overbudget_coeff()) {
+        total_overbudget = std::max(
+            total_overbudget, total_memory_cost - request.memory_budget());
         lower_bound_overbudget =
             std::max(lower_bound_overbudget,
-                     lower_bound_memory_cost - request.memory_budget);
-      } else if (total_memory_cost > request.memory_budget) {
+                     lower_bound_memory_cost - request.memory_budget());
+      } else if (total_memory_cost > request.memory_budget()) {
         evaluation.violation_codes.insert(kMemoryViolationCode);
       }
     }
-    if (request.overbudget_coeff) {
+    if (request.has_overbudget_coeff()) {
       evaluation.total.overbudget_cost =
-          *request.overbudget_coeff * total_overbudget;
+          request.overbudget_coeff().coeff() * total_overbudget;
       evaluation.lower_bound.overbudget_cost =
-          *request.overbudget_coeff * lower_bound_overbudget;
+          request.overbudget_coeff().coeff() * lower_bound_overbudget;
     }
   }
-  // Compute metrics & lower bounds.
-  for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
-    evaluation.total.communication_cost += d[node_idx][s_val[node_idx]];
-    evaluation.total.computation_cost += c[node_idx][s_val[node_idx]];
-    evaluation.lower_bound.communication_cost +=
-        *std::min_element(d[node_idx].begin(), d[node_idx].end());
-    evaluation.lower_bound.computation_cost +=
-        *std::min_element(c[node_idx].begin(), c[node_idx].end());
+  // Compute metrics and lower bounds.
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
+    evaluation.total.communication_cost +=
+        d.at(node_idx).costs(s_val[node_idx]);
+    evaluation.total.computation_cost += c.at(node_idx).costs(s_val[node_idx]);
+    evaluation.lower_bound.communication_cost += *std::min_element(
+        d.at(node_idx).costs().begin(), d.at(node_idx).costs().end());
+    evaluation.lower_bound.computation_cost += *std::min_element(
+        c.at(node_idx).costs().begin(), c.at(node_idx).costs().end());
   }
-  for (EdgeIdx edge_idx = 0; edge_idx < request.e.size(); ++edge_idx) {
-    evaluation.total.resharding_cost += r[edge_idx][e_val[edge_idx]];
-    evaluation.lower_bound.resharding_cost +=
-        *std::min_element(r[edge_idx].begin(), r[edge_idx].end());
+  for (EdgeIdx edge_idx = 0; edge_idx < request.edges_size(); ++edge_idx) {
+    evaluation.total.resharding_cost += r.at(edge_idx).costs(e_val[edge_idx]);
+    evaluation.lower_bound.resharding_cost += *std::min_element(
+        r.at(edge_idx).costs().begin(), r.at(edge_idx).costs().end());
   }
   evaluation.total_makespan = EvaluateMakespan(request, result, evaluation);
   return evaluation;
@@ -736,23 +819,25 @@ std::vector<std::string> Rationalize(const AutoShardingSolverRequest& request,
                                      const AutoShardingSolverResult& result,
                                      const AutoShardingSolverResult& subopt) {
   std::vector<std::string> rationales;
-  const std::vector<std::string>& names = request.instruction_names;
+  const auto& names = request.instruction_names();
 
   const std::vector<NodeStrategyIdx>& s_result = std::get<0>(*result.status);
   const std::vector<NodeStrategyIdx>& s_subopt = std::get<0>(*subopt.status);
-  for (NodeIdx node_idx = 0; node_idx < request.num_nodes; ++node_idx) {
+  for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
     const NodeStrategyIdx j = s_result[node_idx], k = s_subopt[node_idx];
     if (j != k) {
       rationales.push_back(absl::StrCat(
           "strategy changes for ", names[node_idx], " (", j, " -> ", k, ")"));
     }
-    const double dj = request.d[node_idx][j], dk = request.d[node_idx][k];
+    const double dj = request.communication_costs(node_idx).costs(j);
+    const double dk = request.communication_costs(node_idx).costs(k);
     if (dj < dk) {
       rationales.push_back(absl::StrCat("communication cost increases for ",
                                         names[node_idx], " (", dj, " -> ", dk,
                                         ")"));
     }
-    const double cj = request.c[node_idx][j], ck = request.c[node_idx][k];
+    const double cj = request.computation_costs(node_idx).costs(j);
+    const double ck = request.computation_costs(node_idx).costs(k);
     if (cj < ck) {
       rationales.push_back(absl::StrCat("computation cost increases for ",
                                         names[node_idx], " (", cj, " -> ", ck,
@@ -762,13 +847,14 @@ std::vector<std::string> Rationalize(const AutoShardingSolverRequest& request,
 
   const std::vector<EdgeStrategyIdx>& e_result = std::get<1>(*result.status);
   const std::vector<EdgeStrategyIdx>& e_subopt = std::get<1>(*subopt.status);
-  for (EdgeIdx edge_idx = 0; edge_idx < request.e.size(); ++edge_idx) {
-    const std::pair<NodeIdx, NodeIdx>& edge = request.e[edge_idx];
+  for (EdgeIdx edge_idx = 0; edge_idx < request.edges_size(); ++edge_idx) {
+    const auto& edge = request.edges(edge_idx);
     const EdgeStrategyIdx j = e_result[edge_idx], k = e_subopt[edge_idx];
-    const double rj = request.r[edge_idx][j], rk = request.r[edge_idx][k];
+    const double rj = request.resharding_costs(edge_idx).costs(j);
+    const double rk = request.resharding_costs(edge_idx).costs(k);
     if (rj < rk) {
       const std::string edge_name =
-          absl::StrCat(names[edge.first], " and ", names[edge.second]);
+          absl::StrCat(names[edge.first()], " and ", names[edge.second()]);
       rationales.push_back(absl::StrCat("resharding cost increases for ",
                                         edge_name, " (", rj, " -> ", rk, ")"));
     }
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
index 28477c7b5e7971..bd501984a4ad99 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
@@ -17,49 +17,19 @@ limitations under the License.
 #define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_H_
 
 #include <cstdint>
-#include <optional>
 #include <string>
 #include <tuple>
-#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/statusor.h"
 #include "ortools/linear_solver/linear_solver.h"
 
-using MPSolver = operations_research::MPSolver;
-using MPVariable = operations_research::MPVariable;
-
 namespace xla {
 namespace spmd {
 
-struct AutoShardingSolverRequest {
-  int64_t num_nodes = 0;
-  int64_t memory_budget = -1;
-  std::vector<int> s_len;
-  std::vector<NodeIdx> s_follow;
-  std::vector<NodeStrategyIdx> s_hint;
-  std::vector<std::pair<NodeIdx, NodeIdx>> e;
-  std::vector<std::vector<NodeIdx>> live;
-  std::vector<std::vector<double>> c;
-  std::vector<std::vector<double>> d;
-  std::vector<std::vector<double>> m;
-  std::vector<std::vector<double>> p;
-  std::vector<std::vector<double>> r;
-  std::vector<std::vector<double>> t;
-  std::vector<std::pair<NodeIdx, NodeIdx>> a;
-  std::vector<std::vector<double>> v;
-  std::vector<std::string> instruction_names;
-  std::optional<int64_t> solver_timeout_in_seconds;
-  std::optional<double> overbudget_coeff = 1e6;
-  std::optional<double> makespan_coeff;
-  std::optional<double> max_departures;
-  bool crash_at_infinity_costs_check = false;
-  bool compute_iis = true;
-  double saltiplier = 0.001;  // Modifies each objective term by at most 0.1%
-};
-
 struct AutoShardingSolverResult {
  public:
   AutoShardingSolverResult(
@@ -103,14 +73,14 @@ struct AutoShardingEvaluation {
   // A set of constraint violations; should be empty for any viable solution.
   absl::flat_hash_set<AutoShardingViolationCode> violation_codes;
 
-  // A breakdown & lower bound for each individual cost component.
+  // A breakdown and lower bound for each individual cost component.
   CostComponents total;
   CostComponents lower_bound;
 
   // How many instructions departed from the "default" sharding strategy.
   double total_departures = 0.0;
 
-  // The (raw) total makespan, i.e. not scaled by the makespan coefficient.
+  // The (raw) total makespan, i.e., not scaled by the makespan coefficient.
   double total_makespan = 0.0;
 
   bool operator==(const AutoShardingEvaluation& other) const;
@@ -127,9 +97,10 @@ std::vector<std::string> Rationalize(const AutoShardingSolverRequest& request,
                                      const AutoShardingSolverResult& subopt);
 
 // Creates and returns a variable for makespan.
-MPVariable* CreateMakespanVar(const AutoShardingSolverRequest& request,
-                              const std::vector<std::vector<MPVariable*>>& e,
-                              MPSolver& solver);
+operations_research::MPVariable* CreateMakespanVar(
+    const AutoShardingSolverRequest& request,
+    const std::vector<std::vector<operations_research::MPVariable*>>& e,
+    operations_research::MPSolver& solver);
 
 double EvaluateMakespan(const AutoShardingSolverRequest& request,
                         const AutoShardingSolverResult& result,
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_impl.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_impl.cc
index 0b1ddcabcc7acf..eec0310ca3c39a 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_impl.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_impl.cc
@@ -15,15 +15,16 @@ limitations under the License.
 
 #include <vector>
 
+#include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_solver.h"
 #include "ortools/linear_solver/linear_solver.h"
 
-using MPSolver = operations_research::MPSolver;
-using MPVariable = operations_research::MPVariable;
-
 namespace xla {
 namespace spmd {
 
+using ::operations_research::MPSolver;
+using ::operations_research::MPVariable;
+
 MPVariable* CreateMakespanVar(const AutoShardingSolverRequest& request,
                               const std::vector<std::vector<MPVariable*>>& e,
                               MPSolver& solver) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
index 41e1c450e93bdc..d2b63f690f3321 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
@@ -12,6 +12,7 @@ limitations under the License.
 
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_solver.h"
 
+#include <cstdint>
 #include <optional>
 #include <string>
 #include <tuple>
@@ -19,67 +20,112 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 
 namespace xla {
 namespace spmd {
 namespace {
 
+using CostMatrix = std::vector<std::vector<double>>;
+using NodeMatrix = std::vector<std::vector<int64_t>>;
+
+void AddCosts(proto2::RepeatedPtrField<AutoShardingSolverRequest_Costs>* costs,
+              const CostMatrix& cost_matrix) {
+  for (const auto& cost_row : cost_matrix) {
+    AutoShardingSolverRequest_Costs cost;
+    cost.mutable_costs()->Add(cost_row.begin(), cost_row.end());
+    costs->Add(std::move(cost));
+  }
+}
+
+void AddNodes(proto2::RepeatedPtrField<AutoShardingSolverRequest_Nodes>* nodes,
+              const NodeMatrix& node_matrix) {
+  for (const auto& node_row : node_matrix) {
+    AutoShardingSolverRequest_Nodes node;
+    node.mutable_nodes()->Add(node_row.begin(), node_row.end());
+    nodes->Add(std::move(node));
+  }
+}
+
 // clang-format off
 
 AutoShardingSolverRequest DefaultAutoShardingSolverRequest() {
-  AutoShardingSolverRequest request;
   // The problem below is partially inspired by 'DotLHSTwoNonContractingDims'
-  request.num_nodes = 5;
-  request.memory_budget = 1500000;
-  request.s_len = {4, 3, 4, 4, 3};
-  request.s_follow = {-1, -1, -1, 2, -1};
-  request.e = {{0, 2}, {1, 2}};
-  request.live = {{1, 0},
-                  {1, 0},
-                  {1, 2, 0},
-                  {1, 2, 3, 0},
-                  {1, 3, 0}};
-  request.c = {{10, 11, 12, 13},
-               {20, 21, 22},
-               {30, 31, 32, 33},
-               {40, 41, 42, 43},
-               {50, 51, 52, 53}};
-  request.d = {{100, 110, 120, 130},
-               {200, 210, 220},
-               {300, 310, 320, 330},
-               {400, 410, 420, 430},
-               {500, 510, 520}};
-  request.m = {{100000, 110000, 990000, 130000},
-               {200000, 210000, 220000},
-               {300000, 310000, 320000, 330000},
-               {400000, 410000, 420000, 430000},
-               {500000, 510000, 520000}};
-  request.p = {{1.0, 0.0, 1.0, 1.0},
-               {1.0, 0.0, 1.0},
-               {1.0, 0.0, 1.0, 1.0},
-               {1.0, 0.0, 1.0, 1.0},
-               {1.0, 0.0, 1.0}};
-  request.r = {{1000, 1100, 1200, 1300,
-                2000, 2100, 2200, 2300,
-                3000, 3100, 3200, 3300,
-                4000, 4100, 4200, 4300},
-               {5000, 5100, 5200, 5300,
-                6000, 6100, 6200, 6300,
-                7000, 7100, 7200, 7300}};
-  request.t = {{73000, 72000, 71000, 70000,
-                63000, 62000, 61000, 60000,
-                53000, 52000, 51000, 50000,
-                43000, 42000, 41000, 40000},
-               {33000, 32000, 31000, 30000,
-                23000, 22000, 21000, 20000,
-                13000, 12000, 11000, 10000}};
-  request.a = {{1, 4}};
-  request.v = {{0, 1, 1,
-                1, 0, 1,
-                1, 1, 0}};
-  request.instruction_names = {"A", "B", "C", "D", "E"};
-  request.overbudget_coeff = std::nullopt;
+  const auto s_len = {4, 3, 4, 4, 3};
+  const auto s_follow = {-1, -1, -1, 2, -1};
+  AutoShardingSolverRequest_Pair edge1, edge2;
+  edge1.set_first(0);
+  edge1.set_second(2);
+  edge2.set_first(1);
+  edge2.set_second(2);
+  const auto edges = {edge1, edge2};
+  const NodeMatrix live = {{1, 0},
+                           {1, 0},
+                           {1, 2, 0},
+                           {1, 2, 3, 0},
+                           {1, 3, 0}};
+  const CostMatrix c = {{10, 11, 12, 13},
+                        {20, 21, 22},
+                        {30, 31, 32, 33},
+                        {40, 41, 42, 43},
+                        {50, 51, 52, 53}};
+  const CostMatrix d = {{100, 110, 120, 130},
+                        {200, 210, 220},
+                        {300, 310, 320, 330},
+                        {400, 410, 420, 430},
+                        {500, 510, 520}};
+  const CostMatrix m = {{100000, 110000, 990000, 130000},
+                        {200000, 210000, 220000},
+                        {300000, 310000, 320000, 330000},
+                        {400000, 410000, 420000, 430000},
+                        {500000, 510000, 520000}};
+  const CostMatrix p = {{1.0, 0.0, 1.0, 1.0},
+                        {1.0, 0.0, 1.0},
+                        {1.0, 0.0, 1.0, 1.0},
+                        {1.0, 0.0, 1.0, 1.0},
+                        {1.0, 0.0, 1.0}};
+  const CostMatrix r = {{1000, 1100, 1200, 1300,
+                         2000, 2100, 2200, 2300,
+                         3000, 3100, 3200, 3300,
+                         4000, 4100, 4200, 4300},
+                        {5000, 5100, 5200, 5300,
+                         6000, 6100, 6200, 6300,
+                         7000, 7100, 7200, 7300}};
+  const CostMatrix t = {{73000, 72000, 71000, 70000,
+                         63000, 62000, 61000, 60000,
+                         53000, 52000, 51000, 50000,
+                         43000, 42000, 41000, 40000},
+                        {33000, 32000, 31000, 30000,
+                         23000, 22000, 21000, 20000,
+                         13000, 12000, 11000, 10000}};
+  AutoShardingSolverRequest_Pair alias;
+  alias.set_first(1);
+  alias.set_second(4);
+  const auto aliases = {alias};
+  const CostMatrix v = {{0, 1, 1,
+                         1, 0, 1,
+                         1, 1, 0}};
+  const std::vector<std::string> instruction_names = {"A", "B", "C", "D", "E"};
+
+  AutoShardingSolverRequest request;
+  request.set_num_nodes(5);
+  request.set_memory_budget(1500000);
+  request.mutable_s_len()->Add(s_len.begin(), s_len.end());
+  request.mutable_s_follow()->Add(s_follow.begin(), s_follow.end());
+  request.mutable_edges()->Add(edges.begin(), edges.end());
+  AddNodes(request.mutable_live(), live);
+  AddCosts(request.mutable_computation_costs(), c);
+  AddCosts(request.mutable_communication_costs(), d);
+  AddCosts(request.mutable_memory_costs(), m);
+  AddCosts(request.mutable_departure_costs(), p);
+  AddCosts(request.mutable_resharding_costs(), r);
+  AddCosts(request.mutable_duration_costs(), t);
+  request.mutable_aliases()->Add(aliases.begin(), aliases.end());
+  AddCosts(request.mutable_value_costs(), v);
+  request.mutable_instruction_names()->Add(instruction_names.begin(),
+                                           instruction_names.end());
+  AddCosts(request.mutable_computation_costs(), c);
   return request;
 }
 
@@ -99,8 +145,8 @@ TEST(CallORToolsSolverTest, SolvesOptimally) {
 
 TEST(CallORToolsSolverTest, SolvesOverbudget) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
-  request.memory_budget = 100000;
-  request.overbudget_coeff = 10.0;
+  request.set_memory_budget(100000);
+  request.mutable_overbudget_coeff()->set_coeff(10.0);
 
   const AutoShardingSolverResult result = CallORToolsSolver(request);
 
@@ -115,7 +161,7 @@ TEST(CallORToolsSolverTest, SolvesOverbudget) {
 
 TEST(CallORToolsSolverTest, SolvesMaxDepartures) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
-  request.max_departures = 3.0;
+  request.mutable_max_departures()->set_coeff(3.0);
 
   const AutoShardingSolverResult result = CallORToolsSolver(request);
 
@@ -130,7 +176,9 @@ TEST(CallORToolsSolverTest, SolvesMaxDepartures) {
 
 TEST(CallORToolsSolverTest, AvoidsInfiniteNodeCosts) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
-  request.c[0][0] = request.c[0][1] = request.c[0][2] = kInfinityCost;
+  request.mutable_computation_costs(0)->set_costs(0, kInfinityCost);
+  request.mutable_computation_costs(0)->set_costs(1, kInfinityCost);
+  request.mutable_computation_costs(0)->set_costs(2, kInfinityCost);
 
   const AutoShardingSolverResult result = CallORToolsSolver(request);
 
@@ -145,7 +193,7 @@ TEST(CallORToolsSolverTest, AvoidsInfiniteNodeCosts) {
 
 TEST(CallORToolsSolverTest, AvoidsInfiniteEdgeCosts) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
-  request.r[0][0] = kInfinityCost;
+  request.mutable_resharding_costs(0)->set_costs(0, kInfinityCost);
 
   const AutoShardingSolverResult result = CallORToolsSolver(request);
 
@@ -160,10 +208,19 @@ TEST(CallORToolsSolverTest, AvoidsInfiniteEdgeCosts) {
 
 TEST(CallORToolsSolverTest, HandlesFollowedEdges) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
-  request.e.push_back({1, 3});  // Reduces to {1, 2} since node 3 follows node 2
-  request.r.push_back({5000, 5100, 5200, 5300,
-                       6000, 6100, 6200, 6300,
-                       7000, 7100, 7200, 7300});
+  AutoShardingSolverRequest_Pair edge;
+  edge.set_first(1);
+  edge.set_second(3);
+  // Reduces to {1, 2} since node 3 follows node 2
+  *request.mutable_edges()->Add() = edge;
+  const CostMatrix r = {{5000, 5100, 5200, 5300,
+                         6000, 6100, 6200, 6300,
+                         7000, 7100, 7200, 7300}};
+  AddCosts(request.mutable_resharding_costs(), r);
+  const CostMatrix t = {{50000, 51000, 52000, 53000,
+                         60000, 61000, 62000, 63000,
+                         70000, 71000, 72000, 73000}};
+  AddCosts(request.mutable_duration_costs(), t);
 
   const AutoShardingSolverResult result = CallORToolsSolver(request);
 
@@ -178,7 +235,8 @@ TEST(CallORToolsSolverTest, HandlesFollowedEdges) {
 
 TEST(CallORToolsSolverTest, UsesHint) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
-  request.s_hint = {1, 0, 0, 0, 0};  // Not optimal, but close.
+  const auto s_hint = {1, 0, 0, 0, 0};  // Not optimal, but close.
+  request.mutable_s_hint()->Add(s_hint.begin(), s_hint.end());
 
   const AutoShardingSolverResult result = CallORToolsSolver(request);
 
@@ -215,8 +273,8 @@ TEST(AutoShardingEvaluatorTest, NoViolations) {
 
 TEST(AutoShardingEvaluatorTest, EvaluatesOverbudget) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
-  request.memory_budget = 100000;
-  request.overbudget_coeff = 10.0;
+  request.set_memory_budget(100000);
+  request.mutable_overbudget_coeff()->set_coeff(10.0);
   const std::vector<NodeStrategyIdx> s_val = {2 /* violates */, 1, 2, 2, 1};
   const std::vector<EdgeStrategyIdx> e_val = {10, 6};
   const double objective_value = 11138.0;
@@ -310,7 +368,9 @@ TEST(AutoShardingEvaluatorTest, ViolatesMemory) {
 
 TEST(AutoShardingEvaluatorTest, ViolatesInfiniteCostForNode) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
-  request.c[0][0] = request.c[0][1] = request.c[0][2] = kInfinityCost;
+  request.mutable_computation_costs(0)->set_costs(0, kInfinityCost);
+  request.mutable_computation_costs(0)->set_costs(1, kInfinityCost);
+  request.mutable_computation_costs(0)->set_costs(2, kInfinityCost);
   const std::vector<NodeStrategyIdx> s_val = {0 /* violates */, 1, 2, 2, 1};
   const std::vector<EdgeStrategyIdx> e_val = {2, 6};
   const double objective_value = 1e+20;
@@ -334,7 +394,7 @@ TEST(AutoShardingEvaluatorTest, ViolatesInfiniteCostForNode) {
 
 TEST(AutoShardingEvaluatorTest, ViolatesInfiniteCostForEdge) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
-  request.r[0][2] = kInfinityCost;
+  request.mutable_resharding_costs(0)->set_costs(2, kInfinityCost);
   const std::vector<NodeStrategyIdx> s_val = {0, 1, 2, 2, 1};
   const std::vector<EdgeStrategyIdx> e_val = {2 /* violates */, 6};
   const double objective_value = 1e+20;
@@ -358,7 +418,7 @@ TEST(AutoShardingEvaluatorTest, ViolatesInfiniteCostForEdge) {
 
 TEST(AutoShardingEvaluatorTest, ViolatesMaxDepartures) {
   AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
-  request.max_departures = 2.0;
+  request.mutable_max_departures()->set_coeff(2.0);
   const std::vector<NodeStrategyIdx> s_val = {3, 1, 2, 2, 1};
   const std::vector<EdgeStrategyIdx> e_val = {14, 6};
   const double objective_value = 12149.0;
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
new file mode 100644
index 00000000000000..856da68967e68a
--- /dev/null
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
@@ -0,0 +1,835 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h"
+#include "xla/hlo/experimental/auto_sharding/cluster_environment.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/utils/hlo_sharding_util.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/sharding_propagation.h"
+#include "xla/shape.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+
+namespace xla {
+namespace spmd {
+
+// NOLINTBEGIN(readability/fn_size)
+// TODO(zhuohan): Decompose this function into smaller pieces
+StatusOr<std::tuple<StrategyMap, StrategyGroups, AssociativeDotPairs>>
+BuildStrategyAndCost(const HloInstructionSequence& sequence,
+                     const HloModule* module,
+                     const absl::flat_hash_map<const HloInstruction*, int64_t>&
+                         instruction_execution_counts,
+                     const InstructionDepthMap& depth_map,
+                     const InstructionBatchDimMap& batch_dim_map,
+                     const AliasMap& alias_map,
+                     const ClusterEnvironment& cluster_env,
+                     AutoShardingOption& option, const CallGraph& call_graph,
+                     const HloCostAnalysis& hlo_cost_analysis,
+                     bool trying_multiple_mesh_shapes) {
+  const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
+  StrategyMap strategy_map;
+  // This map stores all of the trimmed strategies due to user specified
+  // sharding. The key is the instruction id, the value is the strategies. This
+  // is useful when the operand is forced to use a user sharding, and the op
+  // doesn't need to strictly follow it. We restore the trimmed strategies in
+  // this situation.
+  StableHashMap<int64_t, std::vector<ShardingStrategy>> pretrimmed_strategy_map;
+  StrategyGroups strategy_groups;
+  AssociativeDotPairs associative_dot_pairs;
+
+  const std::vector<HloInstruction*>& instructions = sequence.instructions();
+
+  // Add penalty for replicated tensors
+  double replicated_penalty = std::round(cluster_env.AllReduceCost(1, 0) +
+                                         cluster_env.AllReduceCost(1, 1));
+
+  int64_t max_depth = -1;
+  for (auto iter : depth_map) {
+    max_depth = std::max(max_depth, iter.second);
+  }
+
+  // Register strategies and their costs for each instruction.
+  for (size_t instruction_id = 0; instruction_id < instructions.size();
+       ++instruction_id) {
+    const HloInstruction* ins = instructions[instruction_id];
+    VLOG(2) << "instruction_id = " << instruction_id << ": "
+            << ToAdaptiveString(ins);
+    std::unique_ptr<StrategyGroup> strategy_group;
+
+    HloOpcode opcode = ins->opcode();
+
+    bool only_allow_divisible;
+    if (IsEntryComputationInputOrOutput(module, ins)) {
+      // With IsEntryComputationInputOrOutput(module, ins) == true, entry
+      // computation's root instruction may still be unevenly sharded because it
+      // usually "follows" other instruction's sharding. If the instruction it
+      // follows is an intermediate instruction, it may be able to choose
+      // unevenly sharded strategiyes. Usually if we constraint input's sharding
+      // strategies, outputs would be constrained as welll, but if outputs are
+      // still unevely sharded in some cases, we need to fix the implementation
+      // in auto sharding.
+      only_allow_divisible = option.only_allow_divisible_input_output;
+    } else {
+      only_allow_divisible = option.only_allow_divisible_intermediate;
+    }
+
+    switch (opcode) {
+      case HloOpcode::kParameter:
+      case HloOpcode::kRngBitGenerator:
+      case HloOpcode::kRng: {
+        strategy_group =
+            CreateAllStrategiesGroup(
+                ins, ins->shape(), instruction_id, strategy_groups, cluster_env,
+                strategy_map, option, replicated_penalty, batch_dim_map,
+                call_graph, only_allow_divisible,
+                option.allow_replicated_parameters,
+                /* create_partially_replicated_strategies */ true)
+                .value();
+        break;
+      }
+      case HloOpcode::kConstant: {
+        strategy_group = CreateLeafStrategyGroupWithoutInNodes(instruction_id,
+                                                               strategy_groups);
+        AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
+                              strategy_group, 0);
+        break;
+      }
+      case HloOpcode::kScatter: {
+        strategy_group = CreateLeafStrategyGroup(instruction_id, ins,
+                                                 strategy_map, strategy_groups);
+        // We follow the first operand (the array we're scattering into)
+        auto src_strategy_group = strategy_map.at(ins->operand(0)).get();
+        CHECK(!src_strategy_group->is_tuple);
+        for (int64_t sid = 0; sid < src_strategy_group->strategies.size();
+             ++sid) {
+          HloSharding output_spec =
+              src_strategy_group->strategies[sid].output_sharding;
+          std::string name = ToStringSimple(output_spec);
+          double compute_cost = 0, communication_cost = 0;
+          double memory_cost = GetBytes(ins->shape()) / output_spec.NumTiles();
+
+          std::vector<std::optional<HloSharding>> input_shardings_optional(
+              {output_spec, std::nullopt, std::nullopt});
+          std::vector<std::vector<double>> resharding_cost =
+              GenerateReshardingCostsAndMissingShardingsForAllOperands(
+                  ins, output_spec, strategy_map, cluster_env, call_graph,
+                  input_shardings_optional);
+
+          for (const auto& sharding_optional : input_shardings_optional) {
+            CHECK(sharding_optional.has_value());
+          }
+
+          strategy_group->strategies.push_back(ShardingStrategy(
+              {name, output_spec, compute_cost, communication_cost, memory_cost,
+               std::move(resharding_cost), input_shardings_optional}));
+        }
+        break;
+      }
+      case HloOpcode::kGather: {
+        strategy_group = CreateLeafStrategyGroup(instruction_id, ins,
+                                                 strategy_map, strategy_groups);
+        const HloInstruction* indices = ins->operand(1);
+        const Shape& shape = ins->shape();
+        for (int32_t index_dim = 0; index_dim < indices->shape().rank();
+             index_dim++) {
+          // Shard on indices dimensions that correspond to output dimensions
+          // TODO(b/220935014) Shard the last dim of output (model dim) with
+          // AllGather cost and no follow.
+          if (index_dim == ins->gather_dimension_numbers().index_vector_dim()) {
+            continue;
+          }
+          for (int64_t j = 0; j < device_mesh.num_dimensions(); ++j) {
+            // Split only when the tensor shape is divisible by device
+            // mesh.
+            if (device_mesh.dim(j) == 1 ||
+                (only_allow_divisible &&
+                 !IsDivisible(shape.dimensions(index_dim),
+                              device_mesh.dim(j)))) {
+              continue;
+            }
+            std::string name = absl::StrCat("S", index_dim, " @ ", j);
+
+            HloSharding output_spec =
+                Tile(shape, {index_dim}, {j}, device_mesh);
+            double compute_cost = 0, communication_cost = 0;
+            double memory_cost = GetBytes(shape) / output_spec.NumTiles();
+            std::optional<HloSharding> input_spec =
+                hlo_sharding_util::ReshapeSharding(shape, indices->shape(),
+                                                   output_spec);
+            if (!input_spec.has_value()) {  // invalid reshape
+              continue;
+            }
+            std::vector<std::optional<HloSharding>> input_shardings_optional(
+                {std::nullopt, input_spec});
+            std::vector<std::vector<double>> resharding_cost =
+                GenerateReshardingCostsAndMissingShardingsForAllOperands(
+                    ins, output_spec, strategy_map, cluster_env, call_graph,
+                    input_shardings_optional);
+
+            strategy_group->strategies.push_back(ShardingStrategy(
+                {name, output_spec, compute_cost, communication_cost,
+                 memory_cost, std::move(resharding_cost),
+                 input_shardings_optional}));
+          }
+        }
+        auto src_strategy_group = strategy_map.at(ins->operand(0)).get();
+        for (int64_t sid = 0; sid < src_strategy_group->strategies.size();
+             ++sid) {
+          HloSharding output_spec =
+              src_strategy_group->strategies[sid].output_sharding;
+          auto gather_parallel_dims =
+              hlo_sharding_util::GetGatherParallelBatchDims(*ins, call_graph);
+          absl::Span<const int64_t> operand_parallel_dims;
+          if (gather_parallel_dims) {
+            operand_parallel_dims = absl::MakeConstSpan(
+                gather_parallel_dims->operand_parallel_dims);
+          }
+          HloSharding filtered_operand_sharding =
+              hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+                  output_spec, operand_parallel_dims);
+          auto maybe_from_data = hlo_sharding_util::
+              GatherOutputShardingFromOperandOperandPassthroughDimensions(
+                  filtered_operand_sharding, *ins);
+          if (!maybe_from_data) continue;
+          std::string name = ToStringSimple(*maybe_from_data);
+          double compute_cost = 0, communication_cost = 0;
+          double memory_cost =
+              GetBytes(ins->shape()) / maybe_from_data->NumTiles();
+          std::vector<std::optional<HloSharding>> input_shardings_optional(
+              {*maybe_from_data, std::nullopt});
+          std::vector<std::vector<double>> resharding_cost =
+              GenerateReshardingCostsAndMissingShardingsForAllOperands(
+                  ins, *maybe_from_data, strategy_map, cluster_env, call_graph,
+                  input_shardings_optional);
+          strategy_group->strategies.push_back(ShardingStrategy(
+              {name, *maybe_from_data, compute_cost, communication_cost,
+               memory_cost, std::move(resharding_cost),
+               input_shardings_optional}));
+        }
+        AddReplicatedStrategy(
+            ins, ins->shape(), cluster_env, strategy_map, strategy_group, 0,
+            /* operands_to_consider_all_strategies_for */ {0});
+        break;
+      }
+      case HloOpcode::kBroadcast: {
+        // For an unknown reason, we do not generate partially replicated
+        // strategies for >1D broadcast ops. This can be changed if we find that
+        // our search isn't exhaustive enough for certain ops.
+        strategy_group =
+            CreateAllStrategiesGroup(
+                ins, ins->shape(), instruction_id, strategy_groups, cluster_env,
+                strategy_map, option, replicated_penalty, batch_dim_map,
+                call_graph, only_allow_divisible,
+                /* create_replicated_strategies */ true,
+                /* create_partially_replicated_strategies */
+                (ins->shape().rank() == 1))
+                .value();
+        break;
+      }
+      case HloOpcode::kReshape: {
+        strategy_group = CreateReshapeStrategies(
+            instruction_id, ins, strategy_map, cluster_env,
+            only_allow_divisible, replicated_penalty, batch_dim_map, option,
+            strategy_groups, call_graph);
+        break;
+      }
+      case HloOpcode::kTranspose:
+      case HloOpcode::kReverse: {
+        strategy_group = CreateLeafStrategyGroup(instruction_id, ins,
+                                                 strategy_map, strategy_groups);
+
+        const HloInstruction* operand = ins->operand(0);
+
+        // Create follow strategies
+        const StrategyGroup* src_strategy_group =
+            strategy_map.at(operand).get();
+        CHECK(!src_strategy_group->is_tuple);
+        strategy_group->following = src_strategy_group;
+
+        for (int64_t sid = 0; sid < src_strategy_group->strategies.size();
+             ++sid) {
+          HloSharding output_spec = Undefined();
+          auto input_spec = src_strategy_group->strategies[sid].output_sharding;
+          if (opcode == HloOpcode::kTranspose) {
+            output_spec = hlo_sharding_util::TransposeSharding(
+                input_spec, ins->dimensions());
+          } else {
+            output_spec = hlo_sharding_util::ReverseSharding(input_spec,
+                                                             ins->dimensions());
+          }
+
+          std::string name = ToStringSimple(output_spec);
+          double compute_cost = 0, communication_cost = 0;
+          double memory_cost = GetBytes(ins->shape()) / output_spec.NumTiles();
+          auto resharding_costs = ReshardingCostVector(
+              src_strategy_group, operand->shape(), input_spec, cluster_env);
+          strategy_group->strategies.push_back(
+              ShardingStrategy({name,
+                                output_spec,
+                                compute_cost,
+                                communication_cost,
+                                memory_cost,
+                                {resharding_costs},
+                                {input_spec}}));
+        }
+        break;
+      }
+      case HloOpcode::kPad:
+      case HloOpcode::kSlice:
+      case HloOpcode::kConcatenate:  // TODO(zhuohan): revisit concatenate
+      case HloOpcode::kDynamicSlice:
+      case HloOpcode::kDynamicUpdateSlice:
+      case HloOpcode::kReduceWindow:
+      case HloOpcode::kSelectAndScatter: {
+        strategy_group = CreateLeafStrategyGroup(instruction_id, ins,
+                                                 strategy_map, strategy_groups);
+        int64_t follow_idx;
+        switch (opcode) {
+          // TODO(yuemmawang) Re-evaluate the follow_idx choices for the
+          // following 3.
+          case HloOpcode::kPad:
+          case HloOpcode::kReduceWindow:
+          case HloOpcode::kSelectAndScatter:
+          case HloOpcode::kConcatenate:
+            // Follow the operand according to the follow heuristics
+            follow_idx = ChooseOperandToFollow(strategy_map, depth_map,
+                                               alias_map, max_depth, ins)
+                             .first;
+            break;
+          // The following types are better to follow the first operand.
+          case HloOpcode::kSlice:
+          case HloOpcode::kDynamicSlice:
+          case HloOpcode::kDynamicUpdateSlice:
+            follow_idx = 0;
+            break;
+          default:
+            LOG(FATAL) << "Selecting follow index encounters an unhandled "
+                          "instruction type: " +
+                              ins->ToShortString();
+        }
+        // Create follow strategies
+        const HloInstruction* operand = ins->operand(follow_idx);
+        StrategyGroup* src_strategy_group = strategy_map.at(operand).get();
+        CHECK(!src_strategy_group->is_tuple);
+        strategy_group->following = src_strategy_group;
+
+        for (int64_t sid = 0; sid < src_strategy_group->strategies.size();
+             ++sid) {
+          std::optional<HloSharding> output_spec;
+          HloSharding input_spec =
+              src_strategy_group->strategies[sid].output_sharding;
+
+          // Find output shardings.
+          switch (opcode) {
+            case HloOpcode::kPad:
+            case HloOpcode::kSlice:
+            case HloOpcode::kConcatenate:
+            case HloOpcode::kDynamicSlice:
+            case HloOpcode::kDynamicUpdateSlice:
+              output_spec = PropagateDimwiseSharding(
+                  input_spec, operand->shape(), ins->shape());
+              break;
+            case HloOpcode::kReduceWindow:
+            case HloOpcode::kSelectAndScatter:
+              output_spec = PropagateReduceWindowSharding(
+                  input_spec, operand->shape(), ins->window());
+              break;
+            default:
+              LOG(FATAL) << "Unhandled instruction: " + ins->ToString();
+          }
+
+          // Get a list of input shardings, each corresponds to an operand.
+          std::vector<std::optional<HloSharding>> input_shardings;
+          for (int64_t k = 0; k < ins->operand_count(); ++k) {
+            if (k == follow_idx ||
+                ToString(ins->operand(k)->shape().dimensions()) ==
+                    ToString(operand->shape().dimensions())) {
+              input_shardings.push_back(input_spec);
+            } else {
+              input_shardings.push_back(std::nullopt);
+            }
+          }
+          if (!output_spec.has_value()) {
+            continue;
+          }
+
+          std::string name = ToStringSimple(*output_spec);
+          double compute_cost = 0, communication_cost = 0;
+          double memory_cost = GetBytes(ins->shape()) / output_spec->NumTiles();
+          std::vector<std::vector<double>> resharding_costs =
+              GenerateReshardingCostsAndMissingShardingsForAllOperands(
+                  ins, *output_spec, strategy_map, cluster_env, call_graph,
+                  input_shardings);
+
+          strategy_group->strategies.push_back(
+              ShardingStrategy({name,
+                                *output_spec,
+                                compute_cost,
+                                communication_cost,
+                                memory_cost,
+                                std::move(resharding_costs),
+                                {input_spec}}));
+        }
+
+        if (strategy_group->strategies.empty()) {
+          strategy_group->following = nullptr;
+          AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
+                                strategy_group, 0);
+        }
+        break;
+      }
+      case HloOpcode::kOptimizationBarrier: {
+        auto operand_strategies = strategy_map.at(ins->operand(0)).get();
+        strategy_group = MaybeFollowInsStrategyGroup(
+            operand_strategies, ins->shape(), instruction_id,
+            /* have_memory_cost */ true, strategy_groups, cluster_env,
+            pretrimmed_strategy_map);
+        break;
+      }
+      case HloOpcode::kBitcast: {
+        if (ins->shape() == ins->operand(0)->shape()) {
+          strategy_group = CreateElementwiseOperatorStrategies(
+              instruction_id, ins, strategy_map, cluster_env, depth_map,
+              alias_map, pretrimmed_strategy_map, max_depth, strategy_groups,
+              associative_dot_pairs);
+        } else {
+          strategy_group = CreateReshapeStrategies(
+              instruction_id, ins, strategy_map, cluster_env,
+              only_allow_divisible, replicated_penalty, batch_dim_map, option,
+              strategy_groups, call_graph);
+        }
+        break;
+      }
+      // Unary elementwise operations.
+      case HloOpcode::kAbs:
+      case HloOpcode::kRoundNearestAfz:
+      case HloOpcode::kRoundNearestEven:
+      case HloOpcode::kCeil:
+      case HloOpcode::kClz:
+      case HloOpcode::kConvert:
+      case HloOpcode::kBitcastConvert:
+      case HloOpcode::kCopy:
+      case HloOpcode::kCos:
+      case HloOpcode::kExp:
+      case HloOpcode::kExpm1:
+      case HloOpcode::kFloor:
+      case HloOpcode::kImag:
+      case HloOpcode::kIsFinite:
+      case HloOpcode::kLog:
+      case HloOpcode::kLog1p:
+      case HloOpcode::kNot:
+      case HloOpcode::kNegate:
+      case HloOpcode::kPopulationCount:
+      case HloOpcode::kReal:
+      case HloOpcode::kReducePrecision:
+      case HloOpcode::kRsqrt:
+      case HloOpcode::kLogistic:
+      case HloOpcode::kSign:
+      case HloOpcode::kSin:
+      case HloOpcode::kSqrt:
+      case HloOpcode::kCbrt:
+      case HloOpcode::kTan:
+      case HloOpcode::kTanh:
+      // Binary elementwise operations
+      case HloOpcode::kAdd:
+      case HloOpcode::kAtan2:
+      case HloOpcode::kCompare:
+      case HloOpcode::kComplex:
+      case HloOpcode::kDivide:
+      case HloOpcode::kMaximum:
+      case HloOpcode::kMinimum:
+      case HloOpcode::kMultiply:
+      case HloOpcode::kPower:
+      case HloOpcode::kRemainder:
+      case HloOpcode::kSubtract:
+      case HloOpcode::kAnd:
+      case HloOpcode::kOr:
+      case HloOpcode::kXor:
+      case HloOpcode::kShiftLeft:
+      case HloOpcode::kShiftRightArithmetic:
+      case HloOpcode::kShiftRightLogical:
+      case HloOpcode::kStochasticConvert:
+      // Ternary elementwise operations.
+      case HloOpcode::kSelect:
+      case HloOpcode::kClamp: {
+        strategy_group = CreateElementwiseOperatorStrategies(
+            instruction_id, ins, strategy_map, cluster_env, depth_map,
+            alias_map, pretrimmed_strategy_map, max_depth, strategy_groups,
+            associative_dot_pairs);
+        break;
+      }
+      case HloOpcode::kReduce: {
+        auto strategies_status = FollowReduceStrategy(
+            ins, ins->shape(), ins->operand(0), ins->operand(1), instruction_id,
+            strategy_map, strategy_groups, cluster_env,
+            option.allow_mixed_mesh_shape, !trying_multiple_mesh_shapes);
+        if (strategies_status.ok()) {
+          strategy_group = std::move(strategies_status.value());
+        } else {
+          return strategies_status.status();
+        }
+        break;
+      }
+      case HloOpcode::kDot: {
+        TF_RETURN_IF_ERROR(HandleDot(
+            strategy_group, strategy_groups, strategy_map, ins, instruction_id,
+            cluster_env, batch_dim_map, option, call_graph));
+        if (option.allow_replicated_strategy_for_dot_and_conv) {
+          AddReplicatedStrategy(
+              ins, ins->shape(), cluster_env, strategy_map, strategy_group,
+              GetDotConvReplicationPenalty(ins, instruction_id, /* window */ 10,
+                                           sequence, hlo_cost_analysis));
+        }
+        break;
+      }
+      case HloOpcode::kConvolution: {
+        TF_RETURN_IF_ERROR(HandleConv(
+            strategy_group, strategy_groups, strategy_map, ins, instruction_id,
+            cluster_env, batch_dim_map, option, call_graph));
+        if (option.allow_replicated_strategy_for_dot_and_conv) {
+          AddReplicatedStrategy(
+              ins, ins->shape(), cluster_env, strategy_map, strategy_group,
+              GetDotConvReplicationPenalty(ins, instruction_id, /* window */ 10,
+                                           sequence, hlo_cost_analysis));
+        }
+        break;
+      }
+      case HloOpcode::kRngGetAndUpdateState: {
+        strategy_group = CreateLeafStrategyGroupWithoutInNodes(instruction_id,
+                                                               strategy_groups);
+        AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
+                              strategy_group, 0);
+        break;
+      }
+      case HloOpcode::kIota: {
+        // For an unknown reason, we do not generate partially replicated
+        // strategies for iota ops. This can be changed if we find that our
+        // search isn't exhaustive enough for certain ops.
+        strategy_group =
+            CreateAllStrategiesGroup(
+                ins, ins->shape(), instruction_id, strategy_groups, cluster_env,
+                strategy_map, option, replicated_penalty, batch_dim_map,
+                call_graph, only_allow_divisible,
+                /* create_replicated_strategies */ true,
+                /* create_partially_replicated_strategies */ false)
+                .value();
+        break;
+      }
+      case HloOpcode::kTuple: {
+        strategy_group = CreateTupleStrategyGroup(instruction_id);
+        strategy_group->childs.reserve(ins->operand_count());
+        for (size_t i = 0; i < ins->operand_count(); ++i) {
+          const HloInstruction* operand = ins->operand(i);
+          const StrategyGroup* src_strategy_group =
+              strategy_map.at(operand).get();
+          auto child_strategies = MaybeFollowInsStrategyGroup(
+              src_strategy_group, operand->shape(), instruction_id,
+              /* have_memory_cost= */ true, strategy_groups, cluster_env,
+              pretrimmed_strategy_map);
+          child_strategies->tuple_element_idx = i;
+          strategy_group->childs.push_back(std::move(child_strategies));
+        }
+        break;
+      }
+      case HloOpcode::kGetTupleElement: {
+        const HloInstruction* operand = ins->operand(0);
+        const StrategyGroup* src_strategy_group =
+            strategy_map.at(operand).get();
+        CHECK(src_strategy_group->is_tuple);
+        strategy_group = MaybeFollowInsStrategyGroup(
+            src_strategy_group->childs[ins->tuple_index()].get(), ins->shape(),
+            instruction_id,
+            /* have_memory_cost= */ true, strategy_groups, cluster_env,
+            pretrimmed_strategy_map);
+        break;
+      }
+      case HloOpcode::kCustomCall: {
+        auto generate_non_following_strategies =
+            [&](bool only_replicated,
+                absl::flat_hash_set<int64_t>
+                    operands_to_consider_all_strategies_for = {}) {
+              if (ins->shape().IsTuple()) {
+                if (only_replicated) {
+                  strategy_group = CreateTupleStrategyGroup(instruction_id);
+                  strategy_group->childs.reserve(
+                      ins->shape().tuple_shapes_size());
+                  for (size_t i = 0; i < ins->shape().tuple_shapes_size();
+                       ++i) {
+                    std::unique_ptr<StrategyGroup> child_strategies =
+                        CreateLeafStrategyGroup(instruction_id, ins,
+                                                strategy_map, strategy_groups);
+                    AddReplicatedStrategy(ins, ins->shape().tuple_shapes(i),
+                                          cluster_env, strategy_map,
+                                          child_strategies, replicated_penalty);
+                    strategy_group->childs.push_back(
+                        std::move(child_strategies));
+                  }
+                } else {
+                  strategy_group =
+                      CreateAllStrategiesGroup(
+                          ins, ins->shape(), instruction_id, strategy_groups,
+                          cluster_env, strategy_map, option, replicated_penalty,
+                          batch_dim_map, call_graph, only_allow_divisible,
+                          /* create_replicated_strategies */ true,
+                          /* create_partially_replicated_strategies */ true)
+                          .value();
+                }
+              } else {
+                if (only_replicated) {
+                  strategy_group = CreateLeafStrategyGroup(
+                      instruction_id, ins, strategy_map, strategy_groups);
+                  AddReplicatedStrategy(ins, ins->shape(), cluster_env,
+                                        strategy_map, strategy_group,
+                                        replicated_penalty);
+                } else {
+                  strategy_group =
+                      CreateAllStrategiesGroup(
+                          ins, ins->shape(), instruction_id, strategy_groups,
+                          cluster_env, strategy_map, option, replicated_penalty,
+                          batch_dim_map, call_graph, only_allow_divisible,
+                          /* create_replicated_strategies */ true,
+                          /* create_partially_replicated_strategies */ true)
+                          .value();
+                }
+              }
+            };
+
+        if (IsCustomCallMarker(ins)) {
+          const HloInstruction* operand = ins->operand(0);
+          const StrategyGroup* src_strategy_group =
+              strategy_map.at(operand).get();
+          CHECK(src_strategy_group->is_tuple);
+          strategy_group = MaybeFollowInsStrategyGroup(
+              src_strategy_group, ins->shape(), instruction_id,
+              /* have_memory_cost= */ true, strategy_groups, cluster_env,
+              pretrimmed_strategy_map);
+        } else if (ins->has_sharding()) {
+          generate_non_following_strategies(false);
+        } else if (OutputInputSameShapes(ins)) {
+          auto* partitioner =
+              GetCustomCallPartitioner(ins->custom_call_target());
+          if (partitioner && partitioner->IsCustomCallShardable(ins)) {
+            // Follows operand 0's strategies if this custom-call op is
+            // shardable and has the same input and output sizes.
+            const HloInstruction* operand = ins->operand(0);
+            const StrategyGroup* src_strategy_group =
+                strategy_map.at(operand).get();
+            strategy_group = MaybeFollowInsStrategyGroup(
+                src_strategy_group, ins->shape(), instruction_id,
+                /* have_memory_cost= */ true, strategy_groups, cluster_env,
+                pretrimmed_strategy_map);
+          }
+        } else if (IsTopKCustomCall(ins)) {
+          generate_non_following_strategies(false, {0});
+        } else {
+          // TODO (b/258723035) Handle CustomCall ops for GPUs in a better way.
+          generate_non_following_strategies(true);
+        }
+        break;
+      }
+      case HloOpcode::kWhile: {
+        strategy_group = CreateTupleStrategyGroup(instruction_id);
+        strategy_group->childs.reserve(ins->shape().tuple_shapes_size());
+        const StrategyGroup* src_strategy_group =
+            strategy_map.at(ins->operand(0)).get();
+        for (size_t i = 0; i < ins->shape().tuple_shapes_size(); ++i) {
+          auto child_strategies = MaybeFollowInsStrategyGroup(
+              src_strategy_group->childs[i].get(),
+              ins->shape().tuple_shapes().at(i), instruction_id,
+              /* have_memory_cost= */ true, strategy_groups, cluster_env,
+              pretrimmed_strategy_map);
+          child_strategies->tuple_element_idx = i;
+          strategy_group->childs.push_back(std::move(child_strategies));
+        }
+
+        break;
+      }
+      case HloOpcode::kConditional:
+      case HloOpcode::kInfeed:
+      case HloOpcode::kSort: {
+        strategy_group =
+            CreateAllStrategiesGroup(
+                ins, ins->shape(), instruction_id, strategy_groups, cluster_env,
+                strategy_map, option, replicated_penalty, batch_dim_map,
+                call_graph, only_allow_divisible,
+                /* create_replicated_strategies */ true,
+                /* create_partially_replicated_strategies */ true)
+                .value();
+        break;
+      }
+      case HloOpcode::kOutfeed: {
+        strategy_group = CreateLeafStrategyGroup(instruction_id, ins,
+                                                 strategy_map, strategy_groups);
+        GenerateOutfeedStrategy(ins, ins->shape(), cluster_env, strategy_map,
+                                strategy_group, replicated_penalty);
+        break;
+      }
+      case HloOpcode::kAfterAll: {
+        strategy_group = CreateLeafStrategyGroup(instruction_id, ins,
+                                                 strategy_map, strategy_groups);
+        AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
+                              strategy_group, replicated_penalty);
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unhandled instruction: " + ins->ToString();
+    }
+    RemoveDuplicatedStrategy(strategy_group);
+    if (ins->has_sharding() && ins->opcode() != HloOpcode::kOutfeed) {
+      // Finds the sharding strategy that aligns with the given sharding spec
+      // Do not merge nodes if this one instruction has annotations.
+      TrimOrGenerateStrategiesBasedOnExistingSharding(
+          ins->shape(), strategy_group.get(), strategy_map, instructions,
+          ins->sharding(), cluster_env, pretrimmed_strategy_map, call_graph,
+          option.nd_sharding_iteratively_strict_search_space);
+    }
+    if (!strategy_group->is_tuple && strategy_group->following) {
+      if (!LeafVectorsAreConsistent(
+              strategy_group->strategies, strategy_group->following->strategies,
+              /*is_reshape*/ ins->opcode() == HloOpcode::kReshape)) {
+        // It confuses the solver if two instructions have different number of
+        // sharding strategies but share the same ILP variable. The solver
+        // would run much longer and/or return infeasible solutions.
+        // So if two strategies' strategiess are inconsistent, we unfollow
+        // them.
+        strategy_group->following = nullptr;
+      }
+    } else if (strategy_group->is_tuple) {
+      for (size_t i = 0; i < strategy_group->childs.size(); i++) {
+        if (strategy_group->childs.at(i)->following &&
+            !LeafVectorsAreConsistent(
+                strategy_group->childs.at(i)->strategies,
+                strategy_group->childs.at(i)->following->strategies,
+                /*is_reshape*/ ins->opcode() == HloOpcode::kReshape)) {
+          strategy_group->childs.at(i)->following = nullptr;
+        }
+      }
+    }
+    RemoveInvalidShardingsWithShapes(
+        ins->shape(), strategy_group.get(),
+        /* instruction_has_user_sharding */ ins->has_sharding());
+
+    if (instruction_execution_counts.contains(ins)) {
+      ScaleCostsWithExecutionCounts(strategy_group.get(),
+                                    instruction_execution_counts.at(ins));
+    } else {
+      VLOG(5) << "No execution count available for " << ins->name();
+    }
+    XLA_VLOG_LINES(2,
+                   absl::StrCat("strategies:\n", strategy_group->ToString()));
+
+    // Debug options: forcibly set the strategy of some instructions.
+    if (option.force_strategy) {
+      std::vector<int64_t> inst_indices = option.force_strategy_inst_indices;
+      std::vector<std::string> stra_names = option.force_strategy_stra_names;
+      CHECK_EQ(inst_indices.size(), stra_names.size());
+      auto it = absl::c_find(inst_indices, strategy_group->node_idx);
+      if (it != inst_indices.end()) {
+        CHECK(!strategy_group->is_tuple);
+        std::vector<ShardingStrategy> new_strategies;
+        int64_t idx = it - inst_indices.begin();
+        for (const auto& stra : strategy_group->strategies) {
+          if (stra.name == stra_names[idx]) {
+            new_strategies.push_back(stra);
+          }
+        }
+        strategy_group->strategies = std::move(new_strategies);
+      }
+    }
+
+    // When trying out multiple mesh shapes in the presence of user specified
+    // sharding (as in
+    // AutoShardingTest.AutoShardingKeepUserShardingInputOutput), there may be a
+    // situation when we cannot generate any shardings for an instruction when
+    // the mesh shape we're trying does not match with the mesh shape used in
+    // user specified shardings. So we disable the check in that situation.
+    if (!trying_multiple_mesh_shapes) {
+      CHECK(strategy_group->is_tuple || !strategy_group->strategies.empty())
+          << ins->ToString() << " does not have any valid strategies.";
+    } else if (!(strategy_group->is_tuple ||
+                 !strategy_group->strategies.empty())) {
+      return Status(absl::StatusCode::kFailedPrecondition,
+                    "Could not generate any shardings for an instruction due "
+                    "to mismatched mesh shapes.");
+    }
+    // Checks the shape of resharding_costs is valid. It will check fail if the
+    // shape is not as expected.
+    // CheckReshardingCostsShape(strategies.get());
+    CheckMemoryCosts(strategy_group.get(), ins->shape());
+    strategy_map[ins] = std::move(strategy_group);
+  }  // end of for loop
+
+  // If gradient accumulation is used, adjust the cost of all-reduce for
+  // gradient synchronization.
+  if (option.grad_acc_num_micro_batches > 1) {
+    // find gradient-computation instructions
+    std::vector<const HloInstruction*> grad_insts =
+        GetGradientComputationInstructions(instructions);
+    for (const HloInstruction* inst : grad_insts) {
+      StrategyGroup* stra_vector = strategy_map[inst].get();
+      CHECK(!stra_vector->is_tuple);
+
+      for (auto& stra : stra_vector->strategies) {
+        if (absl::StrContains(stra.name, "allreduce")) {
+          stra.communication_cost /= option.grad_acc_num_micro_batches;
+        }
+      }
+    }
+  }
+
+  return std::make_tuple(std::move(strategy_map), std::move(strategy_groups),
+                         std::move(associative_dot_pairs));
+}
+
+// NOLINTEND
+
+}  // namespace spmd
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
index cd034821d8f822..a37ceca17e47a5 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
@@ -130,26 +130,27 @@ using EdgeStrategyIdx = int64_t;  // An index into an edge's strategy vector.
 using LivenessIdx = int64_t;      // An index into the liveness vector.
 using AliasIdx = int64_t;         // An index into the alias vector.
 
-// The strategy choices for each instruction.
-struct StrategyVector {
+// A group of strategy choices (along with details like index values)
+// for each instruction.
+struct StrategyGroup {
   bool is_tuple;
   // The index used in the solver. For non-leaf nodes, this is set to -1.
   NodeIdx node_idx;
-  // The index of the HLO instruction that this strategy vector belongs to.
+  // The index of the HLO instruction that this strategy group belongs to.
   size_t instruction_id;
   // The connected nodes used for resharding costs;
   // The size must be the same as the size of resharding cost
-  // each element in leaf_vector's resharding_costs.size() needs to be the same
+  // each element in strategies's resharding_costs.size() needs to be the same
   // as strategies->in_nodes.size()
-  std::vector<const StrategyVector*> in_nodes;
+  std::vector<const StrategyGroup*> in_nodes;
   // The followed strategy. Used for merging nodes.
-  const StrategyVector* following = nullptr;
+  const StrategyGroup* following = nullptr;
   // Used when is_tuple == False. Leaf strategy vector.
   // A vector of strategy choices for the non-tuple output.
-  std::vector<ShardingStrategy> leaf_vector;
+  std::vector<ShardingStrategy> strategies;
   // Used when is_tuple == True. A vector of pointers, each pointer is one
-  // StrategyVector for one value in the output Tuple
-  std::vector<std::unique_ptr<StrategyVector>> childs;
+  // StrategyGroup for one value in the output Tuple
+  std::vector<std::unique_ptr<StrategyGroup>> childs;
   // The index of this instruction in the HLO operand (or tuple shape) list.
   std::optional<int64_t> tuple_element_idx;
 
@@ -159,6 +160,10 @@ struct StrategyVector {
     absl::StrAppend(&str, indent, "node_idx: ", node_idx, "\n");
     absl::StrAppend(&str, indent, "instruction id: ", instruction_id, "\n");
     absl::StrAppend(&str, indent, "is_tuple: ", is_tuple, "\n");
+    if (tuple_element_idx.has_value()) {
+      absl::StrAppend(&str, indent,
+                      "index in producer inst.: ", *tuple_element_idx, "\n");
+    }
     if (following != nullptr) {
       absl::StrAppend(&str, indent,
                       "following instruction: ", following->instruction_id,
@@ -176,15 +181,15 @@ struct StrategyVector {
         absl::StrAppend(&str, childs[i]->ToString(indention + 2));
       }
     } else {
-      for (const auto& strategy : leaf_vector) {
+      for (const auto& strategy : strategies) {
         absl::StrAppend(&str, indent, "Strategy ", strategy.ToStringLong());
       }
     }
     return str;
   }
 
-  const StrategyVector* GetSubStrategyVector(const ShapeIndex& index) const {
-    const StrategyVector* result = this;
+  const StrategyGroup* GetSubStrategyGroup(const ShapeIndex& index) const {
+    const StrategyGroup* result = this;
     for (auto index_element : index) {
       CHECK_LE(index_element, result->childs.size());
       result = result->childs.at(index_element).get();
@@ -197,15 +202,15 @@ struct StrategyVector {
 using LivenessSet = std::vector<std::vector<const HloValue*>>;
 // A liveness set using node indices instead of HLO values.
 using LivenessNodeSet = std::vector<std::vector<NodeIdx>>;
-// Map an instruction to its strategy vector.
+// Map an instruction to its strategy group.
 using StrategyMap =
-    StableHashMap<const HloInstruction*, std::unique_ptr<StrategyVector>>;
-// The list of all leaf strategies.
-using LeafStrategies = std::vector<StrategyVector*>;
+    StableHashMap<const HloInstruction*, std::unique_ptr<StrategyGroup>>;
+// The list of all strategy groups.
+using StrategyGroups = std::vector<StrategyGroup*>;
 // The list of all dot instruction pairs that can be optimized by
 // AllReduceReassociate pass.
 using AssociativeDotPairs =
-    std::vector<std::pair<const StrategyVector*, const StrategyVector*>>;
+    std::vector<std::pair<const StrategyGroup*, const StrategyGroup*>>;
 // The set of all alias pairs
 using AliasSet = StableHashSet<std::pair<NodeIdx, NodeIdx>>;
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
index d5c182a72d6275..88700280b6b3cd 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
@@ -172,6 +172,61 @@ ENTRY %elementwise {
   EXPECT_THAT(instruction, op::Sharding("{devices=[2,2]0,2,1,3}"));
 }
 
+TEST_F(AutoShardingTest, RngBitGeneratorArrayInput) {
+  const char* const hlo_string = R"(
+HloModule rng_bit_generator
+
+ENTRY %RngBitGenerator (p0: u64[2]) -> (u64[2], u32[16,16]) {
+  %p0 = u64[2]{0} parameter(0)
+  ROOT %rand = (u64[2]{0}, u32[16,16]{1,0}) rng-bit-generator(u64[2]{0} %p0), algorithm=rng_three_fry
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {1.0, 1.0};
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  VLOG(10) << module->ToString();
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "p0");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction, op::Sharding("{replicated}"));
+}
+
+TEST_F(AutoShardingTest, RngBitGeneratorTupleInput) {
+  const char* const hlo_string = R"(
+HloModule rng_bit_generator
+
+ENTRY %RngBitGenerator {
+  param.0 = u32[2]{0:T(128)} parameter(0)
+  param.1 = u32[2]{0:T(128)} parameter(1)
+  tuple.3 = (u32[2]{0:T(128)}, u32[2]{0:T(128)}) tuple(param.0, param.1)
+  ROOT rng-bit-generator = u32[100,100]{1,0:T(8,128)} rng-bit-generator(tuple.3), algorithm=rng_default
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  VLOG(10) << module->ToString();
+  EXPECT_TRUE(changed);
+  auto* param0 = FindInstruction(module.get(), "param.0");
+  auto* param1 = FindInstruction(module.get(), "param.1");
+  ASSERT_NE(param0, nullptr);
+  ASSERT_NE(param0, nullptr);
+  EXPECT_THAT(param0, op::Sharding("{replicated}"));
+  EXPECT_THAT(param1, op::Sharding("{replicated}"));
+}
+
 TEST_F(AutoShardingTest, DotLHSTwoNonContractingDims) {
   const char* const hlo_string = R"(
 HloModule module
@@ -665,7 +720,9 @@ ENTRY %Scatter {
   EXPECT_THAT(scatter, AnyOf(op::Sharding("{devices=[2,2,1]0,2,1,3}"),
                              op::Sharding("{devices=[2,2,1]0,1,2,3}"),
                              op::Sharding("{devices=[2,1,2]0,2,1,3}"),
-                             op::Sharding("{devices=[2,1,2]0,1,2,3}")));
+                             op::Sharding("{devices=[2,1,2]0,1,2,3}"),
+                             op::Sharding("{devices=[1,2,2]0,1,2,3}"),
+                             op::Sharding("{devices=[1,2,2]0,2,1,3}")));
   auto scatter_sharding = scatter->sharding();
   TF_EXPECT_OK(scatter_sharding.Validate(scatter->shape(), 4));
 }
@@ -1504,6 +1561,32 @@ ENTRY %entry {
   EXPECT_TRUE(changed);
 }
 
+TEST_F(AutoShardingTest, ReshapeWithInvalidUserSharding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %entry {
+  %param.0 = bf16[24,16,16]{2,1,0} parameter(0), sharding={devices=[32,1,1]<=[32]}
+  %reshape = bf16[1,24,16,16]{3,2,1,0} reshape(%param.0)
+  %copy = bf16[1,24,16,16]{3,2,1,0} copy(%reshape)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AutoShardingOption option;
+  option.enable = true;
+  option.device_mesh_shape = {32, 1};
+  option.device_mesh_ids.resize(32);
+  std::iota(option.device_mesh_ids.begin(), option.device_mesh_ids.end(), 0);
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  EXPECT_TRUE(changed);
+  VLOG(1) << module->ToString();
+  HloInstruction* reshape = FindInstruction(module.get(), "reshape");
+  EXPECT_THAT(reshape, op::Sharding("{devices=[1,32,1,1]<=[32]}"));
+}
+
 TEST_F(AutoShardingTest, Broadcast) {
   const char* const hlo_string = R"(
 HloModule module
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index d62d01e79de9c1..0acb7a2c9868c8 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -55,6 +55,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/status.h"
 
@@ -67,24 +68,42 @@ inline HloInstruction* PassThroughCustomCallMarkerUser(
     HloInstruction* raw_user, const HloInstruction* inst);
 
 std::optional<HloSharding> GetInputSharding(const HloInstruction* ins,
-                                            const HloInstruction* operand,
                                             int64_t op_index,
                                             const HloSharding& output_sharding,
                                             const CallGraph& call_graph,
                                             int64_t num_devices) {
-  auto ins_clone = ins->Clone();
+  std::unique_ptr<HloInstruction> ins_clone = ins->Clone();
   ins_clone->set_sharding(output_sharding);
-  auto operand_clone = operand->Clone();
-  if (operand_clone->has_sharding() &&
-      !operand_clone->sharding()
-           .Validate(operand_clone->shape(), num_devices)
-           .ok()) {
-    operand_clone->clear_sharding();
-  }
-  auto s = ins_clone->ReplaceOperandWith(op_index, operand_clone.get());
-  CHECK_OK(s);
-  return ShardingPropagation::GetShardingFromUser(*operand_clone, *ins_clone,
-                                                  10, true, call_graph);
+
+  std::vector<std::unique_ptr<HloInstruction>> operands;
+  for (size_t i = 0; i < ins->operand_count(); ++i) {
+    const HloInstruction* operand = ins->operand(i);
+    if (i != op_index &&
+        (!operand->has_sharding() ||
+         operand->sharding().Validate(operand->shape(), num_devices).ok())) {
+      continue;
+    }
+    std::unique_ptr<HloInstruction> operand_clone = operand->Clone();
+    if (operand_clone->has_sharding() &&
+        !operand_clone->sharding()
+             .Validate(operand_clone->shape(), num_devices)
+             .ok()) {
+      operand_clone->clear_sharding();
+    }
+    CHECK_OK(ins_clone->ReplaceOperandWith(i, operand_clone.get()));
+    operands.push_back(std::move(operand_clone));
+  }
+
+  std::optional<HloSharding> inferred_sharding =
+      ShardingPropagation::GetShardingFromUser(
+          *ins_clone->operand(op_index), *ins_clone, 10, true, call_graph);
+
+  if (!inferred_sharding.has_value() && IsTopKCustomCall(ins)) {
+    // ShardingPropagation::GetShardingFromUser does not handle TopK custom
+    // calls. Mirroring that function's handling of kSort, we handle TopK below.
+    inferred_sharding = output_sharding;
+  }
+  return inferred_sharding;
 }
 
 // Return whether the instruction is an activation from another pipeline stage.
@@ -107,40 +126,6 @@ bool IsActivationFromAnotherStage(const HloInstruction* ins,
   return true;
 }
 
-// Propagate sharding for broadcast.
-// The output will be tiled along the broadcasted dimension the same way
-// as the input for the broadcast while the other dimensions are kept
-// non-tiled.
-HloSharding BroadcastSharding(const HloSharding& input_spec,
-                              const Shape& new_shape,
-                              absl::Span<const int64_t> dimensions) {
-  if (input_spec.IsReplicated()) {
-    return input_spec;
-  }
-  CHECK(new_shape.IsArray());
-  std::vector<int64_t> target_tile_assignment_dimensions;
-  for (int64_t i = 0; i < new_shape.rank(); ++i) {
-    auto it = absl::c_find(dimensions, i);
-    if (it == dimensions.end()) {
-      target_tile_assignment_dimensions.push_back(1);
-    } else {
-      const int64_t source_dim = std::distance(dimensions.begin(), it);
-      target_tile_assignment_dimensions.push_back(
-          input_spec.tile_assignment().dim(source_dim));
-    }
-  }
-  if (input_spec.ReplicateOnLastTileDim()) {
-    target_tile_assignment_dimensions.push_back(
-        input_spec.tile_assignment().dimensions().back());
-  }
-  auto new_tile_assignment =
-      input_spec.tile_assignment().Reshape(target_tile_assignment_dimensions);
-
-  return input_spec.ReplicateOnLastTileDim()
-             ? HloSharding::PartialTile(new_tile_assignment)
-             : HloSharding::Tile(new_tile_assignment);
-}
-
 // Propagate sharding for dim-wise operations (e.g., slice, pad) which works
 // independently on each dimension.
 // The sharding can successfully propagate if the operation only happens
@@ -874,40 +859,42 @@ bool AllInfinityCosts(
 // that were not intended to be replicated when being generating, but ending up
 // being replicated, which could happen when, for example, generating 2D
 // sharding for a 1D mesh shape.
-void RemoveDuplicatedStrategy(std::unique_ptr<StrategyVector>& strategies) {
-  if (strategies->is_tuple) {
-    for (auto& child : strategies->childs) {
+void RemoveDuplicatedStrategy(std::unique_ptr<StrategyGroup>& strategy_group) {
+  if (strategy_group->is_tuple) {
+    for (auto& child : strategy_group->childs) {
       RemoveDuplicatedStrategy(child);
     }
-  } else if (!strategies->following) {
-    if (strategies->leaf_vector.empty()) return;
+  } else if (!strategy_group->following) {
+    if (strategy_group->strategies.empty()) return;
     std::vector<ShardingStrategy> new_vector;
     std::vector<ShardingStrategy> deduped_replicated_strategies;
     absl::flat_hash_set<std::string> added;
     size_t num_skipped_due_to_infinity_costs = 0;
-    for (size_t i = 0; i < strategies->leaf_vector.size(); ++i) {
-      if (AllInfinityCosts(strategies->leaf_vector[i].resharding_costs)) {
+    for (size_t i = 0; i < strategy_group->strategies.size(); ++i) {
+      if (AllInfinityCosts(strategy_group->strategies[i].resharding_costs)) {
         num_skipped_due_to_infinity_costs++;
         continue;
       }
-      std::string key = strategies->leaf_vector[i].output_sharding.ToString();
-      if (!strategies->leaf_vector[i].input_shardings.empty()) {
+      std::string key =
+          strategy_group->strategies[i].output_sharding.ToString();
+      if (!strategy_group->strategies[i].input_shardings.empty()) {
         for (const auto& sharding :
-             strategies->leaf_vector[i].input_shardings) {
+             strategy_group->strategies[i].input_shardings) {
           key += "/" + (sharding.has_value() ? sharding->ToString() : "none");
         }
       }
       if (!added.contains(key)) {
         added.insert(key);
-        if (!strategies->leaf_vector[i].output_sharding.IsReplicated()) {
-          new_vector.push_back(std::move(strategies->leaf_vector[i]));
+        if (!strategy_group->strategies[i].output_sharding.IsReplicated()) {
+          new_vector.push_back(std::move(strategy_group->strategies[i]));
         } else {
           deduped_replicated_strategies.push_back(
-              std::move(strategies->leaf_vector[i]));
+              std::move(strategy_group->strategies[i]));
         }
       }
     }
-    CHECK_LT(num_skipped_due_to_infinity_costs, strategies->leaf_vector.size())
+    CHECK_LT(num_skipped_due_to_infinity_costs,
+             strategy_group->strategies.size())
         << "All strategies removed due to infinite resharding costs";
     // Keeps replicated strategies as the last ones.
     if (!deduped_replicated_strategies.empty()) {
@@ -915,7 +902,7 @@ void RemoveDuplicatedStrategy(std::unique_ptr<StrategyVector>& strategies) {
         new_vector.push_back(std::move(deduped_replicated_strategies[i]));
       }
     }
-    strategies->leaf_vector = std::move(new_vector);
+    strategy_group->strategies = std::move(new_vector);
   }
 }
 
@@ -1053,47 +1040,6 @@ void UseAllReduceForGradAcc(StableHashSet<HloInstruction*>& replicated_set,
   }
 }
 
-void RemoveCustomCallMarker(HloModule* module) {
-  HloComputation* entry_computation = module->entry_computation();
-
-  std::vector<HloInstruction*> get_tuple_ins;
-  std::vector<HloInstruction*> marker_ins;
-
-  for (HloInstruction* ins : entry_computation->instructions()) {
-    if (ins->opcode() == HloOpcode::kGetTupleElement &&
-        IsCustomCallMarker(ins->operand(0))) {
-      get_tuple_ins.push_back(ins);
-      marker_ins.push_back(ins->mutable_operand(0));
-    }
-  }
-
-  for (HloInstruction* raw_ins : get_tuple_ins) {
-    HloInstruction* ins = raw_ins;
-    while (ins->opcode() == HloOpcode::kGetTupleElement) {
-      HloInstruction* custom_call = ins->mutable_operand(0);
-      CHECK(IsCustomCallMarker(custom_call));
-      HloInstruction* tuple = custom_call->mutable_operand(0);
-      ins = tuple->mutable_operand(ins->tuple_index());
-    }
-
-    TF_CHECK_OK(raw_ins->ReplaceAllUsesWith(ins));
-  }
-
-  for (HloInstruction* ins : get_tuple_ins) {
-    TF_CHECK_OK(entry_computation->RemoveInstruction(ins));
-  }
-
-  StableHashSet<const HloInstruction*> removed;
-  for (HloInstruction* ins : marker_ins) {
-    if (!removed.contains(ins)) {
-      HloInstruction* tmp = ins->mutable_operand(0);
-      TF_CHECK_OK(entry_computation->RemoveInstruction(ins));
-      TF_CHECK_OK(entry_computation->RemoveInstruction(tmp));
-      removed.insert(ins);
-    }
-  }
-}
-
 // Gets values in |array| along |dim| while keeping indices at other
 // dimensions at 0, e.g., array is 2D and dim = 1, this returns array[0, 1],
 // array[1, 1], array [2, 1], ....
@@ -1207,7 +1153,7 @@ absl::StatusOr<std::vector<int64_t>> GetTensorDimToMeshDimNoCrash(
     absl::c_iota(axes, 0);
     bool found = false;
     do {
-      auto transposed_mesh = Transpose(mesh, axes);
+      Array<int64_t> transposed_mesh = Transpose(mesh, axes);
       if (std::equal(transposed_mesh.begin(), transposed_mesh.end(),
                      spec.tile_assignment().array().begin())) {
         found = true;
@@ -1742,20 +1688,21 @@ AliasSet BuildAliasSet(const HloModule* module,
   const HloInstruction* output_tuple = entry->root_instruction();
 
   AliasSet alias_set;
-  std::function<void(const StrategyVector*, const StrategyVector*)>
+  std::function<void(const StrategyGroup*, const StrategyGroup*)>
       traverse_tuple_alias;
-  traverse_tuple_alias = [&](const StrategyVector* src_strategies,
-                             const StrategyVector* dst_strategies) {
-    if (src_strategies->is_tuple) {
-      CHECK(dst_strategies->is_tuple);
-      CHECK_EQ(src_strategies->childs.size(), dst_strategies->childs.size());
-      for (size_t i = 0; i < src_strategies->childs.size(); ++i) {
-        traverse_tuple_alias(src_strategies->childs[i].get(),
-                             dst_strategies->childs[i].get());
+  traverse_tuple_alias = [&](const StrategyGroup* src_strategy_group,
+                             const StrategyGroup* dst_strategy_group) {
+    if (src_strategy_group->is_tuple) {
+      CHECK(dst_strategy_group->is_tuple);
+      CHECK_EQ(src_strategy_group->childs.size(),
+               dst_strategy_group->childs.size());
+      for (size_t i = 0; i < src_strategy_group->childs.size(); ++i) {
+        traverse_tuple_alias(src_strategy_group->childs[i].get(),
+                             dst_strategy_group->childs[i].get());
       }
     } else {
       alias_set.insert(
-          std::make_pair(src_strategies->node_idx, dst_strategies->node_idx));
+          {src_strategy_group->node_idx, dst_strategy_group->node_idx});
     }
   };
   alias_config.ForEachAlias([&](const ShapeIndex& output_index,
@@ -1820,22 +1767,23 @@ AliasSet BuildAliasSet(const HloModule* module,
 }
 
 void CheckAliasSetCompatibility(const AliasSet& alias_set,
-                                const LeafStrategies& leaf_strategies,
+                                const StrategyGroups& strategy_groups,
                                 const HloInstructionSequence& sequence) {
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
   // Checks the compatibility
   for (const auto& pair : alias_set) {
-    const StrategyVector* src_strategies = leaf_strategies[pair.first];
-    const StrategyVector* dst_strategies = leaf_strategies[pair.second];
+    const StrategyGroup* src_strategy_group = strategy_groups[pair.first];
+    const StrategyGroup* dst_strategy_group = strategy_groups[pair.second];
 
     size_t compatible_cnt = 0;
     bool replicated = false;
-    for (size_t i = 0; i < src_strategies->leaf_vector.size(); ++i) {
-      for (size_t j = 0; j < dst_strategies->leaf_vector.size(); ++j) {
-        if (src_strategies->leaf_vector[i].output_sharding ==
-            dst_strategies->leaf_vector[j].output_sharding) {
+    for (size_t i = 0; i < src_strategy_group->strategies.size(); ++i) {
+      for (size_t j = 0; j < dst_strategy_group->strategies.size(); ++j) {
+        if (src_strategy_group->strategies[i].output_sharding ==
+            dst_strategy_group->strategies[j].output_sharding) {
           compatible_cnt += 1;
-          if (src_strategies->leaf_vector[i].output_sharding.IsReplicated()) {
+          if (src_strategy_group->strategies[i]
+                  .output_sharding.IsReplicated()) {
             replicated = true;
           }
         }
@@ -1843,32 +1791,31 @@ void CheckAliasSetCompatibility(const AliasSet& alias_set,
     }
 
     if (compatible_cnt == 1 &&
-        (replicated && (src_strategies->leaf_vector.size() > 1 ||
-                        dst_strategies->leaf_vector.size() > 1))) {
-      LOG(WARNING) << "Alias pair has only replicated strategy in common. This "
-                      "will result in choosing replicated strategy for these "
-                      "tensors and may result in large memory consumption: "
-                   << "("
-                   << instructions.at(src_strategies->instruction_id)->name()
-                   << ", "
-                   << instructions.at(dst_strategies->instruction_id)->name()
-                   << ")"
-                   << "\n"
-                   << "(" << src_strategies->node_idx << ", "
-                   << dst_strategies->node_idx << ")\n"
-                   << src_strategies->ToString() << "\n"
-                   << dst_strategies->ToString();
+        (replicated && (src_strategy_group->strategies.size() > 1 ||
+                        dst_strategy_group->strategies.size() > 1))) {
+      LOG(WARNING)
+          << "Alias pair has only replicated strategy in common. This "
+             "will result in choosing replicated strategy for these "
+             "tensors and may result in large memory consumption: "
+          << "(" << instructions.at(src_strategy_group->instruction_id)->name()
+          << ", " << instructions.at(dst_strategy_group->instruction_id)->name()
+          << ")"
+          << "\n"
+          << "(" << src_strategy_group->node_idx << ", "
+          << dst_strategy_group->node_idx << ")\n"
+          << src_strategy_group->ToString() << "\n"
+          << dst_strategy_group->ToString();
     }
     CHECK(compatible_cnt > 0)
         << "Alias pair does not have any sharding strategy in common: "
-        << "(" << instructions.at(src_strategies->instruction_id)->name()
-        << ", " << instructions.at(dst_strategies->instruction_id)->name()
+        << "(" << instructions.at(src_strategy_group->instruction_id)->name()
+        << ", " << instructions.at(dst_strategy_group->instruction_id)->name()
         << ")"
         << "\n"
-        << "(" << src_strategies->node_idx << ", " << dst_strategies->node_idx
-        << ")\n"
-        << src_strategies->ToString() << "\n"
-        << dst_strategies->ToString();
+        << "(" << src_strategy_group->node_idx << ", "
+        << dst_strategy_group->node_idx << ")\n"
+        << src_strategy_group->ToString() << "\n"
+        << dst_strategy_group->ToString();
   }
 }
 
@@ -2015,7 +1962,7 @@ AdjustShardingWithPartialMeshShapePerElement(
           LOG(FATAL) << err_msg;
         } else {
           LOG(WARNING) << err_msg;
-          return std::make_pair(absl::InternalError(err_msg), std::nullopt);
+          return {absl::InternalError(err_msg), std::nullopt};
         }
       }
     }
@@ -2028,7 +1975,7 @@ AdjustShardingWithPartialMeshShapePerElement(
       if (valid_shards.find(sharding.tile_assignment().dim(
               sharding.tile_assignment().num_dimensions() - 1)) !=
           valid_shards.end()) {
-        return std::make_pair(OkStatus(), HloSharding::Replicate());
+        return {OkStatus(), HloSharding::Replicate()};
       }
       // If replicate on other dimensions, remove the
       // replicate_on_last_tile
@@ -2075,9 +2022,9 @@ AdjustShardingWithPartialMeshShapePerElement(
     std::iota(device_ids.begin(), device_ids.end(), 0);
     tile_assignment.SetValues(device_ids);
     HloSharding new_sharding = HloSharding::Tile(std::move(tile_assignment));
-    return std::make_pair(OkStatus(), new_sharding);
+    return {OkStatus(), new_sharding};
   }
-  return std::make_pair(OkStatus(), std::nullopt);
+  return {OkStatus(), std::nullopt};
 }
 
 StatusOr<bool> AdjustShardingsWithPartialMeshShape(
@@ -2144,7 +2091,7 @@ std::vector<std::vector<int64_t>> DecomposeMeshShapes(
   std::vector<std::vector<int64_t>> partial_mesh_shapes;
   std::vector<std::pair<int64_t, size_t>> pairs(mesh_shape.size());
   for (size_t i = 0; i < mesh_shape.size(); i++) {
-    pairs[i] = std::make_pair(mesh_shape[i], i);
+    pairs[i] = {mesh_shape[i], i};
   }
   // For vector of size 3, the sorted indices happen to be the same as their
   // rankings. mesh_shapes over 3 elements are not supported by AutoSharding.
@@ -2187,51 +2134,6 @@ bool IsEntryComputationInputOrOutput(const HloModule* module,
   return false;
 }
 
-void CreateDifferentMeshShapesToTryHelper(
-    int64_t num_devices, size_t num_mesh_dims,
-    std::vector<int64_t> current_shape,
-    std::vector<std::vector<int64_t>>& all_shapes) {
-  if (current_shape.size() == num_mesh_dims - 1) {
-    current_shape.push_back(num_devices);
-    if (spmd::VectorGreaterThanOneElementCount(current_shape) <= 2) {
-      all_shapes.push_back(current_shape);
-    }
-    return;
-  } else {
-    int64_t current_dim = 1;
-    while (current_dim <= num_devices) {
-      std::vector<int64_t> new_shape(current_shape);
-      new_shape.push_back(current_dim);
-      CreateDifferentMeshShapesToTryHelper(
-          num_devices / current_dim, num_mesh_dims, new_shape, all_shapes);
-      current_dim *= 2;
-    }
-  }
-}
-
-std::vector<std::vector<int64_t>> CreateDifferentMeshShapesToTry(
-    const int64_t num_devices, int num_mesh_dims, bool symmetrical_mesh_dims) {
-  std::vector<std::vector<int64_t>> result;
-  CreateDifferentMeshShapesToTryHelper(num_devices, num_mesh_dims, {}, result);
-
-  if (symmetrical_mesh_dims) {
-    absl::flat_hash_set<absl::btree_multiset<int64_t>> dedup_result;
-    for (const auto& mesh_shape : result) {
-      dedup_result.insert(
-          absl::btree_multiset<int64_t>(mesh_shape.begin(), mesh_shape.end()));
-    }
-
-    result.clear();
-
-    for (const auto& mesh_shape_set : dedup_result) {
-      result.push_back(
-          std::vector<int64_t>(mesh_shape_set.begin(), mesh_shape_set.end()));
-    }
-  }
-
-  return result;
-}
-
 void ComputeInstructionExecutionCountsHelper(
     const HloComputation* computation, int64_t computation_execution_count,
     int64_t loop_iteration_count_estimate,
@@ -2281,5 +2183,115 @@ ComputeInstructionExecutionCounts(const HloModule* module,
   return instruction_execution_counts;
 }
 
+void EnumerateAllPossibleMeshShapesHelper(
+    int64_t num_devices, size_t num_mesh_dims,
+    std::vector<int64_t> current_shape,
+    std::vector<std::vector<int64_t>>& all_shapes) {
+  if (current_shape.size() == num_mesh_dims - 1) {
+    current_shape.push_back(num_devices);
+    if (spmd::VectorGreaterThanOneElementCount(current_shape) <= 2) {
+      all_shapes.push_back(current_shape);
+    }
+    return;
+  } else {
+    int64_t current_dim = 1;
+    while (current_dim <= num_devices) {
+      std::vector<int64_t> new_shape(current_shape);
+      new_shape.push_back(current_dim);
+      EnumerateAllPossibleMeshShapesHelper(
+          num_devices / current_dim, num_mesh_dims, new_shape, all_shapes);
+      current_dim *= 2;
+    }
+  }
+}
+
+std::vector<std::vector<int64_t>> EnumerateAllPossibleMeshShapes(
+    const int64_t num_devices, int num_mesh_dims, bool symmetrical_mesh_dims) {
+  std::vector<std::vector<int64_t>> result;
+  EnumerateAllPossibleMeshShapesHelper(num_devices, num_mesh_dims, {}, result);
+
+  if (symmetrical_mesh_dims) {
+    absl::flat_hash_set<absl::btree_multiset<int64_t>> dedup_result;
+    for (const std::vector<int64_t>& mesh_shape : result) {
+      dedup_result.insert(
+          absl::btree_multiset<int64_t>(mesh_shape.begin(), mesh_shape.end()));
+    }
+
+    result.clear();
+
+    for (const absl::btree_multiset<int64_t>& mesh_shape_set : dedup_result) {
+      result.push_back(
+          std::vector<int64_t>(mesh_shape_set.begin(), mesh_shape_set.end()));
+    }
+  }
+
+  return result;
+}
+
+std::vector<std::vector<int64_t>> InferMeshShapesToTry(
+    const HloModule& module) {
+  int64_t sharding_1d = -1;
+  absl::flat_hash_set<std::vector<int64_t>> shardings_nd;
+  std::function<void(const HloSharding&)> process_sharding;
+  process_sharding = [&sharding_1d, &shardings_nd,
+                      &process_sharding](const HloSharding& sharding) {
+    if (sharding.IsTuple()) {
+      for (const HloSharding& child : sharding.tuple_elements()) {
+        process_sharding(child);
+      }
+    } else if (!sharding.IsReplicated()) {
+      absl::Span<const int64_t> dims = sharding.tile_assignment().dimensions();
+      std::vector<int64_t> dims_greater_than_one;
+      for (const int64_t dim : dims) {
+        if (dim > 1) {
+          dims_greater_than_one.push_back(dim);
+        }
+      }
+      if (dims_greater_than_one.size() == 1) {
+        CHECK(sharding_1d == -1 || sharding_1d == dims_greater_than_one[0]);
+        sharding_1d = dims_greater_than_one[0];
+      } else {
+        std::sort(dims_greater_than_one.begin(), dims_greater_than_one.end());
+        shardings_nd.insert(dims_greater_than_one);
+      }
+    }
+  };
+
+  for (const HloComputation* comp : module.computations()) {
+    for (const HloInstruction* ins : comp->instructions()) {
+      if (ins->has_sharding()) {
+        process_sharding(ins->sharding());
+      }
+    }
+  }
+
+  if (shardings_nd.empty() && sharding_1d < 0) {
+    return {};
+  } else if (shardings_nd.empty()) {
+    CHECK_GE(sharding_1d, 0);
+    return {{1, sharding_1d}};
+  } else {
+    std::vector<std::vector<int64_t>> result;
+    for (std::vector<int64_t> mesh : shardings_nd) {
+      do {
+        result.push_back(std::vector<int64_t>(mesh));
+      } while (std::next_permutation(std::begin(mesh), std::end(mesh)));
+    }
+    return result;
+  }
+}
+
+std::vector<std::vector<int64_t>> InferOrEnumerateMeshShapesToTry(
+    const HloModule& module, int64_t num_devices, int num_mesh_dims,
+    bool symmetrical_mesh_dims) {
+  std::vector<std::vector<int64_t>> mesh_shapes = InferMeshShapesToTry(module);
+  if (mesh_shapes.empty()) {
+    mesh_shapes = spmd::EnumerateAllPossibleMeshShapes(
+        num_devices, num_mesh_dims,
+        /* symmetrical_mesh_dims */ symmetrical_mesh_dims);
+  }
+  return mesh_shapes;
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index 88c404cba56e96..b6aca1199eb1ee 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -357,7 +357,6 @@ inline std::vector<int> Argsort(const std::vector<T>& scores) {
 // Given the sharding for an instruction, invoke the sharding propagation pass
 // to infer appropriate shardings for its operands.
 std::optional<HloSharding> GetInputSharding(const HloInstruction* ins,
-                                            const HloInstruction* operand,
                                             int64_t op_index,
                                             const HloSharding& output_sharding,
                                             const xla::CallGraph& call_graph,
@@ -380,9 +379,6 @@ std::string GetBatchDimMapKey(const HloInstruction* ins, int64_t idx = -1);
 InstructionBatchDimMap BuildInstructionBatchDimMap(
     const HloInstructionSequence& sequence);
 
-// Remove all custom call makers in an HloModule.
-void RemoveCustomCallMarker(HloModule* module);
-
 /*
  * HloSharding Utility
  */
@@ -422,14 +418,6 @@ inline bool IsFullyTiled(const HloSharding& sharding) {
   return sharding.NumTiles() == sharding.tile_assignment().num_elements();
 }
 
-// Propagate sharding for broadcast.
-// The output will be tiled along the broadcasted dimension the same way
-// as the input for the broadcast while the other dimensions are kept
-// non-tiled.
-HloSharding BroadcastSharding(const HloSharding& input_spec,
-                              const Shape& new_shape,
-                              absl::Span<const int64_t> dimensions);
-
 // Propagate sharding for dim-wise operations (e.g., slice, pad) which works
 // independently on each dimension.
 // The sharding can successfully propagate if the operation only happens on
@@ -651,18 +639,21 @@ bool OutputInputSameShapes(const HloInstruction* ins);
 bool IsEntryComputationInputOrOutput(const HloModule* module,
                                      const HloInstruction* ins);
 
-// Given a number of devices (`num_devices`), create a list different mesh
-// shapes of a given rank (`num_mesh_dims`) to try, if the option to try
-// multiple mesh shapes is enabled.
-std::vector<std::vector<int64_t>> CreateDifferentMeshShapesToTry(
-    int64_t num_devices, int num_mesh_dims, bool symmetrical_mesh_dims);
-
 // Statically estimate the execution counts of HLO ops. This matters for while
 // loops, and we use a constant iteration count for all while loops for this
 // approximation.
 absl::flat_hash_map<const HloInstruction*, int64_t>
 ComputeInstructionExecutionCounts(const HloModule* module,
                                   int64_t loop_iteration_count_estimate);
+
+// Generates a set of mesh shapes to try for a given module based on
+// pre-existing sharding annotations. If not such annotations exist, it will
+// enumerate and return all possible mesh shapes for a given number of devices
+// and mesh dimensions.
+std::vector<std::vector<int64_t>> InferOrEnumerateMeshShapesToTry(
+    const HloModule& module, int64_t num_devices, int num_mesh_dims,
+    bool symmetrical_mesh_dims);
+
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h
index d168a46bbd3a7d..008a4184e00b96 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h
@@ -16,16 +16,19 @@ limitations under the License.
 #ifndef XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_WRAPPER_H_
 #define XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_WRAPPER_H_
 
+#include <cstddef>
 #include <cstdint>
 #include <string>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_solver.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/hlo_cost_analysis.h"
 
 namespace xla {
 namespace spmd {
@@ -33,16 +36,22 @@ namespace spmd {
 // A wrapper around the solver that converts the given objects into a
 // combinatorial optimization problem & solves it.
 AutoShardingSolverResult CallSolver(
-    const HloLiveRange& hlo_live_range,
+    const HloModule& hlo_module, const HloLiveRange& hlo_live_range,
     const LivenessNodeSet& liveness_node_set, const StrategyMap& strategy_map,
-    const LeafStrategies& leaf_strategies, const CostGraph& cost_graph,
+    const StrategyGroups& strategy_groups, const CostGraph& cost_graph,
     const AliasSet& alias_set, const std::vector<NodeStrategyIdx>& s_hint,
-    int64_t memory_budget_per_device, bool crash_at_infinity_costs_check,
     bool compute_iis, int64_t solver_timeout_in_seconds,
-    bool allow_alias_to_follower_conversion,
+    const AutoShardingOption& option,
     const absl::flat_hash_map<std::string, const HloInstruction*>&
         sharding_propagation_solution = {});
 
+// Computes the penalty to be used for fully replicated sharding strategies for
+// dots and convs.
+double GetDotConvReplicationPenalty(const HloInstruction* inst,
+                                    size_t instruction_id, size_t window,
+                                    const HloInstructionSequence& sequence,
+                                    const HloCostAnalysis& hlo_cost_analysis);
+
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h
index 93cf5a6dba06d9..09b3689274438a 100644
--- a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h
+++ b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h
@@ -284,7 +284,8 @@ class DfsHloVisitorWithDefaultBase
       delete;
 };
 
-// Users should use these type aliases which are only two valid instantiations.
+// Users should use one of these two type aliases, which are the only two valid
+// instantiations of DfsHloVisitorWithDefaultBase.
 using DfsHloVisitorWithDefault = DfsHloVisitorWithDefaultBase<HloInstruction*>;
 using ConstDfsHloVisitorWithDefault =
     DfsHloVisitorWithDefaultBase<const HloInstruction*>;
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.cc b/third_party/xla/xla/hlo/ir/hlo_computation.cc
index fd88587b0df10b..e22e630dc57548 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.cc
@@ -51,6 +51,32 @@ namespace xla {
 
 using absl::StrCat;
 
+enum VisitState { kVisiting, kVisited };
+
+// VisitMap is a HloInstruction visitation map that uses an inline array to
+// store up to a certain number of unique elements, but upgrades itself
+// automatically to be backed by a real map when it runs out of space.
+class HloComputation::VisitMap {
+ public:
+  VisitMap() = default;
+  explicit VisitMap(int capacity);
+
+  // Inserts a given element into the map, provided an element with
+  // the same key hasn't already been inserted. It returns the boolean that
+  // indicates whether the element was inserted or not, along with a mutable
+  // reference to the element value inside the map.
+  std::pair<VisitState&, bool> insert(
+      std::pair<const HloInstruction*, VisitState>);
+
+ private:
+  static const int kMaxInlined = 16;
+  static const int kUsingMap = -1;
+
+  int size_ = 0;
+  std::pair<const HloInstruction*, VisitState> array_[kMaxInlined];
+  absl::flat_hash_map<const HloInstruction*, VisitState> map_;
+};
+
 std::unique_ptr<HloComputation> HloComputation::Builder::Build(
     HloInstruction* root_instruction) {
   int parameter_count = 0;
@@ -79,7 +105,9 @@ HloComputation::HloComputation(
       custom_call_instruction_(nullptr),
       is_custom_call_computation_(false),
       collective_call_instruction_(nullptr),
-      is_collective_called_computation_(false) {
+      is_collective_called_computation_(false),
+      while_call_instruction_(nullptr),
+      is_while_call_body_computation_(false) {
   param_instructions_.resize(parameter_count, nullptr);
   bool root_found = false;
   for (auto& instruction : *instructions) {
@@ -414,30 +442,32 @@ void HloComputation::set_root_instruction(HloInstruction* new_root_instruction,
 
 void HloComputation::ComputeInstructionPostOrder(
     HloInstruction* root, const ChannelDependencies& channel_dependencies,
-    absl::flat_hash_map<HloInstruction*, VisitState>& visited,
-    std::vector<HloInstruction*>& post_order) const {
+    VisitMap& visited, std::vector<HloInstruction*>& post_order,
+    std::vector<HloInstruction*>* dfs_stack_scratch) const {
   ForEachInstructionPostOrderImpl(
       [&post_order](HloInstruction* hlo) { post_order.push_back(hlo); }, root,
-      channel_dependencies, visited);
+      channel_dependencies, visited, dfs_stack_scratch);
 }
 
 void HloComputation::ForEachInstructionPostOrderImpl(
     absl::FunctionRef<void(HloInstruction*)> func, HloInstruction* root,
-    const ChannelDependencies& channel_dependencies,
-    absl::flat_hash_map<HloInstruction*, VisitState>& visited) const {
-  std::vector<HloInstruction*> dfs_stack = {root};
-  while (!dfs_stack.empty()) {
-    HloInstruction& current = *dfs_stack.back();
-
-    auto [it, was_inserted] = visited.insert({&current, kVisiting});
+    const ChannelDependencies& channel_dependencies, VisitMap& visited,
+    std::vector<HloInstruction*>* dfs_stack_scratch) const {
+  auto* dfs_stack = dfs_stack_scratch;
+  dfs_stack->clear();
+  dfs_stack->push_back(root);
+  while (!dfs_stack->empty()) {
+    HloInstruction& current = *dfs_stack->back();
+
+    auto [state, was_inserted] = visited.insert({&current, kVisiting});
     if (!was_inserted) {  // We've already seen this instruction.
-      dfs_stack.pop_back();
-      if (it->second != kVisited) {
+      dfs_stack->pop_back();
+      if (state != kVisited) {
         DCHECK_EQ(current.parent(), this)
             << "Instruction " << current.name()
             << " is not in the current computation (" << name() << ").";
         func(&current);
-        it->second = kVisited;
+        state = kVisited;
       }
       continue;
     }
@@ -449,7 +479,8 @@ void HloComputation::ForEachInstructionPostOrderImpl(
     if (&current != root) {
       auto it = channel_dependencies.find(&current);
       if (it != channel_dependencies.end()) {
-        dfs_stack.insert(dfs_stack.end(), it->second.begin(), it->second.end());
+        dfs_stack->insert(dfs_stack->end(), it->second.begin(),
+                          it->second.end());
       }
     }
 
@@ -457,11 +488,12 @@ void HloComputation::ForEachInstructionPostOrderImpl(
     // processed first. This will produce a more natural ordering and a nicer
     // result for things like HLO stringification.
     const HloInstruction::InstructionVector& operands = current.operands();
-    dfs_stack.insert(dfs_stack.end(), operands.rbegin(), operands.rend());
+    dfs_stack->insert(dfs_stack->end(), operands.rbegin(), operands.rend());
 
     const std::vector<HloInstruction*>& predecessors =
         current.control_predecessors();
-    dfs_stack.insert(dfs_stack.end(), predecessors.begin(), predecessors.end());
+    dfs_stack->insert(dfs_stack->end(), predecessors.begin(),
+                      predecessors.end());
   }
 }
 
@@ -506,9 +538,10 @@ HloComputation::ChannelDependencies HloComputation::ComputeChannelDependencies()
 std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrderFrom(
     HloInstruction& postorder_root) const {
   std::vector<HloInstruction*> post_order;
-  absl::flat_hash_map<HloInstruction*, VisitState> visited;
+  VisitMap visited;
+  std::vector<HloInstruction*> dfs_stack_scratch;
   ComputeInstructionPostOrder(&postorder_root, ComputeChannelDependencies(),
-                              visited, post_order);
+                              visited, post_order, &dfs_stack_scratch);
   return post_order;
 }
 
@@ -520,12 +553,13 @@ std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder(
     const ChannelDependencies& channel_dependencies) const {
   std::vector<HloInstruction*> post_order;
   post_order.reserve(instruction_count());
-  absl::flat_hash_map<HloInstruction*, VisitState> visited;
-  visited.reserve(instruction_count());
+  VisitMap visited(instruction_count());
+  std::vector<HloInstruction*> dfs_stack_scratch;
+  dfs_stack_scratch.reserve(instruction_count());
   for (auto& instruction : instructions_) {
     if (instruction->users().empty()) {
       ComputeInstructionPostOrder(instruction.get(), channel_dependencies,
-                                  visited, post_order);
+                                  visited, post_order, &dfs_stack_scratch);
     }
   }
   CHECK_EQ(instructions_.size(), post_order.size())
@@ -598,13 +632,15 @@ HloComputation::MakeInstructionPostOrderWithReshapeFirst() const {
 
 void HloComputation::ForEachInstructionPostOrder(
     absl::FunctionRef<void(HloInstruction*)> func) const {
-  absl::flat_hash_map<HloInstruction*, VisitState> visited;
-  visited.reserve(instruction_count());
+  VisitMap visited(instruction_count());
+  std::vector<HloInstruction*> dfs_stack_scratch;
+  dfs_stack_scratch.reserve(instruction_count());
   auto channel_dependencies = ComputeChannelDependencies();
   for (auto& instruction : instructions_) {
     if (instruction->users().empty()) {
       ForEachInstructionPostOrderImpl(func, instruction.get(),
-                                      channel_dependencies, visited);
+                                      channel_dependencies, visited,
+                                      &dfs_stack_scratch);
     }
   }
 }
@@ -623,12 +659,13 @@ std::vector<HloComputation*> HloComputation::MakeEmbeddedComputationsList()
   // include the computation itself in the list of embedded computations.
   for (auto* instruction : instructions()) {
     auto process_called_computations =
-        [&](std::vector<HloComputation*> called_computations) {
+        [&](const std::vector<HloComputation*>& called_computations) {
           // Put the called computations in reverse order onto the stack.
           // Otherwise we don't match the recursive enumeration of
           // computations, which processes the first called computation first.
-          absl::c_reverse(called_computations);
-          for (HloComputation* called_computation : called_computations) {
+          for (auto i = called_computations.rbegin();
+               i != called_computations.rend(); ++i) {
+            HloComputation* called_computation = *i;
             if (visited.insert(called_computation).second) {
               st.emplace(called_computation,
                          called_computation->instructions_.cbegin());
@@ -1131,7 +1168,8 @@ Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
 
 StatusOr<bool> HloComputation::ReplaceInstructionWithDifferentShape(
     HloInstruction* old_instruction, HloInstruction* new_instruction,
-    bool preserve_sharding, bool relay_control_dependency) {
+    bool preserve_sharding, bool relay_control_dependency,
+    bool remove_unused_operands) {
   if (preserve_sharding && new_instruction->has_sharding() &&
       old_instruction->has_sharding() &&
       !new_instruction->has_compatible_sharding(old_instruction)) {
@@ -1184,10 +1222,13 @@ StatusOr<bool> HloComputation::ReplaceInstructionWithDifferentShape(
            new_instruction->custom_call_target())) {
     new_instruction->SetAndSanitizeName(old_instruction->name());
   }
-
-  TF_RETURN_IF_ERROR(RemoveInstructionAndUnusedOperands(
-      old_instruction, /*cleanup=*/std::nullopt,
-      /*ignore_control_dependencies=*/relay_control_dependency));
+  if (remove_unused_operands) {
+    TF_RETURN_IF_ERROR(RemoveInstructionAndUnusedOperands(
+        old_instruction, /*cleanup=*/std::nullopt,
+        /*ignore_control_dependencies=*/relay_control_dependency));
+  } else {
+    TF_RETURN_IF_ERROR(RemoveInstruction(old_instruction));
+  }
   return true;
 }
 
@@ -1399,12 +1440,13 @@ std::unique_ptr<HloComputation> HloComputation::CloneInContext(
   // ourselves.
   std::vector<const HloInstruction*> postorder;
   absl::flat_hash_map<const HloInstruction*, VisitState> visited;
+  std::vector<const HloInstruction*> dfs_stack;
   for (const auto& instr : instructions_) {
-    std::vector<const HloInstruction*> dfs_stack;
     const HloInstruction* new_instr = replace(instr.get());
     if (!new_instr) {
       continue;
     }
+    dfs_stack.clear();
     dfs_stack.push_back(new_instr);
 
     while (!dfs_stack.empty()) {
@@ -1516,4 +1558,42 @@ bool HloComputation::CanExpandIntoSingleInstruction() const {
       });
 }
 
+HloComputation::HloComputation::VisitMap::VisitMap(int capacity) {
+  if (capacity <= kMaxInlined) return;  // already reserved
+  size_ = kUsingMap;
+  map_.reserve(capacity);
+}
+
+std::pair<VisitState&, bool> HloComputation::HloComputation::VisitMap::insert(
+    std::pair<const HloInstruction*, VisitState> x) {
+  if (size_ == kUsingMap) {
+    // Using map.
+    auto [it, was_inserted] = map_.insert(std::move(x));
+    return {it->second, was_inserted};
+  }
+
+  // Using array.
+  for (int i = 0; i < size_; ++i) {
+    if (array_[i].first == x.first) {  // already inserted
+      return {array_[i].second, false};
+    }
+  }
+
+  // See if there's space in array.
+  if (size_ < kMaxInlined) {
+    array_[size_] = std::move(x);
+    return {array_[size_++].second, true};
+  }
+
+  // Convert to a map.
+  for (int i = 0; i < kMaxInlined; ++i) {
+    map_.insert(std::move(array_[i]));
+  }
+  size_ = kUsingMap;
+
+  auto [it, inserted] = map_.insert(std::move(x));
+  DCHECK_EQ(inserted, true);
+  return {it->second, true};
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.h b/third_party/xla/xla/hlo/ir/hlo_computation.h
index af2b80496d6c25..2f6ec87e3c5866 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.h
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.h
@@ -514,7 +514,8 @@ class HloComputation {
   // shape.
   StatusOr<bool> ReplaceInstructionWithDifferentShape(
       HloInstruction* old_instruction, HloInstruction* new_instruction,
-      bool preserve_sharding, bool relay_control_dependency = false);
+      bool preserve_sharding, bool relay_control_dependency = false,
+      bool remove_unused_operands = true);
   Status ReplaceInstructionWithDifferentShape(HloInstruction* old_instruction,
                                               HloInstruction* new_instruction);
 
@@ -644,7 +645,8 @@ class HloComputation {
   // computation.
   HloInstruction* FusionInstruction() const { return fusion_instruction_; }
   void SetFusionInstruction(HloInstruction* fusion_instruction) {
-    CHECK(!IsCustomCallComputation() && !IsAsyncComputation());
+    CHECK(!IsCustomCallComputation() && !IsAsyncComputation() &&
+          !IsCollectiveCalledComputation() && !IsWhileBodyComputation());
     fusion_instruction_ = fusion_instruction;
     is_fusion_computation_ |= (fusion_instruction != nullptr);
   }
@@ -658,7 +660,8 @@ class HloComputation {
     return custom_call_instruction_;
   }
   void SetCustomCallInstruction(HloInstruction* custom_call_instruction) {
-    CHECK(!IsFusionComputation() && !IsAsyncComputation());
+    CHECK(!IsFusionComputation() && !IsAsyncComputation() &&
+          !IsCollectiveCalledComputation() && !IsWhileBodyComputation());
     custom_call_instruction_ = custom_call_instruction;
     is_custom_call_computation_ |= (custom_call_instruction != nullptr);
   }
@@ -677,12 +680,32 @@ class HloComputation {
   void SetCollectiveCallInstruction(
       HloInstruction* collective_call_instruction) {
     CHECK(!IsFusionComputation() && !IsAsyncComputation() &&
-          !IsCustomCallComputation());
+          !IsCustomCallComputation() && !IsWhileBodyComputation());
     collective_call_instruction_ = collective_call_instruction;
     is_collective_called_computation_ |=
         (collective_call_instruction != nullptr);
   }
 
+  // Returns if this computation is a body computation of a while.
+  bool IsWhileBodyComputation() const {
+    return is_while_call_body_computation_;
+  }
+
+  // Returns the owning while call instruction, or nullptr if this is not a
+  // while call body computation.
+  HloInstruction* WhileCallInstruction() const {
+    return while_call_instruction_;
+  }
+
+  void SetWhileCallInstruction(HloInstruction* while_call_instruction) {
+    CHECK(!IsFusionComputation() && !IsAsyncComputation() &&
+          !IsCustomCallComputation() && !IsCollectiveCalledComputation());
+    CHECK(while_call_instruction != nullptr);
+    CHECK(while_call_instruction->opcode() == HloOpcode::kWhile);
+    while_call_instruction_ = while_call_instruction;
+    is_while_call_body_computation_ |= (while_call_instruction != nullptr);
+  }
+
   // Returns if this computation is an async computation.
   bool IsAsyncComputation() const { return !async_instructions_.empty(); }
 
@@ -793,16 +816,16 @@ class HloComputation {
   // Internal helper to collect unreachable roots.
   std::vector<HloInstruction*> CollectUnreachableRoots() const;
 
-  enum VisitState { kVisiting, kVisited };
+  class VisitMap;
   void ComputeInstructionPostOrder(
       HloInstruction* root, const ChannelDependencies& channel_dependencies,
-      absl::flat_hash_map<HloInstruction*, VisitState>& visited,
-      std::vector<HloInstruction*>& post_order) const;
+      VisitMap& visited, std::vector<HloInstruction*>& post_order,
+      std::vector<HloInstruction*>* dfs_stack_scratch) const;
 
   void ForEachInstructionPostOrderImpl(
       absl::FunctionRef<void(HloInstruction*)> func, HloInstruction* root,
-      const ChannelDependencies& channel_dependencies,
-      absl::flat_hash_map<HloInstruction*, VisitState>& visited) const;
+      const ChannelDependencies& channel_dependencies, VisitMap& visited,
+      std::vector<HloInstruction*>* dfs_stack_scratch) const;
 
   Status RemoveUnusedParametersImpl(bool allow_non_fusion);
 
@@ -840,6 +863,13 @@ class HloComputation {
   // Determines whether this computation is a collective sub-computation.
   bool is_collective_called_computation_;
 
+  // If this computation is a while body computation, this field points to
+  // the corresponding while instruction. Otherwise, this is null.
+  HloInstruction* while_call_instruction_;
+
+  // Determines whether this computation is a while body computation.
+  bool is_while_call_body_computation_;
+
   // If this computation is an async computation, this field points to the
   // corresponding async instructions (if live) that call this computation.
   // Otherwise, this is empty.
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.cc b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
index a8b1a377d834bd..a35ee7485ee0f6 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
@@ -396,11 +396,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "TopK instruction should have exactly 1 operand but has "
           << proto.operand_ids_size();
-      TF_RET_CHECK(proto.called_computation_ids_size() == 1)
-          << "TopK instruction should one called computation but sees "
-          << proto.called_computation_ids_size();
       instruction =
-          CreateTopK(shape, all_operands()[0], proto.k(), computations(0));
+          CreateTopK(shape, all_operands()[0], proto.k(), proto.largest());
       break;
     }
     case HloOpcode::kTranspose:
@@ -1092,9 +1089,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateTopK(
-    const Shape& shape, HloInstruction* input, int64_t k,
-    HloComputation* compare) {
-  return std::make_unique<HloTopKInstruction>(shape, input, k, compare);
+    const Shape& shape, HloInstruction* input, int64_t k, bool largest) {
+  return std::make_unique<HloTopKInstruction>(shape, input, k, largest);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -1542,6 +1538,8 @@ HloInstruction::CreateAddDependency(HloInstruction* data_operand,
   // Body comes before condition computation in the vector.
   instruction->called_computations_.push_back(body);
   instruction->called_computations_.push_back(condition);
+  // Set back pointer from body computation to the while call instruction
+  body->SetWhileCallInstruction(instruction.get());
   return instruction;
 }
 
@@ -2400,12 +2398,12 @@ const HloInstruction* HloInstruction::LatestNonGteAncestor() const {
 }
 
 const HloInstruction* HloInstruction::operand(int64_t i) const {
-  return operands_.at(i);
+  return operands_[i];
 }
 
 HloInstruction* HloInstruction::mutable_operand(int64_t i) {
   CHECK(operands_[i] != nullptr);
-  return operands_.at(i);
+  return operands_[i];
 }
 
 int64_t HloInstruction::operand_index(const HloInstruction* target) const {
@@ -2864,6 +2862,54 @@ Status HloInstruction::ReplaceOperandWithDifferentShape(
   return OkStatus();
 }
 
+// Copy all the instructions in the given fusion instruction into the fusion
+// instruction's parent computation and replace the use of the fusion
+// instruction with the copy of the fusion expression root.
+Status HloInstruction::Defuse() {
+  if (opcode() != HloOpcode::kFusion) {
+    return OkStatus();
+  }
+  VLOG(2) << "Defusing instruction: " << ToString();
+
+  HloComputation* fused_computation = fused_instructions_computation();
+
+  // A map from fused instruction to its defused clone.
+  absl::flat_hash_map<const HloInstruction*, HloInstruction*>
+      defused_instructions;
+  // Initialize map to contain the fusion instruction parameters mapping
+  // to the operands of the fusion instruction.
+  for (int64_t i = 0; i < operand_count(); ++i) {
+    defused_instructions[fused_computation->parameter_instruction(i)] =
+        mutable_operand(i);
+  }
+
+  // Create a clone of each instruction of the fused computation in the same
+  // computation as the fusion instruction itself.
+  // TODO(b/68227302): Moving instruction to new computation rather than
+  // cloning and deleting.
+  for (HloInstruction* fused_instruction :
+       fused_computation->MakeInstructionPostOrder()) {
+    if (fused_instruction->opcode() == HloOpcode::kParameter) {
+      continue;
+    }
+    std::vector<HloInstruction*> new_operands;
+    for (HloInstruction* operand : fused_instruction->operands()) {
+      new_operands.push_back(defused_instructions.at(operand));
+    }
+    HloInstruction* defused_instruction =
+        parent()->AddInstruction(fused_instruction->CloneWithNewOperands(
+            fused_instruction->shape(), new_operands));
+    defused_instructions[fused_instruction] = defused_instruction;
+  }
+
+  TF_RETURN_IF_ERROR(
+      ReplaceAllUsesWith(defused_instructions.at(fused_expression_root())));
+
+  HloModule* module = GetModule();
+  TF_RETURN_IF_ERROR(parent()->RemoveInstruction(this));
+  return module->RemoveEmbeddedComputation(fused_computation);
+}
+
 Status HloInstruction::ReplaceUsesWith(absl::Span<HloInstruction* const> users,
                                        HloInstruction* new_producer) {
   TF_RET_CHECK(
@@ -3599,6 +3645,23 @@ void HloInstruction::PrintExtraAttributes(
       AppendCat(printer, "statistics=", StatisticsVizToString(statistics_viz_));
     });
   }
+
+  if (operation_queue_id_) {
+    printer.Next([this](Printer* printer) {
+      AppendCat(printer, "operation_queue_id=", *operation_queue_id_);
+    });
+  }
+
+  if (wait_on_operation_queues_.size() > 0) {
+    printer.Next([this, &options](Printer* printer) {
+      printer->Append("wait_on_operation_queues={");
+      AppendJoin(printer, wait_on_operation_queues_, ", ",
+                 [&](Printer* printer, int64_t queue_id) {
+                   printer->Append(queue_id);
+                 });
+      printer->Append("}");
+    });
+  }
 }
 
 std::vector<std::string> HloInstruction::ExtraAttributesToString(
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/xla/xla/hlo/ir/hlo_instruction.h
index 37101f793551f5..21970c34bb1585 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -571,11 +571,14 @@ class HloInstruction {
   static std::unique_ptr<HloInstruction> CreateIota(const Shape& shape,
                                                     int64_t iota_dimension);
 
-  // Creates a Top-K instruction.
+  // Creates a Top-K instruction returning the top k values along the last
+  // dimension of the input operand.
+  //
+  // - `k` indicates how many elements to return in the last dimension.
+  // - `largest` indicates whether to return the largest or smallest elements.
   static std::unique_ptr<HloInstruction> CreateTopK(const Shape& shape,
                                                     HloInstruction* input,
-                                                    int64_t k,
-                                                    HloComputation* compare);
+                                                    int64_t k, bool largest);
 
   // Creates a get tuple element instruction.
   static std::unique_ptr<HloInstruction> CreateGetTupleElement(
@@ -1433,6 +1436,9 @@ class HloInstruction {
   Status ReplaceOperandWithDifferentShape(int64_t operand_num,
                                           HloInstruction* new_operand);
 
+  // Decomposes fusion back to individual parts.
+  Status Defuse();
+
   // Replaces all uses of this instruction with the new producer. If
   // new_producer is a user of this instruction then new_producer remains a use
   // of this instruction to avoid introducing cycles into the graph.
@@ -1660,8 +1666,8 @@ class HloInstruction {
   }
   // Sets the sharding of this operator. Should only be called by HloModule or
   // HloComputation methods.
-  void set_sharding(const HloSharding& sharding) {
-    set_sharding(std::make_shared<const HloSharding>(sharding));
+  void set_sharding(HloSharding sharding) {
+    set_sharding(std::make_shared<HloSharding>(std::move(sharding)));
   }
   void set_sharding(std::shared_ptr<const HloSharding> sharding) {
     sharding_ = std::move(sharding);
@@ -1909,6 +1915,26 @@ class HloInstruction {
   bool is_default_config() const { return is_default_config_; }
   void set_default_config() { is_default_config_ = true; }
 
+  void set_operation_queue_id(int64_t operation_queue_id) {
+    operation_queue_id_ = operation_queue_id;
+  }
+
+  const std::optional<int64_t> operation_queue_id() const {
+    return operation_queue_id_;
+  }
+
+  void set_wait_on_operation_queues(std::vector<int64_t>& operation_queue_ids) {
+    wait_on_operation_queues_ = operation_queue_ids;
+  }
+
+  const std::vector<int64_t> wait_on_operation_queues() const {
+    return wait_on_operation_queues_;
+  }
+
+  void add_wait_on_operation_queues(int64_t operation_queue_id) {
+    wait_on_operation_queues_.push_back(operation_queue_id);
+  }
+
   // Returns a string representation of a proto in the format used by
   // raw_backend_config_string.
   //
@@ -2520,6 +2546,12 @@ class HloInstruction {
   // Intrusive flag used by HloComputation, whether this instruction has
   // been marked as dead.
   bool marked_as_dead_;
+
+  // ID of the operation queue to run this instruction.
+  std::optional<int64_t> operation_queue_id_;
+
+  // IDs of operation queues to await before running this instruction.
+  std::vector<int64_t> wait_on_operation_queues_;
 };
 
 // Explicit instantiations in hlo_instruction.cc.
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.cc b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
index db908ddc46a41a..566b33dccf1097 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
@@ -663,28 +663,31 @@ bool HloChannelInstruction::IdenticalSlowPath(
 
 HloTopKInstruction::HloTopKInstruction(const Shape& shape,
                                        HloInstruction* input, int64_t k,
-                                       HloComputation* compare)
-    : HloInstruction(HloOpcode::kTopK, shape), k_(k) {
+                                       bool largest)
+    : HloInstruction(HloOpcode::kTopK, shape), k_(k), largest_(largest) {
   AppendOperand(input);
-  AppendComputation(compare);
 }
 
 HloInstructionProto HloTopKInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_k(k_);
+  proto.set_largest(largest_);
   return proto;
 }
 
 void HloTopKInstruction::PrintExtraAttributesImpl(
     AttributePrinter& printer, const HloPrintOptions& options) const {
-  printer.Next([this](Printer* printer) { AppendCat(printer, "k=", k_); });
+  printer.Next([this](Printer* p) { AppendCat(p, "k=", k_); });
+  printer.Next([this](Printer* p) {
+    AppendCat(p, "largest=", (largest_ ? "true" : "false"));
+  });
 }
 
 std::unique_ptr<HloInstruction> HloTopKInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
   return std::make_unique<HloTopKInstruction>(shape, new_operands[0], k(),
-                                              to_apply());
+                                              largest());
 }
 
 bool HloTopKInstruction::IdenticalSlowPath(
@@ -692,8 +695,7 @@ bool HloTopKInstruction::IdenticalSlowPath(
     absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
         eq_computations) const {
   const auto& casted_other = static_cast<const HloTopKInstruction&>(other);
-  return k() == casted_other.k() &&
-         eq_computations(to_apply(), casted_other.to_apply());
+  return k() == casted_other.k() && largest() == casted_other.largest();
 }
 
 HloSendRecvInstruction::HloSendRecvInstruction(HloOpcode opcode,
@@ -1573,11 +1575,19 @@ void HloConstantInstruction::RelayoutConstant(const Layout& new_layout,
   CHECK(mutable_array_subshape->IsArray());
 
   // Normally array_subshape will always have a layout, but this invariant is
-  // temporarily broken in LayoutAssignment::AssignLayouts.
+  // temporarily broken in LayoutAssignment::AssignLayouts where all shape
+  // layouts are cleared. The inner condition below ensures that we don't
+  // unnecessarily relayout literals in that case.
 
   if (!mutable_array_subshape->has_layout() ||
       !LayoutUtil::Equal(mutable_array_subshape->layout(), new_layout)) {
-    *mutable_literal() = literal_->Relayout(new_layout, shape_index);
+    if (!LayoutUtil::Equal(
+            new_layout,
+            ShapeUtil::GetSubshape(literal().shape(), shape_index).layout())) {
+      // Only relayout literals if that's really necessary.
+      Literal new_literal = literal_->Relayout(new_layout, shape_index);
+      *mutable_literal() = std::move(new_literal);
+    }
     *mutable_array_subshape->mutable_layout() = new_layout;
   }
 }
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/xla/xla/hlo/ir/hlo_instructions.h
index b9c9889ee59b2d..e2509d7284c115 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -460,7 +460,7 @@ class HloChannelInstruction : public HloInstruction {
 class HloTopKInstruction : public HloInstruction {
  public:
   HloTopKInstruction(const Shape& shape, HloInstruction* input, int64_t k,
-                     HloComputation* compare);
+                     bool largest);
 
   HloInstructionProto ToProto() const override;
 
@@ -471,6 +471,9 @@ class HloTopKInstruction : public HloInstruction {
   // Returns how many K-s does it need.
   int64_t k() const { return k_; }
 
+  // Returns whether the largest or smallest K values should be computed.
+  bool largest() const { return largest_; }
+
   void PrintExtraAttributesImpl(AttributePrinter& printer,
                                 const HloPrintOptions& options) const override;
 
@@ -484,6 +487,7 @@ class HloTopKInstruction : public HloInstruction {
       HloCloneContext* context) const override;
 
   int64_t k_;
+  bool largest_;
 };
 
 class HloSendRecvInstruction : public HloChannelInstruction {
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.cc b/third_party/xla/xla/hlo/ir/hlo_module.cc
index dd9570f874653a..149fec7c3f1d45 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module.cc
@@ -379,6 +379,14 @@ void HloModule::Print(Printer* printer, const HloPrintOptions& options) const {
                });
     printer->Append("}");
   }
+  if (config.replica_count() != 1) {
+    printer->Append(", replica_count=");
+    printer->Append(config.replica_count());
+  }
+  if (config.num_partitions() != 1) {
+    printer->Append(", num_partitions=");
+    printer->Append(config.num_partitions());
+  }
   if (!frontend_attributes_.map().empty()) {
     AppendCat(printer, ", frontend_attributes=",
               FrontendAttributesToString(frontend_attributes_));
@@ -472,6 +480,7 @@ HloModuleProto HloModule::ToProto() const {
     profile_info_proto.set_relative_speedup(profile_info.relative_speedup());
     profile_info_proto.set_profile_source(profile_info.profile_source());
     profile_info_proto.set_compilation_event(profile_info.compilation_event());
+    profile_info_proto.set_fingerprint(profile_info.fingerprint());
   }
   if (config_.get().has_static_device_assignment()) {
     DeviceAssignmentProto device_assignment;
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.cc b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
index 0f1cb985618b08..9497747e6ece24 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
@@ -361,7 +361,7 @@ HloSharding HloSharding::Tuple(const Shape& tuple_shape,
         << "Flat list has " << flattened_list.size() << ", required "
         << RequiredLeaves(tuple_shape);
   }
-  return HloSharding(flattened_list);
+  return HloSharding(std::move(flattened_list));
 }
 
 HloSharding HloSharding::SingleTuple(const Shape& tuple_shape,
@@ -809,22 +809,23 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
     for (const OpSharding& tuple_sharding_proto : proto.tuple_shardings()) {
       TF_ASSIGN_OR_RETURN(HloSharding sharding,
                           HloSharding::FromProto(tuple_sharding_proto));
-      tuple_shardings.push_back(sharding);
+      tuple_shardings.push_back(std::move(sharding));
     }
-    return HloSharding(tuple_shardings).SetShardGroupFromProto(proto);
+    return std::move(
+        HloSharding(std::move(tuple_shardings)).SetShardGroupFromProto(proto));
   } else if (proto.type() == OpSharding::REPLICATED) {
-    return Replicate(metadata).SetShardGroupFromProto(proto);
+    return std::move(Replicate(metadata).SetShardGroupFromProto(proto));
   } else if (proto.type() == OpSharding::MANUAL) {
-    return Manual(metadata).SetShardGroupFromProto(proto);
+    return std::move(Manual(metadata).SetShardGroupFromProto(proto));
   } else if (proto.type() == OpSharding::UNKNOWN) {
-    return Unknown(metadata).SetShardGroupFromProto(proto);
+    return std::move(Unknown(metadata).SetShardGroupFromProto(proto));
   } else if (proto.tile_assignment_devices().size() == 1) {
-    return HloSharding(proto.tile_assignment_devices(0), metadata)
-        .SetShardGroupFromProto(proto);
+    return std::move(HloSharding(proto.tile_assignment_devices(0), metadata)
+                         .SetShardGroupFromProto(proto));
   } else if (!proto.iota_reshape_dims().empty() &&
              absl::c_all_of(proto.iota_reshape_dims(),
                             [](int64_t d) { return d == 1; })) {
-    return HloSharding(0, metadata).SetShardGroupFromProto(proto);
+    return std::move(HloSharding(0, metadata).SetShardGroupFromProto(proto));
   }
 
   TF_RET_CHECK(proto.type() != OpSharding::MAXIMAL)
@@ -881,15 +882,17 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
   };
   if (!subgroup_types.empty()) {
     TF_RET_CHECK(!proto.replicate_on_last_tile_dim());
-    return Subgroup(create_tile_assignment(), subgroup_types, metadata)
-        .SetShardGroupFromProto(proto);
-  }
-  return proto.replicate_on_last_tile_dim()
-             ? PartialTile(create_tile_assignment(), metadata)
-                   .SetShardGroupFromProto(proto)
-             : HloSharding(create_tile_assignment(),
-                           /*replicate_on_last_tile_dim=*/false, metadata)
-                   .SetShardGroupFromProto(proto);
+    return std::move(
+        Subgroup(create_tile_assignment(), subgroup_types, metadata)
+            .SetShardGroupFromProto(proto));
+  }
+  if (proto.replicate_on_last_tile_dim()) {
+    return std::move(PartialTile(create_tile_assignment(), metadata)
+                         .SetShardGroupFromProto(proto));
+  }
+  return std::move(HloSharding(create_tile_assignment(),
+                               /*replicate_on_last_tile_dim=*/false, metadata)
+                       .SetShardGroupFromProto(proto));
 }
 
 OpSharding HloSharding::ToProto() const {
@@ -1043,9 +1046,10 @@ HloSharding HloSharding::GetSubSharding(const Shape& shape,
   }
   if (sub_shape->IsTuple()) {
     auto begin_it = tuple_elements_.begin() + sharding_index;
-    std::vector<HloSharding> sub_shardings(
-        begin_it, begin_it + ShapeUtil::GetLeafCount(*sub_shape));
-    return HloSharding::Tuple(*sub_shape, sub_shardings);
+    return HloSharding::Tuple(
+        *sub_shape,
+        absl::MakeConstSpan(
+            &*begin_it, &*(begin_it + ShapeUtil::GetLeafCount(*sub_shape))));
   } else {
     return tuple_elements_[sharding_index];
   }
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.h b/third_party/xla/xla/hlo/ir/hlo_sharding.h
index 430f95799a7bad..8b943b75b00c00 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.h
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.h
@@ -568,8 +568,8 @@ class HloSharding {
         manual_(false),
         unknown_(false),
         replicate_on_last_tile_dim_(false) {}
-  explicit HloSharding(const std::vector<HloSharding>& tuple_shardings)
-      : tuple_elements_(tuple_shardings),
+  explicit HloSharding(std::vector<HloSharding> tuple_shardings)
+      : tuple_elements_(std::move(tuple_shardings)),
         replicated_(false),
         maximal_(false),
         tuple_(true),
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index 7cdbf2f16c03ac..bee53152f046d8 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "xla/protobuf_util.h"
 #include "xla/service/call_graph.h"
 #include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -2758,5 +2759,36 @@ std::optional<HloSharding> GetOutputSharding(
   return instruction->sharding();
 }
 
+Shape UntileShape(const HloSharding& sharding, const Shape& shape) {
+  if (!sharding.IsTuple()) {
+    return UntileLeafShape(sharding, shape);
+  }
+  Shape result_shape = shape;
+  ShapeUtil::ForEachMutableSubshape(
+      &result_shape,
+      [&shape, &sharding](Shape* subshape, const ShapeIndex& index) {
+        if (!ShapeUtil::IsLeafIndex(shape, index)) {
+          return;
+        }
+        const HloSharding& subshape_sharding =
+            sharding.GetSubSharding(shape, index);
+        *subshape = UntileLeafShape(subshape_sharding, *subshape);
+      });
+
+  return result_shape;
+}
+
+Shape UntileLeafShape(const HloSharding& sharding, const Shape& shape) {
+  if (sharding.IsTileMaximal() || sharding.IsManual() || sharding.IsUnknown()) {
+    return shape;
+  }
+  Shape result_shape = shape;
+  for (int64_t i = 0; i < sharding.TiledDataRank(); ++i) {
+    result_shape.set_dimensions(
+        i, shape.dimensions(i) * sharding.tile_assignment().dim(i));
+  }
+  return result_shape;
+}
+
 }  // namespace hlo_sharding_util
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index d953f0cc532334..c158f15891d8d8 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/service/call_graph.h"
+#include "xla/shape.h"
 
 namespace xla {
 namespace hlo_sharding_util {
@@ -467,6 +468,13 @@ std::optional<GatherScatterParallelDims> GetGatherScatterBatchParallelDims(
 // special handling like Outfeed and this function takes care of those.
 std::optional<HloSharding> GetOutputSharding(const HloInstruction* instruction);
 
+// Returns the un-tiled shape.
+Shape UntileShape(const HloSharding& sharding, const Shape& shape);
+
+// Returns the un-tiled shape.
+// REQUIRES: !sharding.IsTuple()
+Shape UntileLeafShape(const HloSharding& sharding, const Shape& shape);
+
 }  // namespace hlo_sharding_util
 }  // namespace xla
 
diff --git a/third_party/xla/xla/index_util.cc b/third_party/xla/xla/index_util.cc
index 6797df88551d03..be9a912c6f03f4 100644
--- a/third_party/xla/xla/index_util.cc
+++ b/third_party/xla/xla/index_util.cc
@@ -16,16 +16,19 @@ limitations under the License.
 #include "xla/index_util.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <string>
 #include <vector>
 
 #include "absl/strings/str_join.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/util.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
 
-/* static */ std::vector<int64_t> IndexUtil::LinearIndexToMultidimensionalIndex(
+/* static */ DimensionVector IndexUtil::LinearIndexToMultidimensionalIndex(
     const Shape& shape, int64_t linear_index) {
   DCHECK_GE(linear_index, 0);
   DCHECK_LT(linear_index, ShapeUtil::ElementsIn(shape));
@@ -37,7 +40,7 @@ namespace xla {
   // I{L(1)} = (linear_index / D{L(0)}) % D{L(1)}
   // I{L(2)} = (linear_index / (D{L(0)} * D{L(1)})) % D{L(2)}
   // ...
-  std::vector<int64_t> multi_index(shape.dimensions_size());
+  DimensionVector multi_index(shape.dimensions_size());
 
   // Accumulated product D{L(0)} * D{L(1)} * ...
   int64_t divisor = 1;
diff --git a/third_party/xla/xla/index_util.h b/third_party/xla/xla/index_util.h
index e096b93c00c481..33a1d6e1479a6f 100644
--- a/third_party/xla/xla/index_util.h
+++ b/third_party/xla/xla/index_util.h
@@ -110,7 +110,7 @@ class IndexUtil {
   // Converts a linear index into multidimensional index (eg {x, y, z}) based on
   // the shape and its layout. The first index in the returned multidimensional
   // index is dimension 0.
-  static std::vector<int64_t> LinearIndexToMultidimensionalIndex(
+  static DimensionVector LinearIndexToMultidimensionalIndex(
       const Shape& shape, int64_t linear_index);
 
   // Bumps a sequence of indices; e.g. {0,0,0,0} up by one index value; e.g. to
diff --git a/third_party/xla/xla/index_util_test.cc b/third_party/xla/xla/index_util_test.cc
index 19a8eb36e69e85..5d9568b9c83ee9 100644
--- a/third_party/xla/xla/index_util_test.cc
+++ b/third_party/xla/xla/index_util_test.cc
@@ -38,7 +38,7 @@ TEST(IndexUtilTest, VectorIndexing) {
   Shape vector_shape = ShapeUtil::MakeShape(F32, {100});
   EXPECT_EQ(42,
             IndexUtil::MultidimensionalIndexToLinearIndex(vector_shape, {42}));
-  std::vector<int64_t> multi_index =
+  auto multi_index =
       IndexUtil::LinearIndexToMultidimensionalIndex(vector_shape, 42);
   EXPECT_EQ(1, multi_index.size());
   EXPECT_EQ(42, multi_index[0]);
@@ -56,8 +56,9 @@ TEST(IndexUtilTest, MatrixIndexingRowMajor) {
                                                                {9, 19}));
   EXPECT_EQ(53, IndexUtil::MultidimensionalIndexToLinearIndex(matrix_shape_01,
                                                               {3, 5}));
-  EXPECT_EQ(std::vector<int64_t>({3, 5}),
-            IndexUtil::LinearIndexToMultidimensionalIndex(matrix_shape_01, 53));
+  EXPECT_THAT(
+      IndexUtil::LinearIndexToMultidimensionalIndex(matrix_shape_01, 53),
+      testing::ElementsAre(3, 5));
 }
 
 TEST(IndexUtilTest, MatrixIndexingColumnMajor) {
@@ -72,8 +73,9 @@ TEST(IndexUtilTest, MatrixIndexingColumnMajor) {
                                                                {9, 19}));
   EXPECT_EQ(65, IndexUtil::MultidimensionalIndexToLinearIndex(matrix_shape_10,
                                                               {3, 5}));
-  EXPECT_EQ(std::vector<int64_t>({3, 5}),
-            IndexUtil::LinearIndexToMultidimensionalIndex(matrix_shape_10, 65));
+  EXPECT_THAT(
+      IndexUtil::LinearIndexToMultidimensionalIndex(matrix_shape_10, 65),
+      testing::ElementsAre(3, 5));
 }
 
 TEST(IndexUtilTest, ThreeDArrayIndexing210) {
@@ -131,7 +133,7 @@ TEST(IndexUtilTest, LinearToMultiToLinear) {
     Shape shape = ShapeUtil::MakeShape(F32, {10, 20, 30, 40, 30, 20, 10});
     SetMinorToMajorLayout(&shape, minor_to_major_order);
     for (auto linear_index : linear_indexes) {
-      std::vector<int64_t> multi_index =
+      auto multi_index =
           IndexUtil::LinearIndexToMultidimensionalIndex(shape, linear_index);
       EXPECT_EQ(linear_index, IndexUtil::MultidimensionalIndexToLinearIndex(
                                   shape, multi_index));
diff --git a/third_party/xla/xla/layout.h b/third_party/xla/xla/layout.h
index c64806c0b2644d..2a201a420dc1f6 100644
--- a/third_party/xla/xla/layout.h
+++ b/third_party/xla/xla/layout.h
@@ -58,7 +58,7 @@ class Tile {
   std::string ToString() const;
 
   // Returns the bound of the tile in the given dimension index.
-  int64_t dimension(int i) const { return dimensions_.at(i); }
+  int64_t dimension(int i) const { return dimensions_[i]; }
 
   // Returns the dimensions of the tile.
   absl::Span<const int64_t> dimensions() const { return dimensions_; }
@@ -203,10 +203,10 @@ class Layout {
   // Methods for accessing the DimLevelType array.
   int dim_level_types_size() const { return dim_level_types_.size(); }
   DimLevelType dim_level_type(int index) const {
-    return dim_level_types_.at(index);
+    return dim_level_types_[index];
   }
   Layout& set_dim_level_type(int index, DimLevelType dim_level_type) {
-    dim_level_types_.at(index) = dim_level_type;
+    dim_level_types_[index] = dim_level_type;
     return *this;
   }
   Layout& add_dim_level_type(DimLevelType dim_level_type) {
@@ -224,9 +224,9 @@ class Layout {
 
   // Methods for accessing the dim_unique array.
   int dim_unique_size() const { return dim_unique_.size(); }
-  bool dim_unique(int index) const { return dim_unique_.at(index); }
+  bool dim_unique(int index) const { return dim_unique_[index]; }
   Layout& set_dim_unique(int index, bool unique) {
-    dim_unique_.at(index) = unique;
+    dim_unique_[index] = unique;
     return *this;
   }
   Layout& add_dim_unique(bool unique) {
@@ -244,9 +244,9 @@ class Layout {
 
   // Methods for accessing the dim_ordered array.
   int dim_ordered_size() const { return dim_ordered_.size(); }
-  bool dim_ordered(int index) const { return dim_ordered_.at(index); }
+  bool dim_ordered(int index) const { return dim_ordered_[index]; }
   Layout& set_dim_ordered(int index, bool ordered) {
-    dim_ordered_.at(index) = ordered;
+    dim_ordered_[index] = ordered;
     return *this;
   }
   Layout& add_dim_ordered(bool ordered) {
@@ -264,9 +264,9 @@ class Layout {
 
   // Methods for accessing the minor-to-major array.
   int minor_to_major_size() const { return minor_to_major_.size(); }
-  int64_t minor_to_major(int index) const { return minor_to_major_.at(index); }
+  int64_t minor_to_major(int index) const { return minor_to_major_[index]; }
   Layout& set_minor_to_major(int index, int64_t value) {
-    minor_to_major_.at(index) = value;
+    minor_to_major_[index] = value;
     return *this;
   }
   Layout& add_minor_to_major(int64_t value) {
@@ -286,8 +286,8 @@ class Layout {
 
   // Methods for accessing the tile field.
   int64_t tiles_size() const { return tiles_.size(); }
-  const Tile& tiles(int index) const { return tiles_.at(index); }
-  Tile* mutable_tiles(int index) { return &tiles_.at(index); }
+  const Tile& tiles(int index) const { return tiles_[index]; }
+  Tile* mutable_tiles(int index) { return &tiles_[index]; }
   Tile* add_tiles() {
     tiles_.push_back(Tile());
     return &tiles_.back();
diff --git a/third_party/xla/xla/layout_util.cc b/third_party/xla/xla/layout_util.cc
index c4c2c5ece4eed9..0d9f5e24893a11 100644
--- a/third_party/xla/xla/layout_util.cc
+++ b/third_party/xla/xla/layout_util.cc
@@ -252,7 +252,8 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
         absl::StrJoin(layout.minor_to_major(), ", "), shape.ShortDebugString());
   }
 
-  std::vector<bool> dimensions_in_layout(shape.rank(), false);
+  absl::InlinedVector<bool, InlineRank()> dimensions_in_layout(shape.rank(),
+                                                               false);
   for (int64_t i = 0; i < shape.rank(); ++i) {
     int64_t dim = layout.minor_to_major(i);
     if (dim < 0 || dim >= shape.rank()) {
diff --git a/third_party/xla/xla/literal.cc b/third_party/xla/xla/literal.cc
index 3749bcdbaed2fb..fefa51bb68e82a 100644
--- a/third_party/xla/xla/literal.cc
+++ b/third_party/xla/xla/literal.cc
@@ -1631,7 +1631,7 @@ void LiteralBase::EachCellAsString(
   if (ShapeUtil::IsZeroElementArray(shape())) {
     return;
   }
-  std::vector<int64_t> indices = IndexUtil::LinearIndexToMultidimensionalIndex(
+  auto indices = IndexUtil::LinearIndexToMultidimensionalIndex(
       shape(), /*linear_index=*/0);
   do {
     per_cell(indices, GetAsString(indices));
diff --git a/third_party/xla/xla/literal_util.cc b/third_party/xla/xla/literal_util.cc
index 572483672a7b65..c671b294b059f0 100644
--- a/third_party/xla/xla/literal_util.cc
+++ b/third_party/xla/xla/literal_util.cc
@@ -367,9 +367,9 @@ void SetScalarAtIndexImpl(MutableLiteralBase& literal,
 
   // Copy data into new literal, element-by-element.
   for (int64_t i = 0; i < ShapeUtil::ElementsIn(literal.shape()); ++i) {
-    std::vector<int64_t> from_multi_index =
+    auto from_multi_index =
         IndexUtil::LinearIndexToMultidimensionalIndex(literal.shape(), i);
-    std::vector<int64_t> to_multi_index =
+    auto to_multi_index =
         IndexUtil::LinearIndexToMultidimensionalIndex(shape_with_layout, i);
     primitive_util::PrimitiveTypeSwitch<void>(
         [&](auto primitive_type_constant) -> void {
diff --git a/third_party/xla/xla/mlir/backends/cpu/BUILD b/third_party/xla/xla/mlir/backends/cpu/BUILD
index 7478d4ab70e0a7..b4623938eb2cba 100644
--- a/third_party/xla/xla/mlir/backends/cpu/BUILD
+++ b/third_party/xla/xla/mlir/backends/cpu/BUILD
@@ -1,5 +1,5 @@
-load("//xla:xla.bzl", "xla_cc_binary")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("//xla:xla.bzl", "xla_cc_binary")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -20,12 +20,8 @@ xla_cc_binary(
         "//xla/mlir/backends/cpu/transforms:passes",
         "//xla/mlir/xla_cpu/ir:xla_cpu",
         "//xla/mlir_hlo:all_passes",
-        "//xla/mlir_hlo:gml_st",
-        "//xla/mlir_hlo:gml_st_passes",
-        "//xla/mlir_hlo:gml_st_test_passes",
         "//xla/mlir_hlo:hlo_dialect_registration",
         "//xla/mlir_hlo:lhlo",
-        "//xla/mlir_hlo:thlo",
         "//xla/service/cpu:cpu_compiler",
         "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:FuncExtensions",
diff --git a/third_party/xla/xla/mlir/backends/cpu/xla-cpu-opt.cc b/third_party/xla/xla/mlir/backends/cpu/xla-cpu-opt.cc
index dd939396fdfda3..88a8e5c8663b9a 100644
--- a/third_party/xla/xla/mlir/backends/cpu/xla-cpu-opt.cc
+++ b/third_party/xla/xla/mlir/backends/cpu/xla-cpu-opt.cc
@@ -24,20 +24,14 @@ limitations under the License.
 #include "stablehlo/dialect/Register.h"  // from @stablehlo
 #include "xla/mlir/backends/cpu/transforms/passes.h"
 #include "xla/mlir/xla_cpu/ir/xla_cpu.h"
-#include "xla/mlir_hlo/gml_st/IR/gml_st_ops.h"
-#include "xla/mlir_hlo/gml_st/transforms/passes.h"
-#include "xla/mlir_hlo/gml_st/transforms/test_passes.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/mlir_hlo/lhlo/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/IR/register.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
-#include "xla/mlir_hlo/thlo/IR/thlo_ops.h"
 
 int main(int argc, char **argv) {
   mlir::mhlo::registerAllMhloPasses();
   mlir::lmhlo::registerAllLmhloPasses();
-  mlir::gml_st::registerGmlStPasses();
-  mlir::gml_st::registerGmlStTestPasses();
   mlir::bufferization::registerBufferizationPasses();
 
   mlir::DialectRegistry registry;
@@ -45,7 +39,6 @@ int main(int argc, char **argv) {
   mlir::stablehlo::registerAllDialects(registry);
   registry.insert<mlir::func::FuncDialect, mlir::lmhlo::LmhloDialect,
                   mlir::memref::MemRefDialect, mlir::scf::SCFDialect,
-                  mlir::gml_st::GmlStDialect, mlir::thlo::THLODialect,
                   mlir::linalg::LinalgDialect, mlir::tensor::TensorDialect,
                   mlir::vector::VectorDialect, mlir::xla_cpu::XlaCpuDialect>();
   mlir::func::registerAllExtensions(registry);
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD b/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD
index fd1c250a2d416e..1d4681c57e17fb 100644
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD
+++ b/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD
@@ -74,6 +74,9 @@ cc_library(
         "//xla/service/gpu:gpu_executable",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:nccl_collective_thunks",
+        "//xla/service/gpu/runtime3:conditional_thunk",
+        "//xla/service/gpu/runtime3:sequential_thunk",
+        "//xla/service/gpu/runtime3:while_thunk",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_description",
         "//xla/translate/mhlo_to_hlo:location_exporter",
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/gpu_to_gpu_runtime.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/gpu_to_gpu_runtime.cc
index a016b4802100ed..d7a03fe538c043 100644
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/gpu_to_gpu_runtime.cc
+++ b/third_party/xla/xla/mlir/backends/gpu/transforms/gpu_to_gpu_runtime.cc
@@ -25,7 +25,10 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "xla/mlir/backends/gpu/transforms/uid_generator.h"
 #include "xla/mlir/runtime/utils/custom_calls.h"
@@ -179,9 +182,12 @@ class LaunchFuncOpLowering : public OpRewritePattern<LaunchFuncOp> {
     // Add kernel arguments.
     llvm::copy(op.getKernelOperands(), std::back_inserter(args));
 
+    auto computation = op->getAttr("__custom_fusion_computation");
+
     // Get or create a custom call function declaration.
     func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, "xla.gpu.func.launch", TypeRange(ValueRange(args)), TypeRange());
+        b, computation ? "xla.gpu.func.custom_launch" : "xla.gpu.func.launch",
+        TypeRange(ValueRange(args)), TypeRange());
 
     // Create a function launch call operation.
     auto call = b.create<func::CallOp>(callee.getName(), TypeRange(), args);
@@ -198,6 +204,11 @@ class LaunchFuncOpLowering : public OpRewritePattern<LaunchFuncOp> {
       call->setAttr(b.getStringAttr("stream"), b.getI64IntegerAttr(0));
     }
 
+    // Copy custom fusion computation.
+    if (computation) {
+      call->setAttr("__custom_fusion_computation", computation);
+    }
+
     // Erase the original gpu launch operation.
     rewriter.eraseOp(op);
 
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_gpu_to_gpu_runtime.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_gpu_to_gpu_runtime.cc
index d3d52e4516b46a..3a74ed46de4244 100644
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_gpu_to_gpu_runtime.cc
+++ b/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_gpu_to_gpu_runtime.cc
@@ -741,7 +741,8 @@ class FusedAttentionForwardLowering
     set_attr("fmha_scale", op.getFmhaScaleAttr());
     set_attr("dropout_rate", op.getDropoutRateAttr());
     set_attr("seed", op.getSeedAttr());
-
+    set_attr("is_flash_attention", op.getIsFlashAttentionAttr());
+    set_attr("is_causal_mask", op.getIsCausalMaskAttr());
     set_attr("fused_mha_dag", op.getFusedMhaDagAttr());
     set_attr("algorithm_config", op.getAlgorithmConfigAttr());
     set_attr("bmm1_dot_dimension_numbers", op.getBmm1DotDimensionNumbers());
@@ -784,8 +785,10 @@ template <typename FusedDotAttentionBackward>
 class FusedAttentionBackwardLowering
     : public OpRewritePattern<FusedDotAttentionBackward> {
  private:
-  static constexpr const char kCustomCallTarget[] =
+  static constexpr const char kFusedAttentionCustomCallTarget[] =
       "xla.gpu.fused.attention.backward.";
+  static constexpr const char kFlashAttentionCustomCallTarget[] =
+      "xla.gpu.flash.attention.backward.";
 
  public:
   explicit FusedAttentionBackwardLowering(MLIRContext* ctx, UidGenerator& uid,
@@ -797,11 +800,36 @@ class FusedAttentionBackwardLowering
   LogicalResult matchAndRewrite(FusedDotAttentionBackward op,
                                 PatternRewriter& rewriter) const override {
     // Get the custom call target.
-    std::string fused_attention = kCustomCallTarget;
+    bool is_flash_attention = op.getIsFlashAttention();
+    std::string fused_attention = is_flash_attention
+                                      ? kFlashAttentionCustomCallTarget
+                                      : kFusedAttentionCustomCallTarget;
     auto num_operands = op.getNumOperands();
     switch (op.getFusedMhaDag()) {
+      case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::BackwardSoftmax:
+        if (is_flash_attention) {
+          if (num_operands == 12) {
+            fused_attention += "scale.softmax";
+          } else {
+            return op.emitOpError(
+                "unexpected number of operands for flash attention backward - "
+                "BMM_Softmax_BMM");
+          }
+        }
+        break;
+
       case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::
           BackwardScaleBiasSoftmax:
+        if (is_flash_attention) {
+          if (num_operands == 13) {
+            fused_attention += "scale.bias.softmax";
+          } else {
+            return op.emitOpError(
+                "unexpected number of operands for flash attention backward - "
+                "BMM_Bias_Softmax_BMM");
+          }
+          break;
+        }
         if (num_operands == 10) {
           fused_attention += "scale.softmax";
         } else if (num_operands == 11) {
@@ -877,7 +905,8 @@ class FusedAttentionBackwardLowering
     set_attr("fmha_scale", op.getFmhaScaleAttr());
     set_attr("dropout_rate", op.getDropoutRateAttr());
     set_attr("seed", op.getSeedAttr());
-
+    set_attr("is_flash_attention", op.getIsFlashAttentionAttr());
+    set_attr("is_causal_mask", op.getIsCausalMaskAttr());
     set_attr("fused_mha_dag", op.getFusedMhaDagAttr());
     set_attr("algorithm_config", op.getAlgorithmConfigAttr());
     set_attr("bmm1_grad_gemm1_dot_dimension_numbers",
@@ -889,6 +918,20 @@ class FusedAttentionBackwardLowering
     set_attr("bmm2_grad_gemm2_dot_dimension_numbers",
              op.getBmm2GradGemm2DotDimensionNumbers());
 
+    auto set_xi64 = [&](StringRef name, mlir::ArrayAttr array) {
+      int rank = array.size();
+      SmallVector<int64_t> values;
+      for (int i = 0; i < rank; i++) {
+        mlir::IntegerAttr attr = array[i].dyn_cast<mlir::IntegerAttr>();
+        values.push_back(attr.getInt());
+      }
+      set_attr(name, b.getI64TensorAttr(values));
+    };
+
+    set_xi64("intermediate_tensor_dimensions",
+             op.getIntermediateTensorDimensions());
+    set_xi64("intermediate_tensor_layout", op.getIntermediateTensorLayout());
+
     // Erase the original fused dot attention operation.
     rewriter.eraseOp(op);
 
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc
index d0df74a362ad08..74dec1f5e934a6 100644
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc
+++ b/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -39,13 +40,13 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "xla/mlir/runtime/ir/rt_ops.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
-#include "xla/service/gpu/conditional_thunk.h"
 #include "xla/service/gpu/copy_thunk.h"
 #include "xla/service/gpu/kernel_thunk.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/memset_thunk.h"
-#include "xla/service/gpu/sequential_thunk.h"
-#include "xla/service/gpu/while_thunk.h"
+#include "xla/service/gpu/runtime3/conditional_thunk.h"
+#include "xla/service/gpu/runtime3/sequential_thunk.h"
+#include "xla/service/gpu/runtime3/while_thunk.h"
 
 namespace xla {
 namespace gpu {
@@ -203,9 +204,9 @@ static absl::StatusOr<std::unique_ptr<ThunkSequence>> Match(
 
   // Check if we know how to lower a Thunk to Gpu operation(s).
   auto is_supported = [](const std::unique_ptr<Thunk>& thunk) -> bool {
-    Thunk::Kind kinds[] = {Thunk::kKernel, Thunk::kCopy,
-                           Thunk::kMemset32BitValue, Thunk::kMemzero,
-                           Thunk::kSequential};
+    Thunk::Kind kinds[] = {Thunk::kKernel,  Thunk::kCustomKernel,
+                           Thunk::kCopy,    Thunk::kMemset32BitValue,
+                           Thunk::kMemzero, Thunk::kSequential};
     return llvm::any_of(
         kinds, [&](Thunk::Kind kind) { return thunk->kind() == kind; });
   };
@@ -281,6 +282,57 @@ static void LowerKernelThunkToGpuOp(
                          kernel_args);
 }
 
+static void LowerCustomKernelThunkToGpuOp(
+    Operation* op, OpBuilder& b, GPUModuleOp gpu_module,
+    const CustomKernelThunk& thunk, const SmallVector<Value>& kernel_args,
+    const SmallVector<bool>& kernel_args_written) {
+  mlir::Location loc = op->getLoc();
+  b.setInsertionPointToStart(gpu_module.getBody());
+
+  auto func_type =
+      b.getType<FunctionType>(TypeRange(ValueRange(kernel_args)), TypeRange());
+
+  gpu::GPUFuncOp kernel_func =
+      b.create<gpu::GPUFuncOp>(loc, thunk.custom_kernel_name(), func_type);
+  kernel_func->setAttr(GPUDialect::getKernelFuncAttrName(), b.getUnitAttr());
+
+  for (int i = 0; i < kernel_args.size(); ++i) {
+    if (kernel_args_written[i]) {
+      kernel_func.setArgAttr(i, "lmhlo.written", b.getUnitAttr());
+    }
+  }
+
+  b.setInsertionPointToEnd(&kernel_func.getBody().back());
+  b.create<ReturnOp>(loc);
+
+  auto make_const_idx = [&](int64_t value) {
+    auto attr = b.getIndexAttr(value);
+    return b.create<arith::ConstantOp>(loc, attr).getResult();
+  };
+
+  auto make_kernel_dim3 = [&](const auto& dim3) {
+    return KernelDim3{make_const_idx(dim3.x), make_const_idx(dim3.y),
+                      make_const_idx(dim3.z)};
+  };
+
+  b.setInsertionPoint(op);
+  auto launch_dims = thunk.launch_dimensions();
+  auto grid_size = make_kernel_dim3(launch_dims.block_counts());
+  auto block_size = make_kernel_dim3(launch_dims.thread_counts_per_block());
+  auto shmem_size = b.create<arith::ConstantOp>(
+      loc, b.getI32IntegerAttr(thunk.shmem_bytes()));
+
+  auto launch_func = b.create<LaunchFuncOp>(
+      loc, kernel_func, grid_size, block_size, shmem_size, kernel_args);
+
+  if (auto computation = op->getAttr("__custom_fusion_computation")) {
+    launch_func->setAttr("__custom_fusion_computation", computation);
+  } else {
+    launch_func->setAttr("__custom_fusion_computation",
+                         b.getStringAttr("<UNKNOWN>"));
+  }
+}
+
 static void LowerThunkToGpuOp(Operation* op, OpBuilder& b,
                               GPUModuleOp gpu_module, Thunk* thunk) {
   auto loc = op->getLoc();
@@ -341,6 +393,23 @@ static void LowerThunkToGpuOp(Operation* op, OpBuilder& b,
     return;
   }
 
+  if (thunk->kind() == Thunk::kCustomKernel) {
+    const auto* kernel_thunk = static_cast<const CustomKernelThunk*>(thunk);
+
+    SmallVector<Value> kernel_args;
+    for (auto kernel_arg : kernel_thunk->values())
+      kernel_args.push_back(kernel_arg);
+
+    SmallVector<bool> kernel_args_written;
+    for (auto written : kernel_thunk->written()) {
+      kernel_args_written.push_back(written);
+    }
+
+    LowerCustomKernelThunkToGpuOp(op, b, gpu_module, *kernel_thunk, kernel_args,
+                                  kernel_args_written);
+    return;
+  }
+
   CHECK(false) << "Thunk kind not handled: " << thunk->kind();
 }
 
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc
index 266370e34c7f25..593b81ac2dcc61 100644
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc
+++ b/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc
@@ -791,6 +791,18 @@ class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
     return op.getIsSync();
   }
 
+  template <typename OpT>
+  static typename std::enable_if_t<is_any<OpT, SendOp, RecvOp>, bool>
+  noParallelCustomCall(OpT) {
+    return false;
+  }
+
+  template <typename OpT>
+  static typename std::enable_if_t<!is_any<OpT, SendOp, RecvOp>, bool>
+  noParallelCustomCall(OpT op) {
+    return op.getNoParallelCustomCall();
+  }
+
   // For async collective erase all corresponding done operations.
   template <typename StartOpT, typename DoneOpT>
   void eraseDoneOp(PatternRewriter& rewriter, CollectiveOp op) const {
@@ -913,6 +925,9 @@ class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
     bool is_async = !getIsSync(op);
     call->setAttr(b.getStringAttr("is_async"), b.getBoolAttr(is_async));
 
+    call->setAttr(b.getStringAttr("no_parallel_custom_call"),
+                  b.getBoolAttr(noParallelCustomCall(op)));
+
     // If the collective will not execute asynchronously, erase the associated
     // done op.
     if (!is_async) {
diff --git a/third_party/xla/xla/mlir/runtime/transforms/BUILD b/third_party/xla/xla/mlir/runtime/transforms/BUILD
index d9151d90a5c200..b56d4930bfbd59 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/BUILD
+++ b/third_party/xla/xla/mlir/runtime/transforms/BUILD
@@ -100,6 +100,12 @@ cc_library(
     srcs = ["compilation_pipeline_cpu.cc"],
     hdrs = ["compilation_pipeline_cpu.h"],
     compatible_with = get_compatible_with_portable(),
+    local_defines = select({
+        "//xla/service/cpu:experimental_mlir_gpu_enabled": [
+            "EXPERIMENTAL_MLIR_GPU=1",
+        ],
+        "//conditions:default": [],
+    }),
     visibility = ["//visibility:public"],
     deps = [
         ":compilation_pipeline_options",
@@ -126,8 +132,6 @@ cc_library(
         "@llvm-project//mlir:ControlFlowDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
-        "@llvm-project//mlir:GPUToGPURuntimeTransforms",
-        "@llvm-project//mlir:GPUTransforms",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
         "@llvm-project//mlir:LinalgDialect",
         "@llvm-project//mlir:LinalgTransforms",
@@ -142,7 +146,14 @@ cc_library(
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:X86VectorToLLVMIRTranslation",
-    ],
+        "@local_tsl//tsl/platform:logging",
+    ] + select({
+        "//xla/service/cpu:experimental_mlir_gpu_enabled": [
+            "@llvm-project//mlir:GPUToGPURuntimeTransforms",
+            "@llvm-project//mlir:GPUTransforms",
+        ],
+        "//conditions:default": [],
+    }),
     alwayslink = 1,  # has pipeline registration
 )
 
diff --git a/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc b/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
index 50488cad7f13ff..22920be516e5d4 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
+++ b/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"  // from @llvm-project
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
@@ -32,7 +31,6 @@ limitations under the License.
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"  // from @llvm-project
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/IR/Linalg.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Math/IR/Math.h"  // from @llvm-project
@@ -57,6 +55,12 @@ limitations under the License.
 #include "xla/mlir/runtime/transforms/compiler.h"
 #include "xla/mlir/runtime/transforms/passes.h"
 #include "xla/mlir_hlo/transforms/passes.h"
+#include "tsl/platform/logging.h"
+
+#ifdef EXPERIMENTAL_MLIR_GPU
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/Transforms/Passes.h"  // from @llvm-project
+#endif  // EXPERIMENTAL_MLIR_GPU
 
 namespace xla {
 namespace runtime {
@@ -146,6 +150,7 @@ static void CreateXlaCpuCompilationPipeline(mlir::OpPassManager& pm,
   llvm_options.enableAvx2 = opts.math_avx2;
   pm.addPass(mlir::hlo::createGenericHostToLLVMPass(llvm_options));
   const bool gpuCodegen = opts.xla_cpu_sparse_cuda_threads > 0;
+#ifdef EXPERIMENTAL_MLIR_GPU
   if (gpuCodegen) {
 #ifdef MLIR_GPU_TO_CUBIN_PASS_ENABLE
     pm.addNestedPass<mlir::gpu::GPUModuleOp>(
@@ -154,6 +159,10 @@ static void CreateXlaCpuCompilationPipeline(mlir::OpPassManager& pm,
 #endif
     pm.addPass(mlir::createGpuToLLVMConversionPass());
   }
+#else   // EXPERIMENTAL_MLIR_GPU
+  CHECK(!gpuCodegen)
+      << "Experimental MLIR GPU code generation was not enabled at build time";
+#endif  // EXPERIMENTAL_MLIR_GPU
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
 
   // Prepare module for translation to LLVM.
diff --git a/third_party/xla/xla/mlir_hlo/BUILD b/third_party/xla/xla/mlir_hlo/BUILD
index f55e0dad52f3f6..76c566c2bc78b6 100644
--- a/third_party/xla/xla/mlir_hlo/BUILD
+++ b/third_party/xla/xla/mlir_hlo/BUILD
@@ -375,73 +375,17 @@ gentbl_cc_library(
     deps = [":hlo_ops_td_files"],
 )
 
-td_library(
-    name = "deallocation_ops_td_files",
-    srcs = glob(["deallocation/IR/*.td"]),
-    compatible_with = get_compatible_with_portable(),
-    includes = ["."],
-    deps = [
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "deallocation_ops_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "deallocation/IR/deallocation_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "deallocation/IR/deallocation_ops.cc.inc",
-        ),
-        (
-            ["-gen-dialect-decls"],
-            "deallocation/IR/deallocation_dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "deallocation/IR/deallocation_dialect.cc.inc",
-        ),
-        (
-            ["-gen-typedef-decls"],
-            "deallocation/IR/deallocation_typedefs.h.inc",
-        ),
-        (
-            ["-gen-typedef-defs"],
-            "deallocation/IR/deallocation_typedefs.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "deallocation/IR/deallocation_ops.td",
-    deps = [":deallocation_ops_td_files"],
-)
-
 cc_library(
     name = "deallocation_passes",
     srcs = [
-        "deallocation/transforms/analysis.cc",
         "deallocation/transforms/buffer_reuse.cc",
-        "deallocation/transforms/convert_deallocation_ops_to_llvm.cc",
-        "deallocation/transforms/deallocate.cc",
-        "deallocation/transforms/deallocation_simplification.cc",
-        "deallocation/transforms/deallocation_to_scf.cc",
-        "deallocation/transforms/debug_passes.cc",
-        "deallocation/transforms/split_alloc_tensors.cc",
-        "deallocation/transforms/xla_buffer_arg_rewrite.cc",
     ],
     hdrs = [
-        "deallocation/transforms/analysis.h",
         "deallocation/transforms/passes.h",
     ],
     strip_include_prefix = ".",
     visibility = ["//visibility:public"],
     deps = [
-        ":deallocation",
         ":deallocation_passes_inc_gen",
         ":deallocation_utils",
         "@llvm-project//llvm:Support",
@@ -479,23 +423,6 @@ gentbl_cc_library(
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
 )
 
-cc_library(
-    name = "deallocation",
-    srcs = ["deallocation/IR/deallocation_ops.cc"],
-    hdrs = ["deallocation/IR/deallocation_ops.h"],
-    strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":deallocation_ops_inc_gen",
-        ":deallocation_utils",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ControlFlowInterfaces",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:SCFDialect",
-    ],
-)
-
 cc_library(
     name = "deallocation_utils",
     srcs = ["deallocation/utils/util.cc"],
@@ -503,7 +430,6 @@ cc_library(
     strip_include_prefix = ".",
     visibility = ["//visibility:public"],
     deps = [
-        ":deallocation_ops_inc_gen",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:IR",
@@ -750,7 +676,6 @@ cc_library(
         "mhlo/transforms/legalize_dot_to_dot_general/legalize_dot_to_dot_general.cc",
         "mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc",
         "mhlo/transforms/legalize_gather_to_torch_index_select/legalize_gather_to_torch_index_select.cc",
-        "mhlo/transforms/legalize_mhlo_to_thlo/legalize_mhlo_to_thlo.cc",
         "mhlo/transforms/legalize_shape_computations/legalize_shape_computations.cc",
         "mhlo/transforms/legalize_sort/legalize_sort.cc",
         "mhlo/transforms/legalize_sparse_ops/legalize_sparse_ops.cc",
@@ -812,8 +737,6 @@ cc_library(
         ":mlir_hlo",
         ":shape_component_analysis",
         ":stablehlo_legalize_to_hlo",
-        ":thlo",
-        ":thlo_bufferizable_op_interface",
         ":type_conversion",
         ":unfuse_batch_norm",
         "@llvm-project//llvm:Support",
@@ -1236,16 +1159,12 @@ cc_library(
         # shouldn't be. Ideally, this entire target should be removed.
         "deallocation/transforms/passes.h.inc",
         "lhlo/transforms/lmhlo_passes.h.inc",
-        "gml_st/transforms/passes.h.inc",
-        "thlo/transforms/thlo_passes.h.inc",
         "transforms/passes.h.inc",
     ],
     hdrs = [
         "deallocation/transforms/passes.h",
-        "gml_st/transforms/passes.h",
         "lhlo/transforms/passes.h",
         "mhlo/transforms/passes.h",
-        "thlo/transforms/passes.h",
         "transforms/passes.h",
     ],
     strip_include_prefix = ".",
@@ -1254,16 +1173,12 @@ cc_library(
         ":chlo_legalize_to_hlo",
         ":deallocation_passes",
         ":deallocation_passes_inc_gen",
-        ":gml_st_passes",
-        ":gml_st_passes_inc_gen",
         ":lhlo",
         ":lmhlo_pass_inc_gen",
         ":lmhlo_passes",
         ":mhlo_pass_inc_gen",
         ":mhlo_passes",
         ":stablehlo_legalize_to_hlo",
-        ":thlo_passes",
-        ":thlo_passes_inc_gen",
         ":transforms_passes",
         ":transforms_passes_inc_gen",
         ":userange_analysis",
@@ -1289,11 +1204,13 @@ cc_library(
         "transforms/detensorize_scf_ops.cc",
         "transforms/generic_host_to_llvm.cc",
         "transforms/lower_index_cast_pass.cc",
+        "transforms/naive_copy_removal.cc",
         "transforms/propagate_static_shapes_to_kernel.cc",
         "transforms/test_hlo_transform_dialect_interpreter.cc",
         "transforms/tile_loops_pass.cc",
         "transforms/unbufferize_pass.cc",
         "transforms/unroll_loops.cc",
+        "transforms/vectorize_copy.cc",
     ],
     hdrs = [
         "transforms/passes.h",
@@ -1303,17 +1220,11 @@ cc_library(
     strip_include_prefix = ".",
     visibility = ["//visibility:public"],
     deps = [
-        ":deallocation",
         ":deallocation_passes",
-        ":gml_st",
-        ":gml_st_bufferizable_op_interface",
-        ":gml_st_passes",
         ":lhlo",
         ":mhlo_passes",
         ":mlir_hlo",
         ":shape_component_analysis",
-        ":thlo",
-        ":thlo_bufferizable_op_interface",
         ":transforms_passes_inc_gen",
         ":type_conversion",
         ":userange_analysis",
@@ -1351,6 +1262,7 @@ cc_library(
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:MemRefToLLVM",
         "@llvm-project//mlir:MemRefTransforms",
+        "@llvm-project//mlir:MemRefUtils",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:PDLDialect",
         "@llvm-project//mlir:Pass",
@@ -1390,7 +1302,6 @@ cc_library(
     strip_include_prefix = ".",
     visibility = ["//visibility:public"],
     deps = [
-        ":gml_st_passes",
         ":gpu_transforms_passes_inc_gen",
         ":lhlo",
         ":mhlo_passes",
@@ -1435,48 +1346,6 @@ cc_library(
     ],
 )
 
-gentbl_cc_library(
-    name = "gml_st_test_passes_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=GmlStTest",
-            ],
-            "gml_st/transforms/test_passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "gml_st/transforms/test_passes.td",
-    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
-)
-
-cc_library(
-    name = "gml_st_test_passes",
-    srcs = [
-        "gml_st/transforms/test_passes.cc",
-        "gml_st/transforms/test_passes.h.inc",
-    ],
-    hdrs = ["gml_st/transforms/test_passes.h"],
-    strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":gml_st_passes",
-        ":gml_st_test_passes_inc_gen",
-        ":gml_st_transforms",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:BufferizationDialect",
-        "@llvm-project//mlir:BufferizationTransforms",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Transforms",
-    ],
-)
-
 gentbl_cc_library(
     name = "transforms_passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
@@ -1541,111 +1410,8 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:TensorDialect",
-    ],
-)
-
-cc_library(
-    name = "gml_st_passes",
-    srcs = [
-        "gml_st/transforms/add_debug_info/add_debug_info.cc",
-        "gml_st/transforms/canonicalization/optimize_linalg_ops.cc",
-        "gml_st/transforms/collapse_shape/collapse_shape.cc",
-        "gml_st/transforms/collect_stats/collect_stats.cc",
-        "gml_st/transforms/compose_extract_insert_slice/compose_extract_insert_slice.cc",
-        "gml_st/transforms/copy_removal/copy_removal.cc",
-        "gml_st/transforms/cpu_tiling/cpu_tiling_pipeline.cc",
-        "gml_st/transforms/cpu_tiling/fusion_outlining.cc",
-        "gml_st/transforms/cpu_tiling/fusion_planning_for_cpu.cc",
-        "gml_st/transforms/cpu_tiling/pack_matmul.cc",
-        "gml_st/transforms/cpu_tiling/remove_label.cc",
-        "gml_st/transforms/cpu_tiling/transform_dot_for_cpu.cc",
-        "gml_st/transforms/cpu_tiling/transform_elementwise_for_cpu.cc",
-        "gml_st/transforms/cpu_tiling/transform_mmt4d_for_cpu.cc",
-        "gml_st/transforms/cpu_tiling/transform_pack_for_cpu.cc",
-        "gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc",
-        "gml_st/transforms/cpu_tiling/transform_scatter_for_cpu.cc",
-        "gml_st/transforms/fusion/fusion.cc",
-        "gml_st/transforms/passes.h.inc",
-        "gml_st/transforms/peeling/peeling.cc",
-        "gml_st/transforms/rewrite_from_elements_op/rewrite_from_elements_op.cc",
-        "gml_st/transforms/rewrite_scf_forall/rewrite_scf_forall.cc",
-        "gml_st/transforms/scalarization/scalarization.cc",
-        "gml_st/transforms/tiling/tile_by_one.cc",
-        "gml_st/transforms/tiling/tiling.cc",
-        "gml_st/transforms/tiling_softmax/tiling_softmax.cc",
-        "gml_st/transforms/transforms.h",
-        "gml_st/transforms/vectorization/lower_vectors.cc",
-        "gml_st/transforms/vectorization/vectorization.cc",
-        "gml_st/transforms/vectorization/vectorize_copy.cc",
-        "gml_st/transforms/vectorization/vectorize_for_cpu.cc",
-        "gml_st/utils/linalg_utils.cc",
-        "gml_st/utils/tensor_utils.cc",
-    ],
-    hdrs = [
-        "gml_st/transforms/fusion/fusion.h",
-        "gml_st/transforms/passes.h",
-        "gml_st/transforms/peeling/peeling.h",
-        "gml_st/transforms/scalarization/scalarization.h",
-        "gml_st/transforms/tiling/tiling.h",
-        "gml_st/transforms/vectorization/vectorization.h",
-        "gml_st/utils/linalg_utils.h",
-        "gml_st/utils/tensor_utils.h",
-    ],
-    strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":gml_st",
-        ":gml_st_passes_inc_gen",
-        ":gml_st_transforms",
-        ":lhlo",
-        ":mlir_hlo",
-        ":thlo",
-        ":type_conversion",
-        "@llvm-project//llvm:BinaryFormat",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineUtils",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithUtils",
-        "@llvm-project//mlir:BufferizationDialect",
-        "@llvm-project//mlir:BufferizationTransforms",
-        "@llvm-project//mlir:ComplexDialect",
-        "@llvm-project//mlir:ControlFlowDialect",
-        "@llvm-project//mlir:DestinationStyleOpInterface",
-        "@llvm-project//mlir:DialectUtils",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncTransforms",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:LinalgUtils",
-        "@llvm-project//mlir:LoopLikeInterface",
-        "@llvm-project//mlir:MathDialect",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:MemRefUtils",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:SCFTransforms",
-        "@llvm-project//mlir:SCFUtils",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:ShapeTransforms",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TensorInferTypeOpInterfaceImpl",
-        "@llvm-project//mlir:TensorTilingInterfaceImpl",
-        "@llvm-project//mlir:TensorTransforms",
-        "@llvm-project//mlir:TensorUtils",
-        "@llvm-project//mlir:TilingInterface",
-        "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:VectorDialect",
-        "@llvm-project//mlir:VectorToSCF",
-        "@llvm-project//mlir:VectorTransforms",
-        "@llvm-project//mlir:X86VectorTransforms",
-        "@stablehlo//:chlo_ops",
     ],
 )
 
@@ -1712,14 +1478,9 @@ cc_binary(
     srcs = ["tools/mlir-hlo-opt/mlir-hlo-opt.cc"],
     deps = [
         ":all_passes",
-        ":deallocation",
-        ":gml_st",
-        ":gml_st_passes",
-        ":gml_st_test_passes",
         ":hlo_dialect_registration",
         ":lhlo",
         ":lhlo_gpu",
-        ":thlo",
         ":transforms_gpu_passes",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllExtensions",
@@ -1762,259 +1523,6 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-td_library(
-    name = "gml_st_ops_td_files",
-    srcs = glob(["gml_st/IR/*.td"]),
-    compatible_with = get_compatible_with_portable(),
-    includes = ["."],
-    deps = [
-        "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
-        "@llvm-project//mlir:DestinationStyleOpInterfaceTdFiles",
-        "@llvm-project//mlir:DialectUtilsTdFiles",
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "gml_st_ops_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "gml_st/IR/gml_st_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "gml_st/IR/gml_st_ops.cc.inc",
-        ),
-        (
-            ["-gen-dialect-decls"],
-            "gml_st/IR/gml_st_dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "gml_st/IR/gml_st_dialect.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "gml_st/IR/gml_st_ops.td",
-    deps = [":gml_st_ops_td_files"],
-)
-
-cc_library(
-    name = "gml_st",
-    srcs = ["gml_st/IR/gml_st_ops.cc"],
-    hdrs = ["gml_st/IR/gml_st_ops.h"],
-    strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":gml_st_ops_inc_gen",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithUtils",
-        "@llvm-project//mlir:ControlFlowInterfaces",
-        "@llvm-project//mlir:DestinationStyleOpInterface",
-        "@llvm-project//mlir:DialectUtils",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:InferTypeOpInterface",
-        "@llvm-project//mlir:LoopLikeInterface",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TensorUtils",
-        "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:VectorDialect",
-        "@llvm-project//mlir:ViewLikeInterface",
-    ],
-)
-
-cc_library(
-    name = "gml_st_bufferizable_op_interface",
-    srcs = ["gml_st/interfaces/bufferizable_op_interface_impl.cc"],
-    hdrs = ["gml_st/interfaces/bufferizable_op_interface_impl.h"],
-    strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":gml_st",
-        "@llvm-project//mlir:BufferizationDialect",
-        "@llvm-project//mlir:DialectUtils",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-gentbl_cc_library(
-    name = "gml_st_passes_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=GmlSt",
-            ],
-            "gml_st/transforms/passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "gml_st/transforms/passes.td",
-    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
-)
-
-cc_library(
-    name = "gml_st_transforms",
-    srcs = ["gml_st/transforms/transforms.cc"],
-    hdrs = ["gml_st/transforms/transforms.h"],
-    strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":gml_st",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:ArithUtils",
-        "@llvm-project//mlir:DialectUtils",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SCFUtils",
-        "@llvm-project//mlir:TensorUtils",
-        "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:VectorDialect",
-    ],
-)
-
-td_library(
-    name = "thlo_ops_td_files",
-    srcs = glob(["thlo/IR/*.td"]),
-    compatible_with = get_compatible_with_portable(),
-    includes = ["."],
-    deps = [
-        "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
-        "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "thlo_ops_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "thlo/IR/thlo_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "thlo/IR/thlo_ops.cc.inc",
-        ),
-        (
-            ["-gen-dialect-decls"],
-            "thlo/IR/thlo_dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "thlo/IR/thlo_dialect.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "thlo/IR/thlo_ops.td",
-    deps = [
-        ":thlo_ops_td_files",
-        "@llvm-project//mlir:DestinationStyleOpInterfaceTdFiles",
-        "@llvm-project//mlir:TilingInterfaceTdFiles",
-    ],
-)
-
-cc_library(
-    name = "thlo",
-    srcs = ["thlo/IR/thlo_ops.cc"],
-    hdrs = ["thlo/IR/thlo_ops.h"],
-    strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":gml_st",
-        ":thlo_ops_inc_gen",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithUtils",
-        "@llvm-project//mlir:ControlFlowInterfaces",
-        "@llvm-project//mlir:DestinationStyleOpInterface",
-        "@llvm-project//mlir:DialectUtils",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:InferTypeOpInterface",
-        "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:LinalgUtils",
-        "@llvm-project//mlir:LoopLikeInterface",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TensorUtils",
-        "@llvm-project//mlir:TilingInterface",
-        "@llvm-project//mlir:ViewLikeInterface",
-    ],
-)
-
-cc_library(
-    name = "thlo_bufferizable_op_interface",
-    srcs = ["thlo/interfaces/bufferizable_op_interface_impl.cc"],
-    hdrs = ["thlo/interfaces/bufferizable_op_interface_impl.h"],
-    strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":thlo",
-        "@llvm-project//mlir:BufferizationDialect",
-        "@llvm-project//mlir:DestinationStyleOpInterface",
-    ],
-)
-
-gentbl_cc_library(
-    name = "thlo_passes_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=AllThlo",
-            ],
-            "thlo/transforms/thlo_passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "thlo/transforms/thlo_passes.td",
-    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
-)
-
-cc_library(
-    name = "thlo_passes",
-    srcs = [
-        "thlo/transforms/legalize_sort/legalize_sort.cc",
-        "thlo/transforms/thlo_passes.h.inc",
-    ],
-    hdrs = [
-        "thlo/transforms/passes.h",
-    ],
-    strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":thlo",
-        ":thlo_passes_inc_gen",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithUtils",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:Transforms",
-    ],
-)
-
 # A light-weight runtime support library, used by MLIR code that results
 # after lowering some ops in the vector and sparse tensor dialects.
 cc_binary(
diff --git a/third_party/xla/xla/mlir_hlo/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/CMakeLists.txt
index cbbaa42f9c16bf..9bfdc58b3a3eb2 100644
--- a/third_party/xla/xla/mlir_hlo/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/CMakeLists.txt
@@ -161,13 +161,11 @@ add_custom_target(check-mlir-hlo)
 add_subdirectory(analysis)
 add_subdirectory(bindings)
 add_subdirectory(deallocation)
-add_subdirectory(gml_st)
 add_subdirectory(lhlo)
 add_subdirectory(lhlo_gpu)
 add_subdirectory(mhlo)
 add_subdirectory(stablehlo)
 add_subdirectory(tests)
-add_subdirectory(thlo)
 add_subdirectory(tools)
 add_subdirectory(transforms)
 add_subdirectory(utils)
diff --git a/third_party/xla/xla/mlir_hlo/bindings/c/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/bindings/c/CMakeLists.txt
index 858b2b63dc9b28..d3f4158293adfc 100644
--- a/third_party/xla/xla/mlir_hlo/bindings/c/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/bindings/c/CMakeLists.txt
@@ -5,14 +5,12 @@ add_mlir_public_c_api_library(MLIRHLOCAPIDialects
   Passes.cc
   LINK_LIBS PUBLIC
   MhloDialect
-  THLODialect
   # For AllMhLoPasses:
   ChloPasses
   MhloPasses
   MhloToArithmeticConversion
   MhloToMemrefConversion
   MhloToStandard
-  MhloToThloConversion
   MhloToLinalg
   MhloToStablehlo
   MhloShapeOpsToStandard
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/deallocation/CMakeLists.txt
index 0868972988fc99..d758e74bb38e82 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/deallocation/CMakeLists.txt
@@ -12,6 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(IR)
 add_subdirectory(transforms)
 add_subdirectory(utils)
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/IR/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/deallocation/IR/CMakeLists.txt
deleted file mode 100644
index 89ace26593b6d0..00000000000000
--- a/third_party/xla/xla/mlir_hlo/deallocation/IR/CMakeLists.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set(LLVM_TARGET_DEFINITIONS deallocation_ops.td)
-mlir_tablegen(deallocation_ops.h.inc -gen-op-decls)
-mlir_tablegen(deallocation_ops.cc.inc -gen-op-defs)
-mlir_tablegen(deallocation_dialect.h.inc -gen-dialect-decls)
-mlir_tablegen(deallocation_dialect.cc.inc -gen-dialect-defs)
-mlir_tablegen(deallocation_typedefs.h.inc -gen-typedef-decls)
-mlir_tablegen(deallocation_typedefs.cc.inc -gen-typedef-defs)
-
-add_public_tablegen_target(MLIRdeallocation_opsIncGen)
-add_dependencies(mlir-headers MLIRdeallocation_opsIncGen)
-
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_mlir_dialect_library(DeallocationDialect
-  deallocation_ops.cc
-
-  DEPENDS
-  MLIRdeallocation_opsIncGen
-
-  LINK_LIBS PUBLIC
-  MLIRDeallocationUtils
-  MLIRControlFlowInterfaces
-  MLIRIR
-  MLIRMemRefDialect
-  MLIRSCFDialect
-  MLIRSupport
-)
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/IR/deallocation_ops.cc b/third_party/xla/xla/mlir_hlo/deallocation/IR/deallocation_ops.cc
deleted file mode 100644
index c899022593b755..00000000000000
--- a/third_party/xla/xla/mlir_hlo/deallocation/IR/deallocation_ops.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "deallocation/IR/deallocation_ops.h"
-
-#include "deallocation/IR/deallocation_dialect.cc.inc"
-#include "llvm/ADT/TypeSwitch.h"  // IWYU pragma: keep
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/DialectImplementation.h"  // IWYU pragma: keep
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/TypeUtilities.h"
-
-#define GET_TYPEDEF_CLASSES
-#include "deallocation/IR/deallocation_typedefs.cc.inc"
-#undef GET_TYPEDEF_CLASSES
-
-namespace mlir {
-namespace deallocation {
-
-void DeallocationDialect::initialize() {
-  addOperations<
-#define GET_OP_LIST
-#include "deallocation/IR/deallocation_ops.cc.inc"
-#undef GET_OP_LIST
-      >();
-  addTypes<
-#define GET_TYPEDEF_LIST
-#include "deallocation/IR/deallocation_typedefs.cc.inc"
-#undef GET_TYPEDEF_LIST
-      >();
-}
-
-void OwnOp::build(OpBuilder& odsBuilder, OperationState& odsState,
-                  Value memref) {
-  return build(odsBuilder, odsState,
-               OwnershipIndicatorType::get(odsBuilder.getContext()), memref);
-}
-
-void NullOp::build(OpBuilder& odsBuilder, OperationState& odsState) {
-  return build(odsBuilder, odsState,
-               OwnershipIndicatorType::get(odsBuilder.getContext()));
-}
-
-}  // namespace deallocation
-}  // namespace mlir
-
-#define GET_OP_CLASSES
-#include "deallocation/IR/deallocation_ops.cc.inc"
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/IR/deallocation_ops.h b/third_party/xla/xla/mlir_hlo/deallocation/IR/deallocation_ops.h
deleted file mode 100644
index 74dec9c2e8ee43..00000000000000
--- a/third_party/xla/xla/mlir_hlo/deallocation/IR/deallocation_ops.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DEALLOACTION_DEALLOCATION_OPS_H
-#define MLIR_HLO_DEALLOACTION_DEALLOCATION_OPS_H
-
-#include "mlir/Bytecode/BytecodeOpInterface.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
-
-#define GET_TYPEDEF_CLASSES
-#include "deallocation/IR/deallocation_typedefs.h.inc"
-#undef GET_TYPEDEF_CLASSES
-
-#define GET_OP_CLASSES
-#include "deallocation/IR/deallocation_dialect.h.inc"
-#include "deallocation/IR/deallocation_ops.h.inc"
-#undef GET_OP_CLASSES
-
-#endif  // MLIR_HLO_DEALLOACTION_DEALLOCATION_OPS_H
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/IR/deallocation_ops.td b/third_party/xla/xla/mlir_hlo/deallocation/IR/deallocation_ops.td
deleted file mode 100644
index 97c001dfdc22ce..00000000000000
--- a/third_party/xla/xla/mlir_hlo/deallocation/IR/deallocation_ops.td
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef DEALLOCATION_OPS_TD_
-#define DEALLOCATION_OPS_TD_
-
-include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir/IR/AttrTypeBase.td"
-
-def DeallocationDialect : Dialect {
-  let name = "deallocation";
-
-  let summary = "Operations for the deallocation dialect";
-  let description = [{
-    Ops for modelling owned/unowned memrefs using null pointers.
-  }];
-  let cppNamespace = "::mlir::deallocation";
-
-  let useDefaultTypePrinterParser = 1;
-  let usePropertiesForAttributes = 0;
-}
-
-def OwnershipIndicatorType : TypeDef<DeallocationDialect, "OwnershipIndicator"> {
-  let mnemonic = "ownership";
-  let summary = "an ownership indicator";
-}
-
-class DeallocationOp<string mnemonic, list<Trait> traits = []>
-    : Op<DeallocationDialect, mnemonic, traits>;
-
-def GetBufferOp : DeallocationOp<"get_buffer", [Pure]> {
-  let summary = "extracts the base pointer as an index";
-
-  let arguments = (ins AnyTypeOf<[AnyMemRef, OwnershipIndicatorType]>:$alloc);
-  let results = (outs Index:$result);
-
-  let assemblyFormat = "attr-dict $alloc `:` type($alloc)";
-}
-
-def OwnOp : DeallocationOp<"own", [Pure]> {
-  let summary = "declare ownership";
-
-  let arguments = (ins AnyRankedOrUnrankedMemRef:$memref);
-  let results = (outs OwnershipIndicatorType:$result);
-
-  let builders = [
-    OpBuilder<(ins "Value":$memref)>
-  ];
-
-  let assemblyFormat = "attr-dict $memref `:` type($memref)";
-}
-
-def NullOp : DeallocationOp<"null", [Pure]> {
-  let summary = "null pointer";
-
-  let results = (outs OwnershipIndicatorType:$result);
-
-  let builders = [
-    OpBuilder<(ins)>
-  ];
-
-  let assemblyFormat = "attr-dict";
-}
-
-def FreeOp : DeallocationOp<"free"> {
-  let summary = "free";
-
-  let arguments = (ins OwnershipIndicatorType:$alloc);
-
-  let assemblyFormat = "attr-dict $alloc";
-}
-
-// TODO(jreiffers): Implement InferTypeOpInterface.
-def RetainOp : DeallocationOp<"retain", [AttrSizedOperandSegments]> {
-  let summary = "null-safe dealloc";
-
-  let description = [{
-    For each memref in `retained`, finds the alloc in `allocs` that it is
-    derived from and returns it. If not found, returns `null`.
-
-    Any allocs that are not in the result are deallocated.
-
-    `allocs` may contain `null`s. Otherwise, all allocs must be distinct.
-    `retained` values may alias.
-  }];
-
-  let arguments = (ins Variadic<AnyRankedOrUnrankedMemRef>:$retained,
-                       Variadic<OwnershipIndicatorType>:$allocs);
-  let results = (outs Variadic<OwnershipIndicatorType>:$result_allocs);
-
-  let assemblyFormat = [{
-    `(` $retained `)` `of` `(` $allocs `)` attr-dict `:`
-      functional-type(operands, results)
-  }];
-}
-
-#endif  // DEALLOCATION_TD_
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/README.md b/third_party/xla/xla/mlir_hlo/deallocation/README.md
deleted file mode 100644
index 9cbbed9769ee19..00000000000000
--- a/third_party/xla/xla/mlir_hlo/deallocation/README.md
+++ /dev/null
@@ -1,446 +0,0 @@
-# MLIR-HLO deallocation and buffer reuse passes
-
-MLIR-HLO deallocation is an alternative to the upstream buffer-deallocation and
-buffer-hoisting passes.
-
-The core concept is that of *ownership*, i.e. for each allocation, we track an
-*ownership indicator* that can be moved around. These indicators can be
-understood as a `std::unique_ptr` or alternatively a ref-counted pointer with a
-maximum count of 1. At the end of a block, an ownership indicator must either
-be yielded or the underlying alloc must be freed. In practice, it is not always
-known whether a particular alloc is owned by the current block. Therefore, we
-must also be able to represent empty ownership indicators (i.e., null pointers).
-
-## Usage
-
-This is the recommended and supported pass pipeline to use these passes:
-
-1.  `hlo-split-alloc-tensors`
-1.  `one-shot-bufferize` with `create-deallocs=0`
-1.  `hlo-deallocate`
-1.  `hlo-deallocation-simplification`
-1.  `hlo-buffer-reuse`
-1.  `hlo-deallocation-simplification`
-1.  `hlo-deallocation-to-scf`
-1.  (...)
-1.  `convert-deallocation-ops-to-llvm`
-
-It is possible to use just the deallocation pass or just buffer-reuse, but the
-former isn't recommended because the output will be inefficient. The latter will
-work as long as the invariants assumed by this code are maintained (in
-particular, there should be no unranked memrefs in the input IR, since as
-described above, the code here assigns special meaning to those).
-
-## "ABI"
-
-As long as the IR contains only a single function, there shouldn't be any sharp
-edges here. If there are multiple functions, it is important to pay attention to
-the ABI assumed here:
-
-1.  Function arguments are always owned by the caller.
-1.  Function results are always owned by the caller **and do not alias with any
-    function arguments**. In other words, function results are always freshly
-    allocated buffers. Function arguments may alias each other.
-
-Warning: The second condition here is particularly important - if a function
-returns one of its arguments, the deallocation pass will silently introduce a
-double free.
-
-This restriction could be lifted by introducing ownership indicators for
-function arguments, but as of March 2023, this is not done.
-
-## The deallocation pass
-
-The deallocation pass assumes that:
-
-1.  The input IR was fully bufferized (i.e., no tensors are left in the
-    program).
-1.  No `dealloc`s, `alloca`s or `realloc`s exist yet.
-1.  No `memrefs` with distinct element types alias (strict aliasing; in
-    particular, no `xla_cpu.memref_element_cast` ops should exist at this point)
-
-The basic deallocation algorithm works mostly locally within blocks. It
-transforms the input IR op by op, keeping track of memref alias information as
-it goes. For each op, it produces the following information: 1) which allocs
-were released by the parent block (i.e., are no longer owned by it; more on that
-in the section on transferring ownership), 2) which new allocs are now owned by
-the parent block. For example, when processing an `alloc` op, nothing is
-released, and the result of the op is now owned by the block. It also keeps
-track of aliasing information. Conservatively, it is assumed that all inputs
-alias all compatible outputs.
-
-When transforming a block, it is not possible to know in general whether
-`memref` arguments are owned by it or by some ancestor. Therefore, we introduce
-ownership indicator arguments (`!deallocation.ownership`) for each `memref`
-argument. Inside the block, `allocs` and alias sets are tracked as described
-above. At the end of the block, we must reconcile these memrefs and potentially
-owned allocs. We can do this separately for those that are yielded from the
-block and those that aren't.
-
-For `memrefs` (or rather sets of `memrefs` that potentially alias) that aren't
-yielded, we must free the corresponding `alloc` if we own it. In general, we
-can't know statically whether that's the case, so we use the `retain` op, which
-frees non-null allocs [^1] that are no longer needed. To find the place to
-insert the op, we simply traverse the block backwards, starting from the
-terminator, and look for the last op that contains any reference to a memref
-from the alias set.
-
-```
-  // Free %alloc_0 and %alloc_1 iff they are non-null.
-  deallocation.retain() of(%alloc_0, %alloc_1)
-      : (!deallocation.ownership, !deallocation.ownership) -> ()
-```
-
-For `memrefs` that are yielded, we also insert retain ops, but this time, we
-must retain allocs if we own them. The `retain` ops look like this:
-
-```
-  // Check if %yielded_memref aliases with any of %a, %b or %c. If it does,
-  // return the corresponding memref. Free the others if they are non-null.
-  %maybe_owned = deallocation.retain(%yielded_memref) of(%a, %b, %c)
-      : (!deallocation.ownership, !deallocation.ownership, !deallocation.ownership)
-      -> (!deallocation.ownership)
-```
-
-To understand where such ops come from, consider the following code:
-
-```
-  %result = scf.if %cond -> memref<2xi32> {
-    scf.yield %some_alloc : memref<2xi32>
-  } else {
-    %new_alloc = memref.alloc() : memref<2xi32>
-    scf.yield %new_alloc : memref<2xi32>
-  }
-```
-
-Whether the parent block owns the alloc that backs `%result` depends on which
-branch was taken. Therefore, after transforming the block, the `if` will look
-like this:
-
-```
-  %result, %result_ownership = scf.if %cond -> memref<2xi32> {
-    %null = deallocation.null
-    scf.yield %some_alloc, %null : memref<2xi32>, !deallocation.ownership
-  } else {
-    %new_alloc = memref.alloc() : memref<2xi32>
-    %new_alloc_owned = deallocation.own %new_alloc : memref<2x32>
-    scf.yield %new_alloc, %new_alloc_owned : memref<2xi32>, !deallocation.ownership
-  }
-```
-
-`%result_ownership` is nonnull iff `%result` is owned by the parent block. If
-`%result` is yielded, the corresponding retain op would be:
-
-```
-  %yielded_result_ownership = deallocation.retain(%result) of(%result_ownership)
-```
-
-However, here we can statically determine that this always results in
-`%result_ownership`, so the `retain` op will not be emitted.
-
-### Loops and if: `RegionBranchOpInterface`
-
-RegionBranchOpInterface ops mostly follow what was described above for blocks,
-but there are two interesting things about them:
-
-1.  Regions with multiple predecessors
-1.  Transferring ownership to the op
-
-*Multiple predecessors*. In `scf.while`, and `scf.if`, some regions have
-multiple predecessors (in the case of `while`, the `before` region, in the case
-of `if`, the parent region). As it turns out, no special logic is required to
-handle this - the regions will always yield the same types of memrefs, and
-therefore the added ownership indicators will also have the same types.
-
-*Transfer of ownership*. If a `memref` operand of a loop has no further uses
-after the loop, we can transfer the ownership indicator for the operand to the
-loop. Note that this does not necessarily mean ownership is actually
-transferred - the ownership indicator may be null.
-
-#### Implicit capture / implicit transfer of ownership
-
-Consider the following program, which conditionally reallocates a memref:
-
-```
-%alloc = memref.alloc(%size) : memref<?xi32>
-scf.for %i = %lb to %ub step %step iter_args(%arg0 = %alloc) {
-  %should_grow, %new_size = "dummy.check_capacity"(%arg0)
-    : (memref<?xi32>) -> (i1, index)
-  %mem = scf.if %should_grow {
-    %0 = memref.realloc %arg0(%new_size) : memref<?xi32> -> memref<?xi32>
-    scf.yield %0 : memref<?xi32>
-  } else {
-    scf.yield %arg0 : memref<?xi32>
-  }
-  "dummy.use"(%mem) : (memref<?xi32>) -> ()
-  scf.yield %mem : memref<?xi32>
-}
-```
-
-`%arg0` is owned by the loop, but it must not be deallocated at the end of the
-loop body - otherwise, we'd run into a double free when it is reallocated.
-
-We solve this by defining implicit captures, or implicit transfer of ownership.
-`memref.realloc` ops are considered to implicitly capture and release their
-operand. There are a couple of restrictions to this:
-
-1.  Only ops owned by the parent block can be implicitly captured.
-1.  Implicit capture is only allowed in `scf.if` ops. This rule may be applied
-    recursively.
-1.  The implicit capture must be the last use of the captured value across all
-    execution paths.
-1.  Implied by the previous rule: Implicit capture is not allowed in `scf.if`
-    ops that do not have an else branch.
-
-To illustrate these restrictions, we can look at some IR that violates them:
-
-```
-%alloc = memref.alloc()
-scf.if %cond {
-  %0 = memref.realloc %alloc  // invalid
-}
-```
-
-This IR contains an implicit capture inside an `scf.if` without an `else`
-branch. Since `%alloc` is only freed if `%cond` is true, there must be some
-further use of `%alloc`, which is invalid. To make this valid, the following IR
-should be emitted instead:
-
-```
-%alloc = memref.alloc()
-%0 = scf.if %cond {
-  %1 = memref.realloc %alloc
-  scf.yield %1
-} else {
-  scf.yield %alloc
-}
-```
-
-Note that `scf.yield %alloc` is executed no execution path that also executes
-the `realloc`, so condition 3 is not violated.
-
-An example that violates condition 1:
-
-```
-%alloc = memref.alloc()
-scf.for %i = %lb to %ub step %step {
-  scf.if ... {
-    %0 = memref.realloc %alloc  // invalid
-  } else {
-    ...
-  }
-}
-```
-
-`%alloc` cannot be implicitly captured here, since there is no chain of ancestor
-`scf.if` ops to its definition. To make this valid, turn `%alloc` into an
-`iter_arg`:
-
-```
-%alloc = memref.alloc()
-%0 = scf.for %i = %lb to %ub step %step iter_args(%arg0 = %alloc) {
-  %1 = scf.if ... {
-    %2 = memref.realloc %alloc
-  } else {
-    ...
-  }
-  scf.yield %1
-}
-```
-
-## Ops in the deallocation dialect
-
-### The `null` op
-
-Creates a null pointer.
-
-### The `own` op
-
-Declares ownership of an alloc and returns an ownership indicator. This is
-lowered to an extraction of the alloc's base pointer.
-
-### The `retain` op
-
-Takes a list of memrefs and a list of ownership indicator. For each memref,
-returns the ownership (alloc) that it was derived from (if present). Each alloc
-is returned at most once. Alloc that are not returned are freed.
-
-Some retain ops can be simplified to a no op (e.g. if there's only one alloc
-and one memref, and they're the same). Others can be rewritten to memref.dealloc
-(if we know that the alloc is non-null and there is no memref). This is done by
-the `deallocation-simplification` pass.
-
-There are two lowerings of `retain`: retains with a single memref or a single
-ownership indicator are lowered to a sequence of `scf.if` ops. Lowerings with
-more than one of either are instead lowered to a library call. For details, see
-the section on the deallocation-to-scf pass.
-
-### The `get_buffer` op
-
-Returns the memref's base pointer as an index.
-
-## The buffer reuse pass
-
-The buffer reuse pass is intended to be run after the deallocation pass and
-assumes that the code has the structure that the pass guarantees (in particular,
-unranked memref == ownership indicator). For best results, the IR should be
-canonicalized first.
-
-### Loop simplification
-
-As a preprocessing step, this pass transforms `retain` ops that operate on the
-result of loops. Consider the following IR:
-
-```
-%alloc1 = memref.alloc() : memref<4xi32>
-%alloc2 = memref.alloc() : memref<4xi32>
-%0:4 = scf.while(%arg0 = %alloc1, $arg1 = %alloc2) {
-  scf.condition(%cond) %arg1, %arg0
-do {
-  (...)
-  scf.yield %arg0, %arg1
-}
-memref.dealloc %0#0 : memref<4xi32>
-memref.dealloc %0#1 : memref<4xi32>
-```
-
-`%0#0` and `%0#1` are `%alloc1` and `%alloc2`, in some order. Since there is no
-further use of these allocs and they are all deallocated, we can rewrite the
-operands to `%alloc1` and `%alloc2`, even though we don't know which one is
-which.
-
-The purpose of this preprocessing step is to allow more buffer reuse, which
-requires `dealloc`/`alloc` pairs to work.
-
-### Buffer reuse
-
-Buffer reuse coalesces `dealloc`/`alloc` pairs:
-
-```
-memref.dealloc %alloc : memref<100xi32>
-(...)
-%alloc_1 = memref.alloc() : memref<100xi32>
-```
-
-Instead of deallocating and allocating, we replace all uses of `%alloc_1` with
-`%alloc`. Currently, we only do this for immediate `dealloc`/`alloc` pairs with
-no other `alloc`/`dealloc` ops in between. So in the example above, if `(...)`
-included any other allocation or deallocation, no reuse would occur.
-
-### Copy elision
-
-Another simple transformation eliminates `alloc`/`copy`/`dealloc` patterns:
-
-```
-%a = memref.alloc() : memref<100xi32>
-(... 1)  // no uses of %a
-memref.copy %b, %a : memref<100xi32> to memref<100xi32>
-memref.dealloc %b : memref<100xi32>
-(... 2)  // uses of %a
-```
-
-Since `%a` is completely overwritten with `%b`, which is deallocated immediately
-afterwards, we can remove the allocation of `%a` and replace its uses with `%b`.
-
-```
-(... 1)  // potential uses of %b
-(... 2)  // all uses of %a replaced with %b
-```
-
-Note: This pattern could be generalized to only look at copy ops and the uses of
-its operand, leaving the elimination of the allocation and deallocation to other
-patterns. As of March 2023, this is not done.
-
-### Hoisting
-
-The second transformation implemented in this pass is buffer hoisting. This
-simply looks for allocs that happen in each iteration of a loop and moves them
-out of the loop:
-
-```
-scf.for %i = %c0 to %c1000 step %c1 {
-  %foo = memref.alloc() : memref<100xi32>
-  (...)
-  memref.dealloc %foo : memref<100xi32>
-}
-```
-
-Since the contents of a freshly allocated memref are undefined, this can be
-transformed as follows:
-
-```
-%foo = memref.alloc() : memref<100xi32>
-scf.for %i = %c0 to %c1000 step %c1 {
-  (...)
-}
-memref.dealloc %foo : memref<100xi32>
-```
-
-The same transformation applies for while loops, with the caveat that it may
-increase peak heap usage in that case.
-
-### Double buffering
-
-Double buffering can be considered a variant of hoisting. It is useful in cases
-where use ranges of buffers overlap, preventing simple hoisting. Consider the
-following IR (ownership indicator omitted for clarity):
-
-```
-%0 = scf.for %i = %c0 to %c1000 step %c1 iter_args(%arg = %alloc)
-    -> memref<100xi32> {
-  %tmp = memref.alloc() : memref<100xi32>
-  "some.op"(%tmp, %arg) : (memref<100xi32>, memref<100xi32>) -> ()
-  memref.dealloc %arg : memref<100xi32>
-  scf.yield %tmp : memref<100xi32>
-}
-memref.dealloc %0 : memref<100xi32>
-```
-
-The live ranges of `%alloc` and `%tmp` overlap, so we can't do straightforward
-hoisting here. However, we only need two distinct buffers at any given time, so
-instead, we introduce an additional iter arg for the temporary buffer, hoist and
-swap in each iteration:
-
-```
-%tmp = memref.alloc() : memref<100xi32>
-%0, %1 = scf.for %i = %c0 to %c1000 step %c1
-    iter_args(%arg = %alloc, %tmp_ = %tmp) -> memref<100xi32> {
-  "some.op"(%tmp_, %arg) : (memref<100xi32>, memref<100xi32>) -> ()
-  scf.yield %tmp_, %arg : memref<100xi32>, memref<100xi32>
-}
-memref.dealloc %1 : memref<100xi32>
-memref.dealloc %0 : memref<100xi32>
-```
-
-Note that the presence of a deallocation of `%arg` inside the loop implies no
-further uses of `%alloc` after the loop. So, similarly to the case described in
-the section on loop simplification, it doesn't matter which alloc is in `%0` and
-which one is in `%1`.
-
-Double buffering works analogously for `while` loops, with the exception that
-buffers have to be plumbed through the before region.
-
-Note: as of March 2023, double buffering allocations in `while` loops is only
-implemented for the `after` region.
-
-## The split-alloc-tensors pass
-
-This pass is a helper pass to improve the behavior of the other passes when used
-together with `one-shot-bufferize`. The purpose of this pass is to prevent
-accidental buffer reuse by `one-shot-bufferize` by ensuring each `alloc_tensor`
-is used only once, thereby minimizing the sizes of live ranges and enabling the
-buffer reuse pass to work optimally.
-
-## The deallocation-to-scf pass
-
-As described previously, most `deallocation.retain` ops are eliminated either by
-canonicalization or by `buffer-reuse`. `deallocation-to-scf` lowers the ones
-that remain to sequences of `scf.if` ops.
-
-Because the size of the emitted code is in `O(|allocs| * |memrefs|)`, we only
-use this lowering when at least one of `|allocs|` or `|memrefs|` is 1.
-
-[^1]: `memref.dealloc` happens to tolerate null inputs as well, but at this
-    point of the pipeline, we assume that the argument is always non-null,
-    because 1) this behavior isn't documented 2) it simplifies analysis in
-    subsequent passes.
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/deallocation/transforms/CMakeLists.txt
index a61329f0677657..efe5e92b44ac89 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/deallocation/transforms/CMakeLists.txt
@@ -21,15 +21,7 @@ include_directories(BEFORE
     ${CMAKE_CURRENT_SOURCE_DIR})
 
 add_mlir_library(DeallocationPasses
-  analysis.cc
   buffer_reuse.cc
-  convert_deallocation_ops_to_llvm.cc
-  deallocate.cc
-  deallocation_simplification.cc
-  deallocation_to_scf.cc
-  debug_passes.cc
-  split_alloc_tensors.cc
-  xla_buffer_arg_rewrite.cc
 
   DEPENDS
   MLIRDeallocationPassesIncGen
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/analysis.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/analysis.cc
deleted file mode 100644
index 85019895762d9b..00000000000000
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/analysis.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "deallocation/transforms/analysis.h"
-
-#include <optional>
-
-#include "deallocation/utils/util.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/Block.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/ControlFlowInterfaces.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
-
-namespace mlir {
-namespace deallocation {
-
-namespace {
-
-bool isRestrictBbArg(Value value) {
-  auto bbarg = llvm::dyn_cast<BlockArgument>(value);
-  auto func =
-      llvm::dyn_cast<func::FuncOp>(value.getParentBlock()->getParentOp());
-  if (!bbarg || !func) return false;
-  auto isRestrict = func.getArgAttrOfType<BoolAttr>(bbarg.getArgNumber(),
-                                                    "deallocation.restrict");
-  return isRestrict && isRestrict.getValue();
-}
-
-bool isMemref(Value v) { return llvm::isa<BaseMemRefType>(v.getType()); }
-
-}  // namespace
-
-void DeallocationAnalysis::collectBackingMemory(
-    Value source, DenseSet<Value>& visited,
-    breaks_if_you_move_ops::ValueSet& results) {
-  if (!isMemref(source)) return;
-  if (!visited.insert(source).second) return;
-
-  auto type = getElementTypeOrSelf(source);
-  if (auto bbarg = llvm::dyn_cast<BlockArgument>(source)) {
-    results.insert(source);
-    if (llvm::isa<func::FuncOp>(bbarg.getParentBlock()->getParentOp())) {
-      if (!isRestrictBbArg(source)) {
-        // Restrict bbargs can't alias anything else.
-        for (auto arg : bbarg.getParentBlock()->getArguments()) {
-          if (isMemref(arg) && getElementTypeOrSelf(arg.getType()) == type) {
-            results.insert(arg);
-          }
-        }
-      }
-    } else if (auto rbi = llvm::dyn_cast<RegionBranchOpInterface>(
-                   bbarg.getParentRegion()->getParentOp())) {
-      for (const auto& edge :
-           getPredecessorRegions(rbi, bbarg.getParentRegion())) {
-        if (bbarg.getArgNumber() >= edge.successorValueIndex &&
-            static_cast<size_t>(bbarg.getArgNumber() -
-                                edge.successorValueIndex) <=
-                edge.getPredecessorOperands().size()) {
-          Value dep = edge.getPredecessorOperand(bbarg.getArgNumber());
-          collectBackingMemory(dep, visited, results);
-        }
-      }
-    }
-    return;
-  }
-
-  auto result = llvm::cast<OpResult>(source);
-  if (auto rbi = llvm::dyn_cast<RegionBranchOpInterface>(result.getOwner())) {
-    for (const auto& edge :
-         getPredecessorRegions(rbi, RegionBranchPoint::parent())) {
-      collectBackingMemory(edge.getPredecessorOperand(result.getResultNumber()),
-                           visited, results);
-    }
-  }
-
-  if (auto mem = llvm::dyn_cast<MemoryEffectOpInterface>(result.getOwner())) {
-    if (mem.getEffectOnValue<MemoryEffects::Allocate>(result).has_value()) {
-      results.insert(result);
-    }
-  }
-
-  for (auto operand : result.getOwner()->getOperands()) {
-    if (isMemref(operand) && getElementTypeOrSelf(operand) == type) {
-      collectBackingMemory(operand, visited, results);
-    }
-  }
-}
-
-const breaks_if_you_move_ops::ValueSet& DeallocationAnalysis::getBackingMemory(
-    Value source) {
-  auto it = backingMemory.find(source);
-  if (it != backingMemory.end()) return it->second;
-
-  auto& results = backingMemory[source];
-  DenseSet<Value> visited;
-  collectBackingMemory(source, visited, results);
-  return results;
-}
-
-}  // namespace deallocation
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/analysis.h b/third_party/xla/xla/mlir_hlo/deallocation/transforms/analysis.h
deleted file mode 100644
index aefb8ff03ce03d..00000000000000
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/analysis.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef MLIR_HLO_DEALLOCATION_TRANSFORMS_ANALYSIS_H
-#define MLIR_HLO_DEALLOCATION_TRANSFORMS_ANALYSIS_H
-
-#include "deallocation/utils/util.h"
-#include "mlir/IR/Value.h"
-
-namespace mlir {
-namespace deallocation {
-
-class DeallocationAnalysis {
- public:
-  // Returns the set of all possible values that may back the given value. A
-  // value `A` is considered to back another value `B` if
-  // a) `A` is an alloc or a bbarg
-  // b) `B` depends on `A` (possibly indirectly)
-  //
-  // For example, in this IR:
-  //
-  // func.func @foo(%arg0: memref<i32>) -> memref<i32> {
-  //   %c0 = arith.constant 0 : index
-  //   %c4 = arith.constant 4 : index
-  //   %c1 = arith.constant 1 : index
-  //   %ret = scf.for %i = %c0 to %c4 step %c1 iter_args(%x = %arg0)
-  //        -> memref<i32> {
-  //     %y = some.op(%x) : memref<i32> -> memref<i32>
-  //     scf.yield %y : memref<i32>
-  //   }
-  //   func.return %ret : memref<i32>
-  // }
-  //
-  // `getBackingMemory(%ret)` is {`%arg0`, `%x`, `%y`}.
-  const breaks_if_you_move_ops::ValueSet& getBackingMemory(Value source);
-
- private:
-  void collectBackingMemory(Value source, DenseSet<Value>& visited,
-                            breaks_if_you_move_ops::ValueSet& results);
-
-  DenseMap<Value, breaks_if_you_move_ops::ValueSet> backingMemory;
-};
-
-}  // namespace deallocation
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DEALLOCATION_TRANSFORMS_ANALYSIS_H
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
index 3c9d735e6bd5e8..7f95cdbbabc514 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
+++ b/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
-#include "deallocation/IR/deallocation_ops.h"
 #include "deallocation/transforms/passes.h"
 #include "deallocation/utils/util.h"
 #include "llvm/ADT/STLExtras.h"
@@ -375,12 +374,6 @@ void promoteToStack(memref::DeallocOp dealloc) {
   auto alloca = b.create<memref::AllocaOp>(
       alloc->getLoc(), alloc->getResultTypes()[0].cast<MemRefType>(),
       alloc.getAlignmentAttr());
-  for (auto* user : alloc->getUsers()) {
-    if (auto ownership = llvm::dyn_cast<OwnOp>(user)) {
-      b.setInsertionPoint(ownership);
-      ownership->replaceAllUsesWith(b.create<NullOp>(ownership.getLoc()));
-    }
-  }
   alloc->replaceAllUsesWith(ValueRange{alloca.getResult()});
   alloc->erase();
   dealloc->erase();
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/convert_deallocation_ops_to_llvm.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/convert_deallocation_ops_to_llvm.cc
deleted file mode 100644
index 5cdd1e9987f563..00000000000000
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/convert_deallocation_ops_to_llvm.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "deallocation/IR/deallocation_ops.h"
-#include "deallocation/transforms/passes.h"
-#include "mlir/Analysis/DataLayoutAnalysis.h"
-#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
-#include "mlir/Conversion/LLVMCommon/MemRefBuilder.h"
-#include "mlir/Conversion/LLVMCommon/Pattern.h"
-#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-namespace deallocation {
-namespace {
-
-struct NullOpLowering : public ConvertOpToLLVMPattern<NullOp> {
-  using ConvertOpToLLVMPattern<NullOp>::ConvertOpToLLVMPattern;
-
-  LogicalResult matchAndRewrite(
-      NullOp nullOp, OpAdaptor,
-      ConversionPatternRewriter& rewriter) const override {
-    rewriter.replaceOpWithNewOp<LLVM::ZeroOp>(
-        nullOp, LLVM::LLVMPointerType::get(rewriter.getContext(), 0));
-    return success();
-  }
-};
-
-struct OwnOpLowering : public ConvertOpToLLVMPattern<OwnOp> {
-  using ConvertOpToLLVMPattern<OwnOp>::ConvertOpToLLVMPattern;
-
-  LogicalResult matchAndRewrite(
-      OwnOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const override {
-    rewriter.replaceOp(op, MemRefDescriptor(adaptor.getMemref())
-                               .allocatedPtr(rewriter, op->getLoc()));
-    return success();
-  }
-};
-
-struct GetBufferOpLowering : public ConvertOpToLLVMPattern<GetBufferOp> {
-  using ConvertOpToLLVMPattern<GetBufferOp>::ConvertOpToLLVMPattern;
-
-  LogicalResult matchAndRewrite(
-      GetBufferOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const override {
-    if (op.getAlloc().getType().isa<OwnershipIndicatorType>()) {
-      rewriter.replaceOpWithNewOp<LLVM::PtrToIntOp>(
-          op, getTypeConverter()->getIndexType(), adaptor.getAlloc());
-    } else {
-      rewriter.replaceOpWithNewOp<LLVM::PtrToIntOp>(
-          op, getTypeConverter()->getIndexType(),
-          MemRefDescriptor(adaptor.getAlloc())
-              .allocatedPtr(rewriter, op->getLoc()));
-    }
-    return success();
-  }
-};
-
-struct FreeOpLowering : public ConvertOpToLLVMPattern<FreeOp> {
-  using ConvertOpToLLVMPattern<FreeOp>::ConvertOpToLLVMPattern;
-
-  LogicalResult matchAndRewrite(
-      FreeOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const override {
-    auto freeFn = LLVM::lookupOrCreateFreeFn(op->getParentOfType<ModuleOp>());
-
-    rewriter.replaceOpWithNewOp<LLVM::CallOp>(op, freeFn, adaptor.getAlloc());
-    return success();
-  }
-};
-
-struct RetainOpLowering : public ConvertOpToLLVMPattern<RetainOp> {
-  using ConvertOpToLLVMPattern<RetainOp>::ConvertOpToLLVMPattern;
-
-  LogicalResult matchAndRewrite(
-      RetainOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const override {
-    auto loc = op.getLoc();
-    auto ptrTy = LLVM::LLVMPointerType::get(op.getContext());
-    rewriter.setInsertionPoint(op);
-    auto alloca = rewriter.create<memref::AllocaScopeOp>(
-        loc, SmallVector<Type>(op->getNumResults(), ptrTy));
-    auto& body = alloca.getBodyRegion().emplaceBlock();
-    rewriter.setInsertionPoint(&body, body.begin());
-
-    auto i64Ty = rewriter.getI64Type();
-    auto ptrPtrTy = LLVM::LLVMPointerType::get(rewriter.getContext());
-    Type indexType = ConvertOpToLLVMPattern::getIndexType();
-    auto getBuffers = [&](ValueRange values) {
-      auto ret = rewriter.create<LLVM::AllocaOp>(
-          loc, ptrPtrTy, ptrTy,
-          createIndexAttrConstant(rewriter, loc, indexType,
-                                  values.size() *
-                                      getTypeConverter()->getPointerBitwidth() /
-                                      CHAR_BIT));
-      for (auto [index, value] : llvm::enumerate(values)) {
-        auto ptr = rewriter.create<LLVM::GEPOp>(
-            loc, ptrPtrTy, ptrTy, ret,
-            createIndexAttrConstant(rewriter, loc, indexType, index));
-        rewriter.create<LLVM::StoreOp>(loc, value, ptr);
-      }
-      return ret;
-    };
-
-    Value numAllocs = createIndexAttrConstant(rewriter, loc, indexType,
-                                              op.getAllocs().size());
-    Value allocBuffers = getBuffers(adaptor.getAllocs());
-    Value numRetained = createIndexAttrConstant(rewriter, loc, indexType,
-                                                op.getRetained().size());
-    Value retainedBuffers = getBuffers(adaptor.getRetained());
-
-    auto retainFn =
-        LLVM::lookupOrCreateFn(op->getParentOfType<ModuleOp>(), "retainBuffers",
-                               {i64Ty, ptrPtrTy, i64Ty, ptrPtrTy},
-                               LLVM::LLVMVoidType::get(op->getContext()));
-    rewriter.create<LLVM::CallOp>(
-        loc, retainFn,
-        ValueRange{numAllocs, allocBuffers, numRetained, retainedBuffers});
-
-    SmallVector<Value> results;
-    for (auto index : llvm::seq<size_t>(0, op.getRetained().size())) {
-      auto ptr = rewriter.create<LLVM::GEPOp>(
-          loc, ptrPtrTy, ptrTy, retainedBuffers,
-          createIndexAttrConstant(rewriter, loc, indexType, index));
-      results.push_back(rewriter.create<LLVM::LoadOp>(loc, ptrTy, ptr));
-    }
-    rewriter.create<memref::AllocaScopeReturnOp>(loc, results);
-
-    rewriter.replaceOp(op, alloca->getResults());
-    return success();
-  }
-};
-
-#define GEN_PASS_DEF_CONVERTDEALLOCATIONOPSTOLLVMPASS
-#include "deallocation/transforms/passes.h.inc"
-
-struct ConvertDeallocationOpsToLLVMPass
-    : public impl::ConvertDeallocationOpsToLLVMPassBase<
-          ConvertDeallocationOpsToLLVMPass> {
-  ConvertDeallocationOpsToLLVMPass() = default;
-
-  void runOnOperation() override {
-    Operation* func = getOperation();
-    const auto& dataLayoutAnalysis = getAnalysis<DataLayoutAnalysis>();
-    LowerToLLVMOptions options(&getContext(),
-                               dataLayoutAnalysis.getAtOrAbove(func));
-
-    LLVMTypeConverter typeConverter(&getContext(), options,
-                                    &dataLayoutAnalysis);
-    RewritePatternSet patterns(&getContext());
-    populateDeallocationToLLVMConversionPatterns(typeConverter, patterns);
-
-    LLVMConversionTarget target(getContext());
-    target.addLegalOp<func::FuncOp>();
-    target.addLegalOp<memref::AllocaScopeOp, memref::AllocaScopeReturnOp>();
-    target.addIllegalOp<OwnOp, FreeOp, GetBufferOp, NullOp, RetainOp>();
-    if (failed(applyPartialConversion(func, target, std::move(patterns)))) {
-      signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-void populateDeallocationToLLVMConversionPatterns(LLVMTypeConverter& converter,
-                                                  RewritePatternSet& patterns) {
-  converter.addConversion([&](OwnershipIndicatorType) {
-    return LLVM::LLVMPointerType::get(&converter.getContext());
-  });
-  patterns.add<OwnOpLowering, FreeOpLowering, GetBufferOpLowering,
-               NullOpLowering, RetainOpLowering>(converter);
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createConvertDeallocationOpsToLLVM() {
-  return std::make_unique<ConvertDeallocationOpsToLLVMPass>();
-}
-
-}  // namespace deallocation
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/deallocate.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/deallocate.cc
deleted file mode 100644
index 90a1ed051ffff1..00000000000000
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/deallocate.cc
+++ /dev/null
@@ -1,585 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <cstdint>
-#include <map>
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "deallocation/IR/deallocation_ops.h"
-#include "deallocation/transforms/passes.h"
-#include "deallocation/utils/util.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/Block.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/Region.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/ValueRange.h"
-#include "mlir/Interfaces/ControlFlowInterfaces.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
-
-namespace mlir {
-namespace deallocation {
-namespace {
-
-bool isMemref(Value v) { return v.getType().isa<BaseMemRefType>(); }
-
-struct TransformResult {
-  // Allocs that are no longer owned by the current block. Note that it is valid
-  // for an alloc to be both in `acquired` and `released`, if it was temporarily
-  // released and then reacquired. It is valid to release an alloc that's not
-  // owned by the current block, if some ancestor that is reachable without
-  // crossing a loop boundary owns it.
-  // Collects values that are the actual memrefs.
-  breaks_if_you_move_ops::ValueSet released;
-
-  // Allocs that are now owned by the current block. Order matters here - it's
-  // the same order as in the terminator/result list.
-  // Collects values that are the ownership indicators.
-  SmallVector<Value> acquired;
-};
-
-bool doesAlias(Operation* op, Value v,
-               breaks_if_you_move_ops::ValueEquivalenceClasses& aliases,
-               bool considerOperands = true) {
-  auto eq = [&](Value other) { return aliases.isEquivalent(v, other); };
-  return op && ((considerOperands && llvm::any_of(op->getOperands(), eq)) ||
-                llvm::any_of(op->getResults(), eq) ||
-                llvm::any_of(op->getRegions(), [&](Region& region) {
-                  return llvm::any_of(region.getOps(), [&](Operation& subOp) {
-                    return doesAlias(&subOp, v, aliases);
-                  });
-                }));
-}
-
-struct Deallocator {
-  void setOwnershipIndicator(Value owned, Value indicator);
-  Value findOwnershipIndicator(Value v);
-
-  // Transform ops, introducing deallocs.
-  LogicalResult transformModuleOp(ModuleOp op);
-  LogicalResult transformFuncOp(func::FuncOp op);
-  FailureOr<TransformResult> transformBlock(Block& block,
-                                            bool ownsInputs = true);
-  FailureOr<breaks_if_you_move_ops::ValueSet> transformIfImplicitCapture(
-      scf::IfOp op, TransformResult& ifResult, TransformResult& elseResult);
-  FailureOr<TransformResult> transformOp(
-      RegionBranchOpInterface op,
-      const breaks_if_you_move_ops::ValueSet& ownedMemrefs);
-  FailureOr<TransformResult> transformOp(func::CallOp op);
-  FailureOr<TransformResult> transformOp(
-      Operation* op, const breaks_if_you_move_ops::ValueSet& ownedMemrefs);
-
-  // Internal state keeping track of
-  //   - inter-function aliasing,
-  //   - intra-function aliasing, and
-  //   - ownership indicators per memref.
-  std::map<func::FuncOp, SmallVector<llvm::SmallVector<int64_t>>>
-      functionAliasOverapprox;
-  breaks_if_you_move_ops::ValueEquivalenceClasses aliasOverapprox;
-  breaks_if_you_move_ops::ValueMap<Value> ownershipIndicator;
-};
-
-void Deallocator::setOwnershipIndicator(Value owned, Value indicator) {
-  ownershipIndicator[owned] = indicator;
-  aliasOverapprox.unionSets(owned, indicator);
-}
-
-Value Deallocator::findOwnershipIndicator(Value v) {
-  if (llvm::isa_and_nonnull<memref::SubViewOp, memref::ViewOp,
-                            memref::CollapseShapeOp, memref::ExpandShapeOp,
-                            memref::TransposeOp, memref::ReinterpretCastOp>(
-          v.getDefiningOp())) {
-    return findOwnershipIndicator(v.getDefiningOp()->getOperand(0));
-  }
-  auto it = ownershipIndicator.find(v);
-  if (it != ownershipIndicator.end()) return it->second;
-  return {};
-}
-
-LogicalResult Deallocator::transformModuleOp(ModuleOp op) {
-  LogicalResult result = success();
-  op.walk([&](func::FuncOp funcOp) {
-    if (failed(transformFuncOp(funcOp))) {
-      result = failure();
-      return WalkResult::interrupt();
-    }
-    return WalkResult::advance();
-  });
-
-  return result;
-}
-
-// TODO(frgossen): Also allow passing ownership to functions.
-LogicalResult Deallocator::transformFuncOp(func::FuncOp op) {
-  // If we find an aliasing record for this function, it is already being
-  // transformed. We might be hitting a cycle in the call graph here, in which
-  // case this is a temporary aliasing overapproximation and may be refined
-  // later.
-  if (functionAliasOverapprox.find(op) != functionAliasOverapprox.end())
-    return success();
-
-  // Mark function as being processed and provide a valid overapproximation for
-  // aliasing: every result may alias every argument.
-  SmallVector<llvm::SmallVector<int64_t>> trivialOverapproximation;
-  int numOwnershipResults = 0;
-  auto allArgs = llvm::to_vector(llvm::seq<int64_t>(0, op.getNumArguments()));
-  for (Type resultTy : op.getFunctionType().getResults()) {
-    auto& resultAliasing = trivialOverapproximation.emplace_back();
-    if (!llvm::isa<MemRefType>(resultTy)) continue;
-    resultAliasing = allArgs;
-    numOwnershipResults++;
-  }
-  trivialOverapproximation.append(numOwnershipResults, allArgs);
-  functionAliasOverapprox[op] = trivialOverapproximation;
-
-  if (op->getNumRegions() == 0) return success();
-
-  // Transform function body.
-  assert(op.getBody().getBlocks().size() == 1 &&
-         "expect single block functions");
-  Block& block = op.getBody().front();
-  auto transformedBlock = transformBlock(block, /*ownsInputs=*/false);
-  if (failed(transformedBlock)) return failure();
-  if (!transformedBlock->released.empty()) {
-    op->emitOpError("invalid realloc of memref");
-    return failure();
-  }
-
-  // Update terminator and pass on the ownership indicator per escaping memref.
-  auto returnOp = llvm::dyn_cast<func::ReturnOp>(block.getTerminator());
-  returnOp->setOperands(returnOp.getNumOperands(), 0,
-                        transformedBlock->acquired);
-  op.setFunctionType(mlir::FunctionType::get(
-      op.getContext(), block.getArgumentTypes(), returnOp.getOperandTypes()));
-
-  // Refine function aliasing based on return values.
-  SmallVector<llvm::SmallVector<int64_t>> refinedOverapproximation;
-  for (Value result : returnOp.getOperands()) {
-    auto& resultAliasing = refinedOverapproximation.emplace_back();
-    for (auto [j, arg] : llvm::enumerate(op.getArguments())) {
-      if (aliasOverapprox.isEquivalent(result, arg))
-        resultAliasing.push_back(j);
-    }
-  }
-  functionAliasOverapprox[op] = refinedOverapproximation;
-
-  return success();
-}
-
-FailureOr<TransformResult> Deallocator::transformBlock(Block& block,
-                                                       bool ownsInputs) {
-  auto loc = block.getParent()->getLoc();
-  auto ownershipTy = OwnershipIndicatorType::get(loc.getContext());
-  // Introduce block arguments for the owned inputs.
-  breaks_if_you_move_ops::ValueSet ownedMemrefs;
-  if (ownsInputs) {
-    for (auto arg : llvm::to_vector(
-             llvm::make_filter_range(block.getArguments(), isMemref))) {
-      // Add an argument for a potentially owned memref.
-      auto newArg = block.addArgument(ownershipTy, loc);
-      ownedMemrefs.insert(newArg);
-      setOwnershipIndicator(arg, newArg);
-    }
-  }
-
-  TransformResult blockResult;
-  for (auto& op : llvm::make_early_inc_range(block.without_terminator())) {
-    auto opResult = transformOp(&op, ownedMemrefs);
-    if (failed(opResult)) return failure();
-    // Remove released memrefs.
-    for (auto v : opResult->released) {
-      auto owned = llvm::find(ownedMemrefs, v);
-      // If we don't own the released value, pass the release on to the parent.
-      if (owned == ownedMemrefs.end()) {
-        if (!blockResult.released.insert(v).second) {
-          block.getParentOp()->emitOpError("same value released twice");
-          return failure();
-        }
-      } else {
-        ownedMemrefs.erase(owned);
-      }
-    }
-    ownedMemrefs.insert(opResult->acquired.begin(), opResult->acquired.end());
-  }
-  auto yieldedMemrefs = llvm::to_vector(
-      llvm::make_filter_range(block.getTerminator()->getOperands(), isMemref));
-
-  // Group yielded memrefs and owned memrefs by equivalence class leader.
-  auto groupByLeader = [&](auto& values) {
-    breaks_if_you_move_ops::ValueMap<SmallVector<Value>> result;
-    for (auto v : values) {
-      aliasOverapprox.insert(v);
-      result[aliasOverapprox.getLeaderValue(v)].push_back(v);
-    }
-    return result;
-  };
-  auto yieldedByLeader = groupByLeader(yieldedMemrefs);
-  auto ownedByLeader = groupByLeader(ownedMemrefs);
-
-  // Create one retain per equivalence class.
-  DenseSet<Value> alreadyRetained;
-  ImplicitLocOpBuilder b(loc, block.getTerminator());
-  auto null = b.create<NullOp>();
-  blockResult.acquired =
-      SmallVector<Value>(yieldedMemrefs.size(), null.getResult());
-  for (auto [leader, yielded] : yieldedByLeader) {
-    auto& ownedGroup = ownedByLeader[leader];
-    alreadyRetained.insert(ownedGroup.begin(), ownedGroup.end());
-    if (yielded.size() == 1 && ownedGroup.size() == 1) {
-      auto oi = ownershipIndicator.find(yielded[0]);
-      if (oi != ownershipIndicator.end() && oi->second == ownedGroup.front()) {
-        blockResult.acquired[llvm::find(yieldedMemrefs, yielded.front()) -
-                             yieldedMemrefs.begin()] = ownedGroup.front();
-        continue;
-      }
-    }
-    SmallVector<Type> types(yielded.size(), ownershipTy);
-    auto retain = b.create<RetainOp>(types, yielded, ownedGroup);
-    for (auto [retained, result] : llvm::zip(retain.getResults(), yielded)) {
-      aliasOverapprox.unionSets(retained, result);
-      blockResult.acquired[llvm::find(yieldedMemrefs, result) -
-                           yieldedMemrefs.begin()] = retained;
-    }
-  }
-  if (!llvm::is_contained(blockResult.acquired, null.getResult())) null.erase();
-
-  // Handle owned memrefs that don't alias any yielded memref.
-  for (auto v : ownedMemrefs) {
-    if (!alreadyRetained.contains(v)) {
-      b.create<RetainOp>(TypeRange{}, ValueRange{}, ValueRange{v});
-    }
-  }
-
-  return blockResult;
-}
-
-FailureOr<breaks_if_you_move_ops::ValueSet>
-Deallocator::transformIfImplicitCapture(scf::IfOp op, TransformResult& ifResult,
-                                        TransformResult& elseResult) {
-  if (ifResult.released == elseResult.released) {
-    return ifResult.released;
-  }
-
-  auto fixAcquiredAlloc = [&](Value v, Region& region,
-                              TransformResult& result) -> LogicalResult {
-    if (region.empty()) {
-      op.emitOpError("cannot implicitly capture from an if without else");
-      return failure();
-    }
-    auto* terminator = region.front().getTerminator();
-    auto operands = terminator->getOperands();
-    auto it = llvm::find_if(operands, [&](Value operand) {
-      return findOwnershipIndicator(operand) == v;
-    });
-    if (it == operands.end()) {
-      op.emitOpError("released value not yielded on other branch");
-      return failure();
-    }
-    ownershipIndicator.erase(v);
-
-    auto index = std::count_if(operands.begin(), it, isMemref);
-    result.acquired[index] = v;
-    return success();
-  };
-
-  for (auto v : ifResult.released) {
-    if (!llvm::is_contained(elseResult.released, v)) {
-      if (failed(fixAcquiredAlloc(v, op.getElseRegion(), elseResult)))
-        return failure();
-    }
-  }
-  for (auto v : elseResult.released) {
-    if (!llvm::is_contained(ifResult.released, v)) {
-      if (failed(fixAcquiredAlloc(v, op.getThenRegion(), ifResult)))
-        return failure();
-    }
-  }
-
-  breaks_if_you_move_ops::ValueSet released = ifResult.released;
-  released.insert(elseResult.released.begin(), elseResult.released.end());
-  return released;
-}
-
-FailureOr<TransformResult> Deallocator::transformOp(
-    RegionBranchOpInterface op,
-    const breaks_if_you_move_ops::ValueSet& ownedMemrefs) {
-  SmallVector<int64_t> originalNumArgsByRegion;
-  SmallVector<TransformResult> transformResultsByRegion;
-  transformResultsByRegion.reserve(op->getNumRegions());
-
-  bool mayImplicitlyCapture = llvm::isa<scf::IfOp>(op);
-  for (auto [index, region] : llvm::enumerate(op->getRegions())) {
-    assert(region.getBlocks().size() <= 1 &&
-           "expected regions to have at most one block");
-    auto edges = getSuccessorRegions(op, region);
-    originalNumArgsByRegion.push_back(region.getNumArguments());
-
-    auto& result = transformResultsByRegion.emplace_back();
-    if (region.empty()) continue;
-
-    // Transform the block and collect acquired/released memrefs.
-    auto transformResultOrError = transformBlock(region.front());
-    if (failed(transformResultOrError)) return failure();
-
-    result = *std::move(transformResultOrError);  // NOLINT
-    if (!result.released.empty() && !mayImplicitlyCapture) {
-      // This error means that there's a realloc or free in a loop, and the op
-      // defining the value is outside the loop. This is not valid. To fix
-      // this, turn the argument of realloc/free into an iter arg.
-      op.emitOpError(
-          "can't implicitly capture across loop boundaries; use an "
-          "explicit iter arg instead");
-      return failure();
-    }
-  }
-
-  breaks_if_you_move_ops::ValueSet released;
-  if (llvm::any_of(transformResultsByRegion, [](const TransformResult& result) {
-        return !result.released.empty();
-      })) {
-    auto releasedByIf = transformIfImplicitCapture(
-        llvm::cast<scf::IfOp>(op.getOperation()), transformResultsByRegion[0],
-        transformResultsByRegion[1]);
-    if (failed(releasedByIf)) return failure();
-    released = *std::move(releasedByIf);  // NOLINT
-  }
-
-  // Adjust terminator operands.
-  for (auto [region, transformResult] :
-       llvm::zip(op->getRegions(), transformResultsByRegion)) {
-    if (region.empty()) continue;
-    auto* terminator = region.front().getTerminator();
-    terminator->setOperands(terminator->getNumOperands(), 0,
-                            transformResult.acquired);
-  }
-
-  ImplicitLocOpBuilder b(op.getLoc(), op);
-  SmallVector<Value> operands = op->getOperands();
-  Value null = nullptr;
-  // If we pass an owned memref to the loop and don't reuse it afterwards, we
-  // can transfer ownership.
-  for (auto operand : llvm::make_filter_range(operands, isMemref)) {
-    auto isLastUse = [&]() {
-      for (auto* candidate = op.getOperation(); candidate != nullptr;
-           candidate = candidate->getNextNode()) {
-        if (doesAlias(candidate, operand, aliasOverapprox,
-                      /*considerOperands=*/candidate != op.getOperation())) {
-          return false;
-        }
-      }
-      return true;
-    };
-
-    Value ownershipIndicator = findOwnershipIndicator(operand);
-    if (ownershipIndicator &&
-        !llvm::is_contained(released, ownershipIndicator) &&
-        llvm::is_contained(ownedMemrefs, ownershipIndicator) && isLastUse()) {
-      // This is an alloc that is not used again, so we can pass ownership
-      // to the loop.
-      op->insertOperands(op->getNumOperands(), ownershipIndicator);
-      released.insert(ownershipIndicator);
-    } else {
-      // Either the operand is not an alloc or it's reused.
-      if (!null) null = b.create<NullOp>().getResult();
-      op->insertOperands(op->getNumOperands(), null);
-    }
-  }
-
-  RegionBranchOpInterface newOp = moveRegionsToNewOpButKeepOldOp(op);
-  auto numOriginalResults = op->getNumResults();
-  auto newResults = newOp->getResults().take_front(numOriginalResults);
-  auto retained = newOp->getResults().drop_front(numOriginalResults);
-  op->replaceAllUsesWith(newResults);
-  op->erase();
-
-  for (auto [result, indicator] :
-       llvm::zip(llvm::make_filter_range(newOp->getResults(), isMemref),
-                 newOp->getResults().drop_front(numOriginalResults))) {
-    setOwnershipIndicator(result, indicator);
-  }
-
-  auto setupAliases = [&](RegionBranchPoint point) {
-    for (auto& region : getSuccessorRegions(newOp, point)) {
-      for (auto [pred, succ] : llvm::zip(region.getPredecessorOperands(),
-                                         region.getSuccessorValues())) {
-        aliasOverapprox.unionSets(pred, succ);
-      }
-    }
-  };
-  auto setMemrefAliases = [this](ValueRange a, ValueRange b) {
-    for (auto [aa, bb] : llvm::zip(llvm::make_filter_range(a, isMemref), b)) {
-      aliasOverapprox.unionSets(aa, bb);
-    }
-  };
-  setupAliases(RegionBranchPoint::parent());
-  for (uint32_t i = 0; i < newOp->getNumRegions(); ++i) {
-    setupAliases(newOp->getRegion(i));
-    auto args = newOp->getRegion(i).getArguments();
-    auto n = originalNumArgsByRegion[i];
-    setMemrefAliases(args.take_front(n), args.drop_front(n));
-  }
-  setMemrefAliases(newResults, retained);
-  return TransformResult{released, retained};
-}
-
-// TODO(frgossen): Also allow passing ownership to functions.
-FailureOr<TransformResult> Deallocator::transformOp(func::CallOp op) {
-  ImplicitLocOpBuilder b(op.getLoc(), op);
-
-  // Extend result types with ownership indicators.
-  SmallVector<Type> newResultTys(op.getResultTypes());
-  int64_t numMemrefResults = llvm::count_if(op.getResults(), isMemref);
-  newResultTys.append(
-      SmallVector<Type>(numMemrefResults, b.getType<OwnershipIndicatorType>()));
-  auto newOp = b.create<func::CallOp>(op.getCalleeAttr(), newResultTys,
-                                      op.getOperands());
-
-  // Follow the call graph and process the callee first to get accurate aliasing
-  // information.
-  auto callee = llvm::cast<func::FuncOp>(
-      op->getParentOfType<ModuleOp>().lookupSymbol(op.getCallee()));
-  if (failed(transformFuncOp(callee))) return failure();
-
-  // Update ownership indicators and aliasing.
-  int64_t numResults = op.getNumResults();
-  int64_t ownershipIndicatorIdx = numResults;
-  for (auto [result, resultAliasing] :
-       llvm::zip(newOp.getResults().take_front(numResults),
-                 functionAliasOverapprox[callee])) {
-    if (!isMemref(result)) continue;
-    setOwnershipIndicator(result, newOp.getResult(ownershipIndicatorIdx++));
-    for (int64_t i : resultAliasing) {
-      aliasOverapprox.unionSets(result, op.getOperand(i));
-    }
-  }
-
-  // Replace old op.
-  op.replaceAllUsesWith(newOp.getResults().take_front(numResults));
-  op.erase();
-
-  // Collect ownership indicators.
-  auto retained = newOp->getResults().drop_front(numResults);
-  return TransformResult{{}, retained};
-}
-
-// Returns the set of values that are potentially owned by the op.
-FailureOr<TransformResult> Deallocator::transformOp(
-    Operation* op, const breaks_if_you_move_ops::ValueSet& ownedMemrefs) {
-  if (auto rbi = llvm::dyn_cast<RegionBranchOpInterface>(op)) {
-    return transformOp(rbi, ownedMemrefs);
-  }
-  if (auto callOp = llvm::dyn_cast<func::CallOp>(op)) {
-    return transformOp(callOp);
-  }
-
-  if (auto me = llvm::dyn_cast<MemoryEffectOpInterface>(op)) {
-    if (llvm::isa<memref::AllocaOp>(op)) {
-      // Don't attempt to memory manage memref.alloca.
-      return TransformResult{};
-    }
-    TransformResult result;
-    OpBuilder b(op->getContext());
-    b.setInsertionPointAfter(op);
-
-    SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>> allocs,
-        frees;
-    me.getEffects<MemoryEffects::Allocate>(allocs);
-    me.getEffects<MemoryEffects::Free>(frees);
-    if (!allocs.empty() || !frees.empty()) {
-      for (const auto& alloc : allocs) {
-        auto owned = b.create<OwnOp>(op->getLoc(), alloc.getValue());
-        setOwnershipIndicator(alloc.getValue(), owned);
-        result.acquired.push_back(owned);
-      }
-      for (const auto& free : frees) {
-        auto ownershipIndicator = findOwnershipIndicator(free.getValue());
-        if (!ownershipIndicator) {
-          op->emitOpError("unable to find ownership indicator for operand");
-          return failure();
-        }
-        result.released.insert(ownershipIndicator);
-      }
-      return result;
-    }
-  }
-
-  // Deallocate ops inside unknown op regions.
-  // Also assert that unknown ops with regions return no memrefs. There is no
-  // way to generically transform such ops, if they exist. Eventually we'll need
-  // an interface for this.
-  if (op->getNumRegions() > 0) {
-    assert(llvm::none_of(op->getResults(), isMemref));
-    for (auto& region : op->getRegions()) {
-      for (auto& block : region.getBlocks()) {
-        auto transformedBlock = transformBlock(block, /*ownsInputs=*/false);
-        if (failed(transformedBlock)) return failure();
-        if (!transformedBlock->acquired.empty() ||
-            !transformedBlock->released.empty()) {
-          op->emitOpError("block unexpectededly released or returned an alloc");
-          return failure();
-        }
-      }
-    }
-  }
-
-  // Assume any memref operand may alias any memref result.
-  for (auto result : llvm::make_filter_range(op->getResults(), isMemref)) {
-    for (auto arg : llvm::make_filter_range(op->getOperands(), isMemref)) {
-      if (getElementTypeOrSelf(result.getType()) ==
-          getElementTypeOrSelf(arg.getType())) {
-        aliasOverapprox.unionSets(result, arg);
-      }
-    }
-  }
-  // No new allocations or releases.
-  return TransformResult{};
-}
-
-#define GEN_PASS_DEF_DEALLOCATEPASS
-#include "deallocation/transforms/passes.h.inc"
-
-struct DeallocatePass : public impl::DeallocatePassBase<DeallocatePass> {
-  void runOnOperation() override {
-    ModuleOp moduleOp = getOperation();
-    if (failed(Deallocator().transformModuleOp(moduleOp))) {
-      signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> createDeallocatePass() {
-  return std::make_unique<DeallocatePass>();
-}
-
-}  // namespace deallocation
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/deallocation_simplification.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/deallocation_simplification.cc
deleted file mode 100644
index b140e8ff742b70..00000000000000
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/deallocation_simplification.cc
+++ /dev/null
@@ -1,237 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <functional>
-#include <memory>
-#include <optional>
-
-#include "deallocation/IR/deallocation_ops.h"
-#include "deallocation/transforms/passes.h"
-#include "deallocation/utils/util.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/ValueRange.h"
-#include "mlir/Interfaces/ControlFlowInterfaces.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-namespace deallocation {
-namespace {
-
-// Returns the value owned by the given ownership indicator. Returns null if it
-// could not be determined.
-Value getOwnedValue(Value v) {
-  ValueRange vals;
-  unsigned valueNum;
-  if (auto bbarg = v.dyn_cast<BlockArgument>()) {
-    vals = v.getParentBlock()->getArguments();
-    valueNum = bbarg.getArgNumber();
-  } else {
-    vals = v.getDefiningOp()->getResults();
-    valueNum = v.cast<OpResult>().getResultNumber();
-  }
-
-  int64_t num = llvm::count_if(vals.take_front(valueNum), [](Value it) {
-    return it.getType().isa<OwnershipIndicatorType>();
-  });
-
-  auto memrefs = llvm::make_filter_range(
-      vals, [](Value it) { return it.getType().isa<BaseMemRefType>(); });
-
-  auto it = memrefs.begin();
-  for (auto end = memrefs.end(); it != end && num > 0; ++it) {
-    --num;
-  }
-  if (it == memrefs.end()) return {};
-  return *it;
-}
-
-enum AllocNullability : uint32_t {
-  UNDEFINED = 0,
-  ALWAYS_NULL = 1,
-  NEVER_NULL = 2,
-  SOMETIMES_NULL = 3
-};
-
-AllocNullability operator|=(AllocNullability& lhs, AllocNullability rhs) {
-  return lhs = static_cast<AllocNullability>(static_cast<uint32_t>(lhs) | rhs);
-}
-
-struct AllocInfo {
-  AllocNullability nullability;
-  // Set only if nullability is NEVER_NULL.
-  Value nonNullValue;
-};
-
-// Returns the nullability of `v`. `pending` contains a set of `Values` we're
-// already considering in the computation of some value's nullability. It is
-// assumed that we will eventually take the maximum (logical or) of all
-// nullability values in this set.
-AllocInfo getAllocNullabilityImpl(Value v, llvm::DenseSet<Value>& pending) {
-  if (llvm::isa_and_present<OwnOp>(v.getDefiningOp())) {
-    return {NEVER_NULL, v.getDefiningOp()->getOperand(0)};
-  }
-
-  if (llvm::isa_and_present<NullOp>(v.getDefiningOp())) {
-    return {ALWAYS_NULL, {}};
-  }
-
-  if (auto retain = llvm::dyn_cast_or_null<RetainOp>(v.getDefiningOp())) {
-    // We start with ALWAYS_NULL because a retain without any allocs is null.
-    // Also, because a retain with a non-null alloc can be null (otherwise, this
-    // would have been cleaned up by `retainNoOp`).
-    AllocNullability nullability = ALWAYS_NULL;
-    for (auto alloc : retain.getAllocs()) {
-      if (pending.insert(alloc).second) {
-        // We can ignore the non-null value here, since the final outcome won't
-        // be NEVER_NULL.
-        nullability |= getAllocNullabilityImpl(alloc, pending).nullability;
-      }
-      if (nullability == SOMETIMES_NULL) break;
-    }
-    return {nullability, {}};
-  }
-
-  // Returns the nullability of an operand in each of the region's predecessors.
-  auto getPredecessorNullability =
-      [&](RegionBranchOpInterface rbi,
-          RegionBranchPoint successorRegionPoint,
-          int64_t successorArgIndex) -> AllocInfo {
-    AllocNullability nullability = UNDEFINED;
-    for (const auto& pred : getPredecessorRegions(rbi, successorRegionPoint)) {
-      Value operand = pred.getPredecessorOperand(successorArgIndex);
-      // It is safe to skip values that are already being considered higher
-      // up in the call stack, because we end up taking the maximum of all
-      // nullability values.
-      if (pending.insert(operand).second) {
-        nullability |= getAllocNullabilityImpl(operand, pending).nullability;
-      }
-      if (nullability == SOMETIMES_NULL) break;
-    }
-    if (nullability == NEVER_NULL) {
-      return {NEVER_NULL, getOwnedValue(v)};
-    }
-    return {nullability, {}};
-  };
-
-  // If `v` is a block argument, check all incoming edges.
-  if (auto bbarg = v.dyn_cast<BlockArgument>()) {
-    if (auto rbi = llvm::dyn_cast<RegionBranchOpInterface>(
-            bbarg.getParentRegion()->getParentOp())) {
-      return getPredecessorNullability(
-          rbi, bbarg.getParentRegion(),
-          bbarg.getArgNumber());
-    }
-  }
-
-  if (auto rbi =
-          llvm::dyn_cast_or_null<RegionBranchOpInterface>(v.getDefiningOp())) {
-    return getPredecessorNullability(rbi, mlir::RegionBranchPoint::parent(),
-                                     llvm::cast<OpResult>(v).getResultNumber());
-  }
-
-  // Something we don't understand.
-  return {AllocNullability::SOMETIMES_NULL, {}};
-}
-
-bool allocIsNull(Value v) {
-  llvm::DenseSet<Value> pendingChecks;
-  return getAllocNullabilityImpl(v, pendingChecks).nullability == ALWAYS_NULL;
-}
-
-// Returns true if the value is just passed around, but never really used.
-bool valueIsUnused(Value value) {
-  llvm::DenseSet<Value> pendingChecks;
-  std::function<bool(Value)> checkValue;
-  std::function<bool(OpOperand&)> checkUser;
-
-  checkUser = [&](OpOperand& user) -> bool {
-    RegionBranchPoint regionPoint = mlir::RegionBranchPoint::parent();
-    auto rbi = llvm::dyn_cast<RegionBranchOpInterface>(user.getOwner());
-    if (user.getOwner()->mightHaveTrait<OpTrait::IsTerminator>()) {
-      rbi = llvm::dyn_cast<RegionBranchOpInterface>(
-          user.getOwner()->getParentOp());
-      regionPoint = user.getOwner()->getParentRegion();
-    }
-    return rbi && llvm::all_of(getSuccessorRegions(rbi, regionPoint),
-                               [&](const RegionEdge& edge) {
-                                 return checkValue(edge.getSuccessorValue(
-                                     user.getOperandNumber()));
-                               });
-  };
-  checkValue = [&](Value value) {
-    if (!pendingChecks.insert(value).second) return true;
-    return llvm::all_of(value.getUses(), checkUser);
-  };
-
-  return checkValue(value);
-}
-
-#define GEN_PASS_DEF_DEALLOCATIONSIMPLIFICATIONPASS
-#include "deallocation/transforms/passes.h.inc"
-
-struct DeallocationSimplificationPass
-    : public impl::DeallocationSimplificationPassBase<
-          DeallocationSimplificationPass> {
-  void runOnOperation() override {
-    getOperation()->walk([](RetainOp op) {
-      OpBuilder b(op);
-      // If all allocs are null, the result is null and there is nothing to
-      // deallocate.
-      if (llvm::all_of(op.getAllocs(), allocIsNull)) {
-        auto null = b.create<NullOp>(op.getLoc());
-        auto nulls = llvm::SmallVector<Value>(op.getNumResults(), null);
-        op.replaceAllUsesWith(nulls);
-        op.erase();
-        return;
-      }
-
-      if (op.getRetained().empty() && op.getAllocs().size() == 1) {
-        llvm::DenseSet<Value> pendingChecks;
-        auto nullability =
-            getAllocNullabilityImpl(op.getAllocs()[0], pendingChecks);
-        if (nullability.nullability != NEVER_NULL ||
-            !nullability.nonNullValue) {
-          return;
-        }
-
-        b.setInsertionPoint(op);
-        b.create<memref::DeallocOp>(op.getLoc(), nullability.nonNullValue);
-        op.erase();
-      }
-    });
-    getOperation()->walk([](OwnOp op) {
-      if (op.use_empty()) {
-        op.erase();
-      } else if (valueIsUnused(op.getResult())) {
-        OpBuilder b(op);
-        op.replaceAllUsesWith(b.create<NullOp>(op.getLoc()).getResult());
-        op.erase();
-      }
-    });
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createDeallocationSimplificationPass() {
-  return std::make_unique<DeallocationSimplificationPass>();
-}
-
-}  // namespace deallocation
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/deallocation_to_scf.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/deallocation_to_scf.cc
deleted file mode 100644
index b6e82d4d4c3c99..00000000000000
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/deallocation_to_scf.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-#include <utility>
-
-#include "deallocation/IR/deallocation_ops.h"
-#include "deallocation/transforms/passes.h"
-#include "llvm/ADT/STLExtras.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace deallocation {
-namespace {
-
-#define GEN_PASS_DEF_DEALLOCATIONTOSCFPASS
-#include "deallocation/transforms/passes.h.inc"
-
-LogicalResult rewriteRetain(RetainOp op, PatternRewriter& rewriter) {
-  assert(!op.getAllocs().empty() && "run canonicalization first");
-
-  if (op.getRetained().size() != 1 && op.getAllocs().size() != 1) {
-    return rewriter.notifyMatchFailure(
-        op, "this retain needs to be lowered to a library call");
-  }
-  // Note: The generated code has size O(|`allocs`| * |`retains`|). If there are
-  // cases where this gets too big, we should lower it to a library call
-  // instead.
-
-  auto loc = op.getLoc();
-
-  // Get the buffers of all `alloc` values.
-  SmallVector<Value> remainingBuffersAndResult;
-  for (Value alloc : op.getAllocs()) {
-    remainingBuffersAndResult.push_back(alloc);
-  }
-  llvm::copy(llvm::map_range(op.getAllocs(),
-                             [&](Value alloc) -> Value {
-                               return rewriter.create<GetBufferOp>(
-                                   loc, rewriter.getIndexType(), alloc);
-                             }),
-             std::back_inserter(remainingBuffersAndResult));
-  remainingBuffersAndResult.push_back({});
-
-  Value null = rewriter.create<NullOp>(loc);
-  auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  SmallVector<Value> results;
-
-  size_t nAllocs = op.getAllocs().size();
-  for (auto [retainedIndex, retained] : llvm::enumerate(op.getRetained())) {
-    auto retainedBuffer =
-        rewriter.create<GetBufferOp>(loc, rewriter.getIndexType(), retained);
-
-    remainingBuffersAndResult.back() = null;
-    for (auto allocIndex : llvm::seq<size_t>(0, nAllocs)) {
-      auto isSame = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::eq, retainedBuffer,
-          remainingBuffersAndResult[nAllocs + allocIndex]);
-
-      // If the buffers are the same, remove the alloc from consideration for
-      // future `retained` values.
-      SmallVector<Value> yieldedIfSame{null, zero,
-                                       remainingBuffersAndResult[allocIndex]};
-      SmallVector<Value> yieldedIfDifferent{
-          remainingBuffersAndResult[allocIndex],
-          remainingBuffersAndResult[allocIndex + nAllocs],
-          remainingBuffersAndResult.back()};
-
-      auto ifOp =
-          rewriter.create<scf::IfOp>(loc, TypeRange{ValueRange{yieldedIfSame}},
-                                     isSame, /*withElseRegion=*/true);
-      ifOp.getThenBodyBuilder().create<scf::YieldOp>(loc, yieldedIfSame);
-
-      // Otherwise, keep the current results.
-      ifOp.getElseBodyBuilder().create<scf::YieldOp>(loc, yieldedIfDifferent);
-
-      remainingBuffersAndResult[allocIndex] = ifOp.getResult(0);
-      remainingBuffersAndResult[allocIndex + nAllocs] = ifOp.getResult(1);
-      remainingBuffersAndResult.back() = ifOp.getResult(2);
-    }
-
-    results.push_back(remainingBuffersAndResult.back());
-  }
-
-  // Deallocate any remaining buffers.
-  for (auto index : llvm::seq<size_t>(0, nAllocs)) {
-    auto nonZero = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::ne,
-        remainingBuffersAndResult[index + nAllocs], zero);
-    rewriter.create<scf::IfOp>(
-        loc, nonZero, [&](OpBuilder& thenBuilder, Location loc) {
-          thenBuilder.create<FreeOp>(loc, remainingBuffersAndResult[index]);
-          thenBuilder.create<scf::YieldOp>(loc);
-        });
-  }
-
-  rewriter.replaceOp(op, results);
-
-  return success();
-}
-
-struct DeallocationToScfPass
-    : public impl::DeallocationToScfPassBase<DeallocationToScfPass> {
-  void runOnOperation() override {
-    MLIRContext* ctx = &getContext();
-    RewritePatternSet patterns(ctx);
-    patterns.add(rewriteRetain);
-
-    if (failed(applyPatternsAndFoldGreedily(getOperation(),
-                                            std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createDeallocationToScfPass() {
-  return std::make_unique<DeallocationToScfPass>();
-}
-
-}  // namespace deallocation
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/debug_passes.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/debug_passes.cc
deleted file mode 100644
index 9900a74b9e22b9..00000000000000
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/debug_passes.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-
-#include "deallocation/transforms/analysis.h"
-#include "deallocation/transforms/passes.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/IR/AsmState.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Operation.h"
-
-namespace mlir {
-namespace deallocation {
-namespace {
-
-#define GEN_PASS_DEF_ANNOTATEDEALLOCATIONPASS
-#include "deallocation/transforms/passes.h.inc"
-
-std::string getDebugString(AsmState& state, DeallocationAnalysis& analysis,
-                           Value value) {
-  std::string out;
-  llvm::raw_string_ostream os(out);
-  llvm::interleaveComma(analysis.getBackingMemory(value), os,
-                        [&](Value v) { v.printAsOperand(os, state); });
-  return out;
-}
-
-Attribute getDebugAttribute(AsmState& state, DeallocationAnalysis& analysis,
-                            Region& region) {
-  mlir::OpBuilder b(region.getContext());
-  return b.getArrayAttr(llvm::to_vector(
-      llvm::map_range(region.getArguments(), [&](Value arg) -> Attribute {
-        return b.getStringAttr(getDebugString(state, analysis, arg));
-      })));
-}
-
-struct AnnotatePass : public impl::AnnotateDeallocationPassBase<AnnotatePass> {
-  void runOnOperation() override {
-    DeallocationAnalysis analysis;
-    AsmState state(getOperation());
-    mlir::OpBuilder b(getOperation());
-    getOperation().walk([&](Operation* op) {
-      std::string out;
-      llvm::raw_string_ostream os(out);
-      if (op->getNumRegions() > 0) {
-        op->setAttr("deallocation.region_args_backing_memory",
-                    b.getArrayAttr(llvm::to_vector(
-                        llvm::map_range(op->getRegions(), [&](Region& region) {
-                          return getDebugAttribute(state, analysis, region);
-                        }))));
-      }
-
-      if (op->getNumResults() > 0) {
-        op->setAttr("deallocation.result_backing_memory",
-                    b.getArrayAttr(llvm::to_vector(llvm::map_range(
-                        op->getResults(), [&](Value result) -> Attribute {
-                          return b.getStringAttr(
-                              getDebugString(state, analysis, result));
-                        }))));
-      }
-    });
-  }
-};
-
-}  // namespace
-
-// Pass to annotate ops with debug information.
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createDeallocationAnnotationPass() {
-  return std::make_unique<AnnotatePass>();
-}
-
-}  // namespace deallocation
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/passes.h b/third_party/xla/xla/mlir_hlo/deallocation/transforms/passes.h
index 183082fc54e04d..4ba90fed2eae98 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/passes.h
+++ b/third_party/xla/xla/mlir_hlo/deallocation/transforms/passes.h
@@ -26,42 +26,11 @@ limitations under the License.
 namespace mlir {
 namespace deallocation {
 
-// Pass to split bufferization.alloc_tensor ops to optimize buffer reuse.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createSplitAllocTensorsPass();
-
-// Pass to insert deallocations (in the form of `deallocation.retain`) ops. Most
-// deallocations are typically converted to `memref.dealloc` by
-// canonicalization.
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> createDeallocatePass();
-
-// Pass to annotate ops with debug information.
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createDeallocationAnnotationPass();
-
-// Pass to annotate buffer arguments with aliasing information.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createXlaBufferArgRewritePass();
-
 // Pass to reuse buffers (hoisting, double buffering, dealloc/alloc
 // coalescing).
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 createBufferReusePass();
 
-// Lowers retain to SCF.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createDeallocationToScfPass();
-
-// Convert `deallocation` ops to LLVM.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createConvertDeallocationOpsToLLVM();
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createDeallocationSimplificationPass();
-
-void populateDeallocationToLLVMConversionPatterns(LLVMTypeConverter& converter,
-                                                  RewritePatternSet& patterns);
-
 #define GEN_PASS_REGISTRATION
 #include "deallocation/transforms/passes.h.inc"
 
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/passes.td b/third_party/xla/xla/mlir_hlo/deallocation/transforms/passes.td
index cc9ce1e5e44974..34dc1f56be6c49 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/passes.td
+++ b/third_party/xla/xla/mlir_hlo/deallocation/transforms/passes.td
@@ -15,63 +15,6 @@ limitations under the License.
 
 include "mlir/Pass/PassBase.td"
 
-def SplitAllocTensorsPass :
-    Pass<"hlo-split-alloc-tensors", "mlir::func::FuncOp"> {
-  let summary = "Split bufferization.alloc_tensor ops.";
-  let description = [{
-    `bufferization.alloc_tensor` ops that are reused for non-conflicting ops
-    prevent buffer reuse. This pass replaces each use of an alloc tensor with a
-    fresh alloc.
-  }];
-  let constructor = "::mlir::deallocation::createSplitAllocTensorsPass()";
-  let dependentDialects = ["::mlir::bufferization::BufferizationDialect"];
-}
-
-def AnnotateDeallocationPass :
-    Pass<"hlo-deallocation-annotation", "mlir::ModuleOp"> {
-  let summary = "Annotate ops with deallocation debug information.";
-  let description = [{
-    Adds attributes to annotate ops with debug information about the
-    deallocation analysis.
-  }];
-  let constructor = "::mlir::deallocation::createDeallocationAnnotationPass()";
-}
-
-def DeallocatePass : Pass<"hlo-deallocate", "mlir::ModuleOp"> {
-  let summary = "Deallocate buffers by inserting `deallocation.retain` ops.";
-  let description = [{
-    Inserts deallocations (in the form of `deallocation.retain`) ops. Most
-    deallocations are typically converted to `memref.dealloc` by
-    canonicalization.
-  }];
-  let constructor = "::mlir::deallocation::createDeallocatePass()";
-  let dependentDialects = ["::mlir::deallocation::DeallocationDialect"];
-}
-
-def DeallocationSimplificationPass : Pass<"hlo-deallocation-simplification",
-    "mlir::func::FuncOp"> {
-  let summary = "Simplifies deallocation.retain ops.";
-  let constructor = "::mlir::deallocation::createDeallocationSimplificationPass()";
-  let dependentDialects = ["::mlir::deallocation::DeallocationDialect"];
-}
-
-def XlaBufferArgRewritePass :
-    Pass<"hlo-xla-buffer-arg-rewrite", "mlir::func::FuncOp"> {
-  let summary = "Rewrites XLA framework buffer arguments with alias information";
-  let description = [{
-    In the presence of variables, some results of the main function will alias
-    other parameters. This pass rewrites the main function to annotate results
-    for which this isn't the case with the `deallocation.restrict` attribute,
-    indicating that they do not alias with any other buffer and allowing the
-    buffer-reuse pass to optimize them.
-
-    The pass uses attributes present in XLA programs
-    (`xla_framework.input_mapping`, `xla_framework.result_mapping` and
-    `xla_framework.result_inner_mapping`, specifically).
-  }];
-  let constructor = "::mlir::deallocation::createXlaBufferArgRewritePass()";
-}
-
 def BufferReusePass : Pass<"hlo-buffer-reuse", "mlir::func::FuncOp"> {
   let summary = "Reuse buffers.";
   let description = [{
@@ -93,23 +36,3 @@ def BufferReusePass : Pass<"hlo-buffer-reuse", "mlir::func::FuncOp"> {
   let dependentDialects = ["::mlir::memref::MemRefDialect"];
 }
 
-def ConvertDeallocationOpsToLLVMPass
-    : Pass<"hlo-convert-deallocation-ops-to-llvm", "mlir::func::FuncOp"> {
-  let summary = "Convert `deallocation` ops to LLVM";
-  let constructor = "::mlir::deallocation::createConvertDeallocationOpsToLLVM()";
-  let dependentDialects = [
-    "::mlir::LLVM::LLVMDialect",
-    "::mlir::memref::MemRefDialect",
-  ];
-}
-
-def DeallocationToScfPass : Pass<"hlo-deallocation-to-scf",
-                                 "mlir::func::FuncOp"> {
-  let summary = "Lowers retain to scf.";
-  let constructor = "::mlir::deallocation::createDeallocationToScfPass()";
-  let dependentDialects = [
-    "::mlir::arith::ArithDialect",
-    "::mlir::scf::SCFDialect",
-    "::mlir::memref::MemRefDialect",
-  ];
-}
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/split_alloc_tensors.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/split_alloc_tensors.cc
deleted file mode 100644
index 0d0bf7a6a790fd..00000000000000
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/split_alloc_tensors.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-
-#include "deallocation/transforms/passes.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-namespace deallocation {
-namespace {
-
-void splitAllocTensors(Block& block) {
-  for (auto& op : block) {
-    for (auto [index, operand] : llvm::enumerate(op.getOperands())) {
-      auto* definingOp = operand.getDefiningOp();
-      if (llvm::isa_and_nonnull<bufferization::AllocTensorOp>(definingOp)) {
-        op.setOperand(index, OpBuilder(&op).clone(*definingOp)->getResult(0));
-      }
-    }
-
-    for (auto& region : op.getRegions()) {
-      for (auto& block : region.getBlocks()) {
-        splitAllocTensors(block);
-      }
-    }
-  }
-
-  for (auto& op : llvm::make_early_inc_range(block)) {
-    if (llvm::isa<bufferization::AllocTensorOp>(op) && op.use_empty()) {
-      op.erase();
-    }
-  }
-}
-
-#define GEN_PASS_DEF_SPLITALLOCTENSORSPASS
-#include "deallocation/transforms/passes.h.inc"
-
-struct SplitAllocTensorsPass
-    : public impl::SplitAllocTensorsPassBase<SplitAllocTensorsPass> {
-  void runOnOperation() override {
-    splitAllocTensors(getOperation().getBody().front());
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createSplitAllocTensorsPass() {
-  return std::make_unique<SplitAllocTensorsPass>();
-}
-
-}  // namespace deallocation
-}  // namespace mlir
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/xla_buffer_arg_rewrite.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/xla_buffer_arg_rewrite.cc
deleted file mode 100644
index 919308910c27d8..00000000000000
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/xla_buffer_arg_rewrite.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-
-#include "deallocation/transforms/passes.h"
-#include "llvm/ADT/STLExtras.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/TypeUtilities.h"
-
-namespace mlir {
-namespace deallocation {
-namespace {
-
-#define GEN_PASS_DEF_XLABUFFERARGREWRITEPASS
-#include "deallocation/transforms/passes.h.inc"
-
-constexpr char kInputMapping[] = "xla_framework.input_mapping";
-constexpr char kResultMapping[] = "xla_framework.result_mapping";
-constexpr char kResultInnerMapping[] = "xla_framework.result_inner_mapping";
-
-struct XlaBufferArgRewritePass
-    : public impl::XlaBufferArgRewritePassBase<XlaBufferArgRewritePass> {
-  void runOnOperation() override {
-    func::FuncOp op = getOperation();
-    if (!op->hasAttr(kResultMapping)) return;
-
-    // Collect result arguments and input arguments.
-    auto results = llvm::to_vector(
-        llvm::make_filter_range(op.getArguments(), [&](auto arg) {
-          return op.getArgAttr(arg.getArgNumber(), kInputMapping) == nullptr;
-        }));
-    auto args =
-        llvm::to_vector(llvm::map_range(op.getArguments(), [&](auto arg) {
-          auto buffer = op.getArgAttrOfType<IntegerAttr>(arg.getArgNumber(),
-                                                         kInputMapping);
-          return buffer ? buffer.getInt() : -1;
-        }));
-
-    SmallVector<int64_t> resultMapping;
-    if (auto innerMapping = op->getAttrOfType<ArrayAttr>(kResultInnerMapping)) {
-      resultMapping = llvm::to_vector(llvm::map_range(
-          innerMapping.getAsValueRange<IntegerAttr>(),
-          [](const APInt& value) { return value.getSExtValue(); }));
-    } else if (auto mapping = op->getAttrOfType<IntegerAttr>(kResultMapping)) {
-      resultMapping = {mapping.getInt()};
-    }
-
-    if (resultMapping.size() != results.size()) {
-      op.emitOpError(
-          "number of result arguments does not match size of mapping");
-      signalPassFailure();
-      return;
-    }
-
-    for (auto [bufferIndex, result] : llvm::zip(resultMapping, results)) {
-      // If the result doesn't alias any argument, add the
-      // `deallocation.restrict` attribute to signal to the buffer reuse pass
-      // that this buffer is guaranteed not to alias any other argument.
-      if (!llvm::is_contained(args, bufferIndex)) {
-        op.setArgAttr(result.getArgNumber(), "deallocation.restrict",
-                      OpBuilder(op).getBoolAttr(true));
-      }
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createXlaBufferArgRewritePass() {
-  return std::make_unique<XlaBufferArgRewritePass>();
-}
-
-}  // namespace deallocation
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/gml_st/CMakeLists.txt
deleted file mode 100644
index 47c038050ca6b2..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-add_subdirectory(IR)
-add_subdirectory(interfaces)
-add_subdirectory(transforms)
-add_subdirectory(utils)
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/IR/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/gml_st/IR/CMakeLists.txt
deleted file mode 100644
index b7b4af6c0a3d55..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/IR/CMakeLists.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set(LLVM_TARGET_DEFINITIONS gml_st_ops.td)
-mlir_tablegen(gml_st_ops.h.inc -gen-op-decls)
-mlir_tablegen(gml_st_ops.cc.inc -gen-op-defs)
-mlir_tablegen(gml_st_dialect.h.inc -gen-dialect-decls)
-mlir_tablegen(gml_st_dialect.cc.inc -gen-dialect-defs)
-
-add_public_tablegen_target(MLIRgml_st_opsIncGen)
-add_dependencies(mlir-headers MLIRgml_st_opsIncGen)
-
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_mlir_dialect_library(GmlStDialect
-  gml_st_ops.cc
-
-  DEPENDS
-  MLIRgml_st_opsIncGen
-
-  LINK_LIBS PUBLIC
-  MLIRArithUtils
-  MLIRControlFlowInterfaces
-  MLIRIR
-  MLIRMemRefDialect
-  MLIRSideEffectInterfaces
-  MLIRSupport
-  MLIRTensorDialect
-  MLIRVectorDialect
-)
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops.cc b/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops.cc
deleted file mode 100644
index 8da248b5b69223..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "gml_st/IR/gml_st_ops.h"
-
-#include <tuple>
-#include <utility>
-
-#include "llvm/ADT/SetVector.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/Transforms/InliningUtils.h"
-#include "mlir/Transforms/RegionUtils.h"
-
-// Generated dialect definitions.
-#include "gml_st/IR/gml_st_dialect.cc.inc"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-//===----------------------------------------------------------------------===//
-// GmlSt Dialect Interfaces
-//===----------------------------------------------------------------------===//
-
-struct GmlStInlinerInterface : public DialectInlinerInterface {
-  using DialectInlinerInterface::DialectInlinerInterface;
-  // Operations in GmlSt dialect are always legal to inline since they are
-  // pure.
-  bool isLegalToInline(Operation *, Region *, bool, IRMapping &) const final {
-    return true;
-  }
-  // Handle the given inlined terminator by replacing it with a new operation
-  // as necessary. Required when the region has only one block.
-  void handleTerminator(Operation *op,
-                        ArrayRef<Value> valuesToRepl) const final {
-    auto yieldOp = dyn_cast<gml_st::YieldOp>(op);
-    if (!yieldOp) return;
-
-    for (auto [valueToRepl, operand] :
-         llvm::zip(valuesToRepl, yieldOp.getOperands())) {
-      valueToRepl.replaceAllUsesWith(operand);
-    }
-  }
-};
-}  // namespace
-
-//===----------------------------------------------------------------------===//
-// GmlStDialect
-//===----------------------------------------------------------------------===//
-
-void GmlStDialect::initialize() {
-  addOperations<
-#define GET_OP_LIST
-#include "gml_st/IR/gml_st_ops.cc.inc"
-      >();
-  addInterfaces<GmlStInlinerInterface>();
-}
-
-//===----------------------------------------------------------------------===//
-// FusionOp
-//===----------------------------------------------------------------------===//
-
-YieldOp FusionOp::getTerminator() {
-  return cast<YieldOp>(getBody()->getTerminator());
-}
-
-void FusionOp::print(OpAsmPrinter &p) {
-  p << " ";
-  if (!getInputs().empty()) {
-    p << "ins(";
-    llvm::interleaveComma(
-        llvm::zip(getBody()->getArguments(), getInputs()), p, [&](auto it) {
-          Value inputRegionArg, input;
-          std::tie(inputRegionArg, input) = it;
-          p << inputRegionArg << " = " << input << ": " << input.getType();
-        });
-    p << ") ";
-  }
-
-  if (!getInits().empty()) {
-    p << "inits(";
-    llvm::interleaveComma(
-        llvm::zip(getBody()->getArguments().drop_front(getInputs().size()),
-                  getInits()),
-        p, [&](auto it) {
-          Value inputRegionArg, input;
-          std::tie(inputRegionArg, input) = it;
-          p << inputRegionArg << " = " << input << ": " << input.getType();
-        });
-    p << ") ";
-  }
-
-  p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
-
-  p.printOptionalAttrDict(getOperation()->getAttrs(),
-                          {getOperandSegmentSizesAttrName()});
-
-  if (!getResultTypes().empty()) {
-    p << " : ";
-    llvm::interleave(getResultTypes(), p, ", ");
-  }
-}
-
-ParseResult FusionOp::parse(OpAsmParser &parser, OperationState &result) {
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> operands, regionOperands;
-  SmallVector<Type, 4> operandTypes;
-
-  auto parseElt = [&]() -> ParseResult {
-    if (parser.parseOperand(regionOperands.emplace_back(),
-                            /*allowResultNumber=*/false) ||
-        parser.parseEqual()) {
-      return failure();
-    }
-    if (parser.parseOperand(operands.emplace_back()) || parser.parseColon() ||
-        parser.parseType(operandTypes.emplace_back())) {
-      return failure();
-    }
-    return success();
-  };
-
-  size_t numInputs = 0, numInits = 0;
-  if (succeeded(parser.parseOptionalKeyword("ins"))) {
-    if (parser.parseCommaSeparatedList(AsmParser::Delimiter::Paren, parseElt))
-      return failure();
-  }
-  numInputs = operands.size();
-
-  if (succeeded(parser.parseOptionalKeyword("inits"))) {
-    if (parser.parseCommaSeparatedList(AsmParser::Delimiter::Paren, parseElt))
-      return failure();
-  }
-  numInits = operands.size() - numInputs;
-
-  SMLoc loc = parser.getCurrentLocation();
-  if (parser.resolveOperands(operands, operandTypes, loc, result.operands))
-    return failure();
-
-  // Parse region.
-  SmallVector<OpAsmParser::Argument, 4> regionArgs;
-  for (auto argAndType : llvm::zip(regionOperands, operandTypes)) {
-    auto &arg = regionArgs.emplace_back();
-    std::tie(arg.ssaName, arg.type) = argAndType;
-  }
-  Region *body = result.addRegion();
-  if (parser.parseRegion(*body, regionArgs)) return failure();
-
-  // Parse attributes.
-  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
-
-  // Parser result types.
-  if (parser.parseOptionalColonTypeList(result.types)) return failure();
-
-  result.addAttribute(
-      "operandSegmentSizes",
-      parser.getBuilder().getDenseI32ArrayAttr(
-          {static_cast<int32_t>(numInputs), static_cast<int32_t>(numInits)}));
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// YieldOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult YieldOp::verify() { return success(); }
-
-}  // namespace gml_st
-}  // namespace mlir
-
-// Generated op classes.
-#define GET_OP_CLASSES
-#include "gml_st/IR/gml_st_ops.cc.inc"
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops.h b/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops.h
deleted file mode 100644
index 55ade3e4c4a76a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines the operations used in the GML ST dialect.
-
-#ifndef MLIR_HLO_GML_ST_IR_GML_ST_OPS_H
-#define MLIR_HLO_GML_ST_IR_GML_ST_OPS_H
-
-#include "mlir/Bytecode/BytecodeOpInterface.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/Interfaces/ControlFlowInterfaces.h"
-#include "mlir/Interfaces/DestinationStyleOpInterface.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
-
-// Generated dialect declarations.
-#include "gml_st/IR/gml_st_dialect.h.inc"
-
-// Generated operation classes.
-#define GET_OP_CLASSES
-#include "gml_st/IR/gml_st_ops.h.inc"
-
-#endif  // MLIR_HLO_GML_ST_IR_GML_ST_OPS_H
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops.td b/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops.td
deleted file mode 100644
index b19d067027d752..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops.td
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This is the operation definition file for ST ops.
-
-#ifndef GML_ST_OPS
-#define GML_ST_OPS
-
-include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/DestinationStyleOpInterface.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir/Interfaces/ControlFlowInterfaces.td"
-include "gml_st/IR/gml_st_ops_base.td"
-
-///////////////////////////////////////////////////////////////////////////////
-// Ops
-///////////////////////////////////////////////////////////////////////////////
-
-def GMLST_FusionOp : GMLST_Op<"fusion", [
-      AttrSizedOperandSegments,
-      DestinationStyleOpInterface,
-      IsolatedFromAbove,
-      SingleBlockImplicitTerminator<"gml_st::YieldOp">
-    ]> {
-  let summary = "A cluster of operations to be tiled and fused.";
-
-  let arguments = (ins Variadic<AnyType>:$inputs,
-                       Variadic<AnyType>:$inits,
-                       OptionalAttr<DenseI64ArrayAttr>:$parallel_tile_sizes,
-                       OptionalAttr<DenseI64ArrayAttr>:$reduction_tile_sizes);
-  let results = (outs Variadic<AnyRankedTensor>:$results);
-  let regions = (region SizedRegion<1>:$region);
-
-  let hasCustomAssemblyFormat = 1;
-  let hasVerifier = 0;
-
-  code extraClassDeclaration = [{
-    /// Return terminator of the region body.
-    YieldOp getTerminator();
-
-    // Implement method necessary for DestinationStyleOpInterface.
-    mlir::MutableOperandRange getDpsInitsMutable() {
-      return getInitsMutable();
-    }
-  }];
-}
-
-def GMLST_YieldOp : GMLST_Op<"yield", [Pure, ReturnLike, Terminator,
-    HasParent<"::mlir::gml_st::FusionOp">]>,
-    Arguments<(ins Variadic<AnyType>:$values)> {
-  let summary = "Yield operation";
-  let description = [{
-    `gml_st.yield` is a special terminator operation for accumulator regions of
-    `gml_st.set_yield` and `gml_st.fusion` region.
-
-    Example:
-
-    ```mlir
-    gml_st.yield %f0: tensor<f32>
-    ```
-  }];
-  let assemblyFormat = "attr-dict $values `:` type($values)";
-}
-
-#endif // GML_ST_OPS
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops_base.td b/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops_base.td
deleted file mode 100644
index 64364a36f77361..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/IR/gml_st_ops_base.td
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef GML_ST_OPS_BASE
-#define GML_ST_OPS_BASE
-
-include "mlir/Dialect/Utils/StructuredOpsUtils.td"
-include "mlir/IR/EnumAttr.td"
-include "mlir/IR/OpBase.td"
-
-def GmlSt_Dialect : Dialect {
-  let name = "gml_st";
-  let cppNamespace = "::mlir::gml_st";
-  let description = [{
-    The GmlSt (Google ML Structured) dialect is intended to hold operations,
-    types and transformations to assist structured code generation.
-  }];
-  let usePropertiesForAttributes = 0;
-}
-
-class GMLST_Op<string mnemonic, list<Trait> traits> :
-    Op<GmlSt_Dialect, mnemonic, traits> {
-  let hasVerifier = 1;
-}
-
-#endif // GML_ST_OPS_BASE
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/README.md b/third_party/xla/xla/mlir_hlo/gml_st/README.md
deleted file mode 100644
index e121d7c7e86fa5..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/README.md
+++ /dev/null
@@ -1,173 +0,0 @@
-# Google ML Structured Dialect
- 
-The `gml_st` dialect will contain a loop-like construct and subset operations
-that should allow support for fusion beyond rectangular tiles. This is necessary
-for operations like `gather`, `scatter`, `concat` and more.
-
-## Overview
-### Tiling and fusion
-
-Tiling of an op is performed by creating a loop that computes subsets of the
-result. Usually the tiling is needed to enable vectorization or distribution.
-
-Before tiling
-
-```
-%0 = op(%input)
-```
-
-After tiling
-
-```
-loop (%ivs)
-  %1 = subset(%input, %ivs)
-  %2 = op (%1)
-```
-
-Fusion of a producer op into a tiled consumer consists of two main parts:
-computing subsets of producer's operands and moving the producer op into the
-loop body so that it operates on the subsets of its original operands.
-
-After consumer tiling
-```
-%0 = producer (%input)
-loop (%ivs)
-  %1 = subset(%0, %ivs)
-  %2 = consumer(%1)
-```
-
-After producer fusion
-
-```
-loop (%ivs)
-  %0 = subset(%input, %ivs)
-  %1 = producer(%0)
-  %2 = consumer (%1)
-```
-
-There is some duality between tiling and fusion. One can consider tiling as
-fusion of the op into a loop that partitions the iteration space and just
-returns identity for every subset. On the other hand, fusion can be seen as
-tiling of the producer and then merging of the loop bodies.
-
-### Subset operations
-
-Linalg has support for hyperrectangular subsets (tiles) of tensor/memref
-operands. Currently, Linalg's fusion assumes that the tiling is performed only
-using `tensor.extract_slice/tensor.insert_slice` and `memref.subview`
-operations.
-There are several disadvantages to that approach:
-
-If some of the operands are not affected by tiling, i.e. the tiling was
-performed along dimensions that are not present in the operand, then we cannot
-fuse anymore the producer of the operand. That can happen when `linalg.generic`
-broadcasts one of the operands or when the output is tiled, but not the
-reduction dimensions
-
-Support for fusion with ops like `gather`, `scatter`, `concat` for some of the
-cases can only be done via `TilingInterface`
-([RFC](https://llvm.discourse.group/t/rfc-for-tilinginterface-for-tiling-operations-that-dont-fit-into-linalg-structured-operation-definition/3897/7)).
-
-**Example of a tiled op**
-
-```
-%sum = linalg.tiled_loop (%i, %j) = (%c0, %c0) to (%c80, %c60) step (%c4, %c4)
-          ins (%in_ = %in: tensor<80x60xf32>, %cst_ = %cst: f32)
-          outs (%out_ = %out: tensor<80xf32>)
-          iterators["parallel", "reduction"] {
-  %in_sub = tensor.extract_slice %in_[%i, %j] [4, 4] [1, 1]
-      : tensor<80x60xf32> to tensor<4x4xf32>
-  %out_sub = tensor.extract_slice %out_[%i] [4] [1]
-      : tensor<80xf32> to tensor<4xf32>
-  %reduction = linalg.generic {
-      indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                       affine_map<(d0, d1) -> (d0)>],
-      iterator_types = ["parallel", "reduction"]}
-      ins(%in_sub : tensor<4x4xf32>)
-      outs(%out_sub : tensor<4xf32>) {
-    ^bb0(%a: f32, %b: f32):
-      %0 = arith.addf %a, %b : f32
-      linalg.yield %0 : f32
-  } -> tensor<4xf32>
-  %update = tensor.insert_slice %reduction into %out_[%i] [4] [1]
-      : tensor<4xf32> into tensor<80xf32>
-  linalg.yield %update : tensor<80xf32>
-}
-```
-
-The body of this loop models read-modify-write of the output tensor. The tile
-that we extract from `%out_` should have the same sizes/offsets/strides as the
-destination of `tensor.insert_slice`. The arguments of `tensor.extract_slice`
-and `tensor.insert_slice` are currently not required to encode the same tile.
-
-We introduce new operations that define subsets on tensors/memrefs
-
- * `subset.full %tensor` - the subset spans the original tensor fully
- * `subset.tile %tensor [%offsets][%sizes][%strides]` - defines a rectangular
-   tile
- * `subset.filter %tensor[%indices]` - the subset has the same shape as the
-   original tensor, but only the values at %indices are populated. This can be a
-   sparse tensor.
- * `subset.point %tensor[%index]` - the subset contains a single element
-
-### Structured loop
-
-We introduce `gml_st.loop` that keeps the subset definition separately from the
-materialization.
-
-`linalg.generic` has `AffineMap` attributes that specify the indexing maps and a
-region that models the computation on the element types of the operand
-tensors/memrefs. The region ends with `linalg.yield` terminator that yields the
-element of the output. The load and store ops in that case are implicit, so
-are extraction/insertion in `gml_st.loop`.
-
-`gml_st.loop` has one region that contains subset operations to define the
-dense/sparse ranges that we are working with and also `gml_st.materialize` ops
-to convert subset spec to a tensor or memref.
-
-`gml_st.yield` is the terminator for `gml_st.loop` that takes computed tensors
-and a subset specification for which the computation was done. Note that this
-way we don't have to explicitly write a destructive update with
-`tensor.insert_slice` and then yield a full tensor. Here, we yield values for a
-subset.
-
-
-```
-%sum = gml_st.loop (%i, %j) = (%c0, %c0) to (%c80, %c60) step (%c4, %c4)
-           ins (%in_ = %in: tensor<80x60xf32>, %cst_ = %cst: f32)
-           outs (%out_ = %out: tensor<80xf32>)
-           iterators["parallel", "sequential"] {
-  %in_tile = gml_st.tile %in_[%i, %j] [4, 4] [1, 1]
-      : tensor<80x60xf32> to !gml_st.subset<4x4xf32>
-  %out_tile = gml_st.tile %out_[%i] [4] [1]
-      : tensor<80xf32> to !gml_st.subset<4xf32>
-
-  %in_sub = gml_st.materialize %in_tile
-      : !gml_st.subset<4x4xf32> to tensor<4x4xf32>
-  %out_sub = gml_st.materialize %in_tile
-      : !gml_st.subset<4xf32> to tensor<4xf32>
-  %reduction = linalg.generic {
-      indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                       affine_map<(d0, d1) -> (d0)>],
-      iterator_types = ["parallel", "reduction"]}
-      ins(%in_sub : tensor<4x4xf32>)
-      outs(%out_sub : tensor<4xf32>) {
-    ^bb0(%a: f32, %b: f32):
-      %0 = arith.addf %a, %b : f32
-      linalg.yield %0 : f32
-  } -> tensor<4xf32>
-  gml_st.yield %reduction to %out_tile
-      : tensor<4xf32> to !gml_st.subset<4xf32>
-}
-```
-
-Currently, tiling of the consumer and fusion of its producers are tightly
-coupled. If the fusion is happening not in the same pass, then some analysis is
-required to find the [consumer - `tensor.extract_slice` - producer] triple to
-perform the fusion. Keeping the subset computations separately from the
-"compute" ops not only improves readability but also simplifies fusion, since we
-have a subset computation per operand and we can just specify what argument of
-the loop we want to fuse.
-
-It also simplifies the bufferization, since we don't need to introduce the
-additional operations in MemRef dialect for every subset operation in TensorOps.
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/interfaces/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/gml_st/interfaces/CMakeLists.txt
deleted file mode 100644
index da038529bba406..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/interfaces/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-set(LLVM_OPTIONAL_SOURCES
-  bufferizable_op_interface_impl.cc
-)
-
-add_mlir_library(GmlStBufferizableOpInterface
-  bufferizable_op_interface_impl.cc
-
-  LINK_LIBS PUBLIC
-  GmlStDialect
-  MLIRBufferizationDialect
-  MLIRBufferizationTransforms
-  MLIRDialectUtils
-  MLIRIR
-  MLIRSupport
-)
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.cc b/third_party/xla/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.cc
deleted file mode 100644
index 398d78c9cd9961..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "gml_st/interfaces/bufferizable_op_interface_impl.h"
-
-#include <optional>
-#include <tuple>
-
-#include "gml_st/IR/gml_st_ops.h"
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/Support/LogicalResult.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-using mlir::bufferization::AliasingOpOperandList;
-using mlir::bufferization::AliasingValueList;
-using mlir::bufferization::AnalysisState;
-using mlir::bufferization::BufferizableOpInterface;
-using mlir::bufferization::BufferizationOptions;
-using mlir::bufferization::BufferRelation;
-
-struct FusionOpBufferizationInterface
-    : public BufferizableOpInterface::ExternalModel<
-          FusionOpBufferizationInterface, FusionOp> {
-  bool bufferizesToMemoryRead(Operation * /*op*/, OpOperand & /*opOperand*/,
-                              const AnalysisState & /*state*/) const {
-    return true;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const AnalysisState & /*state*/) const {
-    return cast<FusionOp>(op).isDpsInit(&opOperand);
-  }
-
-  AliasingOpOperandList getAliasingOpOperands(
-      Operation *op, Value value, const AnalysisState & /*state*/) const {
-    auto fusionOp = cast<FusionOp>(op);
-    auto opResult = value.dyn_cast<OpResult>();
-    if (!opResult) return {};
-
-    // The i-th OpResult aliases with the i-th "out" tensor.
-    return {{fusionOp.getDpsInitOperand(opResult.getResultNumber()),
-             BufferRelation::Equivalent}};
-  }
-
-  AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
-                                      const AnalysisState & /*state*/) const {
-    auto fusionOp = cast<FusionOp>(op);
-
-    // The i-th "out" tensor aliases with the i-th OpResult.
-    if (fusionOp.isDpsInit(&opOperand)) {
-      return {
-          {fusionOp.getTiedOpResult(&opOperand), BufferRelation::Equivalent}};
-    }
-    return {};
-  }
-
-  bool isWritable(Operation * /*op*/, Value /*value*/,
-                  const AnalysisState & /*state*/) const {
-    return true;
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
-    // Take a guard before anything else.
-    OpBuilder::InsertionGuard g(rewriter);
-    rewriter.setInsertionPoint(op);
-
-    auto loc = op->getLoc();
-    FusionOp fusionOp = cast<FusionOp>(op);
-
-    // Nothing to do. This op is already bufferized.
-    if (fusionOp.hasBufferSemantics()) return success();
-
-    if (!fusionOp.hasTensorSemantics()) {
-      return op->emitError() << "expected either buffer or tensor semantics";
-    }
-
-    size_t numOutputs = fusionOp.getNumDpsInits();
-
-    // New operands for the cloned op.
-    SmallVector<Value> newOperands;
-    newOperands.reserve(fusionOp.getNumDpsInputs() + numOutputs);
-
-    for (OpOperand *opOperand : fusionOp.getDpsInputOperands()) {
-      if (fusionOp.isScalar(opOperand)) {
-        newOperands.push_back(opOperand->get());
-        continue;
-      }
-      FailureOr<Value> buffer = getBuffer(rewriter, opOperand->get(), options);
-      if (failed(buffer)) return failure();
-      newOperands.push_back(*buffer);
-    }
-
-    // New output operands for the cloned op.
-    SmallVector<Value> newOutputs;
-    newOutputs.reserve(numOutputs);
-
-    for (OpResult opResult : fusionOp->getOpResults()) {
-      OpOperand *opOperand =
-          fusionOp.getDpsInitOperand(opResult.getResultNumber());
-      FailureOr<Value> resultBuffer =
-          getBuffer(rewriter, opOperand->get(), options);
-      if (failed(resultBuffer)) return failure();
-      newOutputs.push_back(*resultBuffer);
-    }
-
-    newOperands.append(newOutputs.begin(), newOutputs.end());
-
-    // Set insertion point now that potential alloc/dealloc are introduced.
-    rewriter.setInsertionPoint(op);
-
-    // Clone the op, but use the new operands. Move the existing block into the
-    // new op. Since the new op does not have any tensor results, it does not
-    // return anything.
-    auto newFusionOp = cast<FusionOp>(cloneWithoutRegions(
-        rewriter, op, /*resultTypes=*/TypeRange{}, newOperands));
-
-    // Create empty region in the new bufferized op.
-    Region &region = newFusionOp.getRegion();
-    SmallVector<Type, 4> blockArgTypes =
-        llvm::to_vector(TypeRange(ValueRange(newOperands)));
-    SmallVector<Location, 4> blockArgLocs(blockArgTypes.size(), loc);
-    rewriter.createBlock(&region, region.end(), blockArgTypes, blockArgLocs);
-
-    ArrayRef<BlockArgument> bbArgs =
-        newFusionOp.getRegion().front().getArguments();
-    SmallVector<Value> bbArgsToTensors;
-    for (auto buf : bbArgs) {
-      if (isa<MemRefType>(buf.getType())) {
-        Value tensor = rewriter.create<bufferization::ToTensorOp>(loc, buf);
-        bbArgsToTensors.push_back(tensor);
-      } else {
-        bbArgsToTensors.push_back(buf);
-      }
-    }
-
-    // Move old body into new fusion op.
-    rewriter.mergeBlocks(fusionOp.getBody(), newFusionOp.getBody(),
-                         bbArgsToTensors);
-
-    // Copy results to output memrefs. In most of the cases it's not necessary,
-    // because clusters are constructed in a way that the result is produced by
-    // an dst-style op that already put everything in the output memrefs, but
-    // there are corner cases when it doesn't happen. For example, tiled 1d
-    // linalg.reduce.
-    rewriter.setInsertionPoint(newFusionOp.getTerminator());
-    for (auto [bbArg, resultValue] :
-         llvm::zip(bbArgs.take_back(numOutputs),
-                   newFusionOp.getTerminator().getValues())) {
-      if (auto toTensorOp =
-              resultValue.getDefiningOp<bufferization::ToTensorOp>()) {
-        rewriter.create<memref::CopyOp>(loc, toTensorOp.getMemref(), bbArg);
-      }
-    }
-
-    // Replace gml_st.yield values with output buffers.
-    rewriter.replaceOpWithNewOp<gml_st::YieldOp>(newFusionOp.getTerminator(),
-                                                 bbArgs.take_back(numOutputs));
-
-    // Replace the results of the old op with the new output buffers.
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, newOutputs);
-
-    return success();
-  }
-
-  FailureOr<BaseMemRefType> getBufferType(
-      Operation *op, Value value, const BufferizationOptions &options,
-      SmallVector<Value> &invocationStack) const {
-    auto fusionOp = cast<FusionOp>(op);
-
-    if (auto bbArg = value.dyn_cast<BlockArgument>()) {
-      // A tensor block argument has the same bufferized type as the
-      // corresponding output operand.
-      return bufferization::getBufferType(
-          fusionOp->getOpOperand(bbArg.getArgNumber()).get(), options,
-          invocationStack);
-    }
-
-    // The bufferized result type is the same as the bufferized type of the
-    // corresponding output operand.
-    return bufferization::getBufferType(
-        fusionOp.getDpsInitOperand(value.cast<OpResult>().getResultNumber())
-            ->get(),
-        options, invocationStack);
-  }
-};
-
-}  // namespace
-}  // namespace gml_st
-}  // namespace mlir
-
-void mlir::gml_st::registerBufferizableOpInterfaceExternalModels(
-    DialectRegistry &registry) {
-  registry.addExtension(
-      +[](MLIRContext *ctx, gml_st::GmlStDialect * /*dialect*/) {
-        FusionOp::attachInterface<FusionOpBufferizationInterface>(*ctx);
-      });
-}
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.h b/third_party/xla/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.h
deleted file mode 100644
index 54739c5b3acc76..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_GML_ST_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
-#define MLIR_HLO_GML_ST_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
-
-namespace mlir {
-class DialectRegistry;
-
-namespace gml_st {
-
-void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/gml_st/transforms/CMakeLists.txt
deleted file mode 100644
index 379af7e56a1d9e..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/CMakeLists.txt
+++ /dev/null
@@ -1,113 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set(LLVM_TARGET_DEFINITIONS passes.td)
-mlir_tablegen(passes.h.inc -gen-pass-decls -name GmlSt)
-add_public_tablegen_target(MLIRGmlStPassIncGen)
-
-set(LLVM_TARGET_DEFINITIONS test_passes.td)
-mlir_tablegen(test_passes.h.inc -gen-pass-decls -name GmlStTest)
-add_public_tablegen_target(MLIRGmlStTestPassIncGen)
-
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_mlir_library(GmlStPasses
-  add_debug_info/add_debug_info.cc
-  canonicalization/optimize_linalg_ops.cc
-  collapse_shape/collapse_shape.cc
-  collect_stats/collect_stats.cc
-  compose_extract_insert_slice/compose_extract_insert_slice.cc
-  copy_removal/copy_removal.cc
-  cpu_tiling/cpu_tiling_pipeline.cc
-  cpu_tiling/fusion_outlining.cc
-  cpu_tiling/fusion_planning_for_cpu.cc
-  cpu_tiling/pack_matmul.cc
-  cpu_tiling/remove_label.cc
-  cpu_tiling/transform_dot_for_cpu.cc
-  cpu_tiling/transform_elementwise_for_cpu.cc
-  cpu_tiling/transform_mmt4d_for_cpu.cc
-  cpu_tiling/transform_pack_for_cpu.cc
-  cpu_tiling/transform_reduce_for_cpu.cc
-  cpu_tiling/transform_scatter_for_cpu.cc
-  fusion/fusion.cc
-  peeling/peeling.cc
-  rewrite_from_elements_op/rewrite_from_elements_op.cc
-  rewrite_scf_forall/rewrite_scf_forall.cc
-  scalarization/scalarization.cc
-  tiling/tile_by_one.cc
-  tiling/tiling.cc
-  tiling_softmax/tiling_softmax.cc
-  vectorization/lower_vectors.cc
-  vectorization/vectorization.cc
-  vectorization/vectorize_copy.cc
-  vectorization/vectorize_for_cpu.cc
-
-  DEPENDS
-  MLIRGmlStPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MLIRAffineDialect
-  MLIRArithDialect
-  MLIRDestinationStyleOpInterface
-  MLIRDialectUtils
-  MLIRFuncDialect
-  MLIRGmlStUtils
-  MLIRIR
-  MLIRLinalgDialect
-  MLIRLinalgTransforms
-  MLIRMemRefDialect
-  MLIRPass
-  MLIRSCFUtils
-  MLIRSupport
-  MLIRVectorDialect
-  MLIRVectorToSCF
-  MLIRX86VectorTransforms
-  MhloDialect
-)
-
-add_mlir_library(GmlStTransforms
-  transforms.cc
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  GmlStDialect
-  MLIRAffineDialect
-  MLIRDialectUtils
-  MLIRIR
-)
-
-add_mlir_library(GmlStTestPasses
-  test_passes.cc
-
-  DEPENDS
-  MLIRGmlStTestPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  GmlStDialect
-  GmlStTransforms
-  MLIRPass
-  MLIRTransforms
-)
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/add_debug_info/add_debug_info.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/add_debug_info/add_debug_info.cc
deleted file mode 100644
index 8cf648cb9b2b58..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/add_debug_info/add_debug_info.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "gml_st/transforms/passes.h"
-#include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/Support/Path.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_ADDDEBUGINFOPASS
-#include "gml_st/transforms/passes.h.inc"
-
-struct AddDebugInfoPass : public impl::AddDebugInfoPassBase<AddDebugInfoPass> {
-  void runOnOperation() override {
-    auto module = getOperation();
-    auto *context = &getContext();
-    OpBuilder builder(context);
-    std::string inputFilePath("-");
-
-    if (auto fileLoc = module.getLoc().dyn_cast<mlir::FileLineColLoc>())
-      inputFilePath = fileLoc.getFilename().getValue();
-
-    auto fileAttr =
-        LLVM::DIFileAttr::get(context, llvm::sys::path::filename(inputFilePath),
-                              llvm::sys::path::parent_path(inputFilePath));
-
-    auto producer = StringAttr::get(context, "XLA CPU");
-    auto cuAttr = LLVM::DICompileUnitAttr::get(
-        context, llvm::dwarf::DW_LANG_C_plus_plus_17, fileAttr, producer,
-        /*isOptimized=*/false, LLVM::DIEmissionKind::LineTablesOnly);
-    module.walk([&](func::FuncOp funcOp) {
-      StringAttr funcName = StringAttr::get(context, funcOp.getName());
-      auto bT = LLVM::DIBasicTypeAttr::get(
-          context, llvm::dwarf::DW_TAG_base_type, "void", /*sizeInBits=*/0,
-          /*encoding=*/1);
-      auto subTypeAttr = LLVM::DISubroutineTypeAttr::get(
-          context, llvm::dwarf::DW_CC_normal, {bT});
-      auto spAttr = LLVM::DISubprogramAttr::get(
-          context, cuAttr, fileAttr, funcName, funcName, fileAttr, /*line=*/1,
-          /*scopeline=*/1, LLVM::DISubprogramFlags::Definition, subTypeAttr);
-      funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr));
-    });
-  }
-};
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>> createAddDebugInfoPass() {
-  return std::make_unique<AddDebugInfoPass>();
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/canonicalization/optimize_linalg_ops.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/canonicalization/optimize_linalg_ops.cc
deleted file mode 100644
index f7b29a7cb1f1bf..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/canonicalization/optimize_linalg_ops.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Value.h"
-#include "mlir/Interfaces/TilingInterface.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "thlo/IR/thlo_ops.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_OPTIMIZELINALGOPSPASS
-#include "gml_st/transforms/passes.h.inc"
-
-std::optional<Value> getSplatValue(PatternRewriter& rewriter, Location loc,
-                                   Value value) {
-  auto* definingOp = value.getDefiningOp();
-  if (!definingOp) return std::nullopt;
-
-  if (auto constantOp = dyn_cast_or_null<arith::ConstantOp>(definingOp)) {
-    auto denseElementsAttr =
-        constantOp.getValue().dyn_cast<DenseElementsAttr>();
-
-    if (!denseElementsAttr.isSplat()) return std::nullopt;
-
-    auto splatAttr = denseElementsAttr.getSplatValue<Attribute>();
-    auto splatType = denseElementsAttr.getElementType();
-
-    if (complex::ConstantOp::isBuildableWith(splatAttr, splatType))
-      return rewriter.create<complex::ConstantOp>(loc, splatType,
-                                                  splatAttr.cast<ArrayAttr>());
-
-    return rewriter.create<arith::ConstantOp>(loc, cast<TypedAttr>(splatAttr));
-  }
-
-  if (auto fillOp = dyn_cast_or_null<linalg::FillOp>(definingOp))
-    return fillOp.getInputs()[0];
-  return std::nullopt;
-}
-
-LogicalResult foldConstantOperandsIntoMap(linalg::MapOp op,
-                                          PatternRewriter& rewriter) {
-  auto loc = op->getLoc();
-  SmallVector<Value> newInputs;
-  IRMapping mapping;
-
-  for (auto [operand, bbArg] :
-       llvm::zip(op.getDpsInputOperands(), op.getBody()->getArguments())) {
-    auto constantValue = getSplatValue(rewriter, loc, operand->get());
-    if (constantValue.has_value()) {
-      mapping.map(bbArg, *constantValue);
-    } else {
-      newInputs.push_back(operand->get());
-    }
-  }
-
-  // No constant operands found.
-  if (newInputs.size() == op.getInputs().size()) return failure();
-
-  auto newMapOp = rewriter.create<linalg::MapOp>(loc, op.getResultTypes(),
-                                                 /*inputs=*/newInputs,
-                                                 /*init=*/op.getInit());
-  rewriter.cloneRegionBefore(op.getRegion(), newMapOp.getRegion(),
-                             newMapOp.getRegion().begin(), mapping);
-  rewriter.replaceOp(op, newMapOp.getResults());
-
-  return success();
-}
-
-// Replace linalg.map with no inputs with an linalg.fill.
-LogicalResult replaceConstantMapWithFill(linalg::MapOp op,
-                                         PatternRewriter& rewriter) {
-  // Only replace linalg.map that has no inputs.
-  if (!op.getInputs().empty()) return failure();
-
-  // linalg.index indicates that region result is not constant.
-  if (!op.getBody()->getOps<linalg::IndexOp>().empty()) return failure();
-
-  // Move all ops outside of the region. It's safe, because this linalg.map has
-  // only implicit arguments.
-  for (Operation& regionOp :
-       llvm::make_early_inc_range(op.getBody()->without_terminator())) {
-    regionOp.moveBefore(op);
-  }
-
-  // Get fill value from gml_st.yield operand.
-  auto yieldValue = op.getBody()->getTerminator()->getOperand(0);
-
-  rewriter.replaceOpWithNewOp<linalg::FillOp>(op, yieldValue, op.getInit());
-  return success();
-}
-
-// Replace linalg.broadcast(single_element_tensor) with linalg.fill.
-LogicalResult replaceBroadcastWithFill(linalg::BroadcastOp op,
-                                       PatternRewriter& rewriter) {
-  Value input = op.getInput();
-  auto inputType = dyn_cast<RankedTensorType>(input.getType());
-  if (!inputType) return failure();
-
-  Location loc = op.getLoc();
-  Value scalar;
-  if (auto splatValue = getSplatValue(rewriter, loc, input)) {
-    scalar = *splatValue;
-  } else if (hasSingleElement(inputType)) {
-    SmallVector<Value> indicesInput(
-        inputType.getRank(), rewriter.create<arith::ConstantIndexOp>(loc, 0));
-    scalar = rewriter.create<tensor::ExtractOp>(loc, input, indicesInput);
-  }
-  if (!scalar) return failure();
-  rewriter.replaceOpWithNewOp<linalg::FillOp>(op, scalar, op.getInit());
-  return success();
-}
-
-// Rewrite `tensor.extract_slice(op(arg1, ...))` into
-// `op(tensor.extract_slice(arg1, ...))`.
-LogicalResult rewriteExtractSliceOfTileableOp(Operation* op,
-                                              PatternRewriter& rewriter) {
-  auto tileableOp = dyn_cast<TilingInterface>(op);
-  if (!tileableOp) return failure();
-
-  // Support only ops with a single result for now.
-  if (op->getNumResults() != 1) return failure();
-  auto result = op->getResult(0);
-
-  // If the op has several uses, then it is not always beneficial to rewrite.
-  if (!result.hasOneUse()) return failure();
-  auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(*result.getUsers().begin());
-  // Check if the defining op and the slice op are located in the same block.
-  // Cases when they are not are covered by fusion.
-  if (!sliceOp || sliceOp->getBlock() != op->getBlock()) return failure();
-
-  rewriter.setInsertionPointAfter(sliceOp);
-  FailureOr<TilingResult> tilingResult =
-      tensor::replaceExtractSliceWithTiledProducer(rewriter, sliceOp, result);
-
-  if (failed(tilingResult)) return failure();
-  rewriter.replaceOp(sliceOp, tilingResult->tiledValues);
-
-  return success();
-}
-
-LogicalResult rewriteExtractSliceOfReverseOp(thlo::ReverseOp reverseOp,
-                                             PatternRewriter& rewriter) {
-  return rewriteExtractSliceOfTileableOp(reverseOp, rewriter);
-}
-
-struct RewriteExtractSliceOfLinalgOpPattern
-    : public OpInterfaceRewritePattern<linalg::LinalgOp> {
-  using OpInterfaceRewritePattern<linalg::LinalgOp>::OpInterfaceRewritePattern;
-
-  LogicalResult matchAndRewrite(linalg::LinalgOp linalgOp,
-                                PatternRewriter& rewriter) const override {
-    return rewriteExtractSliceOfTileableOp(linalgOp, rewriter);
-  }
-};
-
-struct OptimizeLinalgOpsPass
-    : public impl::OptimizeLinalgOpsPassBase<OptimizeLinalgOpsPass> {
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext* ctx = &getContext();
-
-    // Populate patterns.
-    RewritePatternSet patterns(ctx);
-    patterns.add<RewriteExtractSliceOfLinalgOpPattern>(ctx);
-    patterns.add(foldConstantOperandsIntoMap);
-    patterns.add(replaceBroadcastWithFill);
-    patterns.add(replaceConstantMapWithFill);
-    patterns.add(rewriteExtractSliceOfReverseOp);
-    tensor::populateFoldTensorEmptyPatterns(patterns);
-    tensor::populateReassociativeReshapeFoldingPatterns(patterns);
-
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-      return signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createOptimizeLinalgOpsPass() {
-  return std::make_unique<gml_st::OptimizeLinalgOpsPass>();
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/collapse_shape/collapse_shape.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/collapse_shape/collapse_shape.cc
deleted file mode 100644
index 7da420ef6333c2..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/collapse_shape/collapse_shape.cc
+++ /dev/null
@@ -1,351 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/transforms.h"
-#include "gml_st/utils/linalg_utils.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h"
-#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_COLLAPSESHAPEPASS
-#include "gml_st/transforms/passes.h.inc"
-
-// Creates reassociation indices for `shape_collapse` and `shape_expand` ops.
-// Given `rank`(N) and `retainTrailingDims`(M), returns the following
-// reassociation:
-//     [[0, 1, ..., N-M-1], [N-M], [N-M+1], ..., [N-1]]
-//                         |--- retainTrailingDims ---|
-//     |-------------------- rank --------------------|
-SmallVector<ReassociationIndices> getCollapsingReassociationIndices(
-    int64_t rank, int64_t retainTrailingDims) {
-  SmallVector<ReassociationIndices> reassociation;
-  reassociation.reserve(retainTrailingDims + 1);
-  if (rank > retainTrailingDims) {
-    auto seq = llvm::seq<int64_t>(0, rank - retainTrailingDims);
-    reassociation.emplace_back(seq.begin(), seq.end());
-  }
-  for (int64_t i = rank - retainTrailingDims; i < rank; ++i)
-    reassociation.push_back({i});
-  return reassociation;
-}
-
-struct CollapseBcastPattern : OpRewritePattern<linalg::BroadcastOp> {
-  using OpRewritePattern<linalg::BroadcastOp>::OpRewritePattern;
-
-  CollapseBcastPattern(MLIRContext* ctx, int64_t retainTrailingDims)
-      : OpRewritePattern<linalg::BroadcastOp>(ctx),
-        retainTrailingDims(retainTrailingDims) {}
-
-  LogicalResult matchAndRewrite(linalg::BroadcastOp op,
-                                PatternRewriter& rewriter) const override {
-    Value init = op.getInit();
-    auto initTy = init.getType().cast<RankedTensorType>();
-    int64_t initRank = initTy.getRank();
-    int64_t numCollapsedDims = initRank - retainTrailingDims;
-
-    if (numCollapsedDims < 2) {
-      return rewriter.notifyMatchFailure(op, "no dimension to collapse");
-    }
-
-    // Dimensions to be collapsed must either be all broadcasted or not
-    // broadcasted.
-    llvm::ArrayRef<int64_t> nonBroadcastedDims = op.getDimensions();
-
-    bool firstDimsBroadcasted = true;
-    if (!nonBroadcastedDims.empty()) {
-      int64_t i = 0;
-      while (i < (int64_t)nonBroadcastedDims.size() &&
-             nonBroadcastedDims[i] == i && i < numCollapsedDims) {
-        ++i;
-      }
-      if (i >= numCollapsedDims) {
-        firstDimsBroadcasted = false;
-      } else if (llvm::any_of(nonBroadcastedDims,
-                              [numCollapsedDims](unsigned dim) {
-                                return dim < numCollapsedDims;
-                              })) {
-        return rewriter.notifyMatchFailure(
-            op, "collapsed dims are not broadcasted in order");
-      }
-    }
-
-    Value operand = op.getInput();
-    auto operandTy = operand.getType().cast<RankedTensorType>();
-    int64_t operandRank = operandTy.getRank();
-    llvm::DenseSet<int64_t> nonBroadcastedDimsSet(nonBroadcastedDims.begin(),
-                                                  nonBroadcastedDims.end());
-    llvm::SmallVector<int64_t> collapsedNonBroadcastedDims;
-    collapsedNonBroadcastedDims.reserve(numCollapsedDims +
-                                        (firstDimsBroadcasted ? 1 : 0));
-    for (int64_t dim = numCollapsedDims; dim < initRank; ++dim) {
-      if (nonBroadcastedDimsSet.contains(dim)) {
-        collapsedNonBroadcastedDims.push_back(dim - numCollapsedDims + 1);
-      }
-    }
-    int64_t operandRetainTrailingDims =
-        retainTrailingDims - collapsedNonBroadcastedDims.size();
-
-    // Collapse operand and init tensor.
-    // For bcasts, this retains the last `retainTrailingDims` dimensions of the
-    // *result* and collapses all others.
-    Location loc = op.getLoc();
-    Value collapsedOperand = operand;
-    if (operandRank > operandRetainTrailingDims + 1) {
-      SmallVector<ReassociationIndices> operandReassociation =
-          getCollapsingReassociationIndices(operandRank,
-                                            operandRetainTrailingDims);
-      collapsedOperand = rewriter.createOrFold<tensor::CollapseShapeOp>(
-          loc, operand, operandReassociation);
-    }
-    SmallVector<ReassociationIndices> initReassociation =
-        getCollapsingReassociationIndices(initRank, retainTrailingDims);
-    Value collapsedInit =
-        rewriter.create<tensor::CollapseShapeOp>(loc, init, initReassociation);
-
-    // Create collapsed bcast op.
-    if (!firstDimsBroadcasted) {
-      collapsedNonBroadcastedDims.push_back(0);
-    }
-    Value collapsedBcastOp =
-        rewriter
-            .create<linalg::BroadcastOp>(
-                loc, collapsedOperand, collapsedInit,
-                ArrayRef<int64_t>(collapsedNonBroadcastedDims))
-            .getResult()
-            .front();
-
-    // Re-expand broadcast op and replace the original.
-    auto reexpandedBcastOp = rewriter.create<tensor::ExpandShapeOp>(
-        loc, initTy, collapsedBcastOp, initReassociation);
-    rewriter.replaceOp(op, reexpandedBcastOp.getResult());
-    return success();
-  }
-
- private:
-  int64_t retainTrailingDims;
-};
-
-struct CollapseReductionPattern : OpRewritePattern<linalg::ReduceOp> {
-  using OpRewritePattern<linalg::ReduceOp>::OpRewritePattern;
-
-  CollapseReductionPattern(MLIRContext* ctx, int64_t retainTrailingDims)
-      : OpRewritePattern<linalg::ReduceOp>(ctx),
-        retainTrailingDims(retainTrailingDims) {}
-
-  LogicalResult matchAndRewrite(linalg::ReduceOp op,
-                                PatternRewriter& rewriter) const override {
-    if (op.getNumDpsInits() != 1 || op.getDimensions().empty())
-      return failure();
-    int64_t reductionDim = op.getDimensions()[0];
-
-    Value operand = op.getInputs().front();
-    auto operandTy = operand.getType().cast<RankedTensorType>();
-    int64_t operandRank = operandTy.getRank();
-
-    if (operandRank <= retainTrailingDims + 1) {
-      return rewriter.notifyMatchFailure(op, "no dimension to collapse");
-    }
-
-    if (operandRank - 1 - reductionDim >= retainTrailingDims) {
-      return rewriter.notifyMatchFailure(
-          op, "reduction dimension must be retained");
-    }
-
-    Value init = op.getInits().front();
-    auto initTy = init.getType().cast<RankedTensorType>();
-    int64_t initRank = initTy.getRank();
-
-    // Collapse operand and init tensor.
-    // For reductions, this retains the last `retainTrailingDims` dimensions of
-    // the *operand* and collapses all others.
-    Location loc = op.getLoc();
-    SmallVector<ReassociationIndices> operandReassociation =
-        getCollapsingReassociationIndices(operandRank, retainTrailingDims);
-    Value collapsedOperand = rewriter.create<tensor::CollapseShapeOp>(
-        loc, operand, operandReassociation);
-    SmallVector<ReassociationIndices> initReassociation =
-        getCollapsingReassociationIndices(initRank, retainTrailingDims - 1);
-    Value collapsedInit =
-        rewriter.create<tensor::CollapseShapeOp>(loc, init, initReassociation);
-
-    auto collapsedOperandTy =
-        collapsedOperand.getType().cast<RankedTensorType>();
-    int64_t collapsedOperandRank = collapsedOperandTy.getRank();
-    auto collapsedInitTy = collapsedInit.getType().cast<RankedTensorType>();
-
-    // Create collapsed reduction op.
-    int64_t collapsedReductionDim =
-        reductionDim - operandRank + collapsedOperandRank;
-    SmallVector<utils::IteratorType> collapsedIteratorTypes(
-        collapsedOperandRank, utils::IteratorType::parallel);
-    collapsedIteratorTypes[collapsedReductionDim] =
-        utils::IteratorType::reduction;
-    auto collapsedReductionOp = rewriter.create<linalg::ReduceOp>(
-        loc, collapsedInitTy, collapsedOperand, collapsedInit,
-        ArrayRef<int64_t>({collapsedReductionDim}));
-    collapsedReductionOp.getRegion().takeBody(op.getBodyRegion());
-
-    // Re-expand reduction op and replace the original.
-    auto reexpandedReductionOp = rewriter.create<tensor::ExpandShapeOp>(
-        loc, initTy, collapsedReductionOp.getResults().front(),
-        initReassociation);
-    rewriter.replaceOp(op, reexpandedReductionOp.getResult());
-    return success();
-  }
-
- private:
-  int64_t retainTrailingDims;
-};
-
-linalg::MapOp createCollapsedMapOp(
-    linalg::MapOp mapOp, PatternRewriter& rewriter,
-    const SmallVector<ReassociationIndices>& reassociation) {
-  // Collapsed operands and init tensor.
-  Location loc = mapOp.getLoc();
-  SmallVector<Value> collapsedOperands = llvm::to_vector(
-      llvm::map_range(mapOp.getInputs(), [&](Value it) -> Value {
-        return rewriter.create<tensor::CollapseShapeOp>(loc, it, reassociation);
-      }));
-  Value init = mapOp.getInit();
-  Value collapsedInit =
-      rewriter.create<tensor::CollapseShapeOp>(loc, init, reassociation);
-
-  // Create collapsed map op.
-  auto collapsedInitTy = collapsedInit.getType().cast<RankedTensorType>();
-  auto collapsedMapOp = rewriter.create<linalg::MapOp>(
-      loc, collapsedInitTy, collapsedOperands, collapsedInit);
-  IRMapping bvm;
-  mapOp.getBodyRegion().cloneInto(&collapsedMapOp.getRegion(), bvm);
-  return collapsedMapOp;
-}
-
-struct CollapseMapPattern : OpRewritePattern<linalg::MapOp> {
-  using OpRewritePattern<linalg::MapOp>::OpRewritePattern;
-
-  CollapseMapPattern(MLIRContext* ctx, int64_t retainTrailingDims)
-      : OpRewritePattern<linalg::MapOp>(ctx),
-        retainTrailingDims(retainTrailingDims) {}
-
-  LogicalResult matchAndRewrite(linalg::MapOp op,
-                                PatternRewriter& rewriter) const override {
-    Value init = op.getInit();
-    auto initTy = init.getType().cast<RankedTensorType>();
-    int64_t rank = initTy.getRank();
-
-    if (rank <= retainTrailingDims + 1) {
-      return rewriter.notifyMatchFailure(op, "no dimension to collapse");
-    }
-
-    SmallVector<ReassociationIndices> reassociation =
-        getCollapsingReassociationIndices(rank, retainTrailingDims);
-    auto collapsedMapOp = createCollapsedMapOp(op, rewriter, reassociation);
-
-    // Re-expand map op and replace the original.
-    auto reexpandedMapOp = rewriter.create<tensor::ExpandShapeOp>(
-        op.getLoc(), initTy, collapsedMapOp.getResult().front(), reassociation);
-    rewriter.replaceOp(op, reexpandedMapOp.getResult());
-    return success();
-  }
-
- private:
-  int64_t retainTrailingDims;
-};
-
-struct MoveCollapseBeforeMapPattern
-    : OpRewritePattern<tensor::CollapseShapeOp> {
-  using OpRewritePattern<tensor::CollapseShapeOp>::OpRewritePattern;
-
-  explicit MoveCollapseBeforeMapPattern(MLIRContext* ctx)
-      : OpRewritePattern<tensor::CollapseShapeOp>(ctx) {}
-
-  LogicalResult matchAndRewrite(tensor::CollapseShapeOp op,
-                                PatternRewriter& rewriter) const override {
-    auto mapOp = op.getSrc().getDefiningOp<linalg::MapOp>();
-    if (!mapOp) return failure();
-    auto collapsedMapOp =
-        createCollapsedMapOp(mapOp, rewriter, op.getReassociationIndices());
-    rewriter.replaceOp(op, collapsedMapOp.getResult());
-    return success();
-  }
-};
-
-struct CollapseShapePass
-    : public impl::CollapseShapePassBase<CollapseShapePass> {
-  using CollapseShapePassBase<CollapseShapePass>::CollapseShapePassBase;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    CollapseShapePassBase<CollapseShapePass>::getDependentDialects(registry);
-
-    // TODO(frgossen): Move these iface implementations into the tensor dialect.
-    // Some of its canonicalizations depend on it. Until then, we have to
-    // register them explicitly.
-    tensor::registerInferTypeOpInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext* ctx = &getContext();
-
-    // Populate shape-collapsing patterns for cwise ops, reductions, and bcasts.
-    RewritePatternSet patterns(ctx);
-    patterns.add<CollapseBcastPattern, CollapseMapPattern,
-                 CollapseReductionPattern>(ctx, retainTrailingDims);
-    // By moving CollapseShapeOp before MapOp, we can potentially remove it if
-    // it cancels out with an ExpandShapeOp.
-    patterns.add<MoveCollapseBeforeMapPattern>(ctx);
-
-    // Collect some related canonicalization patterns.
-    linalg::BroadcastOp::getCanonicalizationPatterns(patterns, ctx);
-    linalg::FillOp::getCanonicalizationPatterns(patterns, ctx);
-    linalg::MapOp::getCanonicalizationPatterns(patterns, ctx);
-    linalg::ReduceOp::getCanonicalizationPatterns(patterns, ctx);
-    tensor::CollapseShapeOp::getCanonicalizationPatterns(patterns, ctx);
-    tensor::EmptyOp::getCanonicalizationPatterns(patterns, ctx);
-    tensor::ExpandShapeOp::getCanonicalizationPatterns(patterns, ctx);
-    tensor::populateFoldTensorEmptyPatterns(patterns);
-
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createCollapseShapePass() {
-  return std::make_unique<CollapseShapePass>();
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> createCollapseShapePass(
-    const CollapseShapePassOptions& options) {
-  return std::make_unique<CollapseShapePass>(options);
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/collect_stats/collect_stats.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/collect_stats/collect_stats.cc
deleted file mode 100644
index 8c75eb08cc88b1..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/collect_stats/collect_stats.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-#include "gml_st/transforms/passes.h"
-#include "gml_st/utils/tensor_utils.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Interfaces/LoopLikeInterface.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_COLLECTSTATSPASS
-#include "gml_st/transforms/passes.h.inc"
-
-using NameToOpMap =
-    std::unordered_map<std::string, llvm::SmallVector<Operation *, 4>>;
-
-struct CollectStatsPass : public impl::CollectStatsPassBase<CollectStatsPass> {
-  using CollectStatsPassBase<CollectStatsPass>::CollectStatsPassBase;
-
-  explicit CollectStatsPass(int64_t level) { detailLevel = level; }
-
-  void runOnOperation() override {
-    if (detailLevel <= 0) return;
-    func::FuncOp func = getOperation();
-
-    func.walk([&](Operation *op) {
-      if (!isa<TilingInterface, tensor::CollapseShapeOp, tensor::EmptyOp,
-               tensor::ExpandShapeOp, tensor::PackOp, tensor::PadOp,
-               tensor::UnPackOp>(op))
-        return WalkResult::advance();
-
-      std::string key = op->getName().getStringRef().str();
-      if (auto collapseShapeOp = dyn_cast<tensor::CollapseShapeOp>(op)) {
-        key += isDegenerateReshapeOp(collapseShapeOp) ? " (degenerate)"
-                                                      : " (non-degenerate)";
-      }
-      map[key].push_back(op);
-      return WalkResult::advance();
-    });
-
-    printStats();
-  }
-
- private:
-  void printStats() {
-    llvm::outs() << "*** Tileable ops stats (detail level " << detailLevel
-                 << ") ***\n";
-    for (const auto &it : map) {
-      auto name = it.first;
-      auto ops = it.second;
-      llvm::outs() << ops.size() << "x " << name << "\n";
-      // If we want the op name only, stop here.
-      if (detailLevel == 1) continue;
-      for (size_t i = 0; i < ops.size(); ++i) {
-        auto *op = ops[i];
-        llvm::outs().indent(2) << i + 1 << ". ";
-        op->print(llvm::outs());
-        llvm::outs() << '\n';
-        // If we want the full op string only, stop here.
-        if (detailLevel == 2) continue;
-        // Otherwise print info about the producers and consumers of the op.
-        llvm::outs().indent(4) << "Producers:\n";
-        for (auto operand : op->getOperands()) {
-          if (auto loopLikeProducer =
-                  operand.getDefiningOp<LoopLikeOpInterface>()) {
-            llvm::outs().indent(6)
-                << loopLikeProducer->getName().getStringRef() << '\n';
-          } else {
-            operand.print(llvm::outs().indent(6));
-            llvm::outs() << '\n';
-          }
-        }
-        llvm::outs().indent(4) << "Consumers:\n";
-        for (auto user : op->getUsers()) {
-          user->print(llvm::outs().indent(6));
-          llvm::outs() << '\n';
-        }
-      }
-      llvm::outs() << '\n';
-    }
-  }
-
-  int64_t detailLevel;
-  NameToOpMap map;
-};
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createCollectStatsPass() {
-  return std::make_unique<CollectStatsPass>();
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> createCollectStatsPass(
-    int64_t level) {
-  return std::make_unique<CollectStatsPass>(level);
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/compose_extract_insert_slice/compose_extract_insert_slice.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/compose_extract_insert_slice/compose_extract_insert_slice.cc
deleted file mode 100644
index 1d2e1f0b1e8381..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/compose_extract_insert_slice/compose_extract_insert_slice.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "gml_st/transforms/passes.h"
-#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::gml_st {
-namespace {
-
-using tensor::ExtractOp;
-using tensor::ExtractSliceOp;
-
-#define GEN_PASS_DEF_COMPOSEEXTRACTINSERTSLICEPASS
-#include "gml_st/transforms/passes.h.inc"
-
-LogicalResult composeExtractOfExtractSlice(ExtractOp extractOp,
-                                           PatternRewriter& rewriter) {
-  auto sliceOp = extractOp.getTensor().getDefiningOp<ExtractSliceOp>();
-  if (!sliceOp) return failure();
-
-  Location loc = extractOp.getLoc();
-  SmallVector<OpFoldResult> combinedOffsets, combinedSizes, combinedStrides;
-
-  // ExtractOp can be viewed as ExtractSliceOp as extracts 1x...x1 slice.
-  int64_t rank = extractOp.getTensor().getType().getRank();
-  SmallVector<OpFoldResult> consumerOffsets(
-      getAsOpFoldResult(extractOp.getIndices()));
-  SmallVector<OpFoldResult> consumerSizes(rank, rewriter.getIndexAttr(1));
-  SmallVector<OpFoldResult> consumerStrides(rank, rewriter.getIndexAttr(1));
-
-  if (failed(affine::mergeOffsetsSizesAndStrides(
-          rewriter, loc, sliceOp.getMixedOffsets(), sliceOp.getMixedSizes(),
-          sliceOp.getMixedStrides(), sliceOp.getDroppedDims(), consumerOffsets,
-          consumerSizes, consumerStrides, combinedOffsets, combinedSizes,
-          combinedStrides)))
-    return failure();
-
-  rewriter.replaceOpWithNewOp<ExtractOp>(
-      extractOp, sliceOp.getSource(),
-      getValueOrCreateConstantIndexOp(rewriter, loc, combinedOffsets));
-  return success();
-}
-
-struct ComposeExtractInsertSlicePass
-    : public impl::ComposeExtractInsertSlicePassBase<
-          ComposeExtractInsertSlicePass> {
-  void runOnOperation() override {
-    MLIRContext* ctx = &getContext();
-    RewritePatternSet patterns(ctx);
-    patterns.add(composeExtractOfExtractSlice);
-    tensor::populateMergeConsecutiveInsertExtractSlicePatterns(patterns);
-    if (failed(applyPatternsAndFoldGreedily(getOperation(),
-                                            std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>>
-createComposeExtractInsertSlicePass() {
-  return std::make_unique<ComposeExtractInsertSlicePass>();
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/cpu_tiling_pipeline.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/cpu_tiling_pipeline.cc
deleted file mode 100644
index 5e2649621f1314..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/cpu_tiling_pipeline.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <functional>
-
-#include "gml_st/transforms/passes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Func/Transforms/Passes.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/Passes.h"
-
-namespace mlir {
-namespace gml_st {
-
-GmlStCPUTilingOptions getDefaultCPUPipelineOptions(StringRef cpuName,
-                                                   int64_t statsDetailLevel) {
-  GmlStCPUTilingOptions opts;
-  opts.vectorSize = 8;
-  opts.reductionEnableHeuristic = false;
-  opts.reduction1DSplitRatio = 8;
-  opts.reduction1DTileSize = 8;
-  opts.reduction2DParallelDimTileSize = 4;
-  opts.reduction2DReductionDimTileSize = 4;
-  opts.matmulTileSizes = {};
-  // TODO(vuson): Re-enable or remove this:
-  opts.vectorizationSizeThreshold = 0;
-  opts.vectorizationTiledSizeThreshold = 1024;
-  opts.lowerToMmt4d = false;
-  opts.cpuName = cpuName;
-  opts.statsDetailLevel = statsDetailLevel;
-  opts.fuseDegenerateReshapes = false;
-  opts.inlineFusionClusters = true;
-  return opts;
-}
-
-void addCPUTilingPipeline(OpPassManager& pm,
-                          const GmlStCPUTilingOptions& options) {
-  using func::FuncOp;
-
-  pm.addNestedPass<FuncOp>(createCollectStatsPass(options.statsDetailLevel));
-  pm.addNestedPass<FuncOp>(createScalarizationPass(false));
-  pm.addNestedPass<FuncOp>(
-      createVectorizeForCPUPass(options.vectorizationSizeThreshold));
-
-  if (options.lowerToMmt4d) pm.addNestedPass<FuncOp>(createPackMatmulPass());
-
-  pm.addNestedPass<FuncOp>(createTransformScatterForCpuPass());
-
-  pm.addNestedPass<FuncOp>(
-      createTransformDotForCpuPass(options.matmulTileSizes, options.cpuName));
-  TransformReduceForCpuPassOptions reductionOpts;
-  reductionOpts.enableHeuristic = options.reductionEnableHeuristic;
-  reductionOpts.tileSize1D = options.reduction1DTileSize;
-  reductionOpts.splitRatio1D = options.reduction1DSplitRatio;
-  reductionOpts.parallelDimTileSize2D = options.reduction2DParallelDimTileSize;
-  reductionOpts.reductionDimTileSize2D =
-      options.reduction2DReductionDimTileSize;
-  pm.addNestedPass<FuncOp>(createTransformReduceForCpuPass(reductionOpts));
-
-  // Upstream generalization of tensor.pack/unpack (i.e. tensor.pack/unpack ->
-  // tensor.pad + linalg.transpose + tensor.insert_slice) does not transfer
-  // transformed labels from tensor.pack/unpack to linalg.transpose and thus
-  // makes the latter being tiled again.
-  // Hence, elementwise ops transformation needs to be run before pack/unpack
-  // transformation.
-  pm.addNestedPass<FuncOp>(createTransformElementwiseForCpuPass(
-      options.vectorSize, options.fuseDegenerateReshapes));
-  pm.addNestedPass<FuncOp>(createTransformMmt4DForCpuPass());
-  pm.addNestedPass<FuncOp>(createTransformPackForCpuPass());
-
-  if (options.inlineFusionClusters)
-    pm.addNestedPass<FuncOp>(createInlineFusionClustersPass());
-
-  pm.addPass(createCSEPass());
-  pm.addPass(createCanonicalizerPass());
-
-  pm.addNestedPass<FuncOp>(createRewriteForallOpPass());
-  pm.addNestedPass<FuncOp>(createComposeExtractInsertSlicePass());
-  pm.addNestedPass<FuncOp>(
-      createVectorizeForCPUPass(options.vectorizationTiledSizeThreshold));
-
-  // Tile remaining ops by size one and scalarize what we can.
-  pm.addNestedPass<FuncOp>(createTileByOnePass());
-  pm.addNestedPass<FuncOp>(createScalarizationPass());
-  pm.addNestedPass<FuncOp>(createComposeExtractInsertSlicePass());
-
-  pm.addPass(createCanonicalizerPass());
-
-  // Remove transformed labels after tiling all ops.
-  pm.addNestedPass<FuncOp>(createRemoveLabelPass());
-}
-
-void addDefaultCPUTilingPipeline(OpPassManager& pm, StringRef cpuName,
-                                 int64_t statsDetailLevel) {
-  addCPUTilingPipeline(pm,
-                       getDefaultCPUPipelineOptions(cpuName, statsDetailLevel));
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_outlining.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_outlining.cc
deleted file mode 100644
index dd912a13c58af8..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_outlining.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "gml_st/IR/gml_st_ops.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/transforms.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/RegionUtils.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_FUSIONOUTLININGPASS
-#include "gml_st/transforms/passes.h.inc"
-
-constexpr llvm::StringRef kFusionFunctionLabel = "fusion";
-
-void outlineFusionOp(func::FuncOp parentFuncOp, gml_st::FusionOp fusionOp,
-                     int64_t localFusionId, PatternRewriter& rewriter) {
-  Location loc = fusionOp.getLoc();
-  MLIRContext* ctx = fusionOp.getContext();
-
-  // Generate outlined fusion func ops right before the parent func op.
-  rewriter.setInsertionPoint(parentFuncOp);
-  std::string funcName =
-      llvm::formatv("{0}_fusion_{1}", parentFuncOp.getName(), localFusionId)
-          .str();
-  TypeRange funcArgTypes = fusionOp->getOperandTypes();
-  TypeRange funcResultTypes = fusionOp.getResultTypes();
-  auto funcTy = FunctionType::get(ctx, funcArgTypes, funcResultTypes);
-  auto funcOp =
-      rewriter.create<func::FuncOp>(fusionOp.getLoc(), funcName, funcTy);
-  setLabel(funcOp, kFusionFunctionLabel);
-
-  // Generate entry block.
-  Region& funcRegion = funcOp.getBody();
-  Block* funcBlock =
-      rewriter.createBlock(&funcRegion, funcRegion.begin(), funcArgTypes,
-                           SmallVector<Location>(funcArgTypes.size(), loc));
-  rewriter.setInsertionPointToStart(funcBlock);
-
-  // Generate new fusion op and steal body.
-  auto newFusionOp = rewriter.create<gml_st::FusionOp>(
-      loc, funcResultTypes, funcBlock->getArguments(), fusionOp->getAttrs());
-  newFusionOp.getRegion().takeBody(fusionOp.getRegion());
-
-  // Forward fusion op results.
-  rewriter.create<func::ReturnOp>(loc, newFusionOp->getResults());
-
-  // Replace fusion op with a call to the newly outlined function.
-  rewriter.setInsertionPoint(fusionOp);
-  rewriter.replaceOpWithNewOp<func::CallOp>(fusionOp, funcOp,
-                                            fusionOp->getOperands());
-}
-
-LogicalResult outlineFusionOpPattern(func::FuncOp funcOp,
-                                     PatternRewriter& rewriter) {
-  // Only apply to functions that are not the result of outlining.
-  if (hasLabel(funcOp, kFusionFunctionLabel)) return failure();
-
-  // Outline fusion ops one by one.
-  int64_t numOutlinedFusions = 0;
-  funcOp.walk([&](gml_st::FusionOp fusionOp) {
-    // Outline only outermost cluster.
-    if (fusionOp->getParentOfType<gml_st::FusionOp>()) return;
-
-    outlineFusionOp(funcOp, fusionOp, numOutlinedFusions++, rewriter);
-  });
-
-  // Successfully applied pattern if at least one fusion was outlined.
-  if (numOutlinedFusions > 0) return success();
-  return failure();
-}
-
-struct FusionOutliningPass
-    : public impl::FusionOutliningPassBase<FusionOutliningPass> {
-  void runOnOperation() override {
-    ModuleOp moduleOp = getOperation();
-    MLIRContext* ctx = &getContext();
-
-    // Populate patterns.
-    RewritePatternSet patterns(ctx);
-    patterns.add(outlineFusionOpPattern);
-
-    if (failed(applyPatternsAndFoldGreedily(moduleOp, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>> createFusionOutliningPass() {
-  return std::make_unique<gml_st::FusionOutliningPass>();
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_planning_for_cpu.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_planning_for_cpu.cc
deleted file mode 100644
index a6d127c6eb0de4..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_planning_for_cpu.cc
+++ /dev/null
@@ -1,269 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-#include <utility>
-
-#include "gml_st/IR/gml_st_ops.h"
-#include "gml_st/transforms/fusion/fusion.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/transforms.h"
-#include "llvm/ADT/STLExtras.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Interfaces/DestinationStyleOpInterface.h"
-#include "mlir/Interfaces/TilingInterface.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "thlo/IR/thlo_ops.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_FUSIONPLANNINGFORCPUPASS
-#define GEN_PASS_DEF_INLINEFUSIONCLUSTERSPASS
-#include "gml_st/transforms/passes.h.inc"
-
-static constexpr llvm::StringRef kFusionPlanningLabel =
-    "__fusion_planning_label__";
-
-// Returns true if the op is linalg.reduce or one of the variations of matmul.
-bool isReducingOp(Operation* op) {
-  return isa<linalg::ReduceOp, linalg::MatmulOp, linalg::MatvecOp,
-             linalg::VecmatOp, linalg::DotOp>(op);
-}
-
-// Returns true if the op is either a map (linalg.map or linalg.fill) or the op
-// has only parallel tiling dimensions and doesn't perform any computations
-// (linalg.broadcast, linalg.transpose, thlo.reverse).
-bool isElementwiseOp(Operation* op) {
-  return isa<linalg::MapOp, linalg::BroadcastOp, linalg::TransposeOp,
-             thlo::ReverseOp, linalg::FillOp>(op);
-}
-
-// Returns true is consumer and producer should be fused and tiled together.
-bool allowedToFuse(Operation* consumerOp, Operation* producerOp) {
-  // Verify that only known ops are fused.
-  if (!isa<linalg::LinalgDialect, tensor::TensorDialect, thlo::THLODialect>(
-          producerOp->getDialect()))
-    return false;
-
-  if (isa<thlo::ScatterOp, thlo::SortOp>(producerOp)) return false;
-
-  if (isa<linalg::FillOp, tensor::EmptyOp>(producerOp)) {
-    auto dstStyleOp = dyn_cast<DestinationStyleOpInterface>(consumerOp);
-    if (!dstStyleOp) return false;
-
-    if (llvm::any_of(dstStyleOp.getDpsInits(), [&](Value operand) {
-          return operand.getDefiningOp() == producerOp;
-        }))
-      return true;
-  }
-
-  if (isElementwiseOp(consumerOp) && isElementwiseOp(producerOp)) return true;
-
-  if (isa<linalg::MapOp, thlo::ReverseOp>(consumerOp)) return true;
-  if (isa<linalg::BroadcastOp>(consumerOp)) return false;
-
-  if (isa<linalg::ReduceOp>(consumerOp))
-    return isa<linalg::MapOp, linalg::BroadcastOp, thlo::ReverseOp>(producerOp);
-  if (isa<linalg::MatmulOp>(consumerOp))
-    return isa<linalg::BroadcastOp, thlo::ReverseOp>(producerOp);
-  if (isa<linalg::FillOp>(consumerOp)) return isa<tensor::EmptyOp>(producerOp);
-  return false;
-}
-
-// Runs graph search to find ops that can be fused together.
-template <typename OpTy>
-LogicalResult fusionPattern(OpTy op, PatternRewriter& rewriter) {
-  // The op is already in a fusion cluster.
-  if (isa<gml_st::FusionOp>(op.getOperation()->getParentOp())) return failure();
-
-  // The op was already processed.
-  if (hasLabel(op, kFusionPlanningLabel)) return failure();
-
-  for (auto& use : op->getUses()) {
-    auto* useOp = use.getOwner();
-    // This op can be potentially fused into one of the consumers. Wait until
-    // that other op is processed.
-    if (useOp && allowedToFuse(useOp, op.getOperation())) return failure();
-  }
-
-  SetVector<Operation*> resultOps;
-  SmallVector<Operation*> remainingProducers;
-  bool hasReducingOp = isReducingOp(op);
-  resultOps.insert(op.getOperation());
-  for (auto operand : op.getOperands())
-    remainingProducers.push_back(operand.getDefiningOp());
-
-  while (!remainingProducers.empty()) {
-    Operation* curOp = remainingProducers.pop_back_val();
-    if (!curOp) continue;
-
-    if (llvm::is_contained(resultOps, curOp)) continue;
-
-    if (!llvm::all_of(curOp->getUses(), [&](mlir::OpOperand& use) {
-          auto* consumerOp = use.getOwner();
-          // Check that curOp is allowed to fused with all consumers.
-          if (!allowedToFuse(consumerOp, curOp)) return false;
-          // Check that all consumers are already in the fusion cluster.
-          if (!llvm::is_contained(resultOps, consumerOp)) return false;
-          return true;
-        }))
-      continue;
-
-    // Only one reducing op should be added to the cluster.
-    if (isReducingOp(curOp)) {
-      if (hasReducingOp) continue;
-      hasReducingOp = true;
-    }
-
-    resultOps.insert(curOp);
-
-    for (auto operand : curOp->getOperands())
-      remainingProducers.push_back(operand.getDefiningOp());
-  }
-
-  FusionCluster fusionCluster;
-  fusionCluster.root = op;
-  fusionCluster.operations = resultOps;
-  if (failed(wrapFusionCluster(rewriter, fusionCluster))) return failure();
-
-  // Mark all ops as processed.
-  for (auto* op : resultOps) setLabel(op, kFusionPlanningLabel);
-
-  return success();
-}
-
-// Add attributes with tile sizes for parallel and reduction dimensions.
-// Attribute is empty if there is nothing to tile across respective dimensions.
-struct ComputeTileSizesPattern : public OpRewritePattern<gml_st::FusionOp> {
-  ComputeTileSizesPattern(MLIRContext* context, int64_t vectorSize,
-                          PatternBenefit benefit = 1)
-      : OpRewritePattern<gml_st::FusionOp>(context, benefit),
-        vectorSize(vectorSize) {}
-
-  LogicalResult matchAndRewrite(gml_st::FusionOp fusionOp,
-                                PatternRewriter& rewriter) const override {
-    if (fusionOp.getParallelTileSizes().has_value()) return failure();
-
-    if (!llvm::all_of(fusionOp.getRegion().getOps(), [](Operation& op) {
-          return isa<gml_st::YieldOp, linalg::BroadcastOp, linalg::FillOp,
-                     linalg::MapOp, tensor::EmptyOp, thlo::ReverseOp>(op);
-        }))
-      return failure();
-
-    auto rootOp = dyn_cast_or_null<TilingInterface>(
-        fusionOp.getTerminator().getOperand(0).getDefiningOp());
-    if (!rootOp) return failure();
-
-    const int64_t numLoops = rootOp.getLoopIteratorTypes().size();
-
-    fusionOp.setParallelTileSizes(getParallelTileSizes(numLoops));
-    fusionOp.setReductionTileSizes(SmallVector<int64_t>(numLoops, 0));
-
-    return success();
-  };
-
- private:
-  SmallVector<int64_t> getParallelTileSizes(int64_t numLoops) const {
-    SmallVector<int64_t> result(numLoops, 1);
-    if (!result.empty()) result.back() = vectorSize;
-    return result;
-  }
-
-  int64_t vectorSize;
-};
-
-struct FusionPlanningForCpuPass
-    : public impl::FusionPlanningForCpuPassBase<FusionPlanningForCpuPass> {
-  explicit FusionPlanningForCpuPass(int64_t vs = 8) { vectorSize = vs; }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext* ctx = &getContext();
-
-    // Cleanup passes to prepare ops for better clustering.
-    {
-      RewritePatternSet patterns(ctx);
-      populateDuplicateInitOpsPatterns(patterns);
-
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-        return signalPassFailure();
-    }
-
-    // Move ops to gml_st.fusion clusters.
-    {
-      RewritePatternSet patterns(ctx);
-      patterns.add(fusionPattern<linalg::MapOp>);
-      patterns.add(fusionPattern<linalg::MatmulOp>);
-      patterns.add(fusionPattern<linalg::ReduceOp>);
-      patterns.add(fusionPattern<linalg::TransposeOp>);
-      patterns.add(fusionPattern<thlo::ReverseOp>);
-      patterns.add(fusionPattern<thlo::ScatterOp>);
-      patterns.add(fusionPattern<thlo::SortOp>);
-
-      GreedyRewriteConfig config = GreedyRewriteConfig();
-      // TODO(shyshkov): Refactor the fusion pattern so it doesn't visit all ops
-      // too many times. Currently pattern might need O(N^2) iterations to
-      // create fusion clusters for N ops.
-      config.maxIterations = GreedyRewriteConfig::kNoLimit;
-      if (failed(
-              applyPatternsAndFoldGreedily(f, std::move(patterns), config))) {
-        return signalPassFailure();
-      }
-    }
-
-    // Add attributes with tile sizes.
-    {
-      RewritePatternSet patterns(ctx);
-      patterns.add<ComputeTileSizesPattern>(ctx, vectorSize);
-
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-        return signalPassFailure();
-    }
-  }
-};
-
-struct InlineFusionClustersPass
-    : public impl::InlineFusionClustersPassBase<InlineFusionClustersPass> {
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext* ctx = &getContext();
-
-    RewritePatternSet patterns(ctx);
-    patterns.add(inlineFusionCluster);
-
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-      return signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createFusionPlanningForCpuPass(int64_t vectorSize) {
-  return std::make_unique<mlir::gml_st::FusionPlanningForCpuPass>(vectorSize);
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createInlineFusionClustersPass() {
-  return std::make_unique<mlir::gml_st::InlineFusionClustersPass>();
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/pack_matmul.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/pack_matmul.cc
deleted file mode 100644
index e119d88dbb291f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/pack_matmul.cc
+++ /dev/null
@@ -1,331 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <array>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "gml_st/transforms/passes.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
-#include "mlir/Dialect/SCF/Transforms/Transforms.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_PACKMATMULPASS
-#include "gml_st/transforms/passes.h.inc"
-
-// Helper to pick the tile shapes to use as the 2 inner dimensions of the
-// 4D shapes appearing in a Mmt4D.
-class Mmt4DTileParams {
- public:
-  Mmt4DTileParams(ArrayRef<int> m0k0n0, const llvm::StringRef comment)
-      : m0(m0k0n0[0]), k0(m0k0n0[1]), n0(m0k0n0[2]), comment(comment) {}
-  std::array<int64_t, 2> lhs() const { return {m0, k0}; }
-  std::array<int64_t, 2> rhs() const { return {k0, n0}; }
-  std::array<int64_t, 2> acc() const { return {m0, n0}; }
-  std::array<int64_t, 2> rhsTranspose() const { return {n0, k0}; }
-  const std::string &getComment() const { return comment; }
-
- private:
-  const int64_t m0;
-  const int64_t k0;
-  const int64_t n0;
-  const std::string comment;
-};
-
-std::optional<Value> getPaddingValue(Value &source) {
-  auto padOp = source.getDefiningOp<tensor::PadOp>();
-  if (!padOp || padOp.getNofold() || !padOp.hasZeroLowPad())
-    return std::nullopt;
-
-  Value constantPaddingValue = padOp.getConstantPaddingValue();
-  if (!constantPaddingValue) return std::nullopt;
-
-  source = padOp.getSource();
-  return constantPaddingValue;
-}
-
-// Returns a tiled and packed value of |source|, the data layout is described by
-// |innerDimsPos|, |innerTileSizes| and |outerDimsPerm|.
-Value pack(Location loc, PatternRewriter &rewriter, Value source,
-           ArrayRef<int64_t> innerDimsPos, ArrayRef<int64_t> innerTileSizes,
-           ArrayRef<int64_t> outerDimsPerm) {
-  SmallVector<OpFoldResult> innerTileSizesOfr =
-      getAsOpFoldResult(rewriter.getI64ArrayAttr(innerTileSizes));
-  auto empty = tensor::PackOp::createDestinationTensor(
-      rewriter, loc, source, innerTileSizesOfr, innerDimsPos, outerDimsPerm);
-  std::optional<Value> paddingValue = getPaddingValue(source);
-  return rewriter.create<tensor::PackOp>(loc, source, empty, innerDimsPos,
-                                         innerTileSizesOfr, paddingValue,
-                                         outerDimsPerm);
-}
-
-// Returns an unpacked value of |source|, the data layout is described by
-// |innerDimsPos|, |innerTileSizes| and |outerDimsPerm|. |resultShapeValue| is
-// used to create the destination tensor for the resulting unpacked value.
-Value unpack(Location loc, PatternRewriter &rewriter, Value source,
-             Value resultShapeValue, ArrayRef<int64_t> innerDimsPos,
-             ArrayRef<int64_t> innerTileSizes,
-             ArrayRef<int64_t> outerDimsPerm) {
-  SmallVector<OpFoldResult> resultDims =
-      tensor::getMixedSizes(rewriter, loc, resultShapeValue);
-  auto empty = rewriter.create<tensor::EmptyOp>(
-      loc, resultDims,
-      source.getType().cast<RankedTensorType>().getElementType());
-
-  SmallVector<OpFoldResult> innerTileSizesOfr =
-      getAsOpFoldResult(rewriter.getI64ArrayAttr(innerTileSizes));
-
-  return rewriter.create<tensor::UnPackOp>(loc, source, empty, innerDimsPos,
-                                           innerTileSizesOfr, outerDimsPerm);
-}
-
-bool haveEqualShapeDim(Value x, Value y, int i) {
-  return x.getType().cast<ShapedType>().getDimSize(i) ==
-         y.getType().cast<ShapedType>().getDimSize(i);
-}
-
-// Returns a top-left slice from |input| shaped like |likeWhat|.
-Value extractSliceLike(Location loc, PatternRewriter &rewriter, Value input,
-                       Value likeWhat) {
-  SmallVector<OpFoldResult, 2> offsets, dims, strides;
-  auto resultType = likeWhat.getType().cast<RankedTensorType>();
-  int64_t rank = resultType.getRank();
-  auto resultShape = likeWhat.getType().cast<ShapedType>().getShape();
-  for (int i = 0; i < rank; ++i) {
-    offsets.push_back(rewriter.getIndexAttr(0));
-    strides.push_back(rewriter.getIndexAttr(1));
-    if (resultShape[i] == ShapedType::kDynamic) {
-      dims.emplace_back(rewriter.create<tensor::DimOp>(loc, likeWhat, i));
-    } else {
-      dims.push_back(rewriter.getIndexAttr(resultShape[i]));
-    }
-  }
-  return rewriter.create<tensor::ExtractSliceOp>(loc, resultType, input,
-                                                 offsets, dims, strides);
-}
-
-// Returns true if an input of the given |inputShape| needs padding to
-// ensure that its shape will be a multiple of |tileShape|. That's always true
-// in the dynamic shape case.
-bool needsPadding(ArrayRef<int64_t> inputShape, ArrayRef<int64_t> tileShape) {
-  assert(inputShape.size() == tileShape.size());
-  for (size_t i = 0; i < inputShape.size(); i++) {
-    if (inputShape[i] == ShapedType::kDynamic) {
-      return true;
-    }
-    if (inputShape[i] % tileShape[i] != 0) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Pads |input| on the bottom and on the right to the next multiple of
-// |tileShape|.
-Value pad(Location loc, PatternRewriter &rewriter, Value input,
-          ArrayRef<int64_t> tileShape) {
-  SmallVector<OpFoldResult, 2> lowPadding, highPadding;
-  SmallVector<int64_t, 2> resultTypeShape;
-  auto inputType = input.getType().cast<RankedTensorType>();
-  ArrayRef<int64_t> inputShape = inputType.getShape();
-  if (!needsPadding(inputShape, tileShape)) {
-    return input;
-  }
-  int64_t rank = inputType.getRank();
-  for (int64_t i = 0; i < rank; ++i) {
-    // No 'low' padding i.e. no padding at the top and on the left.
-    lowPadding.push_back(rewriter.getIndexAttr(0));
-    // 'High' padding i.e. padding at the bottom and on the right, and the
-    // result type shape, will be dynamic in any dimension if and only if the
-    // input shape is.
-    if (inputShape[i] == ShapedType::kDynamic) {
-      resultTypeShape.push_back(ShapedType::kDynamic);
-      // There only remains to compute the 'high' padding Value.
-      auto add = [&](Value a, Value b) {
-        return rewriter.create<arith::AddIOp>(loc, a, b);
-      };
-      auto sub = [&](Value a, Value b) {
-        return rewriter.create<arith::SubIOp>(loc, a, b);
-      };
-      auto rem = [&](Value a, Value b) {
-        return rewriter.create<arith::RemSIOp>(loc, a, b);
-      };
-      // Compare to the plainer distanceToNextMultipleOf in the static
-      // dimension case below.
-      auto distanceToNextMultipleOf = [&](Value a, Value b) {
-        Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-        Value bMinusOne = sub(b, one);
-        return sub(bMinusOne, rem(add(a, bMinusOne), b));
-      };
-      Value inputDim = rewriter.create<tensor::DimOp>(loc, input, i);
-      Value tileDim =
-          rewriter.create<arith::ConstantIndexOp>(loc, tileShape[i]);
-      Value padding = distanceToNextMultipleOf(inputDim, tileDim);
-      highPadding.push_back(padding);
-    } else {
-      auto distanceToNextMultipleOf = [=](int64_t a, int64_t b) {
-        int64_t bMinusOne = b - 1;
-        return bMinusOne - ((a + bMinusOne) % b);
-      };
-      int64_t inputDim = inputShape[i];
-      int64_t tileDim = tileShape[i];
-      int64_t padding = distanceToNextMultipleOf(inputDim, tileDim);
-      resultTypeShape.push_back(inputDim + padding);
-      highPadding.push_back(rewriter.getIndexAttr(padding));
-    }
-  }
-  Type elementType = inputType.getElementType();
-  RankedTensorType resultType =
-      RankedTensorType::get(resultTypeShape, elementType);
-  Value padValue;
-  if (auto complexTy = elementType.dyn_cast<ComplexType>()) {
-    auto zero = rewriter.getZeroAttr(complexTy.getElementType());
-    padValue = rewriter.create<complex::ConstantOp>(
-        loc, elementType, rewriter.getArrayAttr({zero, zero}));
-  } else {
-    auto zero = rewriter.getZeroAttr(elementType);
-    padValue = rewriter.create<arith::ConstantOp>(loc, elementType, zero);
-  }
-  return rewriter.create<tensor::PadOp>(loc, resultType, input, lowPadding,
-                                        highPadding, padValue);
-}
-
-// Pattern to convert linalg.matmul to an equivalent subgraph using
-// linalg.mmt4d. Currently, m0, n0 and k0 (packing parameters, aka layout tiling
-// parameters) are compile-time constants.
-LogicalResult packMatmul(linalg::MatmulOp matmulOp, PatternRewriter &rewriter) {
-  Location loc = matmulOp.getLoc();
-  MLIRContext *ctx = rewriter.getContext();
-
-  Value lhs = matmulOp.getDpsInputOperand(0)->get();
-  Value rhs = matmulOp.getDpsInputOperand(1)->get();
-  Value acc = matmulOp.getDpsInitOperand(0)->get();
-
-  // This transformation supports any mixing of static and dynamic dimensions,
-  // with one exception: the dynamic-ness of each dimension of the accumulator
-  // must match the dynamic-ness of the corresponding lhs/rhs dimension.
-  // This limitation is not inherent to this transformation's code, it's just
-  // here to avoid a current linalg folding limitation: at the moment,
-  // removing this gives the following error in e2e matmul tests,
-  //   "error: failed to legalize operation 'tensor.cast' that was explicitly
-  //   marked illegal"
-  // apparently due to some missing folding of tensor.cast op into reshapes.
-  if (!haveEqualShapeDim(lhs, acc, 0) || !haveEqualShapeDim(rhs, acc, 1)) {
-    return failure();
-  }
-
-  ShapedType lhsType = lhs.getType().cast<ShapedType>();
-  ShapedType rhsType = rhs.getType().cast<ShapedType>();
-  int64_t shapeM = lhsType.getShape()[0];
-  int64_t shapeN = rhsType.getShape()[1];
-  auto chooseMatMulOrMatVec =
-      [=](ArrayRef<int> m0k0n0, ArrayRef<int> m0k0n0ForMatVec,
-          ArrayRef<int> m0k0n0ForWhenRhsHas2Columns, std::string comment) {
-        assert(m0k0n0ForMatVec[2] == 1 && "not a matrix*vector shape");
-        assert(m0k0n0ForWhenRhsHas2Columns[2] == 2 &&
-               "N=2 is expected when RHS has 2 columns");
-
-        SmallVector<int> params;
-        if (shapeN == 1 || shapeM == 1) {
-          params.assign(m0k0n0ForMatVec.begin(), m0k0n0ForMatVec.end());
-        } else if (shapeN == 2 || shapeM == 2) {
-          params.assign(m0k0n0ForWhenRhsHas2Columns.begin(),
-                        m0k0n0ForWhenRhsHas2Columns.end());
-        } else {
-          return Mmt4DTileParams(m0k0n0, comment);
-        }
-
-        if (shapeN == 1 || shapeN == 2) {
-          comment += ", matrix * narrow matrix, where the narrow matrix has " +
-                     std::to_string(shapeN) + " column(s)";
-        } else {
-          // The vector*matrix case is intentionally derived from the
-          // matrix*vector case by swapping M and N dims so that in kernel
-          // codegen we can reuse matrix*vector kernels by swapping LHS and RHS.
-          std::swap(params[0], params[2]);
-          comment += ", narrow matrix * matrix, where the narrow matrix has " +
-                     std::to_string(shapeM) + " column(s)";
-        }
-        return Mmt4DTileParams(params, comment);
-      };
-
-  const auto &tileParams = chooseMatMulOrMatVec({8, 1, 8}, {8, 1, 1}, {8, 1, 2},
-                                                "f32*f32->f32, generic");
-
-  Value paddedLhs = pad(loc, rewriter, lhs, tileParams.lhs());
-  Value paddedRhs = pad(loc, rewriter, rhs, tileParams.rhs());
-  Value paddedAcc = pad(loc, rewriter, acc, tileParams.acc());
-
-  Value packed4DLhs =
-      pack(loc, rewriter, paddedLhs, {0, 1}, tileParams.lhs(), {});
-  Value packed4DRhs =
-      pack(loc, rewriter, paddedRhs, {1, 0}, tileParams.rhsTranspose(), {1, 0});
-  Value packed4DAcc =
-      pack(loc, rewriter, paddedAcc, {0, 1}, tileParams.acc(), {});
-
-  auto mmt4d = rewriter.create<linalg::Mmt4DOp>(
-      loc, packed4DAcc.getType(), ValueRange{packed4DLhs, packed4DRhs},
-      ValueRange{packed4DAcc});
-  mmt4d->setAttr(StringAttr::get(ctx, "comment"),
-                 StringAttr::get(ctx, tileParams.getComment()));
-
-  Value paddedResult = unpack(loc, rewriter, mmt4d.getResult(0), paddedAcc,
-                              {0, 1}, tileParams.acc(), {});
-
-  Value result = extractSliceLike(loc, rewriter, paddedResult, acc);
-  rewriter.replaceOp(matmulOp, ArrayRef<Value>{result});
-
-  return success();
-}
-
-struct PackMatmulPass : public impl::PackMatmulPassBase<PackMatmulPass> {
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<arith::ArithDialect, linalg::LinalgDialect, scf::SCFDialect,
-                    tensor::TensorDialect>();
-  }
-
-  void runOnOperation() override {
-    func::FuncOp func = getOperation();
-
-    RewritePatternSet patterns(&getContext());
-    patterns.add(packMatmul);
-
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
-      return signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createPackMatmulPass() {
-  return std::make_unique<PackMatmulPass>();
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_dot_for_cpu.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_dot_for_cpu.cc
deleted file mode 100644
index 316c9e5cc92ec2..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_dot_for_cpu.cc
+++ /dev/null
@@ -1,681 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <array>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "gml_st/IR/gml_st_ops.h"
-#include "gml_st/transforms/fusion/fusion.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/peeling/peeling.h"
-#include "gml_st/transforms/tiling/tiling.h"
-#include "gml_st/transforms/transforms.h"
-#include "gml_st/utils/linalg_utils.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
-#include "mlir/Dialect/SCF/Transforms/Transforms.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h"
-#include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h"
-#include "mlir/IR/Dominance.h"
-#include "mlir/Pass/Pass.h"  // IWYU pragma: keep
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "thlo/IR/thlo_ops.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TRANSFORMDOTFORCPUPASS
-#include "gml_st/transforms/passes.h.inc"
-
-constexpr llvm::StringRef kFusionPlanningLabel = "__fusion_planning_label__";
-
-struct MatmulSizes {
-  // [m, k] x [k, n]
-  int64_t m;
-  int64_t n;
-  int64_t k;
-};
-
-using MatmulTileSizeComputationFn = std::function<MatmulSizes(MatmulSizes)>;
-
-int64_t roundDownToPowerOfTwo(int64_t n) {
-  if ((n & (n - 1)) == 0) return n;
-  n |= n >> 1;
-  n |= n >> 2;
-  n |= n >> 4;
-  n |= n >> 8;
-  n |= n >> 16;
-  n |= n >> 32;
-  return (n + 1) >> 1;
-}
-
-bool isPowerOfTwo(int64_t n) { return (n & (n - 1)) == 0; }
-
-// Tiling heuristic that was tuned for static power-of-two sized shapes on
-// Skylake.
-MatmulSizes skylakeTilingHeuristic(MatmulSizes sizes) {
-  if (sizes.m == 1) {
-    // Limit the maximum tiling to an arbitrary 32 to limit code growth. This
-    // needs re-tuning.
-    return {1, std::min<int64_t>(sizes.n, 32), 1};
-  }
-
-  if (sizes.n == 1) {
-    if (sizes.k <= 8) {
-      return {1, 1, 1};
-    }
-    return {std::min<int64_t>(8, sizes.m), 1, 4};
-  }
-
-  MatmulSizes result;
-  result.k = sizes.k <= 8 ? 1 : 4;
-  result.n = std::min<int64_t>(8, sizes.n) << (sizes.m <= 16 ? 1 : 0);
-  result.m = std::min<int64_t>(32, sizes.m) << (sizes.n <= 4 ? 1 : 0);
-  return result;
-}
-
-// Tiling heuristic that was tuned for static power-of-two sized shapes on Zen
-// v2 ("Rome").
-MatmulSizes znver2TilingHeuristic(MatmulSizes sizes) {
-  MatmulSizes result;
-  result.k = sizes.n == 1 ? 8 : 1;
-  if (sizes.n == 1) {
-    result.m = sizes.k >= 32 ? 16 : 8;
-  } else {
-    result.m = sizes.n <= 8 ? 8 : 4;
-  }
-  if (sizes.m == 1) {
-    result.n = std::min<int64_t>(64, sizes.n) * (sizes.k <= 64 ? 1 : 2);
-  } else {
-    result.n = std::min<int64_t>(16, sizes.n);
-  }
-  return result;
-}
-
-// Tiling heuristic that was tuned for static sized shapes on generic Haswell.
-MatmulSizes haswellTilingHeuristic(MatmulSizes sizes) {
-  MatmulSizes result;
-  // Dot
-  if (sizes.m == 1 && sizes.n == 1) {
-    // At this point we only have small tensors, dots with bigger tensors are
-    // already turned into reduce(map).
-    return {1, std::min<int64_t>(sizes.n, 32), 1};
-  }
-
-  // Vecmat
-  if (sizes.m == 1) {
-    result.m = 1;
-    constexpr int64_t kVecmatNThreshold = 64;
-    constexpr int64_t kVecmatSizeThreshold = 16 * kVecmatNThreshold;
-    int64_t numElements = sizes.k * sizes.n;
-    if (sizes.n < kVecmatNThreshold) {
-      result.n = sizes.n;
-      if (numElements < kVecmatSizeThreshold) {
-        result.k = sizes.k;
-      } else if (isPowerOfTwo(sizes.n)) {
-        result.k = 2;
-      } else {
-        result.k = std::min<int64_t>(result.k / 2, 64);
-      }
-    } else {
-      result.n = kVecmatNThreshold;
-      if (sizes.k < 16) {
-        result.k = sizes.k;
-      } else {
-        if (sizes.n >= 256) {
-          result.k = isPowerOfTwo(sizes.k) ? 1 : 8;
-        } else {
-          result.k = isPowerOfTwo(sizes.k) ? 8 : 16;
-        }
-      }
-    }
-    return result;
-  }
-
-  result.k = sizes.n == 1 ? 8 : 1;
-  // Matvec
-  if (sizes.n == 1) {
-    if (sizes.k <= 8) {
-      return {1, 1, 1};
-    }
-    return {std::min<int64_t>(8, sizes.m), 1, 4};
-  }
-  // Matmul
-  result.k = sizes.k <= 8 ? 1 : 4;
-  result.n = std::min<int64_t>(8, sizes.n) << (sizes.m <= 16 ? 1 : 0);
-  result.m = std::min<int64_t>(32, sizes.m) << (sizes.n <= 4 ? 1 : 0);
-  return result;
-}
-
-std::function<MatmulSizes(MatmulSizes)> wrapHeuristic(
-    const std::function<MatmulSizes(MatmulSizes)> &heuristic,
-    MatmulSizes dynamicDefault) {
-  return [=](MatmulSizes sizes) {
-    if (ShapedType::isDynamic(sizes.n) || ShapedType::isDynamic(sizes.m) ||
-        ShapedType::isDynamic(sizes.k)) {
-      return dynamicDefault;
-    }
-
-    sizes.m = roundDownToPowerOfTwo(sizes.m);
-    sizes.n = roundDownToPowerOfTwo(sizes.n);
-    sizes.k = roundDownToPowerOfTwo(sizes.k);
-
-    return heuristic(sizes);
-  };
-}
-
-MatmulSizes getMatmulSizes(linalg::MatmulOp op) {
-  // [m, k] x [k, n]
-  auto lhsTy = op->getOperand(0).getType().cast<ShapedType>();
-  auto rhsTy = op->getOperand(1).getType().cast<ShapedType>();
-  MatmulSizes sizes;
-  sizes.m = lhsTy.getDimSize(0);
-  sizes.k = rhsTy.getDimSize(0);
-  sizes.n = rhsTy.getDimSize(1);
-  return sizes;
-}
-
-MatmulSizes getMatmulSizes(linalg::VecmatOp op) {
-  // [1, k] x [k, n]
-  auto ty = op->getOperand(1).getType().cast<ShapedType>();
-  MatmulSizes sizes;
-  sizes.m = 1;
-  sizes.k = ty.getDimSize(0);
-  sizes.n = ty.getDimSize(1);
-  return sizes;
-}
-
-MatmulSizes getMatmulSizes(linalg::MatvecOp op) {
-  // [m, k] x [k, 1]
-  auto ty = op->getOperand(0).getType().cast<ShapedType>();
-  MatmulSizes sizes;
-  sizes.m = ty.getDimSize(0);
-  sizes.k = ty.getDimSize(1);
-  sizes.n = 1;
-  return sizes;
-}
-
-MatmulSizes getMatmulSizes(linalg::DotOp op) {
-  // [1, k] x [k, 1]
-  auto ty = op->getOperand(0).getType().cast<ShapedType>();
-  MatmulSizes sizes;
-  sizes.m = 1;
-  sizes.k = ty.getDimSize(0);
-  sizes.n = 1;
-  return sizes;
-}
-
-SmallVector<int64_t> dropZeros(ArrayRef<int64_t> tileSizes) {
-  return to_vector(llvm::make_filter_range(
-      tileSizes, [](int64_t size) { return size != 0; }));
-}
-
-struct DotAddPattern : public OpRewritePattern<linalg::MapOp> {
-  using OpRewritePattern<linalg::MapOp>::OpRewritePattern;
-
-  explicit DotAddPattern(MLIRContext *context, PatternBenefit benefit = 1)
-      : OpRewritePattern<linalg::MapOp>(context, benefit) {}
-
-  LogicalResult matchAndRewrite(linalg::MapOp mapOp,
-                                PatternRewriter &rewriter) const override {
-    auto &region = mapOp.getMapper();
-    if (!region.hasOneBlock()) return failure();
-
-    auto &body = region.front();
-    // The body region should only have one add operation and a linalg.yield.
-    if (body.getOperations().size() != 2) return failure();
-
-    auto &mapperOp = body.front();
-    if (!isa<arith::AddIOp, arith::AddFOp>(mapperOp)) return failure();
-
-    // Map of add should always be binary.
-    if (mapOp.getInputs().size() != 2) return failure();
-    if (ValueRange{body.getArguments()} != ValueRange{mapperOp.getOperands()})
-      return failure();
-
-    if (!llvm::any_of(mapOp.getInputs(), [](Value operand) {
-          auto linalgOp = operand.getDefiningOp<linalg::LinalgOp>();
-          return linalg::isaContractionOpInterface(linalgOp);
-        }))
-      return failure();
-
-    auto foldAddIntoDotOperand = [&](unsigned opIdx) {
-      auto dotOp = mapOp.getInputs()[opIdx].getDefiningOp<linalg::LinalgOp>();
-      auto otherOp = mapOp.getInputs()[1 - opIdx];
-      if (!linalg::isaContractionOpInterface(dotOp)) return false;
-      if (!dotOp.getDpsInitOperand(0)->get().getDefiningOp<linalg::FillOp>())
-        return false;
-      if (!dotOp->hasOneUse()) return false;
-      // TODO(vuson): handle the case where we need to move dotOp up or otherOp
-      // down.
-      mlir::DominanceInfo domInfo(mapOp->getParentOp());
-      if (!domInfo.properlyDominates(otherOp, dotOp)) return false;
-      rewriter.updateRootInPlace(
-          dotOp, [&]() { dotOp.setDpsInitOperand(0, otherOp); });
-      rewriter.replaceOp(mapOp, dotOp->getResults());
-      return true;
-    };
-
-    return success(foldAddIntoDotOperand(0) || foldAddIntoDotOperand(1));
-  }
-};
-
-LogicalResult tileAndPeelReductionDim(PatternRewriter &rewriter,
-                                      Operation *reduceOp,
-                                      ArrayRef<int64_t> reductionDimTileSizes) {
-  FailureOr<scf::SCFTilingResult> reductionDimTilingResult =
-      tileUsingSCFForOpAndFuseGreedily(
-          rewriter, reduceOp,
-          getSCFTilingOptions(rewriter.getContext(), reductionDimTileSizes));
-  if (failed(reductionDimTilingResult)) return failure();
-
-  SCFForPeelingResult reductionDimPeelingResult = peelSCFForOp(
-      rewriter, cast<scf::ForOp>(reductionDimTilingResult->loops.front()));
-  if (reductionDimPeelingResult.mainLoop) {
-    setLabel(reductionDimPeelingResult.mainLoop, kPerfectlyTiledLoopLabel);
-  }
-  return success();
-}
-
-SmallVector<int64_t> getTileSizesForDimsOfType(Operation *iop,
-                                               ArrayRef<int64_t> tileSizes,
-                                               utils::IteratorType iterType) {
-  TilingInterface op = cast<TilingInterface>(iop);
-  SmallVector<utils::IteratorType> iteratorTypes = op.getLoopIteratorTypes();
-  SmallVector<int64_t> tileSizesOfType(iteratorTypes.size(), 0);
-  assert(tileSizes.size() == iteratorTypes.size() &&
-         "the number of provided tile sizes should match the iteration domain "
-         "of the op");
-  SmallVector<unsigned> iteratorTypeDimsPositions;
-  findPositionsOfType(iteratorTypes, iterType, iteratorTypeDimsPositions);
-  for (unsigned pos : iteratorTypeDimsPositions)
-    tileSizesOfType[pos] = tileSizes[pos];
-  return tileSizesOfType;
-}
-
-/// Helper to tile dot operations (linalg.matvec, linalg.vecmat, linalg.dot)
-/// and peel the generated loops. This can be extended to support any op that
-/// implements TilingInterface.
-template <typename DotOpTy>
-LogicalResult tileAndPeelMatmulOp(PatternRewriter &rewriter, DotOpTy dotOp,
-                                  ArrayRef<int64_t> tileSizes) {
-  Operation *tilingRoot = dotOp;
-  if (auto fusionOp = dyn_cast<gml_st::FusionOp>(dotOp->getParentOp())) {
-    tilingRoot = fusionOp.getTerminator().getValues()[0].getDefiningOp();
-  }
-
-  // First level tiling: parallel dimension.
-  auto parallelDimsTileSizes = getTileSizesForDimsOfType(
-      dotOp.getOperation(), tileSizes, utils::IteratorType::parallel);
-  auto reductionDimsTileSizes = getTileSizesForDimsOfType(
-      dotOp.getOperation(), tileSizes, utils::IteratorType::reduction);
-  if (!isa<DotOpTy>(tilingRoot))
-    parallelDimsTileSizes = dropZeros(parallelDimsTileSizes);
-
-  auto tilingParallelDimsResult = tileUsingSCFForallOpAndFuseGreedily(
-      rewriter, tilingRoot,
-      getSCFTilingOptions(rewriter.getContext(), parallelDimsTileSizes));
-  if (failed(tilingParallelDimsResult)) return failure();
-
-  if (!tilingParallelDimsResult->loop) {
-    return tileAndPeelReductionDim(rewriter, dotOp, reductionDimsTileSizes);
-  }
-  auto peeledParallelLoop =
-      peelAllLoops(tilingParallelDimsResult->loop, rewriter);
-
-  // Process main parallel loop.
-  scf::ForallOp mainParallelLoop = peeledParallelLoop.mainLoop;
-  if (mainParallelLoop) {
-    auto tiledDotOp = *mainParallelLoop.getBody()->getOps<DotOpTy>().begin();
-    if (failed(tileAndPeelReductionDim(rewriter, tiledDotOp,
-                                       reductionDimsTileSizes))) {
-      return failure();
-    }
-  }
-
-  // Process tail parallel loop.
-  for (scf::ForallOp tailParallelLoop : peeledParallelLoop.tailLoops) {
-    for (auto tiledDotOp : llvm::to_vector(
-             tailParallelLoop.getBody()->template getOps<DotOpTy>())) {
-      auto reductionDimTilingResult = tileUsingSCFForOpAndFuseGreedily(
-          rewriter, tiledDotOp,
-          getSCFTilingOptions(rewriter.getContext(), reductionDimsTileSizes));
-      if (failed(reductionDimTilingResult)) return failure();
-    }
-  }
-  return success();
-}
-
-// Tile linalg.conv_2d_nhwc_hwcf to convert it to linalg.matmul..
-struct Conv2DNhwcHwcfOpPattern
-    : public OpRewritePattern<linalg::Conv2DNhwcHwcfOp> {
-  using OpRewritePattern<linalg::Conv2DNhwcHwcfOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(linalg::Conv2DNhwcHwcfOp convOp,
-                                PatternRewriter &rewriter) const override {
-    if (!isTransformableIntoMatmul(convOp)) return failure();
-    FailureOr<scf::SCFTilingResult> tilingResult = scf::tileUsingSCFForOp(
-        rewriter, cast<TilingInterface>(convOp.getOperation()),
-        getSCFTilingOptions(rewriter.getContext(), {0, 0, 0, 0, 1, 0, 0}));
-    if (failed(tilingResult)) return failure();
-    rewriter.replaceOp(convOp, tilingResult->replacements);
-
-    auto tiledConv =
-        cast<linalg::Conv2DNhwcHwcfOp>(tilingResult->tiledOps.front());
-    return convertConvToMatmul(tiledConv, rewriter);
-  }
-};
-
-// Tile linalg.batch_matmul to 1 in the outermost dimension, then transform a
-// unit linalg.batch_matmul into a matmul using reshape ops.
-struct BatchMatmulOpPattern : public OpRewritePattern<linalg::BatchMatmulOp> {
-  using OpRewritePattern<linalg::BatchMatmulOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(linalg::BatchMatmulOp batchMatmulOp,
-                                PatternRewriter &rewriter) const override {
-    // Tile and fuse fillOp into the loop nest.
-    auto tilingResult = tileUsingSCFForallOpAndFuseGreedily(
-        rewriter, batchMatmulOp.getOperation(),
-        getSCFTilingOptions(rewriter.getContext(), {1, 0, 0, 0}));
-    if (failed(tilingResult)) return failure();
-
-    auto tiledBatchMatmulOp =
-        cast<linalg::BatchMatmulOp>(tilingResult->tiledOps.front());
-    return convertBatchMatmulToMatmul(tiledBatchMatmulOp, rewriter);
-  }
-};
-
-struct MatmulPattern : public OpRewritePattern<linalg::MatmulOp> {
-  using OpRewritePattern<linalg::MatmulOp>::OpRewritePattern;
-
-  MatmulPattern(MLIRContext *context, MatmulTileSizeComputationFn tileSizeFn,
-                PatternBenefit benefit = 1)
-      : OpRewritePattern<linalg::MatmulOp>(context, benefit),
-        tileSizeFn(std::move(tileSizeFn)) {}
-
-  LogicalResult matchAndRewrite(linalg::MatmulOp matmulOp,
-                                PatternRewriter &rewriter) const override {
-    if (hasLabel(matmulOp, kTransformedLabel))
-      return rewriter.notifyMatchFailure(matmulOp, "already transformed");
-
-    MatmulSizes tileSizes = tileSizeFn(getMatmulSizes(matmulOp));
-    return tileAndPeelMatmulOp(rewriter, matmulOp,
-                               {tileSizes.m, tileSizes.n, tileSizes.k});
-  }
-
- private:
-  MatmulTileSizeComputationFn tileSizeFn;
-};
-
-struct MatvecPattern : public OpRewritePattern<linalg::MatvecOp> {
-  using OpRewritePattern<linalg::MatvecOp>::OpRewritePattern;
-
-  MatvecPattern(MLIRContext *context, MatmulTileSizeComputationFn tileSizeFn,
-                PatternBenefit benefit = 1)
-      : OpRewritePattern<linalg::MatvecOp>(context, benefit),
-        tileSizeFn(std::move(tileSizeFn)) {}
-
-  LogicalResult matchAndRewrite(linalg::MatvecOp matvecOp,
-                                PatternRewriter &rewriter) const override {
-    if (hasLabel(matvecOp, kTransformedLabel))
-      return rewriter.notifyMatchFailure(matvecOp, "already transformed");
-
-    MatmulSizes matmulSizes = getMatmulSizes(matvecOp);
-    // For large K it is beneficial to perform reduction in two steps, i.e.
-    // reduce tensor<K> to tensor<VECTOR_SIZE> and then perform a horizontal
-    // add to reduce tensoSr<VECTOR_SIZE> to a single element.
-    constexpr int64_t kReductionDimSizeThreshold = 96;
-    if (!ShapedType::isDynamic(matmulSizes.k) &&
-        matmulSizes.k > kReductionDimSizeThreshold) {
-      auto tilingParallelDim = tileUsingSCFForallOpAndFuseGreedily(
-          rewriter, matvecOp,
-          getSCFTilingOptions(rewriter.getContext(), {1, 0}), nullptr);
-      if (failed(tilingParallelDim)) return failure();
-
-      auto tiledMatvecOp =
-          cast<linalg::MatvecOp>(tilingParallelDim->tiledOps.front());
-      return convertMatvecToDotOp(rewriter, tiledMatvecOp);
-    }
-
-    MatmulSizes tileSizes = tileSizeFn(matmulSizes);
-    return tileAndPeelMatmulOp(rewriter, matvecOp, {tileSizes.m, tileSizes.k});
-  }
-
- private:
-  MatmulTileSizeComputationFn tileSizeFn;
-};
-
-struct VecmatPattern : public OpRewritePattern<linalg::VecmatOp> {
-  using OpRewritePattern<linalg::VecmatOp>::OpRewritePattern;
-
-  VecmatPattern(MLIRContext *context, MatmulTileSizeComputationFn tileSizeFn,
-                PatternBenefit benefit = 1)
-      : OpRewritePattern<linalg::VecmatOp>(context, benefit),
-        tileSizeFn(std::move(tileSizeFn)) {}
-
-  LogicalResult matchAndRewrite(linalg::VecmatOp dotOp,
-                                PatternRewriter &rewriter) const override {
-    if (hasLabel(dotOp, kTransformedLabel))
-      return rewriter.notifyMatchFailure(dotOp, "already transformed");
-
-    MatmulSizes tileSizes = tileSizeFn(getMatmulSizes(dotOp));
-    return tileAndPeelMatmulOp(rewriter, dotOp, {tileSizes.n, tileSizes.k});
-  }
-
- private:
-  MatmulTileSizeComputationFn tileSizeFn;
-};
-
-struct DotPattern : public OpRewritePattern<linalg::DotOp> {
-  using OpRewritePattern<linalg::DotOp>::OpRewritePattern;
-
-  DotPattern(MLIRContext *context, MatmulTileSizeComputationFn tileSizeFn,
-             PatternBenefit benefit = 1)
-      : OpRewritePattern<linalg::DotOp>(context, benefit),
-        tileSizeFn(std::move(tileSizeFn)) {}
-
-  LogicalResult matchAndRewrite(linalg::DotOp dotOp,
-                                PatternRewriter &rewriter) const override {
-    if (hasLabel(dotOp, kTransformedLabel))
-      return rewriter.notifyMatchFailure(dotOp, "already transformed");
-
-    MatmulSizes matmulSizes = getMatmulSizes(dotOp);
-    constexpr int64_t kReductionDimSizeThreshold = 32;
-    if (!ShapedType::isDynamic(matmulSizes.k) &&
-        matmulSizes.k > kReductionDimSizeThreshold) {
-      return convertDotOpToReduce(dotOp, rewriter);
-    }
-    MatmulSizes tileSizes = tileSizeFn(matmulSizes);
-    return tileAndPeelMatmulOp(rewriter, dotOp, {tileSizes.k});
-  }
-
- private:
-  MatmulTileSizeComputationFn tileSizeFn;
-};
-
-Value transposeMatrixConstant(ImplicitLocOpBuilder &builder, Value input) {
-  ElementsAttr inputValues;
-  matchPattern(input, m_Constant(&inputValues));
-
-  auto inputType = input.getType().cast<ShapedType>();
-  ArrayRef<int64_t> inputShape = inputType.getShape();
-  assert(inputShape.size() == 2);
-
-  auto outputType = RankedTensorType::get({inputShape[1], inputShape[0]},
-                                          inputType.getElementType());
-
-  SmallVector<Attribute, 4> outputValues(inputType.getNumElements());
-  for (const auto &it : llvm::enumerate(inputValues.getValues<Attribute>())) {
-    auto row = it.index() / inputShape[1];
-    auto col = it.index() % inputShape[1];
-    outputValues[col * inputShape[0] + row] = it.value();
-  }
-  return builder.create<arith::ConstantOp>(
-      outputType, DenseElementsAttr::get(outputType, outputValues));
-}
-
-// If we have a matvec with a constant matrix it's profitable to transpose the
-// matrix at compile time and use vecmat instead. This has a friendlier memory
-// access pattern.
-struct MatVecToVecMatPattern : public OpRewritePattern<linalg::MatvecOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(linalg::MatvecOp matvecOp,
-                                PatternRewriter &rewriter) const override {
-    auto constantMatrix =
-        matvecOp.getOperand(0).getDefiningOp<arith::ConstantOp>();
-    if (!constantMatrix) return failure();
-
-    ImplicitLocOpBuilder builder(constantMatrix.getLoc(), rewriter);
-    Value transposed = transposeMatrixConstant(builder, constantMatrix);
-    rewriter.replaceOpWithNewOp<linalg::VecmatOp>(
-        matvecOp, ValueRange{matvecOp.getOperand(1), transposed},
-        matvecOp.getOutputs());
-    return success();
-  }
-};
-
-template <typename OpTy>
-LogicalResult fusionClusterPattern(OpTy dotOp, PatternRewriter &rewriter) {
-  // The op was already processed.
-  if (dotOp->template getParentOfType<gml_st::FusionOp>()) return failure();
-  if (hasLabel(dotOp, kFusionPlanningLabel)) return failure();
-
-  auto producerFilterFn = [](Operation *op) {
-    return isa<linalg::FillOp, thlo::ReverseOp, tensor::CastOp>(op);
-  };
-  auto consumerFilterFn = [](Operation *op) {
-    if (auto mapOp = dyn_cast<linalg::MapOp>(op))
-      return mapOp.getNumDpsInputs() == 1;
-    return isa<thlo::ReverseOp>(op);
-  };
-
-  auto fusionCluster =
-      getFusionCluster(dotOp, producerFilterFn, consumerFilterFn);
-
-  for (auto *op : fusionCluster.operations) setLabel(op, kFusionPlanningLabel);
-
-  if (failed(wrapFusionCluster(rewriter, fusionCluster))) return failure();
-
-  return success();
-}
-
-struct TransformDotForCpuPass
-    : public impl::TransformDotForCpuPassBase<TransformDotForCpuPass> {
-  TransformDotForCpuPass() = default;
-
-  explicit TransformDotForCpuPass(MatmulTileSizeComputationFn tileSizeFn)
-      : tileSizeFn(std::move(tileSizeFn)) {}
-
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<mlir::gml_st::GmlStDialect, arith::ArithDialect,
-                    linalg::LinalgDialect, scf::SCFDialect,
-                    tensor::TensorDialect>();
-    linalg::registerTilingInterfaceExternalModels(registry);
-    tensor::registerTilingInterfaceExternalModels(registry);
-    tensor::registerInferTypeOpInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    // Peephole optimization of dot followed by add.
-    {
-      RewritePatternSet patterns(ctx);
-      patterns.add<DotAddPattern>(ctx);
-
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-        return signalPassFailure();
-    }
-
-    {
-      RewritePatternSet patterns(ctx);
-      patterns.add<BatchMatmulOpPattern, Conv2DNhwcHwcfOpPattern,
-                   MatVecToVecMatPattern>(ctx);
-
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-        return signalPassFailure();
-    }
-
-    // Cleanup passes to prepare ops for better clustering.
-    {
-      RewritePatternSet patterns(ctx);
-      populateDuplicateInitOpsPatterns(patterns);
-
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-        return signalPassFailure();
-    }
-
-    {
-      RewritePatternSet patterns(ctx);
-      patterns.add(fusionClusterPattern<linalg::DotOp>);
-      patterns.add(fusionClusterPattern<linalg::MatmulOp>);
-      patterns.add(fusionClusterPattern<linalg::MatvecOp>);
-      patterns.add(fusionClusterPattern<linalg::VecmatOp>);
-
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-        return signalPassFailure();
-
-      f.walk([](Operation *op) { removeLabel(op, kFusionPlanningLabel); });
-    }
-
-    {
-      RewritePatternSet patterns(ctx);
-      patterns.add<MatmulPattern, MatvecPattern, VecmatPattern, DotPattern>(
-          ctx, tileSizeFn);
-
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-        return signalPassFailure();
-    }
-  }
-
-  MatmulTileSizeComputationFn tileSizeFn;
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformDotForCpuPass(ArrayRef<int64_t> tileSizes, StringRef cpuName) {
-  std::function<MatmulSizes(MatmulSizes)> tilingHeuristic;
-  if (!tileSizes.empty()) {
-    assert(tileSizes.size() == 3 && "Expected exactly 3 tile sizes for matmul");
-    MatmulSizes fixedSizes{tileSizes[0], tileSizes[1], tileSizes[2]};
-    tilingHeuristic = [=](MatmulSizes) { return fixedSizes; };
-  } else {
-    if (cpuName.starts_with("znver"))
-      tilingHeuristic = wrapHeuristic(znver2TilingHeuristic, {16, 8, 8});
-    else if (cpuName.contains("skylake"))
-      tilingHeuristic = wrapHeuristic(skylakeTilingHeuristic, {16, 16, 4});
-    else
-      // Default to generic Haswell target.
-      tilingHeuristic = wrapHeuristic(haswellTilingHeuristic, {8, 8, 8});
-  }
-  return std::make_unique<mlir::gml_st::TransformDotForCpuPass>(
-      std::move(tilingHeuristic));
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_elementwise_for_cpu.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_elementwise_for_cpu.cc
deleted file mode 100644
index b9a058d25904db..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_elementwise_for_cpu.cc
+++ /dev/null
@@ -1,401 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include <utility>
-
-#include "gml_st/transforms/fusion/fusion.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/peeling/peeling.h"
-#include "gml_st/transforms/transforms.h"
-#include "gml_st/utils/tensor_utils.h"
-#include "llvm/ADT/TypeSwitch.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Interfaces/DestinationStyleOpInterface.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "thlo/IR/thlo_ops.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TRANSFORMELEMENTWISEFORCPUPASS
-#include "gml_st/transforms/passes.h.inc"
-
-constexpr llvm::StringRef kFusionPlanningLabel = "__fusion_planning_label__";
-constexpr llvm::StringRef kElementwiseLabel = "__elementwise_label__";
-
-// Indicates the the dimension is not mapped to dimensions of the root op.
-constexpr int64_t kNotMappedToRootDims = -1;
-
-using FusionFilterFn = llvm::function_ref<bool(Operation *)>;
-using CandidatesMap = llvm::SmallMapVector<Value, SmallVector<int64_t>, 4>;
-
-// Find the root of the fusion cluster.
-Operation *findRootElementwiseOp(Operation *op, FusionFilterFn fusionFilterFn) {
-  Operation *rootOp = op;
-  Operation *curOp = nullptr;
-  do {
-    curOp = nullptr;
-    for (OpOperand &use : rootOp->getUses()) {
-      Operation *owner = use.getOwner();
-      if (!fusionFilterFn(owner)) continue;
-      if (hasLabel(owner, kTransformedLabel)) continue;
-      if (hasLabel(owner, kFusionPlanningLabel)) continue;
-      if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(owner)) {
-        SmallVector<OpOperand *> opOperands = llvm::to_vector(llvm::map_range(
-            dpsOp.getDpsInitsMutable(), [](OpOperand &o) { return &o; }));
-        if (llvm::is_contained(opOperands, &use)) continue;
-      }
-      curOp = owner;
-      rootOp = curOp;
-      break;
-    }
-  } while (curOp != nullptr);
-  // If the root is a reshape, don't use it, use the defining op for the
-  // argument instead.
-  if (isa<tensor::ExpandShapeOp, tensor::CollapseShapeOp>(rootOp))
-    return rootOp->getOperand(0).getDefiningOp();
-  return rootOp;
-}
-
-// Depending on the type of the defining op for the `result`, adds its arguments
-// with the maps to the root result dimensions.
-void addMappedTensorArgs(Value result, const SmallVector<int64_t> &map,
-                         CandidatesMap &args) {
-  Operation *defOp = result.getDefiningOp();
-  if (!defOp) return;
-
-  mlir::TypeSwitch<Operation *>(defOp)
-      .Case<linalg::FillOp, linalg::MapOp, thlo::ReverseOp>([&](auto op) {
-        for (OpOperand *operand :
-             cast<DestinationStyleOpInterface>(op.getOperation())
-                 .getDpsInputOperands()) {
-          Value val = operand->get();
-          if (!isa<RankedTensorType>(val.getType())) continue;
-          args[val] = map;
-        }
-      })
-      .Case<linalg::TransposeOp>([&](auto op) {
-        auto transposeOp = cast<linalg::TransposeOp>(op);
-        SmallVector<int64_t> composed(map.size(), 0);
-        for (auto [index, id] : llvm::enumerate(transposeOp.getPermutation())) {
-          composed[index] = map[id];
-        }
-        args[transposeOp.getInput()] = composed;
-      })
-      .Case<linalg::BroadcastOp>([&](auto op) {
-        auto broadcastOp = cast<linalg::BroadcastOp>(op);
-        SmallVector<int64_t> composed;
-        SmallVector<int64_t> bcastDims = to_vector(broadcastOp.getDimensions());
-
-        for (auto [index, id] : llvm::enumerate(map)) {
-          if (llvm::is_contained(bcastDims, index)) continue;
-          composed.push_back(id);
-        }
-        args[broadcastOp.getInput()] = composed;
-      })
-      .Case<tensor::CollapseShapeOp>([&](auto op) {
-        auto collapseShapeOp = cast<tensor::CollapseShapeOp>(op);
-        auto srcType = collapseShapeOp.getSrcType();
-
-        SmallVector<int64_t> preservedDims = getPreservedDimensions(
-            srcType.getShape(), collapseShapeOp.getReassociationIndices());
-
-        SmallVector<int64_t> composed(srcType.getRank(), kNotMappedToRootDims);
-        for (auto [index, mapDim] : llvm::enumerate(map))
-          composed[preservedDims[index]] = mapDim;
-        args[collapseShapeOp.getSrc()] = composed;
-      })
-      .Case<tensor::ExpandShapeOp>([&](auto op) {
-        auto expandShapeOp = cast<tensor::ExpandShapeOp>(op);
-        auto dstType = expandShapeOp.getResultType();
-
-        SmallVector<int64_t> preservedDims = getPreservedDimensions(
-            dstType.getShape(), expandShapeOp.getReassociationIndices());
-
-        SmallVector<int64_t> composed(expandShapeOp.getSrcType().getRank());
-        for (auto [index, preservedDim] : llvm::enumerate(preservedDims))
-          composed[index] = map[preservedDim];
-        args[expandShapeOp.getSrc()] = composed;
-      })
-      .Default(
-          [](Operation *) { llvm_unreachable("The op is not supported"); });
-}
-
-// Starts a graph traversal from the root trying to fuse all ops that satisfy
-// `fusionFilterFn` and also have no users outside of this fusion cluster.
-FusionCluster findElementwiseCluster(Operation *rootOp,
-                                     FusionFilterFn fusionFilterFn) {
-  Value rootResult = rootOp->getResult(0);
-
-  SetVector<Operation *> resultOps;
-  resultOps.insert(rootOp);
-  CandidatesMap mappedArgs, candidates;
-
-  // Add operands of root.
-  int64_t rootRank = rootResult.getType().cast<RankedTensorType>().getRank();
-  auto identityMap = llvm::to_vector(llvm::seq<int64_t>(0, rootRank));
-  addMappedTensorArgs(rootResult, identityMap, candidates);
-
-  while (!candidates.empty()) {
-    bool fusionHappened = false;
-    SmallVector<Value> argsToErase;
-    for (auto [arg, map] : llvm::reverse(candidates)) {
-      // If the arg is already coming outside of the cluster, i.e. it is a
-      // function argument or a result of some op that is not included by the
-      // fusionFilterFn, then we remove such arg.
-      Operation *defOp = arg.getDefiningOp();
-      if (mappedArgs.contains(arg) || !defOp || resultOps.contains(defOp) ||
-          !fusionFilterFn(defOp)) {
-        mappedArgs[arg] = map;
-        argsToErase.push_back(arg);
-        continue;
-      }
-
-      // If there are any users of this op outside of fusion cluster, then skip.
-      if (llvm::any_of(arg.getUsers(), [&](Operation *user) {
-            return !resultOps.contains(user);
-          })) {
-        continue;
-      }
-
-      resultOps.insert(defOp);
-      addMappedTensorArgs(arg, map, candidates);
-      fusionHappened = true;
-      break;
-    }
-    for (Value argToErase : argsToErase) {
-      candidates.erase(argToErase);
-    }
-
-    // If an op to fuse was not found, we add all current candidates  to the
-    // result.
-    if (!fusionHappened) {
-      for (auto &candidate : candidates) {
-        mappedArgs.insert(candidate);
-      }
-      break;
-    }
-  }
-  FusionCluster fusionCluster;
-  fusionCluster.root = rootOp;
-  fusionCluster.operations = resultOps;
-
-  // Add tensor.empty ops to the cluster.
-  for (auto *op : resultOps) {
-    if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(op)) {
-      for (auto operand : dpsOp.getDpsInits()) {
-        if (auto emptyOp =
-                dyn_cast_or_null<tensor::EmptyOp>(operand.getDefiningOp()))
-          fusionCluster.operations.insert(emptyOp);
-      }
-    }
-  }
-
-  llvm::append_range(fusionCluster.argDimsMapping, mappedArgs);
-  return fusionCluster;
-}
-
-// Searches through the inner-most dimensions of the arguments of the fusion
-// cluster to find the most beneficial dimension to tile. Default tile size is 1
-// x ... x 1 x vector_size, which leads to vector.transfer_write to the init
-// tensor.
-// In case of broadcast, transpose and other maps with the non-identity mapping
-// between op input and op result the innermost dimension of the input can be
-// different from the one of result.
-SmallVector<int64_t> optimizeTileSizes(const FusionCluster &fusionCluster,
-                                       int64_t vectorSize) {
-  auto rootTy =
-      cast<RankedTensorType>(fusionCluster.root->getResultTypes().front());
-
-  if (rootTy.getRank() == 0) return {};
-  SmallVector<int64_t> tileSizes(rootTy.getRank(), 1);
-  tileSizes.back() = vectorSize;
-
-  int64_t rootInnermostDim = rootTy.getRank() - 1;
-  int64_t innermostDimWithMostElements = rootInnermostDim;
-  int64_t innermostDimMaxElements = std::numeric_limits<int64_t>::min();
-  for (auto &[arg, map] : fusionCluster.argDimsMapping) {
-    auto argInnermostDimIt = llvm::find_if(
-        llvm::reverse(map),
-        [](int64_t item) { return item != kNotMappedToRootDims; });
-    if (argInnermostDimIt == map.rend()) continue;
-    int64_t argInnermostDim = *argInnermostDimIt;
-    if (argInnermostDim == rootInnermostDim) continue;
-
-    int64_t numElements = rootTy.getDimSize(argInnermostDim);
-    if (innermostDimMaxElements >= numElements &&
-        !ShapedType::isDynamic(numElements))
-      continue;
-    innermostDimMaxElements = numElements;
-    innermostDimWithMostElements = argInnermostDim;
-  }
-  tileSizes[innermostDimWithMostElements] = vectorSize;
-  return tileSizes;
-}
-
-template <typename OpTy>
-struct FusionClusterPattern : public OpRewritePattern<OpTy> {
-  FusionClusterPattern(MLIRContext *context, int64_t vectorSize,
-                       bool fuseDegenerateReshapes, PatternBenefit benefit = 1)
-      : OpRewritePattern<OpTy>(context, benefit),
-        vectorSize(vectorSize),
-        fuseDegenerateReshapes(fuseDegenerateReshapes) {}
-
-  LogicalResult matchAndRewrite(OpTy op,
-                                PatternRewriter &rewriter) const override {
-    if (hasSingleElementOperandsAndResults(op)) return failure();
-    if (hasLabel(op, kFusionPlanningLabel)) return failure();
-    if (hasLabel(op, kTransformedLabel)) return failure();
-    if (op->template getParentOfType<gml_st::FusionOp>()) return failure();
-
-    // Find the root from which to start tiling and fusion.
-    auto fusionFilterFn = [&](Operation *op) {
-      if (fuseDegenerateReshapes) {
-        if (auto reshapeOp = dyn_cast<tensor::CollapseShapeOp>(op))
-          return isDegenerateReshapeOp(reshapeOp);
-        if (auto reshapeOp = dyn_cast<tensor::ExpandShapeOp>(op))
-          return isDegenerateReshapeOp(reshapeOp);
-      }
-      // Add thlo.concatenate here.
-      return isa<linalg::BroadcastOp, linalg::FillOp, linalg::MapOp,
-                 linalg::TransposeOp, thlo::ReverseOp>(op);
-    };
-    Operation *fusionRoot = findRootElementwiseOp(op, fusionFilterFn);
-
-    // Find the fusion cluster and its arguments.
-    FusionCluster fusionCluster =
-        findElementwiseCluster(fusionRoot, fusionFilterFn);
-
-    // Find what dimensions to tile.
-    SmallVector<int64_t> tileSizes =
-        optimizeTileSizes(fusionCluster, vectorSize);
-
-    for (auto *clusterOp : fusionCluster.operations)
-      setLabel(clusterOp, kFusionPlanningLabel);
-
-    auto fusionOp = wrapFusionCluster(rewriter, fusionCluster);
-    if (failed(fusionOp)) return failure();
-
-    fusionOp->setParallelTileSizes(tileSizes);
-    setLabel(*fusionOp, kElementwiseLabel);
-
-    return success();
-  }
-
- private:
-  int64_t vectorSize;
-  bool fuseDegenerateReshapes;
-};
-
-LogicalResult tileAndFuse(FusionOp fusionOp, PatternRewriter &rewriter) {
-  if (hasLabel(fusionOp, kTransformedLabel)) return failure();
-  if (!hasLabel(fusionOp, kElementwiseLabel)) return failure();
-
-  auto *tilingRootOp = fusionOp.getTerminator().getValues()[0].getDefiningOp();
-  auto tileSizes = *fusionOp.getParallelTileSizes();
-
-  // Tile and fuse.
-  auto tiledLoop = tileUsingSCFForallOpAndFuseGreedily(
-      rewriter, tilingRootOp,
-      getSCFTilingOptions(rewriter.getContext(), tileSizes));
-  if (failed(tiledLoop)) return failure();
-
-  // Peel.
-  auto peelingResult = peelAllLoops(tiledLoop->loop, rewriter);
-  setLabel(tiledLoop->loop, kPerfectlyTiledLoopLabel);
-
-  // Tile ops in the peeled loop again, to size 1, so they can be
-  // scalarized.
-  if (failed(tilePeeledOpsToScalars(rewriter, peelingResult))) return failure();
-
-  setLabel(fusionOp, kTransformedLabel);
-  return success();
-}
-
-struct TransformElementwiseForCpuPass
-    : public impl::TransformElementwiseForCpuPassBase<
-          TransformElementwiseForCpuPass> {
-  using Base::Base;
-
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<mlir::gml_st::GmlStDialect, arith::ArithDialect,
-                    linalg::LinalgDialect, tensor::TensorDialect,
-                    scf::SCFDialect>();
-    linalg::registerTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    // Cleanup passes to prepare ops for better clustering.
-    {
-      RewritePatternSet patterns(ctx);
-      populateDuplicateInitOpsPatterns(patterns);
-
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-        return signalPassFailure();
-    }
-
-    {
-      RewritePatternSet patterns(ctx);
-      // clang-format off
-      patterns.add<
-        FusionClusterPattern<linalg::BroadcastOp>,
-        FusionClusterPattern<linalg::FillOp>,
-        FusionClusterPattern<linalg::MapOp>,
-        FusionClusterPattern<linalg::TransposeOp>,
-        FusionClusterPattern<thlo::ReverseOp>
-      >(ctx, vectorSize, fuseDegenerateReshapes);
-      // clang-format on
-
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-        return signalPassFailure();
-    }
-
-    {
-      RewritePatternSet patterns(ctx);
-      patterns.add(tileAndFuse);
-
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-        return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformElementwiseForCpuPass(int64_t vectorSize,
-                                     bool fuseDegenerateReshapes) {
-  TransformElementwiseForCpuPassOptions opts;
-  opts.vectorSize = vectorSize;
-  opts.fuseDegenerateReshapes = fuseDegenerateReshapes;
-  return std::make_unique<mlir::gml_st::TransformElementwiseForCpuPass>(opts);
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_mmt4d_for_cpu.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_mmt4d_for_cpu.cc
deleted file mode 100644
index 8616c3fbee1a68..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_mmt4d_for_cpu.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <memory>
-#include <utility>
-
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
-#include "mlir/Dialect/SCF/Transforms/Transforms.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TRANSFORMMMT4DFORCPUPASS
-#include "gml_st/transforms/passes.h.inc"
-
-FailureOr<Operation *> tileUsingSCFForAndReplace(
-    PatternRewriter &rewriter, Operation *op, ArrayRef<int64_t> tilingSizes) {
-  scf::SCFTilingOptions tilingOptions;
-  tilingOptions.setTileSizes(
-      getAsIndexOpFoldResult(rewriter.getContext(), tilingSizes));
-  auto tilingResult = scf::tileUsingSCFForOp(
-      rewriter, cast<TilingInterface>(op), tilingOptions);
-  if (failed(tilingResult) || tilingResult->loops.empty()) return failure();
-  rewriter.replaceOp(op, tilingResult->replacements);
-  return tilingResult->tiledOps.front();
-}
-
-/// Splits the tile sizes in `parallelSizes` into `reductionSizes` for the
-/// reduction loops.
-void splitParallelAndReductionTiles(linalg::LinalgOp op,
-                                    SmallVectorImpl<int64_t> &parallelSizes,
-                                    SmallVectorImpl<int64_t> &reductionSizes) {
-  reductionSizes.assign(parallelSizes.begin(), parallelSizes.end());
-  for (auto [index, iteratorType] :
-       llvm::enumerate(op.getIteratorTypesArray())) {
-    if (iteratorType == utils::IteratorType::parallel) {
-      reductionSizes[index] = 0;
-    } else {
-      parallelSizes[index] = 0;
-    }
-  }
-}
-
-// We tile towards SIMD codegen, so the tile sizes depend on the target
-// architecture (vector instruction sizes, etc.). Luckily, this information is
-// already captured in linalg.mmt4d during linalg.matmul -> linalg.mmt4d
-// lowering phase. It is hardcoded for AVX on x86 for now.
-LogicalResult tileMmt4DOp(linalg::Mmt4DOp mmt4dOp, PatternRewriter &rewriter) {
-  if (hasLabel(mmt4dOp, kTransformedLabel)) {
-    return rewriter.notifyMatchFailure(mmt4dOp,
-                                       "has already been transformed.");
-  }
-
-  // Compute the tile sizes. Note that at this stage we only do layout tiling.
-  // Later we might also want to do traversal tiling (only on M and N dims).
-  auto getL1TileSizes = [&]() -> SmallVector<int64_t> {
-    auto lhsShape =
-        mmt4dOp.getInputs()[0].getType().cast<ShapedType>().getShape();
-    auto rhsShape =
-        mmt4dOp.getInputs()[1].getType().cast<ShapedType>().getShape();
-    int64_t m0 = lhsShape[2];
-    int64_t n0 = rhsShape[2];
-    int64_t k0 = lhsShape[3];
-    return {1, 1, 1, m0, n0, k0};
-  };
-
-  SmallVector<int64_t> parallelTileSizes = getL1TileSizes();
-  SmallVector<int64_t> reductionTileSizes;
-
-  // Search the number of outer parallel loops to separate them from possible
-  // inner reduction dimensions.
-  auto iterTypes = mmt4dOp.getIteratorTypesArray();
-  // Make sure to only look at the leading loops for tiling---we will scan
-  // this array to find the first non-parallel loop later and use that for
-  // indexing into the tile sizes.
-  if (iterTypes.size() > parallelTileSizes.size()) {
-    iterTypes.resize(parallelTileSizes.size());
-  }
-
-  splitParallelAndReductionTiles(cast<linalg::LinalgOp>(mmt4dOp.getOperation()),
-                                 parallelTileSizes, reductionTileSizes);
-
-  // Tile the parallel loops.
-  auto tiledOp = tileUsingSCFForAndReplace(rewriter, mmt4dOp.getOperation(),
-                                           parallelTileSizes);
-  if (failed(tiledOp)) return failure();
-  mmt4dOp = cast<linalg::Mmt4DOp>(*tiledOp);
-
-  // Tile the reduction loops.
-  tiledOp = tileUsingSCFForAndReplace(rewriter, mmt4dOp.getOperation(),
-                                      reductionTileSizes);
-  if (failed(tiledOp)) return failure();
-  mmt4dOp = cast<linalg::Mmt4DOp>(*tiledOp);
-
-  setLabel(mmt4dOp, kTransformedLabel);
-  return success();
-}
-
-struct TransformMmt4DForCpuPass
-    : public impl::TransformMmt4DForCpuPassBase<TransformMmt4DForCpuPass> {
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<arith::ArithDialect, linalg::LinalgDialect, scf::SCFDialect,
-                    tensor::TensorDialect>();
-    linalg::registerTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp func = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    RewritePatternSet patterns(ctx);
-    patterns.add(tileMmt4DOp);
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
-      return signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformMmt4DForCpuPass() {
-  return std::make_unique<TransformMmt4DForCpuPass>();
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_pack_for_cpu.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_pack_for_cpu.cc
deleted file mode 100644
index 117003b1d9952f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_pack_for_cpu.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <array>
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
-#include "mlir/Dialect/SCF/Transforms/Transforms.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h"
-#include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TRANSFORMPACKFORCPUPASS
-#include "gml_st/transforms/passes.h.inc"
-
-FailureOr<Operation *> tileUsingSCFForAndReplace(
-    PatternRewriter &rewriter, Operation *op,
-    const scf::SCFTilingOptions &tilingOptions) {
-  if (hasLabel(op, kTransformedLabel)) return failure();
-
-  auto tilingResult = scf::tileUsingSCFForOp(
-      rewriter, cast<TilingInterface>(op), tilingOptions);
-  if (failed(tilingResult) || tilingResult->loops.empty()) return failure();
-
-  for (Operation *tiledOp : tilingResult->tiledOps)
-    setLabel(tiledOp, kTransformedLabel);
-  rewriter.replaceOp(op, tilingResult->replacements);
-  return tilingResult->tiledOps.front();
-}
-
-LogicalResult tilePackOp(tensor::PackOp packOp, PatternRewriter &rewriter) {
-  // Tile tensor.pack ops.
-  auto packTilingOptions =
-      scf::SCFTilingOptions().setTileSizeComputationFunction(
-          [&](OpBuilder b, Operation *op) {
-            auto numLoops =
-                cast<mlir::TilingInterface>(op).getLoopIteratorTypes().size();
-            SmallVector<OpFoldResult> tiles(
-                numLoops, getAsIndexOpFoldResult(b.getContext(), 1));
-            return tiles;
-          });
-
-  return tileUsingSCFForAndReplace(rewriter, packOp, packTilingOptions);
-}
-
-LogicalResult tileUnpackOp(tensor::UnPackOp unpackOp,
-                           PatternRewriter &rewriter) {
-  // Tile tensor.unpack op.
-  auto unpackTilingOptions =
-      scf::SCFTilingOptions().setTileSizeComputationFunction(
-          [](OpBuilder &builder, Operation *op) {
-            Location loc = op->getLoc();
-            auto unpackOp = cast<tensor::UnPackOp>(op);
-            auto numLoops = unpackOp.getDestRank();
-            auto dimAndTileMapping = unpackOp.getDimAndTileMapping();
-            SmallVector<OpFoldResult> tileSizes;
-            for (size_t i = 0; i < numLoops; ++i) {
-              if (dimAndTileMapping.count(i)) {
-                tileSizes.push_back(dimAndTileMapping[i]);
-              } else {
-                tileSizes.push_back(
-                    builder.create<memref::DimOp>(loc, unpackOp.getDest(), i)
-                        .getResult());
-              }
-            }
-            return tileSizes;
-          });
-
-  return tileUsingSCFForAndReplace(rewriter, unpackOp, unpackTilingOptions);
-}
-
-struct TransformPackForCpuPass
-    : public impl::TransformPackForCpuPassBase<TransformPackForCpuPass> {
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry
-        .insert<arith::ArithDialect, scf::SCFDialect, tensor::TensorDialect>();
-    tensor::registerTilingInterfaceExternalModels(registry);
-    tensor::registerInferTypeOpInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp func = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    {
-      RewritePatternSet patterns(ctx);
-      patterns.add(tilePackOp);
-      patterns.add(tileUnpackOp);
-      if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
-        return signalPassFailure();
-    }
-
-    // Expanding pack and unpack ops to other primitive tensor/linalg ops and
-    // canonicalize tiled ops.
-    {
-      RewritePatternSet patterns(ctx);
-      patterns.add<linalg::GeneralizeOuterUnitDimsPackOpPattern,
-                   linalg::GeneralizeOuterUnitDimsUnPackOpPattern>(ctx);
-      if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
-        return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformPackForCpuPass() {
-  return std::make_unique<TransformPackForCpuPass>();
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
deleted file mode 100644
index 0c1c878a5f0847..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
+++ /dev/null
@@ -1,605 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "gml_st/transforms/fusion/fusion.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/peeling/peeling.h"
-#include "gml_st/transforms/tiling/tiling.h"
-#include "gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/Utils/Utils.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "thlo/IR/thlo_ops.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TRANSFORMREDUCEFORCPUPASS
-#include "gml_st/transforms/passes.h.inc"
-
-constexpr llvm::StringRef kReduceCluster = "__reduce_cluster__";
-
-struct Reduce1DTileSizes {
-  int64_t tileSize;
-  int64_t splitRatio;
-};
-using Reduce1DTileSizeComputationFn = std::function<Reduce1DTileSizes(int64_t)>;
-
-SmallVector<int64_t> getParallelDimTileSizes(int64_t reductionDim,
-                                             int64_t parallelDimTileSize) {
-  return reductionDim ? SmallVector<int64_t>{parallelDimTileSize, 0}
-                      : SmallVector<int64_t>{0, parallelDimTileSize};
-}
-
-SmallVector<int64_t> getReductionDimTileSizes(int64_t reductionDim,
-                                              int64_t reductionDimTileSize) {
-  return reductionDim ? SmallVector<int64_t>{0, reductionDimTileSize}
-                      : SmallVector<int64_t>{reductionDimTileSize, 0};
-}
-
-LogicalResult validateOp(linalg::ReduceOp reduceOp, PatternRewriter &rewriter,
-                         int64_t expectedRank) {
-  ArrayRef<int64_t> reduceDimensions = reduceOp.getDimensions();
-  if (reduceDimensions.size() != 1) {
-    return rewriter.notifyMatchFailure(
-        reduceOp, "expects 1 reduction dimension element. 0 or > 1 received.");
-  }
-  SmallVector<OpOperand *> operands = reduceOp.getDpsInputOperands();
-  if (operands.size() != 1) {
-    return rewriter.notifyMatchFailure(reduceOp,
-                                       "expects 1 operand. 0 or > 1 received.");
-  }
-  const int64_t operandRank = reduceOp.getRank(operands[0]);
-  if (operandRank != expectedRank) {
-    return rewriter.notifyMatchFailure(reduceOp, [&](Diagnostic &diag) {
-      diag << "expects rank " << expectedRank << ". " << operandRank
-           << "received.";
-    });
-  }
-  return success();
-}
-
-bool reduce1DFusionFilter(Operation *op) {
-  return isa<linalg::FillOp, linalg::MapOp, thlo::ReverseOp>(op);
-}
-
-bool reduce2DProducerFusionFilter(Operation *op) {
-  return isa<linalg::BroadcastOp, linalg::FillOp, linalg::MapOp,
-             linalg::TransposeOp, tensor::CastOp>(op);
-}
-
-bool reduce2DConsumerFusionFilter(Operation *op) {
-  return isa<linalg::MapOp, thlo::ReverseOp>(op);
-}
-
-LogicalResult wrapReduceFusionCluster(
-    PatternRewriter &rewriter, linalg::ReduceOp reduceOp,
-    llvm::function_ref<bool(Operation *)> producerFilterFn,
-    llvm::function_ref<bool(Operation *)> consumerFilterFn) {
-  auto fusionCluster =
-      getFusionCluster(reduceOp, producerFilterFn, consumerFilterFn);
-
-  auto fusionOp = wrapFusionCluster(rewriter, fusionCluster);
-  if (failed(fusionOp)) return failure();
-
-  setLabel(reduceOp, kTransformedLabel);
-  setLabel(*fusionOp, kReduceCluster);
-  return success();
-}
-
-LogicalResult fusionClusterPattern(linalg::ReduceOp reduceOp,
-                                   PatternRewriter &rewriter) {
-  if (hasLabel(reduceOp, kTransformedLabel)) return failure();
-
-  auto fusionOp = reduceOp->getParentOfType<gml_st::FusionOp>();
-  if (fusionOp && hasLabel(fusionOp, kReduceCluster)) return failure();
-
-  const int64_t rank = reduceOp.getRank(reduceOp.getDpsInputOperand(0));
-
-  if (rank == 1) {
-    return wrapReduceFusionCluster(rewriter, reduceOp, reduce1DFusionFilter,
-                                   [](Operation *) { return false; });
-  }
-
-  if (rank == 2) {
-    return wrapReduceFusionCluster(rewriter, reduceOp,
-                                   reduce2DProducerFusionFilter,
-                                   reduce2DConsumerFusionFilter);
-  }
-
-  return failure();
-}
-
-struct Reduce1DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
-  using OpRewritePattern<linalg::ReduceOp>::OpRewritePattern;
-
-  explicit Reduce1DTransformPattern(MLIRContext *context,
-                                    Reduce1DTileSizeComputationFn tileSizeFn,
-                                    PatternBenefit benefit = 1)
-      : OpRewritePattern<linalg::ReduceOp>(context, benefit),
-        tileSizeFn(std::move(tileSizeFn)) {}
-
-  LogicalResult matchAndRewrite(linalg::ReduceOp reduceOp,
-                                PatternRewriter &rewriter) const override {
-    if (hasLabel(reduceOp, kTransformedLabel)) {
-      return rewriter.notifyMatchFailure(reduceOp,
-                                         "has already been transformed.");
-    }
-    if (failed(validateOp(reduceOp, rewriter, /*expectedRank=*/1)))
-      return failure();
-
-    int64_t inputSize =
-        reduceOp.getOperand(0).getType().cast<RankedTensorType>().getDimSize(0);
-    Reduce1DTileSizes tileSizes = tileSizeFn(inputSize);
-
-    // Rewrite as a tree reduction.
-    FailureOr<SplitReduce1DResult> splitReduce = rewriteReduce1D(
-        rewriter, reduceOp, tileSizes.tileSize, tileSizes.splitRatio);
-    if (failed(splitReduce)) {
-      return rewriter.notifyMatchFailure(reduceOp,
-                                         "failed to split reduction dimension");
-    }
-    scf::ForOp mainLoop = splitReduce->mainLoop;
-    scf::ForOp tailLoop = splitReduce->tailLoop;
-
-    // Fusion.
-    SmallVector<Block *> blocks;
-    if (mainLoop) blocks.push_back(mainLoop.getBody());
-    if (tailLoop) blocks.push_back(tailLoop.getBody());
-    fuseGreedily(rewriter, blocks, reduce1DFusionFilter);
-
-    // Tiling to 1 and fusion in the tail loop.
-    if (tailLoop) {
-      for (auto reduOp :
-           llvm::to_vector(tailLoop.getBody()->getOps<linalg::ReduceOp>())) {
-        if (failed(tileUsingSCFForOpAndFuseGreedily(
-                rewriter, reduOp, getSCFTilingOptions(rewriter.getContext(), 1),
-                reduce1DFusionFilter))) {
-          return failure();
-        }
-      }
-    }
-    return success();
-  }
-
- private:
-  struct SplitReduce1DResult {
-    scf::ForOp mainLoop;
-    scf::ForOp tailLoop;
-    linalg::ReduceOp horizontalReduce;
-    Value result;
-  };
-  // Split reduction tensor<N*tile_size+M x elem_type> -> tensor<elem_type> into
-  //  * scf.for that reduces
-  //    tensor<N*tile_size> -> tensor<split_ratio x elem_type>
-  //  * horizontal reduce tensor<split_ratio x elem_type> -> tensor<elem_type>
-  //  * scf.for that reduces the remaining M elements.
-  FailureOr<SplitReduce1DResult> rewriteReduce1D(PatternRewriter &rewriter,
-                                                 linalg::ReduceOp reduceOp,
-                                                 int64_t tileSize,
-                                                 int64_t splitRatio) const {
-    OpBuilder::InsertionGuard g(rewriter);
-    rewriter.setInsertionPointAfter(reduceOp);
-
-    // 0-d tensor with the neutral elements.
-    auto fillOp = reduceOp.getInits().front().getDefiningOp<linalg::FillOp>();
-    if (!fillOp)
-      return rewriter.notifyMatchFailure(reduceOp,
-                                         "init not defined by fill op");
-    auto neutralValue = fillOp.value();
-
-    // Constants.
-    Location loc = reduceOp.getLoc();
-    Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    Value tileSizeValue =
-        rewriter.create<arith::ConstantIndexOp>(loc, tileSize);
-
-    // Input.
-    Value input = reduceOp.getInputs().front();
-    FailureOr<OpFoldResult> inputSizeOfr =
-        tensor::getMixedSize(rewriter, loc, input, 0);
-    if (failed(inputSizeOfr))
-      return rewriter.notifyMatchFailure(reduceOp, "cannot get input size");
-
-    // Loop boundaries.
-    //   tileableBound = inputSize - inputSize % tileSize
-    //   remainderSize = inputSize - tileableBound
-    OpFoldResult tileableBoundOfr =
-        getTileableBound(rewriter, loc, *inputSizeOfr, tileSize);
-    Value tileableBoundValue =
-        getValueOrCreateConstantIndexOp(rewriter, loc, tileableBoundOfr);
-
-    OpFoldResult remainderSize =
-        getRemainderSize(rewriter, loc, tileableBoundOfr, *inputSizeOfr);
-
-    // Create tensor<SPLIT_RATIOxELEM_TYPE> with neutral elements for tile loop
-    // init.
-    Type elementType = neutralValue.getType();
-    Value emptyVector = rewriter.create<tensor::EmptyOp>(
-        loc, llvm::ArrayRef({splitRatio}), elementType);
-    Value filledVector =
-        rewriter.create<linalg::FillOp>(loc, neutralValue, emptyVector)
-            .getResult(0);
-
-    // Create a tiled loop
-    SplitReduce1DResult splitResult;
-    splitResult.result = fillOp.getResult(0);
-
-    std::optional<int64_t> tileableBoundConstant =
-        getConstantIntValue(tileableBoundOfr);
-    if (!tileableBoundConstant || tileableBoundConstant != 0) {
-      auto tiledLoopBodyBuilder = [&](OpBuilder &b, Location loc, Value iv,
-                                      ValueRange inits) {
-        // Tile input as tensor<TILE_SIZExELEM_TYPE> and reshape into
-        // tensor<(TILE_SIZE/SPLIT_RATIO)xSPLIT_RATIOxELEM_TYPE>.
-        Value inputSlice = tileAndReshapeInput(b, loc, iv, input, elementType,
-                                               tileSize, splitRatio);
-
-        tensor::ExtractSliceOp initSlice =
-            create1DSlice(b, loc, inits.front(), b.getIndexAttr(0),
-                          b.getIndexAttr(splitRatio));
-
-        // Create `linalg.reduce` to combine
-        // `tensor<(TILE_SIZE/SPLIT_RATIO)xSPLIT_RATIOxELEM_TYPE> input with the
-        // `tensor<SPLIT_RATIOxELEM_TYPE>` accumulator.
-        auto tiledReduceOp = b.create<linalg::ReduceOp>(
-            loc, ValueRange{inputSlice}, ValueRange{initSlice},
-            /*dimensions=*/SmallVector<int64_t>{0},
-            /*bodyBuilder=*/nullptr, linalg::getPrunedAttributeList(reduceOp));
-        OpBuilder::InsertionGuard g(rewriter);
-        Region &region = tiledReduceOp.getRegion();
-        rewriter.cloneRegionBefore(reduceOp.getRegion(), region, region.end());
-        setLabel(tiledReduceOp, kTransformedLabel);
-
-        b.create<scf::YieldOp>(loc, tiledReduceOp.getResults());
-      };
-
-      splitResult.mainLoop = rewriter.create<scf::ForOp>(
-          loc, zero, tileableBoundValue, tileSizeValue, filledVector,
-          tiledLoopBodyBuilder);
-      setLabel(splitResult.mainLoop, kPerfectlyTiledLoopLabel);
-
-      // Create `linalg.reduce` from tensor<SPLIT_RATIOxELEM_TYPE> to
-      // tensor<ELEM_TYPE>.
-      splitResult.horizontalReduce =
-          cloneReduceOp(rewriter, reduceOp, splitResult.mainLoop.getResult(0),
-                        reduceOp.getInits().front());
-      splitResult.result = splitResult.horizontalReduce.getResult(0);
-    }
-
-    // Combine `horizontal reduce` with the tail of the input. The tail is
-    // always smaller than TILE_SIZE.
-    std::optional<int64_t> tripCount = constantTripCount(
-        tileableBoundOfr, *inputSizeOfr, rewriter.getIndexAttr(tileSize));
-    scf::ForOp remainderLoop;
-    if (!tripCount || *tripCount > 0) {
-      auto remainderLoopBodyBuilder = [&](OpBuilder &b, Location loc, Value iv,
-                                          ValueRange inits) {
-        Value inputSlice = create1DSlice(b, loc, input, iv, remainderSize);
-
-        Value initSlice = b.create<tensor::ExtractSliceOp>(
-            loc, inits.front(), /*offsets=*/SmallVector<OpFoldResult>{},
-            /*sizes=*/SmallVector<OpFoldResult>{},
-            /*strides=*/SmallVector<OpFoldResult>{});
-
-        linalg::ReduceOp newReduce =
-            cloneReduceOp(b, reduceOp, inputSlice, initSlice);
-        b.create<scf::YieldOp>(loc, newReduce->getResults());
-      };
-      splitResult.tailLoop = rewriter.create<scf::ForOp>(
-          loc, tileableBoundValue,
-          getValueOrCreateConstantIndexOp(rewriter, loc, *inputSizeOfr),
-          tileSizeValue, splitResult.result, remainderLoopBodyBuilder);
-      splitResult.result = splitResult.tailLoop.getResult(0);
-    }
-    rewriter.replaceOp(reduceOp, splitResult.result);
-    return splitResult;
-  }
-
-  OpFoldResult getTileableBound(OpBuilder &b, Location loc,
-                                OpFoldResult inputSizeOfr,
-                                int64_t tileSize) const {
-    if (tileSize == 1) return inputSizeOfr;
-
-    auto inputSizeInt = getConstantIntValue(inputSizeOfr);
-    if (inputSizeInt && *inputSizeInt < tileSize) return b.getIndexAttr(0);
-
-    AffineExpr sym0;
-    bindSymbols(b.getContext(), sym0);
-
-    auto modMap = AffineMap::get(0, 1, {sym0 - sym0 % tileSize});
-    return affine::makeComposedFoldedAffineApply(b, loc, modMap, inputSizeOfr);
-  }
-
-  OpFoldResult getRemainderSize(OpBuilder &b, Location loc,
-                                OpFoldResult tileableBoundOfr,
-                                OpFoldResult inputSize) const {
-    AffineExpr sym0, sym1;
-    bindSymbols(b.getContext(), sym0, sym1);
-    auto diffMap = AffineMap::get(0, 2, {sym1 - sym0});
-    return affine::makeComposedFoldedAffineApply(b, loc, diffMap,
-                                                 {tileableBoundOfr, inputSize});
-  }
-
-  tensor::ExtractSliceOp create1DSlice(OpBuilder &b, Location loc, Value source,
-                                       OpFoldResult offset,
-                                       OpFoldResult size) const {
-    SmallVector<OpFoldResult> offsets{offset};
-    SmallVector<OpFoldResult> sizes{size};
-    SmallVector<OpFoldResult> strides{b.getIndexAttr(1)};
-
-    return b.create<tensor::ExtractSliceOp>(loc, source, offsets, sizes,
-                                            strides);
-  }
-
-  linalg::ReduceOp cloneReduceOp(OpBuilder &b, linalg::ReduceOp reduceOp,
-                                 ValueRange newInputs, Value newInit) const {
-    IRMapping bvm;
-    bvm.map(reduceOp.getInputs(), newInputs);
-    bvm.map(reduceOp.getInits(), ValueRange{newInit});
-
-    auto *newReduceOp = b.clone(*reduceOp.getOperation(), bvm);
-    setLabel(newReduceOp, kTransformedLabel);
-    return cast<linalg::ReduceOp>(newReduceOp);
-  }
-
-  Value tileAndReshapeInput(OpBuilder &b, Location loc, Value iv, Value input,
-                            Type elementType, int64_t tileSize,
-                            int64_t splitRatio) const {
-    Value inputSlice =
-        create1DSlice(b, loc, input, iv, b.getIndexAttr(tileSize));
-
-    auto reshapeType =
-        RankedTensorType::get({tileSize / splitRatio, splitRatio}, elementType);
-    SmallVector<ReassociationIndices> ri = {{0, 1}};
-    return b.create<tensor::ExpandShapeOp>(loc, reshapeType, inputSlice, ri);
-  }
-
-  Reduce1DTileSizeComputationFn tileSizeFn;
-};
-
-/// Pattern to tile `linalg.reduce` and fuse `linalg.fill` into generated
-/// `scf.forall`.
-struct Reduce2DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
-  using OpRewritePattern<linalg::ReduceOp>::OpRewritePattern;
-
-  explicit Reduce2DTransformPattern(MLIRContext *context,
-                                    int64_t parallelDimTileSize = 4,
-                                    int64_t reductionDimTileSize = 2,
-                                    PatternBenefit benefit = 1)
-      : OpRewritePattern<linalg::ReduceOp>(context, benefit),
-        parallelDimTileSize(parallelDimTileSize),
-        reductionDimTileSize(reductionDimTileSize) {}
-
-  LogicalResult matchAndRewrite(linalg::ReduceOp reduceOp,
-                                PatternRewriter &rewriter) const override {
-    if (reduceOp.getDimensions().size() != 1) return failure();
-    int64_t reductionDim = reduceOp.getDimensions()[0];
-
-    if (hasLabel(reduceOp, kTransformedLabel)) {
-      return rewriter.notifyMatchFailure(reduceOp,
-                                         "has already been transformed.");
-    }
-    if (failed(validateOp(reduceOp, rewriter, /*expectedRank=*/2)))
-      return failure();
-
-    auto fusionOp = reduceOp->getParentOfType<gml_st::FusionOp>();
-    auto *tilingRoot = fusionOp.getTerminator().getValues()[0].getDefiningOp();
-
-    // First level tiling: parallel dimension.
-    auto parallelDimsTileSizes =
-        isa<linalg::ReduceOp>(tilingRoot)
-            ? getParallelDimTileSizes(reduceOp.getDimensions()[0],
-                                      parallelDimTileSize)
-            : SmallVector<int64_t>{parallelDimTileSize};
-    auto tilingParallelDimsResult = tileUsingSCFForallOpAndFuseGreedily(
-        rewriter, tilingRoot,
-        getSCFTilingOptions(rewriter.getContext(), parallelDimsTileSizes));
-    if (failed(tilingParallelDimsResult)) return failure();
-
-    auto peeledParallelLoop =
-        peelAllLoops(tilingParallelDimsResult->loop, rewriter);
-
-    auto filterFn = [&](Operation *op) {
-      return reduce2DProducerFusionFilter(op) || isa<linalg::ReduceOp>(op);
-    };
-
-    // Process main parallel loop.
-    scf::ForallOp mainParallelLoop = peeledParallelLoop.mainLoop;
-    if (mainParallelLoop) {
-      auto tiledReduceOp =
-          *mainParallelLoop.getBody()->getOps<linalg::ReduceOp>().begin();
-      if (failed(tileAndPeelReductionDim(rewriter, tiledReduceOp, reductionDim,
-                                         filterFn))) {
-        return failure();
-      }
-    }
-
-    // Process tail parallel loop.
-    scf::ForallOp tailParallelLoop = peeledParallelLoop.tailLoops.size() == 1
-                                         ? peeledParallelLoop.tailLoops.front()
-                                         : nullptr;
-    if (tailParallelLoop) {
-      Value yieldedTensor =
-          getYieldedValues(tailParallelLoop.getTerminator()).front();
-      auto *definingOp = yieldedTensor.getDefiningOp();
-      if (!definingOp) return failure();
-
-      auto opts =
-          getSCFTilingOptions(rewriter.getContext(),
-                              SmallVector<int64_t>(definingOp->getResult(0)
-                                                       .getType()
-                                                       .cast<RankedTensorType>()
-                                                       .getRank(),
-                                                   1));
-      auto parallelDimTilingOpts =
-          isa<linalg::ReduceOp>(definingOp)
-              ? getSCFTilingOptions(rewriter.getContext(),
-                                    getParallelDimTileSizes(reductionDim, 1))
-              : getSCFTilingOptions(rewriter.getContext(), 1);
-      auto parallelDimTilingResult = tileUsingSCFForallOpAndFuseGreedily(
-          rewriter, definingOp, parallelDimTilingOpts, filterFn);
-      if (failed(parallelDimTilingResult)) return failure();
-
-      for (auto tiledReduceOp :
-           llvm::to_vector(parallelDimTilingResult->loop.getBody()
-                               ->getOps<linalg::ReduceOp>())) {
-        auto reductionDimTilingResult = tileUsingSCFForOpAndFuseGreedily(
-            rewriter, tiledReduceOp,
-            getSCFTilingOptions(rewriter.getContext(),
-                                getReductionDimTileSizes(reductionDim, 1)),
-            reduce2DProducerFusionFilter);
-        if (failed(reductionDimTilingResult)) return failure();
-      }
-    }
-
-    return success();
-  }
-
- private:
-  LogicalResult tileAndPeelReductionDim(
-      PatternRewriter &rewriter, linalg::ReduceOp reduceOp,
-      int64_t reductionDim,
-      llvm::function_ref<bool(Operation *)> producerFilterFn) const {
-    FailureOr<scf::SCFTilingResult> reductionDimTilingResult =
-        tileUsingSCFForOpAndFuseGreedily(
-            rewriter, reduceOp,
-            getSCFTilingOptions(
-                rewriter.getContext(),
-                getReductionDimTileSizes(reductionDim, reductionDimTileSize)),
-            producerFilterFn);
-    if (failed(reductionDimTilingResult)) return failure();
-
-    SCFForPeelingResult reductionDimPeelingResult = peelSCFForOp(
-        rewriter, cast<scf::ForOp>(reductionDimTilingResult->loops.front()));
-    if (reductionDimPeelingResult.mainLoop) {
-      setLabel(reductionDimPeelingResult.mainLoop, kPerfectlyTiledLoopLabel);
-    }
-    if (reductionDimPeelingResult.tailLoop) {
-      for (auto reduOp :
-           llvm::to_vector(reductionDimPeelingResult.tailLoop.getBody()
-                               ->getOps<linalg::ReduceOp>())) {
-        // Column reductions have to be tiled even further, otherwise we
-        // would get vector.multi_reduction 4x1 -> 1, which is expensive.
-        // Potentially, we could lower it to a horizontal add.
-        if (reductionDim == 0) {
-          auto parallelDimSizeOneTilingResult =
-              tileUsingSCFForOpAndFuseGreedily(
-                  rewriter, reduOp,
-                  getSCFTilingOptions(rewriter.getContext(),
-                                      getParallelDimTileSizes(reductionDim, 1)),
-                  producerFilterFn);
-          if (failed(parallelDimSizeOneTilingResult)) return failure();
-
-          reduOp = cast<linalg::ReduceOp>(
-              parallelDimSizeOneTilingResult->tiledOps.front());
-        }
-        if (failed(tileUsingSCFForOpAndFuseGreedily(
-                rewriter, reduOp,
-                getSCFTilingOptions(rewriter.getContext(),
-                                    getReductionDimTileSizes(reductionDim, 1)),
-                producerFilterFn))) {
-          return failure();
-        }
-      }
-    }
-    return success();
-  }
-
-  int64_t parallelDimTileSize;
-  int64_t reductionDimTileSize;
-};
-
-struct TransformReduceForCpuPass
-    : public impl::TransformReduceForCpuPassBase<TransformReduceForCpuPass> {
-  using Base::Base;
-
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<arith::ArithDialect, gml_st::GmlStDialect,
-                    linalg::LinalgDialect, scf::SCFDialect,
-                    tensor::TensorDialect>();
-    linalg::registerTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    // Cleanup passes to prepare ops for better clustering.
-    {
-      RewritePatternSet patterns(ctx);
-      populateDuplicateInitOpsPatterns(patterns);
-
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-        return signalPassFailure();
-    }
-
-    {
-      RewritePatternSet patterns(ctx);
-      patterns.add(fusionClusterPattern);
-
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-        return signalPassFailure();
-    }
-
-    {
-      RewritePatternSet patterns(ctx);
-      Reduce1DTileSizeComputationFn tilingHeuristic;
-      if (enableHeuristic) {
-        tilingHeuristic = [](int64_t size) {
-          if (!ShapedType::isDynamic(size) && size > 96)
-            return Reduce1DTileSizes{32, 8};
-          return Reduce1DTileSizes{8, 8};
-        };
-      } else {
-        tilingHeuristic = [=](int64_t) {
-          return Reduce1DTileSizes{tileSize1D, splitRatio1D};
-        };
-      }
-      patterns.add<Reduce1DTransformPattern>(ctx, std::move(tilingHeuristic));
-      patterns.add<Reduce2DTransformPattern>(ctx, parallelDimTileSize2D,
-                                             reductionDimTileSize2D);
-      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-        return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<Pass> createTransformReduceForCpuPass(
-    const TransformReduceForCpuPassOptions &opts) {
-  return std::make_unique<TransformReduceForCpuPass>(opts);
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_scatter_for_cpu.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_scatter_for_cpu.cc
deleted file mode 100644
index b69ccee201a7e3..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_scatter_for_cpu.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "gml_st/IR/gml_st_ops.h"
-#include "gml_st/transforms/fusion/fusion.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/scalarization/scalarization.h"
-#include "gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "thlo/IR/thlo_ops.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TRANSFORMSCATTERFORCPUPASS
-#include "gml_st/transforms/passes.h.inc"
-
-struct TileScatterPattern : public OpRewritePattern<thlo::ScatterOp> {
-  using OpRewritePattern<thlo::ScatterOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(thlo::ScatterOp scatterOp,
-                                PatternRewriter &rewriter) const override {
-    if (hasLabel(scatterOp, kTransformedLabel)) return failure();
-
-    // Tile everything to points and fuse.
-    scf::SCFTilingOptions opts;
-    opts.setTileSizes(SmallVector<OpFoldResult>(
-        scatterOp.getLoopIteratorTypes().size(),
-        getAsIndexOpFoldResult(rewriter.getContext(), 1)));
-
-    auto fuseFilterFn = [](Operation *op) {
-      return isa<linalg::BroadcastOp, linalg::FillOp, linalg::MapOp,
-                 thlo::ReverseOp, linalg::TransposeOp>(op);
-    };
-    auto tilingResult = tileUsingSCFForOpAndFuseGreedily(rewriter, scatterOp,
-                                                         opts, fuseFilterFn);
-
-    if (failed(tilingResult)) return failure();
-
-    assert(tilingResult->tiledOps.size() == 1 &&
-           "Tiling of thlo.scatter should generate a single op");
-
-    // Scalarize scatter op.
-    scatterOp = cast<thlo::ScatterOp>(tilingResult->tiledOps.front());
-    FailureOr<scf::IfOp> ifOpOr = rewriteScatterOpAsIfOp(scatterOp, rewriter);
-    if (failed(ifOpOr)) return failure();
-
-    // Fuse into `then` block.
-    fuseGreedily(rewriter, &ifOpOr->getThenRegion().front(), fuseFilterFn);
-
-    // Remove tiling label to continue generating code inside the region.
-    ifOpOr->walk([](Operation *op) { removeLabel(op, kTransformedLabel); });
-    return success();
-  }
-};
-
-struct TransformScatterForCpuPass
-    : public impl::TransformScatterForCpuPassBase<TransformScatterForCpuPass> {
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<arith::ArithDialect, gml_st::GmlStDialect, scf::SCFDialect,
-                    tensor::TensorDialect>();
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    RewritePatternSet patterns(ctx);
-    patterns.add<TileScatterPattern>(ctx);
-
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-      return signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformScatterForCpuPass() {
-  return std::make_unique<mlir::gml_st::TransformScatterForCpuPass>();
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
deleted file mode 100644
index 04cf7f87181bac..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
+++ /dev/null
@@ -1,796 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "gml_st/transforms/fusion/fusion.h"
-
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "gml_st/IR/gml_st_ops.h"
-#include "gml_st/transforms/tiling/tiling.h"
-#include "gml_st/transforms/transforms.h"
-#include "gml_st/utils/tensor_utils.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetOperations.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Interfaces/DestinationStyleOpInterface.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/InliningUtils.h"
-#include "mlir/Transforms/RegionUtils.h"
-#include "mlir/Transforms/TopologicalSortUtils.h"
-
-namespace mlir::gml_st {
-namespace {
-
-struct SimpleOperationInfo : public llvm::DenseMapInfo<Operation*> {
-  static unsigned getHashValue(const Operation* opC) {
-    return OperationEquivalence::computeHash(
-        const_cast<Operation*>(opC),
-        /*hashOperands=*/OperationEquivalence::directHashValue,
-        /*hashResults=*/OperationEquivalence::ignoreHashValue,
-        OperationEquivalence::IgnoreLocations);
-  }
-  static bool isEqual(const Operation* lhsC, const Operation* rhsC) {
-    auto* lhs = const_cast<Operation*>(lhsC);
-    auto* rhs = const_cast<Operation*>(rhsC);
-    if (lhs == rhs) return true;
-    if (lhs == getTombstoneKey() || lhs == getEmptyKey() ||
-        rhs == getTombstoneKey() || rhs == getEmptyKey())
-      return false;
-    return OperationEquivalence::isEquivalentTo(
-        const_cast<Operation*>(lhsC), const_cast<Operation*>(rhsC),
-        OperationEquivalence::IgnoreLocations);
-  }
-};
-
-template <class OpTy>
-void eliminateEqualOps(PatternRewriter& rewriter, Block& block) {
-  llvm::DenseMap<Operation*, Operation*, SimpleOperationInfo> uniqueOps;
-
-  for (auto op : llvm::make_early_inc_range(block.getOps<OpTy>())) {
-    if (auto* equivalentOp = uniqueOps.lookup(op)) {
-      rewriter.replaceOp(op, equivalentOp->getResults());
-    } else {
-      uniqueOps.insert(std::make_pair(op, op));
-    }
-  }
-}
-
-void eliminateTriviallyDeadUsers(PatternRewriter& rewriter, Operation* op) {
-  for (auto* user :
-       DenseSet<Operation*>(op->getUsers().begin(), op->getUsers().end())) {
-    if (isOpTriviallyDead(user)) rewriter.eraseOp(user);
-  }
-}
-
-void reifyDimOp(PatternRewriter& rewriter, tensor::DimOp dimOp) {
-  auto dimValue = dimOp.getSource().template dyn_cast<OpResult>();
-  if (!dimValue) return;
-
-  std::optional<int64_t> dimIndex = dimOp.getConstantIndex();
-  if (!dimIndex) return;
-
-  ReifiedRankedShapedTypeDims reifiedResultShapes;
-  if (failed(reifyResultShapes(rewriter, dimValue.getOwner(),
-                               reifiedResultShapes))) {
-    return;
-  }
-
-  if (reifiedResultShapes.size() != dimValue.getOwner()->getNumResults())
-    return;
-
-  unsigned resultNumber = dimValue.getResultNumber();
-  auto sourceType = dimValue.getType().dyn_cast<RankedTensorType>();
-  if (reifiedResultShapes[resultNumber].size() !=
-      static_cast<size_t>(sourceType.getRank()))
-    return;
-
-  rewriter.replaceOp(dimOp, getValueOrCreateConstantIndexOp(
-                                rewriter, dimOp.getLoc(),
-                                reifiedResultShapes[resultNumber][*dimIndex]));
-}
-
-void reifyDimOpsUsers(PatternRewriter& rewriter, Operation* op) {
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointAfter(op);
-
-  for (auto* user : llvm::make_early_inc_range(op->getUsers())) {
-    auto dimOp = dyn_cast<tensor::DimOp>(user);
-    if (dimOp) reifyDimOp(rewriter, dimOp);
-  }
-}
-
-LogicalResult fuseTensorCast(PatternRewriter& rewriter, tensor::CastOp castOp,
-                             tensor::ExtractSliceOp sliceOp) {
-  if (!tensor::canFoldIntoConsumerOp(castOp)) return failure();
-
-  /// Deduce the type of the result to use for the canonicalized operation.
-  RankedTensorType resultType =
-      tensor::ExtractSliceOp::inferCanonicalRankReducedResultType(
-          sliceOp.getType().getRank(), sliceOp.getSourceType(),
-          sliceOp.getMixedOffsets(), sliceOp.getMixedSizes(),
-          sliceOp.getMixedStrides());
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPointAfter(sliceOp);
-  Value newSlice = rewriter.create<tensor::ExtractSliceOp>(
-      sliceOp.getLoc(), resultType, castOp.getSource(), sliceOp.getOffsets(),
-      sliceOp.getSizes(), sliceOp.getStrides(), sliceOp.getStaticOffsets(),
-      sliceOp.getStaticSizes(), sliceOp.getStaticStrides());
-  rewriter.replaceOpWithNewOp<tensor::CastOp>(sliceOp, sliceOp.getType(),
-                                              newSlice);
-  return success();
-}
-
-// TODO(vuson): maybe overload this function instead of templating it.
-// Fuse a reshape op being used by an extract_slice op (inside a loop) into the
-// loop by reversing the order of these two ops (and fixing their
-// operands/result accordingly). For example, fusing a tensor.collapse_shape:
-//
-//   %1 = tensor.collapse_shape %0 [[0], [1, 2]] :
-//            tensor<10x10x1xf32> into tensor<10x10xf32>
-//   some_scf_loop (%a1, %a2) = (0, 0) to (10, 10) step (1, 8) ...
-//     %3 = tensor.extract_slice %1[%a1, %a2] [1, 8] [1, 1] :
-//              tensor<10x10xf32> to tensor<1x?xf32>
-//
-// into
-//
-//   some_scf_loop (%a1, %a2) = (0, 0) to (10, 10) step (1, 8) ...
-//     %3 = tensor.extract_slice %0[%a1, %a2, 0] [1, 8, 1] [1, 1, 1] :
-//              tensor<10x10x1xf32> to tensor<1x?x1xf32>
-//     %1 = tensor.collapse_shape %3 [[0], [1, 2]] :
-//              tensor<1x?x1xf32> into tensor<1x?xf32>
-//
-// This also works for tensor.expand_shape instead of tensor.collapse_shape:
-//
-//   %1 = tensor.expand_shape %0 [[0], [1, 2]] :
-//            tensor<10x10xf32> into tensor<10x10x1xf32>
-//   some_scf_loop (%a1, %a2) = (0, 0) to (10, 10) step (1, 8) ...
-//     %3 = tensor.extract_slice %1[%a1, %a2, 0] [1, 8, 1] [1, 1, 1] :
-//              tensor<10x10x1xf32> to tensor<1x?x1xf32>
-//
-// into
-//
-//   some_scf_loop (%a1, %a2) = (0, 0) to (10, 10) step (1, 8) ...
-//     %3 = tensor.extract_slice %0[%a1, %a2] [1, 8] [1, 1] :
-//              tensor<10x10xf32> to tensor<1x?xf32>
-//     %1 = tensor.expand_shape %2 [[0], [1, 2]] :
-//              tensor<1x?xf32> into tensor<1x?x1xf32>
-template <typename TensorReshapeOp>
-LogicalResult fuseTensorReshape(PatternRewriter& rewriter,
-                                TensorReshapeOp reshapeOp,
-                                tensor::ExtractSliceOp sliceOp) {
-  if (!isDegenerateReshapeOp(reshapeOp)) return failure();
-
-  auto newSliceSrcType = reshapeOp.getSrcType();
-  llvm::ArrayRef<int64_t> newSliceSrcShape = newSliceSrcType.getShape();
-  auto newSliceRank = newSliceSrcType.getRank();
-  // If the source type of reshape op is a rank-0 tensor, there will be no
-  // extract_slice possible from that source value, let's bail out.
-  if (newSliceRank == 0) return failure();
-
-  auto one = rewriter.getIndexAttr(1);
-  auto zero = rewriter.getIndexAttr(0);
-  SmallVector<OpFoldResult> newOffsets(newSliceRank, zero);
-  SmallVector<OpFoldResult> newSizes(newSliceRank, one);
-  SmallVector<OpFoldResult> newStrides(newSliceRank, one);
-
-  llvm::ArrayRef<int64_t> sliceSrcShape = sliceOp.getSourceType().getShape();
-  auto reassociation = reshapeOp.getReassociationIndices();
-  constexpr bool isExpanding =
-      std::is_same<TensorReshapeOp, tensor::ExpandShapeOp>::value;
-  llvm::ArrayRef<int64_t> shape =
-      isExpanding ? sliceSrcShape : newSliceSrcShape;
-  // For each reassociation indices, a degenerate reshape op only has at most
-  // 1 non-unit-dimension. If there is none, it means the source shape already
-  // has some unit-dimensions (e.g. tensor<1x1x1xf32> collapsed into
-  // tensor<1xf32>)
-  assert(
-      static_cast<size_t>(
-          llvm::count_if(reassociation,
-                         [&](auto indices) {
-                           return llvm::count_if(indices, [&](int64_t idx) {
-                                    return shape[idx] != 1;
-                                  }) <= 1;
-                         })) == reassociation.size() &&
-      "Degenerate reshape op should only have at most 1 non-unit dimension for "
-      "each reassociation indices");
-  for (const auto& [enumIdx, indices] : llvm::enumerate(reassociation)) {
-    auto findIt =
-        llvm::find_if(indices, [&](int64_t idx) { return shape[idx] != 1; });
-    // No non-unit dimension, which means the source shape already has some
-    // unit-dimensions. The default values for offset/size/stride (0/1/1) should
-    // be usable. Skip updating offset/size/stride for this dimension.
-    if (findIt == indices.end()) continue;
-    auto newIdx = isExpanding ? enumIdx : *findIt;
-    auto idx = isExpanding ? *findIt : enumIdx;
-    newOffsets[newIdx] = sliceOp.getMixedOffsets()[idx];
-    newSizes[newIdx] = sliceOp.getMixedSizes()[idx];
-    newStrides[newIdx] = sliceOp.getMixedStrides()[idx];
-  }
-
-  RankedTensorType newSliceResultType =
-      tensor::ExtractSliceOp::inferCanonicalRankReducedResultType(
-          newSliceRank, newSliceSrcType, newOffsets, newSizes, newStrides);
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPointAfter(sliceOp);
-  auto newSlice = rewriter.create<tensor::ExtractSliceOp>(
-      sliceOp.getLoc(), newSliceResultType, reshapeOp.getSrc(), newOffsets,
-      newSizes, newStrides);
-
-  rewriter.replaceOpWithNewOp<TensorReshapeOp>(sliceOp, sliceOp.getResultType(),
-                                               newSlice, reassociation);
-  return success();
-}
-
-// Checks that there is at most one user in each given block.
-bool atMostOneUserPerBlock(ArrayRef<Operation*> users,
-                           ArrayRef<Block*> blocks) {
-  if (users.size() == 1) return true;
-  if (users.size() > blocks.size()) return false;
-
-  llvm::SmallSetVector<Block*, 2> blocksWithUsers;
-
-  Block* funcBlock = &users.front()
-                          ->getParentWithTrait<OpTrait::IsIsolatedFromAbove>()
-                          ->getRegion(0)
-                          .front();
-  // Return false if there two users in a block.
-  for (Operation* user : users) {
-    Block* block = user->getBlock();
-    while (block != funcBlock) {
-      if (llvm::is_contained(blocks, block)) {
-        if (!blocksWithUsers.insert(block)) return false;
-      }
-      block = block->getParentOp()->getBlock();
-    }
-  }
-  return true;
-}
-
-DenseSet<Operation*> getFusionCandidates(
-    Region& region, llvm::function_ref<bool(Operation*)> filterFn) {
-  DenseSet<Operation*> fusionCandidates;
-  visitUsedValuesDefinedAbove(region, [&](OpOperand* operand) {
-    auto* fusionCandidate = operand->get().getDefiningOp();
-    // Do not fuse if there is no defining op. Of example, if it's an
-    // extract_slice from a function argument.
-    if (!fusionCandidate) return;
-
-    // Filter candidates that we don't want to fuse.
-    if (filterFn && !filterFn(fusionCandidate)) return;
-
-    // Check that the candidate doesn't have users that will block fusion.
-    if (!llvm::all_of(fusionCandidate->getUsers(), [](Operation* op) {
-          // Fusion candidates can only be fused into tensor.extract_slice or
-          // tensor.extract.
-          return isa<tensor::ExtractSliceOp, tensor::ExtractOp>(op) ||
-                 // tensor.dim is pushed 'above' the fusion candidate.
-                 isa<tensor::DimOp>(op) ||
-                 // Trivially dead ops will be removed.
-                 isOpTriviallyDead(op);
-        })) {
-      return;
-    }
-    fusionCandidates.insert(fusionCandidate);
-  });
-  return fusionCandidates;
-}
-
-LogicalResult fuseIntoUser(PatternRewriter& rewriter,
-                           Operation* fusionCandidate,
-                           Operation* candidateUser) {
-  // If the user of the fusion candidate is `tensor.extract_slice`, we use
-  // TilingInterface to rewrite `tensor.extract_slice(fusionOp)` into
-  // `tiledFusionOp(tensor.extract_slice)`.
-  if (auto extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(candidateUser)) {
-    if (auto castOp = dyn_cast<tensor::CastOp>(fusionCandidate)) {
-      return fuseTensorCast(rewriter, castOp, extractSliceOp);
-    }
-    if (auto collapseShapeOp =
-            dyn_cast<tensor::CollapseShapeOp>(fusionCandidate)) {
-      return fuseTensorReshape(rewriter, collapseShapeOp, extractSliceOp);
-    }
-    if (auto expandShapeOp = dyn_cast<tensor::ExpandShapeOp>(fusionCandidate)) {
-      return fuseTensorReshape(rewriter, expandShapeOp, extractSliceOp);
-    }
-    auto fusedOp = fuse(rewriter, extractSliceOp);
-    if (succeeded(fusedOp)) {
-      setLabel(*fusedOp, kTransformedLabel);
-      return success();
-    }
-    return failure();
-  }
-
-  // TODO(shyshkov): Implement fusion into `tensor.extract` using
-  // TilingInterface.
-  if (auto extractOp = dyn_cast<tensor::ExtractOp>(candidateUser)) {
-    return failure();
-  }
-
-  // Otherwise, the fusion candidate op is moved inside of the region.
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(candidateUser);
-  Operation* clonedCandidate = rewriter.clone(*fusionCandidate);
-  rewriter.replaceOp(fusionCandidate, clonedCandidate->getResults());
-  return success();
-}
-
-LogicalResult fuseIntoUsers(PatternRewriter& rewriter,
-                            Operation* fusionCandidate,
-                            ArrayRef<Operation*> fusionCandidateUsers) {
-  for (Operation* candidateUser : fusionCandidateUsers) {
-    if (failed(fuseIntoUser(rewriter, fusionCandidate, candidateUser)))
-      return failure();
-  }
-  return success();
-}
-
-// Iterates over tensor::ExtractSliceOp inside the block, finds a suitable
-// candidate for fusion and fuses it. The fusion candidate should satisfy the
-// filter function and not have uses outside of the block. Fails if nothing
-// can be fused.
-LogicalResult fuseGreedilyOneOpIntoBlock(
-    PatternRewriter& rewriter, ArrayRef<Block*> blocks,
-    llvm::function_ref<bool(Operation*)> filterFn) {
-  if (blocks.empty()) return failure();
-
-  // Ad-hoc CSE to eliminate duplicate MatrializeOp that could have been added
-  // after previous fusions. Running the whole CSE pass would be to expensive
-  // here and unnecessary. Without removing those duplicate, some ops will be
-  // fused multiple times resulting in exponential code growth.
-  DenseSet<Operation*> fusionCandidates;
-  for (auto [index, block] : llvm::enumerate(blocks)) {
-    eliminateEqualOps<tensor::ExtractSliceOp>(rewriter, *block);
-
-    if (index == 0) {
-      fusionCandidates = getFusionCandidates(*block->getParent(), filterFn);
-      continue;
-    }
-    llvm::set_intersect(fusionCandidates,
-                        getFusionCandidates(*block->getParent(), filterFn));
-  }
-
-  for (Operation* fusionCandidate : fusionCandidates) {
-    // Ad-hoc DCE to trim the fusion candidate from dead users that could have
-    // been added in the previous fusion cycles. Normally those ops would be
-    // garbage collected after the pattern rewriter driver finished working,
-    // but here it requires manual handling.
-    eliminateTriviallyDeadUsers(rewriter, fusionCandidate);
-
-    // Push tensor.dim ops 'above' the fusion candidate. This is normally done
-    // by canonicalization passes, but running the whole canonicalization
-    // pipeline here is too expensive.
-    reifyDimOpsUsers(rewriter, fusionCandidate);
-
-    // After the previous steps, there should be at most one user of the
-    // fusion candidate per block. Otherwise this candidate should not be fused.
-    // We always want to fuse linalg.fill.
-    auto fusionCandidateUsers = llvm::to_vector(fusionCandidate->getUsers());
-    if (!atMostOneUserPerBlock(fusionCandidateUsers, blocks)) {
-      continue;
-    }
-    if (succeeded(
-            fuseIntoUsers(rewriter, fusionCandidate, fusionCandidateUsers))) {
-      return success();
-    }
-  }
-  return failure();
-}
-
-FailureOr<TilingResult> createFusedOp(PatternRewriter& rewriter,
-                                      tensor::ExtractSliceOp extractSliceOp) {
-  Value src = extractSliceOp.getSource();
-  if (!src) return failure();
-  auto tileableOp = src.getDefiningOp<TilingInterface>();
-  if (!tileableOp) {
-    return rewriter.notifyMatchFailure(
-        extractSliceOp,
-        "expected source to be defined by tiling interface op ");
-  }
-
-  SmallVector<OpFoldResult> offsets = extractSliceOp.getMixedOffsets();
-  SmallVector<OpFoldResult> sizes = extractSliceOp.getMixedSizes();
-
-  // Tile the producer.
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(extractSliceOp);
-  FailureOr<TilingResult> tiledProducer = tileableOp.generateResultTileValue(
-      rewriter, /*resultNumber=*/0, offsets, sizes);
-  if (failed(tiledProducer)) {
-    return rewriter.notifyMatchFailure(tileableOp,
-                                       "failed to tile the producer");
-  }
-
-  return tiledProducer;
-}
-
-void fuseFillOpsIntoForallOp(PatternRewriter& rewriter,
-                             scf::ForallOp parallelOp) {
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPointToStart(parallelOp.getBody());
-  for (OpOperand& output :
-       parallelOp->getOpOperands().take_back(parallelOp.getNumResults())) {
-    auto fillOp = output.get().getDefiningOp<linalg::FillOp>();
-    if (!fillOp) continue;
-
-    // Clone `linalg.fill` op inside the loop, update the uses of bbArg.
-    BlockArgument regionOutputArg = parallelOp.getTiedBlockArgument(&output);
-    auto clonedFill = cast<linalg::FillOp>(
-        mlir::clone(rewriter, fillOp, fillOp.getResultTypes(),
-                    {fillOp.value(), regionOutputArg}));
-
-    output.set(fillOp.output());
-    setLabel(clonedFill, kTransformedLabel);
-
-    SmallVector<tensor::ExtractSliceOp> sliceOps;
-    regionOutputArg.replaceUsesWithIf(
-        clonedFill.getResult(0), [&](OpOperand& operand) {
-          Operation* owner = operand.getOwner();
-          if (auto sliceOp = dyn_cast_or_null<tensor::ExtractSliceOp>(owner))
-            sliceOps.push_back(sliceOp);
-          return owner != clonedFill &&
-                 !isa<tensor::ParallelInsertSliceOp>(owner) &&
-                 owner->getParentOfType<scf::ForallOp>() == parallelOp;
-        });
-
-    // Use standard fusion logic to swap extract_slice(fill) ->
-    // fill(extract_slice).
-    for (tensor::ExtractSliceOp sliceOp : sliceOps)
-      (void)fuse(rewriter, sliceOp);
-  }
-}
-
-// Finds the source of the operand. It could be a tensor.empty, a region arg or
-// an op outside of the cluster.
-Value getTiedSourceOp(PatternRewriter& rewriter, OpOperand* operand,
-                      const FusionCluster& fusionCluster) {
-  auto* definingOp = operand->get().getDefiningOp();
-  if (!definingOp) return operand->get();
-
-  // A tensor.empty used tied to fusion cluster result should not be fused, so
-  // bufferization can properly handle allocations. If the same tensor.empty is
-  // used in other ops for temporary result, it should be fused. Copied op is
-  // not in the cluster, so it will not be fused.
-  if (auto emptyOp = dyn_cast<tensor::EmptyOp>(definingOp)) {
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPointAfter(emptyOp);
-
-    auto newEmptyOp = cast<tensor::EmptyOp>(rewriter.clone(*emptyOp));
-    operand->set(newEmptyOp);
-    return newEmptyOp;
-  }
-
-  // Source of the operand is outside of the cluster, so pass it as an argument.
-  if (!llvm::is_contained(fusionCluster.operations, definingOp)) {
-    return operand->get();
-  }
-
-  // Source of the operand is another DPS op from the cluster. Look higher in
-  // the chain.
-  if (auto dstStyleOp = dyn_cast<DestinationStyleOpInterface>(definingOp)) {
-    OpOperand* tiedOperand =
-        dstStyleOp.getTiedOpOperand(operand->get().dyn_cast<OpResult>());
-    return getTiedSourceOp(rewriter, tiedOperand, fusionCluster);
-  }
-
-  return operand->get();
-}
-
-SmallVector<Value> getRootOpInitOperands(PatternRewriter& rewriter,
-                                         const FusionCluster& fusionCluster) {
-  auto dstStyleOp = dyn_cast<DestinationStyleOpInterface>(fusionCluster.root);
-  if (!dstStyleOp) return {};
-
-  SmallVector<Value> initOperands;
-
-  for (OpOperand& operand : dstStyleOp.getDpsInitsMutable()) {
-    initOperands.push_back(getTiedSourceOp(rewriter, &operand, fusionCluster));
-  }
-
-  return initOperands;
-}
-
-}  // namespace
-
-FailureOr<Operation*> fuse(PatternRewriter& rewriter,
-                           tensor::ExtractSliceOp extractSliceOp) {
-  Location loc = extractSliceOp.getLoc();
-  FailureOr<TilingResult> fusedOr = createFusedOp(rewriter, extractSliceOp);
-  if (failed(fusedOr)) return failure();  // Match failure already notified.
-
-  // Insert cast if needed.
-  Value fused = fusedOr->tiledOps.front()->getResult(0);
-  if (fused.getType() != extractSliceOp.getType()) {
-    // The result should be a tensor, cast it to the correct shape
-    OpBuilder::InsertionGuard g(rewriter);
-    rewriter.setInsertionPointAfter(fused.getDefiningOp());
-    fused =
-        rewriter.create<tensor::CastOp>(loc, extractSliceOp.getType(), fused);
-  }
-
-  rewriter.replaceOp(extractSliceOp, fused);
-  return fused.getDefiningOp();
-}
-
-void fuseGreedily(PatternRewriter& rewriter, ArrayRef<Block*> blocks,
-                  llvm::function_ref<bool(Operation*)> filterFn) {
-  while (succeeded(fuseGreedilyOneOpIntoBlock(rewriter, blocks, filterFn)))
-    ;
-}
-
-// Cluster producers and consumers around the root op.
-FusionCluster getFusionCluster(
-    Operation* op, llvm::function_ref<bool(Operation*)> producerFilterFn,
-    llvm::function_ref<bool(Operation*)> consumerFilterFn) {
-  // Find a chain of users and use the last one as a root of cluster.
-  SetVector<Operation*> resultOps;
-
-  Operation* rootOp = op;
-  while (true) {
-    auto users = llvm::to_vector(rootOp->getUsers());
-
-    if (users.size() != 1) break;
-
-    if (!consumerFilterFn(users[0])) break;
-    resultOps.insert(rootOp);
-
-    rootOp = users[0];
-  }
-  resultOps.insert(rootOp);
-
-  // Run DFS to find all ops that satisfy producerFilterFn.
-  SmallVector<Operation*> remainingProducers;
-  for (Value operand : op->getOperands())
-    remainingProducers.push_back(operand.getDefiningOp());
-
-  while (!remainingProducers.empty()) {
-    Operation* curOp = remainingProducers.pop_back_val();
-    if (!curOp || resultOps.contains(curOp)) continue;
-    if (!llvm::all_of(curOp->getUsers(),
-                      [&](Operation* op) { return resultOps.contains(op); })) {
-      continue;
-    }
-
-    if (curOp == op || producerFilterFn(curOp)) {
-      resultOps.insert(curOp);
-      for (Value operand : curOp->getOperands())
-        remainingProducers.push_back(operand.getDefiningOp());
-    }
-  }
-  return {resultOps, rootOp, {}};
-}
-
-FailureOr<GMLSTTilingResult> tileUsingSCFForallOpAndFuseGreedily(
-    PatternRewriter& rewriter, Operation* op, const scf::SCFTilingOptions& opts,
-    llvm::function_ref<bool(Operation*)> fuseFilterFn) {
-  auto tilingResult =
-      tileUsingSCFForallOp(rewriter, cast<TilingInterface>(op), opts);
-  if (failed(tilingResult)) return failure();
-
-  for (Operation* tiledOp : tilingResult->tiledOps)
-    setLabel(tiledOp, kTransformedLabel);
-
-  // If tiling created an `scf.forall` loop, we fuse.
-  if (tilingResult->loop != nullptr) {
-    rewriter.replaceOp(op, tilingResult->loop->getResults());
-    // Fuse ops into the loop.
-    fuseGreedily(rewriter, tilingResult->tiledOps.front()->getBlock(),
-                 fuseFilterFn);
-    fuseFillOpsIntoForallOp(rewriter, tilingResult->loop);
-  }
-  return tilingResult;
-}
-
-FailureOr<scf::SCFTilingResult> tileUsingSCFForOpAndFuseGreedily(
-    PatternRewriter& rewriter, Operation* op, const scf::SCFTilingOptions& opts,
-    llvm::function_ref<bool(Operation*)> fuseFilterFn) {
-  auto tilingResult =
-      scf::tileUsingSCFForOp(rewriter, cast<TilingInterface>(op), opts);
-  if (failed(tilingResult)) return failure();
-  rewriter.replaceOp(op, tilingResult->replacements);
-
-  for (Operation* tiledOp : tilingResult->tiledOps)
-    setLabel(tiledOp, kTransformedLabel);
-
-  // If tiling created an `scf.for` loop nest, we fuse.
-  if (!tilingResult->loops.empty()) {
-    scf::ForOp innerLoop = cast<scf::ForOp>(tilingResult->loops.back());
-    fuseGreedily(rewriter, innerLoop.getBody(), fuseFilterFn);
-  }
-  return tilingResult;
-}
-
-LogicalResult tilePeeledOpsToScalars(
-    PatternRewriter& rewriter, const GmlStPeelingResult& peelingResult,
-    llvm::function_ref<bool(Operation*)> fuseFilterFn) {
-  for (scf::ForallOp peeledLoop : peelingResult.tailLoops) {
-    SmallVector<Value> yieldedTensors =
-        getYieldedValues(peeledLoop.getTerminator());
-
-    assert(yieldedTensors.size() == 1 &&
-           "expected to have a single result in scf.forall loop");
-    auto definingOp = yieldedTensors.front().getDefiningOp<TilingInterface>();
-    if (!definingOp) return failure();
-
-    auto opts = getSCFTilingOptions(
-        rewriter.getContext(),
-        SmallVector<int64_t>(definingOp.getLoopIteratorTypes().size(), 1));
-    if (failed(tileUsingSCFForallOpAndFuseGreedily(rewriter, definingOp, opts,
-                                                   fuseFilterFn))) {
-      return failure();
-    }
-  }
-  return success();
-}
-
-FailureOr<gml_st::FusionOp> wrapFusionCluster(
-    PatternRewriter& rewriter, const FusionCluster& fusionCluster) {
-  auto loc = fusionCluster.root->getLoc();
-
-  SetVector<Value> inputOperands;
-  SmallVector<Value> initOperands =
-      getRootOpInitOperands(rewriter, fusionCluster);
-
-  // 1. Find operands and results of the cluster op.
-  SmallVector<Value> clusterResults;
-  SmallVector<Value> constantOps;
-  auto visitOpOperand = [&](OpOperand* operand) {
-    Value operandValue = operand->get();
-    auto* definingOp = operandValue.getDefiningOp();
-
-    if (definingOp && definingOp->hasTrait<OpTrait::ConstantLike>()) {
-      constantOps.push_back(operandValue);
-      return;
-    }
-
-    if (fusionCluster.operations.contains(definingOp)) return;
-    if (llvm::is_contained(initOperands, operandValue)) return;
-
-    inputOperands.insert(operandValue);
-  };
-
-  for (Operation* op : fusionCluster.operations) {
-    for (OpOperand& operand : op->getOpOperands()) visitOpOperand(&operand);
-
-    visitUsedValuesDefinedAbove(op->getRegions(), visitOpOperand);
-
-    if (llvm::any_of(op->getUsers(), [&](Operation* user) {
-          return !fusionCluster.operations.contains(user);
-        })) {
-      llvm::append_range(clusterResults, op->getResults());
-    }
-  }
-
-  SetVector<Value> clusterOperands = inputOperands;
-  clusterOperands.insert(initOperands.begin(), initOperands.end());
-
-  // 2. Create an empty op.
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointAfter(fusionCluster.root);
-  auto fusionClusterOp = rewriter.create<gml_st::FusionOp>(
-      loc, TypeRange(ValueRange(clusterResults)),
-      ValueRange(inputOperands.getArrayRef()), ValueRange(initOperands),
-      nullptr, nullptr);
-
-  // 3. Create block with mapping between operands and block arguments.
-  SmallVector<Type, 4> blockArgTypes =
-      llvm::to_vector(TypeRange(ValueRange(clusterOperands.getArrayRef())));
-  SmallVector<Location, 4> blockArgLocs(blockArgTypes.size(), loc);
-
-  Region& region = fusionClusterOp.getRegion();
-  Block* block =
-      rewriter.createBlock(&region, region.end(), blockArgTypes, blockArgLocs);
-
-  IRMapping mapper;
-  mapper.map(clusterOperands, block->getArguments());
-
-  // 4. Copy ops into the cluster region.
-  // 4.1. Copy constant ops.
-  for (auto v : constantOps) {
-    auto newOp = rewriter.clone(*v.getDefiningOp())->getResult(0);
-    mapper.map(v, newOp);
-  }
-
-  // 4.2. Copy ops into the cluster region in topoligical order to avoid
-  // swapping depending ops.
-  SmallVector<Operation*> clusterOps(fusionCluster.operations.begin(),
-                                     fusionCluster.operations.end());
-
-  mlir::computeTopologicalSorting(clusterOps);
-  for (Operation* op : clusterOps) {
-    rewriter.clone(*op, mapper);
-  }
-
-  // 4.3 Create terminator gml_st.yield.
-  SmallVector<Value> yieldOpOperands = llvm::to_vector(llvm::map_range(
-      clusterResults, [&](Value v) { return mapper.lookupOrDefault(v); }));
-  auto yieldOp = rewriter.create<gml_st::YieldOp>(loc, yieldOpOperands);
-
-  // 5. Replace all uses of ops in the cluster with results of the new fusion
-  // cluster op.
-  for (auto [fromV, toV] :
-       llvm::zip(clusterResults, fusionClusterOp.getResults())) {
-    rewriter.replaceAllUsesExcept(fromV, toV, yieldOp);
-  }
-
-  return fusionClusterOp;
-}
-
-LogicalResult inlineFusionCluster(FusionOp fusionOp,
-                                  PatternRewriter& rewriter) {
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointAfter(fusionOp);
-
-  IRMapping mapper;
-  mapper.map(fusionOp.getRegion().getArguments(), fusionOp.getOperands());
-
-  for (auto& op : fusionOp.getBody()->without_terminator()) {
-    rewriter.clone(op, mapper);
-  }
-
-  if (fusionOp.hasTensorSemantics()) {
-    SmallVector<Value> yieldOpOperands = llvm::to_vector(
-        llvm::map_range(fusionOp.getTerminator().getOperands(),
-                        [&](Value v) { return mapper.lookupOrDefault(v); }));
-
-    rewriter.replaceOp(fusionOp, yieldOpOperands);
-  } else {
-    rewriter.eraseOp(fusionOp);
-  }
-
-  return success();
-}
-
-// Duplicates the op so each copy has only one use as init parameter.
-template <typename OpTy>
-LogicalResult duplicateInitOps(OpTy op, PatternRewriter& rewriter) {
-  // Nothing to do, because the op has 0 or 1 users.
-  if (std::distance(op->user_begin(), op->user_end()) <= 1) return failure();
-
-  bool modified = false;
-  for (auto& use : llvm::make_early_inc_range(op->getUses())) {
-    Operation* ownerOp = use.getOwner();
-
-    auto dstStyleOp = dyn_cast<DestinationStyleOpInterface>(ownerOp);
-    if (!dstStyleOp || !dstStyleOp.isDpsInit(&use)) continue;
-
-    auto newOp = cast<OpTy>(rewriter.clone(*op));
-    use.set(newOp->getResult(0));
-    modified = true;
-  }
-  return success(modified);
-}
-
-void populateDuplicateInitOpsPatterns(RewritePatternSet& patterns) {
-  patterns.add(duplicateInitOps<linalg::FillOp>);
-  patterns.add(duplicateInitOps<tensor::EmptyOp>);
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/fusion/fusion.h b/third_party/xla/xla/mlir_hlo/gml_st/transforms/fusion/fusion.h
deleted file mode 100644
index b1b9181108e0e8..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/fusion/fusion.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_GML_ST_TRANSFORMS_FUSION_FUSION_H
-#define MLIR_HLO_GML_ST_TRANSFORMS_FUSION_FUSION_H
-
-#include <utility>
-
-#include "gml_st/transforms/peeling/peeling.h"
-#include "gml_st/transforms/tiling/tiling.h"
-#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/PatternMatch.h"
-
-namespace mlir::gml_st {
-
-struct FusionCluster {
-  SetVector<Operation *> operations;
-  Operation *root;
-  // Map from Value of the fusion cluster argument to the root dimensions.
-  llvm::SmallVector<std::pair<Value, SmallVector<int64_t>>> argDimsMapping;
-};
-// Cluster producers and consumers around the root op.
-FusionCluster getFusionCluster(
-    Operation *op, llvm::function_ref<bool(Operation *)> producerFilterFn,
-    llvm::function_ref<bool(Operation *)> consumerFilterFn);
-
-// Creates gml_st.fusion op with a region with ops from the fusion cluster.
-// Operands of the ops in the region are replaced with region arguments to
-// isolate the fusion cluster form above. Usages of the ops are replaces with
-// the fusion op results.
-FailureOr<gml_st::FusionOp> wrapFusionCluster(
-    PatternRewriter &rewriter, const FusionCluster &fusionCluster);
-
-// Replaces gml_st.fusion op with ops from the region.
-LogicalResult inlineFusionCluster(FusionOp fusionOp, PatternRewriter &rewriter);
-
-// Adds patterns to duplicate linalg.fill and tensor.empty that used as init
-// parameters.
-void populateDuplicateInitOpsPatterns(RewritePatternSet &patterns);
-
-// Fuses an op into `tensor.extract_slice` and performs the necessary updates to
-// the surrounding loop if any.
-FailureOr<Operation *> fuse(PatternRewriter &rewriter,
-                            tensor::ExtractSliceOp materializeOp);
-
-// Finds `tensor.extract_slice` ops in the block and fuses ops into them.
-// Verifies that fusion candidate doesn't have any uses except the one
-// `tensor.extract_slice` in the block to avoid exponential code growth.
-void fuseGreedily(PatternRewriter &rewriter, ArrayRef<Block *> blocks,
-                  llvm::function_ref<bool(Operation *)> filterFn = nullptr);
-
-// Tiles the op to gml_st.parallel and fuses greedily according to the filter.
-FailureOr<GMLSTTilingResult> tileUsingSCFForallOpAndFuseGreedily(
-    PatternRewriter &rewriter, Operation *op, const scf::SCFTilingOptions &opts,
-    llvm::function_ref<bool(Operation *)> fuseFilterFn = nullptr);
-
-// Tiles the op to scf.for and fuses greedily according to the filter.
-FailureOr<scf::SCFTilingResult> tileUsingSCFForOpAndFuseGreedily(
-    PatternRewriter &rewriter, Operation *op, const scf::SCFTilingOptions &opts,
-    llvm::function_ref<bool(Operation *)> fuseFilterFn = nullptr);
-
-// Tiles the op to 1 for all dimensions and fuses greedily according to the
-// filter function.
-LogicalResult tilePeeledOpsToScalars(
-    PatternRewriter &rewriter, const GmlStPeelingResult &peelingResult,
-    llvm::function_ref<bool(Operation *)> fuseFilterFn = nullptr);
-
-}  // namespace mlir::gml_st
-
-#endif  // MLIR_HLO_GML_ST_TRANSFORMS_FUSION_FUSION_H
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/passes.h b/third_party/xla/xla/mlir_hlo/gml_st/transforms/passes.h
deleted file mode 100644
index 12b02debc705fa..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/passes.h
+++ /dev/null
@@ -1,251 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_GML_ST_TRANSFORMS_PASSES_H
-#define MLIR_HLO_GML_ST_TRANSFORMS_PASSES_H
-
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // IWYU pragma: keep
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir::gml_st {
-
-#define GEN_PASS_DECL
-#include "gml_st/transforms/passes.h.inc"
-
-/// Pass to fuse producers into a tiled consumer.
-std::unique_ptr<OperationPass<func::FuncOp>> createFusionPass(
-    StringRef producer = "", StringRef consumer = "");
-
-/// Pass to match, tile, and fuse softmax implementations.
-std::unique_ptr<OperationPass<func::FuncOp>> createTilingSoftmaxPass(
-    ArrayRef<int64_t> tileSizes);
-std::unique_ptr<OperationPass<func::FuncOp>> createTilingSoftmaxPass();
-
-/// Pass to tile the root operation and to greedily fuse producers into it.
-std::unique_ptr<OperationPass<func::FuncOp>> createGreedyFusionPass(
-    ArrayRef<int64_t> tileSizes);
-std::unique_ptr<OperationPass<func::FuncOp>> createGreedyFusionPass();
-
-// Pass to collapse dimensions of bcasts, reductions, and cwise ops.
-std::unique_ptr<OperationPass<func::FuncOp>> createCollapseShapePass();
-std::unique_ptr<OperationPass<func::FuncOp>> createCollapseShapePass(
-    const CollapseShapePassOptions &options);
-
-// Pass to tile all tileable ops to size 1.
-std::unique_ptr<OperationPass<func::FuncOp>> createTileByOnePass();
-
-/// Pass to compose tensor.extract_slice/insert_slice ops.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createComposeExtractInsertSlicePass();
-
-/// Pass to vectorize compute ops and scf.for loops that are tiled perfectly.
-std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeForCPUPass(
-    int64_t numElementsThreshold = 1024);
-
-/// Pass to vectorize `memref.copy`.
-std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeCopyPass(
-    int64_t numElementsThreshold = 8);
-
-/// Pass to remove redundant `memref.copy` ops.
-std::unique_ptr<OperationPass<func::FuncOp>> createNaiveCopyRemovalPass();
-
-/// Pass to gradually lower vector ops to SCF.
-std::unique_ptr<OperationPass<func::FuncOp>> createLowerVectorsPass(
-    bool enableAVX2 = true, bool flatten = false);
-
-/// Pass to pack linalg.matmul as linalg.mmt4d.
-std::unique_ptr<OperationPass<func::FuncOp>> createPackMatmulPass();
-
-/// Pass to transform a thlo.scatter op for CPU backend.
-std::unique_ptr<OperationPass<func::FuncOp>> createTransformScatterForCpuPass();
-
-/// Pass to transform a dot operation for CPU backend.
-std::unique_ptr<OperationPass<func::FuncOp>> createTransformDotForCpuPass(
-    ArrayRef<int64_t> tileSizes = {}, StringRef cpuName = "");
-
-/// Pass to transform tensor.pack/unpack ops for CPU backend.
-std::unique_ptr<OperationPass<func::FuncOp>> createTransformPackForCpuPass();
-
-/// Pass to transform a linalg.mmt4d op for CPU backend.
-std::unique_ptr<OperationPass<func::FuncOp>> createTransformMmt4DForCpuPass();
-
-/// Pass to fuse linalg on tensor operations.
-std::unique_ptr<OperationPass<func::FuncOp>> createFusionOfTensorOpsPass();
-
-/// Pass to convert ops on tensors with 1 element to scalar ops.
-std::unique_ptr<OperationPass<func::FuncOp>> createScalarizationPass(
-    bool scalarizeAllThlo = true);
-
-/// Pass to transform elementwise ops for CPU backend.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformElementwiseForCpuPass(int64_t vectorSize = 8,
-                                     bool fuseDegenerateReshapes = false);
-
-/// Pass to transform a linalg.reduce op for CPU backend.
-std::unique_ptr<Pass> createTransformReduceForCpuPass(
-    const TransformReduceForCpuPassOptions &option = {});
-
-/// Pass to create fusion clusters.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createFusionPlanningForCpuPass(int64_t vectorSize = 8);
-
-/// Pass to outline fusion regions into functions.
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createFusionOutliningPass();
-
-/// Pass to inline fusion clusters.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createInlineFusionClustersPass();
-
-/// Pass with canonicalization patterns for linalg ops.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createOptimizeLinalgOpsPass();
-
-/// Pass to rewrite tensor.from_elements into tensor.insert.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createRewriteFromElementsOpPass();
-
-/// Pass to rewrite scf.forall to scf.for.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createRewriteForallOpPass();
-
-/// Pass to add debug info to be propagated into LLVM backend.
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> createAddDebugInfoPass();
-
-/// Pass to print stats about tileable ops.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> createCollectStatsPass(
-    int64_t level = 0);
-
-/// Pass to remove all transformed labels from tiled ops.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createRemoveLabelPass();
-
-/// Populate pattern to remove single/zero iteration scf.forall dimensions.
-void populateCollapseForallOpDimensionsPattern(RewritePatternSet &patterns);
-
-struct GmlStCPUTilingOptions
-    : public mlir::PassPipelineOptions<GmlStCPUTilingOptions> {
-  GmlStCPUTilingOptions() = default;
-  GmlStCPUTilingOptions(const GmlStCPUTilingOptions &opts) {
-    this->lowerToMmt4d = opts.lowerToMmt4d;
-    this->matmulTileSizes = opts.matmulTileSizes;
-    this->reduction1DTileSize = opts.reduction1DTileSize;
-    this->reduction1DSplitRatio = opts.reduction1DSplitRatio;
-    this->reduction2DParallelDimTileSize = opts.reduction2DParallelDimTileSize;
-    this->reduction2DReductionDimTileSize =
-        opts.reduction2DReductionDimTileSize;
-    this->vectorSize = opts.vectorSize;
-    this->statsDetailLevel = opts.statsDetailLevel;
-    this->cpuName = opts.cpuName;
-    this->inlineFusionClusters = opts.inlineFusionClusters;
-  }
-
-  Option<int64_t> vectorSize{*this, "vector-size",
-                             llvm::cl::desc("Vector size for a 1D reduction."),
-                             llvm::cl::init(8)};
-
-  Option<bool> reductionEnableHeuristic{
-      *this, "reduction-enable-heuristic",
-      llvm::cl::desc("Enable tiling parameters heuristic for reductions."),
-      llvm::cl::init(false)};
-
-  Option<int64_t> reduction1DTileSize{
-      *this, "reduction-1d-tile-size",
-      llvm::cl::desc("Tile size for a 1D reduction."), llvm::cl::init(32)};
-
-  Option<int64_t> reduction1DSplitRatio{
-      *this, "reduction-1d-split-ratio",
-      llvm::cl::desc("Ratio used to split the reduction dimension"),
-      llvm::cl::init(8)};
-
-  Option<int64_t> reduction2DParallelDimTileSize{
-      *this, "reduction-2d-parallel-dim-tile-size",
-      llvm::cl::desc("Tile size for the parallel dimension of a 2D reduction."),
-      llvm::cl::init(4)};
-
-  Option<int64_t> reduction2DReductionDimTileSize{
-      *this, "reduction-2d-reduction-dim-tile-size",
-      llvm::cl::desc(
-          "Tile size for the reduction dimension of a 2D reduction."),
-      llvm::cl::init(4)};
-
-  ListOption<int64_t> matmulTileSizes{
-      *this, "matmul-tile-sizes",
-      llvm::cl::desc("Tile sizes for `linalg.matmul`. Leave empty to determine "
-                     "sizes automatically."),
-      llvm::cl::list_init<int64_t>({}), llvm::cl::ZeroOrMore};
-
-  Option<int64_t> vectorizationSizeThreshold{
-      *this, "vectorization-size-threshold",
-      llvm::cl::desc("Threshold size for vectorization."), llvm::cl::init(128)};
-
-  Option<int64_t> vectorizationTiledSizeThreshold{
-      *this, "vectorization-tiled-size-threshold",
-      llvm::cl::desc("Threshold size for vectorization after tiling."),
-      llvm::cl::init(1024)};
-
-  Option<bool> lowerToMmt4d{
-      *this, "lower-to-mmt4d",
-      llvm::cl::desc("Enable the specific code generation (packing) for matmul "
-                     "operations."),
-      llvm::cl::init(false)};
-
-  Option<StringRef> cpuName{
-      *this, "cpu",
-      llvm::cl::desc("CPU name, similar to llc's -mcpu flag. e.g. 'znver2', "
-                     "'skylake-avx512'."),
-      llvm::cl::init("")};
-
-  Option<int64_t> statsDetailLevel{
-      *this, "stats-detail-level",
-      llvm::cl::desc("Detail level for collecting IR statistics."),
-      llvm::cl::init(0)};
-
-  Option<bool> fuseDegenerateReshapes{
-      *this, "fuse-degenerate-reshapes",
-      llvm::cl::desc("Fuse through tensor.expand/collapse_shape"),
-      llvm::cl::init(false)};
-
-  Option<bool> inlineFusionClusters{
-      *this, "inline-fusion-clusters",
-      llvm::cl::desc("Inline fusion clusters at the end of the pipeline."),
-      llvm::cl::init(true)};
-};
-
-// Returns default "optimized" tiling parameters.
-GmlStCPUTilingOptions getDefaultCPUPipelineOptions(
-    StringRef cpuName, int64_t statsDetailLevel = 0);
-
-// Adds tiling-fusion-vectorization passes for tHLO/Linalg ops mix.
-void addCPUTilingPipeline(OpPassManager &pm,
-                          const GmlStCPUTilingOptions &options);
-
-// Adds tiling-fusion-vectorization passes for tHLO/Linalg ops mix with the
-// "optimized" tiling parameters.
-void addDefaultCPUTilingPipeline(OpPassManager &pm, StringRef cpuName,
-                                 int64_t statsDetailLevel = 0);
-
-#define GEN_PASS_REGISTRATION
-#include "gml_st/transforms/passes.h.inc"
-
-}  // namespace mlir::gml_st
-
-#endif  // MLIR_HLO_GML_ST_TRANSFORMS_PASSES_H
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/passes.td b/third_party/xla/xla/mlir_hlo/gml_st/transforms/passes.td
deleted file mode 100644
index ce462a3adcd850..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/passes.td
+++ /dev/null
@@ -1,264 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-include "mlir/Pass/PassBase.td"
-
-def TilingSoftmaxPass : Pass<"gml-tiling-softmax", "mlir::func::FuncOp"> {
-  let summary = "Match, tile, and fuse softmax implementations";
-  let constructor = "::mlir::gml_st::createTilingSoftmaxPass()";
-  let options = [
-    ListOption<"tileSizes", "tile-sizes", "int64_t",
-               "Right-aligned tile sizes. Do not tile possible remaining "
-               "dimensions", "llvm::cl::ZeroOrMore">,
-  ];
-}
-
-def TileByOnePass : Pass<"gml-tile-by-one", "mlir::func::FuncOp"> {
-  let summary = "Tile all tileable ops by size 1";
-  let description = [{
-    Tile all tileable ops to size 1. This is meant as a fallback for those ops
-    that were not previously tiled and vectorized.
-  }];
-  let constructor = "::mlir::gml_st::createTileByOnePass()";
-}
-
-def CollapseShapePass : Pass<"gml-collapse-shape", "mlir::func::FuncOp"> {
-  let summary = "Collapse dimensions of bcasts, reductions, and cwise ops";
-  let description = [{
-    Pass to collapse dimensions of bcasts, reductions, and cwise ops. A given
-    number of trailing dimensions remains untouched while the remaining leading
-    dimensions will be collapsed where possible.
-  }];
-  let constructor = "::mlir::gml_st::createCollapseShapePass()";
-  let options = [
-    Option<"retainTrailingDims", "retain-trailing-dims", "int64_t",
-           /*default=*/"0",
-           "Number of trailing dimensions that will not be collapsed.">,
-  ];
-  let dependentDialects = ["::mlir::tensor::TensorDialect"];
-}
-
-def ComposeExtractInsertSlicePass : Pass<"gml-compose-extract-insert-slice",
-    "mlir::func::FuncOp"> {
-  let summary = "Compose tensor.extract_slice/insert_slice ops.";
-  let constructor = "::mlir::gml_st::createComposeExtractInsertSlicePass()";
-}
-
-def VectorizeForCPUPass : Pass<"vectorize-for-cpu", "mlir::func::FuncOp"> {
-  let summary = "Pass to vectorize gml_st.for loops that are tiled perfectly.";
-  let constructor = "::mlir::gml_st::createVectorizeForCPUPass()";
-  let dependentDialects = [
-    "::mlir::vector::VectorDialect",
-    "::mlir::tensor::TensorDialect"
-  ];
-  let options = [
-    Option<"numElementsThreshold", "num-elements-threshold", "int64_t",
-           /*default=*/"128",
-           "Number of elements max of the tensor operands in order for the op to be vectorized.">,
-  ];
-}
-
-def VectorizeCopyPass : Pass<"vectorize-copy", "mlir::func::FuncOp"> {
-  let summary = "Pass to vectorize `memref.copy`.";
-  let constructor = "::mlir::gml_st::createVectorizeCopyPass()";
-  let dependentDialects = [
-    "scf::SCFDialect",
-    "vector::VectorDialect",
-  ];
-  let options = [
-    Option<"numElementsThreshold", "num-elements-threshold", "int64_t",
-           /*default=*/"8",
-           "Max number of elements in src and dst memref for a copy to be "
-           "vectorized.">,
-  ];
-}
-
-def NaiveCopyRemovalPass : Pass<"naive-copy-removal", "mlir::func::FuncOp"> {
-  let summary = "Pass to remove redundant `memref.copy` ops.";
-  let constructor = "::mlir::gml_st::createNaiveCopyRemovalPass()";
-  let dependentDialects = ["::mlir::memref::MemRefDialect"];
-}
-
-def LowerVectorsPass : Pass<"lower-vectors", "mlir::func::FuncOp"> {
-  let summary = "Pass to lower vector operations progressively.";
-  let constructor = "::mlir::gml_st::createLowerVectorsPass()";
-  let dependentDialects = [
-    "::mlir::LLVM::LLVMDialect",
-    "::mlir::vector::VectorDialect",
-    "::mlir::affine::AffineDialect",
-  ];
-  let options = [
-    Option<"enableAVX2", "enable-avx2", "bool", /*default=*/"true",
-           "Enable specialized lowerings for AVX2.">,
-    Option<"flatten", "flatten", "bool", /*default=*/"false",
-           "Flatten multiple small n-D vector transfers into a large 1-D transfer.">,
-  ];
-}
-
-def ScalarizationPass : Pass<"scalarize", "mlir::func::FuncOp"> {
-  let summary = "Converts ops on tensors with 1 element to scalar ops.";
-  let dependentDialects = [
-    "arith::ArithDialect",
-    "scf::SCFDialect",
-    "tensor::TensorDialect"
-  ];
-  let constructor = "createScalarizationPass()";
-  let options = [
-    Option<"scalarizeAllThlo", "scalarize-all-thlo", "bool", /*default=*/"true",
-           "Enable scalarization of thlo.concatenate/gather/scatter.">,
-  ];
-}
-
-def PackMatmulPass : Pass<"xla-cpu-pack-matmul", "mlir::func::FuncOp"> {
-  let summary = "Pack linalg.matmul as linalg.mmt4d";
-  let constructor = "createPackMatmulPass()";
-}
-
-def TransformScatterForCpuPass :
-    Pass<"xla-cpu-transform-scatter", "mlir::func::FuncOp"> {
-  let summary = "Transform scatter ops for running on CPU";
-
-  let constructor = "createTransformScatterForCpuPass()";
-}
-
-def TransformDotForCpuPass :
-    Pass<"xla-cpu-transform-dot", "mlir::func::FuncOp"> {
-  let summary = "Transform dot ops for running on CPU";
-  let constructor = "createTransformDotForCpuPass()";
-}
-
-def TransformPackForCpuPass :
-    Pass<"xla-cpu-transform-pack", "mlir::func::FuncOp"> {
-  let summary = "Transform tensor.pack/unpack ops for running on CPU";
-  let constructor = "createTransformPackForCpuPass()";
-}
-
-def TransformMmt4DForCpuPass :
-    Pass<"xla-cpu-transform-mmt4d", "mlir::func::FuncOp"> {
-  let summary = "Transform linalg.mmt4d ops for running on CPU";
-  let constructor = "createTransformMmt4DForCpuPass()";
-}
-
-def TransformElementwiseForCpuPass :
-    Pass <"xla-cpu-transform-elementwise", "mlir::func::FuncOp"> {
-  let summary = "Transform elementwise ops for running on CPU";
-  let description = [{
-    Transforms elementwise ops, i.e. map, transpose, broadcast, concat, reverse.
-  }];
-  let constructor = "::mlir::gml_st::createTransformElementwiseForCpuPass()";
-
-  let options = [
-    Option<"vectorSize", "vector-size", "int64_t", "8", "Vector size.">,
-    Option<"fuseDegenerateReshapes", "fuse-degenerate-reshapes", "bool",
-           /*default=*/"false",
-           "Fuse through degenerate tensor.expand/collapse_shape">,
-  ];
-}
-
-def TransformReduceForCpuPass :
-    Pass<"xla-cpu-transform-reduce", "mlir::func::FuncOp"> {
-  let summary = "Transform reduce ops for running on CPU";
-  let options = [
-    Option<"enableHeuristic", "enable_heuristic", "bool", "false",
-           "Enable heuristic for tiling sizes. Currently only for 1D.">,
-    Option<"tileSize1D", "reduction-1d-tile-size", "int64_t", "32",
-           "Tile size for a 1D reduction.">,
-    Option<"splitRatio1D", "reduction-1d-split-ratio", "int64_t", "8",
-           "Ratio used to split the reduction dimension, i.e. tiled reduce op "
-           "`reduce(tensor<N>)` will be split into a composition of a "
-           " column reduction `reduce(tensor<N/splitRatio1D x splitRatio1D>)` "
-           "and a row 1D reductionreduce(tensor<splitRatio1D>)`." >,
-    Option<"parallelDimTileSize2D",
-            "reduction-2d-parallel-dim-tile-size", "int64_t", "4",
-           "Tile size for the parallel dimension of a 2D reduction.">,
-    Option<"reductionDimTileSize2D",
-           "reduction-2d-reduction-dim-tile-size", "int64_t", "4",
-           "Tile size for the reduction dimension of a 2D reduction.">,
-  ];
-  let constructor = "::mlir::gml_st::createTransformReduceForCpuPass()";
-}
-
-def AddDebugInfoPass :
-    Pass<"add-debug-info", "mlir::ModuleOp"> {
-  let summary = "Add debug info for the whole module";
-  let constructor = "::mlir::gml_st::createAddDebugInfoPass()";
-  let dependentDialects = ["::mlir::LLVM::LLVMDialect"];
-}
-
-def CollectStatsPass :
-    Pass<"collect-stats", "mlir::func::FuncOp"> {
-  let summary = "Print stats about tileable ops";
-  let constructor = "::mlir::gml_st::createCollectStatsPass()";
-}
-
-def RemoveLabelPass : Pass<"remove-label", "mlir::func::FuncOp"> {
-  let summary = "Remove transformed labels from tiled ops";
-  let constructor = "::mlir::gml_st::createRemoveLabelPass()";
-}
-
-def FusionPlanningForCpuPass :
-    Pass<"gml-st-cpu-fusion-planning", "mlir::func::FuncOp"> {
-  let summary = "Create fusion clusters.";
-  let constructor = "createFusionPlanningForCpuPass()";
-  let dependentDialects = [
-    "::mlir::arith::ArithDialect",
-    "::mlir::gml_st::GmlStDialect",
-    "::mlir::linalg::LinalgDialect",
-    "::mlir::tensor::TensorDialect"
-  ];
-
-  let options = [
-    Option<"vectorSize", "vector-size", "int64_t", "8",
-           "Tile size for the innermost dimension of `linalg.map`">,
-  ];
-}
-
-def RewriteFromElementsOpPass
-    : Pass<"gml-st-rewrite-from-elements-ops", "mlir::func::FuncOp"> {
-  let summary = "Pass to rewrite tensor.from_elements into tensor.insert.";
-  let constructor = "createRewriteFromElementsOpPass()";
-}
-
-def RewriteForallOpPass
-    : Pass<"gml-st-rewrite-forall-ops", "mlir::func::FuncOp"> {
-  let summary = "Pass to rewrite scf.forall to scf.for.";
-  let constructor = "createRewriteForallOpPass()";
-}
-
-def FusionOutliningPass : Pass<"gml-fusion-outlining", "mlir::ModuleOp"> {
-  let summary = "Pass to outline fusion regions into functions.";
-  let constructor = "createFusionOutliningPass()";
-}
-
-def InlineFusionClustersPass :
-    Pass<"gml-st-inline-fusion-clusters", "mlir::func::FuncOp"> {
-  let summary = "Replaces all gml_st.fusion op with ops from the region.";
-  let constructor = "createInlineFusionClustersPass()";
-  let dependentDialects = [
-    "::mlir::gml_st::GmlStDialect",
-  ];
-}
-
-def OptimizeLinalgOpsPass
-    : Pass<"gml-st-optimize-linalg-ops-pass", "mlir::func::FuncOp"> {
-  let summary = "Canonicalization patterns for linalg ops.";
-  let constructor = "createOptimizeLinalgOpsPass()";
-
-  let dependentDialects = [
-    "::mlir::arith::ArithDialect",
-    "::mlir::linalg::LinalgDialect",
-    "::mlir::tensor::TensorDialect"
-  ];
-}
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/peeling/peeling.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/peeling/peeling.cc
deleted file mode 100644
index 4b07155c4903d4..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/peeling/peeling.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "gml_st/transforms/peeling/peeling.h"
-
-#include "gml_st/IR/gml_st_ops.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/SCF/Transforms/Transforms.h"
-#include "mlir/Dialect/SCF/Utils/AffineCanonicalizationUtils.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
-#include "mlir/IR/IRMapping.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-bool isATensor(Type t) { return t.isa<TensorType>(); }
-
-/// Return true if the given op has only tensor-typed results or operands.
-bool hasTensorSemantics(Operation *op) {
-  return llvm::all_of(op->getResultTypes(), isATensor) ||
-         llvm::all_of(op->getOperandTypes(), isATensor);
-}
-
-LogicalResult peelLoop(RewriterBase &b, scf::ForallOp loopOp, int64_t idx,
-                       scf::ForallOp &result, Value &splitBound) {
-  if (!hasTensorSemantics(loopOp)) return failure();
-
-  Location loc = loopOp.getLoc();
-  Value lb =
-      getValueOrCreateConstantIndexOp(b, loc, loopOp.getMixedLowerBound()[idx]);
-  Value ub =
-      getValueOrCreateConstantIndexOp(b, loc, loopOp.getMixedUpperBound()[idx]);
-  Value step =
-      getValueOrCreateConstantIndexOp(b, loc, loopOp.getMixedStep()[idx]);
-  auto ubInt = getConstantIntValue(ub);
-
-  AffineExpr exprLb, exprUb, exprStep;
-  bindSymbols(b.getContext(), exprLb, exprUb, exprStep);
-  // New upper bound: %ub - (%ub - %lb) mod %step
-  auto modMap = AffineMap::get(0, 3, exprUb - ((exprUb - exprLb) % exprStep));
-  SmallVector<Value> operands{lb, ub, step};
-  affine::canonicalizeMapAndOperands(&modMap, &operands);
-  modMap = simplifyAffineMap(modMap);
-  RewriterBase::InsertionGuard guard(b);
-  b.setInsertionPoint(loopOp);
-  splitBound = b.createOrFold<affine::AffineApplyOp>(loc, modMap, operands);
-
-  // No specialization necessary if step already divides upper bound evenly.
-  if (splitBound == ub || (ubInt && ubInt == getConstantIntValue(splitBound)))
-    return failure();
-
-  // Create remainder loop.
-  IRMapping bvm;
-  for (const auto &[res, termDst] :
-       llvm::zip(loopOp.getResults(), loopOp.getOutputs())) {
-    bvm.map(termDst, res);
-  }
-  b.setInsertionPointAfter(loopOp);
-  auto remainderLoop =
-      cast<scf::ForallOp>(b.clone(*loopOp.getOperation(), bvm));
-
-  Operation *remainderLoopOp = remainderLoop.getOperation();
-
-  for (const auto &[oldRes, newRes] :
-       llvm::zip(loopOp.getResults(), remainderLoop.getResults())) {
-    SmallPtrSet<Operation *, 4> exceptions({remainderLoopOp});
-    for (OpOperand &use : oldRes.getUses()) {
-      Operation *user = use.getOwner();
-      if (user->getParentOp() == remainderLoopOp) exceptions.insert(user);
-    }
-    oldRes.replaceAllUsesExcept(newRes, exceptions);
-  }
-
-  // Set new loop bounds.
-  SmallVector<OpFoldResult> ubs = loopOp.getMixedUpperBound();
-  ubs[idx] = splitBound;
-  SmallVector<Value> dynamicUbs;
-  SmallVector<int64_t> staticUbs;
-  dispatchIndexOpFoldResults(ubs, dynamicUbs, staticUbs);
-  b.updateRootInPlace(loopOp, [&]() {
-    loopOp.getDynamicUpperBoundMutable().assign(dynamicUbs);
-    loopOp.setStaticUpperBound(staticUbs);
-  });
-
-  SmallVector<OpFoldResult> lbs = remainderLoop.getMixedLowerBound();
-  lbs[idx] = splitBound;
-  SmallVector<Value> dynamicLbs;
-  SmallVector<int64_t> staticLbs;
-  dispatchIndexOpFoldResults(lbs, dynamicLbs, staticLbs);
-  b.updateRootInPlace(remainderLoop, [&]() {
-    remainderLoop.getDynamicLowerBoundMutable().assign(dynamicLbs);
-    remainderLoop.setStaticLowerBound(staticLbs);
-  });
-
-  result = remainderLoop;
-  return success();
-}
-
-template <typename OpTy>
-void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, Operation *mainLoop,
-                                 Operation *remainderLoop, Value mainIv,
-                                 Value remainderIv, Value ub, Value step) {
-  mainLoop->walk([&](OpTy affineOp) {
-    (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, mainIv, ub, step,
-                                     /*insideLoop=*/true);
-  });
-  remainderLoop->walk([&](OpTy affineOp) {
-    (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, remainderIv, ub, step,
-                                     /*insideLoop=*/false);
-  });
-}
-
-}  // namespace
-
-GmlStPeelingResult peelAllLoops(scf::ForallOp loop,
-                                mlir::PatternRewriter &rewriter) {
-  GmlStPeelingResult peelingResult;
-
-  bool hasMainLoop = true;
-  for (unsigned peeledIdx = 0; peeledIdx < loop.getRank(); ++peeledIdx) {
-    int64_t numLoops = loop.getRank();
-    if (peeledIdx < 0 || numLoops <= peeledIdx) continue;
-
-    OpFoldResult ubOfr = loop.getMixedUpperBound()[peeledIdx];
-    OpFoldResult stepOfr = loop.getMixedStep()[peeledIdx];
-    auto ubInt = getConstantIntValue(ubOfr);
-    auto stepInt = getConstantIntValue(stepOfr);
-
-    // If the loop is smaller than the step, then append loop as tail. Needs to
-    // be done only once.
-    if (ubInt && stepInt && ubInt < stepInt) {
-      if (hasMainLoop) {
-        peelingResult.tailLoops.push_back(loop);
-        hasMainLoop = false;
-      }
-      continue;
-    }
-
-    Location loc = loop.getLoc();
-    Value ub = getValueOrCreateConstantIndexOp(rewriter, loc, ubOfr);
-    Value step = getValueOrCreateConstantIndexOp(rewriter, loc, stepOfr);
-    scf::ForallOp remainderLoop;
-    Value splitBound;
-    if (failed(peelLoop(rewriter, loop, peeledIdx, remainderLoop, splitBound)))
-      continue;
-
-    // Rewrite affine.min and affine.max ops.
-    Value mainIv = loop.getInductionVars()[peeledIdx],
-          remainderIv = remainderLoop.getInductionVars()[peeledIdx];
-
-    rewriteAffineOpAfterPeeling<affine::AffineMinOp>(
-        rewriter, loop, remainderLoop, mainIv, remainderIv, ub, step);
-    rewriteAffineOpAfterPeeling<affine::AffineMaxOp>(
-        rewriter, loop, remainderLoop, mainIv, remainderIv, ub, step);
-
-    // Mark the new loop if one was created.
-    peelingResult.tailLoops.push_back(remainderLoop);
-  }
-
-  // Update main loop if applicable.
-  if (hasMainLoop) peelingResult.mainLoop = loop;
-
-  return peelingResult;
-}
-
-SCFForPeelingResult peelSCFForOp(RewriterBase &rewriter, scf::ForOp loop) {
-  // Peeling fails, if the step divides the upper bound. In that case,
-  // we still want to return {loop, nullptr}.
-  scf::ForOp tailLoop;
-  return succeeded(scf::peelForLoopAndSimplifyBounds(rewriter, loop, tailLoop))
-             ? SCFForPeelingResult{loop, tailLoop}
-             : SCFForPeelingResult{loop, nullptr};
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/peeling/peeling.h b/third_party/xla/xla/mlir_hlo/gml_st/transforms/peeling/peeling.h
deleted file mode 100644
index da3412b8b9901f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/peeling/peeling.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_GML_ST_TRANSFORMS_PEELING_PEELING_H
-#define MLIR_HLO_GML_ST_TRANSFORMS_PEELING_PEELING_H
-
-#include "gml_st/IR/gml_st_ops.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/PatternMatch.h"
-
-namespace mlir {
-namespace gml_st {
-
-struct GmlStPeelingResult {
-  scf::ForallOp mainLoop = nullptr;
-  SmallVector<scf::ForallOp> tailLoops = {};
-};
-
-/// Rewrite a scf::ForallOp with bounds/step that potentially do not divide
-/// evenly into a scf::ForallOp where the step divides the iteration space
-/// evenly, followed by another scf::ForallOp for the last (partial)
-/// iteration (if any).  This transformation is called "loop peeling".
-///
-/// These functions peel all loops in the loop nest by calling
-/// peelAndCanonicalizeGmlStLoop. Additionally, they mark all loops (main and
-/// remainder loops) as peeled, so the same loop is not rewritten a second time.
-GmlStPeelingResult peelAllLoops(scf::ForallOp loop,
-                                mlir::PatternRewriter &rewriter);
-
-struct SCFForPeelingResult {
-  scf::ForOp mainLoop = nullptr;
-  scf::ForOp tailLoop = nullptr;
-};
-SCFForPeelingResult peelSCFForOp(RewriterBase &rewriter, scf::ForOp);
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif  // MLIR_HLO_GML_ST_TRANSFORMS_PEELING_PEELING_H
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/rewrite_from_elements_op/rewrite_from_elements_op.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/rewrite_from_elements_op/rewrite_from_elements_op.cc
deleted file mode 100644
index 56ee84469abefe..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/rewrite_from_elements_op/rewrite_from_elements_op.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "gml_st/transforms/passes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_REWRITEFROMELEMENTSOPPASS
-#include "gml_st/transforms/passes.h.inc"
-
-// Rewrite `tensor.from_elements(x)` into `tensor.insert(x, tensor.empty)`.
-// In combination with `empty-tensor-elimination` it removes the alloc that can
-// result from `tensor.from_elements`.
-struct RewriteFromElementsOpInDestinationPassingStyle
-    : public OpRewritePattern<tensor::FromElementsOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(tensor::FromElementsOp op,
-                                PatternRewriter &rewriter) const override {
-    return linalg::rewriteInDestinationPassingStyle(rewriter, op);
-  }
-};
-
-class RewriteFromElementsOpPass
-    : public impl::RewriteFromElementsOpPassBase<RewriteFromElementsOpPass> {
-  void runOnOperation() override {
-    auto func = getOperation();
-    auto *context = &getContext();
-
-    RewritePatternSet patterns(context);
-    patterns.add<RewriteFromElementsOpInDestinationPassingStyle>(context);
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
-      signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createRewriteFromElementsOpPass() {
-  return std::make_unique<RewriteFromElementsOpPass>();
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/rewrite_scf_forall/rewrite_scf_forall.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/rewrite_scf_forall/rewrite_scf_forall.cc
deleted file mode 100644
index 0fe18bd3af6363..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/rewrite_scf_forall/rewrite_scf_forall.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/ValueRange.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::gml_st {
-namespace {
-
-// Rewrites `scf.forall` to an `scf.for` loop nest.
-LogicalResult rewriteScfForallToScfFor(scf::ForallOp forallOp,
-                                       PatternRewriter &rewriter) {
-  if (forallOp.getRank() == 0) return failure();
-  // Do not convert to scf.for if scf.forall is mapped to threads.
-  if (forallOp.getMapping().has_value()) return failure();
-
-  Location loc = forallOp.getLoc();
-  scf::LoopNest loopNest = scf::buildLoopNest(
-      rewriter, loc, forallOp.getLowerBound(rewriter),
-      forallOp.getUpperBound(rewriter), forallOp.getStep(rewriter),
-      forallOp.getOutputs(),
-      [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange ivs,
-          ValueRange iterArgs) -> scf::ValueVector {
-        IRMapping map;
-        map.map(forallOp.getInductionVars(), ivs);
-        map.map(forallOp.getOutputBlockArguments(), iterArgs);
-
-        for (auto &op : forallOp.getBody()->without_terminator())
-          nestedBuilder.clone(op, map);
-
-        auto inParallelOp = forallOp.getTerminator();
-        scf::ValueVector results;
-        for (auto &op : inParallelOp.getYieldingOps()) {
-          auto mappedOperands =
-              llvm::to_vector(llvm::map_range(op.getOperands(), [&](Value val) {
-                return map.lookupOrDefault(val);
-              }));
-          results.push_back(rewriter.create<tensor::InsertSliceOp>(
-              nestedLoc, mappedOperands, op.getAttrs()));
-        }
-        rewriter.eraseOp(forallOp.getTerminator());
-        return results;
-      });
-
-  // Copy attributes from `scf.forall` to the output
-  SmallVector<StringAttr> elidedAttrs{forallOp.getOperandSegmentSizesAttrName(),
-                                      forallOp.getStaticLowerBoundAttrName(),
-                                      forallOp.getStaticUpperBoundAttrName(),
-                                      forallOp.getStaticStepAttrName()};
-  SmallVector<NamedAttribute> attrs = llvm::to_vector(llvm::make_filter_range(
-      forallOp->getAttrs(), [&](const NamedAttribute &attr) {
-        return !llvm::is_contained(elidedAttrs, attr.getName());
-      }));
-
-  for (scf::ForOp loop : loopNest.loops) {
-    rewriter.updateRootInPlace(loop, [&]() {
-      for (const auto &attr : attrs)
-        loop->setAttr(attr.getName(), attr.getValue());
-    });
-  }
-  rewriter.replaceOp(forallOp, loopNest.results);
-  return success();
-}
-
-#define GEN_PASS_DEF_REWRITEFORALLOPPASS
-#include "gml_st/transforms/passes.h.inc"
-
-class RewriteForallOpPass
-    : public impl::RewriteForallOpPassBase<RewriteForallOpPass> {
-  void runOnOperation() override {
-    auto func = getOperation();
-    auto *context = &getContext();
-
-    RewritePatternSet patterns(context);
-    patterns.add(rewriteScfForallToScfFor);
-    scf::ForOp::getCanonicalizationPatterns(patterns, context);
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
-      return signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createRewriteForallOpPass() {
-  return std::make_unique<RewriteForallOpPass>();
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.cc
deleted file mode 100644
index df12ce7714b3ba..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.cc
+++ /dev/null
@@ -1,670 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "gml_st/transforms/scalarization/scalarization.h"
-
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "thlo/IR/thlo_ops.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_SCALARIZATIONPASS
-#include "gml_st/transforms/passes.h.inc"
-
-using linalg::LinalgOp;
-using tensor::ExtractOp;
-using tensor::ExtractSliceOp;
-using tensor::FromElementsOp;
-using tensor::InsertOp;
-
-// Fold `tensor.insert_slice(tensor.from_elements(x), dst)` into
-//      `tensor.insert(x, dst)` for single-element tensors.
-struct FoldTensorFromElementsIntoInsertSlice
-    : public OpRewritePattern<tensor::InsertSliceOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(tensor::InsertSliceOp insertSliceOp,
-                                PatternRewriter &rewriter) const override {
-    auto fromElementsOp =
-        insertSliceOp.getSource().getDefiningOp<FromElementsOp>();
-    if (!fromElementsOp || !hasSingleElement(fromElementsOp.getType())) {
-      return failure();
-    }
-    SmallVector<Value> indices = getValueOrCreateConstantIndexOp(
-        rewriter, insertSliceOp.getLoc(), insertSliceOp.getMixedOffsets());
-    rewriter.replaceOpWithNewOp<tensor::InsertOp>(
-        insertSliceOp, fromElementsOp.getElements().front(),
-        insertSliceOp.getDest(), indices);
-    return success();
-  }
-};
-
-LogicalResult inlinePayload(PatternRewriter &rewriter, Location loc,
-                            LinalgOp linalgOp, ValueRange argValues) {
-  // Clone everything but terminator.
-  Block *body = linalgOp.getBlock();
-  IRMapping map;
-  map.map(body->getArguments(), argValues);
-  for (auto &op : body->without_terminator()) {
-    if (auto indexOp = dyn_cast<linalg::IndexOp>(&op)) {
-      Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-      map.map(indexOp.getResult(), zero);
-      continue;
-    }
-    rewriter.clone(op, map);
-  }
-
-  // Wrap every scalar result into a tensor using `tensor.from_elements`.
-  SmallVector<Value> newResults;
-  for (auto [resultType, yieldOperand] : llvm::zip(
-           linalgOp->getResultTypes(), body->getTerminator()->getOperands())) {
-    auto scalarValue = map.lookupOrDefault(yieldOperand);
-    newResults.push_back(
-        rewriter.create<FromElementsOp>(loc, resultType, scalarValue));
-  }
-  rewriter.replaceOp(linalgOp, newResults);
-  return success();
-}
-
-// `scalarizeLinalgOp` has to be wrapped in OpInterfaceRewritePattern, because
-// `patterns.add` does not support adding interface rewriter patterns yet.
-struct ScalarizeLinalgOp : public OpInterfaceRewritePattern<LinalgOp> {
-  using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
-
-  LogicalResult matchAndRewrite(LinalgOp linalgOp,
-                                PatternRewriter &rewriter) const override {
-    return scalarizeLinalgOp(linalgOp, rewriter);
-  }
-};
-
-// Get reassociation indices to collapse first dimension.
-SmallVector<ReassociationIndices> getCollapseFirstDimReassociation(
-    unsigned rank) {
-  SmallVector<ReassociationIndices> result{{0, 1}};
-  for (unsigned i = 2; i < rank; ++i) result.push_back({i});
-  return result;
-}
-
-// Returns `startIndices`[0, :] for `startIndices` of shape 1xn. Returns None if
-// startIndices has a different shape.
-std::optional<SmallVector<Value>> extractStartIndices(
-    ImplicitLocOpBuilder &b, TypedValue<ShapedType> startIndices) {
-  if (startIndices.getType().getRank() != 2 ||
-      startIndices.getType().getDimSize(0) != 1) {
-    return std::nullopt;
-  }
-
-  int64_t indexVectorSize = startIndices.getType().getDimSize(1);
-  SmallVector<Value> result;
-  result.reserve(indexVectorSize);
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  for (int64_t i = 0; i < indexVectorSize; ++i) {
-    result.push_back(b.create<ExtractOp>(
-        startIndices, ValueRange{zero, b.create<arith::ConstantIndexOp>(i)}));
-  }
-  return result;
-}
-
-// Return i1 value after checking that 0 <= indices < dims(tensor).
-Value isValidIndex(OpBuilder &b, Location loc, ArrayRef<Value> indices,
-                   ArrayRef<Value> tensorDims, Value &zero) {
-  auto i1Type = b.getI1Type();
-  Value isValid = b.create<arith::ConstantOp>(
-      loc, i1Type, IntegerAttr::get(i1Type, APInt(1, 1)));
-
-  for (auto [dim, index] : llvm::zip(tensorDims, indices)) {
-    Value geZero =
-        b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sge, index, zero);
-    Value ltDim =
-        b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, index, dim);
-    Value dimInBounds = b.create<arith::AndIOp>(loc, geZero, ltDim);
-    isValid = b.create<arith::AndIOp>(loc, isValid, dimInBounds);
-  }
-  return isValid;
-}
-
-Value isIndexInBounds(ImplicitLocOpBuilder &b, Location loc,
-                      ArrayRef<Value> updatesDimValues,
-                      ArrayRef<Value> scatterIndices,
-                      ArrayRef<Value> initDimValues, Value &zero, Value &one) {
-  SmallVector<Value> limitIndex{updatesDimValues.drop_front()};
-  for (const auto &en : llvm::enumerate(scatterIndices)) {
-    limitIndex[en.index()] =
-        b.create<arith::AddIOp>(loc, limitIndex[en.index()], en.value());
-  }
-  for (auto &value : limitIndex) {
-    value = b.create<arith::SubIOp>(loc, value, one);
-  }
-
-  Value inBounds = isValidIndex(b, loc, limitIndex, initDimValues, zero);
-  return b.create<arith::AndIOp>(
-      loc, inBounds, isValidIndex(b, loc, scatterIndices, initDimValues, zero));
-}
-
-Value tensorHasElement(OpBuilder &b, Location loc, Value input,
-                       int64_t concatDim) {
-  Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
-  Value concatDimSize = b.create<tensor::DimOp>(loc, input, concatDim);
-  return b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, concatDimSize,
-                                 zero);
-}
-
-Value extractElementFromInputs(
-    OpBuilder &b, Location loc, ValueRange inputs, Type resultType,
-    int64_t concatDim,
-    llvm::function_ref<Value(OpBuilder &, Location, Value)>
-        materializeAndInsert) {
-  if (inputs.size() == 1) {
-    return materializeAndInsert(b, loc, inputs.front());
-  }
-
-  return b
-      .create<scf::IfOp>(
-          loc, tensorHasElement(b, loc, inputs.front(), concatDim),
-          [&](OpBuilder &thenBuilder, Location thenLoc) {
-            thenBuilder.create<scf::YieldOp>(
-                thenLoc,
-                materializeAndInsert(thenBuilder, thenLoc, inputs.front()));
-          },
-          [&](OpBuilder &elseBuilder, Location elseLoc) {
-            elseBuilder.create<scf::YieldOp>(
-                elseLoc, extractElementFromInputs(
-                             elseBuilder, elseLoc, inputs.drop_front(),
-                             resultType, concatDim, materializeAndInsert));
-          })
-      .getResult(0);
-}
-
-LogicalResult scalarizeOp(Operation *op, PatternRewriter &rewriter,
-                          TypedValue<ShapedType> &input,
-                          TypedValue<ShapedType> &output) {
-  ImplicitLocOpBuilder b(op->getLoc(), rewriter);
-
-  auto outputType = output.getType().dyn_cast<RankedTensorType>();
-  if (!outputType) {
-    return rewriter.notifyMatchFailure(
-        op, "failed to cast output to RankedTensorType");
-  }
-  if (!hasSingleElement(outputType)) {
-    return rewriter.notifyMatchFailure(
-        op, "has output with number of elements not equal to 1");
-  }
-
-  auto inputType = input.getType().dyn_cast<RankedTensorType>();
-  if (!inputType) {
-    return rewriter.notifyMatchFailure(
-        op, "failed to cast input to RankedTensorType");
-  }
-
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  llvm::SmallVector<Value> indicesInput(inputType.getRank(), zero);
-  llvm::SmallVector<Value> indicesOutput(outputType.getRank(), zero);
-
-  Value extractedValue = b.create<ExtractOp>(input, indicesInput);
-  Value result = b.create<tensor::FromElementsOp>(outputType, extractedValue);
-
-  rewriter.replaceOp(op, result);
-  return success();
-}
-
-LogicalResult hoistTensorExtractFromForOp(scf::ForOp forOp,
-                                          PatternRewriter &rewriter) {
-  if (forOp.getInitArgs().size() != 1) return failure();
-  OpOperand &iterOperand = forOp.getInitArgsMutable()[0];
-  auto iterArgTensorTy =
-      dyn_cast<RankedTensorType>(iterOperand.get().getType());
-  if (!iterArgTensorTy || !hasSingleElement(iterArgTensorTy)) return failure();
-
-  Value bbArg = forOp.getTiedLoopRegionIterArg(&iterOperand);
-
-  if (!bbArg.hasOneUse()) return failure();
-
-  Operation *user = *bbArg.getUsers().begin();
-  auto extractOp = dyn_cast<ExtractOp>(user);
-  if (!extractOp) return failure();
-
-  Operation *terminator = forOp.getBody()->getTerminator();
-  auto fromTensorOp =
-      terminator->getOperand(0).getDefiningOp<tensor::FromElementsOp>();
-  if (!fromTensorOp) return failure();
-
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(forOp);
-  Location loc = forOp.getLoc();
-  Value extractedElement = rewriter.create<ExtractOp>(loc, iterOperand.get(),
-                                                      extractOp.getIndices());
-  auto newForOp = rewriter.create<scf::ForOp>(
-      loc, forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep(),
-      ValueRange{extractedElement});
-  newForOp->setAttrs(forOp->getAttrs());
-  Block *newLoopBody = newForOp.getBody();
-
-  // Move old body into new for loop.
-  rewriter.setInsertionPointToStart(newLoopBody);
-  SmallVector<Value> blockArgs{
-      newForOp.getInductionVar(),
-      rewriter.create<tensor::FromElementsOp>(loc, iterArgTensorTy,
-                                              newForOp.getRegionIterArg(0))};
-  rewriter.mergeBlocks(forOp.getBody(), newLoopBody, blockArgs);
-
-  // Replace terminator that yields a tensor with the one that yields the
-  // element.
-  Operation *newTerminator = newForOp.getBody()->getTerminator();
-  rewriter.setInsertionPointAfter(newTerminator);
-  Value elemOfYieldedTensor = rewriter.create<ExtractOp>(
-      loc, terminator->getOperand(0), extractOp.getIndices());
-  rewriter.replaceOpWithNewOp<scf::YieldOp>(newTerminator, elemOfYieldedTensor);
-
-  // Replace the old loop with the new loop result wrapped in a tensor.
-  rewriter.setInsertionPointAfter(newForOp);
-  rewriter.replaceOpWithNewOp<tensor::FromElementsOp>(
-      forOp, forOp.getResultTypes().front(), newForOp.getResult(0));
-
-  return success();
-}
-
-LogicalResult hoistTensorExtractFromIfOp(scf::IfOp ifOp,
-                                         PatternRewriter &rewriter) {
-  // Analyse result types and determine what we can scalarize.
-  int64_t numResults = ifOp.getNumResults();
-  SmallVector<bool> isScalarizableResult(numResults, false);
-  SmallVector<Type> unscalarizedResultType =
-      llvm::to_vector(ifOp.getResultTypes());
-  SmallVector<Type> scalarizedResultType =
-      llvm::to_vector(ifOp.getResultTypes());
-  bool isAnyResultScalarizable = false;
-  for (int64_t i = 0; i < numResults; ++i) {
-    auto rankedTy = scalarizedResultType[i].dyn_cast<RankedTensorType>();
-    if (!rankedTy || !hasSingleElement(rankedTy)) continue;
-    isScalarizableResult[i] = true;
-    scalarizedResultType[i] = rankedTy.getElementType();
-    isAnyResultScalarizable = true;
-  }
-
-  if (!isAnyResultScalarizable) {
-    return rewriter.notifyMatchFailure(ifOp, "cannot scalarize any result");
-  }
-
-  // Create new if ifOp.
-  Location loc = ifOp.getLoc();
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  auto scalarizedOp = rewriter.create<scf::IfOp>(loc, scalarizedResultType,
-                                                 ifOp.getCondition());
-  scalarizedOp.getThenRegion().takeBody(ifOp.getThenRegion());
-  scalarizedOp.getElseRegion().takeBody(ifOp.getElseRegion());
-  for (int64_t i = 0; i < numResults; ++i) {
-    if (!isScalarizableResult[i]) continue;
-
-    // Insert `extract` ops to yield value as a scalar.
-    llvm::SmallVector<Value> zeroIndices(
-        unscalarizedResultType[i].cast<RankedTensorType>().getRank(), zero);
-    rewriter.setInsertionPoint(scalarizedOp.thenYield());
-    Value thenScalar = rewriter.createOrFold<ExtractOp>(
-        loc, scalarizedOp.thenYield().getOperand(i), zeroIndices);
-    scalarizedOp.thenYield().setOperand(i, thenScalar);
-    rewriter.setInsertionPoint(scalarizedOp.elseYield());
-    Value elseScalar = rewriter.createOrFold<ExtractOp>(
-        loc, scalarizedOp.elseYield().getOperand(i), zeroIndices);
-    scalarizedOp.elseYield().setOperand(i, elseScalar);
-  }
-
-  // Insert `from_elements` ifOp to be type compatible.
-  rewriter.setInsertionPointAfter(scalarizedOp);
-  SmallVector<Value> results(scalarizedOp.getResults());
-  for (int64_t i = 0; i < numResults; ++i) {
-    if (!isScalarizableResult[i]) continue;
-
-    // Wrap scalar.
-    results[i] = rewriter.create<tensor::FromElementsOp>(
-        loc, unscalarizedResultType[i], results[i]);
-  }
-
-  rewriter.replaceOp(ifOp, results);
-  return success();
-}
-
-struct ScalarizationPass
-    : public impl::ScalarizationPassBase<ScalarizationPass> {
-  using Base::Base;
-
-  void runOnOperation() override {
-    auto func = getOperation();
-    auto *ctx = &getContext();
-
-    RewritePatternSet patterns(ctx);
-    patterns.add<ScalarizeLinalgOp, FoldTensorFromElementsIntoInsertSlice>(ctx);
-    patterns.add(hoistTensorExtractFromForOp);
-    patterns.add(hoistTensorExtractFromIfOp);
-    patterns.add(scalarizeDynamicBroadcastInDimOp);
-    patterns.add(scalarizeReverseOp);
-
-    if (scalarizeAllThlo) {
-      patterns.add(scalarizeConcatenateOp);
-      patterns.add(scalarizeGatherOp);
-      patterns.add(scalarizeScatterOp);
-    }
-
-    FromElementsOp::getCanonicalizationPatterns(patterns, ctx);
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
-      signalPassFailure();
-  }
-};
-
-}  // namespace
-
-LogicalResult scalarizeConcatenateOp(thlo::ConcatenateOp concatenateOp,
-                                     PatternRewriter &rewriter) {
-  Location loc = concatenateOp.getLoc();
-  int64_t concatDim = concatenateOp.getDimension().getSExtValue();
-
-  auto initTensor = concatenateOp.getInit();
-  auto initType = initTensor.getType();
-  int64_t rank = initTensor.getType().getRank();
-
-  // Only scalarize when it's statically known that output concatenation dim
-  // size is one.
-  if (initType.getShape()[concatDim] != 1) {
-    return failure();
-  }
-
-  IntegerAttr oneAttr = rewriter.getIndexAttr(1);
-  SmallVector<OpFoldResult> offsets(rank, rewriter.getIndexAttr(0));
-  SmallVector<OpFoldResult> strides(rank, oneAttr);
-
-  SmallVector<OpFoldResult> sizes;
-  for (int i = 0; i < rank; ++i) {
-    if (i == concatDim) {
-      sizes.push_back(oneAttr);
-    } else {
-      sizes.emplace_back(rewriter.create<tensor::DimOp>(loc, initTensor, i));
-    }
-  }
-
-  auto materializeAndInsert = [&](OpBuilder &b, Location l, Value input) {
-    Value slice = b.create<ExtractSliceOp>(l, input, offsets, sizes, strides);
-    return b.create<tensor::InsertSliceOp>(l, slice, initTensor, offsets, sizes,
-                                           strides);
-  };
-
-  Value res =
-      extractElementFromInputs(rewriter, loc, concatenateOp.getInputs(),
-                               initType, concatDim, materializeAndInsert);
-
-  rewriter.replaceOp(concatenateOp, res);
-
-  return success();
-}
-
-LogicalResult scalarizeDynamicBroadcastInDimOp(
-    thlo::DynamicBroadcastInDimOp broadcastOp, PatternRewriter &rewriter) {
-  auto input = broadcastOp.getOperand();
-  auto output = broadcastOp.getInit();
-  return scalarizeOp(broadcastOp, rewriter, input, output);
-}
-
-LogicalResult scalarizeGatherOp(thlo::GatherOp gatherOp,
-                                PatternRewriter &rewriter) {
-  Location loc = gatherOp.getLoc();
-  ImplicitLocOpBuilder b(loc, rewriter);
-  auto startIndices = extractStartIndices(b, gatherOp.getStartIndices());
-  if (!startIndices) return failure();
-
-  TypedValue<ShapedType> init = gatherOp.getInit();
-  ShapedType initTy = init.getType();
-  int64_t initRank = initTy.getRank();
-  SmallVector<OpFoldResult> initDimSizes = tensor::getMixedSizes(b, loc, init);
-  SmallVector<Value> initDimSizeValues =
-      getValueOrCreateConstantIndexOp(b, loc, initDimSizes);
-
-  IntegerAttr oneAttr = b.getI64IntegerAttr(1);
-
-  TypedValue<ShapedType> operand = gatherOp.getOperand();
-  auto operandSizes = getValueOrCreateConstantIndexOp(
-      b, loc, tensor::getMixedSizes(b, loc, operand));
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  Value one = b.create<arith::ConstantIndexOp>(1);
-
-  SmallVector<Value> sliceSizes{initDimSizeValues.begin() + 1,
-                                initDimSizeValues.end()};
-  while (sliceSizes.size() < startIndices->size()) {
-    sliceSizes.push_back(one);
-  }
-
-  // Clamp the indices.
-  for (auto &&[startIndex, max, sliceSize] :
-       llvm::zip(*startIndices, operandSizes, sliceSizes)) {
-    auto maxMinusSize = b.createOrFold<arith::SubIOp>(loc, max, sliceSize);
-    startIndex = b.create<arith::MinSIOp>(loc, startIndex, maxMinusSize);
-    startIndex = b.create<arith::MaxSIOp>(loc, startIndex, zero);
-  }
-
-  SmallVector<Value> lbs(initRank, zero);
-  SmallVector<Value> steps(initRank, one);
-
-  scf::LoopNest loopNest = scf::buildLoopNest(
-      rewriter, loc, lbs, initDimSizeValues, steps, ValueRange{init},
-      [&](OpBuilder &nestedBuilder, Location bodyLoc, ValueRange ivs,
-          ValueRange loopInits) {
-        // Compute the index in the operand.
-        SmallVector<Value> readIndices(operand.getType().getRank(), zero);
-        llvm::copy(ivs.drop_front(1), readIndices.begin());
-        for (auto &&[readIndex, startIndex] :
-             llvm::zip(readIndices, *startIndices)) {
-          readIndex = nestedBuilder.create<arith::AddIOp>(bodyLoc, readIndex,
-                                                          startIndex);
-        }
-
-        // Materialize the value and yield it.
-        SmallVector<OpFoldResult> ones(initRank, oneAttr);
-        Value val =
-            nestedBuilder.create<ExtractOp>(bodyLoc, operand, readIndices);
-        Value updatedInit = nestedBuilder.create<tensor::InsertOp>(
-            bodyLoc, val, loopInits.front(), ivs);
-
-        return scf::ValueVector({updatedInit});
-      });
-
-  rewriter.replaceOp(gatherOp, loopNest.results);
-  return success();
-}
-
-LogicalResult scalarizeLinalgOp(LinalgOp linalgOp, PatternRewriter &rewriter) {
-  // Fail if not every argument is a scalar or a single-element tensor.
-  if (!hasSingleElementOperandsAndResults(linalgOp)) return failure();
-
-  // Do not scalarize linalg::FillOp that is only used by DPS ops as init
-  // operands.
-  if (isa<linalg::FillOp>(linalgOp)) {
-    if (llvm::all_of(linalgOp->getUses(), [&](OpOperand &use) {
-          Operation *user = use.getOwner();
-          if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(user)) {
-            SmallVector<OpOperand *> opOperands = llvm::to_vector(
-                llvm::map_range(dpsOp.getDpsInitsMutable(),
-                                [](OpOperand &o) { return &o; }));
-            return llvm::is_contained(opOperands, &use);
-          }
-          return false;
-        }))
-      return failure();
-  }
-
-  // Load the data corresponding to the block arguments that
-  // represent input operands.
-  SmallVector<Value> indexedValues;
-  indexedValues.reserve(linalgOp->getNumOperands());
-  Location loc = linalgOp->getLoc();
-  auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  for (OpOperand &operand : linalgOp->getOpOperands()) {
-    if (!linalgOp.payloadUsesValueFromOperand(&operand)) {
-      indexedValues.push_back(nullptr);
-      continue;
-    }
-    if (linalgOp.isScalar(&operand)) {
-      indexedValues.push_back(operand.get());
-      continue;
-    }
-    Value operandValue = operand.get();
-    Type operandType = operandValue.getType();
-    SmallVector<Value> indices(operandType.cast<RankedTensorType>().getRank(),
-                               zero);
-    Value load = rewriter.create<ExtractOp>(loc, operandValue, indices);
-    indexedValues.push_back(load);
-  }
-
-  // Inline the op payload and rewrite the operation.
-  return inlinePayload(rewriter, loc, linalgOp, indexedValues);
-}
-
-LogicalResult scalarizeReverseOp(thlo::ReverseOp reverseOp,
-                                 PatternRewriter &rewriter) {
-  auto input = reverseOp.getInput();
-  auto output = reverseOp.getInit();
-  return scalarizeOp(reverseOp, rewriter, input, output);
-}
-
-FailureOr<scf::IfOp> rewriteScatterOpAsIfOp(thlo::ScatterOp scatterOp,
-                                            PatternRewriter &rewriter) {
-  Location loc = scatterOp.getLoc();
-  ImplicitLocOpBuilder b(loc, rewriter);
-  b.setInsertionPoint(scatterOp);
-
-  auto scatterIndices = extractStartIndices(b, scatterOp.getIndices());
-  if (!scatterIndices) return failure();
-  Value updates = scatterOp.getUpdates();
-  auto updatesType = updates.getType().dyn_cast<RankedTensorType>();
-  if (!updatesType) return failure();
-  unsigned updatesRank = updatesType.getRank();
-
-  SmallVector<OpFoldResult> updatesDimSizes =
-      tensor::getMixedSizes(b, loc, updates);
-  SmallVector<Value> updatesDimValues =
-      getValueOrCreateConstantIndexOp(b, loc, updatesDimSizes);
-
-  Value init = scatterOp.getInit();
-  auto initType = init.getType().dyn_cast<RankedTensorType>();
-  if (!initType) return failure();
-  SmallVector<Value> initDimValues = getValueOrCreateConstantIndexOp(
-      b, loc, tensor::getMixedSizes(b, loc, init));
-
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  Value one = b.create<arith::ConstantIndexOp>(1);
-
-  Value indexIsInBounds =
-      isIndexInBounds(b, loc, updatesDimValues, scatterIndices.value(),
-                      initDimValues, zero, one);
-  auto ifOp = b.create<scf::IfOp>(
-      loc, indexIsInBounds,
-      [&](OpBuilder &thenBuilder, Location thenLoc) {
-        SmallVector<OpFoldResult> collapsedOffsets;
-        for (size_t i = 0; i < updatesRank - 1; ++i) {
-          collapsedOffsets.push_back(
-              i < (scatterIndices->size()) ? (*scatterIndices)[i] : zero);
-        }
-        SmallVector<OpFoldResult> collapsedSizes;
-        for (size_t i = 1; i < updatesRank; ++i) {
-          collapsedSizes.push_back(updatesDimSizes[i]);
-        }
-
-        auto collapsedStrides = SmallVector<OpFoldResult>(updatesRank - 1, one);
-
-        // If body consists only from terminator, then insert the update
-        // slice into `init`, otherwise reduce the update slice with the same
-        // body.
-        if (scatterOp.getBody()->getOperations().size() == 1) {
-          SmallVector<OpFoldResult> offsets(updatesRank, zero);
-          SmallVector<OpFoldResult> strides(updatesRank, one);
-
-          // Create rank-reducing `tensor.extract_slice` to avoid insertion of
-          // `tensor.collapse_shape` to get rid of the outer size-1 dimension.
-          Value extracted = thenBuilder.create<ExtractSliceOp>(
-              thenLoc, updates, offsets, updatesDimSizes, strides);
-          Value collapsed = thenBuilder.create<tensor::CollapseShapeOp>(
-              thenLoc, extracted,
-              getCollapseFirstDimReassociation(updatesRank));
-
-          // Insert resized `updates` into `init`.
-          Value inserted = thenBuilder.create<tensor::InsertSliceOp>(
-              thenLoc, collapsed, init, collapsedOffsets, collapsedSizes,
-              collapsedStrides);
-          thenBuilder.create<scf::YieldOp>(thenLoc, inserted);
-          return;
-        }
-
-        // Extract a slice for `init`.
-        Value extracted = thenBuilder.create<ExtractSliceOp>(
-            thenLoc, init, collapsedOffsets, collapsedSizes, collapsedStrides);
-
-        // Insert indentity slice for `updates`.
-        Value updatesSlice = thenBuilder.create<ExtractSliceOp>(
-            thenLoc, updates,
-            SmallVector<OpFoldResult>(updatesRank, b.getIndexAttr(0)),
-            updatesDimSizes,
-            SmallVector<OpFoldResult>(updatesRank, b.getIndexAttr(1)));
-
-        // Reduce `updates` into that slice.
-        auto reduced = thenBuilder.create<linalg::ReduceOp>(
-            thenLoc, extracted.getType().cast<RankedTensorType>(), updatesSlice,
-            extracted, ArrayRef<int64_t>({0}));
-        reduced.getRegion().takeBody(scatterOp.getBodyRegion());
-
-        Operation *yield = reduced.getBlock()->getTerminator();
-
-        OpBuilder::InsertionGuard g(rewriter);
-        rewriter.setInsertionPoint(yield);
-        rewriter.replaceOpWithNewOp<linalg::YieldOp>(yield,
-                                                     yield->getOperands());
-        // Put that slice back.
-        auto inserted = thenBuilder.create<tensor::InsertSliceOp>(
-            thenLoc, reduced.getResults().front(), init, collapsedOffsets,
-            collapsedSizes, collapsedStrides);
-        thenBuilder.create<scf::YieldOp>(thenLoc, inserted.getResult());
-      },
-      [&](OpBuilder &elseBuilder, Location elseLoc) {
-        elseBuilder.create<scf::YieldOp>(elseLoc, init);
-      });
-  rewriter.replaceOp(scatterOp, ifOp.getResults());
-  return ifOp;
-}
-
-LogicalResult scalarizeScatterOp(thlo::ScatterOp scatterOp,
-                                 PatternRewriter &rewriter) {
-  return rewriteScatterOpAsIfOp(scatterOp, rewriter);
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> createScalarizationPass(
-    bool scalarizeAllThlo) {
-  ScalarizationPassOptions opts;
-  opts.scalarizeAllThlo = scalarizeAllThlo;
-  return std::make_unique<ScalarizationPass>(opts);
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.h b/third_party/xla/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.h
deleted file mode 100644
index 9455aa1c95eb18..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_GML_ST_TRANSFORMS_SCALARIZATION_SCALARIZATION_H
-#define MLIR_HLO_GML_ST_TRANSFORMS_SCALARIZATION_SCALARIZATION_H
-
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/PatternMatch.h"
-#include "thlo/IR/thlo_ops.h"
-
-namespace mlir {
-namespace gml_st {
-
-/// Rewrites thlo.concatenate, returns `failure` if IR was not changed.
-LogicalResult scalarizeConcatenateOp(thlo::ConcatenateOp concatenateOp,
-                                     PatternRewriter &rewriter);
-
-/// Rewrites thlo.dynamic_broadcast_in_dim, returns `failure` if IR was not
-/// changed.
-LogicalResult scalarizeDynamicBroadcastInDimOp(
-    thlo::DynamicBroadcastInDimOp broadcastOp, PatternRewriter &rewriter);
-
-/// Rewrites thlo.gather, returns `failure` if IR was not changed.
-LogicalResult scalarizeGatherOp(thlo::GatherOp gatherOp,
-                                PatternRewriter &rewriter);
-
-/// Rewrites LinalgOp interface ops, returns `failure` if IR was not changed.
-LogicalResult scalarizeLinalgOp(linalg::LinalgOp linalgOp,
-                                PatternRewriter &rewriter);
-
-/// Rewrites thlo.reverse, returns `failure` if IR was not changed.
-LogicalResult scalarizeReverseOp(thlo::ReverseOp reverseOp,
-                                 PatternRewriter &rewriter);
-
-/// Rewrites thlo.scatter, returns `failure` if IR was not changed.
-LogicalResult scalarizeScatterOp(thlo::ScatterOp scatterOp,
-                                 PatternRewriter &rewriter);
-
-FailureOr<scf::IfOp> rewriteScatterOpAsIfOp(thlo::ScatterOp scatterOp,
-                                            PatternRewriter &rewriter);
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif  // MLIR_HLO_GML_ST_TRANSFORMS_SCALARIZATION_SCALARIZATION_H
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/test_passes.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/test_passes.cc
deleted file mode 100644
index a5aa31770e11b4..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/test_passes.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "gml_st/transforms/test_passes.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "gml_st/transforms/fusion/fusion.h"
-#include "gml_st/transforms/peeling/peeling.h"
-#include "gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
-#include "mlir/Dialect/Bufferization/Transforms/OneShotModuleBufferize.h"
-#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TESTGMLSTGREEDYFUSION
-#include "gml_st/transforms/test_passes.h.inc"
-
-static constexpr llvm::StringRef kTestFusionAppliedLabel =
-    "__test_fusion_applied_label__";
-
-struct GreedyFusionPattern : public OpRewritePattern<scf::ForallOp> {
-  using OpRewritePattern<scf::ForallOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(scf::ForallOp op,
-                                PatternRewriter &rewriter) const override {
-    if (hasLabel(op, kTestFusionAppliedLabel)) return failure();
-
-    rewriter.updateRootInPlace(op, [&]() {
-      fuseGreedily(rewriter, &op.getRegion().front(), [](Operation *op) {
-        return isa<linalg::BroadcastOp, linalg::FillOp, linalg::MapOp,
-                   tensor::CollapseShapeOp, tensor::ExpandShapeOp>(op);
-      });
-    });
-
-    setLabel(op, kTestFusionAppliedLabel);
-    return success();
-  }
-};
-
-struct TestGmlStGreedyFusionPass
-    : public impl::TestGmlStGreedyFusionBase<TestGmlStGreedyFusionPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry
-        .insert<GmlStDialect, linalg::LinalgDialect, tensor::TensorDialect>();
-    linalg::registerTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp funcOp = getOperation();
-
-    MLIRContext *ctx = funcOp.getContext();
-    RewritePatternSet patterns(ctx);
-
-    patterns.add<GreedyFusionPattern>(ctx);
-
-    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns))))
-      return signalPassFailure();
-
-    funcOp.walk(
-        [](scf::ForallOp op) { removeLabel(op, kTestFusionAppliedLabel); });
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createTestGmlStGreedyFusionPass() {
-  return std::make_unique<TestGmlStGreedyFusionPass>();
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/test_passes.h b/third_party/xla/xla/mlir_hlo/gml_st/transforms/test_passes.h
deleted file mode 100644
index 7b4399b9c2fabc..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/test_passes.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_GML_ST_TRANSFORMS_TEST_PASSES_H
-#define MLIR_HLO_GML_ST_TRANSFORMS_TEST_PASSES_H
-
-#include <memory>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-namespace gml_st {
-
-#define GEN_PASS_DECL
-#include "gml_st/transforms/test_passes.h.inc"
-
-std::unique_ptr<OperationPass<func::FuncOp>> createTestGmlStGreedyFusionPass();
-
-#define GEN_PASS_REGISTRATION
-#include "gml_st/transforms/test_passes.h.inc"
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif  // MLIR_HLO_GML_ST_TRANSFORMS_TEST_PASSES_H
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/test_passes.td b/third_party/xla/xla/mlir_hlo/gml_st/transforms/test_passes.td
deleted file mode 100644
index 2be3577fc79b99..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/test_passes.td
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-include "mlir/Pass/PassBase.td"
-
-def TestGmlStGreedyFusion : Pass<"test-gml-st-greedy-fusion", "mlir::func::FuncOp"> {
-  let summary = "Fuse ops greedily into gml-st loops.";
-  let constructor = "::mlir::gml_st::createTestGmlStGreedyFusionPass()";
-}
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/tiling/tile_by_one.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/tiling/tile_by_one.cc
deleted file mode 100644
index 6e0d28273e543f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/tiling/tile_by_one.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "gml_st/IR/gml_st_ops.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "thlo/IR/thlo_ops.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TILEBYONEPASS
-#include "gml_st/transforms/passes.h.inc"
-
-static constexpr llvm::StringRef kTileByOneLabel = "__tile_by_one_label__";
-
-template <typename OpTy>
-struct TileByOnePattern : public OpRewritePattern<OpTy> {
-  using OpRewritePattern<OpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(OpTy op,
-                                PatternRewriter &rewriter) const override {
-    // Skip ops that are already tiled.
-    if (hasLabel(op, kTileByOneLabel)) return failure();
-
-    // Skip if iteration domain is statically known to be of size 1.
-    auto iface = llvm::cast<TilingInterface>(op.getOperation());
-    // TODO(frgossen): Avoid creating the IR for these ranges. Instead, the
-    // tiling interface should allow to access statically known iteration
-    // domains.
-    SmallVector<Range> iterationDomain = iface.getIterationDomain(rewriter);
-    auto isRangeSizeOne = [](Range range) {
-      if (!range.size.is<Attribute>()) return false;
-      auto intAttr = range.size.get<Attribute>().dyn_cast<IntegerAttr>();
-      if (!intAttr) return false;
-      return intAttr.getInt() == 1;
-    };
-    if (llvm::all_of(iterationDomain, isRangeSizeOne)) return failure();
-
-    // Tile.
-    scf::SCFTilingOptions opts;
-    opts.setTileSizes(SmallVector<OpFoldResult>(
-        iface.getLoopIteratorTypes().size(),
-        getAsIndexOpFoldResult(rewriter.getContext(), 1)));
-    FailureOr<scf::SCFTilingResult> tilingResult =
-        tileUsingSCFForOp(rewriter, iface, opts);
-    if (failed(tilingResult))
-      return rewriter.notifyMatchFailure(op, "tiling to scf.for failed");
-
-    // Mark resulting tiled ops.
-    for (Operation *tiled : tilingResult->tiledOps) {
-      setLabel(tiled, kTileByOneLabel);
-    }
-
-    rewriter.replaceOp(op, tilingResult->replacements);
-    return success();
-  }
-};
-
-struct TileByOnePass : public impl::TileByOnePassBase<TileByOnePass> {
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<GmlStDialect, arith::ArithDialect, tensor::TensorDialect,
-                    scf::SCFDialect>();
-    linalg::registerTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    // Populate patterns.
-    RewritePatternSet patterns(ctx);
-    // clang-format off
-    patterns.add<
-        TileByOnePattern<thlo::ConcatenateOp>,
-        TileByOnePattern<thlo::GatherOp>,
-        TileByOnePattern<thlo::ReverseOp>,
-        TileByOnePattern<thlo::ScatterOp>,
-        TileByOnePattern<thlo::SortOp>,
-        TileByOnePattern<linalg::MapOp>>(ctx);
-    // clang-format on
-
-    // Apply patterns.
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-
-    // Clean up by removing temporary attributes.
-    f->walk([](Operation *op) { removeLabel(op, kTileByOneLabel); });
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> createTileByOnePass() {
-  return std::make_unique<TileByOnePass>();
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/tiling/tiling.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/tiling/tiling.cc
deleted file mode 100644
index 85c0e72babf384..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/tiling/tiling.cc
+++ /dev/null
@@ -1,253 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "gml_st/transforms/tiling/tiling.h"
-
-#include <functional>
-#include <memory>
-#include <optional>
-#include <utility>
-#include <variant>
-
-#include "llvm/ADT/STLExtras.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinAttributeInterfaces.h"
-#include "mlir/IR/OpDefinition.h"
-
-namespace mlir::gml_st {
-namespace {
-
-// Compute tile size for the tile that starts at `offset`, has size `tileSize`
-// for the tensor with the dimension size `dimSize`.
-// The tile size is static when `tileSize` divides `dimSize` or when the
-// `tileSize` is 1.
-// Otherwise, it is minimum of `tileSize` and `dimSize - offset` to avoid out of
-// bounds access.
-OpFoldResult computeTileSizeInDim(OpBuilder &builder, Location loc,
-                                  OpFoldResult tileSize, OpFoldResult dimSize,
-                                  OpFoldResult offset) {
-  std::optional<int64_t> tileCst = getConstantIntValue(tileSize);
-  std::optional<int64_t> dimCst = getConstantIntValue(dimSize);
-
-  bool hasTileSizeOne = tileCst && *tileCst == 1;
-  bool dividesEvenly = tileCst && dimCst && ((*dimCst % *tileCst) == 0);
-  if (hasTileSizeOne || dividesEvenly) return builder.getIndexAttr(*tileCst);
-
-  AffineExpr d0, s0;
-  bindDims(builder.getContext(), d0);
-  bindSymbols(builder.getContext(), s0);
-  OpFoldResult residualTileSize = affine::makeComposedFoldedAffineApply(
-      builder, loc, s0 - d0, {offset, dimSize});
-
-  return affine::makeComposedFoldedAffineMin(
-      builder, loc, AffineMap::getMultiDimIdentityMap(2, loc.getContext()),
-      {residualTileSize, tileSize});
-}
-
-// Updates offsets, sizes as functions of ivs and insert parallel_insert_slices
-// into `in_parallel` terminator.
-void calculateTileOffsetsAndSizes(OpBuilder &b, Location loc,
-                                  scf::ForallOp forallOp,
-                                  ArrayRef<OpFoldResult> steps,
-                                  ArrayRef<OpFoldResult> ubs,
-                                  ArrayRef<unsigned> nonemptyRangeIndices,
-                                  SmallVector<OpFoldResult> &offsets,
-                                  SmallVector<OpFoldResult> &sizes) {
-  OpBuilder::InsertionGuard g(b);
-  b.setInsertionPointToStart(forallOp.getBody(0));
-  for (const auto &[index, iv] : llvm::enumerate(forallOp.getInductionVars())) {
-    offsets[nonemptyRangeIndices[index]] = iv;
-    sizes[nonemptyRangeIndices[index]] =
-        computeTileSizeInDim(b, loc, steps[index], ubs[index], iv);
-  }
-}
-
-/// Generate an empty loop nest that represents the tiled loop nest shell.
-/// - `loopRanges` specifies the lb, ub and step of the untiled iteration space.
-/// - `tileSizeVals` is the tile sizes to use. Zero represent untiled loops.
-/// - In `offsets` and `sizes` return the multi-dimensional offset and size of
-/// the tile processed within the inner most loop.
-scf::ForallOp generateTileLoopNest(OpBuilder &builder, Location loc,
-                                   ArrayRef<Range> loopRanges,
-                                   ArrayRef<Value> tileSizeVals,
-                                   ArrayRef<Value> dstOperands,
-                                   SmallVector<OpFoldResult> &offsets,
-                                   SmallVector<OpFoldResult> &sizes) {
-  assert(!loopRanges.empty() && "expected at least one loop range");
-  assert(loopRanges.size() == tileSizeVals.size() &&
-         "expected as many tile sizes as loop ranges");
-  OpBuilder::InsertionGuard guard(builder);
-
-  SmallVector<OpFoldResult> lbs, ubs, steps;
-  SmallVector<unsigned> nonemptyRangeIndices;
-  for (const auto &loopRange : llvm::enumerate(loopRanges)) {
-    OpFoldResult offset = loopRange.value().offset;
-    OpFoldResult size = loopRange.value().size;
-    // No loops if tile size is zero. Set offset and size to the loop offset and
-    // size.
-    offsets.push_back(offset);
-    sizes.push_back(size);
-    if (matchPattern(tileSizeVals[loopRange.index()], m_Zero())) continue;
-    lbs.push_back(offset);
-    ubs.push_back(size);
-    steps.push_back(tileSizeVals[loopRange.index()]);
-    nonemptyRangeIndices.push_back(loopRange.index());
-  }
-  auto loop = builder.create<scf::ForallOp>(loc, lbs, ubs, steps, dstOperands,
-                                            std::nullopt);
-
-  calculateTileOffsetsAndSizes(builder, loc, loop, steps, ubs,
-                               nonemptyRangeIndices, offsets, sizes);
-  return loop;
-}
-
-void updateOutputs(const GMLSTTilingResult &tilingResult,
-                   ValueRange dstOperands) {
-  scf::ForallOp parallelLoop = tilingResult.loop;
-
-  if (auto dstOp = dyn_cast<DestinationStyleOpInterface>(
-          tilingResult.tiledOps.front())) {
-    for (auto [dst, regionArg] :
-         llvm::zip(dstOperands, parallelLoop.getOutputBlockArguments())) {
-      dst.replaceUsesWithIf(regionArg, [&](OpOperand &operand) {
-        Operation *owner = operand.getOwner();
-        return isa<tensor::ExtractSliceOp, TilingInterface>(owner) &&
-               owner->getParentOfType<scf::ForallOp>() ==
-                   parallelLoop.getOperation();
-      });
-    }
-  }
-}
-
-}  // namespace
-
-scf::SCFTilingOptions getSCFTilingOptions(MLIRContext *context,
-                                          ArrayRef<int64_t> tileSizes) {
-  scf::SCFTilingOptions opts;
-  SmallVector<OpFoldResult> tileSizesOfr =
-      getAsIndexOpFoldResult(context, tileSizes);
-  opts.setTileSizes(tileSizesOfr);
-  return opts;
-}
-
-FailureOr<GMLSTTilingResult> tileUsingSCFForallOp(
-    PatternRewriter &rewriter, TilingInterface op,
-    const scf::SCFTilingOptions &options) {
-  rewriter.setInsertionPoint(op);
-  if (!options.tileSizeComputationFunction) {
-    return rewriter.notifyMatchFailure(
-        op, "missing tile size computation function");
-  }
-  Location loc = op.getLoc();
-
-  // 1. Get the range of the loops that are represented by the operation.
-  SmallVector<Range> iterationDomain = op.getIterationDomain(rewriter);
-  size_t numLoops = iterationDomain.size();
-  if (numLoops == 0) return failure();
-
-  // 2. Materialize the tile sizes. Enforce the convention that "tiling by
-  // zero" skips tiling a particular dimension. This convention is
-  // significantly simpler to handle instead of adjusting affine maps to
-  // account for missing dimensions.
-  SmallVector<Value> tileSizeVector;
-  {
-    OpBuilder::InsertionGuard guard(rewriter);
-    tileSizeVector = llvm::to_vector(
-        llvm::map_range(options.tileSizeComputationFunction(rewriter, op),
-                        [&](const OpFoldResult &ofr) -> Value {
-                          if (Value value = mlir::dyn_cast<Value>(ofr))
-                            return value;
-                          if (Attribute attr = mlir::dyn_cast<Attribute>(ofr))
-                            return rewriter.create<arith::ConstantOp>(
-                                loc, attr.cast<TypedAttr>());
-                          return Value();
-                        }));
-  }
-
-  if (tileSizeVector.size() < iterationDomain.size()) {
-    auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    tileSizeVector.append(numLoops - tileSizeVector.size(), zero);
-  }
-
-  if (llvm::all_of(tileSizeVector,
-                   [](Value v) { return matchPattern(v, m_Zero()); })) {
-    return GMLSTTilingResult{{op}, nullptr};
-  }
-
-  // 3. Materialize an empty loop nest that iterates over the tiles.
-  SmallVector<Value> dstOperands;
-  if (failed(tensor::getOrCreateDestinations(rewriter, loc, op, dstOperands)))
-    return rewriter.notifyMatchFailure(op, "failed to get destinations");
-  SmallVector<OpFoldResult> offsets, sizes;
-  GMLSTTilingResult tilingResult;
-  tilingResult.loop =
-      generateTileLoopNest(rewriter, loc, iterationDomain, tileSizeVector,
-                           dstOperands, offsets, sizes);
-
-  Block *loopBody = &tilingResult.loop->getRegion(0).front();
-  auto terminator = cast<scf::InParallelOp>(loopBody->getTerminator());
-  rewriter.setInsertionPoint(terminator);
-
-  // 4. Insert the tiled implementation within the loop.
-  FailureOr<TilingResult> tiledImplementation =
-      op.getTiledImplementation(rewriter, offsets, sizes);
-  if (failed(tiledImplementation))
-    return rewriter.notifyMatchFailure(op,
-                                       "failed to get tiled implementation");
-  tilingResult.tiledOps = tiledImplementation->tiledOps;
-
-  // 5. Compute tiles for the insertion.
-  int64_t numResults = op->getNumResults();
-  SmallVector<Value> outputTiles;
-  auto oneAttr = rewriter.getI64IntegerAttr(1);
-  for (const auto &result : llvm::enumerate(op->getResults())) {
-    rewriter.setInsertionPoint(terminator);
-    SmallVector<OpFoldResult> resultOffsetsList(numResults),
-        resultSizesList(numResults);
-    if (failed(op.getResultTilePosition(rewriter, result.index(), offsets,
-                                        sizes, resultOffsetsList,
-                                        resultSizesList))) {
-      return rewriter.notifyMatchFailure(
-          op, "failed to get slice of result produced");
-    }
-    rewriter.setInsertionPointToEnd(terminator.getBody());
-    rewriter.create<tensor::ParallelInsertSliceOp>(
-        loc, tilingResult.tiledOps.front()->getResult(result.index()),
-        tilingResult.loop.getOutputBlockArguments()[result.index()],
-        resultOffsetsList, resultSizesList,
-        SmallVector<OpFoldResult>(resultSizesList.size(), oneAttr));
-  }
-  rewriter.setInsertionPoint(tilingResult.loop);
-
-  // 6. Update the uses of `outputs` with the output bbArgs.
-  updateOutputs(tilingResult, dstOperands);
-  return tilingResult;
-}
-
-SmallVector<Value> getYieldedValues(scf::InParallelOp inParallelOp) {
-  return llvm::to_vector(llvm::map_range(
-      inParallelOp.getYieldingOps(), [](Operation &op) -> Value {
-        auto insertSliceOp = cast<tensor::ParallelInsertSliceOp>(&op);
-        return insertSliceOp.getSource();
-      }));
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/tiling/tiling.h b/third_party/xla/xla/mlir_hlo/gml_st/transforms/tiling/tiling.h
deleted file mode 100644
index 47d852dcdf1aac..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/tiling/tiling.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_GML_ST_TRANSFORMS_TILING_TILING_H
-#define MLIR_HLO_GML_ST_TRANSFORMS_TILING_TILING_H
-
-#include <functional>
-
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Interfaces/TilingInterface.h"
-
-namespace mlir::gml_st {
-
-// Creates SCFTilingOptions from the list of tile sizes.
-scf::SCFTilingOptions getSCFTilingOptions(MLIRContext *context,
-                                          ArrayRef<int64_t> tileSizes);
-
-/// Returns `failure`, when there occurs a problem during tiling. If the tile
-/// sizes are smaller then the iteration domain of the op, it will still create
-/// an `scf.forall` op. This is matches the behavior of tiling to `scf.for`
-/// upstream.
-struct GMLSTTilingResult {
-  SmallVector<Operation *> tiledOps;
-  scf::ForallOp loop = nullptr;
-};
-FailureOr<GMLSTTilingResult> tileUsingSCFForallOp(
-    PatternRewriter &rewriter, TilingInterface op,
-    const scf::SCFTilingOptions &options);
-
-/// Extracts all yielded values from scf.in_parallel terminator. It should be
-/// upstreamed.
-SmallVector<Value> getYieldedValues(scf::InParallelOp inParallelOp);
-
-}  // namespace mlir::gml_st
-
-#endif  // MLIR_HLO_GML_ST_TRANSFORMS_TILING_TILING_H
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/tiling_softmax/tiling_softmax.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/tiling_softmax/tiling_softmax.cc
deleted file mode 100644
index c1ea034f0a64a9..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/tiling_softmax/tiling_softmax.cc
+++ /dev/null
@@ -1,292 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "gml_st/transforms/fusion/fusion.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/tiling/tiling.h"
-#include "gml_st/transforms/transforms.h"
-#include "gml_st/utils/linalg_utils.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TILINGSOFTMAXPASS
-#include "gml_st/transforms/passes.h.inc"
-
-constexpr llvm::StringRef kTileSoftmaxAppliedLabel =
-    "__tile_softmax_applied_label__";
-
-Operation *fuseIthOperandInPlace(PatternRewriter &rewriter, Operation *op,
-                                 int64_t i) {
-  auto matOp =
-      llvm::cast<tensor::ExtractSliceOp>(op->getOperand(i).getDefiningOp());
-  FailureOr<Operation *> fused = fuse(rewriter, matOp);
-  assert(succeeded(fused) && "expect success after matching");
-  return *fused;
-}
-
-LogicalResult tilePartialSoftmax(
-    TilingInterface op, PatternRewriter &rewriter,
-    llvm::function_ref<FailureOr<TilingResult>(Operation *, int64_t)>
-        tileOperationFn) {
-  // Match cwise root op.
-  // Match all operands to be derived from the same source value in one of two
-  // ways:
-  //   i)  by a reduction and subsequent bcast in one dimension, or
-  //   ii) by using the source value as is.
-  Value commonSource;
-  std::optional<int64_t> commonReductionDim;
-  SmallVector<std::optional<SimpleBcastReduction>> simpleBcastReductions;
-  auto mapOp = llvm::dyn_cast_or_null<linalg::MapOp>(op.getOperation());
-  if (!mapOp || mapOp.getNumDpsInits() != 1)
-    return rewriter.notifyMatchFailure(op, "no mapOp");
-  for (Value operand : mapOp.getInputs()) {
-    // Case i.
-    SimpleBcastReduction bcastReduction;
-    int64_t reductionDim;
-    if (isSimpleBcastReduction(operand.getDefiningOp(), &reductionDim,
-                               &bcastReduction)) {
-      if (commonSource && commonSource != bcastReduction.operand) {
-        return rewriter.notifyMatchFailure(bcastReduction.bcast,
-                                           "no common reduction source");
-      }
-      commonSource = bcastReduction.operand;
-      if (commonReductionDim && *commonReductionDim != reductionDim) {
-        return rewriter.notifyMatchFailure(bcastReduction.reduction,
-                                           "no common reduction dim");
-      }
-      commonReductionDim = reductionDim;
-      simpleBcastReductions.push_back(bcastReduction);
-      continue;
-    }
-
-    // Case ii.
-    if (commonSource && commonSource != operand)
-      return rewriter.notifyMatchFailure(op, "common source != operand");
-    commonSource = operand;
-    simpleBcastReductions.push_back(std::nullopt);
-  }
-
-  if (!commonReductionDim || !commonSource)
-    return rewriter.notifyMatchFailure(op, "no common dim/src");
-
-  // Tile or fuse cwise root op.
-  FailureOr<TilingResult> tilingResult =
-      tileOperationFn(op, *commonReductionDim);
-  if (failed(tilingResult))
-    return rewriter.notifyMatchFailure(op, "call to tileOperationFn failed");
-  Operation *tiledOp = tilingResult->tiledOps[0];
-  setLabel(tiledOp, kTileSoftmaxAppliedLabel);
-
-  // Fuse through the bcast reduction chains.
-  Value commonTiledSource;
-  for (int64_t i = 0; i < static_cast<int64_t>(simpleBcastReductions.size());
-       i++) {
-    if (!simpleBcastReductions[i]) continue;
-
-    // Fuse.
-    Operation *tiledBcast = fuseIthOperandInPlace(rewriter, tiledOp, i);
-    Operation *tiledReduction =
-        fuseIthOperandInPlace(rewriter, tiledBcast, /*i=*/0);
-
-    // Use common tiled source value.
-    if (commonTiledSource) {
-      tiledReduction->setOperand(0, commonTiledSource);
-    } else {
-      commonTiledSource = tiledReduction->getOperands().front();
-    }
-  }
-
-  // Also use the common tiled source value for the remaining operands.
-  for (size_t i = 0; i < simpleBcastReductions.size(); i++) {
-    if (simpleBcastReductions[i]) continue;
-    tiledOp->setOperand(i, commonTiledSource);
-  }
-
-  return success();
-}
-
-struct TilePartialSoftmaxPattern
-    : public OpInterfaceRewritePattern<TilingInterface> {
-  using OpInterfaceRewritePattern<TilingInterface>::OpInterfaceRewritePattern;
-
-  TilePartialSoftmaxPattern(MLIRContext *ctx, SmallVector<int64_t> tileSizes,
-                            PatternBenefit benefit = 1)
-      : OpInterfaceRewritePattern<TilingInterface>(ctx, benefit),
-        tileSizes(std::move(tileSizes)) {}
-
-  LogicalResult matchAndRewrite(TilingInterface op,
-                                PatternRewriter &rewriter) const override {
-    if (hasLabel(op, kTileSoftmaxAppliedLabel))
-      return rewriter.notifyMatchFailure(op, "has tranformation attr");
-
-    // Only apply to non-fusable occurrences.
-    bool hasFusableOccurrences = llvm::any_of(
-        op->getUsers(),
-        [](Operation *op) { return llvm::isa<tensor::ExtractSliceOp>(op); });
-    if (hasFusableOccurrences)
-      return rewriter.notifyMatchFailure(op, "has fusable occurrences");
-
-    return tilePartialSoftmax(
-        op, rewriter,
-        [&](Operation *op,
-            int64_t commonReductionDim) -> FailureOr<TilingResult> {
-          // Populate tiling options.
-          scf::SCFTilingOptions tilingOptions;
-          tilingOptions.setTileSizeComputationFunction(
-              [&](OpBuilder &b, Operation *op) -> SmallVector<OpFoldResult> {
-                SmallVector<OpFoldResult> tileSizeValues;
-                for (int64_t i = 0; i < static_cast<int64_t>(tileSizes.size());
-                     i++) {
-                  // Skip tiling the reduction dimension. By convention, this is
-                  // a tile size of 0.
-                  int64_t tileSizeInDim =
-                      i == commonReductionDim ? 0 : tileSizes[i];
-                  tileSizeValues.push_back(
-                      getAsIndexOpFoldResult(b.getContext(), tileSizeInDim));
-                }
-                return tileSizeValues;
-              });
-          // Tile.
-          FailureOr<GMLSTTilingResult> tilingResult = tileUsingSCFForallOp(
-              rewriter, cast<TilingInterface>(op), tilingOptions);
-          if (failed(tilingResult)) return failure();
-
-          rewriter.replaceOp(op, tilingResult->loop->getResults());
-          setLabel(tilingResult->tiledOps.front(), kTileSoftmaxAppliedLabel);
-          Operation *tiledOp = tilingResult->tiledOps.front();
-          return TilingResult{{tiledOp},
-                              SmallVector<Value>(tiledOp->result_begin(),
-                                                 tiledOp->result_end())};
-        });
-  }
-
- private:
-  SmallVector<int64_t> tileSizes;
-};
-
-struct FusePartialSoftmaxPattern
-    : public OpRewritePattern<tensor::ExtractSliceOp> {
-  using OpRewritePattern<tensor::ExtractSliceOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(tensor::ExtractSliceOp op,
-                                PatternRewriter &rewriter) const override {
-    Value source = op.getSource();
-    Operation *def = source.getDefiningOp();
-    if (!def) return failure();
-
-    if (!llvm::isa<TilingInterface>(def)) return failure();
-
-    return tilePartialSoftmax(
-        cast<TilingInterface>(def), rewriter,
-        [&](Operation *cwiseOp,
-            int64_t /*commonReductionDim*/) -> FailureOr<TilingResult> {
-          auto iface = llvm::dyn_cast_or_null<TilingInterface>(cwiseOp);
-          if (!iface) {
-            return rewriter.notifyMatchFailure(
-                cwiseOp, "doesn't implement tiling iface");
-          }
-
-          // By construction, we assume that the tile spans the operand in the
-          // common reduction dimension (`commonReductionDim`).
-          // TODO(frgossen): Assert this assumption when we have moved to
-          // unnested tiles.
-
-          // Fuse.
-          SmallVector<OpFoldResult> offsets = op.getMixedOffsets();
-          SmallVector<OpFoldResult> sizes = op.getMixedSizes();
-          FailureOr<TilingResult> tilingResult =
-              iface.generateResultTileValue(rewriter, 0, offsets, sizes);
-          if (failed(tilingResult)) {
-            return rewriter.notifyMatchFailure(
-                cwiseOp, "failed to generate result tile");
-          }
-
-          rewriter.replaceOp(op, tilingResult->tiledValues[0]);
-          return tilingResult;
-        });
-  }
-};
-
-struct FuseUnaryCwisePattern : public OpRewritePattern<tensor::ExtractSliceOp> {
-  using OpRewritePattern<tensor::ExtractSliceOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(tensor::ExtractSliceOp op,
-                                PatternRewriter &rewriter) const override {
-    // Match unary cwise ops.
-    Operation *source = op.getSource().getDefiningOp();
-    auto mapOp = dyn_cast_or_null<linalg::MapOp>(source);
-    if (!mapOp || mapOp.getNumDpsInputs() != 1) return failure();
-    // Fuse.
-    return fuse(rewriter, op);
-  }
-};
-
-struct TilingSoftmaxPass
-    : public impl::TilingSoftmaxPassBase<TilingSoftmaxPass> {
-  TilingSoftmaxPass() = default;
-  explicit TilingSoftmaxPass(ArrayRef<int64_t> ts) { this->tileSizes = ts; }
-
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<GmlStDialect, linalg::LinalgDialect, tensor::TensorDialect,
-                    scf::SCFDialect>();
-    linalg::registerTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    // Populate tiling and fusion patterns for partial softmax and unary cwise
-    // ops.
-    RewritePatternSet patterns(ctx);
-    SmallVector<int64_t> tileSizes(this->tileSizes.begin(),
-                                   this->tileSizes.end());
-    patterns.insert<TilePartialSoftmaxPattern>(ctx, tileSizes);
-    patterns.insert<FuseUnaryCwisePattern, FusePartialSoftmaxPattern>(ctx);
-
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-
-    // Clean up by removing temporary attributes.
-    f.walk([](Operation *op) { removeLabel(op, kTileSoftmaxAppliedLabel); });
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createTilingSoftmaxPass() {
-  return std::make_unique<TilingSoftmaxPass>();
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> createTilingSoftmaxPass(
-    ArrayRef<int64_t> tileSizes) {
-  return std::make_unique<TilingSoftmaxPass>(tileSizes);
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/transforms.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/transforms.cc
deleted file mode 100644
index 315a0f07838902..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/transforms.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "gml_st/transforms/transforms.h"
-
-#include "mlir/IR/BuiltinTypes.h"
-
-namespace mlir {
-namespace gml_st {
-
-bool hasSingleElementOperandsAndResults(Operation *op) {
-  auto isScalar = [](Type type) {
-    return !type.isa<mlir::ShapedType>() ||
-           (type.isa<TensorType>() &&
-            hasSingleElement(type.cast<TensorType>()));
-  };
-  return llvm::all_of(op->getOperandTypes(), isScalar) &&
-         llvm::all_of(op->getResultTypes(), isScalar);
-}
-
-void setLabel(Operation *op, StringRef name) {
-  op->setAttr(name, UnitAttr::get(op->getContext()));
-}
-
-void removeLabel(Operation *op, StringRef name) { op->removeAttr(name); }
-
-bool hasLabel(Operation *op, StringRef name) { return op->hasAttr(name); }
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/transforms.h b/third_party/xla/xla/mlir_hlo/gml_st/transforms/transforms.h
deleted file mode 100644
index 97d0ef0cf750a3..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/transforms.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_GML_ST_TRANSFORMS_TRANSFORMS_H
-#define MLIR_HLO_GML_ST_TRANSFORMS_TRANSFORMS_H
-
-#include "mlir/IR/Operation.h"
-
-namespace mlir {
-namespace gml_st {
-
-constexpr llvm::StringRef kPerfectlyTiledLoopLabel =
-    "__perfectly_tiled_loop_label__";
-
-static constexpr llvm::StringRef kTransformedLabel = "__transformed_label__";
-
-template <typename ShapedTy>
-bool hasSingleElement(ShapedTy type) {
-  return type.hasStaticShape() && type.getNumElements() == 1;
-}
-bool hasSingleElementOperandsAndResults(Operation *op);
-
-// Sets the attribute to the `op` that indicates that the op was transformed.
-void setLabel(Operation *op, StringRef name);
-
-// Removes the attribute that indicates that it was transformed.
-void removeLabel(Operation *op, StringRef name);
-
-// Checks if `op` has the attribute that indicates that it was transformed.
-bool hasLabel(Operation *op, StringRef name);
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif  // MLIR_HLO_GML_ST_TRANSFORMS_TRANSFORMS_H
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/lower_vectors.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/lower_vectors.cc
deleted file mode 100644
index 5622027e00d205..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/lower_vectors.cc
+++ /dev/null
@@ -1,270 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "gml_st/transforms/passes.h"
-#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
-#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
-#include "mlir/Dialect/X86Vector/Transforms.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_LOWERVECTORSPASS
-#include "gml_st/transforms/passes.h.inc"
-
-using func::FuncOp;
-
-LogicalResult rewriteVectorContract(MLIRContext* ctx, FuncOp funcOp) {
-  // Reduce vector.contract dimensions to fit one of the lowering patterns to
-  // vector.outerproduct.
-  {
-    RewritePatternSet castAwayUnitDimPatterns(ctx);
-    vector::populateCastAwayVectorLeadingOneDimPatterns(
-        castAwayUnitDimPatterns);
-    if (failed(applyPatternsAndFoldGreedily(
-            funcOp, std::move(castAwayUnitDimPatterns)))) {
-      return failure();
-    }
-
-    RewritePatternSet reductionToContractPatterns(ctx);
-    vector::populateVectorReductionToContractPatterns(
-        reductionToContractPatterns);
-    vector::ExtractOp::getCanonicalizationPatterns(reductionToContractPatterns,
-                                                   ctx);
-    if (failed(applyPatternsAndFoldGreedily(
-            funcOp, std::move(reductionToContractPatterns)))) {
-      return failure();
-    }
-  }
-
-  RewritePatternSet patterns(ctx);
-  vector::populateVectorToVectorCanonicalizationPatterns(patterns);
-
-  // Currently we always lower vector.contract into vector.outerproduct.
-  vector::populateVectorContractLoweringPatterns(
-      patterns,
-      vector::VectorTransformsOptions().setVectorTransformsOptions(
-          vector::VectorContractLowering::OuterProduct),
-      /*benefit=*/2,
-      /*disableOuterProductLowering*/ true);
-  vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
-
-  return applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
-}
-
-// Rewrite `vector.transpose` into vector.shuffle ops.
-LogicalResult rewriteVectorTranspose(MLIRContext* ctx, Operation* funcOp,
-                                     bool enableAVX2) {
-  RewritePatternSet patterns(ctx);
-  vector::VectorTransformsOptions vectorTransformOptions;
-  vectorTransformOptions = vectorTransformOptions.setVectorTransposeLowering(
-      vector::VectorTransposeLowering::EltWise);
-  vector::populateVectorTransposeLoweringPatterns(patterns,
-                                                  vectorTransformOptions);
-
-  if (enableAVX2) {
-    // Options for controlling specialized AVX2 lowerings. These lowerings may
-    // either use intrin or inline_asm depending on needs. So they won't work
-    // for SSE.
-    auto avxLoweringOptions =
-        x86vector::avx2::LoweringOptions().setTransposeOptions(
-            x86vector::avx2::TransposeLoweringOptions()
-                .lower4x8xf32()
-                .lower8x8xf32());
-
-    x86vector::avx2::populateSpecializedTransposeLoweringPatterns(
-        patterns, avxLoweringOptions, /*benefit=*/10);
-  }
-
-  return applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
-}
-
-// Rewrite N-D reductions as the sequence of vector operations without
-// horizontal reduction, i.e. `vector.reduction`.
-LogicalResult rewriteVectorReductionsND(MLIRContext* ctx, FuncOp funcOp) {
-  ConversionTarget target(*ctx);
-  target.addLegalDialect<arith::ArithDialect, vector::VectorDialect>();
-  target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
-      [&](vector::MultiDimReductionOp op) {
-        return op.getSourceVectorType().getRank() == 1;
-      });
-
-  RewritePatternSet patterns(ctx);
-  vector::populateVectorMultiReductionLoweringPatterns(
-      patterns, vector::VectorMultiReductionLowering::InnerParallel);
-  return applyPartialConversion(funcOp, target, std::move(patterns));
-}
-
-// Rewrite 1D reductions as a `vector.reduction`.
-LogicalResult rewriteVectorReductions1D(MLIRContext* ctx, Operation* op) {
-  RewritePatternSet patterns(ctx);
-  vector::populateVectorMultiReductionLoweringPatterns(
-      patterns, vector::VectorMultiReductionLowering::InnerReduction);
-  return applyPatternsAndFoldGreedily(op, std::move(patterns));
-}
-
-// Return the uses of op if they all are either StoreOp, TransferWriteOp, or
-// SubviewOp with only StoreOp/TransferWriteOp users.
-std::optional<llvm::SmallVector<Operation*>> getUsesIfAllStores(Operation* op) {
-  llvm::SmallVector<Operation*> opUses;
-  for (OpOperand& use : op->getUses()) {
-    Operation* useOp = use.getOwner();
-    if (isa<vector::TransferWriteOp, memref::StoreOp>(useOp)) {
-      opUses.push_back(useOp);
-      continue;
-    }
-    if (isa<memref::SubViewOp>(useOp)) {
-      if (auto subviewUses = getUsesIfAllStores(useOp)) {
-        opUses.insert(opUses.end(), subviewUses->begin(), subviewUses->end());
-        opUses.push_back(useOp);
-        continue;
-      }
-    }
-    return std::nullopt;
-  }
-  return opUses;
-}
-
-// Track temporary allocations that are never read from. If this is the case
-// it means both the allocations and associated stores can be removed.
-void eraseDeadAllocAndStores(func::FuncOp func) {
-  SmallVector<Operation*> opToErase;
-  func.walk([&](memref::AllocOp op) {
-    if (auto uses = getUsesIfAllStores(op)) {
-      // Insert the uses first,
-      opToErase.insert(opToErase.end(), uses->begin(), uses->end());
-      // then the op itself, since we will be erasing from opToErase's start.
-      opToErase.push_back(op.getOperation());
-    }
-  });
-  for (Operation* op : opToErase) {
-    op->erase();
-  }
-}
-
-// Pattern to canonialize tranpose where only one dimension is not unit
-// dimension. In this case the transpose is a no-op and should be simplified
-// before getting to the conversion to llvm/spirv.
-class TransposeUnitDimToShapeCast
-    : public OpRewritePattern<vector::TransposeOp> {
- public:
-  using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::TransposeOp op,
-                                PatternRewriter& rewriter) const override {
-    unsigned numNonUnitSrcDim =
-        llvm::count_if(op.getSourceVectorType().getShape(),
-                       [](int64_t dim) { return dim != 1; });
-    if (numNonUnitSrcDim != 1) return failure();
-    rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(
-        op, op.getResultVectorType(), op.getVector());
-    return success();
-  }
-};
-
-// Run optimization transformations on vector transfer operations.
-LogicalResult optimizeVectorTransfers(MLIRContext* ctx, FuncOp funcOp,
-                                      bool flatten) {
-  // Generate vector.shape_cast for dropping leading one dimensions in vector
-  // ops. This increases the chance that we can forward more transfer writes
-  // to transfer reads.
-  {
-    RewritePatternSet patterns(ctx);
-    mlir::vector::populateCastAwayVectorLeadingOneDimPatterns(patterns);
-    vector::ExtractOp::getCanonicalizationPatterns(patterns, ctx);
-    patterns.add<TransposeUnitDimToShapeCast>(ctx);
-    mlir::vector::populateVectorTransferCollapseInnerMostContiguousDimsPatterns(
-        patterns);
-    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
-      return failure();
-    }
-  }
-
-  // Move bitcast inwards from loop region boundaries to increase chances to
-  // cancel them.
-  {
-    RewritePatternSet patterns(ctx);
-    vector::populateBubbleVectorBitCastOpPatterns(patterns);
-    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
-      return failure();
-    }
-  }
-
-  // Third stage of patterns to flatten transfer ops.
-  if (flatten) {
-    RewritePatternSet patterns(ctx);
-    mlir::vector::populateVectorTransferDropUnitDimsPatterns(patterns);
-    mlir::vector::populateFlattenVectorTransferPatterns(patterns);
-    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
-      return failure();
-    }
-  }
-  // Delete potential dead alloc and associated ops after store to load
-  // forwarding.
-  eraseDeadAllocAndStores(funcOp);
-  return success();
-}
-
-LogicalResult lowerVectorOpsToSCF(MLIRContext* ctx, FuncOp funcOp) {
-  RewritePatternSet patterns(ctx);
-  auto vectorTransferToSCFOptions =
-      VectorTransferToSCFOptions().enableFullUnroll(true).setTargetRank(1);
-
-  populateVectorToSCFConversionPatterns(patterns, vectorTransferToSCFOptions);
-  return applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
-}
-
-struct LowerVectorsPass : public impl::LowerVectorsPassBase<LowerVectorsPass> {
-  using Base::Base;
-
-  void runOnOperation() override {
-    func::FuncOp funcOp = getOperation();
-    MLIRContext* ctx = &getContext();
-
-    if (failed(rewriteVectorContract(ctx, funcOp))) signalPassFailure();
-    if (failed(rewriteVectorTranspose(ctx, funcOp, enableAVX2)))
-      signalPassFailure();
-    if (failed(rewriteVectorReductionsND(ctx, funcOp))) signalPassFailure();
-    if (failed(rewriteVectorReductions1D(ctx, funcOp))) signalPassFailure();
-    if (failed(optimizeVectorTransfers(ctx, funcOp, flatten)))
-      signalPassFailure();
-    if (failed(lowerVectorOpsToSCF(ctx, funcOp))) signalPassFailure();
-  }
-};
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createLowerVectorsPass(
-    bool enableAVX2, bool flatten) {
-  LowerVectorsPassOptions opts;
-  opts.enableAVX2 = enableAVX2;
-  opts.flatten = flatten;
-  return std::make_unique<LowerVectorsPass>(opts);
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.cc
deleted file mode 100644
index 2ae94e276921a5..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "gml_st/transforms/vectorization/vectorization.h"
-
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/OpDefinition.h"
-
-namespace mlir {
-namespace gml_st {
-
-using mlir::vector::TransferWriteOp;
-
-RewritePatternSet getDefaultVectorizationPatterns(MLIRContext *ctx) {
-  RewritePatternSet patterns(ctx);
-  vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
-  vector::populateVectorReductionToContractPatterns(patterns);
-  patterns.add<linalg::LinalgCopyVTRForwardingPattern,
-               linalg::LinalgCopyVTWForwardingPattern>(ctx, /*benefit=*/2);
-  TransferWriteOp::getCanonicalizationPatterns(patterns, ctx);
-  return patterns;
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.h b/third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.h
deleted file mode 100644
index ede8e9320ff11d..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_GML_ST_TRANSFORMS_VECTORIZATION_VECTORIZATION_H
-#define MLIR_HLO_GML_ST_TRANSFORMS_VECTORIZATION_VECTORIZATION_H
-
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Support/LogicalResult.h"
-
-namespace mlir {
-namespace gml_st {
-
-// TODO(manany): This should be parameterized later on depending on hardware.
-static constexpr int64_t kNumElementsVectorization = 8;
-
-template <typename OpTy>
-struct VectorizationPattern : public mlir::OpRewritePattern<OpTy> {
-  VectorizationPattern(MLIRContext *context,
-                       llvm::function_ref<bool(OpTy)> matchFn,
-                       mlir::PatternBenefit benefit = 1)
-      : mlir::OpRewritePattern<OpTy>(context, benefit), filterFn(matchFn) {}
-
-  LogicalResult matchAndRewrite(OpTy op,
-                                PatternRewriter &rewriter) const override {
-    if (!filterFn(op))
-      return rewriter.notifyMatchFailure(op, "did not match filter");
-    return mlir::linalg::vectorize(rewriter, op);
-  }
-
- private:
-  llvm::function_ref<bool(OpTy)> filterFn;
-};
-
-RewritePatternSet getDefaultVectorizationPatterns(MLIRContext *ctx);
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif  // MLIR_HLO_GML_ST_TRANSFORMS_VECTORIZATION_VECTORIZATION_H
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_cpu.cc b/third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_cpu.cc
deleted file mode 100644
index 5c7496c4193d17..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_cpu.cc
+++ /dev/null
@@ -1,418 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <limits>
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "gml_st/IR/gml_st_ops.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/transforms.h"
-#include "gml_st/transforms/vectorization/vectorization.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "thlo/IR/thlo_ops.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_VECTORIZEFORCPUPASS
-#include "gml_st/transforms/passes.h.inc"
-
-using mlir::linalg::BroadcastOp;
-using mlir::linalg::DotOp;
-using mlir::linalg::FillOp;
-using mlir::linalg::GenericOp;
-using mlir::linalg::MapOp;
-using mlir::linalg::MatmulOp;
-using mlir::linalg::MatvecOp;
-using mlir::linalg::Mmt4DOp;
-using mlir::linalg::ReduceOp;
-using mlir::linalg::TransposeOp;
-using mlir::linalg::VecmatOp;
-using mlir::tensor::ExpandShapeOp;
-using mlir::thlo::ReverseOp;
-using mlir::vector::TransferReadOp;
-using mlir::vector::TransferWriteOp;
-
-struct PassVectorizedValuesThroughIfOpPattern
-    : public OpRewritePattern<scf::IfOp> {
-  using OpRewritePattern<scf::IfOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(scf::IfOp op,
-                                PatternRewriter &rewriter) const override {
-    int64_t numResults = op.getNumResults();
-    if (numResults == 0) {
-      return rewriter.notifyMatchFailure(op,
-                                         "cannot vectorize if op w/o results");
-    }
-
-    // Derive vectorized types.
-    SmallVector<Type> vectorizedTypes(op.getResultTypes());
-    int64_t numActuallyVectorizedTypes = 0;
-    scf::YieldOp thenYieldOp = op.thenYield();
-    scf::YieldOp elseYieldOp = op.elseYield();
-    for (int64_t i = 0; i < numResults; ++i) {
-      Value result = op.getResult(i);
-
-      // Can only vectorized statically shaped results.
-      auto rankedTy = result.getType().dyn_cast<RankedTensorType>();
-      if (!rankedTy || !rankedTy.hasStaticShape()) continue;
-
-      // Vectorize only results that are either always used as a vector or
-      // always produced as a vector.
-      bool allVectorConsumers =
-          llvm::all_of(result.getUsers(), [](Operation *user) {
-            return llvm::isa_and_nonnull<vector::TransferReadOp>(user);
-          });
-      bool allVectorProducers =
-          llvm::isa_and_nonnull<vector::TransferWriteOp>(
-              thenYieldOp.getOperand(i).getDefiningOp()) &&
-          llvm::isa_and_nonnull<vector::TransferWriteOp>(
-              elseYieldOp.getOperand(i).getDefiningOp());
-      if (!allVectorProducers && !allVectorConsumers) continue;
-
-      // Derive vectorized type.
-      vectorizedTypes[i] =
-          VectorType::get(rankedTy.getShape(), rankedTy.getElementType());
-      numActuallyVectorizedTypes++;
-    }
-
-    // Fail if there isn't anything to vectorize.
-    if (numActuallyVectorizedTypes == 0) {
-      return rewriter.notifyMatchFailure(op, "nothing to vectorize");
-    }
-
-    // Create vectorized if op and steal bodies.
-    Location loc = op.getLoc();
-    auto vectorizedIfOp =
-        rewriter.create<scf::IfOp>(loc, vectorizedTypes, op.getCondition());
-    vectorizedIfOp.getThenRegion().takeBody(op.getThenRegion());
-    vectorizedIfOp.getElseRegion().takeBody(op.getElseRegion());
-
-    // Insert `transfer_read/write` ops for type compatibility.
-    auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    SmallVector<Value> replacements(vectorizedIfOp.getResults());
-    for (int64_t i = 0; i < numResults; ++i) {
-      // Skip non-vectorizable values.
-      auto vectorTy = vectorizedTypes[i].dyn_cast<VectorType>();
-      if (!vectorTy) continue;
-
-      // Yield vectorized value in then-case.
-      rewriter.setInsertionPoint(vectorizedIfOp.thenYield());
-      SmallVector<Value> indices(vectorTy.getRank(), zero);
-      Value unvectorizedThen = vectorizedIfOp.thenYield().getOperand(i);
-      Value vectorizedThen = rewriter.create<vector::TransferReadOp>(
-          loc, vectorTy, unvectorizedThen, indices);
-      vectorizedIfOp.thenYield().setOperand(i, vectorizedThen);
-
-      // Yield vectorized value in else-case.
-      rewriter.setInsertionPoint(vectorizedIfOp.elseYield());
-      Value unvectorizedElse = vectorizedIfOp.elseYield().getOperand(i);
-      Value vectorizedElse = rewriter.create<vector::TransferReadOp>(
-          loc, vectorTy, unvectorizedElse, indices);
-      vectorizedIfOp.elseYield().setOperand(i, vectorizedElse);
-
-      // Insert `transfer_write` op after the vectorized if op for type
-      // compatibility.
-      rewriter.setInsertionPointAfter(vectorizedIfOp);
-      Value init = rewriter.create<tensor::EmptyOp>(
-          loc, vectorTy.getShape(), vectorTy.getElementType(), ValueRange{});
-      replacements[i] = rewriter
-                            .create<vector::TransferWriteOp>(
-                                loc, vectorizedIfOp.getResult(i), init, indices)
-                            .getResult();
-    }
-
-    // Replace op.
-    rewriter.replaceOp(op, replacements);
-    return success();
-  }
-};
-
-// TODO(b/269643522): Upstream this as a canonicalization for `scf.if`.
-struct InlineCastInIfOpPattern : public OpRewritePattern<tensor::CastOp> {
-  using OpRewritePattern<tensor::CastOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(tensor::CastOp op,
-                                PatternRewriter &rewriter) const override {
-    auto srcTy = op.getSource().getType().cast<RankedTensorType>();
-    auto dstTy = op.getType().cast<RankedTensorType>();
-    if (srcTy.hasStaticShape() || !dstTy.hasStaticShape()) {
-      return rewriter.notifyMatchFailure(
-          op, "not cast from dynamic to static shape");
-    }
-
-    if (!op.getSource().hasOneUse())
-      return rewriter.notifyMatchFailure(op, "source has more than one use");
-
-    auto ifOp = op.getSource().getDefiningOp<scf::IfOp>();
-    if (!ifOp || ifOp.getNumResults() != 1) {
-      return rewriter.notifyMatchFailure(
-          op, "source is not an if op with a unique result");
-    }
-
-    // Determine result types for the new if op.
-    SmallVector<Type> newResultTypes(ifOp.getResultTypes());
-    auto ifOpResult = llvm::cast<OpResult>(op.getSource());
-    int64_t resultIdx = ifOpResult.getResultNumber();
-    newResultTypes[resultIdx] = dstTy;
-
-    // Create new if op and steal bodies.
-    rewriter.setInsertionPoint(ifOp);
-    Location loc = ifOp.getLoc();
-    auto newIfOp =
-        rewriter.create<scf::IfOp>(loc, newResultTypes, ifOp.getCondition());
-    newIfOp.getThenRegion().takeBody(ifOp.getThenRegion());
-    newIfOp.getElseRegion().takeBody(ifOp.getElseRegion());
-
-    // Insert inner casts.
-    rewriter.setInsertionPoint(newIfOp.thenYield());
-    newIfOp.thenYield().setOperand(
-        resultIdx, rewriter.create<tensor::CastOp>(
-                       loc, dstTy, newIfOp.thenYield().getOperand(resultIdx)));
-    rewriter.setInsertionPoint(newIfOp.elseYield());
-    newIfOp.elseYield().setOperand(
-        resultIdx, rewriter.create<tensor::CastOp>(
-                       loc, dstTy, newIfOp.elseYield().getOperand(resultIdx)));
-
-    // Replace op.
-    rewriter.replaceOp(op, newIfOp.getResults());
-    rewriter.eraseOp(ifOp);
-    return success();
-  }
-};
-
-// This currently matches for all thlo.reverse of the form 1x1x..x1xVectorSize.
-// DimSize < kNumElementsVectorization will be handled by Scalarization.
-bool isPerfectlyTiledReverse(thlo::ReverseOp reverseOp) {
-  auto inputType = reverseOp.getInput().getType();
-  for (unsigned i = 0; i < inputType.getRank(); ++i) {
-    if (inputType.isDynamicDim(i)) {
-      return false;
-    }
-    if (i == inputType.getRank() - 1) {
-      return inputType.getDimSize(i) == kNumElementsVectorization &&
-             llvm::is_contained(reverseOp.getReverseDimensions(), i);
-    }
-    if (inputType.getDimSize(i) != 1) {
-      return false;
-    }
-  }
-  return false;
-}
-
-// Rewrite thlo.reverse of pattern 1x1x..x1xVectorSize as vector.transfer_read
-// followed by vector.shuffle followed by vector.transfer_write.
-struct ThloReverseVectorizationPattern
-    : public mlir::OpRewritePattern<thlo::ReverseOp> {
-  explicit ThloReverseVectorizationPattern(MLIRContext *context,
-                                           mlir::PatternBenefit benefit = 1)
-      : mlir::OpRewritePattern<thlo::ReverseOp>(context, benefit) {}
-
-  LogicalResult matchAndRewrite(thlo::ReverseOp op,
-                                PatternRewriter &rewriter) const override {
-    if (!isPerfectlyTiledReverse(op))
-      return rewriter.notifyMatchFailure(op, "did not match filter");
-
-    auto inputType = op.getInput().getType();
-    if (!VectorType::isValidElementType(inputType.getElementType())) {
-      return rewriter.notifyMatchFailure(op, "cannot be vectorized");
-    }
-    auto vecTargetType =
-        VectorType::get(inputType.getShape()[inputType.getRank() - 1],
-                        inputType.getElementType());
-    Value zero = rewriter.create<arith::ConstantIndexOp>(op.getLoc(), 0);
-    SmallVector<Value> indices(op.getInit().getType().getRank(), zero);
-
-    auto readInput = rewriter.create<vector::TransferReadOp>(
-        op.getLoc(), vecTargetType, op.getInput(), indices);
-
-    SmallVector<int64_t> mask;
-    int64_t maskSize = inputType.getShape()[inputType.getRank() - 1];
-    mask.reserve(maskSize);
-    for (int64_t i = maskSize - 1; i >= 0; --i) {
-      mask.push_back(i);
-    }
-    auto shuffle = rewriter.create<vector::ShuffleOp>(op.getLoc(), readInput,
-                                                      readInput, mask);
-
-    rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
-        op, shuffle.getResult(), op.getInit(), indices);
-    return success();
-  }
-};
-
-struct IdentityTransposeOpFoldingPattern
-    : public OpRewritePattern<TransposeOp> {
-  explicit IdentityTransposeOpFoldingPattern(MLIRContext *context,
-                                             PatternBenefit benefit = 1)
-      : OpRewritePattern(context, benefit) {}
-
-  LogicalResult matchAndRewrite(TransposeOp op,
-                                PatternRewriter & /*rewriter*/) const override {
-    auto perm = op.getPermutation();
-    for (int64_t i = 0; static_cast<uint64_t>(i) < perm.size(); ++i) {
-      if (perm[i] != i) return failure();
-    }
-
-    if (!hasSingleElementOperandsAndResults(op)) return failure();
-
-    op.replaceAllUsesWith(SmallVector<Value>(1, op.getInput()));
-    return success();
-  }
-};
-
-// Rewrite `vector.transfer_read(linalg.expand_shape)` as
-// `vector.shape_cast(vector.transfer_read)`.
-struct TransferReadOfOneDimExpandShape
-    : public mlir::OpRewritePattern<vector::TransferReadOp> {
-  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      vector::TransferReadOp vectorRead,
-      mlir::PatternRewriter &rewriter) const override {
-    auto expand = vectorRead.getSource().getDefiningOp<tensor::ExpandShapeOp>();
-    if (!expand) return failure();
-
-    auto expandSrc = expand.getSrc();
-    auto expandSrcType = expand.getSrcType();
-    auto expandDstType = expand.getResultType();
-    if (expandSrcType.getRank() != 1 || expandDstType.getRank() != 2)
-      return failure();
-
-    auto resultType = vectorRead.getType().dyn_cast<mlir::ShapedType>();
-    if (!resultType || resultType.getShape() != expandDstType.getShape())
-      return failure();
-
-    auto zero = rewriter.create<arith::ConstantIndexOp>(vectorRead.getLoc(), 0);
-    auto map = mlir::AffineMap::get(1, 0, {rewriter.getAffineDimExpr(0)},
-                                    vectorRead.getContext());
-    // TODO(pifon): Also support canonicalization in case the map is not an
-    // identity.
-    if (!map.isIdentity()) return failure();
-
-    auto newRead = rewriter.create<vector::TransferReadOp>(
-        vectorRead.getLoc(),
-        mlir::VectorType::get(expandSrcType.getShape(),
-                              expandSrcType.getElementType()),
-        expandSrc, mlir::ValueRange{zero}, mlir::AffineMapAttr::get(map),
-        vectorRead.getPadding(),
-        /*mask=*/mlir::Value(), rewriter.getBoolArrayAttr({true}));
-    rewriter.replaceOpWithNewOp<mlir::vector::ShapeCastOp>(
-        vectorRead, vectorRead.getType(), newRead);
-    return success();
-  }
-};
-
-struct VectorizeForCPUPass
-    : public impl::VectorizeForCPUPassBase<VectorizeForCPUPass> {
-  using Base::Base;
-
-  void runOnOperation() override {
-    auto func = getOperation();
-    auto *ctx = func.getContext();
-
-    auto isNonComplexSmallTensorOrScalar = [&](Type ty) {
-      if (getElementTypeOrSelf(ty).isa<ComplexType>()) return false;
-      if (auto rankedTy = ty.dyn_cast<mlir::RankedTensorType>()) {
-        return rankedTy.hasStaticShape() &&
-               rankedTy.getNumElements() < numElementsThreshold;
-      }
-
-      return !isa<ShapedType>(ty);
-    };
-
-    auto isOpOnNonComplexSmallTensorOrScalar = [&](Operation *op) {
-      return llvm::all_of(op->getOperandTypes(),
-                          isNonComplexSmallTensorOrScalar) &&
-             llvm::all_of(op->getResultTypes(),
-                          isNonComplexSmallTensorOrScalar);
-    };
-    auto isInsidePerfectlyTiledLoop = [&](Operation *op) {
-      Operation *parent = op->getParentOp();
-      return (isa<scf::ForallOp, scf::ForOp>(parent)) &&
-             hasLabel(parent, kPerfectlyTiledLoopLabel);
-    };
-    auto isInsidePerfectlyTiledLoopOrSmall = [&](Operation *op) {
-      return !hasSingleElementOperandsAndResults(op) &&
-             (isInsidePerfectlyTiledLoop(op) ||
-              isOpOnNonComplexSmallTensorOrScalar(op));
-    };
-    {
-      RewritePatternSet patterns = getDefaultVectorizationPatterns(ctx);
-      TransferReadOp::getCanonicalizationPatterns(patterns, ctx);
-      // clang-format off
-      patterns.add<
-          VectorizationPattern<BroadcastOp>,
-          VectorizationPattern<FillOp>,
-          VectorizationPattern<GenericOp>,
-          VectorizationPattern<DotOp>,
-          VectorizationPattern<MapOp>,
-          VectorizationPattern<MatmulOp>,
-          VectorizationPattern<MatvecOp>,
-          VectorizationPattern<Mmt4DOp>,
-          VectorizationPattern<ReduceOp>,
-          VectorizationPattern<TransposeOp>,
-          VectorizationPattern<VecmatOp>
-          >(ctx, isInsidePerfectlyTiledLoopOrSmall);
-      // clang-format on
-      patterns
-          .add<InlineCastInIfOpPattern, PassVectorizedValuesThroughIfOpPattern,
-               ThloReverseVectorizationPattern,
-               TransferReadOfOneDimExpandShape>(ctx);
-      tensor::CastOp::getCanonicalizationPatterns(patterns, ctx);
-      tensor::populateFoldTensorSubsetIntoVectorTransferPatterns(patterns);
-      if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
-        return signalPassFailure();
-    }
-
-    {
-      RewritePatternSet patterns = getDefaultVectorizationPatterns(ctx);
-      TransferReadOp::getCanonicalizationPatterns(patterns, ctx);
-      patterns.add<IdentityTransposeOpFoldingPattern>(ctx);
-      if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
-        return signalPassFailure();
-    }
-
-    // Hoisting transfer_read/transfer_write.
-    linalg::hoistRedundantVectorTransfersOnTensor(func);
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeForCPUPass(
-    int64_t numElementsThreshold) {
-  VectorizeForCPUPassOptions opts;
-  opts.numElementsThreshold = numElementsThreshold;
-  return std::make_unique<VectorizeForCPUPass>(opts);
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/utils/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/gml_st/utils/CMakeLists.txt
deleted file mode 100644
index 809a3e3a892f77..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/utils/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-add_mlir_dialect_library(MLIRGmlStUtils
-  linalg_utils.cc
-  tensor_utils.cc
-
-  LINK_LIBS PUBLIC
-  MLIRLinalgDialect
-  MLIRTensorDialect
-)
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/utils/linalg_utils.cc b/third_party/xla/xla/mlir_hlo/gml_st/utils/linalg_utils.cc
deleted file mode 100644
index f1f1f7c300f559..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/utils/linalg_utils.cc
+++ /dev/null
@@ -1,238 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "gml_st/utils/linalg_utils.h"
-
-#include <iterator>
-
-#include "mlir/Dialect/Linalg/Utils/Utils.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
-#include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
-
-namespace mlir::gml_st {
-namespace {
-
-using tensor::CollapseShapeOp;
-using tensor::ExpandShapeOp;
-
-Value collapseDpsInit(OpBuilder &b, Location loc, Value init,
-                      ArrayRef<ReassociationIndices> reassociation) {
-  auto fillOp = init.getDefiningOp<linalg::FillOp>();
-  if (!fillOp) return b.create<CollapseShapeOp>(loc, init, reassociation);
-
-  Value collapsedInit = b.create<CollapseShapeOp>(
-      loc, fillOp.getOutputs().front(), reassociation);
-  auto newFill = b.create<linalg::FillOp>(loc, fillOp.getInputs(),
-                                          ValueRange{collapsedInit});
-  return newFill.getResult(0);
-}
-
-}  // namespace
-
-bool isCwiseGenericOp(Operation *op, int64_t *arity) {
-  auto genericOp = llvm::dyn_cast_or_null<linalg::GenericOp>(op);
-  if (!genericOp || genericOp.getNumDpsInits() != 1) return false;
-
-  // Check all-parallel iterator types.
-  if (!llvm::all_of(genericOp.getIteratorTypesArray(),
-                    linalg::isParallelIterator))
-    return false;
-
-  // Check all-identity maps.
-  if (!llvm::all_of(genericOp.getIndexingMapsArray(),
-                    [](AffineMap map) { return map.isIdentity(); })) {
-    return false;
-  }
-
-  // Allow for pattern matching the arity.
-  if (arity != nullptr) *arity = genericOp.getNumDpsInputs();
-  return true;
-}
-
-bool isSimpleBcastReduction(Operation *op, int64_t *dimension,
-                            SimpleBcastReduction *chain) {
-  // Match bcast.
-  auto broadcastOp = llvm::dyn_cast_or_null<linalg::BroadcastOp>(op);
-  if (!broadcastOp) return false;
-
-  // Match reduction.
-  auto reduceOp = llvm::dyn_cast_or_null<linalg::ReduceOp>(
-      broadcastOp.getOperands().front().getDefiningOp());
-  if (!reduceOp || reduceOp.getNumDpsInits() != 1) return false;
-
-  // Check that bcast and reduction dimensions match.
-  auto bcstDimensions = broadcastOp.getDimensions();
-  if (!bcstDimensions.empty() && bcstDimensions != reduceOp.getDimensions())
-    return false;
-
-  // Allow for pattern matching the reduction dimension and operation chain.
-  if (dimension != nullptr) *dimension = bcstDimensions.front();
-  if (chain != nullptr) {
-    chain->bcast = op;
-    chain->reduction = reduceOp;
-    chain->operand = reduceOp.getInputs().front();
-  }
-  return true;
-}
-
-bool isTransformableIntoMatmul(linalg::Conv2DNhwcHwcfOp convOp) {
-  if (!convOp.hasTensorSemantics()) return false;
-
-  Value input = convOp.getInputs()[0];
-  auto inputType = input.getType().cast<RankedTensorType>();
-
-  Value kernel = convOp.getInputs()[1];
-  auto kernelType = kernel.getType().cast<RankedTensorType>();
-
-  Value init = convOp.getOutputs()[0];
-  auto initType = init.getType().cast<RankedTensorType>();
-
-  if (!inputType.hasStaticShape() || !kernelType.hasStaticShape() ||
-      !initType.hasStaticShape()) {
-    return false;
-  }
-
-  auto allOnes = [](DenseIntElementsAttr attr) {
-    return attr.isSplat() && attr.getValues<int64_t>()[0] == 1;
-  };
-  if (!allOnes(convOp.getDilations()) || !allOnes(convOp.getStrides()))
-    return false;
-
-  if (inputType.getDimSize(0) != 1 || inputType.getDimSize(3) != 1 ||
-      kernelType.getDimSize(2) != 1 || initType.getDimSize(0) != 1 ||
-      initType.getDimSize(2) != 1)
-    return false;
-  return true;
-}
-
-FailureOr<linalg::MatmulOp> convertConvToMatmul(linalg::Conv2DNhwcHwcfOp convOp,
-                                                PatternRewriter &rewriter) {
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(convOp);
-  Value input = convOp.getInputs()[0];
-  Value kernel = convOp.getInputs()[1];
-  Value init = convOp.getOutputs()[0];
-
-  auto kernelType = kernel.getType().cast<RankedTensorType>();
-  if (!isTransformableIntoMatmul(convOp) || kernelType.getDimSize(0) != 1)
-    return failure();
-
-  Location loc = convOp.getLoc();
-  SmallVector<ReassociationIndices> map{{0, 1}, {2, 3}};
-  Value newInput = rewriter.create<CollapseShapeOp>(loc, input, map);
-  Value newKernel = rewriter.create<CollapseShapeOp>(loc, kernel, map);
-  Value newInit = rewriter.create<CollapseShapeOp>(loc, init, map);
-
-  auto matmul = rewriter.create<linalg::MatmulOp>(
-      loc, newInit.getType(), ValueRange{newInput, newKernel},
-      ValueRange{newInit});
-
-  rewriter.replaceOpWithNewOp<ExpandShapeOp>(convOp, convOp.getType(0),
-                                             matmul.getResult(0), map);
-  return matmul;
-}
-
-FailureOr<linalg::MatmulOp> convertBatchMatmulToMatmul(
-    linalg::BatchMatmulOp batchMatmulOp, PatternRewriter &rewriter) {
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(batchMatmulOp);
-  Value lhs = batchMatmulOp.getInputs()[0];
-  Value rhs = batchMatmulOp.getInputs()[1];
-  Value init = batchMatmulOp.getOutputs()[0];
-
-  Location loc = batchMatmulOp.getLoc();
-  SmallVector<ReassociationIndices> map{{0, 1}, {2}};
-  Value newLhs = rewriter.create<CollapseShapeOp>(loc, lhs, map);
-  Value newRhs = rewriter.create<CollapseShapeOp>(loc, rhs, map);
-  Value newInit = collapseDpsInit(rewriter, loc, init, map);
-  auto matmul = rewriter.create<linalg::MatmulOp>(
-      loc, newInit.getType(), ValueRange{newLhs, newRhs}, ValueRange{newInit});
-
-  rewriter.replaceOpWithNewOp<ExpandShapeOp>(
-      batchMatmulOp, batchMatmulOp.getType(0), matmul.getResult(0), map);
-  return matmul;
-}
-
-FailureOr<linalg::DotOp> convertMatvecToDotOp(PatternRewriter &rewriter,
-                                              linalg::MatvecOp matvecOp) {
-  auto resultType = matvecOp.getType(0).cast<RankedTensorType>();
-  if (resultType.getDimSize(0) != 1) return failure();
-
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(matvecOp);
-
-  Location loc = matvecOp.getLoc();
-  Value lhs = matvecOp.getInputs().front();
-  Value rhs = matvecOp.getInputs().back();
-  Value init = matvecOp.getOutputs().front();
-
-  Value collapsedLhs =
-      rewriter.create<CollapseShapeOp>(loc, lhs, ReassociationIndices{{0, 1}});
-  Value collapsedInit = collapseDpsInit(rewriter, loc, init, {});
-  auto dotOp = rewriter.create<linalg::DotOp>(loc, collapsedInit.getType(),
-                                              ValueRange{collapsedLhs, rhs},
-                                              ValueRange{collapsedInit});
-  Value expandResult =
-      rewriter.create<ExpandShapeOp>(loc, init.getType(), dotOp.getResult(0),
-                                     ArrayRef<ReassociationIndices>{});
-
-  rewriter.replaceOp(matvecOp, expandResult);
-  return dotOp;
-}
-
-FailureOr<linalg::ReduceOp> convertDotOpToReduce(linalg::DotOp dotOp,
-                                                 PatternRewriter &rewriter) {
-  Location loc = dotOp.getLoc();
-
-  // Create empty tensor for linalg.map.
-  Value lhs = dotOp.getInputs().front();
-  FailureOr<OpFoldResult> inputSizeOfr =
-      tensor::getMixedSize(rewriter, loc, lhs, 0);
-
-  if (failed(inputSizeOfr)) {
-    return rewriter.notifyMatchFailure(
-        dotOp, "cannot get the size of the input tensor");
-  }
-
-  Type elementType = getElementTypeOrSelf(lhs.getType());
-  Value emptyTensor =
-      rewriter.create<tensor::EmptyOp>(loc, *inputSizeOfr, elementType);
-
-  // Create linalg.map.
-  Operation *arithMul = &dotOp.getBody()->front();
-  auto mul = rewriter.create<linalg::MapOp>(
-      loc, dotOp.getOperands().take_front(2), emptyTensor,
-      [&](OpBuilder &b, Location loc, ValueRange args) {
-        auto *n = mlir::clone(b, arithMul, arithMul->getResultTypes(),
-                              args.take_front(2));
-        b.create<linalg::YieldOp>(loc, n->getResults());
-      });
-
-  // Create linalg.reduce.
-  Operation *arithAdd = &(*std::next(dotOp.getBody()->begin()));
-  auto add = rewriter.create<linalg::ReduceOp>(
-      loc, ValueRange{mul.getResult()}, ValueRange{dotOp.getOperand(2)},
-      SmallVector<int64_t>{0},
-      [&](OpBuilder &b, Location loc, ValueRange args) {
-        auto *n = mlir::clone(b, arithAdd, arithAdd->getResultTypes(),
-                              {args[1], args[0]});
-        b.create<linalg::YieldOp>(loc, n->getResults());
-      });
-
-  rewriter.replaceOp(dotOp, add->getResults());
-  return add;
-}
-
-}  // namespace mlir::gml_st
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/utils/linalg_utils.h b/third_party/xla/xla/mlir_hlo/gml_st/utils/linalg_utils.h
deleted file mode 100644
index aef0a7603ad3f7..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/utils/linalg_utils.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_GML_ST_UTILS_LINALG_UTILS_H
-#define MLIR_HLO_GML_ST_UTILS_LINALG_UTILS_H
-
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-
-namespace mlir::gml_st {
-
-// Helper functions to match Linalg ops that implement simple reductions,
-// bcasts, and cwise ops.
-
-struct SimpleBcastReduction {
-  Operation *bcast;
-  Operation *reduction;
-  Value operand;
-};
-
-bool isSimpleBcastReduction(Operation *op, int64_t *dimension = nullptr,
-                            SimpleBcastReduction *chain = nullptr);
-
-// The Conv2D is transformable into a matmul, if it has the following shape
-//
-// linalg.conv_2d_nhwc_hwcf
-//   {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-//   ins(%input, %kernel : tensor<1x(N+L-1)xKx1xf32>, tensor<LxKx1xMxf32>)
-//   outs(%fill : tensor<1xNx1xM>) -> tensor<1xNx1xMxf32>
-//
-// in that case we can tile w.r.t. L to bring it to the following form
-//
-// linalg.conv_2d_nhwc_hwcf
-//   {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-//   ins(%input, %kernel : tensor<1xNxKx1xf32>, tensor<1xKx1xMxf32>)
-//   outs(%fill : tensor<1xNx1xM>) -> tensor<1xNx1xMxf32>
-bool isTransformableIntoMatmul(linalg::Conv2DNhwcHwcfOp convOp);
-
-// linalg.conv_2d_nhwc_hwcf
-//   {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-//   ins(%input, %kernel : tensor<1xNxKx1xf32>, tensor<1xKx1xMxf32>)
-//   outs(%fill : tensor<1xNx1xM>) -> tensor<1xNx1xMxf32>
-//
-//  into
-//
-// linalg.matmul
-//   ins(%lhs, %rhs : tensor<NxKxf32>, tensor<KxMxf32>)
-//   outs(%fill : tensor<NxM>) -> tensor<1xNx1xMxf32>
-FailureOr<linalg::MatmulOp> convertConvToMatmul(linalg::Conv2DNhwcHwcfOp convOp,
-                                                PatternRewriter &rewriter);
-
-// Converts linalg.batch_matmul into linalg.matmul.
-FailureOr<linalg::MatmulOp> convertBatchMatmulToMatmul(
-    linalg::BatchMatmulOp batchMatmulOp, PatternRewriter &rewriter);
-
-// Converts linalg.matvec into linalg.dot.
-FailureOr<linalg::DotOp> convertMatvecToDotOp(PatternRewriter &rewriter,
-                                              linalg::MatvecOp matvecOp);
-
-// Converts linalg.dot into linalg.reduce(linalg.map).
-FailureOr<linalg::ReduceOp> convertDotOpToReduce(linalg::DotOp dotOp,
-                                                 PatternRewriter &rewriter);
-
-}  // namespace mlir::gml_st
-
-#endif  // MLIR_HLO_GML_ST_UTILS_LINALG_UTILS_H
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/utils/tensor_utils.h b/third_party/xla/xla/mlir_hlo/gml_st/utils/tensor_utils.h
deleted file mode 100644
index 0d0de996984ef0..00000000000000
--- a/third_party/xla/xla/mlir_hlo/gml_st/utils/tensor_utils.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_GML_ST_UTILS_TENSOR_UTILS_H
-#define MLIR_HLO_GML_ST_UTILS_TENSOR_UTILS_H
-
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
-
-namespace mlir {
-namespace gml_st {
-
-// TODO(vuson): maybe overload this function instead of templating it.
-// Check if the reshape operation is only expanding into/collapsing of
-// unit-dimension.
-template <typename TensorReshapeOp>
-bool isDegenerateReshapeOp(TensorReshapeOp reshapeOp) {
-  constexpr bool isExpanding =
-      std::is_same<TensorReshapeOp, tensor::ExpandShapeOp>::value;
-  llvm::ArrayRef<int64_t> expandedShape =
-      (isExpanding ? reshapeOp.getResultType().getShape()
-                   : reshapeOp.getSrcType().getShape());
-  for (auto& indices : reshapeOp.getReassociationIndices()) {
-    // For each reassociation indices, a degenerate reshape op only has at most
-    // 1 non-unit-dimension, i.e. number of unit-dimensions is greater or equal
-    // to the indices size - 1.
-    if (static_cast<size_t>(
-            llvm::count_if(indices, [&expandedShape](int64_t idx) {
-              return expandedShape[idx] == 1;
-            })) < indices.size() - 1)
-      return false;
-  }
-  return true;
-}
-
-// Returns ids of size-1 dims that were expanded or collapsed by
-// tensor.expand_shape/tensor.collapse_shape.
-SmallVector<int64_t> getPreservedDimensions(
-    ArrayRef<int64_t> shape,
-    ArrayRef<ReassociationIndices> reassociationIndices);
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif  // MLIR_HLO_GML_ST_UTILS_TENSOR_UTILS_H
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops.td b/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops.td
index f014a4ed3aa4cd..e406b7006573e4 100644
--- a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops.td
+++ b/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops.td
@@ -1406,8 +1406,9 @@ def FusionOp : LHLO_Op<"fusion", [
 
     SmallVector<Value> getOutputBuffers() {
       SmallVector<Value> buffers;
-      for (auto store : getRegion().front().getOps<memref::TensorStoreOp>()) {
-        buffers.push_back(store.getMemref());
+      for (auto store : getRegion().front()
+          .getOps<bufferization::MaterializeInDestinationOp>()) {
+        buffers.push_back(store.getDest());
       }
       return buffers;
     }
@@ -1422,8 +1423,9 @@ def FusionOp : LHLO_Op<"fusion", [
 
     SmallVector<Value> getFusionResults() {
       SmallVector<Value> buffers;
-      for (auto store : getRegion().front().getOps<memref::TensorStoreOp>()) {
-        buffers.push_back(store.getTensor());
+      for (auto store : getRegion().front()
+          .getOps<bufferization::MaterializeInDestinationOp>()) {
+        buffers.push_back(store.getSource());
       }
       return buffers;
     }
diff --git a/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td b/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
index 3091ed06f4fec8..b2e9b4dff8f450 100644
--- a/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
+++ b/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
@@ -244,7 +244,8 @@ class LHLOGPU_AsyncCollectiveCommunicationOp<string name, list<Trait> traits = [
     DefaultValuedOptionalAttr<BoolAttr, "false">:$constrain_layout,
     OptionalAttr<MHLO_ChannelHandle>:$channel_id,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_global_device_ids,
-    BoolAttr:$is_sync
+    BoolAttr:$is_sync,
+    BoolAttr:$no_parallel_custom_call
   );
 }
 
@@ -271,7 +272,8 @@ def LHLOGPU_CollectivePermuteStartOp :
     Arg<LHLO_Buffer, "", [MemWrite]>:$output,
     I64ElementsAttr:$source_target_pairs,
     OptionalAttr<MHLO_ChannelHandle>:$channel_id,
-    BoolAttr:$is_sync
+    BoolAttr:$is_sync,
+    BoolAttr:$no_parallel_custom_call
   );
 }
 
@@ -362,7 +364,9 @@ def LHLOGPU_fusedMHAOp : LHLOGPU_Op<"fMHA", [AttrSizedOperandSegments]> {
     FusedMhaDagSignatureAttr:$fused_mha_dag,
     FusedMHAAlgorithmConfigAttr:$algorithm_config,
     OptionalAttr<F64Attr>:$dropout_rate,
-    OptionalAttr<I64Attr>:$seed
+    OptionalAttr<I64Attr>:$seed,
+    BoolAttr:$is_flash_attention,
+    BoolAttr:$is_causal_mask
     );
 }
 
@@ -374,21 +378,30 @@ def LHLOGPU_fusedMHABackwardOp : LHLOGPU_Op<"fMHABackward", [AttrSizedOperandSeg
     Arg<LHLO_Buffer, "", [MemRead]>:$bmm2_grad_gemm1_lhs,
     Arg<LHLO_Buffer, "", [MemRead]>:$d_output,
     Arg<Optional<LHLO_Buffer>, "", [MemRead]>:$mask,
+    Arg<Optional<LHLO_Buffer>, "", [MemRead]>:$bias,
+    Arg<Optional<LHLO_Buffer>, "", [MemRead]>:$fwd_output,
     Arg<LHLO_Buffer, "", [MemWrite]>:$d_bmm1_lhs,
     Arg<LHLO_Buffer, "", [MemWrite]>:$d_bmm1_rhs,
     Arg<LHLO_Buffer, "", [MemWrite]>:$d_bmm2_rhs,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$d_S,
+    Arg<Optional<LHLO_Buffer>, "", [MemWrite]>:$d_S,
+    Arg<Optional<LHLO_Buffer>, "", [MemWrite]>:$softmax_sum,
+    Arg<Optional<LHLO_Buffer>, "", [MemWrite]>:$d_Q_accum,
     Arg<LHLO_Buffer, "", [MemWrite]>:$scratch,
     Arg<Optional<LHLO_Buffer>, "", [MemWrite]>:$d_bias,
     MHLO_DotDimensionNumbers:$bmm1_grad_gemm1_dot_dimension_numbers,
     MHLO_DotDimensionNumbers:$bmm1_grad_gemm2_dot_dimension_numbers,
     MHLO_DotDimensionNumbers:$bmm2_grad_gemm1_dot_dimension_numbers,
     MHLO_DotDimensionNumbers:$bmm2_grad_gemm2_dot_dimension_numbers,
+    I64ArrayAttr:$intermediate_tensor_dimensions,
+    I64ArrayAttr:$intermediate_tensor_layout,
     F64Attr:$fmha_scale,
     FusedMhaBackwardDagSignatureAttr:$fused_mha_dag,
     FusedMHAAlgorithmConfigAttr:$algorithm_config,
     OptionalAttr<F64Attr>:$dropout_rate,
-    OptionalAttr<I64Attr>:$seed);
+    OptionalAttr<I64Attr>:$seed,
+    BoolAttr:$is_flash_attention,
+    BoolAttr:$is_causal_mask
+    );
 }
 
 def LHLOGPU_RadixSortOp: LHLOGPU_Op<"radix_sort", [SameVariadicOperandSize]> {
diff --git a/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops_enums.td b/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops_enums.td
index 8ab0646a44a6d3..7ce614e43b8597 100644
--- a/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops_enums.td
+++ b/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops_enums.td
@@ -153,6 +153,8 @@ def FusedMhaBackwardDagScaleBiasSoftmaxDropout : I32EnumAttrCase<"BackwardScaleB
 def FusedMhaBackwardDagScaleBiasSoftmax : I32EnumAttrCase<"BackwardScaleBiasSoftmax", 1>;
 def FusedMhaBackwardDagScaleBiasMaskSoftmax : I32EnumAttrCase<"BackwardScaleBiasMaskSoftmax", 2>;
 def FusedMhaBackwardDagScaleBiasMaskSoftmaxDropout : I32EnumAttrCase<"BackwardScaleBiasMaskSoftmaxDropout", 3>;
+def FusedMhaBackwardDagSoftmax : I32EnumAttrCase<"BackwardSoftmax", 4>;
+def FusedMhaBackwardDagSoftmaxDropout : I32EnumAttrCase<"BackwardSoftmaxDropout", 5>;
 
 def FusedMhaDagSignature: I32EnumAttr<"FusedMhaDagSignature",
     "DAG configuration for Fused Multi-Headed Attention",
@@ -175,11 +177,13 @@ def FusedMhaBackwardDagSignature: I32EnumAttr<"FusedMhaBackwardDagSignature",
     FusedMhaBackwardDagScaleBiasSoftmaxDropout,
     FusedMhaBackwardDagScaleBiasSoftmax,
     FusedMhaBackwardDagScaleBiasMaskSoftmax,
-    FusedMhaBackwardDagScaleBiasMaskSoftmaxDropout]> {
+    FusedMhaBackwardDagScaleBiasMaskSoftmaxDropout,
+    FusedMhaBackwardDagSoftmax,
+    FusedMhaBackwardDagSoftmaxDropout]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::lmhlo_gpu";
 }
 
 def FusedMhaDagSignatureAttr : EnumAttr<LmhloGpuDialect, FusedMhaDagSignature, "fused_mha_dag">;
 def FusedMhaBackwardDagSignatureAttr : EnumAttr<LmhloGpuDialect, FusedMhaBackwardDagSignature, "fused_mha_backward_dag">;
-#endif // LHLO_GPU_OPS_ENUMS
\ No newline at end of file
+#endif // LHLO_GPU_OPS_ENUMS
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
index 99007914acbd9d..ee1e08303e7a41 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
@@ -334,17 +334,6 @@ LogicalResult TypeExtensionsAttr::verifyEncoding(
       getBounds(), RankedTensorType::get(shape, elementType), emitError);
 }
 
-//===----------------------------------------------------------------------===//
-// CollectivePermuteOp
-//===----------------------------------------------------------------------===//
-
-void CollectivePermuteOp::build(OpBuilder& odsBuilder, OperationState& odsState,
-                                Type resultType, Value operand,
-                                DenseIntElementsAttr sourceTargetPairs) {
-  CollectivePermuteOp::build(odsBuilder, odsState, resultType, operand,
-                             sourceTargetPairs, /*channel_handle=*/nullptr);
-}
-
 //===----------------------------------------------------------------------===//
 // ReduceScatterOp
 //===----------------------------------------------------------------------===//
@@ -392,6 +381,7 @@ INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Atan2Op)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CbrtOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CeilOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ClzOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CollectiveBroadcastOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CollectivePermuteOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CopyOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CosineOp)
@@ -1003,16 +993,28 @@ LogicalResult DotGeneralOp::reifyReturnTypeShapes(
 //===----------------------------------------------------------------------===//
 // FftOp
 //===----------------------------------------------------------------------===//
+LogicalResult verify1dTensor(std::optional<Location> loc,
+                             DenseIntElementsAttr attr, std::string attrName) {
+  auto rank = attr.getType().getRank();
+  if (rank != 1) {
+    return emitOptionalError(loc, attrName, " has rank ", rank,
+                             " instead of required rank 1.");
+  }
+  return success();
+}
 
 LogicalResult FftOp::inferReturnTypeComponents(
     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
     DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
   FftOp::Adaptor adaptor(operands, attributes, {}, regions);
-  return hlo::inferFftOp(location, adaptor.getOperand(),
-                         adaptor.getFftType() == FftType::RFFT,
-                         adaptor.getFftType() == FftType::IRFFT,
-                         adaptor.getFftLength(), inferredReturnShapes);
+  if (failed(verify1dTensor(location, adaptor.getFftLength(), "fft_length")))
+    return failure();
+  return hlo::inferFftOp(
+      location, adaptor.getOperand(), adaptor.getFftType() == FftType::RFFT,
+      adaptor.getFftType() == FftType::IRFFT,
+      llvm::to_vector(adaptor.getFftLength().getValues<int64_t>()),
+      inferredReturnShapes);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1488,10 +1490,33 @@ LogicalResult AbsOp::inferReturnTypes(
   return hlo::inferAbsOp(location, adaptor.getOperand(), inferredReturnTypes);
 }
 
+//===----------------------------------------------------------------------===//
+// CollectiveBroadcastOp
+//===----------------------------------------------------------------------===//
+
+void CollectiveBroadcastOp::build(OpBuilder& odsBuilder,
+                                  OperationState& odsState, Type resultType,
+                                  Value operand,
+                                  DenseIntElementsAttr replicaGroups) {
+  CollectiveBroadcastOp::build(odsBuilder, odsState, resultType, operand,
+                               replicaGroups, /*channel_handle=*/nullptr);
+}
+
+LogicalResult CollectiveBroadcastOp::verify() {
+  return hlo::verifyCollectiveBroadcastOp(getLoc(), getReplicaGroups());
+}
+
 //===----------------------------------------------------------------------===//
 // CollectivePermuteOp
 //===----------------------------------------------------------------------===//
 
+void CollectivePermuteOp::build(OpBuilder& odsBuilder, OperationState& odsState,
+                                Type resultType, Value operand,
+                                DenseIntElementsAttr sourceTargetPairs) {
+  CollectivePermuteOp::build(odsBuilder, odsState, resultType, operand,
+                             sourceTargetPairs, /*channel_handle=*/nullptr);
+}
+
 LogicalResult CollectivePermuteOp::verify() {
   return hlo::verifyCollectivePermuteOp(getLoc(), getSourceTargetPairs());
 }
@@ -2217,9 +2242,13 @@ LogicalResult BroadcastOp::inferReturnTypeComponents(
     DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
   BroadcastOp::Adaptor adaptor(operands, attributes, {}, regions);
-  return hlo::inferBroadcastOp(location, adaptor.getOperand(),
-                               adaptor.getBroadcastSizes(),
-                               inferredReturnShapes);
+  if (failed(verify1dTensor(location, adaptor.getBroadcastSizes(),
+                            "broadcast_sizes")))
+    return failure();
+  return hlo::inferBroadcastOp(
+      location, adaptor.getOperand(),
+      llvm::to_vector(adaptor.getBroadcastSizes().getValues<int64_t>()),
+      inferredReturnShapes);
 }
 
 LogicalResult BroadcastOp::reifyReturnTypeShapes(
@@ -2994,10 +3023,13 @@ LogicalResult DynamicSliceOp::inferReturnTypeComponents(
     DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
   DynamicSliceOp::Adaptor adaptor(operands, attributes, {}, regions);
-  return hlo::inferDynamicSliceOp(location, adaptor.getOperand().getType(),
-                                  adaptor.getStartIndices().getTypes(),
-                                  adaptor.getSliceSizes(),
-                                  inferredReturnShapes);
+  if (failed(verify1dTensor(location, adaptor.getSliceSizes(), "slice_sizes")))
+    return failure();
+  return hlo::inferDynamicSliceOp(
+      location, adaptor.getOperand().getType(),
+      adaptor.getStartIndices().getTypes(),
+      llvm::to_vector(adaptor.getSliceSizes().getValues<int64_t>()),
+      inferredReturnShapes);
 }
 
 //===----------------------------------------------------------------------===//
@@ -3981,7 +4013,11 @@ LogicalResult OptimizationBarrierOp::inferReturnTypes(
 // ReverseOp
 //===----------------------------------------------------------------------===//
 LogicalResult ReverseOp::verify() {
-  return hlo::verifyReverseOp(getLoc(), getOperand(), getDimensions());
+  if (failed(verify1dTensor(getLoc(), getDimensions(), "dimensions")))
+    return failure();
+  return hlo::verifyReverseOp(
+      getLoc(), getOperand(),
+      llvm::to_vector(getDimensions().getValues<int64_t>()));
 }
 
 //===----------------------------------------------------------------------===//
@@ -4139,11 +4175,20 @@ LogicalResult PadOp::inferReturnTypes(
     DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   PadOp::Adaptor adaptor(operands, attributes, {}, regions);
-  return hlo::inferPadOp(location, adaptor.getOperand().getType(),
-                         adaptor.getPaddingValue().getType(),
-                         adaptor.getEdgePaddingLow(),
-                         adaptor.getEdgePaddingHigh(),
-                         adaptor.getInteriorPadding(), inferredReturnTypes);
+  if (failed(verify1dTensor(location, adaptor.getEdgePaddingLow(),
+                            "edge_padding_low")) ||
+      failed(verify1dTensor(location, adaptor.getEdgePaddingHigh(),
+                            "edge_padding_high")) ||
+      failed(verify1dTensor(location, adaptor.getInteriorPadding(),
+                            "interior_padding")))
+    return failure();
+  return hlo::inferPadOp(
+      location, adaptor.getOperand().getType(),
+      adaptor.getPaddingValue().getType(),
+      llvm::to_vector(adaptor.getEdgePaddingLow().getValues<int64_t>()),
+      llvm::to_vector(adaptor.getEdgePaddingHigh().getValues<int64_t>()),
+      llvm::to_vector(adaptor.getInteriorPadding().getValues<int64_t>()),
+      inferredReturnTypes);
 }
 
 template <typename T>
@@ -5125,9 +5170,18 @@ LogicalResult SliceOp::inferReturnTypes(
     ValueRange operands, DictionaryAttr attributes, OpaqueProperties,
     RegionRange /*regions*/, SmallVectorImpl<Type>& inferredReturnTypes) {
   SliceOpAdaptor adaptor(operands, attributes);
-  return hlo::inferSliceOp(location, adaptor.getOperand().getType(),
-                           adaptor.getStartIndices(), adaptor.getLimitIndices(),
-                           adaptor.getStrides(), inferredReturnTypes);
+  if (failed(verify1dTensor(location, adaptor.getStartIndices(),
+                            "start_indices")) ||
+      failed(verify1dTensor(location, adaptor.getLimitIndices(),
+                            "limit_indices")) ||
+      failed(verify1dTensor(location, adaptor.getStrides(), "strides")))
+    return failure();
+  return hlo::inferSliceOp(
+      location, adaptor.getOperand().getType(),
+      llvm::to_vector(adaptor.getStartIndices().getValues<int64_t>()),
+      llvm::to_vector(adaptor.getLimitIndices().getValues<int64_t>()),
+      llvm::to_vector(adaptor.getStrides().getValues<int64_t>()),
+      inferredReturnTypes);
 }
 
 template <typename I, typename E>
@@ -5442,46 +5496,6 @@ LogicalResult TopKOp::inferReturnTypeComponents(
                           inferredReturnShapes);
 }
 
-bool isMhloCompareOfBodyArgumentsGtOrLt(Block& body) {
-  auto terminator = dyn_cast<ReturnOp>(body.getTerminator());
-  if (!terminator || terminator->getNumOperands() != 1) return false;
-
-  auto compare = terminator.getOperand(0).getDefiningOp<CompareOp>();
-  if (!compare) return false;
-  auto direction = compare.getComparisonDirection();
-  if (direction != ComparisonDirection::GT &&
-      direction != ComparisonDirection::LT)
-    return false;
-
-  if (body.getNumArguments() != 2) return false;
-  auto arg0 = matchers::m_Val(body.getArgument(0));
-  auto arg1 = matchers::m_Val(body.getArgument(1));
-  return matchPattern(compare.getResult(), m_Op<CompareOp>(arg0, arg1)) ||
-         matchPattern(compare.getResult(), m_Op<CompareOp>(arg1, arg0));
-}
-
-LogicalResult TopKOp::verify() {
-  Builder builder(getContext());
-  auto operandType = getOperand().getType();
-  Block& body = getBody().front();
-
-  auto expectedBodyArgType =
-      RankedTensorType::get({}, operandType.getElementType());
-  auto expectedBodyType =
-      builder.getFunctionType({expectedBodyArgType, expectedBodyArgType},
-                              {RankedTensorType::get({}, builder.getI1Type())});
-  auto actualBodyType = builder.getFunctionType(
-      body.getArgumentTypes(), body.getTerminator()->getOperandTypes());
-  if (expectedBodyType != actualBodyType)
-    return emitOpError() << "unsupported body: expected: " << expectedBodyType
-                         << ", got " << actualBodyType;
-  if (!isMhloCompareOfBodyArgumentsGtOrLt(body))
-    return emitOpError() << "unsupported body: expected mhlo.compare of "
-                         << "body arguments with GT or LT comparison_direction";
-
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // TransposeOp
 //===----------------------------------------------------------------------===//
@@ -5624,8 +5638,12 @@ LogicalResult TransposeOp::inferReturnTypes(
     DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   TransposeOp::Adaptor adaptor(operands, attributes, {}, regions);
-  return hlo::inferTransposeOp(loc, adaptor.getOperand(),
-                               adaptor.getPermutation(), inferredReturnTypes);
+  if (failed(verify1dTensor(loc, adaptor.getPermutation(), "permutation")))
+    return failure();
+  return hlo::inferTransposeOp(
+      loc, adaptor.getOperand(),
+      llvm::to_vector(adaptor.getPermutation().getValues<int64_t>()),
+      inferredReturnTypes);
 }
 
 //===----------------------------------------------------------------------===//
@@ -7076,7 +7094,7 @@ static LogicalResult verifyArgResultAliasAttr(StringAttr attrName,
 LogicalResult verifyCrossProgramPrefetchAttr(CrossProgramPrefetchAttr cpp,
                                              ModuleOp module) {
   func::FuncOp main = module.lookupSymbol<func::FuncOp>("main");
-  if (cpp.getParameter() >= main.getNumArguments())
+  if (cpp.getParameter() >= main.getNumArguments() || cpp.getParameter() < 0)
     return module->emitOpError()
            << "cross_program_prefetch: parameter " << cpp.getParameter()
            << " out of range. main has only " << main.getNumArguments()
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.h b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.h
index 4a5483a91c6d87..806324b45b1473 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.h
@@ -89,6 +89,7 @@ class MhloDialect : public Dialect {
 class TokenType : public Type::TypeBase<TokenType, Type, TypeStorage> {
  public:
   using Base::Base;
+  static constexpr StringLiteral name = "mhlo.token";
 };
 
 void printConvolutionDimensions(AsmPrinter &p, ConvDimensionNumbersAttr dnums);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
index d480e6794e4bbb..b93368e08446db 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
@@ -2243,6 +2243,42 @@ def MHLO_ConcatenateOp : MHLO_ShapedInterfaceOp<"concatenate",
   let hasFolder = 1;
 }
 
+def MHLO_CollectiveBroadcastOp: MHLO_Op<"collective_broadcast",
+    [HLO_CompatibleOperandsAndResultType]> {
+  let summary = "CollectiveBroadcast operation";
+  let description = [{
+    Within each process group in the process grid, send the value of the
+    `operand` tensor from the source process to the target processes and produce a
+    `result` tensor.
+
+    See:
+    https://github.com/openxla/stablehlo/blob/main/docs/spec.md#collective_broadcast
+
+    Example:
+    ```mlir
+    %result = "mhlo.collective_broadcast"(%operand) {
+      replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>,
+      channel_handle = #mhlo.channel_handle<handle = 0, type = 0>
+    } : (tensor<1x2xi64>) -> tensor<1x2xi64>
+    ```
+  }];
+
+  let arguments = (ins
+    MHLO_Tensor:$operand,
+    I64ElementsAttr:$replica_groups,
+    OptionalAttr<MHLO_ChannelHandle>:$channel_handle
+  );
+  let results = (outs MHLO_Tensor);
+  let hasCustomHLOConverter = 1;
+  let hasVerifier = 1;
+  // channel_handle is only used for the SPMD partitioner, so we add a
+  // simplified builder method for convenience.
+  let builders = [
+    OpBuilder<(ins
+      "::mlir::Type":$result_type, "::mlir::Value":$operand,
+      "::mlir::DenseIntElementsAttr":$replica_groups)>];
+}
+
 def MHLO_CollectivePermuteOp: MHLO_Op<"collective_permute",
     [Pure, HLO_CompatibleOperandsAndResultType]> {
   let summary = "CollectivePermute operation";
@@ -2976,33 +3012,29 @@ def MHLO_TopKOp : MHLO_Op<"topk", [RecursiveMemoryEffects, InferTensorType]> {
   let summary = "TopK operation";
   let description = [{
     Returns top `k` values and their indices, along the last
-    dimension of the operand using the given `comparator` (for usual topk
-    behavior, it should be strict-greater-than operation).
+    dimension of the operand if `largest=true` or the bottom `k` values if
+    `largest=false`.
 
     See:
     https://www.tensorflow.org/xla/operation_semantics#top-k
 
     Example:
     ```mlir
-    %values, %indices = mhlo.topk(%operand, k=5) {
-      ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
-        %predicate = mhlo.compare GT, %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-        mhlo.return %predicate : tensor<i1>
-    } : tensor<100xf32> -> (tensor<5xf32>, tensor<5xi32>)
+    %values, %indices = mhlo.topk(%operand, k=5, largest=true)
+      : tensor<100xf32> -> (tensor<5xf32>, tensor<5xi32>)
     ```
   }];
 
   let arguments = (ins
     MHLO_Tensor:$operand,
-    I64Attr:$k
+    I64Attr:$k,
+    DefaultValuedOptionalAttr<BoolAttr, "true">:$largest
   );
-  let regions = (region SizedRegion<1>:$body);
   let results = (outs MHLO_Tensor:$values,
                       MHLO_Tensor:$indices);
 
-  let hasVerifier = 1;
   let assemblyFormat = [{
-    `(`$operand `,` `k` `=` $k`)` $body attr-dict `:`
+    `(`$operand `,` `k` `=` $k (`,` `largest` `=` $largest^)? `)` attr-dict `:`
     type($operand) `->` `(`type($values)`,` type($indices)`)`
   }];
   let hasCustomHLOConverter = 1;
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td
index 08c4619591525a..70572e8807cf93 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td
@@ -157,8 +157,9 @@ def MHLO_ArgResultAlias : AttrDef<MHLO_Dialect, "ArgResultAlias"> {
 }
 
 // Represents a unique identifier for each Send/Recv instruction pair or
-// optionally for collective instructions (AllReduce, CollectivePermute,
-// AllToAll). Non-positive channel_id handle is equivalent to no channel id.
+// optionally for collective instructions (AllToAll, AllReduce,
+// CollectiveBroadcast, and CollectivePermute). Non-positive channel_id
+// handle is equivalent to no channel id.
 def MHLO_ChannelHandle : AttrDef<MHLO_Dialect, "ChannelHandle"> {
   let mnemonic = "channel_handle";
   let parameters = (ins "int64_t":$handle, "int64_t":$type);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.cc b/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.cc
index 380d48d3b3754f..044f011c81e7a5 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.cc
@@ -16,10 +16,12 @@ limitations under the License.
 #include "mhlo/analysis/shape_component_analysis.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <optional>
 #include <vector>
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "mhlo/IR/hlo_ops.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -28,6 +30,7 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/Support/LLVM.h"
 
 using namespace mlir;
 
@@ -480,7 +483,7 @@ struct ShapeVisitor {
     SymbolicExpr dim;
     for (auto &it : in) {
       // For constant expressions, we can accumulate a concrete product.
-      if (auto cexpr = it.expr.dyn_cast<AffineConstantExpr>()) {
+      if (auto cexpr = dyn_cast<AffineConstantExpr>(it.expr)) {
         assert(cexpr.getValue() > 0 && "shape value must be positive");
         concreteProduct *= cexpr.getValue();
         continue;
@@ -520,6 +523,8 @@ struct ShapeVisitor {
     if (auto index = op.getIndex().getDefiningOp<arith::ConstantOp>()) {
       int64_t i = index.getValue().cast<IntegerAttr>().getInt();
       auto in = lookup(ShapeOrValueInfo::getShapeInfoOf(op.getSource()));
+      if (i >= static_cast<int64_t>(in.size()) || i < 0)
+        llvm::report_fatal_error("tensor dim out of bounds");
       dims.push_back({in[i].symbols, in[i].expr});
     } else {
       forwardUnknown(op);
@@ -764,8 +769,8 @@ void ShapeComponentAnalysis::reset() {
 }
 
 bool SymbolicExpr::isConstant(int64_t value) const {
-  return expr.isa<AffineConstantExpr>() &&
-         expr.cast<AffineConstantExpr>().getValue() == value;
+  return isa<AffineConstantExpr>(expr) &&
+         cast<AffineConstantExpr>(expr).getValue() == value;
 }
 
 bool SymbolicExpr::isKnownNotNegativeOne() const {
@@ -783,9 +788,9 @@ bool SymbolicExpr::isKnownNotNegativeOne() const {
   // For constants we know if it's -1 or not. Checking the sign is sufficient
   // here and allows for reuse below. This is correct, not complete.
   auto isGoodSymbolOrGoodConstantExpr = [&](AffineExpr expr) {
-    if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>())
+    if (auto symExpr = dyn_cast<AffineSymbolExpr>(expr))
       return isGoodSymbol(symbols[symExpr.getPosition()]);
-    if (auto constExpr = expr.dyn_cast<AffineConstantExpr>())
+    if (auto constExpr = dyn_cast<AffineConstantExpr>(expr))
       return constExpr.getValue() >= 0;
     return false;
   };
@@ -795,7 +800,7 @@ bool SymbolicExpr::isKnownNotNegativeOne() const {
   // Multiplying non-negative symbols and non-negative constants will always
   // give a positive result. This is correct, not complete.
   // TODO(kramerb): Could the analysis provide a generic interface for this?
-  if (auto bexpr = expr.dyn_cast<AffineBinaryOpExpr>()) {
+  if (auto bexpr = dyn_cast<AffineBinaryOpExpr>(expr)) {
     return bexpr.getKind() == AffineExprKind::Mul &&
            isGoodSymbolOrGoodConstantExpr(bexpr.getLHS()) &&
            isGoodSymbolOrGoodConstantExpr(bexpr.getRHS());
@@ -805,15 +810,15 @@ bool SymbolicExpr::isKnownNotNegativeOne() const {
 }
 
 bool SymbolicExpr::isKnownNotOne() const {
-  if (auto constExpr = expr.dyn_cast<AffineConstantExpr>()) {
+  if (auto constExpr = dyn_cast<AffineConstantExpr>(expr)) {
     return constExpr.getValue() != 1;
   }
   return false;
 }
 
 std::optional<Symbol> SymbolicExpr::singleton() const {
-  if (expr.isa<AffineSymbolExpr>() &&
-      expr.cast<AffineSymbolExpr>().getPosition() == 0) {
+  if (isa<AffineSymbolExpr>(expr) &&
+      cast<AffineSymbolExpr>(expr).getPosition() == 0) {
     assert(symbols.size() == 1);
     return symbols[0];
   }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
index 438013e4b3bcc1..1bd60985e1a0d3 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
@@ -107,26 +107,6 @@ add_mlir_library(MhloPasses
   StablehloBroadcastUtils
 )
 
-add_mlir_library(MhloToThloConversion
-  legalize_mhlo_to_thlo/legalize_mhlo_to_thlo.cc
-
-  DEPENDS
-  MLIRMhloPassIncGen
-  THLODialect
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MhloDialect
-  MhloToArithmeticConversion
-  MhloTypeConversion
-  THLODialect
-  MLIRIR
-  MLIRMhloUtils
-  MLIRPass
-  MLIRTransformUtils
-)
 
 add_mlir_library(MhloToMemrefConversion
   hlo_legalize_to_memref/hlo_legalize_to_memref.cc
@@ -316,21 +296,7 @@ target_link_libraries(AllMhloPasses INTERFACE
   MhloToStandard
   HloToLinalgUtils
   MhloToLinalg
-  MhloToThloConversion
   MhloShapeOpsToStandard
   MhloToStablehlo
   StablehloToMhlo
 )
-
-add_library(AllGmlStPasses INTERFACE)
-target_link_libraries(AllGmlStPasses INTERFACE
-  GmlStPasses
-  GmlStTestPasses
-  MLIRFuncDialect
-  MLIRPass
-)
-
-add_library(AllThloPasses INTERFACE)
-target_link_libraries(AllThloPasses INTERFACE
-  ThloPasses
-)
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
index 4774aac93d4b56..0ccf2d56debfff 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
@@ -52,7 +52,7 @@ bool hasPrivateFeaturesNotInStablehlo(HloOpTy hloOp) {
   // Please let us know if we missed something, and we'll recategorize them.
   if (isa<mhlo::AddDependencyOp, mhlo::AsyncDoneOp, mhlo::AsyncStartOp,
           mhlo::AsyncUpdateOp, mhlo::BitcastOp, mhlo::CopyOp, mhlo::DomainOp,
-          mhlo::FusionOp, mhlo::StochasticConvertOp, mhlo::TopKOp,
+          mhlo::FusionOp, mhlo::StochasticConvertOp,
           mhlo::XlaRngGetAndUpdateStateOp>(hloOp.getOperation())) {
     return true;
   }
@@ -140,6 +140,12 @@ std::optional<int64_t> getPublicFeaturesNotInStablehlo(HloOpTy hloOp) {
         mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI)
       return 1;
   }
+  // StableHLO doesn't support TopK yet.
+  // Proposal: https://github.com/openxla/stablehlo/pull/1593
+  if constexpr (std::is_same<HloOpTy, mhlo::TopKOp>::value) {
+    // Version 1: Initial version for TopK.
+    return 1;
+  }
   return std::nullopt;
 }
 
@@ -148,6 +154,26 @@ bool hasPublicFeaturesNotInStablehlo(HloOpTy op) {
   return getPublicFeaturesNotInStablehlo(op).has_value();
 }
 
+template <typename StablehloOpTy>
+Attribute convertDenseArray(Attribute hloAttr) {
+  auto denseInts = hloAttr.dyn_cast<DenseIntElementsAttr>();
+  if (!denseInts) return {};
+
+  // Handle DenseIntElementsAttr --> DenseI64ArrayAttr for StableHLO ops that
+  // use dense arrays. This is temporary while MHLO integrates this change.
+  if constexpr (std::is_same<StablehloOpTy, stablehlo::BroadcastOp>::value ||
+                std::is_same<StablehloOpTy, stablehlo::DynamicSliceOp>::value ||
+                std::is_same<StablehloOpTy, stablehlo::FftOp>::value ||
+                std::is_same<StablehloOpTy, stablehlo::PadOp>::value ||
+                std::is_same<StablehloOpTy, stablehlo::ReverseOp>::value ||
+                std::is_same<StablehloOpTy, stablehlo::SliceOp>::value ||
+                std::is_same<StablehloOpTy, stablehlo::TransposeOp>::value) {
+    return DenseI64ArrayAttr::get(
+        hloAttr.getContext(), llvm::to_vector(denseInts.getValues<int64_t>()));
+  }
+  return {};
+}
+
 #define RETURN_CONVERTED_ENUM_ATTR(Name)                      \
   auto hloValue = mhlo::stringify##Name(attr.getValue());     \
   auto stablehloValue = stablehlo::symbolize##Name(hloValue); \
@@ -494,7 +520,11 @@ class HloToStablehloOpConverter : public OpConversionPattern<HloOpTy> {
             hloOp.getCustomCallSchedule() == mhlo::CustomCallSchedule::NONE)
           continue;
       }
-      auto stablehloAttr = convertAttr(hloAttr.getValue());
+      auto stablehloAttr =
+          convertDenseArray<HloToStablehloOp<HloOpTy>>(hloAttr.getValue());
+      if (!stablehloAttr) {
+        stablehloAttr = convertAttr(hloAttr.getValue());
+      }
       if (!stablehloAttr) return failure();
       stablehloAttrs.push_back({hloAttr.getName(), stablehloAttr});
     }
@@ -566,7 +596,7 @@ void populateHloToStablehloPatterns(RewritePatternSet* patterns,
 #include "stablehlo/dialect/StablehloOps.cpp.inc"
       >(patterns, converter, context, allowExperimentalFeatures);
 
-  populateHloToStablehloCustomCallPatterns<mhlo::TanOp>(
+  populateHloToStablehloCustomCallPatterns<mhlo::TanOp, mhlo::TopKOp>(
       patterns, converter, context, allowExperimentalFeatures);
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_mhlo_to_thlo/legalize_mhlo_to_thlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_mhlo_to_thlo/legalize_mhlo_to_thlo.cc
deleted file mode 100644
index f40b19ffb5fb55..00000000000000
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_mhlo_to_thlo/legalize_mhlo_to_thlo.cc
+++ /dev/null
@@ -1,465 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <cstddef>
-#include <iterator>
-#include <memory>
-#include <utility>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mhlo/IR/hlo_ops.h"
-#include "mhlo/transforms/map_mhlo_to_scalar_op.h"
-#include "mhlo/transforms/passes.h"
-#include "mhlo/transforms/rewriters.h"
-#include "mhlo/utils/legalize_to_linalg_utils.h"
-#include "mhlo/utils/mhlo_scatter_gather_utils.h"
-#include "mhlo/utils/type_conversion.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "thlo/IR/thlo_ops.h"
-
-namespace mlir {
-namespace mhlo {
-
-#define GEN_PASS_DEF_LEGALIZEMHLOTOTHLOPASS
-#include "mhlo/transforms/mhlo_passes.h.inc"
-
-namespace {
-
-Value castToIndex(OpBuilder& b, Location loc, TensorType originalType,
-                  Value value) {
-  Type elementTy = originalType.getElementType();
-  if (elementTy.isIndex()) return value;
-
-  Type indexType = b.getIndexType();
-  Value emptyTensor = b.create<tensor::EmptyOp>(
-      loc, tensor::getMixedSizes(b, loc, value), indexType);
-
-  auto map = b.create<linalg::MapOp>(
-      loc, value, emptyTensor,
-      [&](OpBuilder& nestedB, Location loc, ValueRange args) {
-        Value elem = args.front();
-        Value res =
-            elementTy.isUnsignedInteger()
-                ? nestedB.create<arith::IndexCastUIOp>(loc, indexType, elem)
-                      .getResult()
-                : nestedB.create<arith::IndexCastOp>(loc, indexType, elem)
-                      .getResult();
-
-        b.create<linalg::YieldOp>(loc, res);
-      });
-  return map->getResult(0);
-}
-
-struct ConcatenateOpPattern : public OpConversionPattern<mhlo::ConcatenateOp> {
-  using OpConversionPattern<mhlo::ConcatenateOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      mhlo::ConcatenateOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const override {
-    const int64_t concatDim = op.getDimension();
-    const Location loc = op.getLoc();
-    const Value anyOperand = adaptor.getVal().front();
-
-    auto resultTy = typeConverter->convertType(op.getResult().getType())
-                        .cast<RankedTensorType>();
-    const ArrayRef<int64_t> resultShape = resultTy.getShape();
-    const int64_t rank = resultTy.getRank();
-
-    // Determine empty tensor size.
-    SmallVector<int64_t> staticInitSizes(resultShape.begin(),
-                                         resultShape.end());
-    SmallVector<Value> dynamicInitSizes;
-    for (int64_t i = 0; i < rank; ++i) {
-      // No need to materialize anything for static dimensions.
-      if (staticInitSizes[i] != ShapedType::kDynamic) {
-        continue;
-      }
-
-      // For all dimensions other than the concatenation dimension, we can copy
-      // the size from any operand.
-      if (i != static_cast<int64_t>(concatDim)) {
-        dynamicInitSizes.push_back(
-            rewriter.create<tensor::DimOp>(loc, anyOperand, i));
-        continue;
-      }
-
-      // For the concatenation dimensions, sum up the sizes of all operands in
-      // that dimension.
-      int64_t staticSum = 0;
-      Value dynamicSum;
-      for (const Value operand : adaptor.getVal()) {
-        auto operandTy = operand.getType().cast<RankedTensorType>();
-        if (operandTy.getDimSize(concatDim) == ShapedType::kDynamic) {
-          const Value dynamicSummand =
-              rewriter.create<tensor::DimOp>(loc, operand, concatDim);
-          if (dynamicSum) {
-            dynamicSum =
-                rewriter.create<arith::AddIOp>(loc, dynamicSum, dynamicSummand);
-          } else {
-            dynamicSum = dynamicSummand;
-          }
-        } else {
-          staticSum += operandTy.getDimSize(concatDim);
-        }
-      }
-      assert(dynamicSum && "expect at least one dynamic summand in this case");
-      if (staticSum != 0) {
-        dynamicSum = rewriter.create<arith::AddIOp>(
-            loc, dynamicSum,
-            rewriter.create<arith::ConstantIndexOp>(loc, staticSum));
-      }
-      dynamicInitSizes.push_back(dynamicSum);
-    }
-
-    // Create empty tensor and the new concat op.
-    auto emptyTensor = rewriter.create<tensor::EmptyOp>(
-        loc, staticInitSizes, resultTy.getElementType(), dynamicInitSizes);
-    rewriter.replaceOpWithNewOp<thlo::ConcatenateOp>(
-        op, resultTy, adaptor.getVal(), emptyTensor,
-        rewriter.getIndexAttr(concatDim));
-    return success();
-  }
-};
-
-struct DynamicBroadcastInDimOpPattern
-    : public OpConversionPattern<mhlo::DynamicBroadcastInDimOp> {
-  using OpConversionPattern<mhlo::DynamicBroadcastInDimOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      mhlo::DynamicBroadcastInDimOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const override {
-    auto loc = op.getLoc();
-    Value outputDimensions = adaptor.getOutputDimensions();
-    auto operandTy = adaptor.getOperand().getType().cast<RankedTensorType>();
-    auto resultTy =
-        typeConverter->convertType(op.getType()).cast<RankedTensorType>();
-
-    // Only  apply to broadcasts that cannot be lowered to linalg, i.e. those
-    // for which we do not know their expansion behavior at compile time.
-    int64_t countKnownExpansionBehavior = 0;
-    if (auto expandingDims = op.getKnownExpandingDimensions()) {
-      countKnownExpansionBehavior += expandingDims->size();
-    }
-    if (auto nonexpandingDims = op.getKnownNonexpandingDimensions()) {
-      countKnownExpansionBehavior += nonexpandingDims->size();
-    }
-    if (operandTy.getRank() == countKnownExpansionBehavior) return failure();
-
-    // Create empty tensor as none of the operands are reusable/updatable.
-    SmallVector<Value> dynamicDims;
-    SmallVector<int64_t> staticShapeInfo;
-    for (int i = 0; i < resultTy.getRank(); i++) {
-      dynamicDims.push_back(rewriter.create<tensor::ExtractOp>(
-          loc, outputDimensions,
-          ValueRange{rewriter.create<arith::ConstantIndexOp>(loc, i)}));
-      staticShapeInfo.push_back(ShapedType::kDynamic);
-    }
-    auto emptyTensor = rewriter.create<tensor::EmptyOp>(
-        loc, staticShapeInfo, resultTy.getElementType(), dynamicDims);
-
-    auto broadcastDims = rewriter.getDenseI64ArrayAttr(
-        llvm::to_vector(op.getBroadcastDimensions().getValues<int64_t>()));
-
-    DenseI64ArrayAttr knownExpandingDims;
-    if (op.getKnownExpandingDimensions().has_value()) {
-      knownExpandingDims = rewriter.getDenseI64ArrayAttr(llvm::to_vector(
-          op.getKnownExpandingDimensionsAttr().getValues<int64_t>()));
-    }
-    DenseI64ArrayAttr knownNonexpandingDims;
-    if (op.getKnownNonexpandingDimensions().has_value()) {
-      knownNonexpandingDims = rewriter.getDenseI64ArrayAttr(llvm::to_vector(
-          op.getKnownNonexpandingDimensionsAttr().getValues<int64_t>()));
-    }
-
-    rewriter.replaceOpWithNewOp<thlo::DynamicBroadcastInDimOp>(
-        op, resultTy, adaptor.getOperand(), emptyTensor, broadcastDims,
-        knownExpandingDims, knownNonexpandingDims);
-    return success();
-  }
-};
-
-// Rewrites simple gather patterns (as checked below).
-struct GatherPattern : public OpConversionPattern<mhlo::GatherOp> {
-  using OpConversionPattern<mhlo::GatherOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      mhlo::GatherOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const override {
-    if (!isCanonicalGather(op)) return failure();
-    auto startIndicesType =
-        adaptor.getStartIndices().getType().dyn_cast<RankedTensorType>();
-    auto operandType =
-        adaptor.getOperand().getType().dyn_cast<RankedTensorType>();
-
-    if (!startIndicesType || !operandType) return failure();
-
-    auto resultType =
-        typeConverter->convertType(op.getType()).cast<RankedTensorType>();
-    SmallVector<OpFoldResult> sizes;
-    sizes.reserve(resultType.getRank());
-    if (resultType.getDimSize(0) != ShapedType::kDynamic) {
-      sizes.push_back(rewriter.getI64IntegerAttr(resultType.getDimSize(0)));
-    } else {
-      sizes.push_back(
-          rewriter
-              .create<tensor::DimOp>(op.getLoc(), adaptor.getStartIndices(), 0)
-              .getResult());
-    }
-    llvm::copy(op.getSliceSizes().getValues<IntegerAttr>(),
-               std::back_inserter(sizes));
-
-    auto emptyTensor = rewriter.create<tensor::EmptyOp>(
-        op.getLoc(), sizes, resultType.getElementType());
-    rewriter.replaceOpWithNewOp<thlo::GatherOp>(
-        op, resultType, adaptor.getOperand(),
-        castToIndex(rewriter, op.getLoc(), op.getStartIndices().getType(),
-                    adaptor.getStartIndices()),
-        emptyTensor);
-    return success();
-  }
-};
-
-bool isInBodyOfThloOp(Operation* op) {
-  auto* parentOp = op->getParentRegion()->getParentOp();
-  return isa<thlo::ScatterOp>(*parentOp) || isa<thlo::SortOp>(*parentOp);
-}
-
-// Rewrites a mhlo::ReturnOp inside a thlo::ReductionOp to thlo::YieldOp.
-struct ThloRegionReturnOpConversion
-    : public OpConversionPattern<mhlo::ReturnOp> {
-  using OpConversionPattern<mhlo::ReturnOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      mhlo::ReturnOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const final {
-    if (!isInBodyOfThloOp(op)) return failure();
-    SmallVector<Value, 4> operands(adaptor.getOperands());
-    auto loc = op.getLoc();
-    for (size_t i = 0; i < operands.size(); ++i) {
-      if (operands[i].getType().isa<ShapedType>()) {
-        operands[i] = rewriter.create<tensor::ExtractOp>(loc, operands[i]);
-      }
-    }
-    rewriter.replaceOpWithNewOp<thlo::YieldOp>(op, operands);
-    return success();
-  }
-};
-
-struct ScatterPattern : public OpConversionPattern<mhlo::ScatterOp> {
-  using OpConversionPattern<mhlo::ScatterOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      mhlo::ScatterOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const override {
-    // Only canonicalized single-result scatter ops are supported.
-    if (!isCanonicalScatter(op) || op.getNumResults() != 1) return failure();
-
-    auto opType =
-        typeConverter->convertType(op.getType(0)).dyn_cast<RankedTensorType>();
-    if (!opType) return failure();
-
-    Location loc = op.getLoc();
-    auto thloScatter = rewriter.create<thlo::ScatterOp>(
-        loc, opType,
-        castToIndex(rewriter, loc, op.getScatterIndices().getType(),
-                    adaptor.getScatterIndices()),
-        adaptor.getUpdates().front(), adaptor.getInputs().front());
-
-    Region& region = thloScatter.getUpdateComputation();
-    rewriter.inlineRegionBefore(op.getRegion(), region, region.end());
-
-    // Convert the signature of the body by inserting
-    // tensor.from_elements/tensor.extract.
-    TypeConverter::SignatureConversion signatureConverter(2);
-    for (const auto& [idx, val] : llvm::enumerate(
-             thloScatter.getUpdateComputation().getArgumentTypes())) {
-      signatureConverter.addInputs(
-          1 - idx, typeConverter->convertType(
-                       val.cast<RankedTensorType>().getElementType()));
-    }
-    rewriter.applySignatureConversion(&region, signatureConverter,
-                                      getTypeConverter());
-
-    rewriter.replaceOp(op, thloScatter.getResults());
-    return success();
-  }
-};
-
-struct SortPattern : public OpConversionPattern<mhlo::SortOp> {
-  using OpConversionPattern<mhlo::SortOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      mhlo::SortOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const final {
-    Location loc = op.getLoc();
-
-    SmallVector<Value> outputs;
-    SmallVector<RankedTensorType> operandTypes;
-    SmallVector<Type> resultTypes;
-    if (failed(typeConverter->convertTypes(op.getResultTypes(), resultTypes)))
-      return failure();
-
-    for (auto [operand, resultType] :
-         llvm::zip(adaptor.getInputs(), resultTypes)) {
-      RankedTensorType operandType =
-          operand.getType().dyn_cast<RankedTensorType>();
-      if (!operandType)
-        return rewriter.notifyMatchFailure(op, "expects known-rank operands");
-      operandTypes.push_back(operandType);
-      auto tensorResultType = resultType.cast<RankedTensorType>();
-
-      Value emptyTensor = rewriter.create<tensor::EmptyOp>(
-          loc, tensorResultType.getShape(), tensorResultType.getElementType());
-
-      outputs.push_back(emptyTensor);
-    }
-
-    int64_t dimension = op.getDimension();
-    // TODO(bchetioui): MHLO accepts dimensions in the range [-rank, rank),
-    // while THLO accepts only dimensions in the range [0, rank). Ideally, they
-    // should agree on the range of acceptable arguments, but while it is not
-    // the case, this is a (reliable) workaround.
-    if (dimension < 0) dimension = dimension + operandTypes.front().getRank();
-    bool isStable = op.getIsStable();
-
-    auto thloSort = rewriter.create<thlo::SortOp>(
-        loc, resultTypes, adaptor.getInputs(), outputs,
-        rewriter.getIndexAttr(dimension), rewriter.getBoolAttr(isStable));
-
-    Region& region = thloSort.getComparator();
-    rewriter.inlineRegionBefore(op.getComparator(), region, region.end());
-
-    assert(thloSort.getNumDpsInputs() == thloSort.getNumDpsInits());
-
-    // Convert the signature of the comparator.
-    TypeConverter::SignatureConversion signatureConverter(
-        thloSort.getNumDpsInputs() * 2);
-    for (const auto& [idx, val] : llvm::enumerate(operandTypes)) {
-      signatureConverter.addInputs(
-          /*origInputNo=*/2 * idx,
-          typeConverter->convertType(val.getElementType()));
-      signatureConverter.addInputs(
-          /*origInputNo=*/2 * idx + 1,
-          typeConverter->convertType(val.getElementType()));
-    }
-
-    rewriter.applySignatureConversion(&region, signatureConverter,
-                                      getTypeConverter());
-
-    rewriter.replaceOp(op, thloSort.getResults());
-    return success();
-  }
-};
-
-struct ReversePattern : public OpConversionPattern<mhlo::ReverseOp> {
-  using OpConversionPattern<mhlo::ReverseOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      mhlo::ReverseOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const final {
-    auto reverseDimensions =
-        llvm::to_vector(op.getDimensions().getValues<int64_t>());
-    Type resultType = typeConverter->convertType(op->getResultTypes()[0]);
-    if (!resultType)
-      return rewriter.notifyMatchFailure(op, "failed to convert result type");
-    Location loc = op.getLoc();
-    auto operandType =
-        adaptor.getOperand().getType().dyn_cast<RankedTensorType>();
-    if (!operandType)
-      return rewriter.notifyMatchFailure(op, "expects known-rank operand");
-    auto tensorResultType = resultType.cast<RankedTensorType>();
-    SmallVector<Value, 8> dynShape =
-        tensor::createDynamicDimValues(rewriter, loc, adaptor.getOperand());
-    Value initTensor = rewriter.create<tensor::EmptyOp>(
-        loc, tensorResultType.getShape(), tensorResultType.getElementType(),
-        dynShape);
-    rewriter.replaceOpWithNewOp<thlo::ReverseOp>(
-        op, resultType, adaptor.getOperand(), initTensor, reverseDimensions);
-    return success();
-  }
-};
-
-class LegalizeMHLOToTHLOPass
-    : public impl::LegalizeMHLOToTHLOPassBase<LegalizeMHLOToTHLOPass> {
- public:
-  explicit LegalizeMHLOToTHLOPass(bool enableExperimentalOps) {
-    enableExperimental = enableExperimentalOps;
-  }
-
- private:
-  void runOnOperation() final {
-    MLIRContext* ctx = &getContext();
-    RewritePatternSet patterns(ctx);
-    ConversionTarget target(*ctx);
-    // clang-format off
-    target.addLegalDialect<
-        arith::ArithDialect,
-        complex::ComplexDialect,
-        linalg::LinalgDialect,
-        math::MathDialect,
-        shape::ShapeDialect,
-        tensor::TensorDialect,
-        thlo::THLODialect>();
-    // clang-format on
-    target.addLegalOp<UnrealizedConversionCastOp>();
-
-    auto typeConverter = std::make_unique<LinalgTypeConverter>();
-
-    populateScalarHloToArithmeticConversionPatterns(
-        ctx, *typeConverter, &patterns,
-        [](Operation* op) { return isInBodyOfThloOp(op); });
-
-    // List of patterns.
-    // clang-format off
-    patterns.insert<
-        ConcatenateOpPattern,
-        GatherPattern,
-        ReversePattern,
-        ScatterPattern,
-        SortPattern,
-        ThloRegionReturnOpConversion>(*typeConverter, ctx);
-    // clang-format on
-
-    if (enableExperimental) {
-      patterns.insert<DynamicBroadcastInDimOpPattern>(*typeConverter, ctx);
-    }
-
-    if (failed(applyPartialConversion(getOperation(), target,
-                                      std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeMHLOToTHLOPass(
-    bool enableExperimentalOps) {
-  return std::make_unique<LegalizeMHLOToTHLOPass>(enableExperimentalOps);
-}
-
-}  // namespace mhlo
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_stablehlo_to_hlo_op.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_stablehlo_to_hlo_op.h
index 3667563ac078e7..bed720482d0588 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_stablehlo_to_hlo_op.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_stablehlo_to_hlo_op.h
@@ -64,6 +64,7 @@ MAP_STABLEHLO_TO_HLO(CeilOp)
 MAP_STABLEHLO_TO_HLO(CholeskyOp)
 MAP_STABLEHLO_TO_HLO(ClampOp)
 MAP_STABLEHLO_TO_HLO(ClzOp)
+MAP_STABLEHLO_TO_HLO(CollectiveBroadcastOp)
 MAP_STABLEHLO_TO_HLO(CollectivePermuteOp)
 MAP_STABLEHLO_TO_HLO(CompareOp)
 MAP_STABLEHLO_TO_HLO(ComplexOp)
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
index 32c5822b52de82..5bf2e91fbd4afc 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
@@ -146,22 +146,6 @@ def HloLegalizeToLinalgPass : Pass<"hlo-legalize-to-linalg", "func::FuncOp"> {
                         "transpose) when possible, instead of linalg.generic">];
 }
 
-def LegalizeMHLOToTHLOPass : Pass<"legalize-mhlo-to-thlo", "mlir::func::FuncOp"> {
-  let summary = "Legalize from HLO dialect to tHLO dialect.";
-  let constructor = "::mlir::mhlo::createLegalizeMHLOToTHLOPass()";
-  let options =
-      [Option<"enableExperimental", "enable-experimental", "bool",
-              /*default=*/"false",
-              "Enable conversion to operations that are still under "
-              "developement and might not be working in some pipelines. For "
-              "example, thlo.map and thlo.transpose.">];
-  let dependentDialects = [
-    "arith::ArithDialect", "complex::ComplexDialect",
-    "linalg::LinalgDialect", "math::MathDialect", "shape::ShapeDialect",
-    "tensor::TensorDialect", "thlo::THLODialect"
-  ];
-}
-
 def HloLegalizeShapeComputationsPass : Pass<"hlo-legalize-shape-computations", "func::FuncOp"> {
   let summary = "Legalize HLOs shape operations to core-mlir operations.";
   let constructor = "createLegalizeShapeComputationsPass()";
@@ -408,4 +392,8 @@ def ShapeLegalizeToHloPass : Pass<"shape-legalize-to-hlo", "func::FuncOp"> {
     compilation pipelines that use HLO operations to model dynamism.
   }];
   let dependentDialects = ["mhlo::MhloDialect"];
+  let options = [
+    Option<"legalize_constraints_", "legalize-constraints", "bool",
+           /*default=*/"false", "Whether to legalize Cstr Ops to shape_assertion custom_call">
+  ];
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
index 946c56ae18ab1b..dd06b8ddd49bcb 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
@@ -196,7 +196,8 @@ std::unique_ptr<OperationPass<ModuleOp>> createHloLegalizeToStablehloPass();
 std::unique_ptr<OperationPass<ModuleOp>> createStablehloLegalizeToHloPass();
 
 // Legalizes from the Shape dialect to the MHLO dialect.
-std::unique_ptr<OperationPass<func::FuncOp>> createShapeLegalizeToHloPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createShapeLegalizeToHloPass(
+    bool legalizeConstraints = false);
 
 // Test passes.
 std::unique_ptr<Pass> createTestInferShapedTypeMethodsPass();
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_legalize_to_hlo/shape_legalize_to_hlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_legalize_to_hlo/shape_legalize_to_hlo.cc
index b07a589afdbe23..2e09b6a3b0a459 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_legalize_to_hlo/shape_legalize_to_hlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_legalize_to_hlo/shape_legalize_to_hlo.cc
@@ -13,21 +13,30 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <utility>
 
 #include "llvm/ADT/SmallVector.h"
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/passes.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/Location.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Support/TypeID.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -105,6 +114,16 @@ Value castToIndex(PatternRewriter& rewriter, Location loc, Value value) {
   return cast.getResult(0);
 }
 
+void insertShapeAssertionCustomCall(OpBuilder builder, Location loc,
+                                    Value assert) {
+  auto customCall =
+      builder.create<mhlo::CustomCallOp>(loc, TypeRange{}, ValueRange{assert});
+  customCall.setCallTargetName("shape_assertion");
+  customCall.setHasSideEffect(true);
+  customCall->setAttr("error_message",
+                      builder.getStringAttr("Shape assertion failed"));
+}
+
 struct ConvertComputeReshapeShapeOpPattern
     : public OpRewritePattern<ComputeReshapeShapeOp> {
   using OpRewritePattern::OpRewritePattern;
@@ -250,6 +269,240 @@ struct ConvertShapeOfOpPattern : public OpRewritePattern<shape::ShapeOfOp> {
   }
 };
 
+struct ConvertShapeBroadcastOpPattern
+    : public OpRewritePattern<shape::BroadcastOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(shape::BroadcastOp op,
+                                PatternRewriter& rewriter) const override {
+    // Only support broadcasting for two 1D tensors with same size.
+    if (op.getShapes().size() != 2) return failure();
+    auto shape1 = castToI32(rewriter, op.getLoc(), op.getShapes().front());
+    auto shape2 = castToI32(rewriter, op.getLoc(), op.getShapes().back());
+    if (!shape1 || !shape2) return failure();
+    auto tensorType1 = shape1.getType().dyn_cast<RankedTensorType>();
+    auto tensorType2 = shape2.getType().dyn_cast<RankedTensorType>();
+    if (!tensorType1 || !tensorType2 ||
+        tensorType1.getDimSize(0) != tensorType2.getDimSize(0))
+      return failure();
+
+    // By definition, broadcasted dims are:
+    //   result[i] = lhs[i] if lhs[i] == rhs[i]
+    //             = lhs[i] if rhs[i] == 1
+    //             = rhs[i] if lhs[i] == 1
+    //
+    // We assume that there is shape.cstr_broadcastable check done elsewhere to
+    // make sure the shapes are broadcastable, then we can calculate broadcast
+    // result simply using MaxOp. In case the shapes are not broadcastable, the
+    // result extent tensor is undefined according to spec. So this
+    // implementation is technically correct.
+    auto broadcasted =
+        rewriter.create<mhlo::MaxOp>(op->getLoc(), shape1, shape2);
+
+    auto broadcastedIndex = castToIndex(rewriter, op.getLoc(), broadcasted);
+    if (!broadcastedIndex ||
+        broadcastedIndex.getType() != op.getResult().getType())
+      return rewriter.notifyMatchFailure(op, "cast to index failed");
+    rewriter.replaceOp(op, broadcastedIndex);
+    return success();
+  }
+};
+
+struct ConvertTensorDimPattern : public OpRewritePattern<tensor::DimOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(tensor::DimOp op,
+                                PatternRewriter& rewriter) const override {
+    // We only support getting static index.
+    auto constIndex =
+        dyn_cast_or_null<arith::ConstantIndexOp>(op.getIndex().getDefiningOp());
+    if (!constIndex) {
+      return failure();
+    }
+
+    auto dim = rewriter.create<mhlo::GetDimensionSizeOp>(
+        op->getLoc(), op.getSource(), constIndex.value());
+    auto dimIndex = castToIndex(rewriter, op.getLoc(), dim);
+    rewriter.replaceOp(op, dimIndex);
+    return success();
+  }
+};
+
+struct ConvertTensorFromElementsPattern
+    : public OpRewritePattern<tensor::FromElementsOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(tensor::FromElementsOp op,
+                                PatternRewriter& rewriter) const override {
+    // We only handle 1D tensor with index types. tensor.from_elements spec
+    // allows the same element type only for all input/output.
+    auto tensorType =
+        op.getResult().getType().dyn_cast_or_null<RankedTensorType>();
+    if (!tensorType || tensorType.getRank() != 1) {
+      return failure();
+    }
+    if (!hasIndexStyle(op.getResult())) return failure();
+
+    SmallVector<Value> elementI32x1;
+    for (size_t i = 0; i < op.getElements().size(); ++i) {
+      if (auto constIndex = dyn_cast_or_null<arith::ConstantIndexOp>(
+              op.getElements()[i].getDefiningOp())) {
+        elementI32x1.push_back(rewriter.create<ConstantOp>(
+            op.getLoc(), DenseIntElementsAttr::get<int32_t>(
+                             RankedTensorType::get({1}, rewriter.getI32Type()),
+                             static_cast<int32_t>(constIndex.value()))));
+      } else {
+        elementI32x1.push_back(rewriter.create<ReshapeOp>(
+            op.getLoc(), RankedTensorType::get({1}, rewriter.getI32Type()),
+            castToI32(rewriter, op->getLoc(), op.getElements()[i])));
+      }
+    }
+    Value tensorI32 =
+        rewriter.create<mhlo::ConcatenateOp>(op.getLoc(), elementI32x1,
+                                             /*dimension=*/0);
+
+    tensorI32 = hasI32Style(op.getResult())
+                    ? tensorI32
+                    : castToIndex(rewriter, op.getLoc(), tensorI32);
+    if (!tensorI32 || tensorI32.getType() != op.getResult().getType())
+      return rewriter.notifyMatchFailure(op, "cast to index failed");
+    rewriter.replaceOp(op, tensorI32);
+    return success();
+  }
+};
+
+struct ConvertCstrBroadcastableOp
+    : public OpRewritePattern<shape::CstrBroadcastableOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(shape::CstrBroadcastableOp op,
+                                PatternRewriter& rewriter) const override {
+    // Only support broadcasting for two 1D tensors with same size.
+    if (op.getShapes().size() != 2) return failure();
+    auto shape1 = castToI32(rewriter, op.getLoc(), op.getShapes().front());
+    auto shape2 = castToI32(rewriter, op.getLoc(), op.getShapes().back());
+    if (!shape1 || !shape2) return failure();
+    auto tensorType1 = shape1.getType().dyn_cast<RankedTensorType>();
+    auto tensorType2 = shape2.getType().dyn_cast<RankedTensorType>();
+    if (!tensorType1 || !tensorType2 ||
+        tensorType1.getDimSize(0) != tensorType2.getDimSize(0))
+      return failure();
+
+    // Compute if each dim is broadcastable. A dim is broadcastable iff
+    // dimSize1 == dimSize2 or dimSize1 == 1 or dimSize2 == 1
+    auto allOne = rewriter.create<mhlo::ConstantOp>(
+        op.getLoc(), DenseIntElementsAttr::get<int32_t>(
+                         RankedTensorType::get({tensorType1.getDimSize(0)},
+                                               rewriter.getI32Type()),
+                         static_cast<int32_t>(1)));
+    Value dimSize1Is1 = rewriter.create<mhlo::CompareOp>(
+        op.getLoc(), shape1, allOne, ComparisonDirection::EQ);
+    Value dimSize2Is1 = rewriter.create<mhlo::CompareOp>(
+        op.getLoc(), shape2, allOne, ComparisonDirection::EQ);
+    Value eitherDimSizeIs1 =
+        rewriter.create<mhlo::OrOp>(op.getLoc(), dimSize1Is1, dimSize2Is1);
+    Value dimSizeEq = rewriter.create<mhlo::CompareOp>(
+        op.getLoc(), shape1, shape2, ComparisonDirection::EQ);
+    Value dimBroadcastable =
+        rewriter.create<mhlo::OrOp>(op.getLoc(), eitherDimSizeIs1, dimSizeEq);
+
+    // Iterate over each dim to check that all dims are broadcastable.
+    auto boolType = RankedTensorType::get({1}, rewriter.getI1Type());
+    Value allBroadcastable = rewriter.create<ConstantOp>(
+        op.getLoc(), DenseIntElementsAttr::get<bool>(boolType, true));
+    for (auto i = 0; i < tensorType1.getDimSize(0); ++i) {
+      Value broadcastable = rewriter.create<SliceOp>(
+          op.getLoc(), dimBroadcastable, rewriter.getI64TensorAttr(i),
+          rewriter.getI64TensorAttr(i + 1), rewriter.getI64TensorAttr(1));
+      allBroadcastable =
+          rewriter.create<AndOp>(op.getLoc(), allBroadcastable, broadcastable);
+    }
+    Value allBroadcastableScalar = rewriter.create<ReshapeOp>(
+        op.getLoc(), RankedTensorType::get({}, rewriter.getI1Type()),
+        allBroadcastable);
+
+    // Add CustomCallOp and replace Cstr op with const witness, which is useful
+    // for canonicalizer to remove the shape.assuming region.
+    insertShapeAssertionCustomCall(rewriter, op->getLoc(),
+                                   allBroadcastableScalar);
+    rewriter.replaceOpWithNewOp<shape::ConstWitnessOp>(op.getOperation(), true);
+    return success();
+  }
+};
+
+// As defined in tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.td, the
+// dynamic shape is reshapable if it has only 1 dynamic dimension and the number
+// of element can divide the product of the static dimension sizes.
+struct ConvertCstrReshapableOp
+    : public OpRewritePattern<mhlo::CstrReshapableOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(mhlo::CstrReshapableOp op,
+                                PatternRewriter& rewriter) const override {
+    Value numElements;
+    if (auto constIndex = dyn_cast_or_null<arith::ConstantIndexOp>(
+            op.getNumElements().getDefiningOp())) {
+      numElements = rewriter.create<ConstantOp>(
+          op.getLoc(), DenseIntElementsAttr::get<int32_t>(
+                           RankedTensorType::get({}, rewriter.getI32Type()),
+                           static_cast<int32_t>(constIndex.value())));
+    } else {
+      numElements = castToI32(rewriter, op->getLoc(), op.getNumElements());
+    }
+    Value dyanmicShape =
+        castToI32(rewriter, op->getLoc(), op.getDynamicShape());
+    if (!dyanmicShape || !numElements) return failure();
+    auto dyanmicShapeType =
+        dyanmicShape.getType().dyn_cast_or_null<RankedTensorType>();
+    if (!dyanmicShapeType || dyanmicShapeType.getRank() != 1) return failure();
+
+    auto i32Type = RankedTensorType::get({}, rewriter.getI32Type());
+    Value minusOne = rewriter.create<ConstantOp>(
+        op.getLoc(), DenseIntElementsAttr::get<int32_t>(i32Type, -1));
+    // There must only be 1 dynamic dimension, enforced later in this pattern.
+    // Init as -1 so that it will cancel with the dynamic dim when calculating
+    // product of static dim sizes.
+    Value productStaticDimSizes = minusOne;
+    Value one = rewriter.create<ConstantOp>(
+        op.getLoc(), DenseIntElementsAttr::get<int32_t>(i32Type, 1));
+    Value zero = rewriter.create<ConstantOp>(
+        op.getLoc(), DenseIntElementsAttr::get<int32_t>(i32Type, 0));
+    Value numDyanmicDim = zero;
+    for (auto i = 0; i < dyanmicShapeType.getDimSize(0); ++i) {
+      // Calculate the product of static dimension sizes.
+      Value dimSize = rewriter.create<SliceOp>(
+          op.getLoc(), dyanmicShape, rewriter.getI64TensorAttr(i),
+          rewriter.getI64TensorAttr(i + 1), rewriter.getI64TensorAttr(1));
+      dimSize = rewriter.create<ReshapeOp>(op.getLoc(), i32Type, dimSize);
+      productStaticDimSizes =
+          rewriter.create<MulOp>(op.getLoc(), productStaticDimSizes, dimSize);
+      // Count number of -1 dims, aka dynamic dimensions.
+      Value eqMinusOne = rewriter.create<CompareOp>(
+          op.getLoc(), dimSize, minusOne, ComparisonDirection::EQ);
+      eqMinusOne =
+          rewriter.create<SelectOp>(op.getLoc(), eqMinusOne, one, zero);
+      numDyanmicDim =
+          rewriter.create<AddOp>(op.getLoc(), numDyanmicDim, eqMinusOne);
+    }
+
+    // 1. Check there is 1 dynamic dim.
+    Value exactlyOneDynamicDim = rewriter.create<CompareOp>(
+        op.getLoc(), numDyanmicDim, one, ComparisonDirection::EQ);
+
+    // 2. Check number of elements can be divided by product of static dim
+    // sizes.
+    Value rem =
+        rewriter.create<RemOp>(op.getLoc(), numElements, productStaticDimSizes);
+    Value reshapable = rewriter.create<CompareOp>(op.getLoc(), rem, zero,
+                                                  ComparisonDirection::EQ);
+
+    // Check both conditions are true.
+    reshapable =
+        rewriter.create<AndOp>(op.getLoc(), reshapable, exactlyOneDynamicDim);
+
+    // Add CustomCallOp and replace Cstr op with const witness, which is
+    // useful for canonicalizer to remove the shape.assuming region.
+    insertShapeAssertionCustomCall(rewriter, op->getLoc(), reshapable);
+    rewriter.replaceOpWithNewOp<shape::ConstWitnessOp>(op.getOperation(), true);
+    return success();
+  }
+};
+
 template <typename OpType>
 struct CastOperandsPattern : public OpRewritePattern<OpType> {
   using OpRewritePattern<OpType>::OpRewritePattern;
@@ -281,6 +534,12 @@ struct CastOperandsPattern : public OpRewritePattern<OpType> {
 // needed to support bounded dynamism in MHLO export.
 struct ShapeLegalizeToHloPass
     : public impl::ShapeLegalizeToHloPassBase<ShapeLegalizeToHloPass> {
+  explicit ShapeLegalizeToHloPass(bool legalizeConstraints)
+      : impl::ShapeLegalizeToHloPassBase<
+            ShapeLegalizeToHloPass>::ShapeLegalizeToHloPassBase() {
+    this->legalize_constraints_ = legalizeConstraints;
+  }
+
   void runOnOperation() override {
     // In order to make dynamic MHLO programs compatible with HLO,
     // we need to get rid of all non-MHLO ops as well as the two shape-related
@@ -305,14 +564,12 @@ struct ShapeLegalizeToHloPass
     // Most of these ops are convertible to MHLO, although the representation is
     // going to be pretty laborious for many of them. Luckily, canonicalization
     // is able to remove unnecessary cruft. At the moment, this pass is a
-    // work in progress, so now all of these ops are supported.
+    // work in progress, so not all of these ops are supported.
     //
-    // The only problem (and a big problem at that) are the ops involved in
-    // shape constraints: cstr* ops as well as shape.assuming*. Since HLO does
-    // not support shape constraints, it is currently unclear what to do with
-    // them, unless they can be removed by --symbolic-shape-optimization.
-    // At the moment, this pass is a work in progress, so it does not provide
-    // an answer to this problem yet.
+    // When legalize_constraints_ is set true, cstr* ops are also legalized.
+    // A shape_assertion custom_call is used to check the constraint. And the
+    // shape.assuming region will consume a shape.const_witness that evaluate to
+    // true, so that it can be removed later in a canonicalizer pass.
     ConversionTarget target(getContext());
     target.addIllegalDialect<shape::ShapeDialect>();
     target.addIllegalDialect<tensor::TensorDialect>();
@@ -323,6 +580,10 @@ struct ShapeLegalizeToHloPass
     });
     target.addLegalOp<tensor::CastOp>();
     target.addLegalOp<UnrealizedConversionCastOp>();
+    if (this->legalize_constraints_) {
+      target.addLegalOp<shape::ConstWitnessOp, shape::AssumingOp,
+                        shape::AssumingYieldOp>();
+    }
 
     // The patterns do what one might expect, converting between MLIR-style
     // and HLO-style shape computations.
@@ -336,7 +597,15 @@ struct ShapeLegalizeToHloPass
     patterns.add<ConvertComputeReshapeShapeOpPattern>(&getContext());
     patterns.add<ConvertNumElementsOpPattern>(&getContext());
     patterns.add<ConvertShapeOfOpPattern>(&getContext());
+    patterns.add<ConvertShapeBroadcastOpPattern>(&getContext());
     patterns.add<CastOperandsPattern<DynamicBroadcastInDimOp>>(&getContext());
+    patterns.add<CastOperandsPattern<DynamicReshapeOp>>(&getContext());
+    patterns.add<ConvertTensorDimPattern>(&getContext());
+    patterns.add<ConvertTensorFromElementsPattern>(&getContext());
+    if (this->legalize_constraints_) {
+      patterns.add<ConvertCstrBroadcastableOp>(&getContext());
+      patterns.add<ConvertCstrReshapableOp>(&getContext());
+    }
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
       return signalPassFailure();
@@ -345,9 +614,9 @@ struct ShapeLegalizeToHloPass
 
 }  // namespace
 
-std::unique_ptr<mlir::OperationPass<func::FuncOp>>
-createShapeLegalizeToHloPass() {
-  return std::make_unique<ShapeLegalizeToHloPass>();
+std::unique_ptr<mlir::OperationPass<func::FuncOp>> createShapeLegalizeToHloPass(
+    bool legalizeConstraints) {
+  return std::make_unique<ShapeLegalizeToHloPass>(legalizeConstraints);
 }
 
 }  // namespace mhlo
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
index a3c401a7b1abe1..3561818fe95c74 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
@@ -44,6 +44,15 @@ namespace {
   return mhlo::Name##Attr::get(attr.getContext(), hloValue.value())
 
 Attribute convertAttr(Attribute stablehloAttr) {
+  // StableHLO uses DenseArray for some attributes, MHLO is in the process
+  // of integrating this change. In the meantime, convert DenseArray to
+  // DenseElementsAttr.
+  if (auto attr = stablehloAttr.dyn_cast<DenseI64ArrayAttr>()) {
+    return DenseIntElementsAttr::get(
+        RankedTensorType::get(attr.getSize(), attr.getElementType()),
+        attr.asArrayRef());
+  }
+
   // Handle StableHLO attributes.
   // The logic that handles attributes from other dialects (e.g. builtin
   // attributes) lives below.
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc
index 458329ac9386f5..bf36a140ce7e8c 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -113,8 +114,8 @@ struct SimplifyBroadcasts : public mlir::OpRewritePattern<shape::BroadcastOp> {
         llvm::map_range(symResult, [&](const auto &symResultDim) {
           // If we know the dimension statically, use a constant.
           if (!symResultDim) return findOrCreateConstant(1);
-          if (auto cexpr = symResultDim->expr.expr
-                               .template dyn_cast<AffineConstantExpr>()) {
+          if (auto cexpr =
+                  dyn_cast<AffineConstantExpr>(symResultDim->expr.expr)) {
             return findOrCreateConstant(cexpr.getValue());
           }
 
@@ -243,16 +244,16 @@ struct RemoveComputeReshapeShape final
 bool isProduct(AffineExpr expr,
                llvm::function_ref<void(AffineConstantExpr)> cbkConstantFactor,
                llvm::function_ref<void(AffineSymbolExpr)> cbkSymbolicFactor) {
-  auto binExpr = expr.dyn_cast<AffineBinaryOpExpr>();
+  auto binExpr = dyn_cast<AffineBinaryOpExpr>(expr);
   if (binExpr && binExpr.getKind() == AffineExprKind::Mul) {
     return isProduct(binExpr.getLHS(), cbkConstantFactor, cbkSymbolicFactor) &&
            isProduct(binExpr.getRHS(), cbkConstantFactor, cbkSymbolicFactor);
   }
-  if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>()) {
+  if (auto symExpr = dyn_cast<AffineSymbolExpr>(expr)) {
     cbkSymbolicFactor(symExpr);
     return true;
   }
-  if (auto constExpr = expr.dyn_cast<AffineConstantExpr>()) {
+  if (auto constExpr = dyn_cast<AffineConstantExpr>(expr)) {
     cbkConstantFactor(constExpr);
     return true;
   }
@@ -630,7 +631,7 @@ SmallVector<int64_t> concretizeOperandShape(
   for (auto it : llvm::zip(operandShape, operandShapeInfo)) {
     auto dimSize = std::get<0>(it);
     auto sExpr = std::get<1>(it);
-    if (auto cexpr = sExpr.expr.dyn_cast<AffineConstantExpr>()) {
+    if (auto cexpr = dyn_cast<AffineConstantExpr>(sExpr.expr)) {
       int64_t alsoDimSize = cexpr.getValue();
       assert((ShapedType::isDynamic(dimSize) || dimSize == alsoDimSize) &&
              "expect shape analysis result to be compatible with type");
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/sparse_chlo_legalize_to_linalg.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/sparse_chlo_legalize_to_linalg.mlir
index 118e043d484243..1b7602b71dd3a5 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/sparse_chlo_legalize_to_linalg.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/sparse_chlo_legalize_to_linalg.mlir
@@ -4,6 +4,8 @@
   map = (d0, d1) -> (d0 : dense, d1 : compressed)
 }>
 
+// CHECK: #[[$CSR:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>
+
 // CHECK-LABEL: @asinh_scalar(
 // CHECK-SAME: %[[ARG:.*]]: tensor<f32>) -> tensor<f32> {
 // CHECK:         %[[RESULT:.*]] = chlo.asinh %[[ARG]] : tensor<f32> -> tensor<f32>
@@ -14,13 +16,13 @@ func.func @asinh_scalar(%arg : tensor<f32>) -> tensor<f32> {
 }
 
 // CHECK-LABEL: @asinh_tensor(
-// CHECK-SAME: %[[ARG:.*]]: tensor<10x20xf32, #{{.*}}>) ->
-// CHECK-SAME:   tensor<10x20xf32, #{{.*}}> {
+// CHECK-SAME: %[[ARG:.*]]: tensor<10x20xf32, #[[$CSR]]>) ->
+// CHECK-SAME:   tensor<10x20xf32, #[[$CSR]]> {
 // CHECK:         %[[OUT:.*]] = bufferization.alloc_tensor() :
-// CHECK-SAME:      tensor<10x20xf32, #{{.*}}>
+// CHECK-SAME:      tensor<10x20xf32, #[[$CSR]]>
 // CHECK:         %[[VAL:.*]] = linalg.generic
 // CHECK-SAME:        ins(%[[ARG]] : tensor<10x20xf32,
-// CHECK-SAME:        #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>) outs(%[[OUT]]
+// CHECK-SAME:        #sparse>) outs(%[[OUT]]
 // CHECK:           sparse_tensor.unary %{{.*}} : f32 to f32
 // CHECK:           present = {
 // CHECK:             tensor.from_elements
@@ -38,13 +40,12 @@ func.func @asinh_tensor(%arg : tensor<10x20xf32, #CSR>)
   func.return %result : tensor<10x20xf32, #CSR>
 }
 
-
 // CHECK-LABEL:  func.func @tan_tensor(
-// CHECK-SAME:   %[[TMP_arg0:.*]]: tensor<10x20xf32,
-// CHECK:          %[[TMP_0:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32,
+// CHECK-SAME:   %[[TMP_arg0:.*]]: tensor<10x20xf32, #[[$CSR]]
+// CHECK:          %[[TMP_0:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #[[$CSR]]
 // CHECK:          %[[TMP_1:.*]] = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]}
-// CHECK-SAME:     ins(%[[TMP_arg0]] : tensor<10x20xf32,
-// CHECK-SAME:     outs(%[[TMP_0]] : tensor<10x20xf32,
+// CHECK-SAME:     ins(%[[TMP_arg0]] : tensor<10x20xf32, #[[$CSR]]
+// CHECK-SAME:     outs(%[[TMP_0]] : tensor<10x20xf32, #[[$CSR]]
 // CHECK:           ^bb0(%[[TMP_arg1:.*]]: f32, %[[TMP_arg2:.*]]: f32):
 // CHECK:             %[[TMP_2:.*]] = sparse_tensor.unary %[[TMP_arg1]] : f32 to f32
 // CHECK:              present = {
@@ -68,10 +69,10 @@ func.func @tan_tensor(%arg : tensor<10x20xf32, #CSR>)
 
 // CHECK-LABEL:  func.func @sinh_tensor(
 // CHECK-SAME:   %[[TMP_arg0:.*]]: tensor<10x20xf32,
-// CHECK:          %[[TMP_0:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32,
+// CHECK:          %[[TMP_0:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #[[$CSR]]
 // CHECK:          %[[TMP_1:.*]] = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]}
-// CHECK-SAME:     ins(%[[TMP_arg0]] : tensor<10x20xf32,
-// CHECK-SAME:     outs(%[[TMP_0]] : tensor<10x20xf32,
+// CHECK-SAME:     ins(%[[TMP_arg0]] : tensor<10x20xf32, #[[$CSR]]
+// CHECK-SAME:     outs(%[[TMP_0]] : tensor<10x20xf32, #[[$CSR]]
 // CHECK:          ^bb0(%[[TMP_arg1:.*]]: f32, %[[TMP_arg2:.*]]: f32):
 // CHECK:            %[[TMP_2:.*]] = sparse_tensor.unary %[[TMP_arg1]] : f32 to f32
 // CHECK:             present = {
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/analysis.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/analysis.mlir
deleted file mode 100644
index fb4d520a8d6888..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/analysis.mlir
+++ /dev/null
@@ -1,65 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --allow-unregistered-dialect \
-// RUN:     --hlo-deallocation-annotation | \
-// RUN: FileCheck %s
-
-func.func @loop_nested_alloc(
-    %lb: index, %ub: index, %step: index,
-    %buf: memref<2xf32>, %res: memref<2xf32>) {
-  // CHECK-LABEL: func.func @loop_nested_alloc
-  // CHECK-SAME:    (%[[LB:.*]]: index, %[[UB:.*]]: index, %[[STEP:.*]]: index,
-  // CHECK-SAME:    %[[BUF:.*]]: memref<2xf32>, %[[RES:.*]]: memref<2xf32>)
-  // CHECK-SAME:    attributes {deallocation.region_args_backing_memory = {{\[\[}}
-  // CHECK-SAME:      "", "", "", "%[[BUF]], %[[RES]]", "%[[BUF]], %[[RES]]"]]} {
-  %0 = memref.alloc() : memref<2xf32>
-  // CHECK: %[[ALLOC1:.*]] = memref.alloc()
-  // CHECK-SAME:  {deallocation.result_backing_memory = ["%[[ALLOC1]]"]} : memref<2xf32>
-  %1 = scf.for %i = %lb to %ub step %step
-      iter_args(%iterBuf = %buf) -> memref<2xf32> {
-    // CHECK: %[[FOR1:.*]] = scf.for %[[I:.*]] = %[[LB]] to %[[UB]] step %[[STEP]]
-    // CHECK-SAME: iter_args(%[[ITER_BUF:.*]] = %[[BUF]]) -> (memref<2xf32>)
-    %2 = scf.for %j = %lb to %ub step %step
-        iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
-      // CHECK: %[[FOR2:.*]] = scf.for %[[J:.*]] = %[[LB]] to %[[UB]] step %[[STEP]]
-      // CHECK-SAME: iter_args(%[[ITER_BUF2:.*]] = %[[ITER_BUF]]) -> (memref<2xf32>)
-      %3 = memref.alloc() : memref<2xf32>
-      // CHECK: %[[ALLOC2:.*]] = memref.alloc()
-      %4 = arith.cmpi eq, %i, %ub : index
-      // CHECK: arith.cmpi
-      %5 = scf.if %4 -> (memref<2xf32>) {
-        // CHECK: %[[IF:.*]] = scf.if
-        %6 = memref.alloc() : memref<2xf32>
-        // CHECK: %[[ALLOC3:.*]] = memref.alloc()
-        scf.yield %6 : memref<2xf32>
-        // CHECK: scf.yield %[[ALLOC3]]
-      } else {
-        scf.yield %iterBuf2 : memref<2xf32>
-        // CHECK: scf.yield %[[ITER_BUF2]]
-      }
-      scf.yield %5 : memref<2xf32>
-      // CHECK: scf.yield %[[IF]]
-    }
-    scf.yield %2 : memref<2xf32>
-    // CHECK: scf.yield %[[FOR2]]
-  }
-  // CHECK: } {deallocation.region_args_backing_memory = {{\[\[}}"", "%[[ITER_BUF]], %[[ITER_BUF2]], %[[BUF]], %[[RES]], %[[ALLOC3]]"]],
-  // CHECK-SAME: deallocation.result_backing_memory = ["%[[ITER_BUF]], %[[ITER_BUF2]], %[[BUF]], %[[RES]], %[[ALLOC3]]"]}
-  memref.copy %1, %res : memref<2xf32> to memref<2xf32>
-  return
-}
-
-// -----
-
-func.func @arith_select() -> (memref<i32>, memref<i32>) {
-  %cond = "test.make_condition"() : () -> (i1)
-  %a = memref.alloc() : memref<i32>
-  %b = memref.alloc() : memref<i32>
-  %c = arith.select %cond, %a, %b : memref<i32>
-  return %a, %c : memref<i32>, memref<i32>
-}
-
-// CHECK-LABEL: @arith_select
-// CHECK: %[[COND:.*]] = "test.make_condition"
-// CHECK: %[[A:.*]] = memref.alloc
-// CHECK: %[[B:.*]] = memref.alloc
-// CHECK: %[[C:.*]] = arith.select %[[COND]], %[[A]], %[[B]]
-// CHECK-SAME: result_backing_memory = ["%[[A]], %[[B]]"]
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/buffer_reuse.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/buffer_reuse.mlir
index b7eb2bfb1145da..19c0bd67274c8c 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/buffer_reuse.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/buffer_reuse.mlir
@@ -206,99 +206,6 @@ func.func @double_buffer_while_both(%lb: index, %ub: index, %step: index) {
 
 // -----
 
-func.func @simplify_loop_dealloc() {
-  %a = memref.alloc() : memref<f32>
-  %a_owned = deallocation.own %a : memref<f32>
-  %b = memref.alloc() : memref<f32>
-  %b_owned = deallocation.own %b : memref<f32>
-  %c = memref.alloc() : memref<f32>
-  %c_owned = deallocation.own %c : memref<f32>
-  %w:6 = scf.while (%arg0 = %a, %arg1 = %b, %arg2 = %c, %arg3 = %a_owned, %arg4 = %b_owned, %arg5 = %c_owned)
-    : (memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership) ->
-      (memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership) {
-    %cond = "test.make_condition"() : () -> i1
-    scf.condition(%cond) %arg2, %arg1, %arg0, %arg5, %arg4, %arg3
-      : memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership
-  } do {
-  ^bb0(%arg0: memref<f32>, %arg1: memref<f32>, %arg2: memref<f32>,
-        %arg3: !deallocation.ownership, %arg4: !deallocation.ownership, %arg5: !deallocation.ownership):
-    scf.yield %arg1, %arg0, %arg2, %arg4, %arg3, %arg5
-      : memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership
-  }
-  memref.dealloc %w#0 : memref<f32>
-  memref.dealloc %w#1 : memref<f32>
-  memref.dealloc %w#2 : memref<f32>
-  return
-}
-
-// CHECK-LABEL: @simplify_loop_dealloc
-// CHECK: memref.alloca
-// CHECK: memref.alloca
-// CHECK: memref.alloca
-// CHECK-NOT: memref.alloc
-// CHECK-NOT: memref.dealloc
-
-// -----
-
-func.func @hoist_always_reallocated() {
-  %a = memref.alloc() : memref<f32>
-  %b = deallocation.own %a : memref<f32>
-  %w:3 = scf.while(%arg0 = %a, %arg1 = %b)
-      : (memref<f32>, !deallocation.ownership)
-     -> (i32, memref<f32>, !deallocation.ownership) {
-    %cond = "test.make_condition"() : () -> i1
-    %v = "test.dummy"() : () -> i32
-    memref.dealloc %arg0 : memref<f32>
-    %0 = memref.alloc() : memref<f32>
-    %1 = deallocation.own %0 : memref<f32>
-    scf.condition (%cond) %v, %0, %1 : i32, memref<f32>, !deallocation.ownership
-  } do {
-  ^bb0(%_: i32, %arg0: memref<f32>, %arg1 : !deallocation.ownership):
-    memref.dealloc %arg0 : memref<f32>
-    %0 = memref.alloc() : memref<f32>
-    %1 = deallocation.own %0 : memref<f32>
-    scf.yield %0, %1 : memref<f32>, !deallocation.ownership
-  }
-  memref.dealloc %w#1 : memref<f32>
-  return
-}
-
-// CHECK-LABEL: @hoist_always_reallocated
-// CHECK-NEXT: memref.alloca
-// CHECK-NEXT: deallocation.null
-// CHECK-NEXT: scf.while
-// CHECK-NOT: memref.alloc
-
-// -----
-
-func.func @hoist_passthrough() {
-  %a = memref.alloc() : memref<f32>
-  %b = deallocation.own %a : memref<f32>
-  %w:3 = scf.while(%arg0 = %a, %arg1 = %b)
-      : (memref<f32>, !deallocation.ownership)
-     -> (i32, memref<f32>, !deallocation.ownership) {
-    %cond = "test.make_condition"() : () -> i1
-    %v = "test.dummy"() : () -> i32
-    memref.dealloc %arg0 : memref<f32>
-    %0 = memref.alloc() : memref<f32>
-    %1 = deallocation.own %0 : memref<f32>
-    scf.condition (%cond) %v, %0, %1 : i32, memref<f32>, !deallocation.ownership
-  } do {
-  ^bb0(%_: i32, %arg0: memref<f32>, %arg1: !deallocation.ownership):
-    scf.yield %arg0, %arg1 : memref<f32>, !deallocation.ownership
-  }
-  memref.dealloc %w#1 : memref<f32>
-  return
-}
-
-// CHECK-LABEL: @hoist_passthrough
-// CHECK-NEXT: memref.alloca
-// CHECK-NEXT: deallocation.null
-// CHECK-NEXT: scf.while
-// CHECK-NOT: memref.alloc
-
-// -----
-
 func.func @allocs_in_different_scopes_with_no_overlap() {
   %alloc0 = memref.alloc() : memref<4xi32>
   "test.use"(%alloc0) : (memref<4xi32>) -> ()
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/convert_deallocation_ops_to_llvm.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/convert_deallocation_ops_to_llvm.mlir
deleted file mode 100644
index 121b9f346cd67f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/convert_deallocation_ops_to_llvm.mlir
+++ /dev/null
@@ -1,77 +0,0 @@
-// RUN: mlir-hlo-opt -hlo-convert-deallocation-ops-to-llvm %s \
-// RUN: -split-input-file | FileCheck %s
-
-// CHECK-LABEL: func.func @null()
-func.func @null() -> !deallocation.ownership {
-  %null = deallocation.null
-  func.return %null : !deallocation.ownership
-}
-// CHECK: %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK: %[[RET:.*]] = builtin.unrealized_conversion_cast %[[NULL]]
-// CHECK: return %[[RET]]
-
-// -----
-
-// CHECK-LABEL: func.func @memref_get_buffer
-func.func @memref_get_buffer(%arg0: memref<2x?xf32>) -> index {
-  %ret = deallocation.get_buffer %arg0 : memref<2x?xf32>
-  return %ret : index
-}
-
-// CHECK-NEXT: builtin.unrealized_conversion_cast
-// CHECK-NEXT: llvm.extractvalue
-// CHECK-NEXT: llvm.ptrtoint
-
-// -----
-
-// CHECK-LABEL: func.func @ownership_get_buffer
-func.func @ownership_get_buffer(%arg0: !deallocation.ownership) -> index {
-  %ret = deallocation.get_buffer %arg0 : !deallocation.ownership
-  return %ret : index
-}
-
-// CHECK-NEXT: builtin.unrealized_conversion_cast
-// CHECK-NEXT: llvm.ptrtoint
-
-// -----
-
-// CHECK-LABEL: func.func @own(
-func.func @own(%arg0: memref<2x?xf32>) -> !deallocation.ownership {
-  %ret = deallocation.own %arg0 : memref<2x?xf32>
-  return %ret : !deallocation.ownership
-}
-
-// CHECK-NEXT: builtin.unrealized_conversion_cast
-// CHECK-NEXT: llvm.extractvalue
-// CHECK-NEXT: builtin.unrealized_conversion_cast
-
-// -----
-
-func.func @freeAlloc(%arg0: !deallocation.ownership) {
-  deallocation.free %arg0
-  return
-}
-
-// CHECK: @freeAlloc
-// CHECK-NEXT: builtin.unrealized_conversion_cast
-// CHECK-NEXT: llvm.call @free
-
-// -----
-
-func.func @retain_multiple(%arg0: memref<?xi32>, %arg1: memref<?xi32>,
-        %arg2: !deallocation.ownership, %arg3: !deallocation.ownership)
-    -> (!deallocation.ownership, !deallocation.ownership) {
-  %ret:2 = deallocation.retain(%arg0, %arg1) of (%arg2, %arg3)
-    : (memref<?xi32>, memref<?xi32>, !deallocation.ownership, !deallocation.ownership)
-    -> (!deallocation.ownership, !deallocation.ownership)
-  return %ret#0, %ret#1 : !deallocation.ownership, !deallocation.ownership
-}
-
-// CHECK-LABEL: @retain_multiple
-// CHECK-SAME:     %[[ARG0:.*]]: memref<?xi32>, %[[ARG1:.*]]: memref<?xi32>
-// CHECK-SAME:     %[[ARG2:.*]]: {{.*}}, %[[ARG3:.*]]:
-// CHECK:          memref.alloca_scope
-// CHECK:          llvm.alloca
-// CHECK:          llvm.alloca
-// CHECK:          call @retainBuffers
-// CHECK:          memref.alloca_scope.return
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocate.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocate.mlir
deleted file mode 100644
index 83a03c9853d062..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocate.mlir
+++ /dev/null
@@ -1,928 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --allow-unregistered-dialect \
-// RUN:     --hlo-deallocate | \
-// RUN: FileCheck %s
-
-// RUN: mlir-hlo-opt %s --split-input-file --allow-unregistered-dialect \
-// RUN:     --hlo-deallocate --hlo-deallocation-simplification | \
-// RUN: FileCheck %s --check-prefix=CHECK-SIMPLE
-
-func.func @loop_nested_alloc(
-    %lb: index, %ub: index, %step: index,
-    %buf: memref<2xf32>, %res: memref<2xf32>) {
-  %0 = memref.alloc() : memref<2xf32>
-  %1 = scf.for %i = %lb to %ub step %step
-      iter_args(%iterBuf = %buf) -> memref<2xf32> {
-    %2 = scf.for %i2 = %lb to %ub step %step
-        iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
-      %3 = memref.alloc() : memref<2xf32>
-      %4 = arith.cmpi eq, %i, %ub : index
-      %5 = scf.if %4 -> (memref<2xf32>) {
-        %6 = memref.alloc() : memref<2xf32>
-        scf.yield %6 : memref<2xf32>
-      } else {
-        scf.yield %iterBuf2 : memref<2xf32>
-      }
-      scf.yield %5 : memref<2xf32>
-    }
-    scf.yield %2 : memref<2xf32>
-  }
-  memref.copy %1, %res : memref<2xf32> to memref<2xf32>
-  return
-}
-
-// CHECK-LABEL: func @loop_nested_alloc
-// CHECK-SAME:      %[[ARG3:[a-z0-9]*]]: memref<2xf32>, %[[OUT:.*]]: memref<2xf32>)
-// CHECK:       %[[ALLOC:.*]] = memref.alloc() : memref<2xf32>
-// CHECK:       %[[ALLOC_OWNED:.*]] = deallocation.own %[[ALLOC]]
-// CHECK:       %[[ARG3_UNOWNED:.*]] = deallocation.null
-// CHECK:       %[[FOR1:.*]]:2 = scf.for {{.*}}iter_args(%[[A:.*]] = %[[ARG3]], %[[A_OWNERSHIP:.*]] = %[[ARG3_UNOWNED]])
-// CHECK:         %[[FOR2:.*]]:2 = scf.for {{.*}} iter_args(%[[B:.*]] = %[[A]], %[[B_OWNERSHIP:.*]] = %[[A_OWNERSHIP]])
-// CHECK:           %[[ALLOC2:.*]] = memref.alloc() : memref<2xf32>
-// CHECK:           %[[ALLOC2_OWNED:.*]] = deallocation.own %[[ALLOC2]]
-// CHECK:           %[[IF:.*]]:2 = scf.if
-// CHECK:             %[[ALLOC3:.*]] = memref.alloc() : memref<2xf32>
-// CHECK:             %[[ALLOC3_OWNED:.*]] = deallocation.own %[[ALLOC3]]
-// CHECK:             scf.yield %[[ALLOC3]], %[[ALLOC3_OWNED]]
-// CHECK:           } else {
-// CHECK:             %[[NULL:.*]] = deallocation.retain(%[[B]]) of()
-// CHECK:             scf.yield %[[B]], %[[NULL]]
-// CHECK:           }
-// CHECK:           %[[RETAINED_IF:.*]] = deallocation.retain(%[[IF]]#0) of(%[[B_OWNERSHIP]], %[[IF]]#1)
-// CHECK:           deallocation.retain() of(%[[ALLOC2_OWNED]])
-// CHECK:           scf.yield %[[IF]]#0, %[[RETAINED_IF]]
-// CHECK:         }
-// CHECK:         scf.yield %[[FOR2]]#0, %[[FOR2]]#1
-// CHECK:       }
-// CHECK:       memref.copy %[[FOR1]]#0, %[[OUT]]
-// CHECK:       deallocation.retain() of(%[[ALLOC_OWNED]])
-// CHECK:       deallocation.retain() of(%[[FOR1]]#1)
-// CHECK:       return
-
-// -----
-
-func.func @nested_if() -> (memref<2xf32>, memref<2xf32>) {
-  %alloc_0 = memref.alloc() : memref<2xf32>
-  %alloc_1 = memref.alloc() : memref<2xf32>
-  %a = "test.condition"() : () -> i1
-  %0 = scf.if %a -> (memref<2xf32>) {
-    %2 = memref.alloc() : memref<2xf32>
-    scf.yield %2 : memref<2xf32>
-  } else {
-    %b = "test.condition"() : () -> i1
-    %3 = scf.if %b -> (memref<2xf32>) {
-      scf.yield %alloc_0 : memref<2xf32>
-    } else {
-      scf.yield %alloc_1 : memref<2xf32>
-    }
-    scf.yield %3 : memref<2xf32>
-  }
-  return %alloc_0, %0 : memref<2xf32>, memref<2xf32>
-}
-
-// CHECK-LABEL: func @nested_if
-// CHECK:       %[[ALLOC0:.*]] = memref.alloc()
-// CHECK:       %[[ALLOC0_OWNED:.*]] = deallocation.own %[[ALLOC0]]
-// CHECK:       %[[ALLOC1:.*]] = memref.alloc()
-// CHECK:       %[[ALLOC1_OWNED:.*]] = deallocation.own %[[ALLOC1]]
-// CHECK:       %[[IF1:.*]]:2 = scf.if
-// CHECK-NEXT:    %[[ALLOC2:.*]] = memref.alloc()
-// CHECK-NEXT:    %[[ALLOC2_OWNED:.*]] = deallocation.own %[[ALLOC2]]
-// CHECK-NEXT:    scf.yield %[[ALLOC2]], %[[ALLOC2_OWNED]]
-// CHECK-NEXT:  } else {
-// CHECK:         %[[IF2:.*]]:2 = scf.if
-// CHECK-NEXT:      %[[NULL:.*]] = deallocation.retain(%[[ALLOC0]]) of()
-// CHECK-NEXT:      scf.yield %[[ALLOC0]], %[[NULL]]
-// CHECK-NEXT:    } else {
-// CHECK-NEXT:      %[[NULL:.*]] = deallocation.retain(%[[ALLOC1]]) of()
-// CHECK-NEXT:      scf.yield %[[ALLOC1]], %[[NULL]]
-// CHECK-NEXT:    }
-// CHECK-NEXT:    scf.yield %[[IF2]]#0, %[[IF2]]#1
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[RETAINED:.*]]:2 = deallocation.retain(%[[ALLOC0]], %[[IF1]]#0) of(%[[ALLOC0_OWNED]], %[[ALLOC1_OWNED]], %[[IF1]]#1)
-// CHECK-NEXT:  return %[[ALLOC0]], %[[IF1]]#0, %[[RETAINED]]#0, %[[RETAINED]]#1 : memref<2xf32>, memref<2xf32>, !deallocation.ownership, !deallocation.ownership
-
-// -----
-
-func.func @while(%arg0: index) -> (memref<?xf32>, memref<?xf32>, memref<?xf32>) {
-  %a = memref.alloc(%arg0) : memref<?xf32>
-  %w:3 = scf.while (%arg1 = %a, %arg2 = %a, %arg3 = %a) : (memref<?xf32>, memref<?xf32>, memref<?xf32>)
-      -> (memref<?xf32>, memref<?xf32>, memref<?xf32>) {
-    %0 = "test.make_condition"() : () -> i1
-    scf.condition(%0) %arg1, %arg2, %arg3 : memref<?xf32>, memref<?xf32>, memref<?xf32>
-  } do {
-  ^bb0(%arg1: memref<?xf32>, %arg2: memref<?xf32>, %arg3: memref<?xf32>):
-    %b = memref.alloc(%arg0) : memref<?xf32>
-    %q = memref.alloc(%arg0) : memref<?xf32>
-    scf.yield %q, %b, %arg2: memref<?xf32>, memref<?xf32>, memref<?xf32>
-  }
-  return %w#0, %w#1, %w#2 : memref<?xf32>, memref<?xf32>, memref<?xf32>
-}
-
-// CHECK-LABEL: func @while(
-// CHECK-SAME:      %[[ARG0:.*]]:
-// CHECK-NEXT:    %[[ALLOC:.*]] = memref.alloc(%arg0) : memref<?xf32>
-// CHECK-NEXT:    %[[ALLOC_OWNED:.*]] = deallocation.own %[[ALLOC]]
-// CHECK-NEXT:    %[[NULL:.*]] = deallocation.null
-// CHECK-NEXT:    %[[WHILE:.*]]:6 = scf.while (%[[A:[a-z0-9]*]] = %[[ALLOC]], %[[B:[a-z0-9]*]] = %[[ALLOC]], %[[C:[a-z0-9]*]] = %[[ALLOC]],
-// CHECK-SAME:       %[[A_OWNERSHIP:.*]] = %[[ALLOC_OWNED]], %[[B_OWNERSHIP:.*]] = %[[NULL]], %[[C_OWNERSHIP:.*]] = %[[NULL]])
-// CHECK:            scf.condition{{.*}} %[[A]], %[[B]], %[[C]], %[[A_OWNERSHIP]], %[[B_OWNERSHIP]], %[[C_OWNERSHIP]]
-// CHECK:         } do {
-// CHECK:           %[[ALLOC1:.*]] = memref.alloc(%[[ARG0]])
-// CHECK:           %[[ALLOC1_OWNED:.*]] = deallocation.own %[[ALLOC1]]
-// CHECK:           %[[ALLOC2:.*]] = memref.alloc(%[[ARG0]])
-// CHECK:           %[[ALLOC2_OWNED:.*]] = deallocation.own %[[ALLOC2]]
-// CHECK:           deallocation.retain() of(%[[A_OWNERSHIP]])
-// CHECK:           deallocation.retain() of(%[[C_OWNERSHIP]])
-// CHECK:           scf.yield %[[ALLOC2]], %[[ALLOC1]], %[[B]], %[[ALLOC2_OWNED]], %[[ALLOC1_OWNED]], %[[B_OWNERSHIP]]
-// CHECK:         }
-// CHECK:         %[[RESULTS_RETAINED:.*]] = deallocation.retain(%[[WHILE]]#0, %[[WHILE]]#1, %[[WHILE]]#2)
-// CHECK-SAME:      of(%[[WHILE]]#3, %[[WHILE]]#4, %[[WHILE]]#5)
-// CHECK:         return %[[WHILE]]#0, %[[WHILE]]#1, %[[WHILE]]#2
-
-// -----
-
-func.func @if_without_else() {
-  %cond = "test.make_condition"() : () -> i1
-  scf.if %cond {
-    %x = memref.alloc() : memref<2xf32>
-    "test.use"(%x) : (memref<2xf32>) -> ()
-    scf.yield
-  }
-  return
-}
-
-// CHECK-LABEL: @if_without_else
-// CHECK:       scf.if
-// CHECK-NEXT:  %[[ALLOC:.*]] = memref.alloc
-// CHECK-NEXT:  %[[ALLOC_OWNED:.*]] = deallocation.own %[[ALLOC]]
-// CHECK-NEXT:  test.use
-// CHECK-NEXT:  deallocation.retain() of(%[[ALLOC_OWNED]])
-
-// CHECK-SIMPLE-LABEL: @if_without_else
-// CHECK-SIMPLE:       scf.if
-// CHECK-SIMPLE-NEXT:    memref.alloc
-// CHECK-SIMPLE-NEXT:    test.use
-// CHECK-SIMPLE-NEXT:    memref.dealloc
-
-// -----
-
-func.func @yield_same_alloc_twice() {
-  %alloc = memref.alloc() : memref<f32>
-  scf.while (%a = %alloc, %b = %alloc) : (memref<f32>, memref<f32>) -> () {
-    %cond = "test.make_condition"() : () -> i1
-    scf.condition(%cond)
-  } do {
-  ^bb0():
-    scf.yield %alloc, %alloc : memref<f32>, memref<f32>
-  }
-  return
-}
-
-// CHECK-LABEL: @yield_same_alloc_twice
-// CHECK-NEXT:  %[[ALLOC:.*]] = memref.alloc
-// CHECK-NEXT:  %[[ALLOC_OWNED:.*]] = deallocation.own %[[ALLOC]]
-// CHECK-NEXT:  %[[NULL:.*]] = deallocation.null
-// CHECK:       scf.while
-// CHECK-SAME:    %[[ALLOC]]
-// CHECK-SAME:    %[[ALLOC]]
-// CHECK-SAME:    %[[NULL]]
-// CHECK-SAME:    %[[NULL]]
-// CHECK:       do
-// CHECK-NEXT:    %[[NULL:.*]] = deallocation.null
-// CHECK-NEXT:    %[[RETAIN:.*]]:2 = deallocation.retain(%[[ALLOC]], %[[ALLOC]]) of()
-// CHECK-NEXT:    scf.yield %[[ALLOC]], %[[ALLOC]], %[[RETAIN]]#1, %[[NULL]]
-
-// -----
-
-func.func @yield_derived(%lb: index, %ub: index, %step: index) {
-  %0 = memref.alloc() : memref<2xi32>
-  %1 = scf.for %i2 = %lb to %ub step %step iter_args(%arg0 = %0) -> memref<2xi32> {
-    %2 = memref.alloc() : memref<2xi32>
-    %3 = "test.someop"(%2) : (memref<2xi32>) -> memref<1xi32>
-    %4 = "test.someop"(%3) : (memref<1xi32>) -> memref<2xi32>
-    scf.yield %4 : memref<2xi32>
-  }
-  "test.use"(%1) : (memref<2xi32>) -> ()
-  return
-}
-
-// CHECK-LABEL: @yield_derived
-// CHECK-NEXT:  memref.alloc
-// CHECK-NEXT:  deallocation.own
-// CHECK-NEXT:  scf.for
-// CHECK-NEXT:    %[[ALLOC:.*]] = memref.alloc
-// CHECK-NEXT:    %[[ALLOC_OWNED:.*]] = deallocation.own
-// CHECK-NEXT:    "test.someop"
-// CHECK-NEXT:    %[[RESULT:.*]] = "test.someop"
-// CHECK-NEXT:    %[[RETAINED:.*]] = deallocation.retain
-// CHECK-NEXT:    deallocation.retain() of
-// CHECK-NEXT:    scf.yield %[[RESULT]], %[[RETAINED]]
-// CHECK-NEXT:  }
-// CHECK-NEXT:  test.use
-// CHECK-NEXT:  retain
-
-// -----
-
-func.func @unknown_op() {
-  %c0 = arith.constant 0 : index
-  %c512 = arith.constant 512 : index
-  %c1 = arith.constant 1 : index
-  %c8 = arith.constant 8 : index
-  scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%c512, %c512) step (%c1, %c8) {
-    %alloc_14 = memref.alloc() {alignment = 64 : i64} : memref<512x512xf32>
-    "test.use"(%alloc_14) : (memref<512x512xf32>) -> ()
-    scf.yield
-  }
-  return
-}
-
-// TODO(jreiffers): Remove the `own` op in simplification.
-// CHECK-SIMPLE-LABEL: @unknown_op
-// CHECK-SIMPLE:       scf.parallel
-// CHECK-SIMPLE-NEXT:  memref.alloc()
-// CHECK-SIMPLE:       test.use
-// CHECK-SIMPLE-NEXT:  memref.dealloc
-
-// -----
-
-func.func @unconditional_realloc(%init: index, %new: index) {
-  %alloc = memref.alloc(%init) : memref<?xi32>
-  "test.use"(%alloc) : (memref<?xi32>) -> ()
-  %realloc = memref.realloc %alloc(%new) : memref<?xi32> to memref<?xi32>
-  "test.use"(%realloc) : (memref<?xi32>) -> ()
-  return
-}
-
-// CHECK-LABEL: @unconditional_realloc
-// CHECK-NEXT:  memref.alloc
-// CHECK-NEXT:  deallocation.own
-// CHECK-NEXT:  test.use
-// CHECK-NEXT:  %[[REALLOC:.*]] = memref.realloc
-// CHECK-NEXT:  %[[OWNED:.*]] = deallocation.own %[[REALLOC]]
-// CHECK-NEXT:  test.use
-// CHECK-NEXT:  deallocation.retain() of(%[[OWNED]])
-// CHECK-NEXT:  return
-
-// CHECK-SIMPLE-LABEL: @unconditional_realloc
-// CHECK-SIMPLE-NEXT:  memref.alloc
-// CHECK-SIMPLE-NEXT:  test.use
-// CHECK-SIMPLE-NEXT:  %[[REALLOC:.*]] = memref.realloc
-// CHECK-SIMPLE-NEXT:  test.use
-// CHECK-SIMPLE-NEXT:  memref.dealloc %[[REALLOC]]
-
-// -----
-
-func.func @realloc_in_if(%init: index) {
-  %alloc = memref.alloc(%init) : memref<?xi32>
-  %cond = "test.make_condition"() : () -> (i1)
-  %new_alloc = scf.if %cond -> memref<?xi32> {
-    %new_size = "test.make_index"() : () -> (index)
-    %ret = memref.realloc %alloc(%new_size) : memref<?xi32> to memref<?xi32>
-    scf.yield %ret : memref<?xi32>
-  } else {
-    scf.yield %alloc: memref<?xi32>
-  }
-  "test.use"(%new_alloc) : (memref<?xi32>) -> ()
-  return
-}
-
-// CHECK-LABEL: @realloc_in_if
-// CHECK-NEXT:  %[[ALLOC:.*]] = memref.alloc
-// CHECK-NEXT:  %[[OWNED:.*]] = deallocation.own %[[ALLOC]]
-// CHECK-NEXT:  test.make_condition
-// CHECK-NEXT:  %[[NEW_ALLOC:.*]]:2 = scf.if
-// CHECK-NEXT:    test.make_index
-// CHECK-NEXT:    %[[REALLOC:.*]] = memref.realloc %[[ALLOC]]
-// CHECK-NEXT:    %[[REALLOC_OWNED:.*]] = deallocation.own %[[REALLOC]]
-// CHECK-NEXT:    scf.yield %[[REALLOC]], %[[REALLOC_OWNED]]
-// CHECK-NEXT:  } else {
-// CHECK-NEXT:    deallocation.retain(%[[ALLOC]]) of()
-// CHECK-NEXT:    scf.yield %[[ALLOC]], %[[OWNED]]
-// CHECK-NEXT:  }
-// CHECK-NEXT:  "test.use"(%[[NEW_ALLOC]]#0)
-// CHECK-NEXT:  deallocation.retain() of(%[[NEW_ALLOC]]#1)
-// CHECK-NEXT:  return
-
-// -----
-
-func.func @realloc_in_if_strange_but_ok(%size: index, %cond: i1) {
-  %alloc = memref.alloc(%size) : memref<?xi32>
-  scf.if %cond -> memref<?xi32> {
-    %realloc = memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
-    %new = memref.alloc(%size) : memref<?xi32>
-    scf.yield %new : memref<?xi32>
-  } else {
-    "test.dummy"() : () -> ()
-    scf.yield %alloc : memref<?xi32>
-  }
-  return
-}
-
-// CHECK-LABEL: @realloc_in_if_strange_but_ok
-// CHECK-NEXT:  %[[ALLOC:.*]] = memref.alloc
-// CHECK-NEXT:  %[[OWNED:.*]] = deallocation.own %[[ALLOC]]
-// CHECK-NOT:   deallocation.retain() of(%[[OWNED]])
-
-// -----
-
-func.func @realloc_in_loop(%size: index, %lb: index, %ub: index, %step: index) {
-  %alloc = memref.alloc(%size) : memref<?xi32>
-  scf.for %i = %lb to %ub step %step iter_args(%arg0 = %alloc) -> memref<?xi32> {
-    %cond = "test.make_condition"() : () -> i1
-    %new = scf.if %cond -> memref<?xi32> {
-      %realloc = memref.realloc %arg0(%size) : memref<?xi32> to memref<?xi32>
-      scf.yield %realloc : memref<?xi32>
-    } else {
-      scf.yield %arg0 : memref<?xi32>
-    }
-    scf.yield %new : memref<?xi32>
-  }
-  return
-}
-
-// CHECK-LABEL: @realloc_in_loop
-// CHECK-NEXT:  memref.alloc
-// CHECK-NEXT:  %[[OWNED:.*]] = deallocation.own
-// CHECK-NEXT:  %[[FOR:.*]]:2 = scf.for
-// CHECK:         %[[IF:.*]]:2 = scf.if
-// CHECK:         scf.yield %[[IF]]#0, %[[IF]]#1
-// CHECK-NEXT:  }
-// CHECK-NEXT:  deallocation.retain() of(%[[FOR]]#1)
-// CHECK-NEXT:  return
-
-// -----
-
-func.func @alloca() {
-  %alloca = memref.alloca() : memref<2xf32>
-  %passthrough = "test.use"(%alloca) : (memref<2xf32>) -> (memref<2xf32>)
-  "test.use"(%passthrough) : (memref<2xf32>) -> ()
-  return
-}
-
-// CHECK-LABEL: @alloca()
-// CHECK-NEXT: memref.alloca
-// CHECK-NEXT: test.use
-// CHECK-NEXT: test.use
-// CHECK-NEXT: return
-
-// -----
-
-func.func @dealloc() {
-  %alloc = memref.alloc() : memref<i32>
-  "test.use"(%alloc) : (memref<i32>) -> ()
-  memref.dealloc %alloc: memref<i32>
-  return
-}
-
-// CHECK-LABEL:        @dealloc
-// CHECK-SIMPLE-LABEL: @dealloc
-// CHECK-SIMPLE-NEXT:  memref.alloc
-// CHECK-SIMPLE-NEXT:  test.use
-// CHECK-SIMPLE-NEXT:  memref.dealloc
-// CHECK-SIMPLE-NEXT:  return
-
-// -----
-
-func.func @dealloc_in_loop(%lb: index, %ub: index, %step: index) {
-  scf.for %i = %lb to %ub step %step {
-    %alloc = memref.alloc() : memref<i32>
-    "test.use"(%alloc) : (memref<i32>) -> ()
-    memref.dealloc %alloc: memref<i32>
-  }
-  return
-}
-
-// CHECK-LABEL:        @dealloc_in_loop
-// CHECK-SIMPLE-LABEL: @dealloc_in_loop
-// CHECK-SIMPLE-NEXT:  scf.for
-// CHECK-SIMPLE-NEXT:    memref.alloc
-// CHECK-SIMPLE-NEXT:    test.use
-// CHECK-SIMPLE-NEXT:    memref.dealloc
-// CHECK-SIMPLE-NEXT:  }
-// CHECK-SIMPLE-NEXT:  return
-
-// -----
-
-func.func @dealloc_around_loop(%lb: index, %ub: index, %step: index) {
-  %alloc = memref.alloc() : memref<i32>
-  scf.for %i = %lb to %ub step %step {
-    "test.use"(%alloc) : (memref<i32>) -> ()
-  }
-  memref.dealloc %alloc: memref<i32>
-  return
-}
-
-// CHECK-LABEL:        @dealloc_around_loop
-// CHECK-SIMPLE-LABEL: @dealloc_around_loop
-// CHECK-SIMPLE-NEXT:  memref.alloc
-// CHECK-SIMPLE-NEXT:  scf.for
-// CHECK-SIMPLE-NEXT:    test.use
-// CHECK-SIMPLE-NEXT:  }
-// CHECK-SIMPLE-NEXT:  memref.dealloc
-// CHECK-SIMPLE-NEXT:  return
-
-// -----
-
-func.func @memory_effect_no_free_or_alloc() {
-  %alloc = memref.alloc() : memref<i32>
-  %expand_shape = memref.expand_shape %alloc [] : memref<i32> into memref<1x1xi32>
-  "test.use"(%expand_shape) : (memref<1x1xi32>) -> ()
-  return
-}
-
-// CHECK-LABEL: @memory_effect_no_free_or_alloc
-// CHECK-NEXT:  memref.alloc
-// CHECK-NEXT:  deallocation.own
-// CHECK-NEXT:  memref.expand_shape
-// CHECK-NEXT:  test.use
-// CHECK-NEXT:  deallocation.retain
-
-// -----
-
-func.func @id(%arg0: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
-  return %arg0 : memref<1x2x3xf32>
-}
-
-func.func @user(%arg0: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
-  %0 = call @id(%arg0) : (memref<1x2x3xf32>) -> memref<1x2x3xf32>
-  return %0 : memref<1x2x3xf32>
-}
-
-// CHECK: @id(%[[ARG0:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[RETAIN:.*]] = deallocation.retain(%[[ARG0]]) of()
-// CHECK:   return %[[ARG0]], %[[RETAIN]]
-
-// CHECK: @user(%[[ARG0_0:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[OWNERSHIP:.*]]:2 = call @id(%[[ARG0_0]])
-// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
-
-// -----
-
-func.func @id_select(%arg0: i1, %arg1: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
-  %0 = arith.select %arg0, %arg1, %arg1 : memref<1x2x3xf32>
-  return %0 : memref<1x2x3xf32>
-}
-
-func.func @user(%arg0: i1, %arg1: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
-  %0 = call @id_select(%arg0, %arg1) : (i1, memref<1x2x3xf32>) -> memref<1x2x3xf32>
-  return %0 : memref<1x2x3xf32>
-}
-
-// CHECK: @id_select(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[SELECT:.*]] = arith.select %[[ARG0]], %[[ARG1]], %[[ARG1]]
-// CHECK:   %[[RETAIN:.*]] = deallocation.retain(%[[SELECT]]) of()
-// CHECK:   return %[[SELECT]], %[[RETAIN]]
-
-// CHECK: @user(%[[ARG0_0:.*]]: i1, %[[ARG1_0:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[OWNERSHIP:.*]]:2 = call @id_select(%[[ARG0_0]], %[[ARG1_0]])
-// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
-
-// -----
-
-func.func @ite(%arg0: i1, %arg1: memref<1x2x3xf32>, %arg2: memref<1x2x3xf32>)
-    -> memref<1x2x3xf32> {
-  %0 = scf.if %arg0 -> (memref<1x2x3xf32>) {
-    scf.yield %arg1 : memref<1x2x3xf32>
-  } else {
-    scf.yield %arg2 : memref<1x2x3xf32>
-  }
-  return %0 : memref<1x2x3xf32>
-}
-
-func.func @user(%arg0: i1, %arg1: memref<1x2x3xf32>, %arg2: memref<1x2x3xf32>)
-    -> memref<1x2x3xf32> {
-  %0 = call @ite(%arg0, %arg1, %arg2)
-      : (i1, memref<1x2x3xf32>, memref<1x2x3xf32>) -> memref<1x2x3xf32>
-  return %0 : memref<1x2x3xf32>
-}
-
-// CHECK: @ite(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: memref<1x2x3xf32>, %[[ARG2:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[IF:.*]]:2 = scf.if %[[ARG0]]
-// CHECK:     %[[RETAIN:.*]] = deallocation.retain(%[[ARG1]]) of()
-// CHECK:     scf.yield %[[ARG1]], %[[RETAIN]]
-// CHECK:   else
-// CHECK:     %[[RETAIN_0:.*]] = deallocation.retain(%[[ARG2]]) of()
-// CHECK:     scf.yield %[[ARG2]], %[[RETAIN_0]]
-// CHECK:   return %[[IF]]#0, %[[IF]]#1
-
-// CHECK: @user(%[[ARG0_0:.*]]: i1, %[[ARG1_0:.*]]: memref<1x2x3xf32>, %[[ARG2_0:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[OWNERSHIP:.*]]:2 = call @ite(%[[ARG0_0]], %[[ARG1_0]], %[[ARG2_0]])
-// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
-
-// -----
-
-func.func @ite_select(%arg0: i1, %arg1: memref<1x2x3xf32>,
-    %arg2: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
-  %0 = arith.select %arg0, %arg1, %arg2 : memref<1x2x3xf32>
-  return %0 : memref<1x2x3xf32>
-}
-
-func.func @user(%arg0: i1, %arg1: memref<1x2x3xf32>, %arg2: memref<1x2x3xf32>)
-    -> memref<1x2x3xf32> {
-  %0 = call @ite_select(%arg0, %arg1, %arg2)
-      : (i1, memref<1x2x3xf32>, memref<1x2x3xf32>) -> memref<1x2x3xf32>
-  return %0 : memref<1x2x3xf32>
-}
-
-// CHECK: @ite_select(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: memref<1x2x3xf32>, %[[ARG2:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[SELECT:.*]] = arith.select %[[ARG0]], %[[ARG1]], %[[ARG2]]
-// CHECK:   %[[RETAIN:.*]] = deallocation.retain(%[[SELECT]]) of()
-// CHECK:   return %[[SELECT]], %[[RETAIN]]
-
-// CHECK: @user(%[[ARG0_0:.*]]: i1, %[[ARG1_0:.*]]: memref<1x2x3xf32>, %[[ARG2_0:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[OWNERSHIP:.*]]:2 = call @ite_select(%[[ARG0_0]], %[[ARG1_0]], %[[ARG2_0]])
-// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
-
-// -----
-
-func.func @may_reuse(%arg0: i1, %arg1: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
-  %0 = scf.if %arg0 -> (memref<1x2x3xf32>) {
-    scf.yield %arg1 : memref<1x2x3xf32>
-  } else {
-    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x2x3xf32>
-    scf.yield %alloc : memref<1x2x3xf32>
-  }
-  return %0 : memref<1x2x3xf32>
-}
-
-func.func @user(%arg0: i1, %arg1: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
-  %0 = call @may_reuse(%arg0, %arg1) : (i1, memref<1x2x3xf32>)
-      -> memref<1x2x3xf32>
-  return %0 : memref<1x2x3xf32>
-}
-
-// CHECK: @may_reuse(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[IF:.*]]:2 = scf.if %[[ARG0]]
-// CHECK:     %[[RETAIN:.*]] = deallocation.retain(%[[ARG1]]) of()
-// CHECK:     scf.yield %[[ARG1]], %[[RETAIN]]
-// CHECK:   else
-// CHECK:     %[[ALLOC:.*]] = memref.alloc
-// CHECK:     %[[OWN:.*]] = deallocation.own %[[ALLOC]]
-// CHECK:     scf.yield %[[ALLOC]], %[[OWN]]
-// CHECK:   return %[[IF]]#0, %[[IF]]#1
-
-// CHECK: @user(%[[ARG0_0:.*]]: i1, %[[ARG1_0:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[OWNERSHIP:.*]]:2 = call @may_reuse(%[[ARG0_0]], %[[ARG1_0]])
-// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
-
-// -----
-
-func.func @insert(%arg0: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %cst = arith.constant 7.000000e+00 : f32
-  memref.store %cst, %arg0[%c0, %c1, %c1] : memref<1x2x3xf32>
-  return %arg0 : memref<1x2x3xf32>
-}
-
-func.func @user(%arg0: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
-  %0 = call @insert(%arg0) : (memref<1x2x3xf32>) -> memref<1x2x3xf32>
-  return %0 : memref<1x2x3xf32>
-}
-
-// CHECK:     @insert(%[[ARG0:.*]]: memref<1x2x3xf32>)
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
-// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
-// CHECK-DAG:   %[[CST:.*]] = arith.constant 7.0
-// CHECK:       memref.store %[[CST]], %[[ARG0]][%[[C0]], %[[C1]], %[[C1]]]
-// CHECK:       %[[RETAIN:.*]] = deallocation.retain(%[[ARG0]]) of()
-// CHECK:       return %[[ARG0]], %[[RETAIN]]
-
-// CHECK:     @user(%[[ARG0_0:.*]]: memref<1x2x3xf32>)
-// CHECK:       %[[OWNERSHIP:.*]]:2 = call @insert(%[[ARG0_0]])
-// CHECK:       return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
-
-// -----
-
-func.func @ite_no_yielded_buffers(%pred: i1) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %cst = arith.constant 7.000000e+00 : f32
-  %outer_alloc = memref.alloc() {alignment = 64 : i64} : memref<1x2x3xf32>
-  scf.if %pred {
-    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x2x3xf32>
-    memref.store %cst, %alloc[%c0, %c1, %c1] : memref<1x2x3xf32>
-    scf.yield
-  } else {
-    memref.store %cst, %outer_alloc[%c0, %c1, %c1] : memref<1x2x3xf32>
-    scf.yield
-  }
-  return
-}
-
-func.func @user(%arg0: i1) {
-  call @ite_no_yielded_buffers(%arg0) : (i1) -> ()
-  return
-}
-
-// CHECK:     @ite_no_yielded_buffers(%[[ARG0:.*]]: i1)
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
-// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
-// CHECK-DAG:   %[[CST:.*]] = arith.constant 7.0
-// CHECK:       %[[ALLOC:.*]] = memref.alloc
-// CHECK:       %[[OWN:.*]] = deallocation.own %[[ALLOC]]
-// CHECK:       scf.if %[[ARG0]]
-// CHECK:         %[[ALLOC_0:.*]] = memref.alloc
-// CHECK:         %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]]
-// CHECK:         memref.store %[[CST]], %[[ALLOC_0]][%[[C0]], %[[C1]], %[[C1]]]
-// CHECK:         deallocation.retain() of(%[[OWN_0]])
-// CHECK:       else
-// CHECK:         memref.store %[[CST]], %[[ALLOC]][%[[C0]], %[[C1]], %[[C1]]]
-// CHECK:       deallocation.retain() of(%[[OWN]])
-// CHECK:       return
-
-// CHECK:     @user(%[[ARG0_0:.*]]: i1)
-// CHECK:       call @ite_no_yielded_buffers(%[[ARG0_0]])
-// CHECK:       return
-
-// -----
-
-func.func @may_reuse(%pred: i1, %arg: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
-  %0 = scf.if %pred -> (memref<1x2x3xf32>) {
-    scf.yield %arg : memref<1x2x3xf32>
-  } else {
-    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x2x3xf32>
-    scf.yield %alloc : memref<1x2x3xf32>
-  }
-  return %0 : memref<1x2x3xf32>
-}
-
-func.func @user(%pred: i1, %arg: memref<1x2x3xf32>) -> memref<1x2x3xf32> {
-  %may_escape_indirectly = memref.alloc() {alignment = 64 : i64}
-      : memref<1x2x3xf32>
-  %0 = call @may_reuse(%pred, %may_escape_indirectly) : (i1, memref<1x2x3xf32>)
-      -> memref<1x2x3xf32>
-  return %0 : memref<1x2x3xf32>
-}
-
-// CHECK: @may_reuse(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[IF:.*]]:2 = scf.if %[[ARG0]]
-// CHECK:     %[[RETAIN:.*]] = deallocation.retain(%[[ARG1]]) of()
-// CHECK:     scf.yield %[[ARG1]], %[[RETAIN]]
-// CHECK:   else
-// CHECK:     %[[ALLOC:.*]] = memref.alloc
-// CHECK:     %[[OWN:.*]] = deallocation.own %[[ALLOC]]
-// CHECK:     scf.yield %[[ALLOC]], %[[OWN]]
-// CHECK:   return %[[IF]]#0, %[[IF]]#1
-
-// CHECK: @user(%[[ARG0_0:.*]]: i1, %[[ARG1_0:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[ALLOC_0:.*]] = memref.alloc
-// CHECK:   %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]]
-// CHECK:   %[[OWNERSHIP:.*]]:2 = call @may_reuse(%[[ARG0_0]], %[[ALLOC_0]])
-// CHECK:   %[[RETAIN_0:.*]] = deallocation.retain(%[[OWNERSHIP]]#0) of(%[[OWN_0]], %[[OWNERSHIP]]#1)
-// CHECK:   return %[[OWNERSHIP]]#0, %[[RETAIN_0]]
-
-// -----
-
-func.func @insert_may_reuse_and_forward(%arg0: i1, %arg1: memref<1x2x3xf32>)
-    -> (memref<1x2x3xf32>, memref<1x2x3xf32>) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %cst = arith.constant 7.000000e+00 : f32
-  %0 = scf.if %arg0 -> (memref<1x2x3xf32>) {
-    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x2x3xf32>
-    memref.copy %arg1, %alloc : memref<1x2x3xf32> to memref<1x2x3xf32>
-    memref.store %cst, %alloc[%c0, %c1, %c1] : memref<1x2x3xf32>
-    scf.yield %alloc : memref<1x2x3xf32>
-  } else {
-    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x2x3xf32>
-    memref.store %cst, %alloc[%c0, %c1, %c1] : memref<1x2x3xf32>
-    scf.yield %alloc : memref<1x2x3xf32>
-  }
-  return %0, %arg1 : memref<1x2x3xf32>, memref<1x2x3xf32>
-}
-
-func.func @user(%arg0: i1, %arg1: memref<1x2x3xf32>)
-    -> (memref<1x2x3xf32>, memref<1x2x3xf32>) {
-  %5:2 = call @insert_may_reuse_and_forward(%arg0, %arg1)
-      : (i1, memref<1x2x3xf32>) -> (memref<1x2x3xf32>, memref<1x2x3xf32>)
-  return %5#0, %5#1 : memref<1x2x3xf32>, memref<1x2x3xf32>
-}
-
-// CHECK:     @insert_may_reuse_and_forward(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: memref<1x2x3xf32>)
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
-// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
-// CHECK-DAG:   %[[CST:.*]] = arith.constant 7.0
-// CHECK:       %[[IF:.*]]:2 = scf.if %[[ARG0]]
-// CHECK:         %[[ALLOC:.*]] = memref.alloc
-// CHECK:         %[[OWN:.*]] = deallocation.own %[[ALLOC]]
-// CHECK:         memref.copy %[[ARG1]], %[[ALLOC]]
-// CHECK:         memref.store %[[CST]], %[[ALLOC]][%[[C0]], %[[C1]], %[[C1]]]
-// CHECK:         scf.yield %[[ALLOC]], %[[OWN]]
-// CHECK:       else
-// CHECK:         %[[ALLOC_0:.*]] = memref.alloc
-// CHECK:         %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]]
-// CHECK:         memref.store %[[CST]], %[[ALLOC_0]][%[[C0]], %[[C1]], %[[C1]]]
-// CHECK:         scf.yield %[[ALLOC_0]], %[[OWN_0]]
-// CHECK:       %[[RETAIN:.*]] = deallocation.retain(%[[ARG1]]) of()
-// CHECK:       return %[[IF]]#0, %[[ARG1]], %[[IF]]#1, %[[RETAIN]]
-
-// CHECK:     @user(%[[ARG0_0:.*]]: i1, %[[ARG1_0:.*]]: memref<1x2x3xf32>)
-// CHECK:       %[[RESULT:.*]]:4 = call @insert_may_reuse_and_forward(%[[ARG0_0]], %[[ARG1_0]])
-// CHECK:       return %[[RESULT]]#0, %[[RESULT]]#1, %[[RESULT]]#2, %[[RESULT]]#3
-
-// -----
-
-func.func @f(%a : memref<1x2x3xf32>, %b : memref<1x2x3xf32>,
-    %c : memref<1x2x3xf32>, %d : memref<1x2x3xf32>, %e : memref<1x2x3xf32>)
-    -> memref<1x2x3xf32> {
-  %0 = func.call @f(%a, %a, %b, %c, %d) : (memref<1x2x3xf32>, memref<1x2x3xf32>,
-      memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>)
-      -> memref<1x2x3xf32>
-  func.return %0 : memref<1x2x3xf32>
-}
-
-func.func @user() -> memref<1x2x3xf32> {
-  %a = memref.alloc() : memref<1x2x3xf32>
-  %b = memref.alloc() : memref<1x2x3xf32>
-  %c = memref.alloc() : memref<1x2x3xf32>
-  %d = memref.alloc() : memref<1x2x3xf32>
-  %e = memref.alloc() : memref<1x2x3xf32>
-  %0 = func.call @f(%a, %b, %c, %d, %e) : (memref<1x2x3xf32>, memref<1x2x3xf32>,
-      memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>)
-      -> memref<1x2x3xf32>
-  return %0 : memref<1x2x3xf32>
-}
-
-// CHECK: @f(%[[ARG0:.*]]: memref<1x2x3xf32>, %[[ARG1:.*]]: memref<1x2x3xf32>, %[[ARG2:.*]]: memref<1x2x3xf32>, %[[ARG3:.*]]: memref<1x2x3xf32>, %[[ARG4:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[OWNERSHIP:.*]]:2 = call @f(%[[ARG0]], %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]])
-// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
-
-// CHECK: @user()
-// CHECK-NEXT:   %[[ALLOC:.*]] = memref.alloc
-// CHECK-NEXT:   %[[OWN:.*]] = deallocation.own %[[ALLOC]]
-// CHECK-NEXT:   %[[ALLOC_0:.*]] = memref.alloc
-// CHECK-NEXT:   %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]]
-// CHECK-NEXT:   %[[ALLOC_1:.*]] = memref.alloc
-// CHECK-NEXT:   %[[OWN_1:.*]] = deallocation.own %[[ALLOC_1]]
-// CHECK-NEXT:   %[[ALLOC_2:.*]] = memref.alloc
-// CHECK-NEXT:   %[[OWN_2:.*]] = deallocation.own %[[ALLOC_2]]
-// CHECK-NEXT:   %[[ALLOC_3:.*]] = memref.alloc
-// CHECK-NEXT:   %[[OWN_3:.*]] = deallocation.own %[[ALLOC_3]]
-// CHECK-NEXT:   %[[OWNERSHIP_0:.*]]:2 = call @f(%[[ALLOC]], %[[ALLOC_0]], %[[ALLOC_1]], %[[ALLOC_2]], %[[ALLOC_3]])
-// CHECK-NEXT:   %[[RETAIN:.*]] = deallocation.retain(%[[OWNERSHIP_0]]#0) of(%[[OWN]], %[[OWN_0]], %[[OWN_1]], %[[OWN_2]], %[[OWNERSHIP_0]]#1)
-// CHECK-NEXT:   deallocation.retain() of(%[[OWN_3]])
-// CHECK-NEXT:   return %[[OWNERSHIP_0]]#0, %[[RETAIN]]
-
-// -----
-
-func.func @terminating_f(%i : i32, %a : memref<1x2x3xf32>,
-    %b : memref<1x2x3xf32>, %c : memref<1x2x3xf32>, %d : memref<1x2x3xf32>,
-    %e : memref<1x2x3xf32>) -> memref<1x2x3xf32> {
-  %c0 = arith.constant 0 : i32
-  %c1 = arith.constant 1 : i32
-  %pred = arith.cmpi slt, %i, %c0 : i32
-  %0 = scf.if %pred -> memref<1x2x3xf32> {
-    scf.yield %a : memref<1x2x3xf32>
-  } else {
-    %i_ = arith.subi %i, %c1 : i32
-    %1 = func.call @terminating_f(%i_, %a, %a, %b, %c, %d)
-        : (i32, memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>,
-        memref<1x2x3xf32>, memref<1x2x3xf32>) -> memref<1x2x3xf32>
-    scf.yield %1 : memref<1x2x3xf32>
-  }
-  func.return %0 : memref<1x2x3xf32>
-}
-
-func.func @user() -> memref<1x2x3xf32> {
-  %c0 = arith.constant 0 : i32
-  %a = memref.alloc() : memref<1x2x3xf32>
-  %b = memref.alloc() : memref<1x2x3xf32>
-  %c = memref.alloc() : memref<1x2x3xf32>
-  %d = memref.alloc() : memref<1x2x3xf32>
-  %e = memref.alloc() : memref<1x2x3xf32>
-  %0 = func.call @terminating_f(%c0, %a, %b, %c, %d, %e)
-      : (i32, memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>,
-      memref<1x2x3xf32>, memref<1x2x3xf32>) -> memref<1x2x3xf32>
-  return %0 : memref<1x2x3xf32>
-}
-
-// CHECK:     @terminating_f(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: memref<1x2x3xf32>, %[[ARG2:.*]]: memref<1x2x3xf32>, %[[ARG3:.*]]: memref<1x2x3xf32>, %[[ARG4:.*]]: memref<1x2x3xf32>, %[[ARG5:.*]]: memref<1x2x3xf32>)
-// CHECK-DAG:   %[[C0_I32:.*]] = arith.constant 0 : i32
-// CHECK-DAG:   %[[C1_I32:.*]] = arith.constant 1 : i32
-// CHECK:       %[[CMPI:.*]] = arith.cmpi slt, %[[ARG0]], %[[C0_I32]]
-// CHECK:       %[[IF:.*]]:2 = scf.if %[[CMPI]]
-// CHECK:         %[[RETAIN:.*]] = deallocation.retain(%[[ARG1]]) of()
-// CHECK:         scf.yield %[[ARG1]], %[[RETAIN]]
-// CHECK:       else
-// CHECK:         %[[SUBI:.*]] = arith.subi %[[ARG0]], %[[C1_I32]]
-// CHECK:         %[[OWNERSHIP:.*]]:2 = func.call @terminating_f(%[[SUBI]], %[[ARG1]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
-// CHECK:         scf.yield %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
-// CHECK:       return %[[IF]]#0, %[[IF]]#1
-
-// CHECK:     @user()
-// CHECK-DAG:   %[[C0_I32_0:.*]] = arith.constant 0 : i32
-// CHECK:       %[[ALLOC:.*]] = memref.alloc
-// CHECK:       %[[OWN:.*]] = deallocation.own %[[ALLOC]]
-// CHECK:       %[[ALLOC_0:.*]] = memref.alloc
-// CHECK:       %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]]
-// CHECK:       %[[ALLOC_1:.*]] = memref.alloc
-// CHECK:       %[[OWN_1:.*]] = deallocation.own %[[ALLOC_1]]
-// CHECK:       %[[ALLOC_2:.*]] = memref.alloc
-// CHECK:       %[[OWN_2:.*]] = deallocation.own %[[ALLOC_2]]
-// CHECK:       %[[ALLOC_3:.*]] = memref.alloc
-// CHECK:       %[[OWN_3:.*]] = deallocation.own %[[ALLOC_3]]
-// CHECK:       %[[OWNERSHIP_0:.*]]:2 = call @terminating_f(%[[C0_I32_0]], %[[ALLOC]], %[[ALLOC_0]], %[[ALLOC_1]], %[[ALLOC_2]], %[[ALLOC_3]])
-// CHECK:       %[[RETAIN_0:.*]] = deallocation.retain(%[[OWNERSHIP_0]]#0) of(%[[OWN]], %[[OWN_0]], %[[OWN_1]], %[[OWN_2]], %[[OWNERSHIP_0]]#1)
-// CHECK:       deallocation.retain() of(%[[OWN_3]])
-// CHECK:       return %[[OWNERSHIP_0]]#0, %[[RETAIN_0]]
-
-// -----
-
-func.func @id(%arg0 : memref<1x2x3xf32>, %arg1 : memref<1x2x3xf32>)
-    -> memref<1x2x3xf32> {
-  func.return %arg1 : memref<1x2x3xf32>
-}
-
-func.func @user() -> (memref<1x2x3xf32>, memref<1x2x3xf32>) {
-  %alloc0 = memref.alloc() : memref<1x2x3xf32>
-  %alloc1 = memref.alloc() : memref<1x2x3xf32>
-  %alloc2 = memref.alloc() : memref<1x2x3xf32>
-  %0 = func.call @id(%alloc0, %alloc2) : (memref<1x2x3xf32>, memref<1x2x3xf32>)
-      -> memref<1x2x3xf32>
-  %1 = func.call @id(%alloc1, %alloc2) : (memref<1x2x3xf32>, memref<1x2x3xf32>)
-      -> memref<1x2x3xf32>
-  func.return %0, %1 : memref<1x2x3xf32>, memref<1x2x3xf32>
-}
-
-// CHECK: @id(%[[ARG0:.*]]: memref<1x2x3xf32>, %[[ARG1:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[RETAIN:.*]] = deallocation.retain(%[[ARG1]]) of()
-// CHECK:   return %[[ARG1]], %[[RETAIN]]
-
-// CHECK: @user()
-// CHECK:   %[[ALLOC:.*]] = memref.alloc
-// CHECK:   %[[OWN:.*]] = deallocation.own %[[ALLOC]]
-// CHECK:   %[[ALLOC_0:.*]] = memref.alloc
-// CHECK:   %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]] : memref<1x2x3xf32>
-// CHECK:   %[[ALLOC_1:.*]] = memref.alloc
-// CHECK:   %[[OWN_1:.*]] = deallocation.own %[[ALLOC_1]] : memref<1x2x3xf32>
-// CHECK:   %[[OWNERSHIP:.*]]:2 = call @id(%[[ALLOC]], %[[ALLOC_1]])
-// CHECK:   %[[OWNERSHIP_0:.*]]:2 = call @id(%[[ALLOC_0]], %[[ALLOC_1]])
-// CHECK:   %[[RETAIN_0:.*]]:2 = deallocation.retain(%[[OWNERSHIP]]#0, %[[OWNERSHIP_0]]#0) of(%[[OWN_1]], %[[OWNERSHIP]]#1, %[[OWNERSHIP_0]]#1)
-// CHECK:   deallocation.retain() of(%[[OWN]])
-// CHECK:   deallocation.retain() of(%[[OWN_0]])
-// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP_0]]#0, %[[RETAIN_0]]#0, %[[RETAIN_0]]#1
-
-// -----
-
-func.func @forward(%arg0: memref<1x2x3xf32>, %arg1: memref<1x2x3xf32>,
-    %arg2: memref<1x2x3xf32>) -> (memref<1x2x3xf32>, memref<1x2x3xf32>,
-    memref<1x2x3xf32>) {
-  func.return %arg0, %arg1, %arg2 : memref<1x2x3xf32>, memref<1x2x3xf32>,
-      memref<1x2x3xf32>
-}
-
-func.func @replace(%arg0: memref<1x2x3xf32>, %arg1: memref<1x2x3xf32>,
-    %arg2: memref<1x2x3xf32>) -> (memref<1x2x3xf32>, memref<1x2x3xf32>,
-    memref<1x2x3xf32>) {
-  %alloc0 = memref.alloc() : memref<1x2x3xf32>
-  %alloc1 = memref.alloc() : memref<1x2x3xf32>
-  %alloc2 = memref.alloc() : memref<1x2x3xf32>
-  func.return %alloc0, %alloc1, %alloc2
-      : memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>
-}
-
-func.func @user() -> (memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>,
-    memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>) {
-  %alloc0 = memref.alloc() : memref<1x2x3xf32>
-  %alloc1 = memref.alloc() : memref<1x2x3xf32>
-  %alloc2 = memref.alloc() : memref<1x2x3xf32>
-  %0:3 = func.call @forward(%alloc0, %alloc1, %alloc2)
-      : (memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>)
-      -> (memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>)
-  %1:3 = func.call @replace(%alloc0, %alloc1, %alloc2)
-      : (memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>)
-      -> (memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>)
-  func.return %0#0, %0#1, %0#2, %1#0, %1#1, %1#2 : memref<1x2x3xf32>,
-      memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>,
-      memref<1x2x3xf32>, memref<1x2x3xf32>
-}
-
-// CHECK: @forward(%[[ARG0:.*]]: memref<1x2x3xf32>, %[[ARG1:.*]]: memref<1x2x3xf32>, %[[ARG2:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[RETAIN:.*]] = deallocation.retain(%[[ARG0]]) of()
-// CHECK:   %[[RETAIN_0:.*]] = deallocation.retain(%[[ARG1]]) of()
-// CHECK:   %[[RETAIN_1:.*]] = deallocation.retain(%[[ARG2]]) of()
-// CHECK:   return %[[ARG0]], %[[ARG1]], %[[ARG2]], %[[RETAIN]], %[[RETAIN_0]], %[[RETAIN_1]]
-
-// CHECK: @replace(%[[ARG0_0:.*]]: memref<1x2x3xf32>, %[[ARG1_0:.*]]: memref<1x2x3xf32>, %[[ARG2_0:.*]]: memref<1x2x3xf32>)
-// CHECK:   %[[ALLOC:.*]] = memref.alloc
-// CHECK:   %[[OWN:.*]] = deallocation.own %[[ALLOC]]
-// CHECK:   %[[ALLOC_0:.*]] = memref.alloc
-// CHECK:   %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]]
-// CHECK:   %[[ALLOC_1:.*]] = memref.alloc
-// CHECK:   %[[OWN_1:.*]] = deallocation.own %[[ALLOC_1]]
-// CHECK:   return %[[ALLOC]], %[[ALLOC_0]], %[[ALLOC_1]], %[[OWN]], %[[OWN_0]], %[[OWN_1]]
-
-// CHECK: @user()
-// CHECK:   %[[ALLOC_2:.*]] = memref.alloc
-// CHECK:   %[[OWN_2:.*]] = deallocation.own %[[ALLOC_2]]
-// CHECK:   %[[ALLOC_0_0:.*]] = memref.alloc
-// CHECK:   %[[OWN_3:.*]] = deallocation.own %[[ALLOC_0_0]]
-// CHECK:   %[[ALLOC_1_0:.*]] = memref.alloc
-// CHECK:   %[[OWN_4:.*]] = deallocation.own %[[ALLOC_1_0]]
-// CHECK:   %[[OWNERSHIP:.*]]:6 = call @forward(%[[ALLOC_2]], %[[ALLOC_0_0]], %[[ALLOC_1_0]])
-// CHECK:   %[[OWNERSHIP_0:.*]]:6 = call @replace(%[[ALLOC_2]], %[[ALLOC_0_0]], %[[ALLOC_1_0]])
-// CHECK:   %[[RETAIN_2:.*]] = deallocation.retain(%[[OWNERSHIP]]#0) of(%[[OWN_2]], %[[OWNERSHIP]]#3)
-// CHECK:   %[[RETAIN_3:.*]] = deallocation.retain(%[[OWNERSHIP]]#1) of(%[[OWN_3]], %[[OWNERSHIP]]#4)
-// CHECK:   %[[RETAIN_4:.*]] = deallocation.retain(%[[OWNERSHIP]]#2) of(%[[OWN_4]], %[[OWNERSHIP]]#5)
-// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1, %[[OWNERSHIP]]#2, %[[OWNERSHIP_0]]#0, %[[OWNERSHIP_0]]#1, %[[OWNERSHIP_0]]#2, %[[RETAIN_2]], %[[RETAIN_3]], %[[RETAIN_4]], %[[OWNERSHIP_0]]#3, %[[OWNERSHIP_0]]#4, %[[OWNERSHIP_0]]#5
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocate_invalid.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocate_invalid.mlir
deleted file mode 100644
index 9604880ced0e86..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocate_invalid.mlir
+++ /dev/null
@@ -1,76 +0,0 @@
-// RUN: mlir-hlo-opt -allow-unregistered-dialect %s -split-input-file -hlo-deallocate -verify-diagnostics
-
-func.func @dealloc_invalid(%lb: index, %ub: index, %step: index) {
-  %alloc = memref.alloc() : memref<i32>
-  scf.for %i = %lb to %ub step %step {  // expected-error {{can't implicitly capture across loop boundaries}}
-    memref.dealloc %alloc: memref<i32>
-  }
-  return
-}
-
-// -----
-
-func.func @realloc_no_else(%size: index, %cond: i1) {
-  %alloc = memref.alloc(%size) : memref<?xi32>
-  scf.if %cond {  // expected-error {{cannot implicitly capture from an if without else}}
-    %realloc = memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
-  }
-  return
-}
-
-// -----
-
-func.func @realloc_not_yielded(%size: index, %cond: i1) {
-  %alloc = memref.alloc(%size) : memref<?xi32>
-  scf.if %cond {  // expected-error {{released value not yielded on other branch}}
-    %realloc = memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
-  } else {
-    "test.dummy"() : () -> ()
-  }
-  return
-}
-
-// -----
-
-func.func @realloc_arg(%arg: memref<?xi32>, %size: index) {
-  %realloc = memref.realloc %arg(%size) : memref<?xi32> to memref<?xi32>  // expected-error {{unable to find ownership indicator for operand}}
-  return
-}
-
-// -----
-
-func.func @realloc_twice(%size: index) {  // expected-error {{invalid realloc of memref}}
-  %alloc = memref.alloc(%size) : memref<?xi32>
-  %realloc0 = memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
-  %realloc1 = memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
-  return
-}
-
-// -----
-
-func.func @realloc_twice_in_if(%size: index, %cond: i1) {  // expected-error {{invalid realloc of memref}}
-  %alloc = memref.alloc(%size) : memref<?xi32>
-  scf.if %cond -> memref<?xi32> {
-    %realloc = memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
-    scf.yield %realloc : memref<?xi32>
-  } else {
-    scf.yield %alloc : memref<?xi32>
-  }
-  scf.if %cond -> memref<?xi32> {
-    %realloc = memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
-    scf.yield %realloc : memref<?xi32>
-  } else {
-    scf.yield %alloc : memref<?xi32>
-  }
-  return
-}
-
-// -----
-
-func.func @cross_loop_boundary(%size: index, %lb: index, %ub: index, %step: index) {
-  %alloc = memref.alloc(%size) : memref<?xi32>
-  scf.for %i = %lb to %ub step %step {  // expected-error {{can't implicitly capture across loop boundaries}}
-    memref.realloc %alloc(%size) : memref<?xi32> to memref<?xi32>
-  }
-  return
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_ops.mlir
deleted file mode 100644
index 0dd5e47ad9c1b1..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_ops.mlir
+++ /dev/null
@@ -1,39 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --verify-diagnostics | FileCheck %s
-
-// CHECK-LABEL: @retain
-func.func @retain(%arg0: memref<2xf32>, %arg1: !deallocation.ownership, %arg2: !deallocation.ownership)
-    -> !deallocation.ownership {
-  %0 = deallocation.retain(%arg0) of(%arg1, %arg2)
-      : (memref<2xf32>, !deallocation.ownership, !deallocation.ownership) -> !deallocation.ownership
-  return %0 : !deallocation.ownership
-}
-
-// CHECK-LABEL: @get_buffer
-func.func @get_buffer(%arg0: memref<2xf32>) -> index {
-  %0 = deallocation.get_buffer %arg0 : memref<2xf32>
-  return %0 : index
-}
-
-// CHECK-LABEL: @get_ownership_buffer
-func.func @get_ownership_buffer(%arg0: !deallocation.ownership) -> index {
-  %0 = deallocation.get_buffer %arg0 : !deallocation.ownership
-  return %0 : index
-}
-
-// CHECK-LABEL: @own
-func.func @own(%arg0: memref<2xf32>) -> !deallocation.ownership {
-  %0 = deallocation.own %arg0 : memref<2xf32>
-  return %0 : !deallocation.ownership
-}
-
-// CHECK-LABEL: @null
-func.func @null() -> !deallocation.ownership {
-  %0 = deallocation.null
-  return %0 : !deallocation.ownership
-}
-
-// CHECK-LABEL: @free
-func.func @free(%arg0: !deallocation.ownership) {
-  deallocation.free %arg0
-  return
-}
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_simplification.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_simplification.mlir
deleted file mode 100644
index ebefd3dce49da7..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_simplification.mlir
+++ /dev/null
@@ -1,181 +0,0 @@
-// RUN: mlir-hlo-opt %s -allow-unregistered-dialect -hlo-deallocation-simplification | FileCheck %s
-
-func.func @retain_is_dealloc() {
-  %alloc = memref.alloc() : memref<2xf32>
-  %alloc_owned = deallocation.own %alloc : memref<2xf32>
-  "test.use"(%alloc) : (memref<2xf32>) -> ()
-  deallocation.retain() of (%alloc_owned) : (!deallocation.ownership) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_is_dealloc
-// CHECK-NEXT: %[[ALLOC:.*]] = memref.alloc()
-// CHECK-NEXT: test.use
-// CHECK-NEXT: memref.dealloc %[[ALLOC]]
-
-// -----
-
-func.func @retain_of_nothing(%arg: memref<2xf32>) -> !deallocation.ownership {
-  %ret = deallocation.retain(%arg) of() : (memref<2xf32>) -> (!deallocation.ownership)
-  return %ret : !deallocation.ownership
-}
-
-// CHECK-LABEL: @retain_of_nothing
-// CHECK-SAME: (%[[ARG:.*]]: memref<2xf32>
-// CHECK-NEXT: %[[NULL:.*]] = deallocation.null
-// CHECK-NEXT: return %[[NULL]]
-
-// -----
-
-func.func @retain_is_dealloc_for(%lb: index, %ub: index, %step: index) {
-  %alloc = memref.alloc() : memref<2xf32>
-  %alloc_owned = deallocation.own %alloc : memref<2xf32>
-  %for:2 = scf.for %i = %lb to %ub step %step iter_args(%arg0 = %alloc, %arg1 = %alloc_owned)
-      -> (memref<2xf32>, !deallocation.ownership) {
-    "some.use"(%arg0) : (memref<2xf32>) -> ()
-    scf.yield %arg0, %arg1 : memref<2xf32>, !deallocation.ownership
-  }
-  deallocation.retain() of(%for#1) : (!deallocation.ownership) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_is_dealloc_for
-// CHECK-NEXT: memref.alloc()
-// CHECK-NEXT: deallocation.null
-// CHECK-NEXT: %[[FOR:.*]]:2 = scf.for
-// CHECK-NEXT:   some.use
-// CHECK-NEXT:   scf.yield
-// CHECK-NEXT: }
-// CHECK-NEXT: memref.dealloc %[[FOR]]#0
-// CHECK-NEXT: return
-
-// -----
-
-func.func @retain_is_dealloc_reallocated(%lb: index, %ub: index, %step: index) {
-  %alloc = memref.alloc() : memref<2xf32>
-  %alloc_owned = deallocation.own %alloc : memref<2xf32>
-  %for:2 = scf.for %i = %lb to %ub step %step iter_args(%arg0 = %alloc, %arg1 = %alloc_owned)
-      -> (memref<2xf32>, !deallocation.ownership) {
-    "some.use"(%arg0) : (memref<2xf32>) -> ()
-    deallocation.retain() of(%arg1) : (!deallocation.ownership) -> ()
-    %alloc0 = memref.alloc() : memref<2xf32>
-    %alloc0_owned = deallocation.own %alloc0 : memref<2xf32>
-    scf.yield %alloc, %alloc0_owned : memref<2xf32>, !deallocation.ownership
-  }
-  deallocation.retain() of(%for#1) : (!deallocation.ownership) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_is_dealloc_reallocated
-// CHECK-NEXT: memref.alloc
-// CHECK-NEXT: deallocation.null
-// CHECK-NEXT: %[[FOR:.*]]:2 = scf.for
-// CHECK:        memref.dealloc
-// CHECK:      }
-// CHECK:      memref.dealloc %[[FOR]]
-
-// -----
-
-func.func @retain_is_not_dealloc_for(
-    %x: memref<2xf32>, %x_owned: !deallocation.ownership,
-    %lb: index, %ub: index, %step: index) {
-  %for:2 = scf.for %i = %lb to %ub step %step iter_args(%arg0 = %x, %arg1 = %x_owned)
-      -> (memref<2xf32>, !deallocation.ownership) {
-    "some.use"(%arg0) : (memref<2xf32>) -> ()
-    deallocation.retain() of(%arg1) : (!deallocation.ownership) -> ()
-    %alloc = memref.alloc() : memref<2xf32>
-    %alloc_owned = deallocation.own %alloc : memref<2xf32>
-    scf.yield %alloc, %alloc_owned : memref<2xf32>, !deallocation.ownership
-  }
-  deallocation.retain() of(%for#1) : (!deallocation.ownership) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_is_not_dealloc_for
-// CHECK: %[[FOR:.*]]:2 = scf.for
-// CHECK: deallocation.retain() of(%[[FOR]]#1)
-
-// -----
-
-func.func @retain_is_dealloc_while() {
-  %a = memref.alloc() : memref<2xf32>
-  %a_owned = deallocation.own %a : memref<2xf32>
-  %while:2 = scf.while (%arg0 = %a, %arg1 = %a_owned)
-      : (memref<2xf32>, !deallocation.ownership) -> (memref<2xf32>, !deallocation.ownership) {
-    %0 = "test.make_condition"() : () -> i1
-    scf.condition(%0) %arg0, %arg1 : memref<2xf32>, !deallocation.ownership
-  } do {
-  ^bb0(%arg0: memref<2xf32>, %arg1: !deallocation.ownership):
-    "some.use"(%arg0) : (memref<2xf32>) -> ()
-    deallocation.retain() of(%arg1) : (!deallocation.ownership) -> ()
-    %b = memref.alloc() : memref<2xf32>
-    %b_owned = deallocation.own %b : memref<2xf32>
-    scf.yield %b, %b_owned: memref<2xf32>, !deallocation.ownership
-  }
-  deallocation.retain() of (%while#1) : (!deallocation.ownership) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_is_dealloc_while
-// CHECK: %[[WHILE:.*]]:2 = scf.while
-// CHECK: memref.dealloc %[[WHILE]]#0
-
-// -----
-
-func.func @retain_is_dealloc_while_permute() {
-  %a = memref.alloc() : memref<f32>
-  %a_owned = deallocation.own %a : memref<f32>
-  %b = memref.alloc() : memref<f32>
-  %b_owned = deallocation.own %b : memref<f32>
-  %c = memref.alloc() : memref<f32>
-  %c_owned = deallocation.own %c : memref<f32>
-  %w:6 = scf.while (%arg0 = %a, %arg1 = %b, %arg2 = %c,
-                    %arg3 = %a_owned, %arg4 = %b_owned, %arg5 = %c_owned)
-    : (memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership) ->
-      (memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership) {
-    %cond = "test.make_condition"() : () -> i1
-    scf.condition(%cond) %arg2, %arg1, %arg0, %arg5, %arg4, %arg3
-      : memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership
-  } do {
-  ^bb0(%arg0: memref<f32>, %arg1: memref<f32>, %arg2: memref<f32>,
-        %arg3: !deallocation.ownership, %arg4: !deallocation.ownership, %arg5: !deallocation.ownership):
-    scf.yield %arg1, %arg0, %arg2, %arg4, %arg3, %arg5
-      : memref<f32>, memref<f32>, memref<f32>, !deallocation.ownership, !deallocation.ownership, !deallocation.ownership
-  }
-  "test.use"(%w#1) : (memref<f32>) -> ()
-  deallocation.retain() of (%w#3) : (!deallocation.ownership) -> ()
-  deallocation.retain() of (%w#4) : (!deallocation.ownership) -> ()
-  deallocation.retain() of (%w#5) : (!deallocation.ownership) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_is_dealloc_while_permute
-// CHECK: memref.alloc
-// CHECK: memref.alloc
-// CHECK: memref.alloc
-// CHECK: %[[WHILE:.*]]:6 = scf.while
-// CHECK: memref.dealloc %[[WHILE]]
-// CHECK: memref.dealloc %[[WHILE]]
-// CHECK: memref.dealloc %[[WHILE]]
-
-func.func @retain_of_null(%arg0: memref<4xi32>, %arg1: memref<4xi32>,
-                          %arg2: index, %arg3: index, %arg4: index) {
-  %0 = deallocation.null
-  %2:4 = scf.for %arg5 = %arg2 to %arg3 step %arg4
-      iter_args(%arg6 = %arg0, %arg7 = %arg1, %arg8 = %0, %arg9 = %0) ->
-      (memref<4xi32>, memref<4xi32>, !deallocation.ownership, !deallocation.ownership) {
-    "test.use"(%arg6, %arg7) : (memref<4xi32>, memref<4xi32>) -> ()
-    %3 = deallocation.retain(%arg6) of(%arg8)
-      : (memref<4xi32>, !deallocation.ownership) -> !deallocation.ownership
-    %4 = deallocation.retain(%arg7) of(%arg9)
-      : (memref<4xi32>, !deallocation.ownership) -> !deallocation.ownership
-    scf.yield %arg7, %arg6, %4, %3
-      : memref<4xi32>, memref<4xi32>, !deallocation.ownership, !deallocation.ownership
-  }
-  deallocation.retain() of(%2#2) : (!deallocation.ownership) -> ()
-  deallocation.retain() of(%2#3) : (!deallocation.ownership) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_of_null
-// CHECK-NOT: deallocation.retain()
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_to_scf.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_to_scf.mlir
deleted file mode 100644
index a181ef97c44788..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/deallocation_to_scf.mlir
+++ /dev/null
@@ -1,42 +0,0 @@
-// RUN: mlir-hlo-opt %s -hlo-deallocation-to-scf | FileCheck %s
-
-func.func @retain_nothing(%arg0: !deallocation.ownership) {
-  deallocation.retain() of (%arg0) : (!deallocation.ownership) -> ()
-  return
-}
-
-// CHECK-LABEL: @retain_nothing
-// CHECK-SAME:     %[[ARG:.*]]:
-// CHECK-NEXT:  %[[ZERO:.*]] = arith.constant 0 : index
-// CHECK-NEXT:  %[[BUF:.*]] = deallocation.get_buffer %[[ARG]]
-// CHECK-NEXT:  %[[NONNULL:.*]] = arith.cmpi ne, %[[BUF]], %[[ZERO]]
-// CHECK-NEXT:  scf.if %[[NONNULL]] {
-// CHECK-NEXT:    deallocation.free %[[ARG]]
-// CHECK-NEXT:  }
-
-// -----
-
-func.func @retain_something(%arg0: memref<2xf32>, %arg1: !deallocation.ownership)
-    -> !deallocation.ownership {
-  %ret = deallocation.retain(%arg0) of (%arg1) : (memref<2xf32>, !deallocation.ownership)
-      -> (!deallocation.ownership)
-  return %ret : !deallocation.ownership
-}
-
-// CHECK-LABEL: @retain_something
-// CHECK-SAME:     %[[ARG0:.*]]: memref<2xf32>, %[[ARG1:.*]]:
-// CHECK-NEXT:  %[[ZERO:.*]] = arith.constant 0 : index
-// CHECK-NEXT:  %[[BUF:.*]] = deallocation.get_buffer %[[ARG1]]
-// CHECK-NEXT:  %[[NULL:.*]] = deallocation.null
-// CHECK-NEXT:  %[[RETAINED_BUF:.*]] = deallocation.get_buffer %[[ARG0]]
-// CHECK-NEXT:  %[[SAME:.*]] = arith.cmpi eq, %[[RETAINED_BUF]], %[[BUF]]
-// CHECK-NEXT:  %[[RET:.*]]:3 = scf.if %[[SAME]]
-// CHECK-NEXT:    scf.yield %[[NULL]], %[[ZERO]], %[[ARG1]]
-// CHECK-NEXT:  } else {
-// CHECK-NEXT:    scf.yield %[[ARG1]], %[[BUF]], %[[NULL]]
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[DEALLOC:.*]] = arith.cmpi ne, %[[RET]]#1, %[[ZERO]]
-// CHECK-NEXT:  scf.if %[[DEALLOC]] {
-// CHECK-NEXT:    deallocation.free %[[RET]]#0
-// CHECK-NEXT:  }
-// CHECK-NEXT:  return %[[RET]]#2
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/split_alloc_tensors.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/split_alloc_tensors.mlir
deleted file mode 100644
index 9718a69372d996..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/deallocation/split_alloc_tensors.mlir
+++ /dev/null
@@ -1,33 +0,0 @@
-// RUN: mlir-hlo-opt %s -allow-unregistered-dialect -hlo-split-alloc-tensors | FileCheck %s
-
-func.func @split() {
-  %alloc_tensor = bufferization.alloc_tensor() : tensor<2xf32>
-  %a = "some.op"(%alloc_tensor) : (tensor<2xf32>) -> (tensor<2xf32>)
-  %b = "some.op"(%a, %alloc_tensor)
-      : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xf32>)
-  "some.use"(%b) : (tensor<2xf32>) -> ()
-  %c = "some.op"(%alloc_tensor) : (tensor<2xf32>) -> (tensor<2xf32>)
-  return
-}
-
-// CHECK-LABEL: @split
-// CHECK-NEXT: alloc_tensor
-// CHECK-NEXT: some.op
-// CHECK-NEXT: alloc_tensor
-// CHECK-NEXT: some.op
-// CHECK-NEXT: some.use
-// CHECK-NEXT: alloc_tensor
-// CHECK-NEXT: some.op
-
-func.func @split_empty_region() {
-  %alloc_tensor = bufferization.alloc_tensor() : tensor<2xf32>
-  %cond = "test.cond"() : () -> (i1)
-  scf.if %cond {
-    %a = "some.op"(%alloc_tensor) : (tensor<2xf32>) -> (tensor<2xf32>)
-  }
-  // No else.
-  return
-}
-
-// This is a regression test. Just check that this is processed successfully.
-// CHECK-LABEL: @split_empty_region
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/add_debug_info.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/add_debug_info.mlir
deleted file mode 100644
index cc3690fea32e95..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/add_debug_info.mlir
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: mlir-hlo-opt %s --add-debug-info --mlir-print-debuginfo | FileCheck %s
-
-builtin.module {
-  func.func @foo() {
-    return
-  }
-}
-
-// CHECK: module
-// CHECK:   func.func @[[SUBPROGRAM_NAME:.*]]() {
-// CHECK:     return loc(#[[RET_LOC:.*]])
-// CHECK:   } loc(#[[FUSED_SUBPROGRAM_LOC:.*]])
-// CHECK: } loc(#[[MODULE_LOC:.*]])
-// CHECK: #di_basic_type = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "void", encoding = DW_ATE_address>
-// CHECK: #di_file = #llvm.di_file<"[[FILE_NAME:.*]]" in "[[DIR_NAME:.*]]">
-// CHECK: #[[MODULE_LOC]] = loc("[[DIR_NAME]]/[[FILE_NAME]]":[[#MODULE_LINE:]]:1)
-// CHECK: #[[SUBPROGRAM_LOC:.*]] = loc("[[DIR_NAME]]/[[FILE_NAME]]":[[#MODULE_LINE+1]]:3)
-// CHECK: #[[RET_LOC]] = loc("[[DIR_NAME]]/[[FILE_NAME]]":[[#MODULE_LINE+2]]:5)
-// CHECK: #di_compile_unit = #llvm.di_compile_unit<sourceLanguage = DW_LANG_C_plus_plus_17, file = #di_file, producer = "XLA CPU", isOptimized = false, emissionKind = LineTablesOnly>
-// CHECK: #di_subroutine_type = #llvm.di_subroutine_type<callingConvention = DW_CC_normal, types = #di_basic_type>
-// CHECK: #di_subprogram = #llvm.di_subprogram<compileUnit = #di_compile_unit, scope = #di_file, name = "[[SUBPROGRAM_NAME]]", linkageName = "[[SUBPROGRAM_NAME]]", file = #di_file, line = 1, scopeLine = 1, subprogramFlags = Definition, type = #di_subroutine_type>
-// CHECK: #[[FUSED_SUBPROGRAM_LOC]] = loc(fused<#di_subprogram>[#[[SUBPROGRAM_LOC]]])
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/bufferization.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/bufferization.mlir
deleted file mode 100644
index 9c4fd72e90b0df..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/bufferization.mlir
+++ /dev/null
@@ -1,303 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-st-rewrite-from-elements-ops \
-// RUN: -eliminate-empty-tensors -empty-tensor-to-alloc-tensor \
-// RUN: -hlo-one-shot-bufferize -canonicalize -cse -canonicalize \
-// RUN: -split-input-file | FileCheck %s
-
-func.func @set_tile(%input: tensor<?x?xf32>) -> tensor<2x4xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-
-  %dim_0 = tensor.dim %input, %c0 : tensor<?x?xf32>
-  %dim_1 = tensor.dim %input, %c1 : tensor<?x?xf32>
-
-  %slice = tensor.extract_slice %input[0, 1][2, 4][1, 1]
-    : tensor<?x?xf32> to tensor<2x4xf32>
-
-  return %slice : tensor<2x4xf32>
-}
-// CHECK-LABEL: func @set_tile(
-// CHECK-SAME:    %[[ARG:.*]]: memref<?x?xf32>)
-// CHECK-NEXT:  %[[VIEW:.*]] = memref.subview %[[ARG]][0, 1] [2, 4] [1, 1]
-// CHECK-NEXT:  %[[ALLOC:.*]] = memref.alloc() : memref<2x4xf32>
-// CHECK-NEXT:  memref.copy %[[VIEW]], %[[ALLOC]]
-// CHECK-NEXT:  return %[[ALLOC]] : memref<2x4xf32>
-
-// -----
-
-#map = affine_map<(d0, d1) -> (d0, d1)>
-func.func @parallel_with_tiles(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>,
-                               %out : tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
-  %dim_0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
-  %dim_1 = tensor.dim %lhs, %c1 : tensor<?x?xf32>
-
-  %result = scf.forall (%i, %j) = (%c0, %c0) to (%dim_0, %dim_1)
-      step (%c4, %c1) shared_outs (%out_ = %out) -> (tensor<?x?xf32>) {
-    %7 = arith.addi %i, %c4 : index
-    %8 = arith.cmpi sgt, %7, %dim_0 : index
-    %9 = arith.subi %dim_0, %i : index
-    %size_0 = arith.select %8, %9, %c4 : index
-
-    %lhs_tile = tensor.extract_slice %lhs[%i, %j] [%size_0, 1] [1, 1]
-      : tensor<?x?xf32> to tensor<?x1xf32>
-    %rhs_tile = tensor.extract_slice %rhs[%i, %j] [%size_0, 1] [1, 1]
-      : tensor<?x?xf32> to tensor<?x1xf32>
-    %init_tile = tensor.extract_slice %out_[%i, %j] [%size_0, 1] [1, 1]
-      : tensor<?x?xf32> to tensor<?x1xf32>
-    %sum = linalg.generic {
-        indexing_maps = [#map, #map, #map],
-        iterator_types = ["parallel", "parallel"]}
-        ins(%lhs_tile, %rhs_tile : tensor<?x1xf32>, tensor<?x1xf32>)
-        outs(%init_tile : tensor<?x1xf32>) {
-      ^bb0(%l: f32, %r: f32, %o: f32):
-        %add = arith.addf %l, %r : f32
-        linalg.yield %add : f32
-      } -> tensor<?x1xf32>
-    scf.forall.in_parallel {
-      tensor.parallel_insert_slice %sum into %out_[%i, %j] [%size_0, 1] [1, 1]
-        : tensor<?x1xf32> into tensor<?x?xf32>
-    }
-  }
-  return %result : tensor<?x?xf32>
-}
-// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)>
-
-// CHECK-LABEL: func @parallel_with_tiles(
-// CHECK-SAME: %[[LHS:.*]]: memref<?x?xf32>, %[[RHS:.*]]: memref<?x?xf32>,
-// CHECK-SAME: %[[OUT:.*]]: memref<?x?xf32>) -> memref<?x?xf32> {
-
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
-// CHECK:     %[[DIM_0:.*]] = memref.dim %[[LHS]], %[[C0]] : memref<?x?xf32>
-// CHECK:     %[[DIM_1:.*]] = memref.dim %[[LHS]], %[[C1]] : memref<?x?xf32>
-
-// CHECK:     scf.forall (%[[I:.*]], %[[J:.*]]) = (0, 0)
-// CHECK-SAME:    to (%[[DIM_0]], %[[DIM_1]]) step (4, 1) {
-
-// CHECK-DAG:   %[[LHS_SUB:.*]] = memref.subview %[[LHS]][%[[I]], %[[J]]]
-// CHECK-SAME:    : memref<?x?xf32> to memref<?x1xf32, strided<[?, 1], offset: ?>>
-// CHECK-DAG:   %[[RHS_SUB:.*]] = memref.subview %[[RHS]][%[[I]], %[[J]]]
-// CHECK-SAME:    : memref<?x?xf32> to memref<?x1xf32, strided<[?, 1], offset: ?>>
-// CHECK-DAG:   %[[OUT_SUB:.*]] = memref.subview %[[OUT]][%[[I]], %[[J]]]
-// CHECK-SAME:    : memref<?x?xf32> to memref<?x1xf32, strided<[?, 1], offset: ?>>
-
-// CHECK:       linalg.generic {
-// CHECK-SAME:    indexing_maps = [#[[$MAP1]], #[[$MAP1]], #[[$MAP1]]]
-// CHECK-SAME:    ins(%[[LHS_SUB]], %[[RHS_SUB]] : memref<?x1xf32, strided<[?, 1], offset: ?>>
-// CHECK-SAME:    outs(%[[OUT_SUB]] : memref<?x1xf32, strided<[?, 1], offset: ?>>)
-// CHECK:     }
-// CHECK: return %[[OUT]] : memref<?x?xf32>
-
-// -----
-
-func.func @materialize_and_yield_with_constants(
-    %in: tensor<8x2xf32>, %out: tensor<8x2xf32>) -> tensor<8x2xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c8 = arith.constant 8 : index
-
-  %1 = scf.forall (%i, %j) = (%c0, %c0) to (%c8, %c2) step (%c1, %c1)
-      shared_outs (%out_ = %out) -> (tensor<8x2xf32>) {
-    %2 = tensor.extract_slice %in[%i, %j] [1, 1] [1, 1]
-      : tensor<8x2xf32> to tensor<1x1xf32>
-    %3 = tensor.extract %2[%c0, %c0] : tensor<1x1xf32>
-    %4 = math.absf %3: f32
-    %5 = tensor.from_elements %4 : tensor<f32>
-    scf.forall.in_parallel {
-      tensor.parallel_insert_slice %5 into %out_[%i, %j] [1, 1] [1, 1]
-        : tensor<f32> into tensor<8x2xf32>
-    }
-  }
-  return %1 : tensor<8x2xf32>
-}
-// CHECK-LABEL: func @materialize_and_yield_with_constants
-// CHECK-SAME:      %[[IN:.*]]: memref<8x2xf32>, %[[OUT:.*]]: memref<8x2xf32>)
-
-// CHECK:       scf.forall (%[[I:.*]], %[[J:.*]]) in (8, 2)
-// CHECK-NEXT:    %[[SLICE:.*]] = memref.subview %[[IN]][%[[I]], %[[J]]]
-// CHECK-NEXT:    %[[ELEM:.*]] = memref.load %[[SLICE]]
-// CHECK-NEXT:    %[[ABS:.*]] = math.absf %[[ELEM]] : f32
-// CHECK-NEXT:    %[[OUT_SLICE:.*]] = memref.subview %[[OUT]]
-// CHECK-SAME:      [%[[I]], %[[J]]] [1, 1] [1, 1]
-// CHECK-NEXT:    memref.store %[[ABS]], %[[OUT_SLICE]][]
-
-// -----
-
-func.func @same_enclosing_repetitive_region(%2: tensor<320xf32>,
-                                            %3: tensor<320x10240xf32>)
-  -> tensor<320xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %cst = arith.constant -0.000000e+00 : f32
-  %c320 = arith.constant 320 : index
-  %4 = scf.forall (%i) = (%c0) to (%c320) step (%c1)
-      shared_outs(%arg1 = %2) -> (tensor<320xf32>) {
-    %5 = tensor.extract_slice %3[%i, 0] [1, 10240] [1, 1]  : tensor<320x10240xf32> to tensor<1x10240xf32>
-    %6 = tensor.extract_slice %arg1[%i] [1] [1] : tensor<320xf32> to tensor<1xf32>
-    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<1xf32>) -> tensor<1xf32>
-    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
-
-    scf.forall.in_parallel {
-      tensor.parallel_insert_slice %8 into %arg1[%i] [1] [1]
-        : tensor<1xf32> into tensor<320xf32>
-    }
-  }
-  return %4 : tensor<320xf32>
-}
-// CHECK-LABEL: @same_enclosing_repetitive_region
-// CHECK-NOT: memref.alloc
-
-// -----
-
-// CHECK-LABEL: func @scf.forall_private_var(
-//  CHECK-SAME:     %[[t:.*]]: memref<10xf32
-func.func @scf.forall_private_var(%t: tensor<10xf32>) -> f32 {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c5 = arith.constant 5 : index
-
-  // A copy is inserted for the uses of %t in the loop.
-  // CHECK: %[[t_copy:.*]] = memref.alloc() {{.*}} : memref<10xf32>
-  // CHECK: memref.copy %[[t]], %[[t_copy]]
-
-  // CHECK: scf.forall
-
-  // Load from the copy and store into the shared output.
-  // CHECK:   %[[subview:.*]] = memref.subview %[[t_copy]]
-  // CHECK:   memref.load %[[t]]
-  // CHECK:   memref.store %{{.*}}, %[[subview]]
-  %0 = scf.forall (%tid) = (%c0) to (%c2) step (%c1)
-      shared_outs (%o = %t) -> (tensor<10xf32>) {
-    %offset = arith.muli %c5, %tid : index
-    %slice = tensor.extract_slice %o[%offset] [5] [1]
-        : tensor<10xf32> to tensor<5xf32>
-    %r2 = tensor.extract %t[%tid] : tensor<10xf32>
-    %i = tensor.insert %r2 into %slice[%c2] : tensor<5xf32>
-
-    scf.forall.in_parallel {
-      tensor.parallel_insert_slice %i into %o[%offset][5][1]
-        : tensor<5xf32> into tensor<10xf32>
-    }
-  }
-  %r = tensor.extract %0[%c2] : tensor<10xf32>
-  return %r : f32
-}
-
-// -----
-
-func.func @gml_st_fusion(%arg0: tensor<?xf32>,
-    %init: tensor<?xf32>) -> tensor<?xf32> {
-  %0 = gml_st.fusion ins(%a0 = %arg0 : tensor<?xf32>)
-                     inits(%in = %init : tensor<?xf32>) {
-    %res = linalg.map { math.exp }
-      ins(%a0 : tensor<?xf32>)
-      outs(%in : tensor<?xf32>)
-    gml_st.yield %res : tensor<?xf32>
-  } : tensor<?xf32>
-  func.return %0 : tensor<?xf32>
-}
-
-// CHECK-LABEL: func @gml_st_fusion
-// CHECK-SAME:      %[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>
-// CHECK:         gml_st.fusion
-// CHECK-SAME:        ins(%[[ARG0_:.*]] = %[[ARG0]]: memref<?xf32>)
-// CHECK-SAME:        inits(%[[ARG1_:.*]] = %[[ARG1]]: memref<?xf32>)
-// CHECK:           linalg.map { math.exp }
-// CHECK-SAME:          ins(%[[ARG0_]] : memref<?xf32>)
-// CHECK-SAME:          outs(%[[ARG1_]] : memref<?xf32>)
-// CHECK:            gml_st.yield %[[ARG1_]] : memref<?xf32>
-// CHECK:         return %[[ARG1]] : memref<?xf32>
-
-// -----
-
-func.func @gml_st_fusion_temp_tensor(
-    %arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-  %c0 = arith.constant 0 : index
-  %dim0 = tensor.dim %arg0, %c0 : tensor<?xf32>
-  %init = tensor.empty(%dim0) : tensor<?xf32>
-  %0 = gml_st.fusion ins(%arg0_ = %arg0 : tensor<?xf32>,
-                         %arg1_ = %arg1 : tensor<?xf32>)
-                     inits(%init_ = %init : tensor<?xf32>) {
-    %c0_ = arith.constant 0 : index
-    %dim0_ = tensor.dim %arg0_, %c0_ : tensor<?xf32>
-    %temp = tensor.empty(%dim0_) : tensor<?xf32>
-    %map0 = linalg.map { math.exp }
-      ins(%arg0_ : tensor<?xf32>)
-      outs(%temp : tensor<?xf32>)
-    %map1 = linalg.map { arith.mulf }
-      ins(%map0, %arg1_ : tensor<?xf32>, tensor<?xf32>)
-      outs(%init_ : tensor<?xf32>)
-    gml_st.yield %map1 : tensor<?xf32>
-  } : tensor<?xf32>
-  func.return %0 : tensor<?xf32>
-}
-
-// CHECK-LABEL:  func @gml_st_fusion_temp_tensor
-// CHECK-SAME:       (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>)
-// CHECK:          %[[C0:.*]] = arith.constant 0 : index
-// CHECK:          %[[DIM:.*]] = memref.dim %[[ARG0]], %[[C0]] : memref<?xf32>
-// CHECK:          %[[ALLOC:.*]] = memref.alloc(%[[DIM]])
-// CHECK:          gml_st.fusion
-// CHECK-SAME:         ins(%[[ARG0_:.*]] = %[[ARG0]]: memref<?xf32>,
-// CHECK-SAME:             %[[ARG1_:.*]] = %[[ARG1]]: memref<?xf32>)
-// CHECK-SAME:         inits(%[[INIT_:.*]] = %[[ALLOC]]: memref<?xf32>)
-// CHECK-DAG:        %[[C0_:.*]] = arith.constant 0 : index
-// CHECK:            %[[DIM_:.*]] = memref.dim %[[ARG0_]], %[[C0_]]
-// CHECK:            %[[ALLOC_:.*]] = memref.alloc(%[[DIM_]])
-// CHECK:            linalg.map { math.exp }
-// CHECK-SAME:         ins(%[[ARG0_]]
-// CHECK-SAME:         outs(%[[ALLOC_]]
-// CHECK:            linalg.map { arith.mulf }
-// CHECK-SAME:         ins(%[[ALLOC_]], %[[ARG1_]]
-// CHECK-SAME:         outs(%[[INIT_]]
-// CHECK:            gml_st.yield %[[INIT_]] : memref<?xf32>
-// CHECK:          return %[[ALLOC]] : memref<?xf32>
-
-// -----
-
-func.func @gml_st_fusion_scalar_scf_for(%arg0: tensor<?xi64>) -> tensor<i64> {
-  %0 = tensor.empty() : tensor<i64>
-  %1 = gml_st.fusion
-         ins(%arg1 = %arg0: tensor<?xi64>)
-         inits(%arg2 = %0: tensor<i64>) {
-    %c1_i64 = arith.constant 1 : i64
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %dim = tensor.dim %arg1, %c0 : tensor<?xi64>
-    %2 = scf.for %arg3 = %c0 to %dim step %c1
-           iter_args(%arg4 = %c1_i64) -> (i64) {
-      %extracted = tensor.extract %arg1[%arg3] : tensor<?xi64>
-      %3 = arith.muli %arg4, %extracted : i64
-      scf.yield %3 : i64
-    }
-    %from_elements = tensor.from_elements %2 : tensor<i64>
-    gml_st.yield %from_elements : tensor<i64>
-  } : tensor<i64>
-  return %1 : tensor<i64>
-}
-
-// CHECK-LABEL:  func.func @gml_st_fusion_scalar_scf_for
-// CHECK-SAME:       (%[[ARG0:.*]]: memref<?xi64>)
-// CHECK:          %[[ALLOC:.*]] = memref.alloc()
-// CHECK:          gml_st.fusion
-// CHECK-SAME:         ins(%[[ARG0_:.*]] = %[[ARG0]]: memref<?xi64>)
-// CHECK-SAME:         inits(%[[ALLOC_:.*]] = %[[ALLOC]]: memref<i64>)
-// CHECK-DAG:        %[[C1_I64:.*]] = arith.constant 1 : i64
-// CHECK-DAG:        %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:        %[[C1:.*]] = arith.constant 1 : index
-// CHECK:            %[[DIM:.*]] = memref.dim %[[ARG0_]], %[[C0]]
-// CHECK:            %[[FOR:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[DIM]]
-// CHECK-SAME:           step %[[C1]] iter_args(%[[ARG4:.*]] = %[[C1_I64]])
-// CHECK:              %[[LOAD:.*]] = memref.load %[[ARG0_]][%[[ARG3]]]
-// CHECK:              %[[MULI:.*]] = arith.muli %[[ARG4]], %[[LOAD]]
-// CHECK:              scf.yield %[[MULI]] : i64
-// CHECK:            %[[ALLOC_0:.*]] = memref.alloc()
-// CHECK:            memref.store %[[FOR]], %[[ALLOC_0]][]
-// CHECK:            memref.copy %[[ALLOC_0]], %[[ALLOC_]]
-// CHECK:            gml_st.yield %[[ALLOC_]] : memref<i64>
-// CHECK:          return %[[ALLOC]] : memref<i64>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/collapse-shape.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/collapse-shape.mlir
deleted file mode 100644
index a67378fb603e8d..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/collapse-shape.mlir
+++ /dev/null
@@ -1,288 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --gml-collapse-shape | FileCheck %s
-
-// RUN: mlir-hlo-opt %s --split-input-file \
-// RUN:     --gml-collapse-shape="retain-trailing-dims=1" | \
-// RUN: FileCheck %s --check-prefix=CHECK-1
-
-// RUN: mlir-hlo-opt %s --split-input-file \
-// RUN:     --gml-collapse-shape="retain-trailing-dims=2" | \
-// RUN: FileCheck %s --check-prefix=CHECK-2
-
-// RUN: mlir-hlo-opt %s --split-input-file \
-// RUN:     --gml-collapse-shape="retain-trailing-dims=3" | \
-// RUN: FileCheck %s --check-prefix=CHECK-3
-
-func.func @bcast(%arg0: tensor<2x4x2048xf32>) -> tensor<2x4x2048x4096xf32> {
-  %0 = tensor.empty() : tensor<2x4x2048x4096xf32>
-  %1 = linalg.broadcast
-        ins(%arg0 : tensor<2x4x2048xf32>)
-        outs(%0 : tensor<2x4x2048x4096xf32>)
-        dimensions = [3]
-  return %1 : tensor<2x4x2048x4096xf32>
-}
-
-// CHECK:        func.func @bcast(%[[ARG0:.*]]: tensor<2x4x2048xf32>)
-// CHECK-NOT:    collapse_shape
-// CHECK-NOT:    expand_shape
-
-// CHECK-1:      func.func @bcast(%[[ARG0:.*]]: tensor<2x4x2048xf32>)
-// CHECK-1:        %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
-// CHECK-1-SAME:     [0, 1, 2]]
-// CHECK-1:        %[[EMPTY:.*]] = tensor.empty()
-// CHECK-1:        %[[BROADCAST:.*]] = linalg.broadcast
-// CHECK-1:      ins(%[[COLLAPSED]] : tensor<16384xf32>)
-// CHECK-1:      outs(%[[EMPTY]] : tensor<16384x4096xf32>)
-// CHECK-1:      dimensions = [1]
-// CHECK-1:        %[[EXPANDED:.*]] = tensor.expand_shape %[[BROADCAST]] [
-// CHECK-1-SAME:     [0, 1, 2], [3]]
-// CHECK-1:        return %[[EXPANDED]]
-
-// CHECK-2:      func.func @bcast(%[[ARG0:.*]]: tensor<2x4x2048xf32>)
-// CHECK-2:        %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
-// CHECK-2-SAME:     [0, 1], [2]]
-// CHECK-2:        %[[EMPTY:.*]] = tensor.empty()
-// CHECK-2:        %[[BROADCASTED:.*]] = linalg.broadcast
-// CHECK-2-SAME:     ins(%[[COLLAPSED]] : tensor<8x2048xf32>)
-// CHECK-2-SAME:     outs(%[[EMPTY]] : tensor<8x2048x4096xf32>)
-// CHECK-2:      dimensions = [2]
-// CHECK-2:        %[[EXPANDED:.*]] = tensor.expand_shape %[[BROADCASTED]] [
-// CHECK-2-SAME:     [0, 1], [2], [3]]
-// CHECK-2:        return %[[EXPANDED]]
-
-// CHECK-3:        func.func @bcast(%[[ARG0:.*]]: tensor<2x4x2048xf32>)
-// CHECK-3-NOT:    collapse_shape
-// CHECK-3-NOT:    expand_shape
-
-// -----
-
-func.func @bcast_from_scalar() -> tensor<2x4x2048x4096xf32> {
-  %0 = tensor.empty() : tensor<2x4x2048x4096xf32>
-  %cst = arith.constant 0xFF800000 : f32
-  %1 = tensor.empty() : tensor<f32>
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<f32>) -> tensor<f32>
-  %3 = linalg.broadcast
-      ins(%2 : tensor<f32>)
-      outs(%0 : tensor<2x4x2048x4096xf32>)
-      dimensions = [0, 1, 2, 3]
-  return %3 : tensor<2x4x2048x4096xf32>
-}
-
-// CHECK:      func.func @bcast_from_scalar()
-// CHECK:        %[[EMPTY:.*]] = tensor.empty() : tensor<67108864xf32>
-// CHECK:        %[[BROADCAST:.*]] = linalg.broadcast
-// CHECK:           ins(%{{.*}} : tensor<f32>)
-// CHECK:           outs(%[[EMPTY]] : tensor<67108864xf32>)
-// CHECK:           dimensions = [0]
-// CHECK:        %[[EXPANDED:.*]] = tensor.expand_shape %[[BROADCAST]] [
-// CHECK-SAME:     0, 1, 2, 3]]
-// CHECK:        return %[[EXPANDED]]
-
-// CHECK-1:      func.func @bcast_from_scalar()
-// CHECK-1:        %[[EMPTY:.*]] = tensor.empty() : tensor<16384x4096xf32>
-// CHECK-1:        %[[BROADCAST:.*]] = linalg.broadcast
-// CHECK-1-SAME:       ins(%{{.*}} : tensor<f32>)
-// CHECK-1-SAME:       outs(%[[EMPTY]] : tensor<16384x4096xf32>)
-// CHECK-1-SAME:       dimensions = [1, 0]
-// CHECK-1:        %[[EXPANDED:.*]] = tensor.expand_shape %[[BROADCAST]] [
-// CHECK-1-SAME:     [0, 1, 2], [3]]
-// CHECK-1:        return %[[EXPANDED]]
-
-// CHECK-2:      func.func @bcast_from_scalar()
-// CHECK-2:        %[[EMPTY:.*]] = tensor.empty() : tensor<8x2048x4096xf32>
-// CHECK-2:        %[[BROADCAST:.*]] = linalg.broadcast
-// CHECK-2-SAME:       ins(%{{.*}} : tensor<f32>
-// CHECK-2-SAME:       outs(%[[EMPTY]] : tensor<8x2048x4096xf32>)
-// CHECK-2-SAME:       dimensions = [1, 2, 0]
-// CHECK-2:        %[[EXPANDED:.*]] = tensor.expand_shape %[[BROADCAST]] [
-// CHECK-2-SAME:     [0, 1], [2], [3]]
-// CHECK-2:        return %[[EXPANDED]]
-
-// CHECK-3:        func.func @bcast_from_scalar()
-// CHECK-3-NOT:    collapse_shape
-// CHECK-3-NOT:    expand_shape
-
-// -----
-
-func.func @reduction(%arg0: tensor<2x4x2048x4096xf32>) -> tensor<2x4x2048xf32> {
-  %cst = arith.constant 0xFF800000 : f32
-  %0 = tensor.empty() : tensor<2x4x2048xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x4x2048xf32>)
-      -> tensor<2x4x2048xf32>
-  %2 = linalg.reduce { arith.maximumf }
-      ins(%arg0 : tensor<2x4x2048x4096xf32>)
-      outs(%1 : tensor<2x4x2048xf32>)
-      dimensions = [3]
-  return %2 : tensor<2x4x2048xf32>
-}
-
-// CHECK:        func.func @reduction(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>)
-// CHECK-NOT:    collapse_shape
-// CHECK-NOT:    expand_shape
-
-// CHECK-1:      func.func @reduction(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>)
-// CHECK-1-DAG:    %[[CST:.*]] = arith.constant 0xFF800000 : f32
-// CHECK-1:        %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
-// CHECK-1-SAME:     [0, 1, 2], [3]]
-// CHECK-1:        %[[EMPTY:.*]] = tensor.empty()
-// CHECK-1:        %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[EMPTY]] : tensor<16384xf32>)
-// CHECK-1:        %[[REDUCED:.*]] = linalg.reduce { arith.maximumf }
-// CHECK-1-SAME:       ins(%[[COLLAPSED]] : tensor<16384x4096xf32>)
-// CHECK-1-SAME:       outs(%[[FILL]] : tensor<16384xf32>)
-// CHECK-1:        %[[EXPANDED:.*]] = tensor.expand_shape %[[REDUCED]] [
-// CHECK-1-SAME:     [0, 1, 2]]
-// CHECK-1:        return %[[EXPANDED]]
-
-
-// -----
-
-func.func @cwise(%arg0: tensor<2x4x2048x4096xf32>,
-    %arg1: tensor<2x4x2048x4096xf32>) -> tensor<2x4x2048x4096xf32> {
-  %0 = tensor.empty() : tensor<2x4x2048x4096xf32>
-  %1 = linalg.map { arith.subf }
-         ins(%arg0, %arg1 : tensor<2x4x2048x4096xf32>, tensor<2x4x2048x4096xf32>)
-         outs(%0 : tensor<2x4x2048x4096xf32>)
-  return %1 : tensor<2x4x2048x4096xf32>
-}
-
-// CHECK:        func.func @cwise(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>, %[[ARG1:.*]]: tensor<2x4x2048x4096xf32>)
-// CHECK:          %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
-// CHECK-SAME:       [0, 1, 2, 3]]
-// CHECK:          %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[ARG1]] [
-// CHECK-SAME:       [0, 1, 2, 3]]
-// CHECK:          %[[EMPTY:.*]] = tensor.empty()
-// CHECK:          %[[MAP:.*]] = linalg.map { arith.subf }
-// CHECK:           ins(%[[COLLAPSED]], %[[COLLAPSED_0]] : tensor<67108864xf32>, tensor<67108864xf32>)
-// CHECK:           outs(%[[EMPTY]] : tensor<67108864xf32>)
-// CHECK:          %[[EXPANDED:.*]] = tensor.expand_shape %[[MAP]] [
-// CHECK-SAME:       [0, 1, 2, 3]]
-// CHECK:          return %[[EXPANDED]]
-
-// CHECK-1:      func.func @cwise(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>, %[[ARG1:.*]]: tensor<2x4x2048x4096xf32>)
-// CHECK-1:        %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
-// CHECK-1-SAME:     [0, 1, 2], [3]]
-// CHECK-1:        %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[ARG1]] [
-// CHECK-1-SAME:     [0, 1, 2], [3]]
-// CHECK-1:        %[[EMPTY:.*]] = tensor.empty()
-// CHECK-1:        %[[MAP:.*]] = linalg.map { arith.subf }
-// CHECK-1-SAME:       ins(%[[COLLAPSED]], %[[COLLAPSED_0]] : tensor<16384x4096xf32>, tensor<16384x4096xf32>)
-// CHECK-1-SAME       outs(%[[EMPTY]] : tensor<16384x4096xf32>)
-// CHECK-1:        %[[EXPANDED:.*]] = tensor.expand_shape %[[MAP]] [
-// CHECK-1-SAME:     [0, 1, 2], [3]]
-// CHECK-1:        return %[[EXPANDED]]
-
-// CHECK-2:      func.func @cwise(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>, %[[ARG1:.*]]: tensor<2x4x2048x4096xf32>)
-// CHECK-2:        %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
-// CHECK-2-SAME:     [0, 1], [2], [3]]
-// CHECK-2:        %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[ARG1]] [
-// CHECK-2-SAME:     [0, 1], [2], [3]]
-// CHECK-2:        %[[EMPTY:.*]] = tensor.empty()
-// CHECK-2:        %[[MAP:.*]] = linalg.map { arith.subf }
-// CHECK-2-SAME:       ins(%[[COLLAPSED]], %[[COLLAPSED_0]] : tensor<8x2048x4096xf32>, tensor<8x2048x4096xf32>)
-// CHECK-2-SAME       outs(%[[EMPTY]] : tensor<8x2048x4096xf32>)
-// CHECK-2:        %[[EXPANDED:.*]] = tensor.expand_shape %[[MAP]] [
-// CHECK-2-SAME:     [0, 1], [2], [3]]
-// CHECK-2:        return %[[EXPANDED]]
-
-// CHECK-3:        func.func @cwise(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>, %[[ARG1:.*]]: tensor<2x4x2048x4096xf32>)
-// CHECK-3-NOT:    collapse_shape
-// CHECK-3-NOT:    expand_shape
-
-// -----
-
-func.func @partial_softmax(%arg0: tensor<2x4x2048x4096xf32>)
-    -> tensor<2x4x2048x4096xf32> {
-  %cst = arith.constant 0xFF800000 : f32
-  %0 = tensor.empty() : tensor<2x4x2048xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x4x2048xf32>)
-      -> tensor<2x4x2048xf32>
-  %2 = linalg.reduce { arith.maximumf }
-         ins(%arg0 : tensor<2x4x2048x4096xf32>)
-         outs(%1 : tensor<2x4x2048xf32>)
-         dimensions = [3]
-  %3 = tensor.empty() : tensor<2x4x2048x4096xf32>
-  %4 = linalg.broadcast
-         ins(%2 : tensor<2x4x2048xf32>)
-         outs(%3 : tensor<2x4x2048x4096xf32>)
-         dimensions = [3]
-  %5 = linalg.map { arith.subf }
-         ins(%arg0, %4 : tensor<2x4x2048x4096xf32>, tensor<2x4x2048x4096xf32>)
-         outs(%3 : tensor<2x4x2048x4096xf32>)
-  return %5 : tensor<2x4x2048x4096xf32>
-}
-
-// CHECK-1:      func.func @partial_softmax(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>)
-// CHECK-1-DAG:    %[[CST:.*]] = arith.constant 0xFF800000 : f32
-// CHECK-1:        %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
-// CHECK-1-SAME:     [0, 1, 2], [3]]
-// CHECK-1:        %[[EMPTY:.*]] = tensor.empty()
-// CHECK-1:        %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[EMPTY]] : tensor<16384xf32>)
-// CHECK-1:        %[[REDUCE:.*]] = linalg.reduce { arith.maximumf }
-// CHECK-1-SAME:       ins(%[[COLLAPSED]] : tensor<16384x4096xf32>)
-// CHECK-1-SAME:       outs(%[[FILL]] : tensor<16384xf32>)
-// CHECK-1-SAME:       dimensions = [1]
-// CHECK-1:        %[[EMPTY_0:.*]] = tensor.empty()
-// CHECK-1:        %[[BROADCAST:.*]] = linalg.broadcast
-// CHECK-1-SAME:       ins(%[[REDUCE]] : tensor<16384xf32>)
-// CHECK-1-SAME:       outs(%[[EMPTY_0]] : tensor<16384x4096xf32>)
-// CHECK-1-SAME:       dimensions = [1]
-// CHECK-1:        %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[ARG0]] [
-// CHECK-1-SAME:     [0, 1, 2], [3]]
-// CHECK-1:        %[[EMPTY_1:.*]] = tensor.empty()
-// CHECK-1:        %[[MAP:.*]] = linalg.map { arith.subf }
-// CHECK-1-SAME:       ins(%[[COLLAPSED_0]], %[[BROADCAST]] : tensor<16384x4096xf32>, tensor<16384x4096xf32>)
-// CHECK-1-SAME:       outs(%[[EMPTY_1]] : tensor<16384x4096xf32>)
-// CHECK-1:        %[[EXPANDED:.*]] = tensor.expand_shape %[[MAP]] [
-// CHECK-1-SAME:     [0, 1, 2], [3]]
-// CHECK-1:        return %[[EXPANDED]]
-
-// CHECK-2:      func.func @partial_softmax(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>)
-// CHECK-2-DAG:    %[[CST:.*]] = arith.constant 0xFF800000 : f32
-// CHECK-2:        %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
-// CHECK-2-SAME:     [0, 1], [2], [3]]
-// CHECK-2:        %[[EMPTY:.*]] = tensor.empty()
-// CHECK-2:        %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[EMPTY]] : tensor<8x2048xf32>)
-// CHECK-2:        %[[REDUCE:.*]] = linalg.reduce { arith.maximumf }
-// CHECK-2-SAME:       ins(%[[COLLAPSED]] : tensor<8x2048x4096xf32>)
-// CHECK-2-SAME:       outs(%[[FILL]] : tensor<8x2048xf32>)
-// CHECK-2-SAME:       dimensions = [2]
-// CHECK-2:        %[[EMPTY_0:.*]] = tensor.empty()
-// CHECK-2:        %[[BROADCAST:.*]] = linalg.broadcast
-// CHECK-2-SAME:       ins(%[[REDUCE]] : tensor<8x2048xf32>)
-// CHECK-2-SAME:       outs(%[[EMPTY_0]] : tensor<8x2048x4096xf32>)
-// CHECK-2-SAME:       dimensions = [2]
-// CHECK-2:        %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[ARG0]] [
-// CHECK-2-SAME:     [0, 1], [2], [3]]
-// CHECK-2:        %[[EMPTY_1:.*]] = tensor.empty()
-// CHECK-2:        %[[MAP:.*]] = linalg.map { arith.subf }
-// CHECK-2-SAME:       ins(%[[COLLAPSED_0]], %[[BROADCAST]] : tensor<8x2048x4096xf32>, tensor<8x2048x4096xf32>)
-// CHECK-2-SAME:       outs(%[[EMPTY_1]] : tensor<8x2048x4096xf32>)
-// CHECK-2:        %[[EXPANDED:.*]] = tensor.expand_shape %[[MAP]] [
-// CHECK-2-SAME:     [0, 1], [2], [3]]
-// CHECK-2:        return %[[EXPANDED]]
-
-// CHECK-3:        func.func @partial_softmax(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>)
-// CHECK-3-NOT:    collapse_shape
-// CHECK-3-NOT:    expand_shape
-
-// -----
-
-
-func.func @collapse_shape_of_cwise(%arg0: tensor<2x4xf32>) -> tensor<8xf32> {
-  %0 = tensor.empty() : tensor<2x4xf32>
-  %1 = linalg.map { arith.negf }
-         ins(%arg0 : tensor<2x4xf32>)
-         outs(%0 : tensor<2x4xf32>)
-  %3 = tensor.collapse_shape %1 [[0, 1]] : tensor<2x4xf32> into tensor<8xf32>
-  return %3 : tensor<8xf32>
-}
-
-// CHECK:   func.func @collapse_shape_of_cwise
-// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape {{.*}} [
-// CHECK-SAME: [0, 1]] : tensor<2x4xf32> into tensor<8xf32>
-// CHECK: %[[MAPPED:.*]] = linalg.map
-// CHECK: ins(%[[COLLAPSED]] : tensor<8xf32>)
-
-// CHECK-1: func.func @collapse_shape_of_cwise
-// CHECK-2: func.func @collapse_shape_of_cwise
-// CHECK-3: func.func @collapse_shape_of_cwise
-
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/collect_stats.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/collect_stats.mlir
deleted file mode 100644
index fd4e1dd18a56f4..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/collect_stats.mlir
+++ /dev/null
@@ -1,77 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline=stats-detail-level=1 | \
-// RUN: FileCheck %s --check-prefix=CHECK-1
-
-// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline=stats-detail-level=2 | \
-// RUN: FileCheck %s --check-prefix=CHECK-2
-
-// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline=stats-detail-level=3 | \
-// RUN: FileCheck %s --check-prefix=CHECK-3
-
-func.func @foo(%arg0: tensor<2x4xf32>,
-               %arg1: tensor<8x8xf32>,
-               %arg2: tensor<128xf32>) -> tensor<4x2xf32> {
-  %cst = arith.constant 0.0 : f32
-  %c0 = arith.constant 0 : index
-  %c8 = arith.constant 8 : index
-  %c32 = arith.constant 32 : index
-  %0 = tensor.empty() : tensor<2x4xf32>
-  %1 = linalg.map { arith.negf }
-         ins(%arg0 : tensor<2x4xf32>)
-         outs(%0 : tensor<2x4xf32>)
-  %3 = tensor.collapse_shape %1 [[0, 1]] : tensor<2x4xf32> into tensor<8xf32>
-  %4 = tensor.empty() : tensor<8xf32>
-  %17 = scf.for %arg13 = %c0 to %c8 step %c32 iter_args(%arg14 = %4)
-    -> (tensor<8xf32>) {
-    %extracted_slice = tensor.extract_slice %arg2[%arg13] [32] [1] :
-        tensor<128xf32> to tensor<32xf32>
-    %expanded_17 = tensor.expand_shape %extracted_slice [[0, 1]] :
-        tensor<32xf32> into tensor<4x8xf32>
-    %reduced_18 = linalg.reduce { arith.addf }
-        ins(%expanded_17 : tensor<4x8xf32>)
-        outs(%arg14 : tensor<8xf32>) dimensions = [0]
-    scf.yield %reduced_18 : tensor<8xf32>
-  }
-  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<8xf32>)
-      -> tensor<8xf32>
-  %6 = linalg.vecmat ins(%3, %arg1 : tensor<8xf32>, tensor<8x8xf32>)
-                     outs(%5 : tensor<8xf32>) -> tensor<8xf32>
-  %7 = tensor.expand_shape %6 [[0, 1]] : tensor<8xf32> into tensor<8x1xf32>
-  %8 = tensor.collapse_shape %7 [[0, 1]] : tensor<8x1xf32> into tensor<8xf32>
-  %9 = linalg.matvec ins(%arg1, %8 : tensor<8x8xf32>, tensor<8xf32>)
-                     outs(%5 : tensor<8xf32>) -> tensor<8xf32>
-  %10 = linalg.map { arith.addf }
-         ins(%17, %9 : tensor<8xf32>, tensor<8xf32>)
-         outs(%5 : tensor<8xf32>)
-  %11 = tensor.expand_shape %10 [[0, 1]] : tensor<8xf32> into tensor<4x2xf32>
-  return %11 : tensor<4x2xf32>
-}
-
-// CHECK-1:         *** Tileable ops stats (detail level 1) ***
-// CHECK-1-DAG:     1x linalg.fill
-// CHECK-1-DAG:     2x linalg.map
-// CHECK-1-DAG:     1x linalg.matvec
-// CHECK-1-DAG:     1x linalg.reduce
-// CHECK-1-DAG:     1x linalg.vecmat
-// CHECK-1-DAG:     1x tensor.collapse_shape (degenerate)
-// CHECK-1-DAG:     1x tensor.collapse_shape (non-degenerate)
-// CHECK-1-DAG:     3x tensor.expand_shape
-
-// CHECK-2:         *** Tileable ops stats (detail level 2) ***
-// CHECK-2:         1x linalg.fill
-// CHECK-2-NEXT:      1. %{{.*}} = linalg.fill ins({{.*}}) outs({{.*}})
-
-// CHECK-3:         *** Tileable ops stats (detail level 3) ***
-// CHECK-3:         2x linalg.map
-// CHECK-3-DAG:       %{{.*}} = linalg.map { arith.negf } ins({{.*}}) outs({{.*}})
-// CHECK-3-NEXT:        Producers:
-// CHECK-3-NEXT:          <block argument> {{.*}} index: 0
-// CHECK-3-NEXT:          tensor.empty
-// CHECK-3-NEXT:        Consumers:
-// CHECK-3-NEXT:          tensor.collapse_shape
-// CHECK-3-DAG:       %{{.*}} = linalg.map { arith.addf } ins({{.*}}) outs({{.*}})
-// CHECK-3-NEXT:        Producers:
-// CHECK-3-NEXT:          scf.for
-// CHECK-3-NEXT:          linalg.matvec
-// CHECK-3-NEXT:          linalg.fill
-// CHECK-3-NEXT:        Consumers:
-// CHECK-3-NEXT:          tensor.expand_shape
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/compose_extract_insert_slice.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/compose_extract_insert_slice.mlir
deleted file mode 100644
index c9616edb586e15..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/compose_extract_insert_slice.mlir
+++ /dev/null
@@ -1,45 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-compose-extract-insert-slice --split-input-file \
-// RUN: | FileCheck %s
-
-func.func @compose_slices(%arg: tensor<?x?xf32>, %i: index, %j: index,
-    %k: index, %n: index, %a: index, %b: index) -> tensor<4x?xf32> {
-  %4 = tensor.extract_slice %arg[%i, %j] [4, 128] [2, %a]
-    : tensor<?x?xf32> to tensor<4x128xf32>
-  %5 = tensor.extract_slice %4[0, %k] [4, %n] [1, %b]
-    : tensor<4x128xf32> to tensor<4x?xf32>
-  return %5 : tensor<4x?xf32>
-}
-// CHECK-LABEL: @compose_slices
-// CHECK-SAME:  %[[ARG:[a-z0-9]+]]: tensor<?x?xf32>, %[[I:[a-z0-9]+]]: index,
-// CHECK-SAME:  %[[J:[a-z0-9]+]]: index, %[[K:[a-z0-9]+]]: index,
-// CHECK-SAME:  %[[N:[a-z0-9]+]]: index, %[[A:[a-z0-9]+]]: index,
-// CHECK-SAME:  %[[B:[a-z0-9]+]]: index)
-
-// CHECK-DAG:  %[[J_PLUS_AK:.*]] = affine.apply
-// CHECK-DAG:  %[[AB:.*]] = affine.apply
-// CHECK-NEXT: %[[RES:.*]] = tensor.extract_slice %[[ARG]]
-// CHECK-SAME:   [%[[I]], %[[J_PLUS_AK]]] [4, %[[N]]] [2, %[[AB]]]
-// CHECK-SAME:   : tensor<?x?xf32>
-
-// -----
-
-func.func @compose_extract_of_slice(%arg: tensor<?x?xf32>, %i: index, %j: index,
-    %k: index, %l: index) -> f32 {
-  %slice = tensor.extract_slice %arg[%i, %j] [4, 128] [2, %l]
-    : tensor<?x?xf32> to tensor<4x128xf32>
-  %c1 = arith.constant 1 : index
-  %pt = tensor.extract %slice[%c1, %k] : tensor<4x128xf32>
-  return %pt : f32
-}
-// CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0] -> (s0 + 2)>
-// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0, s1, s2] -> (s0 * s1 + s2)>
-
-// CHECK-LABEL: func.func @compose_extract_of_slice
-// CHECK-SAME:   (%[[ARG:.*]]: tensor<?x?xf32>,
-// CHECK-SAME:    %[[I:.*]]: index, %[[J:.*]]: index, %[[K:.*]]: index,
-// CHECK-SAME:    %[[L:.*]]: index) -> f32 {
-
-// CHECK:       %[[X:.*]] = affine.apply #[[$MAP0]]()[%[[I]]]
-// CHECK:       %[[Y:.*]] = affine.apply #[[$MAP1]]()[%[[K]], %[[L]], %[[J]]]
-// CHECK:       tensor.extract %[[ARG]][%[[X]], %[[Y]]] : tensor<?x?xf32>
-
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/batch_matmul.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/batch_matmul.mlir
deleted file mode 100644
index 1bbad52432eaa8..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/batch_matmul.mlir
+++ /dev/null
@@ -1,35 +0,0 @@
-// RUN: mlir-hlo-opt %s \
-// RUN:   --gml-st-cpu-tiling-pipeline=matmul-tile-sizes=4,4,4 \
-// RUN: | FileCheck %s
-
-func.func @batch_matmul(%lhs: tensor<8x64x32xf32>,
-                        %rhs: tensor<8x32x64xf32>) -> tensor<8x64x64xf32> {
-  %37 = tensor.empty() : tensor<8x64x64xf32>
-  %cst_75 = arith.constant 0.000000e+00 : f32
-  %38 = linalg.fill ins(%cst_75 : f32) outs(%37 : tensor<8x64x64xf32>)
-    -> tensor<8x64x64xf32>
-  %39 = linalg.batch_matmul ins(%lhs, %rhs : tensor<8x64x32xf32>,
-    tensor<8x32x64xf32>) outs(%38 : tensor<8x64x64xf32>) -> tensor<8x64x64xf32>
-
-  func.return %39 : tensor<8x64x64xf32>
-}
-// CHECK-LABEL: @batch_matmul
-
-// CHECK:      scf.for
-// CHECK-DAG:    tensor.collapse_shape
-// CHECK-SAME:       : tensor<1x64x32xf32> into tensor<64x32xf32>
-// CHECK-DAG:    tensor.collapse_shape
-// CHECK-SAME:       : tensor<1x32x64xf32> into tensor<32x64xf32>
-// CHECK-DAG:    tensor.collapse_shape
-// CHECK-SAME:       : tensor<1x64x64xf32> into tensor<64x64xf32>
-// CHECK:        scf.for
-// CHECK:          scf.for
-// CHECK:            scf.for
-// CHECK:              vector.contract
-// CHECK-SAME:           : vector<4x4xf32>, vector<4x4xf32> into vector<4x4xf32>
-// CHECK:              scf.yield %{{.*}} : vector<4x4xf32>
-// CHECK:            scf.yield %{{.*}} : tensor<64x64xf32>
-// CHECK:          scf.yield %{{.*}} : tensor<64x64xf32>
-// CHECK:        %expanded = tensor.expand_shape
-// CHECK:          : tensor<64x64xf32> into tensor<1x64x64xf32>
-// CHECK:        scf.yield %inserted_slice : tensor<8x64x64xf32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/conv_2d_nhwc_hwcf.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/conv_2d_nhwc_hwcf.mlir
deleted file mode 100644
index f365853dbba0a7..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/conv_2d_nhwc_hwcf.mlir
+++ /dev/null
@@ -1,52 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file \
-// RUN:   --gml-st-cpu-tiling-pipeline=matmul-tile-sizes=4,4,4 \
-// RUN: | FileCheck %s
-
-func.func @conv_is_matmul(%input: tensor<1x41x140x1xf32>,
-    %kernel: tensor<1x140x1x128xf32>) -> tensor<1x41x1x128xf32> {
-  %empty = tensor.empty() : tensor<1x41x1x128xf32>
-
-  %c0 = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%c0 : f32)
-    outs(%empty: tensor<1x41x1x128xf32>) -> tensor<1x41x1x128xf32>
-
-  %conv = linalg.conv_2d_nhwc_hwcf
-    {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-    ins(%input, %kernel : tensor<1x41x140x1xf32>, tensor<1x140x1x128xf32>)
-    outs(%fill : tensor<1x41x1x128xf32>) -> tensor<1x41x1x128xf32>
-
-  func.return %conv : tensor<1x41x1x128xf32>
-}
-// CHECK-LABEL: @conv_is_matmul
-// CHECK:       scf.for
-// CHECK:         scf.yield %{{.*}} : tensor<41x128xf32>
-
-// -----
-
-func.func @conv_is_matmul_after_tiling(%input: tensor<1x45x140x1xf32>,
-    %kernel: tensor<5x140x1x128xf32>) -> tensor<1x41x1x128xf32> {
-  %empty = tensor.empty() : tensor<1x41x1x128xf32>
-
-  %c0 = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%c0 : f32)
-    outs(%empty: tensor<1x41x1x128xf32>) -> tensor<1x41x1x128xf32>
-
-  %conv = linalg.conv_2d_nhwc_hwcf
-    {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-    ins(%input, %kernel : tensor<1x45x140x1xf32>, tensor<5x140x1x128xf32>)
-    outs(%fill : tensor<1x41x1x128xf32>) -> tensor<1x41x1x128xf32>
-
-  func.return %conv : tensor<1x41x1x128xf32>
-}
-// CHECK-LABEL: @conv_is_matmul_after_tiling
-// CHECK:      scf.for
-// CHECK-DAG:    tensor.collapse_shape
-// CHECK-SAME:     : tensor<1x41x140x1xf32> into tensor<41x140xf32>
-// CHECK-DAG:    tensor.collapse_shape
-// CHECK-SAME:     : tensor<1x140x1x128xf32> into tensor<140x128xf32>
-// CHECK-DAG:    tensor.collapse_shape
-// CHECK-SAME:     : tensor<1x41x1x128xf32> into tensor<41x128xf32>
-// CHECK:        scf.for
-// CHECK:          scf.yield %{{.*}} : tensor<41x128xf32>
-// CHECK:        tensor.expand_shape
-// CHECK-SAME:     : tensor<41x128xf32> into tensor<1x41x1x128xf32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/dot.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/dot.mlir
deleted file mode 100644
index a8546b4e21e15a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/dot.mlir
+++ /dev/null
@@ -1,260 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file \
-// RUN:   --gml-st-cpu-tiling-pipeline="matmul-tile-sizes=4,5,6 \
-// RUN:                                 vectorization-size-threshold=1" |\
-// RUN: FileCheck %s
-
-func.func @matvec(%lhs: tensor<33x17xf32>, %rhs: tensor<17xf32>,
-                  %output: tensor<33xf32>) -> tensor<33xf32> {
-  %2 = linalg.matvec ins(%lhs, %rhs : tensor<33x17xf32>, tensor<17xf32>)
-                     outs(%output : tensor<33xf32>) -> tensor<33xf32>
-  return %2 : tensor<33xf32>
-}
-
-// CHECK-LABEL: @matvec
-// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:     %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG:     %[[C6:.*]] = arith.constant 6 : index
-// CHECK-DAG:     %[[C12:.*]] = arith.constant 12 : index
-// CHECK-DAG:     %[[C17:.*]] = arith.constant 17 : index
-// CHECK-DAG:     %[[C32:.*]] = arith.constant 32 : index
-// CHECK:         scf.for {{.*}} %[[C0]] to %[[C32]] step %[[C4]]
-// CHECK:           scf.for {{.*}} %[[C0]] to %[[C12]] step %[[C6]]
-// CHECK:             vector.contract {{.*}} vector<4x6xf32>
-// CHECK-NEXT:        scf.yield %{{.*}} : vector<4xf32>
-// CHECK:           vector.contract
-// CHECK:           vector.transfer_write
-// CHECK:         scf.for {{.*}} %[[C0]] to %[[C17]] step %[[C6]]
-// CHECK:           linalg.matvec
-
-// -----
-
-func.func @large_matvec(%lhs: tensor<33x1024xf32>, %rhs: tensor<1024xf32>,
-                        %output: tensor<33xf32>) -> tensor<33xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32)
-                      outs(%output : tensor<33xf32>) -> tensor<33xf32>
-  %matvec = linalg.matvec ins(%lhs, %rhs : tensor<33x1024xf32>, tensor<1024xf32>)
-                     outs(%fill : tensor<33xf32>) -> tensor<33xf32>
-  return %matvec : tensor<33xf32>
-}
-// CHECK-LABEL: @large_matvec
-
-// CHECK:   scf.for
-// CHECK:      tensor.collapse_shape
-// CHECK-SAME:   : tensor<1x1024xf32> into tensor<1024xf32>
-// CHECK:     scf.for
-// CHECK:       arith.mulf %{{.*}} : vector<32xf32>
-// CHECK:       vector.multi_reduction <add>
-// CHECK:       scf.yield %{{.*}} : vector<8xf32>
-// CHECK:     vector.multi_reduction <add>
-// CHECK:     scf.yield %{{.*}} : tensor<33xf32>
-
-// -----
-
-func.func @vecmat(%lhs: tensor<17xf32>, %rhs: tensor<17x33xf32>,
-                  %output: tensor<33xf32>) -> tensor<33xf32> {
-  %2 = linalg.vecmat ins(%lhs, %rhs : tensor<17xf32>, tensor<17x33xf32>)
-                     outs(%output : tensor<33xf32>) -> tensor<33xf32>
-  return %2 : tensor<33xf32>
-}
-
-// CHECK-LABEL: @vecmat
-// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:     %[[C5:.*]] = arith.constant 5 : index
-// CHECK-DAG:     %[[C6:.*]] = arith.constant 6 : index
-// CHECK-DAG:     %[[C12:.*]] = arith.constant 12 : index
-// CHECK-DAG:     %[[C17:.*]] = arith.constant 17 : index
-// CHECK-DAG:     %[[C30:.*]] = arith.constant 30 : index
-// CHECK:         scf.for {{.*}} %[[C0]] to %[[C30]] step %[[C5]]
-// CHECK:           scf.for {{.*}} %[[C0]] to %[[C12]] step %[[C6]]
-// CHECK:             vector.contract {{.*}} vector<6x5xf32>
-// CHECK-NEXT:        scf.yield %{{.*}} : vector<5xf32>
-// CHECK:           vector.contract
-// CHECK:           vector.transfer_write
-// CHECK:         scf.for {{.*}} %[[C0]] to %[[C17]] step %[[C6]]
-// CHECK:           linalg.vecmat
-
-// -----
-
-func.func @dot(%lhs: tensor<19xf32>, %rhs: tensor<19xf32>,
-               %output: tensor<f32>) -> tensor<f32> {
-  %2 = linalg.dot ins(%lhs, %rhs : tensor<19xf32>, tensor<19xf32>)
-                  outs(%output : tensor<f32>) -> tensor<f32>
-  return %2 : tensor<f32>
-}
-
-// CHECK-LABEL: @dot
-// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:     %[[C6:.*]] = arith.constant 6 : index
-// CHECK-DAG:     %[[C18:.*]] = arith.constant 18 : index
-// CHECK:         scf.for {{.*}} %[[C0]] to %[[C18]] step %[[C6]]
-// CHECK:           vector.contract {{.*}} vector<6xf32>
-// CHECK-NEXT:      vector.broadcast
-// CHECK-NEXT:      scf.yield %{{.*}} : vector<f32>
-// CHECK:         arith.mulf
-// CHECK:         arith.addf
-
-// -----
-
-func.func @large_dot(%lhs: tensor<128xf32>, %rhs: tensor<128xf32>,
-                     %output: tensor<f32>) -> tensor<f32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %fill = linalg.fill ins(%cst : f32)
-                      outs(%output : tensor<f32>) -> tensor<f32>
-  %dot = linalg.dot ins(%lhs, %rhs : tensor<128xf32>, tensor<128xf32>)
-                    outs(%fill : tensor<f32>) -> tensor<f32>
-  return %dot : tensor<f32>
-}
-// CHECK-LABEL: @large_dot
-
-// CHECK: scf.for
-// CHECK:   arith.mulf {{.*}} : vector<32xf32>
-// CHECK:   vector.multi_reduction <add>
-// CHECK:     : vector<4x8xf32> to vector<8xf32>
-// CHECK:   scf.yield %{{.*}} : vector<8xf32>
-// CHECK: vector.multi_reduction <add>
-// CHECK:   : vector<8xf32> to f32
-
-
-// -----
-
-func.func @matvec_to_vecmat(%rhs: tensor<2xi32>,
-                            %output: tensor<3xi32>) -> tensor<3xi32> {
-  %cst = arith.constant dense<[[0, 1], [2, 3], [4, 5]]> : tensor<3x2xi32>
-  %2 = linalg.matvec ins(%cst, %rhs : tensor<3x2xi32>, tensor<2xi32>)
-                     outs(%output : tensor<3xi32>) -> tensor<3xi32>
-  return %2 : tensor<3xi32>
-}
-
-// CHECK-LABEL: @matvec_to_vecmat
-// CHECK: arith.constant dense<{{\[}}[0, 2, 4], [1, 3, 5]]> : tensor<2x3xi32>
-// CHECK: vector.contract {{.*}} : vector<2xi32>, vector<2x3xi32> into vector<3xi32>
-
-// -----
-
-func.func @matvec_addf(%lhs: tensor<33x17xf32>, %rhs: tensor<17xf32>,
-                       %add: tensor<33xf32>) -> tensor<33xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = tensor.empty() : tensor<33xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<33xf32>) -> tensor<33xf32>
-  %2 = linalg.matvec ins(%lhs, %rhs : tensor<33x17xf32>, tensor<17xf32>)
-                     outs(%1 : tensor<33xf32>) -> tensor<33xf32>
-  %3 = linalg.map { arith.addf } ins(%2, %add : tensor<33xf32>, tensor<33xf32>) outs(%0 : tensor<33xf32>)
-  %4 = linalg.map { arith.addf } ins(%3, %add : tensor<33xf32>, tensor<33xf32>) outs(%0 : tensor<33xf32>)
-  return %4 : tensor<33xf32>
-}
-
-// CHECK-LABEL: @matvec_addf
-// CHECK-SAME:  (%{{.*}}: {{.*}}, %{{.*}}: {{.*}}, %[[ARG_INIT:.*]]: tensor<33xf32>)
-// CHECK:         scf.for {{.*}} iter_args(%[[ARG:.*]] = %[[ARG_INIT]]
-// CHECK:           %[[SLICE:.*]] = tensor.extract_slice %[[ARG]]
-// CHECK:           %[[READ_INIT:.*]] = vector.transfer_read %[[SLICE]]
-// CHECK:           %[[FOR:.*]] = scf.for {{.*}} iter_args(%[[ARG_FOR:.*]] = %[[READ_INIT]]
-// CHECK:             vector.contract {{.*}} %[[ARG_FOR]] :
-// CHECK-NEXT:        scf.yield
-// CHECK:           %[[CONTRACT:.*]] = vector.contract {{.*}} %[[FOR]] :
-// CHECK:           vector.transfer_write %[[CONTRACT]], %[[ARG]]
-// CHECK:           scf.yield
-// CHECK:         scf.for
-// CHECK:           linalg.matvec
-// CHECK:         scf.for
-// CHECK:           arith.addf
-// CHECK-NOT:       arith.addf
-// CHECK:           scf.yield
-// CHECK:         arith.addf
-// CHECK-NOT:     arith.addf
-
-// -----
-
-func.func @matvec_no_dominate_addf(%lhs: tensor<33x17xf32>, %rhs: tensor<17xf32>) -> tensor<33xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %cst1 = arith.constant 1.000000e+00 : f32
-  %0 = tensor.empty() : tensor<33xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<33xf32>) -> tensor<33xf32>
-  %2 = linalg.matvec ins(%lhs, %rhs : tensor<33x17xf32>, tensor<17xf32>)
-                     outs(%1 : tensor<33xf32>) -> tensor<33xf32>
-  %3 = tensor.empty() : tensor<33xf32>
-  %4 = linalg.fill ins(%cst1 : f32) outs(%0 : tensor<33xf32>) -> tensor<33xf32>
-  %5 = linalg.map { arith.addf } ins(%2, %4 : tensor<33xf32>, tensor<33xf32>) outs(%0 : tensor<33xf32>)
-  return %5 : tensor<33xf32>
-}
-
-// CHECK-LABEL: @matvec_no_dominate_addf
-// CHECK:         scf.for
-// CHECK:           scf.for
-// CHECK:             vector.contract
-// CHECK-NEXT:        scf.yield
-// CHECK:           vector.contract
-// CHECK:           vector.transfer_write
-// CHECK:           scf.yield
-// CHECK:         scf.for
-// CHECK:           linalg.matvec
-// CHECK:         scf.for
-// CHECK:           arith.addf
-// CHECK:           scf.yield
-// CHECK:         arith.addf
-
-// -----
-
-func.func @vecmat_addf(%lhs: tensor<17xf32>, %rhs: tensor<17x33xf32>,
-                       %add: tensor<33xf32>) -> tensor<33xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = tensor.empty() : tensor<33xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<33xf32>) -> tensor<33xf32>
-  %2 = linalg.vecmat ins(%lhs, %rhs : tensor<17xf32>, tensor<17x33xf32>)
-                     outs(%1 : tensor<33xf32>) -> tensor<33xf32>
-  %3 = linalg.map { arith.addf } ins(%add, %2 : tensor<33xf32>, tensor<33xf32>) outs(%0 : tensor<33xf32>)
-  %4 = linalg.map { arith.addf } ins(%3, %add : tensor<33xf32>, tensor<33xf32>) outs(%0 : tensor<33xf32>)
-  return %4 : tensor<33xf32>
-}
-
-// CHECK-LABEL: @vecmat_addf
-// CHECK-SAME:  (%{{.*}}: {{.*}}, %{{.*}}: {{.*}}, %[[ARG_INIT:.*]]: tensor<33xf32>)
-// CHECK:         scf.for {{.*}} iter_args(%[[ARG:.*]] = %[[ARG_INIT]]
-// CHECK:           %[[SLICE:.*]] = tensor.extract_slice %[[ARG]]
-// CHECK:           %[[READ_INIT:.*]] = vector.transfer_read %[[SLICE]]
-// CHECK:           %[[FOR:.*]] = scf.for {{.*}} iter_args(%[[ARG_FOR:.*]] = %[[READ_INIT]]
-// CHECK:             vector.contract {{.*}} %[[ARG_FOR]] :
-// CHECK-NEXT:        scf.yield
-// CHECK:           %[[CONTRACT:.*]] = vector.contract {{.*}} %[[FOR]] :
-// CHECK:           vector.transfer_write %[[CONTRACT]], %[[ARG]]
-// CHECK:           scf.yield
-// CHECK:         scf.for
-// CHECK:           linalg.vecmat
-// CHECK:         scf.for
-// CHECK:           arith.addf
-// CHECK-NOT:       arith.addf
-// CHECK:           scf.yield
-// CHECK:         arith.addf
-// CHECK-NOT:     arith.addf
-
-// -----
-
-func.func @vecmat_multiple_uses_addf(%lhs: tensor<17xf32>, %rhs: tensor<17x33xf32>,
-                                     %add: tensor<33xf32>) -> tensor<33xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = tensor.empty() : tensor<33xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<33xf32>) -> tensor<33xf32>
-  %2 = linalg.vecmat ins(%lhs, %rhs : tensor<17xf32>, tensor<17x33xf32>)
-                     outs(%1 : tensor<33xf32>) -> tensor<33xf32>
-  %3 = linalg.map { arith.addf } ins(%add, %2 : tensor<33xf32>, tensor<33xf32>) outs(%0 : tensor<33xf32>)
-  %4 = linalg.map { arith.addf } ins(%2, %3 : tensor<33xf32>, tensor<33xf32>) outs(%0 : tensor<33xf32>)
-  return %4 : tensor<33xf32>
-}
-
-// CHECK-LABEL: @vecmat_multiple_uses_addf
-// CHECK:         scf.for
-// CHECK:           scf.for
-// CHECK:             vector.contract
-// CHECK-NEXT:        scf.yield
-// CHECK:           vector.contract
-// CHECK:           vector.transfer_write
-// CHECK:           scf.yield
-// CHECK:         scf.for
-// CHECK:           linalg.vecmat
-// CHECK:         scf.for
-// CHECK:           arith.addf
-// CHECK:           arith.addf
-// CHECK:           scf.yield
-// CHECK:         arith.addf
-// CHECK:         arith.addf
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/duplicate_fusions.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/duplicate_fusions.mlir
deleted file mode 100644
index 7b7d998ad07aa1..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/duplicate_fusions.mlir
+++ /dev/null
@@ -1,45 +0,0 @@
-// RUN: mlir-hlo-opt %s \
-// RUN:     --gml-fusion-outlining --duplicate-function-elimination | \
-// RUN: FileCheck %s
-
-func.func @double_bcast_map_reduce(%arg : tensor<?xf32>,
-    %init_3d : tensor<?x?x?xf32>, %init_1d : tensor<?xf32>) -> tensor<?xf32> {
-
-  // Bcast, map, reduce.
-  %0 = gml_st.fusion ins(%arg_ = %arg : tensor<?xf32>,
-                         %init_3d_ = %init_3d : tensor<?x?x?xf32>)
-                     inits(%init_1d_ = %init_1d : tensor<?xf32>) {
-    %broadcasted = linalg.broadcast ins(%arg_ : tensor<?xf32>)
-        outs(%init_3d_ : tensor<?x?x?xf32>) dimensions = [1, 2]
-    %mapped = linalg.map { math.absf } ins(%broadcasted : tensor<?x?x?xf32>)
-        outs(%init_3d_ : tensor<?x?x?xf32>)
-    %reduced = linalg.reduce { arith.addf } ins(%mapped : tensor<?x?x?xf32>)
-        outs(%init_1d_ : tensor<?xf32>) dimensions = [1, 2]
-    gml_st.yield %reduced : tensor<?xf32>
-  } : tensor<?xf32>
-
-  // And again...
-  %1 = gml_st.fusion ins(%arg_ = %0 : tensor<?xf32>,
-                         %init_3d_ = %init_3d : tensor<?x?x?xf32>)
-                     inits(%init_1d_ = %init_1d : tensor<?xf32>) {
-    %broadcasted = linalg.broadcast ins(%arg_ : tensor<?xf32>)
-        outs(%init_3d_ : tensor<?x?x?xf32>) dimensions = [1, 2]
-    %mapped = linalg.map { math.absf } ins(%broadcasted : tensor<?x?x?xf32>)
-        outs(%init_3d_ : tensor<?x?x?xf32>)
-    %reduced = linalg.reduce { arith.addf } ins(%mapped : tensor<?x?x?xf32>)
-        outs(%init_1d_ : tensor<?xf32>) dimensions = [1, 2]
-    gml_st.yield %reduced : tensor<?xf32>
-  } : tensor<?xf32>
-
-  return %1 : tensor<?xf32>
-}
-
-// CHECK:      @[[UNIQUE_OUTLINED_FUSION_FUNC:double_bcast_map_reduce_fusion(_[0-9]+)?]]
-// CHECK-SAME: %{{.*}}: tensor<?xf32>, %{{.*}}: tensor<?x?x?xf32>, %{{.*}}: tensor<?xf32>
-// CHECK-SAME: attributes {fusion}
-
-// CHECK:      @double_bcast_map_reduce
-// CHECK-SAME: %[[ARG:.*]]: tensor<?xf32>, %[[INIT_3D:.*]]: tensor<?x?x?xf32>, %[[INIT_1D:.*]]: tensor<?xf32>
-// CHECK:      %[[CALL_0:.*]] = call @[[UNIQUE_OUTLINED_FUSION_FUNC]](%[[ARG]], %[[INIT_3D]], %[[INIT_1D]])
-// CHECK:      %[[CALL_1:.*]] = call @[[UNIQUE_OUTLINED_FUSION_FUNC]](%[[CALL_0]], %[[INIT_3D]], %[[INIT_1D]])
-// CHECK:      return %[[CALL_1]]
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fibonacci.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fibonacci.mlir
deleted file mode 100644
index 35b5f84dfda885..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fibonacci.mlir
+++ /dev/null
@@ -1,60 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-st-cpu-tiling-pipeline | FileCheck %s
-
-func.func @fuse_fibonacci(%init : tensor<?xi64>) -> tensor<?xi64> {
-  %c0 = arith.constant 0 : i64
-  %c1 = arith.constant 1 : i64
-
-  %0 = linalg.fill ins(%c0 : i64) outs(%init : tensor<?xi64>) -> tensor<?xi64>
-  %1 = linalg.fill ins(%c1 : i64) outs(%init : tensor<?xi64>) -> tensor<?xi64>
-  %2 = linalg.map { arith.addi } ins(%0, %1 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %3 = linalg.map { arith.addi } ins(%1, %2 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %4 = linalg.map { arith.addi } ins(%2, %3 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %5 = linalg.map { arith.addi } ins(%3, %4 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %6 = linalg.map { arith.addi } ins(%4, %5 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %7 = linalg.map { arith.addi } ins(%5, %6 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %8 = linalg.map { arith.addi } ins(%6, %7 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %9 = linalg.map { arith.addi } ins(%7, %8 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %10 = linalg.map { arith.addi } ins(%8, %9 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %11 = linalg.map { arith.addi } ins(%9, %10 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %12 = linalg.map { arith.addi } ins(%10, %11 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %13 = linalg.map { arith.addi } ins(%11, %12 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %14 = linalg.map { arith.addi } ins(%12, %13 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %15 = linalg.map { arith.addi } ins(%13, %14 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %16 = linalg.map { arith.addi } ins(%14, %15 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %17 = linalg.map { arith.addi } ins(%15, %16 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %18 = linalg.map { arith.addi } ins(%16, %17 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %19 = linalg.map { arith.addi } ins(%17, %18 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %20 = linalg.map { arith.addi } ins(%18, %19 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %21 = linalg.map { arith.addi } ins(%19, %20 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %22 = linalg.map { arith.addi } ins(%20, %21 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %23 = linalg.map { arith.addi } ins(%21, %22 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %24 = linalg.map { arith.addi } ins(%22, %23 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %25 = linalg.map { arith.addi } ins(%23, %24 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %26 = linalg.map { arith.addi } ins(%24, %25 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %27 = linalg.map { arith.addi } ins(%25, %26 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %28 = linalg.map { arith.addi } ins(%26, %27 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %29 = linalg.map { arith.addi } ins(%27, %28 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %30 = linalg.map { arith.addi } ins(%28, %29 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %31 = linalg.map { arith.addi } ins(%29, %30 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %32 = linalg.map { arith.addi } ins(%30, %31 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %33 = linalg.map { arith.addi } ins(%31, %32 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %34 = linalg.map { arith.addi } ins(%32, %33 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %35 = linalg.map { arith.addi } ins(%33, %34 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %36 = linalg.map { arith.addi } ins(%34, %35 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %37 = linalg.map { arith.addi } ins(%35, %36 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %38 = linalg.map { arith.addi } ins(%36, %37 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %39 = linalg.map { arith.addi } ins(%37, %38 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  func.return %39 : tensor<?xi64>
-}
-// CHECK-LABEL: @fuse_fibonacci
-// CHECK-DAG: %[[SCALAR_RESULT:.*]] = arith.constant 63245986 : i64
-// CHECK-DAG: %[[VECTOR_RESULT:.*]] = arith.constant dense<63245986> : vector<8xi64>
-
-// CHECK:     scf.for
-// CHECK:       %[[VEC:.*]] = vector.transfer_write %[[VECTOR_RESULT]]
-// CHECK:       scf.yield %[[VEC]]
-
-// CHECK:     scf.for
-// CHECK:       scf.for
-// CHECK:         tensor.insert %[[SCALAR_RESULT]]
-// CHECK:       tensor.insert_slice
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fusion_outlining.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fusion_outlining.mlir
deleted file mode 100644
index 8a85edce3edac4..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fusion_outlining.mlir
+++ /dev/null
@@ -1,130 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --gml-fusion-outlining | \
-// RUN: FileCheck %s
-
-func.func @map_fusion(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>)
-    -> tensor<?x?xf32> {
-  %0 = gml_st.fusion ins(%arg2 = %arg0: tensor<?x?xf32>)
-                     inits(%arg3 = %arg1: tensor<?x?xf32>) {
-    %mapped = linalg.map { math.exp } ins(%arg2 : tensor<?x?xf32>)
-        outs(%arg3 : tensor<?x?xf32>)
-    %mapped_0 = linalg.map { arith.mulf }
-        ins(%mapped, %mapped : tensor<?x?xf32>, tensor<?x?xf32>)
-        outs(%arg3 : tensor<?x?xf32>)
-    %mapped_1 = linalg.map { math.absf } ins(%mapped_0 : tensor<?x?xf32>)
-        outs(%arg3 : tensor<?x?xf32>)
-    gml_st.yield %mapped_1 : tensor<?x?xf32>
-  } : tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: @map_fusion_fusion_0
-// CHECK-SAME:      %[[ARG0:.*]]: tensor<?x?xf32>, %[[ARG1:.*]]: tensor<?x?xf32>
-// CHECK-SAME:       attributes {fusion}
-// CHECK:         %[[FUSION:.*]] = gml_st.fusion
-// CHECK-SAME:        ins(%[[ARG2:.*]] = %[[ARG0]]: tensor<?x?xf32>)
-// CHECK-SAME:        inits(%[[ARG3:.*]] = %[[ARG1]]: tensor<?x?xf32>)
-// CHECK:           %[[MAPPED:.*]] = linalg.map { math.exp } ins(%[[ARG2]] : tensor<?x?xf32>) outs(%[[ARG3]] : tensor<?x?xf32>)
-// CHECK:           %[[MAPPED_0:.*]] = linalg.map { arith.mulf } ins(%[[MAPPED]], %[[MAPPED]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[ARG3]] : tensor<?x?xf32>)
-// CHECK:           %[[MAPPED_1:.*]] = linalg.map { math.absf } ins(%[[MAPPED_0]] : tensor<?x?xf32>) outs(%[[ARG3]] : tensor<?x?xf32>)
-// CHECK:           gml_st.yield %[[MAPPED_1]]
-// CHECK:         return %[[FUSION]]
-// CHECK:       @map_fusion(%[[ARG0_0:.*]]: tensor<?x?xf32>, %[[ARG1_0:.*]]: tensor<?x?xf32>)
-// CHECK:         %[[VAL:.*]] = call @map_fusion_fusion_0(%[[ARG0_0]], %[[ARG1_0]])
-// CHECK:         return %[[VAL]]
-
-// -----
-
-func.func @multiple_fusions(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
-    %arg2: tensor<?xf32>) -> tensor<?xf32> {
-  %0 = gml_st.fusion ins(%arg3 = %arg0: tensor<?x?xf32>)
-                     inits(%arg4 = %arg1: tensor<?x?xf32>) {
-    %sorted0 = thlo.sort ins(%arg3 : tensor<?x?xf32>)
-        outs(%arg4 : tensor<?x?xf32>) dimension = 0 is_stable = false
-        (%lhs0: f32, %rhs0: f32) {
-      %2 = arith.cmpf ogt, %lhs0, %rhs0 : f32
-      thlo.yield %2 : i1
-    }
-    gml_st.yield %sorted0 : tensor<?x?xf32>
-  } : tensor<?x?xf32>
-  %1 = gml_st.fusion ins(%arg3 = %0: tensor<?x?xf32>)
-                     inits(%arg4 = %arg2: tensor<?xf32>) {
-    %reduced = linalg.reduce { arith.addf } ins(%arg3 : tensor<?x?xf32>)
-        outs(%arg4 : tensor<?xf32>) dimensions = [0]
-    %mapped = linalg.map { math.exp } ins(%reduced : tensor<?xf32>)
-        outs(%arg4 : tensor<?xf32>)
-    gml_st.yield %mapped : tensor<?xf32>
-  } : tensor<?xf32>
-  return %1 : tensor<?xf32>
-}
-
-// CHECK-LABEL: @multiple_fusions_fusion_0
-// CHECK-SAME:      %[[ARG0:.*]]: tensor<?x?xf32>, %[[ARG1:.*]]: tensor<?x?xf32>
-// CHECK-SAME:      attributes {fusion}
-// CHECK:         %[[FUSION:.*]] = gml_st.fusion
-// CHECK-SAME:        ins(%[[ARG2:.*]] = %[[ARG0]]: tensor<?x?xf32>)
-// CHECK-SAME:        inits(%[[ARG3:.*]] = %[[ARG1]]: tensor<?x?xf32>)
-// CHECK:           %[[SORTED0:.*]] = thlo.sort ins(%[[ARG2]] : tensor<?x?xf32>) outs(%[[ARG3]] : tensor<?x?xf32>) dimension = 0 is_stable = false
-// CHECK:             (%[[LHS0:.*]]: f32, %[[RHS0:.*]]: f32)
-// CHECK:               %[[CMPF:.*]] = arith.cmpf ogt, %[[LHS0]], %[[RHS0]] : f32
-// CHECK:               thlo.yield %[[CMPF]] : i1
-// CHECK:           gml_st.yield %[[SORTED0]]
-// CHECK:         return %[[FUSION]]
-// CHECK:       @multiple_fusions_fusion_1
-// CHECK-SAME:      %[[ARG0_0:.*]]: tensor<?x?xf32>, %[[ARG1_0:.*]]: tensor<?xf32>
-// CHECK-SAME:      attributes {fusion}
-// CHECK:         %[[FUSION_0:.*]] = gml_st.fusion
-// CHECK-SAME:        ins(%[[ARG2_0:.*]] = %[[ARG0_0]]: tensor<?x?xf32>)
-// CHECK-SAME:        inits(%[[ARG3_0:.*]] = %[[ARG1_0]]: tensor<?xf32>)
-// CHECK:           %[[REDUCED:.*]] = linalg.reduce { arith.addf } ins(%[[ARG2_0]] : tensor<?x?xf32>) outs(%[[ARG3_0]] : tensor<?xf32>) dimensions = [0]
-// CHECK:           %[[MAPPED:.*]] = linalg.map { math.exp } ins(%[[REDUCED]] : tensor<?xf32>) outs(%[[ARG3_0]] : tensor<?xf32>)
-// CHECK:           gml_st.yield %[[MAPPED]]
-// CHECK:         return %[[FUSION_0]]
-// CHECK:       @multiple_fusions
-// CHECK-SAME:      %[[ARG0_1:.*]]: tensor<?x?xf32>, %[[ARG1_1:.*]]: tensor<?x?xf32>, %[[ARG2_1:.*]]: tensor<?xf32>
-// CHECK:         %[[VAL:.*]] = call @multiple_fusions_fusion_0(%[[ARG0_1]], %[[ARG1_1]])
-// CHECK:         %[[VAL_0:.*]] = call @multiple_fusions_fusion_1(%[[VAL]], %[[ARG2_1]])
-// CHECK:         return %[[VAL_0]]
-
-// -----
-
-func.func @cst_defined_above() -> tensor<1x10xf32> {
-  %0 = tensor.empty() : tensor<1x10xf32>
-  %1 = gml_st.fusion inits(%arg3 = %0 : tensor<1x10xf32>) {
-    %cst = arith.constant 0.000000e+00 : f32
-    %2 = linalg.fill ins(%cst : f32) outs(%arg3 : tensor<1x10xf32>) -> tensor<1x10xf32>
-    gml_st.yield %2 : tensor<1x10xf32>
-  } { some_attr = 123 } : tensor<1x10xf32>
-  return %1 : tensor<1x10xf32>
-}
-
-// CHECK-LABEL: @cst_defined_above_fusion_0
-// CHECK-SAME:      %[[ARG0:.*]]: tensor<1x10xf32>
-// CHECK-SAME:      attributes {fusion}
-// CHECK:         %[[FUSION:.*]] = gml_st.fusion inits(%[[ARG1:.*]] = %[[ARG0]]: tensor<1x10xf32>) {
-// CHECK:           %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[ARG1]] : tensor<1x10xf32>)
-// CHECK:           gml_st.yield %[[FILL]]
-// CHECK:         } {some_attr = 123 : i64}
-// CHECK:         return %[[FUSION]]
-// CHECK:       @cst_defined_above
-// CHECK:         %[[EMPTY:.*]] = tensor.empty()
-// CHECK:         %[[VAL:.*]] = call @cst_defined_above_fusion_0(%[[EMPTY]])
-// CHECK:         return %[[VAL]]
-
-// -----
-
-func.func @reduce_wo_init(%arg0: tensor<2xf64>, %arg1: tensor<f64>)
-    -> tensor<f64> {
-  %0 = gml_st.fusion ins(%arg3 = %arg0: tensor<2xf64>)
-                     inits(%arg4 = %arg1: tensor<f64>) {
-    %reduced = linalg.reduce { arith.maximumf }
-                 ins(%arg3 : tensor<2xf64>)
-                 outs(%arg4 : tensor<f64>)
-                 dimensions = [0]
-    gml_st.yield %reduced : tensor<f64>
-  } : tensor<f64>
-  return %0 : tensor<f64>
-}
-
-// CHECK: @reduce_wo_init_fusion_0
-// CHECK: @reduce_wo_init
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fusion_planning_for_cpu.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fusion_planning_for_cpu.mlir
deleted file mode 100644
index 27b3dd4e096f27..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/fusion_planning_for_cpu.mlir
+++ /dev/null
@@ -1,480 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-st-cpu-fusion-planning \
-// RUN: --split-input-file \
-// RUN: | FileCheck %s
-
-func.func @reverse_reduce_map(%input: tensor<?x?xf32>, %init0: tensor<?x?xf32>,
-                              %init1: tensor<?xf32>) -> tensor<?xf32> {
-  %sorted = thlo.sort
-                ins(%input: tensor<?x?xf32>)
-                outs(%init0: tensor<?x?xf32>)
-                dimension = 0
-                is_stable = false
-                (%lhs: f32, %rhs: f32) {
-                  %gt = arith.cmpf ogt, %lhs, %rhs: f32
-                  thlo.yield %gt : i1
-                }
-  %reduced = linalg.reduce { arith.addf }
-               ins(%sorted: tensor<?x?xf32>)
-               outs(%init1: tensor<?xf32>)
-               dimensions = [0]
-  %result = linalg.map { math.exp }
-              ins(%reduced: tensor<?xf32>)
-              outs(%init1: tensor<?xf32>)
-  func.return %result : tensor<?xf32>
-}
-
-// CHECK-LABEL: @reverse_reduce_map
-// CHECK-SAME: (%[[INPUT:.*]]: tensor<?x?xf32>, %[[INIT0:.*]]: tensor<?x?xf32>
-// CHECK-SAME: %[[INIT1:.*]]: tensor
-
-// CHECK:       %[[FUSION0:.*]] = gml_st.fusion
-// CHECK-SAME:      ins(%[[BB_INPUT:.*]] = %[[INPUT]]: tensor<?x?xf32>
-// CHECK-SAME:      inits(%[[BB_INIT0:.*]] = %[[INIT0]]: tensor<?x?xf32>
-// CHECK-NEXT:    %[[SORTED:.*]] = thlo.sort
-// CHECK-SAME:      ins(%[[BB_INPUT]]
-// CHECK-SAME:      outs(%[[BB_INIT0]]
-// CHECK:         gml_st.yield %[[SORTED]]
-
-// CHECK:       %[[FUSION1:.*]] = gml_st.fusion
-// CHECK-SAME:      ins(%[[BB_INPUT:.*]] = %[[FUSION0]]: tensor<?x?xf32>
-// CHECK-SAME:      inits(%[[BB_INIT1:.*]] = %[[INIT1]]: tensor<?xf32>
-// CHECK:         %[[REDUCED:.*]] = linalg.reduce
-// CHECK-SAME:      ins(%[[BB_INPUT]]
-// CHECK-SAME:      outs(%[[BB_INIT1]]
-// CHECK:         %[[MAPPED:.*]] = linalg.map
-// CHECK-SAME:      ins(%[[REDUCED]]
-// CHECK-SAME:      outs(%[[BB_INIT1]]
-// CHECK:         gml_st.yield %[[MAPPED]]
-
-// CHECK:       return %[[FUSION1]]
-
-// -----
-
-func.func @scatter(%indices: tensor<1x1xindex>,
-                           %updates: tensor<1x1x3x4xi64>,
-                           %init: tensor<3x3x4xi64>) -> tensor<3x3x4xi64> {
-  %res = thlo.scatter ins(%indices : tensor<1x1xindex>,
-                          %updates : tensor<1x1x3x4xi64>)
-                      outs(%init : tensor<3x3x4xi64>)
-    (%arg5: i64, %arg6: i64) {
-      thlo.yield %arg5 : i64
-    }
-  func.return %res : tensor<3x3x4xi64>
-}
-
-// CHECK-LABEL: func @scatter
-// CHECK:       gml_st.fusion
-// CHECK:         thlo.scatter
-// CHECK:         gml_st.yield
-
-// -----
-
-func.func @sort(%input: tensor<?x?xf32>, %init: tensor<?x?xf32>)
-                -> tensor<?x?xf32> {
-  %res = thlo.sort
-           ins(%input: tensor<?x?xf32>)
-           outs(%init: tensor<?x?xf32>)
-           dimension = 0
-           is_stable = true
-           (%lhs: f32, %rhs: f32) {
-             %0 = arith.cmpf ogt, %lhs, %rhs : f32
-             thlo.yield %0 : i1
-           }
-  func.return %res : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: func @sort
-// CHECK:       gml_st.fusion
-// CHECK:         thlo.sort
-// CHECK:         gml_st.yield
-
-// -----
-
-func.func @reverse(%input: tensor<?x?xf32>, %init: tensor<?x?xf32>)
-                   -> tensor<?x?xf32> {
-  %res = thlo.reverse
-           ins(%input: tensor<?x?xf32>)
-           outs(%init: tensor<?x?xf32>)
-           reverse_dimensions = [0, 1]
-  func.return %res : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: func @reverse
-// CHECK:       gml_st.fusion
-// CHECK:         thlo.reverse
-// CHECK:         gml_st.yield
-// CHECK:         parallel_tile_sizes = array<i64: 1, 8>
-// CHECK-SAME:    reduction_tile_sizes = array<i64: 0, 0>
-
-// -----
-
-func.func @transpose(%input: tensor<?x?xf32>, %init: tensor<?x?xf32>)
-                   -> tensor<?x?xf32> {
-  %res = linalg.transpose
-           ins(%input: tensor<?x?xf32>)
-           outs(%init: tensor<?x?xf32>)
-           permutation = [1, 0]
-  func.return %res : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: func @transpose
-// CHECK:       gml_st.fusion
-// CHECK:         linalg.transpose
-// CHECK:         gml_st.yield
-
-// -----
-
-func.func @map(%input: tensor<?x?xf32>, %init: tensor<?x?xf32>)
-                     -> tensor<?x?xf32> {
-  %abs = linalg.map { math.absf } ins(%input:tensor<?x?xf32>) outs(%init:tensor<?x?xf32>)
-  func.return %abs : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: func @map
-// CHECK:       gml_st.fusion
-// CHECK:         linalg.map
-// CHECK:         gml_st.yield
-// CHECK:         parallel_tile_sizes = array<i64: 1, 8>
-// CHECK-SAME:    reduction_tile_sizes = array<i64: 0, 0>
-
-// -----
-
-func.func @map_non_unique_users(%arg: tensor<?x?xf32>,
-                                %init: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %exp = linalg.map { math.exp }
-           ins(%arg: tensor<?x?xf32>)
-           outs(%init: tensor<?x?xf32>)
-  %mul = linalg.map { arith.mulf }
-           ins(%exp, %exp: tensor<?x?xf32>, tensor<?x?xf32>)
-           outs(%init: tensor<?x?xf32>)
-  %abs = linalg.map { math.absf }
-           ins(%mul: tensor<?x?xf32>)
-           outs(%init: tensor<?x?xf32>)
-  func.return %abs : tensor<?x?xf32>
-}
-
-// CHECK-LABEL:   func @map_non_unique_users
-// CHECK:         gml_st.fusion
-// CHECK-COUNT-3:   linalg.map
-// CHECK:           gml_st.yield
-// CHECK:           parallel_tile_sizes = array<i64: 1, 8>
-// CHECK-SAME:      reduction_tile_sizes = array<i64: 0, 0>
-
-// -----
-
-func.func @matmul(%input1: tensor<4x8xf32>, %input2: tensor<8x16xf32>,
-                  %init: tensor<4x16xf32>) -> tensor<4x16xf32> {
-  %res = linalg.matmul
-           ins(%input1, %input2 : tensor<4x8xf32>, tensor<8x16xf32>)
-           outs(%init : tensor<4x16xf32>) -> tensor<4x16xf32>
-  func.return %res : tensor<4x16xf32>
-}
-
-// CHECK-LABEL: func @matmul
-// CHECK:       gml_st.fusion
-// CHECK:         linalg.matmul
-// CHECK:         gml_st.yield
-
-// -----
-
-func.func @reduce(%input: tensor<100x10xf32>,
-                        %output: tensor<10xf32>) -> tensor<10xf32> {
-  %res = linalg.reduce { arith.addf }
-           ins(%input: tensor<100x10xf32>)
-           outs(%output: tensor<10xf32>)
-           dimensions = [0]
-  return %res : tensor<10xf32>
-}
-
-// CHECK-LABEL:   func @reduce
-// CHECK:         gml_st.fusion
-// CHECK:           linalg.reduce
-// CHECK:           gml_st.yield
-
-// -----
-
-func.func @fused_matmul(%arg0: tensor<1x32xf32>, %arg1: tensor<32x10xf32>,
-                        %arg2: tensor<10xf32>) -> tensor<1x10xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = tensor.empty() : tensor<1x10xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x10xf32>) -> tensor<1x10xf32>
-  %2 = linalg.matmul
-         ins(%arg0, %arg1 : tensor<1x32xf32>, tensor<32x10xf32>)
-         outs(%1 : tensor<1x10xf32>) -> tensor<1x10xf32>
-  %expanded = tensor.expand_shape %arg2 [[0, 1]] : tensor<10xf32> into tensor<1x10xf32>
-  %mapped = linalg.map { arith.addf }
-              ins(%2, %expanded : tensor<1x10xf32>, tensor<1x10xf32>)
-              outs(%0 : tensor<1x10xf32>)
-  return %mapped : tensor<1x10xf32>
-}
-
-// CHECK-LABEL: func @fused_matmul
-// CHECK-SAME:      (%[[ARG0:.*]]: tensor<1x32xf32>, %[[ARG1:.*]]: tensor<32x10xf32>
-// CHECK-SAME:      %[[ARG2:.*]]: tensor<10xf32>
-// CHECK:         %[[EMPTY:.*]] = tensor.empty()
-// CHECK:         gml_st.fusion
-// CHECK-SAME:        ins(%[[ARG2_:.*]] = %[[ARG2]]: tensor<10xf32>
-// CHECK-SAME:            %[[ARG0_:.*]] = %[[ARG0]]: tensor<1x32xf32>
-// CHECK-SAME:            %[[ARG1_:.*]] = %[[ARG1]]: tensor<32x10xf32>
-// CHECK-SAME:        inits(%[[EMPTY_:.*]] = %[[EMPTY]]: tensor<1x10xf32>
-// CHECK:           %[[C0:.*]] = arith.constant 0
-// CHECK:           %[[EXPANDED:.*]] = tensor.expand_shape %[[ARG2_]]
-// CHECK:           %[[TMP:.*]] = tensor.empty
-// CHECK:           %[[FILLED:.*]] = linalg.fill
-// CHECK-SAME:        ins(%[[C0]] : f32)
-// CHECK-SAME:        outs(%[[TMP]] : tensor<1x10xf32>
-// CHECK:           %[[MATMUL:.*]] = linalg.matmul
-// CHECK-SAME:        ins(%[[ARG0_]], %[[ARG1_]]
-// CHECK-SAME:        outs(%[[FILLED]]
-// CHECK:           %[[MAP:.*]] = linalg.map
-// CHECK-SAME:        ins(%[[MATMUL]], %[[EXPANDED]]
-// CHECK-SAME:        outs(%[[EMPTY_]]
-// CHECK:           gml_st.yield %[[MAP]]
-
-// -----
-
-func.func @value_used_in_op_region(%arg0: tensor<i1>,
-    %arg1: tensor<?xi64>, %arg2: tensor<?xi64>, %init: tensor<?xi64>)
-    -> tensor<?xi64> {
-  %extracted = tensor.extract %arg0[] : tensor<i1>
-  %mapped = linalg.map
-              ins(%arg1, %arg2 : tensor<?xi64>, tensor<?xi64>)
-              outs(%init : tensor<?xi64>)
-    (%in: i64, %in_1: i64) {
-      %3 = arith.select %extracted, %in, %in_1 : i64
-      linalg.yield %3 : i64
-    }
-  return %mapped : tensor<?xi64>
-}
-
-// CHECK-LABEL: func @value_used_in_op_region
-// CHECK-SAME:      (%[[ARG0:.*]]: tensor<i1>
-// CHECK:         %[[EXTRACTED:.*]] = tensor.extract %[[ARG0]]
-// CHECK:         gml_st.fusion
-// CHECK-SAME:        %[[EXTRACTED_:[a-zA-Z0-9]*]] = %[[EXTRACTED]]: i1
-// CHECK:         linalg.map
-// CHECK:           arith.select %[[EXTRACTED_]]
-// CHECK:           parallel_tile_sizes = array<i64: 8>
-// CHECK-SAME:      reduction_tile_sizes = array<i64: 0>
-
-// -----
-
-func.func @variadic_fusion(%input1: tensor<16x32x64xf32>,
-    %init1: tensor<16x64xf32>, %input2: tensor<16x32x64xi64>,
-    %init2: tensor<16x64xi64>)  -> (tensor<16x64xf32>, tensor<16x64xi64>) {
-  %reduce, %reduce2 = linalg.reduce
-      ins(%input1, %input2 : tensor<16x32x64xf32>, tensor<16x32x64xi64>)
-      outs(%init1, %init2 : tensor<16x64xf32>, tensor<16x64xi64>)
-      dimensions = [1]
-      (%in1: f32, %in2: i64, %out1: f32, %out2: i64) {
-        %0 = arith.addf %in1, %out1: f32
-        %1 = arith.addi %in2, %out2: i64
-        linalg.yield %0, %1: f32, i64
-      }
-  func.return %reduce, %reduce2 : tensor<16x64xf32>, tensor<16x64xi64>
-}
-
-// CHECK-LABEL: func @variadic_fusion
-// CHECK:  %[[FUSION_RESULT:.*]]:2 = gml_st.fusion
-// CHECK:    %[[REDUCE_RESULT:.*]]:2 = linalg.reduce
-// CHECK:    gml_st.yield %[[REDUCE_RESULT]]#0, %[[REDUCE_RESULT]]#1
-// CHECK:  return %[[FUSION_RESULT]]#0, %[[FUSION_RESULT]]#1
-
-// -----
-
-func.func @tensor_empty_init(%input: tensor<?xf32>)
-    -> tensor<?xf32> {
-  %c0 = arith.constant 0 : index
-  %d0 = tensor.dim %input, %c0 : tensor<?xf32>
-  %init = tensor.empty(%d0) : tensor<?xf32>
-
-  %mapped = linalg.map { math.exp }
-              ins(%input: tensor<?xf32>)
-              outs(%init: tensor<?xf32>)
-
-  %result = linalg.map { math.exp }
-              ins(%mapped: tensor<?xf32>)
-              outs(%init: tensor<?xf32>)
-
-  func.return %result : tensor<?xf32>
-}
-
-// CHECK-LABEL: func @tensor_empty_init
-// CHECK-SAME:      %[[ARG0:.*]]: tensor<?xf32>
-// CHECK:         %[[C0:.*]] = arith.constant 0
-// CHECK:         %[[DIM:.*]] = tensor.dim
-// CHECK:         %[[EMPTY:.*]] = tensor.empty
-// CHECK:         gml_st.fusion
-// CHECK-SAME:        ins(%[[DIM_:.*]] = %[[DIM]]: index
-// CHECK-SAME:            %[[ARG0_:.*]] = %[[ARG0]]: tensor<?xf32>
-// CHECK-SAME:        inits(%[[EMPTY_:.*]] = %[[EMPTY]]
-// CHECK:           %[[TMP:.*]] = tensor.empty(%[[DIM_]])
-// CHECK:           %[[MAPPED:.*]] = linalg.map
-// CHECK-SAME:        outs(%[[TMP]]
-// CHECK:           %[[MAPPED0:.*]] = linalg.map
-// CHECK-SAME:        outs(%[[EMPTY_]]
-// CHECK:             parallel_tile_sizes = array<i64: 8>
-// CHECK-SAME:        reduction_tile_sizes = array<i64: 0>
-
-// -----
-
-func.func @shared_tensor_empty_static(%input: tensor<8xf32>)
-    -> (tensor<8xf32>, tensor<8xf32>) {
-  %init = tensor.empty() : tensor<8xf32>
-
-  %exp = linalg.map { math.exp }
-              ins(%input: tensor<8xf32>)
-              outs(%init: tensor<8xf32>)
-  %res0 = linalg.map { math.absf }
-              ins(%exp: tensor<8xf32>)
-              outs(%init: tensor<8xf32>)
-
-  %abs = linalg.map { math.absf }
-              ins(%input: tensor<8xf32>)
-              outs(%init: tensor<8xf32>)
-  %res1 = linalg.map { math.exp }
-              ins(%abs: tensor<8xf32>)
-              outs(%init: tensor<8xf32>)
-
-  func.return %res0, %res1 : tensor<8xf32>, tensor<8xf32>
-}
-
-// CHECK-LABEL: func @shared_tensor_empty_static
-// CHECK-COUNT-2: tensor.empty
-// CHECK:         gml_st.fusion
-// CHECK:           tensor.empty
-// CHECK:           linalg.map { math.exp }
-// CHECK:           linalg.map { math.absf }
-// CHECK:           gml_st.yield
-
-// CHECK:         gml_st.fusion
-// CHECK:           tensor.empty
-// CHECK:           linalg.map { math.absf }
-// CHECK:           linalg.map { math.exp }
-// CHECK:           gml_st.yield
-
-// -----
-
-func.func @shared_tensor_empty_dynamic(%input: tensor<?xf32>, %size : index)
-    -> tensor<?xf32> {
-  %init1 = tensor.empty(%size) : tensor<?xf32>
-  %exp = linalg.map { math.exp }
-              ins(%input: tensor<?xf32>)
-              outs(%init1: tensor<?xf32>)
-
-  %init2 = tensor.empty(%size) : tensor<?xf32>
-  %res = linalg.map { math.absf }
-              ins(%exp: tensor<?xf32>)
-              outs(%init2: tensor<?xf32>)
-  return %res : tensor<?xf32>
-}
-
-// CHECK-LABEL: func @shared_tensor_empty_dynamic(
-// CHECK-SAME:       %[[ARG0:.*]]: tensor<?xf32>, %[[SIZE:.*]]: index
-// CHECK:         %[[EMPTY:.*]] = tensor.empty(%[[SIZE]]) : tensor<?xf32>
-// CHECK:         %[[FUSION:.*]] = gml_st.fusion
-// CHECK-SAME:        ins(%[[ARG2:.*]] = %[[SIZE]]: index,
-// CHECK-SAME:            %[[ARG3:.*]] = %[[ARG0]]: tensor<?xf32>
-// CHECK-SAME:        inits(%[[ARG4:.*]] = %[[EMPTY]]: tensor<?xf32>
-// CHECK:           %[[EMPTY_0:.*]] = tensor.empty(%[[ARG2]]) : tensor<?xf32>
-// CHECK:           %[[MAPPED:.*]] = linalg.map
-// CHECK-SAME:        ins(%[[ARG3]] {{.*}} outs(%[[EMPTY_0]]
-// CHECK:           %[[MAPPED_0:.*]] = linalg.map
-// CHECK-SAME:        ins(%[[MAPPED]] {{.*}} outs(%[[ARG4]]
-// CHECK:           gml_st.yield %[[MAPPED_0]] : tensor<?xf32>
-// CHECK:         return %[[FUSION]] : tensor<?xf32>
-
-// -----
-
-func.func @shared_linalg_fill_dynamic(%input: tensor<?x?xf32>, %size : index)
-    -> (tensor<?xf32>, tensor<?xf32>) {
-  %c0 = arith.constant 0.0 : f32
-  %init = tensor.empty(%size) : tensor<?xf32>
-  %fill = linalg.fill ins(%c0 : f32) outs(%init : tensor<?xf32>) -> tensor<?xf32>
-  %res0 = linalg.reduce { arith.addf }
-              ins(%input: tensor<?x?xf32>)
-              outs(%fill: tensor<?xf32>)
-              dimensions = [0]
-
-  %res1 = linalg.map { math.absf }
-              ins(%fill : tensor<?xf32>)
-              outs(%init: tensor<?xf32>)
-  return %res0, %res1 : tensor<?xf32>, tensor<?xf32>
-}
-
-// CHECK-LABEL:    func.func @shared_linalg_fill_dynamic
-// CHECK:          %[[EMPTY:.*]] = tensor.empty
-// CHECK:          %[[FUSION:.*]] = gml_st.fusion
-// CHECK:            %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:            %[[FILL:.*]] = linalg.fill
-// CHECK:            %[[REDUCED:.*]] = linalg.reduce {{.*}} outs(%[[FILL]]
-// CHECK:          %[[FUSION_0:.*]] = gml_st.fusion
-// CHECK:            %[[EMPTY_0:.*]] = tensor.empty
-// CHECK:            %[[FILL_0:.*]] = linalg.fill {{.*}} outs(%[[EMPTY_0]]
-// CHECK:            %[[MAPPED:.*]] = linalg.map
-// CHECK:            gml_st.yield %[[MAPPED]] : tensor<?xf32>
-// CHECK:          return %[[FUSION]], %[[FUSION_0]]
-
-// -----
-
-func.func @multiple_users_linalg_fill(%arg0: tensor<2xf64>)
-    -> (tensor<f64>, tensor<f64>) {
-  %cst = arith.constant 0x7FF0000000000000 : f64
-  %0 = tensor.empty() : tensor<f64>
-  %1 = linalg.fill ins(%cst : f64) outs(%0 : tensor<f64>) -> tensor<f64>
-  %reduced = linalg.reduce { arith.minimumf }
-               ins(%arg0 : tensor<2xf64>)
-               outs(%1 : tensor<f64>)
-               dimensions = [0]
-  return %1, %reduced : tensor<f64>, tensor<f64>
-}
-
-// CHECK-LABEL: func @multiple_users_linalg_fill
-// CHECK:         %[[FILL0:.*]] = linalg.fill
-// CHECK:         %[[RESULT:.*]] = gml_st.fusion
-// CHECK:         %[[FILL1:.*]] = linalg.fill
-// CHECK:         linalg.reduce
-// CHECK-SAME:      outs(%[[FILL1]]
-// CHECK:         return %[[FILL0]], %[[RESULT]]
-
-// -----
-
-func.func @map_for_matmuls(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
-                           %arg2: tensor<?x?xf32>, %init: tensor<?x?xf32>)
-                           -> tensor<?x?xf32> {
-  %matmul0 = linalg.matmul
-               ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-               outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %matmul1 = linalg.matmul
-               ins(%arg0, %arg2 : tensor<?x?xf32>, tensor<?x?xf32>)
-               outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
-
-  %res = linalg.map { arith.addf }
-           ins(%matmul0, %matmul1 : tensor<?x?xf32>, tensor<?x?xf32>)
-           outs(%init : tensor<?x?xf32>)
-  func.return %res : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: func @map_for_matmuls
-// CHECK:         gml_st.fusion
-// CHECK:           linalg.matmul
-// CHECK:         gml_st.fusion
-// CHECK:           linalg.matmul
-// CHECK:           linalg.map
-
-// -----
-
-func.func @do_not_fuse_unsupported_op(%arg0: tensor<10xf32>) -> tensor<10xf32> {
-  %init = tensor.empty() : tensor<10xf32>
-  %negated = "mhlo.negate"(%arg0) : (tensor<10xf32>) -> tensor<10xf32>
-  %mapped = linalg.map { math.exp }
-              ins(%negated : tensor<10xf32>)
-              outs(%init : tensor<10xf32>)
-  return %mapped : tensor<10xf32>
-}
-
-// CHECK-LABEL: func @do_not_fuse_unsupported_op
-// CHECK:         tensor.empty
-// CHECK:         mhlo.negate
-// CHECK:         gml_st.fusion
-// CHECK:           linalg.map
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/inline_fusion_clusters.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/inline_fusion_clusters.mlir
deleted file mode 100644
index 8f51d6c89c407c..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/inline_fusion_clusters.mlir
+++ /dev/null
@@ -1,74 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-st-inline-fusion-clusters \
-// RUN:   --split-input-file \
-// RUN: | FileCheck %s
-
-func.func @two_clusters_tensors(
-    %arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?xf32>)
-    -> tensor<?xf32> {
-  %0 = gml_st.fusion ins(%arg3 = %arg0: tensor<?x?xf32>)
-                     inits(%arg4 = %arg1: tensor<?x?xf32>) {
-    %sorted0 = thlo.sort
-      ins(%arg3 : tensor<?x?xf32>)
-      outs(%arg4 : tensor<?x?xf32>)
-      dimension = 0
-      is_stable = false
-      (%lhs0: f32, %rhs0: f32) {
-        %2 = arith.cmpf ogt, %lhs0, %rhs0 : f32
-        thlo.yield %2 : i1
-      }
-    gml_st.yield %sorted0 : tensor<?x?xf32>
-  } : tensor<?x?xf32>
-  %1 = gml_st.fusion ins(%arg3 = %0: tensor<?x?xf32>)
-                     inits(%arg4 = %arg2: tensor<?xf32>) {
-    %reduced = linalg.reduce { arith.addf } ins(%arg3 : tensor<?x?xf32>) outs(%arg4 : tensor<?xf32>) dimensions = [0]
-    %mapped = linalg.map { math.exp } ins(%reduced : tensor<?xf32>) outs(%arg4 : tensor<?xf32>)
-    gml_st.yield %mapped : tensor<?xf32>
-  } : tensor<?xf32>
-  return %1 : tensor<?xf32>
-}
-
-// CHECK-LABEL: @two_clusters_tensors
-// CHECK-NOT:   gml_st.fusion
-// CHECK:       thlo.sort
-// CHECK-NOT:   gml_st.fusion
-// CHECK:       linalg.reduce
-// CHECK:       linalg.map
-
-// -----
-
-func.func @two_clusters_memrefs(
-    %arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?xf32>)
-    -> memref<?xf32> {
-  gml_st.fusion ins(%arg3 = %arg0: memref<?x?xf32>)
-                inits(%arg4 = %arg1: memref<?x?xf32>) {
-    thlo.sort
-      ins(%arg3 : memref<?x?xf32>)
-      outs(%arg4 : memref<?x?xf32>)
-      dimension = 0
-      is_stable = false
-      (%lhs0: f32, %rhs0: f32) {
-        %2 = arith.cmpf ogt, %lhs0, %rhs0 : f32
-        thlo.yield %2 : i1
-      }
-    gml_st.yield %arg4 : memref<?x?xf32>
-  }
-  gml_st.fusion ins(%arg3 = %arg1: memref<?x?xf32>)
-                inits(%arg4 = %arg2: memref<?xf32>) {
-    linalg.reduce { arith.addf }
-      ins(%arg3 : memref<?x?xf32>)
-      outs(%arg4 : memref<?xf32>)
-      dimensions = [0]
-    linalg.map { math.exp }
-      ins(%arg4 : memref<?xf32>)
-      outs(%arg4 : memref<?xf32>)
-    gml_st.yield %arg4 : memref<?xf32>
-  }
-  return %arg2 : memref<?xf32>
-}
-
-// CHECK-LABEL: @two_clusters_memrefs
-// CHECK-NOT:   gml_st.fusion
-// CHECK:       thlo.sort
-// CHECK-NOT:   gml_st.fusion
-// CHECK:       linalg.reduce
-// CHECK:       linalg.map
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_bcast_map.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_bcast_map.mlir
deleted file mode 100644
index 54862c3051829c..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_bcast_map.mlir
+++ /dev/null
@@ -1,37 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-st-cpu-tiling-pipeline \
-// RUN: | FileCheck %s
-
-func.func @map_bcast_map(%arg0: tensor<?xf32>, %arg1: tensor<?x?x?xf32>,
-                              %init0: tensor<?xf32>,
-                              %init1: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
-  %abs = linalg.map { math.absf }
-           ins(%arg0:tensor<?xf32>)
-           outs(%init0:tensor<?xf32>)
-
-  %bcast = linalg.broadcast
-             ins(%abs : tensor<?xf32>)
-             outs(%init1 : tensor<?x?x?xf32>)
-             dimensions = [1, 2]
-
-  %mapped = linalg.map { arith.addf }
-              ins(%bcast, %arg1 : tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-              outs(%init1:tensor<?x?x?xf32>)
-  func.return %mapped : tensor<?x?x?xf32>
-}
-
-// CHECK-LABEL: func.func @map_bcast_map
-
-// CHECK:       scf.for
-// CHECK:         math.absf %{{.*}} : vector<8xf32>
-// CHECK:         vector.broadcast %{{.*}} : vector<8xf32> to vector<1x8x8xf32>
-// CHECK:         vector.transpose %{{.*}}, [2, 0, 1]
-// CHECK-SAME:      : vector<1x8x8xf32> to vector<8x1x8xf32>
-// CHECK:         arith.addf %{{.*}} : vector<8x1x8xf32>
-// CHECK:         vector.transfer_write
-
-// CHECK:       scf.for
-// CHECK:         scf.for
-// CHECK:           math.absf %{{.*}} : f32
-// CHECK:           arith.addf %{{.*}} : f32
-// CHECK:           tensor.insert
-// CHECK:         tensor.insert_slice
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_matmul.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_matmul.mlir
deleted file mode 100644
index 7da607be9cd40a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_matmul.mlir
+++ /dev/null
@@ -1,46 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file \
-// RUN:   --gml-st-cpu-tiling-pipeline=matmul-tile-sizes=4,4,4 \
-// RUN: | FileCheck %s
-
-func.func @map_matmul(%lhs0: tensor<16x16xf32>, %rhs0: tensor<16x16xf32>,
-    %lhs1: tensor<16x32xf32>, %rhs1: tensor<32x16xf32>) -> tensor<16x16xf32> {
-  %init = tensor.empty() : tensor<16x16xf32>
-
-  %cst = arith.constant 0.000000e+00 : f32
-  %filled = linalg.fill ins(%cst : f32)
-              outs(%init : tensor<16x16xf32>) -> tensor<16x16xf32>
-
-  %4 = linalg.matmul ins(%lhs0, %rhs0 : tensor<16x16xf32>, tensor<16x16xf32>)
-                     outs(%filled : tensor<16x16xf32>) -> tensor<16x16xf32>
-  %5 = linalg.matmul ins(%lhs1, %rhs1 : tensor<16x32xf32>, tensor<32x16xf32>)
-                     outs(%filled : tensor<16x16xf32>) -> tensor<16x16xf32>
-  %6 = linalg.map { math.absf }
-         ins(%5 : tensor<16x16xf32>)
-         outs(%init : tensor<16x16xf32>)
-
-  %result = linalg.map { arith.addf }
-              ins(%4, %6 : tensor<16x16xf32>, tensor<16x16xf32>)
-              outs(%init : tensor<16x16xf32>)
-  return %result : tensor<16x16xf32>
-}
-
-// CHECK-LABEL: @map_matmul
-
-// Fuse this linalg.fill.
-
-// CHECK-NOT:  linalg.fill
-// CHECK:      scf.for
-// CHECK:        scf.for
-// CHECK-COUNT-2:     vector.transfer_read
-// CHECK:             vector.contract
-// CHECK:          scf.yield
-// CHECK:        scf.for
-// CHECK-COUNT-2:     vector.transfer_read
-// CHECK:             vector.contract
-// CHECK:          scf.yield
-// CHECK:        math.absf %{{.*}} : vector<4x4xf32>
-// CHECK:        vector.transfer_write
-
-// CHECK:      scf.for
-// CHECK:        scf.for
-// CHECK:        arith.addf %{{.*}} : vector<1x8xf32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reduce_map.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reduce_map.mlir
deleted file mode 100644
index e795e00442d3c4..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reduce_map.mlir
+++ /dev/null
@@ -1,113 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline |\
-// RUN: FileCheck %s
-
-func.func @row_reduce_map_fuse_map(%arg0: tensor<?x?xf32>,
-    %arg1: tensor<?x?xf32>) -> tensor<?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %dim0 = tensor.dim %arg1, %c0 : tensor<?x?xf32>
-  %dim1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
-
-  %empty_2D = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
-  %reduce_init = tensor.empty(%dim0) : tensor<?xf32>
-  %mapped = linalg.map { arith.addf }
-              ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-              outs(%empty_2D : tensor<?x?xf32>)
-
-  %c0_f32 = arith.constant 0.0 : f32
-  %empty_1D = tensor.empty(%dim1) : tensor<?xf32>
-  %fill = linalg.fill ins(%c0_f32: f32)
-                      outs(%empty_1D: tensor<?xf32>) -> tensor<?xf32>
-
-  %reduce = linalg.reduce { arith.addf }
-              ins(%mapped: tensor<?x?xf32>)
-              outs(%fill: tensor<?xf32>)
-              dimensions = [1]
-
-  %res = linalg.map { math.absf }
-           ins(%reduce: tensor<?xf32>)
-           outs(%empty_1D : tensor<?xf32>)
-  return %res : tensor<?xf32>
-}
-// CHECK-LABEL: @row_reduce_map_fuse_map
-
-// CHECK: scf.for
-// CHECK:   scf.for
-// CHECK:     arith.addf %{{.*}} : vector<4x4xf32>
-// CHECK:     vector.multi_reduction <add>
-// CHECK:       : vector<4x4xf32> to vector<4xf32>
-// CHECK:     scf.yield %{{.*}} : vector<4xf32>
-// CHECK:   scf.for
-// CHECK:     scf.for
-// CHECK:       arith.addf %{{.*}} : vector<4x1xf32>
-// CHECK:       arith.addf %{{.*}} : vector<4xf32>
-// CHECK:       scf.yield %{{.*}} : vector<4xf32>
-// CHECK:     scf.yield %{{.*}} : vector<4xf32>
-// CHECK:   math.absf %{{.*}} : vector<4xf32>
-// CHECK:   vector.transfer_write
-
-// CHECK: scf.for
-// CHECK:   scf.for
-// CHECK:     scf.for
-// CHECK:       arith.addf %{{.*}} : f32
-// CHECK:       arith.addf %{{.*}} : f32
-// CHECK:       scf.yield %{{.*}} : f32
-// CHECK:     math.absf %{{.*}} : f32
-// CHECK:     tensor.insert
-
-// -----
-
-func.func @col_reduce_map_fuse_map(%arg0: tensor<?x?xf32>,
-    %arg1: tensor<?x?xf32>) -> tensor<?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %dim0 = tensor.dim %arg1, %c0 : tensor<?x?xf32>
-  %dim1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
-
-  %empty_2D = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
-  %mapped = linalg.map { arith.addf }
-              ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-              outs(%empty_2D : tensor<?x?xf32>)
-
-  %c0_f32 = arith.constant 0.0 : f32
-  %empty_1D = tensor.empty(%dim1) : tensor<?xf32>
-  %fill = linalg.fill ins(%c0_f32: f32)
-                      outs(%empty_1D: tensor<?xf32>) -> tensor<?xf32>
-
-  %reduce = linalg.reduce { arith.addf }
-              ins(%mapped: tensor<?x?xf32>)
-              outs(%fill: tensor<?xf32>)
-              dimensions = [0]
-
-  %res = linalg.map { math.absf }
-           ins(%reduce: tensor<?xf32>)
-           outs(%empty_1D : tensor<?xf32>)
-  return %res : tensor<?xf32>
-}
-// CHECK-LABEL: @col_reduce_map_fuse_map
-
-// CHECK: scf.for
-// CHECK:   scf.for
-// CHECK:     arith.addf %{{.*}} : vector<4x4xf32>
-// CHECK:     vector.multi_reduction <add>
-// CHECK:       : vector<4x4xf32> to vector<4xf32>
-// CHECK:     scf.yield %{{.*}} : vector<4xf32>
-// CHECK:   scf.for
-// CHECK:     scf.for
-// CHECK:       scf.for
-// CHECK:         arith.addf %{{.*}} : f32
-// CHECK:         arith.addf %{{.*}} : f32
-// CHECK:         scf.yield %{{.*}} : f32
-// CHECK:     scf.yield %{{.*}} : tensor<4xf32>
-// CHECK:   scf.yield %{{.*}} : tensor<4xf32>
-// CHECK:   vector.transfer_write
-
-// CHECK: scf.for
-// CHECK:   scf.for
-// CHECK:     scf.for
-// CHECK:       arith.addf %{{.*}} : f32
-// CHECK:       arith.addf %{{.*}} : f32
-// CHECK:       scf.yield %{{.*}} : f32
-// CHECK:     math.absf %{{.*}} : f32
-// CHECK:     tensor.insert
-// CHECK:   tensor.insert_slice
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reshape_map.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reshape_map.mlir
deleted file mode 100644
index f0ff033d2794d3..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reshape_map.mlir
+++ /dev/null
@@ -1,92 +0,0 @@
-// RUN: mlir-hlo-opt %s \
-// RUN: --gml-st-cpu-tiling-pipeline="fuse-degenerate-reshapes=true" \
-// RUN: | FileCheck %s
-
-func.func @fuse_reshape_map(%arg0: tensor<10x16xf32>,
-    %arg1: tensor<10x16xf32>) -> tensor<10x16xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-
-  %0 = tensor.empty() : tensor<10x1x1x1xf32>
-  %1 = tensor.collapse_shape %0 [[0, 1], [2, 3]] : tensor<10x1x1x1xf32> into tensor<10x1xf32>
-
-  %empty= tensor.empty() : tensor<10x1x4x4x1xf32>
-  %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3, 4]] :
-              tensor<10x16xf32> into tensor<10x1x4x4x1xf32>
-  %neg = linalg.map { arith.negf }
-         ins(%expanded: tensor<10x1x4x4x1xf32>)
-         outs(%empty: tensor<10x1x4x4x1xf32>)
-  %collapsed = tensor.collapse_shape %neg [[0, 1], [2, 3, 4]] :
-              tensor<10x1x4x4x1xf32> into tensor<10x16xf32>
-
-  %empty_3D = tensor.empty() : tensor<10x1x16xf32>
-  %expanded0 = tensor.expand_shape %collapsed [[0], [1, 2]] :
-              tensor<10x16xf32> into tensor<10x1x16xf32>
-  %abs0 = linalg.map { math.absf }
-         ins(%expanded0: tensor<10x1x16xf32>)
-         outs(%empty_3D : tensor<10x1x16xf32>)
-  %collapsed0 = tensor.collapse_shape %abs0 [[0], [1, 2]] :
-               tensor<10x1x16xf32> into tensor<10x16xf32>
-
-  %empty_5D = tensor.empty() : tensor<10x16x1x1x1xf32>
-  %expanded1 = tensor.expand_shape %collapsed0 [[0], [1, 2, 3, 4]] :
-               tensor<10x16xf32> into tensor<10x16x1x1x1xf32>
-  %abs1 = linalg.map { math.absf }
-          ins(%expanded1: tensor<10x16x1x1x1xf32>)
-          outs(%empty_5D : tensor<10x16x1x1x1xf32>)
-  %collapsed1 = tensor.collapse_shape %abs1 [[0], [1, 2, 3, 4]] :
-                tensor<10x16x1x1x1xf32> into tensor<10x16xf32>
-
-  %empty_4D = tensor.empty() : tensor<10x1x16x1xf32>
-  %expanded2 = tensor.expand_shape %collapsed1 [[0, 1], [2, 3]] :
-              tensor<10x16xf32> into tensor<10x1x16x1xf32>
-  %abs2 = linalg.map { math.absf }
-         ins(%expanded2: tensor<10x1x16x1xf32>)
-         outs(%empty_4D : tensor<10x1x16x1xf32>)
-  %collapsed2 = tensor.collapse_shape %abs2 [[0, 1], [2, 3]] :
-              tensor<10x1x16x1xf32> into tensor<10x16xf32>
-
-  %empty_2D = tensor.empty() : tensor<10x16xf32>
-  %add = linalg.map { arith.addf }
-              ins(%collapsed2, %arg1 : tensor<10x16xf32>, tensor<10x16xf32>)
-              outs(%empty_2D : tensor<10x16xf32>)
-  return %add : tensor<10x16xf32>
-}
-
-// CHECK:       @fuse_reshape_map(%[[ARG0:.*]]: tensor<10x16xf32>, %[[ARG1:.*]]: tensor<10x16xf32>)
-// CHECK:         %[[EXPAND:.*]] = tensor.expand_shape %[[ARG0]] {{.*}} tensor<10x16xf32> into tensor<10x1x4x4x1xf32>
-// CHECK:         %[[RES:.*]] = scf.for {{.*}} (tensor<10x1x4x4x1xf32>) {
-// CHECK:           scf.for
-// CHECK:             scf.for
-// CHECK:               %[[EXTRACT:.*]] = tensor.extract %[[EXPAND]]
-// CHECK:               arith.negf
-// CHECK:             }
-// CHECK:           }
-// CHECK:         }
-// CHECK:         %[[COLLAPSE:.*]] = tensor.collapse_shape %[[RES]] {{.*}} tensor<10x1x4x4x1xf32> into tensor<10x16xf32>
-
-// CHECK:         scf.for
-// CHECK:           scf.for
-// CHECK:             %[[EXTRACT0:.*]] = tensor.extract_slice %[[COLLAPSE]]
-// CHECK:             %[[EXPAND0:.*]] = tensor.expand_shape %[[EXTRACT0]] {{.*}} tensor<1x8xf32> into tensor<1x1x8xf32>
-// CHECK:             %[[READ0:.*]] = vector.transfer_read %[[EXPAND0]]
-// CHECK:             %[[ABS0:.*]] = math.absf %[[READ0]]
-// CHECK:             %[[WRITE0:.*]] = vector.transfer_write %[[ABS0]]
-// CHECK:             %[[COLLAPSE0:.*]] = tensor.collapse_shape %[[WRITE0]] {{.*}} tensor<1x1x8xf32> into tensor<1x8xf32>
-
-// CHECK:             %[[EXPAND1:.*]] = tensor.expand_shape %[[COLLAPSE0]] {{.*}} tensor<1x8xf32> into tensor<1x8x1x1x1xf32>
-// CHECK:             %[[READ1:.*]] = vector.transfer_read %[[EXPAND1]]
-// CHECK:             %[[ABS1:.*]] = math.absf %[[READ1]]
-// CHECK:             %[[WRITE1:.*]] = vector.transfer_write %[[ABS1]]
-// CHECK:             %[[COLLAPSE1:.*]] = tensor.collapse_shape %[[WRITE1]] {{.*}} tensor<1x8x1x1x1xf32> into tensor<1x8xf32>
-
-// CHECK:             %[[EXPAND2:.*]] = tensor.expand_shape %[[COLLAPSE1]] {{.*}} tensor<1x8xf32> into tensor<1x1x8x1xf32>
-// CHECK:             %[[READ2:.*]] = vector.transfer_read %[[EXPAND2]]
-// CHECK:             %[[ABS2:.*]] = math.absf %[[READ2]]
-// CHECK:             %[[WRITE2:.*]] = vector.transfer_write %[[ABS2]]
-// CHECK:             %[[COLLAPSE2:.*]] = tensor.collapse_shape %[[WRITE2]] {{.*}} tensor<1x1x8x1xf32> into tensor<1x8xf32>
-
-// CHECK:             %[[READ1:.*]] = vector.transfer_read %[[COLLAPSE2]]
-// CHECK:             %[[READ2:.*]] = vector.transfer_read %[[ARG1]]
-// CHECK:             %[[ADD:.*]] = arith.addf %[[READ1]], %[[READ2]]
-// CHECK:             vector.transfer_write %[[ADD]]
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/matmul.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/matmul.mlir
deleted file mode 100644
index 7a9863ea96c8ed..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/matmul.mlir
+++ /dev/null
@@ -1,180 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file \
-// RUN:    --gml-st-cpu-tiling-pipeline=matmul-tile-sizes=4,5,6  | FileCheck %s
-// RUN: mlir-hlo-opt %s --gml-st-cpu-tiling-pipeline="lower-to-mmt4d=true" | \
-// RUN:    FileCheck %s --check-prefixes=PACKED
-
-func.func @matmul_static(%lhs: tensor<128x16xf32>, %rhs: tensor<16x64xf32>,
-                         %output: tensor<128x64xf32>) -> tensor<128x64xf32> {
-  %0 = linalg.matmul ins(%lhs, %rhs : tensor<128x16xf32>, tensor<16x64xf32>)
-                     outs(%output : tensor<128x64xf32>) -> tensor<128x64xf32>
-  return %0 : tensor<128x64xf32>
-}
-
-// CHECK-LABEL: @matmul_static
-
-// CHECK:         scf.for
-// CHECK:           vector.transfer_read
-// CHECK-NEXT:      scf.for
-// CHECK-COUNT-2:     vector.transfer_read
-// CHECK:             vector.contract {{.*}} vector<4x6xf32>, vector<6x5xf32>
-// CHECK:             scf.yield {{.*}} : vector<4x5xf32>
-// CHECK:           vector.transfer_write
-
-// PACKED-LABEL: @matmul_static
-
-// PACKED:         tensor.empty() : tensor<16x16x8x1xf32>
-// PACKED-COUNT-2: scf.for
-// PACKED:           vector.transfer_read
-// PACKED:           vector.transfer_write
-// PACKED:           scf.yield %{{.*}} : tensor<16x16x8x1xf32>
-// PACKED:          scf.yield %{{.*}} : tensor<16x16x8x1xf32>
-
-// PACKED:         tensor.empty() : tensor<8x16x8x1xf32>
-// PACKED-COUNT-2:   scf.for
-// PACKED:           vector.transfer_read
-// PACKED:           vector.transfer_write
-// PACKED:            scf.yield %{{.*}} : tensor<8x16x8x1xf32>
-// PACKED:           scf.yield %{{.*}} : tensor<8x16x8x1xf32>
-
-// PACKED:         tensor.empty() : tensor<16x8x8x8xf32>
-// PACKED-COUNT-2: scf.for
-// PACKED:           vector.transfer_read
-// PACKED:           vector.transfer_write
-// PACKED:          scf.yield
-// PACKED:         scf.yield
-
-// PACKED-COUNT-2: scf.for
-// PACKED:           scf.for
-// PACKED:             vector.transfer_read
-// PACKED:             vector.transfer_read
-// PACKED:             vector.contract
-// PACKED:             scf.yield
-// PACKED:           scf.yield
-// PACKED:          scf.yield
-
-// PACKED:         tensor.empty() : tensor<128x64xf32>
-// PACKED-COUNT-2: scf.for
-// PACKED:           vector.transfer_read
-// PACKED:           vector.transfer_write
-// PACKED:           scf.yield %{{.*}} : tensor<128x64xf32>
-// PACKED:          scf.yield %{{.*}} : tensor<128x64xf32>
-
-// -----
-
-func.func @matmul(%lhs: tensor<?x?xf32>,
-    %rhs: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
-  %c1 = arith.constant 1 : index
-  %1 = tensor.dim %rhs, %c1 : tensor<?x?xf32>
-  %2 = tensor.empty(%0, %1) : tensor<?x?xf32>
-  %cst = arith.constant 0.000000e+00 : f32
-  %3 = linalg.fill ins(%cst : f32)
-         outs(%2 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %4 = linalg.matmul ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
-                     outs(%3 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %4 : tensor<?x?xf32>
-}
-// CHECK-LABEL: @matmul
-
-// CHECK:         scf.for
-// CHECK:           scf.for
-// CHECK-COUNT-2:     vector.transfer_read
-// CHECK:             vector.contract
-// CHECK-NEXT:        scf.yield %{{.*}} : vector<4x5xf32>
-// CHECK:           vector.transfer_write
-
-// CHECK-NEXT:      scf.for
-// CHECK:             linalg.matmul {{.*}} -> tensor<4x5xf32>
-// CHECK:             scf.yield {{.*}} : tensor<4x5xf32>
-// CHECK:           tensor.insert_slice
-
-// CHECK:         scf.for
-// CHECK:           linalg.fill
-// CHECK:           scf.for
-// CHECK:             linalg.matmul {{.*}} -> tensor<4x?xf32>
-// CHECK:             scf.yield {{.*}} : tensor<4x?xf32>
-// CHECK:           tensor.insert_slice
-
-// CHECK:         scf.for
-// CHECK:           linalg.fill
-// CHECK:           scf.for
-// CHECK:             linalg.matmul
-// CHECK:             scf.yield {{.*}} : tensor<?x?xf32>
-// CHECK:           tensor.insert_slice
-
-// -----
-
-func.func @matmul_narrow_static(%lhs: tensor<2x16xf32>, %rhs: tensor<16x64xf32>,
-                         %output: tensor<2x64xf32>) -> tensor<2x64xf32> {
-  %0 = linalg.matmul ins(%lhs, %rhs : tensor<2x16xf32>, tensor<16x64xf32>)
-                     outs(%output : tensor<2x64xf32>) -> tensor<2x64xf32>
-  return %0 : tensor<2x64xf32>
-}
-// CHECK-LABEL: @matmul_narrow_static
-
-// CHECK:         scf.for
-// CHECK:           scf.for
-// CHECK:             linalg.matmul
-// CHECK:             scf.yield {{.*}} : tensor<2x5xf32>
-
-// PACKED-LABEL: @matmul_narrow_static
-
-// PACKED:       tensor.empty() : tensor<1x16x2x1xf32>
-// PACKED:       scf.for
-// PACKED:         vector.transfer_read
-// PACKED:         vector.transfer_write
-// PACKED:         scf.yield %{{.*}} : tensor<1x16x2x1xf32>
-// PACKED:       }
-
-// PACKED:       tensor.empty() : tensor<8x16x8x1xf32>
-// PACKED-COUNT: scf.for
-// PACKED:           vector.transpose
-// PACKED:           scf.yield %{{.*}} : tensor<8x16x8x1xf32>
-// PACKED:         scf.yield %{{.*}} : tensor<8x16x8x1xf32>
-
-// PACKED:       tensor.empty() : tensor<1x8x2x8xf32>
-// PACKED:       scf.for
-// PACKED:         vector.transfer_read
-// PACKED:         vector.transfer_write
-// PACKED:         scf.yield %{{.*}} : tensor<1x8x2x8xf32>
-// PACKED:       scf.for
-// PACKED:         scf.for
-// PACKED:           vector.contract
-// PACKED:           scf.yield %{{.*}} : vector<1x1x2x8xf32>
-// PACKED:         scf.yield
-
-// PACKED:       tensor.empty() : tensor<2x64xf32>
-// PACKED:       scf.for
-// PACKED:         vector.transfer_read
-// PACKED:         vector.transfer_write
-// PACKED:         scf.yield %{{.*}} : tensor<2x64xf32>
-
-// -----
-
-func.func @matmul_small_static_peeling(%lhs: tensor<2x4xf32>,
-    %arg1: tensor<4x6xf32>, %output: tensor<2x6xf32>) -> tensor<2x6xf32> {
-  %0 = linalg.matmul ins(%lhs, %arg1 : tensor<2x4xf32>, tensor<4x6xf32>)
-                     outs(%output : tensor<2x6xf32>) -> tensor<2x6xf32>
-  return %0 : tensor<2x6xf32>
-}
-// CHECK-LABEL: @matmul_small_static_peeling
-
-// CHECK-NOT:     scf.for
-// CHECK-NOT:     scf.for
-// CHECK:         vector.contract
-
-// -----
-
-func.func @matvec_static(%lhs: tensor<1x16xf32>, %arg1: tensor<16x64xf32>,
-                         %output: tensor<1x64xf32>) -> tensor<1x64xf32> {
-  %0 = linalg.matmul ins(%lhs, %arg1 : tensor<1x16xf32>, tensor<16x64xf32>)
-                     outs(%output : tensor<1x64xf32>) -> tensor<1x64xf32>
-  return %0 : tensor<1x64xf32>
-}
-// CHECK-LABEL: @matvec_static
-
-// CHECK:         scf.for
-// CHECK:           scf.for
-// CHECK:             linalg.matmul
-// CHECK:             scf.yield {{.*}} : tensor<1x5xf32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d.mlir
deleted file mode 100644
index 8953e8816cc532..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d.mlir
+++ /dev/null
@@ -1,51 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --mlir-print-ir-after-all\
-// RUN: --gml-st-cpu-tiling-pipeline="reduction-1d-tile-size=32 reduction-1d-split-ratio=8" \
-// RUN: | FileCheck %s
-
-func.func @reduce_1d_static(%arg0: tensor<100xf32>) -> tensor<f32> {
-  %1 = tensor.empty() : tensor<f32>
-  %cst = arith.constant 0.0 : f32
-  %init = linalg.fill ins(%cst : f32) outs(%1 : tensor<f32>) -> tensor<f32>
-  %res = linalg.reduce { arith.addf }
-    ins(%arg0: tensor<100xf32>) outs(%init: tensor<f32>) dimensions = [0]
-  return %res : tensor<f32>
-}
-// CHECK-LABEL: @reduce_1d_static(
-// CHECK-SAME: %[[ARG:.*]]: tensor<100xf32>
-
-// CHECK:   %[[CST:.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:   %[[LHS:.*]] = vector.transfer_read %[[ARG]]
-// CHECK:   %[[RHS:.*]] = vector.transfer_read %[[CST]][]
-// CHECK:   %[[EXTRACT:.*]] = vector.extractelement %[[RHS]][]
-// CHECK:   %[[REDUCTION:.*]] = vector.multi_reduction <add>, %[[LHS]], %[[EXTRACT]]
-// CHECK:   %[[BROADCAST:.*]] = vector.broadcast %[[REDUCTION]]
-// CHECK:   %[[WRITE:.*]] = vector.transfer_write %[[BROADCAST]], %[[CST]][]
-// CHECK:   return %[[WRITE]]
-
-// -----
-
-func.func @reduce_1d_dynamic(%arg0: tensor<?xf32>) -> tensor<f32> {
-  %1 = tensor.empty() : tensor<f32>
-  %cst = arith.constant 0.0 : f32
-  %init = linalg.fill ins(%cst : f32) outs(%1 : tensor<f32>) -> tensor<f32>
-  %res = linalg.reduce { arith.addf }
-    ins(%arg0: tensor<?xf32>) outs(%init: tensor<f32>) dimensions = [0]
-  return %res : tensor<f32>
-}
-// CHECK-LABEL: func @reduce_1d_dynamic
-
-//       CHECK: arith.constant dense<0.000000e+00> : vector<8xf32>
-
-//       CHECK: scf.for
-//       CHECK:   vector.multi_reduction <add>
-//  CHECK-SAME:     : vector<4x8xf32> to vector<8xf32>
-//       CHECK:   scf.yield %{{.*}} :  vector<8xf32>
-
-//       CHECK: vector.multi_reduction <add>
-//  CHECK-SAME:   : vector<8xf32> to f32
-
-//       CHECK: scf.for
-//       CHECK:   scf.for
-//       CHECK:     arith.addf
-//       CHECK:     scf.yield %{{.*}} : f32
-//       CHECK:   scf.yield %{{.*}} : f32
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d_map.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d_map.mlir
deleted file mode 100644
index 079d1df0c1dd47..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d_map.mlir
+++ /dev/null
@@ -1,35 +0,0 @@
-// RUN: mlir-hlo-opt %s \
-// RUN: --gml-st-cpu-tiling-pipeline="reduction-1d-tile-size=32 reduction-1d-split-ratio=8" \
-// RUN: | FileCheck %s
-func.func @reduce_1d_map_aka_dot(%lhs: tensor<?xf32>,
-    %rhs: tensor<?xf32>) -> tensor<f32> {
-  %c0 = arith.constant 0 : index
-  %size = tensor.dim %lhs, %c0 : tensor<?xf32>
-  %init_1d = tensor.empty(%size) : tensor<?xf32>
-
-  %map = linalg.map { arith.mulf }
-    ins(%lhs, %rhs: tensor<?xf32>, tensor<?xf32>) outs(%init_1d: tensor<?xf32>)
-  %cst = arith.constant 0.0 : f32
-  %init_0d = tensor.empty() : tensor<f32>
-
-  %fill = linalg.fill
-    ins(%cst : f32) outs(%init_0d : tensor<f32>) -> tensor<f32>
-  %res = linalg.reduce { arith.addf }
-    ins(%map: tensor<?xf32>) outs(%fill: tensor<f32>) dimensions = [0]
-  return %res : tensor<f32>
-}
-// CHECK-LABEL: func.func @reduce_1d_map_aka_dot
-// CHECK: scf.for
-// CHECK:   arith.mulf {{.*}} : vector<32xf32>
-// CHECK:   vector.multi_reduction <add>
-// CHECK:     : vector<4x8xf32> to vector<8xf32>
-// CHECK:   scf.yield %{{.*}} : vector<8xf32>
-// CHECK: vector.multi_reduction <add>
-// CHECK:   : vector<8xf32> to f32
-// CHECK: scf.for
-// CHECK:   scf.for
-// CHECK:     arith.mulf {{.*}} : f32
-// CHECK:     arith.addf {{.*}} : f32
-// CHECK:     scf.yield {{.*}} : f32
-// CHECK:   scf.yield {{.*}} : f32
-
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_2d.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_2d.mlir
deleted file mode 100644
index 7af0bdef6b492f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_2d.mlir
+++ /dev/null
@@ -1,92 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline \
-// RUN: | FileCheck %s
-
-func.func @col_reduce_static(%input: tensor<100x10xf32>,
-                        %output: tensor<10xf32>) -> tensor<10xf32> {
-  %res = linalg.reduce { arith.addf }
-           ins(%input: tensor<100x10xf32>)
-           outs(%output: tensor<10xf32>)
-           dimensions = [0]
-  return %res : tensor<10xf32>
-}
-// CHECK-LABEL: @col_reduce_static
-
-//       CHECK: scf.for
-//       CHECK:   scf.for
-//       CHECK:     vector.multi_reduction
-//  CHECK-SAME:       : vector<4x4xf32> to vector<4xf32>
-//  CHECK-NEXT:     scf.yield %{{.*}} : vector<4xf32>
-//       CHECK:   vector.transfer_write
-
-// -----
-
-func.func @row_reduce_dynamic(%input: tensor<?x?xf32>,
-                      %output: tensor<?xf32>) -> tensor<?xf32> {
-  %c0 = arith.constant 0 : index
-  %0 = tensor.dim %output, %c0 : tensor<?xf32>
-  %1 = tensor.empty(%0) : tensor<?xf32>
-  %cst = arith.constant 0.000000e+00 : f32
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<?xf32>) -> tensor<?xf32>
-  %res = linalg.reduce { arith.mulf }
-           ins(%input: tensor<?x?xf32>)
-           outs(%2: tensor<?xf32>)
-           dimensions = [1]
-  return %res : tensor<?xf32>
-}
-// CHECK-LABEL: @row_reduce_dynamic
-
-// CHECK:      scf.for
-// CHECK:        scf.for
-// CHECK:          vector.multi_reduction
-// CHECK-SAME:       : vector<4x4xf32> to vector<4xf32>
-// CHECK-NEXT:     scf.yield %{{.*}} : vector<4xf32>
-
-// CHECK:        scf.for
-// CHECK:          arith.mulf
-// CHECK-SAME:       : vector<4xf32>
-// CHECK-NEXT:     scf.yield %{{.*}} : vector<4xf32>
-// CHECK:        vector.transfer_write
-
-// CHECK:      scf.for
-// CHECK:        scf.for
-// CHECK:          scf.for
-// CHECK:            arith.mulf %{{.*}} : f32
-// CHECK:            scf.yield %{{.*}} : f32
-// CHECK:          tensor.insert
-// CHECK:        tensor.insert_slice
-
-// -----
-
-func.func @col_reduce_dynamic(%input: tensor<?x?xf32>,
-                      %output: tensor<?xf32>) -> tensor<?xf32> {
-  %c0 = arith.constant 0 : index
-  %0 = tensor.dim %output, %c0 : tensor<?xf32>
-  %1 = tensor.empty(%0) : tensor<?xf32>
-  %cst = arith.constant 0.000000e+00 : f32
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<?xf32>) -> tensor<?xf32>
-  %res = linalg.reduce { arith.mulf }
-           ins(%input: tensor<?x?xf32>)
-           outs(%2: tensor<?xf32>)
-           dimensions = [0]
-  return %res : tensor<?xf32>
-}
-// CHECK-LABEL: @col_reduce_dynamic
-
-// CHECK:      scf.for
-// CHECK:        scf.for
-// CHECK:          vector.multi_reduction
-// CHECK-SAME:       : vector<4x4xf32> to vector<4xf32>
-// CHECK-NEXT:     scf.yield %{{.*}} : vector<4xf32>
-
-// CHECK:        scf.for
-// CHECK:          arith.mulf %{{.*}} : f32
-// CHECK-NEXT:     scf.yield %{{.*}} : f32
-// CHECK:        tensor.insert
-
-// CHECK:      scf.for
-// CHECK:        scf.for
-// CHECK:            scf.for
-// CHECK:              arith.mulf %{{.*}} : f32
-// CHECK:              scf.yield %{{.*}} : f32
-// CHECK:          tensor.insert
-// CHECK:        tensor.insert_slice
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_window.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_window.mlir
deleted file mode 100644
index 5b39efe87e8f17..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_window.mlir
+++ /dev/null
@@ -1,50 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-st-cpu-tiling-pipeline
-// TODO(b/270534416): Re-enable.
-// | FileCheck %s
-
-func.func @reduce_window(%input: tensor<1xf32>, %window: tensor<32xf32>,
-                  %output: tensor<1x8xf32>) -> tensor<1x8xf32> {
-  %bcast_init = tensor.empty() : tensor<1x256xf32>
-  %bcast = linalg.broadcast
-             ins(%input : tensor<1xf32>)
-             outs(%bcast_init : tensor<1x256xf32>)
-             dimensions = [1]
-
-  %abs_init = tensor.empty() : tensor<32xf32>
-  %abs = linalg.map { math.absf }
-           ins(%window: tensor<32xf32>)
-           outs(%abs_init: tensor<32xf32>)
-
-  %cst = arith.constant 0.000000e+00 : f32
-  %init = tensor.empty() : tensor<1x8xf32>
-  %fill = linalg.fill
-    ins(%cst : f32) outs(%init : tensor<1x8xf32>) -> tensor<1x8xf32>
-
-  %reduce_window = linalg.generic {
-    indexing_maps = [
-      affine_map<(d0, d1, d2) -> (d0, d1 * 32 + d2)>,
-      affine_map<(d0, d1, d2) -> (d2)>,
-      affine_map<(d0, d1, d2) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel", "reduction"]
-  } ins(%bcast, %abs : tensor<1x256xf32>, tensor<32xf32>)
-    outs(%fill : tensor<1x8xf32>) {
-  ^bb0(%in: f32, %win: f32, %out: f32):
-    %add = arith.addf %in, %out : f32
-    linalg.yield %add : f32
-  } -> tensor<1x8xf32>
-
-
-  %exp = linalg.map { math.exp }
-           ins(%reduce_window: tensor<1x8xf32>)
-           outs(%init: tensor<1x8xf32>)
-
-  func.return  %exp : tensor<1x8xf32>
-}
-// CHECK-LABEL: @reduce_window
-
-// CHECK:       scf.for
-// CHECK:         scf.for
-// CHECK:           arith.addf {{.*}} : f32
-// CHECK:           scf.yield %{{.*}} : f32
-// CHECK:         math.exp %{{.*}} : f32
-// CHECK:         tensor.parallel_insert_slice
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reverse.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reverse.mlir
deleted file mode 100644
index 9cc96474589e87..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reverse.mlir
+++ /dev/null
@@ -1,61 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline \
-// RUN: | FileCheck %s
-
-func.func @reverse_static_perfect_tiles(
-  %input: tensor<64xf32>, %init: tensor<64xf32>) -> tensor<64xf32> {
-  %res = thlo.reverse
-    ins(%input: tensor<64xf32>)
-    outs(%init: tensor<64xf32>)
-    reverse_dimensions = [0]
-  func.return %res : tensor<64xf32>
-}
-
-// CHECK-LABEL: @reverse_static_perfect_tiles
-
-// CHECK: scf.for
-// CHECK:   vector.transfer_read
-// CHECK:   vector.shuffle
-// CHECK:   vector.transfer_write
-
-// -----
-
-func.func @reverse_dynamic(
-  %input: tensor<?x?xf32>, %init: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %res = thlo.reverse
-     ins(%input: tensor<?x?xf32>)
-     outs(%init: tensor<?x?xf32>)
-     reverse_dimensions = [0, 1]
-  func.return %res : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: @reverse_dynamic
-
-// CHECK: scf.for
-// CHECK:   vector.shuffle
-// CHECK:   vector.transfer_write
-
-// CHECK: scf.for
-// CHECK:   scf.for
-// CHECK:     tensor.extract_slice
-// CHECK:     tensor.insert_slice
-
-// -----
-
-func.func @reverse_dynamic_not_last_dim(
-  %input: tensor<?x?xf32>, %init: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %res = thlo.reverse
-     ins(%input: tensor<?x?xf32>)
-     outs(%init: tensor<?x?xf32>)
-     reverse_dimensions = [0]
-  func.return %res : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: @reverse_dynamic
-
-// CHECK: scf.for
-// CHECK:   tensor.extract_slice {{.*}} [1, 8] [1, 1]
-
-// CHECK: scf.for
-// CHECK:   %[[REM_SIZE:.*]] = affine.apply
-// CHECK:   tensor.extract_slice {{.*}} [1, %[[REM_SIZE]]] [1, 1]
-// CHECK:   tensor.insert_slice {{.*}} tensor<1x?xf32> into tensor<?x?xf32>
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/scatter.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/scatter.mlir
deleted file mode 100644
index d31b073f350909..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/scatter.mlir
+++ /dev/null
@@ -1,70 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline | \
-// RUN: FileCheck %s
-
-func.func @scatter_fusion(%indices: tensor<?x2xindex>,
-  %updates: tensor<?x?x?xf32>, %init: tensor<?x?xf32>) -> tensor<?x?xf32> {
-
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %dim0 = tensor.dim %updates, %c0 : tensor<?x?x?xf32>
-  %dim1 = tensor.dim %updates, %c1 : tensor<?x?x?xf32>
-  %dim2 = tensor.dim %updates, %c2 : tensor<?x?x?xf32>
-  %init0 = tensor.empty(%dim0, %dim1, %dim2) : tensor<?x?x?xf32>
-  %abs = linalg.map { math.absf }
-          ins(%updates:tensor<?x?x?xf32>)
-          outs(%init0:tensor<?x?x?xf32>)
-
-  %result = thlo.scatter
-    ins (%indices: tensor<?x2xindex>, %abs: tensor<?x?x?xf32>)
-    outs (%init: tensor<?x?xf32>)
-    (%in: f32, %out: f32) {
-      %0 = arith.addf %in, %out: f32
-      thlo.yield %0: f32
-    }
-  return %result : tensor<?x?xf32>
-}
-// CHECK-LABEL: @scatter_fusion
-
-// CHECK:         scf.for
-// CHECK:           scf.if
-// CHECK:             scf.for
-// CHECK:               math.absf
-// CHECK:             scf.for
-// CHECK:               math.absf
-// CHECK:             linalg.reduce
-// CHECK:             scf.yield {{.*}} : tensor<?x?xf32>
-
-// -----
-
-func.func @scatter_fusion_overwrite(%indices: tensor<?x2xindex>,
-  %updates: tensor<?x?x?xf32>, %init: tensor<?x?xf32>) -> tensor<?x?xf32> {
-
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %dim0 = tensor.dim %updates, %c0 : tensor<?x?x?xf32>
-  %dim1 = tensor.dim %updates, %c1 : tensor<?x?x?xf32>
-  %dim2 = tensor.dim %updates, %c2 : tensor<?x?x?xf32>
-  %init0 = tensor.empty(%dim0, %dim1, %dim2) : tensor<?x?x?xf32>
-  %abs = linalg.map { math.absf }
-          ins(%updates:tensor<?x?x?xf32>)
-          outs(%init0:tensor<?x?x?xf32>)
-
-  %result = thlo.scatter
-    ins (%indices: tensor<?x2xindex>, %abs: tensor<?x?x?xf32>)
-    outs (%init: tensor<?x?xf32>)
-    (%in: f32, %out: f32) {
-      thlo.yield %in: f32
-    }
-  return %result : tensor<?x?xf32>
-}
-// CHECK-LABEL: @scatter_fusion_overwrite
-
-// CHECK:         scf.for
-// CHECK:           scf.if
-// CHECK:             scf.for
-// CHECK:               math.absf
-// CHECK:             scf.for
-// CHECK:               math.absf
-// CHECK:             scf.yield {{.*}} : tensor<?x?xf32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/sort.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/sort.mlir
deleted file mode 100644
index e2c289a221aef6..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/sort.mlir
+++ /dev/null
@@ -1,24 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-st-cpu-tiling-pipeline --canonicalize \
-// RUN: | FileCheck %s
-
-func.func @sort(%input1: tensor<64x8x4xf32>, %input2: tensor<64x8x4xf32>,
-                %init1: tensor<64x8x4xf32>, %init2: tensor<64x8x4xf32>)
-                -> (tensor<64x8x4xf32>, tensor<64x8x4xf32>) {
-  %res0, %res1 = thlo.sort
-    ins(%input1: tensor<64x8x4xf32>, %input2: tensor<64x8x4xf32>)
-    outs(%init1: tensor<64x8x4xf32>, %init2: tensor<64x8x4xf32>)
-    dimension = 1
-    is_stable = true
-    (%e11: f32, %e12: f32, %e21: f32, %e22: f32) {
-      %gt = arith.cmpf ogt, %e11, %e12: f32
-      thlo.yield %gt : i1
-    }
-  func.return %res0, %res1: tensor<64x8x4xf32>, tensor<64x8x4xf32>
-}
-// CHECK-LABEL: func.func @sort(
-
-// CHECK:      scf.for
-// CHECK:        thlo.sort
-// CHECK-SAME:     ins(%{{.*}} : tensor<1x8x1xf32>, %{{.*}} : tensor<1x8x1xf32>)
-// CHECK-SAME:     dimension = 1
-// CHECK:        tensor.insert_slice
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/transpose.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/transpose.mlir
deleted file mode 100644
index 065d983139275b..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/transpose.mlir
+++ /dev/null
@@ -1,41 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-st-cpu-tiling-pipeline \
-// RUN: | FileCheck %s
-
-func.func @transpose(%input: tensor<16x32x64xf32>,
-    %init: tensor<32x64x16xf32>) -> tensor<32x64x16xf32> {
-  %transpose = linalg.transpose
-    ins(%input:tensor<16x32x64xf32>)
-    outs(%init:tensor<32x64x16xf32>)
-    permutation = [1, 2, 0]
-  func.return %transpose : tensor<32x64x16xf32>
-}
-// CHECK-LABEL: func.func @transpose
-
-// CHECK:      scf.for
-// CHECK:        vector.transpose
-// CHECK-SAME:     [1, 2, 0] : vector<8x1x8xf32> to vector<1x8x8xf32>
-// CHECK:        vector.transfer_write
-
-// -----
-
-func.func @peel_transpose(%input: tensor<16x32x65xf32>,
-    %init: tensor<32x65x16xf32>) -> tensor<32x65x16xf32> {
-  %transpose = linalg.transpose
-    ins(%input:tensor<16x32x65xf32>)
-    outs(%init:tensor<32x65x16xf32>)
-    permutation = [1, 2, 0]
-  func.return %transpose : tensor<32x65x16xf32>
-}
-
-// CHECK-LABEL: @peel_transpose
-
-// CHECK:      scf.for
-// CHECK:        vector.transpose
-// CHECK-SAME:     [1, 2, 0] : vector<8x1x8xf32> to vector<1x8x8xf32>
-// CHECK:        vector.transfer_write
-
-// CHECK:      scf.for
-// CHECK:        scf.for
-// CHECK:          tensor.extract
-// CHECK:          tensor.insert
-// CHECK:       tensor.insert_slice
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir
deleted file mode 100644
index f54c4ed7174932..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir
+++ /dev/null
@@ -1,490 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file \
-// RUN: --test-hlo-transform-dialect-interpreter --canonicalize -cse \
-// RUN: --test-gml-st-greedy-fusion |  FileCheck %s
-
-// CHECK-LABEL: func @fuse_broadcast_map
-// CHECK-SAME: (%[[ARG0:.*]]: tensor<16xf32>, %[[ARG1:.*]]: tensor<16x32xf32>)
-func.func @fuse_broadcast_map(%arg0: tensor<16xf32>, %arg1: tensor<16x32xf32>)
-    -> tensor<16x32xf32> {
-  %init = tensor.empty() : tensor<16x32xf32>
-  %bcast = linalg.broadcast
-    ins(%arg0 : tensor<16xf32>)
-    outs(%init : tensor<16x32xf32>)
-    dimensions = [1]
-
-  %result = linalg.map { arith.addf }
-    ins(%bcast, %arg1 : tensor<16x32xf32>, tensor<16x32xf32>)
-    outs(%init : tensor<16x32xf32>)
-  func.return %result : tensor<16x32xf32>
-}
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = transform.structured.match ops{["linalg.map"]} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    %forall_op, %tiled_op = transform.structured.tile_using_forall %0 num_threads [10, 20] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-}
-
-// CHECK:      %[[INIT:.*]] = tensor.empty()
-// CHECK:      %[[RESULT:.*]] = scf.forall
-// CHECK-SAME:    shared_outs(%[[INIT_:.*]] = %[[INIT]])
-// CHECK-DAG:  %[[INIT_SLICE:.*]] = tensor.extract_slice %[[INIT]]
-// CHECK-DAG:  %[[ARG0_SLICE:.*]] = tensor.extract_slice %[[ARG0]]
-// CHECK:      %[[BCAST:.*]] = linalg.broadcast
-// CHECK-SAME:   ins(%[[ARG0_SLICE]]
-// CHECK-SAME:   outs(%[[INIT_SLICE]]
-// CHECK:      %[[ARG1_SLICE:.*]] = tensor.extract_slice %[[ARG1]]
-// CHECK-DAG:  %[[INIT_SLICE_:.*]] = tensor.extract_slice %[[INIT_]]
-// CHECK:      %[[MAPPED:.*]] = linalg.map
-// CHECK-SAME:   ins(%[[BCAST]], %[[ARG1_SLICE]]
-// CHECK-SAME:   outs(%[[INIT_SLICE_]]
-// CHECK:      tensor.parallel_insert_slice %[[MAPPED]]
-// CHECK:      return %[[RESULT]]
-
-// -----
-
-// CHECK-LABEL: func @do_not_fuse_multiple_uses
-func.func @do_not_fuse_multiple_uses(%arg0: tensor<?xf32>,
-    %arg1: tensor<?x?xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %dim0 = tensor.dim %arg1, %c0 : tensor<?x?xf32>
-  %dim1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
-  %init = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
-  %bcast = linalg.broadcast
-    ins(%arg0 : tensor<?xf32>)
-    outs(%init : tensor<?x?xf32>)
-    dimensions = [1]
-
-  %result = linalg.map { arith.addf }
-    ins(%bcast, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-    outs(%init : tensor<?x?xf32>)
-    { op_label = "root" }
-  func.return %result, %bcast : tensor<?x?xf32>, tensor<?x?xf32>
-}
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = transform.structured.match ops{["linalg.map"]} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [0, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-}
-
-// CHECK: tensor.empty
-// CHECK: %[[BCAST:.*]] = linalg.broadcast
-// CHECK: %[[RESULT:.*]] = scf.forall
-// CHECK:   linalg.map
-// CHECK:   scf.forall.in_parallel
-// CHECK: return %[[RESULT]], %[[BCAST]]
-
-// -----
-
-// CHECK-LABEL: func @do_not_fuse_map_reduce
-// CHECK-SAME: (%[[ARG0:.*]]: tensor<16x32xf32>, %[[ARG1:.*]]: tensor<16xf32>)
-func.func @do_not_fuse_map_reduce(%arg0: tensor<16x32xf32>, %arg1: tensor<16xf32>)
-    -> tensor<16xf32> {
-  %init = tensor.empty() : tensor<16xf32>
-  %reduce = linalg.reduce { arith.addf }
-    ins(%arg0 : tensor<16x32xf32>)
-    outs(%init : tensor<16xf32>)
-    dimensions = [1]
-
-  %result = linalg.map { arith.addf }
-    ins(%reduce, %arg1 : tensor<16xf32>, tensor<16xf32>)
-    outs(%init : tensor<16xf32>)
-  func.return %result : tensor<16xf32>
-}
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = transform.structured.match ops{["linalg.map"]} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-}
-
-// CHECK:      %[[INIT:.*]] = tensor.empty()
-// CHECK:      %[[REDUCE:.*]] = linalg.reduce
-// CHECK:      %[[RESULT:.*]] = scf.forall
-// CHECK-SAME:    shared_outs(%[[INIT_:.*]] = %[[INIT]])
-// CHECK-DAG:  %[[REDUCE_SLICE:.*]] = tensor.extract_slice %[[REDUCE]]
-// CHECK-DAG:  %[[ARG1_SLICE:.*]] = tensor.extract_slice %[[ARG1]]
-// CHECK-DAG:  %[[INIT_SLICE:.*]] = tensor.extract_slice %[[INIT_]]
-// CHECK:      %[[MAPPED:.*]] = linalg.map
-// CHECK-SAME:   ins(%[[REDUCE_SLICE]], %[[ARG1_SLICE]]
-// CHECK-SAME:   outs(%[[INIT_SLICE]]
-// CHECK:      tensor.parallel_insert_slice %[[MAPPED]]
-// CHECK:      return %[[RESULT]]
-
-// -----
-
-// Only basic checks that all maps and fills were fused into scf.forall.
-// This test verified that ops are fused in correct order. If something is
-// broken, the test will take exponential time and/or memory to finish.
-// CHECK-LABEL:    func @fuse_fibonacci
-// CHECK-NOT:      linalg.fill
-// CHECK-NOT:      linalg.map
-// CHECK:          scf.forall
-// CHECK-COUNT-2:    linalg.fill
-// CHECK-COUNT-38:   linalg.map
-// CHECK-NOT:        linalg.fill
-// CHECK-NOT:        linalg.map
-// CHECK:            tensor.parallel_insert_slice
-// CHECK:          return
-func.func @fuse_fibonacci(%init : tensor<?xi64>) -> tensor<?xi64> {
-  %c0 = arith.constant 0 : i64
-  %c1 = arith.constant 1 : i64
-
-  %0 = linalg.fill ins(%c0 : i64) outs(%init : tensor<?xi64>) -> tensor<?xi64>
-  %1 = linalg.fill ins(%c1 : i64) outs(%init : tensor<?xi64>) -> tensor<?xi64>
-  %2 = linalg.map { arith.addi } ins(%0, %1 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %3 = linalg.map { arith.addi } ins(%1, %2 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %4 = linalg.map { arith.addi } ins(%2, %3 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %5 = linalg.map { arith.addi } ins(%3, %4 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %6 = linalg.map { arith.addi } ins(%4, %5 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %7 = linalg.map { arith.addi } ins(%5, %6 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %8 = linalg.map { arith.addi } ins(%6, %7 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %9 = linalg.map { arith.addi } ins(%7, %8 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %10 = linalg.map { arith.addi } ins(%8, %9 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %11 = linalg.map { arith.addi } ins(%9, %10 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %12 = linalg.map { arith.addi } ins(%10, %11 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %13 = linalg.map { arith.addi } ins(%11, %12 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %14 = linalg.map { arith.addi } ins(%12, %13 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %15 = linalg.map { arith.addi } ins(%13, %14 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %16 = linalg.map { arith.addi } ins(%14, %15 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %17 = linalg.map { arith.addi } ins(%15, %16 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %18 = linalg.map { arith.addi } ins(%16, %17 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %19 = linalg.map { arith.addi } ins(%17, %18 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %20 = linalg.map { arith.addi } ins(%18, %19 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %21 = linalg.map { arith.addi } ins(%19, %20 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %22 = linalg.map { arith.addi } ins(%20, %21 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %23 = linalg.map { arith.addi } ins(%21, %22 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %24 = linalg.map { arith.addi } ins(%22, %23 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %25 = linalg.map { arith.addi } ins(%23, %24 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %26 = linalg.map { arith.addi } ins(%24, %25 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %27 = linalg.map { arith.addi } ins(%25, %26 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %28 = linalg.map { arith.addi } ins(%26, %27 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %29 = linalg.map { arith.addi } ins(%27, %28 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %30 = linalg.map { arith.addi } ins(%28, %29 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %31 = linalg.map { arith.addi } ins(%29, %30 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %32 = linalg.map { arith.addi } ins(%30, %31 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %33 = linalg.map { arith.addi } ins(%31, %32 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %34 = linalg.map { arith.addi } ins(%32, %33 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %35 = linalg.map { arith.addi } ins(%33, %34 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %36 = linalg.map { arith.addi } ins(%34, %35 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %37 = linalg.map { arith.addi } ins(%35, %36 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %38 = linalg.map { arith.addi } ins(%36, %37 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-  %39 = linalg.map { arith.addi } ins(%37, %38 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
-    {op_label="root"}
-  func.return %39 : tensor<?xi64>
-}
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = transform.structured.match ops{["linalg.map"]}
-                                    attributes{op_label="root"} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-}
-
-// -----
-
-func.func @fuse_reshape_middle_unit_dim_map(%arg0: tensor<10x16xf32>,
-    %arg1: tensor<10x16xf32>) -> tensor<10x16xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-
-  %empty_3D = tensor.empty() : tensor<10x1x16xf32>
-  %expanded = tensor.expand_shape %arg0 [[0], [1, 2]] :
-              tensor<10x16xf32> into tensor<10x1x16xf32>
-  %abs = linalg.map { math.absf }
-         ins(%expanded: tensor<10x1x16xf32>)
-         outs(%empty_3D : tensor<10x1x16xf32>)
-
-  %empty_2D = tensor.empty() : tensor<10x16xf32>
-  %collapsed = tensor.collapse_shape %abs [[0], [1, 2]] :
-               tensor<10x1x16xf32> into tensor<10x16xf32>
-  %add = linalg.map { arith.addf }
-              ins(%collapsed, %arg1 : tensor<10x16xf32>, tensor<10x16xf32>)
-              outs(%empty_2D : tensor<10x16xf32>)
-    {op_label="root"}
-  return %add : tensor<10x16xf32>
-}
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = transform.structured.match ops{["linalg.map"]}
-                                    attributes{op_label="root"} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-}
-
-// CHECK-LABEL: func @fuse_reshape_middle_unit_dim_map
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<10x16xf32>, %[[ARG1:.*]]: tensor<10x16xf32>)
-// CHECK-NOT:      tensor.expand_shape
-// CHECK-NOT:      tensor.collapse_shape
-// CHECK:          scf.forall
-// CHECK:            %[[EXTRACT:.*]] = tensor.extract_slice %[[ARG0]]
-// CHECK:            %[[EXPAND:.*]] = tensor.expand_shape %[[EXTRACT]]
-// CHECK:            %[[ABS:.*]] = linalg.map { math.absf } ins(%[[EXPAND]]
-// CHECK:            %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ABS]]
-// CHECK:            linalg.map { arith.addf } ins(%[[COLLAPSE]]
-// CHECK:            tensor.parallel_insert_slice
-// CHECK:          return
-
-// -----
-
-func.func @fuse_reshape_trailing_unit_dim_map(%arg0: tensor<10x16xf32>,
-    %arg1: tensor<10x16xf32>) -> tensor<10x16xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-
-  %empty_5D = tensor.empty() : tensor<10x16x1x1x1xf32>
-  %expanded = tensor.expand_shape %arg0 [[0], [1, 2, 3, 4]] :
-              tensor<10x16xf32> into tensor<10x16x1x1x1xf32>
-  %abs = linalg.map { math.absf }
-         ins(%expanded: tensor<10x16x1x1x1xf32>)
-         outs(%empty_5D : tensor<10x16x1x1x1xf32>)
-
-  %empty_2D = tensor.empty() : tensor<10x16xf32>
-  %collapsed = tensor.collapse_shape %abs [[0], [1, 2, 3, 4]] :
-               tensor<10x16x1x1x1xf32> into tensor<10x16xf32>
-  %add = linalg.map { arith.addf }
-              ins(%collapsed, %arg1 : tensor<10x16xf32>, tensor<10x16xf32>)
-              outs(%empty_2D : tensor<10x16xf32>)
-    {op_label="root"}
-  return %add : tensor<10x16xf32>
-}
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = transform.structured.match ops{["linalg.map"]}
-                                    attributes{op_label="root"} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-}
-
-// CHECK-LABEL: func @fuse_reshape_trailing_unit_dim_map
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<10x16xf32>, %[[ARG1:.*]]: tensor<10x16xf32>)
-// CHECK-NOT:      tensor.expand_shape
-// CHECK-NOT:      tensor.collapse_shape
-// CHECK:          scf.forall
-// CHECK:            %[[EXTRACT:.*]] = tensor.extract_slice %[[ARG0]]
-// CHECK:            %[[EXPAND:.*]] = tensor.expand_shape %[[EXTRACT]]
-// CHECK:            %[[ABS:.*]] = linalg.map { math.absf } ins(%[[EXPAND]]
-// CHECK:            %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ABS]]
-// CHECK:            linalg.map { arith.addf } ins(%[[COLLAPSE]]
-// CHECK:            tensor.parallel_insert_slice
-// CHECK:          return
-
-// -----
-
-func.func @fuse_reshape_leading_unit_dim_map(%arg0: tensor<10x16xf32>,
-    %arg1: tensor<10x16xf32>) -> tensor<10x16xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-
-  %empty_5D = tensor.empty() : tensor<1x1x1x10x16xf32>
-  %expanded = tensor.expand_shape %arg0 [[0, 1, 2, 3], [4]] :
-              tensor<10x16xf32> into tensor<1x1x1x10x16xf32>
-  %abs = linalg.map { math.absf }
-         ins(%expanded: tensor<1x1x1x10x16xf32>)
-         outs(%empty_5D : tensor<1x1x1x10x16xf32>)
-
-  %empty_2D = tensor.empty() : tensor<10x16xf32>
-  %collapsed = tensor.collapse_shape %abs [[0, 1, 2, 3], [4]] :
-               tensor<1x1x1x10x16xf32> into tensor<10x16xf32>
-  %add = linalg.map { arith.addf }
-              ins(%collapsed, %arg1 : tensor<10x16xf32>, tensor<10x16xf32>)
-              outs(%empty_2D : tensor<10x16xf32>)
-    {op_label="root"}
-  return %add : tensor<10x16xf32>
-}
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = transform.structured.match ops{["linalg.map"]}
-                                    attributes{op_label="root"} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-}
-
-// CHECK-LABEL: func @fuse_reshape_leading_unit_dim_map
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<10x16xf32>, %[[ARG1:.*]]: tensor<10x16xf32>)
-// CHECK-NOT:      tensor.expand_shape
-// CHECK-NOT:      tensor.collapse_shape
-// CHECK:          scf.forall
-// CHECK:            %[[EXTRACT:.*]] = tensor.extract_slice %[[ARG0]]
-// CHECK:            %[[EXPAND:.*]] = tensor.expand_shape %[[EXTRACT]]
-// CHECK:            %[[ABS:.*]] = linalg.map { math.absf } ins(%[[EXPAND]]
-// CHECK:            %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ABS]]
-// CHECK:            linalg.map { arith.addf } ins(%[[COLLAPSE]]
-// CHECK:            tensor.parallel_insert_slice
-// CHECK:          return
-
-// -----
-
-func.func @fuse_reshape_multiple_unit_dims_map(%arg0: tensor<10x16xf32>,
-    %arg1: tensor<10x16xf32>) -> tensor<10x16xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-
-  %empty_4D = tensor.empty() : tensor<10x1x16x1xf32>
-  %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3]] :
-              tensor<10x16xf32> into tensor<10x1x16x1xf32>
-  %abs = linalg.map { math.absf }
-         ins(%expanded: tensor<10x1x16x1xf32>)
-         outs(%empty_4D : tensor<10x1x16x1xf32>)
-
-  %empty_2D = tensor.empty() : tensor<10x16xf32>
-  %collapsed = tensor.collapse_shape %abs [[0, 1], [2, 3]] :
-               tensor<10x1x16x1xf32> into tensor<10x16xf32>
-  %add = linalg.map { arith.addf }
-              ins(%collapsed, %arg1 : tensor<10x16xf32>, tensor<10x16xf32>)
-              outs(%empty_2D : tensor<10x16xf32>)
-    {op_label="root"}
-  return %add : tensor<10x16xf32>
-}
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = transform.structured.match ops{["linalg.map"]}
-                                    attributes{op_label="root"} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-}
-
-// CHECK-LABEL: func @fuse_reshape_multiple_unit_dims_map
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<10x16xf32>, %[[ARG1:.*]]: tensor<10x16xf32>)
-// CHECK-NOT:      tensor.expand_shape
-// CHECK-NOT:      tensor.collapse_shape
-// CHECK:          scf.forall
-// CHECK:            %[[EXTRACT:.*]] = tensor.extract_slice %[[ARG0]]
-// CHECK:            %[[EXPAND:.*]] = tensor.expand_shape %[[EXTRACT]]
-// CHECK:            %[[ABS:.*]] = linalg.map { math.absf } ins(%[[EXPAND]]
-// CHECK:            %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ABS]]
-// CHECK:            linalg.map { arith.addf } ins(%[[COLLAPSE]]
-// CHECK:            tensor.parallel_insert_slice
-// CHECK:          return
-
-// -----
-
-func.func @fuse_reshape_reassoc_only_unit_dims_map(%arg0: tensor<10x16xf32>)
-    -> tensor<10x16x1xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-
-  %empty_5D = tensor.empty() : tensor<10x1x16x1x1xf32>
-  %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3, 4]] :
-              tensor<10x16xf32> into tensor<10x1x16x1x1xf32>
-  %abs = linalg.map { math.absf }
-         ins(%expanded: tensor<10x1x16x1x1xf32>)
-         outs(%empty_5D : tensor<10x1x16x1x1xf32>)
-
-  %empty_3D = tensor.empty() : tensor<10x16x1xf32>
-  %collapsed = tensor.collapse_shape %abs [[0, 1], [2], [3, 4]] :
-               tensor<10x1x16x1x1xf32> into tensor<10x16x1xf32>
-  %neg = linalg.map { arith.negf }
-              ins(%collapsed : tensor<10x16x1xf32>)
-              outs(%empty_3D : tensor<10x16x1xf32>)
-    {op_label="root"}
-  return %neg : tensor<10x16x1xf32>
-}
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = transform.structured.match ops{["linalg.map"]}
-                                    attributes{op_label="root"} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-}
-
-// CHECK-LABEL: func @fuse_reshape_reassoc_only_unit_dims_map
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<10x16xf32>)
-// CHECK-NOT:      tensor.expand_shape
-// CHECK-NOT:      tensor.collapse_shape
-// CHECK:          scf.forall
-// CHECK:            %[[EXTRACT:.*]] = tensor.extract_slice %[[ARG0]]
-// CHECK:            %[[EXPAND:.*]] = tensor.expand_shape %[[EXTRACT]]
-// CHECK:            %[[ABS:.*]] = linalg.map { math.absf } ins(%[[EXPAND]]
-// CHECK:            %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ABS]]
-// CHECK:            linalg.map { arith.negf } ins(%[[COLLAPSE]]
-// CHECK:            tensor.parallel_insert_slice
-// CHECK:          return
-
-// -----
-
-func.func @do_not_fuse_collapse_shape(%arg0: tensor<10x16xf32>,
-    %arg1: tensor<10x16xf32>) -> tensor<10x16xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-
-  %empty = tensor.empty() : tensor<10x1x4x4x1xf32>
-  %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3, 4]] :
-              tensor<10x16xf32> into tensor<10x1x4x4x1xf32>
-  %abs = linalg.map { math.absf }
-         ins(%expanded: tensor<10x1x4x4x1xf32>)
-         outs(%empty: tensor<10x1x4x4x1xf32>)
-
-  %empty_2D = tensor.empty() : tensor<10x16xf32>
-  %collapsed = tensor.collapse_shape %abs [[0, 1], [2, 3, 4]] :
-              tensor<10x1x4x4x1xf32> into tensor<10x16xf32>
-  %add = linalg.map { arith.addf }
-              ins(%collapsed, %arg1 : tensor<10x16xf32>, tensor<10x16xf32>)
-              outs(%empty_2D : tensor<10x16xf32>)
-    {op_label="root"}
-  return %add : tensor<10x16xf32>
-}
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = transform.structured.match ops{["linalg.map"]}
-                                    attributes{op_label="root"} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-}
-
-// CHECK-LABEL: func @do_not_fuse_collapse_shape
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<10x16xf32>, %[[ARG1:.*]]: tensor<10x16xf32>)
-// CHECK:          %[[EXPAND:.*]] = tensor.expand_shape %[[ARG0]]
-// CHECK:          %[[ABS:.*]] = linalg.map { math.absf } ins(%[[EXPAND]]
-// CHECK:          %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ABS]]
-// CHECK:          scf.forall
-// CHECK:            %[[EXTRACT:.*]] = tensor.extract_slice %[[COLLAPSE]]
-// CHECK:            linalg.map { arith.addf } ins(%[[EXTRACT]]
-// CHECK:            tensor.parallel_insert_slice
-// CHECK:          return
-
-//%test = tensor.collapse_shape %abs [[0, 1], [2]] :
-//             tensor<10x16x1xf32> into tensor<160x1xf32>
-
-// -----
-
-func.func @do_not_fuse_expand_shape(%arg0: tensor<10x16xf32>,
-    %arg1: tensor<10x16xf32>) -> tensor<10x16xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-
-  %empty = tensor.empty() : tensor<160xf32>
-  %collapsed = tensor.collapse_shape %arg0 [[0, 1]] :
-               tensor<10x16xf32> into tensor<160xf32>
-  %abs = linalg.map { math.absf }
-         ins(%collapsed: tensor<160xf32>)
-         outs(%empty: tensor<160xf32>)
-
-  %empty_2D = tensor.empty() : tensor<10x16xf32>
-  %expanded = tensor.expand_shape %abs [[0, 1]] :
-              tensor<160xf32> into tensor<10x16xf32>
-  %add = linalg.map { arith.addf }
-              ins(%expanded, %arg1 : tensor<10x16xf32>, tensor<10x16xf32>)
-              outs(%empty_2D : tensor<10x16xf32>)
-    {op_label="root"}
-  return %add : tensor<10x16xf32>
-}
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = transform.structured.match ops{["linalg.map"]}
-                                    attributes{op_label="root"} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    %loop, %1 = transform.structured.tile_using_forall %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-}
-
-// CHECK-LABEL: func @do_not_fuse_expand_shape
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<10x16xf32>, %[[ARG1:.*]]: tensor<10x16xf32>)
-// CHECK:          %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ARG0]]
-// CHECK:          %[[ABS:.*]] = linalg.map { math.absf } ins(%[[COLLAPSE]]
-// CHECK:          %[[EXPAND:.*]] = tensor.expand_shape %[[ABS]]
-// CHECK:          scf.forall
-// CHECK:            %[[EXTRACT:.*]] = tensor.extract_slice %[[EXPAND]]
-// CHECK:            linalg.map { arith.addf } ins(%[[EXTRACT]]
-// CHECK:            tensor.parallel_insert_slice
-// CHECK:          return
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/invalid.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/invalid.mlir
deleted file mode 100644
index 31e27a2a74ea63..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/invalid.mlir
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: mlir-hlo-opt %s -split-input-file -verify-diagnostics
-
-func.func @fusion_cluster_not_isolated(%arg0: tensor<?x?xf32>,
-    %arg1: tensor<?x?xf32>, %init: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %map0 = linalg.map { math.exp }
-            ins(%arg0 : tensor<?x?xf32>)
-            outs(%init : tensor<?x?xf32>)
-  // expected-note@+1 {{required by region isolation constraints}}
-  %0 = gml_st.fusion ins(%a1 = %arg1 : tensor<?x?xf32>)
-                     inits(%in = %init : tensor<?x?xf32>) {
-    // expected-error@+1 {{op using value defined outside the region}}
-    %map1 = linalg.map { arith.mulf }
-      ins(%map0, %a1 : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%in : tensor<?x?xf32>)
-    gml_st.yield %map1 : tensor<?x?xf32>
-  } : tensor<?x?xf32>
-  func.return %0 : tensor<?x?xf32>
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/lower_vectors.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/lower_vectors.mlir
deleted file mode 100644
index 6c10c564e522bb..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/lower_vectors.mlir
+++ /dev/null
@@ -1,220 +0,0 @@
-// RUN: mlir-hlo-opt %s --lower-vectors --split-input-file | FileCheck %s
-// RUN: mlir-hlo-opt %s --lower-vectors="flatten=true" --split-input-file | FileCheck %s --check-prefix=FLATTEN
-
-// CHECK-LABEL: func @vector_row
-func.func @vector_row(%arg0: vector<2x4xf32>, %acc: vector<2xf32>) -> vector<2xf32> {
-    %0 = vector.multi_reduction <mul>, %arg0, %acc [1] : vector<2x4xf32> to vector<2xf32>
-    func.return %0 : vector<2xf32>
-}
-// CHECK-COUNT-4: arith.mulf
-
-// -----
-
-// CHECK-LABEL: func @vector_col
-func.func @vector_col(%arg0: vector<2x4xf32>, %acc: vector<4xf32>) -> vector<4xf32> {
-    %0 = vector.multi_reduction <mul>, %arg0, %acc [0] : vector<2x4xf32> to vector<4xf32>
-    func.return %0 : vector<4xf32>
-}
-// CHECK: arith.mulf
-// CHECK: arith.mulf
-
-// -----
-
-// CHECK-LABEL: func @vector_1d
-func.func @vector_1d(%arg0: vector<4xf32>, %acc: f32) -> f32 {
-    %0 = vector.multi_reduction <mul>, %arg0, %acc [0] : vector<4xf32> to f32
-    func.return %0 : f32
-}
-
-// -----
-
-// CHECK: vector.reduction <mul>
-func.func @lower_vector_contract(%arg0: tensor<8x8xf32>, %arg1: tensor<8x8xf32>)
-                  -> tensor<8x8xf32> {
-  %c0 = arith.constant 0 : index
-  %cst_0 = arith.constant 0.000000e+00 : f32
-  %0 = tensor.empty() : tensor<8x8xf32>
-  %2 = vector.transfer_read %arg0[%c0, %c0], %cst_0 {in_bounds = [true, true]}
-    : tensor<8x8xf32>, vector<8x8xf32>
-  %3 = vector.transfer_read %arg1[%c0, %c0], %cst_0 {in_bounds = [true, true]}
-    : tensor<8x8xf32>, vector<8x8xf32>
-  %4 = vector.transfer_read %0[%c0, %c0], %cst_0 {in_bounds = [true, true]}
-    : tensor<8x8xf32>, vector<8x8xf32>
-  %5 = vector.contract {
-         indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
-                          affine_map<(d0, d1, d2) -> (d2, d1)>,
-                          affine_map<(d0, d1, d2) -> (d0, d1)>],
-         iterator_types = ["parallel", "parallel", "reduction"],
-         kind = #vector.kind<add>
-  } %2, %3, %4 : vector<8x8xf32>, vector<8x8xf32> into vector<8x8xf32>
-  %6 = vector.transfer_write %5, %0[%c0, %c0] {in_bounds = [true, true]} : vector<8x8xf32>, tensor<8x8xf32>
-  return %6 : tensor<8x8xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @lower_vector_contract
-// CHECK-COUNT-8: vector.outerproduct
-
-// -----
-
-func.func @lower_vector_contract_4d(%arg0: tensor<1x1x8x1xf32>,
-                                    %arg1: tensor<1x1x8x1xf32>)
-                  -> tensor<1x1x8x8xf32> {
-  %c0 = arith.constant 0 : index
-  %4 = tensor.empty() : tensor<1x1x8x8xf32>
-  %cst = arith.constant 0.000000e+00 : f32
-  %20 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst
-    {in_bounds = [true, true, true, true]} : tensor<1x1x8x1xf32>,
-                                             vector<1x1x8x1xf32>
-  %21 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst
-    {in_bounds = [true, true, true, true]} : tensor<1x1x8x1xf32>,
-                                             vector<1x1x8x1xf32>
-  %22 = vector.transfer_read %4[%c0, %c0, %c0, %c0], %cst
-    {in_bounds = [true, true, true, true]} : tensor<1x1x8x8xf32>,
-                                             vector<1x1x8x8xf32>
-  %23 = vector.contract {indexing_maps =
-    [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>,
-     affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>,
-     affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>],
-    iterator_types = ["parallel", "parallel", "reduction",
-                      "parallel", "parallel", "reduction"],
-    kind = #vector.kind<add>}
-    %20, %21, %22 : vector<1x1x8x1xf32>, vector<1x1x8x1xf32>
-               into vector<1x1x8x8xf32>
-  %14 = vector.transfer_write %23, %4[%c0, %c0, %c0, %c0]
-    {in_bounds = [true, true, true, true]} : vector<1x1x8x8xf32>,
-                                             tensor<1x1x8x8xf32>
-  return %14 : tensor<1x1x8x8xf32>
-}
-
-// CHECK-LABEL: func @lower_vector_contract_4d
-// CHECK:         vector.outerproduct
-
-// -----
-
-func.func @lower_vector_contract_4d_matvec(%arg0: tensor<1x1x1x1xf32>,
-                                           %arg1: tensor<1x1x8x1xf32>)
-                  -> tensor<1x1x1x8xf32> {
-  %c0 = arith.constant 0 : index
-  %4 = tensor.empty() : tensor<1x1x1x8xf32>
-  %cst = arith.constant 0.000000e+00 : f32
-  %20 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst
-    {in_bounds = [true, true, true, true]} : tensor<1x1x1x1xf32>,
-                                             vector<1x1x1x1xf32>
-  %21 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst
-    {in_bounds = [true, true, true, true]} : tensor<1x1x8x1xf32>,
-                                             vector<1x1x8x1xf32>
-  %22 = vector.transfer_read %4[%c0, %c0, %c0, %c0], %cst
-    {in_bounds = [true, true, true, true]} : tensor<1x1x1x8xf32>,
-                                             vector<1x1x1x8xf32>
-  %23 = vector.contract {indexing_maps =
-    [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>,
-     affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>,
-     affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>],
-    iterator_types = ["parallel", "parallel", "reduction",
-                      "parallel", "parallel", "reduction"],
-    kind = #vector.kind<add>}
-    %20, %21, %22 : vector<1x1x1x1xf32>, vector<1x1x8x1xf32>
-               into vector<1x1x1x8xf32>
-  %14 = vector.transfer_write %23, %4[%c0, %c0, %c0, %c0]
-    {in_bounds = [true, true, true, true]} : vector<1x1x1x8xf32>,
-                                             tensor<1x1x1x8xf32>
-  return %14 : tensor<1x1x1x8xf32>
-}
-
-// CHECK-LABEL: func @lower_vector_contract_4d_matvec
-// CHECK:         vector.outerproduct
-
-// -----
-
-#map = affine_map<(d0) -> (d0 * 8)>
-func.func @optimize_pack_with_transpose(%arg0: memref<1024x1024xf32>) ->
-                                        memref<128x1024x8x1xf32> {
-  %c0 = arith.constant 0 : index
-  %c128 = arith.constant 128 : index
-  %c1024 = arith.constant 1024 : index
-  %c1 = arith.constant 1 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<128x1024x8x1xf32>
-  scf.for %arg2 = %c0 to %c128 step %c1 {
-    scf.for %arg3 = %c0 to %c1024 step %c1 {
-      %0 = affine.apply #map(%arg2)
-      %1 = vector.transfer_read %arg0[%arg3, %0], %cst {in_bounds = [true]} :
-                                memref<1024x1024xf32>, vector<8xf32>
-      %2 = vector.broadcast %1 : vector<8xf32> to vector<1x8xf32>
-      %3 = vector.transpose %2, [1, 0] : vector<1x8xf32> to vector<8x1xf32>
-      vector.transfer_write %3, %alloc_0[%arg2, %arg3, %c0, %c0]
-                            {in_bounds = [true, true]} :
-                            vector<8x1xf32>, memref<128x1024x8x1xf32>
-    }
-  }
-  return %alloc_0 : memref<128x1024x8x1xf32>
-}
-
-// FLATTEN-LABEL: func @optimize_pack_with_transpose(
-// FLATTEN-SAME:      %[[INPUT:.*]]: memref<1024x1024xf32>)
-
-// FLATTEN:         %[[ALLOC:.*]] = memref.alloc
-// FLATTEN:         %[[READ:.*]] = vector.transfer_read %[[INPUT]]
-// FLATTEN-NOT:     vector.broadcast
-// FLATTEN-NOT:     vector.transpose
-// FLATTEN:         %[[COLLAPSE:.*]] = memref.collapse_shape %[[ALLOC]]
-// FLATTEN-SAME:    memref<128x1024x8x1xf32> into memref<128x1024x8xf32>
-// FLATTEN:         %[[SHAPE_CAST:.*]] = vector.shape_cast %{{.*}}
-// FLATTEN:         vector.transfer_write %[[SHAPE_CAST]], %[[COLLAPSE]]
-
-// -----
-
-#map = affine_map<(d0) -> (d0 * 8)>
-func.func @optimize_pack(%arg0: memref<1024x1024xf32>) ->
-                         memref<128x1024x8x1xf32> {
-  %c0 = arith.constant 0 : index
-  %c128 = arith.constant 128 : index
-  %c1024 = arith.constant 1024 : index
-  %c1 = arith.constant 1 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<128x1024x8x1xf32>
-  scf.for %arg2 = %c0 to %c128 step %c1 {
-    scf.for %arg3 = %c0 to %c1024 step %c1 {
-      %0 = affine.apply #map(%arg2)
-      %1 = vector.transfer_read %arg0[%0, %arg3], %cst
-                                {in_bounds = [true, true]} :
-                                memref<1024x1024xf32>, vector<8x1xf32>
-      vector.transfer_write %1, %alloc_0[%arg2, %arg3, %c0, %c0]
-                            {in_bounds = [true, true]} :
-                            vector<8x1xf32>, memref<128x1024x8x1xf32>
-    }
-  }
-  return %alloc_0 : memref<128x1024x8x1xf32>
-}
-
-// FLATTEN-LABEL: func @optimize_pack(
-// FLATTEN-SAME:      %[[INPUT:.*]]: memref<1024x1024xf32>)
-
-// FLATTEN:         %[[ALLOC:.*]] = memref.alloc
-// FLATTEN:         %[[READ:.*]] = vector.transfer_read %[[INPUT]]
-// FLATTEN:         %[[COLLAPSE:.*]] = memref.collapse_shape %[[ALLOC]]
-// FLATTEN-SAME:    memref<128x1024x8x1xf32> into memref<128x1024x8xf32>
-// FLATTEN:         %[[SHAPE_CAST:.*]] = vector.shape_cast
-// FLATTEN-SAME:    vector<8x1xf32> to vector<8xf32>
-// FLATTEN:         vector.transfer_write %[[SHAPE_CAST]], %[[COLLAPSE]]
-
-// -----
-
-func.func @no_flatten(%arg0: memref<2x9x10x2xf64>) ->
-                         memref<2x9x10x2xf64> {
-  %cst = arith.constant 0.000000e+00 : f64
-  %c0 = arith.constant 0 : index
-  %alloca = memref.alloca() : memref<2x9x10x2xf64>
-  %1 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<2x9x10x2xf64>, vector<2x9x10x2xf64>
-  vector.transfer_write %1, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<2x9x10x2xf64>, memref<2x9x10x2xf64>
-  return %alloca : memref<2x9x10x2xf64>
-}
-
-
-// CHECK-LABEL:     func @no_flatten(
-
-// CHECK-NOT:         memref.collapse_shape
-// CHECK-COUNT-180:   vector.transfer_read {{.*}} memref<2x9x10x2xf64>, vector<2xf64>
-// CHECK-COUNT-180:   vector.transfer_write {{.*}} vector<2xf64>, memref<2x9x10x2xf64>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling_softmax.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling_softmax.mlir
deleted file mode 100644
index a5bac2b79bb4f6..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling_softmax.mlir
+++ /dev/null
@@ -1,110 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file \
-// RUN:     --gml-tiling-softmax="tile-sizes=8,16" --canonicalize --cse \
-// RUN:     --gml-tiling-softmax="tile-sizes=1,1" --canonicalize --cse | \
-// RUN: FileCheck %s
-
-func.func @softmax(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
-  %cst = arith.constant -0.000000e+00 : f32
-  %cst_0 = arith.constant 0xFF800000 : f32
-  %0 = tensor.empty() : tensor<64xf32>
-  %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
-  %2 = linalg.reduce ins(%arg0 : tensor<64x128xf32>)
-                     outs(%1 : tensor<64xf32>) dimensions = [1]
-    (%arg1: f32, %arg2: f32) {
-      %11 = arith.maximumf %arg1, %arg2 : f32
-      linalg.yield %11 : f32
-    }
-  %3 = tensor.empty() : tensor<64x128xf32>
-  %4 = linalg.broadcast
-    ins(%2 : tensor<64xf32>)
-    outs(%3 : tensor<64x128xf32>)
-    dimensions = [1]
-  %5 = linalg.map ins(%arg0, %4 : tensor<64x128xf32>, tensor<64x128xf32>)
-              outs(%3 : tensor<64x128xf32>)
-  (%arg1: f32, %arg2: f32) {
-    %11 = arith.subf %arg1, %arg2 : f32
-    linalg.yield %11 : f32
-  }
-  %6 = linalg.map ins(%5 : tensor<64x128xf32>)
-              outs(%3 : tensor<64x128xf32>)
-  (%arg1: f32) {
-    %11 = math.exp %arg1 : f32
-    linalg.yield %11 : f32
-  }
-  %7 = linalg.fill ins(%cst : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
-  %8 = linalg.reduce ins(%6 : tensor<64x128xf32>)
-                     outs(%7 : tensor<64xf32>) dimensions = [1]
-    (%arg1: f32, %arg2: f32) {
-      %11 = arith.addf %arg2, %arg1 : f32
-      linalg.yield %11 : f32
-    }
-  %9 = linalg.broadcast
-    ins(%8 : tensor<64xf32>)
-    outs(%3 : tensor<64x128xf32>)
-    dimensions = [1]
-  %10 = linalg.map ins(%6, %9 : tensor<64x128xf32>, tensor<64x128xf32>)
-      outs(%3 : tensor<64x128xf32>)
-  (%arg1: f32, %arg2: f32) {
-    %11 = arith.divf %arg1, %arg2 : f32
-    linalg.yield %11 : f32
-  }
-  return %10 : tensor<64x128xf32>
-}
-// CHECK-LABEL: @softmax
-// CHECK-SAME:  %[[ARG0:.*]]: tensor<64x128xf32>
-// CHECK-DAG:  %[[CST:.*]] = arith.constant -0.000000e+00
-// CHECK-DAG:  %[[CST_0:.*]] = arith.constant 0xFF800000
-// CHECK:      %[[EMPTY:.*]] = tensor.empty() : tensor<64xf32>
-// CHECK:      %[[FILL:.*]] = linalg.fill ins(%[[CST_0]] : f32)
-// CHECK-SAME:     outs(%[[EMPTY]] : tensor<64xf32>)
-// CHECK:      %[[EMPTY_0:.*]] = tensor.empty() : tensor<64x128xf32>
-// CHECK:      %[[FILL_0:.*]] = linalg.fill ins(%[[CST]] : f32)
-// CHECK-SAME:     outs(%[[EMPTY]] : tensor<64xf32>)
-
-// CHECK:      %[[PARALLEL:.*]] = scf.forall (%[[ARG1:.*]]) = (0) to (64) step (8)
-// CHECK-SAME:     shared_outs(%[[EMPTY_:.*]] = %[[EMPTY_0]])
-// CHECK-DAG:   %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG1]], 0] [8, 128] [1, 1]
-// CHECK-DAG:   %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[FILL]][%[[ARG1]]] [8] [1]
-// CHECK-DAG:   %[[MATERIALIZE_1:.*]] = tensor.extract_slice %[[EMPTY_0]][%[[ARG1]], 0] [8, 128] [1, 1]
-// CHECK-DAG:   %[[MATERIALIZE_3:.*]] = tensor.extract_slice %[[FILL_0]][%[[ARG1]]] [8] [1]
-// CHECK-DAG:   %[[EMPTY_SUB:.*]] = tensor.extract_slice %[[EMPTY_]]
-
-// CHECK:        %[[PARALLEL_0:.*]] = scf.forall (%[[ARG2:.*]]) in (8)
-// CHECK-SAME:       shared_outs(%[[EMPTY_SUB_:.*]] = %[[EMPTY_SUB]])
-// CHECK-NEXT:     %[[MATERIALIZE_4:.*]] = tensor.extract_slice %[[MATERIALIZE]][%[[ARG2]], 0] [1, 128] [1, 1]
-// CHECK-NEXT:     %[[MATERIALIZE_5:.*]] = tensor.extract_slice %[[MATERIALIZE_0]][%[[ARG2]]] [1] [1]
-// CHECK-NEXT:     %[[REDUCE:.*]] = linalg.reduce
-// CHECK-SAME:         ins(%[[MATERIALIZE_4]] : tensor<1x128xf32>)
-// CHECK-SAME:         outs(%[[MATERIALIZE_5]] : tensor<1xf32>)
-// CHECK-SAME:         dimensions = [1]
-
-// CHECK:          %[[MATERIALIZE_6:.*]] = tensor.extract_slice %[[MATERIALIZE_1]][%[[ARG2]], 0] [1, 128] [1, 1]
-// CHECK-NEXT:     %[[BROADCAST:.*]] = linalg.broadcast
-// CHECK-SAME:         ins(%[[REDUCE]] : tensor<1xf32>)
-// CHECK-SAME:         outs(%[[MATERIALIZE_6]] : tensor<1x128xf32>)
-// CHECK-SAME:         dimensions = [1]
-
-// CHECK:          %[[MAP:.*]] = linalg.map
-// CHECK-SAME:         ins(%[[MATERIALIZE_4]], %[[BROADCAST]] : tensor<1x128xf32>, tensor<1x128xf32>)
-// CHECK-SAME:         outs(%[[MATERIALIZE_6]] : tensor<1x128xf32>)
-
-// CHECK:          %[[MAP_0:.*]] = linalg.map
-// CHECK-SAME:         ins(%[[MAP]] : tensor<1x128xf32>)
-// CHECK-SAME:         outs(%[[MATERIALIZE_6]] : tensor<1x128xf32>)
-
-// CHECK:          %[[MATERIALIZE_8:.*]] = tensor.extract_slice %[[MATERIALIZE_3]][%[[ARG2]]] [1] [1]
-// CHECK-NEXT:          %[[REDUCE_0:.*]] = linalg.reduce
-// CHECK-SAME:         ins(%[[MAP_0]] : tensor<1x128xf32>)
-// CHECK-SAME:         outs(%[[MATERIALIZE_8]] : tensor<1xf32>)
-
-// CHECK:          %[[BROADCAST_0:.*]] = linalg.broadcast
-// CHECK-SAME:         ins(%[[REDUCE_0]] : tensor<1xf32>)
-// CHECK-SAME:         outs(%[[MATERIALIZE_6]] : tensor<1x128xf32>)
-
-// CHECK-NEXT:     %[[MATERIALIZE_7:.*]] = tensor.extract_slice %[[EMPTY_SUB_]]
-// CHECK:          %[[MAP_1:.*]] = linalg.map
-// CHECK-SAME:         ins(%[[MAP_0]], %[[BROADCAST_0]] : tensor<1x128xf32>, tensor<1x128xf32>)
-// CHECK-SAME:         outs(%[[MATERIALIZE_7]] : tensor<1x128xf32>)
-// CHECK:          tensor.parallel_insert_slice %[[MAP_1]] into %[[EMPTY_SUB_]][%[[ARG2]], 0] [1, 128] [1, 1]
-// CHECK:        tensor.parallel_insert_slice %[[PARALLEL_0]] into %[[EMPTY_]][%[[ARG1]], 0] [8, 128] [1, 1]
-// CHECK:      return %[[PARALLEL]]
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/ops.mlir
deleted file mode 100644
index 4718693639cdf2..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/ops.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --allow-unregistered-dialect | \
-// RUN: mlir-hlo-opt --verify-diagnostics --split-input-file \
-// RUN:     --allow-unregistered-dialect | \
-// RUN: FileCheck %s
-
-func.func @fusion_cluster(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
-    %init: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %0 = gml_st.fusion ins(%a0 = %arg0 : tensor<?x?xf32>,
-                         %a1 = %arg1 : tensor<?x?xf32>)
-                     inits(%in = %init : tensor<?x?xf32>) {
-    %map0 = linalg.map { math.exp }
-      ins(%a0 : tensor<?x?xf32>)
-      outs(%in : tensor<?x?xf32>)
-    %map1 = linalg.map { arith.mulf }
-      ins(%map0, %a1 : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%in : tensor<?x?xf32>)
-    gml_st.yield %map1 : tensor<?x?xf32>
-  } { "some_attr" = 1 } : tensor<?x?xf32>
-  func.return %0 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: func @fusion_cluster
-// CHECK:       gml_st.fusion
-// CHECK:         linalg.map
-// CHECK:         linalg.map
-// CHECK:         gml_st.yield
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/optimize_linalg_ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/optimize_linalg_ops.mlir
deleted file mode 100644
index 074642bc9c6f8b..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/optimize_linalg_ops.mlir
+++ /dev/null
@@ -1,183 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-st-optimize-linalg-ops-pass \
-// RUN: --split-input-file \
-// RUN: | FileCheck %s
-
-func.func @map_no_inputs(%arg: tensor<32xf32>) -> tensor<32xf32> {
-  %c0 = arith.constant 0.0 : f32
-  %init = tensor.empty() : tensor<32xf32>
-
-  %res = linalg.map
-           outs(%init: tensor<32xf32>)
-           () {
-             linalg.yield %c0 : f32
-           }
-  func.return %res : tensor<32xf32>
-}
-
-// CHECK-LABEL:  @map_no_inputs
-// CHECK-DAG:      %[[CST:.*]] = arith.constant
-// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
-// CHECK:          linalg.fill
-// CHECK-SAME:       ins(%[[CST]]
-// CHECK-SAME:       outs(%[[INIT]]
-
-// -----
-
-func.func @map_dense_constant_operand(%arg: tensor<32xf32>) -> tensor<32xf32> {
-  %c0 = arith.constant dense<0.0> : tensor<32xf32>
-  %init = tensor.empty() : tensor<32xf32>
-
-  %res = linalg.map { arith.maximumf }
-           ins(%arg, %c0: tensor<32xf32>, tensor<32xf32>)
-           outs(%init: tensor<32xf32>)
-  func.return %res : tensor<32xf32>
-}
-
-// CHECK-LABEL:  @map_dense_constant_operand
-// CHECK-SAME:       (%[[ARG:.*]]: tensor<32xf32>)
-// CHECK-DAG:      %[[CST:.*]] = arith.constant 0.0
-// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
-// CHECK:          linalg.map
-// CHECK-SAME:       ins(%[[ARG]]
-// CHECK-SAME:       outs(%[[INIT]]
-// CHECK-NEXT:       (%[[BBARG:.*]]: f32)
-// CHECK-NEXT:         arith.maximumf %[[BBARG]], %[[CST]]
-
-// -----
-
-func.func @map_dense_constant_operand_complex(%arg: tensor<32xcomplex<f64>>)
-    -> tensor<32xcomplex<f64>> {
-  %c0 = arith.constant dense<(1.0000e+00,0.0000e+00)> : tensor<32xcomplex<f64>>
-  %init = tensor.empty() : tensor<32xcomplex<f64>>
-
-  %res = linalg.map { complex.add }
-           ins(%arg, %c0: tensor<32xcomplex<f64>>, tensor<32xcomplex<f64>>)
-           outs(%init: tensor<32xcomplex<f64>>)
-  func.return %res : tensor<32xcomplex<f64>>
-}
-
-// CHECK-LABEL:  @map_dense_constant_operand_complex
-// CHECK-SAME:       (%[[ARG:.*]]: tensor<32xcomplex<f64>>)
-// CHECK-DAG:      %[[CST:.*]] = complex.constant
-// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
-// CHECK:          linalg.map
-// CHECK-SAME:       ins(%[[ARG]]
-// CHECK-SAME:       outs(%[[INIT]]
-// CHECK-NEXT:       (%[[BBARG:.*]]: complex<f64>)
-// CHECK-NEXT:         complex.add %[[BBARG]], %[[CST]]
-
-// -----
-
-func.func @map_fill_operand(%arg: tensor<32xf32>) -> tensor<32xf32> {
-  %c0 = arith.constant 0.0 : f32
-  %init = tensor.empty() : tensor<32xf32>
-
-  %filled = linalg.fill ins(%c0 : f32)
-              outs(%init: tensor<32xf32>) -> tensor<32xf32>
-
-  %res = linalg.map { arith.maximumf }
-           ins(%arg, %filled: tensor<32xf32>, tensor<32xf32>)
-           outs(%init: tensor<32xf32>)
-  func.return %res : tensor<32xf32>
-}
-
-// CHECK-LABEL:  @map_fill_operand
-// CHECK-SAME:       (%[[ARG:.*]]: tensor<32xf32>)
-// CHECK-DAG:      %[[CST:.*]] = arith.constant 0.0
-// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
-// CHECK:          linalg.map
-// CHECK-SAME:       ins(%[[ARG]]
-// CHECK-SAME:       outs(%[[INIT]]
-// CHECK-NEXT:       (%[[BBARG:.*]]: f32)
-// CHECK-NEXT:         arith.maximumf %[[BBARG]], %[[CST]]
-
-// -----
-
-func.func @map_all_constant_operand(%select: i1) -> tensor<32xf32> {
-  %c0 = arith.constant dense<0.0> : tensor<32xf32>
-  %c1 = arith.constant 1.0 : f32
-  %init = tensor.empty() : tensor<32xf32>
-
-  %filled = linalg.fill ins(%c1 : f32)
-              outs(%init: tensor<32xf32>) -> tensor<32xf32>
-
-  %res = linalg.map
-           ins(%c0, %filled: tensor<32xf32>, tensor<32xf32>)
-           outs(%init: tensor<32xf32>)
-           (%lhs : f32, %rhs : f32) {
-             %0 = arith.select %select, %lhs, %rhs : f32
-             linalg.yield %0 : f32
-           }
-  func.return %res : tensor<32xf32>
-}
-
-// CHECK-LABEL:  @map_all_constant_operand
-// CHECK-DAG:      %[[C0:.*]] = arith.constant 0.0
-// CHECK-DAG:      %[[C1:.*]] = arith.constant 1.0
-// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
-// CHECK-DAG:      %[[VAL:.*]] = arith.select
-// CHECK:          linalg.fill
-// CHECK-SAME:       ins(%[[VAL]]
-// CHECK-SAME:       outs(%[[INIT]]
-
-// -----
-
-func.func @broadcast_of_splat() -> tensor<32x64xf32> {
-  %c0 = arith.constant dense<0.0> : tensor<32xf32>
-  %init = tensor.empty() : tensor<32x64xf32>
-
-  %bcast = linalg.broadcast
-    ins(%c0: tensor<32xf32>)
-    outs(%init: tensor<32x64xf32>)
-    dimensions = [1]
-  func.return %bcast : tensor<32x64xf32>
-}
-// CHECK-LABEL:  @broadcast_of_splat
-// CHECK-DAG:      %[[CST:.*]] = arith.constant
-// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
-// CHECK:          linalg.fill
-// CHECK-SAME:       ins(%[[CST]]
-// CHECK-SAME:       outs(%[[INIT]]
-
-// -----
-
-func.func @broadcast_of_single_element_tensor(%arg: tensor<f32>)
-    -> tensor<32xf32> {
-  %init = tensor.empty() : tensor<32xf32>
-  %bcast = linalg.broadcast
-    ins(%arg: tensor<f32>)
-    outs(%init: tensor<32xf32>)
-    dimensions = [0]
-  func.return %bcast : tensor<32xf32>
-}
-// CHECK-LABEL:  @broadcast_of_single_element_tensor
-// CHECK-SAME:       (%[[ARG:.*]]: tensor<f32>)
-
-// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
-// CHECK-DAG:      %[[EXTRACT:.*]] = tensor.extract %[[ARG]]
-// CHECK:          linalg.fill
-// CHECK-SAME:       ins(%[[EXTRACT]]
-// CHECK-SAME:       outs(%[[INIT]]
-
-// -----
-
-func.func @slice_of_map(%arg: tensor<32xf32>) -> tensor<8xf32> {
-  %c0 = arith.constant dense<0.0> : tensor<32xf32>
-  %init = tensor.empty() : tensor<32xf32>
-
-  %map = linalg.map { arith.maximumf }
-           ins(%arg, %c0: tensor<32xf32>, tensor<32xf32>)
-           outs(%init: tensor<32xf32>)
-  %slice = tensor.extract_slice %map[0] [8] [1]
-   : tensor<32xf32> to tensor<8xf32>
-  func.return %slice : tensor<8xf32>
-}
-// CHECK-LABEL:  @slice_of_map
-// CHECK-SAME:       (%[[ARG:.*]]: tensor<32xf32>)
-
-// CHECK-DAG:      %[[CST:.*]] = arith.constant 0.0
-// CHECK-DAG:      %[[SLICE:.*]] = tensor.extract_slice %[[ARG]][0] [8] [1]
-// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
-// CHECK:          linalg.map
-// CHECK-SAME:       ins(%[[SLICE]]
-// CHECK-SAME:       outs(%[[INIT]]
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/rewrite_forall_to_for.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/rewrite_forall_to_for.mlir
deleted file mode 100644
index ed52346d7a9572..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/rewrite_forall_to_for.mlir
+++ /dev/null
@@ -1,62 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-st-rewrite-forall-ops --split-input-file \
-// RUN: | FileCheck %s
-
-func.func @add(%in: tensor<3x3xi32>, %out: tensor<3x3xi32>) -> tensor<3x3xi32> {
-  %c3 = arith.constant 3 : index
-
-  %result = scf.forall (%i, %j) in (%c3, %c3)
-      shared_outs(%o = %out) -> tensor<3x3xi32> {
-    %addend = tensor.extract_slice %in[%i, %j][1, 1][1, 1]
-        : tensor<3x3xi32> to tensor<i32>
-    %augend = tensor.extract_slice %out[%i, %j][1, 1][1, 1]
-        : tensor<3x3xi32> to tensor<i32>
-    %sum = mhlo.add %augend, %addend : tensor<i32>
-    scf.forall.in_parallel {
-      tensor.parallel_insert_slice %sum into %o[%i, %j][1, 1][1, 1]
-        : tensor<i32> into tensor<3x3xi32>
-    }
-  } {some_attr = "attr_value"}
-
-  return %result : tensor<3x3xi32>
-}
-
-// CHECK-LABEL: @add
-//      CHECK: %[[RESULT:.*]] = scf.for
-// CHECK-NEXT:   %[[INNER:.*]] = scf.for
-// CHECK-NEXT:     tensor.extract_slice
-// CHECK-NEXT:     tensor.extract_slice
-// CHECK-NEXT:     mhlo.add
-// CHECK-NEXT:     %[[INSERTED:.*]] = tensor.insert_slice
-// CHECK-NEXT:     scf.yield %[[INSERTED]]
-// CHECK-NEXT:   } {some_attr = "attr_value"}
-// CHECK-NEXT:   scf.yield %[[INNER]]
-// CHECK-NEXT: } {some_attr = "attr_value"}
-// CHECK-NEXT: return %[[RESULT]]
-
-// -----
-
-func.func @bufferized_add() -> memref<3xi32> {
-  %c3 = arith.constant 3 : index
-  %in = arith.constant dense<[1, 2, 3]> : tensor<3xi32>
-  %out = arith.constant dense<[4, 5, 6]> : memref<3xi32>
-
-  scf.forall (%i) in (%c3) {
-    %addend = tensor.extract %in[%i] : tensor<3xi32>
-    %augend = memref.load %out[%i] : memref<3xi32>
-    %sum = arith.addi %augend, %addend : i32
-    memref.store %sum, %out[%i] : memref<3xi32>
-  }
-
-  return %out : memref<3xi32>
-}
-
-// CHECK-LABEL: @bufferized_add
-//  CHECK-DAG: %[[C0:.*]] = arith.constant 0
-//  CHECK-DAG: %[[C1:.*]] = arith.constant 1
-//  CHECK-DAG: %[[C3:.*]] = arith.constant 3
-//      CHECK: scf.for {{.*}} = %[[C0]] to %[[C3]] step %[[C1]]
-// CHECK-NEXT:   tensor.extract
-// CHECK-NEXT:   memref.load
-// CHECK-NEXT:   arith.addi
-// CHECK-NEXT:   memref.store
-// CHECK-NEXT: }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/tile_by_one.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/tile_by_one.mlir
deleted file mode 100644
index b5f9876045ef61..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/tile_by_one.mlir
+++ /dev/null
@@ -1,77 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-tile-by-one | FileCheck %s
-
-func.func @reverse_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>)
-    -> tensor<?x?xf32> {
-  %reversed = thlo.reverse ins(%arg0 : tensor<?x?xf32>)
-      outs(%arg1 : tensor<?x?xf32>) reverse_dimensions = [0, 1]
-  return %reversed : tensor<?x?xf32>
-}
-
-// CHECK:      @reverse_dynamic
-// CHECK:        scf.for
-// CHECK:          scf.for
-// CHECK:            tensor.extract_slice
-// CHECK-SAME:           <?x?xf32> to tensor<1x1xf32>
-
-// -----
-
-func.func @map(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>)
-    -> tensor<?x?xf32> {
-  %mapped = linalg.map { math.absf } ins(%arg0 : tensor<?x?xf32>)
-      outs(%arg1 : tensor<?x?xf32>)
-  return %mapped : tensor<?x?xf32>
-}
-
-// CHECK:      @map
-// CHECK:        scf.for
-// CHECK:          scf.for
-// CHECK:            tensor.extract_slice
-// CHECK-SAME:           <?x?xf32> to tensor<1x1xf32>
-// CHECK:            linalg.map { math.absf }
-// CHECK-SAME:           tensor<1x1xf32>
-
-// -----
-
-func.func @dont_tile_scalarlike_map(%arg0: tensor<1x1xf32>,
-    %arg1: tensor<1x1xf32>) -> tensor<1x1xf32> {
-  %mapped = linalg.map { math.absf } ins(%arg0 : tensor<1x1xf32>)
-      outs(%arg1 : tensor<1x1xf32>)
-  return %mapped : tensor<1x1xf32>
-}
-
-// CHECK:      @dont_tile_scalarlike_map
-// CHECK-NOT:    scf.for
-// CHECK-NOT:    scf.parallel
-// CHECK:        linalg.map
-// CHECK-SAME:       tensor<1x1xf32>
-// CHECK-NOT:    scf.for
-// CHECK-NOT:    scf.parallel
-
-// -----
-
-func.func @concat(%init : tensor<?x?xi32>, %a: tensor<?x?xi32>,
-    %b: tensor<?x?xi32>, %c: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  %concat = thlo.concatenate
-      ins(%a : tensor<?x?xi32>, %b : tensor<?x?xi32>, %c : tensor<?x?xi32>)
-      outs(%init : tensor<?x?xi32>) dimension = 1
-  func.return %concat : tensor<?x?xi32>
-}
-
-// CHECK-LABEL:  @concat
-// CHECK-SAME:       %[[ARG0:.*]]: tensor<?x?xi32>, %[[ARG1:.*]]: tensor<?x?xi32>, %[[ARG2:.*]]: tensor<?x?xi32>, %[[ARG3:.*]]: tensor<?x?xi32>
-// CHECK:          scf.for
-// CHECK:            scf.for
-// CHECK:              scf.if
-// CHECK:                tensor.extract_slice %[[ARG1]]
-// CHECK:                scf.yield
-// CHECK:              else
-// CHECK:                scf.if
-// CHECK:                  tensor.extract_slice %[[ARG2]]
-// CHECK:                  scf.yield
-// CHECK:                else
-// CHECK:                  tensor.extract_slice %[[ARG3]]
-// CHECK:                  scf.yield
-// CHECK:                scf.yield
-// CHECK:              scf.yield
-// CHECK:            scf.yield
-// CHECK:          return
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/tiling_softmax.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/tiling_softmax.mlir
deleted file mode 100644
index 69856f4d89d103..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/tiling_softmax.mlir
+++ /dev/null
@@ -1,172 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file \
-// RUN: --gml-tiling-softmax="tile-sizes=8,16" --canonicalize --cse | \
-// RUN: FileCheck %s
-
-// CHECK-LABEL: @partial_softmax
-// CHECK-SAME:  %[[ARG0:.*]]: tensor<64x128xf32>
-func.func @partial_softmax(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
-  // CHECK:       %[[CST:.*]] = arith.constant 0xFF800000
-  // CHECK:       %[[INIT:.*]] = tensor.empty() : tensor<64xf32>
-  // CHECK:       %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<64xf32>)
-  // CHECK:       %[[INIT_0:.*]] = tensor.empty() : tensor<64x128xf32>
-  // CHECK:       %[[PARALLEL:.*]] = scf.forall
-  // CHECK-SAME:      (%[[ARG1:.*]]) = (0) to (64) step (8)
-  // CHECK-SAME:      shared_outs(%[[INIT_0_:.*]] = %[[INIT_0]])
-  // CHECK:         %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG1]], 0] [8, 128] [1, 1]
-  // CHECK:         %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[FILL]][%[[ARG1]]] [8] [1]
-  // CHECK:         %[[REDUCE:.*]] = linalg.reduce { arith.maximumf }
-  // CHECK-SAME:        ins(%[[MATERIALIZE]] : tensor<8x128xf32>)
-  // CHECK-SAME:        outs(%[[MATERIALIZE_0]] : tensor<8xf32>)
-  // CHECK-SAME:        dimensions = [1]
-  // CHECK:         %[[MATERIALIZE_1:.*]] = tensor.extract_slice %[[INIT_0]][%[[ARG1]], 0] [8, 128] [1, 1]
-  // CHECK:         %[[BROADCAST:.*]] = linalg.broadcast
-  // CHECK-SAME:        ins(%[[REDUCE]] : tensor<8xf32>)
-  // CHECK-SAME:        outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK-SAME:        dimensions = [1]
-  // CHECK:         %[[INIT_0_SUB:.*]] = tensor.extract_slice %[[INIT_0_]][%[[ARG1]], 0] [8, 128] [1, 1]
-  // CHECK:         %[[MAP:.*]] = linalg.map { arith.subf }
-  // CHECK-SAME:        ins(%[[MATERIALIZE]], %[[BROADCAST]] : tensor<8x128xf32>, tensor<8x128xf32>)
-  // CHECK-SAME:        outs(%[[INIT_0_SUB]] : tensor<8x128xf32>)
-
-  // CHECK:       return %[[PARALLEL]]
-  %cst = arith.constant 0xFF800000 : f32
-  %0 = tensor.empty() : tensor<64xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
-  %2 = linalg.reduce { arith.maximumf }
-         ins(%arg0 : tensor<64x128xf32>)
-         outs(%1 : tensor<64xf32>)
-         dimensions = [1]
-  %3 = tensor.empty() : tensor<64x128xf32>
-  %4 = linalg.broadcast
-         ins(%2 : tensor<64xf32>)
-         outs(%3 : tensor<64x128xf32>)
-         dimensions = [1]
-  %5 = linalg.map { arith.subf }
-         ins(%arg0, %4 : tensor<64x128xf32>, tensor<64x128xf32>)
-         outs(%3 : tensor<64x128xf32>)
-  return %5 : tensor<64x128xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @partial_softmax_fusion
-// CHECK-SAME:  %[[ARG0:.*]]: tensor<64x128xf32>, %[[ARG1:.*]]: index
-func.func @partial_softmax_fusion(%arg0: tensor<64x128xf32>, %arg1: index)
-    -> tensor<8x128xf32> {
-  // CHECK-DAG:   %[[CST:.*]] = arith.constant 0xFF800000
-  // CHECK-DAG:   %[[INIT:.*]] = tensor.empty() : tensor<64xf32>
-  // CHECK-DAG:   %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<64xf32>)
-  // CHECK-DAG:   %[[INIT_0:.*]] = tensor.empty() : tensor<64x128xf32>
-  // CHECK-DAG:   %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG1]], 0] [8, 128] [1, 1]
-  // CHECK-DAG:   %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[FILL]][%[[ARG1]]] [8] [1]
-  // CHECK:       %[[REDUCE:.*]] = linalg.reduce { arith.maximumf }
-  // CHECK-SAME:      ins(%[[MATERIALIZE]] : tensor<8x128xf32>)
-  // CHECK-SAME:      outs(%[[MATERIALIZE_0]] : tensor<8xf32>)
-  // CHECK-SAME:      dimensions = [1]
-  // CHECK-DAG:   %[[MATERIALIZE_1:.*]] = tensor.extract_slice %[[INIT_0]][%[[ARG1]], 0] [8, 128] [1, 1]
-  // CHECK:       %[[BROADCAST:.*]] = linalg.broadcast
-  // CHECK-SAME:      ins(%[[REDUCE]] : tensor<8xf32>)
-  // CHECK-SAME:      outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK-SAME:      dimensions = [1]
-  // CHECK:       %[[MAP:.*]] = linalg.map { arith.subf }
-  // CHECK-SAME:      ins(%[[MATERIALIZE]], %[[BROADCAST]] : tensor<8x128xf32>, tensor<8x128xf32>)
-  // CHECK-SAME:      outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK:       return %[[MAP]]
-  %cst = arith.constant 0xFF800000 : f32
-  %0 = tensor.empty() : tensor<64xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
-  %2 = linalg.reduce { arith.maximumf }
-         ins(%arg0 : tensor<64x128xf32>)
-         outs(%1 : tensor<64xf32>)
-         dimensions = [1]
-  %3 = tensor.empty() : tensor<64x128xf32>
-  %4 = linalg.broadcast
-         ins(%2 : tensor<64xf32>)
-         outs(%3 : tensor<64x128xf32>)
-         dimensions = [1]
-  %5 = linalg.map { arith.subf }
-         ins(%arg0, %4 : tensor<64x128xf32>, tensor<64x128xf32>)
-         outs(%3 : tensor<64x128xf32>)
-  %8 = tensor.extract_slice %5[%arg1, 0] [8, 128] [1, 1]
-      : tensor<64x128xf32> to tensor<8x128xf32>
-  return %8 : tensor<8x128xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @softmax
-// CHECK-SAME:  %[[ARG0:.*]]: tensor<64x128xf32>
-func.func @softmax(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
-  // CHECK-DAG:   %[[CST:.*]] = arith.constant -0.000000e+00
-  // CHECK-DAG:   %[[CST_0:.*]] = arith.constant 0xFF800000
-  // CHECK-DAG:   %[[INIT:.*]] = tensor.empty() : tensor<64xf32>
-  // CHECK-DAG:   %[[FILL:.*]] = linalg.fill ins(%[[CST_0]] : f32) outs(%[[INIT]] : tensor<64xf32>)
-  // CHECK-DAG:   %[[INIT_0:.*]] = tensor.empty() : tensor<64x128xf32>
-  // CHECK-DAG:   %[[FILL_0:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<64xf32>)
-  // CHECK:       %[[PARALLEL:.*]] = scf.forall
-  // CHECK-SAME:      (%[[ARG1:.*]]) = (0) to (64) step (8)
-  // CHECK-SAME:      shared_outs(%[[INIT_0_:.*]] = %[[INIT_0]])
-  // CHECK:         %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG1]], 0] [8, 128] [1, 1]
-  // CHECK:         %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[FILL]][%[[ARG1]]] [8] [1]
-  // CHECK:         %[[REDUCE:.*]] = linalg.reduce { arith.maximumf }
-  // CHECK-SAME:        ins(%[[MATERIALIZE]] : tensor<8x128xf32>)
-  // CHECK-SAME:        outs(%[[MATERIALIZE_0]] : tensor<8xf32>)
-  // CHECK-SAME:        dimensions = [1]
-  // CHECK:         %[[MATERIALIZE_1:.*]] = tensor.extract_slice %[[INIT_0]][%[[ARG1]], 0] [8, 128] [1, 1]
-  // CHECK:         %[[BROADCAST:.*]] = linalg.broadcast
-  // CHECK-SAME:        ins(%[[REDUCE]] : tensor<8xf32>)
-  // CHECK-SAME:        outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK-SAME:        dimensions = [1]
-  // CHECK:         %[[MAP:.*]] = linalg.map { arith.subf }
-  // CHECK-SAME:        ins(%[[MATERIALIZE]], %[[BROADCAST]] : tensor<8x128xf32>, tensor<8x128xf32>)
-  // CHECK-SAME:        outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK:         %[[MAP_0:.*]] = linalg.map { math.exp }
-  // CHECK-SAME:        ins(%[[MAP]] : tensor<8x128xf32>)
-  // CHECK-SAME:        outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK:         %[[MATERIALIZE_3:.*]] = tensor.extract_slice %[[FILL_0]][%[[ARG1]]] [8] [1]
-  // CHECK:         %[[REDUCE_0:.*]] = linalg.reduce { arith.addf }
-  // CHECK-SAME:        ins(%[[MAP_0]] : tensor<8x128xf32>)
-  // CHECK-SAME:        outs(%[[MATERIALIZE_3]] : tensor<8xf32>)
-  // CHECK-SAME:        dimensions = [1]
-  // CHECK:         %[[BROADCAST_0:.*]] = linalg.broadcast
-  // CHECK-SAME:        ins(%[[REDUCE_0]] : tensor<8xf32>)
-  // CHECK-SAME:        outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK-SAME:        dimensions = [1]
-  // CHECK:         %[[INIT_0_SUB:.*]] = tensor.extract_slice %[[INIT_0_]][%[[ARG1]], 0] [8, 128] [1, 1]
-  // CHECK:         %[[MAP_1:.*]] = linalg.map { arith.divf }
-  // CHECK-SAME:        ins(%[[MAP_0]], %[[BROADCAST_0]] : tensor<8x128xf32>, tensor<8x128xf32>)
-  // CHECK-SAME:        outs(%[[INIT_0_SUB]] : tensor<8x128xf32>)
-  // CHECK:         tensor.parallel_insert_slice %[[MAP_1]] into %[[INIT_0_]][%[[ARG1]], 0] [8, 128] [1, 1]
-  // CHECK:       return %[[PARALLEL]]
-  %cst = arith.constant -0.000000e+00 : f32
-  %cst_0 = arith.constant 0xFF800000 : f32
-  %0 = tensor.empty() : tensor<64xf32>
-  %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
-  %2 = linalg.reduce { arith.maximumf }
-         ins(%arg0 : tensor<64x128xf32>)
-         outs(%1 : tensor<64xf32>) dimensions = [1]
-  %3 = tensor.empty() : tensor<64x128xf32>
-  %4 = linalg.broadcast
-         ins(%2 : tensor<64xf32>)
-         outs(%3 : tensor<64x128xf32>)
-         dimensions = [1]
-  %5 = linalg.map { arith.subf }
-         ins(%arg0, %4 : tensor<64x128xf32>, tensor<64x128xf32>)
-         outs(%3 : tensor<64x128xf32>)
-  %6 = linalg.map { math.exp }
-         ins(%5 : tensor<64x128xf32>)
-         outs(%3 : tensor<64x128xf32>)
-  %7 = linalg.fill ins(%cst : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
-  %8 = linalg.reduce { arith.addf }
-         ins(%6 : tensor<64x128xf32>)
-         outs(%7 : tensor<64xf32>)
-         dimensions = [1]
-  %9 = linalg.broadcast
-         ins(%8 : tensor<64xf32>)
-         outs(%3 : tensor<64x128xf32>)
-         dimensions = [1]
-  %10 = linalg.map { arith.divf }
-          ins(%6, %9 : tensor<64x128xf32>, tensor<64x128xf32>)
-          outs(%3 : tensor<64x128xf32>)
-  return %10 : tensor<64x128xf32>
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_for_cpu.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_for_cpu.mlir
deleted file mode 100644
index ad51ae89b20f06..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_for_cpu.mlir
+++ /dev/null
@@ -1,395 +0,0 @@
-// RUN: mlir-hlo-opt %s --vectorize-for-cpu --split-input-file |\
-// RUN: FileCheck %s
-
-
-func.func @vectorize_tiled_matmul(%lhs: tensor<8x16xf32>,
-    %rhs: tensor<16x4xf32>, %fill: tensor<8x4xf32>) -> tensor<8x4xf32> {
-  %c0 = arith.constant 0 : index
-  %c2 = arith.constant 2 : index
-  %c16 = arith.constant 16 : index
-
-  %7 = scf.for %i = %c0 to %c16 step %c2
-      iter_args (%arg6 = %fill) -> (tensor<8x4xf32>) {
-    %9 = tensor.extract_slice %lhs[0, %i] [8, 2] [1, 1]  :
-              tensor<8x16xf32> to tensor<8x2xf32>
-
-    %11 = tensor.extract_slice %rhs[%i, 0] [2, 4] [1, 1]  :
-              tensor<16x4xf32> to tensor<2x4xf32>
-
-    %13 = tensor.extract_slice %arg6[0, 0] [8, 4] [1, 1]  :
-              tensor<8x4xf32> to tensor<8x4xf32>
-
-    %14 = linalg.matmul ins(%9, %11 : tensor<8x2xf32>, tensor<2x4xf32>)
-                        outs(%13 : tensor<8x4xf32>) -> tensor<8x4xf32>
-
-    %12 = tensor.insert_slice %14 into %arg6 [0, 0] [8, 4] [1, 1]
-      : tensor<8x4xf32> into tensor<8x4xf32>
-
-    scf.yield %14 : tensor<8x4xf32>
-  } {__perfectly_tileable_loop_label__}
-  return %7 : tensor<8x4xf32>
-}
-
-// CHECK-LABEL: func @vectorize_tiled_matmul
-
-// CHECK:         %[[OUT_READ:.*]] = vector.transfer_read %[[OUT:.*]]
-// CHECK:         %[[FOR:.*]]:2 = scf.for {{.*}} iter_args(%[[ARG0:.*]] = %{{.*}}, %[[ARG1:.*]] =
-// CHECK:           %[[LHS:.*]] = vector.transfer_read
-// CHECK-SAME:        : tensor<8x16xf32>, vector<8x2xf32>
-// CHECK:           %[[RHS:.*]] = vector.transfer_read
-// CHECK-SAME:        : tensor<16x4xf32>, vector<2x4xf32>
-// CHECK:           %[[CONTRACT:.*]] = vector.contract
-// CHECK-SAME:        %[[LHS]], %[[RHS]], %[[ARG1]]
-// CHECK:           scf.yield %[[ARG0]], %[[CONTRACT]]
-// CHECK:         vector.transfer_write %[[FOR]]#1,  %[[FOR]]#0
-
-// -----
-
-func.func @vectorize_static_matmul(%lhs: tensor<128x16xf32>,
-    %rhs: tensor<16x64xf32>, %fill: tensor<128x64xf32>) -> tensor<128x64xf32> {
-  %c2 = arith.constant 2 : index
-  %c16 = arith.constant 16 : index
-  %c8 = arith.constant 8 : index
-  %c4 = arith.constant 4 : index
-  %c0 = arith.constant 0 : index
-  %c128 = arith.constant 128 : index
-  %c64 = arith.constant 64 : index
-  %0 = scf.forall (%i, %j) = (%c0, %c0) to (%c128, %c64) step (%c8, %c4)
-    shared_outs (%out_ = %fill) -> (tensor<128x64xf32>) {
-    %2 = tensor.extract_slice %lhs[%i, 0] [8, 16] [1, 1] :
-            tensor<128x16xf32> to tensor<8x16xf32>
-    %4 = tensor.extract_slice %rhs[0, %j] [16, 4] [1, 1] :
-            tensor<16x64xf32> to tensor<16x4xf32>
-    %6 = tensor.extract_slice %fill[%i, %j] [8, 4] [1, 1] :
-            tensor<128x64xf32> to tensor<8x4xf32>
-    %7 = scf.for %k = %c0 to %c16 step %c2 iter_args (%arg6 = %6) -> (tensor<8x4xf32>) {
-      %9 = tensor.extract_slice %2[0, %k] [8, 2] [1, 1] :
-                tensor<8x16xf32> to tensor<8x2xf32>
-      %11 = tensor.extract_slice %4[%k, 0] [2, 4] [1, 1] :
-                tensor<16x4xf32> to tensor<2x4xf32>
-      %13 = tensor.extract_slice %arg6[0, 0] [8, 4] [1, 1] :
-                tensor<8x4xf32> to tensor<8x4xf32>
-      %14 = linalg.matmul ins(%9, %11 : tensor<8x2xf32>, tensor<2x4xf32>)
-                          outs(%13 : tensor<8x4xf32>) -> tensor<8x4xf32>
-      scf.yield %14 : tensor<8x4xf32>
-    }
-    scf.forall.in_parallel {
-      tensor.parallel_insert_slice %7 into %out_[%i, %j] [8, 4] [1, 1]  :
-              tensor<8x4xf32> into tensor<128x64xf32>
-    }
-  }
-  return %0 : tensor<128x64xf32>
-}
-// CHECK-LABEL: func @vectorize_static_matmul
-
-// CHECK:         %[[OUT_READ:.*]] = vector.transfer_read {{.*}} : tensor<8x4xf32>, vector<8x4xf32>
-// CHECK:         %[[FOR:.*]]:2 = scf.for {{.*}} iter_args(%[[ARG0:.*]] = %{{.*}}, %[[ARG1:.*]] = %[[OUT_READ]]
-// CHECK-NOT:       linalg.matmul
-// CHECK:           %[[LHS:.*]] = vector.transfer_read {{.*}} : tensor<128x16xf32>, vector<8x2xf32>
-// CHECK:           %[[RHS:.*]] = vector.transfer_read {{.*}} : tensor<16x64xf32>, vector<2x4xf32>
-// CHECK-NOT:       vector.transfer_read
-// CHECK:           %[[CONTRACT:.*]] = vector.contract {{{.*}}} %[[LHS]], %[[RHS]], %[[ARG1]]
-// CHECK:           scf.yield %[[ARG0]], %[[CONTRACT]]
-// CHECK:         vector.transfer_write  %[[FOR]]#1,  %[[FOR]]#0
-
-// -----
-
-func.func @transpose(%input: tensor<4x5x6xf32>,
-    %init: tensor<5x6x4xf32>) -> tensor<5x6x4xf32> {
-  %transpose = linalg.transpose
-    ins(%input:tensor<4x5x6xf32>)
-    outs(%init:tensor<5x6x4xf32>)
-    permutation = [1, 2, 0]
-  func.return %transpose : tensor<5x6x4xf32>
-}
-
-// CHECK-LABEL: func @transpose(
-// CHECK-SAME:  %[[INPUT:.*]]: tensor<4x5x6xf32>
-// CHECK-SAME:  %[[INIT:.*]]: tensor<5x6x4xf32>
-
-// CHECK:         %[[READ:.*]] = vector.transfer_read %[[INPUT]]
-// CHECK:         %[[TRANSPOSE:.*]] = vector.transpose %[[READ]], [1, 2, 0]
-// CHECK:         %[[WRITE:.*]] = vector.transfer_write %[[TRANSPOSE]], %[[INIT]]
-// CHECK:         return %[[WRITE]]
-
-// -----
-
-func.func @simplify_identity_transpose(%input: tensor<1x1xf32>,
-    %init: tensor<1x1xf32>) -> tensor<1x1xf32> {
-  %transpose = linalg.transpose
-    ins(%input:tensor<1x1xf32>)
-    outs(%init:tensor<1x1xf32>)
-    permutation = [0, 1]
-  func.return %transpose : tensor<1x1xf32>
-}
-
-// CHECK-LABEL: func @simplify_identity_transpose(
-
-// CHECK-NOT:     linalg.transpose
-// CHECK:         return
-
-// -----
-
-func.func @do_not_simplify_transpose(%input: tensor<1x1xf32>,
-    %init: tensor<1x1xf32>) -> tensor<1x1xf32> {
-  %transpose = linalg.transpose
-    ins(%input:tensor<1x1xf32>)
-    outs(%init:tensor<1x1xf32>)
-    permutation = [1, 0]
-  func.return %transpose : tensor<1x1xf32>
-}
-
-// CHECK-LABEL: func @do_not_simplify_transpose(
-
-// CHECK:         %[[TRANSPOSE:.*]] = linalg.transpose
-// CHECK:         return %[[TRANSPOSE]]
-
-// -----
-
-func.func @perfectly_tiled_reverse_1d(%input: tensor<8xf32>,
-    %init: tensor<8xf32>) -> tensor<8xf32> {
-  %res = thlo.reverse
-         ins(%input: tensor<8xf32>)
-         outs(%init: tensor<8xf32>)
-         reverse_dimensions = [0]
-  func.return %res : tensor<8xf32>
-}
-
-// CHECK-LABEL: func @perfectly_tiled_reverse_1d(
-//  CHECK-SAME: %[[ARG0:.*]]: tensor<8xf32>, %[[ARG1:.*]]: tensor<8xf32>
-//       CHECK:   %[[READ:.*]] = vector.transfer_read %[[ARG0]]
-//       CHECK:   %[[SHUFFLE:.*]] = vector.shuffle %[[READ]]
-//       CHECK:   %[[WRITE:.*]] = vector.transfer_write %[[SHUFFLE]], %[[ARG1]]
-//       CHECK:   return %[[WRITE]]
-
-// -----
-
-func.func @perfectly_tiled_reverse_2d(%input: tensor<1x8xf32>,
-    %init: tensor<1x8xf32>) -> tensor<1x8xf32> {
-  %res = thlo.reverse
-         ins(%input: tensor<1x8xf32>)
-         outs(%init: tensor<1x8xf32>)
-         reverse_dimensions = [1]
-  func.return %res : tensor<1x8xf32>
-}
-
-// CHECK-LABEL: func @perfectly_tiled_reverse_2d(
-//  CHECK-SAME: %[[ARG0:.*]]: tensor<1x8xf32>, %[[ARG1:.*]]: tensor<1x8xf32>
-//       CHECK:   %[[READ:.*]] = vector.transfer_read %[[ARG0]]
-//  CHECK-SAME:   : tensor<1x8xf32>, vector<8xf32>
-//       CHECK:   %[[SHUFFLE:.*]] = vector.shuffle %[[READ]]
-//       CHECK:   %[[WRITE:.*]] = vector.transfer_write %[[SHUFFLE]], %[[ARG1]]
-//  CHECK-SAME:   : vector<8xf32>, tensor<1x8xf32>
-//       CHECK:   return %[[WRITE]]
-
-// -----
-
-func.func @perfectly_tiled_reverse_4d(%input: tensor<1x1x1x8xf32>,
-    %init: tensor<1x1x1x8xf32>) -> tensor<1x1x1x8xf32> {
-  %res = thlo.reverse
-         ins(%input: tensor<1x1x1x8xf32>)
-         outs(%init: tensor<1x1x1x8xf32>)
-         reverse_dimensions = [3]
-  func.return %res : tensor<1x1x1x8xf32>
-}
-
-// CHECK-LABEL: func @perfectly_tiled_reverse_4d(
-//  CHECK-SAME: %[[ARG0:.*]]: tensor<1x1x1x8xf32>, %[[ARG1:.*]]: tensor<1x1x1x8xf32>
-//       CHECK:   %[[READ:.*]] = vector.transfer_read %[[ARG0]]
-//  CHECK-SAME:   : tensor<1x1x1x8xf32>, vector<8xf32>
-//       CHECK:   %[[SHUFFLE:.*]] = vector.shuffle %[[READ]]
-//       CHECK:   %[[WRITE:.*]] = vector.transfer_write %[[SHUFFLE]], %[[ARG1]]
-//  CHECK-SAME:   : vector<8xf32>, tensor<1x1x1x8xf32>
-//       CHECK:   return %[[WRITE]]
-
-// -----
-
-func.func @matvec(%lhs: tensor<33x17xf32>, %rhs: tensor<17xf32>,
-                  %output: tensor<33xf32>) -> tensor<33xf32> {
-  %2 = linalg.matvec ins(%lhs, %rhs : tensor<33x17xf32>, tensor<17xf32>)
-                     outs(%output : tensor<33xf32>) -> tensor<33xf32>
-  return %2 : tensor<33xf32>
-}
-
-// CHECK-LABEL: @matvec
-// CHECK-SAME:  %[[LHS:.*]]: tensor<33x17xf32>, %[[RHS:.*]]: tensor<17xf32>, %[[OUT:.*]]: tensor<33xf32>
-// CHECK:         %[[LHS_READ:.*]] = vector.transfer_read %[[LHS]]
-// CHECK:         %[[RHS_READ:.*]] = vector.transfer_read %[[RHS]]
-// CHECK:         %[[OUT_READ:.*]] = vector.transfer_read %[[OUT]]
-// CHECK:         %[[CONTRACT:.*]] = vector.contract {{.*}}%[[LHS_READ]], %[[RHS_READ]], %[[OUT_READ]]
-// CHECK:         vector.transfer_write %[[CONTRACT]], %[[OUT]]
-
-// -----
-
-func.func @vecmat(%lhs: tensor<17xf32>, %rhs: tensor<17x33xf32>,
-                  %output: tensor<33xf32>) -> tensor<33xf32> {
-  %2 = linalg.vecmat ins(%lhs, %rhs : tensor<17xf32>, tensor<17x33xf32>)
-                     outs(%output : tensor<33xf32>) -> tensor<33xf32>
-  return %2 : tensor<33xf32>
-}
-
-// CHECK-LABEL: @vecmat
-// CHECK-SAME:  %[[LHS:.*]]: tensor<17xf32>, %[[RHS:.*]]: tensor<17x33xf32>, %[[OUT:.*]]: tensor<33xf32>
-// CHECK:         %[[LHS_READ:.*]] = vector.transfer_read %[[LHS]]
-// CHECK:         %[[RHS_READ:.*]] = vector.transfer_read %[[RHS]]
-// CHECK:         %[[OUT_READ:.*]] = vector.transfer_read %[[OUT]]
-// CHECK:         %[[CONTRACT:.*]] = vector.contract {{.*}}%[[LHS_READ]], %[[RHS_READ]], %[[OUT_READ]]
-// CHECK:         vector.transfer_write %[[CONTRACT]], %[[OUT]]
-
-// -----
-
-func.func @dot(%lhs: tensor<17xf32>, %rhs: tensor<17xf32>,
-                  %output: tensor<f32>) -> tensor<f32> {
-  %2 = linalg.dot ins(%lhs, %rhs : tensor<17xf32>, tensor<17xf32>)
-                     outs(%output : tensor<f32>) -> tensor<f32>
-  return %2 : tensor<f32>
-}
-
-// CHECK-LABEL: @dot
-// CHECK-SAME:  %[[LHS:.*]]: tensor<17xf32>, %[[RHS:.*]]: tensor<17xf32>, %[[OUT:.*]]: tensor<f32>
-// CHECK:         %[[LHS_READ:.*]] = vector.transfer_read %[[LHS]]
-// CHECK:         %[[RHS_READ:.*]] = vector.transfer_read %[[RHS]]
-// CHECK:         %[[OUT_READ:.*]] = vector.transfer_read %[[OUT]]
-// CHECK:         %[[CONTRACT:.*]] = vector.contract {{.*}}%[[LHS_READ]], %[[RHS_READ]]
-// CHECK:         vector.transfer_write {{.*}}, %[[OUT]]
-
-// -----
-
-func.func @dont_vectorize_any_ite(%arg0: i1, %arg1: tensor<8x1xf32>,
-    %arg2: tensor<8x1xf32>) -> tensor<8x1xf32> {
-  %0 = scf.if %arg0 -> (tensor<8x1xf32>) {
-    scf.yield %arg1 : tensor<8x1xf32>
-  } else {
-    scf.yield %arg2 : tensor<8x1xf32>
-  }
-  return %0 : tensor<8x1xf32>
-}
-
-// CHECK-LABEL: @dont_vectorize_any_ite
-// CHECK:         scf.if %{{.*}} -> (tensor<8x1xf32>)
-
-// -----
-
-func.func @vectorize_ite_w_vector_producers(%arg0: i1, %arg1: vector<8x1xf32>,
-    %arg2: vector<8x1xf32>) -> tensor<8x1xf32> {
-  %c0 = arith.constant 0 : index
-  %0 = tensor.empty() : tensor<8x1xf32>
-  %1 = scf.if %arg0 -> (tensor<8x1xf32>) {
-    %2 = vector.transfer_write %arg1, %0[%c0, %c0] {in_bounds = [true, true]}
-        : vector<8x1xf32>, tensor<8x1xf32>
-    scf.yield %2 : tensor<8x1xf32>
-  } else {
-    %2 = vector.transfer_write %arg2, %0[%c0, %c0] {in_bounds = [true, true]}
-        : vector<8x1xf32>, tensor<8x1xf32>
-    scf.yield %2 : tensor<8x1xf32>
-  }
-  return %1 : tensor<8x1xf32>
-}
-
-// CHECK-LABEL: @vectorize_ite_w_vector_producers
-// CHECK-SAME:      %[[ARG0:.*]]: i1, %[[ARG1:.*]]: vector<8x1xf32>, %[[ARG2:.*]]: vector<8x1xf32>
-// CHECK:         %[[IF:.*]] = scf.if %[[ARG0]] -> (vector<8x1xf32>)
-// CHECK:           scf.yield %[[ARG1]]
-// CHECK:         else
-// CHECK:           scf.yield %[[ARG2]]
-// CHECK:         %[[TRANSFER:.*]] = vector.transfer_write %[[IF]]
-// CHECK:         return %[[TRANSFER]]
-
-// -----
-
-func.func @vectorize_ite_w_vector_users(%arg0: i1, %arg1: tensor<8x1xf32>,
-    %arg2: tensor<8x1xf32>) -> vector<8x1xf32> {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = scf.if %arg0 -> (tensor<8x1xf32>) {
-    scf.yield %arg1 : tensor<8x1xf32>
-  } else {
-    scf.yield %arg2 : tensor<8x1xf32>
-  }
-  %1 = vector.transfer_read %0[%c0, %c0], %cst {in_bounds = [true, true]}
-      : tensor<8x1xf32>, vector<8x1xf32>
-  return %1 : vector<8x1xf32>
-}
-
-// CHECK-LABEL: @vectorize_ite_w_vector_users
-// CHECK-SAME:      %[[ARG0:.*]]: i1, %[[ARG1:.*]]: tensor<8x1xf32>, %[[ARG2:.*]]: tensor<8x1xf32>
-// CHECK:         %[[IF:.*]] = scf.if %[[ARG0]] -> (vector<8x1xf32>)
-// CHECK:           %[[TRANSFER:.*]] = vector.transfer_read
-// CHECK:           scf.yield %[[TRANSFER]] : vector<8x1xf32>
-// CHECK:         else
-// CHECK:           %[[TRANSFER_0:.*]] = vector.transfer_read
-// CHECK:           scf.yield %[[TRANSFER_0]] : vector<8x1xf32>
-// CHECK:         return %[[IF]]
-
-// -----
-
-func.func @dont_vectorize_complex_ite(%arg0: i1,
-    %arg1: tensor<8x1xcomplex<f32>>, %arg2: tensor<8x1xcomplex<f32>>)
-    -> tensor<8x1xcomplex<f32>> {
-  %0 = scf.if %arg0 -> (tensor<8x1xcomplex<f32>>) {
-    scf.yield %arg1 : tensor<8x1xcomplex<f32>>
-  } else {
-    scf.yield %arg2 : tensor<8x1xcomplex<f32>>
-  }
-  return %0 : tensor<8x1xcomplex<f32>>
-}
-
-// CHECK-LABEL: @dont_vectorize_complex_ite
-// CHECK:         scf.if %{{.*}} -> (tensor<8x1xcomplex<f32>>)
-
-// -----
-
-func.func @vectorize_ite_w_scalar(%arg0: i1, %arg1: tensor<8x1xf32>, %arg2: f32,
-    %arg3: tensor<8x1xf32>, %arg4: f32) -> (vector<8x1xf32>, f32) {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0:2 = scf.if %arg0 -> (tensor<8x1xf32>, f32) {
-    scf.yield %arg1, %arg2 : tensor<8x1xf32>, f32
-  } else {
-    scf.yield %arg3, %arg4 : tensor<8x1xf32>, f32
-  }
-  %1 = vector.transfer_read %0#0[%c0, %c0], %cst {in_bounds = [true, true]}
-      : tensor<8x1xf32>, vector<8x1xf32>
-  return %1, %0#1 : vector<8x1xf32>, f32
-}
-
-// CHECK-LABEL: @vectorize_ite_w_scalar
-// CHECK-SAME:      %[[ARG0:.*]]: i1, %[[ARG1:.*]]: tensor<8x1xf32>, %[[ARG2:.*]]: f32, %[[ARG3:.*]]: tensor<8x1xf32>, %[[ARG4:.*]]: f32
-// CHECK:         %[[IF:.*]]:2 = scf.if %[[ARG0]] -> (vector<8x1xf32>, f32)
-// CHECK:           %[[TRANSFER:.*]] = vector.transfer_read %[[ARG1]]
-// CHECK:           scf.yield %[[TRANSFER]], %[[ARG2]]
-// CHECK:         else
-// CHECK:           %[[TRANSFER_0:.*]] = vector.transfer_read %[[ARG3]]
-// CHECK:           scf.yield %[[TRANSFER_0]], %[[ARG4]]
-// CHECK:         return %[[IF]]#0, %[[IF]]#1
-
-// -----
-
-func.func @vectorize_ite_w_casts(%arg0: i1, %arg1: tensor<8x1xf32>,
-    %arg2: tensor<8x1xf32>) -> vector<8x1xf32> {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = scf.if %arg0 -> (tensor<?x1xf32>) {
-    %cast_0 = tensor.cast %arg1 : tensor<8x1xf32> to tensor<?x1xf32>
-    scf.yield %cast_0 : tensor<?x1xf32>
-  } else {
-    %cast_0 = tensor.cast %arg2 : tensor<8x1xf32> to tensor<?x1xf32>
-    scf.yield %cast_0 : tensor<?x1xf32>
-  }
-  %cast = tensor.cast %0 : tensor<?x1xf32> to tensor<8x1xf32>
-  %1 = vector.transfer_read %cast[%c0, %c0], %cst {in_bounds = [true, true]}
-      : tensor<8x1xf32>, vector<8x1xf32>
-  return %1 : vector<8x1xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @vectorize_ite_w_casts
-// CHECK-SAME:      %[[ARG0:.*]]: i1, %[[ARG1:.*]]: tensor<8x1xf32>, %[[ARG2:.*]]: tensor<8x1xf32>
-// CHECK:         %[[IF:.*]] = scf.if %[[ARG0]] -> (vector<8x1xf32>)
-// CHECK:           %[[TRANSFER:.*]] = vector.transfer_read %[[ARG1]]
-// CHECK:           scf.yield %[[TRANSFER]]
-// CHECK:         else
-// CHECK:           %[[TRANSFER_0:.*]] = vector.transfer_read %[[ARG2]]
-// CHECK:           scf.yield %[[TRANSFER_0]]
-// CHECK:         return %[[IF]]
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir
index 8be889f81a3736..78b64069531d0c 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir
@@ -398,7 +398,8 @@ func.func @fusion_memref(%input1: memref<10xf32>, %input2: memref<10xf32>, %inpu
     %2 = "mhlo.add"(%0, %1) {name = "add"} : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
     %3 = bufferization.to_tensor %input3 : memref<10xf32>
     %4 = "mhlo.multiply"(%2, %3) {name = "multiply"} : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-    memref.tensor_store %4, %out : memref<10xf32>
+    bufferization.materialize_in_destination %4 in writable %out
+        : (tensor<10xf32>, memref<10xf32>) -> ()
     "lmhlo.terminator"() : () -> ()
   } ) : () -> ()
   func.return
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo_gpu/lhlo_gpu_ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo_gpu/lhlo_gpu_ops.mlir
index ca4b2c0be2a117..2370f762db4398 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo_gpu/lhlo_gpu_ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo_gpu/lhlo_gpu_ops.mlir
@@ -228,7 +228,8 @@ func.func @ag_start(%arg : memref<10x10xf32>, %out: memref<20x10xf32>) {
     {
       replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
       all_gather_dimension = 0,
-      is_sync = false
+      is_sync = false,
+      no_parallel_custom_call = false
     }
     : (memref<10x10xf32>, memref<20x10xf32>) -> (!mhlo.token)
   func.return
@@ -241,7 +242,8 @@ func.func @ag_start_mixed(%arg0 : memref<10x10xf32>, %arg1 : memref<10x10xf16>,
     {
       replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
       all_gather_dimension = 0,
-      is_sync = true
+      is_sync = true,
+      no_parallel_custom_call = true
     }
     : (memref<10x10xf32>, memref<10x10xf16>, memref<20x10xf32>, memref<20x10xf16>) -> (!mhlo.token)
   func.return
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
index a5deb447d2187a..7726ec47587c3a 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
@@ -6130,7 +6130,8 @@ func.func @clamp_complex(%min: tensor<8xcomplex<f32>>,
 }
 
 // -----
-
+// CHECK: #[[$ST_3D:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed) }>
+// CHECK: #[[$ST_4D:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2, d3) -> (d0 : compressed, d1 : compressed, d2 : compressed, d3 : compressed) }>
 // CHECK-LABEL: func @reshape_sparse_encoding
 // CHECK-PRIMITIVE-LABEL: func @reshape_sparse_encoding
 
@@ -6146,4 +6147,4 @@ func.func @reshape_sparse_encoding(%arg0: tensor<1x49x16xf32, #ST_3D>) -> tensor
   %0 = "mhlo.reshape"(%arg0) : (tensor<1x49x16xf32, #ST_3D>) -> tensor<1x784x1x1xf32, #ST_4D>
   func.return %0 : tensor<1x784x1x1xf32, #ST_4D>
 }
-// CHECK: tensor.reshape %{{.*}} : (tensor<1x49x16xf32, #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed) }>>, tensor<4xi64>) -> tensor<1x784x1x1xf32, #sparse_tensor.encoding<{ map = (d0, d1, d2, d3) -> (d0 : compressed, d1 : compressed, d2 : compressed, d3 : compressed) }>>
+// CHECK: tensor.reshape %{{.*}} : (tensor<1x49x16xf32, #[[$ST_3D]]>, tensor<4xi64>) -> tensor<1x784x1x1xf32, #[[$ST_4D]]>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
index 6de9c1a64b37b7..c87a831a492d1c 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
@@ -501,7 +501,7 @@ func.func @op_broadcast_in_dim(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
 // CHECK-LABEL: "op_broadcast"
 func.func @op_broadcast(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
   //      CHECK: "stablehlo.broadcast"(%arg0) {
-  // CHECK-SAME:   broadcast_sizes = dense<16> : tensor<1xi64>
+  // CHECK-SAME:   broadcast_sizes = array<i64: 16>
   // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16x16xf32>
   %0 = "mhlo.broadcast"(%arg0) {
     broadcast_sizes = dense<16> : tensor<1xi64>
@@ -559,6 +559,19 @@ func.func @op_count_leading_zeros(%arg0: tensor<i32>) -> tensor<i32> {
   func.return %0 : tensor<i32>
 }
 
+// CHECK-LABEL: "op_collective_broadcast"
+func.func @op_collective_broadcast(%arg0: tensor<1x2xi64>) -> tensor<1x2xi64> {
+  //               CHECK: "stablehlo.collective_broadcast"(%arg0) {
+  //          CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>,
+  // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
+  //          CHECK-SAME: } : (tensor<1x2xi64>) -> tensor<1x2xi64>
+  %0 = "mhlo.collective_broadcast"(%arg0) {
+    replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>,
+    channel_handle = #mhlo.channel_handle<handle = 0, type = 0>
+  } : (tensor<1x2xi64>) -> tensor<1x2xi64>
+  func.return %0 : tensor<1x2xi64>
+}
+
 // CHECK-LABEL: "op_collective_permute"
 func.func @op_collective_permute(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
   //               CHECK: "stablehlo.collective_permute"(%arg0) {
@@ -859,7 +872,7 @@ func.func @op_dynamic_reshape(%arg0: tensor<16xf32>, %arg1: tensor<?xindex>) ->
 // CHECK-LABEL: "op_dynamic_slice"
 func.func @op_dynamic_slice(%arg0: tensor<16xf32>, %arg1: tensor<i64>) -> tensor<4xf32> {
   //      CHECK: "stablehlo.dynamic_slice"(%arg0, %arg1) {
-  // CHECK-SAME:   slice_sizes = dense<4> : tensor<1xi64>
+  // CHECK-SAME:   slice_sizes = array<i64: 4>
   // CHECK-SAME: } : (tensor<16xf32>, tensor<i64>) -> tensor<4xf32>
   %0 = "mhlo.dynamic_slice"(%arg0, %arg1) {
     slice_sizes = dense<4> : tensor<1xi64>
@@ -902,7 +915,7 @@ func.func @op_exponential(%arg0: tensor<f32>) -> tensor<f32> {
 // CHECK-LABEL: "op_fft"
 func.func @op_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
   //      CHECK: "stablehlo.fft"(%arg0) {
-  // CHECK-SAME:   fft_length = dense<16> : tensor<1xi64>,
+  // CHECK-SAME:   fft_length = array<i64: 16>,
   // CHECK-SAME:   fft_type = #stablehlo<fft_type FFT>
   // CHECK-SAME: } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
   %0 = "mhlo.fft"(%arg0) {
@@ -1124,9 +1137,9 @@ func.func @op_outfeed(%arg0: tensor<f32>, %arg1: !mhlo.token) -> !mhlo.token {
 // CHECK-LABEL: "op_pad"
 func.func @op_pad(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> tensor<16xf32> {
   //      CHECK: "stablehlo.pad"(%arg0, %arg1) {
-  // CHECK-SAME:   edge_padding_high = dense<4> : tensor<1xi64>,
-  // CHECK-SAME:   edge_padding_low = dense<4> : tensor<1xi64>,
-  // CHECK-SAME:   interior_padding = dense<0> : tensor<1xi64>
+  // CHECK-SAME:   edge_padding_high = array<i64: 4>,
+  // CHECK-SAME:   edge_padding_low = array<i64: 4>,
+  // CHECK-SAME:   interior_padding = array<i64: 0>
   // CHECK-SAME: } : (tensor<8xf32>, tensor<f32>) -> tensor<16xf32>
   %0 = "mhlo.pad"(%arg0, %arg1) {
     edge_padding_high = dense<4> : tensor<1xi64>,
@@ -1294,7 +1307,7 @@ func.func @op_return(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
 // CHECK-LABEL: "op_reverse"
 func.func @op_reverse(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   //      CHECK: "stablehlo.reverse"(%arg0) {
-  // CHECK-SAME:   dimensions = dense<0> : tensor<1xi64>
+  // CHECK-SAME:   dimensions = array<i64: 0>
   // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16xf32>
   %0 = "mhlo.reverse"(%arg0) {
     dimensions = dense<0> : tensor<1xi64>
@@ -1478,9 +1491,9 @@ func.func @op_sine(%arg0: tensor<f32>) -> tensor<f32> {
 // CHECK-LABEL: "op_slice"
 func.func @op_slice(%arg0: tensor<16xf32>) -> tensor<4xf32> {
   //      CHECK: "stablehlo.slice"(%arg0) {
-  // CHECK-SAME:   limit_indices = dense<4> : tensor<1xi64>,
-  // CHECK-SAME:   start_indices = dense<0> : tensor<1xi64>,
-  // CHECK-SAME:   strides = dense<1> : tensor<1xi64>
+  // CHECK-SAME:   limit_indices = array<i64: 4>,
+  // CHECK-SAME:   start_indices = array<i64: 0>,
+  // CHECK-SAME:   strides = array<i64: 1>
   // CHECK-SAME: } : (tensor<16xf32>) -> tensor<4xf32>
   %0 = "mhlo.slice"(%arg0) {
     start_indices = dense<0> : tensor<1xi64>,
@@ -1543,7 +1556,16 @@ func.func @op_tanh(%arg0: tensor<f32>) -> tensor<f32> {
   func.return %0 : tensor<f32>
 }
 
-// TopKOp aka mhlo.topk is unsupported at the moment (see negative test below).
+// CHECK-LABEL: "op_topk"
+func.func @op_topk(%arg0: tensor<5x10xf32>) -> (tensor<5x8xf32>, tensor<5x8xi32>) {
+  //               CHECK: "stablehlo.custom_call"(%arg0) {
+  //          CHECK-SAME:    call_target_name = "mhlo.topk"
+  // CHECK-SAME{LITERAL}:    mhlo.attributes = {k = 8 : i64, largest = true}
+  // CHECK-SAME{LITERAL}:    mhlo.version = 1 : i64
+  //          CHECK-SAME: } : (tensor<5x10xf32>) -> (tensor<5x8xf32>, tensor<5x8xi32>)
+  %0:2 = mhlo.topk(%arg0, k=8, largest=true) : tensor<5x10xf32> -> (tensor<5x8xf32>, tensor<5x8xi32>)
+  func.return %0#0, %0#1 : tensor<5x8xf32>, tensor<5x8xi32>
+}
 
 // CHECK-LABEL: "op_torch_index_select"
 func.func @op_torch_index_select(%arg0: tensor<5x1x5xf32>, %arg1: tensor<2xi32>) ->  tensor<2x1x5xf32> {
@@ -1572,7 +1594,7 @@ func.func @op_trace(%arg0: tensor<f32>) {
 // CHECK-LABEL: "op_transpose"
 func.func @op_transpose(%arg0: tensor<16x8xf32>) ->  tensor<8x16xf32> {
   //      CHECK: "stablehlo.transpose"(%arg0) {
-  // CHECK-SAME:   permutation = dense<[1, 0]> : tensor<2xi64>
+  // CHECK-SAME:   permutation = array<i64: 1, 0>
   // CHECK-SAME: } : (tensor<16x8xf32>) -> tensor<8x16xf32>
   %0 = "mhlo.transpose"(%arg0) {
     permutation = dense<[1, 0]> : tensor<2xi64>
@@ -1834,13 +1856,20 @@ func.func @type_quantization(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>, %ar
   func.return %0 : tensor<f32>
 }
 
+// -----
+
+#SV = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
+
+// CHECK: #[[$SV:.*]] = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
 // CHECK-LABEL: "type_sparsity"
-func.func @type_sparsity(%arg0: tensor<16xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<16xf32> {
-  // CHECK: "stablehlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<16xf32>
-  %0 = "mhlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<16xf32>
+func.func @type_sparsity(%arg0: tensor<16xf32, #SV>) -> tensor<16xf32> {
+  // CHECK: "stablehlo.abs"(%arg0) : (tensor<16xf32, #[[$SV]]>) -> tensor<16xf32>
+  %0 = "mhlo.abs"(%arg0) : (tensor<16xf32, #SV>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
 
+// -----
+
 // AsyncBundle aka !mhlo.async_bundle is unsupported at the moment (see negative test below).
 
 func.func @type_token_callee(%arg0: !mhlo.token) -> !mhlo.token {
@@ -2050,18 +2079,6 @@ func.func @op_stochastic_convert(%arg0: tensor<f32>, %arg1: tensor<ui32>) -> ten
 
 // -----
 
-func.func @op_topk(%arg0 : tensor<16xf32>) {
-  // expected-error@+1 {{failed to legalize operation 'mhlo.topk' that was explicitly marked illegal}}
-  %0:2 = mhlo.topk(%arg0, k=8) {
-    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-      mhlo.return %predicate : tensor<i1>
-  } : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
-  return
-}
-
-// -----
-
 func.func @op_xla_rng_get_and_update_state() -> tensor<2xui64> {
   // expected-error@+1 {{failed to legalize operation 'mhlo.xla.rng_get_and_update_state' that was explicitly marked illegal}}
   %0 = "mhlo.xla.rng_get_and_update_state"() {
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-mhlo-to-thlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-mhlo-to-thlo.mlir
deleted file mode 100644
index 18b71ea6e12bd5..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/legalize-mhlo-to-thlo.mlir
+++ /dev/null
@@ -1,314 +0,0 @@
-// RUN: mlir-hlo-opt %s --legalize-mhlo-to-thlo=enable-experimental=true | FileCheck %s
-
-// CHECK-LABEL: @dynamic_broadcast_in_dim
-// CHECK-SAME:  %[[ARG:.*]]: tensor<?x?xf32>, %[[SHAPE:.*]]: tensor<3xindex>
-func.func @dynamic_broadcast_in_dim(%arg : tensor<?x?xf32>, %shape : tensor<3xindex>) -> tensor<?x?x?xf32> {
-  // CHECK-DAG: %[[C0:.*]] = arith.constant 0
-  // CHECK-DAG: %[[C1:.*]] = arith.constant 1
-  // CHECK-DAG: %[[C2:.*]] = arith.constant 2
-  // CHECK-DAG: %[[SHAPE_D0:.*]] = tensor.extract %[[SHAPE]][%[[C0]]]
-  // CHECK-DAG: %[[SHAPE_D1:.*]] = tensor.extract %[[SHAPE]][%[[C1]]]
-  // CHECK-DAG: %[[SHAPE_D2:.*]] = tensor.extract %[[SHAPE]][%[[C2]]]
-  // CHECK-DAG: %[[INIT:.*]] = tensor.empty(%[[SHAPE_D0]], %[[SHAPE_D1]], %[[SHAPE_D2]]) : tensor<?x?x?xf32>
-  // CHECK-NEXT: %[[BCAST:.*]] = thlo.dynamic_broadcast_in_dim
-  // CHECK-SAME:   ins(%[[ARG]] : tensor<?x?xf32>)
-  // CHECK-SAME:   outs(%[[INIT]] : tensor<?x?x?xf32>)
-  // CHECK-SAME:   broadcast_dimensions = [0, 2]
-  // CHECK:     return %[[BCAST]]
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg, %shape)
-      { broadcast_dimensions = dense<[0, 2]> : tensor<2xi64> }
-      : (tensor<?x?xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
-  func.return %0 : tensor<?x?x?xf32>
-}
-
-// CHECK-LABEL: @dynamic_broadcast_in_dim_expansion_behavior_known
-// CHECK-SAME:  %[[ARG:.*]]: tensor<?x?xf32>, %[[SHAPE:.*]]: tensor<3xindex>
-func.func @dynamic_broadcast_in_dim_expansion_behavior_known(
-    %arg : tensor<?x?xf32>, %shape : tensor<3xindex>) -> tensor<?x?x?xf32> {
-  // CHECK:       %[[BCAST:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG]], %[[SHAPE]])
-  // CHECK:       return %[[BCAST]]
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg, %shape) {
-      broadcast_dimensions = dense<[0, 2]> : tensor<2xi64>,
-      known_expanding_dimensions = dense<[0]> : tensor<1xi64>,
-      known_nonexpanding_dimensions = dense<[1]> : tensor<1xi64> }
-      : (tensor<?x?xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
-  func.return %0 : tensor<?x?x?xf32>
-}
-
-// CHECK-LABEL: @dynamic_broadcast_in_dim_with_known_expanding
-// CHECK-SAME:  %[[ARG:.*]]: tensor<?x?x?xf32>, %[[SHAPE:.*]]: tensor<4xindex>
-func.func @dynamic_broadcast_in_dim_with_known_expanding(%arg : tensor<?x?x?xf32>, %shape : tensor<4xindex>) -> tensor<?x?x?x?xf32> {
-  // CHECK-DAG: %[[C0:.*]] = arith.constant 0
-  // CHECK-DAG: %[[C1:.*]] = arith.constant 1
-  // CHECK-DAG: %[[C2:.*]] = arith.constant 2
-  // CHECK-DAG: %[[C3:.*]] = arith.constant 3
-  // CHECK-DAG: %[[SHAPE_D0:.*]] = tensor.extract %[[SHAPE]][%[[C0]]]
-  // CHECK-DAG: %[[SHAPE_D1:.*]] = tensor.extract %[[SHAPE]][%[[C1]]]
-  // CHECK-DAG: %[[SHAPE_D2:.*]] = tensor.extract %[[SHAPE]][%[[C2]]]
-  // CHECK-DAG: %[[SHAPE_D3:.*]] = tensor.extract %[[SHAPE]][%[[C3]]]
-  // CHECK-DAG: %[[INIT:.*]] = tensor.empty(%[[SHAPE_D0]], %[[SHAPE_D1]], %[[SHAPE_D2]], %[[SHAPE_D3]]) : tensor<?x?x?x?xf32>
-  // CHECK-NEXT: %[[BCAST:.*]] = thlo.dynamic_broadcast_in_dim
-  // CHECK-SAME:   ins(%[[ARG]] : tensor<?x?x?xf32>)
-  // CHECK-SAME:   outs(%[[INIT]] : tensor<?x?x?x?xf32>)
-  // CHECK-SAME:   broadcast_dimensions = [0, 2, 3]
-  // CHECK-SAME: {known_expanding_dimensions = array<i64: 0>, known_nonexpanding_dimensions = array<i64: 2>}
-  // CHECK:     return %[[BCAST]]
-  %0 = "mhlo.dynamic_broadcast_in_dim"(%arg, %shape) {
-      broadcast_dimensions = dense<[0, 2, 3]> : tensor<3xi64>,
-      known_expanding_dimensions = dense<[0]> : tensor<1xi64>,
-      known_nonexpanding_dimensions = dense<[2]> : tensor<1xi64> }
-      : (tensor<?x?x?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
-  func.return %0 : tensor<?x?x?x?xf32>
-}
-
-// CHECK-LABEL: @concatenate
-// CHECK-SAME:  %[[A:.*]]: tensor<?x?xi32>, %[[B:.*]]: tensor<?x?xi32>, %[[C:.*]]: tensor<?x?xi32>
-func.func @concatenate(%a: tensor<?x?xi32>, %b: tensor<?x?xi32>, %c: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK-DAG:  %[[C0:.*]] = arith.constant 0
-  // CHECK-DAG:  %[[C1:.*]] = arith.constant 1
-  // CHECK-DAG:  %[[D0:.*]] = tensor.dim %[[A]], %[[C0]]
-  // CHECK-DAG:  %[[CONCAT_DIM_A:.*]] = tensor.dim %[[A]], %[[C1]]
-  // CHECK-DAG:  %[[CONCAT_DIM_B:.*]] = tensor.dim %[[B]], %[[C1]]
-  // CHECK-DAG:  %[[CONCAT_DIM_C:.*]] = tensor.dim %[[C]], %[[C1]]
-  // CHECK-DAG:  %[[CONCAT_DIM_AB:.*]] = arith.addi %[[CONCAT_DIM_A]], %[[CONCAT_DIM_B]]
-  // CHECK-DAG:  %[[CONCAT_DIM_ABC:.*]] = arith.addi %[[CONCAT_DIM_AB]], %[[CONCAT_DIM_C]]
-  // CHECK-DAG:  %[[INIT:.*]] = tensor.empty(%[[D0]], %[[CONCAT_DIM_ABC]])
-  // CHECK:      %[[CONCATENATE:.*]] = thlo.concatenate
-  // CHECK-SAME:   ins(%[[A]] : tensor<?x?xi32>, %[[B]] : tensor<?x?xi32>, %[[C]] : tensor<?x?xi32>)
-  // CHECK-SAME:   outs(%[[INIT]] : tensor<?x?xi32>)
-  // CHECK-SAME:   dimension = 1
-  // CHECK:      return %[[CONCATENATE]]
-  %concat = "mhlo.concatenate"(%a, %b, %c) { dimension = 1 } : (tensor<?x?xi32>, tensor<?x?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
-  func.return %concat : tensor<?x?xi32>
-}
-
-// CHECK-LABEL: @concatenate_with_static_info
-// CHECK-SAME:  %[[A:.*]]: tensor<64x32xi32>, %[[B:.*]]: tensor<64x16xi32>, %[[C:.*]]: tensor<64x?xi32>
-func.func @concatenate_with_static_info(%a: tensor<64x32xi32>, %b: tensor<64x16xi32>, %c: tensor<64x?xi32>) -> tensor<64x?xi32> {
-  // CHECK-DAG:  %[[C1:.*]] = arith.constant 1
-  // CHECK-DAG:  %[[C48:.*]] = arith.constant 48
-  // CHECK-DAG:  %[[CONCAT_DIM_C:.*]] = tensor.dim %[[C]], %[[C1]]
-  // CHECK-DAG:  %[[CONCAT_DIM_SUM:.*]] = arith.addi %[[CONCAT_DIM_C]], %[[C48]]
-  // CHECK-DAG:  %[[INIT:.*]] = tensor.empty(%[[CONCAT_DIM_SUM]])
-  // CHECK:      %[[CONCAT:.*]] = thlo.concatenate
-  // CHECK-SAME:   ins(%[[A]] : tensor<64x32xi32>, %[[B]] : tensor<64x16xi32>, %[[C]] : tensor<64x?xi32>)
-  // CHECK-SAME:   outs(%[[INIT]] : tensor<64x?xi32>)
-  // CHECK-SAME:   dimension = 1
-  // CHECK:      return %[[CONCAT]]
-  %concat = "mhlo.concatenate"(%a, %b, %c) { dimension = 1 } : (tensor<64x32xi32>, tensor<64x16xi32>, tensor<64x?xi32>) -> tensor<64x?xi32>
-  func.return %concat : tensor<64x?xi32>
-}
-
-func.func @simple_gather(%operand : tensor<3x3xf32>,
-                         %indices: tensor<3x2xi64>) -> tensor<3x1x1xf32> {
-  %0 = "mhlo.gather"(%operand, %indices) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [],
-      index_vector_dim = 1,
-      offset_dims = [1, 2],
-      start_index_map = [0, 1]
-    >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1]> : tensor<2xi64>
-  } : (tensor<3x3xf32>, tensor<3x2xi64>) -> tensor<3x1x1xf32>
-  func.return %0 : tensor<3x1x1xf32>
-}
-
-// CHECK-LABEL: @simple_gather
-//   CHECK:     %[[INIT:.*]] = tensor.empty() : tensor<3x1x1xf32>
-//   CHECK:     %[[CAST_INIT:.*]] = tensor.empty() : tensor<3x2xindex>
-//   CHECK:     %[[CAST:.*]] = linalg.map { arith.index_cast }
-//       CHECK: %[[GATHER:.*]] = thlo.gather
-//  CHECK-SAME:   ins(%{{.*}} : tensor<3x3xf32>, %[[CAST]] : tensor<3x2xindex>)
-//  CHECK-SAME:   outs(%[[INIT]] : tensor<3x1x1xf32>)
-//       CHECK: return %[[GATHER]]
-
-func.func @simple_gather_unsigned(
-    %operand : tensor<3x3xui32>, %indices: tensor<3x2xui64>) -> tensor<3x1x1xui32> {
-  %0 = "mhlo.gather"(%operand, %indices) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [],
-      index_vector_dim = 1,
-      offset_dims = [1, 2],
-      start_index_map = [0, 1]
-    >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1]> : tensor<2xi64>
-  } : (tensor<3x3xui32>, tensor<3x2xui64>) -> tensor<3x1x1xui32>
-  func.return %0 : tensor<3x1x1xui32>
-}
-// CHECK-LABEL: @simple_gather_unsigned
-//   CHECK-DAG: %[[CAST:.*]] = builtin.unrealized_conversion_cast {{.*}} : tensor<3x3xui32> to tensor<3x3xi32>
-//   CHECK-DAG: %[[INIT:.*]] = tensor.empty() : tensor<3x1x1xi32>
-//   CHECK:     %[[INDEX_CAST_INIT:.*]] = tensor.empty() : tensor<3x2xindex>
-//   CHECK:     %[[INDEX_CAST:.*]] = linalg.map { arith.index_castui }
-//       CHECK: %[[GATHER:.*]] = thlo.gather
-//  CHECK-SAME:   ins(%[[CAST]] : tensor<3x3xi32>, %[[INDEX_CAST]] : tensor<3x2xindex>)
-//  CHECK-SAME:   outs(%[[INIT]] : tensor<3x1x1xi32>)
-//       CHECK: %[[CAST2:.*]] = builtin.unrealized_conversion_cast %[[GATHER]] : tensor<3x1x1xi32> to tensor<3x1x1xui32>
-//       CHECK: return %[[CAST2]]
-
-func.func @gather_with_slices(
-    %operand : tensor<300x300xi32>, %indices: tensor<3x2xi64>) -> tensor<3x101x102xi32> {
-  %0 = "mhlo.gather"(%operand, %indices) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [],
-      index_vector_dim = 1,
-      offset_dims = [1, 2],
-      start_index_map = [0, 1]
-    >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[101, 102]> : tensor<2xi64>
-  } : (tensor<300x300xi32>, tensor<3x2xi64>) -> tensor<3x101x102xi32>
-  func.return %0 : tensor<3x101x102xi32>
-}
-// CHECK-LABEL: @gather_with_slices
-//       CHECK: %[[INIT:.*]] = tensor.empty() : tensor<3x101x102xi32>
-//       CHECK: thlo.gather
-//  CHECK-SAME:   outs(%[[INIT]] : tensor<3x101x102xi32>)
-
-func.func @gather_dynamic(
-    %operand : tensor<300xi32>, %indices: tensor<?x1xi64>) -> tensor<?x42xi32> {
-  %0 = "mhlo.gather"(%operand, %indices) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [],
-      index_vector_dim = 1,
-      offset_dims = [1],
-      start_index_map = [0]
-    >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[42]> : tensor<1xi64>
-  } : (tensor<300xi32>, tensor<?x1xi64>) -> tensor<?x42xi32>
-  func.return %0 : tensor<?x42xi32>
-}
-// CHECK-LABEL: @gather_dynamic
-//   CHECK-DAG: %[[C0:.*]] = arith.constant 0
-//       CHECK: %[[DIM:.*]] = tensor.dim {{.*}} %[[C0]] : tensor<?x1xi64>
-//       CHECK: %[[INIT:.*]] = tensor.empty(%dim) : tensor<?x42xi32>
-//       CHECK: thlo.gather
-//  CHECK-SAME:   outs(%[[INIT]] : tensor<?x42xi32>)
-
-func.func @unsupported_gather(%operand: tensor<3x3xf32>,
-                              %indices: tensor<3x2xi64>) -> tensor<3xf32> {
-  %0 = "mhlo.gather"(%operand, %indices) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
-      index_vector_dim = 1,
-      offset_dims = [],
-      start_index_map = [1, 0]
-    >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1]> : tensor<2xi64>
-  } : (tensor<3x3xf32>, tensor<3x2xi64>) -> tensor<3xf32>
-  func.return %0 : tensor<3xf32>
-}
-
-// CHECK-LABEL: @unsupported_gather
-//       CHECK: mhlo.gather
-
-func.func @simple_scatter(%dst: tensor<3x3xf32>, %indices: tensor<2x2xi32>,
-                          %update: tensor<2x1x3xf32>) -> tensor<3x3xf32> {
-  %0 = "mhlo.scatter"(%dst, %indices, %update) ({
-  ^bb0(%in: tensor<f32>, %out: tensor<f32>):
-    %sum = mhlo.add %in, %out : tensor<f32>
-    "mhlo.return"(%sum) : (tensor<f32>) -> ()
-  }) {
-    scatter_dimension_numbers = #mhlo.scatter<
-      update_window_dims = [1, 2],
-      inserted_window_dims = [],
-      scatter_dims_to_operand_dims = [0, 1],
-      index_vector_dim = 1,
-    >,
-    unique_indices = false,
-    indices_are_sorted = false
-  } : (tensor<3x3xf32>, tensor<2x2xi32>, tensor<2x1x3xf32>) -> tensor<3x3xf32>
-  func.return %0 : tensor<3x3xf32>
-}
-
-// CHECK-LABEL: @simple_scatter
-// CHECK-SAME: (%[[DST:.*]]: tensor<3x3xf32>, %[[INDICES:.*]]: tensor<2x2xi32>,
-// CHECK-SAME:  %[[UPDATE:.*]]: tensor<2x1x3xf32>)
-//      CHECK:   %[[CAST_INIT:.*]] = tensor.empty() : tensor<2x2xindex>
-//      CHECK:   %[[CAST:.*]] = linalg.map { arith.index_cast }
-//      CHECK:   thlo.scatter 
-// CHECK-SAME:     ins(%[[CAST]] : tensor<2x2xindex>,
-// CHECK-SAME:        %[[UPDATE]] : tensor<2x1x3xf32>)
-// CHECK-SAME:     outs(%[[DST]] : tensor<3x3xf32>)
-// CHECK-NEXT:     (%[[UPD:.*]]: f32, %[[CUR:.*]]: f32) {
-// CHECK-NEXT:    %[[CUR_T:.*]] = tensor.from_elements %[[CUR]] : tensor<f32>
-// CHECK-NEXT:    %[[UPD_T:.*]] = tensor.from_elements %[[UPD]] : tensor<f32>
-// CHECK-NEXT:    %[[CUR:.*]] = tensor.extract %[[CUR_T]][] : tensor<f32>
-// CHECK-NEXT:    %[[UPD:.*]] = tensor.extract %[[UPD_T]][] : tensor<f32>
-// CHECK-NEXT:    arith.addf %[[CUR]], %[[UPD]] : f32
-// CHECK-NEXT:    tensor.from_elements
-// CHECK-NEXT:    tensor.extract
-
-// -----
-
-// CHECK-LABEL: func @sort
-// CHECK-SAME:  (%[[IN0:.*]]: tensor<16x16xf32>, %[[IN1:.*]]: tensor<16x16xi32>)
-func.func @sort(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
-  %0:2 = "mhlo.sort"(%input0, %input1) ({
-  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>,
-       %arg2: tensor<i32>, %arg3: tensor<i32>):
-    %7 = "mhlo.compare"(%arg0, %arg1)
-      {comparison_direction = #mhlo<comparison_direction GT>}
-        : (tensor<f32>, tensor<f32>) -> tensor<i1>
-    "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = -1 : i64, is_stable = true}
-     : (tensor<16x16xf32>, tensor<16x16xi32>)
-    -> (tensor<16x16xf32>, tensor<16x16xi32>)
-  func.return
-}
-// CHECK-DAG:   %[[INIT0:.*]] = tensor.empty() : tensor<16x16xf32>
-// CHECK-DAG:   %[[INIT1:.*]] = tensor.empty() : tensor<16x16xi32>
-// CHECK:       thlo.sort
-// CHECK-SAME:  ins(%[[IN0]] : tensor<16x16xf32>, %[[IN1]] : tensor<16x16xi32>)
-// CHECK-SAME:  outs(%[[INIT0]] : tensor<16x16xf32>, %[[INIT1]] : tensor<16x16xi32>)
-// CHECK-SAME:  dimension = 1
-// CHECK-SAME:  is_stable = true
-// CHECK:       (%[[FLOAT0:.*]]: f32, %[[FLOAT1:.*]]: f32, %[[INT0:.*]]: i32, %[[INT1:.*]]: i32)
-// CHECK-DAG:     %[[TENSOR0:.*]] = tensor.from_elements %[[FLOAT0]] : tensor<f32>
-// CHECK-DAG:     %[[TENSOR1:.*]] = tensor.from_elements %[[FLOAT1]] : tensor<f32>
-// CHECK-DAG:     %[[EXTRACTED0:.*]] = tensor.extract %[[TENSOR0]][] : tensor<f32>
-// CHECK-DAG:     %[[EXTRACTED1:.*]] = tensor.extract %[[TENSOR1]][] : tensor<f32>
-// CHECK:         %[[CMPRESULT:.*]] = arith.cmpf ogt, %[[EXTRACTED0]], %[[EXTRACTED1]] : f32
-// CHECK-NEXT:    %[[RESULT:.*]] = tensor.from_elements %[[CMPRESULT]] : tensor<i1>
-// CHECK-NEXT:    %[[EXTRACTED_RESULT:.*]] = tensor.extract %[[RESULT]][] : tensor<i1>
-// CHECK-NEXT:    thlo.yield %[[EXTRACTED_RESULT]] : i1
-
-func.func @reverse_static(%input: tensor<100xf32>)
-  -> tensor<100xf32> {
-  %res = "mhlo.reverse"(%input) {dimensions = dense<[0]> : tensor<1xi64>} :
-    (tensor<100xf32>) -> tensor<100xf32>
-  func.return %res : tensor<100xf32>
-}
-
-// CHECK-LABEL: func @reverse_static
-//  CHECK-SAME: (%[[ARG0:.*]]: tensor<100xf32>) -> tensor<100xf32>
-//       CHECK:   %[[EMPTY:.*]] = tensor.empty
-//       CHECK:   %[[REVERSED:.*]] = thlo.reverse
-//  CHECK-SAME:     ins(%[[ARG0]]
-//  CHECK-SAME:     outs(%[[EMPTY]]
-//  CHECK-SAME:     reverse_dimensions = [0]
-//  CHECK-NEXT:   return %[[REVERSED]]
-
-func.func @reverse_dynamic(%input: tensor<?x?xf32>)
-  -> tensor<?x?xf32> {
-  %res = "mhlo.reverse"(%input) {dimensions = dense<[0, 1]> : tensor<2xi64>} :
-    (tensor<?x?xf32>) -> tensor<?x?xf32>
-  func.return %res : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: func @reverse_dynamic
-//  CHECK-SAME: (%[[ARG0:.*]]: tensor<?x?xf32>) -> tensor<?x?xf32>
-//       CHECK:   %[[C0:.*]] = arith.constant
-//       CHECK:   %[[DIM0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
-//       CHECK:   %[[C1:.*]] = arith.constant
-//       CHECK:   %[[DIM1:.*]] = tensor.dim %[[ARG0]], %[[C1]]
-//       CHECK:   %[[EMPTY:.*]] = tensor.empty(%[[DIM0]],  %[[DIM1]])
-//       CHECK:   %[[REVERSED:.*]] = thlo.reverse
-//  CHECK-SAME:     ins(%[[ARG0]]
-//  CHECK-SAME:     outs(%[[EMPTY]]
-//  CHECK-SAME:     reverse_dimensions = [0, 1]
-//  CHECK-NEXT:   return %[[REVERSED]]
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
index b7d36ebba62c81..7aad7a84cc5155 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-hlo-opt -split-input-file %s | FileCheck %s
-// RUN: mlir-hlo-opt -split-input-file %s | mlir-hlo-opt | FileCheck %s
+// RUN: mlir-hlo-opt -split-input-file %s | mlir-hlo-opt -split-input-file | FileCheck %s
 
 // -----
 
@@ -238,14 +238,16 @@ func.func @extensions(%arg0 : tensor<?x?xf32, #mhlo.type_extensions<bounds = [3,
   map = (d0, d1) -> (d0 : compressed, d1 : compressed)
 }>
 
+// CHECK: #[[$CSR:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>
+// CHECK: #[[$DCSR:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
 // CHECK-LABEL: func @encodings
 func.func @encodings(%arg0: tensor<10x20xf32, #CSR>,
                      %arg1: tensor<10x20xf32, #DCSR>) -> tensor<10x20xf32> {
-  // CHECK:      %0 = mhlo.add %arg0, %arg1 : (tensor<10x20xf32, #sparse_tensor.encoding<{{.*}}>>, tensor<10x20xf32, #sparse_tensor.encoding<{{.*}}>>) -> tensor<10x20xf32>
-  // CHECK-NEXT: %1 = mhlo.add %arg1, %arg1 : tensor<10x20xf32, #sparse_tensor.encoding<{{.*}}>>
-  // CHECK-NEXT: %2 = mhlo.abs %arg0 : (tensor<10x20xf32, #sparse_tensor.encoding<{{.*}}>>) -> tensor<10x20xf32>
-  // CHECK-NEXT: %3 = mhlo.abs %arg0 : tensor<10x20xf32, #sparse_tensor.encoding<{{.*}}>>
-  // CHECK-NEXT: %4 = mhlo.complex %arg0, %arg0 : (tensor<10x20xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>, tensor<10x20xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>) -> tensor<10x20xcomplex<f32>>
+  // CHECK:      %0 = mhlo.add %arg0, %arg1 : (tensor<10x20xf32, #[[$CSR]]>, tensor<10x20xf32, #[[$DCSR]]>) -> tensor<10x20xf32>
+  // CHECK-NEXT: %1 = mhlo.add %arg1, %arg1 : tensor<10x20xf32, #[[$DCSR]]
+  // CHECK-NEXT: %2 = mhlo.abs %arg0 : (tensor<10x20xf32, #[[$CSR]]>) -> tensor<10x20xf32>
+  // CHECK-NEXT: %3 = mhlo.abs %arg0 : tensor<10x20xf32, #[[$CSR]]>
+  // CHECK-NEXT: %4 = mhlo.complex %arg0, %arg0 : (tensor<10x20xf32, #[[$CSR]]>, tensor<10x20xf32, #[[$CSR]]>) -> tensor<10x20xcomplex<f32>>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<10x20xf32, #CSR>,
                                    tensor<10x20xf32, #DCSR>) -> tensor<10x20xf32>
   %1 = "mhlo.add"(%arg1, %arg1) : (tensor<10x20xf32, #DCSR>,
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
index 983148bff77802..e04baf1c868973 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
@@ -896,7 +896,7 @@ func.func @broadcast(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
 
 func.func @broadcast_bad_sizes_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+2 {{'mhlo.broadcast' op failed to infer returned types}}
-  // expected-error@+1 {{broadcast_sizes has rank 2 instead of rank 1}}
+  // expected-error@+1 {{broadcast_sizes has rank 2 instead of required rank 1.}}
   %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[[1, 2]]> : tensor<1x2xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
   func.return %0 : tensor<1x2x3xi32>
 }
@@ -2421,7 +2421,7 @@ func.func @dynamic_slice_dynamic_dim(%arg0: tensor<?x4xi32>, %arg1: tensor<i64>,
 
 func.func @dynamic_slice_i3(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
   // @expected-error@+2 {{'mhlo.dynamic_slice' op failed to infer returned types}}
-  // expected-error@+1 {{slice_sizes should be rank 1, but got rank 0.}}
+  // expected-error@+1 {{slice_sizes has rank 0 instead of required rank 1.}}
   %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<1> : tensor<i64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
   func.return %0 : tensor<1x4xi32>
 }
@@ -2545,7 +2545,7 @@ func.func @transpose_missing_permutation(%arg0: tensor<1x2x3x4xi32>) -> tensor<2
 
 func.func @transpose_bad_permutations_rank(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
   // @expected-error@+2 {{'mhlo.transpose' op failed to infer returned types}}
-  // expected-error@+1 {{permutation has rank 2 instead of rank 1}}
+  // expected-error@+1 {{permutation has rank 2 instead of required rank 1.}}
   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[[1]]> : tensor<1x1xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   func.return %0: tensor<2x1x4x3xi32>
 }
@@ -2563,7 +2563,7 @@ func.func @transpose_bad_permutations_size(%arg0: tensor<1x2x3x4xi32>) ->  tenso
 
 func.func @transpose_bad_permutation(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
   // @expected-error@+2 {{'mhlo.transpose' op failed to infer returned types}}
-  // expected-error@+1 {{attribute permutation must be a permutation of [0, 1, 2, 3] but got dense<[1, 0, 3, 9]> : tensor<4xi64>}}
+  // expected-error@+1 {{attribute permutation must be a permutation of [0, 1, 2, 3] but got 1, 0, 3, 9}}
   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 9]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   func.return %0: tensor<2x1x4x3xi32>
 }
@@ -6097,10 +6097,10 @@ func.func @quantization_supported_ops(%arg0: tensor<1x2x2x!quant.uniform<i8:f32,
 }
 
 func.func @per_axis_quantized_ops(%arg0: tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>, %arg1: tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:0, {0.1:-30}>>) {
-  %0 = "stablehlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0,1,3]> : tensor<3xi64>} : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>) -> tensor<1x2x3x2x!quant.uniform<i8<-128:127>:f32:3, {0.1:-30, 0.5:-20}>>
-  %1 = "stablehlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0,1,2]> : tensor<3xi64>} : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:0, {0.1:-30}>>) -> tensor<2x2x2x!quant.uniform<i8<-128:127>:f32:0, {0.1:-30, 0.1:-30}>>
-  %2 = stablehlo.reshape %arg0 : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>) -> tensor<2x2x!quant.uniform<i8<-128:127>:f32:1, {0.1:-30, 0.5:-20}>>
-  %3 = "stablehlo.transpose"(%arg0) {permutation = dense<[0,2,1]> : tensor<3xi64>}: (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>) -> tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:1, {0.1:-30, 0.5:-20}>>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0,1,3]> : tensor<3xi64>} : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>) -> tensor<1x2x3x2x!quant.uniform<i8<-128:127>:f32:3, {0.1:-30, 0.5:-20}>>
+  %1 = "mhlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0,1,2]> : tensor<3xi64>} : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:0, {0.1:-30}>>) -> tensor<2x2x2x!quant.uniform<i8<-128:127>:f32:0, {0.1:-30, 0.1:-30}>>
+  %2 = mhlo.reshape %arg0 : (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>) -> tensor<2x2x!quant.uniform<i8<-128:127>:f32:1, {0.1:-30, 0.5:-20}>>
+  %3 = "mhlo.transpose"(%arg0) {permutation = dense<[0,2,1]> : tensor<3xi64>}: (tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:2, {0.1:-30, 0.5:-20}>>) -> tensor<1x2x2x!quant.uniform<i8<-128:127>:f32:1, {0.1:-30, 0.5:-20}>>
   func.return
 }
 
@@ -6518,139 +6518,66 @@ func.func @f8e5m2(%arg0: tensor<f16>) -> tensor<f8E5M2> {
 // -----
 
 func.func @top_k_1d(%arg0 : tensor<16xf32>) {
-  %0:2 = mhlo.topk(%arg0, k=8) {
-    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-      mhlo.return %predicate : tensor<i1>
-  } : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
+  %0:2 = mhlo.topk(%arg0, k=8, largest=true) : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
   return
 }
 
 // -----
 
 func.func @top_k_nd(%arg0 : tensor<16x16xf32>) {
-  %0:2 = mhlo.topk(%arg0, k=8) {
-    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-      mhlo.return %predicate : tensor<i1>
-  } : tensor<16x16xf32> -> (tensor<16x8xf32>, tensor<16x8xi32>)
+  %0:2 = mhlo.topk(%arg0, k=8, largest=false) : tensor<16x16xf32> -> (tensor<16x8xf32>, tensor<16x8xi32>)
   return
 }
 
 // -----
 
 func.func @top_k_unbounded(%arg0 : tensor<?x16x?xf32>) {
-  %0:2 = mhlo.topk(%arg0, k=8) {
-    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-      mhlo.return %predicate : tensor<i1>
-  } : tensor<?x16x?xf32> -> (tensor<?x16x8xf32>, tensor<?x16x8xi32>)
+  %0:2 = mhlo.topk(%arg0, k=8, largest=true) : tensor<?x16x?xf32> -> (tensor<?x16x8xf32>, tensor<?x16x8xi32>)
   return
 }
 
 // -----
 
 func.func @top_k_bounded(%arg0 : tensor<?x?x?xf32, #mhlo.type_extensions<bounds = [?, 16, 16]>>) {
-  %0:2 = mhlo.topk(%arg0, k=8) {
-    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-      mhlo.return %predicate : tensor<i1>
-  } : tensor<?x?x?xf32, #mhlo.type_extensions<bounds = [?, 16, 16]>> -> (tensor<16x?x8xf32, #mhlo.type_extensions<bounds = [?, 16, ?]>>, tensor<16x?x8xi32, #mhlo.type_extensions<bounds = [?, 16, ?]>>)
+  %0:2 = mhlo.topk(%arg0, k=8, largest=true) : tensor<?x?x?xf32, #mhlo.type_extensions<bounds = [?, 16, 16]>> -> (tensor<16x?x8xf32, #mhlo.type_extensions<bounds = [?, 16, ?]>>, tensor<16x?x8xi32, #mhlo.type_extensions<bounds = [?, 16, ?]>>)
   return
 }
 
 // -----
 
 func.func @top_k_unranked(%arg0 : tensor<*xf32>) {
-  %0:2 = mhlo.topk(%arg0, k=8) {
-    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-      mhlo.return %predicate : tensor<i1>
-  } : tensor<*xf32> -> (tensor<*xf32>, tensor<*xi32>)
+  %0:2 = mhlo.topk(%arg0, k=8, largest=true) : tensor<*xf32> -> (tensor<*xf32>, tensor<*xi32>)
   return
 }
 
 // -----
 
-func.func @topk_rank_at_least_one(%arg0 : tensor<f32>) {
-  // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{operand's rank must be at least 1}}
-  %0:2 = mhlo.topk(%arg0, k=8) {
-    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-      mhlo.return %predicate : tensor<i1>
-  } : tensor<f32> -> (tensor<8xf32>, tensor<8xi32>)
-  return
-}
-
-// -----
-
-func.func @topk_last_dimension_at_least_k(%arg0 : tensor<4xf32>) {
-  // expected-error@+2 {{failed to infer returned types}}
-  // expected-error@+1 {{operand's last dimension must be at least 8}}
-  %0:2 = mhlo.topk(%arg0, k=8) {
-    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-      mhlo.return %predicate : tensor<i1>
-  } : tensor<4xf32> -> (tensor<8xf32>, tensor<8xi32>)
+func.func @top_k_1d_false(%arg0 : tensor<16xf32>) {
+  %0:2 = mhlo.topk(%arg0, k=8, largest=false) : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
   return
 }
 
 // -----
 
-func.func @topk_body_must_have_two_arguments(%arg0 : tensor<16xf32>) {
-  // expected-error@+1 {{unsupported body: expected: '(tensor<f32>, tensor<f32>) -> tensor<i1>', got '(tensor<f32>) -> tensor<i1>'}}
-  %0:2 = mhlo.topk(%arg0, k=8) {
-    ^bb0(%arg1: tensor<f32>):
-      %predicate = mhlo.compare GT, %arg1, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-      mhlo.return %predicate : tensor<i1>
-  } : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
+func.func @top_k_1d_default(%arg0 : tensor<16xf32>) {
+  %0:2 = mhlo.topk(%arg0, k=8) : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
   return
 }
 
 // -----
 
-func.func @topk_body_must_have_one_result(%arg0 : tensor<16xf32>) {
-  // expected-error@+1 {{unsupported body: expected: '(tensor<f32>, tensor<f32>) -> tensor<i1>', got '(tensor<f32>, tensor<f32>) -> (tensor<i1>, tensor<i1>)'}}
-  %0:2 = mhlo.topk(%arg0, k=8) {
-    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-      mhlo.return %predicate, %predicate : tensor<i1>, tensor<i1>
-  } : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
-  return
-}
-
-// -----
-
-func.func @topk_body_arguments_must_have_operand_element_type(%arg0 : tensor<16xf32>) {
-  // expected-error@+1 {{unsupported body: expected: '(tensor<f32>, tensor<f32>) -> tensor<i1>', got '(tensor<i32>, tensor<i32>) -> tensor<i1>'}}
-  %0:2 = mhlo.topk(%arg0, k=8) {
-    ^bb0(%arg1: tensor<i32>, %arg2: tensor<i32>):
-      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-      mhlo.return %predicate : tensor<i1>
-  } : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
-  return
-}
-
-// -----
-
-func.func @topk_body_results_must_have_i1_element_type(%arg0 : tensor<16xf32>) {
-  // expected-error@+1 {{unsupported body: expected: '(tensor<f32>, tensor<f32>) -> tensor<i1>', got '(tensor<f32>, tensor<f32>) -> tensor<f32>'}}
-  %0:2 = mhlo.topk(%arg0, k=8) {
-    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-      mhlo.return %arg1 : tensor<f32>
-  } : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
+func.func @topk_rank_at_least_one(%arg0 : tensor<f32>) {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{operand's rank must be at least 1}}
+  %0:2 = mhlo.topk(%arg0, k=8, largest=true) : tensor<f32> -> (tensor<8xf32>, tensor<8xi32>)
   return
 }
 
 // -----
 
-func.func @topk_body_must_consist_of_compare_gt_or_compare_lt(%arg0 : tensor<16xf32>) {
-  // expected-error@+1 {{unsupported body: expected mhlo.compare of body arguments with GT or LT comparison_direction}}
-  %0:2 = mhlo.topk(%arg0, k=8) {
-    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-      %predicate = mhlo.compare EQ, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-      mhlo.return %predicate : tensor<i1>
-  } : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
+func.func @topk_last_dimension_at_least_k(%arg0 : tensor<4xf32>) {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{operand's last dimension must be at least 8}}
+  %0:2 = mhlo.topk(%arg0, k=8, largest=true) : tensor<4xf32> -> (tensor<8xf32>, tensor<8xi32>)
   return
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo.mlir
new file mode 100644
index 00000000000000..6b748b2fec1d47
--- /dev/null
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo.mlir
@@ -0,0 +1,129 @@
+// RUN: mlir-hlo-opt --shape-legalize-to-hlo=legalize-constraints=true --split-input-file --verify-diagnostics %s | FileCheck %s
+
+// -----
+
+// CHECK-LABEL: func.func @shape_cstr_broadcastable
+func.func @shape_cstr_broadcastable(%arg0: tensor<2xindex>, %arg1: tensor<2xindex>) {
+  %0 = shape.cstr_broadcastable %arg0, %arg1 : tensor<2xindex>, tensor<2xindex>
+  shape.assuming %0 {
+  }
+  func.return
+  //      CHECK: %[[DIMS1:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<2xindex> to tensor<2xi32>
+  // CHECK-NEXT: %[[DIMS2:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<2xindex> to tensor<2xi32>
+  // CHECK-NEXT: %[[ONES:.*]] = mhlo.constant dense<1> : tensor<2xi32>
+  // CHECK-NEXT: %[[DIMS1_IS_1:.*]] = mhlo.compare  EQ, %[[DIMS1]], %[[ONES:.*]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS2_IS_1:.*]] = mhlo.compare  EQ, %[[DIMS2]], %[[ONES:.*]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[EITHER_DIM_IS_1:.*]] = mhlo.or %[[DIMS1_IS_1]], %[[DIMS2_IS_1]] : tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS_EQ:.*]] = mhlo.compare  EQ, %[[DIMS1]], %[[DIMS2]],  NOTYPE : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  // CHECK-NEXT: %[[DIMS_BROADCASTABLE:.*]] = mhlo.or %[[EITHER_DIM_IS_1]], %[[DIMS_EQ]] : tensor<2xi1>
+  // CHECK-NEXT: %[[TRUE:.*]] = mhlo.constant dense<true> : tensor<1xi1>
+  // CHECK-NEXT: %[[DIM1_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[BROADCASTABLE_TEMP:.*]] = mhlo.and %[[TRUE]], %[[DIM1_BROADCASTABLE]] : tensor<1xi1>
+  // CHECK-NEXT: %[[DIM2_BROADCASTABLE:.*]] = "mhlo.slice"(%[[DIMS_BROADCASTABLE]]) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi1>) -> tensor<1xi1>
+  // CHECK-NEXT: %[[ALL_BROADCASTABLE:.*]] = mhlo.and %[[BROADCASTABLE_TEMP]], %[[DIM2_BROADCASTABLE]] : tensor<1xi1>
+  // CHECK-NEXT: %[[ALL_BROADCASTABLE_SCALAR:.*]] = mhlo.reshape %[[ALL_BROADCASTABLE]] : (tensor<1xi1>) -> tensor<i1>
+  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[ALL_BROADCASTABLE_SCALAR]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
+  // CHECK-NEXT: %[[WITNESS:.*]] = shape.const_witness true
+  // CHECK-NEXT: shape.assuming %[[WITNESS]] {
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+}
+
+// -----
+
+func.func @shape_cstr_broadcastable_input_shape(%arg0: !shape.shape, %arg1: !shape.shape) {
+  // expected-error@+1 {{failed to legalize operation 'shape.cstr_broadcastable' that was explicitly marked illegal}}
+  %0 = shape.cstr_broadcastable %arg0, %arg1 : !shape.shape, !shape.shape
+  shape.assuming %0 {
+  }
+  func.return
+}
+
+// -----
+
+func.func @shape_cstr_broadcastable_different_dims(%arg0: tensor<2xindex>, %arg1: tensor<3xindex>) {
+  // expected-error@+1 {{failed to legalize operation 'shape.cstr_broadcastable' that was explicitly marked illegal}}
+  %0 = shape.cstr_broadcastable %arg0, %arg1 : tensor<2xindex>, tensor<3xindex>
+  shape.assuming %0 {
+  }
+  func.return
+}
+
+// -----
+
+func.func @shape_cstr_broadcast_too_many_operands(%arg0: tensor<4xindex>, %arg1: tensor<4xindex>, %arg2: tensor<4xindex>) {
+  // expected-error@+1 {{failed to legalize operation 'shape.cstr_broadcastable' that was explicitly marked illegal}}
+  %0 = shape.cstr_broadcastable %arg0, %arg1, %arg2 : tensor<4xindex>, tensor<4xindex>, tensor<4xindex>
+  shape.assuming %0 {
+  }
+  func.return
+}
+
+// -----
+
+func.func @mhlo_cstr_reshapable(%arg0: index, %arg1: tensor<2xindex>) {
+  %0 = mhlo.cstr_reshapable %arg0, %arg1 : (index, tensor<2xindex>) -> !shape.witness
+  func.return
+  //  CHECK-DAG: %[[NUM_ELEMENTS:.*]] = builtin.unrealized_conversion_cast %arg0 : index to tensor<i32>
+  //  CHECK-DAG: %[[DYNAMIC_SHAPE:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<2xindex> to tensor<2xi32>
+  //  CHECK-DAG: %[[MINUS_ONE:.*]] = mhlo.constant dense<-1> : tensor<i32>
+  //  CHECK-DAG: %[[ONE:.*]] = mhlo.constant dense<1> : tensor<i32>
+  //  CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK-NEXT: %[[DIM_SIZE_1:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[DIM_SIZE_SCALAR_1:.*]] = mhlo.reshape %[[DIM_SIZE_1]] : (tensor<1xi32>) -> tensor<i32>
+  // CHECK-NEXT: %[[STATIC_DIMS_PRODUCT_1:.*]] = mhlo.multiply %[[MINUS_ONE]], %[[DIM_SIZE_SCALAR_1]] : tensor<i32>
+  // CHECK-NEXT: %[[EQ_MINUS_ONE_1:.*]] = mhlo.compare  EQ, %[[DIM_SIZE_SCALAR_1]], %[[MINUS_ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT: %[[DYNAMIC_DIM_1:.*]] = mhlo.select %[[EQ_MINUS_ONE_1]], %[[ONE]], %[[ZERO]] : tensor<i1>, tensor<i32>
+  // CHECK-NEXT: %[[NUM_DYNAMIC_DIM_1:.*]] = mhlo.add %[[ZERO]], %[[DYNAMIC_DIM_1]] : tensor<i32>
+  // CHECK-NEXT: %[[DIM_SIZE_2:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[DIM_SIZE_SCALAR_2:.*]] = mhlo.reshape %[[DIM_SIZE_2]] : (tensor<1xi32>) -> tensor<i32>
+  // CHECK-NEXT: %[[STATIC_DIMS_PRODUCT:.*]] = mhlo.multiply %[[STATIC_DIMS_PRODUCT_1]], %[[DIM_SIZE_SCALAR_2]] : tensor<i32>
+  // CHECK-NEXT: %[[EQ_MINUS_ONE_2:.*]] = mhlo.compare  EQ, %[[DIM_SIZE_SCALAR_2]], %[[MINUS_ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT: %[[DYNAMIC_DIM_2:.*]] = mhlo.select %[[EQ_MINUS_ONE_2]], %[[ONE]], %[[ZERO]] : tensor<i1>, tensor<i32>
+  // CHECK-NEXT: %[[NUM_DYNAMIC_DIM:.*]] = mhlo.add %[[NUM_DYNAMIC_DIM_1]], %[[DYNAMIC_DIM_2]] : tensor<i32>
+  // CHECK-NEXT: %[[ONLY_ONE_DYNAMIC_DIM:.*]] = mhlo.compare  EQ, %[[NUM_DYNAMIC_DIM]], %[[ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT: %[[REM:.*]] = mhlo.remainder %[[NUM_ELEMENTS]], %[[STATIC_DIMS_PRODUCT]] : tensor<i32>
+  // CHECK-NEXT: %[[NO_RESIDUAL:.*]] = mhlo.compare  EQ, %[[REM]], %[[ZERO]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT: %[[RESHAPABLE:.*]] = mhlo.and %[[NO_RESIDUAL]], %[[ONLY_ONE_DYNAMIC_DIM]] : tensor<i1>
+  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[RESHAPABLE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
+}
+
+// -----
+
+// CHECK-LABEL: func.func @mhlo_cstr_reshapable_const
+func.func @mhlo_cstr_reshapable_const(%arg0: tensor<?x2xf32>) {
+  %0 = arith.constant 20 : index
+  %1 = mhlo.constant dense<[-1, 4]> : tensor<2xi32>
+  %2 = mhlo.cstr_reshapable %0, %1 : (index, tensor<2xi32>) -> !shape.witness
+  func.return
+  //  CHECK-DAG: %[[DYNAMIC_SHAPE:.*]] = mhlo.constant dense<[-1, 4]> : tensor<2xi32>
+  //  CHECK-DAG: %[[NUM_ELEMENTS:.*]] = mhlo.constant dense<20> : tensor<i32>
+  //  CHECK-DAG: %[[MINUS_ONE:.*]] = mhlo.constant dense<-1> : tensor<i32>
+  //  CHECK-DAG: %[[ONE:.*]] = mhlo.constant dense<1> : tensor<i32>
+  //  CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK-NEXT: %[[DIM_SIZE_1:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[DIM_SIZE_SCALAR_1:.*]] = mhlo.reshape %[[DIM_SIZE_1]] : (tensor<1xi32>) -> tensor<i32>
+  // CHECK-NEXT: %[[STATIC_DIMS_PRODUCT_1:.*]] = mhlo.multiply %[[MINUS_ONE]], %[[DIM_SIZE_SCALAR_1]] : tensor<i32>
+  // CHECK-NEXT: %[[EQ_MINUS_ONE_1:.*]] = mhlo.compare  EQ, %[[DIM_SIZE_SCALAR_1]], %[[MINUS_ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT: %[[DYNAMIC_DIM_1:.*]] = mhlo.select %[[EQ_MINUS_ONE_1]], %[[ONE]], %[[ZERO]] : tensor<i1>, tensor<i32>
+  // CHECK-NEXT: %[[NUM_DYNAMIC_DIM_1:.*]] = mhlo.add %[[ZERO]], %[[DYNAMIC_DIM_1]] : tensor<i32>
+  // CHECK-NEXT: %[[DIM_SIZE_2:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[DIM_SIZE_SCALAR_2:.*]] = mhlo.reshape %[[DIM_SIZE_2]] : (tensor<1xi32>) -> tensor<i32>
+  // CHECK-NEXT: %[[STATIC_DIMS_PRODUCT:.*]] = mhlo.multiply %[[STATIC_DIMS_PRODUCT_1]], %[[DIM_SIZE_SCALAR_2]] : tensor<i32>
+  // CHECK-NEXT: %[[EQ_MINUS_ONE_2:.*]] = mhlo.compare  EQ, %[[DIM_SIZE_SCALAR_2]], %[[MINUS_ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT: %[[DYNAMIC_DIM_2:.*]] = mhlo.select %[[EQ_MINUS_ONE_2]], %[[ONE]], %[[ZERO]] : tensor<i1>, tensor<i32>
+  // CHECK-NEXT: %[[NUM_DYNAMIC_DIM:.*]] = mhlo.add %[[NUM_DYNAMIC_DIM_1]], %[[DYNAMIC_DIM_2]] : tensor<i32>
+  // CHECK-NEXT: %[[ONLY_ONE_DYNAMIC_DIM:.*]] = mhlo.compare  EQ, %[[NUM_DYNAMIC_DIM]], %[[ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT: %[[REM:.*]] = mhlo.remainder %[[NUM_ELEMENTS]], %[[STATIC_DIMS_PRODUCT]] : tensor<i32>
+  // CHECK-NEXT: %[[NO_RESIDUAL:.*]] = mhlo.compare  EQ, %[[REM]], %[[ZERO]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT: %[[RESHAPABLE:.*]] = mhlo.and %[[NO_RESIDUAL]], %[[ONLY_ONE_DYNAMIC_DIM]] : tensor<i1>
+  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[RESHAPABLE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
+}
+
+// -----
+
+func.func @mhlo_cstr_reshapable_i8(%arg0: index, %arg1: tensor<2xi8>) {
+  // expected-error@+1 {{failed to legalize operation 'mhlo.cstr_reshapable' that was explicitly marked illegal}}
+  %0 = mhlo.cstr_reshapable %arg0, %arg1 : (index, tensor<2xi8>) -> !shape.witness
+  func.return
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo_e2e.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo_e2e.mlir
new file mode 100644
index 00000000000000..fe0898657476e0
--- /dev/null
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo_e2e.mlir
@@ -0,0 +1,66 @@
+// RUN: mlir-hlo-opt --shape-legalize-to-hlo=legalize-constraints=true -reconcile-unrealized-casts -canonicalize --split-input-file --verify-diagnostics %s | FileCheck %s
+// This test verifies e2e lowering of cstr ops result is correct for constant inputs.
+
+// -----
+
+// CHECK-LABEL: func.func @mhlo_cstr_reshapable_true
+func.func @mhlo_cstr_reshapable_true(%arg0: tensor<?x2xf32>) -> tensor<?x2xf32> {
+  %0 = arith.constant 16 : index
+  %1 = mhlo.constant dense<[-1, 4, 2]> : tensor<3xi32>
+  %2 = mhlo.cstr_reshapable %0, %1 : (index, tensor<3xi32>) -> !shape.witness
+  %3 = shape.assuming %2 -> tensor<?x2xf32> {
+    shape.assuming_yield %arg0 : tensor<?x2xf32>
+  }
+  func.return %3 : tensor<?x2xf32>
+  //      CHECK: %[[TRUE:.*]] = mhlo.constant dense<true> : tensor<i1>
+  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[TRUE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
+  // CHECK-NEXT: return %arg0 : tensor<?x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @mhlo_cstr_reshapable_has_residual
+func.func @mhlo_cstr_reshapable_has_residual(%arg0: tensor<?x2xf32>) -> tensor<?x2xf32> {
+  %0 = arith.constant 19 : index
+  %1 = mhlo.constant dense<[-1, 4]> : tensor<2xi32>
+  %2 = mhlo.cstr_reshapable %0, %1 : (index, tensor<2xi32>) -> !shape.witness
+  %3 = shape.assuming %2 -> tensor<?x2xf32> {
+    shape.assuming_yield %arg0 : tensor<?x2xf32>
+  }
+  func.return %3 : tensor<?x2xf32>
+  //      CHECK: %[[FALSE:.*]] = mhlo.constant dense<false> : tensor<i1>
+  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[FALSE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
+  // CHECK-NEXT: return %arg0 : tensor<?x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @mhlo_cstr_reshapable_2_dynamic_dims
+func.func @mhlo_cstr_reshapable_2_dynamic_dims(%arg0: tensor<?x2xf32>) -> tensor<?x2xf32> {
+  %0 = arith.constant 20 : index
+  %1 = mhlo.constant dense<[-1, 4, -1]> : tensor<3xi32>
+  %2 = mhlo.cstr_reshapable %0, %1 : (index, tensor<3xi32>) -> !shape.witness
+  %3 = shape.assuming %2 -> tensor<?x2xf32> {
+    shape.assuming_yield %arg0 : tensor<?x2xf32>
+  }
+  func.return %3 : tensor<?x2xf32>
+  //      CHECK: %[[FALSE:.*]] = mhlo.constant dense<false> : tensor<i1>
+  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[FALSE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
+  // CHECK-NEXT: return %arg0 : tensor<?x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @mhlo_cstr_reshapable_0_dynamic_dims
+func.func @mhlo_cstr_reshapable_0_dynamic_dims(%arg0: tensor<?x2xf32>) -> tensor<?x2xf32> {
+  %0 = arith.constant 20 : index
+  %1 = mhlo.constant dense<[1, 4, 5]> : tensor<3xi32>
+  %2 = mhlo.cstr_reshapable %0, %1 : (index, tensor<3xi32>) -> !shape.witness
+  %3 = shape.assuming %2 -> tensor<?x2xf32> {
+    shape.assuming_yield %arg0 : tensor<?x2xf32>
+  }
+  func.return %3 : tensor<?x2xf32>
+  //      CHECK: %[[FALSE:.*]] = mhlo.constant dense<false> : tensor<i1>
+  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[FALSE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
+  // CHECK-NEXT: return %arg0 : tensor<?x2xf32>
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir
index bb5af8494888ec..2440774e876402 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir
@@ -88,3 +88,121 @@ func.func @shape_of_ranked_to_shape(%arg0: tensor<?x1xf32>) -> !shape.shape {
   %0 = shape.shape_of %arg0 : tensor<?x1xf32> -> !shape.shape
   func.return %0 : !shape.shape
 }
+
+// -----
+
+// CHECK-LABEL: func.func @tensor_dim
+func.func @tensor_dim(%arg0: tensor<?x?xf32>) -> index {
+  %c0 = arith.constant 0 : index
+  %dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  func.return %dim : index
+  //      CHECK: %[[DIM_SIZE:.*]] = "mhlo.get_dimension_size"(%arg0) {dimension = 0 : i64} : (tensor<?x?xf32>) -> tensor<i32>
+  // CHECK-NEXT: %[[DIM_SIZE_INDEX:.*]] = builtin.unrealized_conversion_cast %[[DIM_SIZE]] : tensor<i32> to index
+  // CHECK-NEXT: return %[[DIM_SIZE_INDEX]] : index
+}
+
+// -----
+
+func.func @tensor_dim_dynamic(%arg0: tensor<?x?xf32>, %arg1: index) -> index {
+  // expected-error@+1 {{failed to legalize operation 'tensor.dim' that was explicitly marked illegal}}
+  %dim = tensor.dim %arg0, %arg1 : tensor<?x?xf32>
+  func.return %dim : index
+}
+
+// -----
+
+// CHECK-LABEL: func.func @tensor_from_elements
+func.func @tensor_from_elements(%arg0: index) -> tensor<2xindex> {
+  %c0 = arith.constant 0 : index
+  %0 = tensor.from_elements %arg0, %c0 : tensor<2xindex>
+  func.return %0 : tensor<2xindex>
+  //      CHECK: %[[ELEMENT1_SCALAR:.*]] = builtin.unrealized_conversion_cast %arg0 : index to tensor<i32>
+  // CHECK-NEXT: %[[ELEMENT1:.*]] = mhlo.reshape %[[ELEMENT1_SCALAR]] : (tensor<i32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[ELEMENT2:.*]] = mhlo.constant dense<0> : tensor<1xi32>
+  // CHECK-NEXT: %[[CONCAT:.*]] = "mhlo.concatenate"(%[[ELEMENT1]], %[[ELEMENT2]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  // CHECK-NEXT: %[[CONCAT_INDEX:.*]] = builtin.unrealized_conversion_cast %[[CONCAT]] : tensor<2xi32> to tensor<2xindex>
+  // CHECK-NEXT: return %[[CONCAT_INDEX]] : tensor<2xindex>
+}
+
+// -----
+
+func.func @tensor_from_elements_i8(%arg0: i8) -> tensor<2xi8> {
+  %c0 = arith.constant 0 : i8
+  // expected-error@+1 {{failed to legalize operation 'tensor.from_elements' that was explicitly marked illegal}}
+  %0 = tensor.from_elements %arg0, %c0 : tensor<2xi8>
+  func.return %0 : tensor<2xi8>
+}
+
+// -----
+
+func.func @tensor_from_elements_rank2(%arg0: index) -> tensor<2x1xindex> {
+  %c0 = arith.constant 0 : index
+  // expected-error@+1 {{failed to legalize operation 'tensor.from_elements' that was explicitly marked illegal}}
+  %0 = tensor.from_elements %arg0, %c0 : tensor<2x1xindex>
+  func.return %0 : tensor<2x1xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @shape_broadcast
+func.func @shape_broadcast(%arg0: tensor<4xindex>, %arg1: tensor<4xindex>) -> tensor<4xindex> {
+  %0 = shape.broadcast %arg0, %arg1 : tensor<4xindex>, tensor<4xindex> -> tensor<4xindex>
+  func.return %0 : tensor<4xindex>
+  //      CHECK: %[[LHS:.*]] = builtin.unrealized_conversion_cast %arg0 : tensor<4xindex> to tensor<4xi32>
+  // CHECK-NEXT: %[[RHS:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<4xindex> to tensor<4xi32>
+  // CHECK-NEXT: %[[BROADCAST:.*]] = mhlo.maximum %[[LHS]], %[[RHS]] : tensor<4xi32>
+  // CHECK-NEXT: %[[BROADCAST_INDEX:.*]] = builtin.unrealized_conversion_cast %[[BROADCAST]] : tensor<4xi32> to tensor<4xindex>
+  // CHECK-NEXT: return %[[BROADCAST_INDEX]] : tensor<4xindex>
+}
+
+// -----
+
+func.func @shape_broadcast_result_shape(%arg0: tensor<4xindex>, %arg1: tensor<4xindex>) -> !shape.shape {
+  // expected-error@+1 {{failed to legalize operation 'shape.broadcast' that was explicitly marked illegal}}
+  %0 = shape.broadcast %arg0, %arg1 : tensor<4xindex>, tensor<4xindex> -> !shape.shape
+  func.return %0 : !shape.shape
+}
+
+// -----
+
+func.func @shape_broadcast_input_shape(%arg0: !shape.shape, %arg1: !shape.shape) -> !shape.shape {
+  // expected-error@+1 {{failed to legalize operation 'shape.broadcast' that was explicitly marked illegal}}
+  %0 = shape.broadcast %arg0, %arg1 : !shape.shape, !shape.shape -> !shape.shape
+  func.return %0 : !shape.shape
+}
+
+// -----
+
+func.func @shape_broadcast_different_dims(%arg0: tensor<4xindex>, %arg1: tensor<6xindex>) -> tensor<6xindex> {
+  // expected-error@+1 {{failed to legalize operation 'shape.broadcast' that was explicitly marked illegal}}
+  %0 = shape.broadcast %arg0, %arg1 : tensor<4xindex>, tensor<6xindex> -> tensor<6xindex>
+  func.return %0 : tensor<6xindex>
+}
+
+// -----
+
+func.func @shape_broadcast_too_many_operands(%arg0: tensor<4xindex>, %arg1: tensor<4xindex>, %arg2: tensor<4xindex>) -> tensor<4xindex> {
+  // expected-error@+1 {{failed to legalize operation 'shape.broadcast' that was explicitly marked illegal}}
+  %0 = shape.broadcast %arg0, %arg1, %arg2 : tensor<4xindex>, tensor<4xindex>, tensor<4xindex> -> tensor<4xindex>
+  func.return %0 : tensor<4xindex>
+}
+
+// -----
+
+func.func @shape_cstr_broadcastable(%arg0: tensor<2xindex>, %arg1: tensor<2xindex>) -> !shape.witness {
+  // expected-error@+1 {{failed to legalize operation 'shape.cstr_broadcastable' that was explicitly marked illegal}}
+  %0 = shape.cstr_broadcastable %arg0, %arg1 : tensor<2xindex>, tensor<2xindex>
+  func.return %0 : !shape.witness
+}
+
+// -----
+
+func.func @mhlo_cstr_reshapable(%arg0: index, %arg1: tensor<2xindex>, %arg2: tensor<?x2xf32>) -> tensor<?x4xf32> {
+  // expected-error@+1 {{failed to legalize operation 'mhlo.cstr_reshapable' that was explicitly marked illegal}}
+  %0 = mhlo.cstr_reshapable %arg0, %arg1 : (index, tensor<2xindex>) -> !shape.witness
+  %1 = shape.assuming %0 -> (tensor<?x4xf32>) {
+    %2 = mhlo.dynamic_reshape %arg2, %arg1 : (tensor<?x2xf32>, tensor<2xindex>) -> tensor<?x4xf32>
+    shape.assuming_yield %2 : tensor<?x4xf32>
+  }
+  func.return %1 : tensor<?x4xf32>
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_gendot_lower.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_gendot_lower.mlir
index 4e3a8dd7569710..ec98a7cc1a4c68 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_gendot_lower.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_gendot_lower.mlir
@@ -3,16 +3,20 @@
 // RUN: --mhlo-test-lower-general-dot --canonicalize | FileCheck %s
 
 #SV  = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
-#CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
+#DCSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
 #COO = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton(nonunique), d2 : singleton) }>
 
+// CHECK: #[[$SV:.*]] = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
+// CHECK: #[[$DCSR:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
+// CHECK: #[[$COO:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton(nonunique), d2 : singleton) }>
+
 //
 // Vector-vector gendot.
 //
 // CHECK-LABEL: func.func @sparse_vecvec(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<10xf64, #sparse_tensor.encoding<{{{.*}}}>>,
-// CHECK-SAME:    %[[ARG1:.*]]: tensor<10xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<f64> {
-// CHECK:         %[[DOT:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]]) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<10xf64, #sparse_tensor.encoding<{{{.*}}}>>, tensor<10xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<f64>
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<10xf64, #[[$SV]]>,
+// CHECK-SAME:    %[[ARG1:.*]]: tensor<10xf64, #[[$SV]]>) -> tensor<f64> {
+// CHECK:         %[[DOT:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]]) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<10xf64, #[[$SV]]>, tensor<10xf64, #[[$SV]]>) -> tensor<f64>
 // CHECK:         return %[[DOT]] : tensor<f64>
 // CHECK:       }
 //
@@ -32,20 +36,20 @@ func.func @sparse_vecvec(%arg0: tensor<10xf64, #SV>,
 // Matrix-vector gendot.
 //
 // CHECK-LABEL: func.func @sparse_matvec(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<3x5xf64, #sparse_tensor.encoding<{{{.*}}}>>,
-// CHECK-SAME:    %[[ARG1:.*]]: tensor<5xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<3xf64> {
-// CHECK:         %[[DOT:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]]) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<3x5xf64, #sparse_tensor.encoding<{{{.*}}}>>, tensor<5xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<3xf64>
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<3x5xf64, #[[$DCSR]]>,
+// CHECK-SAME:    %[[ARG1:.*]]: tensor<5xf64, #[[$SV]]>) -> tensor<3xf64> {
+// CHECK:         %[[DOT:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]]) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<3x5xf64, #[[$DCSR]]>, tensor<5xf64, #[[$SV]]>) -> tensor<3xf64>
 // CHECK:        return %[[DOT]] : tensor<3xf64>
 // CHECK:       }
 //
-func.func @sparse_matvec(%arg0: tensor<3x5xf64, #CSR>,
+func.func @sparse_matvec(%arg0: tensor<3x5xf64, #DCSR>,
                          %arg1: tensor<5xf64, #SV>) -> tensor<3xf64> {
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1],
                                       rhs_contracting_dimensions = [0]>,
      precision_config = [#mhlo<precision DEFAULT>,
                          #mhlo<precision DEFAULT>]}
-    : (tensor<3x5xf64, #CSR>,
+    : (tensor<3x5xf64, #DCSR>,
        tensor<5xf64, #SV>) -> tensor<3xf64>
   return %0 : tensor<3xf64>
 }
@@ -54,20 +58,20 @@ func.func @sparse_matvec(%arg0: tensor<3x5xf64, #CSR>,
 // Matrix-matrix gendot, one sparse operand.
 //
 // CHECK-LABEL: func.func @sparse_matmat_1s(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<16x32xf64, #sparse_tensor.encoding<{{{.*}}}>>,
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<16x32xf64, #[[$DCSR]]>,
 // CHECK-SAME:    %[[ARG1:.*]]: tensor<32x64xf64>) -> tensor<16x64xf64> {
-// CHECK:         %[[DOT:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]]) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<16x32xf64, #sparse_tensor.encoding<{{{.*}}}>>, tensor<32x64xf64>) -> tensor<16x64xf64>
+// CHECK:         %[[DOT:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]]) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<16x32xf64, #[[$DCSR]]>, tensor<32x64xf64>) -> tensor<16x64xf64>
 // CHECK:         return %[[DOT]] : tensor<16x64xf64>
 // CHECK:       }
 //
-func.func @sparse_matmat_1s(%arg0: tensor<16x32xf64, #CSR>,
+func.func @sparse_matmat_1s(%arg0: tensor<16x32xf64, #DCSR>,
                             %arg1: tensor<32x64xf64>) -> tensor<16x64xf64> {
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1],
                                       rhs_contracting_dimensions = [0]>,
     precision_config = [#mhlo<precision DEFAULT>,
                         #mhlo<precision DEFAULT>]}
-    : (tensor<16x32xf64, #CSR>,
+    : (tensor<16x32xf64, #DCSR>,
        tensor<32x64xf64>) -> tensor<16x64xf64>
   return %0 : tensor<16x64xf64>
 }
@@ -76,22 +80,22 @@ func.func @sparse_matmat_1s(%arg0: tensor<16x32xf64, #CSR>,
 // Matrix-matrix gendot, everything sparse.
 //
 // CHECK-LABEL: func.func @sparse_matmat_as(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<16x32xf64, #sparse_tensor.encoding<{{{.*}}}>>,
-// CHECK-SAME:    %[[ARG1:.*]]: tensor<32x64xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<16x64xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[DOT:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]]) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<16x32xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>>, tensor<32x64xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>>) -> tensor<16x64xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>>
-// CHECK:         return %[[DOT]] : tensor<16x64xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<16x32xf64, #[[$DCSR]]>,
+// CHECK-SAME:    %[[ARG1:.*]]: tensor<32x64xf64, #[[$DCSR]]>) -> tensor<16x64xf64, #[[$DCSR]]> {
+// CHECK:         %[[DOT:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]]) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<16x32xf64, #[[$DCSR]]>, tensor<32x64xf64, #[[$DCSR]]>) -> tensor<16x64xf64, #[[$DCSR]]>
+// CHECK:         return %[[DOT]] : tensor<16x64xf64, #[[$DCSR]]>
 // CHECK:       }
 //
-func.func @sparse_matmat_as(%arg0: tensor<16x32xf64, #CSR>,
-                            %arg1: tensor<32x64xf64, #CSR>) -> tensor<16x64xf64, #CSR> {
+func.func @sparse_matmat_as(%arg0: tensor<16x32xf64, #DCSR>,
+                            %arg1: tensor<32x64xf64, #DCSR>) -> tensor<16x64xf64, #DCSR> {
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1],
                                       rhs_contracting_dimensions = [0]>,
     precision_config = [#mhlo<precision DEFAULT>,
                         #mhlo<precision DEFAULT>]}
-    : (tensor<16x32xf64, #CSR>,
-       tensor<32x64xf64, #CSR>) -> tensor<16x64xf64, #CSR>
-  return %0 : tensor<16x64xf64, #CSR>
+    : (tensor<16x32xf64, #DCSR>,
+       tensor<32x64xf64, #DCSR>) -> tensor<16x64xf64, #DCSR>
+  return %0 : tensor<16x64xf64, #DCSR>
 }
 
 //
@@ -101,7 +105,7 @@ func.func @sparse_matmat_as(%arg0: tensor<16x32xf64, #CSR>,
 //
 // CHECK-LABEL: func.func @sparse_tensor(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<197x12x64xf32>,
-// CHECK-SAME:    %[[ARG1:.*]]: tensor<12x64x768xf32, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<197x768xf32> {
+// CHECK-SAME:    %[[ARG1:.*]]: tensor<12x64x768xf32, #[[$COO]]>) -> tensor<197x768xf32> {
 // CHECK:         %[[R:.*]] = "mhlo.dot_general"(%[[ARG0]], %[[ARG1]])
 // CHECK:         return %[[R]] : tensor<197x768xf32>
 func.func @sparse_tensor(%arg0: tensor<197x12x64xf32>,
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_lower.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_lower.mlir
index 960248ebe8267d..eb5e7d98cdf6ea 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_lower.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_lower.mlir
@@ -22,15 +22,21 @@
   map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed)
 }>
 
+// CHECK: #[[$CSR:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>
+// CHECK: #[[$DCSR:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
+// CHECK: #[[$ST:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed) }>
+// CHECK: #[[$SV:.*]] = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
+
+
 // CHECK-LABEL: func @sparse_abs_eltwise(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20xf32, #{{.*}}>) -> tensor<10x20xf32, #{{.*}}> {
-// CHECK:         %[[OUT:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #{{.*}}>
-// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}} ins(%[[ARG0]] : tensor<10x20xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>>)
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20xf32, #[[$CSR]]>) -> tensor<10x20xf32, #[[$DCSR]]> {
+// CHECK:         %[[OUT:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #[[$DCSR]]>
+// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}} ins(%[[ARG0]] : tensor<10x20xf32, #[[$CSR]]>) outs(%[[OUT]] : tensor<10x20xf32, #[[$DCSR]]>)
 // CHECK:         ^bb0(%[[A:.*]]: f32, %[[B:.*]]: f32):
 // CHECK:           %[[ABS:.*]] = math.absf %[[A]] : f32
 // CHECK:           linalg.yield %[[ABS]] : f32
-// CHECK:         } -> tensor<10x20xf32, #{{.*}}>
-// CHECK:         return %[[VAL:.*]] : tensor<10x20xf32, #{{.*}}>
+// CHECK:         } -> tensor<10x20xf32, #[[$DCSR]]>
+// CHECK:         return %[[VAL:.*]] : tensor<10x20xf32, #[[$DCSR]]>
 // CHECK:       }
 func.func @sparse_abs_eltwise(%arg0: tensor<10x20xf32, #CSR>)
                                   -> tensor<10x20xf32, #DCSR> {
@@ -40,15 +46,15 @@ func.func @sparse_abs_eltwise(%arg0: tensor<10x20xf32, #CSR>)
 }
 
 // CHECK-LABEL: func @sparse_add_eltwise(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20xf32, #{{.*}}>,
-// CHECK-SAME:    %[[ARG1:.*]]: tensor<10x20xf32, #{{.*}}>) -> tensor<10x20xf32, #{{.*}}> {
-// CHECK:         %[[OUT:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #{{.*}}>
-// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<10x20xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>, tensor<10x20xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>) {
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20xf32, #[[$CSR]]>,
+// CHECK-SAME:    %[[ARG1:.*]]: tensor<10x20xf32, #[[$DCSR]]>) -> tensor<10x20xf32, #[[$CSR]]> {
+// CHECK:         %[[OUT:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #[[$CSR]]>
+// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<10x20xf32, #[[$CSR]]>, tensor<10x20xf32, #[[$DCSR]]>) outs(%[[OUT]] : tensor<10x20xf32, #[[$CSR]]>) {
 // CHECK:           ^bb0(%[[A:.*]]: f32, %[[B:.*]]: f32, %[[C:.*]]: f32):
 // CHECK:             %[[ADD:.*]] = arith.addf %[[A]], %[[B]] : f32
 // CHECK:             linalg.yield %[[ADD]] : f32
-// CHECK:         } -> tensor<10x20xf32, #{{.*}}>
-// CHECK:         return %[[VAL:.*]] : tensor<10x20xf32, #{{.*}}>
+// CHECK:         } -> tensor<10x20xf32, #[[$CSR]]>
+// CHECK:         return %[[VAL:.*]] : tensor<10x20xf32, #[[$CSR]]>
 // CHECK:       }
 func.func @sparse_add_eltwise(%arg0: tensor<10x20xf32, #CSR>,
                               %arg1: tensor<10x20xf32, #DCSR>)
@@ -60,15 +66,15 @@ func.func @sparse_add_eltwise(%arg0: tensor<10x20xf32, #CSR>,
 }
 
 // CHECK-LABEL: func @sparse_mul_eltwise(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20xf32, #{{.*}}>,
-// CHECK-SAME:    %[[ARG1:.*]]: tensor<10x20xf32, #{{.*}}>) -> tensor<10x20xf32, #{{.*}}> {
-// CHECK:         %[[OUT:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #{{.*}}>
-// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<10x20xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>, tensor<10x20xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>) {
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20xf32, #[[$CSR]]>,
+// CHECK-SAME:    %[[ARG1:.*]]: tensor<10x20xf32, #[[$DCSR]]>) -> tensor<10x20xf32, #[[$CSR]]> {
+// CHECK:         %[[OUT:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #[[$CSR]]>
+// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<10x20xf32, #[[$CSR]]>, tensor<10x20xf32, #[[$DCSR]]>) outs(%[[OUT]] : tensor<10x20xf32, #[[$CSR]]>) {
 // CHECK:           ^bb0(%[[A:.*]]: f32, %[[B:.*]]: f32, %[[C:.*]]: f32):
 // CHECK:             %[[ADD:.*]] = arith.mulf %[[A]], %[[B]] : f32
 // CHECK:             linalg.yield %[[ADD]] : f32
-// CHECK:         } -> tensor<10x20xf32, #{{.*}}>
-// CHECK:         return %[[VAL:.*]] : tensor<10x20xf32, #{{.*}}>
+// CHECK:         } -> tensor<10x20xf32, #[[$CSR]]>
+// CHECK:         return %[[VAL:.*]] : tensor<10x20xf32, #[[$CSR]]>
 // CHECK:       }
 func.func @sparse_mul_eltwise(%arg0: tensor<10x20xf32, #CSR>,
                               %arg1: tensor<10x20xf32, #DCSR>)
@@ -80,20 +86,20 @@ func.func @sparse_mul_eltwise(%arg0: tensor<10x20xf32, #CSR>,
 }
 
 // CHECK-LABEL: func @sparse_math(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20x30xf64, #{{.*}}>) -> tensor<10x20x30xf64, #{{.*}}> {
-// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed) }>>) outs
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20x30xf64, #[[$ST]]>) -> tensor<10x20x30xf64, #[[$ST]]> {
+// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]] : tensor<10x20x30xf64, #[[$ST]]>) outs
 // CHECK:            math.absf
 // CHECK:         }
-// CHECK:         %[[T1:.*]] = linalg.generic {{{.*}}} ins(%[[T0]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed) }>>) outs
+// CHECK:         %[[T1:.*]] = linalg.generic {{{.*}}} ins(%[[T0]] : tensor<10x20x30xf64, #[[$ST]]>) outs
 // CHECK:            math.expm1
 // CHECK:         }
-// CHECK:         %[[T2:.*]] = linalg.generic {{{.*}}} ins(%[[T1]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed) }>>) outs
+// CHECK:         %[[T2:.*]] = linalg.generic {{{.*}}} ins(%[[T1]] : tensor<10x20x30xf64, #[[$ST]]>) outs
 // CHECK:           math.log1p
 // CHECK:         }
-// CHECK:         %[[T3:.*]] = linalg.generic {{{.*}}} ins(%[[T2]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed) }>>) outs
+// CHECK:         %[[T3:.*]] = linalg.generic {{{.*}}} ins(%[[T2]] : tensor<10x20x30xf64, #[[$ST]]>) outs
 // CHECK:           arith.negf
 // CHECK:         }
-// CHECK:         %[[T4:.*]] = linalg.generic {{{.*}}} ins(%[[T3]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed) }>>) outs
+// CHECK:         %[[T4:.*]] = linalg.generic {{{.*}}} ins(%[[T3]] : tensor<10x20x30xf64, #[[$ST]]>) outs
 // CHECK:           sparse_tensor.unary %{{.*}} : f64 to f64
 // CHECK:           present = {
 // CHECK:             math.copysign
@@ -102,22 +108,22 @@ func.func @sparse_mul_eltwise(%arg0: tensor<10x20xf32, #CSR>,
 // CHECK:           absent = {
 // CHECK:           }
 // CHECK:         }
-// CHECK:         %[[T5:.*]] = linalg.generic {{{.*}}} ins(%[[T4]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed) }>>) outs
+// CHECK:         %[[T5:.*]] = linalg.generic {{{.*}}} ins(%[[T4]] : tensor<10x20x30xf64, #[[$ST]]>) outs
 // CHECK:           math.sin
 // CHECK:         }
-// CHECK:         %[[T6:.*]] = linalg.generic {{{.*}}} ins(%[[T5]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed) }>>) outs
+// CHECK:         %[[T6:.*]] = linalg.generic {{{.*}}} ins(%[[T5]] : tensor<10x20x30xf64, #[[$ST]]>) outs
 // CHECK:           math.sqrt
 // CHECK:         }
-// CHECK:         %[[T7:.*]] = linalg.generic {{{.*}}} ins(%[[T6]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed) }>>) outs
+// CHECK:         %[[T7:.*]] = linalg.generic {{{.*}}} ins(%[[T6]] : tensor<10x20x30xf64, #[[$ST]]>) outs
 // CHECK:           math.tanh
 // CHECK:         }
-// CHECK:         %[[T8:.*]] = linalg.generic {{{.*}}} ins(%[[T7]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed) }>>) outs
+// CHECK:         %[[T8:.*]] = linalg.generic {{{.*}}} ins(%[[T7]] : tensor<10x20x30xf64, #[[$ST]]>) outs
 // CHECK:           math.ceil
 // CHECK:         }
-// CHECK:         %[[T9:.*]] = linalg.generic {{{.*}}} ins(%[[T8]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed, d1 : compressed, d2 : compressed) }>>) outs
+// CHECK:         %[[T9:.*]] = linalg.generic {{{.*}}} ins(%[[T8]] : tensor<10x20x30xf64, #[[$ST]]>) outs
 // CHECK:           math.floor
 // CHECK:         }
-// CHECK:         return %[[T9]] : tensor<10x20x30xf64, #{{.*}}>
+// CHECK:         return %[[T9]] : tensor<10x20x30xf64, #[[$ST]]>
 // CHECK:       }
 func.func @sparse_math(%arg0: tensor<10x20x30xf64, #ST>) -> tensor<10x20x30xf64, #ST> {
   %0 = mhlo.abs %arg0 : (tensor<10x20x30xf64, #ST>) -> tensor<10x20x30xf64, #ST>
@@ -134,8 +140,8 @@ func.func @sparse_math(%arg0: tensor<10x20x30xf64, #ST>) -> tensor<10x20x30xf64,
 }
 
 // CHECK-LABEL: func @sparse_sign(
-// CHECK-SAME:    %[[A:.*]]: tensor<100xi32, #{{.*}}>) -> tensor<100xi32> {
-// CHECK:         %[[T:.*]] = linalg.generic {{{.*}}} ins(%[[A]] : tensor<100xi32, #{{.*}}>)
+// CHECK-SAME:    %[[A:.*]]: tensor<100xi32, #[[$SV]]>) -> tensor<100xi32> {
+// CHECK:         %[[T:.*]] = linalg.generic {{{.*}}} ins(%[[A]] : tensor<100xi32, #[[$SV]]>)
 // CHECK:           %[[U:.*]] = sparse_tensor.unary %{{.*}} : i32 to i32
 // CHECK:           present = {
 // CHECK:             arith.cmpi eq
@@ -153,8 +159,8 @@ func.func @sparse_sign(%arg0: tensor<100xi32, #SV>) -> tensor<100xi32> {
 }
 
 // CHECK-LABEL: func @sparse_int_abs(
-// CHECK-SAME:    %[[A:.*]]: tensor<100xi64, #{{.*}}>) -> tensor<100xi64> {
-// CHECK:         %[[T:.*]] = linalg.generic {{{.*}}} ins(%[[A]] : tensor<100xi64, #{{.*}}>)
+// CHECK-SAME:    %[[A:.*]]: tensor<100xi64, #[[$SV]]>) -> tensor<100xi64> {
+// CHECK:         %[[T:.*]] = linalg.generic {{{.*}}} ins(%[[A]] : tensor<100xi64, #[[$SV]]>)
 // CHECK:           %[[U:.*]] = sparse_tensor.unary %{{.*}} : i64 to i64
 // CHECK:           present = {
 // CHECK:             arith.cmpi sge
@@ -174,8 +180,8 @@ func.func @sparse_int_abs(%arg0: tensor<100xi64, #SV>) -> tensor<100xi64> {
 }
 
 // CHECK-LABEL: func @sparse_reduce(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<10xi64, #{{.*}}>) -> tensor<i64> {
-// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]] : tensor<10xi64, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>)
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<10xi64, #[[$SV]]>) -> tensor<i64> {
+// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]] : tensor<10xi64, #[[$SV]]>)
 // CHECK:           arith.addi
 // CHECK:         }
 // CHECK:         return %[[T0]] : tensor<i64>
@@ -191,9 +197,9 @@ func.func @sparse_reduce(%arg0: tensor<10xi64, #SV>) -> tensor<i64> {
 }
 
 // CHECK-LABEL: func @sparse_dot(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<?xf32, #{{.*}}>,
-// CHECK-SAME:    %[[ARG1:.*]]: tensor<?xf32, #{{.*}}>) -> tensor<f32> {
-// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<?xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>, tensor<?xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>)
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<?xf32, #[[$SV]]>,
+// CHECK-SAME:    %[[ARG1:.*]]: tensor<?xf32, #[[$SV]]>) -> tensor<f32> {
+// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<?xf32, #[[$SV]]>, tensor<?xf32, #[[$SV]]>)
 // CHECK:           arith.mulf
 // CHECK:           arith.addf
 // CHECK:         }
@@ -211,12 +217,12 @@ func.func @sparse_dot(%arg0: tensor<?xf32, #SV>,
 }
 
 // CHECK-LABEL: func @sparse_transpose(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<100x200xf64, #{{.*}}>) -> tensor<200x100xf64, #{{.*}}> {
-// CHECK:         %[[T0:.*]] = bufferization.alloc_tensor() : tensor<200x100xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>>
-// CHECK:         %[[T1:.*]] = linalg.generic {{.*}} ins(%[[ARG0]] : tensor<100x200xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>) outs(%[[T0]] : tensor<200x100xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>>) {
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<100x200xf64, #[[$CSR]]>) -> tensor<200x100xf64, #[[$DCSR]]> {
+// CHECK:         %[[T0:.*]] = bufferization.alloc_tensor() : tensor<200x100xf64, #[[$DCSR]]>
+// CHECK:         %[[T1:.*]] = linalg.generic {{.*}} ins(%[[ARG0]] : tensor<100x200xf64, #[[$CSR]]>) outs(%[[T0]] : tensor<200x100xf64, #[[$DCSR]]>) {
 // CHECK:           linalg.yield
 // CHECK:         }
-// CHECK:         return %[[T1]] : tensor<200x100xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>>
+// CHECK:         return %[[T1]] : tensor<200x100xf64, #[[$DCSR]]>
 // CHECK:       }
 func.func @sparse_transpose(%arg0: tensor<100x200xf64, #CSR>)
                                 -> tensor<200x100xf64, #DCSR> {
@@ -226,20 +232,20 @@ func.func @sparse_transpose(%arg0: tensor<100x200xf64, #CSR>)
 }
 
 // CHECK-LABEL: func @sparse_expand(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<100xf64, #{{.*}}>) -> tensor<10x10xf64, #{{.*}}> {
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<100xf64, #[[$SV]]>) -> tensor<10x10xf64, #[[$CSR]]> {
 // CHECK:         %[[CST:.*]] = arith.constant dense<10> : tensor<2xi64>
-// CHECK:         %[[OUT:.*]] = tensor.reshape %[[ARG0]](%[[CST]]) : (tensor<100xf64, #{{.*}}>, tensor<2xi64>) -> tensor<10x10xf64, #{{.*}}>
-// CHECK:         return %[[OUT]] : tensor<10x10xf64, #{{.*}}>
+// CHECK:         %[[OUT:.*]] = tensor.reshape %[[ARG0]](%[[CST]]) : (tensor<100xf64, #[[$SV]]>, tensor<2xi64>) -> tensor<10x10xf64, #[[$CSR]]>
+// CHECK:         return %[[OUT]] : tensor<10x10xf64, #[[$CSR]]>
 func.func @sparse_expand(%arg0: tensor<100xf64, #SV>) -> tensor<10x10xf64, #CSR> {
   %0 = "mhlo.reshape"(%arg0) : (tensor<100xf64, #SV>) -> tensor<10x10xf64, #CSR>
   return %0 : tensor<10x10xf64, #CSR>
 }
 
 // CHECK-LABEL: func @sparse_collapse(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64, #{{.*}}>) -> tensor<100xf64, #{{.*}}> {
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64, #[[$CSR]]>) -> tensor<100xf64, #[[$SV]]> {
 // CHECK:         %[[CST:.*]] = arith.constant dense<100> : tensor<1xi64>
-// CHECK:         %[[OUT:.*]] = tensor.reshape %[[ARG0]](%[[CST]]) : (tensor<10x10xf64, #{{.*}}>, tensor<1xi64>) -> tensor<100xf64, #{{.*}}>
-// CHECK:         return %[[OUT]] : tensor<100xf64, #{{.*}}>
+// CHECK:         %[[OUT:.*]] = tensor.reshape %[[ARG0]](%[[CST]]) : (tensor<10x10xf64, #[[$CSR]]>, tensor<1xi64>) -> tensor<100xf64, #[[$SV]]>
+// CHECK:         return %[[OUT]] : tensor<100xf64, #[[$SV]]>
 func.func @sparse_collapse(%arg0: tensor<10x10xf64, #CSR>) -> tensor<100xf64, #SV> {
   %0 = "mhlo.reshape"(%arg0) : (tensor<10x10xf64, #CSR>) -> tensor<100xf64, #SV>
   return %0 : tensor<100xf64, #SV>
@@ -247,12 +253,12 @@ func.func @sparse_collapse(%arg0: tensor<10x10xf64, #CSR>) -> tensor<100xf64, #S
 
 // CHECK-LABEL: func @sparse_tensor_dot(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<197x12x64xf32>,
-// CHECK-SAME:    %[[ARG1:.*]]: tensor<12x64x768xf32, #{{.*}}>) -> tensor<197x768xf32, #{{.*}}> {
+// CHECK-SAME:    %[[ARG1:.*]]: tensor<12x64x768xf32, #[[$ST]]>) -> tensor<197x768xf32, #[[$CSR]]> {
 // CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] :
 // CHECK:           arith.mulf
 // CHECK:           arith.addf
 // CHECK:         }
-// CHECK:         return %[[T0]] : tensor<197x768xf32, #{{.*}}>
+// CHECK:         return %[[T0]] : tensor<197x768xf32, #[[$CSR]]>
 // CHECK:       }
 func.func @sparse_tensor_dot(%arg0: tensor<197x12x64xf32>,
                              %arg1: tensor<12x64x768xf32, #ST>) -> tensor<197x768xf32, #CSR> {
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_rewriting.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_rewriting.mlir
index d88b9b990a880d..c0386c1de2ac94 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_rewriting.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_rewriting.mlir
@@ -14,10 +14,15 @@
   map = (d0, d1) -> (d0 : compressed, d1 : compressed)
 }>
 
+// CHECK: #[[$SV:.*]] = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
+// CHECK: #[[$CSR:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>
+// CHECK: #[[$DCSR:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
+
+
 // CHECK-LABEL: func @rewrite_unary(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<100xf64>) -> tensor<100xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[VAL:.*]] = mhlo.abs %[[ARG0]] : (tensor<100xf64>) -> tensor<100xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-NEXT:    return %[[VAL:.*]] : tensor<100xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<100xf64>) -> tensor<100xf64, #[[$SV]]> {
+// CHECK:         %[[VAL:.*]] = mhlo.abs %[[ARG0]] : (tensor<100xf64>) -> tensor<100xf64, #[[$SV]]>
+// CHECK-NEXT:    return %[[VAL:.*]] : tensor<100xf64, #[[$SV]]>
 func.func @rewrite_unary(%arg0: tensor<100xf64>) -> tensor<100xf64, #SV> {
   %0 = mhlo.abs %arg0 : tensor<100xf64>
   %1 = sparse_tensor.convert %0 : tensor<100xf64> to tensor<100xf64, #SV>
@@ -26,9 +31,9 @@ func.func @rewrite_unary(%arg0: tensor<100xf64>) -> tensor<100xf64, #SV> {
 
 // CHECK-LABEL: func @rewrite_binary(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<100xf64>,
-// CHECK-SAME:    %[[ARG1:.*]]: tensor<100xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<100xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[VAL:.*]] = mhlo.multiply %[[ARG0]], %[[ARG1]] : (tensor<100xf64>, tensor<100xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-NEXT:    return %[[VAL:.*]] : tensor<100xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-SAME:    %[[ARG1:.*]]: tensor<100xf64, #[[$SV]]>) -> tensor<100xf64, #[[$SV]]> {
+// CHECK:         %[[VAL:.*]] = mhlo.multiply %[[ARG0]], %[[ARG1]] : (tensor<100xf64>, tensor<100xf64, #[[$SV]]>
+// CHECK-NEXT:    return %[[VAL:.*]] : tensor<100xf64, #[[$SV]]>
 func.func @rewrite_binary(%arg0: tensor<100xf64>,
                           %arg1: tensor<100xf64, #SV>) -> tensor<100xf64, #SV> {
   %0 = mhlo.multiply %arg0, %arg1 : (tensor<100xf64>, tensor<100xf64, #SV>) -> tensor<100xf64>
@@ -37,10 +42,10 @@ func.func @rewrite_binary(%arg0: tensor<100xf64>,
 }
 
 // CHECK-LABEL: func @rewrite_binary_override(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>,
-// CHECK-SAME:    %[[ARG1:.*]]: tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[VAL:.*]] = mhlo.multiply %[[ARG0]], %[[ARG1]] : (tensor<10x10xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>, tensor<10x10xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>) -> tensor<10x10xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>>
-// CHECK-NEXT:    return %[[VAL:.*]] : tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64, #[[$CSR]]>,
+// CHECK-SAME:    %[[ARG1:.*]]: tensor<10x10xf64, #[[$CSR]]>) -> tensor<10x10xf64, #[[$DCSR]]> {
+// CHECK:         %[[VAL:.*]] = mhlo.multiply %[[ARG0]], %[[ARG1]] : (tensor<10x10xf64, #[[$CSR]]>, tensor<10x10xf64, #[[$CSR]]>) -> tensor<10x10xf64, #[[$DCSR]]>
+// CHECK-NEXT:    return %[[VAL:.*]] : tensor<10x10xf64, #[[$DCSR]]>
 func.func @rewrite_binary_override(%arg0: tensor<10x10xf64, #CSR>,
                                    %arg1: tensor<10x10xf64, #CSR>) -> tensor<10x10xf64, #DCSR> {
   %0 = mhlo.multiply %arg0, %arg1 : (tensor<10x10xf64, #CSR>, tensor<10x10xf64, #CSR>) -> tensor<10x10xf64, #CSR>
@@ -49,9 +54,9 @@ func.func @rewrite_binary_override(%arg0: tensor<10x10xf64, #CSR>,
 }
 
 // CHECK-LABEL: func @rewrite_convert(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64>) -> tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[VAL:.*]] = sparse_tensor.convert %[[ARG0]] : tensor<10x10xf64> to tensor<10x10xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>
-// CHECK-NEXT:    return %[[VAL:.*]] : tensor<10x10xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64>) -> tensor<10x10xf64, #[[$CSR]]> {
+// CHECK:         %[[VAL:.*]] = sparse_tensor.convert %[[ARG0]] : tensor<10x10xf64> to tensor<10x10xf64, #[[$CSR]]>
+// CHECK-NEXT:    return %[[VAL:.*]] : tensor<10x10xf64, #[[$CSR]]>
 func.func @rewrite_convert(%arg0: tensor<10x10xf64>) -> tensor<10x10xf64, #CSR> {
   %0 = sparse_tensor.convert %arg0 : tensor<10x10xf64> to tensor<10x10xf64, #DCSR>
   %1 = sparse_tensor.convert %0 : tensor<10x10xf64, #DCSR> to tensor<10x10xf64, #CSR>
@@ -60,8 +65,8 @@ func.func @rewrite_convert(%arg0: tensor<10x10xf64>) -> tensor<10x10xf64, #CSR>
 }
 
 // CHECK-LABEL: func @rewrite_convert_nop(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>) -> tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-NEXT:    return %[[ARG0]] : tensor<10x10xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64, #[[$CSR]]>) -> tensor<10x10xf64, #[[$CSR]]>
+// CHECK-NEXT:    return %[[ARG0]] : tensor<10x10xf64, #[[$CSR]]>
 func.func @rewrite_convert_nop(%arg0: tensor<10x10xf64, #CSR>) -> tensor<10x10xf64, #CSR> {
   %0 = sparse_tensor.convert %arg0 : tensor<10x10xf64, #CSR> to tensor<10x10xf64, #DCSR>
   %1 = sparse_tensor.convert %0 : tensor<10x10xf64, #DCSR> to tensor<10x10xf64, #CSR>
@@ -70,9 +75,9 @@ func.func @rewrite_convert_nop(%arg0: tensor<10x10xf64, #CSR>) -> tensor<10x10xf
 }
 
 // CHECK-LABEL: func @rewrite_transpose(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<100x200xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<200x100xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[VAL:.*]] = "mhlo.transpose"(%[[ARG0]]) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<100x200xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-NEXT:    return %[[VAL:.*]] : tensor<200x100xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<100x200xf64, #[[$CSR]]>) -> tensor<200x100xf64, #[[$CSR]]> {
+// CHECK:         %[[VAL:.*]] = "mhlo.transpose"(%[[ARG0]]) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<100x200xf64, #[[$CSR]]>
+// CHECK-NEXT:    return %[[VAL:.*]] : tensor<200x100xf64, #[[$CSR]]>
 func.func @rewrite_transpose(%arg0: tensor<100x200xf64, #CSR>) -> tensor<200x100xf64, #CSR> {
   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<100x200xf64, #CSR>) -> tensor<200x100xf64>
   %1 = sparse_tensor.convert %0 : tensor<200x100xf64> to tensor<200x100xf64, #CSR>
@@ -80,10 +85,10 @@ func.func @rewrite_transpose(%arg0: tensor<100x200xf64, #CSR>) -> tensor<200x100
 }
 
 // CHECK-LABEL: func.func @rewrite_dot(
-// CHECK-SAME:    %[[ARG0:.*0]]: tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>,
-// CHECK-SAME:    %[[ARG1:.*1]]: tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>> {
+// CHECK-SAME:    %[[ARG0:.*0]]: tensor<5x5xf64, #[[$CSR]]>,
+// CHECK-SAME:    %[[ARG1:.*1]]: tensor<5x5xf64, #[[$CSR]]>) -> tensor<5x5xf64, #[[$CSR]]> {
 // CHECK:         %[[VAL:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]])
-// CHECK:         return %[[VAL]] : tensor<5x5xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>
+// CHECK:         return %[[VAL]] : tensor<5x5xf64, #[[$CSR]]>
 func.func @rewrite_dot(%arg0: tensor<5x5xf64, #CSR>,
                        %arg1: tensor<5x5xf64, #CSR>) -> tensor<5x5xf64, #CSR> {
   %0 = "mhlo.dot"(%arg0, %arg1)
@@ -96,10 +101,10 @@ func.func @rewrite_dot(%arg0: tensor<5x5xf64, #CSR>,
 }
 
 // CHECK-LABEL: func.func @rewrite_general_dot(
-// CHECK-SAME:    %[[ARG0:.*0]]: tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>,
-// CHECK-SAME:    %[[ARG1:.*1]]: tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>> {
+// CHECK-SAME:    %[[ARG0:.*0]]: tensor<5x5xf64, #[[$CSR]]>,
+// CHECK-SAME:    %[[ARG1:.*1]]: tensor<5x5xf64, #[[$CSR]]>) -> tensor<5x5xf64, #[[$CSR]]> {
 // CHECK:         %[[VAL:.*]] = "mhlo.dot_general"(%[[ARG0]], %[[ARG1]])
-// CHECK:         return %[[VAL]] : tensor<5x5xf64, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>>
+// CHECK:         return %[[VAL]] : tensor<5x5xf64, #[[$CSR]]>
 func.func @rewrite_general_dot(%arg0: tensor<5x5xf64, #CSR>,
                                %arg1: tensor<5x5xf64, #CSR>) -> tensor<5x5xf64, #CSR> {
    %0 = "mhlo.dot_general"(%arg0, %arg1)
@@ -114,19 +119,19 @@ func.func @rewrite_general_dot(%arg0: tensor<5x5xf64, #CSR>,
 }
 
 // CHECK-LABEL:  func.func @rewrite_elt_convert(
-// CHECK-SAME:     %[[ARG0:.*0]]: tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<5x5xf32, #sparse_tensor.encoding<{{{.*}}}>> {
+// CHECK-SAME:     %[[ARG0:.*0]]: tensor<5x5xf64, #[[$CSR]]>) -> tensor<5x5xf32, #[[$CSR]]> {
 // CHECK:          %[[VAL:.*]] = sparse_tensor.convert %[[ARG0]]
-// CHECK:          return %[[VAL]] : tensor<5x5xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:          return %[[VAL]] : tensor<5x5xf32, #[[$CSR]]>
 func.func @rewrite_elt_convert(%arg0: tensor<5x5xf64, #CSR>) -> tensor<5x5xf32, #CSR> {
   %0 = "mhlo.convert"(%arg0) : (tensor<5x5xf64, #CSR>) -> tensor<5x5xf32, #CSR>
   return %0 : tensor<5x5xf32, #CSR>
 }
 
 // CHECK-LABEL:  func.func @concatenate_sparse(
-// CHECK-SAME:     %[[ARG0:.*0]]: tensor<100x100xf64, #sparse_tensor.encoding<{{{.*}}}>>,
-// CHECK-SAME:     %[[ARG1:.*1]]: tensor<100x100xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<200x100xf64, #sparse_tensor.encoding<{{{.*}}}>> {
+// CHECK-SAME:     %[[ARG0:.*0]]: tensor<100x100xf64, #[[$CSR]]>,
+// CHECK-SAME:     %[[ARG1:.*1]]: tensor<100x100xf64, #[[$CSR]]>) -> tensor<200x100xf64, #[[$CSR]]> {
 // CHECK:          %[[VAL:.*]] = sparse_tensor.concatenate %[[ARG0]], %[[ARG1]] {dimension = 0
-// CHECK:          return %[[VAL]] : tensor<200x100xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK:          return %[[VAL]] : tensor<200x100xf64, #[[$CSR]]>
 func.func @concatenate_sparse(%arg0: tensor<100x100xf64, #CSR>, %arg1: tensor<100x100xf64, #CSR>) -> tensor<200x100xf64, #CSR> {
   %0 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<100x100xf64, #CSR>, tensor<100x100xf64, #CSR>) -> tensor<200x100xf64, #CSR>
   return %0 : tensor<200x100xf64, #CSR>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_transpose.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_transpose.mlir
index 4d10863b90baee..ae8dc3213c7f3b 100755
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_transpose.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/sparse_transpose.mlir
@@ -23,8 +23,8 @@ func.func @transpose1(%arg0: tensor<100x100xf64>)
 }
 
 // CHECK-LABEL: func @transpose2(
-//  CHECK-SAME: %[[A:.*]]: tensor<100x100xf64, #sparse_tensor.encoding<{{{.*}}}>>)
-//       CHECK: return %[[A]] : tensor<100x100xf64, #sparse_tensor.encoding<{{{.*}}}>>
+//  CHECK-SAME: %[[A:.*]]: tensor<100x100xf64, #sparse{{[0-9]*}}>)
+//       CHECK: return %[[A]] : tensor<100x100xf64, #sparse{{[0-9]*}}>
 func.func @transpose2(%arg0: tensor<100x100xf64, #DCSR>)
                           -> tensor<100x100xf64, #DCSR> {
   %0 = "mhlo.transpose"(%arg0)
@@ -34,8 +34,8 @@ func.func @transpose2(%arg0: tensor<100x100xf64, #DCSR>)
 }
 
 // CHECK-LABEL: func @transpose3(
-//  CHECK-SAME: %[[A:.*]]: tensor<100x100xf64, #sparse_tensor.encoding<{{{.*}}}>>)
-//       CHECK: %[[R:.*]] = mhlo.reshape %[[A]] : (tensor<100x100xf64, #sparse_tensor.encoding<{{.*}}}>>) -> tensor<100x100xf64>
+//  CHECK-SAME: %[[A:.*]]: tensor<100x100xf64, #sparse{{[0-9]*}}>)
+//       CHECK: %[[R:.*]] = mhlo.reshape %[[A]] : (tensor<100x100xf64, #sparse{{[0-9]*}}>) -> tensor<100x100xf64>
 //       CHECK: return %[[R]] : tensor<100x100xf64>
 func.func @transpose3(%arg0: tensor<100x100xf64, #DCSR>)
                           -> tensor<100x100xf64> {
@@ -47,8 +47,8 @@ func.func @transpose3(%arg0: tensor<100x100xf64, #DCSR>)
 
 // CHECK-LABEL: func @transpose4(
 //  CHECK-SAME: %[[A:.*]]: tensor<100x100xf64>)
-//       CHECK: %[[R:.*]] = mhlo.reshape %[[A]] : (tensor<100x100xf64>) -> tensor<100x100xf64, #sparse_tensor.encoding<{{.*}}}>>
-//       CHECK: return %[[R]] : tensor<100x100xf64, #sparse_tensor.encoding<{{{.*}}}>>
+//       CHECK: %[[R:.*]] = mhlo.reshape %[[A]] : (tensor<100x100xf64>) -> tensor<100x100xf64, #sparse{{[0-9]*}}>
+//       CHECK: return %[[R]] : tensor<100x100xf64, #sparse{{[0-9]*}}>
 func.func @transpose4(%arg0: tensor<100x100xf64>)
                           -> tensor<100x100xf64, #DCSR> {
   %0 = "mhlo.transpose"(%arg0)
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
index af8419890635ab..1fa1d20cf23d0c 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
@@ -158,7 +158,7 @@ func.func @attr_fft_type_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomple
   %0 = "stablehlo.fft"(%arg0) {
     // CHECK: fft_type = #mhlo<fft_type FFT>
     fft_type = #stablehlo<fft_type FFT>,
-    fft_length = dense<16> : tensor<1xi64>
+    fft_length = array<i64: 16>
   } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
   func.return %0 : tensor<16xcomplex<f32>>
 }
@@ -168,27 +168,27 @@ func.func @attr_fft_type_ifft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcompl
   %0 = "stablehlo.fft"(%arg0) {
     // CHECK: fft_type = #mhlo<fft_type IFFT>
     fft_type = #stablehlo<fft_type IFFT>,
-    fft_length = dense<16> : tensor<1xi64>
+    fft_length = array<i64: 16>
   } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
   func.return %0 : tensor<16xcomplex<f32>>
 }
 
 // CHECK-LABEL: "attr_fft_type_rfft"
 func.func @attr_fft_type_rfft(%arg0: tensor<16xf32>) -> tensor<9xcomplex<f32>> {
-  %0 = "mhlo.fft"(%arg0) {
+  %0 = "stablehlo.fft"(%arg0) {
     // CHECK: fft_type = #mhlo<fft_type RFFT>
-    fft_type = #mhlo<fft_type RFFT>,
-    fft_length = dense<16> : tensor<1xi64>
+    fft_type = #stablehlo<fft_type RFFT>,
+    fft_length = array<i64: 16>
   } : (tensor<16xf32>) -> tensor<9xcomplex<f32>>
   func.return %0 : tensor<9xcomplex<f32>>
 }
 
 // CHECK-LABEL: "attr_fft_type_irfft"
 func.func @attr_fft_type_irfft(%arg0: tensor<9xcomplex<f32>>) -> tensor<16xf32> {
-  %0 = "mhlo.fft"(%arg0) {
+  %0 = "stablehlo.fft"(%arg0) {
     // CHECK: fft_type = #mhlo<fft_type IRFFT>
-    fft_type = #mhlo<fft_type IRFFT>,
-    fft_length = dense<16> : tensor<1xi64>
+    fft_type = #stablehlo<fft_type IRFFT>,
+    fft_length = array<i64: 16>
   } : (tensor<9xcomplex<f32>>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
@@ -490,7 +490,7 @@ func.func @op_broadcast(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
   // CHECK-SAME:   broadcast_sizes = dense<16> : tensor<1xi64>
   // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16x16xf32>
   %0 = "stablehlo.broadcast"(%arg0) {
-    broadcast_sizes = dense<16> : tensor<1xi64>
+    broadcast_sizes = array<i64: 16>
   } : (tensor<16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
@@ -545,6 +545,19 @@ func.func @op_count_leading_zeros(%arg0: tensor<i32>) -> tensor<i32> {
   func.return %0 : tensor<i32>
 }
 
+// CHECK-LABEL: "op_collective_broadcast"
+func.func @op_collective_broadcast(%arg0: tensor<1x2xi64>) -> tensor<1x2xi64> {
+  //               CHECK: "mhlo.collective_broadcast"(%arg0) {
+  //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
+  // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
+  //          CHECK-SAME: } : (tensor<1x2xi64>) -> tensor<1x2xi64>
+  %0 = "stablehlo.collective_broadcast"(%arg0) {
+    replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>,
+    channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>
+  } : (tensor<1x2xi64>) -> tensor<1x2xi64>
+  func.return %0 : tensor<1x2xi64>
+}
+
 // CHECK-LABEL: "op_collective_permute"
 func.func @op_collective_permute(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
   //               CHECK: "mhlo.collective_permute"(%arg0) {
@@ -871,7 +884,7 @@ func.func @op_dynamic_slice(%arg0: tensor<16xf32>, %arg1: tensor<i64>) -> tensor
   // CHECK-SAME:   slice_sizes = dense<4> : tensor<1xi64>
   // CHECK-SAME: } : (tensor<16xf32>, tensor<i64>) -> tensor<4xf32>
   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1) {
-    slice_sizes = dense<4> : tensor<1xi64>
+    slice_sizes = array<i64: 4>
   } : (tensor<16xf32>, tensor<i64>) -> tensor<4xf32>
   func.return %0 : tensor<4xf32>
 }
@@ -916,7 +929,7 @@ func.func @op_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
   // CHECK-SAME: } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
   %0 = "stablehlo.fft"(%arg0) {
     fft_type = #stablehlo<fft_type FFT>,
-    fft_length = dense<16> : tensor<1xi64>
+    fft_length = array<i64: 16>
   } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
   func.return %0 : tensor<16xcomplex<f32>>
 }
@@ -1136,9 +1149,9 @@ func.func @op_pad(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> tensor<16xf32> {
   // CHECK-SAME:   interior_padding = dense<0> : tensor<1xi64>
   // CHECK-SAME: } : (tensor<8xf32>, tensor<f32>) -> tensor<16xf32>
   %0 = "stablehlo.pad"(%arg0, %arg1) {
-    edge_padding_high = dense<4> : tensor<1xi64>,
-    edge_padding_low = dense<4> : tensor<1xi64>,
-    interior_padding = dense<0> : tensor<1xi64>
+    edge_padding_high = array<i64: 4>,
+    edge_padding_low = array<i64: 4>,
+    interior_padding = array<i64: 0>
   } : (tensor<8xf32>, tensor<f32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
@@ -1304,7 +1317,7 @@ func.func @op_reverse(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   // CHECK-SAME:   dimensions = dense<0> : tensor<1xi64>
   // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16xf32>
   %0 = "stablehlo.reverse"(%arg0) {
-    dimensions = dense<0> : tensor<1xi64>
+    dimensions = array<i64: 0>
   } : (tensor<16xf32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
@@ -1490,9 +1503,9 @@ func.func @op_slice(%arg0: tensor<16xf32>) -> tensor<4xf32> {
   // CHECK-SAME:   strides = dense<1> : tensor<1xi64>
   // CHECK-SAME: } : (tensor<16xf32>) -> tensor<4xf32>
   %0 = "stablehlo.slice"(%arg0) {
-    start_indices = dense<0> : tensor<1xi64>,
-    limit_indices = dense<4> : tensor<1xi64>,
-    strides = dense<1> : tensor<1xi64>
+    start_indices = array<i64: 0>,
+    limit_indices = array<i64: 4>,
+    strides = array<i64: 1>
   } : (tensor<16xf32>) -> tensor<4xf32>
   func.return %0 : tensor<4xf32>
 }
@@ -1581,7 +1594,7 @@ func.func @op_transpose(%arg0: tensor<16x8xf32>) ->  tensor<8x16xf32> {
   // CHECK-SAME:   permutation = dense<[1, 0]> : tensor<2xi64>
   // CHECK-SAME: } : (tensor<16x8xf32>) -> tensor<8x16xf32>
   %0 = "stablehlo.transpose"(%arg0) {
-    permutation = dense<[1, 0]> : tensor<2xi64>
+    permutation = array<i64: 1, 0>
   } : (tensor<16x8xf32>) -> tensor<8x16xf32>
   func.return %0 : tensor<8x16xf32>
 }
@@ -1838,13 +1851,20 @@ func.func @type_quantization(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>, %ar
   func.return %0 : tensor<f32>
 }
 
+// -----
+
+#SV = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
+
+// CHECK: #[[$SV:.*]] = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
 // CHECK-LABEL: "type_sparsity"
-func.func @type_sparsity(%arg0: tensor<16xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<16xf32> {
-  // CHECK: "mhlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<16xf32>
-  %0 = "stablehlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>>) -> tensor<16xf32>
+func.func @type_sparsity(%arg0: tensor<16xf32, #SV>) -> tensor<16xf32> {
+  // CHECK: "mhlo.abs"(%arg0) : (tensor<16xf32, #[[$SV]]>) -> tensor<16xf32>
+  %0 = "stablehlo.abs"(%arg0) : (tensor<16xf32, #SV>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
 
+// -----
+
 func.func @type_token_callee(%arg0: !stablehlo.token) -> !stablehlo.token {
   // CHECK: function_type = (!mhlo.token) -> !mhlo.token, sym_name = "type_token_callee"
   // CHECK: "func.return"(%arg0) : (!mhlo.token) -> ()
@@ -1935,3 +1955,17 @@ func.func @op_custom_call_botched_mhlo_backend_config_version(%arg0: tensor<f32>
   } : (tensor<f32>) -> tensor<f32>
   return %0 : tensor<f32>
 }
+
+// -----
+
+// CHECK-LABEL: "op_topk_mhlo_v1"
+func.func @op_topk_mhlo_v1(%arg0: tensor<5x10xf32>) -> (tensor<5x8xf32>, tensor<5x8xi32>) {
+  // CHECK: "mhlo.topk"(%arg0) {k = 8 : i64, largest = true} : (tensor<5x10xf32>) -> (tensor<5x8xf32>, tensor<5x8xi32>)
+  %0:2 = "stablehlo.custom_call"(%arg0) {
+    backend_config = "",
+    call_target_name = "mhlo.topk",
+    mhlo.attributes = {k = 8 : i64, largest = true},
+    mhlo.version = 1 : i64
+  } : (tensor<5x10xf32>) -> (tensor<5x8xf32>, tensor<5x8xi32>)
+  func.return %0#0, %0#1 : tensor<5x8xf32>, tensor<5x8xi32>
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/bufferize.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/bufferize.mlir
deleted file mode 100644
index 5e88a2dd320019..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/bufferize.mlir
+++ /dev/null
@@ -1,41 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --computeop-and-func-bufferize \
-// RUN:     --allow-unregistered-dialect --final-bufferize=alignment=128 | \
-// RUN: FileCheck %s
-
-func.func @sort(%input1: tensor<?x?x?xf32>, %input2: tensor<?x?x?xi32>,
-                %init1: tensor<?x?x?xf32>, %init2: tensor<?x?x?xi32>)
-    -> (tensor<?x?x?xf32>, tensor<?x?x?xi32>) {
-  %sorted1, %sorted2 = thlo.sort
-      ins(%input1: tensor<?x?x?xf32>, %input2: tensor<?x?x?xi32>)
-      outs(%init1: tensor<?x?x?xf32>, %init2: tensor<?x?x?xi32>)
-      dimension = 1
-      is_stable = true
-      (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
-        %gt = arith.cmpf ogt, %e11, %e12: f32
-        thlo.yield %gt : i1
-      }
-  func.return %sorted1, %sorted2 : tensor<?x?x?xf32>, tensor<?x?x?xi32>
-}
-
-// CHECK-LABEL:  func.func @sort
-// CHECK-SAME:          (%[[INPUT1:[A-Za-z_0-9]*]]: memref<?x?x?xf32>,
-// CHECK-SAME:           %[[INPUT2:[A-Za-z_0-9]*]]: memref<?x?x?xi32>,
-// CHECK-SAME:           %[[INIT1:[A-Za-z_0-9]*]]: memref<?x?x?xf32>,
-// CHECK-SAME:           %[[INIT2:[A-Za-z_0-9]*]]: memref<?x?x?xi32>)
-// CHECK-SAME:       -> (memref<?x?x?xf32>, memref<?x?x?xi32>)
-// CHECK-DAG:      %[[OUTPUT1:.*]] = memref.alloc
-// CHECK-DAG:      memref.copy %[[INIT1]], %[[OUTPUT1]]
-// CHECK-DAG:      %[[OUTPUT2:.*]] = memref.alloc
-// CHECK-DAG:      memref.copy %[[INIT2]], %[[OUTPUT2]]
-// CHECK:          thlo.sort
-// CHECK-SAME:         ins(%[[INPUT1]] : memref<?x?x?xf32>,
-// CHECK-SAME:           %[[INPUT2]] : memref<?x?x?xi32>)
-// CHECK-SAME:         outs(%[[OUTPUT1]] : memref<?x?x?xf32>,
-// CHECK-SAME:           %[[OUTPUT2]] : memref<?x?x?xi32>)
-// CHECK-SAME:         dimension = 1
-// CHECK-SAME:         is_stable = true
-// CHECK-NEXT:         (%[[FLOAT1:[A-Za-z_0-9]*]]: f32, %[[FLOAT2:.*]]: f32,
-// CHECK-SAME:          %[[INT1:[A-Za-z_0-9]*]]: i32, %[[INT2:.*]]: i32)
-// CHECK:                 %[[RESULT:.*]] = arith.cmpf ogt, %[[FLOAT1]], %[[FLOAT2]] : f32
-// CHECK:                 thlo.yield %[[RESULT]] : i1
-// CHECK:          return %[[OUTPUT1]], %[[OUTPUT2]]
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/canonicalize.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/canonicalize.mlir
deleted file mode 100644
index aaec8cc5b1efd4..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/canonicalize.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file \
-// RUN: --canonicalize | FileCheck %s
-
-func.func @reverse_dynamic_fold(%input: tensor<1x?xf32>, %init: tensor<1x?xf32>)
-  -> tensor<1x?xf32> {
-  %res = thlo.reverse
-         ins(%input: tensor<1x?xf32>)
-         outs(%init: tensor<1x?xf32>)
-         reverse_dimensions = [0]
-  func.return %res : tensor<1x?xf32>
-}
-
-// CHECK-LABEL: func @reverse_dynamic_fold
-//  CHECK-SAME: %[[ARG0:.*]]: tensor<1x?xf32>, %[[ARG1:.*]]: tensor<1x?xf32>
-//       CHECK:   return %[[ARG0]]
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/invalid.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/invalid.mlir
deleted file mode 100644
index 8582b40adabda1..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/invalid.mlir
+++ /dev/null
@@ -1,352 +0,0 @@
-// RUN: mlir-hlo-opt %s -verify-diagnostics -split-input-file
-
-func.func @concatenate(%arg1: tensor<?x?xf32>,
-                       %arg2: tensor<?x?xi32>,
-                       %dst: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // expected-error @+1 {{thlo.concatenate' op expected element type of input 'i32' to match output element type 'f32'}}
-  %cat = thlo.concatenate
-      ins(%arg1: tensor<?x?xf32>, %arg2: tensor<?x?xi32>)
-      outs(%dst: tensor<?x?xf32>)
-      dimension = 0
-  func.return %cat : tensor<?x?xf32>
-}
-
-// -----
-
-func.func @concatenate_mismatch_rank(%arg1: tensor<?x?xf32>,
-                       %arg2: tensor<?x?x?xf32>,
-                       %dst: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // expected-error @+1 {{thlo.concatenate' op expected all args to be rank 2, got 3 in arg 1}}
-  %cat = thlo.concatenate
-      ins(%arg1: tensor<?x?xf32>, %arg2: tensor<?x?x?xf32>)
-      outs(%dst: tensor<?x?xf32>)
-      dimension = 0
-  func.return %cat : tensor<?x?xf32>
-}
-
-// -----
-
-func.func @concatenate_mismatch_shape(%arg1: tensor<?x8xf32>,
-                       %arg2: tensor<?x?xf32>,
-                       %dst: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // expected-error @+1 {{thlo.concatenate' op shape of input arg 1: 'tensor<?x?xf32>' doesn't match expected shape 'tensor<?x8xf32>'}}
-  %cat = thlo.concatenate
-      ins(%arg1: tensor<?x8xf32>, %arg2: tensor<?x?xf32>)
-      outs(%dst: tensor<?x?xf32>)
-      dimension = 0
-  func.return %cat : tensor<?x?xf32>
-}
-
-// -----
-
-func.func @yield_op_inside_mhlo_reduce(
-    %arg0: tensor<5x4xf32>, %arg1: tensor<f32>) -> tensor<5xf32> {
-  %0 = "mhlo.reduce"(%arg0, %arg1) ({
-  ^bb0(%init: tensor<f32>, %arg3: tensor<f32>):
-    %1 = mhlo.add %init, %arg3 : tensor<f32>
-    // expected-error @+1{{'thlo.yield' op expects parent op to be one of}}
-    thlo.yield %1: tensor<f32>
-  }) {dimensions = dense<1> : tensor<1xi64>} :
-    (tensor<5x4xf32>, tensor<f32>) -> tensor<5xf32>
-  func.return %0 : tensor<5xf32>
-}
-
-// -----
-
-func.func @scatter_indices_wrong_rank(%indices: tensor<2x2x2xindex>,
-    %updates: tensor<2x1x3xf32>, %init: tensor<3x3xf32>) -> tensor<3x3xf32> {
-  // expected-error@+1{{expected `indices` to be a 2D tensor}}
-  %0 = thlo.scatter ins(%indices : tensor<2x2x2xindex>,
-                        %updates : tensor<2x1x3xf32>)
-                    outs(%init : tensor<3x3xf32>)
-                    (%in: f32, %out: f32) {
-    %sum = arith.addf %in, %out : f32
-    thlo.yield %sum : f32
-  }
-  return %0 : tensor<3x3xf32>
-}
-
-// -----
-
-func.func @scatter_updates_indices_major_dim_mismatch(
-    %indices: tensor<2x2xindex>, %updates: tensor<3x1x3xf32>,
-    %init: tensor<3x3xf32>) -> tensor<3x3xf32> {
-  // expected-error@+1{{expected major dimension of `indices` to match major dimension of `updates`}}
-  %0 = thlo.scatter ins(%indices : tensor<2x2xindex>,
-                        %updates : tensor<3x1x3xf32>)
-                    outs(%init : tensor<3x3xf32>)
-                    (%in: f32, %out: f32) {
-    %sum = arith.addf %in, %out : f32
-    thlo.yield %sum : f32
-  }
-  return %0 : tensor<3x3xf32>
-}
-
-// -----
-
-func.func @scatter_indices_dynamic_index_vector_dim(
-    %indices: tensor<2x?xindex>, %updates: tensor<2x1x3xf32>,
-    %init: tensor<3x3xf32>) -> tensor<3x3xf32> {
-  // expected-error@+1{{expected index vector dimension size to be static}}
-  %0 = thlo.scatter ins(%indices : tensor<2x?xindex>,
-                        %updates : tensor<2x1x3xf32>)
-                    outs(%init : tensor<3x3xf32>)
-                    (%in: f32, %out: f32) {
-    %sum = arith.addf %in, %out : f32
-    thlo.yield %sum : f32
-  }
-  return %0 : tensor<3x3xf32>
-}
-
-// -----
-
-func.func @scatter_indices_index_vector_dim_too_big(
-    %indices: tensor<2x9xindex>, %updates: tensor<2x1x3xf32>,
-    %init: tensor<3x3xf32>) -> tensor<3x3xf32> {
-  // expected-error@+1{{expected index vector dimension size = 9 to be smaller or equal than `init` rank = 2}}
-  %0 = thlo.scatter ins(%indices : tensor<2x9xindex>,
-                        %updates : tensor<2x1x3xf32>)
-                    outs(%init : tensor<3x3xf32>)
-                    (%in: f32, %out: f32) {
-    %sum = arith.addf %in, %out : f32
-    thlo.yield %sum : f32
-  }
-  return %0 : tensor<3x3xf32>
-}
-
-// -----
-
-func.func @scatter_updates_init_rank_mismatch(%indices: tensor<2x2xindex>,
-    %updates: tensor<2x3xf32>, %init: tensor<3x3xf32>) -> tensor<3x3xf32> {
-  // expected-error@+1{{expected `updates` rank + 1 to match `init` rank}}
-  %0 = thlo.scatter ins(%indices : tensor<2x2xindex>,
-                        %updates : tensor<2x3xf32>)
-                    outs(%init : tensor<3x3xf32>)
-                    (%in: f32, %out: f32) {
-    %sum = arith.addf %in, %out : f32
-    thlo.yield %sum : f32
-  }
-  return %0 : tensor<3x3xf32>
-}
-
-// -----
-
-func.func @scatter_updates_init_element_type_mismatch(
-    %indices: tensor<2x2xindex>, %updates: tensor<2x1x3xf32>,
-    %init: tensor<3x3xi32>) -> tensor<3x3xi32> {
-  // expected-error@+1{{expected `updates` element type to match `init` element type}}
-  %0 = thlo.scatter ins(%indices : tensor<2x2xindex>,
-                        %updates : tensor<2x1x3xf32>)
-                    outs(%init : tensor<3x3xi32>)
-                    (%in: f32, %out: f32) {
-    %sum = arith.addf %in, %out : f32
-    thlo.yield %sum : f32
-  }
-  return %0 : tensor<3x3xi32>
-}
-
-// -----
-
-func.func @gather_output_result_mismatch(
-    %arg: tensor<100xf32>, %indices: tensor<42x1xindex>, %dst: tensor<42xf32>)
-    -> tensor<42xf64> {
-  // expected-error@+1{{'thlo.gather' op expected type of operand #2 ('tensor<42xf32>') to match type of corresponding result ('tensor<42xf64>')}}
-  %gather = "thlo.gather"(%arg, %indices, %dst) :
-      (tensor<100xf32>, tensor<42x1xindex>, tensor<42xf32>) -> (tensor<42xf64>)
-  func.return %gather : tensor<42xf64>
-}
-
-// -----
-
-func.func @gather_invalid_dynamic_indices(
-    %arg: tensor<100xf32>, %indices: tensor<42x?xindex>, %dst: tensor<42xf32>)
-    -> tensor<42xf64> {
-  // expected-error@+1{{'thlo.gather' op expected type of operand #2 ('tensor<42xf32>') to match type of corresponding result ('tensor<42xf64>')}}
-  %gather = "thlo.gather"(%arg, %indices, %dst) :
-      (tensor<100xf32>, tensor<42x?xindex>, tensor<42xf32>) -> (tensor<42xf64>)
-  func.return %gather : tensor<42xf64>
-}
-
-// -----
-
-func.func @gather_invalid_indices_shape(
-    %arg: tensor<100xf32>, %indices: tensor<42xindex>, %dst: tensor<42xf32>)
-    -> tensor<42xf64> {
-  // expected-error@+1{{'thlo.gather' op expected `indices` to be a 2D tensor}}
-  %gather = "thlo.gather"(%arg, %indices, %dst) :
-      (tensor<100xf32>, tensor<42xindex>, tensor<42xf32>) -> (tensor<42xf64>)
-  func.return %gather : tensor<42xf64>
-}
-
-// -----
-
-func.func @gather_indices_dst_mismatch(
-    %arg: tensor<100xf32>, %indices: tensor<42x1xindex>, %dst: tensor<43xf32>)
-    -> tensor<43xf64> {
-  // expected-error@+1{{'thlo.gather' op expected major dimension of `startIndices` to match major dimension of `init`}}
-  %gather = "thlo.gather"(%arg, %indices, %dst) :
-      (tensor<100xf32>, tensor<42x1xindex>, tensor<43xf32>) -> (tensor<43xf64>)
-  func.return %gather : tensor<43xf64>
-}
-
-// -----
-
-func.func @gather_invalid_dst_shape(
-    %arg: tensor<100xf32>, %indices: tensor<42x1xindex>, %dst: tensor<42x?xf32>)
-    -> tensor<42x?xf64> {
-  // expected-error@+1{{'thlo.gather' op only the major dimenion of `init` may be dynamic}}
-  %gather = "thlo.gather"(%arg, %indices, %dst) :
-      (tensor<100xf32>, tensor<42x1xindex>, tensor<42x?xf32>) -> (tensor<42x?xf64>)
-  func.return %gather : tensor<42x?xf64>
-}
-
-// -----
-
-func.func @sort_mismatched_number_of_inputs_and_outputs(
-      %input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>,
-      %init1: tensor<?x?xf32>)
-    -> tensor<?x?xf32> {
-  // expected-error@+1{{'thlo.sort' op expected the number of inputs 2 to match the number of outputs 1}}
-  %sorted = thlo.sort
-      ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
-      outs(%init1: tensor<?x?xf32>)
-      dimension = 0
-      is_stable = true
-      (%e11: f32, %e12: f32) {
-        %gt = arith.cmpf ogt, %e11, %e12: f32
-        thlo.yield %gt : i1
-      }
-  func.return %sorted : tensor<?x?xf32>
-}
-
-// -----
-
-func.func @sort_mismatched_number_of_inputs_and_comparator_arguments(
-      %input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>,
-      %init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-    -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-  // expected-error@+1{{'thlo.sort' op expected the number of block arguments 3 to be twice the number of inputs (2*2)}}
-  %sorted1, %sorted2 = thlo.sort
-      ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
-      outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-      dimension = 0
-      is_stable = true
-      (%e11: f32, %e12: f32, %e21: i32) {
-        %gt = arith.cmpf ogt, %e11, %e12: f32
-        thlo.yield %gt : i1
-      }
-  func.return %sorted1, %sorted2 : tensor<?x?xf32>, tensor<?x?xi32>
-}
-
-// -----
-
-func.func @sort_mismatched_input_and_comparator_type(
-      %input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>,
-      %init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-    -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-  // expected-error@+1{{'thlo.sort' op expected element type of input 1 to match type of the corresponding arguments to the comparison function but got 'i32' and ('i32', 'f32')}}
-  %sorted1, %sorted2 = thlo.sort
-      ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
-      outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-      dimension = 0
-      is_stable = true
-      (%e11: f32, %e12: f32, %e21: i32, %e22: f32) {
-        %gt = arith.cmpf ogt, %e11, %e12: f32
-        thlo.yield %gt : i1
-      }
-  func.return %sorted1, %sorted2 : tensor<?x?xf32>, tensor<?x?xi32>
-}
-
-// -----
-
-func.func @sort_comparator_yields_different_than_one_output(
-      %input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>,
-      %init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-    -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-  %sorted1, %sorted2 = thlo.sort
-      ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
-      outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-      dimension = 0
-      is_stable = true
-      (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
-        %gt = arith.cmpf ogt, %e11, %e12: f32
-        // expected-error@+1{{'thlo.yield' op expects number of tensor output args = 1 to match the number of yield operands = 2}}
-        thlo.yield %gt, %gt : i1, i1
-      }
-  func.return %sorted1, %sorted2 : tensor<?x?xf32>, tensor<?x?xi32>
-}
-
-// -----
-
-func.func @sort_comparator_yields_non_boolean(
-      %input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>,
-      %init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-    -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-  %sorted1, %sorted2 = thlo.sort
-      ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
-      outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-      dimension = 0
-      is_stable = true
-      (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
-        // expected-error@+1{{'thlo.yield' op expects yield operand 0 with type = 'f32' to match output arg element type = 'i1'}}
-        thlo.yield %e11 : f32
-      }
-  func.return %sorted1, %sorted2 : tensor<?x?xf32>, tensor<?x?xi32>
-}
-
-// -----
-
-func.func @sort_inputs_have_different_shapes(
-      %input1: tensor<64x32xf32>, %input2: tensor<32x32xi32>,
-      %init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-    -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-  // expected-error@+1{{'thlo.sort' op expected all inputs to have the same shape (64, 32) but input 1 has shape (32, 32)}}
-  %sorted1, %sorted2 = thlo.sort
-      ins(%input1: tensor<64x32xf32>, %input2: tensor<32x32xi32>)
-      outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-      dimension = 0
-      is_stable = true
-      (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
-        %gt = arith.cmpf ogt, %e11, %e12: f32
-        thlo.yield %gt : i1
-      }
-  func.return %sorted1, %sorted2 : tensor<?x?xf32>, tensor<?x?xi32>
-}
-
-// -----
-
-func.func @sort_output_has_different_shape_from_inputs(
-      %input1: tensor<64x32xf32>, %input2: tensor<64x32xi32>,
-      %init1: tensor<32x64xf32>, %init2: tensor<?x?xi32>)
-    -> (tensor<32x64xf32>, tensor<?x?xi32>) {
-  // expected-error@+1{{'thlo.sort' op expected outputs to have shape (64, 32) but output 0 has shape (32, 64)}}
-  %sorted1, %sorted2 = thlo.sort
-      ins(%input1: tensor<64x32xf32>, %input2: tensor<64x32xi32>)
-      outs(%init1: tensor<32x64xf32>, %init2: tensor<?x?xi32>)
-      dimension = 0
-      is_stable = true
-      (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
-        %gt = arith.cmpf ogt, %e11, %e12: f32
-        thlo.yield %gt : i1
-      }
-  func.return %sorted1, %sorted2 : tensor<32x64xf32>, tensor<?x?xi32>
-}
-
-// -----
-
-func.func @sort_dimension_is_incompatible_with_rank_of_inputs(
-      %input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>,
-      %init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-    -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-  // expected-error@+1{{'thlo.sort' op sorting dimension must be in range [0, 2) but got 2}}
-  %sorted1, %sorted2 = thlo.sort
-      ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
-      outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-      dimension = 2
-      is_stable = true
-      (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
-        %gt = arith.cmpf ogt, %e11, %e12: f32
-        thlo.yield %gt : i1
-      }
-  func.return %sorted1, %sorted2 : tensor<?x?xf32>, tensor<?x?xi32>
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/legalize_sort.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/legalize_sort.mlir
deleted file mode 100644
index d4f3709d718a22..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/legalize_sort.mlir
+++ /dev/null
@@ -1,203 +0,0 @@
-// RUN: mlir-hlo-opt -thlo-legalize-sort -canonicalize %s | FileCheck %s
-
-func.func @sort(%input1: memref<?x?xf32>, %input2: memref<?x?xi32>,
-                %init1: memref<?x?xf32>, %init2: memref<?x?xi32>) {
-  thlo.sort
-      ins(%input1: memref<?x?xf32>, %input2: memref<?x?xi32>)
-      outs(%init1: memref<?x?xf32>, %init2: memref<?x?xi32>)
-      dimension = 0
-      is_stable = true
-      (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
-        %gt = arith.cmpf ogt, %e11, %e12: f32
-        thlo.yield %gt : i1
-      }
-  func.return
-}
-
-// CHECK-LABEL:   func.func @sort(
-// CHECK-SAME:                    %[[INPUT1:[A-Za-z0-9]*]]: memref<?x?xf32>,
-// CHECK-SAME:                    %[[INPUT2:[A-Za-z0-9]*]]: memref<?x?xi32>,
-// CHECK-SAME:                    %[[INIT1:[A-Za-z0-9]*]]: memref<?x?xf32>,
-// CHECK-SAME:                    %[[INIT2:[A-Za-z0-9]*]]: memref<?x?xi32>) {
-// CHECK:           %[[CTRUE:.*]] = arith.constant true
-// CHECK:           %[[C16:.*]] = arith.constant 16 : index
-// CHECK:           %[[CFALSE:.*]] = arith.constant false
-// CHECK:           %[[C1:.*]] = arith.constant 1 : index
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[SORT_DIM:.*]] = memref.dim %[[INPUT1]], %[[C0]]
-// CHECK:           %[[DYN_DIM0:.*]] = memref.dim %[[INPUT1]], %[[C0]]
-// CHECK:           %[[DYN_DIM1:.*]] = memref.dim %[[INPUT1]], %[[C1]]
-// CHECK:           %[[SCRATCH1:.*]] = memref.alloc(%[[DYN_DIM0]], %[[DYN_DIM1]])
-// CHECK:           %[[SCRATCH2:.*]] = memref.alloc(%[[DYN_DIM0]], %[[DYN_DIM1]])
-// CHECK:           %[[BATCH_DIM_SIZE:.*]] = memref.dim %[[INPUT1]], %[[C1]]
-// CHECK:           %[[PARITY:.*]] = scf.for
-// CHECK-SAME:          %[[SUBVIEW_INDEX:.*]] = %[[C0]] to %[[BATCH_DIM_SIZE]]
-// CHECK-SAME:          step %[[C1]]
-// CHECK-SAME:          iter_args(%[[ARG5:.*]] = %[[CFALSE]]) -> (i1) {
-// CHECK:             %[[SUBVIEW_INPUT1:.*]] = memref.subview
-// CHECK-SAME:            %[[INPUT1]][0, %[[SUBVIEW_INDEX]]]
-// CHECK-SAME             [%[[SORT_DIM]], 1] [1, 1]
-// CHECK:             %[[SUBVIEW_INPUT2:.*]] = memref.subview
-// CHECK-SAME:            %[[INPUT2]][0, %[[SUBVIEW_INDEX]]]
-// CHECK-SAME             [%[[SORT_DIM]], 1] [1, 1]
-// CHECK:             %[[SUBVIEW_INIT1:.*]] = memref.subview
-// CHECK-SAME:            %[[INIT1]][0, %[[SUBVIEW_INDEX]]]
-// CHECK-SAME             [%[[SORT_DIM]], 1] [1, 1]
-// CHECK:             %[[SUBVIEW_INIT2:.*]] = memref.subview
-// CHECK-SAME:            %[[INIT2]][0, %[[SUBVIEW_INDEX]]]
-// CHECK-SAME             [%[[SORT_DIM]], 1] [1, 1]
-// CHECK:             %[[SUBVIEW_SCRATCH1:.*]] = memref.subview
-// CHECK-SAME:            %[[SCRATCH1]][0, %[[SUBVIEW_INDEX]]]
-// CHECK-SAME             [%[[SORT_DIM]], 1] [1, 1]
-// CHECK:             %[[SUBVIEW_SCRATCH2:.*]] = memref.subview
-// CHECK-SAME:            %[[SCRATCH2]][0, %[[SUBVIEW_INDEX]]]
-// CHECK-SAME             [%[[SORT_DIM]], 1] [1, 1]
-// COM:               // We first sort ELEMs in groups of 16 using an
-// COM:               // insertion sort.
-// CHECK:             scf.for %[[LO:.*]] = %[[C0]] to %[[SORT_DIM]]
-// CHECK-SAME:                step %[[C16]] {
-// CHECK:               %[[UPPER_BOUND:.*]] = arith.addi %[[LO]], %[[C16]]
-// CHECK:               %[[END:.*]] = arith.minsi %[[UPPER_BOUND]], %[[SORT_DIM]]
-// CHECK:               %[[LO_IN1:.*]] = memref.load %[[SUBVIEW_INPUT1]][%[[LO]]]
-// CHECK:               %[[LO_IN2:.*]] = memref.load %[[SUBVIEW_INPUT2]][%[[LO]]]
-// CHECK:               memref.store %[[LO_IN1]], %[[SUBVIEW_INIT1]][%[[LO]]]
-// CHECK:               memref.store %[[LO_IN2]], %[[SUBVIEW_INIT2]][%[[LO]]]
-// CHECK:               %[[LO_PLUS_1:.*]] = arith.addi %[[LO]], %[[C1]]
-// CHECK:               scf.for %[[START:.*]] = %[[LO_PLUS_1]] to %[[END]]
-// CHECK-SAME:                  step %[[C1]] {
-// CHECK:                 %[[PIVOT1:.*]] = memref.load %[[SUBVIEW_INPUT1]][%[[START]]]
-// CHECK:                 %[[PIVOT2:.*]] = memref.load %[[SUBVIEW_INPUT2]][%[[START]]]
-// COM:                   // Binary search of the insertion point.
-// CHECK:                 %[[LR:.*]]:2 = scf.while
-// CHECK-SAME:                (%[[LEFT:.*]] = %[[LO]], %[[RIGHT:.*]] = %[[START]])
-// CHECK-SAME:                : (index, index) -> (index, index) {
-// CHECK:                   %[[L_LT_R:.*]] = arith.cmpi slt, %[[LEFT]], %[[RIGHT]]
-// CHECK:                   scf.condition(%[[L_LT_R]]) %[[LEFT]], %[[RIGHT]]
-// CHECK:                 } do {
-// CHECK:                 ^bb0(%[[LEFT_:.*]]: index, %[[RIGHT_:.*]]: index):
-// CHECK:                   %[[SUM_LR:.*]] = arith.addi %[[LEFT_]], %[[RIGHT_]]
-// CHECK:                   %[[MID:.*]] = arith.shrui %[[SUM_LR]], %[[C1]]
-// CHECK:                   %[[MID_PLUS_1:.*]] = arith.addi %[[MID]], %[[C1]]
-// CHECK:                   %[[MEDIAN:.*]] = memref.load %[[SUBVIEW_INIT1]][%[[MID]]]
-// CHECK:                   %[[CMP_PIVOT_MEDIAN:.*]] = arith.cmpf ogt, %[[PIVOT1]], %[[MEDIAN]] : f32
-// CHECK:                   %[[NEW_LEFT:.*]] = arith.select %[[CMP_PIVOT_MEDIAN]], %[[LEFT_]], %[[MID_PLUS_1]]
-// CHECK:                   %[[NEW_RIGHT:.*]] = arith.select %[[CMP_PIVOT_MEDIAN]], %[[MID]], %[[RIGHT_]]
-// CHECK:                   scf.yield %[[NEW_LEFT]], %[[NEW_RIGHT]]
-// CHECK:                 }
-// COM:                   // Move the n ELEMs that are larger than the pivot
-// COM:                   // once to the right.
-// CHECK:                 %[[N:.*]] = arith.subi %[[START]], %[[LR:.*]]#0
-// CHECK:                 scf.for %[[I:.*]] = %[[C0]] to %[[N]] step %[[C1]] {
-// CHECK:                   %[[CUR_IX:.*]] = arith.subi %[[START]], %[[I]]
-// CHECK:                   %[[CUR_IX_MINUS_1:.*]] = arith.subi %[[CUR_IX]], %[[C1]] : index
-// CHECK:                   %[[ELEM_TO_MOVE1:.*]] = memref.load %[[SUBVIEW_INIT1]][%[[CUR_IX_MINUS_1]]]
-// CHECK:                   %[[ELEM_TO_MOVE2:.*]] = memref.load %[[SUBVIEW_INIT2]][%[[CUR_IX_MINUS_1]]]
-// CHECK:                   memref.store %[[ELEM_TO_MOVE1]], %[[SUBVIEW_INIT1]][%[[CUR_IX]]]
-// CHECK:                   memref.store %[[ELEM_TO_MOVE2]], %[[SUBVIEW_INIT2]][%[[CUR_IX]]]
-// CHECK:                 }
-// CHECK:                 memref.store %[[PIVOT1]], %[[SUBVIEW_INIT1]][%[[LR]]#0]
-// CHECK:                 memref.store %[[PIVOT2]], %[[SUBVIEW_INIT2]][%[[LR]]#0]
-// CHECK:               }
-// CHECK:             }
-// COM:               // Merge subarrays of each input together until the final
-// COM:               // sorted array is computed.
-// CHECK:             %[[MERGE_RESULTS:.*]]:2 = scf.while
-// CHECK-SAME:            (%[[SUBARRAY_SIZE:[A-Za-z0-9]*]] = %[[C16]],
-// CHECK-SAME:             %[[PARITY_:[A-Za-z0-9]*]] = %[[CFALSE]])
-// CHECK:               %[[ARE_ALL_SUBARRAYS_MERGED:.*]] = arith.cmpi slt, %[[SUBARRAY_SIZE]], %[[SORT_DIM]]
-// CHECK:               scf.condition(%[[ARE_ALL_SUBARRAYS_MERGED]]) %[[SUBARRAY_SIZE]], %[[PARITY_]]
-// CHECK:             } do {
-// CHECK:             ^bb0(%[[SUBARRAY_SIZE_:[A-Za-z0-9]*]]: index,
-// CHECK-SAME:             %[[PARITY__:[A-Za-z0-9]*]]: i1):
-// CHECK:               %[[DOUBLE_SUBARRAY_SIZE:.*]] = arith.addi %[[SUBARRAY_SIZE_]], %[[SUBARRAY_SIZE_]]
-// COM:                 // Merge all successive pairs of subarrays of maximum
-// COM:                 // size SUBARRAY_SIZE.
-// CHECK:               scf.if %[[PARITY_]] {
-// CHECK:                scf.for
-// CHECK-SAME:              %[[DOUBLE_SUBARRAY_START:.*]] = %[[C0]] to %[[SORT_DIM]]
-// CHECK-SAME:              step %[[DOUBLE_SUBARRAY_SIZE]] {
-// CHECK:                 %[[SUBARRAY1_UPPER_BOUND:.*]] = arith.addi %[[DOUBLE_SUBARRAY_START]], %[[SUBARRAY_SIZE_]]
-// CHECK:                 %[[SUBARRAY1_END:.*]] = arith.minsi %[[SORT_DIM]], %[[SUBARRAY1_UPPER_BOUND]]
-// CHECK:                 %[[SUBARRAY2_UPPER_BOUND:.*]] = arith.addi %[[DOUBLE_SUBARRAY_START]], %[[DOUBLE_SUBARRAY_SIZE]]
-// CHECK:                 %[[SUBARRAY2_END:.*]] = arith.minsi %[[SORT_DIM]], %[[SUBARRAY2_UPPER_BOUND]]
-// COM:                   // Merge two subarrays together.
-// CHECK:                 %[[POST_MERGE_INDICES:.*]]:3 = scf.while
-// CHECK-SAME:                (%[[OUTPUT_INDEX:[A-Za-z0-9]*]] = %[[DOUBLE_SUBARRAY_START]],
-// CHECK-SAME:                 %[[SUBARRAY1_INDEX:[A-Za-z0-9]*]] = %[[DOUBLE_SUBARRAY_START]],
-// CHECK-SAME:                 %[[SUBARRAY2_INDEX:[A-Za-z0-9]*]] = %[[SUBARRAY1_END]])
-// CHECK:                   %[[SUBARRAY1_IS_CONSUMED:.*]] = arith.cmpi slt, %[[SUBARRAY1_INDEX]], %[[SUBARRAY1_END]]
-// CHECK:                   %[[SUBARRAY2_IS_CONSUMED:.*]] = arith.cmpi slt, %[[SUBARRAY2_INDEX]], %[[SUBARRAY2_END]]
-// CHECK:                   %[[IS_MERGE_OVER:.*]] = arith.andi %[[SUBARRAY1_IS_CONSUMED]], %[[SUBARRAY2_IS_CONSUMED]] : i1
-// CHECK:                   scf.condition(%[[IS_MERGE_OVER]]) %[[OUTPUT_INDEX]], %[[SUBARRAY1_INDEX]], %[[SUBARRAY2_INDEX]]
-// CHECK:                 } do {
-// CHECK:                 ^bb0(%[[OUTPUT_INDEX_:[A-Za-z0-9]*]]: index,
-// CHECK-SAME:                 %[[SUBARRAY1_INDEX_:[A-Za-z0-9]*]]: index,
-// CHECK-SAME:                 %[[SUBARRAY2_INDEX_:[A-Za-z0-9]*]]: index):
-// CHECK:                   %[[RHS_ELEM1:.*]] = memref.load %[[SUBVIEW_INIT1]][%[[SUBARRAY1_INDEX_]]]
-// CHECK:                   %[[RHS_ELEM2:.*]] = memref.load %[[SUBVIEW_INIT2]][%[[SUBARRAY1_INDEX_]]]
-// CHECK:                   %[[LHS_ELEM1:.*]] = memref.load %[[SUBVIEW_INIT1]][%[[SUBARRAY2_INDEX_]]]
-// CHECK:                   %[[LHS_ELEM2:.*]] = memref.load %[[SUBVIEW_INIT2]][%[[SUBARRAY2_INDEX_]]]
-// CHECK:                   %[[COMPARATOR_RESULT:.*]] = arith.cmpf ogt, %[[LHS_ELEM1]], %[[RHS_ELEM1]] : f32
-// CHECK:                   %[[LEFT_ELEM1:.*]] = arith.select %[[COMPARATOR_RESULT]], %[[LHS_ELEM1]], %[[RHS_ELEM1]] : f32
-// CHECK:                   %[[LEFT_ELEM2:.*]] = arith.select %[[COMPARATOR_RESULT]], %[[LHS_ELEM2]], %[[RHS_ELEM2]] : i32
-// CHECK:                   memref.store %[[LEFT_ELEM1]], %[[SUBVIEW_SCRATCH1]][%[[OUTPUT_INDEX_]]]
-// CHECK:                   memref.store %[[LEFT_ELEM2]], %[[SUBVIEW_SCRATCH2]][%[[OUTPUT_INDEX_]]]
-// CHECK:                   %[[SUBARRAY1_INDEX__PLUS_1:.*]] = arith.addi %[[SUBARRAY1_INDEX_]], %[[C1]]
-// CHECK:                   %[[NEW_SUBARRAY1_INDEX:.*]] = arith.select %[[COMPARATOR_RESULT]], %[[SUBARRAY1_INDEX_]], %[[SUBARRAY1_INDEX__PLUS_1]]
-// CHECK:                   %[[SUBARRAY2_INDEX__PLUS_1:.*]] = arith.addi %[[SUBARRAY2_INDEX_]], %[[C1]]
-// CHECK:                   %[[NEW_SUBARRAY2_INDEX:.*]] = arith.select %[[COMPARATOR_RESULT]], %[[SUBARRAY2_INDEX__PLUS_1]], %[[SUBARRAY2_INDEX_]]
-// CHECK:                   %[[NEW_OUTPUT_INDEX:.*]] = arith.addi %[[OUTPUT_INDEX_]], %[[C1]]
-// CHECK:                   scf.yield %[[NEW_OUTPUT_INDEX]], %[[NEW_SUBARRAY1_INDEX]], %[[NEW_SUBARRAY2_INDEX]]
-// CHECK:                 }
-// COM:                   // After the merge, exactly one of the two subarrays
-// COM:                   // contains unprocessed (and sorted) ELEMs. This
-// COM:                   // appends the corresponding ELEMs to the result
-// COM:                   // array.
-// CHECK:                 %[[IS_SUBARRAY1_CONSUMED:.*]] = arith.cmpi slt, %[[POST_MERGE_INDICES]]#1, %[[SUBARRAY1_END]]
-// CHECK:                 %[[INDEX_TO_UNPROCESSED_ELEMS:.*]] = arith.select %[[IS_SUBARRAY1_CONSUMED]], %[[POST_MERGE_INDICES]]#1, %[[POST_MERGE_INDICES]]#2
-// CHECK:                 %[[UNPROCESSED_SUBARRAY_END:.*]] = arith.select %[[IS_SUBARRAY1_CONSUMED]], %[[SUBARRAY1_END]], %[[SUBARRAY2_END]]
-// CHECK:                 %[[NUMBER_OF_UNPROCESSED_ELEMS:.*]] = arith.subi %[[UNPROCESSED_SUBARRAY_END]], %[[INDEX_TO_UNPROCESSED_ELEMS]]
-// CHECK:                 scf.for
-// CHECK-SAME:                %[[I_:.*]] = %[[C0]] to %[[NUMBER_OF_UNPROCESSED_ELEMS]]
-// CHECK-SAME:                step %[[C1]] {
-// CHECK:                   %[[UNPROCESSED_ELEM_INDEX:.*]] = arith.addi %[[INDEX_TO_UNPROCESSED_ELEMS]], %[[I_]]
-// CHECK:                   %[[OUTPUT_INDEX__:.*]] = arith.addi %[[POST_MERGE_INDICES]]#0, %[[I_]]
-// CHECK:                   %[[UNPROCESSED_ELEM1:.*]] = memref.load %[[SUBVIEW_INIT1]][%[[UNPROCESSED_ELEM_INDEX]]]
-// CHECK:                   %[[UNPROCESSED_ELEM2:.*]] = memref.load %[[SUBVIEW_INIT2]][%[[UNPROCESSED_ELEM_INDEX]]]
-// CHECK:                   memref.store %[[UNPROCESSED_ELEM1]], %[[SUBVIEW_SCRATCH1]][%[[OUTPUT_INDEX__]]]
-// CHECK:                   memref.store %[[UNPROCESSED_ELEM2]], %[[SUBVIEW_SCRATCH2]][%[[OUTPUT_INDEX__]]]
-// CHECK:                 }
-// CHECK:                }
-// COM:                   // Else block as above, but with read and write buffers
-// COM:                   // swapped.
-// CHECK:               }
-// CHECK:               %[[NEW_PARITY:.*]] = arith.subi %[[CTRUE]], %[[PARITY__]] : i1
-// CHECK:               scf.yield %[[DOUBLE_SUBARRAY_SIZE]], %[[NEW_PARITY]]
-// CHECK:             }
-// CHECK:             scf.yield %[[MERGE_RESULTS]]#1 : i1
-// CHECK:           }
-// CHECK:           scf.if %[[PARITY]] {
-// CHECK:             memref.copy %[[SCRATCH1]], %[[INIT1]]
-// CHECK:             memref.copy %[[SCRATCH2]], %[[INIT2]]
-// CHECK:           }
-// CHECK:           memref.dealloc %[[SCRATCH1]]
-// CHECK:           memref.dealloc %[[SCRATCH2]]
-// CHECK:           return
-// CHECK:         }
-
-// -----
-
-// CHECK-LABEL: @sort_strided
-func.func @sort_strided(%input: memref<47x1xf32, strided<[7, 1], offset: ?>>,
-                        %init: memref<47x1xf32, strided<[1, 7], offset: ?>>) {
-  thlo.sort
-    ins(%input : memref<47x1xf32, strided<[7, 1], offset: ?>>)
-    outs(%init : memref<47x1xf32, strided<[1, 7], offset: ?>>)
-    dimension = 0
-    is_stable = true
-    (%lhs: f32, %rhs: f32) {
-      %gt = arith.cmpf ogt, %lhs, %rhs: f32
-      thlo.yield %gt : i1
-    }
-  func.return
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/ops.mlir
deleted file mode 100644
index cab86c18cdc569..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/ops.mlir
+++ /dev/null
@@ -1,179 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --allow-unregistered-dialect | \
-// RUN: mlir-hlo-opt --verify-diagnostics --split-input-file \
-// RUN:     --allow-unregistered-dialect | \
-// RUN: FileCheck %s
-
-func.func @concatenate(%arg1: tensor<?x?xf32>,
-                       %arg2: tensor<?x?xf32>,
-                       %dst: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %cat = thlo.concatenate
-      ins(%arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
-      outs(%dst: tensor<?x?xf32>)
-      dimension = 0
-  func.return %cat : tensor<?x?xf32>
-}
-// CHECK-LABEL: func @concatenate
-
-// -----
-
-func.func @concatenate_result_number(%dst: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %a:2 = "test.op"() : () -> (tensor<?x?xf32>, tensor<?x?xf32>)
-  %cat = thlo.concatenate
-      ins(%a#0: tensor<?x?xf32>, %a#1: tensor<?x?xf32>)
-      outs(%dst: tensor<?x?xf32>)
-      dimension = 0
-  func.return %cat : tensor<?x?xf32>
-}
-// CHECK-LABEL: func @concatenate_result_number
-
-// -----
-
-func.func @concatenate_memref(%arg1: memref<?x?xf32>,
-                              %arg2: memref<?x?xf32>,
-                              %dst: memref<?x?xf32>) {
-  thlo.concatenate
-      ins(%arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>)
-      outs(%dst: memref<?x?xf32>)
-      dimension = 0
-  func.return
-}
-// CHECK-LABEL: func @concatenate_memref
-
-// -----
-
-func.func @dynamic_broadcast_in_dim(%arg: tensor<?x?xf32>,
-                                    %dst: tensor<?x?x?xf32>) {
-  %bcast = thlo.dynamic_broadcast_in_dim
-      ins(%arg: tensor<?x?xf32>)
-      outs(%dst: tensor<?x?x?xf32>)
-      broadcast_dimensions = [0, 2]
-  func.return
-}
-// CHECK-LABEL: func @dynamic_broadcast_in_dim
-
-// -----
-
-func.func @dynamic_broadcast_in_dim_memref(%arg: memref<?x?xf32>,
-                                           %dst: memref<?x?x?xf32>) {
-  thlo.dynamic_broadcast_in_dim
-      ins(%arg: memref<?x?xf32>)
-      outs(%dst: memref<?x?x?xf32>)
-      broadcast_dimensions = [0, 2]
-  func.return
-}
-// CHECK-LABEL: func @dynamic_broadcast_in_dim_memref
-
-// -----
-
-func.func @gather(%arg: tensor<100xf32>,
-                  %indices: tensor<42x1xindex>,
-                  %dst: tensor<42xf32>) -> tensor<42xf32> {
-  %gather = thlo.gather
-      ins(%arg: tensor<100xf32>, %indices: tensor<42x1xindex>)
-      outs(%dst: tensor<42xf32>)
-  func.return %gather : tensor<42xf32>
-}
-// CHECK-LABEL: func @gather
-
-// -----
-
-func.func @gather_memref(%arg: memref<100xf32>,
-                         %indices: memref<42x1xindex>,
-                         %dst: memref<42xf32>) {
-  thlo.gather
-      ins(%arg: memref<100xf32>, %indices: memref<42x1xindex>)
-      outs(%dst: memref<42xf32>)
-  func.return
-}
-// CHECK-LABEL: func @gather_memref
-
-// -----
-
-func.func @scatter(%indices: tensor<2x2xindex>, %updates: tensor<2x1x3xf32>,
-    %init: tensor<3x3xf32>) -> tensor<3x3xf32> {
-  %0 = thlo.scatter ins(%indices : tensor<2x2xindex>,
-                        %updates : tensor<2x1x3xf32>)
-                    outs(%init : tensor<3x3xf32>)
-                    (%in: f32, %out: f32) {
-    %sum = arith.addf %in, %out : f32
-    thlo.yield %sum : f32
-  }
-  return %0 : tensor<3x3xf32>
-}
-// CHECK-LABEL: func @scatter
-
-// -----
-
-func.func @scatter_memref(%indices: memref<2x2xindex>,
-    %updates: memref<2x1x3xf32>, %init: memref<3x3xf32>) {
-  thlo.scatter ins(%indices : memref<2x2xindex>, %updates : memref<2x1x3xf32>)
-               outs(%init : memref<3x3xf32>)
-               (%in: f32, %out: f32) {
-    %sum = arith.addf %in, %out : f32
-    thlo.yield %sum : f32
-  }
-  func.return
-}
-// CHECK-LABEL: func @scatter_memref
-
-// -----
-
-func.func @sort(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>,
-                %init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-    -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-  %sorted1, %sorted2 = thlo.sort
-      ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
-      outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-      dimension = 0
-      is_stable = true
-      (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
-        %gt = arith.cmpf ogt, %e11, %e12: f32
-        thlo.yield %gt : i1
-      }
-  func.return %sorted1, %sorted2 : tensor<?x?xf32>, tensor<?x?xi32>
-}
-// CHECK-LABEL: func @sort
-// CHECK:         %[[RES1:sorted0]], %[[RES2:sorted1]] = thlo.sort
-// CHECK:         %[[LHS0:lhs0: f32]], %[[RHS0:rhs0: f32]],
-// CHECK-SAME:    %[[LHS1:lhs1: i32]], %[[RHS1:rhs1: i32]]
-
-// -----
-
-func.func @sort_memref(%input1: memref<?x?xf32>, %input2: memref<?x?xi32>,
-                       %init1: memref<?x?xf32>, %init2: memref<?x?xi32>) {
-  thlo.sort
-      ins(%input1: memref<?x?xf32>, %input2: memref<?x?xi32>)
-      outs(%init1: memref<?x?xf32>, %init2: memref<?x?xi32>)
-      dimension = 0
-      is_stable = true
-      (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
-        %gt = arith.cmpf ogt, %e11, %e12: f32
-        thlo.yield %gt : i1
-      }
-  func.return
-}
-// CHECK-LABEL: func @sort_memref
-
-// -----
-
-func.func @reverse_static(%input: tensor<100xf32>, %init: tensor<100xf32>)
-  -> tensor<100xf32> {
-  %res = thlo.reverse
-         ins(%input: tensor<100xf32>)
-         outs(%init: tensor<100xf32>)
-         reverse_dimensions = [0]
-  func.return %res : tensor<100xf32>
-}
-// CHECK-LABEL: func @reverse_static
-
-// -----
-
-func.func @reverse_dynamic(%input: tensor<?x?xf32>, %init: tensor<?x?xf32>)
-  -> tensor<?x?xf32> {
-  %res = thlo.reverse
-         ins(%input: tensor<?x?xf32>)
-         outs(%init: tensor<?x?xf32>)
-         reverse_dimensions = [0, 1]
-  func.return %res : tensor<?x?xf32>
-}
-// CHECK-LABEL: func @reverse_dynamic
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/tiling.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/tiling.mlir
deleted file mode 100644
index 6709cd91100068..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/thlo/tiling.mlir
+++ /dev/null
@@ -1,414 +0,0 @@
-// RUN: mlir-hlo-opt %s -test-hlo-transform-dialect-interpreter -cse \
-// RUN: -split-input-file | FileCheck %s
-
-func.func @dynamic_broadcast_in_dim_at_tile(%init : tensor<?x?x?xf32>,
-    %arg : tensor<?x?xf32>) -> tensor<?x?x?xf32> {
-  %bcast = thlo.dynamic_broadcast_in_dim ins(%arg: tensor<?x?xf32>)
-      outs(%init: tensor<?x?x?xf32>) broadcast_dimensions = [0, 2]
-  func.return %bcast : tensor<?x?x?xf32>
-}
-
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["thlo.dynamic_broadcast_in_dim"]} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %1, %loops:2 = transform.structured.tile_using_for %0 [256, 512]
-      : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
-}
-
-// CHECK-LABEL: @dynamic_broadcast_in_dim_at_tile
-// CHECK-SAME:  %[[INIT:.*]]: tensor<?x?x?xf32>, %[[ARG:.*]]: tensor<?x?xf32>
-
-// CHECK-DAG:  %[[C0:.*]] = arith.constant 0
-// CHECK-DAG:  %[[C1:.*]] = arith.constant 1
-// CHECK-DAG:  %[[C2:.*]] = arith.constant 2
-// CHECK-DAG:  %[[C256:.*]] = arith.constant 256
-// CHECK-DAG:  %[[C512:.*]] = arith.constant 512
-// CHECK-DAG:  %[[INIT_DIM_0:.*]] = tensor.dim %[[INIT]], %[[C0]]
-// CHECK-DAG:  %[[INIT_DIM_1:.*]] = tensor.dim %[[INIT]], %[[C1]]
-// CHECK-DAG:  %[[INIT_DIM_2:.*]] = tensor.dim %[[INIT]], %[[C2]]
-// CHECK-DAG:  %[[FOR:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[INIT_DIM_0]] step %[[C256]] iter_args(%[[INIT_ARG0:.*]] = %[[INIT]])
-// CHECK-DAG:     %[[MIN:.*]] = affine.min #map{{[0-9]*}}(%[[I]])[%[[INIT_DIM_0]]]
-// CHECK:         %[[INNER_FOR:.*]] = scf.for %[[J:.*]] = %[[C0]] to %[[INIT_DIM_1]]
-// CHECK-SAME:       step %[[C512]]
-// CHECK-SAME:       iter_args(%[[OUT:.*]] = %[[INIT_ARG0]])
-// CHECK:          %[[MIN_0:.*]] = affine.min #map{{[0-9]*}}(%[[J]])[%[[INIT_DIM_1]]]
-// CHECK:          %[[ARG_DIM_0:.*]] = tensor.dim %[[ARG]], %[[C0]]
-// CHECK:          %[[ARG_DIM_1:.*]] = tensor.dim %[[ARG]], %[[C1]]
-// CHECK:          %[[CMPI:.*]] = arith.cmpi ne, %[[ARG_DIM_0]], %[[INIT_DIM_0]]
-// CHECK:          %[[CMPI_0:.*]] = arith.cmpi ne, %[[ARG_DIM_1]], %[[INIT_DIM_2]]
-// CHECK:          %[[SELECT:.*]] = arith.select %[[CMPI]], %[[C0]], %[[I]]
-// CHECK:          %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[C0]], %[[C0]]
-// CHECK:          %[[SELECT_1:.*]] = arith.select %[[CMPI]], %[[C1]], %[[MIN]]
-// CHECK:          %[[SELECT_2:.*]] = arith.select %[[CMPI_0]], %[[C1]], %[[INIT_DIM_2]]
-// CHECK:          %[[EXTRACT:.*]] = tensor.extract_slice %[[OUT]]
-// CHECK-SAME:      [%[[I]], %[[J]], %[[C0]]] [%[[MIN]], %[[MIN_0]], %[[INIT_DIM_2]]] [1, 1, 1]
-// CHECK:         %[[EXTRACT_0:.*]] = tensor.extract_slice %[[ARG]]
-// CHECK-SAME:      [%[[SELECT]], %[[SELECT_0]]] [%[[SELECT_1]], %[[SELECT_2]]] [1, 1]
-// CHECK:         %[[DYNAMIC:.*]] = thlo.dynamic_broadcast_in_dim
-// CHECK-SAME:        ins(%[[EXTRACT_0]]
-// CHECK-SAME:        outs(%[[EXTRACT]]
-// CHECK-SAME:        broadcast_dimensions = [0, 2]
-// CHECK:         %[[INSERTED:.*]] = tensor.insert_slice %[[DYNAMIC]]
-// CHECK-SAME:       into %[[OUT]][%[[I]], %[[J]], %[[C0]]]
-// CHECK-SAME:       [%[[MIN]], %[[MIN_0]], %[[INIT_DIM_2]]]
-// CHECK-SAME:       [1, 1, 1]
-// CHECK:         scf.yield %[[INSERTED]]
-// CHECK:       scf.yield %[[INNER_FOR]]
-// CHECK:     return %[[FOR]]
-
-// -----
-
-func.func @scatter_i64(%indices: tensor<?x2xindex>,
-    %updates: tensor<?x?x?xi64>, %init: tensor<?x?xi64>) -> tensor<?x?xi64> {
-  %result = thlo.scatter
-    ins (%indices: tensor<?x2xindex>, %updates: tensor<?x?x?xi64>)
-    outs (%init: tensor<?x?xi64>)
-    (%in: i64, %out: i64) {
-      %0 = arith.addi %in, %out: i64
-      thlo.yield %0: i64
-    }
-  return %result : tensor<?x?xi64>
-}
-
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["thlo.scatter"]} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %1, %loop = transform.structured.tile_using_for %0 [1]
-      : (!pdl.operation) -> (!pdl.operation, !pdl.operation)
-}
-
-// CHECK-LABEL: func.func @scatter_i64(
-// CHECK-SAME:    %[[INDICES:.*]]: tensor<?x2xindex>,
-// CHECK-SAME:    %[[UPDATES:.*]]: tensor<?x?x?xi64>,
-// CHECK-SAME:    %[[INIT:.*]]: tensor<?x?xi64>
-
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:   %[[INDICES_COUNT:.*]] = tensor.dim %[[INDICES]], %c0
-
-// CHECK:       scf.for %[[I:.*]] = %[[C0]] to %[[INDICES_COUNT]] step %[[C1]]
-// CHECK-SAME:    iter_args(%[[INIT_:.*]] = %[[INIT]])
-
-// CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-// CHECK:       %[[UPDATE_SUB:.*]] = tensor.extract_slice %[[UPDATES]][%[[I]]
-// CHECK-SAME:    : tensor<?x?x?xi64>
-// CHECK:       %[[INDICES_SUB:.*]] = tensor.extract_slice %[[INDICES]][%[[I]]
-// CHECK-SAME:    : tensor<?x2xindex>
-// CHECK-DAG:   %[[INIT_DIM_0:.*]] = tensor.dim %[[INIT]], %[[C0]]
-// CHECK-DAG:   %[[INIT_DIM_1:.*]] = tensor.dim %[[INIT]], %[[C1]]
-// CHECK:       %[[INIT_SUB:.*]] = tensor.extract_slice %[[INIT_]][0, 0]
-// CHECK-SAME:     [%[[INIT_DIM_0]], %[[INIT_DIM_1]]] [1, 1]
-
-// CHECK:       %[[SCATTER:.*]] = thlo.scatter
-// CHECK-SAME:    ins(%[[INDICES_SUB]] : tensor<1x2xindex>,
-// CHECK-SAME:        %[[UPDATE_SUB]] : tensor<1x?x?xi64>)
-// CHECK-SAME:    outs(%[[INIT_SUB]] : tensor<?x?xi64>)
-// CHECK:           arith.addi
-// CHECK:           thlo.yield
-// CHECK:       %[[INSERTED:.*]] = tensor.insert_slice %[[SCATTER]]
-// CHECK-SAME:    into %[[INIT_]][0, 0]
-// CHECK:       scf.yield %[[INSERTED:.*]]
-
-// -----
-
-func.func @gather(%operand: tensor<?x?x?x?xf64>, %indices: tensor<?x4xindex>,
-    %init: tensor<?x10xf64>) -> tensor<?x10xf64> {
-  %result = thlo.gather
-    ins (%operand: tensor<?x?x?x?xf64>, %indices: tensor<?x4xindex>)
-    outs (%init: tensor<?x10xf64>)
-  return %result : tensor<?x10xf64>
-}
-
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["thlo.gather"]} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %1, %loop = transform.structured.tile_using_for %0 [1]
-      : (!pdl.operation) -> (!pdl.operation, !pdl.operation)
-}
-
-// CHECK-LABEL: @gather
-// CHECK-SAME:    %[[OPERAND:.*]]: tensor<?x?x?x?xf64>
-// CHECK-SAME:    %[[INDICES:.*]]: tensor<?x4xindex>
-// CHECK-SAME:    %[[INIT:.*]]:
-// CHECK-DAG:   %[[ZERO:.*]] = arith.constant 0
-// CHECK-DAG:   %[[ONE:.*]] = arith.constant 1
-// CHECK:       %[[RESULT:.*]] = scf.for %[[I:.*]] = %[[ZERO]] to
-// CHECK-SAME:      (%[[INIT_:[a-z0-9]+]] = %[[INIT]])
-
-// CHECK:         %[[INDEX_SLICE:.*]] = tensor.extract_slice %[[INDICES]]
-// CHECK-SAME:      [%[[I]], 0] [1, 4] [1, 1]
-
-// CHECK:         %[[INIT_SLICE:.*]] = tensor.extract_slice %[[INIT_]]
-// CHECK-SAME:      [%[[I]], 0] [1, 10] [1, 1]
-// CHECK:         %[[GATHER_SLICE:.*]] = thlo.gather
-// CHECK-SAME:       ins(%[[OPERAND]] : tensor<?x?x?x?xf64>,
-// CHECK-SAME:           %[[INDEX_SLICE]] : tensor<1x4xindex>)
-// CHECK-SAME:       outs(%[[INIT_SLICE]] : tensor<1x10xf64>)
-// CHECK:         %[[INSERTED:.*]] = tensor.insert_slice %[[GATHER_SLICE]]
-// CHECK-SAME:       into %[[INIT_]][%[[I]], 0] [1, 10]
-// CHECK:         scf.yield %[[INSERTED]]
-
-// -----
-
-func.func @concatenate_at_tile(%init : tensor<?x?xi32>, %a: tensor<?x?xi32>,
-    %b: tensor<?x?xi32>, %c: tensor<?x?xi32>)
-    -> tensor<?x?xi32> {
-  %concat = thlo.concatenate
-      ins(%a : tensor<?x?xi32>, %b : tensor<?x?xi32>, %c : tensor<?x?xi32>)
-      outs(%init : tensor<?x?xi32>)
-      dimension = 1
-  func.return %concat : tensor<?x?xi32>
-}
-
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["thlo.concatenate"]} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %1, %loops:2 = transform.structured.tile_using_for %0 [256, 512]
-      : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
-}
-
-// CHECK-LABEL: @concatenate_at_tile
-// CHECK-SAME:  %[[ARG0:.*]]: tensor<?x?xi32>, %[[ARG1:.*]]: tensor<?x?xi32>, %[[ARG2:.*]]: tensor<?x?xi32>, %[[ARG3:.*]]: tensor<?x?xi32>
-
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
-// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
-// CHECK-DAG:   %[[C256:.*]] = arith.constant 256
-// CHECK-DAG:   %[[C512:.*]] = arith.constant 512
-// CHECK-DAG:   %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C0]]
-// CHECK-DAG:   %[[DIM_0:.*]] = tensor.dim %[[ARG0]], %[[C1]]
-// CHECK-DAG:   %[[FOR:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[DIM]] step %[[C256]] iter_args(%[[INIT_:.*]] = %[[ARG0]])
-// CHECK-DAG:     %[[MIN:.*]] = affine.min #map{{[0-9]*}}(%[[ARG4]])[%[[DIM]]]
-// CHECK:         %[[INNER_FOR:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[DIM_0]] step %[[C512]]
-// CHECK-SAME:      iter_args(%[[ARG6:.*]] = %[[INIT_]])
-// CHECK:         %[[MIN_0:.*]] = affine.min #map{{[0-9]*}}(%[[ARG5]])[%[[DIM_0]]]
-// CHECK:         %[[DIM_4:.*]] = tensor.dim %[[ARG1]], %[[C1]]
-// CHECK:         %[[MINUI:.*]] = arith.minui %[[ARG5]], %[[DIM_4]]
-// CHECK:         %[[SUBI:.*]] = arith.subi %[[DIM_4]], %[[MINUI]]
-// CHECK:         %[[MINUI_0:.*]] = arith.minui %[[SUBI]], %[[MIN_0]]
-// CHECK:         %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG1]]
-// CHECK-SAME:      [%[[ARG4]], %[[MINUI]]] [%[[MIN]], %[[MINUI_0]]] [1, 1]
-// CHECK:         %[[CMPI:.*]] = arith.cmpi ule, %[[ARG5]], %[[DIM_4]]
-// CHECK:         %[[SUBI_0:.*]] = arith.subi %[[ARG5]], %[[DIM_4]]
-// CHECK:         %[[SELECT:.*]] = arith.select %[[CMPI]], %[[C0]], %[[SUBI_0]]
-// CHECK:         %[[DIM_5:.*]] = tensor.dim %[[ARG2]], %[[C1]]
-// CHECK:         %[[MINUI_1:.*]] = arith.minui %[[SELECT]], %[[DIM_5]]
-// CHECK:         %[[SUBI_1:.*]] = arith.subi %[[DIM_5]], %[[MINUI_1]]
-// CHECK:         %[[MINUI_2:.*]] = arith.minui %[[SUBI_1]], %[[MIN_0]]
-// CHECK:         %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[ARG2]]
-// CHECK-SAME:      [%[[ARG4]], %[[MINUI_1]]] [%[[MIN]], %[[MINUI_2]]] [1, 1]
-// CHECK:         %[[CMPI_0:.*]] = arith.cmpi ule, %[[SELECT]], %[[DIM_5]]
-// CHECK:         %[[SUBI_2:.*]] = arith.subi %[[SELECT]], %[[DIM_5]]
-// CHECK:         %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[C0]], %[[SUBI_2]]
-// CHECK:         %[[DIM_6:.*]] = tensor.dim %[[ARG3]], %[[C1]]
-// CHECK:         %[[MINUI_3:.*]] = arith.minui %[[SELECT_0]], %[[DIM_6]]
-// CHECK:         %[[SUBI_3:.*]] = arith.subi %[[DIM_6]], %[[MINUI_3]]
-// CHECK:         %[[MINUI_4:.*]] = arith.minui %[[SUBI_3]], %[[MIN_0]]
-// CHECK:         %[[MATERIALIZE_1:.*]] = tensor.extract_slice %[[ARG3]]
-// CHECK-SAME:      [%[[ARG4]], %[[MINUI_3]]] [%[[MIN]], %[[MINUI_4]]] [1, 1]
-// CHECK:         %[[MATERIALIZE_2:.*]] = tensor.extract_slice %[[ARG6]]
-// CHECK:         [%[[ARG4]], %[[ARG5]]] [%[[MIN]], %[[MIN_0]]] [1, 1]
-// CHECK:         %[[CONCATENATE:.*]] = thlo.concatenate
-// CHECK-SAME:        ins(%[[MATERIALIZE]] : tensor<?x?xi32>, %[[MATERIALIZE_0]] : tensor<?x?xi32>, %[[MATERIALIZE_1]] : tensor<?x?xi32>)
-// CHECK-SAME:        outs(%[[MATERIALIZE_2]] : tensor<?x?xi32>)
-// CHECK-SAME:        dimension = 1
-// CHECK:         %[[INSERTED:.*]] = tensor.insert_slice %[[CONCATENATE]]
-// CHECK-SAME:        into %[[ARG6]][%[[ARG4]], %[[ARG5]]] [%[[MIN]], %[[MIN_0]]] [1, 1]
-// CHECK:         scf.yield %[[INSERTED]]
-// CHECK:       return %[[FOR]]
-
-// CHECK-PARALLEL-LABEL: @concatenate_at_tile
-
-// -----
-
-func.func @sort(%input1: tensor<?x?x?xf32>, %input2: tensor<?x?x?xi32>,
-                %init1: tensor<?x?x?xf32>, %init2: tensor<?x?x?xi32>)
-    -> (tensor<?x?x?xf32>, tensor<?x?x?xi32>) {
-  %sorted1, %sorted2 = thlo.sort
-      ins(%input1: tensor<?x?x?xf32>, %input2: tensor<?x?x?xi32>)
-      outs(%init1: tensor<?x?x?xf32>, %init2: tensor<?x?x?xi32>)
-      dimension = 1
-      is_stable = true
-      (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
-        %gt = arith.cmpf ogt, %e11, %e12: f32
-        thlo.yield %gt : i1
-      }
-  func.return %sorted1, %sorted2 : tensor<?x?x?xf32>, tensor<?x?x?xi32>
-}
-
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["thlo.sort"]} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %1, %loops:2 = transform.structured.tile_using_for %0 [256, 512]
-      : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
-}
-
-// CHECK-LABEL: func.func @sort
-// CHECK-SAME:    (%[[IN0:[a-zA-Z_0-9]*]]: tensor<?x?x?xf32>,
-// CHECK-SAME:     %[[IN1:[a-zA-Z_0-9]*]]: tensor<?x?x?xi32>,
-// CHECK-SAME:     %[[INIT0:[a-zA-Z_0-9]*]]: tensor<?x?x?xf32>,
-// CHECK-SAME:     %[[INIT1:[a-zA-Z_0-9]*]]: tensor<?x?x?xi32>)
-// CHECK-DAG:   %[[C0:[a-zA-Z_0-9]*]] = arith.constant 0
-// CHECK-DAG:   %[[C2:.*]] = arith.constant 2
-// CHECK-DAG:   %[[DIM0:.*]] = tensor.dim %[[INIT0]], %[[C0]]
-// CHECK-DAG:   %[[DIM2:.*]] = tensor.dim %[[INIT0]], %[[C2]]
-// CHECK:       scf.for
-// CHECK-SAME:      %[[START0:.*]] = %[[C0]] to %[[DIM0]]
-// CHECK-SAME:      iter_args(%[[INIT0_OUTER:.*]] = %[[INIT0]],
-// CHECK-SAME:                %[[INIT1_OUTER:.*]] = %[[INIT1]])
-// CHECK-DAG:     %[[TILE_SIZE0:.*]] = affine.min #map{{[0-9]*}}(%[[START0]])[%[[DIM0]]]
-// CHECK:         scf.for
-// CHECK-SAME:      %[[START2:.*]] = %[[C0]] to %[[DIM2]]
-// CHECK-SAME:      iter_args(%[[INIT0_:.*]] = %[[INIT0_OUTER]],
-// CHECK-SAME:                %[[INIT1_:.*]] = %[[INIT1_OUTER]])
-// CHECK-DAG:     %[[C1:.*]] = arith.constant 1
-// CHECK-DAG:     %[[TILE_SIZE2:.*]] = affine.min #map{{[0-9]*}}(%[[START2]])[%[[DIM2]]]
-// CHECK-DAG:     %[[DIM1:.*]] = tensor.dim %[[IN0]], %[[C1]]
-// CHECK-DAG:     %[[IN0_SUB:.*]] = tensor.extract_slice %[[IN0]]
-// CHECK-SAME:        [%[[START0]], 0, %[[START2]]]
-// CHECK-SAME:        [%[[TILE_SIZE0]], %[[DIM1]], %[[TILE_SIZE2]]]
-// CHECK-SAME:        [1, 1, 1]
-// CHECK-DAG:     %[[IN1_SUB:.*]] = tensor.extract_slice %[[IN1]]
-// CHECK-SAME:        [%[[START0]], 0, %[[START2]]]
-// CHECK-SAME:        [%[[TILE_SIZE0]], %[[DIM1]], %[[TILE_SIZE2]]]
-// CHECK-SAME:        [1, 1, 1]
-// CHECK-DAG:     %[[INIT0_SUB:.*]] = tensor.extract_slice %[[INIT0_]]
-// CHECK-SAME:        [%[[START0]], 0, %[[START2]]]
-// CHECK-SAME:        [%[[TILE_SIZE0]], %[[DIM1]], %[[TILE_SIZE2]]]
-// CHECK-SAME:        [1, 1, 1]
-// CHECK-DAG:     %[[INIT1_SUB:.*]] = tensor.extract_slice %[[INIT1_]]
-// CHECK-SAME:        [%[[START0]], 0, %[[START2]]]
-// CHECK-SAME:        [%[[TILE_SIZE0]], %[[DIM1]], %[[TILE_SIZE2]]]
-// CHECK-SAME:        [1, 1, 1]
-// CHECK:         %[[SORTED0:.*]], %[[SORTED1:.*]] = thlo.sort
-// CHECK-SAME:        ins(%[[IN0_SUB]] : tensor<?x?x?xf32>, %[[IN1_SUB]] : tensor<?x?x?xi32>)
-// CHECK-SAME:        outs(%[[INIT0_SUB]] : tensor<?x?x?xf32>, %[[INIT1_SUB]] : tensor<?x?x?xi32>)
-// CHECK:         %[[INSERTED0:.*]] = tensor.insert_slice %[[SORTED0]]
-// CHECK-SAME:        %[[INIT0_]][%[[START0]], 0, %[[START2]]]
-// CHECK-SAME:        [%[[TILE_SIZE0]], %[[DIM1]], %[[TILE_SIZE2]]]
-// CHECK-SAME:        [1, 1, 1]
-// CHECK:         %[[INSERTED1:.*]] = tensor.insert_slice %[[SORTED1]]
-// CHECK-SAME:        %[[INIT1_]][%[[START0]], 0, %[[START2]]]
-// CHECK-SAME:        [%[[TILE_SIZE0]], %[[DIM1]], %[[TILE_SIZE2]]]
-// CHECK-SAME:        [1, 1, 1]
-// CHECK:         scf.yield %[[INSERTED0]], %[[INSERTED1]]
-
-// -----
-
-func.func @sort2(%input1: tensor<1024x2048x4096xf32>,
-                %input2: tensor<1024x2048x4096xi32>,
-                %init1: tensor<1024x2048x4096xf32>,
-                %init2: tensor<1024x2048x4096xi32>)
-    -> (tensor<1024x2048x4096xf32>, tensor<1024x2048x4096xi32>) {
-  %sorted1, %sorted2 = thlo.sort
-      ins(%input1: tensor<1024x2048x4096xf32>,
-          %input2: tensor<1024x2048x4096xi32>)
-      outs(%init1: tensor<1024x2048x4096xf32>,
-           %init2: tensor<1024x2048x4096xi32>)
-      dimension = 1
-      is_stable = true
-      (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
-        %gt = arith.cmpf ogt, %e11, %e12: f32
-        thlo.yield %gt : i1
-      }
-  func.return
-    %sorted1, %sorted2 : tensor<1024x2048x4096xf32>, tensor<1024x2048x4096xi32>
-}
-
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["thlo.sort"]} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %1, %loops:2 = transform.structured.tile_using_for %0 [256, 512]
-      : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
-}
-
-// CHECK-LABEL: func.func @sort2
-
-// -----
-
-func.func @reverse_static(%input: tensor<100xf32>, %init: tensor<100xf32>)
-  -> tensor<100xf32> {
-  %res = thlo.reverse
-         ins(%input: tensor<100xf32>)
-         outs(%init: tensor<100xf32>)
-         reverse_dimensions = [0]
-  func.return %res : tensor<100xf32>
-}
-
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["thlo.reverse"]} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %1, %loop = transform.structured.tile_using_for %0 [10]
-      : (!pdl.operation) -> (!pdl.operation, !pdl.operation)
-}
-
-// CHECK-LABEL: func @reverse_static
-//  CHECK-SAME: %[[ARG0:.*]]: tensor<100xf32>, %[[ARG1:.*]]: tensor<100xf32>
-//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0
-//   CHECK-DAG:   %[[C10:.*]] = arith.constant 10 : index
-//   CHECK-DAG:   %[[C100:.*]] = arith.constant 100 : index
-//       CHECK:   %[[FOR:.*]] = scf.for %[[I:.*]] = %[[C0]]
-//  CHECK-SAME:   iter_args(%[[ARG3:.*]] = %[[ARG1]])
-//       CHECK:     %[[TEMP_SUB_RES:.*]] = arith.subi %[[C100]], %[[I]]
-//       CHECK:     %[[IN_TILE_DIM:.*]] = arith.subi %[[TEMP_SUB_RES]], %[[C10]]
-//   CHECK-DAG:     %[[IN_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[IN_TILE_DIM]]]
-//   CHECK-DAG:     %[[INIT_SLICE:.*]] = tensor.extract_slice %[[ARG3]][%[[I]]]
-//       CHECK:     %[[REVERSED:.*]] = thlo.reverse ins(%[[IN_SLICE]]
-//       CHECK:       outs(%[[INIT_SLICE]]
-//       CHECK:     %[[INSERTED:.*]] = tensor.insert_slice %[[REVERSED]] into %[[ARG3]][%[[I]]
-//       CHECK:     scf.yield %[[INSERTED]]
-//       CHECK:   return %[[FOR]]
-
-// -----
-
-func.func @reverse_dynamic(%input: tensor<?x?xf32>, %init: tensor<?x?xf32>)
-  -> tensor<?x?xf32> {
-  %res = thlo.reverse
-         ins(%input: tensor<?x?xf32>)
-         outs(%init: tensor<?x?xf32>)
-         reverse_dimensions = [0, 1]
-  func.return %res : tensor<?x?xf32>
-}
-
-transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
-    %0 = transform.structured.match ops{["thlo.reverse"]} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %1, %loops:2 = transform.structured.tile_using_for %0 [256, 512]
-      : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation)
-}
-
-// CHECK-LABEL: func @reverse_dynamic(
-//  CHECK-SAME: %[[ARG0:.*]]: tensor<?x?xf32>, %[[ARG1:.*]]: tensor<?x?xf32>
-//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0
-//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1
-//   CHECK-DAG:   %[[DIM:.*]] = tensor.dim %[[ARG1]], %[[C0]]
-//   CHECK-DAG:   %[[DIM0:.*]] = tensor.dim %[[ARG1]], %[[C1]]
-//       CHECK:   %[[FOR:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[DIM]]
-//  CHECK-SAME:       iter_args(%[[ARG4:.*]] = %[[ARG1]])
-//   CHECK-DAG:     %[[AFFINE_MIN1:.*]] = affine.min
-//       CHECK:     %[[INNER_FOR:.*]] = scf.for  %[[J:.*]] = %[[C0]] to %[[DIM0]]
-//  CHECK-SAME:       iter_args(%[[INIT_:.*]] = %[[ARG4]])
-//   CHECK-DAG:     %[[AFFINE_MIN2:.*]] = affine.min
-//   CHECK-DAG:     %[[DIM1:.*]] = tensor.dim %[[ARG0]], %[[C0]]
-//   CHECK-DAG:     %[[DIM2:.*]] = tensor.dim %[[ARG0]], %[[C1]]
-//   CHECK-DAG:     %[[TEMP_SUB_RES0:.*]] = arith.subi %[[DIM1]], %[[I]]
-//   CHECK-DAG:     %[[IN_TILE_DIM0:.*]] = arith.subi %[[TEMP_SUB_RES0]], %[[AFFINE_MIN1]]
-//   CHECK-DAG:     %[[TEMP_SUB_RES1:.*]] = arith.subi %[[DIM2]], %[[J]]
-//   CHECK-DAG:     %[[IN_TILE_DIM1:.*]] = arith.subi %[[TEMP_SUB_RES1]], %[[AFFINE_MIN2]]
-//   CHECK-DAG:     %[[IN_SLICE:.*]] = tensor.extract_slice %[[ARG0]]
-//   CHECK-SAME:      [%[[IN_TILE_DIM0]], %[[IN_TILE_DIM1]]]
-//   CHECK-DAG:     %[[INIT_SLICE:.*]] = tensor.extract_slice %[[INIT_]]
-//   CHECK-SAME:      [%[[I]], %[[J]]]
-//       CHECK:     %[[REVERSED:.*]] = thlo.reverse ins(%[[IN_SLICE]]
-//  CHECK-SAME:     outs(%[[INIT_SLICE]]
-//       CHECK:     %[[INSERTED:.*]] = tensor.insert_slice %[[REVERSED]]
-//  CHECK-SAME:     into %[[INIT_]][%[[I]], %[[J]]
-//       CHECK:     scf.yield %[[INSERTED]]
-//       CHECK:   return %[[FOR]]
diff --git a/third_party/xla/xla/mlir_hlo/tests/collapse_parallel_loops_to_1d_pass.mlir b/third_party/xla/xla/mlir_hlo/tests/collapse_parallel_loops_to_1d_pass.mlir
index 855f7c2e4489ae..a49b1675cb9289 100644
--- a/third_party/xla/xla/mlir_hlo/tests/collapse_parallel_loops_to_1d_pass.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/collapse_parallel_loops_to_1d_pass.mlir
@@ -15,6 +15,7 @@ func.func @parallel_2d(%arg0: memref<4x4xf32>, %arg1: memref<4x4xf32>) {
     scf.yield
   }
   %1 = bufferization.to_tensor %0 : memref<4x4xf32>
-  memref.tensor_store %1, %arg1 : memref<4x4xf32>
+  bufferization.materialize_in_destination %1 in writable %arg1
+      : (tensor<4x4xf32>, memref<4x4xf32>) -> ()
   "lmhlo.terminator"() : () -> ()
 }
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/simplify_dead_copy.mlir b/third_party/xla/xla/mlir_hlo/tests/naive_copy_removal.mlir
similarity index 100%
rename from third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/simplify_dead_copy.mlir
rename to third_party/xla/xla/mlir_hlo/tests/naive_copy_removal.mlir
diff --git a/third_party/xla/xla/mlir_hlo/tests/propagate_static_shapes.mlir b/third_party/xla/xla/mlir_hlo/tests/propagate_static_shapes.mlir
index c9a599acd45d02..4113bf5f8e47bd 100644
--- a/third_party/xla/xla/mlir_hlo/tests/propagate_static_shapes.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/propagate_static_shapes.mlir
@@ -1,26 +1,25 @@
 // RUN: mlir-hlo-opt %s \
-// RUN:   -propagate-static-shapes='convert_pointer_args=!llvm.ptr<i8>' \
+// RUN:   -propagate-static-shapes='convert_pointer_args=!llvm.ptr' \
 // RUN: | FileCheck %s
 
 module attributes {gpu.container_module} {
 
   gpu.module @gpu_module {
-    // CHECK: llvm.func @kernel(%arg0: f32, %arg1: !llvm.ptr<i8>, %arg2: f32)
+    // CHECK: llvm.func @kernel(%arg0: f32, %arg1: !llvm.ptr, %arg2: f32)
     llvm.func @kernel(
       %arg0: f32,
-      %base: !llvm.ptr<f32>, %align: !llvm.ptr<f32>, %offset: i64,
+      %base: !llvm.ptr, %align: !llvm.ptr, %offset: i64,
       %size.x: i64, %size.y: i64, %stride.x: i64, %stride.y: i64,
       %argN: f32
     ) attributes {gpu.kernel} {
-      // CHECK-DAG:  %[[base:.*]] = llvm.bitcast %arg1 : !llvm.ptr<i8> to !llvm.ptr<f32>
-      // CHECK:      %[[ptr:.*]] = llvm.getelementptr %[[base]][4]
-      // CHECK:      llvm.call @dummy(%[[ptr]]) : (!llvm.ptr<f32>) -> ()
-      %ptr = llvm.getelementptr %align[%stride.x] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-      llvm.call @dummy(%ptr) : (!llvm.ptr<f32>) -> ()
+      // CHECK:      %[[ptr:.*]] = llvm.getelementptr %arg1[4]
+      // CHECK:      llvm.call @dummy(%[[ptr]]) : (!llvm.ptr) -> ()
+      %ptr = llvm.getelementptr %align[%stride.x] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.call @dummy(%ptr) : (!llvm.ptr) -> ()
       llvm.return
     }
-    // CHECK: llvm.func @dummy(%arg0: !llvm.ptr<f32>)
-    llvm.func @dummy(%arg0: !llvm.ptr<f32>) attributes {gpu.kernel} {
+    // CHECK: llvm.func @dummy(%arg0: !llvm.ptr)
+    llvm.func @dummy(%arg0: !llvm.ptr) attributes {gpu.kernel} {
       llvm.return
     }
   }
diff --git a/third_party/xla/xla/mlir_hlo/tests/scalarization.mlir b/third_party/xla/xla/mlir_hlo/tests/scalarization.mlir
deleted file mode 100644
index d2e644a9c16dc8..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/scalarization.mlir
+++ /dev/null
@@ -1,586 +0,0 @@
-// RUN: mlir-hlo-opt %s --scalarize --split-input-file | FileCheck %s
-
-#map = affine_map<() -> ()>
-
-func.func @zero_rank(%lhs: tensor<f32>, %rhs: tensor<f32>) -> tensor<f32>  {
-  %0 = tensor.empty() : tensor<f32>
-  %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []}
-    ins(%lhs, %rhs: tensor<f32>, tensor<f32>)
-    outs(%0: tensor<f32>) {
-  ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
-    %2 = arith.addf %arg3, %arg4: f32
-    linalg.yield %2: f32
-  } -> tensor<f32>
-  return %1: tensor<f32>
-}
-// CHECK-LABEL: func @zero_rank
-// CHECK-SAME:    (%[[LHS:.*]]: tensor<f32>, %[[RHS:.*]]: tensor<f32>)
-// CHECK-DAG:   %[[LHS_VAL:.*]] = tensor.extract %[[LHS]]
-// CHECK-DAG:   %[[RHS_VAL:.*]] = tensor.extract %[[RHS]]
-// CHECK:       %[[RES:.*]] = arith.addf %[[LHS_VAL]], %[[RHS_VAL]]
-// CHECK:       %[[NEW_TENSOR_RES:.*]] = tensor.from_elements %[[RES]]
-// CHECK:       return %[[NEW_TENSOR_RES]]
-
-// -----
-
-func.func @linalg_index(%arg0: tensor<1xf64>) -> tensor<1xf64> {
-  %0 = tensor.empty() : tensor<1xf64>
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0) -> (d0)>],
-    iterator_types = ["parallel"]}
-    outs(%0 : tensor<1xf64>) {
-  ^bb0(%arg1: f64):
-    %2 = linalg.index 0 : index
-    %3 = tensor.extract %arg0[%2] : tensor<1xf64>
-    linalg.yield %3 : f64
-  } -> tensor<1xf64>
-  return %1 : tensor<1xf64>
-}
-// CHECK-LABEL: func @linalg_index
-// CHECK-SAME:      (%[[ARG:.*]]: tensor<1xf64>)
-// CHECK-NEXT:    %[[C0:.*]] = arith.constant 0
-// CHECK-NEXT:    %[[ELEM:.*]] = tensor.extract %[[ARG]][%[[C0]]]
-// CHECK-NEXT:    tensor.from_elements %[[ELEM]]
-
-// -----
-
-
-func.func @nonzero_rank(%lhs: tensor<1xf32>, %rhs: tensor<1x1xf32>)
-    -> tensor<1x1x1xf32>  {
-  %0 = tensor.empty() : tensor<1x1x1xf32>
-  %1 = linalg.generic {indexing_maps = [
-    affine_map<(d0, d1, d2) -> (d0)>,
-    affine_map<(d0, d1, d2) -> (d0, d1)>,
-    affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
-    iterator_types = ["parallel", "parallel", "parallel"]}
-    ins(%lhs, %rhs: tensor<1xf32>, tensor<1x1xf32>)
-    outs(%0: tensor<1x1x1xf32>) {
-  ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
-    %2 = arith.addf %arg3, %arg4: f32
-    linalg.yield %2: f32
-  } -> tensor<1x1x1xf32>
-  return %1: tensor<1x1x1xf32>
-}
-// CHECK-LABEL: func @nonzero_rank
-// CHECK-SAME:    (%[[LHS:.*]]: tensor<1xf32>, %[[RHS:.*]]: tensor<1x1xf32>)
-// CHECK-DAG:     %[[LHS_VAL:.*]] = tensor.extract %[[LHS]]
-// CHECK-DAG:     %[[RHS_VAL:.*]] = tensor.extract %[[RHS]]
-// CHECK:         %[[RES:.*]] = arith.addf %[[LHS_VAL]], %[[RHS_VAL]]
-// CHECK:         %[[NEW_TENSOR_RES:.*]] = tensor.from_elements %[[RES]]
-// CHECK:         return %[[NEW_TENSOR_RES]]
-
-// -----
-
-#map = affine_map<() -> ()>
-
-func.func @op_sequence(%lhs: tensor<f32>, %rhs: tensor<f32>) -> tensor<f32>  {
-  %0 = tensor.empty() : tensor<f32>
-  %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []}
-    ins(%lhs, %rhs: tensor<f32>, tensor<f32>)
-    outs(%0: tensor<f32>) {
-  ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
-    %2 = arith.addf %arg3, %arg4: f32
-    linalg.yield %2: f32
-  } -> tensor<f32>
-
-  %3 = tensor.empty() : tensor<f32>
-  %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []}
-    ins(%lhs, %1: tensor<f32>, tensor<f32>)
-    outs(%3: tensor<f32>) {
-  ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
-    %5 = arith.mulf %arg3, %arg4: f32
-    linalg.yield %5: f32
-  } -> tensor<f32>
-
-  %6 = tensor.empty() : tensor<f32>
-  %7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []}
-    ins(%1, %4: tensor<f32>, tensor<f32>)
-    outs(%6: tensor<f32>) {
-  ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
-    %5 = arith.divf %arg3, %arg4: f32
-    linalg.yield %5: f32
-  } -> tensor<f32>
-
-  return %7: tensor<f32>
-}
-// CHECK-LABEL: func @op_sequence
-// CHECK-SAME:    (%[[LHS:.*]]: tensor<f32>, %[[RHS:.*]]: tensor<f32>)
-// CHECK-DAG:   %[[LHS_VAL:.*]] = tensor.extract %[[LHS]]
-// CHECK-DAG:   %[[RHS_VAL:.*]] = tensor.extract %[[RHS]]
-// CHECK:       %[[RES:.*]] = arith.addf %[[LHS_VAL]], %[[RHS_VAL]]
-// CHECK-DAG:   %[[LHS_VAL_:.*]] = tensor.extract %[[LHS]]
-// CHECK:       %[[RES2:.*]] = arith.mulf %[[LHS_VAL_]], %[[RES]]
-// CHECK:       %[[RES3:.*]] = arith.divf %[[RES]], %[[RES2]]
-// CHECK:       %[[NEW_TENSOR_RES:.*]] = tensor.from_elements %[[RES3]]
-// CHECK:       return %[[NEW_TENSOR_RES]]
-
-// -----
-
-#map = affine_map<() -> ()>
-
-func.func @multiple_ops(%lhs: tensor<f32>, %rhs: tensor<f32>) -> tensor<f32>  {
-  %0 = tensor.empty() : tensor<f32>
-  %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []}
-    ins(%lhs, %rhs: tensor<f32>, tensor<f32>)
-    outs(%0: tensor<f32>) {
-  ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
-    %2 = arith.addf %arg3, %arg4: f32
-    %3 = arith.mulf %2, %arg4: f32
-    linalg.yield %3: f32
-  } -> tensor<f32>
-  return %1: tensor<f32>
-}
-// CHECK-LABEL: func @multiple_ops
-// CHECK-SAME:    (%[[LHS:.*]]: tensor<f32>, %[[RHS:.*]]: tensor<f32>)
-// CHECK-DAG:     %[[LHS_VAL:.*]] = tensor.extract %[[LHS]]
-// CHECK-DAG:     %[[RHS_VAL:.*]] = tensor.extract %[[RHS]]
-// CHECK:         %[[RES:.*]] = arith.addf %[[LHS_VAL]], %[[RHS_VAL]]
-// CHECK:         %[[RES2:.*]] = arith.mulf %[[RES]], %[[RHS_VAL]]
-// CHECK:         %[[NEW_TENSOR_RES:.*]] = tensor.from_elements %[[RES2]]
-// CHECK:         return %[[NEW_TENSOR_RES]]
-
-// -----
-
-func.func @outside_yield() -> tensor<1x1xi1>  {
-  %true = arith.constant true
-  %0 = tensor.empty() : tensor<1x1xi1>
-  %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
-                       iterator_types = ["parallel", "parallel"]}
-       outs(%0 : tensor<1x1xi1>) {
-  ^bb0(%arg1: i1):
-    linalg.yield %true : i1
-  } -> tensor<1x1xi1>
-  return %1: tensor<1x1xi1>
-}
-
-// CHECK-LABEL: func @outside_yield
-// CHECK:         %[[CST:.*]] = arith.constant dense<true> : tensor<1x1xi1>
-// CHECK:         return %[[CST]]
-
-// -----
-
-#map0 = affine_map<(d0) -> ()>
-#map1 = affine_map<(d0) -> (d0)>
-func.func @extra_argument(%arg0: tensor<4xf64>, %arg2: tensor<i1>) -> tensor<f64> {
-  %cst = arith.constant 0.000000e+00 : f64
-  %0 = tensor.empty() : tensor<f64>
-  %1 = linalg.fill ins(%cst : f64) outs(%0 : tensor<f64>) -> tensor<f64>
-  %2 = linalg.generic {
-    indexing_maps = [affine_map<(d0) -> ()>,
-                     affine_map<(d0) -> (d0)>,
-                     affine_map<(d0) -> ()>],
-    iterator_types = ["reduction"]}
-    ins(%arg2, %arg0 : tensor<i1>, tensor<4xf64>) outs(%1 : tensor<f64>) {
-  ^bb0(%arg3: i1, %arg4: f64, %arg5: f64):
-    %3 = arith.cmpf une, %arg4, %arg4 : f64
-    %4 = arith.select %3, %cst, %arg4 : f64
-    %5 = arith.select %arg3, %4, %cst : f64
-    %6 = arith.addf %arg5, %5 : f64
-    linalg.yield %6 : f64
-  } -> tensor<f64>
-  return %2 : tensor<f64>
-}
-
-// CHECK-LABEL: func @extra_argument
-
-// -----
-
-func.func @scatter_f32_with_update_computation(%indices: tensor<1x2xindex>,
-    %updates: tensor<1x?x?xf32>, %init: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %0 = thlo.scatter ins(%indices: tensor<1x2xindex>, %updates: tensor<1x?x?xf32>)
-                    outs(%init: tensor<?x?xf32>)
-    (%in: f32, %out: f32) {
-      %1 = arith.addf %in, %out: f32
-      thlo.yield %1: f32
-    }
-  return %0: tensor<?x?xf32>
-}
-// CHECK-LABEL: func.func @scatter_f32_with_update_computation(
-// CHECK-SAME:      %[[INDICES:.*]]: tensor<1x2xindex>,
-// CHECK-SAME:      %[[UPDATES:.*]]: tensor<1x?x?xf32>,
-// CHECK-SAME:      %[[INIT:.*]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
-
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
-// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
-// CHECK-DAG:   %[[C2:.*]] = arith.constant 2
-
-// CHECK-DAG:  %[[UPDATES_DIM_1:.*]] = tensor.dim %[[UPDATES]], %[[C1]]
-// CHECK-DAG:  %[[UPDATES_DIM_2:.*]] = tensor.dim %[[UPDATES]], %[[C2]]
-// CHECK-DAG:  %[[INIT_DIM_0:.*]] = tensor.dim %[[INIT]], %[[C0]]
-// CHECK-DAG:  %[[INIT_DIM_1:.*]] = tensor.dim %[[INIT]], %[[C1]]
-
-// Extract scatter indices from `indices` arg.
-// CHECK-DAG:  %[[INDEX_0:.*]] = tensor.extract %[[INDICES]][%[[C0]],
-// CHECK-DAG:  %[[INDEX_1:.*]] = tensor.extract %[[INDICES]][%[[C0]],
-
-// CHECK-COUNT-7: arith.andi
-
-// CHECK:      scf.if
-// CHECK-NEXT: %[[EXTRACTED:.*]] = tensor.extract_slice %[[INIT]][%[[INDEX_0]],
-// CHECK-SAME:   %[[INDEX_1]]] [%[[UPDATES_DIM_1]], %[[UPDATES_DIM_2]]] [%[[C1]],
-// CHECK-SAME:   %[[C1]]] : tensor<?x?xf32> to tensor<?x?xf32>
-
-// CHECK-NEXT: %[[UPDATES_SLICE:.*]] = tensor.extract_slice %[[UPDATES]]
-
-// CHECK-NEXT: %[[SUM:.*]] = linalg.reduce
-// CHECK-SAME:   ins(%[[UPDATES_SLICE]] : tensor<1x?x?xf32>)
-// CHECK-SAME:   outs(%[[EXTRACTED]] : tensor<?x?xf32>) dimensions = [0]
-// CHECK-NEXT:   (%[[ARG1:.*]]: f32, %[[ARG2:.*]]: f32) {
-// CHECK-NEXT:     %[[ADD:.*]] = arith.addf %[[ARG1]], %[[ARG2]] : f32
-// CHECK-NEXT:     linalg.yield %[[ADD]] : f32
-// CHECK-NEXT:   }
-
-// CHECK-NEXT: %[[INSERTED:.*]] = tensor.insert_slice %[[SUM]] into %[[INIT]]
-// CHECK-SAME:   [%[[INDEX_0]], %[[INDEX_1]]] [%[[UPDATES_DIM_1]],
-// CHECK-SAME:   %[[UPDATES_DIM_2]]] [%[[C1]], %[[C1]]] : tensor<?x?xf32>
-// CHECK-SAME:   into tensor<?x?xf32>
-// CHECK-NEXT:       scf.yield %[[INSERTED]] : tensor<?x?xf32>
-// CHECK-NEXT:   } else {
-// CHECK-NEXT:       scf.yield %[[INIT]] : tensor<?x?xf32>
-// CHECK-NEXT:   }
-// CHECK-NEXT:   return
-
-// -----
-
-func.func @scatter_i64_no_update_computation(%indices: tensor<1x1xindex>,
-                           %updates: tensor<1x1x3x4xi64>,
-                           %init: tensor<3x3x4xi64>) -> tensor<3x3x4xi64> {
- %0 = thlo.scatter ins(%indices : tensor<1x1xindex>,
-                       %updates : tensor<1x1x3x4xi64>)
-                   outs(%init : tensor<3x3x4xi64>)
-   (%arg5: i64, %arg6: i64) {
-     thlo.yield %arg5 : i64
- }
- func.return %0 : tensor<3x3x4xi64>
-}
-// CHECK-LABEL:   func.func @scatter_i64_no_update_computation(
-// CHECK-SAME:         %[[INDICES:.*]]: tensor<1x1xindex>,
-// CHECK-SAME:         %[[UPDATES:.*]]: tensor<1x1x3x4xi64>,
-// CHECK-SAME:         %[[INIT:.*]]: tensor<3x3x4xi64>) -> tensor<3x3x4xi64> {
-
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[C1:.*]] = arith.constant 1 : index
-// CHECK:           %[[C3:.*]] = arith.constant 3 : index
-
-// CHECK:           %[[INDEX_0:.*]] = tensor.extract %[[INDICES]]{{\[}}%[[C0]],
-// CHECK-SAME:      %[[C0]]] : tensor<1x1xindex>
-
-// CHECK:         scf.if
-// CHECK-NEXT:      %[[COLLAPSED:.*]] = tensor.collapse_shape %[[UPDATES]]
-// CHECK-SAME:        [0, 1], [2], [3]]
-// CHECK-SAME:        : tensor<1x1x3x4xi64> into tensor<1x3x4xi64>
-// CHECK-NEXT:      %[[INSERTED:.*]] = tensor.insert_slice %[[COLLAPSED]] into
-// CHECK-SAME:        %[[INIT]][%[[INDEX_0]], %[[C0]], %[[C0]]] [1, 3, 4]
-// CHECK-SAME:        [%[[C1]], %[[C1]], %[[C1]]]
-// CHECK-SAME:        : tensor<1x3x4xi64> into tensor<3x3x4xi64>
-// CHECK-NEXT:      scf.yield %[[INSERTED]] : tensor<3x3x4xi64>
-// CHECK-NEXT:    } else {
-// CHECK-NEXT:      scf.yield %[[INIT]] : tensor<3x3x4xi64>
-// CHECK-NEXT:    }
-// CHECK-NEXT:    return
-
-// -----
-
-func.func @gather(%indices: tensor<1x2xindex>,
-                  %operand: tensor<5x6x7xi64>,
-                  %init: tensor<1x3xi64>) -> tensor<1x3xi64> {
- %0 = thlo.gather ins(%operand : tensor<5x6x7xi64>,
-                      %indices : tensor<1x2xindex>)
-                   outs(%init : tensor<1x3xi64>)
- func.return %0 : tensor<1x3xi64>
-}
-
-// CHECK-LABEL: func.func @gather(
-//  CHECK-SAME:     %[[INDICES:.*]]: tensor<1x2xindex>
-//  CHECK-SAME:     %[[OPERAND:.*]]: tensor<5x6x7xi64>
-//  CHECK-SAME:     %[[INIT:.*]]: tensor<1x3xi64>
-//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0
-//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1
-//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2
-//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3
-//   CHECK-DAG:   %[[C5:.*]] = arith.constant 5
-//   CHECK-DAG:   %[[INDEX0:.*]] = tensor.extract %[[INDICES]][%[[C0]], %[[C0]]]
-//   CHECK-DAG:   %[[INDEX1:.*]] = tensor.extract %[[INDICES]][%[[C0]], %[[C1]]]
-//   CHECK-DAG:   %[[CLAMPED_INDEX0:.*]] = arith.minsi %[[INDEX0]], %[[C2]]
-//   CHECK-DAG:   %[[CLAMPED_INDEX0_:.*]] = arith.maxsi %[[CLAMPED_INDEX0]], %[[C0]]
-//   CHECK-DAG:   %[[CLAMPED_INDEX1:.*]] = arith.minsi %[[INDEX1]], %[[C5]]
-//   CHECK-DAG:   %[[CLAMPED_INDEX1_:.*]] = arith.maxsi %[[CLAMPED_INDEX1]], %[[C0]]
-//       CHECK:    scf.for %[[J:.*]] = %[[C0]] to %[[C3]]
-//       CHECK:      %[[OFFSET_J:.*]] = arith.addi %[[J]], %[[CLAMPED_INDEX0_]]
-
-//       CHECK:      %[[VAL:.*]] = tensor.extract %[[OPERAND]]
-//       CHECK-SAME:   [%[[OFFSET_J]], %[[CLAMPED_INDEX1_]], %[[C0]]]
-//       CHECK-NEXT: %[[UPDATED:.*]] = tensor.insert %[[VAL]]
-//       CHECK:      scf.yield %[[UPDATED]]
-
-// -----
-
-func.func @fold_from_elements_into_insert_slice(%elem: f32,
-    %out: tensor<8x2xf32>) -> tensor<8x2xf32>  {
-  %elem_tensor = tensor.from_elements %elem : tensor<1x1xf32>
-  %updated = tensor.insert_slice %elem_tensor into %out[0, 1] [1, 1] [1, 1]
-    : tensor<1x1xf32> into tensor<8x2xf32>
-
-  func.return %updated: tensor<8x2xf32>
-}
-// CHECK-LABEL: func @fold_from_elements_into_insert_slice
-// CHECK-SAME:      %[[ELEM:.*]]: f32, %[[OUT:.*]]: tensor<8x2xf32>
-
-// CHECK:         %[[UPDATE:.*]] = tensor.insert %[[ELEM]] into %[[OUT]]
-// CHECK-NEXT:    return %[[UPDATE]]
-
-// -----
-
-func.func @dynamic_broadcast_in_dim(%arg : tensor<1x1xf32>,
-                                    %init: tensor<1x1x1xf32>)
-                                    -> tensor<1x1x1xf32>  {
-  %0 = thlo.dynamic_broadcast_in_dim ins(%arg : tensor<1x1xf32>)
-                                     outs(%init : tensor<1x1x1xf32>)
-                                     broadcast_dimensions = [0, 2]
-  func.return %0 : tensor<1x1x1xf32>
-}
-// CHECK-LABEL: @dynamic_broadcast_in_dim(
-// CHECK-SAME:      %[[ARG:.*]]: tensor<1x1xf32>, %[[INIT:.*]]: tensor<1x1x1xf32>)
-// CHECK:                 %[[C0:.*]] = arith.constant 0 : index
-// CHECK-NEXT:      %[[ELEM:.*]] = tensor.extract %[[ARG]][%[[C0]], %[[C0]]]
-// CHECK-NEXT:      %[[UPDATED:.*]] = tensor.from_elements %[[ELEM]]
-
-// -----
-
-func.func @concatenate(
-  %arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>,
-  %arg2: tensor<?x?x?xf32>, %init: tensor<?x1x?xf32>) -> tensor<?x1x?xf32> {
-  %cat = thlo.concatenate
-    ins(%arg0: tensor<?x?x?xf32>,
-        %arg1: tensor<?x?x?xf32>,
-        %arg2: tensor<?x?x?xf32>)
-    outs(%init: tensor<?x1x?xf32>)
-    dimension = 1
-  func.return %cat : tensor<?x1x?xf32>
-}
-
-// CHECK-LABEL: func @concatenate(
-// CHECK-SAME:      %[[ARG_0:[0-9a-zA-Z]*]]: tensor<?x?x?xf32>,
-// CHECK-SAME:      %[[ARG_1:[0-9a-zA-Z]*]]: tensor<?x?x?xf32>,
-// CHECK-SAME:      %[[ARG_2:[0-9a-zA-Z]*]]: tensor<?x?x?xf32>,
-// CHECK-SAME:      %[[INIT:[0-9a-zA-Z]*]]: tensor<?x1x?xf32>)
-
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
-// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
-// CHECK-DAG:   %[[C2:.*]] = arith.constant 2
-
-// CHECK-DAG:   %[[DIM0:.*]] = tensor.dim %[[INIT]], %[[C0]]
-// CHECK-DAG:   %[[DIM2:.*]] = tensor.dim %[[INIT]], %[[C2]]
-
-
-// Extract elements from arg0 is it's not empty.
-// CHECK-NEXT:  %[[DIM_ARG_0:.*]] = tensor.dim %[[ARG_0]], %[[C1]]
-// CHECK-NEXT:  %[[CMP_0:.*]] = arith.cmpi ne, %[[DIM_ARG_0]], %[[C0]]
-// CHECK:       %[[RESULT:.*]] = scf.if %[[CMP_0]]
-// CHECK:         %[[MAT_0:.*]] = tensor.extract_slice %[[ARG_0]]
-// CHECK-SAME:        [0, 0, 0] [%[[DIM0]], 1, %[[DIM2]]] [1, 1, 1]
-// CHECK:         %[[RES_0:.*]] = tensor.insert_slice %[[MAT_0]] into %[[INIT]]
-// CHECK-NEXT:    scf.yield %[[RES_0]]
-// CHECK-NEXT:  } else {
-
-// Else check arg1 and extracts element if it's not empty.
-// CHECK-NEXT:    %[[DIM_ARG_1:.*]] = tensor.dim %[[ARG_1]], %[[C1]]
-// CHECK-NEXT:    %[[CMP_1:.*]] = arith.cmpi ne, %[[DIM_ARG_1]], %[[C0]]
-// CHECK-NEXT:    %[[RESULT_1:.*]] = scf.if %[[CMP_1]]
-// CHECK-NEXT:      %[[MAT_1:.*]] = tensor.extract_slice %[[ARG_1]]
-// CHECK-SAME:        [0, 0, 0] [%[[DIM0]], 1, %[[DIM2]]] [1, 1, 1]
-// CHECK-NEXT:      %[[RES_1:.*]] = tensor.insert_slice %[[MAT_1]] into %[[INIT]]
-// CHECK-NEXT:      scf.yield %[[RES_1]]
-// CHECK-NEXT:    } else {
-
-// Otherwise extract elements from arg2, because arg0 and arg1 are empty.
-// CHECK-NEXT:      %[[MAT_2:.*]] = tensor.extract_slice %[[ARG_2]]
-// CHECK-SAME:        [0, 0, 0] [%[[DIM0]], 1, %[[DIM2]]] [1, 1, 1]
-// CHECK-NEXT:      %[[RES_2:.*]] = tensor.insert_slice %[[MAT_2]] into %[[INIT]]
-// CHECK-NEXT:      scf.yield %[[RES_2]]
-// CHECK-NEXT:    }
-// CHECK-NEXT:    scf.yield %[[RESULT_1]]
-// CHECK-NEXT:  }
-
-// CHECK-NEXT:  return %[[RESULT]] : tensor<?x1x?xf32>
-
-// -----
-
-func.func @linalg_map(%lhs : tensor<1x1xf32>,
-                      %rhs: tensor<1x1xf32>,
-                      %init: tensor<1x1xf32>)
-                                    -> tensor<1x1xf32>  {
-      %add = linalg.map
-          ins(%lhs, %rhs : tensor<1x1xf32>, tensor<1x1xf32>)
-          outs(%init: tensor<1x1xf32>)
-          (%lhs_elem: f32, %rhs_elem: f32) {
-            %0 = arith.addf %lhs_elem, %rhs_elem: f32
-            linalg.yield %0: f32
-          }
-      func.return %add : tensor<1x1xf32>
-}
-
-// CHECK-LABEL: @linalg_map(
-// CHECK-SAME:      %[[LHS:.*]]: tensor<1x1xf32>, %[[RHS:.*]]: tensor<1x1xf32>, %[[INIT:.*]]: tensor<1x1xf32>)
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK-NEXT:      %[[L_ELEM:.*]] = tensor.extract %[[LHS]][%[[C0]], %[[C0]]]
-// CHECK-NEXT:      %[[R_ELEM:.*]] = tensor.extract %[[RHS]][%[[C0]], %[[C0]]]
-// CHECK-NEXT:      %[[ADD:.*]] = arith.addf %[[L_ELEM]], %[[R_ELEM]]
-// CHECK-NEXT:      tensor.from_elements %[[ADD]]
-
-// -----
-
-func.func @linalg_reduce(%ins: tensor<1x1x1xf32>,
-                         %outs: tensor<1x1xf32>)
-                                    -> tensor<1x1xf32>  {
-      %reduce = linalg.reduce
-          ins(%ins: tensor<1x1x1xf32>)
-          outs(%outs: tensor<1x1xf32>)
-          dimensions = [1]
-          (%in: f32, %out: f32) {
-            %0 = arith.addf %in, %out: f32
-            linalg.yield %0: f32
-          }
-      func.return %reduce : tensor<1x1xf32>
-}
-
-// CHECK-LABEL: @linalg_reduce(
-// CHECK-SAME:      %[[INS:.*]]: tensor<1x1x1xf32>, %[[OUTS:.*]]: tensor<1x1xf32>)
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK-NEXT:      %[[L_ELEM:.*]] = tensor.extract %[[INS]][%[[C0]], %[[C0]], %[[C0]]]
-// CHECK-NEXT:      %[[R_ELEM:.*]] = tensor.extract %[[OUTS]][%[[C0]], %[[C0]]]
-// CHECK-NEXT:      %[[ADD:.*]] = arith.addf %[[L_ELEM]], %[[R_ELEM]]
-// CHECK-NEXT:      tensor.from_elements %[[ADD]]
-
-// -----
-
-func.func @linalg_transpose(%ins: tensor<1x1xf32>,
-                            %outs: tensor<1x1xf32>)
-                                    -> tensor<1x1xf32>  {
-      %transpose = linalg.transpose
-          ins(%ins: tensor<1x1xf32>)
-          outs(%outs: tensor<1x1xf32>)
-          permutation = [1, 0]
-      func.return %transpose : tensor<1x1xf32>
-}
-
-// CHECK-LABEL: @linalg_transpose(
-// CHECK-SAME:      %[[INS:.*]]: tensor<1x1xf32>, %[[OUTS:.*]]: tensor<1x1xf32>)
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK-NEXT:      %[[EXTRACTED:.*]] = tensor.extract %[[INS]][%[[C0]], %[[C0]]]
-// CHECK-NEXT:      tensor.from_elements %[[EXTRACTED]]
-
-// -----
-
-func.func @linalg_matmul(%lhs: tensor<1x1xf32>,
-                         %rhs: tensor<1x1xf32>,
-                         %out : tensor<1x1xf32>) -> tensor<1x1xf32> {
-  %0 = linalg.matmul
-      ins(%lhs, %rhs : tensor<1x1xf32>, tensor<1x1xf32>)
-      outs(%out : tensor<1x1xf32>) -> tensor<1x1xf32>
-  return %0 : tensor<1x1xf32>
-}
-
-// CHECK-LABEL: @linalg_matmul(
-// CHECK-SAME:      %[[LHS:.*]]: tensor<1x1xf32>, %[[RHS:.*]]: tensor<1x1xf32>, %[[OUT:.*]]: tensor<1x1xf32>)
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK-NEXT:      %[[LHS_ELEM:.*]] = tensor.extract %[[LHS]][%[[C0]], %[[C0]]]
-// CHECK-NEXT:      %[[RHS_ELEM:.*]] = tensor.extract %[[RHS]][%[[C0]], %[[C0]]]
-// CHECK-NEXT:      %[[OUT_ELEM:.*]] = tensor.extract %[[OUT]][%[[C0]], %[[C0]]]
-// CHECK-NEXT:      %[[MUL:.*]] = arith.mulf %[[LHS_ELEM]], %[[RHS_ELEM]]
-// CHECK-NEXT:      %[[ADD:.*]] = arith.addf %[[OUT_ELEM]], %[[MUL]]
-// CHECK-NEXT:       tensor.from_elements %[[ADD]]
-
-// -----
-
-func.func @thlo_reverse(%arg : tensor<1x1xf32>, %init: tensor<1x1xf32>)
-    -> tensor<1x1xf32> {
-  %0 = thlo.reverse ins(%arg : tensor<1x1xf32>)
-        outs(%init : tensor<1x1xf32>)
-        reverse_dimensions = [0, 1]
-  func.return %0 : tensor<1x1xf32>
-}
-
-// CHECK-LABEL: @thlo_reverse(
-//  CHECK-SAME: %[[ARG:.*]]: tensor<1x1xf32>, %[[INIT:.*]]: tensor<1x1xf32>)
-//       CHECK:   return %[[ARG]]
-
-// -----
-
-func.func @ite_1d(%arg0: i1, %arg1: tensor<1xf32>, %arg2: tensor<1xf32>)
-    -> tensor<1xf32> {
-  %0 = scf.if %arg0 -> (tensor<1xf32>) {
-    scf.yield %arg2 : tensor<1xf32>
-  } else {
-    scf.yield %arg1 : tensor<1xf32>
-  }
-  return %0 : tensor<1xf32>
-}
-
-// CHECK:     func.func @ite_1d(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: tensor<1xf32>, %[[ARG2:.*]]: tensor<1xf32>)
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-// CHECK:       %[[IF:.*]] = scf.if %[[ARG0]] -> (f32)
-// CHECK:         %[[EXTRACTED:.*]] = tensor.extract %[[ARG2]][%[[C0]]]
-// CHECK:         scf.yield %[[EXTRACTED]] : f32
-// CHECK:       else
-// CHECK:         %[[EXTRACTED_0:.*]] = tensor.extract %[[ARG1]][%[[C0]]]
-// CHECK:         scf.yield %[[EXTRACTED_0]] : f32
-// CHECK:       %[[FROM_ELEMENTS:.*]] = tensor.from_elements %[[IF]]
-// CHECK:       return %[[FROM_ELEMENTS]]
-
-// -----
-
-func.func @ite_2d(%arg0: i1, %arg1: tensor<1x1xf32>, %arg2: tensor<1x1xf32>)
-    -> tensor<1x1xf32> {
-  %0 = scf.if %arg0 -> (tensor<1x1xf32>) {
-    scf.yield %arg2 : tensor<1x1xf32>
-  } else {
-    scf.yield %arg1 : tensor<1x1xf32>
-  }
-  return %0 : tensor<1x1xf32>
-}
-
-// CHECK:     func.func @ite_2d(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: tensor<1x1xf32>, %[[ARG2:.*]]: tensor<1x1xf32>)
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-// CHECK:       %[[IF:.*]] = scf.if %[[ARG0]] -> (f32)
-// CHECK:         %[[EXTRACTED:.*]] = tensor.extract %[[ARG2]][%[[C0]], %[[C0]]]
-// CHECK:         scf.yield %[[EXTRACTED]] : f32
-// CHECK:       else
-// CHECK:         %[[EXTRACTED_0:.*]] = tensor.extract %[[ARG1]][%[[C0]], %[[C0]]]
-// CHECK:         scf.yield %[[EXTRACTED_0]] : f32
-// CHECK:       %[[FROM_ELEMENTS:.*]] = tensor.from_elements %[[IF]]
-// CHECK:       return %[[FROM_ELEMENTS]]
-
-
-// -----
-
-func.func @scalarize_for_op(%initValue: f32, %input: tensor<10xf32>) -> f32 {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c10 = arith.constant 10 : index
-
-  %initTensor = tensor.from_elements %initValue : tensor<1x1xf32>
-
-  %sum = scf.for %i = %c0 to %c10 step %c1
-      iter_args(%acc = %initTensor) -> (tensor<1x1xf32>) {
-    %input_elem = tensor.extract %input[%i] : tensor<10xf32>
-
-    %acc_elem = tensor.extract %acc[%c0, %c0] : tensor<1x1xf32>
-    %add = arith.addf %acc_elem, %input_elem : f32
-    %from_elements = tensor.from_elements %add : tensor<1x1xf32>
-
-    scf.yield %from_elements : tensor<1x1xf32>
-  }
-  %sum_elem = tensor.extract %sum[%c0, %c0] : tensor<1x1xf32>
-  func.return %sum_elem : f32
-}
-// CHECK-LABEL: @scalarize_for_op
-
-// CHECK:      scf.for %[[I:[a-z0-9]+]] =
-// CHECK-NEXT:   %[[ELEM:.*]] = tensor.extract %{{.*}}[%[[I]]] : tensor<10xf32>
-// CHECK-NEXT:   %[[ADD:.*]] = arith.addf %{{.*}}, %[[ELEM]] : f32
-// CHECK-NEXT:   scf.yield
-// CHECK-NEXT: }
-// CHECK-NEXT: return
-
-
diff --git a/third_party/xla/xla/mlir_hlo/tests/tile_loops.mlir b/third_party/xla/xla/mlir_hlo/tests/tile_loops.mlir
index 0f4e2134073275..8cfce78ba48c13 100644
--- a/third_party/xla/xla/mlir_hlo/tests/tile_loops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/tile_loops.mlir
@@ -19,7 +19,8 @@ func.func @parallel_loop(%arg0: memref<16xf32>, %arg1: memref<16xf32>) {
     scf.yield
   }
   %1 = bufferization.to_tensor %0 : memref<16xf32>
-  memref.tensor_store %1, %arg1 : memref<16xf32>
+  bufferization.materialize_in_destination %1 in writable %arg1
+      : (tensor<16xf32>, memref<16xf32>) -> ()
   "lmhlo.terminator"() : () -> ()
 }
 
@@ -101,6 +102,7 @@ func.func @complex_access(%arg0: memref<16xf32>, %arg1: memref<4xf32>) {
     scf.yield
   }
   %1 = bufferization.to_tensor %0 : memref<4xf32>
-  memref.tensor_store %1, %arg1 : memref<4xf32>
+  bufferization.materialize_in_destination %1 in writable %arg1
+      : (tensor<4xf32>, memref<4xf32>) -> ()
   "lmhlo.terminator"() : () -> ()
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/unbufferize.mlir b/third_party/xla/xla/mlir_hlo/tests/unbufferize.mlir
index e1bfb4cc9e3c9b..706779585d5a8b 100644
--- a/third_party/xla/xla/mlir_hlo/tests/unbufferize.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/unbufferize.mlir
@@ -4,7 +4,8 @@
 // CHECK-SAME: (%arg0: tensor<8xf32>) -> (tensor<8xf32> {my.attr})
 func.func @unbufferize(%arg0: memref<8xf32>, %arg1: memref<8xf32> {my.attr}) {
   %0 = bufferization.to_tensor %arg0 : memref<8xf32>
-  memref.tensor_store %0, %arg1 : memref<8xf32>
+  bufferization.materialize_in_destination %0 in writable %arg1
+      : (tensor<8xf32>, memref<8xf32>) -> ()
   // CHECK-NEXT: return %arg0 : tensor<8xf32>
   return
 }
@@ -14,7 +15,8 @@ func.func @not_block_arg() {
   %0 = memref.alloc() : memref<8xf32>
   // CHECK: bufferization.to_tensor
   %1 = bufferization.to_tensor %0 : memref<8xf32>
-  // CHECK: memref.tensor_store
-  memref.tensor_store %1, %0 : memref<8xf32>
+  // CHECK: bufferization.materialize_in_destination
+  bufferization.materialize_in_destination %1 in writable %0
+      : (tensor<8xf32>, memref<8xf32>) -> ()
   return
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_copy.mlir b/third_party/xla/xla/mlir_hlo/tests/vectorize_copy.mlir
similarity index 97%
rename from third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_copy.mlir
rename to third_party/xla/xla/mlir_hlo/tests/vectorize_copy.mlir
index b2dff59e53cbce..8c57281a7041c9 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_copy.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/vectorize_copy.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s --vectorize-copy="num-elements-threshold=8" --split-input-file | FileCheck %s
+// RUN: mlir-hlo-opt %s --vectorize-copy --split-input-file | FileCheck %s
 
 func.func @vectorize_copy(%arg: memref<2x2xf32>) -> memref<2x2xf32> {
   %subview = memref.subview %arg[0, 0] [2, 2] [1, 1] : memref<2x2xf32> to memref<2x2xf32, strided<[16, 1]>>
diff --git a/third_party/xla/xla/mlir_hlo/thlo/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/thlo/CMakeLists.txt
deleted file mode 100644
index c3f701100ccab5..00000000000000
--- a/third_party/xla/xla/mlir_hlo/thlo/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-add_subdirectory(IR)
-add_subdirectory(transforms)
-add_subdirectory(interfaces)
diff --git a/third_party/xla/xla/mlir_hlo/thlo/IR/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/thlo/IR/CMakeLists.txt
deleted file mode 100644
index 5a4f76478c2e78..00000000000000
--- a/third_party/xla/xla/mlir_hlo/thlo/IR/CMakeLists.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set(LLVM_TARGET_DEFINITIONS thlo_ops.td)
-mlir_tablegen(thlo_ops.h.inc -gen-op-decls)
-mlir_tablegen(thlo_ops.cc.inc -gen-op-defs)
-mlir_tablegen(thlo_dialect.h.inc -gen-dialect-decls)
-mlir_tablegen(thlo_dialect.cc.inc -gen-dialect-defs)
-
-add_public_tablegen_target(MLIRthlo_opsIncGen)
-add_dependencies(mlir-headers MLIRthlo_opsIncGen)
-
-
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_mlir_dialect_library(THLODialect
-  thlo_ops.cc
-
-  DEPENDS
-  MLIRthlo_opsIncGen
-
-  LINK_LIBS PUBLIC
-  GmlStDialect
-  MLIRDestinationStyleOpInterface
-  MLIRIR
-  MLIRMemRefDialect
-  MLIRSideEffectInterfaces
-  MLIRSupport
-  MLIRTensorDialect
-)
diff --git a/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.cc b/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.cc
deleted file mode 100644
index d56141388076da..00000000000000
--- a/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.cc
+++ /dev/null
@@ -1,1364 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "thlo/IR/thlo_ops.h"
-
-#include <algorithm>
-#include <cassert>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <string>
-#include <tuple>
-#include <utility>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/Interfaces/DestinationStyleOpInterface.h"
-#include "mlir/Interfaces/TilingInterface.h"
-#include "mlir/Transforms/InliningUtils.h"
-
-namespace mlir {
-namespace {
-
-Value materializeSlice(OpBuilder &b, Location loc, Value valueToTile,
-                       ArrayRef<OpFoldResult> offsets,
-                       ArrayRef<OpFoldResult> sizes,
-                       ArrayRef<OpFoldResult> strides) {
-  return b.create<tensor::ExtractSliceOp>(loc, valueToTile, offsets, sizes,
-                                          strides);
-}
-
-Value materializeSlice(OpBuilder &b, Location loc, Value valueToTile,
-                       ArrayRef<OpFoldResult> offsets,
-                       ArrayRef<OpFoldResult> sizes) {
-  SmallVector<OpFoldResult> strides(offsets.size(), b.getIndexAttr(1));
-  return materializeSlice(b, loc, valueToTile, offsets, sizes, strides);
-}
-
-//===----------------------------------------------------------------------===//
-// Destination-style ops tools
-//===----------------------------------------------------------------------===//
-
-LogicalResult verifyDestinationStyleOp(Operation *op) {
-  auto dstStyleOp = cast<DestinationStyleOpInterface>(*op);
-  if (dstStyleOp.hasBufferSemantics()) return success(op->getNumResults() == 0);
-
-  if (!dstStyleOp.hasTensorSemantics())
-    return op->emitOpError("expected either buffer or tensor semantics");
-
-  return success();
-}
-
-template <typename DstOpTy>
-void printDstStyleOp(
-    DstOpTy op, OpAsmPrinter &p,
-    function_ref<SmallVector<StringRef>(DstOpTy op, OpAsmPrinter &)>
-        printAttrsFn = nullptr) {
-  if (op.getNumDpsInputs() != 0) {
-    p << " ins(";
-    llvm::interleaveComma(
-        op.getOperands().take_front(op.getNumDpsInputs()), p,
-        [&](Value input) { p << input << " : " << input.getType(); });
-    p << ")";
-  }
-  p << " outs(";
-  llvm::interleaveComma(
-      op.getOperands().take_back(op.getNumDpsInits()), p,
-      [&](Value output) { p << output << " : " << output.getType(); });
-  p << ")";
-
-  // Print attributes with custom printing logic.
-  SmallVector<StringRef> elidedAttrs;
-  if (printAttrsFn) {
-    p << ' ';
-    elidedAttrs = printAttrsFn(op, p);
-  }
-
-  p.printOptionalAttrDict(op->getAttrs(), elidedAttrs);
-}
-
-ParseResult parseKeywordOperandListWithTypes(
-    OpAsmParser &parser, OperationState &result, StringRef keyword,
-    SmallVectorImpl<Type> *operandTypes) {
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> operands;
-  if (succeeded(parser.parseOptionalKeyword(keyword))) {
-    SMLoc operandsOperandsLoc = parser.getCurrentLocation();
-
-    if (parser.parseCommaSeparatedList(
-            AsmParser::Delimiter::Paren, [&]() -> ParseResult {
-              if (parser.parseOperand(operands.emplace_back(),
-                                      /*allowResultNumber=*/true) ||
-                  parser.parseColon() ||
-                  parser.parseType(operandTypes->emplace_back())) {
-                return failure();
-              }
-              return success();
-            }))
-      return failure();
-
-    if (parser.resolveOperands(operands, *operandTypes, operandsOperandsLoc,
-                               result.operands))
-      return failure();
-  }
-  return success();
-}
-
-ParseResult parseDstStyleOp(
-    OpAsmParser &parser, OperationState &result,
-    function_ref<ParseResult(OpAsmParser &, NamedAttrList &)> parseAttrsFn =
-        nullptr) {
-  // Parse `ins` and `outs`.
-  SmallVector<Type, 4> inputTypes, outputTypes;
-  if (parseKeywordOperandListWithTypes(parser, result, "ins", &inputTypes) ||
-      parseKeywordOperandListWithTypes(parser, result, "outs", &outputTypes))
-    return failure();
-
-  // Add result types.
-  for (Type outputType : outputTypes) {
-    if (outputType.isa<RankedTensorType>()) result.addTypes(outputType);
-  }
-
-  // Parse required attributes.
-  if (parseAttrsFn && failed(parseAttrsFn(parser, result.attributes)))
-    return failure();
-
-  // Parse optional attributes.
-  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
-  return success();
-}
-
-ParseResult parseDenseI64ArrayAttr(OpAsmParser &parser,
-                                   NamedAttrList &attributes,
-                                   StringRef attributeName) {
-  if (parser.parseKeyword(attributeName) || parser.parseEqual())
-    return failure();
-
-  attributes.set(attributeName, DenseI64ArrayAttr::parse(parser, Type{}));
-  return success();
-}
-
-void printDenseI64ArrayAttr(OpAsmPrinter &p, StringRef attributeName,
-                            ArrayRef<int64_t> attributeValue) {
-  p << attributeName << " = [" << attributeValue << "] ";
-}
-
-SmallVector<utils::IteratorType> getParallelIteratorTypes(int64_t dimCount) {
-  return SmallVector<utils::IteratorType>(dimCount,
-                                          utils::IteratorType::parallel);
-}
-
-SmallVector<Range> getIterationDomainForTensor(OpBuilder &b, Location loc,
-                                               Value tensor,
-                                               int64_t dimCount = -1) {
-  auto dimValues = tensor::getMixedSizes(b, loc, tensor);
-  if (dimCount >= 0) dimValues.resize(dimCount);
-  return llvm::to_vector(llvm::map_range(dimValues, [&](OpFoldResult d) {
-    return Range{b.getIndexAttr(0), d, b.getIndexAttr(1)};
-  }));
-}
-
-static void getDstStyleOpEffectsImpl(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects,
-    ValueRange results, ValueRange inputOperands, ValueRange outputOperands) {
-  for (auto operand : inputOperands) {
-    if (!operand.getType().isa<MemRefType>()) continue;
-    effects.emplace_back(MemoryEffects::Read::get(), operand,
-                         SideEffects::DefaultResource::get());
-  }
-  for (auto operand : outputOperands) {
-    if (!operand.getType().isa<MemRefType>()) continue;
-    effects.emplace_back(MemoryEffects::Read::get(), operand,
-                         SideEffects::DefaultResource::get());
-    effects.emplace_back(MemoryEffects::Write::get(), operand,
-                         SideEffects::DefaultResource::get());
-  }
-}
-
-}  // namespace
-}  // namespace mlir
-
-//===----------------------------------------------------------------------===//
-// THLO Dialect Interfaces
-//===----------------------------------------------------------------------===//
-
-namespace mlir {
-namespace {
-
-struct THLOInlinerInterface : public mlir::DialectInlinerInterface {
-  using DialectInlinerInterface::DialectInlinerInterface;
-
-  // Operations in THLO dialect are always legal to inline.
-  bool isLegalToInline(Operation *, Region *, bool, IRMapping &) const final {
-    return true;
-  }
-  // Handle the given inlined terminator by replacing it with a new operation
-  // as necessary. Required when the region has only one block.
-  void handleTerminator(Operation *op,
-                        ArrayRef<Value> valuesToRepl) const final {}
-};
-
-}  // namespace
-}  // namespace mlir
-
-//===----------------------------------------------------------------------===//
-// THLODialect
-//===----------------------------------------------------------------------===//
-
-// Generated dialect definitions.
-#include "thlo/IR/thlo_dialect.cc.inc"
-
-namespace mlir {
-namespace thlo {
-
-void THLODialect::initialize() {
-  addOperations<
-#define GET_OP_LIST
-#include "thlo/IR/thlo_ops.cc.inc"
-      >();
-
-  addInterfaces<THLOInlinerInterface>();
-}
-
-//===----------------------------------------------------------------------===//
-// YieldOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult checkYieldOutputs(YieldOp yieldOp,
-                                TypeRange expectedElementTypes) {
-  uint64_t numOutputs = expectedElementTypes.size();
-  if (yieldOp.getValues().size() != numOutputs) {
-    return yieldOp.emitOpError("expects number of tensor output args = ")
-           << numOutputs << " to match the number of yield operands = "
-           << yieldOp.getValues().size();
-  }
-
-  for (const auto &item : llvm::enumerate(
-           llvm::zip(expectedElementTypes, yieldOp.getOperandTypes()))) {
-    Type outputElementType, resultType;
-    unsigned index = item.index();
-    std::tie(outputElementType, resultType) = item.value();
-    if (outputElementType != resultType)
-      return yieldOp.emitOpError("expects yield operand ")
-             << index << " with type = " << resultType
-             << " to match output arg element type = " << outputElementType;
-  }
-
-  return success();
-}
-
-LogicalResult YieldOp::verify() { return success(); }
-
-//===----------------------------------------------------------------------===//
-// ConcatenateOp
-//===----------------------------------------------------------------------===//
-
-SmallVector<utils::IteratorType> ConcatenateOp::getLoopIteratorTypes() {
-  return getParallelIteratorTypes(getInit().getType().getRank());
-}
-
-SmallVector<Range> ConcatenateOp::getIterationDomain(OpBuilder &b) {
-  return getIterationDomainForTensor(b, getLoc(), getInit());
-}
-
-namespace {
-
-Value getSingleOperandTiledImplementationForConcatRecursively(
-    OpBuilder &b, Location loc, int64_t concatDim, ValueRange remainingOperands,
-    SmallVector<OpFoldResult> &remainingOffsets, ArrayRef<OpFoldResult> sizes) {
-  assert(!remainingOperands.empty() && "expect at least one remaining operand");
-  assert(sizes[concatDim].get<Attribute>().cast<IntegerAttr>().getInt() == 1 &&
-         "expect unit size in concat dim");
-
-  // Terminal case of exactly one operand.
-  Value leadingOperand = remainingOperands.front();
-  if (remainingOperands.size() == 1) {
-    return materializeSlice(b, loc, leadingOperand, remainingOffsets, sizes);
-  }
-
-  // For more than one operand, distinguish between the leading operand and the
-  // remainder.
-  assert(remainingOperands.size() > 1 &&
-         "expect more than one operand at this point");
-  Value leadingOperandSizeInConcatDim =
-      b.createOrFold<tensor::DimOp>(loc, leadingOperand, concatDim);
-  Value remainingOffsetInConcatDim =
-      getValueOrCreateConstantIndexOp(b, loc, remainingOffsets[concatDim]);
-  Value leadingOperandPredicate = b.createOrFold<arith::CmpIOp>(
-      loc, arith::CmpIPredicate::ult, remainingOffsetInConcatDim,
-      leadingOperandSizeInConcatDim);
-  auto ifOp = b.create<scf::IfOp>(
-      loc, leadingOperandPredicate,
-      [&](OpBuilder &b, Location loc) {
-        Value tiledConcat =
-            getSingleOperandTiledImplementationForConcatRecursively(
-                b, loc, concatDim, {leadingOperand}, remainingOffsets, sizes);
-        b.create<scf::YieldOp>(loc, tiledConcat);
-      },
-      [&](OpBuilder &b, Location loc) {
-        remainingOffsets[concatDim] = getAsOpFoldResult(
-            b.createOrFold<arith::SubIOp>(loc, remainingOffsetInConcatDim,
-                                          leadingOperandSizeInConcatDim));
-        Value tiledConcat =
-            getSingleOperandTiledImplementationForConcatRecursively(
-                b, loc, concatDim, remainingOperands.drop_front(),
-                remainingOffsets, sizes);
-        b.create<scf::YieldOp>(loc, tiledConcat);
-      });
-  return ifOp.getResults().front();
-}
-
-Value getSingleOperandTiledImplementationForConcat(
-    ConcatenateOp op, OpBuilder &b, Location loc,
-    ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes) {
-  int64_t concatDim = op.getDimension().getSExtValue();
-  SmallVector<OpFoldResult> remainingOffsets(offsets);
-  return getSingleOperandTiledImplementationForConcatRecursively(
-      b, loc, concatDim, op.getInputs(), remainingOffsets, sizes);
-}
-
-Value getGenericTiledImplementationForConcat(ConcatenateOp op, OpBuilder &b,
-                                             Location loc,
-                                             ArrayRef<OpFoldResult> offsets,
-                                             ArrayRef<OpFoldResult> sizes) {
-  // Create a basis for the tile offsets and sizes. These hold the shared values
-  // in all non-concat dimensions and are amended in the concat dimension to
-  // create the individual operand tiles. Also, create the shared tile strides,
-  // which are the exact same for every operand tile.
-  SmallVector<OpFoldResult> operandTileOffsetsBase(offsets);
-  SmallVector<OpFoldResult> operandTileSizesBase(sizes);
-  SmallVector<OpFoldResult> operandTileStrides(sizes.size(), b.getIndexAttr(1));
-
-  // Some shared values.
-  Value zeroCst = b.create<arith::ConstantIndexOp>(loc, 0);
-  int64_t concatDim = op.getDimension().getSExtValue();
-  Value concatDimCst = b.create<arith::ConstantIndexOp>(loc, concatDim);
-  Value maxTileSizeInConcatDim =
-      getValueOrCreateConstantIndexOp(b, loc, sizes[concatDim]);
-
-  // The remaining tile offset in the concat dimension is subtracted by each
-  // operand's size in that dimension. We maintain the invariant
-  // remainingTileOffsetInConcatDim >= 0.
-  Value remainingTileOffsetInConcatDim =
-      getValueOrCreateConstantIndexOp(b, loc, offsets[concatDim]);
-
-  // Create the relevant subsets per operand. These tiles can be empty at
-  // runtime.
-  SmallVector<Value> tiledOperands;
-  tiledOperands.reserve(op.getNumDpsInputs());
-  for (Value operand : op.getInputs()) {
-    // Find the current operand's tile offset in the concat dimension. This is
-    // the remaining offset clamped into the bounds of the operand. Note that
-    // the remaining offset is always >= 0.
-    Value operandSizeInConcatDim =
-        b.createOrFold<tensor::DimOp>(loc, operand, concatDimCst);
-    Value operandTileOffsetInConcatDim = b.createOrFold<arith::MinUIOp>(
-        loc, remainingTileOffsetInConcatDim, operandSizeInConcatDim);
-    operandTileOffsetsBase[concatDim] =
-        getAsOpFoldResult(operandTileOffsetInConcatDim);
-
-    // Find the current operand's tile size in the concat dimension.
-    Value remainingOperandSizeInConcatDim = b.createOrFold<arith::SubIOp>(
-        loc, operandSizeInConcatDim, operandTileOffsetInConcatDim);
-    operandTileSizesBase[concatDim] =
-        getAsOpFoldResult(b.createOrFold<arith::MinUIOp>(
-            loc, remainingOperandSizeInConcatDim, maxTileSizeInConcatDim));
-
-    // Create the operand tile and materialize the subset for this operand.
-    tiledOperands.push_back(
-        materializeSlice(b, loc, operand, operandTileOffsetsBase,
-                         operandTileSizesBase, operandTileStrides));
-
-    // Unless it is the last operand, update the remaining tile offset in the
-    // concat dimension. The remaining offset is subtracted by the operand's
-    // size but must remain >= 0.
-    if (operand != op.getInputs().back()) {
-      Value cmp = b.createOrFold<arith::CmpIOp>(loc, arith::CmpIPredicate::ule,
-                                                remainingTileOffsetInConcatDim,
-                                                operandSizeInConcatDim);
-      Value sub = b.createOrFold<arith::SubIOp>(
-          loc, remainingTileOffsetInConcatDim, operandSizeInConcatDim);
-      remainingTileOffsetInConcatDim =
-          b.createOrFold<arith::SelectOp>(loc, cmp, zeroCst, sub);
-    }
-  }
-
-  // Create the tiled concat op.
-  Value tiledInit = materializeSlice(b, loc, op.getInit(), offsets, sizes);
-  auto tiledConcat =
-      b.create<thlo::ConcatenateOp>(loc, tiledInit.getType(), tiledOperands,
-                                    tiledInit, b.getIndexAttr(concatDim));
-  return tiledConcat.getResults().front();
-}
-
-Value getTiledImplementationForConcat(ConcatenateOp op, OpBuilder &b,
-                                      Location loc,
-                                      ArrayRef<OpFoldResult> offsets,
-                                      ArrayRef<OpFoldResult> sizes) {
-  // If the tile is of unit size in the concatenation dimension, we can generate
-  // the tiled implementation based on a single operand.
-  int64_t concatDim = op.getDimension().getSExtValue();
-  OpFoldResult tileSizeInConcatDim = sizes[concatDim];
-  if (tileSizeInConcatDim.is<Attribute>() &&
-      tileSizeInConcatDim.get<Attribute>().cast<IntegerAttr>().getInt() == 1) {
-    return getSingleOperandTiledImplementationForConcat(op, b, loc, offsets,
-                                                        sizes);
-  }
-
-  // Otherwise, rely on the generic implementation.
-  return getGenericTiledImplementationForConcat(op, b, loc, offsets, sizes);
-}
-
-}  // namespace
-
-FailureOr<TilingResult> ConcatenateOp::getTiledImplementation(
-    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  auto tiled =
-      getTiledImplementationForConcat(*this, b, getLoc(), offsets, sizes);
-  return TilingResult{{tiled.getDefiningOp()}, {tiled}};
-}
-
-LogicalResult ConcatenateOp::getResultTilePosition(
-    OpBuilder & /*b*/, unsigned /*resultNumber*/,
-    ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
-    SmallVector<OpFoldResult> &resultOffsets,
-    SmallVector<OpFoldResult> &resultSizes) {
-  resultOffsets = llvm::to_vector(offsets);
-  resultSizes = llvm::to_vector(sizes);
-  return success();
-}
-
-FailureOr<TilingResult> ConcatenateOp::generateResultTileValue(
-    OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  assert(resultNumber == 0 && "expect unique result idx");
-  FailureOr<TilingResult> tilingResult =
-      getTiledImplementation(b, offsets, sizes);
-  if (failed(tilingResult)) return failure();
-  return tilingResult.value();
-}
-
-LogicalResult ConcatenateOp::reifyResultShapes(
-    OpBuilder &b, ReifiedRankedShapedTypeDims &reifiedReturnShapes) {
-  Location loc = getLoc();
-  Value init = getInit();
-
-  // Assume unique result.
-  if (getNumResults() != 1) return failure();
-  SmallVector<OpFoldResult> &shape = reifiedReturnShapes.emplace_back();
-
-  // Derive shape from init operand.
-  int64_t rank = init.getType().cast<RankedTensorType>().getRank();
-  shape.reserve(rank);
-  for (int64_t i = 0; i < rank; ++i) {
-    shape.push_back(b.create<tensor::DimOp>(loc, init, i).getResult());
-  }
-
-  return success();
-}
-
-ParseResult ConcatenateOp::parse(OpAsmParser &parser, OperationState &result) {
-  return parseDstStyleOp(
-      parser, result, [&](OpAsmParser &parser, NamedAttrList &attributes) {
-        int64_t dimension = 0;
-        if (parser.parseKeyword("dimension") || parser.parseEqual() ||
-            parser.parseInteger(dimension))
-          return failure();
-
-        attributes.set("dimension",
-                       parser.getBuilder().getIndexAttr(dimension));
-        return success();
-      });
-}
-
-void ConcatenateOp::print(OpAsmPrinter &p) {
-  printDstStyleOp<ConcatenateOp>(
-      *this, p,
-      [](ConcatenateOp op, OpAsmPrinter &p) -> SmallVector<StringRef> {
-        p << op.getDimensionAttrName().str() << " = " << op.getDimension();
-
-        return {op.getDimensionAttrName()};
-      });
-}
-
-LogicalResult ConcatenateOp::verify() {
-  int64_t concatDim = getDimension().getSExtValue();
-
-  ShapedType inputType =
-      getDpsInputOperand(0)->get().getType().cast<ShapedType>();
-  int64_t rank = inputType.getRank();
-  auto inputShape = inputType.getShape();
-
-  Type outputElementType =
-      getDpsInitOperand(0)->get().getType().cast<ShapedType>().getElementType();
-
-  for (const auto &en : llvm::enumerate(getInputs())) {
-    ShapedType inputArgShapedType = en.value().getType().cast<ShapedType>();
-    auto inputArgShape = inputArgShapedType.getShape();
-
-    if (inputArgShapedType.getElementType() != outputElementType)
-      return emitOpError() << "expected element type of input "
-                           << inputArgShapedType.getElementType()
-                           << " to match output element type "
-                           << outputElementType;
-
-    if (inputArgShapedType.getRank() != rank)
-      return emitOpError() << "expected all args to be rank " << rank
-                           << ", got " << inputArgShapedType.getRank()
-                           << " in arg " << en.index();
-
-    // Make sure that all dimensions, expect for concatenation dim, in the input
-    // arg are equal.
-    // TODO(shyshkov): Also check output dims once tiling is fixed for
-    // ConcatenateOp.
-    for (int64_t i = 0; i < rank; ++i) {
-      if (i == concatDim) continue;
-
-      if (inputShape[i] != inputArgShape[i])
-        return emitOpError()
-               << "shape of input arg " << en.index() << ": "
-               << inputArgShapedType << " doesn't match expected shape "
-               << inputType << " (all dims except concat dim(" << concatDim
-               << ") should match exactly)";
-    }
-  }
-
-  return verifyDestinationStyleOp(getOperation());
-}
-
-void ConcatenateOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  getDstStyleOpEffectsImpl(effects, getOperation()->getResults(),
-                           getDpsInputs(), getDpsInits());
-}
-
-//===----------------------------------------------------------------------===//
-// DynamicBroadcastInDimOp
-//===----------------------------------------------------------------------===//
-
-ParseResult DynamicBroadcastInDimOp::parse(OpAsmParser &parser,
-                                           OperationState &result) {
-  return parseDstStyleOp(parser, result,
-                         [&](OpAsmParser &parser, NamedAttrList &attributes) {
-                           return parseDenseI64ArrayAttr(
-                               parser, attributes, "broadcast_dimensions");
-                         });
-}
-
-void DynamicBroadcastInDimOp::print(OpAsmPrinter &p) {
-  printDstStyleOp<DynamicBroadcastInDimOp>(
-      *this, p,
-      [](DynamicBroadcastInDimOp op,
-         OpAsmPrinter &p) -> SmallVector<StringRef> {
-        printDenseI64ArrayAttr(p, op.getBroadcastDimensionsAttrName(),
-                               op.getBroadcastDimensions());
-        return {op.getBroadcastDimensionsAttrName()};
-      });
-}
-
-LogicalResult DynamicBroadcastInDimOp::verify() {
-  return verifyDestinationStyleOp(getOperation());
-}
-
-SmallVector<utils::IteratorType>
-DynamicBroadcastInDimOp::getLoopIteratorTypes() {
-  return getParallelIteratorTypes(getInit().getType().getRank());
-}
-
-SmallVector<Range> DynamicBroadcastInDimOp::getIterationDomain(OpBuilder &b) {
-  return getIterationDomainForTensor(b, getLoc(), getInit());
-}
-
-FailureOr<TilingResult> DynamicBroadcastInDimOp::getTiledImplementation(
-    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  // Create tile subset.
-  auto loc = getLoc();
-  auto initRank = getInit().getType().cast<RankedTensorType>().getRank();
-
-  DenseMap<uint64_t, Value> localIndexConstants;
-
-  DenseSet<int64_t> dimensionsThatStay(getBroadcastDimensions().begin(),
-                                       getBroadcastDimensions().end());
-
-  // Materialize operand space.
-  auto operandTy = getOperand().getType().cast<RankedTensorType>();
-  auto dynamicDims = tensor::createDynamicDimValues(b, loc, getOperand());
-
-  // Materialize operand dimensions.
-  SmallVector<Value> operandDims;
-  int64_t dynamicDimsIdx = 0;
-  operandDims.reserve(operandTy.getRank());
-  for (const auto &it : llvm::enumerate(operandTy.getShape())) {
-    int64_t d = it.value();
-    Value dim = d == ShapedType::kDynamic
-                    ? dynamicDims[dynamicDimsIdx++]
-                    : b.create<arith::ConstantIndexOp>(loc, d);
-    operandDims.push_back(dim);
-  }
-
-  // Find the expanding dimensions. If corresponding operand and result
-  // dimensions are different then the dimension is expanding.
-  // TODO(frgossen): Use info from known expanding and known non-expanding
-  // dimensions here.
-  SmallVector<Value> operandExpandingDims;
-  for (const auto &it : llvm::enumerate(getBroadcastDimensions())) {
-    auto operandDim = operandDims[it.index()];
-    auto resultDim = b.create<tensor::DimOp>(
-        loc, getInit(), b.create<arith::ConstantIndexOp>(loc, it.value()));
-    operandExpandingDims.push_back(b.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::ne, operandDim, resultDim));
-  }
-
-  // Compute operand tile offsets.
-  auto tileOpOffsets = getValueOrCreateConstantIndexOp(b, loc, offsets);
-  int64_t operandRank = operandTy.getRank();
-  auto staticOffsets = SmallVector<int64_t>(operandRank, ShapedType::kDynamic);
-  SmallVector<Value> operandOffsets;
-  Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
-  for (int initId = 0, operandId = 0; initId < initRank; ++initId) {
-    if (!dimensionsThatStay.contains(initId)) continue;
-    Value isExpanding = operandExpandingDims[operandId++];
-    Value collapsedSubsetOffset = tileOpOffsets[initId];
-    operandOffsets.push_back(b.create<arith::SelectOp>(loc, isExpanding, zero,
-                                                       collapsedSubsetOffset));
-  }
-
-  // Compute operand tile sizes.
-  auto staticTileSizes =
-      SmallVector<int64_t>(operandRank, ShapedType::kDynamic);
-  SmallVector<Value> tileSizes;
-  Value one = b.create<arith::ConstantIndexOp>(loc, 1);
-  auto tileOpSizes = getValueOrCreateConstantIndexOp(b, loc, sizes);
-  for (int initId = 0, operandId = 0; initId < initRank; ++initId) {
-    if (!dimensionsThatStay.contains(initId)) continue;
-    Value isExpanding = operandExpandingDims[operandId++];
-    Value tileSize = tileOpSizes[initId];
-    tileSizes.push_back(
-        b.create<arith::SelectOp>(loc, isExpanding, one, tileSize));
-  }
-
-  // Create operand tile.
-  auto staticTileStrides = SmallVector<int64_t>(operandRank, 1);
-  SmallVector<Value> tileStrides = {};
-
-  // Materialize operand tiles.
-  Value tiledInit = materializeSlice(b, loc, getInit(), offsets, sizes);
-  Value tiledOperand = materializeSlice(
-      b, loc, getOperand(), getMixedValues(staticOffsets, operandOffsets, b),
-      getMixedValues(staticTileSizes, tileSizes, b),
-      getMixedValues(staticTileStrides, tileStrides, b));
-
-  // Finally, materialize tiled broadcast.
-  auto resultTy = getType(0).cast<RankedTensorType>();
-  auto tiledResultTy =
-      RankedTensorType::get(tiledInit.getType().cast<ShapedType>().getShape(),
-                            resultTy.getElementType());
-  auto tiledOp = b.create<DynamicBroadcastInDimOp>(
-      loc, TypeRange{tiledResultTy}, tiledOperand, tiledInit,
-      getBroadcastDimensionsAttr(), getKnownExpandingDimensionsAttr(),
-      getKnownNonexpandingDimensionsAttr());
-  return TilingResult{{tiledOp}, {tiledOp.getResult()}};
-}
-
-LogicalResult DynamicBroadcastInDimOp::getResultTilePosition(
-    OpBuilder & /*b*/, unsigned /*resultNumber*/,
-    ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
-    SmallVector<OpFoldResult> &resultOffsets,
-    SmallVector<OpFoldResult> &resultSizes) {
-  resultOffsets = llvm::to_vector(offsets);
-  resultSizes = llvm::to_vector(sizes);
-  return success();
-}
-
-FailureOr<TilingResult> DynamicBroadcastInDimOp::generateResultTileValue(
-    OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  assert(resultNumber == 0 && "expect unique result idx");
-  FailureOr<TilingResult> tilingResult =
-      getTiledImplementation(b, offsets, sizes);
-  if (failed(tilingResult)) return failure();
-  return tilingResult.value();
-}
-
-void DynamicBroadcastInDimOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  getDstStyleOpEffectsImpl(effects, getOperation()->getResults(),
-                           getDpsInputs(), getDpsInits());
-}
-
-//===----------------------------------------------------------------------===//
-// ScatterOp
-//===----------------------------------------------------------------------===//
-
-ParseResult ScatterOp::parse(OpAsmParser &parser, OperationState &result) {
-  if (parseDstStyleOp(parser, result)) return failure();
-
-  SmallVector<OpAsmParser::Argument> regionArgs;
-  if (parser.parseArgumentList(regionArgs, OpAsmParser::Delimiter::Paren,
-                               /*allowType=*/true, /*allowAttrs=*/true)) {
-    return failure();
-  }
-
-  Region *body = result.addRegion();
-  if (parser.parseRegion(*body, regionArgs)) return failure();
-
-  return success();
-}
-
-void ScatterOp::print(OpAsmPrinter &p) {
-  printDstStyleOp<ScatterOp>(*this, p);
-
-  p.increaseIndent();
-  p.printNewline();
-  p << "(";
-  llvm::interleaveComma(getUpdateComputation().getArguments(), p,
-                        [&](auto arg) { p.printRegionArgument(arg); });
-  p << ") ";
-
-  p.printRegion(getUpdateComputation(), /*printEntryBlockArgs=*/false);
-  p.decreaseIndent();
-}
-
-LogicalResult ScatterOp::verify() {
-  if (failed(verifyDestinationStyleOp(getOperation()))) return failure();
-
-  auto indicesType = getIndices().getType().cast<ShapedType>();
-  int64_t indicesRank = indicesType.getRank();
-
-  if (indicesRank != 2)
-    return emitOpError() << "expected `indices` to be a 2D tensor";
-
-  auto updatesType = getUpdates().getType();
-  int64_t updatesRank = updatesType.getRank();
-
-  if (updatesType.getDimSize(0) != indicesType.getDimSize(0)) {
-    return emitOpError() << "expected major dimension of `indices` to match "
-                            "major dimension of `updates`";
-  }
-
-  int64_t indexVectorDim = indicesType.getDimSize(1);
-  if (ShapedType::isDynamic(indexVectorDim))
-    return emitOpError() << "expected index vector dimension size to be static";
-
-  auto initType = getInit().getType();
-  int64_t initRank = initType.getRank();
-
-  if (indexVectorDim > initRank) {
-    return emitOpError() << "expected index vector dimension size = "
-                         << indexVectorDim
-                         << " to be smaller or equal than `init` rank = "
-                         << initRank;
-  }
-
-  if (updatesRank - 1 != initRank)
-    return emitOpError() << "expected `updates` rank + 1 to match `init` rank";
-
-  if (updatesType.getElementType() != initType.getElementType()) {
-    return emitOpError()
-           << "expected `updates` element type to match `init` element type";
-  }
-
-  // The update computation should yield exactly 1 result.
-  auto updateTerminator = cast<YieldOp>(getBody()->getTerminator());
-  Type outputElementType =
-      getDpsInitOperand(0)->get().getType().cast<ShapedType>().getElementType();
-  if (!succeeded(checkYieldOutputs(updateTerminator, outputElementType)))
-    return failure();
-
-  return success();
-}
-
-SmallVector<utils::IteratorType> ScatterOp::getLoopIteratorTypes() {
-  return {utils::IteratorType::reduction};
-}
-
-SmallVector<Range> ScatterOp::getIterationDomain(OpBuilder &b) {
-  Value indicesCount = b.create<tensor::DimOp>(getLoc(), getIndices(), 0);
-  return {Range{b.getIndexAttr(0), indicesCount, b.getIndexAttr(1)}};
-}
-
-FailureOr<TilingResult> ScatterOp::getTiledImplementation(
-    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  Location loc = getLoc();
-  IntegerAttr zeroAttr = b.getIndexAttr(0);
-
-  OpFoldResult tileOffset = offsets.front();
-  OpFoldResult tileSize = sizes.front();
-
-  // Tile outer dimension of updates.
-  Value update = this->getUpdates();
-  auto updateType = update.getType().cast<RankedTensorType>();
-
-  SmallVector<OpFoldResult> updateOffsets(updateType.getRank(), zeroAttr);
-  updateOffsets.front() = tileOffset;
-  SmallVector<OpFoldResult> updateSizes = tensor::getMixedSizes(b, loc, update);
-  updateSizes.front() = tileSize;
-
-  Value updateSlice =
-      materializeSlice(b, loc, update, updateOffsets, updateSizes);
-
-  // Tile outer dimension of indices.
-  Value indices = this->getIndices();
-
-  SmallVector<OpFoldResult> indicesOffsets{offsets.front(), zeroAttr};
-  indicesOffsets.front() = tileOffset;
-  SmallVector<OpFoldResult> indicesSizes =
-      tensor::getMixedSizes(b, loc, indices);
-  indicesSizes.front() = tileSize;
-
-  Value indicesSlice =
-      materializeSlice(b, loc, indices, indicesOffsets, indicesSizes);
-
-  // Get full space of the `init` tensor. We use an extract_slice op because
-  // otherwise, tileUsingSCFForOp won't replace the arg with the bbarg.
-  int64_t initRank = getInit().getType().getRank();
-  Value init = materializeSlice(b, loc, this->getInit(),
-                                SmallVector<OpFoldResult>(initRank, zeroAttr),
-                                tensor::getMixedSizes(b, loc, this->getInit()));
-
-  Operation *tiledOp =
-      mlir::clone(b, this->getOperation(), TypeRange{init.getType()},
-                  ValueRange{indicesSlice, updateSlice, init});
-  return TilingResult{{tiledOp}, {tiledOp->getResult(0)}};
-}
-
-LogicalResult ScatterOp::getResultTilePosition(
-    OpBuilder &b, unsigned /*resultNumber*/, ArrayRef<OpFoldResult> /*offsets*/,
-    ArrayRef<OpFoldResult> /*sizes*/, SmallVector<OpFoldResult> &resultOffsets,
-    SmallVector<OpFoldResult> &resultSizes) {
-  ScatterOp scatterOp = cast<ScatterOp>(this->getOperation());
-  auto init = scatterOp.getInit();
-  resultOffsets =
-      SmallVector<OpFoldResult>(init.getType().getRank(), b.getIndexAttr(0));
-  resultSizes = tensor::getMixedSizes(b, scatterOp.getLoc(), init);
-  return success();
-}
-
-FailureOr<TilingResult> ScatterOp::generateResultTileValue(
-    OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  assert(resultNumber == 0 && "variadic scatter is not implemented");
-  FailureOr<TilingResult> tilingResult =
-      getTiledImplementation(b, offsets, sizes);
-  if (failed(tilingResult)) return failure();
-  return tilingResult;
-}
-
-void ScatterOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  getDstStyleOpEffectsImpl(effects, getOperation()->getResults(),
-                           getDpsInputs(), getDpsInits());
-}
-
-//===----------------------------------------------------------------------===//
-// GatherOp
-//===----------------------------------------------------------------------===//
-
-ParseResult GatherOp::parse(OpAsmParser &parser, OperationState &result) {
-  return parseDstStyleOp(parser, result);
-}
-
-void GatherOp::print(OpAsmPrinter &p) { printDstStyleOp(*this, p); }
-
-LogicalResult GatherOp::verify() {
-  auto indicesType = getStartIndices().getType();
-  int64_t indicesRank = indicesType.getRank();
-
-  if (indicesRank != 2)
-    return emitOpError() << "expected `indices` to be a 2D tensor";
-
-  auto initType = getInit().getType();
-  if (indicesType.getDimSize(0) != getInit().getType().getDimSize(0)) {
-    return emitOpError()
-           << "expected major dimension of `startIndices` to match "
-              "major dimension of `init`";
-  }
-
-  if (initType.getNumDynamicDims() > 1 ||
-      (initType.getNumDynamicDims() == 1 && !initType.isDynamicDim(0))) {
-    return emitOpError() << "only the major dimenion of `init` may be dynamic";
-  }
-
-  if (indicesType.isDynamic(1)) {
-    return emitOpError()
-           << "the minor dimensions of `startIndices` must be static";
-  }
-
-  return verifyDestinationStyleOp(getOperation());
-}
-
-SmallVector<utils::IteratorType> GatherOp::getLoopIteratorTypes() {
-  return {utils::IteratorType::parallel};
-}
-
-SmallVector<Range> GatherOp::getIterationDomain(OpBuilder &b) {
-  Value indicesCount = b.create<tensor::DimOp>(getLoc(), getStartIndices(), 0);
-  return {Range{b.getIndexAttr(0), indicesCount, b.getIndexAttr(1)}};
-}
-
-FailureOr<TilingResult> GatherOp::getTiledImplementation(
-    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  SmallVector<OpFoldResult> startIndexOffsets{offsets.front(),
-                                              b.getIndexAttr(0)};
-  SmallVector<OpFoldResult> startIndexSizes{
-      sizes.front(),
-      b.getIndexAttr(getStartIndices().getType().getShape().back())};
-  auto subStartIndices = materializeSlice(b, getLoc(), getStartIndices(),
-                                          startIndexOffsets, startIndexSizes);
-
-  int64_t initRank = getInit().getType().getRank();
-  SmallVector<OpFoldResult> initOffsets(initRank, b.getIndexAttr(0));
-  initOffsets[0] = offsets.front();
-  auto initSizes = tensor::getMixedSizes(b, getLoc(), getInit());
-  initSizes[0] = sizes.front();
-  Value initSlice =
-      materializeSlice(b, getLoc(), getInit(), initOffsets, initSizes);
-
-  auto gatherOp =
-      b.create<GatherOp>(getLoc(), TypeRange{initSlice.getType()},
-                         ValueRange{getOperand(), subStartIndices, initSlice});
-  return TilingResult{{gatherOp}, {gatherOp.getResult()}};
-}
-
-LogicalResult GatherOp::getResultTilePosition(
-    OpBuilder &b, unsigned /*resultNumber*/, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes, SmallVector<OpFoldResult> &resultOffsets,
-    SmallVector<OpFoldResult> &resultSizes) {
-  GatherOp gatherOp = cast<GatherOp>(this->getOperation());
-  auto init = gatherOp.getInit();
-  resultOffsets =
-      SmallVector<OpFoldResult>(init.getType().getRank(), b.getIndexAttr(0));
-  resultOffsets.front() = offsets.front();
-  resultSizes = tensor::getMixedSizes(b, gatherOp.getLoc(), init);
-  resultSizes.front() = sizes.front();
-  return success();
-}
-
-FailureOr<TilingResult> GatherOp::generateResultTileValue(
-    OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  assert(resultNumber == 0 && "resultNumber > 0 not implemented");
-  FailureOr<TilingResult> tilingResult =
-      getTiledImplementation(b, offsets, sizes);
-  if (failed(tilingResult)) return failure();
-  return tilingResult.value();
-}
-
-void GatherOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  getDstStyleOpEffectsImpl(effects, getOperation()->getResults(),
-                           getDpsInputs(), getDpsInits());
-}
-
-//===----------------------------------------------------------------------===//
-// SortOp
-//===----------------------------------------------------------------------===//
-
-void SortOp::getAsmResultNames(function_ref<void(Value, StringRef)> setNameFn) {
-  ResultRange results = getResults();
-  for (size_t i = 0; i < results.size(); i++) {
-    setNameFn(results[i], "sorted" + std::to_string(i));
-  }
-}
-
-void SortOp::getAsmBlockArgumentNames(Region &region,
-                                      OpAsmSetValueNameFn setNameFn) {
-  for (int i = 0, e = region.getNumArguments(); i < e; i += 2) {
-    setNameFn(region.getArgument(i), "lhs" + std::to_string(i / 2));
-    setNameFn(region.getArgument(i + 1), "rhs" + std::to_string(i / 2));
-  }
-}
-
-ParseResult SortOp::parse(OpAsmParser &parser, OperationState &result) {
-  if (parseDstStyleOp(
-          parser, result, [&](OpAsmParser &parser, NamedAttrList &attributes) {
-            int64_t dimension = 0;
-            int64_t isStable = 0;
-            if (parser.parseKeyword("dimension") || parser.parseEqual() ||
-                parser.parseInteger(dimension) ||
-                parser.parseKeyword("is_stable") || parser.parseEqual() ||
-                parser.parseInteger(isStable))
-              return failure();
-
-            auto b = parser.getBuilder();
-            attributes.set("dimension", b.getIndexAttr(dimension));
-            attributes.set("is_stable", b.getBoolAttr(isStable != 0));
-            return success();
-          }))
-    return failure();
-
-  SmallVector<OpAsmParser::Argument> regionArgs;
-  if (parser.parseArgumentList(regionArgs, OpAsmParser::Delimiter::Paren,
-                               /*allowType=*/true, /*allowAttrs=*/true)) {
-    return failure();
-  }
-
-  Region *comparator = result.addRegion();
-  if (parser.parseRegion(*comparator, regionArgs)) return failure();
-
-  return success();
-}
-
-void SortOp::print(OpAsmPrinter &p) {
-  printDstStyleOp<SortOp>(
-      *this, p, [](SortOp op, OpAsmPrinter &p) -> SmallVector<StringRef> {
-        p << op.getDimensionAttrName().str() << " = " << op.getDimension()
-          << ' ' << op.getIsStableAttrName().str() << " = " << op.getIsStable();
-        return {op.getDimensionAttrName(), op.getIsStableAttrName()};
-      });
-
-  p.increaseIndent();
-  p.printNewline();
-  p << "(";
-  llvm::interleaveComma(getComparator().getArguments(), p,
-                        [&](auto arg) { p.printRegionArgument(arg); });
-  p << ") ";
-
-  p.printRegion(getComparator(), /*printEntryBlockArgs=*/false);
-  p.decreaseIndent();
-}
-
-LogicalResult SortOp::verify() {
-  auto *comparatorBlock = getBody();
-  auto comparatorArgs = comparatorBlock->getArguments();
-
-  // Checks that the arity of the comparator is equal to twice the number of
-  // inputs.
-  int64_t numInputs = getNumDpsInputs();
-  int64_t numOutputs = getNumDpsInits();
-  if (getNumDpsInits() != numInputs) {
-    return emitOpError() << "expected the number of inputs " << numInputs
-                         << " to match the number of outputs " << numOutputs;
-  }
-  if (static_cast<int64_t>(comparatorArgs.size()) != numInputs * 2) {
-    return emitOpError() << "expected the number of block arguments "
-                         << comparatorArgs.size() << " to be twice the number "
-                         << "of inputs (2*" << numInputs << ")";
-  }
-  // Checks that the comparator's arguments match the element type of the
-  // inputs.
-  TypeRange inputTypes = TypeRange{getInputs()};
-  TypeRange comparatorArgElementTypes = comparatorBlock->getArgumentTypes();
-  for (size_t i = 0; i < getInputs().size(); ++i) {
-    Type inputArgElemType = inputTypes[i].cast<ShapedType>().getElementType(),
-         comparatorArgElemType1 = comparatorArgElementTypes[2 * i],
-         comparatorArgElemType2 = comparatorArgElementTypes[2 * i + 1];
-    if (comparatorArgElemType1 != inputArgElemType ||
-        comparatorArgElemType2 != inputArgElemType)
-      return emitOpError() << "expected element type of input " << i
-                           << " to match type of the corresponding "
-                              "arguments to the comparison function but got "
-                           << inputArgElemType << " and ("
-                           << comparatorArgElemType1 << ", "
-                           << comparatorArgElemType2 << ")";
-  }
-
-  // Checks that the comparator yields exactly one boolean output.
-  YieldOp comparatorTerminator =
-      cast<YieldOp>(comparatorBlock->getTerminator());
-  if (!succeeded(
-          checkYieldOutputs(comparatorTerminator,
-                            TypeRange({IntegerType::get(getContext(), 1)}))))
-    return failure();
-
-  // Checks that the inputs all have the same shape.
-  ArrayRef<int64_t> referenceShape =
-      getInputs().front().getType().cast<ShapedType>().getShape();
-
-  for (const auto &item : llvm::enumerate(TypeRange{getInputs()})) {
-    ArrayRef<int64_t> shape = item.value().cast<ShapedType>().getShape();
-    if (shape != referenceShape) {
-      return emitOpError() << "expected all inputs to have the same shape ("
-                           << referenceShape << ") but input " << item.index()
-                           << " has shape (" << shape << ")";
-    }
-  }
-
-  // Checks that the outputs have the same shape as the inputs.
-  for (const auto &item : llvm::enumerate(getInits())) {
-    ArrayRef<int64_t> shape =
-        item.value().getType().cast<ShapedType>().getShape();
-    if (shape != referenceShape) {
-      return emitOpError() << "expected outputs to have shape ("
-                           << referenceShape << ") but output " << item.index()
-                           << " has shape (" << shape << ")";
-    }
-  }
-
-  // Checks that the rank of the reference shape is larger than the absolute
-  // value of the sorting dimension. This is enough to ensure that the dimension
-  // is valid, since all inputs are known to have the same shape. `getDimension`
-  // returns an unsigned int, so no need to check for negative values.
-  size_t referenceRank = referenceShape.size();
-  if (getDimension().getSExtValue() >= (int64_t)referenceRank) {
-    return emitOpError() << "sorting dimension must be in range [0, "
-                         << referenceRank << ") but got "
-                         << getDimension().getSExtValue();
-  }
-
-  return verifyDestinationStyleOp(getOperation());
-}
-
-SmallVector<utils::IteratorType> SortOp::getLoopIteratorTypes() {
-  return getParallelIteratorTypes(getType(0).cast<ShapedType>().getRank() - 1);
-}
-
-SmallVector<Range> SortOp::getIterationDomain(OpBuilder &b) {
-  Location loc = getLoc();
-  auto oneInit = getInits().front();
-  auto operandsRank = oneInit.getType().cast<ShapedType>().getRank();
-
-  SmallVector<Range> iterationDomain(operandsRank - 1);
-
-  IntegerAttr zero = b.getIndexAttr(0);
-  IntegerAttr one = b.getIndexAttr(1);
-  int64_t sortDimension = getDimension().getSExtValue();
-
-  for (auto axis : llvm::seq<int64_t>(0, operandsRank - 1)) {
-    int64_t operandAxis = (axis >= sortDimension) ? axis + 1 : axis;
-    iterationDomain[axis].offset = zero;
-    iterationDomain[axis].size =
-        b.createOrFold<tensor::DimOp>(loc, oneInit, operandAxis);
-    iterationDomain[axis].stride = one;
-  }
-  return iterationDomain;
-}
-
-FailureOr<TilingResult> SortOp::getTiledImplementation(
-    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  auto loc = getLoc();
-  SmallVector<OpFoldResult> tileOffsets = llvm::to_vector(offsets);
-  SmallVector<OpFoldResult> tileSizes = llvm::to_vector(sizes);
-
-  size_t numOutputs = getNumDpsInits();
-  int64_t sortDimension = getDimension().getSExtValue();
-
-  Value oneInput = getInputs().front();
-
-  // Capture the entire sorting axis in each tile.
-  tileOffsets.insert(tileOffsets.begin() + sortDimension, b.getIndexAttr(0));
-
-  OpFoldResult sortDimensionSize =
-      b.createOrFold<tensor::DimOp>(loc, oneInput, sortDimension);
-  tileSizes.insert(tileSizes.begin() + sortDimension, sortDimensionSize);
-
-  // Materialize the tile for each input and init.
-  SmallVector<Value> tiledInputsAndInits;
-  SmallVector<Type> tiledResultTypes;
-  tiledInputsAndInits.reserve(numOutputs * 2);
-  tiledResultTypes.reserve(numOutputs);
-
-  for (const auto &input : getInputs()) {
-    tiledInputsAndInits.push_back(
-        materializeSlice(b, loc, input, tileOffsets, tileSizes));
-    auto tileShape =
-        tiledInputsAndInits.back().getType().cast<ShapedType>().getShape();
-    tiledResultTypes.push_back(RankedTensorType::get(
-        tileShape, input.getType().cast<ShapedType>().getElementType()));
-  }
-
-  for (const auto &init : getInits()) {
-    tiledInputsAndInits.push_back(
-        materializeSlice(b, loc, init, tileOffsets, tileSizes));
-  }
-
-  Operation *tiledOp = mlir::clone(b, this->getOperation(), tiledResultTypes,
-                                   tiledInputsAndInits);
-  return TilingResult{{tiledOp}, SmallVector<Value>(tiledOp->getResults())};
-}
-
-LogicalResult SortOp::getResultTilePosition(
-    OpBuilder &b, unsigned /*resultNumber*/, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes, SmallVector<OpFoldResult> &resultOffsets,
-    SmallVector<OpFoldResult> &resultSizes) {
-  SortOp sortOp = cast<SortOp>(this->getOperation());
-  resultOffsets = llvm::to_vector(offsets);
-  resultSizes = llvm::to_vector(sizes);
-
-  int64_t sortDimIndex = sortOp.getDimension().getSExtValue();
-  Value sortDimValue = b.create<tensor::DimOp>(
-      sortOp.getLoc(), sortOp.getInputs().front(), sortDimIndex);
-  resultOffsets.insert(resultOffsets.begin() + sortDimIndex, b.getIndexAttr(0));
-  resultSizes.insert(resultSizes.begin() + sortDimIndex, sortDimValue);
-  return success();
-}
-
-FailureOr<TilingResult> SortOp::generateResultTileValue(
-    OpBuilder &b, unsigned /*resultNumber*/, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  FailureOr<TilingResult> tilingResult =
-      getTiledImplementation(b, offsets, sizes);
-  if (failed(tilingResult)) return failure();
-  return tilingResult.value();
-}
-
-void SortOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  getDstStyleOpEffectsImpl(effects, getOperation()->getResults(),
-                           getDpsInputs(), getDpsInits());
-}
-
-//===----------------------------------------------------------------------===//
-// ReverseOp
-//===----------------------------------------------------------------------===//
-
-ParseResult ReverseOp::parse(OpAsmParser &parser, OperationState &result) {
-  return parseDstStyleOp(
-      parser, result, [&](OpAsmParser &parser, NamedAttrList &attributes) {
-        return parseDenseI64ArrayAttr(parser, attributes, "reverse_dimensions");
-      });
-}
-
-void ReverseOp::print(OpAsmPrinter &p) {
-  printDstStyleOp<ReverseOp>(
-      *this, p, [](ReverseOp op, OpAsmPrinter &p) -> SmallVector<StringRef> {
-        printDenseI64ArrayAttr(p, op.getReverseDimensionsAttrName(),
-                               op.getReverseDimensions());
-        return {op.getReverseDimensionsAttrName()};
-      });
-}
-
-LogicalResult ReverseOp::verify() {
-  return verifyDestinationStyleOp(getOperation());
-}
-
-void ReverseOp::getAsmResultNames(
-    function_ref<void(Value, StringRef)> setNameFn) {
-  setNameFn(getResult(), "reversed");
-}
-
-SmallVector<utils::IteratorType> ReverseOp::getLoopIteratorTypes() {
-  int64_t rank = getType().cast<ShapedType>().getRank();
-  return getParallelIteratorTypes(rank);
-}
-
-SmallVector<Range> ReverseOp::getIterationDomain(OpBuilder &b) {
-  return getIterationDomainForTensor(b, getLoc(), getInit());
-}
-
-namespace {
-SmallVector<OpFoldResult> getInputTileOffsetsForReverse(
-    OpBuilder &b, Location loc, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> tileSizes, ArrayRef<int64_t> reverseDimensions,
-    TypedValue<ShapedType> &input) {
-  auto tileOpOffsets = getValueOrCreateConstantIndexOp(b, loc, offsets);
-  auto sizes = getValueOrCreateConstantIndexOp(b, loc, tileSizes);
-  SmallVector<OpFoldResult> inputTileOffsets;
-  for (size_t i = 0; i < tileOpOffsets.size(); ++i) {
-    if (llvm::is_contained(reverseDimensions, i)) {
-      inputTileOffsets.push_back(OpFoldResult{b.createOrFold<arith::SubIOp>(
-          loc,
-          b.createOrFold<arith::SubIOp>(
-              loc, b.createOrFold<tensor::DimOp>(loc, input, i),
-              Value(tileOpOffsets[i])),
-          sizes[i])});
-    } else {
-      inputTileOffsets.push_back(tileOpOffsets[i]);
-    }
-  }
-
-  return inputTileOffsets;
-}
-}  // namespace
-
-FailureOr<TilingResult> ReverseOp::getTiledImplementation(
-    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  auto loc = getLoc();
-  auto input = getInput();
-  SmallVector<OpFoldResult> inputTileOffsets = getInputTileOffsetsForReverse(
-      b, loc, offsets, sizes, getReverseDimensions(), input);
-
-  // Materialize the tile for input and init.
-  SmallVector<Value, 2> tiledInputsAndInits;
-
-  tiledInputsAndInits.push_back(
-      materializeSlice(b, loc, input, inputTileOffsets, sizes));
-  tiledInputsAndInits.push_back(
-      materializeSlice(b, loc, getInit(), offsets, sizes));
-  auto tileShape =
-      tiledInputsAndInits.back().getType().cast<ShapedType>().getShape();
-  auto tiledResultType = RankedTensorType::get(
-      tileShape, input.getType().cast<ShapedType>().getElementType());
-
-  Operation *tiledOp = mlir::clone(b, this->getOperation(), tiledResultType,
-                                   tiledInputsAndInits);
-  return TilingResult{{tiledOp}, SmallVector<Value>(tiledOp->getResults())};
-}
-
-LogicalResult ReverseOp::getResultTilePosition(
-    OpBuilder & /*b*/, unsigned /*resultNumber*/,
-    ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
-    SmallVector<OpFoldResult> &resultOffsets,
-    SmallVector<OpFoldResult> &resultSizes) {
-  resultOffsets = llvm::to_vector(offsets);
-  resultSizes = llvm::to_vector(sizes);
-  return success();
-}
-
-FailureOr<TilingResult> ReverseOp::generateResultTileValue(
-    OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  FailureOr<TilingResult> tilingResult =
-      getTiledImplementation(b, offsets, sizes);
-  if (failed(tilingResult)) return failure();
-  return tilingResult.value();
-}
-
-OpFoldResult ReverseOp::fold(
-    ReverseOpGenericAdaptor<ArrayRef<Attribute>>) /*operands*/ {
-  auto inputType = getInput().getType();
-  for (unsigned i = 0; i < getReverseDimensions().size(); ++i) {
-    if (inputType.getDimSize(getReverseDimensions()[i]) != 1) return nullptr;
-  }
-  return getInput();
-}
-
-void ReverseOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  getDstStyleOpEffectsImpl(effects, getOperation()->getResults(),
-                           getDpsInputs(), getDpsInits());
-}
-
-}  // namespace thlo
-}  // namespace mlir
-
-// Generated op classes.
-#define GET_OP_CLASSES
-#include "thlo/IR/thlo_ops.cc.inc"
diff --git a/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.h b/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.h
deleted file mode 100644
index 9f6ad3b7701389..00000000000000
--- a/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines the operations used in the THLO dialect.
-
-#ifndef MLIR_HLO_THLO_IR_THLO_OPS_H
-#define MLIR_HLO_THLO_IR_THLO_OPS_H
-
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/Interfaces/ControlFlowInterfaces.h"
-#include "mlir/Interfaces/DestinationStyleOpInterface.h"
-#include "mlir/Interfaces/InferTypeOpInterface.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Interfaces/TilingInterface.h"
-
-// Generated dialect declarations.
-#include "thlo/IR/thlo_dialect.h.inc"
-
-// Generated operation classes.
-#define GET_OP_CLASSES
-#include "thlo/IR/thlo_ops.h.inc"
-
-#endif  // MLIR_HLO_THLO_IR_THLO_OPS_H
diff --git a/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.td b/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.td
deleted file mode 100644
index 5346d413842e51..00000000000000
--- a/third_party/xla/xla/mlir_hlo/thlo/IR/thlo_ops.td
+++ /dev/null
@@ -1,346 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THLO_OPS
-#define THLO_OPS
-
-include "mlir/IR/OpAsmInterface.td"
-include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/ControlFlowInterfaces.td"
-include "mlir/Interfaces/DestinationStyleOpInterface.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir/Interfaces/InferTypeOpInterface.td"
-include "mlir/Interfaces/TilingInterface.td"
-
-def TensorOrMemref :
-  AnyTypeOf<[AnyMemRef, AnyRankedTensor], "", "::mlir::ShapedType">;
-
-class TensorOrMemrefOf<list<Type> allowedTypes> :
-  AnyTypeOf<[MemRefOf<allowedTypes>, RankedTensorOf<allowedTypes>],
-  "", "::mlir::ShapedType">;
-
-def THLO_Dialect : Dialect {
-  let name = "thlo";
-  let cppNamespace = "::mlir::thlo";
-  let usePropertiesForAttributes = 0;
-}
-
-class THLO_Op<string mnemonic, list<Trait> traits> :
-    Op<THLO_Dialect, mnemonic, traits> {
-  let hasVerifier = 1;
-}
-
-class THLO_DstStyleOp<string mnemonic, list<Trait> traits> : THLO_Op<mnemonic, [
-        DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
-        DestinationStyleOpInterface] # traits> {
-  let hasCustomAssemblyFormat = 1;
-}
-
-def THLO_ConcatenateOp : THLO_DstStyleOp<"concatenate", [
-    DeclareOpInterfaceMethods<TilingInterface, [
-      "generateResultTileValue",
-      "getIterationDomain",
-      "getLoopIteratorTypes",
-      "getResultTilePosition",
-      "getTiledImplementation"
-    ]>,
-    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>
-  ]> {
-  let summary = "Destination-style twin for `mhlo.concatenate`";
-  let description = [{
-    tHLO ConcatenateOp composes a tensor or a memref from multiple tensors or
-    memrefs.
-
-    Example:
-    ```
-      %concat = thlo.concatenate
-        ins(%T1 : tensor<100x?xf32>, %T2 : tensor<300x?xf32>)
-        outs(%init : tensor<400x?xf32>)
-        dimension = 0
-    ```
-
-    See https://www.tensorflow.org/xla/operation_semantics#concatenate
-  }];
-
-  let arguments = (ins
-    Variadic<TensorOrMemref>:$inputs,
-    TensorOrMemref:$init,
-    IndexAttr:$dimension
-  );
-  let results = (outs Variadic<AnyTensor>:$result);
-
-  let extraClassDeclaration = [{
-    // Implement method necessary for DestinationStyleOpInterface.
-    mlir::MutableOperandRange getDpsInitsMutable() {
-      return getInitMutable();
-    }
-  }];
-}
-
-def THLO_DynamicBroadcastInDimOp : THLO_DstStyleOp<"dynamic_broadcast_in_dim", [
-    DeclareOpInterfaceMethods<TilingInterface, [
-      "generateResultTileValue",
-      "getIterationDomain",
-      "getLoopIteratorTypes",
-      "getResultTilePosition",
-      "getTiledImplementation"
-    ]>
-  ]> {
-  let summary = "Destination-style twin for `mhlo.dynamic_broadcast_in_dim`";
-  let description = [{
-    tHLO DynamicBroadcastInDimOp specifies a map how to broadcast input
-    dimensions. It also supports broadcasting size-1 dimensions.
-
-    Example:
-    ```
-      %dyn_bcast = thlo.dynamic_broadcast_in_dim
-        ins(%input : tensor<?x?xf32>)
-        outs(%init : tensor<?x?x?xf32>)
-        broadcast_dimensions = [0, 2]
-    ```
-
-    See https://www.tensorflow.org/xla/operation_semantics#broadcastindim
-  }];
-
-  let arguments = (ins
-    // Input args
-    TensorOrMemref:$operand,
-    // Output arg
-    TensorOrMemref:$init,
-
-    DenseI64ArrayAttr:$broadcast_dimensions,
-    OptionalAttr<DenseI64ArrayAttr>:$known_expanding_dimensions,
-    OptionalAttr<DenseI64ArrayAttr>:$known_nonexpanding_dimensions
-  );
-
-  let results = (outs Variadic<AnyTensor>:$result);
-
-  let extraClassDeclaration = [{
-    // Implement method necessary for DestinationStyleOpInterface.
-    mlir::MutableOperandRange getDpsInitsMutable() {
-      return getInitMutable();
-    }
-  }];
-}
-
-def THLO_GatherOp : THLO_DstStyleOp<"gather", [
-    DeclareOpInterfaceMethods<TilingInterface, [
-      "generateResultTileValue",
-      "getIterationDomain",
-      "getLoopIteratorTypes",
-      "getResultTilePosition",
-      "getTiledImplementation"
-    ]>
-  ]> {
-  let summary = "Destination-style twin for `mhlo.gather`";
-  let description = [{
-    tHLO GatherOp corresponds to the canonicalized mHLO GatherOp, i.e.
-
-    - start_indices is a two-dimensional tensor.
-    - index_vector_dim is 1
-    - offset_dims is [1, 2, ...]
-    - collapsed_slice_dims is []
-    - start_index_map is range(start_indices.shape[1])
-
-    Example:
-    ```
-      %gathered = thlo.gather
-        ins(%input : tensor<100xf32>, %indices : tensor<42x1xindex>)
-        outs(%init : tensor<42xf32>)
-    ```
-
-    See https://www.tensorflow.org/xla/operation_semantics#gather.
-  }];
-  let arguments = (ins
-    // Input args
-    TensorOrMemref:$operand,
-    TensorOrMemrefOf<[Index]>:$start_indices,
-    // Output arg
-    TensorOrMemref:$init
-  );
-  let results = (outs Variadic<AnyTensor>:$result);
-
-  let extraClassDeclaration = [{
-    // Implement method necessary for DestinationStyleOpInterface.
-    mlir::MutableOperandRange getDpsInitsMutable() {
-      return getInitMutable();
-    }
-  }];
-}
-
-def THLO_ScatterOp : THLO_DstStyleOp<"scatter", [
-    DeclareOpInterfaceMethods<TilingInterface, [
-      "generateResultTileValue",
-      "getIterationDomain",
-      "getLoopIteratorTypes",
-      "getResultTilePosition",
-      "getTiledImplementation"
-    ]>,
-    SingleBlockImplicitTerminator<"YieldOp">
-  ]> {
-  let summary = "Destination-style twin for `mhlo.scatter`";
-  let description = [{
-    tHLO ScatterOp corresponds to the canonicalized mHLO ScatterOp, i.e.
-
-    - update_window_dims is range(1, rank(update_window_dims))
-    - inserted_window_dims is []
-    - scatter_dims_to_operand_dims is range(0, rank(indices))
-    - index_vector_dim is rank(indices) - 1
-
-    At the moment, the variadic case is not supported.
-
-    Example:
-    ```
-      %scattered = thlo.scatter
-        ins(%indices : tensor<2x2xindex>, %input : tensor<2x1x3xf32>)
-        outs(%init : tensor<3x3xf32>)
-        (%arg3: f32, %arg4: f32) {
-          %0 = arith.addf %arg3, %arg4 : f32
-          thlo.yield %0 : f32
-        }
-    ```
-
-    See https://www.tensorflow.org/xla/operation_semantics#scatter.
-  }];
-
-  let arguments = (ins
-    // Input args
-    TensorOrMemrefOf<[Index]>:$indices,
-    TensorOrMemref:$updates,
-    // Output arg
-    TensorOrMemref:$init
-  );
-
-  let results = (outs Variadic<AnyTensor>:$result);
-
-  let regions = (region SizedRegion<1>:$update_computation);
-
-  let extraClassDeclaration = [{
-    // Returns index vector dimension size, which is always statically-known.
-    int64_t getIndexVectorDimSize() {
-      return getIndices().getType().getDimSize(1);
-    }
-
-    // Returns the number of indices, i.e. number of scalar/tensor updates.
-    int64_t getIndicesCount() { return getIndices().getType().getDimSize(0); }
-
-    // Implement method necessary for DestinationStyleOpInterface.
-    mlir::MutableOperandRange getDpsInitsMutable() {
-      return getInitMutable();
-    }
-  }];
-}
-
-def THLO_SortOp : THLO_DstStyleOp<"sort", [
-    DeclareOpInterfaceMethods<OpAsmOpInterface, [
-      "getAsmResultNames",
-      "getAsmBlockArgumentNames"
-    ]>,
-    DeclareOpInterfaceMethods<TilingInterface, [
-      "generateResultTileValue",
-      "getIterationDomain",
-      "getLoopIteratorTypes",
-      "getResultTilePosition",
-      "getTiledImplementation"
-    ]>,
-    SameVariadicOperandSize,
-    SingleBlockImplicitTerminator<"YieldOp">
-  ]> {
-  let summary = "Destination-style twin for the `mhlo.sort`";
-  let description = [{
-    Sorts the given `operands` along the given `dimension` using the given
-    `comparator`.
-
-    Example:
-    ```
-      %sorted1, %sorted2 = thlo.sort
-        ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
-        outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-        dimension = 0
-        is_stable = true
-        (%lhs0: f32, %rhs0: f32, %lhs1: i32, %rhs1: i32) {
-          %0 = arith.cmpf ogt, %lhs0, %rhs0 : f32
-          thlo.yield %0 : i1
-        }
-    ```
-
-    See https://www.tensorflow.org/xla/operation_semantics#sort.
-  }];
-
-  let arguments = (ins
-    // Input args
-    Variadic<TensorOrMemref>:$inputs,
-    // Output args
-    Variadic<TensorOrMemref>:$inits,
-
-    IndexAttr:$dimension,
-    BoolAttr:$is_stable
-  );
-
-  let results = (outs Variadic<AnyTensor>:$result);
-  let regions = (region SizedRegion<1>:$comparator);
-
-  let extraClassDeclaration = [{
-    // Implement method necessary for DestinationStyleOpInterface.
-    mlir::MutableOperandRange getDpsInitsMutable() {
-      return getInitsMutable();
-    }
-  }];
-}
-
-def THLO_ReverseOp : THLO_DstStyleOp<"reverse", [
-    DeclareOpInterfaceMethods<TilingInterface, [
-      "generateResultTileValue",
-      "getIterationDomain",
-      "getLoopIteratorTypes",
-      "getResultTilePosition",
-      "getTiledImplementation"
-    ]>,
-    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,]> {
-  let summary = "Destination-style twin for the `mhlo.reverse`";
-  let description = [{
-    Reverses the specified dimensions of `input` according to the given
-    `dimensions`.
-
-    See https://www.tensorflow.org/xla/operation_semantics#rev_reverse.
-  }];
-
-  let arguments = (ins
-    TensorOrMemref:$input,
-    TensorOrMemref:$init,
-    DenseI64ArrayAttr:$reverse_dimensions
-  );
-
-  let results = (outs TensorOrMemref:$result);
-
-  let hasFolder = 1;
-
-  let extraClassDeclaration = [{
-    // Implement method necessary for DestinationStyleOpInterface.
-    mlir::MutableOperandRange getDpsInitsMutable() {
-      return getInitMutable();
-    }
-  }];
-}
-
-def THLO_YieldOp : THLO_Op<"yield", [Pure, ReturnLike, Terminator,
-    ParentOneOf<["ScatterOp", "SortOp"]>]>,
-    Arguments<(ins Variadic<AnyType>:$values)> {
-  let summary = "Yield operation for tHLO ops with regions.";
-  let assemblyFormat = "attr-dict $values `:` type($values)";
-  let hasVerifier = 1;
-}
-
-#endif // THLO_OPS
diff --git a/third_party/xla/xla/mlir_hlo/thlo/interfaces/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/thlo/interfaces/CMakeLists.txt
deleted file mode 100644
index 6ee9828d2a5a52..00000000000000
--- a/third_party/xla/xla/mlir_hlo/thlo/interfaces/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_mlir_library(ThloBufferizableOpInterface
-  bufferizable_op_interface_impl.cc
-
-  LINK_LIBS PUBLIC
-  THLODialect
-  MLIRBufferizationDialect
-  MLIRDestinationStyleOpInterface
-)
diff --git a/third_party/xla/xla/mlir_hlo/thlo/interfaces/bufferizable_op_interface_impl.cc b/third_party/xla/xla/mlir_hlo/thlo/interfaces/bufferizable_op_interface_impl.cc
deleted file mode 100644
index 98ddebd8230f1c..00000000000000
--- a/third_party/xla/xla/mlir_hlo/thlo/interfaces/bufferizable_op_interface_impl.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "thlo/interfaces/bufferizable_op_interface_impl.h"
-
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Interfaces/DestinationStyleOpInterface.h"
-#include "thlo/IR/thlo_ops.h"
-
-namespace mlir {
-namespace thlo {
-namespace {
-
-using mlir::bufferization::AliasingOpOperandList;
-using mlir::bufferization::AliasingValueList;
-using mlir::bufferization::AnalysisState;
-using mlir::bufferization::BufferizableOpInterface;
-using mlir::bufferization::BufferizationOptions;
-using mlir::bufferization::BufferRelation;
-
-// We can reuse the upstream implementation when DestinationStyleOpInterface
-// is moved out of linalg.
-static LogicalResult bufferizeDestinationStyleOpInterface(
-    RewriterBase &rewriter, DestinationStyleOpInterface op,
-    const BufferizationOptions &options) {
-  // Take a guard before anything else.
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(op);
-
-  // Nothing to do. This op is already bufferized.
-  if (op.hasBufferSemantics()) return success();
-
-  if (!op.hasTensorSemantics())
-    return op->emitError() << "expected either buffer or tensor semantics";
-
-  size_t numOutputs = op.getNumDpsInits();
-
-  // New operands for the cloned op.
-  SmallVector<Value> newOperands;
-  newOperands.reserve(op.getNumDpsInputs() + numOutputs);
-
-  for (OpOperand *opOperand : op.getDpsInputOperands()) {
-    if (op.isScalar(opOperand)) {
-      newOperands.push_back(opOperand->get());
-      continue;
-    }
-    FailureOr<Value> buffer = getBuffer(rewriter, opOperand->get(), options);
-    if (failed(buffer)) return failure();
-    newOperands.push_back(*buffer);
-  }
-
-  // New output operands for the cloned op.
-  SmallVector<Value> newOutputs;
-  newOutputs.reserve(numOutputs);
-
-  for (OpResult opResult : op->getOpResults()) {
-    OpOperand *opOperand = op.getDpsInitOperand(opResult.getResultNumber());
-    FailureOr<Value> resultBuffer =
-        getBuffer(rewriter, opOperand->get(), options);
-    if (failed(resultBuffer)) return failure();
-    newOutputs.push_back(*resultBuffer);
-  }
-
-  newOperands.append(newOutputs.begin(), newOutputs.end());
-
-  // Set insertion point now that potential alloc/dealloc are introduced.
-  rewriter.setInsertionPoint(op);
-
-  // Clone the op, but use the new operands. Move the existing block into the
-  // new op. Since the new op does not have any tensor results, it does not
-  // return anything.
-  auto newOp = cast<DestinationStyleOpInterface>(cloneWithoutRegions(
-      rewriter, op, /*resultTypes=*/TypeRange{}, newOperands));
-
-  assert(op->getNumRegions() <= 1);
-  if (op->getNumRegions() == 1) {
-    rewriter.inlineRegionBefore(op->getRegion(0), newOp->getRegion(0),
-                                newOp->getRegion(0).begin());
-  }
-
-  // Replace the results of the old op with the new output buffers.
-  bufferization::replaceOpWithBufferizedValues(rewriter, op, newOutputs);
-
-  return success();
-}
-
-struct ThloSortOpBufferizationModel
-    : public BufferizableOpInterface::ExternalModel<
-          ThloSortOpBufferizationModel, SortOp> {
-  bool bufferizesToMemoryRead(Operation * /*op*/, OpOperand & /*opOperand*/,
-                              const AnalysisState & /*state*/) const {
-    return true;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const AnalysisState & /*state*/) const {
-    return cast<DestinationStyleOpInterface>(op).isDpsInit(&opOperand);
-  }
-
-  AliasingOpOperandList getAliasingOpOperands(
-      Operation *op, Value value, const AnalysisState & /*state*/) const {
-    auto opResult = value.dyn_cast<OpResult>();
-    if (!opResult) return {};
-    auto dstStyleOp = cast<DestinationStyleOpInterface>(op);
-
-    // The i-th OpResult aliases with the i-th "out" tensor.
-    return {{dstStyleOp.getDpsInitOperand(opResult.getResultNumber()),
-             BufferRelation::Equivalent}};
-  }
-
-  AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
-                                      const AnalysisState & /*state*/) const {
-    auto dstStyleOp = cast<DestinationStyleOpInterface>(op);
-
-    // The i-th "out" tensor aliases with the i-th OpResult.
-    if (dstStyleOp.isDpsInit(&opOperand))
-      return {
-          {dstStyleOp.getTiedOpResult(&opOperand), BufferRelation::Equivalent}};
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
-    return bufferizeDestinationStyleOpInterface(
-        rewriter, cast<DestinationStyleOpInterface>(op), options);
-  }
-};
-
-}  // namespace
-
-}  // namespace thlo
-}  // namespace mlir
-
-void mlir::thlo::registerBufferizableOpInterfaceExternalModels(
-    DialectRegistry &registry) {
-  registry.addExtension(+[](MLIRContext *ctx, thlo::THLODialect * /*dialect*/) {
-    SortOp::attachInterface<ThloSortOpBufferizationModel>(*ctx);
-  });
-}
diff --git a/third_party/xla/xla/mlir_hlo/thlo/interfaces/bufferizable_op_interface_impl.h b/third_party/xla/xla/mlir_hlo/thlo/interfaces/bufferizable_op_interface_impl.h
deleted file mode 100644
index ee35b031ac5aea..00000000000000
--- a/third_party/xla/xla/mlir_hlo/thlo/interfaces/bufferizable_op_interface_impl.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_THLO_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
-#define MLIR_HLO_THLO_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
-
-namespace mlir {
-class DialectRegistry;
-
-namespace thlo {
-
-void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
-
-}  // namespace thlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_THLO_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
diff --git a/third_party/xla/xla/mlir_hlo/thlo/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/thlo/transforms/CMakeLists.txt
deleted file mode 100644
index d582d86a724ee3..00000000000000
--- a/third_party/xla/xla/mlir_hlo/thlo/transforms/CMakeLists.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set(LLVM_TARGET_DEFINITIONS thlo_passes.td)
-mlir_tablegen(thlo_passes.h.inc -gen-pass-decls -name AllThlo)
-add_public_tablegen_target(MLIRThloPassIncGen)
-
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_mlir_library(ThloPasses
-  legalize_sort/legalize_sort.cc
-
-  DEPENDS
-  MLIRThloPassIncGen
-
-  LINK_LIBS PUBLIC
-  MLIRArithDialect
-  MLIRArithUtils
-  MLIRFuncDialect
-  MLIRMemRefDialect
-  MLIRPass
-  MLIRSCFDialect
-  MLIRTransforms
-)
diff --git a/third_party/xla/xla/mlir_hlo/thlo/transforms/legalize_sort/legalize_sort.cc b/third_party/xla/xla/mlir_hlo/thlo/transforms/legalize_sort/legalize_sort.cc
deleted file mode 100644
index 3f3d75fc5ad51d..00000000000000
--- a/third_party/xla/xla/mlir_hlo/thlo/transforms/legalize_sort/legalize_sort.cc
+++ /dev/null
@@ -1,561 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "thlo/IR/thlo_ops.h"
-#include "thlo/transforms/passes.h"
-
-namespace mlir {
-namespace thlo {
-
-#define GEN_PASS_DEF_THLOLEGALIZESORTPASS
-#include "thlo/transforms/thlo_passes.h.inc"
-
-namespace {
-
-using ::mlir::arith::AddIOp;
-using ::mlir::arith::MinSIOp;
-using ::mlir::arith::SelectOp;
-
-constexpr uint64_t kInsertionSortSize = 16;
-
-// Inlines the `comparator` region (without terminator) at the current insertion
-// point, replacing the arguments with the given values from `lhs` and `rhs`.
-Value emitComparison(ImplicitLocOpBuilder& b, SmallVector<Value>& lhs,
-                     SmallVector<Value>& rhs, Region& comparator) {
-  assert(comparator.hasOneBlock() && "Comparator must have only one block.");
-  Block& block = comparator.front();
-  assert(block.getTerminator()->getOperands().size() == 1 &&
-         "Comparator must return a single value");
-
-  IRMapping mapping;
-  for (auto [idx, arg] : llvm::enumerate(comparator.getArguments())) {
-    Value value = idx % 2 == 0 ? lhs[idx / 2] : rhs[idx / 2];
-    mapping.map(arg, value);
-  }
-
-  for (Operation& op : block.without_terminator()) b.clone(op, mapping);
-  Value result = mapping.lookup(block.getTerminator()->getOperand(0));
-
-  return result;
-}
-
-// Emits a binary search of `pivots` in `arrayMemrefs` (all rank 1) in the range
-// [`left`;`right`). `arrayMemrefs` must be sorted according to `comparator`.
-Value emitBinarySearch(ImplicitLocOpBuilder& b, Value leftInit, Value rightInit,
-                       SmallVector<Value>& pivots, ValueRange arrayMemrefs,
-                       Region& comparator) {
-  SmallVector<Type, 2> types{leftInit.getType(), rightInit.getType()};
-  ArithBuilder arith(b, b.getLoc());
-
-  // while (
-  auto whileOp = b.create<scf::WhileOp>(
-      types, SmallVector<Value, 2>{leftInit, rightInit},
-      [&](OpBuilder& beforeBuilder, Location beforeLoc, ValueRange args) {
-        //  left < right) {
-        Value left = args[0], right = args[1];
-        beforeBuilder.create<scf::ConditionOp>(beforeLoc,
-                                               arith.slt(left, right), args);
-      },
-      [&](OpBuilder& afterBuilder, Location afterLoc, ValueRange args) {
-        ImplicitLocOpBuilder impLocAfterBuilder =
-            ImplicitLocOpBuilder(afterLoc, afterBuilder);
-        Value left = args[0], right = args[1];
-        //   int mid = (left + right) >> 1;
-        Value one = impLocAfterBuilder.create<arith::ConstantIndexOp>(1);
-        Value mid = impLocAfterBuilder.create<arith::ShRUIOp>(
-            arith.add(left, right), one);
-        Value midPlusOne = impLocAfterBuilder.create<AddIOp>(mid, one);
-
-        auto arraysAtMid = llvm::to_vector(
-            llvm::map_range(arrayMemrefs, [&](Value arrayMemref) -> Value {
-              return impLocAfterBuilder.create<memref::LoadOp>(arrayMemref,
-                                                               mid);
-            }));
-
-        Value cond =
-            emitComparison(impLocAfterBuilder, pivots, arraysAtMid, comparator);
-        //   if (comparator(pivot, array[mid]))
-        //     right = mid;
-        //   else
-        //     left = mid + 1;
-        Value newLeft = arith.select(cond, left, midPlusOne);
-        Value newRight = arith.select(cond, mid, right);
-
-        // }
-        impLocAfterBuilder.create<scf::YieldOp>(ValueRange{newLeft, newRight});
-      });
-
-  return whileOp.getResult(0);
-}
-
-SmallVector<Value> loadMemrefElements(ImplicitLocOpBuilder& b,
-                                      ValueRange memrefs, Value index) {
-  return llvm::to_vector(llvm::map_range(memrefs, [&](Value memref) -> Value {
-    Type type = memref.getType().cast<MemRefType>().getElementType();
-    return b.create<memref::LoadOp>(type, memref, index);
-  }));
-}
-
-void storeMemrefElements(ImplicitLocOpBuilder& b, ValueRange memrefs,
-                         Value index, ValueRange values) {
-  for (auto [value, memref] : llvm::zip(values, memrefs)) {
-    b.create<memref::StoreOp>(value, memref, index);
-  }
-}
-
-// Insertion sorts `inputMemrefs` in the range [`lo`; `hi`), storing the results
-// in `outputMemrefs`. `inputMemrefs` and `outputMemrefs` must all be rank 1 and
-// of identical size.
-void emitInsertionSort(ImplicitLocOpBuilder& b, Value lo, Value hi,
-                       ValueRange inputMemrefs, ValueRange outputMemrefs,
-                       mlir::Region& comparator) {
-  ArithBuilder arith(b, b.getLoc());
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  Value one = b.create<arith::ConstantIndexOp>(1);
-
-  // array[lo] = inputs[lo];
-  storeMemrefElements(b, outputMemrefs, lo,
-                      loadMemrefElements(b, inputMemrefs, lo));
-
-  // for (int start = lo + 1; start < hi; ++start)
-  {
-    auto forOp = b.create<scf::ForOp>(arith.add(lo, one), hi, one);
-    OpBuilder::InsertionGuard outerGuard(b);
-    b.setInsertionPointToStart(forOp.getBody());
-    Value start = forOp.getInductionVar();
-
-    //   T pivot = inputs[start];
-    auto pivots = loadMemrefElements(b, inputMemrefs, start);
-
-    //   int index = binarySearch(lo, start, pivot, array, comparator);
-    auto index =
-        emitBinarySearch(b, lo, start, pivots, outputMemrefs, comparator);
-
-    //   int n = start - index;  // The number of elements to move
-    Value n = arith.sub(start, index);
-
-    // memmove(&array[index + 1], &array[index], n * sizeof(T))
-    // memref::CopyOp would be nice to use here, but:
-    // 1. It lowers to a quite inefficient library call in the general case
-    //    (strides != 1).
-    // 2. It implements memcpy semantics, but we need memmove here.
-    // So we go with a loop instead.
-    auto copyForOp = b.create<scf::ForOp>(zero, n, one);
-    {
-      OpBuilder::InsertionGuard innerGuard(b);
-      b.setInsertionPointToStart(copyForOp.getBody());
-      Value copyLoopIndex = copyForOp.getInductionVar();
-
-      Value dstIndex = arith.sub(start, copyLoopIndex);
-      Value srcIndex = arith.sub(dstIndex, one);
-      storeMemrefElements(b, outputMemrefs, dstIndex,
-                          loadMemrefElements(b, outputMemrefs, srcIndex));
-    }
-    //   array[index] = pivot;
-    storeMemrefElements(b, outputMemrefs, index, pivots);
-  }
-}
-
-void emitMerge(ImplicitLocOpBuilder& b, Value lo, Value mid, Value hi,
-               ValueRange readBufs, ValueRange writeBufs,
-               mlir::Region& comparator) {
-  ArithBuilder arith(b, b.getLoc());
-  // The while loop runs until we reach the end of either interval. It has three
-  // loop-carried variables:
-  // 1. current output index
-  // 2. current read index for interval 1
-  // 3. current read index for interval 2
-  SmallVector<Type> whileArgTypes{lo.getType(), lo.getType(), mid.getType()};
-  SmallVector<Value> whileInitArgs{lo, lo, mid};
-  SmallVector<Location> whileArgLocs(whileArgTypes.size(), b.getLoc());
-
-  // while(
-  auto whileOp = b.create<scf::WhileOp>(
-      whileArgTypes, whileInitArgs,
-      [&](OpBuilder& beforeBuilder, Location beforeLoc, ValueRange args) {
-        Value i0 = args[1], i1 = args[2];
-
-        //     i0 < mid && i1 < hi) {
-        Value inbounds0 = arith.slt(i0, mid);
-        Value inbounds1 = arith.slt(i1, hi);
-        beforeBuilder.create<scf::ConditionOp>(
-            beforeLoc, arith._and(inbounds0, inbounds1), args);
-      },
-      [&](OpBuilder& afterBuilder, Location afterLoc, ValueRange args) {
-        ImplicitLocOpBuilder impLocAfterBuilder(afterLoc, afterBuilder);
-        Value iOut = args[0], i0 = args[1], i1 = args[2];
-
-        //   auto vals0 = readBufs[i0], vals1 = readBufs[i1];
-        SmallVector<Value> vals0 =
-            loadMemrefElements(impLocAfterBuilder, readBufs, i0);
-        SmallVector<Value> vals1 =
-            loadMemrefElements(impLocAfterBuilder, readBufs, i1);
-
-        //   writeBufs[iOut] = comparator(vals1, vals0)
-        //                       ? readBufs[i1++] : readBufs[i0++];
-        Value cmp =
-            emitComparison(impLocAfterBuilder, vals1, vals0, comparator);
-        SmallVector<Value> pickedVals;
-        for (auto [val0, val1] : llvm::zip(vals0, vals1)) {
-          pickedVals.push_back(
-              impLocAfterBuilder.create<SelectOp>(cmp, val1, val0));
-        }
-        storeMemrefElements(impLocAfterBuilder, writeBufs, iOut, pickedVals);
-        Value one = impLocAfterBuilder.create<arith::ConstantIndexOp>(1);
-        Value nexti0 =
-            impLocAfterBuilder.create<SelectOp>(cmp, i0, arith.add(i0, one));
-        Value nexti1 =
-            impLocAfterBuilder.create<SelectOp>(cmp, arith.add(i1, one), i1);
-
-        //   ++iOut;
-        Value nextIOut = impLocAfterBuilder.create<AddIOp>(iOut, one);
-        impLocAfterBuilder.create<scf::YieldOp>(
-            ValueRange{nextIOut, nexti0, nexti1});
-      });
-
-  // At this point, exactly one of the input ranges will have leftover elements.
-  Value iOut = whileOp->getResult(0);
-  Value i0 = whileOp->getResult(1);
-  Value i1 = whileOp->getResult(2);
-
-  // We could use memref::CopyOp here, but typically, there aren't many leftover
-  // elements for randomly shuffled inputs.
-  Value leftoverIn0 = arith.slt(i0, mid);
-  Value start = arith.select(leftoverIn0, i0, i1);
-  Value end = arith.select(leftoverIn0, mid, hi);
-  Value n = arith.sub(end, start);
-
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  Value one = b.create<arith::ConstantIndexOp>(1);
-  auto forOp = b.create<scf::ForOp>(zero, n, one);
-  b.setInsertionPointToStart(forOp.getBody());
-  Value copyIndex = forOp.getInductionVar();
-
-  Value srcIndex = arith.add(start, copyIndex);
-  Value dstIndex = arith.add(iOut, copyIndex);
-  storeMemrefElements(b, writeBufs, dstIndex,
-                      loadMemrefElements(b, readBufs, srcIndex));
-}
-
-Value emitBottomUpMergeSort(ImplicitLocOpBuilder& b, Value lo, Value hi,
-                            int64_t staticSortDimSize, ValueRange inputMemrefs,
-                            ValueRange outputs0, ValueRange outputs1,
-                            mlir::Region& comparator) {
-  ArithBuilder arith(b, b.getLoc());
-  Value size = arith.sub(hi, lo);
-
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  Value insertionSortSize =
-      b.create<arith::ConstantIndexOp>(kInsertionSortSize);
-
-  // Run insertion sort on blocks of size kInsertionSortSize.
-  {
-    auto forBody = [&](OpBuilder& ob, Location loc, Value start, ValueRange) {
-      ImplicitLocOpBuilder b = ImplicitLocOpBuilder(loc, ob);
-      Value end = arith.add(
-          b.create<MinSIOp>(arith.add(start, insertionSortSize), size), lo);
-      emitInsertionSort(b, start, end, inputMemrefs, outputs0, comparator);
-      b.create<scf::YieldOp>(ValueRange{});
-    };
-    b.create<scf::ForOp>(/*lowerBound=*/zero, /*upperBound=*/size,
-                         /*step=*/insertionSortSize, /*iterArgs=*/std::nullopt,
-                         forBody);
-  }
-
-  Value initParity = b.create<arith::ConstantIntOp>(/*value=*/0, /*width=*/1);
-  if (staticSortDimSize >= 0 &&
-      staticSortDimSize < static_cast<int64_t>(kInsertionSortSize)) {
-    return initParity;
-  }
-
-  // The while arguments are:
-  // 1. the current size
-  // 2. a boolean stating whether we are reading from outputs0 or outputs1
-  //
-  // 1 gets doubled each iteration, 2 gets negated.
-  // int currentSize = kInsertionSortSize;
-  SmallVector<Value> whileInitArgs{insertionSortSize, initParity};
-  // First we read from `outputs0` (initialized by the insertion sort above).
-
-  SmallVector<Type> whileArgTypes;
-  for (auto val : whileInitArgs) whileArgTypes.push_back(val.getType());
-
-  SmallVector<Location> whileArgLocs(whileArgTypes.size(), b.getLoc());
-
-  // while (
-  auto whileOp = b.create<scf::WhileOp>(
-      whileArgTypes, whileInitArgs,
-      [&](OpBuilder& beforeBuilder, Location beforeLoc, ValueRange args) {
-        //        currentSize < totalSize)
-        Value currentSize = args[0];
-        beforeBuilder.create<scf::ConditionOp>(
-            beforeLoc, arith.slt(currentSize, size), args);
-      },
-      [&](OpBuilder& afterBuilder, Location afterLoc, ValueRange args) {
-        ImplicitLocOpBuilder impLocAfterBuilder =
-            ImplicitLocOpBuilder(afterLoc, afterBuilder);
-
-        //                                 {
-        Value currentSize = args[0], parity = args[1];
-        Value twoCurrentSize = arith.add(currentSize, currentSize);
-
-        // emitMergeLoop(readBufs, writeBufs) {
-        //   for (int start = 0; start < size; start += 2*currentSize) {
-        auto emitMergeLoop = [&](OpBuilder& builder, Location loc,
-                                 ValueRange readBufs, ValueRange writeBufs) {
-          ImplicitLocOpBuilder localImpLocBuilder(loc, builder);
-          ArithBuilder localArithBuilder(localImpLocBuilder, loc);
-
-          auto forOp =
-              localImpLocBuilder.create<scf::ForOp>(zero, size, twoCurrentSize);
-          OpBuilder::InsertionGuard guard(localImpLocBuilder);
-          localImpLocBuilder.setInsertionPointToStart(forOp.getBody());
-          Value start = forOp.getInductionVar();
-
-          Value mid = localImpLocBuilder.create<MinSIOp>(
-              size, localArithBuilder.add(start, currentSize));
-          Value end = localImpLocBuilder.create<MinSIOp>(
-              size, localArithBuilder.add(start, twoCurrentSize));
-          emitMerge(localImpLocBuilder, start, mid, end, readBufs, writeBufs,
-                    comparator);
-          return;
-        };
-        //   }
-        // }
-
-        // if (parity)
-        //   emitMergeLoop(outputs1, outputs0)
-        // else
-        //   emitMergeLoop(outputs0, outputs1)
-        impLocAfterBuilder.create<scf::IfOp>(
-            /*cond=*/parity,
-            /*thenBuilder=*/
-            [&](OpBuilder& builder, Location loc) {
-              emitMergeLoop(builder, loc, outputs1, outputs0);
-              builder.create<scf::YieldOp>(loc, ValueRange{});
-            },
-            /*elseBuilder=*/
-            [&](OpBuilder& builder, Location loc) {
-              emitMergeLoop(builder, loc, outputs0, outputs1);
-              builder.create<scf::YieldOp>(loc, ValueRange{});
-            });
-
-        // parity = !parity;
-        Value one = impLocAfterBuilder.create<arith::ConstantIntOp>(1, 1);
-        Value notParity = arith.sub(one, parity);
-        // currentSize *= 2;
-        SmallVector<Value> nextWhileArgs{twoCurrentSize, notParity};
-        impLocAfterBuilder.create<scf::YieldOp>(nextWhileArgs);
-      });
-  // }
-
-  // The result is the parity bit.
-  return whileOp.getResult(1);
-}
-
-struct Slicer {
-  Slicer(OpBuilder& b, int64_t sortDim, Value sortDimSize,
-         ValueRange inductionVariables)
-      : sizes(inductionVariables.size() + 1, b.getI64IntegerAttr(1)),
-        strides(inductionVariables.size() + 1, b.getI64IntegerAttr(1)) {
-    sizes[sortDim] = sortDimSize;
-    for (size_t i = 0; i < inductionVariables.size() + 1; ++i) {
-      if ((int64_t)i == sortDim) {
-        offsets.push_back(b.getI64IntegerAttr(0));
-      } else {
-        offsets.push_back(
-            inductionVariables[i - static_cast<int>((int64_t)i > sortDim)]);
-      }
-    }
-  }
-
-  Value slice(ImplicitLocOpBuilder& b, Value input) {
-    auto ty = input.getType().cast<MemRefType>();
-    auto slicedType =
-        memref::SubViewOp::inferRankReducedResultType(
-            {ShapedType::kDynamic} /*1D output*/, ty, offsets, sizes, strides)
-            .cast<MemRefType>();
-    return b
-        .create<memref::SubViewOp>(slicedType, input, offsets, sizes, strides)
-        .getResult();
-  }
-
-  SmallVector<OpFoldResult> offsets;
-  SmallVector<OpFoldResult> sizes;
-  SmallVector<OpFoldResult> strides;
-};
-
-SmallVector<Value> sliceMemrefs(ImplicitLocOpBuilder& b,
-                                SmallVector<Value>& inductionVariables,
-                                Value sortDimSize, ValueRange memrefs,
-                                SortOp op) {
-  if (inductionVariables.empty()) return memrefs;
-
-  SmallVector<Value> slices;
-  Slicer slicer(b, op.getDimension().getSExtValue(), sortDimSize,
-                inductionVariables);
-
-  for (Value out : memrefs) slices.push_back(slicer.slice(b, out));
-
-  return slices;
-}
-
-struct SortOpPattern : public OpRewritePattern<SortOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(SortOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // Lowering thlo to our merge sort implementation necessarily happens after
-    // bufferization.
-    if (!op.hasBufferSemantics())
-      return op->emitError() << "expected buffer semantics";
-
-    // Note: the output memrefs aren't necessarily the ones that we return
-    ValueRange outputMemrefs = op.getInits();
-    SmallVector<Value> scratchMemrefs;
-    scratchMemrefs.reserve(outputMemrefs.size());
-
-    Value firstInput = op.getOperand(0);
-    auto firstInputType = firstInput.getType().cast<ShapedType>();
-    int64_t inputRank = firstInputType.getRank();
-
-    int64_t sortDim = op.getDimension().getSExtValue();
-    Value sortDimSize = b.createOrFold<memref::DimOp>(
-        firstInput, b.create<arith::ConstantIndexOp>(sortDim));
-    int64_t staticSortDimSize = firstInputType.getDimSize(sortDim);
-
-    SmallVector<Value> dynamicDims;
-    for (int i = 0; i < inputRank; ++i) {
-      if (!firstInputType.isDynamicDim(i)) continue;
-      auto index = b.createOrFold<arith::ConstantIndexOp>(i);
-      Value dimOp = b.create<memref::DimOp>(firstInput, index);
-      dynamicDims.push_back(dimOp);
-    }
-
-    // Allocate scratch memrefs. If the size of the sort dimension is
-    // statically known to be <= kInsertionSortSize, `scratchMemrefs` are unused
-    // and will be cleaned up later.
-    for (auto input : op.getInputs()) {
-      auto inputType = input.getType().cast<ShapedType>();
-      auto memRefType =
-          MemRefType::get(inputType.getShape(), inputType.getElementType());
-      scratchMemrefs.emplace_back(
-          b.create<memref::AllocOp>(memRefType, dynamicDims));
-    }
-
-    b.setInsertionPoint(op);
-    Value zero = b.create<arith::ConstantIndexOp>(0);
-    Value one = b.create<arith::ConstantIndexOp>(1);
-
-    Value forInitArg = b.create<arith::ConstantIntOp>(/*value=*/0, /*width=*/1);
-    SmallVector<scf::ForOp> forOps;
-    SmallVector<Value> inductionVariables;
-    forOps.reserve(inputRank - 1);
-    inductionVariables.reserve(inputRank - 1);
-    for (int64_t i = 0; i < inputRank; ++i) {
-      if (i != sortDim) {
-        Value dim = b.create<arith::ConstantIndexOp>(i);
-        Value upperBound = b.create<memref::DimOp>(firstInput, dim);
-        scf::ForOp& forOp = forOps.emplace_back(b.create<scf::ForOp>(
-            zero, upperBound, one, ValueRange{forInitArg}));
-        inductionVariables.push_back(forOp.getInductionVar());
-        b.setInsertionPointToStart(forOp.SingleBlock::getBody());
-      }
-    }
-    SmallVector<Value> inputs =
-        sliceMemrefs(b, inductionVariables, sortDimSize, op.getInputs(), op);
-    SmallVector<Value> outputs =
-        sliceMemrefs(b, inductionVariables, sortDimSize, outputMemrefs, op);
-    SmallVector<Value> scratches =
-        sliceMemrefs(b, inductionVariables, sortDimSize, scratchMemrefs, op);
-
-    Value parity =
-        emitBottomUpMergeSort(b, zero, sortDimSize, staticSortDimSize, inputs,
-                              outputs, scratches, op.getRegion());
-
-    // Pass the parity bit through the for loops.
-    for (auto i = static_cast<int64_t>(forOps.size() - 1); i >= 0; --i) {
-      b.setInsertionPointToEnd(&forOps[i].getRegion().front());
-      b.create<scf::YieldOp>(ValueRange{parity});
-      parity = forOps[i]->getResult(0);
-    }
-    b.setInsertionPoint(op);
-
-    // If the results are in the scratch memrefs, copy them to the output
-    // memrefs.
-    auto thenBlock = [&](OpBuilder& ob, Location loc) {
-      ImplicitLocOpBuilder b = ImplicitLocOpBuilder(loc, ob);
-      for (auto [target, source] : llvm::zip(outputMemrefs, scratchMemrefs)) {
-        b.create<memref::CopyOp>(source, target);
-      }
-      b.create<scf::YieldOp>(ValueRange{});
-    };
-
-    rewriter.replaceOpWithNewOp<scf::IfOp>(op, /*cond=*/parity,
-                                           /*thenBuilder=*/thenBlock,
-                                           /*elseBuilder=*/nullptr);
-
-    for (Value scratchMemref : scratchMemrefs) {
-      b.create<memref::DeallocOp>(scratchMemref);
-    }
-
-    return success();
-  }
-};
-
-struct LegalizeSortPass
-    : public impl::ThloLegalizeSortPassBase<LegalizeSortPass> {
-  // Perform the lowering to MLIR control flow.
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext* ctx = f.getContext();
-
-    RewritePatternSet patterns(ctx);
-    patterns.add<SortOpPattern>(ctx);
-
-    mlir::ConversionTarget target(*ctx);
-    target.markUnknownOpDynamicallyLegal([](Operation*) { return true; });
-    target.addIllegalOp<thlo::SortOp>();
-
-    if (failed(applyPartialConversion(f, target, std::move(patterns)))) {
-      signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-}  // namespace thlo
-}  // namespace mlir
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-mlir::thlo::createLegalizeSortPass() {
-  return std::make_unique<LegalizeSortPass>();
-}
diff --git a/third_party/xla/xla/mlir_hlo/thlo/transforms/passes.h b/third_party/xla/xla/mlir_hlo/thlo/transforms/passes.h
deleted file mode 100644
index 7ac8499f714742..00000000000000
--- a/third_party/xla/xla/mlir_hlo/thlo/transforms/passes.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_THLO_TRANSFORMS_PASSES_H
-#define MLIR_HLO_THLO_TRANSFORMS_PASSES_H
-
-#include <memory>
-
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-
-template <typename T>
-class OperationPass;
-
-namespace func {
-class FuncOp;
-}  // namespace func
-
-namespace thlo {
-
-#define GEN_PASS_DECL_THLOLEGALIZESORTPASS
-#include "thlo/transforms/thlo_passes.h.inc"
-
-/// Lowers sort to Arith, MemRef, and SCF
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeSortPass();
-
-#define GEN_PASS_REGISTRATION
-#include "thlo/transforms/thlo_passes.h.inc"
-
-}  // namespace thlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_THLO_TRANSFORMS_PASSES_H
diff --git a/third_party/xla/xla/mlir_hlo/thlo/transforms/thlo_passes.td b/third_party/xla/xla/mlir_hlo/thlo/transforms/thlo_passes.td
deleted file mode 100644
index be0bdf43816dbe..00000000000000
--- a/third_party/xla/xla/mlir_hlo/thlo/transforms/thlo_passes.td
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-include "mlir/Pass/PassBase.td"
-
-def ThloLegalizeSortPass : Pass<"thlo-legalize-sort", "func::FuncOp"> {
-  let summary =
-    "Legalize from THLO sort with buffer semantics to SCF control flow.";
-  let constructor = "createLegalizeSortPass()";
-  let dependentDialects = ["arith::ArithDialect", "memref::MemRefDialect",
-                           "scf::SCFDialect"];
-}
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt
index 0e6b331876d738..a7a65587e11905 100644
--- a/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt
@@ -22,13 +22,8 @@ set(LIBS
         ${extension_libs}
         MLIROptLib
 
-        AllGmlStPasses
         AllMhloPasses
-        AllThloPasses
-        DeallocationDialect
         DeallocationPasses
-        GmlStDialect
-        GmlStPasses
         LmhloDialect
         LmhloGPUDialect
         LmhloPasses
@@ -43,7 +38,6 @@ add_llvm_executable(mlir-hlo-opt mlir-hlo-opt.cc
   DEPENDS
         MLIRLmhloPassIncGen
         MLIRMhloPassIncGen
-        MLIRThloPassIncGen
         LMHLOTransformsPassIncGen
         LMHLOGPUTransformsPassIncGen
 )
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc b/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc
index 1f2890c164da53..742bf5707b8eaf 100644
--- a/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc
+++ b/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc
@@ -13,11 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "deallocation/IR/deallocation_ops.h"
 #include "deallocation/transforms/passes.h"
-#include "gml_st/IR/gml_st_ops.h"
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/test_passes.h"
 #include "lhlo/IR/lhlo_ops.h"
 #include "lhlo/transforms/passes.h"
 #include "lhlo_gpu/IR/lhlo_gpu_ops.h"
@@ -28,8 +24,6 @@ limitations under the License.
 #include "mlir/InitAllPasses.h"
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 #include "stablehlo/dialect/Register.h"
-#include "thlo/IR/thlo_ops.h"
-#include "thlo/transforms/passes.h"
 #include "transforms/gpu_passes.h"
 #include "transforms/passes.h"
 
@@ -38,36 +32,16 @@ using namespace mlir;
 int main(int argc, char** argv) {
   registerAllPasses();
   deallocation::registerDeallocationPasses();
-  gml_st::registerGmlStPasses();
-  gml_st::registerGmlStTestPasses();
   hlo::registerLMHLOTransformsPasses();
   lmhlo::registerAllLmhloPasses();
   mhlo::registerAllMhloPasses();
   registerLMHLOGPUTransformsPasses();
-  thlo::registerAllThloPasses();
-
-  PassPipelineRegistration<gml_st::GmlStCPUTilingOptions>
-      gmlStCpuTilingPipeline("gml-st-cpu-tiling-pipeline",
-                             "Tiles, fuses, vectorizes tileable ops for CPU",
-                             gml_st::addCPUTilingPipeline);
-
-  PassPipelineRegistration<> defaultGmlStCpuTilingPipeline(
-      "default-gml-st-cpu-tiling-pipeline",
-      "Tiles, fuses, vectorizes tileable ops for CPU with default parameters",
-      [](OpPassManager& pm) {
-        gml_st::addDefaultCPUTilingPipeline(pm, /*cpuName=*/"");
-      });
 
   DialectRegistry registry;
   registerAllDialects(registry);
   registerAllExtensions(registry);
   mhlo::registerAllMhloDialects(registry);
   stablehlo::registerAllDialects(registry);
-  registry.insert<deallocation::DeallocationDialect, lmhlo::LmhloDialect,
-                  lmhlo_gpu::LmhloGpuDialect, gml_st::GmlStDialect,
-                  thlo::THLODialect>();
-
-  registerTestHloTransformDialectEraseSchedulePass();
-  registerTestHloTransformDialectInterpreterPass();
+  registry.insert<lmhlo::LmhloDialect, lmhlo_gpu::LmhloGpuDialect>();
   return failed(MlirOptMain(argc, argv, "MLIR HLO pass driver\n", registry));
 }
diff --git a/third_party/xla/xla/mlir_hlo/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/transforms/CMakeLists.txt
index 4d61e07d8e6cb1..c011ce3a724224 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/transforms/CMakeLists.txt
@@ -31,9 +31,11 @@ add_mlir_library(MLIRBufferTransforms
   detensorize_scf_ops.cc
   generic_host_to_llvm.cc
   lower_index_cast_pass.cc
+  naive_copy_removal.cc
   propagate_static_shapes_to_kernel.cc
   test_hlo_transform_dialect_interpreter.cc
   tile_loops_pass.cc
+  vectorize_copy.cc
   unbufferize_pass.cc
   unroll_loops.cc
 
@@ -46,8 +48,6 @@ add_mlir_library(MLIRBufferTransforms
 
   LINK_LIBS PUBLIC
   ChloOps
-  GmlStBufferizableOpInterface
-  GmlStDialect
   MLIRGPUDialect
   MLIRHLOAnalysis
   MLIRIR
@@ -60,8 +60,6 @@ add_mlir_library(MLIRBufferTransforms
   MLIRX86VectorDialect
   MLIRX86VectorTransforms
   MhloDialect
-  THLODialect
-  ThloBufferizableOpInterface
 )
 
 add_mlir_library(MLIRHLOGPUTransforms
@@ -75,7 +73,6 @@ add_mlir_library(MLIRHLOGPUTransforms
   Core
 
   LINK_LIBS PUBLIC
-  GmlStPasses
   MLIRArithTransforms
   MLIRGPUDialect
   MLIRHLOAnalysis
diff --git a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
index 9c2d3e4b9c69f9..057e7289ca3068 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
@@ -21,8 +21,6 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "gml_st/IR/gml_st_ops.h"
-#include "gml_st/interfaces/bufferizable_op_interface_impl.h"
 #include "lhlo/IR/lhlo_ops.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -72,8 +70,6 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "stablehlo/dialect/ChloOps.h"
-#include "thlo/IR/thlo_ops.h"
-#include "thlo/interfaces/bufferizable_op_interface_impl.h"
 #include "transforms/passes.h"
 #include "transforms/rewriters.h"
 
@@ -138,13 +134,12 @@ struct ComputeOpAndFuncBufferizePass
     : public impl::ComputeOpAndFuncBufferizePassBase<
           ComputeOpAndFuncBufferizePass> {
   void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<bufferization::BufferizationDialect, lmhlo::LmhloDialect,
-                    linalg::LinalgDialect, memref::MemRefDialect,
-                    mhlo::MhloDialect, thlo::THLODialect, shape::ShapeDialect,
-                    vector::VectorDialect>();
+    registry
+        .insert<bufferization::BufferizationDialect, lmhlo::LmhloDialect,
+                linalg::LinalgDialect, memref::MemRefDialect, mhlo::MhloDialect,
+                shape::ShapeDialect, vector::VectorDialect>();
     linalg::registerBufferizableOpInterfaceExternalModels(registry);
     mhlo::registerBufferizableOpInterfaceExternalModels(registry);
-    thlo::registerBufferizableOpInterfaceExternalModels(registry);
     shape::registerBufferizableOpInterfaceExternalModels(registry);
     vector::registerBufferizableOpInterfaceExternalModels(registry);
   }
@@ -159,7 +154,7 @@ struct ComputeOpAndFuncBufferizePass
     options.opFilter.allowDialect<bufferization::BufferizationDialect,
                                   linalg::LinalgDialect, mhlo::MhloDialect,
                                   shape::ShapeDialect, tensor::TensorDialect,
-                                  thlo::THLODialect, vector::VectorDialect>();
+                                  vector::VectorDialect>();
 
     if (failed(bufferization::bufferizeOp(getOperation(), options))) {
       signalPassFailure();
@@ -176,11 +171,10 @@ struct ComputeOpAndFuncBufferizePass
     RewritePatternSet patterns(&getContext());
     auto& context = getContext();
     ConversionTarget target(context);
-    target.addLegalDialect<affine::AffineDialect, arith::ArithDialect,
-                           complex::ComplexDialect, func::FuncDialect,
-                           lmhlo::LmhloDialect, math::MathDialect,
-                           memref::MemRefDialect, tensor::TensorDialect,
-                           thlo::THLODialect, vector::VectorDialect>();
+    target.addLegalDialect<
+        affine::AffineDialect, arith::ArithDialect, complex::ComplexDialect,
+        func::FuncDialect, lmhlo::LmhloDialect, math::MathDialect,
+        memref::MemRefDialect, tensor::TensorDialect, vector::VectorDialect>();
     target.addLegalOp<UnrealizedConversionCastOp>();
     target.addIllegalDialect<mhlo::MhloDialect>();
 
@@ -219,20 +213,18 @@ struct OneShotBufferizePass
     : public impl::OneShotBufferizeBase<OneShotBufferizePass> {
   // TODO(b/173201243): Move to tablegen.
   void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<bufferization::BufferizationDialect, lmhlo::LmhloDialect,
-                    linalg::LinalgDialect, memref::MemRefDialect,
-                    mhlo::MhloDialect, scf::SCFDialect, shape::ShapeDialect,
-                    thlo::THLODialect, vector::VectorDialect>();
+    registry
+        .insert<bufferization::BufferizationDialect, lmhlo::LmhloDialect,
+                linalg::LinalgDialect, memref::MemRefDialect, mhlo::MhloDialect,
+                scf::SCFDialect, shape::ShapeDialect, vector::VectorDialect>();
     arith::registerBufferizableOpInterfaceExternalModels(registry);
     bufferization::func_ext::registerBufferizableOpInterfaceExternalModels(
         registry);
     linalg::registerBufferizableOpInterfaceExternalModels(registry);
     mhlo::registerBufferizableOpInterfaceExternalModels(registry);
-    gml_st::registerBufferizableOpInterfaceExternalModels(registry);
     scf::registerBufferizableOpInterfaceExternalModels(registry);
     shape::registerBufferizableOpInterfaceExternalModels(registry);
     tensor::registerBufferizableOpInterfaceExternalModels(registry);
-    thlo::registerBufferizableOpInterfaceExternalModels(registry);
     vector::registerBufferizableOpInterfaceExternalModels(registry);
   }
 
@@ -270,16 +262,15 @@ struct FinalBufferizePass
 
  public:
   void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<affine::AffineDialect, bufferization::BufferizationDialect,
-                    linalg::LinalgDialect, memref::MemRefDialect,
-                    scf::SCFDialect, shape::ShapeDialect, tensor::TensorDialect,
-                    lmhlo::LmhloDialect, arith::ArithDialect, thlo::THLODialect,
-                    vector::VectorDialect>();
+    registry
+        .insert<affine::AffineDialect, bufferization::BufferizationDialect,
+                linalg::LinalgDialect, memref::MemRefDialect, scf::SCFDialect,
+                shape::ShapeDialect, tensor::TensorDialect, lmhlo::LmhloDialect,
+                arith::ArithDialect, vector::VectorDialect>();
     arith::registerBufferizableOpInterfaceExternalModels(registry);
     linalg::registerBufferizableOpInterfaceExternalModels(registry);
     shape::registerBufferizableOpInterfaceExternalModels(registry);
     tensor::registerBufferizableOpInterfaceExternalModels(registry);
-    thlo::registerBufferizableOpInterfaceExternalModels(registry);
     vector::registerBufferizableOpInterfaceExternalModels(registry);
     if (dialectsCallback) dialectsCallback(registry);
   }
@@ -305,7 +296,7 @@ struct FinalBufferizePass
     options.opFilter.allowDialect<
         arith::ArithDialect, bufferization::BufferizationDialect,
         linalg::LinalgDialect, func::FuncDialect, shape::ShapeDialect,
-        tensor::TensorDialect, thlo::THLODialect, vector::VectorDialect>();
+        tensor::TensorDialect, vector::VectorDialect>();
     if (failed(bufferization::bufferizeOp(getOperation(), options))) {
       signalPassFailure();
       return;
@@ -325,8 +316,7 @@ struct FinalBufferizePass
         cf::ControlFlowDialect, complex::ComplexDialect, memref::MemRefDialect,
         func::FuncDialect, scf::SCFDialect, tensor::TensorDialect,
         affine::AffineDialect, shape::ShapeDialect, lmhlo::LmhloDialect,
-        linalg::LinalgDialect, math::MathDialect, thlo::THLODialect,
-        vector::VectorDialect>();
+        linalg::LinalgDialect, math::MathDialect, vector::VectorDialect>();
     target.addLegalOp<func::FuncOp, ModuleOp>();
 
     target.addIllegalDialect<mhlo::MhloDialect>();
diff --git a/third_party/xla/xla/mlir_hlo/transforms/generic_host_to_llvm.cc b/third_party/xla/xla/mlir_hlo/transforms/generic_host_to_llvm.cc
index b338623b1135bc..bef78953286b93 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/generic_host_to_llvm.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/generic_host_to_llvm.cc
@@ -15,8 +15,6 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "deallocation/IR/deallocation_ops.h"  // IWYU pragma: keep
-#include "deallocation/transforms/passes.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
@@ -107,8 +105,6 @@ class GenericHostToLLVMPass
     populateSCFToControlFlowConversionPatterns(patterns);
     populateComplexToLLVMConversionPatterns(typeConverter, patterns);
     populateMathToLibmConversionPatterns(patterns);
-    deallocation::populateDeallocationToLLVMConversionPatterns(typeConverter,
-                                                               patterns);
 
     // Vector patterns.
     vector::populateVectorMaskMaterializationPatterns(patterns, true);
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/copy_removal/copy_removal.cc b/third_party/xla/xla/mlir_hlo/transforms/naive_copy_removal.cc
similarity index 93%
rename from third_party/xla/xla/mlir_hlo/gml_st/transforms/copy_removal/copy_removal.cc
rename to third_party/xla/xla/mlir_hlo/transforms/naive_copy_removal.cc
index a5ebefdef58b1f..ddd6e6916971f6 100644
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/copy_removal/copy_removal.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/naive_copy_removal.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,16 +16,18 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "gml_st/transforms/passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "transforms/passes.h"
 
-namespace mlir::gml_st {
+namespace mlir {
 namespace {
 
 #define GEN_PASS_DEF_NAIVECOPYREMOVALPASS
-#include "gml_st/transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 /// Remove memref::CopyOp whose target (can be either a memref::SubViewOp or
 /// memref::AllocOp) has no other users.
@@ -88,4 +90,4 @@ std::unique_ptr<OperationPass<func::FuncOp>> createNaiveCopyRemovalPass() {
   return std::make_unique<NaiveCopyRemovalPass>();
 }
 
-}  // namespace mlir::gml_st
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/transforms/passes.h b/third_party/xla/xla/mlir_hlo/transforms/passes.h
index d18c60320525fa..ac322b01aac4a5 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/passes.h
+++ b/third_party/xla/xla/mlir_hlo/transforms/passes.h
@@ -49,6 +49,7 @@ using BufferizePatternsCallback = std::function<void(
 #define GEN_PASS_DECL_PROPAGATESTATICSHAPESTOKERNELPASS
 #define GEN_PASS_DECL_TILELOOPSPASS
 #define GEN_PASS_DECL_GENERICHOSTTOLLVMPASS
+#define GEN_PASS_DECL_VECTORIZECOPYPASS
 #include "transforms/passes.h.inc"
 
 /// Creates a pass that merges smaller buffer into bigger buffer to optimize
@@ -99,6 +100,12 @@ std::unique_ptr<OperationPass<func::FuncOp>> createTileLoopsPass(
 // and scf.if.
 std::unique_ptr<OperationPass<func::FuncOp>> createDetensorizeScfOpsPass();
 
+/// Pass to remove redundant `memref.copy` ops.
+std::unique_ptr<OperationPass<func::FuncOp>> createNaiveCopyRemovalPass();
+
+/// Pass to vectorize `memref.copy`.
+std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeCopyPass();
+
 /// Registers the test pass for erasing transform dialect ops.
 void registerTestHloTransformDialectEraseSchedulePass();
 
diff --git a/third_party/xla/xla/mlir_hlo/transforms/passes.td b/third_party/xla/xla/mlir_hlo/transforms/passes.td
index b532491a737053..f97752add62536 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/passes.td
+++ b/third_party/xla/xla/mlir_hlo/transforms/passes.td
@@ -140,7 +140,6 @@ def GenericHostToLLVMPass : Pass<"generic-host-to-llvm", "ModuleOp"> {
     "::mlir::arith::ArithDialect",
     "::mlir::cf::ControlFlowDialect",
     "::mlir::complex::ComplexDialect",
-    "::mlir::deallocation::DeallocationDialect",
     "::mlir::func::FuncDialect",
     "::mlir::math::MathDialect",
     "::mlir::memref::MemRefDialect",
@@ -152,8 +151,9 @@ def GenericHostToLLVMPass : Pass<"generic-host-to-llvm", "ModuleOp"> {
 def UnbufferizePass : Pass<"unbufferize", "mlir::func::FuncOp"> {
   let summary = "Unbufferize partially bufferized functions.";
   let description = [{
-    Removes bufferization.to_tensor and memref.tensor_store ops that are the
-    result of XLA bufferizing during HLO to MHLO transformation.
+    Removes bufferization.to_tensor and bufferization.materialize_in_destination
+    ops that are the result of XLA bufferizing during HLO to MHLO
+    transformation.
   }];
   let constructor = "hlo::createUnbufferizePass()";
 }
@@ -171,4 +171,16 @@ def AllocToArgPass : Pass<"alloc-to-arg", "mlir::func::FuncOp"> {
   let constructor = "hlo::createAllocToArgPass()";
 }
 
+def NaiveCopyRemovalPass : Pass<"naive-copy-removal", "mlir::func::FuncOp"> {
+  let summary = "Pass to remove redundant `memref.copy` ops.";
+  let constructor = "createNaiveCopyRemovalPass()";
+  let dependentDialects = ["memref::MemRefDialect"];
+}
+
+def VectorizeCopyPass : Pass<"vectorize-copy", "mlir::func::FuncOp"> {
+  let summary = "Pass to vectorize `memref.copy`.";
+  let constructor = "createVectorizeCopyPass()";
+  let dependentDialects = ["scf::SCFDialect", "vector::VectorDialect"];
+}
+
 #endif // TENSORFLOW_COMPILER_MLIR_HLO_TRANSFORMS_PASSES
diff --git a/third_party/xla/xla/mlir_hlo/transforms/propagate_static_shapes_to_kernel.cc b/third_party/xla/xla/mlir_hlo/transforms/propagate_static_shapes_to_kernel.cc
index f3e541ca5496c2..27a53e5a65a27f 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/propagate_static_shapes_to_kernel.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/propagate_static_shapes_to_kernel.cc
@@ -91,7 +91,7 @@ class PropagateStaticShapesToKernelPass
 // 'strides[rank]') corresponding to statically shaped 'memref' with the base
 // pointer and constants. The base pointer is changed to 'pointer_type' if
 // provided.
-static void replaceStaticMemRefArguments(ArrayRef<BlockArgument> arguments,
+static void replaceStaticMemRefArguments(ValueRange arguments,
                                          MemRefType memref, Type pointerType,
                                          PatternRewriter& rewriter) {
   assert(arguments.size() >= 3 && "expected at least 3 arguments");
@@ -111,8 +111,7 @@ static void replaceStaticMemRefArguments(ArrayRef<BlockArgument> arguments,
   arguments[2].replaceAllUsesWith(rewriter.create<LLVM::ConstantOp>(
       arguments[2].getLoc(), arguments[2].getType(),
       rewriter.getIntegerAttr(arguments[2].getType(), 0)));
-  auto replace = [&](ArrayRef<int64_t> values,
-                     ArrayRef<BlockArgument> arguments) {
+  auto replace = [&](ArrayRef<int64_t> values, ValueRange arguments) {
     for (auto valAndArg : llvm::zip_first(values, arguments)) {
       auto argument = std::get<1>(valAndArg);
       argument.replaceAllUsesWith(rewriter.create<LLVM::ConstantOp>(
diff --git a/third_party/xla/xla/mlir_hlo/transforms/unbufferize_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/unbufferize_pass.cc
index 9a1531df0051b9..e9570004f8628f 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/unbufferize_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/unbufferize_pass.cc
@@ -69,11 +69,11 @@ void UnbufferizePass::runOnOperation() {
   });
   SmallVector<Value> results;
   SmallVector<DictionaryAttr> resultAttrs;
-  funcOp->walk([&](memref::TensorStoreOp op) {
-    auto arg = op.getMemref().dyn_cast<BlockArgument>();
+  funcOp->walk([&](bufferization::MaterializeInDestinationOp op) {
+    auto arg = op.getDest().dyn_cast<BlockArgument>();
     if (!arg) return;
     argsToErase.set(arg.getArgNumber());
-    results.push_back(op.getTensor());
+    results.push_back(op.getSource());
     resultAttrs.push_back(funcOp.getArgAttrDict(arg.getArgNumber()));
     rewriter.eraseOp(op);
   });
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_copy.cc b/third_party/xla/xla/mlir_hlo/transforms/vectorize_copy.cc
similarity index 95%
rename from third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_copy.cc
rename to third_party/xla/xla/mlir_hlo/transforms/vectorize_copy.cc
index 3fc5e99cce5286..4d1a9fa213e0b0 100644
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_copy.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/vectorize_copy.cc
@@ -14,24 +14,21 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
-#include <limits>
 #include <memory>
-#include <optional>
 #include <utility>
 
-#include "gml_st/transforms/passes.h"
-#include "gml_st/transforms/vectorization/vectorization.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
+#include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
-namespace gml_st {
 namespace {
 
 #define GEN_PASS_DEF_VECTORIZECOPYPASS
-#include "gml_st/transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 /// Transforms a big non-contiguous `memref.copy` into a loop over smaller
 /// copies that are either contiguous or can be vectorized.
@@ -217,7 +214,7 @@ struct VectorizeCopyPass
 
     RewritePatternSet patterns(ctx);
     patterns.add<TileCopyPattern, CopyVectorizationPattern>(
-        ctx, numElementsThreshold);
+        ctx, /*numElementsThreshold = */ 8);
     if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
       return signalPassFailure();
     }
@@ -226,12 +223,8 @@ struct VectorizeCopyPass
 
 }  // namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeCopyPass(
-    int64_t numElementsThreshold) {
-  VectorizeCopyPassOptions opts;
-  opts.numElementsThreshold = numElementsThreshold;
-  return std::make_unique<VectorizeCopyPass>(opts);
+std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeCopyPass() {
+  return std::make_unique<VectorizeCopyPass>();
 }
 
-}  // namespace gml_st
 }  // namespace mlir
diff --git a/third_party/xla/xla/parse_flags_from_env.cc b/third_party/xla/xla/parse_flags_from_env.cc
index e3cb99e601549d..5cba95a543b58d 100644
--- a/third_party/xla/xla/parse_flags_from_env.cc
+++ b/third_party/xla/xla/parse_flags_from_env.cc
@@ -188,6 +188,17 @@ static absl::Mutex env_argv_mu(absl::kConstInit);
 
 bool ParseFlagsFromEnvAndDieIfUnknown(absl::string_view envvar,
                                       const std::vector<tsl::Flag>& flag_list) {
+  bool parsed_recognized_flags =
+      ParseFlagsFromEnvAndIgnoreUnknown(envvar, flag_list);
+  if (!parsed_recognized_flags) {
+    return false;
+  } else {
+    return !DieIfEnvHasUnknownFlagsLeft(envvar);
+  }
+}
+
+bool ParseFlagsFromEnvAndIgnoreUnknown(
+    absl::string_view envvar, const std::vector<tsl::Flag>& flag_list) {
   absl::MutexLock lock(&env_argv_mu);
   auto* env_argv = &EnvArgvs()[envvar];
   SetArgvFromEnv(envvar, env_argv);  // a no-op if already initialized
@@ -199,20 +210,24 @@ bool ParseFlagsFromEnvAndDieIfUnknown(absl::string_view envvar,
     }
   }
 
-  bool result =
-      tsl::Flags::Parse(&env_argv->argc, &env_argv->argv[0], flag_list);
+  return tsl::Flags::Parse(&env_argv->argc, &env_argv->argv[0], flag_list);
+}
 
-  // There's always at least one unparsed argc, namely the fake argv[0].
-  if (result && env_argv->argc != 1) {
+bool DieIfEnvHasUnknownFlagsLeft(absl::string_view envvar) {
+  absl::MutexLock lock(&env_argv_mu);
+  auto* env_argv = &EnvArgvs()[envvar];
+  SetArgvFromEnv(envvar, env_argv);
+
+  if (env_argv->argc != 1) {
     // Skip the first argv, which is the fake argv[0].
     auto unknown_flags = absl::MakeSpan(env_argv->argv);
     unknown_flags.remove_prefix(1);
     LOG(QFATAL) << "Unknown flag" << (unknown_flags.size() > 1 ? "s" : "")
                 << " in " << envvar << ": "
                 << absl::StrJoin(unknown_flags, " ");
-    return false;
+    return true;
   }
-  return result;
+  return false;
 }
 
 // Testing only.
diff --git a/third_party/xla/xla/parse_flags_from_env.h b/third_party/xla/xla/parse_flags_from_env.h
index 6e69d2472cc491..070176754e53a3 100644
--- a/third_party/xla/xla/parse_flags_from_env.h
+++ b/third_party/xla/xla/parse_flags_from_env.h
@@ -64,6 +64,19 @@ namespace xla {
 bool ParseFlagsFromEnvAndDieIfUnknown(absl::string_view envvar,
                                       const std::vector<tsl::Flag>& flag_list);
 
+// Calls tsl::Flags::Parse(argc, argv, flag_list) against any as yet
+// unrecognized flags passed in the environment variable `envvar`, and returns
+// its return value.
+//
+// Best effort and ignores unknown flags
+bool ParseFlagsFromEnvAndIgnoreUnknown(absl::string_view envvar,
+                                       const std::vector<tsl::Flag>& flag_list);
+
+// Raises a fatal error if there are unrecognized flags left in the specified
+// environment variable `envvar`, besides the fake argv[0]. Returns false if
+// there are no unrecognized flags.
+bool DieIfEnvHasUnknownFlagsLeft(absl::string_view envvar);
+
 // Used only for testing.  Not to be used by clients.
 void ResetFlagsFromEnvForTesting(absl::string_view envvar, int** pargc,
                                  std::vector<char*>** pargv);
diff --git a/third_party/xla/xla/permutation_util.cc b/third_party/xla/xla/permutation_util.cc
index e28c4bf89fbdd4..8e857fc8975c35 100644
--- a/third_party/xla/xla/permutation_util.cc
+++ b/third_party/xla/xla/permutation_util.cc
@@ -37,7 +37,7 @@ std::vector<int64_t> InversePermutation(
   DCHECK(IsPermutation(input_permutation));
   std::vector<int64_t> output_permutation(input_permutation.size(), -1);
   for (size_t i = 0; i < input_permutation.size(); ++i) {
-    output_permutation.at(input_permutation.at(i)) = i;
+    output_permutation[input_permutation[i]] = i;
   }
   return output_permutation;
 }
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index b4e195ed4bfec0..20399dc4624da6 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -92,6 +92,8 @@ cc_library(
         "//xla/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/profiler/lib:connected_traceme",
+        "@local_tsl//tsl/profiler/lib:context_types_hdrs",
     ],
 )
 
@@ -223,16 +225,6 @@ cc_library(
     alwayslink = 1,
 )
 
-xla_cc_test(
-    name = "pjrt_client_test_cpu",
-    srcs = ["pjrt_client_test_cpu.cc"],
-    deps = [
-        ":pjrt_client_test_common",
-        ":tfrt_cpu_pjrt_client",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
 cc_library(
     name = "pjrt_executable",
     srcs = ["pjrt_executable.cc"],
@@ -240,6 +232,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":compile_options_proto_cc",
+        ":executable_metadata_proto_cc",
         ":execute_options_proto_cc",
         ":pjrt_common",
         "//xla:shape_layout",
@@ -263,7 +256,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -302,6 +294,7 @@ cc_library(
         ":pjrt_device_description",
         ":pjrt_executable",
         "//xla/client:xla_computation",
+        "//xla/service:hlo_parser",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -336,6 +329,9 @@ cc_library(
     name = "pjrt_common",
     hdrs = ["pjrt_common.h"],
     visibility = ["//visibility:public"],
+    deps = [
+        "@local_tsl//tsl/lib/gtl:int_type",
+    ],
 )
 
 cc_library(
@@ -344,6 +340,7 @@ cc_library(
     hdrs = ["utils.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":layout_mode",
         "//xla:shape_util",
         "//xla:status",
         "//xla:status_macros",
@@ -359,6 +356,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
@@ -366,16 +365,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "layout_mode",
+    srcs = ["layout_mode.cc"],
+    hdrs = ["layout_mode.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla/service:hlo_parser",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "metrics",
     srcs = ["metrics.cc"],
     hdrs = ["metrics.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//xla/stream_executor",
-        "//xla/stream_executor/gpu:gpu_init",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/lib/monitoring:counter",
         "@local_tsl//tsl/lib/monitoring:gauge",
@@ -437,6 +447,7 @@ cc_library(
         "//xla/client:local_client",
         "//xla/client:xla_computation",
         "//xla/hlo/ir:hlo",
+        "//xla/pjrt:pjrt_common",
         "//xla/pjrt/distributed:protocol_proto_cc",
         "//xla/service:computation_layout",
         "//xla/service:computation_placer",
@@ -553,180 +564,13 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "tracked_tfrt_cpu_device_buffer",
-    srcs = ["tracked_tfrt_cpu_device_buffer.cc"],
-    hdrs = ["tracked_tfrt_cpu_device_buffer.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:cpu_function_runtime",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla/runtime:cpu_event",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:async_value",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:platform_port",
-    ],
-)
-
-xla_cc_test(
-    name = "tracked_tfrt_cpu_device_buffer_test",
-    srcs = ["tracked_tfrt_cpu_device_buffer_test.cc"],
-    deps = [
-        ":tracked_tfrt_cpu_device_buffer",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/concurrency:async_value",
-        "@local_tsl//tsl/platform:env",
-    ],
-)
-
-cc_library(
-    name = "abstract_tfrt_cpu_buffer",
-    srcs = ["abstract_tfrt_cpu_buffer.cc"],
-    hdrs = ["abstract_tfrt_cpu_buffer.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":pjrt_client",
-        ":pjrt_future",
-        ":tracked_tfrt_cpu_device_buffer",
-        ":transpose",
-        ":utils",
-        "//xla:cpu_function_runtime",
-        "//xla:literal",
-        "//xla:shape_tree",
-        "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/runtime:cpu_event",
-        "//xla/service:shaped_buffer",
-        "//xla/service/cpu:cpu_executable",
-        "//xla/service/cpu:cpu_xfeed",
-        "//xla/stream_executor:device_memory",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:async_value",
-        "@local_tsl//tsl/concurrency:ref_count",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/profiler/lib:connected_traceme",
-        "@local_tsl//tsl/profiler/lib:traceme",
-    ],
-)
-
+# Transitional forwarding target. Use cpu:cpu_client instead.
 cc_library(
     name = "tfrt_cpu_pjrt_client",
-    srcs = ["tfrt_cpu_pjrt_client.cc"],
     hdrs = ["tfrt_cpu_pjrt_client.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":abstract_tfrt_cpu_buffer",
-        ":compile_options_proto_cc",
-        ":mlir_to_hlo",
-        ":pjrt_client",
-        ":pjrt_executable",
-        ":pjrt_future",
-        ":semaphore",
-        ":tracked_tfrt_cpu_device_buffer",
-        ":transpose",
-        ":utils",
-        "//xla:array",
-        "//xla:debug_options_flags",
-        "//xla:executable_run_options",
-        "//xla:literal",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/client:executable_build_options",
-        "//xla/client:xla_computation",
-        "//xla/hlo/ir:hlo",
-        "//xla/runtime:cpu_event",
-        "//xla/service:buffer_assignment",
-        "//xla/service:compiler",
-        "//xla/service:computation_placer_hdr",
-        "//xla/service:custom_call_status_public_headers",
-        "//xla/service:dump",
-        "//xla/service:executable",
-        "//xla/service:hlo_cost_analysis",
-        "//xla/service:hlo_module_config",
-        "//xla/service:hlo_module_util",
-        "//xla/service:hlo_proto_cc",
-        "//xla/service:hlo_value",
-        "//xla/service/cpu:buffer_desc",
-        "//xla/service/cpu:cpu_compiler",
-        "//xla/service/cpu:cpu_executable",
-        "//xla/service/cpu:cpu_xfeed",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@eigen_archive//:eigen3",  # TODO(zhangqiaorjc): Remove if use TFRT threadpool.
-        "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/concurrency:async_value",
-        "@local_tsl//tsl/concurrency:ref_count",
-        "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:denormal",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:fingerprint",
-        "@local_tsl//tsl/platform:setround",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/profiler/lib:connected_traceme",
-        "@local_tsl//tsl/profiler/lib:context_types_hdrs",
-        "@local_tsl//tsl/profiler/lib:traceme",
-    ],
-)
-
-xla_cc_test(
-    name = "tfrt_cpu_pjrt_client_test",
-    srcs = ["tfrt_cpu_pjrt_client_test.cc"],
-    deps = [
-        ":tfrt_cpu_pjrt_client",
-        "//xla:literal",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:status",
-        "//xla:util",
-        "//xla/service:custom_call_status_public_headers",
-        "//xla/service:custom_call_target_registry",
-        "//xla/service:hlo_parser",
-        "//xla/tests:test_utils",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
+        "//xla/pjrt/cpu:cpu_client",
     ],
 )
 
@@ -874,6 +718,7 @@ xla_cc_test(
         "//xla/client:xla_builder",
         "//xla/pjrt/c:pjrt_c_api_cpu_internal",
         "//xla/tests:literal_test_util",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
@@ -889,7 +734,11 @@ cc_library(
     deps = [
         ":pjrt_client",
         ":pjrt_future",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:errors",
     ],
 )
@@ -899,8 +748,8 @@ xla_cc_test(
     srcs = ["tf_pjrt_client_test.cc"],
     deps = [
         ":tf_pjrt_client",
-        ":tfrt_cpu_pjrt_client",
         "//xla:literal_util",
+        "//xla/pjrt/cpu:cpu_client",
         "//xla/service:hlo_parser",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:env",
@@ -957,3 +806,45 @@ tf_proto_library(
     srcs = ["execute_options.proto"],
     visibility = ["//visibility:public"],
 )
+
+tf_proto_library(
+    name = "executable_metadata_proto",
+    srcs = ["executable_metadata.proto"],
+    protodeps = [
+        "//xla/service:hlo_proto",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "exceptions",
+    hdrs = ["exceptions.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:status",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "status_casters",
+    hdrs = ["status_casters.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":exceptions",
+        "//xla:status",
+        "//xla:statusor",
+        "@local_tsl//tsl/platform:macros",
+    ],
+)
diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index a67433089f068e..d74d8d92fb5eb9 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -82,6 +82,7 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -110,6 +111,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -134,7 +136,7 @@ cc_library(
         ":pjrt_c_api_helpers",
         ":pjrt_c_api_wrapper_impl",
         "//xla/pjrt:pjrt_client",
-        "//xla/pjrt:tfrt_cpu_pjrt_client",
+        "//xla/pjrt/cpu:cpu_client",
     ],
 )
 
@@ -223,12 +225,16 @@ cc_library(
         ":pjrt_c_api_hdrs",
         ":pjrt_c_api_helpers",
         "//xla:shape_util",
+        "//xla/client:executable_build_options",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
+        "//xla/service:computation_placer_hdr",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index 95c49a9129a98b..5bd553df96126c 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,5 +1,12 @@
 # PJRT C API changelog
 
+## 0.40 (Nov 27, 2023)
+* Added PJRT_Executable_GetCompiledMemoryStats.
+
+## 0.39 (Nov 16, 2023)
+* Add non_donatable_input_indices and num_non_donatable_input_indices to
+PJRT_ExecuteOptions.
+
 ## 0.38 (Oct 30, 2023)
 * Use `enum` to define STRUCT_SIZE constants in a header file.
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index 122f9414ba6bdd..e249630374a13b 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -53,7 +53,7 @@ extern "C" {
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 38
+#define PJRT_API_MINOR 40
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -1264,6 +1264,16 @@ struct PJRT_ExecuteOptions {
   // multi-host programs are launched in different orders on different hosts,
   // the launch IDs may be used by the runtime to detect the mismatch.
   int launch_id;
+  // A list of indices denoting the input buffers that should not be donated.
+  // An input buffer may be non-donable, for example, if it is referenced more
+  // than once. Since such runtime information is not available at compile time,
+  // the compiler might mark the input as `may-alias`, which could lead PjRt to
+  // donate the input buffer when it should not. By defining this list of
+  // indices, a higher-level PJRT caller can instruct PJRT client not to donate
+  // specific input buffers. The caller needs to make sure to keep it alive
+  // during the call.
+  const int64_t* non_donatable_input_indices;
+  size_t num_non_donatable_input_indices;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_ExecuteOptions, launch_id);
 
@@ -1366,6 +1376,27 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_GetCostAnalysis_Args, properties);
 typedef PJRT_Error* PJRT_Executable_GetCostAnalysis(
     PJRT_Executable_GetCostAnalysis_Args* args);
 
+struct PJRT_Executable_GetCompiledMemoryStats_Args {
+  size_t struct_size;
+  void* priv;
+  PJRT_Executable* executable;
+
+  // Mirrors xla::CompiledMemoryStats.
+  int64_t generated_code_size_in_bytes;  // out
+  int64_t argument_size_in_bytes;        // out
+  int64_t output_size_in_bytes;          // out
+  // How much argument is reused for output.
+  int64_t alias_size_in_bytes;  // out
+  int64_t temp_size_in_bytes;   // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_GetCompiledMemoryStats_Args,
+                          temp_size_in_bytes);
+
+// Return memory stats that allow callers to estimate device memory usage
+// when running this executable.
+typedef PJRT_Error* PJRT_Executable_GetCompiledMemoryStats(
+    PJRT_Executable_GetCompiledMemoryStats_Args* args);
+
 struct PJRT_Executable_OutputElementTypes_Args {
   size_t struct_size;
   void* priv;
@@ -2126,6 +2157,8 @@ typedef struct {
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_Fingerprint);
 
   _PJRT_API_STRUCT_FIELD(PJRT_Client_TopologyDescription);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_GetCompiledMemoryStats);
 } PJRT_Api;
 
 enum {
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc
index 1344ab3ccc47b1..b60296cee55556 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_cpu_internal.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
 
 namespace pjrt {
 namespace cpu_plugin {
@@ -34,9 +34,10 @@ PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
 
   // TODO(b/263170683): cpu_device_count should be configurable after config
   // options can be passed to PJRT_Client_Create.
-  PJRT_ASSIGN_OR_RETURN(
-      std::unique_ptr<xla::PjRtClient> client,
-      xla::GetTfrtCpuClient(/*asynchronous=*/true, /*cpu_device_count=*/4));
+  xla::CpuClientOptions options;
+  options.cpu_device_count = 4;
+  PJRT_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtClient> client,
+                        xla::GetTfrtCpuClient(options));
   args->client = pjrt::CreateWrapperClient(std::move(client));
   return nullptr;
 }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
index 3dca0efe9547d2..5edae823c341ee 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -54,7 +54,8 @@ PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
                                           args->num_options);
   const auto kExpectedOptionNameAndTypes =
       absl::flat_hash_map<std::string, PJRT_NamedValue_Type>(
-          {{"allocator", PJRT_NamedValue_Type::PJRT_NamedValue_kString},
+          {{"platform_name", PJRT_NamedValue_Type::PJRT_NamedValue_kString},
+           {"allocator", PJRT_NamedValue_Type::PJRT_NamedValue_kString},
            {"memory_fraction", PJRT_NamedValue_Type::PJRT_NamedValue_kFloat},
            {"preallocate", PJRT_NamedValue_Type::PJRT_NamedValue_kBool},
            {"visible_devices",
@@ -64,6 +65,11 @@ PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
   PJRT_RETURN_IF_ERROR(
       ValidateCreateOptions(create_options, kExpectedOptionNameAndTypes));
 
+  std::optional<std::string> platform_name;
+  if (auto it = create_options.find("platform_name");
+      it != create_options.end()) {
+    platform_name.emplace(std::get<std::string>(it->second));
+  }
   xla::GpuAllocatorConfig allocator_config;
   if (auto it = create_options.find("allocator"); it != create_options.end()) {
     auto allocator_name = std::get<std::string>(it->second);
@@ -105,15 +111,18 @@ PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
     num_nodes = std::get<int64_t>(it->second);
   }
 
+  xla::GpuClientOptions options;
+  options.allocator_config = allocator_config;
+  options.node_id = node_id;
+  options.num_nodes = num_nodes;
+  options.allowed_devices = visible_devices;
+  options.platform_name = platform_name;
+  options.kv_get = pjrt::ToCppKeyValueGetCallback(args->kv_get_callback,
+                                                  args->kv_get_user_arg);
+  options.kv_put = pjrt::ToCppKeyValuePutCallback(args->kv_put_callback,
+                                                  args->kv_put_user_arg);
   PJRT_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtClient> client,
-                        xla::GetStreamExecutorGpuClient(
-                            /*asynchronous=*/true, allocator_config, node_id,
-                            num_nodes, visible_devices,
-                            /*platform_name=*/std::nullopt, true,
-                            pjrt::ToCppKeyValueGetCallback(
-                                args->kv_get_callback, args->kv_get_user_arg),
-                            pjrt::ToCppKeyValuePutCallback(
-                                args->kv_put_callback, args->kv_put_user_arg)));
+                        xla::GetStreamExecutorGpuClient(options));
   args->client = pjrt::CreateWrapperClient(std::move(client));
   return nullptr;
 }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index ded7bf4e5c81f0..ba1a144b0d9468 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <numeric>
 #include <string>
+#include <string_view>
 #include <thread>  // NOLINT(build/c++11)
 #include <utility>
 #include <variant>
@@ -157,7 +158,7 @@ TEST_F(PjrtCApiGpuTest, CreateViewOfDeviceBuffer) {
 std::unique_ptr<::pjrt::PJRT_KeyValueCallbackData> CreateTestCKVCallback(
     absl::flat_hash_map<std::string, std::string>* kv_store, absl::Mutex& mu) {
   xla::PjRtClient::KeyValueGetCallback kv_get =
-      [kv_store, &mu](const std::string& k,
+      [kv_store, &mu](std::string_view k,
                       absl::Duration timeout) -> xla::StatusOr<std::string> {
     absl::Duration wait_interval = absl::Milliseconds(10);
     int num_retry = timeout / wait_interval;
@@ -175,8 +176,7 @@ std::unique_ptr<::pjrt::PJRT_KeyValueCallbackData> CreateTestCKVCallback(
         absl::StrCat(k, " is not found in the kv store."));
   };
   xla::PjRtClient::KeyValuePutCallback kv_put =
-      [kv_store, &mu](const std::string& k,
-                      const std::string& v) -> xla::Status {
+      [kv_store, &mu](std::string_view k, std::string_view v) -> xla::Status {
     {
       absl::MutexLock lock(&mu);
       kv_store->insert(std::pair<std::string, std::string>(k, v));
@@ -333,6 +333,83 @@ TEST(PjrtCApiGpuAllocatorTest, InvalidAllocatorOptionsParsing) {
   api->PJRT_Error_Destroy(&error_destroy_args);
 }
 
+TEST(PjrtCApiPlatformNameTest, AvailablePlatformName) {
+  auto api = GetPjrtApi();
+  std::string expected_platform_name_for_cuda = "cuda";
+  std::string expected_platform_name_for_rocm = "rocm";
+  absl::flat_hash_map<std::string, xla::PjRtValueType> options = {
+      {"platform_name", static_cast<std::string>("gpu")},
+      {"allocator", static_cast<std::string>("default")},
+      {"visible_devices", xla::PjRtValueType(std::vector<int64_t>{0, 1})},
+  };
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<PJRT_NamedValue> c_options,
+      ::pjrt::ConvertToPjRtNamedValueList(options, /*api_minor_version=*/30));
+  PJRT_Client_Create_Args create_arg;
+  create_arg.struct_size = PJRT_Client_Create_Args_STRUCT_SIZE;
+  create_arg.priv = nullptr;
+  create_arg.client = nullptr;
+  create_arg.create_options = c_options.data();
+  create_arg.num_options = c_options.size();
+  PJRT_Error* error = api->PJRT_Client_Create(&create_arg);
+  EXPECT_EQ(error, nullptr) << error->status.message();
+
+  PJRT_Client_PlatformName_Args platform_name_args;
+  platform_name_args.struct_size = PJRT_Client_PlatformName_Args_STRUCT_SIZE;
+  platform_name_args.priv = nullptr;
+  platform_name_args.client = create_arg.client;
+
+  PJRT_Error* platform_name_error =
+      api->PJRT_Client_PlatformName(&platform_name_args);
+  EXPECT_EQ(platform_name_error, nullptr);
+#if TENSORFLOW_USE_ROCM
+  EXPECT_EQ(platform_name_args.platform_name, expected_platform_name_for_rocm);
+#else
+  EXPECT_EQ(platform_name_args.platform_name, expected_platform_name_for_cuda);
+#endif
+
+  PJRT_Client_Destroy_Args destroy_args;
+  destroy_args.struct_size = PJRT_Client_Destroy_Args_STRUCT_SIZE;
+  destroy_args.priv = nullptr;
+  destroy_args.client = create_arg.client;
+
+  PJRT_Error* destroy_error = api->PJRT_Client_Destroy(&destroy_args);
+  CHECK_EQ(destroy_error, nullptr);
+}
+
+TEST(PjrtCApiPlatformNameTest, UnavailablePlatformName) {
+  auto api = GetPjrtApi();
+  absl::flat_hash_map<std::string, xla::PjRtValueType> options = {
+      {"platform_name", static_cast<std::string>("invalid_platform_name")},
+      {"allocator", static_cast<std::string>("default")},
+      {"visible_devices", xla::PjRtValueType(std::vector<int64_t>{0, 1})},
+  };
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<PJRT_NamedValue> c_options,
+      ::pjrt::ConvertToPjRtNamedValueList(options, /*api_minor_version=*/30));
+  PJRT_Client_Create_Args create_arg;
+  create_arg.struct_size = PJRT_Client_Create_Args_STRUCT_SIZE;
+  create_arg.priv = nullptr;
+  create_arg.client = nullptr;
+  create_arg.create_options = c_options.data();
+  create_arg.num_options = c_options.size();
+  PJRT_Error* error = api->PJRT_Client_Create(&create_arg);
+  EXPECT_NE(error, nullptr);
+  EXPECT_THAT(error->status,
+              ::tsl::testing::StatusIs(
+                  absl::StatusCode::kNotFound,
+                  testing::StartsWith("Could not find registered platform with "
+                                      "name: \"invalid_platform_name\". "
+                                      "Available platform names are:")));
+
+  PJRT_Error_Destroy_Args error_destroy_args;
+  error_destroy_args.struct_size = PJRT_Error_Destroy_Args_STRUCT_SIZE;
+  error_destroy_args.priv = nullptr;
+  error_destroy_args.error = error;
+
+  api->PJRT_Error_Destroy(&error_destroy_args);
+}
+
 void TestCustomCall() {}
 
 TEST(PjrtCApiGpuPrivTest, CustomCall) {
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
index 8b62aca02eaf57..5e960c3cd60d37 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/primitive_util.h"
 #include "xla/shape_util.h"
@@ -762,6 +763,49 @@ std::unique_ptr<PJRT_KeyValueCallbackData> ConvertToCKeyValueCallbacks(
   return kv_callback_data;
 }
 
+PJRT_SendCallbackInfo CppSendCallbackToCSendCallback(
+    xla::SendCallback cpp_send_callback,
+    PJRT_SendCallbackFunction* send_callback_function) {
+  return PJRT_SendCallbackInfo{
+      cpp_send_callback.channel_id,
+      // this is the void* user_arg to capture `cpp_send_callback.callback`
+      send_callback_function,
+      // this is the function pointer, PJRT_SendCallback
+      [](PJRT_Chunk* chunk, PJRT_CallbackError* callback_error,
+         size_t total_size_in_bytes, bool done, void* user_arg) -> PJRT_Error* {
+        // PJRT_SendCallback, `send_callback` is internal C interface callback
+        // representation that cpatures the client C++ callback in void*
+        // `user_arg` and reinterprets in the lower-level runtime for execution.
+        // `user_arg` captures `send_callback_function` which is
+        // SendCallbackFunction*.
+        PJRT_SendCallbackFunction* send_callback =
+            reinterpret_cast<PJRT_SendCallbackFunction*>(user_arg);
+        return (*send_callback)(chunk, callback_error, total_size_in_bytes,
+                                done);
+      }};
+}
+
+PJRT_RecvCallbackInfo CppRecvCallbackToCRecvCallback(
+    xla::RecvCallback cpp_recv_callback,
+    PJRT_RecvCallbackFunction* recv_callback_function) {
+  return PJRT_RecvCallbackInfo{
+      cpp_recv_callback.channel_id,
+      // this is the void* user_arg to capture `cpp_recv_callback.callback`
+      recv_callback_function,
+      // this is the function pointer, PJRT_RecvCallback
+      [](PJRT_CopyToDeviceStream* stream, void* user_arg) {
+        // PJRT_RecvCallback, `recv_callback` is internal C interface callback
+        // representation that cpatures the client C++ callback in void*
+        // `user_arg` and reinterprets in the lower-level runtime for execution.
+        // `user_arg` captures `recv_callback_function` which is
+        // RecvCallbackFunction*.
+        auto* recv_callback =
+            reinterpret_cast<std::function<void(PJRT_CopyToDeviceStream*)>*>(
+                user_arg);
+        (*recv_callback)(stream);
+      }};
+}
+
 xla::StatusOr<BufferMemoryLayoutData> ConvertToBufferMemoryLayoutData(
     const xla::Layout& cpp_layout) {
   BufferMemoryLayoutData layout_data;
@@ -898,4 +942,21 @@ absl::Span<PJRT_DeviceDescription* const> DeviceDescriptions(
   return {args.descriptions, args.num_descriptions};
 }
 
+absl::StatusOr<xla::CompiledMemoryStats> GetCompiledMemoryStats(
+    const PJRT_Api* api, PJRT_Executable* executable) {
+  PJRT_Executable_GetCompiledMemoryStats_Args args;
+  args.struct_size = PJRT_Executable_GetCompiledMemoryStats_Args_STRUCT_SIZE;
+  args.priv = 0;
+  args.executable = executable;
+  RETURN_STATUS_IF_PJRT_ERROR(
+      api->PJRT_Executable_GetCompiledMemoryStats(&args), api);
+  xla::CompiledMemoryStats results;
+  results.generated_code_size_in_bytes = args.generated_code_size_in_bytes;
+  results.argument_size_in_bytes = args.argument_size_in_bytes;
+  results.output_size_in_bytes = args.output_size_in_bytes;
+  results.alias_size_in_bytes = args.alias_size_in_bytes;
+  results.temp_size_in_bytes = args.temp_size_in_bytes;
+  return results;
+}
+
 }  // namespace pjrt
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
index 569f3af5d44d6b..f727a00709713f 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
@@ -213,6 +213,28 @@ std::unique_ptr<PJRT_KeyValueCallbackData> ConvertToCKeyValueCallbacks(
     xla::PjRtClient::KeyValueGetCallback kv_get,
     xla::PjRtClient::KeyValuePutCallback kv_put);
 
+// std::function version of PJRT_SendCallback
+using PJRT_SendCallbackFunction =
+    std::function<PJRT_Error*(PJRT_Chunk*, PJRT_CallbackError*, size_t, bool)>;
+// std::function version of PJRT_RecvCallback
+using PJRT_RecvCallbackFunction = std::function<void(PJRT_CopyToDeviceStream*)>;
+
+// Wraps original `xla::SendCallback` inside `PJRT_Callback` using
+// 1) void* `user_arg` to capture `cpp_send_callback.callback` (std::function)
+// 2) `PJRT_SendCallback` function pointer, which reinterprets and calls
+// `user_arg` to call `cpp_send_callback.callback` function.
+PJRT_SendCallbackInfo CppSendCallbackToCSendCallback(
+    xla::SendCallback cpp_send_callback,
+    PJRT_SendCallbackFunction* send_callback_function);
+
+// Wraps original `xla::RecvCallback` inside `PJRT_Callback` using
+// 1) void* `user_arg` to capture `cpp_send_callback.callback` (std::function)
+// 2) `PJRT_RecvCallback` function pointer, which reinterprets and calls
+// `user_arg` to call `cpp_recv_callback.callback` function.
+PJRT_RecvCallbackInfo CppRecvCallbackToCRecvCallback(
+    xla::RecvCallback cpp_recv_callback,
+    PJRT_RecvCallbackFunction* recv_callback_function);
+
 // Data needed to support PJRT_Buffer_MemoryLayout. `minor_to_major` holds the
 // data in PJRT_Buffer_MemoryLayout_Tiled.minor_to_major. `tile_dims` and
 // `tile_dim_sizes` holds the data in PJRT_Buffer_MemoryLayout_Tiled.tile_dims
@@ -247,6 +269,9 @@ absl::string_view PlatformName(const PJRT_Api* api,
 absl::Span<PJRT_DeviceDescription* const> DeviceDescriptions(
     const PJRT_Api* api, const PJRT_TopologyDescription* topo_desc);
 
+absl::StatusOr<xla::CompiledMemoryStats> GetCompiledMemoryStats(
+    const PJRT_Api* api, PJRT_Executable* executable);
+
 }  // namespace pjrt
 
 #endif  // XLA_PJRT_C_PJRT_C_API_HELPERS_H_
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers_test.cc
index 1a9de396c193db..d36f3cf9cd3d1d 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <string>
+#include <string_view>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -128,14 +129,14 @@ TEST(PjRtCApiHelperTest, InvalidOptionTypeIndex) {
   EXPECT_NE(status, tsl::OkStatus());
   EXPECT_THAT(status.message(),
               HasSubstr("Option passed to PJRT_Client_Create with name string "
-                        "has type index 1 but expected type index is 0"));
+                        "has type index 2 but expected type index is 0"));
 }
 
 TEST(PjRtCApiHelperTest, Callback) {
   absl::flat_hash_map<std::string, std::string> kv_store;
   absl::Mutex mu;
   xla::PjRtClient::KeyValueGetCallback kv_get =
-      [&kv_store, &mu](const std::string& k,
+      [&kv_store, &mu](std::string_view k,
                        absl::Duration timeout) -> xla::StatusOr<std::string> {
     absl::Duration wait_interval = absl::Milliseconds(10);
     int num_retry = timeout / wait_interval;
@@ -153,8 +154,7 @@ TEST(PjRtCApiHelperTest, Callback) {
         absl::StrCat(k, " is not found in the kv store."));
   };
   xla::PjRtClient::KeyValuePutCallback kv_put =
-      [&kv_store, &mu](const std::string& k,
-                       const std::string& v) -> xla::Status {
+      [&kv_store, &mu](std::string_view k, std::string_view v) -> xla::Status {
     {
       absl::MutexLock lock(&mu);
       kv_store[k] = v;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
index 36e9344bd223be..68cbc5d65d96a4 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
@@ -43,7 +43,6 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_test_base.h"
 #include "xla/pjrt/compile_options.pb.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo.pb.h"
@@ -53,9 +52,7 @@ limitations under the License.
 #include "xla/tests/literal_test_util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace pjrt {
 namespace {
@@ -137,146 +134,6 @@ class PjrtCApiTest : public PjrtCApiTestBase {
  protected:
   PjrtCApiTest() : PjrtCApiTestBase(GetCApi()) {}
   std::string platform_name_ = GetPlatformName();
-
-  int GetDeviceId(PJRT_DeviceDescription* device_desc) const {
-    PJRT_DeviceDescription_Id_Args args = PJRT_DeviceDescription_Id_Args{
-        .struct_size = PJRT_DeviceDescription_Id_Args_STRUCT_SIZE,
-        .priv = nullptr,
-        .device_description = device_desc,
-        .id = -1,
-    };
-    PJRT_Error* error = api_->PJRT_DeviceDescription_Id(&args);
-    CHECK_EQ(error, nullptr);
-    return args.id;
-  }
-
-  int GetDeviceId(PJRT_Device* device) const {
-    return GetDeviceId(::pjrt::GetDeviceDescription(api_, device));
-  }
-
-  bool IsValidDeviceId(PJRT_Device* device) const {
-    return GetDeviceId(device) >= 0;
-  }
-
-  int GetLocalHardwareId(PJRT_Device* device) const {
-    PJRT_Device_LocalHardwareId_Args args = PJRT_Device_LocalHardwareId_Args{
-        .struct_size = PJRT_Device_LocalHardwareId_Args_STRUCT_SIZE,
-        .priv = nullptr,
-        .device = device,
-        .local_hardware_id = -1,
-    };
-    PJRT_Error* error = api_->PJRT_Device_LocalHardwareId(&args);
-    CHECK_EQ(error, nullptr);
-    return args.local_hardware_id;
-  }
-
-  absl::Span<PJRT_Device* const> GetClientDevices() const {
-    PJRT_Client_Devices_Args dev_args;
-    dev_args.struct_size = PJRT_Client_Devices_Args_STRUCT_SIZE;
-    dev_args.priv = nullptr;
-    dev_args.client = client_;
-    PJRT_Error* error = api_->PJRT_Client_Devices(&dev_args);
-    CHECK(error == nullptr);
-    return absl::MakeSpan(dev_args.devices, dev_args.num_devices);
-  }
-
-  int GetNumDevices() const { return GetClientDevices().size(); }
-
-  std::string BuildSingleDeviceCompileOptionStr() {
-    xla::ExecutableBuildOptions build_options;
-    build_options.set_device_ordinal(0);
-    xla::DeviceAssignment device_assignment(1, 1);
-    device_assignment(0, 0) = 0;
-    build_options.set_device_assignment(device_assignment);
-    xla::CompileOptions options;
-    options.executable_build_options = build_options;
-    absl::StatusOr<xla::CompileOptionsProto> options_proto = options.ToProto();
-    TF_CHECK_OK(options_proto.status());
-    return options_proto->SerializeAsString();
-  }
-
-  // Returns a scalar result of execution.
-  // supply as e.g. `src_buffer = args.output_lists[0][0];`
-  // after calling `api_->PJRT_LoadedExecutable_Execute(&args);`
-  absl::StatusOr<float> GetProgramResult(PJRT_Buffer* src_buffer) {
-    CHECK(src_buffer != nullptr);
-    PJRT_Buffer_ToHostBuffer_Args args{
-        .struct_size = PJRT_Buffer_ToHostBuffer_Args_STRUCT_SIZE,
-        .priv = nullptr,
-        .src = src_buffer,
-        .host_layout = nullptr,
-        .dst = nullptr,
-        .dst_size = 0,
-        .event = nullptr,
-    };
-    PJRT_Error* error = api_->PJRT_Buffer_ToHostBuffer(&args);
-    if (error != nullptr) {
-      return ::pjrt::PjrtErrorToStatus(error, api_);
-    }
-    CHECK_EQ(args.dst_size, sizeof(float));
-
-    CHECK_EQ(::pjrt::GetDimensions(api_, src_buffer).size(), 0);
-    CHECK_EQ(::pjrt::GetElementType(api_, src_buffer), PJRT_Buffer_Type_F32);
-
-    float value;
-    args.dst = &value;
-    error = api_->PJRT_Buffer_ToHostBuffer(&args);
-    if (error != nullptr) {
-      return ::pjrt::PjrtErrorToStatus(error, api_);
-    }
-
-    xla::PjRtFuture<absl::Status> transfer_to_host =
-        ::pjrt::ConvertCEventToCppFuture(args.event, api_);
-    TF_RETURN_IF_ERROR(transfer_to_host.Await());
-    return value;
-  }
-
-  // Runs the default executable created in PjrtCApiTpuExecutableTest:SetUp and
-  // returns its output
-  absl::StatusOr<float> RunScalarExecutableAndGetResult(
-      PJRT_LoadedExecutable* executable) {
-    PJRT_LoadedExecutable_Execute_Args args;
-    args.struct_size = PJRT_LoadedExecutable_Execute_Args_STRUCT_SIZE;
-    args.priv = nullptr;
-    args.executable = executable;
-    PJRT_ExecuteOptions c_options;
-    c_options.num_send_ops = 0;
-    c_options.num_recv_ops = 0;
-    args.options = &c_options;
-    args.options->struct_size = PJRT_ExecuteOptions_STRUCT_SIZE;
-    args.options->launch_id = 0;
-    args.num_devices = 1;
-    args.num_args = 1;
-    auto buffer = create_buffer().first;
-    std::vector<PJRT_Buffer*> argument_list = {buffer.get()};
-    std::vector<PJRT_Buffer**> argument_lists{argument_list.data()};
-    args.argument_lists = argument_lists.data();
-    args.device_complete_events = nullptr;
-    args.execute_device = nullptr;
-
-    // Allocates memory for output.
-    int num_outputs_per_device = 1;
-    std::vector<PJRT_Buffer*> output_list(num_outputs_per_device);
-    std::vector<PJRT_Buffer**> output_lists{output_list.data()};
-    args.output_lists = output_lists.data();
-
-    PJRT_Error* error = api_->PJRT_LoadedExecutable_Execute(&args);
-    if (error != nullptr) {
-      return ::pjrt::PjrtErrorToStatus(error, api_);
-    }
-
-    PJRT_Buffer* result_buffer = args.output_lists[0][0];
-    TF_ASSIGN_OR_RETURN(float result, GetProgramResult(result_buffer));
-
-    // Clean up.
-    auto buffer_deleter = ::pjrt::MakeBufferDeleter(api_);
-    for (int i = 0; i < args.num_devices; ++i) {
-      for (int j = 0; j < num_outputs_per_device; ++j) {
-        buffer_deleter(args.output_lists[i][j]);
-      }
-    }
-    return result;
-  }
 };
 
 // -------------------------------- API Version --------------------------------
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
index c7d43ca5c495d6..3cfb43f1631b67 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <numeric>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -24,12 +25,16 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
+#include "xla/service/computation_placer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "tsl/platform/status.h"
 
 namespace pjrt {
 namespace {
@@ -68,6 +73,65 @@ void PjrtCApiTestBase::destroy_client(PJRT_Client* client) {
   CHECK_EQ(error, nullptr);
 }
 
+int PjrtCApiTestBase::GetDeviceId(PJRT_DeviceDescription* device_desc) const {
+  PJRT_DeviceDescription_Id_Args args = PJRT_DeviceDescription_Id_Args{
+      .struct_size = PJRT_DeviceDescription_Id_Args_STRUCT_SIZE,
+      .priv = nullptr,
+      .device_description = device_desc,
+      .id = -1,
+  };
+  PJRT_Error* error = api_->PJRT_DeviceDescription_Id(&args);
+  CHECK_EQ(error, nullptr);
+  return args.id;
+}
+
+int PjrtCApiTestBase::GetDeviceId(PJRT_Device* device) const {
+  return GetDeviceId(::pjrt::GetDeviceDescription(api_, device));
+}
+
+bool PjrtCApiTestBase::IsValidDeviceId(PJRT_Device* device) const {
+  return GetDeviceId(device) >= 0;
+}
+
+int PjrtCApiTestBase::GetLocalHardwareId(PJRT_Device* device) const {
+  PJRT_Device_LocalHardwareId_Args args = PJRT_Device_LocalHardwareId_Args{
+      .struct_size = PJRT_Device_LocalHardwareId_Args_STRUCT_SIZE,
+      .priv = nullptr,
+      .device = device,
+      .local_hardware_id = -1,
+  };
+  PJRT_Error* error = api_->PJRT_Device_LocalHardwareId(&args);
+  CHECK_EQ(error, nullptr);
+  return args.local_hardware_id;
+}
+
+absl::Span<PJRT_Device* const> PjrtCApiTestBase::GetClientDevices() const {
+  PJRT_Client_Devices_Args dev_args;
+  dev_args.struct_size = PJRT_Client_Devices_Args_STRUCT_SIZE;
+  dev_args.priv = nullptr;
+  dev_args.client = client_;
+  PJRT_Error* error = api_->PJRT_Client_Devices(&dev_args);
+  CHECK(error == nullptr);
+  return absl::MakeSpan(dev_args.devices, dev_args.num_devices);
+}
+
+int PjrtCApiTestBase::GetNumDevices() const {
+  return GetClientDevices().size();
+}
+
+std::string PjrtCApiTestBase::BuildSingleDeviceCompileOptionStr() {
+  xla::ExecutableBuildOptions build_options;
+  build_options.set_device_ordinal(0);
+  xla::DeviceAssignment device_assignment(1, 1);
+  device_assignment(0, 0) = 0;
+  build_options.set_device_assignment(device_assignment);
+  xla::CompileOptions options;
+  options.executable_build_options = build_options;
+  absl::StatusOr<xla::CompileOptionsProto> options_proto = options.ToProto();
+  TF_CHECK_OK(options_proto.status());
+  return options_proto->SerializeAsString();
+}
+
 absl::Span<PJRT_Device* const> PjrtCApiTestBase::GetClientAddressableDevices()
     const {
   PJRT_Client_AddressableDevices_Args addr_args;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
index 35201f307eba2f..5cd00133339290 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.h
@@ -14,14 +14,17 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/shape.h"
 
 #ifndef XLA_PJRT_C_PJRT_C_API_TEST_BASE_H_
@@ -39,6 +42,20 @@ class PjrtCApiTestBase : public ::testing::Test {
   PJRT_Client* client_;
   void destroy_client(PJRT_Client* client);
 
+  int GetDeviceId(PJRT_DeviceDescription* device_desc) const;
+
+  int GetDeviceId(PJRT_Device* device) const;
+
+  bool IsValidDeviceId(PJRT_Device* device) const;
+
+  int GetLocalHardwareId(PJRT_Device* device) const;
+
+  absl::Span<PJRT_Device* const> GetClientDevices() const;
+
+  int GetNumDevices() const;
+
+  std::string BuildSingleDeviceCompileOptionStr();
+
   absl::Span<PJRT_Device* const> GetClientAddressableDevices() const;
 
   PJRT_Client_BufferFromHostBuffer_Args CreateBufferFromHostBufferArgs(
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index 0c65b5f4ba7e4c..3deaf7e0841e4e 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -22,11 +22,13 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -230,7 +232,7 @@ xla::PjRtClient::KeyValueGetCallback ToCppKeyValueGetCallback(
     return nullptr;
   }
   return [c_callback, user_arg](
-             const std::string& key,
+             std::string_view key,
              absl::Duration timeout) -> xla::StatusOr<std::string> {
     PJRT_CallbackError callback_error = [](PJRT_Error_Code code,
                                            const char* message,
@@ -239,7 +241,7 @@ xla::PjRtClient::KeyValueGetCallback ToCppKeyValueGetCallback(
                                         std::string(message, message_size))};
     };
     PJRT_KeyValueGetCallback_Args args;
-    args.key = key.c_str();
+    args.key = key.data();
     args.key_size = key.size();
     args.timeout_in_ms = timeout / absl::Milliseconds(1);
     args.callback_error = &callback_error;
@@ -259,8 +261,8 @@ xla::PjRtClient::KeyValuePutCallback ToCppKeyValuePutCallback(
   if (c_callback == nullptr) {
     return nullptr;
   }
-  return [c_callback, user_arg](const std::string& key,
-                                const std::string& value) -> xla::Status {
+  return [c_callback, user_arg](std::string_view key,
+                                std::string_view value) -> xla::Status {
     PJRT_CallbackError callback_error = [](PJRT_Error_Code code,
                                            const char* message,
                                            size_t message_size) {
@@ -268,9 +270,9 @@ xla::PjRtClient::KeyValuePutCallback ToCppKeyValuePutCallback(
                                         std::string(message, message_size))};
     };
     PJRT_KeyValuePutCallback_Args args;
-    args.key = key.c_str();
+    args.key = key.data();
     args.key_size = key.size();
-    args.value = value.c_str();
+    args.value = value.data();
     args.value_size = value.size();
     args.callback_error = &callback_error;
     args.user_arg = user_arg;
@@ -1336,6 +1338,12 @@ PJRT_Error* PJRT_LoadedExecutable_Execute(
   options.context = nullptr;
   options.multi_slice_config = nullptr;
   options.use_major_to_minor_data_layout_for_callbacks = true;
+  if (args->options->num_non_donatable_input_indices > 0) {
+    for (int i = 0; i < args->options->num_non_donatable_input_indices; ++i) {
+      options.non_donatable_input_indices.insert(
+          args->options->non_donatable_input_indices[i]);
+    }
+  }
 
   std::vector<std::vector<xla::PjRtBuffer*>> cpp_argument_lists =
       Convert2DCBuffersToCppBuffers(args->argument_lists, args->num_devices,
@@ -1472,6 +1480,22 @@ PJRT_Error* PJRT_Executable_Serialize(PJRT_Executable_Serialize_Args* args) {
   return nullptr;
 }
 
+PJRT_Error* PJRT_Executable_GetCompiledMemoryStats(
+    PJRT_Executable_GetCompiledMemoryStats_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Executable_Serialize_Args",
+      PJRT_Executable_Serialize_Args_STRUCT_SIZE, args->struct_size));
+  PJRT_ASSIGN_OR_RETURN(auto memory_stats,
+                        args->executable->executable->GetCompiledMemoryStats());
+  args->generated_code_size_in_bytes =
+      memory_stats.generated_code_size_in_bytes;
+  args->argument_size_in_bytes = memory_stats.argument_size_in_bytes;
+  args->output_size_in_bytes = memory_stats.output_size_in_bytes;
+  args->alias_size_in_bytes = memory_stats.alias_size_in_bytes;
+  args->temp_size_in_bytes = memory_stats.temp_size_in_bytes;
+  return nullptr;
+}
+
 PJRT_Error* PJRT_Executable_DeserializeAndLoad(
     PJRT_Executable_DeserializeAndLoad_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index 44d2f4cdd9ed12..2ce5220719452e 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -286,6 +286,8 @@ PJRT_Error* PJRT_Executable_OutputMemoryKinds(
 PJRT_Error* PJRT_Executable_OptimizedProgram(
     PJRT_Executable_OptimizedProgram_Args* args);
 PJRT_Error* PJRT_Executable_Serialize(PJRT_Executable_Serialize_Args* args);
+PJRT_Error* PJRT_Executable_GetCompiledMemoryStats(
+    PJRT_Executable_GetCompiledMemoryStats_Args* args);
 
 PJRT_Error* PJRT_LoadedExecutable_Destroy(
     PJRT_LoadedExecutable_Destroy_Args* args);
@@ -592,6 +594,8 @@ constexpr PJRT_Api CreatePjrtApi(
       /*PJRT_Executable_Fingerprint=*/pjrt::PJRT_Executable_Fingerprint,
       /*PJRT_Client_TopologyDescription= */
       pjrt::PJRT_Client_TopologyDescription,
+      /*PJRT_Executable_GetCompiledMemoryStats= */
+      pjrt::PJRT_Executable_GetCompiledMemoryStats,
   };
 }
 
diff --git a/third_party/xla/xla/pjrt/compile_options.proto b/third_party/xla/xla/pjrt/compile_options.proto
index 5e9c94fadfc56b..3cb7195c2e2d5d 100644
--- a/third_party/xla/xla/pjrt/compile_options.proto
+++ b/third_party/xla/xla/pjrt/compile_options.proto
@@ -7,7 +7,7 @@ import "xla/xla.proto";
 import "xla/xla_data.proto";
 
 // A serialization of xla::ExecutableBuildOptions.
-// Next id: 16.
+// Next id: 18.
 message ExecutableBuildOptionsProto {
   // If set, this is the device to build the computation for. Valid
   // device_ordinal values are: 0 to # of devices - 1. These values are
@@ -84,6 +84,12 @@ message ExecutableBuildOptionsProto {
   bytes fdo_profile = 14;
 
   int64 device_memory_size = 15;
+
+  // Mesh shape in auto sharding options.
+  repeated int64 auto_spmd_partitioning_mesh_shape = 16;
+
+  // Mesh ids in auto sharding options.
+  repeated int64 auto_spmd_partitioning_mesh_ids = 17;
 }
 
 message OptionOverrideProto {
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
new file mode 100644
index 00000000000000..61d13f96e3bb42
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -0,0 +1,250 @@
+load("//xla:xla.bzl", "xla_cc_test")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+xla_cc_test(
+    name = "pjrt_client_test_cpu",
+    srcs = ["pjrt_client_test_cpu.cc"],
+    deps = [
+        ":cpu_client",
+        "//xla/pjrt:pjrt_client_test_common",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
+cc_library(
+    name = "tracked_tfrt_cpu_device_buffer",
+    srcs = ["tracked_tfrt_cpu_device_buffer.cc"],
+    hdrs = ["tracked_tfrt_cpu_device_buffer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:cpu_function_runtime",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/runtime:cpu_event",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/concurrency:async_value",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:platform_port",
+    ],
+)
+
+xla_cc_test(
+    name = "tracked_tfrt_cpu_device_buffer_test",
+    srcs = ["tracked_tfrt_cpu_device_buffer_test.cc"],
+    deps = [
+        ":tracked_tfrt_cpu_device_buffer",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/concurrency:async_value",
+        "@local_tsl//tsl/platform:env",
+    ],
+)
+
+cc_library(
+    name = "abstract_tfrt_cpu_buffer",
+    srcs = ["abstract_tfrt_cpu_buffer.cc"],
+    hdrs = ["abstract_tfrt_cpu_buffer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tracked_tfrt_cpu_device_buffer",
+        "//xla:cpu_function_runtime",
+        "//xla:literal",
+        "//xla:shape_tree",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_future",
+        "//xla/pjrt:transpose",
+        "//xla/pjrt:utils",
+        "//xla/runtime:cpu_event",
+        "//xla/service:shaped_buffer",
+        "//xla/service/cpu:cpu_executable",
+        "//xla/service/cpu:cpu_xfeed",
+        "//xla/stream_executor:device_memory",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/concurrency:async_value",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:connected_traceme",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ],
+)
+
+cc_library(
+    name = "cpu_client",
+    srcs = ["cpu_client.cc"],
+    hdrs = ["cpu_client.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":abstract_tfrt_cpu_buffer",
+        ":tracked_tfrt_cpu_device_buffer",
+        "//xla:array",
+        "//xla:debug_options_flags",
+        "//xla:executable_run_options",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/client:executable_build_options",
+        "//xla/client:xla_computation",
+        "//xla/hlo/ir:hlo",
+        "//xla/pjrt:compile_options_proto_cc",
+        "//xla/pjrt:mlir_to_hlo",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt:pjrt_future",
+        "//xla/pjrt:semaphore",
+        "//xla/pjrt:transpose",
+        "//xla/pjrt:utils",
+        "//xla/pjrt/distributed:topology_util",
+        "//xla/runtime:cpu_event",
+        "//xla/service:buffer_assignment",
+        "//xla/service:compiler",
+        "//xla/service:computation_placer_hdr",
+        "//xla/service:custom_call_status_public_headers",
+        "//xla/service:dump",
+        "//xla/service:executable",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service:hlo_module_config",
+        "//xla/service:hlo_module_util",
+        "//xla/service:hlo_proto_cc",
+        "//xla/service:hlo_value",
+        "//xla/service/cpu:buffer_desc",
+        "//xla/service/cpu:collectives_interface",
+        "//xla/service/cpu:cpu_compiler",
+        "//xla/service/cpu:cpu_executable",
+        "//xla/service/cpu:cpu_executable_run_options",
+        "//xla/service/cpu:cpu_xfeed",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",  # TODO(zhangqiaorjc): Remove if use TFRT threadpool.
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/concurrency:async_value",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:denormal",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:fingerprint",
+        "@local_tsl//tsl/platform:setround",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:connected_traceme",
+        "@local_tsl//tsl/profiler/lib:context_types_hdrs",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ],
+)
+
+xla_cc_test(
+    name = "cpu_client_test",
+    srcs = ["cpu_client_test.cc"],
+    deps = [
+        ":cpu_client",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:util",
+        "//xla/service:custom_call_status_public_headers",
+        "//xla/service:custom_call_target_registry",
+        "//xla/service:hlo_parser",
+        "//xla/tests:test_utils",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "gloo_kv_store",
+    srcs = ["gloo_kv_store.cc"],
+    hdrs = ["gloo_kv_store.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//third_party/gloo",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:status_casters",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_library(
+    name = "gloo_collectives",
+    srcs = ["gloo_collectives.cc"],
+    hdrs = ["gloo_collectives.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//third_party/gloo",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:global_device_id",
+        "//xla/service/cpu:collectives_interface",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
diff --git a/third_party/xla/xla/pjrt/abstract_tfrt_cpu_buffer.cc b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc
similarity index 99%
rename from third_party/xla/xla/pjrt/abstract_tfrt_cpu_buffer.cc
rename to third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc
index b1e7be85f1c8a4..3bf5805e75888f 100644
--- a/third_party/xla/xla/pjrt/abstract_tfrt_cpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/pjrt/abstract_tfrt_cpu_buffer.h"
+#include "xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -40,9 +40,9 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/cpu_function_runtime.h"
 #include "xla/literal.h"
+#include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_future.h"
-#include "xla/pjrt/tracked_tfrt_cpu_device_buffer.h"
 #include "xla/pjrt/transpose.h"
 #include "xla/pjrt/utils.h"
 #include "xla/primitive_util.h"
@@ -435,8 +435,7 @@ PjRtFuture<Status> AbstractTfrtCpuBuffer::ToLiteralHelper(
           // Errors in src buffer are surfaced to user.
           for (const auto& av : device_buffer_wait_avs) {
             if (auto* error = av->GetErrorIfPresent()) {
-              ready_event.emplace(Internal("Error converting to literal: %s",
-                                           error->message()));
+              ready_event.emplace(*error);
               return;
             }
           }
diff --git a/third_party/xla/xla/pjrt/abstract_tfrt_cpu_buffer.h b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
similarity index 98%
rename from third_party/xla/xla/pjrt/abstract_tfrt_cpu_buffer.h
rename to third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
index 79be926129f8e8..f1cec9f5acb17c 100644
--- a/third_party/xla/xla/pjrt/abstract_tfrt_cpu_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_PJRT_ABSTRACT_TFRT_CPU_BUFFER_H_
-#define XLA_PJRT_ABSTRACT_TFRT_CPU_BUFFER_H_
+#ifndef XLA_PJRT_CPU_ABSTRACT_TFRT_CPU_BUFFER_H_
+#define XLA_PJRT_CPU_ABSTRACT_TFRT_CPU_BUFFER_H_
 
 #include <cstddef>
 #include <cstdint>
@@ -34,9 +34,9 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/literal.h"
+#include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_future.h"
-#include "xla/pjrt/tracked_tfrt_cpu_device_buffer.h"
 #include "xla/pjrt/transpose.h"
 #include "xla/runtime/cpu_event.h"
 #include "xla/shape.h"
@@ -421,4 +421,4 @@ class AbstractAsyncHostToHostMemoryTransferManager
 
 }  // namespace xla
 
-#endif  // XLA_PJRT_ABSTRACT_TFRT_CPU_BUFFER_H_
+#endif  // XLA_PJRT_CPU_ABSTRACT_TFRT_CPU_BUFFER_H_
diff --git a/third_party/xla/xla/pjrt/tfrt_cpu_pjrt_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
similarity index 92%
rename from third_party/xla/xla/pjrt/tfrt_cpu_pjrt_client.cc
rename to third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 5f4080eba14d28..82228a3cdfb9eb 100644
--- a/third_party/xla/xla/pjrt/tfrt_cpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 
 #include <algorithm>
 #include <cstddef>
@@ -41,6 +41,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -52,17 +53,20 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/pjrt/abstract_tfrt_cpu_buffer.h"
 #include "xla/pjrt/compile_options.pb.h"
+#include "xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h"
+#include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
+#include "xla/pjrt/distributed/topology_util.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/semaphore.h"
-#include "xla/pjrt/tracked_tfrt_cpu_device_buffer.h"
 #include "xla/pjrt/transpose.h"
 #include "xla/pjrt/utils.h"
 #include "xla/runtime/cpu_event.h"
@@ -70,8 +74,10 @@ limitations under the License.
 #include "xla/service/compiler.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/cpu/buffer_desc.h"
+#include "xla/service/cpu/collectives_interface.h"
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/cpu/cpu_executable.h"
+#include "xla/service/cpu/cpu_executable_run_options.h"
 #include "xla/service/cpu/cpu_xfeed.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/dump.h"
@@ -236,7 +242,11 @@ class TfrtCpuAsyncHostToDeviceTransferManager
 
 }  // namespace
 
-TfrtCpuDeviceDescription::TfrtCpuDeviceDescription(int id) : id_(id) {
+TfrtCpuDeviceDescription::TfrtCpuDeviceDescription(int id, int process_index,
+                                                   int local_hardware_id)
+    : id_(id),
+      process_index_(process_index),
+      local_hardware_id_(local_hardware_id) {
   debug_string_ = absl::StrCat("TFRT_CPU_", id);
   to_string_ = absl::StrCat("CpuDevice(id=", id, ")");
 }
@@ -253,8 +263,9 @@ absl::string_view TfrtCpuDeviceDescription::ToString() const {
   return to_string_;
 }
 
-TfrtCpuDevice::TfrtCpuDevice(int id, int max_inflight_computations)
-    : description_(id),
+TfrtCpuDevice::TfrtCpuDevice(int id, int process_index, int local_hardware_id,
+                             int max_inflight_computations)
+    : description_(id, process_index, local_hardware_id),
       max_inflight_computations_semaphore_(
           /*capacity=*/max_inflight_computations) {}
 
@@ -281,38 +292,53 @@ static int CpuDeviceCount() {
   return GetDebugOptionsFromFlags().xla_force_host_platform_device_count();
 }
 
-static StatusOr<std::vector<std::unique_ptr<TfrtCpuDevice>>> GetTfrtCpuDevices(
-    int cpu_device_count, int max_inflight_computations_per_device) {
-  std::vector<std::unique_ptr<TfrtCpuDevice>> devices;
-  for (int i = 0; i < cpu_device_count; ++i) {
-    auto device = std::make_unique<TfrtCpuDevice>(
-        /*id=*/i, max_inflight_computations_per_device);
-    devices.push_back(std::move(device));
-  }
-  return std::move(devices);
-}
-
 StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(
-    bool asynchronous, int cpu_device_count,
-    int max_inflight_computations_per_device) {
+    const CpuClientOptions& options) {
   // Need at least CpuDeviceCount threads to launch one collective.
+  int cpu_device_count = options.cpu_device_count.value_or(CpuDeviceCount());
   size_t num_threads = std::max(DefaultThreadPoolSize(), cpu_device_count);
 
-  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<TfrtCpuDevice>> devices,
-                      GetTfrtCpuDevices(cpu_device_count,
-                                        max_inflight_computations_per_device));
+  LocalTopologyProto local_topology;
+  local_topology.set_node_id(options.node_id);
+  std::string boot_id_str;
+  auto boot_id_str_or_status = GetBootIdString();
+  if (!boot_id_str_or_status.ok()) {
+    LOG(INFO) << boot_id_str_or_status.status();
+  } else {
+    boot_id_str = boot_id_str_or_status.value();
+  }
+  local_topology.set_boot_id(boot_id_str);
+  for (int i = 0; i < cpu_device_count; ++i) {
+    DeviceProto* device_proto = local_topology.add_devices();
+    device_proto->set_local_device_ordinal(i);
+    device_proto->set_name("cpu");
+  }
 
-  return std::unique_ptr<PjRtClient>(std::make_unique<TfrtCpuClient>(
-      /*process_index=*/0, std::move(devices), num_threads));
-}
+  GlobalTopologyProto global_topology;
+  TF_RETURN_IF_ERROR(
+      ExchangeTopologies("cpu", options.node_id, options.num_nodes,
+                         absl::Minutes(2), absl::Minutes(5), options.kv_get,
+                         options.kv_put, local_topology, &global_topology));
 
-StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(bool asynchronous) {
-  return GetTfrtCpuClient(asynchronous, CpuDeviceCount());
+  std::vector<std::unique_ptr<TfrtCpuDevice>> devices;
+  for (const LocalTopologyProto& node : global_topology.nodes()) {
+    for (const DeviceProto& device_proto : node.devices()) {
+      auto device = std::make_unique<TfrtCpuDevice>(
+          /*id=*/device_proto.global_device_id(), node.node_id(),
+          device_proto.local_device_ordinal(),
+          options.max_inflight_computations_per_device);
+      devices.push_back(std::move(device));
+    }
+  }
+
+  return std::unique_ptr<PjRtClient>(std::make_unique<TfrtCpuClient>(
+      /*process_index=*/options.node_id, std::move(devices),
+      std::move(options.collectives), num_threads));
 }
 
 TfrtCpuClient::TfrtCpuClient(
     int process_index, std::vector<std::unique_ptr<TfrtCpuDevice>> devices,
-    size_t num_threads)
+    std::shared_ptr<cpu::CollectivesInterface> collectives, size_t num_threads)
     : process_index_(process_index),
       owned_devices_(std::move(devices)),
       computation_placer_(std::make_unique<ComputationPlacer>()),
@@ -327,7 +353,8 @@ TfrtCpuClient::TfrtCpuClient(
                                       eigen_intraop_pool_->NumThreads())),
       last_collective_launch_event_(
           tsl::MakeAvailableAsyncValueRef<CpuEvent>()),
-      transpose_cache_(1024) {
+      transpose_cache_(1024),
+      collectives_(std::move(collectives)) {
   for (const std::unique_ptr<TfrtCpuDevice>& device : owned_devices_) {
     devices_.push_back(device.get());
     CHECK(id_to_device_.insert({device->id(), device.get()}).second)
@@ -352,23 +379,33 @@ TfrtCpuClient::TfrtCpuClient(
 TfrtCpuClient::~TfrtCpuClient() { LOG(INFO) << "TfrtCpuClient destroyed."; }
 
 StatusOr<PjRtDevice*> TfrtCpuClient::LookupDevice(int device_id) const {
-  auto it = id_to_device_.find(device_id);
+  return LookupDevice(PjRtGlobalDeviceId(device_id));
+}
+
+StatusOr<PjRtDevice*> TfrtCpuClient::LookupDevice(
+    xla::PjRtGlobalDeviceId global_device_id) const {
+  auto it = id_to_device_.find(global_device_id.value());
   if (it != id_to_device_.end()) {
     return it->second;
   }
   return InvalidArgument("No matching device found for device_id %d",
-                         device_id);
+                         global_device_id.value());
 }
 
 StatusOr<PjRtDevice*> TfrtCpuClient::LookupAddressableDevice(
     int local_hardware_id) const {
+  return LookupAddressableDevice(PjRtLocalDeviceId(local_hardware_id));
+}
+
+StatusOr<PjRtDevice*> TfrtCpuClient::LookupAddressableDevice(
+    PjRtLocalDeviceId local_device_id) const {
   for (auto* device : addressable_devices_) {
-    if (local_hardware_id == device->local_hardware_id()) {
+    if (local_device_id == device->local_device_id()) {
       return device;
     }
   }
-  return InvalidArgument("No matching device found for local_hardware_id %d",
-                         local_hardware_id);
+  return InvalidArgument("No matching device found for local_device_id %d",
+                         local_device_id.value());
 }
 
 absl::Span<PjRtMemorySpace* const> TfrtCpuClient::memory_spaces() const {
@@ -377,9 +414,25 @@ absl::Span<PjRtMemorySpace* const> TfrtCpuClient::memory_spaces() const {
 
 StatusOr<DeviceAssignment> TfrtCpuClient::GetDefaultDeviceAssignment(
     int num_replicas, int num_partitions) const {
+  if (num_partitions * num_replicas <= addressable_devices().size()) {
+    xla::DeviceAssignment assignment(num_replicas, num_partitions);
+    for (int i = 0; i < num_replicas; ++i) {
+      for (int j = 0; j < num_partitions; ++j) {
+        assignment(i, j) =
+            addressable_devices().at(i * num_partitions + j)->id();
+      }
+    }
+    return assignment;
+  }
   return computation_placer_->AssignDevices(num_replicas, num_partitions);
 }
 
+StatusOr<Layout> TfrtCpuClient::GetDefaultLayout(
+    PrimitiveType element_type, absl::Span<const int64_t> dims) {
+  Shape shape = ShapeUtil::MakeShape(element_type, dims);
+  return LayoutUtil::GetWithDefaultLayout(shape).layout();
+}
+
 StatusOr<std::unique_ptr<HloCostAnalysis>> TfrtCpuClient::GetHloCostAnalysis()
     const {
   return std::make_unique<HloCostAnalysis>(cpu::CpuExecutable::ShapeSizeBytes);
@@ -616,6 +669,25 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
       },
       &num_replicas, &num_partitions, &device_assignment));
 
+  if (collectives_ == nullptr && device_assignment) {
+    for (int replica = 0; replica < device_assignment->replica_count();
+         ++replica) {
+      for (int computation = 0;
+           computation < device_assignment->computation_count();
+           ++computation) {
+        int id = (*device_assignment)(replica, computation);
+        TF_ASSIGN_OR_RETURN(auto* device, LookupDevice(id));
+        if (device->process_index() != process_index()) {
+          // TODO(phawkins): improve this error message when we're ready to
+          // publicize that multiprocess collectives exist.
+          return InvalidArgument(
+              "Multiprocess computations aren't implemented on the CPU "
+              "backend.");
+        }
+      }
+    }
+  }
+
   std::vector<const Shape*> argument_layout_pointers;
   TF_RETURN_IF_ERROR(DetermineArgumentLayoutsFromCompileOptions(
       computation, &LayoutUtil::GetWithDefaultLayout, options.argument_layouts,
@@ -765,6 +837,10 @@ StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::BufferFromHostBuffer(
   VLOG(2) << "TfrtCpuClient::BufferFromHostBuffer: shape: " << shape.ToString()
           << " device: " << device->DebugString();
 
+  if (!device->IsAddressable()) {
+    return InvalidArgument("Cannot copy array to non-addressable device %s",
+                           device->DebugString());
+  }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
       AbstractTfrtCpuBuffer::BufferFromHostBufferHelper(
@@ -836,6 +912,11 @@ StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuBuffer::CopyToDevice(
     return CopyToDeviceAcrossClients(dst_device);
   }
 
+  if (!dst_device->IsAddressable()) {
+    return InvalidArgument("Cannot copy array to non-addressable device %s",
+                           dst_device->DebugString());
+  }
+
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
       CopyToDeviceHelper(client()->async_work_runner()));
@@ -1182,11 +1263,15 @@ StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
 
   ExecutableRunOptions run_options;
   run_options.set_run_id(run_id);
-  run_options.set_device_ordinal(device->local_hardware_id());
+  run_options.set_device_ordinal(device->id());
   // Need to keep device_assignment alive until execution completes.
   run_options.set_device_assignment(device_assignment.get());
   run_options.set_intra_op_thread_pool(client_->eigen_intraop_device());
 
+  auto cpu_run_options = std::make_shared<cpu::CpuExecutableRunOptions>();
+  cpu_run_options->set_collectives(client_->collectives_.get());
+  run_options.set_cpu_executable_run_options(cpu_run_options.get());
+
   // Schedule only one collective at a time.
   bool is_a_collective_launch = !!last_collective_launch_event;
   if (is_a_collective_launch) {
@@ -1255,6 +1340,7 @@ StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
          run_options = std::move(run_options),
          cpu_executable_copy = cpu_executable_,
          device_assignment = std::move(device_assignment),
+         cpu_run_options = std::move(cpu_run_options),
          compute_reservation = std::move(compute_reservation),
          tuplized_arg = std::move(tuplized_arg),
          donation_transactions = std::move(donation_transactions),
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.h b/third_party/xla/xla/pjrt/cpu/cpu_client.h
new file mode 100644
index 00000000000000..8bb0fe6f5e3668
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.h
@@ -0,0 +1,589 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_CPU_CPU_CLIENT_H_
+#define XLA_PJRT_CPU_CPU_CLIENT_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "xla/client/xla_computation.h"
+#include "xla/executable_run_options.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
+#include "xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h"
+#include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/semaphore.h"
+#include "xla/pjrt/transpose.h"
+#include "xla/runtime/cpu_event.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/cpu/collectives_interface.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape.h"
+#include "xla/status.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/concurrency/async_value_ref.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/fingerprint.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+
+class TfrtCpuDeviceDescription final : public PjRtDeviceDescription {
+ public:
+  TfrtCpuDeviceDescription(int id, int process_index, int local_hardware_id);
+
+  int id() const override { return id_; }
+
+  int process_index() const override { return process_index_; }
+
+  int local_hardware_id() const { return local_hardware_id_; }
+
+  absl::string_view device_kind() const override;
+
+  absl::string_view DebugString() const override;
+
+  absl::string_view ToString() const override;
+
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override {
+    return attributes_;
+  }
+
+ private:
+  int id_;
+  int process_index_;
+  int local_hardware_id_;
+  std::string debug_string_;
+  std::string to_string_;
+  absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes_ = {};
+};
+
+class TfrtCpuDevice final : public PjRtDevice {
+ public:
+  explicit TfrtCpuDevice(int id, int process_index, int local_hardware_id,
+                         int max_inflight_computations = 32);
+
+  const TfrtCpuDeviceDescription& description() const override {
+    return description_;
+  }
+
+  void SetClient(PjRtClient* client) {
+    CHECK(client_ == nullptr);
+    client_ = client;
+  }
+
+  PjRtClient* client() const override { return client_; }
+
+  bool IsAddressable() const override {
+    return process_index() == client()->process_index();
+  }
+
+  int local_hardware_id() const override {
+    return local_hardware_id_typed().value();
+  }
+
+  PjRtLocalDeviceId local_device_id() const override {
+    return PjRtLocalDeviceId(local_hardware_id_typed().value());
+  }
+
+  PjRtLocalHardwareId local_hardware_id_typed() const override {
+    return PjRtLocalHardwareId(description_.local_hardware_id());
+  }
+
+  Status TransferToInfeed(const LiteralSlice& literal) override;
+
+  Status TransferFromOutfeed(MutableBorrowingLiteral literal) override;
+
+  absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
+
+  StatusOr<PjRtMemorySpace*> default_memory_space() const override;
+
+  // Returns a semaphore for admission control on inflight computations.
+  Semaphore& max_inflight_computations_semaphore() {
+    return max_inflight_computations_semaphore_;
+  }
+
+  std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
+      absl::string_view description) const override {
+    return nullptr;
+  }
+
+ private:
+  PjRtClient* client_ = nullptr;
+  TfrtCpuDeviceDescription description_;
+
+  // TODO(zhangqiaorjc): Optimize semaphore related overhead.
+  // Semaphore used to limit how many programs can be enqueued by the host
+  // ahead of the device.
+  Semaphore max_inflight_computations_semaphore_;
+};
+
+class TfrtCpuClient final : public PjRtClient {
+ public:
+  TfrtCpuClient(int process_index,
+                std::vector<std::unique_ptr<TfrtCpuDevice>> devices,
+                std::shared_ptr<cpu::CollectivesInterface> collectives,
+                size_t num_threads);
+  ~TfrtCpuClient() override;
+
+  int process_index() const override { return process_index_; }
+
+  int device_count() const override { return devices_.size(); }
+
+  int addressable_device_count() const override {
+    return addressable_devices_.size();
+  }
+
+  absl::Span<PjRtDevice* const> devices() const override { return devices_; }
+
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return addressable_devices_;
+  }
+
+  StatusOr<PjRtDevice*> LookupDevice(int device_id) const override;
+  StatusOr<PjRtDevice*> LookupDevice(
+      PjRtGlobalDeviceId global_device_id) const override;
+
+  StatusOr<PjRtDevice*> LookupAddressableDevice(
+      int local_hardware_id) const override;
+  StatusOr<PjRtDevice*> LookupAddressableDevice(
+      PjRtLocalDeviceId local_device_id) const override;
+
+  absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
+
+  PjRtPlatformId platform_id() const override {
+    return tsl::Fingerprint64(CpuName());
+  }
+
+  absl::string_view platform_name() const override { return CpuName(); }
+
+  absl::string_view platform_version() const override { return "<unknown>"; }
+
+  PjRtRuntimeType runtime_type() const override { return kTfrt; }
+
+  StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+
+  StatusOr<Layout> GetDefaultLayout(PrimitiveType element_type,
+                                    absl::Span<const int64_t> dims) override;
+
+  StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+      const override;
+
+  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      const XlaComputation& computation, CompileOptions options) override;
+  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+      mlir::ModuleOp module, CompileOptions options) override;
+
+  // For TfrtCpuClient, `options` is mandatory.
+  // This function returns an InvalidArgument error if `std::nullopt` is passed.
+  // TODO(b/237720161): make it actually optional
+  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
+      absl::string_view serialized,
+      std::optional<CompileOptions> options) override;
+
+  StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
+      Status error, const Shape& shape, PjRtDevice* device) override;
+
+  StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+      const Shape& shape, PjRtDevice* device) override;
+
+  StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtMemorySpace* memory_space) override {
+    return Unimplemented(
+        "CreateBuffersForAsyncHostToDevice with memory_space not implemented.");
+  }
+
+  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      std::function<void()> on_done_with_host_buffer,
+      PjRtDevice* device) override;
+
+  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtDevice* device) override;
+
+  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
+                              PjRtDevice* device,
+                              PjRtCrossHostRecvNotifier notifier) override {
+    return Unimplemented("MakeCrossHostReceiveBuffers not implemented.");
+  }
+
+  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  MakeCrossHostReceiveBuffersForGather(
+      absl::Span<const Shape> shapes, std::vector<GatherDetails> gather_details,
+      PjRtDevice* device, PjRtCrossHostRecvNotifier notifier) override {
+    return Unimplemented(
+        "MakeCrossHostReceiveBuffersForGather not implemented.");
+  }
+
+  StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
+      void* device_ptr, const Shape& shape, PjRtDevice* device,
+      std::function<void()> on_delete_callback,
+      std::optional<std::intptr_t> stream) override;
+
+  StatusOr<ChannelHandle> CreateChannelHandle() override {
+    return Unimplemented("CreateChannelHandle not implemented.");
+  }
+  StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() override {
+    return Unimplemented("CreateDeviceToHostChannelHandle not implemented.");
+  }
+  StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() override {
+    return Unimplemented("CreateHostToDeviceChannelHandle not implemented.");
+  }
+
+  Status Defragment() override {
+    return Unimplemented("Defragment not implemented.");
+  }
+
+  tsl::thread::ThreadPool* pjrt_client_thread_pool() const {
+    return pjrt_client_thread_pool_.get();
+  }
+
+  AsyncWorkRunner* async_work_runner() const {
+    return async_work_runner_.get();
+  }
+
+  Eigen::ThreadPoolDevice* eigen_intraop_device() const {
+    return eigen_intraop_device_.get();
+  }
+
+  tsl::AsyncValueRef<runtime::CpuEvent> GetLastCollectiveLaunchEvent() {
+    absl::MutexLock lock(&mu_);
+    return last_collective_launch_event_.CopyRef();
+  }
+
+  void SetLastCollectiveLaunchEvent(
+      tsl::AsyncValueRef<runtime::CpuEvent> event) {
+    absl::MutexLock lock(&mu_);
+    last_collective_launch_event_ = std::move(event);
+  }
+
+ private:
+  friend class TfrtCpuExecutable;
+
+  int process_index_;
+  // Includes all devices, including non-addressable devices.
+  std::vector<std::unique_ptr<TfrtCpuDevice>> owned_devices_;
+  // Pointers to `owned_devices_`.
+  std::vector<PjRtDevice*> devices_;
+  // Maps Device::id() to the corresponding Device. Includes all devices.
+  absl::flat_hash_map<int, TfrtCpuDevice*> id_to_device_;
+  // Addressable devices indexed by core_id.
+  std::vector<PjRtDevice*> addressable_devices_;
+  std::unique_ptr<ComputationPlacer> computation_placer_;
+
+  // Thread pool for running PjRtClient tasks.
+  std::unique_ptr<tsl::thread::ThreadPool> pjrt_client_thread_pool_;
+  std::unique_ptr<AsyncWorkRunner> async_work_runner_;
+
+  // TODO(zhangqiaorjc): Use tsl::compat::EigenHostContextThreadPool.
+  std::unique_ptr<tsl::thread::ThreadPool> eigen_intraop_pool_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_intraop_device_;
+
+  // Launching collectives are prone to deadlock when we use fixed-sized
+  // threadpools since ExecuteHelper will block until all replicas reach the
+  // barrier. We ensure that
+  // 1. Threadpool size is at least as large as device_count so one collective
+  //    launch over all devices can succeed.
+  // 2. Gang-schedule each collective by conservatively ensuring a total order
+  //    of collectives and launching only one collective at a time to avoid
+  //    having no active threads to make progress
+  // TODO(zhangqiaorjc): Explore alternatives that allow multiple concurrent
+  // collectives.
+  mutable absl::Mutex mu_;
+  tsl::AsyncValueRef<runtime::CpuEvent> last_collective_launch_event_
+      ABSL_GUARDED_BY(mu_);
+
+  // A cache for transpose plans. We use transposes to convert
+  // (possibly strided) buffers provided to BufferFromHostBuffer into dense
+  // major-to-minor layout.
+  absl::Mutex transpose_mu_;
+  TransposePlanCache transpose_cache_ ABSL_GUARDED_BY(transpose_mu_);
+
+  std::shared_ptr<cpu::CollectivesInterface> collectives_;
+};
+
+class TfrtCpuBuffer final : public AbstractTfrtCpuBuffer {
+ public:
+  TfrtCpuBuffer(
+      Shape on_device_shape,
+      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
+      TfrtCpuClient* client, TfrtCpuDevice* device);
+
+  TfrtCpuBuffer(const TfrtCpuBuffer&) = delete;
+  TfrtCpuBuffer(TfrtCpuBuffer&&) = delete;
+  TfrtCpuBuffer& operator=(const TfrtCpuBuffer&) = delete;
+  TfrtCpuBuffer& operator=(TfrtCpuBuffer&&) = delete;
+
+  PjRtMemorySpace* memory_space() const override { return nullptr; }
+  TfrtCpuDevice* device() const override { return device_; }
+  TfrtCpuClient* client() const override { return client_; }
+
+  using PjRtBuffer::ToLiteralSync;
+  PjRtFuture<Status> ToLiteral(MutableLiteralBase* literal) override;
+
+  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
+      PjRtDevice* dst_device) override;
+
+ private:
+  absl::string_view buffer_name() const override { return "TfrtCpuBuffer"; }
+
+  TfrtCpuClient* client_;
+  TfrtCpuDevice* const device_;
+};
+
+class TfrtCpuExecutable final : public PjRtLoadedExecutable {
+ public:
+  TfrtCpuExecutable(
+      int num_replicas, int num_partitions,
+      std::shared_ptr<DeviceAssignment> device_assignment,
+      bool parameter_is_tupled_arguments, CompileOptions compile_options,
+      std::unique_ptr<Executable> cpu_executable,
+      BufferAllocation::Index result_buffer_index,
+      absl::InlinedVector<BufferAllocation::Index, 4> result_buffer_indices,
+      std::vector<LogicalDeviceIds> addressable_device_logical_ids,
+      std::vector<PjRtDevice*> addressable_devices, TfrtCpuClient* client);
+
+  ~TfrtCpuExecutable() override = default;
+
+  TfrtCpuClient* client() const override { return client_; }
+
+  absl::string_view name() const override {
+    return cpu_executable_->shared_module()->name();
+  }
+
+  int num_replicas() const override { return num_replicas_; }
+
+  int num_partitions() const override { return num_partitions_; }
+
+  int64_t SizeOfGeneratedCodeInBytes() const override {
+    return cpu_executable_->SizeOfGeneratedCodeInBytes();
+  }
+
+  const DeviceAssignment& device_assignment() const override {
+    return *device_assignment_;
+  }
+
+  absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
+      const override {
+    return addressable_device_logical_ids_;
+  }
+
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return addressable_devices_;
+  }
+
+  StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override {
+    return std::vector<std::shared_ptr<HloModule>>{
+        cpu_executable_->shared_module()};
+  }
+
+  StatusOr<std::vector<std::vector<absl::string_view>>> GetOutputMemoryKinds()
+      const override {
+    return Unimplemented("GetOutputMemoryKinds is not supported.");
+  }
+
+  StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+    CompiledMemoryStats memory_stats = CompiledMemoryStats();
+    memory_stats.generated_code_size_in_bytes = SizeOfGeneratedCodeInBytes();
+    const HloProto* proto = cpu_executable_->hlo_proto();
+    if (!proto) {
+      return tsl::errors::FailedPrecondition(
+          "cpu_executable_ has no hlo_proto.");
+    }
+    memory_stats.serialized_hlo_proto = proto->SerializeAsString();
+    return memory_stats;
+  }
+
+  using PjRtLoadedExecutable::Execute;
+  StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
+      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+      const ExecuteOptions& options,
+      std::optional<std::vector<PjRtFuture<Status>>>& returned_futures)
+      override;
+
+  using PjRtLoadedExecutable::ExecuteSharded;
+  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<Status>>& returned_future,
+      bool fill_future) override;
+
+  using PjRtLoadedExecutable::ExecutePortable;
+  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<Status>>& returned_future,
+      bool fill_future) override;
+
+  void Delete() override;
+
+  bool IsDeleted() override;
+
+  StatusOr<std::string> SerializeExecutable() const override;
+
+  bool IsReturnedFutureSupported() const override { return true; }
+
+  StatusOr<std::optional<std::string>> Fingerprint() const;
+
+  std::shared_ptr<Executable> cpu_executable() const { return cpu_executable_; }
+
+  StatusOr<std::string> FingerprintExecutable() const override {
+    return Unimplemented("Fingerprinting executable is not supported.");
+  }
+
+ private:
+  friend class TfrtCpuClient;
+
+  Status SetUpDonation(bool tuple_inputs);
+
+  // Checks that the input buffers passed in by the user have the correct size
+  // on device for the compiled program.
+  Status CheckBufferCompatibilities(
+      absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const>
+          input_buffers) const;
+
+  StatusOr<Result> ExecuteHelper(
+      absl::Span<PjRtBuffer* const> argument_handles, int replica,
+      int partition, const RunId& run_id, const ExecuteOptions& options,
+      tsl::AsyncValueRef<runtime::CpuEvent> last_collective_launch_event,
+      bool fill_future, TfrtCpuDevice* device = nullptr);
+
+  TfrtCpuClient* client_;
+
+  int num_replicas_;
+  int num_partitions_;
+  std::shared_ptr<DeviceAssignment> device_assignment_;
+  bool parameter_is_tupled_arguments_;
+  CompileOptions compile_options_;
+
+  std::shared_ptr<Executable> cpu_executable_;
+
+  // Caching `result_buffer_index_` and `result_buffer_indices_` to avoid lookup
+  // HLO dataflow analysis data structures in program execution critical path.
+
+  // Buffer allocation index corresponding to root buffer buffer.
+  BufferAllocation::Index result_buffer_index_;
+  // Buffer allocation indices corresponding to each result buffer leaf buffer.
+  absl::InlinedVector<BufferAllocation::Index, 4> result_buffer_indices_;
+
+  // Size on device of each leaf buffer of the compiled program, cached here
+  // for performance reasons.
+  std::vector<int64_t> input_buffer_sizes_in_bytes_;
+
+  // A sorted vector of parameters that have any aliased buffers and thus must
+  // be donated when executing the computation.
+  std::vector<int> parameters_that_must_be_donated_;
+
+  // The replica and partition indices of device_assignment_ to be run by this
+  // client. On single-host platforms without partitioning, this is all
+  // replicas (i.e. addressable_device_logical_ids_[i] = (i, 0)), but this may
+  // not be the case on multi-host platforms. If there are 4 replicas and 2
+  // partitions on a single host platform, size of
+  // addressable_device_logical_ids_ is 4*2 = 8.
+  std::vector<LogicalDeviceIds> addressable_device_logical_ids_;
+
+  // addressable_devices_[i] is the Device to which
+  // addressable_device_logical_ids_[i] is assigned. shared_ptrs instead of
+  // unique_ptrs to play well with the Python bindings (see xla.cc).
+  std::vector<PjRtDevice*> addressable_devices_;
+
+  // Cached result of comparing HloCostAnalysis FLOP estimate for execute
+  // critical path.
+  bool cheap_computation_;
+};
+
+struct CpuClientOptions {
+  // Does nothing at the moment. Ignored.
+  bool asynchronous = true;
+
+  // Number of CPU devices. If not provided, the value of
+  // --xla_force_host_platform_device_count is used.
+  std::optional<int> cpu_device_count = std::nullopt;
+
+  int max_inflight_computations_per_device = 32;
+
+  // Number of distributed nodes. node_id, kv_get, and kv_put are ignored if
+  // this is set to 1.
+  int num_nodes = 1;
+
+  // My node ID.
+  int node_id = 0;
+
+  // KV store primitives for sharing topology information.
+  PjRtClient::KeyValueGetCallback kv_get = nullptr;
+  PjRtClient::KeyValuePutCallback kv_put = nullptr;
+
+  // Distributed collectives implementation. Optional. If not provided, an
+  // in-process collectives implementation will be used.
+  std::shared_ptr<cpu::CollectivesInterface> collectives;
+};
+StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(
+    const CpuClientOptions& options);
+
+// Deprecated. Use the overload that takes 'options' instead.
+inline StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(
+    bool asynchronous) {
+  CpuClientOptions options;
+  options.asynchronous = asynchronous;
+  return GetTfrtCpuClient(options);
+}
+
+// Deprecated. Use the overload that takes 'options' instead.
+inline StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(
+    bool asynchronous, int cpu_device_count,
+    int max_inflight_computations_per_device = 32) {
+  CpuClientOptions options;
+  options.asynchronous = asynchronous;
+  options.cpu_device_count = cpu_device_count;
+  options.max_inflight_computations_per_device =
+      max_inflight_computations_per_device;
+  return GetTfrtCpuClient(options);
+}
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_CPU_CPU_CLIENT_H_
diff --git a/third_party/xla/xla/pjrt/tfrt_cpu_pjrt_client_test.cc b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
similarity index 91%
rename from third_party/xla/xla/pjrt/tfrt_cpu_pjrt_client_test.cc
rename to third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
index 1b6e25b8d870e5..f23aab9f808eeb 100644
--- a/third_party/xla/xla/pjrt/tfrt_cpu_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 
 #include <unistd.h>
 
@@ -66,7 +66,7 @@ ENTRY DonationWithExecutionError() -> f32[2, 2] {
     ROOT %result = f32[2, 2] get-tuple-element(%custom-call), index=0
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
                           ParseAndReturnUnverifiedModule(kProgram, {}));
@@ -105,11 +105,9 @@ TEST(TfrtCpuClientTest, HloSnapshot) {
       ROOT add = f32[3,2] add(x, y)
     })";
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client,
-      GetTfrtCpuClient(/*asynchronous=*/true,
-                       /*cpu_device_count=*/1,
-                       /*max_inflight_computations_per_device=*/32));
+  CpuClientOptions cpu_options;
+  cpu_options.cpu_device_count = 1;
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(cpu_options));
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
                           ParseAndReturnUnverifiedModule(kProgram, {}));
 
@@ -167,7 +165,7 @@ TEST(TfrtCpuClientTest, HloSnapshot) {
 }
 
 TEST(TfrtCpuClientTest, AsyncTransferRawData) {
-  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
   xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
   TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
                           client->CreateBuffersForAsyncHostToDevice(
@@ -187,7 +185,7 @@ TEST(TfrtCpuClientTest, AsyncTransferRawData) {
 }
 
 TEST(TfrtCpuClientTest, AsyncTransferLiteral) {
-  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
   xla::Shape shape = xla::ShapeUtil::MakeShape(F32, {128, 256});
   TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
                           client->CreateBuffersForAsyncHostToDevice(
@@ -203,7 +201,7 @@ TEST(TfrtCpuClientTest, AsyncTransferLiteral) {
 }
 
 TEST(TfrtCpuClientTest, AsyncTransferCallsOnDone) {
-  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
   xla::Shape shape = ShapeUtil::MakeShape(F32, {3, 2});
   TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
                           client->CreateBuffersForAsyncHostToDevice(
@@ -221,7 +219,7 @@ TEST(TfrtCpuClientTest, AsyncTransferCallsOnDone) {
 }
 
 TEST(TfrtCpuClientTest, AsyncTransferNeverTransferred) {
-  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
   xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
   TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
                           client->CreateBuffersForAsyncHostToDevice(
@@ -236,7 +234,7 @@ TEST(TfrtCpuClientTest, AsyncTransferNeverTransferred) {
 }
 
 TEST(TfrtCpuClientTest, AsyncTransferBufferCount) {
-  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
   xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
   TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
                           client->CreateBuffersForAsyncHostToDevice(
@@ -249,7 +247,7 @@ TEST(TfrtCpuClientTest, AsyncTransferBufferCount) {
 }
 
 TEST(TfrtCpuClientTest, AsyncTransferBufferSize) {
-  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
   xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
   TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
                           client->CreateBuffersForAsyncHostToDevice(
@@ -258,7 +256,7 @@ TEST(TfrtCpuClientTest, AsyncTransferBufferSize) {
 }
 
 TEST(TfrtCpuClientTest, AsyncTransferDevice) {
-  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
   xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
   auto* device = client->addressable_devices()[0];
   TF_ASSERT_OK_AND_ASSIGN(
@@ -268,7 +266,7 @@ TEST(TfrtCpuClientTest, AsyncTransferDevice) {
 }
 
 TEST(TfrtCpuClientTest, AsyncTransferSetBufferError) {
-  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
   xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
   TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
                           client->CreateBuffersForAsyncHostToDevice(
@@ -281,7 +279,7 @@ TEST(TfrtCpuClientTest, AsyncTransferSetBufferError) {
 }
 
 TEST(TfrtCpuClientTest, CreateErrorBuffer) {
-  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
   xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
   TF_ASSERT_OK_AND_ASSIGN(
       auto buffer, client->CreateErrorBuffer(InternalError("foobar"), shape,
@@ -292,7 +290,7 @@ TEST(TfrtCpuClientTest, CreateErrorBuffer) {
 }
 
 TEST(TfrtCpuClientTest, AsyncTransferRawDataToSubBuffer) {
-  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
   xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
   TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
                           client->CreateBuffersForAsyncHostToDevice(
diff --git a/third_party/xla/xla/pjrt/cpu/gloo_collectives.cc b/third_party/xla/xla/pjrt/cpu/gloo_collectives.cc
new file mode 100644
index 00000000000000..c9135f760d299d
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/gloo_collectives.cc
@@ -0,0 +1,469 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/cpu/gloo_collectives.h"
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <exception>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "third_party/gloo/gloo/algorithm.h"
+#include "third_party/gloo/gloo/allgather.h"
+#include "third_party/gloo/gloo/allreduce.h"
+#include "third_party/gloo/gloo/context.h"
+#include "third_party/gloo/gloo/math.h"
+#include "third_party/gloo/gloo/reduce_scatter.h"
+#include "third_party/gloo/gloo/rendezvous/context.h"
+#include "third_party/gloo/gloo/rendezvous/prefix_store.h"
+#include "third_party/gloo/gloo/rendezvous/store.h"
+#include "third_party/gloo/gloo/transport/device.h"
+#include "third_party/gloo/gloo/transport/unbound_buffer.h"
+#include "third_party/gloo/gloo/types.h"
+#include "xla/primitive_util.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/cpu/collectives_interface.h"
+#include "xla/service/global_device_id.h"
+#include "xla/status_macros.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
+
+namespace xla::cpu {
+
+GlooCollectivesCommunicator::GlooCollectivesCommunicator(
+    std::shared_ptr<gloo::Context> context)
+    : context_(std::move(context)) {}
+GlooCollectivesCommunicator::~GlooCollectivesCommunicator() = default;
+
+template <typename T>
+static absl::Status SetAllReduceOptions(ReductionKind reduction_kind,
+                                        const void* input_buffer,
+                                        void* output_buffer,
+                                        size_t num_elements,
+                                        gloo::AllreduceOptions& options) {
+  options.setInput(reinterpret_cast<T*>(const_cast<void*>(input_buffer)),
+                   num_elements);
+  options.setOutput(reinterpret_cast<T*>(const_cast<void*>(output_buffer)),
+                    num_elements);
+
+  using ReductionFn = void (*)(void*, const void*, const void*, size_t);
+
+  switch (reduction_kind) {
+    case ReductionKind::SUM:
+      options.setReduceFunction(static_cast<ReductionFn>(&gloo::sum<T>));
+      break;
+    case ReductionKind::PRODUCT:
+      options.setReduceFunction(static_cast<ReductionFn>(&gloo::product<T>));
+      break;
+    case ReductionKind::MIN:
+      if constexpr (!is_complex_v<T>) {
+        options.setReduceFunction(static_cast<ReductionFn>(&gloo::min<T>));
+      } else {
+        return absl::InvalidArgumentError(
+            "MIN reduction not supported for complex types");
+      }
+      break;
+    case ReductionKind::MAX:
+      if constexpr (!is_complex_v<T>) {
+        options.setReduceFunction(static_cast<ReductionFn>(&gloo::max<T>));
+      } else {
+        return absl::InvalidArgumentError(
+            "MAX reduction not supported for complex types");
+      }
+      break;
+  }
+  return absl::OkStatus();
+}
+
+absl::Status GlooCollectivesCommunicator::AllReduce(
+    const RendezvousKey& key, ReductionKind reduction_kind,
+    PrimitiveType element_type, size_t num_elements, const void* input_buffer,
+    void* output_buffer, absl::Duration timeout) {
+  gloo::AllreduceOptions options(context_);
+  // TODO(phawkins): how to do tags?
+  // options.setTag(tag);
+  switch (element_type) {
+    case S8:
+      TF_RETURN_IF_ERROR(SetAllReduceOptions<int8_t>(
+          reduction_kind, input_buffer, output_buffer, num_elements, options));
+      break;
+    case PRED:
+    case U8:
+      TF_RETURN_IF_ERROR(SetAllReduceOptions<uint8_t>(
+          reduction_kind, input_buffer, output_buffer, num_elements, options));
+      break;
+    case S16:
+      TF_RETURN_IF_ERROR(SetAllReduceOptions<int16_t>(
+          reduction_kind, input_buffer, output_buffer, num_elements, options));
+      break;
+    case U16:
+      TF_RETURN_IF_ERROR(SetAllReduceOptions<uint16_t>(
+          reduction_kind, input_buffer, output_buffer, num_elements, options));
+      break;
+    case S32:
+      TF_RETURN_IF_ERROR(SetAllReduceOptions<int32_t>(
+          reduction_kind, input_buffer, output_buffer, num_elements, options));
+      break;
+    case U32:
+      TF_RETURN_IF_ERROR(SetAllReduceOptions<uint32_t>(
+          reduction_kind, input_buffer, output_buffer, num_elements, options));
+      break;
+    case S64:
+      TF_RETURN_IF_ERROR(SetAllReduceOptions<int64_t>(
+          reduction_kind, input_buffer, output_buffer, num_elements, options));
+      break;
+    case U64:
+      TF_RETURN_IF_ERROR(SetAllReduceOptions<uint64_t>(
+          reduction_kind, input_buffer, output_buffer, num_elements, options));
+      break;
+    case F16:
+      TF_RETURN_IF_ERROR(SetAllReduceOptions<gloo::float16>(
+          reduction_kind, input_buffer, output_buffer, num_elements, options));
+      break;
+    case BF16:
+      TF_RETURN_IF_ERROR(SetAllReduceOptions<bfloat16>(
+          reduction_kind, input_buffer, output_buffer, num_elements, options));
+      break;
+    case F32:
+      TF_RETURN_IF_ERROR(SetAllReduceOptions<float>(
+          reduction_kind, input_buffer, output_buffer, num_elements, options));
+      break;
+    case F64:
+      TF_RETURN_IF_ERROR(SetAllReduceOptions<double>(
+          reduction_kind, input_buffer, output_buffer, num_elements, options));
+      break;
+    case C64:
+      TF_RETURN_IF_ERROR(SetAllReduceOptions<std::complex<float>>(
+          reduction_kind, input_buffer, output_buffer, num_elements, options));
+      break;
+    case C128:
+      TF_RETURN_IF_ERROR(SetAllReduceOptions<std::complex<double>>(
+          reduction_kind, input_buffer, output_buffer, num_elements, options));
+      break;
+    default:
+      return absl::InvalidArgumentError("Unknown datatype in allreduce");
+  }
+  options.setAlgorithm(gloo::AllreduceOptions::Algorithm::RING);
+  options.setTimeout(absl::ToChronoMilliseconds(timeout));
+
+  try {
+    gloo::allreduce(options);
+  } catch (std::exception& e) {
+    return absl::UnknownError(
+        absl::StrCat("Gloo all-reduce failed: ", e.what()));
+  }
+  return absl::OkStatus();
+}
+
+static constexpr uint8_t kCollectivePermuteSlotPrefix = 0x40;
+
+absl::Status GlooCollectivesCommunicator::CollectivePermute(
+    const RendezvousKey& key, size_t num_bytes, std::optional<int> source_rank,
+    absl::Span<int const> target_ranks, const void* input_buffer,
+    void* output_buffer, absl::Duration timeout) {
+  uint32_t tag = 0;  // TODO(phawkins): come up with better tags.
+  const auto slot = gloo::Slot::build(kCollectivePermuteSlotPrefix, tag);
+  try {
+    std::unique_ptr<gloo::transport::UnboundBuffer> in;
+    std::unique_ptr<gloo::transport::UnboundBuffer> out;
+    for (int target : target_ranks) {
+      if (target != context_->rank) {
+        VLOG(1) << "send from " << context_->rank << " to " << target;
+        if (!in) {
+          in = context_->createUnboundBuffer(const_cast<void*>(input_buffer),
+                                             num_bytes);
+        }
+        in->send(target, slot);
+      }
+    }
+    if (source_rank) {
+      if (*source_rank == context_->rank) {
+        std::memcpy(output_buffer, input_buffer, num_bytes);
+      } else {
+        VLOG(1) << "recv at " << context_->rank << " from " << *source_rank;
+        out = context_->createUnboundBuffer(output_buffer, num_bytes);
+        out->recv(*source_rank, slot);
+      }
+    } else {
+      std::memset(output_buffer, 0, num_bytes);
+    }
+    VLOG(1) << "wait for send at " << context_->rank;
+    auto deadline = absl::ToChronoTime(absl::Now() + timeout);
+    if (in) {
+      in->waitSend(deadline);
+    }
+    VLOG(1) << "wait for recv at " << context_->rank;
+    if (out) {
+      out->waitRecv(deadline);
+    }
+    VLOG(1) << "done waiting at " << context_->rank;
+  } catch (std::exception& e) {
+    return absl::UnknownError(
+        absl::StrCat("Gloo collective permute failed: ", e.what()));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status GlooCollectivesCommunicator::AllToAll(
+    const RendezvousKey& key, size_t chunk_bytes,
+    absl::Span<const void* const> input_buffers,
+    absl::Span<void* const> output_buffers, absl::Duration timeout) {
+  // We can't use Gloo's all-to-all implementation directly because it assumes
+  // that the inputs and outputs are contiguous. No big deal; it's just built
+  // on top of send/recv and we can do the same as it.
+  uint32_t tag = 0;  // TODO(phawkins): use better tags.
+  int my_rank = context_->rank;
+  int world_size = context_->size;
+
+  TF_RET_CHECK(world_size == input_buffers.size());
+  TF_RET_CHECK(world_size == output_buffers.size());
+
+  try {
+    const auto slot = gloo::Slot::build(gloo::kAlltoallSlotPrefix, tag);
+    std::vector<std::unique_ptr<gloo::transport::UnboundBuffer>> ins(
+        context_->size);
+    std::vector<std::unique_ptr<gloo::transport::UnboundBuffer>> outs(
+        context_->size);
+    for (size_t i = 0; i < world_size; ++i) {
+      if (i != my_rank) {
+        ins[i] = context_->createUnboundBuffer(
+            const_cast<void*>(input_buffers[i]), chunk_bytes);
+        outs[i] = context_->createUnboundBuffer(output_buffers[i], chunk_bytes);
+      }
+    }
+
+    for (int i = 1; i < world_size; i++) {
+      int send_rank = (my_rank + i) % world_size;
+      int recv_rank = (my_rank + world_size - i) % world_size;
+      ins[send_rank]->send(send_rank, slot);
+      outs[recv_rank]->recv(recv_rank, slot);
+    }
+
+    std::memcpy(output_buffers[my_rank], input_buffers[my_rank], chunk_bytes);
+
+    auto deadline = absl::ToChronoTime(absl::Now() + timeout);
+    for (int i = 0; i < world_size; i++) {
+      if (i != my_rank) {
+        ins[i]->waitSend(deadline);
+        outs[i]->waitRecv(deadline);
+      }
+    }
+  } catch (std::exception& e) {
+    return absl::UnknownError(
+        absl::StrCat("Gloo all-to-all failed: ", e.what()));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status GlooCollectivesCommunicator::AllGather(const RendezvousKey& key,
+                                                    size_t chunk_bytes,
+                                                    const void* input_buffer,
+                                                    void* output_buffer,
+                                                    absl::Duration timeout) {
+  uint32_t tag = 0;  // TODO(phawkins): use better tags.
+
+  gloo::AllgatherOptions options(context_);
+  options.setTag(tag);
+  options.setTimeout(absl::ToChronoMilliseconds(timeout));
+  options.setInput(reinterpret_cast<char*>(const_cast<void*>(input_buffer)),
+                   chunk_bytes);
+  options.setOutput(reinterpret_cast<char*>(output_buffer),
+                    chunk_bytes * context_->size);
+
+  try {
+    gloo::allgather(options);
+  } catch (std::exception& e) {
+    return absl::UnknownError(
+        absl::StrCat("Gloo AllGather failed: ", e.what()));
+  }
+  return absl::OkStatus();
+}
+
+template <typename T>
+absl::Status ReduceScatterHelper(std::shared_ptr<gloo::Context> context,
+                                 ReductionKind reduction_kind, void* buffer,
+                                 size_t chunk_elems) {
+  const gloo::ReductionFunction<T>* reduction_function = nullptr;
+  if constexpr (is_complex_v<T>) {
+    switch (reduction_kind) {
+      case ReductionKind::SUM:
+        reduction_function = gloo::ReductionFunction<T>::sum;
+        break;
+      case ReductionKind::PRODUCT:
+        reduction_function = gloo::ReductionFunction<T>::product;
+        break;
+      default:
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Unsupported reduction kind: ", static_cast<int>(reduction_kind)));
+    }
+  } else {
+    switch (reduction_kind) {
+      case ReductionKind::SUM:
+        reduction_function = gloo::ReductionFunction<T>::sum;
+        break;
+      case ReductionKind::PRODUCT:
+        reduction_function = gloo::ReductionFunction<T>::product;
+        break;
+      case ReductionKind::MAX:
+        reduction_function = gloo::ReductionFunction<T>::max;
+        break;
+      case ReductionKind::MIN:
+        reduction_function = gloo::ReductionFunction<T>::min;
+        break;
+      default:
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Unsupported reduction kind: ", static_cast<int>(reduction_kind)));
+    }
+  }
+  try {
+    std::vector<int> recv_elems(context->size, chunk_elems);
+    gloo::ReduceScatterHalvingDoubling<T> algorithm(
+        context, std::vector<T*>{reinterpret_cast<T*>(buffer)},
+        chunk_elems * context->size, recv_elems, reduction_function);
+    algorithm.run();
+  } catch (std::exception& e) {
+    return absl::UnknownError(
+        absl::StrCat("Gloo ReduceScatter failed: ", e.what()));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status GlooCollectivesCommunicator::ReduceScatter(
+    const RendezvousKey& key, ReductionKind reduction_kind,
+    PrimitiveType element_type, size_t chunk_elems, const void* input_buffer,
+    void* output_buffer, absl::Duration timeout) {
+  size_t chunk_bytes = chunk_elems * primitive_util::ByteWidth(element_type);
+  std::unique_ptr<char[]> temp(new char[chunk_bytes * context_->size]);
+  std::memcpy(temp.get(), input_buffer, chunk_bytes * context_->size);
+  switch (element_type) {
+    case S8:
+      TF_RETURN_IF_ERROR(ReduceScatterHelper<int8_t>(context_, reduction_kind,
+                                                     temp.get(), chunk_elems));
+      break;
+    case PRED:
+    case U8:
+      TF_RETURN_IF_ERROR(ReduceScatterHelper<uint8_t>(context_, reduction_kind,
+                                                      temp.get(), chunk_elems));
+      break;
+    case S16:
+      TF_RETURN_IF_ERROR(ReduceScatterHelper<int16_t>(context_, reduction_kind,
+                                                      temp.get(), chunk_elems));
+      break;
+    case U16:
+      TF_RETURN_IF_ERROR(ReduceScatterHelper<uint16_t>(
+          context_, reduction_kind, temp.get(), chunk_elems));
+      break;
+    case S32:
+      TF_RETURN_IF_ERROR(ReduceScatterHelper<int32_t>(context_, reduction_kind,
+                                                      temp.get(), chunk_elems));
+      break;
+    case U32:
+      TF_RETURN_IF_ERROR(ReduceScatterHelper<uint32_t>(
+          context_, reduction_kind, temp.get(), chunk_elems));
+      break;
+    case S64:
+      TF_RETURN_IF_ERROR(ReduceScatterHelper<int64_t>(context_, reduction_kind,
+                                                      temp.get(), chunk_elems));
+      break;
+    case U64:
+      TF_RETURN_IF_ERROR(ReduceScatterHelper<uint64_t>(
+          context_, reduction_kind, temp.get(), chunk_elems));
+      break;
+    case BF16:
+      TF_RETURN_IF_ERROR(ReduceScatterHelper<bfloat16>(
+          context_, reduction_kind, temp.get(), chunk_elems));
+      break;
+    case F16:
+      TF_RETURN_IF_ERROR(ReduceScatterHelper<gloo::float16>(
+          context_, reduction_kind, temp.get(), chunk_elems));
+      break;
+    case F32:
+      TF_RETURN_IF_ERROR(ReduceScatterHelper<float>(context_, reduction_kind,
+                                                    temp.get(), chunk_elems));
+      break;
+    case F64:
+      TF_RETURN_IF_ERROR(ReduceScatterHelper<double>(context_, reduction_kind,
+                                                     temp.get(), chunk_elems));
+      break;
+    case C64:
+      TF_RETURN_IF_ERROR(ReduceScatterHelper<std::complex<float>>(
+          context_, reduction_kind, temp.get(), chunk_elems));
+      break;
+    case C128:
+      TF_RETURN_IF_ERROR(ReduceScatterHelper<std::complex<double>>(
+          context_, reduction_kind, temp.get(), chunk_elems));
+      break;
+    default:
+      return absl::InvalidArgumentError("Unknown datatype in reducescatter");
+  }
+  std::memcpy(output_buffer, temp.get(), chunk_bytes);
+  return absl::OkStatus();
+}
+
+GlooCollectives::GlooCollectives(
+    std::unique_ptr<gloo::rendezvous::Store> store,
+    std::shared_ptr<gloo::transport::Device> device)
+    : store_(std::move(store)), device_(std::move(device)) {}
+
+GlooCollectives::~GlooCollectives() = default;
+
+absl::StatusOr<std::shared_ptr<CollectivesCommunicator>>
+GlooCollectives::GetCommunicator(
+    absl::Span<GlobalDeviceId const> global_devices, int rank) {
+  absl::MutexLock lock(&mu_);
+  auto& context = contexts_[std::make_tuple(
+      std::vector<GlobalDeviceId>(global_devices.begin(), global_devices.end()),
+      rank)];
+  if (context) {
+    return context;
+  }
+  auto gloo_context =
+      std::make_shared<gloo::rendezvous::Context>(rank, global_devices.size());
+  auto prefix_store = gloo::rendezvous::PrefixStore(
+      absl::StrCat("gloo/",
+                   absl::StrJoin(global_devices, ",",
+                                 [](std::string* out, GlobalDeviceId id) {
+                                   absl::StrAppend(out, id.value());
+                                 })),
+      *store_);
+  try {
+    gloo_context->connectFullMesh(prefix_store, device_);
+  } catch (std::exception& e) {
+    return absl::UnknownError(
+        absl::StrCat("Gloo context initialization failed: ", e.what()));
+  }
+  context =
+      std::make_shared<GlooCollectivesCommunicator>(std::move(gloo_context));
+  return context;
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/pjrt/cpu/gloo_collectives.h b/third_party/xla/xla/pjrt/cpu/gloo_collectives.h
new file mode 100644
index 00000000000000..36190603438688
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/gloo_collectives.h
@@ -0,0 +1,94 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_CPU_GLOO_COLLECTIVES_H_
+#define XLA_PJRT_CPU_GLOO_COLLECTIVES_H_
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <tuple>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "third_party/gloo/gloo/context.h"
+#include "third_party/gloo/gloo/rendezvous/store.h"
+#include "third_party/gloo/gloo/transport/device.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/cpu/collectives_interface.h"
+#include "xla/service/global_device_id.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+class GlooCollectivesCommunicator : public CollectivesCommunicator {
+ public:
+  explicit GlooCollectivesCommunicator(std::shared_ptr<gloo::Context> context);
+  ~GlooCollectivesCommunicator() override;
+
+  absl::Status AllReduce(const RendezvousKey& key, ReductionKind reduction_kind,
+                         PrimitiveType element_type, size_t num_elements,
+                         const void* input_buffer, void* output_buffer,
+                         absl::Duration timeout) override;
+  absl::Status CollectivePermute(const RendezvousKey& key, size_t num_bytes,
+                                 std::optional<int> source_rank,
+                                 absl::Span<int const> target_ranks,
+                                 const void* input_buffer, void* output_buffer,
+                                 absl::Duration timeout) override;
+  absl::Status AllToAll(const RendezvousKey& key, size_t chunk_bytes,
+                        absl::Span<const void* const> input_buffers,
+                        absl::Span<void* const> output_buffers,
+                        absl::Duration timeout) override;
+  absl::Status AllGather(const RendezvousKey& key, size_t chunk_bytes,
+                         const void* input_buffer, void* output_buffer,
+                         absl::Duration timeout) override;
+  absl::Status ReduceScatter(const RendezvousKey& key,
+                             ReductionKind reduction_kind,
+                             PrimitiveType element_type, size_t chunk_elems,
+                             const void* input_buffer, void* output_buffer,
+                             absl::Duration timeout) override;
+
+ private:
+  std::shared_ptr<gloo::Context> context_;
+};
+
+class GlooCollectives : public CollectivesInterface {
+ public:
+  GlooCollectives(std::unique_ptr<gloo::rendezvous::Store> store,
+                  std::shared_ptr<gloo::transport::Device> device);
+  ~GlooCollectives() override;
+
+  // Thread-safe.
+  absl::StatusOr<std::shared_ptr<CollectivesCommunicator>> GetCommunicator(
+      absl::Span<GlobalDeviceId const> devices, int rank) override;
+
+ private:
+  std::unique_ptr<gloo::rendezvous::Store> store_;
+  std::shared_ptr<gloo::transport::Device> device_;
+  absl::Mutex mu_;
+  absl::flat_hash_map<std::tuple<std::vector<GlobalDeviceId>, int>,
+                      std::shared_ptr<GlooCollectivesCommunicator>>
+      contexts_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_PJRT_CPU_GLOO_COLLECTIVES_H_
diff --git a/third_party/xla/xla/pjrt/cpu/gloo_kv_store.cc b/third_party/xla/xla/pjrt/cpu/gloo_kv_store.cc
new file mode 100644
index 00000000000000..a0c31aeec8e5a1
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/gloo_kv_store.cc
@@ -0,0 +1,66 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/cpu/gloo_kv_store.h"
+
+#include <chrono>  // NOLINT
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "third_party/gloo/gloo/rendezvous/store.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/status_casters.h"
+
+namespace xla::cpu {
+
+GlooKeyValueStore::GlooKeyValueStore(PjRtClient::KeyValueGetCallback kv_get,
+                                     PjRtClient::KeyValuePutCallback kv_put)
+    : kv_get_(kv_get), kv_put_(kv_put) {}
+
+GlooKeyValueStore::~GlooKeyValueStore() = default;
+
+void GlooKeyValueStore::set(const std::string& key,
+                            const std::vector<char>& data) {
+  ThrowIfError(kv_put_(key, std::string_view(data.data(), data.size())));
+}
+
+std::vector<char> GlooKeyValueStore::get(const std::string& key) {
+  std::string result = ValueOrThrow(kv_get_(key, kv_get_timeout_));
+  std::vector<char> data(result.begin(), result.end());
+  return data;
+}
+
+void GlooKeyValueStore::wait(const std::vector<std::string>& keys) {
+  wait(keys, Store::kDefaultTimeout);
+}
+
+void GlooKeyValueStore::wait(const std::vector<std::string>& keys,
+                             const std::chrono::milliseconds& timeout) {
+  // TODO(phawkins): add a wait-many feature to the distributed service.
+  absl::Time deadline = absl::Now() + absl::FromChrono(timeout);
+  for (const std::string& key : keys) {
+    absl::Time now = absl::Now();
+    if (now >= deadline) {
+      throw std::runtime_error("Deadline exceeded in wait()");
+    }
+    ThrowIfError(kv_get_(key, deadline - now).status());
+  }
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/pjrt/cpu/gloo_kv_store.h b/third_party/xla/xla/pjrt/cpu/gloo_kv_store.h
new file mode 100644
index 00000000000000..f31ddc14b0c8bb
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/gloo_kv_store.h
@@ -0,0 +1,53 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_CPU_GLOO_KV_STORE_H_
+#define XLA_PJRT_CPU_GLOO_KV_STORE_H_
+
+#include <chrono>  // NOLINT
+#include <string>
+#include <vector>
+
+#include "absl/time/time.h"
+#include "third_party/gloo/gloo/rendezvous/store.h"
+#include "xla/pjrt/pjrt_client.h"
+
+namespace xla::cpu {
+
+class GlooKeyValueStore : public ::gloo::rendezvous::Store {
+ public:
+  GlooKeyValueStore(PjRtClient::KeyValueGetCallback kv_get,
+                    PjRtClient::KeyValuePutCallback kv_put);
+  ~GlooKeyValueStore() override;
+
+  void set(const std::string& key, const std::vector<char>& data) override;
+
+  std::vector<char> get(const std::string& key) override;
+
+  void wait(const std::vector<std::string>& keys) override;
+
+  void wait(const std::vector<std::string>& keys,
+            const std::chrono::milliseconds& timeout) override;
+
+ private:
+  PjRtClient::KeyValueGetCallback kv_get_;
+  PjRtClient::KeyValuePutCallback kv_put_;
+
+  absl::Duration kv_get_timeout_ = absl::Minutes(1);
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_PJRT_CPU_GLOO_KV_STORE_H_
diff --git a/third_party/xla/xla/pjrt/pjrt_client_test_cpu.cc b/third_party/xla/xla/pjrt/cpu/pjrt_client_test_cpu.cc
similarity index 73%
rename from third_party/xla/xla/pjrt/pjrt_client_test_cpu.cc
rename to third_party/xla/xla/pjrt/cpu/pjrt_client_test_cpu.cc
index 4bdd19ac29310f..ccc2ac8cc2575b 100644
--- a/third_party/xla/xla/pjrt/pjrt_client_test_cpu.cc
+++ b/third_party/xla/xla/pjrt/cpu/pjrt_client_test_cpu.cc
@@ -14,17 +14,18 @@ limitations under the License.
 ==============================================================================*/
 
 #include "xla/pjrt/pjrt_client_test.h"
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 
 namespace xla {
 namespace {
 
 // Register CPU as the backend for tests in pjrt_client_test.cc.
-const bool kUnused =
-    (RegisterTestClientFactory([]() {
-       return GetTfrtCpuClient(/*asynchronous=*/true, /*cpu_device_count=*/4);
-     }),
-     true);
+const bool kUnused = (RegisterTestClientFactory([]() {
+                        CpuClientOptions options;
+                        options.cpu_device_count = 4;
+                        return GetTfrtCpuClient(options);
+                      }),
+                      true);
 
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/tracked_tfrt_cpu_device_buffer.cc b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.cc
similarity index 94%
rename from third_party/xla/xla/pjrt/tracked_tfrt_cpu_device_buffer.cc
rename to third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.cc
index ae7d556a142754..2b5487e6b6850d 100644
--- a/third_party/xla/xla/pjrt/tracked_tfrt_cpu_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/pjrt/tracked_tfrt_cpu_device_buffer.h"
+#include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
 
 #include <atomic>
 #include <functional>
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/base/casts.h"
+#include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/runtime/cpu_event.h"
 #include "tsl/concurrency/async_value_ref.h"
@@ -45,7 +46,7 @@ tsl::AsyncValueRef<CpuEvent> AfterAll(
     tsl::AsyncValueRef<CpuEvent> after_all;
 
     absl::Mutex mutex;
-    std::string error_message;
+    absl::Status error;
   };
 
   auto after_all = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
@@ -55,12 +56,12 @@ tsl::AsyncValueRef<CpuEvent> AfterAll(
     event.AndThen([state, event = event.AsPtr()]() {
       if (event.IsError()) {
         absl::MutexLock lock(&state->mutex);
-        state->error_message = event.GetError().message();
+        state->error = event.GetError();
       }
 
       if (state->count.fetch_sub(1, std::memory_order_acq_rel) == 1) {
-        if (!state->error_message.empty()) {
-          state->after_all.SetError(state->error_message);
+        if (!state->error.ok()) {
+          state->after_all.SetError(state->error);
         } else {
           state->after_all.SetStateConcrete();
         }
@@ -132,7 +133,7 @@ void TrackedTfrtCpuDeviceBuffer::AddUsageEvents(
   if (usage_events_.size() >= 1024) {
     int i = 0;
     while (i < usage_events_.size()) {
-      auto& event = usage_events_.at(i);
+      auto& event = usage_events_[i];
       if (event.IsAvailable()) {
         using std::swap;
         swap(event, usage_events_.back());
diff --git a/third_party/xla/xla/pjrt/tracked_tfrt_cpu_device_buffer.h b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h
similarity index 97%
rename from third_party/xla/xla/pjrt/tracked_tfrt_cpu_device_buffer.h
rename to third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h
index de491729ecffe3..2d4b7589cbb4b4 100644
--- a/third_party/xla/xla/pjrt/tracked_tfrt_cpu_device_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_PJRT_TRACKED_TFRT_CPU_DEVICE_BUFFER_H_
-#define XLA_PJRT_TRACKED_TFRT_CPU_DEVICE_BUFFER_H_
+#ifndef XLA_PJRT_CPU_TRACKED_TFRT_CPU_DEVICE_BUFFER_H_
+#define XLA_PJRT_CPU_TRACKED_TFRT_CPU_DEVICE_BUFFER_H_
 
 #include <functional>
 #include <memory>
@@ -148,4 +148,4 @@ class TrackedTfrtCpuDeviceBuffer {
 };
 }  // namespace xla
 
-#endif  // XLA_PJRT_TRACKED_TFRT_CPU_DEVICE_BUFFER_H_
+#endif  // XLA_PJRT_CPU_TRACKED_TFRT_CPU_DEVICE_BUFFER_H_
diff --git a/third_party/xla/xla/pjrt/tracked_tfrt_cpu_device_buffer_test.cc b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer_test.cc
similarity index 99%
rename from third_party/xla/xla/pjrt/tracked_tfrt_cpu_device_buffer_test.cc
rename to third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer_test.cc
index d064e96c629fc0..4ca8b79a2fd884 100644
--- a/third_party/xla/xla/pjrt/tracked_tfrt_cpu_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/pjrt/tracked_tfrt_cpu_device_buffer.h"
+#include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
 
 #include <cstring>
 #include <string>
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index 0d968f9736b5d5..5fba7fba3099ee 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -1,7 +1,7 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
-load("@local_tsl//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("//xla:xla.bzl", "xla_cc_test")
+load("@local_tsl//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
+load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 licenses(["notice"])
 
@@ -12,41 +12,16 @@ package(
 tf_proto_library(
     name = "protocol_proto",
     srcs = ["protocol.proto"],
-    has_services = 1,
     cc_api_version = 2,
-    create_grpc_library = True,
-    use_grpc_namespace = True,
     visibility = ["//visibility:public"],
 )
 
-cc_library(
-    name = "protocol",
-    hdrs = ["protocol.h"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "key_value_store",
-    srcs = ["key_value_store.cc"],
-    hdrs = ["key_value_store.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-    ] + tsl_grpc_cc_dependencies(),
-)
-
 cc_library(
     name = "service",
     srcs = ["service.cc"],
     hdrs = ["service.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":key_value_store",
-        ":protocol",
-        ":protocol_cc_grpc_proto",
         ":topology_util",
         ":util",
         "//xla:status",
@@ -75,7 +50,14 @@ xla_cc_test(
     deps = [
         ":protocol_proto_cc",
         ":topology_util",
+        "//xla:status",
+        "//xla:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -91,8 +73,6 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":protocol",
-        ":protocol_cc_grpc_proto",
         ":util",
         "//xla:statusor",
         "//xla:types",
@@ -140,9 +120,21 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":protocol_proto_cc",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:utils",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -155,6 +147,7 @@ xla_cc_test(
         ":distributed",
         ":protocol_proto_cc",
         ":service",
+        ":topology_util",
         "//xla:protobuf_util",
         "//xla:status_macros",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/pjrt/distributed/client.cc b/third_party/xla/xla/pjrt/distributed/client.cc
index 97e4a3e44e21e5..60dc6878acd843 100644
--- a/third_party/xla/xla/pjrt/distributed/client.cc
+++ b/third_party/xla/xla/pjrt/distributed/client.cc
@@ -16,33 +16,24 @@ limitations under the License.
 #include "xla/pjrt/distributed/client.h"
 
 #include <algorithm>
-#include <chrono>  // NOLINT
 #include <memory>
-#include <optional>
-#include <random>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
-#include "absl/synchronization/mutex.h"
-#include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
 #include "grpcpp/channel.h"
-#include "xla/pjrt/distributed/protocol.h"
-#include "xla/pjrt/distributed/util.h"
-#include "xla/util.h"
 #include "tsl/distributed_runtime/coordination/coordination_client.h"
 #include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tsl/distributed_runtime/coordination/coordination_service_error_util.h"
 #include "tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
 #include "tsl/platform/errors.h"
-#include "tsl/platform/random.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 #include "tsl/protobuf/coordination_service.pb.h"
 
 namespace xla {
 
-
 class DistributedRuntimeCoordinationServiceClient
     : public DistributedRuntimeClient {
  public:
@@ -55,14 +46,13 @@ class DistributedRuntimeCoordinationServiceClient
 
   xla::Status Connect() override;
   xla::Status Shutdown() override;
-  xla::Status EnumerateDevices(const LocalTopologyProto& local_topology,
-                               GlobalTopologyProto* global_topology) override;
   xla::StatusOr<std::string> BlockingKeyValueGet(
-      std::string key, absl::Duration timeout) override;
+      std::string_view key, absl::Duration timeout) override;
   xla::StatusOr<std::vector<std::pair<std::string, std::string>>>
-  KeyValueDirGet(absl::string_view key) override;
-  xla::Status KeyValueSet(std::string key, std::string value) override;
-  xla::Status KeyValueDelete(std::string key) override;
+  KeyValueDirGet(std::string_view key) override;
+  xla::Status KeyValueSet(std::string_view key,
+                          std::string_view value) override;
+  xla::Status KeyValueDelete(std::string_view key) override;
   xla::Status WaitAtBarrier(std::string barrier_id,
                             absl::Duration timeout) override;
   xla::StatusOr<tsl::CoordinationServiceAgent*> GetCoordinationServiceAgent()
@@ -143,35 +133,15 @@ xla::Status DistributedRuntimeCoordinationServiceClient::Shutdown() {
   return s;
 }
 
-xla::Status DistributedRuntimeCoordinationServiceClient::EnumerateDevices(
-    const LocalTopologyProto& local_topology,
-    GlobalTopologyProto* global_topology) {
-  LocalTopologyProto local_device = local_topology;
-  local_device.set_node_id(task_id_);
-  tensorflow::DeviceInfo devices;
-  devices.mutable_device()->Add()->PackFrom(local_device);
-  // Client sends LocalTopologyProto.
-  Status s = coord_agent_->WaitForAllTasks(devices);
-  if (!s.ok()) return s;
-  // Server responds with GlobalTopologyProto (refer to service.cc for details).
-  tensorflow::DeviceInfo global_devices = coord_agent_->GetClusterDeviceInfo();
-  if (global_devices.device_size() != 1) {
-    return tsl::errors::Internal(
-        "Unexpected cluster device response from EnumerateDevices().");
-  }
-  global_devices.device().Get(0).UnpackTo(global_topology);
-  return OkStatus();
-}
-
 xla::StatusOr<std::string>
 DistributedRuntimeCoordinationServiceClient::BlockingKeyValueGet(
-    std::string key, absl::Duration timeout) {
+    std::string_view key, absl::Duration timeout) {
   return coord_agent_->GetKeyValue(key, timeout);
 }
 
 xla::StatusOr<std::vector<std::pair<std::string, std::string>>>
 DistributedRuntimeCoordinationServiceClient::KeyValueDirGet(
-    absl::string_view key) {
+    std::string_view key) {
   // TODO(hanyangtay): Migrate to string_view for both client and coordination
   // agent APIs.
   TF_ASSIGN_OR_RETURN(const auto results,
@@ -189,12 +159,12 @@ DistributedRuntimeCoordinationServiceClient::KeyValueDirGet(
 }
 
 xla::Status DistributedRuntimeCoordinationServiceClient::KeyValueDelete(
-    std::string key) {
+    std::string_view key) {
   return coord_agent_->DeleteKeyValue(key);
 }
 
 xla::Status DistributedRuntimeCoordinationServiceClient::KeyValueSet(
-    std::string key, std::string value) {
+    std::string_view key, std::string_view value) {
   return coord_agent_->InsertKeyValue(key, value);
 }
 
diff --git a/third_party/xla/xla/pjrt/distributed/client.h b/third_party/xla/xla/pjrt/distributed/client.h
index 18c15f107d2694..c780389c63ef22 100644
--- a/third_party/xla/xla/pjrt/distributed/client.h
+++ b/third_party/xla/xla/pjrt/distributed/client.h
@@ -19,12 +19,12 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
 #include "absl/time/time.h"
 #include "grpcpp/channel.h"
-#include "xla/pjrt/distributed/protocol.grpc.pb.h"
 #include "xla/statusor.h"
 #include "xla/types.h"
 #include "tsl/platform/env.h"
@@ -101,8 +101,8 @@ class DistributedRuntimeClient {
 
   // Connects to the master, and blocks until all clients have successfully
   // connected.
-  // Not thread-safe, i.e., calls to Connect()/Shutdown()/EnumerateDevices()
-  // must be serialized by some other means.
+  // Not thread-safe, i.e., calls to Connect()/Shutdown() must be serialized by
+  // some other means.
   virtual xla::Status Connect() = 0;
 
   // Reports to the master that the client is ready to shutdown, and blocks
@@ -110,19 +110,13 @@ class DistributedRuntimeClient {
   // Not thread-safe.
   virtual xla::Status Shutdown() = 0;
 
-  // Blocking enumeration of global devices. Used by the GPU platform.
-  // Not thread-safe.
-  virtual xla::Status EnumerateDevices(
-      const LocalTopologyProto& local_topology,
-      GlobalTopologyProto* global_topology) = 0;
-
   // The following APIs are thread-safe.
 
   // Key-value store API.
   // There are no concurrency guarantees. To avoid a race / impose an ordering
   // on potentially concurrent ops (e.g. set, delete), use WaitAtBarrier().
   virtual xla::StatusOr<std::string> BlockingKeyValueGet(
-      std::string key, absl::Duration timeout) = 0;
+      std::string_view key, absl::Duration timeout) = 0;
 
   // Get all key-value pairs under a directory (key).
   // A value is considered to be in the directory if its key is prefixed with
@@ -130,13 +124,14 @@ class DistributedRuntimeClient {
   // This is not a blocking call. If no keys are found, an empty vector is
   // returned immediately.
   virtual xla::StatusOr<std::vector<std::pair<std::string, std::string>>>
-  KeyValueDirGet(absl::string_view key) = 0;
+  KeyValueDirGet(std::string_view key) = 0;
 
-  virtual xla::Status KeyValueSet(std::string key, std::string value) = 0;
+  virtual xla::Status KeyValueSet(std::string_view key,
+                                  std::string_view value) = 0;
 
   // Delete the key-value. If the key is a directory, recursively clean
   // up all key-values under the directory.
-  virtual xla::Status KeyValueDelete(std::string key) = 0;
+  virtual xla::Status KeyValueDelete(std::string_view key) = 0;
 
   // Blocks until all nodes are at the barrier or the barrier times out.
   // `barrier_id` should be unique across barriers.
diff --git a/third_party/xla/xla/pjrt/distributed/client_server_test.cc b/third_party/xla/xla/pjrt/distributed/client_server_test.cc
index 12df3a01475a42..7e3c05d3b154af 100644
--- a/third_party/xla/xla/pjrt/distributed/client_server_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/client_server_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
+#include <string_view>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/distributed/service.h"
+#include "xla/pjrt/distributed/topology_util.h"
 #include "xla/protobuf_util.h"
 #include "xla/status_macros.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -215,7 +217,19 @@ TEST_F(ClientServerTest, ConnectAndEnumerateDevices) {
     n.WaitForNotification();
     // Sleep a short while for the other thread to send their device info first.
     absl::SleepFor(absl::Seconds(1));
-    TF_RETURN_IF_ERROR(client->EnumerateDevices(locals[0], &topology));
+
+    auto kv_get = [&](std::string_view k,
+                      absl::Duration timeout) -> xla::StatusOr<std::string> {
+      return client->BlockingKeyValueGet(k, timeout);
+    };
+    auto kv_put = [&](std::string_view k, std::string_view v) -> xla::Status {
+      return client->KeyValueSet(k, v);
+    };
+    TF_RETURN_IF_ERROR(
+        ExchangeTopologies("cuda", /*node_id=*/0, /*num_nodes=*/2,
+                           /*get_local_topology_timeout=*/absl::Minutes(1),
+                           /*get_global_topology_timeout=*/absl::Minutes(1),
+                           kv_get, kv_put, locals[0], &topology));
     TF_RET_CHECK(
         xla::protobuf_util::ProtobufEquals(topology, expected_topology))
         << topology.DebugString();
@@ -236,7 +250,18 @@ TEST_F(ClientServerTest, ConnectAndEnumerateDevices) {
     // We cannot send the notification after the call since there is a barrier
     // within the call that would cause a deadlock.
     n.Notify();
-    TF_RETURN_IF_ERROR(client->EnumerateDevices(locals[1], &topology));
+    auto kv_get = [&](std::string_view k,
+                      absl::Duration timeout) -> xla::StatusOr<std::string> {
+      return client->BlockingKeyValueGet(k, timeout);
+    };
+    auto kv_put = [&](std::string_view k, std::string_view v) -> xla::Status {
+      return client->KeyValueSet(k, v);
+    };
+    TF_RETURN_IF_ERROR(
+        ExchangeTopologies("cuda", /*node_id=*/1, /*num_nodes=*/2,
+                           /*get_local_topology_timeout=*/absl::Minutes(1),
+                           /*get_global_topology_timeout=*/absl::Minutes(1),
+                           kv_get, kv_put, locals[1], &topology));
     TF_RET_CHECK(
         xla::protobuf_util::ProtobufEquals(topology, expected_topology))
         << topology.DebugString();
@@ -290,7 +315,18 @@ TEST_F(ClientServerTest, EnumerateElevenDevices) {
     auto client = GetClient(node_id);
     GlobalTopologyProto topology;
     TF_RETURN_IF_ERROR(client->Connect());
-    TF_RETURN_IF_ERROR(client->EnumerateDevices(locals[node_id], &topology));
+    auto kv_get = [&](std::string_view k,
+                      absl::Duration timeout) -> xla::StatusOr<std::string> {
+      return client->BlockingKeyValueGet(k, timeout);
+    };
+    auto kv_put = [&](std::string_view k, std::string_view v) -> xla::Status {
+      return client->KeyValueSet(k, v);
+    };
+    TF_RETURN_IF_ERROR(
+        ExchangeTopologies("cuda", /*node_id=*/node_id, num_nodes,
+                           /*get_local_topology_timeout=*/absl::Minutes(1),
+                           /*get_global_topology_timeout=*/absl::Minutes(1),
+                           kv_get, kv_put, locals[node_id], &topology));
     TF_RET_CHECK(
         xla::protobuf_util::ProtobufEquals(topology, expected_topology))
         << topology.DebugString();
@@ -515,7 +551,7 @@ TEST_F(ClientServerTest, ConnectEventuallyTimesOutIfAClientDoesNotShowUp) {
   int num_nodes = 3;
   absl::Duration timeout = absl::Milliseconds(100);
   CoordinationServiceImpl::Options service_options;
-  service_options.enumerate_devices_timeout = timeout;
+  service_options.cluster_register_timeout = timeout;
   service_options.shutdown_timeout = timeout;
   StartService(num_nodes, service_options);
 
diff --git a/third_party/xla/xla/pjrt/distributed/key_value_store.cc b/third_party/xla/xla/pjrt/distributed/key_value_store.cc
deleted file mode 100644
index 45151bdd96582f..00000000000000
--- a/third_party/xla/xla/pjrt/distributed/key_value_store.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/pjrt/distributed/key_value_store.h"
-
-namespace xla {
-
-KeyValueStore::KeyValueStore() = default;
-
-::grpc::Status KeyValueStore::Get(const std::string& key,
-                                  absl::Duration timeout, std::string* value) {
-  auto key_is_present = [&]() {
-    mu_.AssertHeld();
-    return entries_.find(key) != entries_.end();
-  };
-  absl::MutexLock lock(&mu_);
-  // TODO(phawkins): the synchronization here is very coarse, but probably
-  // sufficient for its current application.
-  if (!mu_.AwaitWithTimeout(absl::Condition(&key_is_present), timeout)) {
-    return ::grpc::Status(::grpc::StatusCode::NOT_FOUND, key);
-  }
-  *value = entries_.find(key)->second;
-  return ::grpc::Status::OK;
-}
-
-::grpc::Status KeyValueStore::Set(const std::string& key, std::string value) {
-  absl::MutexLock lock(&mu_);
-  entries_[key] = std::move(value);
-  return ::grpc::Status::OK;
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/distributed/key_value_store.h b/third_party/xla/xla/pjrt/distributed/key_value_store.h
deleted file mode 100644
index 4c906c21aa1df9..00000000000000
--- a/third_party/xla/xla/pjrt/distributed/key_value_store.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PJRT_DISTRIBUTED_KEY_VALUE_STORE_H_
-#define XLA_PJRT_DISTRIBUTED_KEY_VALUE_STORE_H_
-
-#include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/time/time.h"
-#include "grpcpp/grpcpp.h"
-
-namespace xla {
-
-// A simple blocking key-value store class.
-class KeyValueStore {
- public:
-  KeyValueStore();
-
-  KeyValueStore(const KeyValueStore&) = delete;
-  KeyValueStore(KeyValueStore&&) = delete;
-  KeyValueStore& operator=(const KeyValueStore&) = delete;
-  KeyValueStore&& operator=(KeyValueStore&&) = delete;
-
-  // Looks up `key`. If present, returns its value. If the key is not present,
-  // waits until `timeout` expires for the key to arrive. If the key does not
-  // arrive by the expiry of `timeout`, returns NOT_FOUND.
-  ::grpc::Status Get(const std::string& key, absl::Duration timeout,
-                     std::string* value);
-
-  // Replaces the value of `key` with `value`.
-  ::grpc::Status Set(const std::string& key, std::string value);
-
- private:
-  absl::Mutex mu_;
-  absl::flat_hash_map<std::string, std::string> entries_ ABSL_GUARDED_BY(mu_);
-};
-
-}  // namespace xla
-
-#endif  // XLA_PJRT_DISTRIBUTED_KEY_VALUE_STORE_H_
diff --git a/third_party/xla/xla/pjrt/distributed/protocol.proto b/third_party/xla/xla/pjrt/distributed/protocol.proto
index 52936190bca32e..010856095fd1d0 100644
--- a/third_party/xla/xla/pjrt/distributed/protocol.proto
+++ b/third_party/xla/xla/pjrt/distributed/protocol.proto
@@ -58,110 +58,3 @@ message LocalTopologyProto {
 message GlobalTopologyProto {
   repeated LocalTopologyProto nodes = 1;
 }
-
-message ConnectRequest {
-  int32 protocol_version = 1;
-  int32 timeout_milliseconds = 2;
-
-  // We assume that each node knows its globally-unique node ID, provided by
-  // whatever mechanism launches the tasks. Node IDs should form a dense range
-  // of integers [0, num_nodes).
-  int32 node_id = 3;
-
-  // A unique ID number for the client.
-  uint64 client_id = 4;
-}
-
-message ConnectResponse {
-  uint64 session_id = 1;
-}
-
-message EnumerateDevicesRequest {
-  uint64 session_id = 1;
-  LocalTopologyProto local_topology = 3;
-}
-
-message EnumerateDevicesResponse {
-  GlobalTopologyProto global_topology = 1;
-}
-
-message KeyValueGetRequest {
-  uint64 session_id = 1;
-  bytes key = 2;
-  int32 timeout_milliseconds = 3;
-}
-
-message KeyValueGetResponse {
-  bool found = 1;
-  bytes value = 2;
-}
-
-message KeyValueSetRequest {
-  uint64 session_id = 1;
-  bytes key = 2;
-  bytes value = 3;
-}
-
-message KeyValueSetResponse {}
-
-message WaitAtBarrierRequest {
-  uint64 session_id = 1;
-  bytes barrier_id = 2;
-  int32 node_id = 3;
-  int32 timeout_milliseconds = 4;
-}
-
-message WaitAtBarrierResponse {}
-
-message HeartbeatRequest {
-  uint64 session_id = 1;
-  int32 node_id = 2;
-}
-message HeartbeatResponse {}
-
-message ShutdownRequest {
-  uint64 session_id = 1;
-  int32 node_id = 2;
-}
-message ShutdownResponse {}
-
-service DistributedRuntimeService {
-  // Connects a node to the distributed coordinator node. Blocks until all tasks
-  // have connected. The service receives the number of nodes to expect as an
-  // option passed to its constructor.
-  rpc Connect(ConnectRequest) returns (ConnectResponse) {
-    // [AUTOMATION]: Internal rpc option goes here.
-  }
-
-  // Blocking enumeration of devices, used by the GPU backend only.
-  // In parallel, all clients call EnumerateDevices() with their local device
-  // topology, and receive back a global topology in response.
-  rpc EnumerateDevices(EnumerateDevicesRequest)
-      returns (EnumerateDevicesResponse) {}
-
-  // Health-checking RPC. Workers send heartbeats to the coordinator at regular
-  // intervals. If the worker does not hear from the coordinator or the
-  // coordinator does not hear from the tasks, the tasks abort.
-  rpc Heartbeat(HeartbeatRequest) returns (HeartbeatResponse) {
-    // [AUTOMATION]: Internal rpc option goes here.
-  }
-
-  // Shutdown RPC. Workers send this RPC when they are ready to shut down; the
-  // RPC blocks until all tasks have indicated they are ready to shut down,
-  // or a timeout is reached.
-  rpc Shutdown(ShutdownRequest) returns (ShutdownResponse) {}
-
-  // Simple key-value store used for sharing configuration data.
-  // For example, when using NCCL to communicate between multiple GPUs,
-  // the NCCL communicator IDs are stored here.
-
-  // Looks up a key in the key-value service. Blocks until the key is present
-  // or until `timeout` expires.
-  rpc KeyValueGet(KeyValueGetRequest) returns (KeyValueGetResponse) {}
-
-  // Updates the value associated with a key.
-  rpc KeyValueSet(KeyValueSetRequest) returns (KeyValueSetResponse) {}
-
-  // Blocks until all nodes are at the barrier or the barrier times out.
-  rpc WaitAtBarrier(WaitAtBarrierRequest) returns (WaitAtBarrierResponse) {}
-}
diff --git a/third_party/xla/xla/pjrt/distributed/service.cc b/third_party/xla/xla/pjrt/distributed/service.cc
index a156de841c2f68..37d4c651544b64 100644
--- a/third_party/xla/xla/pjrt/distributed/service.cc
+++ b/third_party/xla/xla/pjrt/distributed/service.cc
@@ -17,22 +17,15 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <utility>
 
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "grpcpp/server_builder.h"
-#include "xla/pjrt/distributed/protocol.h"
-#include "xla/pjrt/distributed/topology_util.h"
-#include "xla/pjrt/distributed/util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/distributed_runtime/coordination/coordination_service.h"
 #include "tsl/distributed_runtime/rpc/async_service_interface.h"
 #include "tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 #include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/random.h"
 #include "tsl/platform/threadpool.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 
@@ -46,7 +39,7 @@ std::unique_ptr<tsl::CoordinationServiceInterface> EnableCoordinationService(
   config.set_service_type("standalone");
   config.set_service_leader(absl::StrCat("/job:", job_name, "/task:0"));
   config.set_cluster_register_timeout_in_ms(
-      absl::ToInt64Milliseconds(options.enumerate_devices_timeout));
+      absl::ToInt64Milliseconds(options.cluster_register_timeout));
   config.set_heartbeat_timeout_in_ms(absl::ToInt64Milliseconds(
       options.heartbeat_interval * options.max_missing_heartbeats));
   config.set_shutdown_barrier_timeout_in_ms(
@@ -57,46 +50,6 @@ std::unique_ptr<tsl::CoordinationServiceInterface> EnableCoordinationService(
   job->set_num_tasks(options.num_nodes);
   auto service = tsl::CoordinationServiceInterface::EnableCoordinationService(
       options.env, config, /*cache=*/nullptr);
-  // Convert list of local devices to global device message as EnumerateDevies()
-  // response.
-  service->SetDeviceAggregationFunction(
-      [](const tensorflow::DeviceInfo& raw_global_devices) {
-        xla::GlobalTopologyProto global_topology;
-        int global_device_id = 0;
-        // Assign local devices of the same host to the same slice_index.
-        int next_slice_index = 0;
-        absl::flat_hash_map<std::string, int> boot_id_to_slice_index;
-        // Unwrap result to local device proto.
-        for (const auto& device : raw_global_devices.device()) {
-          xla::LocalTopologyProto local_topology;
-          // Note that tensorflow::DeviceInfo.device is xla.LocalTopologyProto!
-          device.UnpackTo(&local_topology);
-          // Every new boot_id seen is treated as a new host/slice.
-          absl::string_view boot_id = local_topology.boot_id();
-          auto [it, inserted] =
-              boot_id_to_slice_index.try_emplace(boot_id, next_slice_index);
-          if (inserted) {
-            ++next_slice_index;
-          }
-          // Set deterministic global ids.
-          for (xla::DeviceProto& device : *local_topology.mutable_devices()) {
-            device.set_global_device_id(global_device_id++);
-            device.set_slice_index(it->second);
-          }
-          *global_topology.mutable_nodes()->Add() = local_topology;
-        }
-        if (VLOG_IS_ON(10)) {
-          for (auto it = boot_id_to_slice_index.begin();
-               it != boot_id_to_slice_index.end(); ++it) {
-            LOG(INFO) << "BuildGlobalTopology boot_id_to_slice_index "
-                      << it->first << "->" << it->second;
-          }
-        }
-        // Wrap result back in DeviceInfo proto.
-        tensorflow::DeviceInfo global_devices;
-        global_devices.mutable_device()->Add()->PackFrom(global_topology);
-        return global_devices;
-      });
   return service;
 }
 }  // namespace
@@ -116,7 +69,7 @@ CoordinationServiceImpl::CoordinationServiceImpl(
   auto* grpc_coord_service =
       static_cast<tsl::GrpcCoordinationServiceImpl*>(coord_rpc_service_.get());
   grpc_coord_service->SetCoordinationServiceInstance(coord_service_.get());
-  LOG(INFO) << "Experimental coordination service is enabled.";
+  LOG(INFO) << "Coordination service is enabled.";
 }
 
 CoordinationServiceImpl::~CoordinationServiceImpl() {
diff --git a/third_party/xla/xla/pjrt/distributed/service.h b/third_party/xla/xla/pjrt/distributed/service.h
index 91316fd3310143..d2e308d0f8a71c 100644
--- a/third_party/xla/xla/pjrt/distributed/service.h
+++ b/third_party/xla/xla/pjrt/distributed/service.h
@@ -23,9 +23,9 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
+#include "grpcpp/grpcpp.h"
 #include "grpcpp/security/server_credentials.h"
-#include "xla/pjrt/distributed/key_value_store.h"
-#include "xla/pjrt/distributed/protocol.grpc.pb.h"
+#include "grpcpp/server_builder.h"
 #include "xla/statusor.h"
 #include "xla/types.h"
 #include "tsl/distributed_runtime/coordination/coordination_service.h"
@@ -53,9 +53,9 @@ class CoordinationServiceImpl {
     // coordinator concludes that a client has vanished.
     int max_missing_heartbeats = 10;
 
-    // How long should we wait for all clients to call EnumerateDevices() before
+    // How long should we wait for all clients to call Connect() before
     // giving up?
-    absl::Duration enumerate_devices_timeout = absl::Seconds(60);
+    absl::Duration cluster_register_timeout = absl::Minutes(60);
 
     // How long should we wait for all clients to call Shutdown() before giving
     // up and returning a failure?
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util.cc b/third_party/xla/xla/pjrt/distributed/topology_util.cc
index cd6f17e2d81673..fa3e26b6595e30 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util.cc
+++ b/third_party/xla/xla/pjrt/distributed/topology_util.cc
@@ -15,13 +15,113 @@ limitations under the License.
 
 #include "xla/pjrt/distributed/topology_util.h"
 
+#include <fstream>
 #include <string>
+#include <string_view>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/synchronization/blocking_counter.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/distributed/protocol.pb.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/utils.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/threadpool.h"
 
 namespace xla {
 
+// Exists on Linux systems. Unique per OS kernel restart.
+static constexpr char kBootIdPath[] = "/proc/sys/kernel/random/boot_id";
+
+// Retrieve content of /proc/sys/kernel/random/boot_id as a string.
+// Note that procfs file may have file size 0 which throws off generic file
+// readers such as tsl::ReadFileToString.
+StatusOr<std::string> GetBootIdString() {
+  std::string boot_id_str;
+#ifdef __linux__
+  std::ifstream file(kBootIdPath);
+  if (!file) {
+    return NotFound("%s not found.", kBootIdPath);
+  }
+  std::string line;
+  while (std::getline(file, line)) {
+    absl::StripAsciiWhitespace(&line);
+    absl::StrAppend(&boot_id_str, line);
+  }
+#endif
+  return boot_id_str;
+}
+
+static std::string GetLocalTopologyKey(std::string_view platform, int node_id) {
+  return absl::StrCat("local_topology/", platform, "/", node_id);
+}
+
+static std::string GetGlobalTopologyKey(std::string_view platform) {
+  return absl::StrCat("global_topology/", platform);
+}
+
+static StatusOr<std::vector<LocalTopologyProto>> GetAllLocalTopologies(
+    std::string_view platform, int num_nodes,
+    const PjRtClient::KeyValueGetCallback& kv_get, absl::Duration timeout) {
+  std::vector<StatusOr<std::string>> local_topology_strs(num_nodes);
+
+  // TODO(ezhulenev): Should a thread pool become a function argument?
+  tsl::thread::ThreadPool thread_pool(
+      tsl::Env::Default(), "GetAllLocalTopologies", DefaultThreadPoolSize());
+
+  absl::BlockingCounter blocking_counter(num_nodes);
+  absl::Mutex mu;
+  for (int i = 0; i < num_nodes; i++) {
+    thread_pool.Schedule([&, i] {
+      StatusOr<std::string> local_topology_str =
+          kv_get(GetLocalTopologyKey(platform, i), timeout);
+      {
+        absl::MutexLock lock(&mu);
+        local_topology_strs[i] = local_topology_str;
+      }
+      blocking_counter.DecrementCount();
+    });
+  }
+  blocking_counter.Wait();
+
+  std::vector<std::string> error_messages;
+  std::vector<LocalTopologyProto> local_topologies;
+  int max_num_failed_message = 10;
+  int failed_count = 0;
+  for (const StatusOr<std::string>& str : local_topology_strs) {
+    if (str.ok()) {
+      LocalTopologyProto local;
+      local.ParseFromString(*str);
+      local_topologies.push_back(local);
+    } else {
+      error_messages.push_back(
+          absl::StrCat("Error ", ++failed_count, ": ", str.status().message()));
+      if (failed_count > max_num_failed_message) {
+        break;
+      }
+    }
+  }
+  if (error_messages.empty()) {
+    return local_topologies;
+  }
+  return absl::InternalError(
+      absl::StrCat("Getting local topologies failed: ",
+                   absl::StrJoin(error_messages, "\n\n")));
+}
+
 // Steals the contents of `local_topologies`.
 GlobalTopologyProto BuildGlobalTopology(
     absl::Span<LocalTopologyProto> local_topologies) {
@@ -32,7 +132,7 @@ GlobalTopologyProto BuildGlobalTopology(
   absl::flat_hash_map<std::string, int> boot_id_to_slice_index;
   for (LocalTopologyProto& local : local_topologies) {
     // Every new boot_id seen is treated as a new host/slice.
-    absl::string_view boot_id = local.boot_id();
+    std::string_view boot_id = local.boot_id();
     auto [it, inserted] =
         boot_id_to_slice_index.try_emplace(boot_id, next_slice_index);
     if (inserted) {
@@ -54,4 +154,47 @@ GlobalTopologyProto BuildGlobalTopology(
   return global_topology;
 }
 
+Status ExchangeTopologies(std::string_view platform, int node_id, int num_nodes,
+                          absl::Duration get_local_topology_timeout,
+                          absl::Duration get_global_topology_timeout,
+                          const PjRtClient::KeyValueGetCallback& kv_get,
+                          const PjRtClient::KeyValuePutCallback& kv_put,
+                          const LocalTopologyProto& local_topology,
+                          GlobalTopologyProto* global_topology) {
+  VLOG(3) << "Local Topology for platform" << platform << ":\n"
+          << local_topology.DebugString();
+  if (num_nodes == 1) {
+    LocalTopologyProto* topology = global_topology->add_nodes();
+    *topology = local_topology;
+    for (DeviceProto& device : *topology->mutable_devices()) {
+      device.set_global_device_id(device.local_device_ordinal());
+    }
+    return absl::OkStatus();
+  }
+
+  TF_RETURN_IF_ERROR(kv_put(GetLocalTopologyKey(platform, node_id),
+                            local_topology.SerializeAsString()));
+
+  // The lead node gets all local topologies, builds the global topology and
+  // puts it to the key-value store.
+  std::string global_topology_key = GetGlobalTopologyKey(platform);
+  if (node_id == 0) {
+    TF_ASSIGN_OR_RETURN(std::vector<LocalTopologyProto> local_topologies,
+                        GetAllLocalTopologies(platform, num_nodes, kv_get,
+                                              get_local_topology_timeout));
+    *global_topology =
+        BuildGlobalTopology(absl::Span<LocalTopologyProto>(local_topologies));
+    TF_RETURN_IF_ERROR(
+        kv_put(global_topology_key, global_topology->SerializeAsString()));
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        std::string global_topology_str,
+        kv_get(global_topology_key, get_global_topology_timeout));
+    global_topology->ParseFromString(global_topology_str);
+  }
+  VLOG(3) << "Global topology for platform " << platform << ":\n"
+          << global_topology->DebugString();
+  return absl::OkStatus();
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util.h b/third_party/xla/xla/pjrt/distributed/topology_util.h
index ad2d08274546d5..10e3732c71f670 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util.h
+++ b/third_party/xla/xla/pjrt/distributed/topology_util.h
@@ -16,13 +16,38 @@ limitations under the License.
 #ifndef XLA_PJRT_DISTRIBUTED_TOPOLOGY_UTIL_H_
 #define XLA_PJRT_DISTRIBUTED_TOPOLOGY_UTIL_H_
 
+#include <string>
+#include <string_view>
+
+#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
 
 namespace xla {
 
+// Retrieve content of /proc/sys/kernel/random/boot_id as a string.
+// Empty on non-Linux platforms.
+StatusOr<std::string> GetBootIdString();
+
+// Performs a distributed exchange of topologies using a KV store. Each process
+// provides its local topology, and the local topologies are exchanged to
+// form a global topology.
+Status ExchangeTopologies(std::string_view platform, int node_id, int num_nodes,
+                          absl::Duration get_local_topology_timeout,
+                          absl::Duration get_global_topology_timeout,
+                          const PjRtClient::KeyValueGetCallback& kv_get,
+                          const PjRtClient::KeyValuePutCallback& kv_put,
+                          const LocalTopologyProto& local_topology,
+                          GlobalTopologyProto* global_topology);
+
+// Functions below this point are public only for testing.
+
 // Given a LocalTopologyProto object from each node, builds a
-// GlobalTopologyProto that describes all nodes.
+// GlobalTopologyProto that describes all nodes. Steals the contents of the
+// LocalTopologyProtos.
 GlobalTopologyProto BuildGlobalTopology(
     absl::Span<LocalTopologyProto> local_topologies);
 
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util_test.cc b/third_party/xla/xla/pjrt/distributed/topology_util_test.cc
index 97192906ad610a..c8baa3e4ac6f58 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/topology_util_test.cc
@@ -15,11 +15,21 @@ limitations under the License.
 
 #include "xla/pjrt/distributed/topology_util.h"
 
+#include <string>
+#include <string_view>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/env.h"
 #include "tsl/platform/test.h"
+#include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace {
@@ -42,5 +52,58 @@ TEST(TopologyTest, BuildGlobalTopology) {
   EXPECT_EQ(global.nodes()[1].devices_size(), 2);
 }
 
+TEST(TopologyTest, ExchangeTopology) {
+  int num_nodes = 2;
+  std::vector<LocalTopologyProto> locals(num_nodes);
+  DeviceProto* d0 = locals[0].add_devices();
+  d0->set_local_device_ordinal(0);
+  DeviceProto* d1 = locals[0].add_devices();
+  d1->set_local_device_ordinal(0);
+  DeviceProto* d2 = locals[1].add_devices();
+  d2->set_local_device_ordinal(0);
+  DeviceProto* d3 = locals[1].add_devices();
+  d3->set_local_device_ordinal(1);
+
+  absl::Mutex mu;
+  absl::flat_hash_map<std::string, std::string> kv;
+
+  auto kv_get = [&](std::string_view key,
+                    absl::Duration timeout) -> xla::StatusOr<std::string> {
+    absl::MutexLock lock(&mu);
+    auto ready = [&]() { return kv.contains(key); };
+    if (mu.AwaitWithTimeout(absl::Condition(&ready), timeout)) {
+      return kv[key];
+    }
+    return absl::NotFoundError("key not found");
+  };
+
+  auto kv_put = [&](std::string_view key,
+                    std::string_view value) -> xla::Status {
+    absl::MutexLock lock(&mu);
+    kv[key] = value;
+    return absl::OkStatus();
+  };
+
+  std::vector<GlobalTopologyProto> globals(num_nodes);
+  {
+    tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "TestPool",
+                                        num_nodes);
+    for (int i = 0; i < num_nodes; i++) {
+      thread_pool.Schedule([&, i] {
+        TF_ASSERT_OK(ExchangeTopologies(
+            /*platform=*/"cuda", /*node_id=*/i, num_nodes,
+            /*get_local_topology_timeout=*/
+            absl::Seconds(10), /*get_global_topology_timeout=*/
+            absl::Seconds(10), kv_get, kv_put, locals[i], &globals[i]));
+      });
+    }
+  }
+  for (const GlobalTopologyProto& global : globals) {
+    EXPECT_EQ(global.nodes_size(), 2);
+    EXPECT_EQ(global.nodes()[0].devices_size(), 2);
+    EXPECT_EQ(global.nodes()[1].devices_size(), 2);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/python/exceptions.h b/third_party/xla/xla/pjrt/exceptions.h
similarity index 94%
rename from third_party/xla/xla/python/exceptions.h
rename to third_party/xla/xla/pjrt/exceptions.h
index c5b7e72e61663e..19911e171edb93 100644
--- a/third_party/xla/xla/python/exceptions.h
+++ b/third_party/xla/xla/pjrt/exceptions.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef XLA_PYTHON_EXCEPTIONS_H_
-#define XLA_PYTHON_EXCEPTIONS_H_
+#ifndef XLA_PJRT_EXCEPTIONS_H_
+#define XLA_PJRT_EXCEPTIONS_H_
 
 #include <cstdlib>
 #include <cstring>
@@ -64,4 +64,4 @@ class XlaRuntimeError : public std::runtime_error {
 
 }  // namespace xla
 
-#endif  // XLA_PYTHON_EXCEPTIONS_H_
+#endif  // XLA_PJRT_EXCEPTIONS_H_
diff --git a/third_party/xla/xla/pjrt/executable_metadata.proto b/third_party/xla/xla/pjrt/executable_metadata.proto
new file mode 100644
index 00000000000000..c5995492d28fdb
--- /dev/null
+++ b/third_party/xla/xla/pjrt/executable_metadata.proto
@@ -0,0 +1,15 @@
+syntax = "proto3";
+
+package xla;
+
+import "xla/service/hlo.proto";
+
+// Mirror of xla::CompiledMemoryStats.
+message CompiledMemoryStatsProto {
+  int64 generated_code_size_in_bytes = 1;
+  int64 argument_size_in_bytes = 2;
+  int64 output_size_in_bytes = 3;
+  int64 alias_size_in_bytes = 4;
+  int64 temp_size_in_bytes = 5;
+  xla.HloProto hlo_proto = 6;
+}
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 373624b7808651..b40f134c265aa4 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -53,6 +53,7 @@ cc_library(
         "//xla/pjrt:mlir_to_hlo",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_stream_executor_client",
         "//xla/pjrt:stream_executor_executable",
@@ -79,6 +80,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/framework:allocator",
         "@local_tsl//tsl/framework:bfc_allocator",
         "@local_tsl//tsl/framework:device_id",
@@ -299,7 +301,6 @@ xla_cc_test(
     deps = [
         ":se_gpu_pjrt_client",
         ":se_gpu_pjrt_compiler",
-        "//third_party/protobuf",
         "//xla:literal",
         "//xla:literal_util",
         "//xla/client:xla_computation",
@@ -323,6 +324,7 @@ xla_cc_test(
         "@llvm-project//mlir:Parser",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
diff --git a/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc b/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc
index 0945d33458bb03..98a5a705edc9b2 100644
--- a/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc
+++ b/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc
@@ -59,7 +59,7 @@ StatusOr<std::string> NcclIdStore::GetNcclUniqueId(
     return FailedPrecondition("NCCL support was not built into XLA binary.");
 #endif
   } else {
-    TF_ASSIGN_OR_RETURN(id_string, kv_get_(key.ToString(), absl::Minutes(5)));
+    TF_ASSIGN_OR_RETURN(id_string, kv_get_(key.ToString(), absl::Minutes(10)));
   }
   absl::MutexLock lock(&mu_);
   auto result = cache_.emplace(key, std::move(id_string));
diff --git a/third_party/xla/xla/pjrt/gpu/pjrt_client_test_se_gpu.cc b/third_party/xla/xla/pjrt/gpu/pjrt_client_test_se_gpu.cc
index cb89b97651807e..bde468f2ff4c51 100644
--- a/third_party/xla/xla/pjrt/gpu/pjrt_client_test_se_gpu.cc
+++ b/third_party/xla/xla/pjrt/gpu/pjrt_client_test_se_gpu.cc
@@ -21,9 +21,7 @@ namespace {
 
 // Register GPU as the backend for tests in pjrt_client_test.cc.
 const bool kUnused = (RegisterTestClientFactory([]() {
-                        return GetStreamExecutorGpuClient(
-                            /*asynchronous=*/true, GpuAllocatorConfig{},
-                            /*node_id=*/0);
+                        return GetStreamExecutorGpuClient(GpuClientOptions());
                       }),
                       true);
 
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 132671a59ad96d..6378a1e9cffdad 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -15,12 +15,13 @@ limitations under the License.
 
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 
-#include <fstream>
+#include <array>
 #include <map>
 #include <memory>
 #include <optional>
 #include <set>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -41,8 +42,10 @@ limitations under the License.
 #include "xla/client/local_client.h"
 #include "xla/client/xla_computation.h"
 #include "xla/pjrt/distributed/topology_util.h"
+#include "xla/pjrt/gpu/gpu_helpers.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
 #include "xla/pjrt/stream_executor_executable.h"
@@ -307,9 +310,7 @@ class AsyncHostToDeviceTransferManager
     CHECK_LE(offset, buffer_memory.size());
     CHECK_LE(transfer_size, buffer_memory.size() - offset);
     if (transfer_size < buffer_memory.size()) {
-      sub_buffer = se::DeviceMemoryBase(
-          reinterpret_cast<char*>(buffer_memory.opaque()) + offset,
-          transfer_size);
+      sub_buffer = buffer_memory.GetByteSlice(offset, transfer_size);
     } else {
       sub_buffer = buffer_memory;
     }
@@ -481,8 +482,7 @@ PjRtFuture<absl::Status> StreamExecutorGpuClient::CopyRawSubBufferToHost(
   std::unique_ptr<se::DeviceMemoryBase> sub_buffer;
   if (transfer_size < device_memory.size()) {
     sub_buffer = std::make_unique<se::DeviceMemoryBase>(
-        reinterpret_cast<char*>(device_memory.opaque()) + offset,
-        transfer_size);
+        device_memory.GetByteSlice(offset, transfer_size));
   } else {
     sub_buffer = std::make_unique<se::DeviceMemoryBase>(device_memory);
   }
@@ -536,7 +536,22 @@ StreamExecutorGpuClient::Compile(const XlaComputation& computation,
   auto executable = PjRtStreamExecutorClient::Compile(computation, options);
 
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
-  metrics::RecordFreeGpuSystemMemory();
+  for (const PjRtDevice* device : addressable_devices()) {
+    LocalDeviceState* local_device_state =
+        tensorflow::down_cast<const PjRtStreamExecutorDevice*>(device)
+            ->local_device_state();
+    int64_t free_memory, total_memory;
+    if (local_device_state != nullptr) {
+      se::StreamExecutor* executor = local_device_state->executor();
+      int device_ordinal = executor->device_ordinal();
+      if (executor->DeviceMemoryUsage(&free_memory, &total_memory)) {
+        metrics::RecordFreeGpuSystemMemory(device_ordinal, free_memory);
+      } else {
+        LOG(ERROR) << "Failed to query available memory for GPU "
+                   << device_ordinal;
+      }
+    }
+  }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   return executable;
 }
@@ -585,21 +600,6 @@ StreamExecutorGpuClient::LoadSerialized(absl::string_view serialized,
   return absl::InternalError("LoadSerialized only works with cuda or rocm.");
 }
 
-std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
-    std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
-    int node_id) {
-  std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
-  for (auto& ordinal_and_device : local_device_states) {
-    const se::DeviceDescription& description =
-        ordinal_and_device.second->executor()->GetDeviceDescription();
-    auto device = std::make_unique<StreamExecutorGpuDevice>(
-        ordinal_and_device.first, std::move(ordinal_and_device.second),
-        description.name(), description.device_vendor(), node_id);
-    devices.push_back(std::move(device));
-  }
-  return devices;
-}
-
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>> StreamExecutorGpuClient::Load(
     std::unique_ptr<PjRtExecutable> executable) {
   auto se_executable = absl::WrapUnique(
@@ -736,8 +736,6 @@ GetStreamExecutorGpuDeviceAllocator(
     case GpuAllocatorConfig::Kind::kDefault:
     case GpuAllocatorConfig::Kind::kBFC: {
       LOG(INFO) << "Using BFC allocator.";
-      std::vector<se::StreamExecutor*> executors;
-      executors.reserve(addressable_devices.size());
       std::vector<se::MultiDeviceAdapter::AllocatorWithStream>
           allocators_and_streams;
       for (const auto& ordinal_and_device : addressable_devices) {
@@ -762,90 +760,14 @@ GetStreamExecutorGpuDeviceAllocator(
   return std::move(allocator);
 }
 
-// Exists on Linux systems. Unique per OS kernel restart.
-static constexpr char kBootIdPath[] = "/proc/sys/kernel/random/boot_id";
-
-// Retrieve content of /proc/sys/kernel/random/boot_id as a string.
-// Note that procfs file may have file size 0 which throws off generic file
-// readers such as tsl::ReadFileToString.
-StatusOr<std::string> GetBootIdString() {
-  std::string boot_id_str;
-#ifdef __linux__
-  std::ifstream file(kBootIdPath);
-  if (!file) {
-    return NotFound("%s not found.", kBootIdPath);
-  }
-  std::string line;
-  while (std::getline(file, line)) {
-    absl::StripAsciiWhitespace(&line);
-    absl::StrAppend(&boot_id_str, line);
-  }
-#endif
-  return boot_id_str;
-}
-
-static std::string GetLocalTopologyKey(int node_id) {
-  return absl::StrCat("local_topology:", node_id);
-}
-
-static std::string GetGlobalTopologyKey() { return "global_topology"; }
-
-static StatusOr<std::vector<LocalTopologyProto>> GetAllLocalTopologies(
-    int num_nodes, const PjRtClient::KeyValueGetCallback& kv_get,
-    absl::Duration timeout) {
-  std::vector<StatusOr<std::string>> local_topology_strs(num_nodes);
-
-  // TODO(ezhulenev): Should a thread pool become a function argument?
-  tsl::thread::ThreadPool thread_pool(
-      tsl::Env::Default(), "GetAllLocalTopologies", DefaultThreadPoolSize());
-
-  absl::BlockingCounter blocking_counter(num_nodes);
-  absl::Mutex mu;
-  for (int i = 0; i < num_nodes; i++) {
-    thread_pool.Schedule([&, i] {
-      StatusOr<std::string> local_topology_str =
-          kv_get(GetLocalTopologyKey(i), timeout);
-      {
-        absl::MutexLock lock(&mu);
-        local_topology_strs[i] = local_topology_str;
-      }
-      blocking_counter.DecrementCount();
-    });
-  }
-  blocking_counter.Wait();
-
-  std::vector<std::string> error_messages;
-  std::vector<LocalTopologyProto> local_topologies;
-  int max_num_failed_message = 10;
-  int failed_count = 0;
-  for (const StatusOr<std::string>& str : local_topology_strs) {
-    if (str.ok()) {
-      LocalTopologyProto local;
-      local.ParseFromString(*str);
-      local_topologies.push_back(local);
-    } else {
-      error_messages.push_back(
-          absl::StrCat("Error ", ++failed_count, ": ", str.status().message()));
-      if (failed_count > max_num_failed_message) {
-        break;
-      }
-    }
-  }
-  if (error_messages.empty()) {
-    return local_topologies;
-  }
-  return absl::InternalError(
-      absl::StrCat("Getting local topologies failed: ",
-                   absl::StrJoin(error_messages, "\n\n")));
-}
-
 Status BuildDistributedDevices(
+    std::string_view platform_name,
     std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
     int node_id, int num_nodes,
     std::vector<std::unique_ptr<PjRtStreamExecutorDevice>>* devices,
     gpu::GpuExecutableRunOptions* gpu_executable_run_options,
-    PjRtClient::KeyValueGetCallback kv_get,
-    PjRtClient::KeyValuePutCallback kv_put,
+    const PjRtClient::KeyValueGetCallback& kv_get,
+    const PjRtClient::KeyValuePutCallback& kv_put,
     absl::Duration get_local_topology_timeout = absl::Minutes(2),
     absl::Duration get_global_topology_timeout = absl::Minutes(5)) {
   LocalTopologyProto local_topology;
@@ -869,28 +791,12 @@ Status BuildDistributedDevices(
     device_proto->set_name(desc->name());
     device_proto->set_vendor(desc->device_vendor());
   }
-  VLOG(3) << "GPU Local Topology:\n" << local_topology.DebugString();
-  TF_RETURN_IF_ERROR(
-      kv_put(GetLocalTopologyKey(node_id), local_topology.SerializeAsString()));
 
   GlobalTopologyProto global_topology;
-  // The lead node gets all local topologies, builds the global topology and
-  // puts it to the key-value store.
-  if (node_id == 0) {
-    TF_ASSIGN_OR_RETURN(
-        std::vector<LocalTopologyProto> local_topologies,
-        GetAllLocalTopologies(num_nodes, kv_get, get_local_topology_timeout));
-    global_topology =
-        BuildGlobalTopology(absl::Span<LocalTopologyProto>(local_topologies));
-    TF_RETURN_IF_ERROR(
-        kv_put(GetGlobalTopologyKey(), global_topology.SerializeAsString()));
-  } else {
-    TF_ASSIGN_OR_RETURN(
-        std::string global_topology_str,
-        kv_get(GetGlobalTopologyKey(), get_global_topology_timeout));
-    global_topology.ParseFromString(global_topology_str);
-  }
-  VLOG(3) << "GPU Global Topology:\n" << global_topology.DebugString();
+  TF_RETURN_IF_ERROR(ExchangeTopologies(
+      platform_name, node_id, num_nodes, get_local_topology_timeout,
+      get_global_topology_timeout, kv_get, kv_put, local_topology,
+      &global_topology));
 
   std::map<int, GlobalDeviceId> gpu_device_ids;
   absl::flat_hash_map<GlobalDeviceId, int> device_to_node;
@@ -920,12 +826,14 @@ Status BuildDistributedDevices(
   gpu_executable_run_options->set_gpu_global_device_ids(
       std::move(gpu_device_ids));
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  auto nccl_id_store =
-      std::make_shared<NcclIdStore>(node_id, device_to_node, kv_get, kv_put);
-  gpu_executable_run_options->set_nccl_unique_id_callback(
-      [nccl_id_store](const gpu::NcclCliqueKey& key) {
-        return nccl_id_store->GetNcclUniqueId(key);
-      });
+  if (num_nodes > 1) {
+    auto nccl_id_store =
+        std::make_shared<NcclIdStore>(node_id, device_to_node, kv_get, kv_put);
+    gpu_executable_run_options->set_nccl_unique_id_callback(
+        [nccl_id_store](const gpu::NcclCliqueKey& key) {
+          return nccl_id_store->GetNcclUniqueId(key);
+        });
+  }
 #endif  // GOOGLE_CUDA
   return OkStatus();
 }
@@ -940,13 +848,26 @@ StreamExecutorGpuDevice::StreamExecutorGpuDevice(
                                std::move(device_kind), node_id),
       device_vendor_(std::move(device_vendor)),
       slice_index_(slice_index) {
+  int64_t core_index = 0;
+  description().SetCoreOnChip(core_index);
+  std::array<int, 1> coords = {local_hardware_id()};
+  description().SetCoords(coords);
+  std::vector<int64_t> v_coords(description().coords().begin(),
+                                description().coords().end());
+
   description().SetAttributes({
+      {"coords", xla::PjRtDeviceAttribute(v_coords)},
+      {"core_on_chip", xla::PjRtDeviceAttribute(core_index)},
       {"device_vendor", device_vendor_},
       {"slice_index", static_cast<int64_t>(slice_index)},
   });
   description().SetToString(absl::StrFormat(
-      "StreamExecutorGpuDevice(id=%i, process_index=%i, slice_index=%i)", id,
-      process_index(), slice_index));
+      "StreamExecutorGpuDevice(device_kind=%s, id=%i, process_index=%i, "
+      "slice_index=%i))",
+      description().device_kind(), id, process_index(), slice_index));
+  description().SetDebugString(absl::StrFormat("%s_%i(process=%i,(%i))",
+                                               description().device_kind(), id,
+                                               process_index(), v_coords[0]));
 }
 
 int StreamExecutorGpuDevice::slice_index() const { return slice_index_; }
@@ -974,91 +895,92 @@ absl::StatusOr<tsl::AllocatorStats> StreamExecutorGpuDevice::GetAllocatorStats()
   return stats.value();
 }
 
+absl::Span<int const> StreamExecutorGpuDevice::coords() const {
+  return description().coords();
+}
+
+int StreamExecutorGpuDevice::core_on_chip() const {
+  return description().core_on_chip();
+}
+
 StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
-    bool asynchronous, const GpuAllocatorConfig& allocator_config, int node_id,
-    int num_nodes, const std::optional<std::set<int>>& allowed_devices,
-    std::optional<std::string> platform_name,
-    bool should_stage_host_to_device_transfers,
-    PjRtClient::KeyValueGetCallback kv_get,
-    PjRtClient::KeyValuePutCallback kv_put, bool enable_mock_nccl) {
-  TF_ASSIGN_OR_RETURN(LocalClient * xla_client,
-                      GetGpuXlaClient(platform_name, allowed_devices));
+    const GpuClientOptions& options) {
+#if TENSORFLOW_USE_ROCM
+  auto pjrt_platform_name = xla::RocmName();
+#else   // TENSORFLOW_USE_ROCM
+  auto pjrt_platform_name = xla::CudaName();
+#endif  // TENSORFLOW_USE_ROCM
+
+  TF_ASSIGN_OR_RETURN(
+      LocalClient * xla_client,
+      GetGpuXlaClient(options.platform_name, options.allowed_devices));
   std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states;
   TF_ASSIGN_OR_RETURN(local_device_states, BuildLocalDeviceStates(xla_client));
   EnablePeerAccess(xla_client->backend().stream_executors());
-  TF_ASSIGN_OR_RETURN(
-      auto allocator,
-      GetStreamExecutorGpuDeviceAllocator(
-          xla_client->platform(), allocator_config, local_device_states));
+  TF_ASSIGN_OR_RETURN(auto allocator,
+                      GetStreamExecutorGpuDeviceAllocator(
+                          xla_client->platform(), options.allocator_config,
+                          local_device_states));
   auto host_memory_allocator =
       GetGpuHostAllocator(local_device_states.begin()->second->executor());
 
   std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
   auto gpu_run_options = std::make_unique<gpu::GpuExecutableRunOptions>();
-  if (enable_mock_nccl) {
+  if (options.enable_mock_nccl) {
     gpu_run_options->set_enable_mock_nccl_collectives();
   }
-  if (num_nodes > 1) {
-    absl::flat_hash_map<std::string, std::string> device_maps;
-    absl::Mutex mu;
-    if (enable_mock_nccl) {
-      kv_get = [&device_maps, &mu, &num_nodes](
-                   const std::string& k,
-                   absl::Duration timeout) -> xla::StatusOr<std::string> {
-        std::string result;
-        {
-          absl::MutexLock lock(&mu);
-          if (device_maps.contains(k)) {
-            result = device_maps[k];
-          } else {
-            int device_id;
-            std::vector<std::string> tokens = absl::StrSplit(k, ':');
-            if (tokens.size() != 2 ||
-                !absl::SimpleAtoi(tokens[1], &device_id)) {
-              device_id = num_nodes - 1;
-            }
-            // Return fake local topology with device_id info back.
-            xla::LocalTopologyProto local;
-            local.set_boot_id("fake_boot_id");
-            local.set_node_id(device_id);
-            xla::DeviceProto* device = local.add_devices();
-            device->set_global_device_id(device_id);
-            device->set_name("fake_device");
-            device->set_vendor("fake_vendor");
-            result = local.SerializeAsString();
+  absl::flat_hash_map<std::string, std::string> device_maps;
+  absl::Mutex mu;
+  PjRtClient::KeyValueGetCallback kv_get = options.kv_get;
+  PjRtClient::KeyValuePutCallback kv_put = options.kv_put;
+  if (options.enable_mock_nccl) {
+    kv_get = [&device_maps, &mu, &options](
+                 std::string_view k,
+                 absl::Duration timeout) -> xla::StatusOr<std::string> {
+      std::string result;
+      {
+        absl::MutexLock lock(&mu);
+        if (device_maps.contains(k)) {
+          result = device_maps[k];
+        } else {
+          int device_id;
+          std::vector<std::string> tokens = absl::StrSplit(k, ':');
+          if (tokens.size() != 2 || !absl::SimpleAtoi(tokens[1], &device_id)) {
+            device_id = options.num_nodes - 1;
           }
+          // Return fake local topology with device_id info back.
+          xla::LocalTopologyProto local;
+          local.set_boot_id("fake_boot_id");
+          local.set_node_id(device_id);
+          xla::DeviceProto* device = local.add_devices();
+          device->set_global_device_id(device_id);
+          device->set_name("fake_device");
+          device->set_vendor("fake_vendor");
+          result = local.SerializeAsString();
         }
-        return result;
-      };
-      kv_put = [&device_maps, &mu](const std::string& k,
-                                   const std::string& v) -> xla::Status {
-        {
-          absl::MutexLock lock(&mu);
-          device_maps[k] = v;
-        }
-        return xla::OkStatus();
-      };
-    }
-    TF_RET_CHECK(kv_get != nullptr);
-    TF_RET_CHECK(kv_put != nullptr);
-    TF_RETURN_IF_ERROR(BuildDistributedDevices(
-        std::move(local_device_states), node_id, num_nodes, &devices,
-        gpu_run_options.get(), kv_get, kv_put));
-  } else {
-    devices = BuildLocalDevices(std::move(local_device_states), node_id);
+      }
+      return result;
+    };
+    kv_put = [&device_maps, &mu](std::string_view k,
+                                 std::string_view v) -> xla::Status {
+      {
+        absl::MutexLock lock(&mu);
+        device_maps[k] = v;
+      }
+      return xla::OkStatus();
+    };
   }
-
-#if TENSORFLOW_USE_ROCM
-  auto pjrt_platform_name = xla::RocmName();
-#else   // TENSORFLOW_USE_ROCM
-  auto pjrt_platform_name = xla::CudaName();
-#endif  // TENSORFLOW_USE_ROCM
+  TF_RET_CHECK(options.num_nodes == 1 || kv_get != nullptr);
+  TF_RET_CHECK(options.num_nodes == 1 || kv_put != nullptr);
+  TF_RETURN_IF_ERROR(BuildDistributedDevices(
+      pjrt_platform_name, std::move(local_device_states), options.node_id,
+      options.num_nodes, &devices, gpu_run_options.get(), kv_get, kv_put));
 
   return std::unique_ptr<PjRtClient>(std::make_unique<StreamExecutorGpuClient>(
-      pjrt_platform_name, xla_client, std::move(devices),
-      /*node_id=*/node_id, std::move(allocator),
-      std::move(host_memory_allocator), should_stage_host_to_device_transfers,
-      /*gpu_run_options=*/std::move(gpu_run_options)));
+      pjrt_platform_name, xla_client, std::move(devices), options.node_id,
+      std::move(allocator), std::move(host_memory_allocator),
+      options.should_stage_host_to_device_transfers,
+      std::move(gpu_run_options)));
 }
 
 absl::StatusOr<std::string> StreamExecutorGpuTopologyDescription::Serialize()
@@ -1070,4 +992,19 @@ absl::StatusOr<std::string> StreamExecutorGpuTopologyDescription::Serialize()
   return result;
 }
 
+std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
+    std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
+    int node_id) {
+  std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
+  for (auto& ordinal_and_device : local_device_states) {
+    const se::DeviceDescription& description =
+        ordinal_and_device.second->executor()->GetDeviceDescription();
+    auto device = std::make_unique<StreamExecutorGpuDevice>(
+        ordinal_and_device.first, std::move(ordinal_and_device.second),
+        description.name(), description.device_vendor(), node_id);
+    devices.push_back(std::move(device));
+  }
+  return devices;
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index d51613e0604339..5fa9433d17cf6b 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
 #include "xla/pjrt/gpu/gpu_topology.h"
@@ -148,6 +149,10 @@ class StreamExecutorGpuDevice : public PjRtStreamExecutorDevice {
 
   absl::StatusOr<tsl::AllocatorStats> GetAllocatorStats() const override;
 
+  absl::Span<int const> coords() const;
+
+  int core_on_chip() const;
+
  private:
   std::string device_vendor_;
   int slice_index_;
@@ -221,18 +226,30 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
     std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
     int node_id);
 
-// kv_get and kv_put are callbacks provided by the caller to access a key-value
-// store shared between nodes. kv_get and kv_put must be non-null if num_nodes
-// > 1.
+struct GpuClientOptions {
+  GpuAllocatorConfig allocator_config;
+
+  int node_id = 0;
+
+  int num_nodes = 1;
+
+  std::optional<std::set<int>> allowed_devices = std::nullopt;
+
+  std::optional<std::string> platform_name = std::nullopt;
+
+  bool should_stage_host_to_device_transfers = true;
+
+  // `kv_get` and `kv_put` are callbacks provided by the caller to access a
+  // key-value store shared between nodes. `kv_get` and `kv_put` must be
+  // non-null if `num_nodes` > 1.
+  PjRtClient::KeyValueGetCallback kv_get = nullptr;
+  PjRtClient::KeyValuePutCallback kv_put = nullptr;
+
+  bool enable_mock_nccl = false;
+};
+
 StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
-    bool asynchronous, const GpuAllocatorConfig& allocator_config, int node_id,
-    int num_nodes = 1,
-    const std::optional<std::set<int>>& allowed_devices = std::nullopt,
-    std::optional<std::string> platform_name = std::nullopt,
-    bool should_stage_host_to_device_transfers = true,
-    PjRtClient::KeyValueGetCallback kv_get = nullptr,
-    PjRtClient::KeyValuePutCallback kv_put = nullptr,
-    bool enable_mock_nccl = false);
+    const GpuClientOptions& options);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
index a2141f31162589..eb8164d3db5be5 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <numeric>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -105,10 +106,8 @@ static constexpr char const* kProgram = R"(HloModule HostTransfer
     })";
 
 TEST(StreamExecutorGpuClientTest, SendRecvChunked) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           CompileExecutable(kProgram, *client));
@@ -159,9 +158,8 @@ TEST(StreamExecutorGpuClientTest, SendRecvChunked) {
 }
 
 TEST(StreamExecutorGpuClientTest, SendErrorNoDeadLock) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           CompileExecutable(kProgram, *client));
@@ -194,9 +192,8 @@ TEST(StreamExecutorGpuClientTest, SendErrorNoDeadLock) {
 }
 
 TEST(StreamExecutorGpuClientTest, RecvErrorNoDeadLock) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
 
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           CompileExecutable(kProgram, *client));
@@ -232,9 +229,8 @@ TEST(StreamExecutorGpuClientTest, RecvErrorNoDeadLock) {
 }
 
 TEST(StreamExecutorGpuClientTest, ToLiteralAsync) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   ASSERT_GE(client->addressable_devices().size(), 1);
 
   auto src_literal = LiteralUtil::CreateR1<float>({41.0f, 42.0f, 43.0f, 44.0f});
@@ -270,9 +266,8 @@ TEST(StreamExecutorGpuClientTest, ToLiteralAsync) {
 }
 
 TEST(StreamExecutorGpuClientTest, ToLiteralAsyncBeforeBufferReady) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   ASSERT_GE(client->addressable_devices().size(), 1);
 
   auto src_literal = LiteralUtil::CreateR1<float>({41.0f, 42.0f, 43.0f, 44.0f});
@@ -311,9 +306,8 @@ TEST(StreamExecutorGpuClientTest, ToLiteralAsyncBeforeBufferReady) {
 }
 
 TEST(StreamExecutorGpuClientTest, FromHostAsync) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   ASSERT_GE(client->addressable_devices().size(), 1);
 
   std::vector<Literal> src_literals;
@@ -379,9 +373,8 @@ TEST(StreamExecutorGpuClientTest, FromHostAsync) {
   }
 }
 TEST(StreamExecutorGpuClientTest, CopyRawToHostFullBuffer) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtBuffer> buffer,
@@ -399,9 +392,8 @@ TEST(StreamExecutorGpuClientTest, CopyRawToHostFullBuffer) {
 }
 
 TEST(StreamExecutorGpuClientTest, CopyRawToHostSubBuffer) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -417,9 +409,8 @@ TEST(StreamExecutorGpuClientTest, CopyRawToHostSubBuffer) {
 }
 
 TEST(StreamExecutorGpuClientTest, CopyRawToHostOutOfRange) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -435,9 +426,8 @@ TEST(StreamExecutorGpuClientTest, CopyRawToHostOutOfRange) {
 }
 
 TEST(StreamExecutorGpuClientTest, AsyncCopyToDevice) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   ASSERT_GE(client->addressable_devices().size(), 2);
 
   // d0 is the device we will perform local/remote sends from.
@@ -468,9 +458,8 @@ TEST(StreamExecutorGpuClientTest, AsyncCopyToDevice) {
 }
 
 TEST(StreamExecutorGpuClientTest, CreateMixOfErrorBuffers) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   ASSERT_GE(client->addressable_devices().size(), 1);
 
   std::vector<Literal> src_literals;
@@ -542,7 +531,7 @@ TEST(StreamExecutorGpuClientTest, DistributeInit) {
   absl::flat_hash_map<std::string, std::string> kv_store;
   absl::Mutex mu;
   PjRtClient::KeyValueGetCallback kv_get =
-      [&kv_store, &mu](const std::string& k,
+      [&kv_store, &mu](std::string_view k,
                        absl::Duration timeout) -> xla::StatusOr<std::string> {
     absl::Duration wait_interval = absl::Milliseconds(10);
     int num_retry = timeout / wait_interval;
@@ -560,8 +549,7 @@ TEST(StreamExecutorGpuClientTest, DistributeInit) {
         absl::StrCat(k, " is not found in the kv store."));
   };
   PjRtClient::KeyValuePutCallback kv_put =
-      [&kv_store, &mu](const std::string& k,
-                       const std::string& v) -> xla::Status {
+      [&kv_store, &mu](std::string_view k, std::string_view v) -> xla::Status {
     {
       absl::MutexLock lock(&mu);
       kv_store[k] = v;
@@ -574,13 +562,12 @@ TEST(StreamExecutorGpuClientTest, DistributeInit) {
   int num_nodes = 2;
   for (int i = 0; i < num_nodes; i++) {
     thread_pool.Schedule([&, i] {
-      TF_ASSERT_OK_AND_ASSIGN(
-          auto client,
-          GetStreamExecutorGpuClient(
-              true, /*allocator_config=*/{},
-              /*node_id=*/i, num_nodes, /*allowed_devices=*/std::nullopt,
-              /*platform_name=*/std::nullopt,
-              /*should_stage_host_to_device_transfers=*/true, kv_get, kv_put));
+      GpuClientOptions options;
+      options.node_id = i;
+      options.num_nodes = num_nodes;
+      options.kv_get = kv_get;
+      options.kv_put = kv_put;
+      TF_ASSERT_OK_AND_ASSIGN(auto client, GetStreamExecutorGpuClient(options));
       EXPECT_TRUE(client->platform_name() == "cuda" ||
                   client->platform_name() == "rocm");
       EXPECT_EQ(client->addressable_device_count(), 2);
@@ -590,9 +577,8 @@ TEST(StreamExecutorGpuClientTest, DistributeInit) {
 }
 
 TEST(StreamExecutorGpuClientTest, GetAllocatorStatsTest) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   ASSERT_GE(client->addressable_devices().size(), 2);
 
   for (auto device : client->addressable_devices()) {
@@ -606,5 +592,22 @@ TEST(StreamExecutorGpuClientTest, GetAllocatorStatsTest) {
   }
 }
 
+TEST(StreamExecutorGpuClientTest, GpuDeviceDescriptionTest) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
+  for (int device_index = 0; device_index < client->device_count();
+       device_index++) {
+    auto coords =
+        static_cast<PjRtStreamExecutorDevice*>(client->devices()[device_index])
+            ->description()
+            .coords();
+    EXPECT_EQ(coords[0], device_index);
+  }
+  EXPECT_EQ(static_cast<PjRtStreamExecutorDevice*>(client->devices()[0])
+                ->description()
+                .core_on_chip(),
+            0);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
index 2f6012bd5d5216..7daa467f8e6d41 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
-#include "third_party/protobuf/text_format.h"
 #include "xla/client/xla_computation.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
@@ -44,6 +43,7 @@ limitations under the License.
 #include "xla/service/hlo_parser.h"
 #include "xla/tests/literal_test_util.h"
 #include "tsl/platform/casts.h"
+#include "tsl/platform/protobuf.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -83,9 +83,8 @@ void ValidateResult(
 }
 
 TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
   Compiler::TargetConfig gpu_target_config = xla::Compiler::TargetConfig(
@@ -113,9 +112,8 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
 }
 
 TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
   auto gpu_compiler = gpu::NVPTXCompiler();
@@ -143,9 +141,8 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
 }
 
 TEST(StreamExecutorGpuCompilerTest, SuccessLoadFromSerializedExecutable) {
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   auto se_client = absl::WrapUnique(
       tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
   StreamExecutorGpuCompiler compiler;
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
index d395a46f93b52e..69013b441b9f7d 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
@@ -76,9 +76,8 @@ TEST(StreamExecutorGpuCompilerTest, TopologyNotSameXla) {
   StreamExecutorGpuTopologyDescription topology(CudaId(), CudaName(),
                                                 "Fake_device", {0, 1});
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
   EXPECT_THAT(compiler.Compile(xla::CompileOptions(), computation, topology,
                                client.get()),
@@ -88,9 +87,8 @@ TEST(StreamExecutorGpuCompilerTest, TopologyNotSameXla) {
 TEST(StreamExecutorGpuCompilerTest, SuccessXla) {
   StreamExecutorGpuCompiler compiler;
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
   TF_ASSERT_OK_AND_ASSIGN(auto topology, client->GetTopologyDescription());
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
@@ -142,9 +140,8 @@ TEST(StreamExecutorGpuCompilerTest, TopologyNotSameMlir) {
   StreamExecutorGpuTopologyDescription topology(CudaId(), CudaName(),
                                                 "Fake_device", {0, 1});
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   EXPECT_THAT(compiler.Compile(xla::CompileOptions(), mlir_module.get(),
                                topology, client.get()),
               StatusIs(absl::StatusCode::kUnimplemented));
@@ -159,9 +156,8 @@ TEST(StreamExecutorGpuCompilerTest, SuccessMlir) {
   auto mlir_module =
       mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
   TF_ASSERT_OK_AND_ASSIGN(auto topology, client->GetTopologyDescription());
   TF_ASSERT_OK_AND_ASSIGN(
       auto executable,
diff --git a/third_party/xla/xla/pjrt/layout_mode.cc b/third_party/xla/xla/pjrt/layout_mode.cc
new file mode 100644
index 00000000000000..458144d50865cf
--- /dev/null
+++ b/third_party/xla/xla/pjrt/layout_mode.cc
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/layout_mode.h"
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "xla/layout.h"
+#include "xla/service/hlo_parser.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+
+namespace xla {
+
+LayoutMode::LayoutMode(Mode layout_mode, std::optional<Layout> layout)
+    : mode(layout_mode), user_layout(std::move(layout)) {
+  if (mode == Mode::kUserSpecified) {
+    CHECK(user_layout) << "Must pass layout to LayoutMode constructor when "
+                          "mode == kUserSpecified";
+  } else {
+    CHECK(!user_layout) << "Only pass layout to LayoutMode constructor "
+                           "if mode == kUserSpecified";
+  }
+}
+
+std::string LayoutMode::ToString() const {
+  switch (mode) {
+    case Mode::kDefault:
+      return "default";
+    case Mode::kUserSpecified:
+      CHECK(user_layout);
+      return user_layout->ToString();
+    case Mode::kAuto:
+      return "auto";
+  }
+}
+
+StatusOr<LayoutMode> LayoutMode::FromString(std::string s) {
+  if (s == "default") {
+    return LayoutMode(Mode::kDefault);
+  }
+  if (s == "auto") {
+    return LayoutMode(Mode::kAuto);
+  }
+  // LayoutMode is user-specified; parse Layout string
+  StatusOr<Layout> layout = ParseLayout(s);
+  if (!layout.ok()) {
+    Status new_status(layout.status().code(),
+                      absl::StrCat("Error parsing user-specified layout mode '",
+                                   s, "': ", layout.status().message()));
+    return new_status;
+  }
+  return LayoutMode(*layout);
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/layout_mode.h b/third_party/xla/xla/pjrt/layout_mode.h
new file mode 100644
index 00000000000000..792bc3de9f9a10
--- /dev/null
+++ b/third_party/xla/xla/pjrt/layout_mode.h
@@ -0,0 +1,67 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_LAYOUT_MODE_H_
+#define XLA_PJRT_LAYOUT_MODE_H_
+
+#include <string>
+
+#include "xla/layout.h"
+#include "xla/shape.h"
+#include "xla/statusor.h"
+
+namespace xla {
+
+// Helper struct for specifying how to choose the layout for a value in a
+// program to be compiled (e.g. a computation argument).
+//
+// The source of truth for this info is the "mhlo.layout_mode" string attribute
+// of input MLIR modules. This struct can help manage the attribute. The
+// ToString and FromString methods can be used to convert between this struct
+// and the "mhlo.layout_mode" string attr.
+struct LayoutMode {
+  enum class Mode {
+    // Use the default compact layout.
+    kDefault = 0,
+    // Use `layout`.
+    kUserSpecified,
+    // Let compiler choose layout.
+    kAuto
+  };
+  Mode mode = Mode::kDefault;
+
+  // Only set iff layout_mode == kUserSpecified. This is the layout of the
+  // per-device data, i.e. if the computation is sharded, the caller must choose
+  // both the sharding and layout for this value such that they're compatible.
+  std::optional<Layout> user_layout;
+
+  LayoutMode() = default;
+  explicit LayoutMode(Mode layout_mode,
+                      std::optional<Layout> layout = std::nullopt);
+  explicit LayoutMode(const Layout& layout)
+      : LayoutMode(Mode::kUserSpecified, layout) {}
+  explicit LayoutMode(const Shape& shape_with_layout)
+      : LayoutMode(Mode::kUserSpecified, shape_with_layout.layout()) {}
+
+  // Produces a human-readable string representing this LayoutMode. Is also in
+  // the correct format for the "mhlo.layout_mode" attribute.
+  std::string ToString() const;
+  // Parses a string produced by LayoutMode::ToString() or Layout::ToString().
+  static StatusOr<LayoutMode> FromString(std::string s);
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_LAYOUT_MODE_H_
diff --git a/third_party/xla/xla/pjrt/lru_cache.h b/third_party/xla/xla/pjrt/lru_cache.h
index ce3d1933f8c5fa..0386817fe2768d 100644
--- a/third_party/xla/xla/pjrt/lru_cache.h
+++ b/third_party/xla/xla/pjrt/lru_cache.h
@@ -92,6 +92,9 @@ class LRUCache {
   int Size() const { return entries_.size(); }
   int Capacity() const { return lru_list_->Capacity(); }
 
+  auto begin() const { return entries_.begin(); }
+  auto end() const { return entries_.end(); }
+
  private:
   LRUList* lru_list_;
 
diff --git a/third_party/xla/xla/pjrt/metrics.cc b/third_party/xla/xla/pjrt/metrics.cc
index 4dd49508bb5bca..1a17cc6590e08d 100644
--- a/third_party/xla/xla/pjrt/metrics.cc
+++ b/third_party/xla/xla/pjrt/metrics.cc
@@ -17,11 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 
-#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
-#include "xla/stream_executor/gpu/gpu_init.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "tsl/lib/monitoring/counter.h"
 #include "tsl/lib/monitoring/gauge.h"
 
@@ -73,21 +69,10 @@ void RecordPjrtCompilerCompileModuleStatus(bool is_compiling) {
   pjrt_compiler_is_compiling_module->GetCell()->Set(is_compiling);
 }
 
-void RecordFreeGpuSystemMemory() {
-  tensorflow::se::Platform* gpu_manager = tensorflow::se::GPUMachineManager();
-  int visible_device_count = gpu_manager->VisibleDeviceCount();
-  if (gpu_manager == nullptr || visible_device_count <= 0) return;
-
-  for (int i = 0; i < visible_device_count; ++i) {
-    tensorflow::se::StreamExecutor* se =
-        gpu_manager->ExecutorForDevice(i).value();
-    int64_t free_memory = 0, total_memory = 0;
-    if (se->DeviceMemoryUsage(&free_memory, &total_memory)) {
-      free_gpu_system_memory->GetCell(absl::StrCat(i))->Set(free_memory);
-    } else {
-      LOG(ERROR) << "Failed to query available memory for GPU " << i;
-    }
-  }
+void RecordFreeGpuSystemMemory(const int device_ordinal,
+                               const int64_t free_memory) {
+  free_gpu_system_memory->GetCell(absl::StrCat(device_ordinal))
+      ->Set(free_memory);
 }
 
 int64_t GetFreeGpuSystemMemory(int gpu_id) {
diff --git a/third_party/xla/xla/pjrt/metrics.h b/third_party/xla/xla/pjrt/metrics.h
index 05c618e0e50ceb..870e473aac6a89 100644
--- a/third_party/xla/xla/pjrt/metrics.h
+++ b/third_party/xla/xla/pjrt/metrics.h
@@ -38,8 +38,10 @@ void RecordPjrtCompilerCompileComputationStatus(bool is_compiling);
 
 void RecordPjrtCompilerCompileModuleStatus(bool is_compiling);
 
-void RecordFreeGpuSystemMemory();
+// TODO(xiangll): Refactor to a more appropriate location.
+void RecordFreeGpuSystemMemory(int device_ordinal, int64_t free_memory);
 
+// TODO(xiangll): Refactor to a more appropriate location.
 int64_t GetFreeGpuSystemMemory(int gpu_id);
 
 }  // namespace metrics
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
index 72f0207eb4061c..c7504baf2e3331 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
@@ -1135,14 +1135,21 @@ StatusOr<std::string> PjRtCApiExecutable::SerializeExecutable() const {
 }
 
 StatusOr<std::string> PjRtCApiExecutable::FingerprintExecutable() const {
+  const PJRT_Api* c_api_ = pjrt_c_api();
+  if (c_api_->pjrt_api_version.major_version == 0 &&
+      c_api_->pjrt_api_version.minor_version < 35) {
+    // TODO(yeounoh): To be removed after 01/20/2024.
+    return xla::Unimplemented(
+        "Getting fingerprint from unloaded PJRT executable requires plugin "
+        "with PJRT C API version >= 0.35");
+  }
+
   PJRT_Executable_Fingerprint_Args args;
   args.struct_size = PJRT_Executable_Fingerprint_Args_STRUCT_SIZE;
   args.priv = nullptr;
   args.executable = c_executable();
-
   RETURN_STATUS_IF_PJRT_ERROR(c_api_->PJRT_Executable_Fingerprint(&args),
                               c_api_);
-
   return std::string(args.executable_fingerprint,
                      args.executable_fingerprint_size);
 }
@@ -1419,7 +1426,8 @@ PjRtCApiLoadedExecutable::GetCommonExecuteArgs(
     std::vector<std::vector<PJRT_Buffer*>>& c_output_lists_storage,
     std::vector<PJRT_Buffer**>& c_output_lists,
     std::optional<std::vector<PJRT_Event*>>& device_complete_events,
-    SendRecvCallbackData& callback_data) {
+    SendRecvCallbackData& callback_data,
+    std::vector<int64_t>& non_donatable_input_indices_storage) {
   bool using_host_callbacks =
       !options.send_callbacks.empty() || !options.recv_callbacks.empty();
   if (using_host_callbacks &&
@@ -1436,6 +1444,13 @@ PjRtCApiLoadedExecutable::GetCommonExecuteArgs(
   args.options = &c_options;
   args.options->struct_size = PJRT_ExecuteOptions_STRUCT_SIZE;
   args.options->launch_id = options.launch_id;
+  for (auto i : options.non_donatable_input_indices) {
+    non_donatable_input_indices_storage.push_back(i);
+  }
+  args.options->num_non_donatable_input_indices =
+      options.non_donatable_input_indices.size();
+  args.options->non_donatable_input_indices =
+      non_donatable_input_indices_storage.data();
   args.num_devices = argument_handles.size();
   CHECK_GT(args.num_devices, 0);
   args.num_args = argument_handles[0].size();
@@ -1509,6 +1524,7 @@ PjRtCApiLoadedExecutable::Execute(
   std::vector<std::vector<PJRT_Buffer*>> c_argument_lists_storage;
   std::vector<std::vector<PJRT_Buffer*>> c_output_lists_storage;
   std::vector<PJRT_Buffer**> c_output_lists;
+  std::vector<int64_t> non_donatable_input_indices_storage;
   PJRT_ExecuteOptions c_options;
   c_options.num_send_ops = 0;
   c_options.num_recv_ops = 0;
@@ -1524,7 +1540,8 @@ PjRtCApiLoadedExecutable::Execute(
       GetCommonExecuteArgs(argument_handles, options, c_options,
                            c_argument_lists_storage, c_arguments,
                            c_output_lists_storage, c_output_lists,
-                           device_complete_events, *callback_data));
+                           device_complete_events, *callback_data,
+                           non_donatable_input_indices_storage));
 
   args.execute_device = nullptr;
 
@@ -1574,6 +1591,7 @@ PjRtCApiLoadedExecutable::ExecuteWithSingleDevice(
   std::vector<std::vector<PJRT_Buffer*>> c_argument_lists_storage;
   std::vector<std::vector<PJRT_Buffer*>> c_output_lists_storage;
   std::vector<PJRT_Buffer**> c_output_lists;
+  std::vector<int64_t> non_donatable_input_indices_storage;
   PJRT_ExecuteOptions c_options;
   c_options.num_send_ops = 0;
   c_options.num_recv_ops = 0;
@@ -1589,7 +1607,8 @@ PjRtCApiLoadedExecutable::ExecuteWithSingleDevice(
       GetCommonExecuteArgs(argument_handles_vec, options, c_options,
                            c_argument_lists_storage, c_arguments,
                            c_output_lists_storage, c_output_lists,
-                           device_complete_events, *callback_data));
+                           device_complete_events, *callback_data,
+                           non_donatable_input_indices_storage));
 
   args.execute_device =
       tensorflow::down_cast<PjRtCApiDevice*>(device)->c_device();
@@ -1646,7 +1665,31 @@ bool PjRtCApiLoadedExecutable::IsDeleted() {
 }
 
 StatusOr<std::string> PjRtCApiLoadedExecutable::FingerprintExecutable() const {
-  return executable_->FingerprintExecutable();
+  StatusOr<std::string> fingerprint = executable_->FingerprintExecutable();
+  if (fingerprint.ok()) {
+    return *fingerprint;
+  }
+  if (fingerprint.status().code() != absl::StatusCode::kUnimplemented) {
+    return fingerprint.status();
+  }
+
+  // Fallback and call PJRT_LoadedEecutable_Fingerprint until the plugins
+  // implement new PJRT_Executable_Fingerprint API within the compatibility
+  // window.
+  // TODO(yeounoh): To be removed after 01/20/2024.
+  PJRT_LoadedExecutable_Fingerprint_Args args;
+  args.struct_size = PJRT_LoadedExecutable_Fingerprint_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.executable = c_loaded_executable();
+  const PJRT_Api* c_api = pjrt_c_api();
+  std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> error(
+      c_api->PJRT_LoadedExecutable_Fingerprint(&args),
+      pjrt::MakeErrorDeleter(c_api));
+  if (error) {
+    return ::pjrt::PjrtErrorToStatus(error.get(), c_api);
+  }
+  return std::string(args.executable_fingerprint,
+                     args.executable_fingerprint_size);
 }
 
 // ---------------------------------- Buffers ----------------------------------
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
index 95a942cc9ce49f..4cbadff7b4db06 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
@@ -258,6 +258,11 @@ class PjRtCApiClient : public PjRtClient {
 
   absl::string_view platform_version() const override;
 
+  std::optional<PjRtPluginAttributes> plugin_attributes() const override {
+    return PjRtPluginAttributes{c_api_->pjrt_api_version.major_version,
+                                c_api_->pjrt_api_version.minor_version};
+  }
+
   // TODO(b/244756954): Rethink this function altogether
   PjRtRuntimeType runtime_type() const override {
     return PjRtRuntimeType::kTfrt;
@@ -271,6 +276,12 @@ class PjRtCApiClient : public PjRtClient {
     return Unimplemented("PJRT C API does not support GetHloCostAnalysis");
   }
 
+  StatusOr<Layout> GetDefaultLayout(PrimitiveType element_type,
+                                    absl::Span<const int64_t> dims) override {
+    // TODO(skyewm): implement
+    return Unimplemented("PJRT C API does not support GetDefaultLayout");
+  }
+
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
       const XlaComputation& computation, CompileOptions options) override;
 
@@ -568,6 +579,10 @@ class PjRtCApiExecutable : public PjRtExecutable {
   StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
       const override;
 
+  StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+    return pjrt::GetCompiledMemoryStats(c_api_, executable_.get());
+  }
+
   StatusOr<std::vector<Shape>> GetOutputShapes() const override {
     LOG(FATAL) << "PjRtExecutable::GetOutputShapes() not implemented in PJRT C "
                   "API. Please use PjRtExecutable::GetOutputElementTypes() or "
@@ -633,6 +648,10 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
     return executable_->GetHloModules();
   }
 
+  StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+    return executable_->GetCompiledMemoryStats();
+  }
+
   StatusOr<std::vector<Shape>> GetOutputShapes() const override {
     LOG(FATAL)
         << "PjRtLoadedExecutable::GetOutputShapes() not implemented in PJRT C "
@@ -723,7 +742,8 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
       std::vector<std::vector<PJRT_Buffer*>>& c_output_lists_storage,
       std::vector<PJRT_Buffer**>& c_output_lists,
       std::optional<std::vector<PJRT_Event*>>& device_complete_events,
-      SendRecvCallbackData& send_recv_callback_data);
+      SendRecvCallbackData& send_recv_callback_data,
+      std::vector<int64_t>& non_donatable_input_indices_storage);
 
   StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteWithSingleDevice(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc
index 2f9e72b1ec07c1..9e9cb71d226fb2 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "xla/client/xla_builder.h"
 #include "xla/literal_util.h"
 #include "xla/pjrt/c/pjrt_c_api_cpu_internal.h"
@@ -81,8 +82,10 @@ TEST(PjRtCApiClientTest, IsDynamicDimension) {
   auto computation = builder.Build(reshaped).value();
   std::unique_ptr<PjRtLoadedExecutable> executable =
       client->Compile(computation, CompileOptions()).value();
+  ExecuteOptions execute_options;
+  execute_options.non_donatable_input_indices = {0};
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> results =
-      executable->Execute({{param0.get(), param1.get()}}, ExecuteOptions())
+      executable->Execute({{param0.get(), param1.get()}}, execute_options)
           .value();
   ASSERT_EQ(results[0].size(), 1);
   auto* result_buffer = results[0][0].get();
@@ -116,8 +119,16 @@ TEST(PjRtCApiClientTest, EmptyExecutableFingerprint) {
   std::unique_ptr<PjRtLoadedExecutable> executable =
       client->Compile(computation, CompileOptions()).value();
 
-  // Empty executable should return an error status.
-  EXPECT_FALSE(executable->FingerprintExecutable().ok());
+  PjRtCApiClient* c_client = dynamic_cast<PjRtCApiClient*>(client.get());
+  ASSERT_NE(c_client, nullptr);
+  if (c_client->pjrt_c_api()->pjrt_api_version.minor_version >= 35) {
+    // Empty executable should return an error status.
+    EXPECT_FALSE(executable->FingerprintExecutable().ok());
+  } else {
+    // TODO(yeounoh): To be removed after 01/20/2024.
+    EXPECT_EQ(executable->FingerprintExecutable().status().code(),
+              absl::StatusCode::kUnimplemented);
+  }
 }
 
 TEST(PjRtClientTest, CreateViewAndCopyToDeviceAsyncExternalCpuOnly) {
diff --git a/third_party/xla/xla/pjrt/pjrt_client.h b/third_party/xla/xla/pjrt/pjrt_client.h
index 046f4b09c51f9d..beef3127d8d0d4 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_client.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -118,7 +119,41 @@ class PjRtDevice {
   // The ID of this device. IDs are unique among devices of this type
   // (e.g. CPUs, GPUs). On multi-host platforms, this will be unique across all
   // hosts' devices.  This is the ID that should be used in a DeviceAssignment.
-  virtual int id() const { return description().id(); }
+  ABSL_DEPRECATED("Use global_device_id() instead")
+  virtual int id() const { return global_device_id().value(); }
+
+  // There are several different IDs for a PJRT device.
+  //
+  // - global_device_id: The logical global device ID. This is unique among
+  // devices of this type (e.g. CPUs, GPUs). On multi-host platforms, this will
+  // be unique across all hosts' devices.  This is the ID that should be used in
+  // a DeviceAssignment.
+  //
+  // - local_device_id: The logical local device ID. This will be used to look
+  // up an addressable device local to a given client. It is -1 if undefined.
+  //
+  // - local_hardware_id: The physical local device ID, e.g., the CUDA device
+  // number. Multiple PJRT devices can have the same local_hardware_id if
+  // these PJRT devices share the same physical device. This is useful for
+  // identifying which physical device when interacting with non-JAX code. In
+  // general, not guaranteed to be dense, and -1 if undefined.
+
+  // TODO(b/314368788): Remove `id()` and replace it with this function.
+  virtual PjRtGlobalDeviceId global_device_id() const {
+    return PjRtGlobalDeviceId(description().id());
+  }
+
+  virtual PjRtLocalDeviceId local_device_id() const {
+    // By default, local_device_id is the same as local_hardware_id when there
+    // is only one PJRT device on a physical device.
+    return PjRtLocalDeviceId(local_hardware_id_typed().value());
+  }
+
+  // TODO(b/314368788): Remove `int local_hardware_id()` and rename this
+  // function to `local_device_id()`. Make this function pure virtual.
+  virtual PjRtLocalHardwareId local_hardware_id_typed() const {
+    return PjRtLocalHardwareId(local_hardware_id());
+  }
 
   // The index of the process that this device belongs to, i.e. is addressable
   // from. This is not always identical to PjRtClient::process_index() in a
@@ -401,6 +436,11 @@ class PjRtHostMemoryForDeviceManager {
 
 class PjRtLoadedExecutable;
 
+struct PjRtPluginAttributes {
+  int64_t pjrt_c_api_major_version;
+  int64_t pjrt_c_api_minor_version;
+};
+
 // Encapsulates the state of Python session with XLA.
 //
 // It is the responsibility of the client of this API to keep the PjRtClient
@@ -461,9 +501,9 @@ class PjRtClient {
   // Subclasses of PjRtClient can optionally take these callbacks in their
   // constructors.
   using KeyValueGetCallback = std::function<xla::StatusOr<std::string>(
-      const std::string& key, absl::Duration timeout)>;
-  using KeyValuePutCallback = std::function<xla::Status(
-      const std::string& key, const std::string& value)>;
+      std::string_view key, absl::Duration timeout)>;
+  using KeyValuePutCallback =
+      std::function<xla::Status(std::string_view key, std::string_view value)>;
 
   PjRtClient() = default;
   explicit PjRtClient(std::unique_ptr<PjRtHostMemoryForDeviceManager>
@@ -495,11 +535,21 @@ class PjRtClient {
 
   // Lookup any PjRtDevice for a given PjRtDevice::id().
   virtual StatusOr<PjRtDevice*> LookupDevice(int device_id) const = 0;
+  // TODO(b/314368788): Replace the above function with this function.
+  virtual StatusOr<PjRtDevice*> LookupDevice(
+      PjRtGlobalDeviceId global_device_id) const {
+    return LookupDevice(global_device_id.value());
+  }
 
   // Return an addressable PjRtDevice for a given
   // PjRtDevice::local_hardware_id().
   virtual StatusOr<PjRtDevice*> LookupAddressableDevice(
       int local_hardware_id) const = 0;
+  // TODO(b/314368788): Replace the above function with this function.
+  virtual StatusOr<PjRtDevice*> LookupAddressableDevice(
+      PjRtLocalDeviceId local_device_id) const {
+    return LookupAddressableDevice(local_device_id.value());
+  }
 
   // Return all memory spaces owned by the client.
   // The memory spaces are in no particular order.
@@ -515,6 +565,12 @@ class PjRtClient {
   // (e.g. the CUDA version on GPU or libtpu version on Cloud TPU).
   virtual absl::string_view platform_version() const = 0;
 
+  // Returns information about the underlying PJRT C API plugin if such a plugin
+  // is being used, otherwise returns nullopt.
+  virtual std::optional<PjRtPluginAttributes> plugin_attributes() const {
+    return std::nullopt;
+  }
+
   // TODO(b/244756954): Rethink this function altogether
   // Returns an enum that identifies the type of runtime being used under this
   // client.
@@ -537,6 +593,15 @@ class PjRtClient {
     return Unimplemented("Multi slice device assignment is not supported.");
   }
 
+  // Returns the default device layout for a buffer with `element_type` and
+  // `dims`. The default layout is a platform-specific layout used when no other
+  // layout is specified, e.g. for host-to-device transfers. When compiling, the
+  // default layout is used for program arguments and outputs unless
+  // user-specified or compiler-chosen layouts are requested via the
+  // "mhlo.layout_mode" attribute.
+  virtual StatusOr<Layout> GetDefaultLayout(PrimitiveType element_type,
+                                            absl::Span<const int64_t> dims) = 0;
+
   // Returns a backend-specific HLO cost analysis visitor.
   virtual StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
       const = 0;
diff --git a/third_party/xla/xla/pjrt/pjrt_client_test.cc b/third_party/xla/xla/pjrt/pjrt_client_test.cc
index f11f2379f97650..c4c6ffaaef228f 100644
--- a/third_party/xla/xla/pjrt/pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_client_test.cc
@@ -563,5 +563,7 @@ ENTRY DuplicateDonationError() -> (f32[2, 2], f32[2, 2]) {
   }
 }
 
+TEST(PjRtClientTest, GetDefaultLayout) {}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_common.h b/third_party/xla/xla/pjrt/pjrt_common.h
index 263aa115113108..68632b994413d8 100644
--- a/third_party/xla/xla/pjrt/pjrt_common.h
+++ b/third_party/xla/xla/pjrt/pjrt_common.h
@@ -21,10 +21,21 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "tsl/lib/gtl/int_type.h"
+
 namespace xla {
 
+// bool comes before int64_t because when pybind11 tries to convert a Python
+// object to a C++ type, it will try to convert it to the first type in the list
+// of possible types that it can be converted to (b/309163973).
 using PjRtValueType =
-    std::variant<std::string, int64_t, std::vector<int64_t>, float, bool>;
+    std::variant<std::string, bool, int64_t, std::vector<int64_t>, float>;
+
+// The strong-typed integer classes to better disambiguate different IDs for
+// PJRT devices.
+TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtGlobalDeviceId, int32_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtLocalDeviceId, int32_t);
+TSL_LIB_GTL_DEFINE_INT_TYPE(PjRtLocalHardwareId, int32_t);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.cc b/third_party/xla/xla/pjrt/pjrt_executable.cc
index d7ccefff8759a8..141987063ae1de 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.cc
+++ b/third_party/xla/xla/pjrt/pjrt_executable.cc
@@ -335,12 +335,24 @@ StatusOr<std::vector<Layout>> PjRtExecutable::GetParameterLayouts() const {
         "from executable.");
   }
   ComputationLayout comp_layout = hlo_modules[0]->entry_computation_layout();
-  std::vector<Layout> result;
-  result.reserve(comp_layout.parameter_count());
-  for (const ShapeLayout& layout : comp_layout.parameter_layouts()) {
-    result.push_back(layout.layout());
+  return comp_layout.FlattenedParameterLayouts();
+}
+
+StatusOr<std::vector<Layout>> PjRtExecutable::GetOutputLayouts() const {
+  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> hlo_modules,
+                      GetHloModules());
+  if (hlo_modules.size() > 1) {
+    return Unimplemented(
+        "PjRtExecutable::GetOutputLayouts doesn't support MPMD "
+        "executables.");
   }
-  return result;
+  if (hlo_modules.empty()) {
+    return InvalidArgument(
+        "PjRtExecutable::GetOutputLayouts: couldn't retrieve HLO module "
+        "from executable.");
+  }
+  ComputationLayout comp_layout = hlo_modules[0]->entry_computation_layout();
+  return comp_layout.FlattenedResultLayouts();
 }
 
 StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.h b/third_party/xla/xla/pjrt/pjrt_executable.h
index c192404b066872..2a739484ade108 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.h
+++ b/third_party/xla/xla/pjrt/pjrt_executable.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/client/executable_build_options.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/pjrt/compile_options.pb.h"
+#include "xla/pjrt/executable_metadata.pb.h"
 #include "xla/pjrt/execute_options.pb.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/service/compiler.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -160,7 +160,6 @@ struct PjRtTransferMetadata {
 };
 
 class PjRtChunk;
-class PjRtTransferMetadata;
 class CopyToDeviceStream;
 
 struct SendCallback {
@@ -276,6 +275,28 @@ struct CompiledMemoryStats {
 
   std::string serialized_hlo_proto = "";
   std::string DebugString() const;
+
+  CompiledMemoryStatsProto ToProto() {
+    CompiledMemoryStatsProto proto;
+    proto.set_generated_code_size_in_bytes(generated_code_size_in_bytes);
+    proto.set_argument_size_in_bytes(argument_size_in_bytes);
+    proto.set_output_size_in_bytes(output_size_in_bytes);
+    proto.set_alias_size_in_bytes(alias_size_in_bytes);
+    proto.set_temp_size_in_bytes(temp_size_in_bytes);
+    proto.mutable_hlo_proto()->ParseFromString(serialized_hlo_proto);
+    return proto;
+  }
+
+  static CompiledMemoryStats FromProto(const CompiledMemoryStatsProto& proto) {
+    CompiledMemoryStats stats;
+    stats.generated_code_size_in_bytes = proto.generated_code_size_in_bytes();
+    stats.argument_size_in_bytes = proto.argument_size_in_bytes();
+    stats.output_size_in_bytes = proto.alias_size_in_bytes();
+    stats.alias_size_in_bytes = proto.alias_size_in_bytes();
+    stats.temp_size_in_bytes = proto.temp_size_in_bytes();
+    stats.serialized_hlo_proto = proto.hlo_proto().SerializeAsString();
+    return stats;
+  }
 };
 
 class PjRtExecutable {
@@ -312,6 +333,9 @@ class PjRtExecutable {
   // Returns the layout of each input parameter.
   virtual StatusOr<std::vector<Layout>> GetParameterLayouts() const;
 
+  // Returns the layout of each output.
+  virtual StatusOr<std::vector<Layout>> GetOutputLayouts() const;
+
   // Returns a list of lists of memory kind strings for output. The returned
   // value is `[num_programs, num_output]`. The size of the outer list should be
   // equal to `GetHloModules()`. Under SPMD, one can use
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 401c125010afcd..debfa5d885c7de 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -103,6 +103,7 @@ limitations under the License.
 #include "xla/pjrt/metrics.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/tracked_device_buffer.h"
@@ -257,6 +258,16 @@ StatusOr<DeviceAssignment> PjRtStreamExecutorClient::GetDefaultDeviceAssignment(
                                                                 num_partitions);
 }
 
+StatusOr<Layout> PjRtStreamExecutorClient::GetDefaultLayout(
+    PrimitiveType element_type, absl::Span<const int64_t> dims) {
+  Shape shape = ShapeUtil::MakeShape(element_type, dims);
+  TF_ASSIGN_OR_RETURN(
+      shape,
+      client()->backend().transfer_manager()->ChooseCompactLayoutForShape(
+          shape));
+  return shape.layout();
+}
+
 StatusOr<std::unique_ptr<HloCostAnalysis>>
 PjRtStreamExecutorClient::GetHloCostAnalysis() const {
   return std::make_unique<HloCostAnalysis>(
@@ -1250,13 +1261,18 @@ PjRtStreamExecutorDevice::GetStreamForExternalReadyEvents() const {
 
 StatusOr<PjRtDevice*> PjRtStreamExecutorClient::LookupAddressableDevice(
     int local_hardware_id) const {
+  return LookupAddressableDevice(PjRtLocalDeviceId(local_hardware_id));
+}
+
+StatusOr<PjRtDevice*> PjRtStreamExecutorClient::LookupAddressableDevice(
+    xla::PjRtLocalDeviceId local_device_id) const {
   for (auto* device : addressable_devices_) {
-    if (local_hardware_id == device->local_hardware_id()) {
+    if (local_device_id == device->local_device_id()) {
       return device;
     }
   }
-  return InvalidArgument("No matching device found for local_hardware_id %d",
-                         local_hardware_id);
+  return InvalidArgument("No matching device found for local_device_id %d",
+                         local_device_id.value());
 }
 
 absl::Span<PjRtMemorySpace* const> PjRtStreamExecutorClient::memory_spaces()
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 9267e1e035c5be..e789792d726907 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_PJRT_PJRT_STREAM_EXECUTOR_CLIENT_H_
 #define XLA_PJRT_PJRT_STREAM_EXECUTOR_CLIENT_H_
 
+#include <algorithm>
 #include <array>
 #include <functional>
 #include <map>
@@ -78,6 +79,10 @@ class PjRtStreamExecutorDeviceDescription : public PjRtDeviceDescription {
 
   absl::string_view DebugString() const override { return debug_string_; }
 
+  int core_on_chip() const { return core_index_; }
+
+  absl::Span<int const> coords() const { return absl::MakeSpan(coords_); }
+
   const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
       const override {
     return attributes_;
@@ -94,13 +99,19 @@ class PjRtStreamExecutorDeviceDescription : public PjRtDeviceDescription {
 
   void SetToString(std::string to_string) { to_string_ = std::move(to_string); }
 
+  void SetCoords(std::array<int, 1> coords) { coords_ = coords; }
+
+  void SetCoreOnChip(int core_index) { core_index_ = core_index; }
+
  private:
   const int id_;
   const int process_index_;
   const std::string device_kind_;
+  int core_index_ = -1;
   std::string debug_string_ = "<unknown SE device>";
   std::string to_string_ = "<unknown SE device>";
   absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes_;
+  std::array<int, 1> coords_;
 };
 
 class PjRtStreamExecutorDevice : public PjRtDevice {
@@ -139,7 +150,17 @@ class PjRtStreamExecutorDevice : public PjRtDevice {
 
   bool IsAddressable() const override { return device_ordinal_ != -1; }
 
-  int local_hardware_id() const override { return device_ordinal_; }
+  int local_hardware_id() const override {
+    return local_hardware_id_typed().value();
+  }
+
+  PjRtLocalDeviceId local_device_id() const override {
+    return PjRtLocalDeviceId(local_hardware_id_typed().value());
+  }
+
+  PjRtLocalHardwareId local_hardware_id_typed() const override {
+    return PjRtLocalHardwareId(device_ordinal_);
+  }
 
   // If this is a device local to this host, returns a LocalDeviceState object
   // that can be used to manipulate the device. Returns nullptr if the device is
@@ -199,16 +220,23 @@ class PjRtStreamExecutorClient : public PjRtClient {
   }
 
   StatusOr<PjRtDevice*> LookupDevice(int device_id) const override {
-    auto it = id_to_device_.find(device_id);
+    return LookupDevice(PjRtGlobalDeviceId(device_id));
+  }
+
+  StatusOr<PjRtDevice*> LookupDevice(
+      PjRtGlobalDeviceId global_device_id) const override {
+    auto it = id_to_device_.find(global_device_id.value());
     if (it != id_to_device_.end()) {
       return it->second;
     }
     return InvalidArgument("No matching device found for device_id %d",
-                           device_id);
+                           global_device_id.value());
   }
 
   StatusOr<PjRtDevice*> LookupAddressableDevice(
       int local_hardware_id) const override;
+  StatusOr<PjRtDevice*> LookupAddressableDevice(
+      PjRtLocalDeviceId local_device_id) const override;
 
   absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
 
@@ -225,6 +253,9 @@ class PjRtStreamExecutorClient : public PjRtClient {
   StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const override;
 
+  StatusOr<Layout> GetDefaultLayout(PrimitiveType element_type,
+                                    absl::Span<const int64_t> dims) override;
+
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
       const XlaComputation& computation, CompileOptions options) override;
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
@@ -869,6 +900,10 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
     return executables_;
   }
 
+  absl::StatusOr<CompileOptions> GetCompileOptions() const override {
+    return compile_options_;
+  }
+
  protected:
   bool parameter_is_tupled_arguments() const {
     return parameter_is_tupled_arguments_;
diff --git a/third_party/xla/xla/pjrt/status_casters.h b/third_party/xla/xla/pjrt/status_casters.h
new file mode 100644
index 00000000000000..9c39d1cbb5153f
--- /dev/null
+++ b/third_party/xla/xla/pjrt/status_casters.h
@@ -0,0 +1,218 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_STATUS_CASTERS_H_
+#define XLA_PJRT_STATUS_CASTERS_H_
+
+#include "xla/pjrt/exceptions.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "tsl/platform/macros.h"
+
+namespace xla {
+
+// C++ -> Python caster helpers.
+//
+// Failing statuses become Python exceptions; OK Status() becomes None.
+//
+// Given there can be only a single global pybind11 type_caster for the
+// `absl::Status` type, and given XLA wants a custom exception being raised,
+// we use a dedicated helper to implement this feature without relying on a
+// global `type_caster`.
+//
+// For example:
+//
+// - Functions without arguments:
+//   m.def("my_func", []() { xla::ThrowIfError(MyFunc()); }
+// - Classes with a single argument:
+//   py_class.def("delete", [](Buffer& self) {
+//     xla::ThrowIfError(self.Delete());
+//   }
+//
+// For functions with more arguments, you can either inline the arguments,
+// or use the `ThrowIfErrorWrapper` wrapper defined below:
+//
+// m.def("my_func", xla::ThrowIfErrorWrapper(MyFunc));
+//
+// Nonstatic member functions can be wrapped by passing a
+// pointer-to-member-function:
+// xla::ThrowIfErrorWrapper(&MyClass::MyMethod)
+
+inline void ThrowIfError(xla::Status src) {
+  if (!src.ok()) {
+    throw xla::XlaRuntimeError(src);
+  }
+}
+
+// If one does not want to have to define a lambda specifying the inputs
+// arguments, on can use the `ThrowIfErrorWrapper` wrapper.
+//
+// There are three specializations:
+// - For free functions, `Sig` is the function type and `F` is `Sig&`.
+// - For callable types, `Sig` is the pointer to member function type
+//   and `F` is the type of the callable.
+// - For a nonstatic member function of a class `C`, `Sig` is the function type
+//   and `F` is Sig C::*.
+//
+// In the first two cases, the wrapper returns a callable with signature `Sig`;
+// in the third case, the wrapper returns callable with a modified signature
+// that takes a C instance as the first argument.
+template <typename Sig, typename F>
+struct ThrowIfErrorWrapper;
+
+// C++17 "deduction guide" that guides class template argument deduction (CTAD)
+// For free functions.
+template <typename F>
+ThrowIfErrorWrapper(F) -> ThrowIfErrorWrapper<decltype(&F::operator()), F>;
+
+// For callable types (with operator()).
+template <typename... Args>
+ThrowIfErrorWrapper(xla::Status (&)(Args...))
+    -> ThrowIfErrorWrapper<xla::Status(Args...), xla::Status (&)(Args...)>;
+
+// For unbound nonstatic member functions.
+template <typename C, typename... Args>
+ThrowIfErrorWrapper(xla::Status (C::*)(Args...))
+    -> ThrowIfErrorWrapper<xla::Status(Args...), C>;
+
+// Template specializations.
+
+// For free functions.
+template <typename... Args>
+struct ThrowIfErrorWrapper<xla::Status(Args...), xla::Status (&)(Args...)> {
+  explicit ThrowIfErrorWrapper(xla::Status (&f)(Args...)) : func(f) {}
+  void operator()(Args... args) {
+    xla::ThrowIfError(func(std::forward<Args>(args)...));
+  }
+  xla::Status (&func)(Args...);
+};
+
+// For callable types (with operator()), non-const and const versions.
+template <typename C, typename... Args, typename F>
+struct ThrowIfErrorWrapper<xla::Status (C::*)(Args...), F> {
+  explicit ThrowIfErrorWrapper(F&& f) : func(std::move(f)) {}
+  void operator()(Args... args) {
+    xla::ThrowIfError(func(std::forward<Args>(args)...));
+  }
+  F func;
+};
+template <typename C, typename... Args, typename F>
+struct ThrowIfErrorWrapper<xla::Status (C::*)(Args...) const, F> {
+  explicit ThrowIfErrorWrapper(F&& f) : func(std::move(f)) {}
+  void operator()(Args... args) const {
+    xla::ThrowIfError(func(std::forward<Args>(args)...));
+  }
+  F func;
+};
+
+// For unbound nonstatic member functions, non-const and const versions.
+// `ptmf` stands for "pointer to member function".
+template <typename C, typename... Args>
+struct ThrowIfErrorWrapper<xla::Status(Args...), C> {
+  explicit ThrowIfErrorWrapper(xla::Status (C::*ptmf)(Args...)) : ptmf(ptmf) {}
+  void operator()(C& instance, Args... args) {
+    xla::ThrowIfError((instance.*ptmf)(std::forward<Args>(args)...));
+  }
+  xla::Status (C::*ptmf)(Args...);
+};
+template <typename C, typename... Args>
+struct ThrowIfErrorWrapper<xla::Status(Args...) const, C> {
+  explicit ThrowIfErrorWrapper(xla::Status (C::*ptmf)(Args...) const)
+      : ptmf(ptmf) {}
+  void operator()(const C& instance, Args... args) const {
+    xla::ThrowIfError((instance.*ptmf)(std::forward<Args>(args)...));
+  }
+  xla::Status (C::*ptmf)(Args...) const;
+};
+
+// Utilities for `StatusOr`.
+template <typename T>
+T ValueOrThrow(StatusOr<T> v) {
+  if (!v.ok()) {
+    throw xla::XlaRuntimeError(v.status());
+  }
+  return std::move(v).value();
+}
+
+template <typename Sig, typename F>
+struct ValueOrThrowWrapper;
+
+template <typename F>
+ValueOrThrowWrapper(F) -> ValueOrThrowWrapper<decltype(&F::operator()), F>;
+
+template <typename R, typename... Args>
+ValueOrThrowWrapper(xla::StatusOr<R> (&)(Args...))
+    -> ValueOrThrowWrapper<xla::StatusOr<R>(Args...),
+                           xla::StatusOr<R> (&)(Args...)>;
+
+template <typename C, typename R, typename... Args>
+ValueOrThrowWrapper(xla::StatusOr<R> (C::*)(Args...))
+    -> ValueOrThrowWrapper<xla::StatusOr<R>(Args...), C>;
+
+// Deduction guide for const methods.
+template <typename C, typename R, typename... Args>
+ValueOrThrowWrapper(xla::StatusOr<R> (C::*)(Args...) const)
+    -> ValueOrThrowWrapper<xla::StatusOr<R>(Args...) const, C>;
+
+template <typename R, typename... Args>
+struct ValueOrThrowWrapper<xla::StatusOr<R>(Args...),
+                           xla::StatusOr<R> (&)(Args...)> {
+  explicit ValueOrThrowWrapper(xla::StatusOr<R> (&f)(Args...)) : func(f) {}
+  R operator()(Args... args) {
+    return xla::ValueOrThrow(func(std::forward<Args>(args)...));
+  }
+  xla::StatusOr<R> (&func)(Args...);
+};
+template <typename R, typename C, typename... Args, typename F>
+struct ValueOrThrowWrapper<xla::StatusOr<R> (C::*)(Args...), F> {
+  explicit ValueOrThrowWrapper(F&& f) : func(std::move(f)) {}
+  R operator()(Args... args) {
+    return xla::ValueOrThrow(func(std::forward<Args>(args)...));
+  }
+  F func;
+};
+template <typename R, typename C, typename... Args, typename F>
+struct ValueOrThrowWrapper<xla::StatusOr<R> (C::*)(Args...) const, F> {
+  explicit ValueOrThrowWrapper(F&& f) : func(std::move(f)) {}
+  R operator()(Args... args) const {
+    return xla::ValueOrThrow(func(std::forward<Args>(args)...));
+  }
+  F func;
+};
+
+// For unbound nonstatic member functions, non-const and const versions.
+// `ptmf` stands for "pointer to member function".
+template <typename R, typename C, typename... Args>
+struct ValueOrThrowWrapper<xla::StatusOr<R>(Args...), C> {
+  explicit ValueOrThrowWrapper(xla::StatusOr<R> (C::*ptmf)(Args...))
+      : ptmf(ptmf) {}
+  R operator()(C& instance, Args... args) {
+    return xla::ValueOrThrow((instance.*ptmf)(std::forward<Args>(args)...));
+  }
+  xla::StatusOr<R> (C::*ptmf)(Args...);
+};
+template <typename R, typename C, typename... Args>
+struct ValueOrThrowWrapper<xla::StatusOr<R>(Args...) const, C> {
+  explicit ValueOrThrowWrapper(xla::StatusOr<R> (C::*ptmf)(Args...) const)
+      : ptmf(ptmf) {}
+  R operator()(const C& instance, Args... args) const {
+    return xla::ValueOrThrow((instance.*ptmf)(std::forward<Args>(args)...));
+  }
+  xla::StatusOr<R> (C::*ptmf)(Args...) const;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_STATUS_CASTERS_H_
diff --git a/third_party/xla/xla/pjrt/tf_pjrt_client.cc b/third_party/xla/xla/pjrt/tf_pjrt_client.cc
index e140e0c53cd165..53b90631f4f441 100644
--- a/third_party/xla/xla/pjrt/tf_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/tf_pjrt_client.cc
@@ -19,6 +19,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/pjrt/pjrt_client.h"
+
 namespace xla {
 
 TfPjRtBuffer::TfPjRtBuffer(TfPjRtClient* client,
@@ -112,6 +118,12 @@ TfPjRtExecutable::ExecutePortable(
 TfPjRtClient::TfPjRtClient(std::unique_ptr<PjRtClient> wrapped)
     : wrapped_(std::move(wrapped)) {
   LOG(INFO) << "TfPjRtClient created.";
+  int num_mutexes = wrapped_->addressable_device_count();
+  alive_buffers_ = std::vector<DeviceBuffers>(num_mutexes);
+  for (int i = 0; i < num_mutexes; ++i) {
+    mutex_id_from_device_id_.insert(
+        {wrapped_->addressable_devices()[i]->id(), i});
+  }
 }
 
 TfPjRtClient::~TfPjRtClient() { LOG(INFO) << "TfPjRtClient destroyed."; }
@@ -131,24 +143,43 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfPjRtClient::WrapExecutable(
       std::make_unique<TfPjRtExecutable>(this, std::move(executable)));
 }
 
+static int GetMutexId(
+    const TfPjRtBuffer* buffer,
+    const absl::flat_hash_map<int, int>& mutex_id_from_device_id) {
+  auto iters = mutex_id_from_device_id.find(buffer->wrapped()->device()->id());
+  CHECK(iters != mutex_id_from_device_id.end())
+      << "Mutex id not found for device id: "
+      << buffer->wrapped()->device()->id();
+  return iters->second;
+}
+
 void TfPjRtClient::TrackBuffer(TfPjRtBuffer* buffer) {
-  mu_.Lock();
-  alive_buffers_.insert(buffer);
-  mu_.Unlock();
+  int mutex_id = GetMutexId(buffer, mutex_id_from_device_id_);
+  {
+    absl::MutexLock lock(&alive_buffers_[mutex_id].mu);
+    alive_buffers_[mutex_id].alive_buffers.insert(buffer);
+  }
 }
 
 void TfPjRtClient::UntrackBuffer(const TfPjRtBuffer* buffer) {
-  mu_.Lock();
-  alive_buffers_.erase(buffer);
-  mu_.Unlock();
+  if (buffer->wrapped() == nullptr) {
+    return;
+  }
+  int mutex_id = GetMutexId(buffer, mutex_id_from_device_id_);
+  {
+    absl::MutexLock lock(&alive_buffers_[mutex_id].mu);
+    alive_buffers_[mutex_id].alive_buffers.erase(buffer);
+  }
 }
 
 void TfPjRtClient::DestroyWrappedBuffersAndClient() {
-  mu_.Lock();
-  for (auto* buffer : alive_buffers_) {
-    buffer->DestroyWrappedBuffer();
+  int num_mutexes = alive_buffers_.size();
+  for (int i = 0; i < num_mutexes; ++i) {
+    absl::MutexLock lock(&alive_buffers_[i].mu);
+    for (auto* buffer : alive_buffers_[i].alive_buffers) {
+      buffer->DestroyWrappedBuffer();
+    }
   }
-  mu_.Unlock();
   wrapped_.reset(nullptr);
   LOG(INFO) << "TfPjRtClient::DestroyWrappedBuffersAndClient completed.";
 }
diff --git a/third_party/xla/xla/pjrt/tf_pjrt_client.h b/third_party/xla/xla/pjrt/tf_pjrt_client.h
index 050cdca0e2e5e2..6dcb5ce3bc2f09 100644
--- a/third_party/xla/xla/pjrt/tf_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/tf_pjrt_client.h
@@ -23,7 +23,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "tsl/platform/errors.h"
@@ -203,15 +206,23 @@ class TfPjRtClient : public PjRtClient {
     return wrapped_->addressable_devices();
   }
   StatusOr<PjRtDevice*> LookupDevice(int device_id) const override {
-    return wrapped_->LookupDevice(device_id);
+    return LookupDevice(PjRtGlobalDeviceId(device_id));
+  }
+  StatusOr<PjRtDevice*> LookupDevice(
+      PjRtGlobalDeviceId global_device_id) const override {
+    return wrapped_->LookupDevice(global_device_id.value());
   }
   StatusOr<PjRtDevice*> LookupAddressableDevice(
       int local_hardware_id) const override {
+    return LookupAddressableDevice(PjRtLocalDeviceId(local_hardware_id));
+  }
+  StatusOr<PjRtDevice*> LookupAddressableDevice(
+      PjRtLocalDeviceId local_device_id) const override {
     if (wrapped_ == nullptr) {
       return tsl::errors::Internal(
           "Wrapped PJRT client in TfPjRtClient is already destoryed.");
     }
-    return wrapped_->LookupAddressableDevice(local_hardware_id);
+    return wrapped_->LookupAddressableDevice(local_device_id);
   }
   absl::Span<PjRtMemorySpace* const> memory_spaces() const override {
     return wrapped_->memory_spaces();
@@ -232,6 +243,10 @@ class TfPjRtClient : public PjRtClient {
       int num_replicas, int num_partitions) const override {
     return wrapped_->GetDefaultDeviceAssignment(num_replicas, num_partitions);
   }
+  StatusOr<Layout> GetDefaultLayout(PrimitiveType element_type,
+                                    absl::Span<const int64_t> dims) override {
+    return wrapped_->GetDefaultLayout(element_type, dims);
+  }
   StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
       const override {
     return wrapped_->GetHloCostAnalysis();
@@ -359,8 +374,16 @@ class TfPjRtClient : public PjRtClient {
       StatusOr<std::unique_ptr<PjRtLoadedExecutable>> to_wrap);
 
   std::unique_ptr<PjRtClient> wrapped_;
-  absl::Mutex mu_;
-  absl::flat_hash_set<TfPjRtBuffer*> alive_buffers_ ABSL_GUARDED_BY(&mu_);
+
+  absl::flat_hash_map<int, int> mutex_id_from_device_id_;
+
+  // Depending on `sizeof(absl::flat_hash_set<TfPjRtBuffer*>)`, might need to
+  // add some padding to the struct.
+  struct DeviceBuffers {
+    absl::Mutex mu;
+    absl::flat_hash_set<TfPjRtBuffer*> alive_buffers ABSL_GUARDED_BY(mu);
+  };
+  std::vector<DeviceBuffers> alive_buffers_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/tf_pjrt_client_test.cc b/third_party/xla/xla/pjrt/tf_pjrt_client_test.cc
index 49e1f219028706..9e3b785fc01853 100644
--- a/third_party/xla/xla/pjrt/tf_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/tf_pjrt_client_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "xla/literal_util.h"
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/service/hlo_parser.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/file_system.h"
diff --git a/third_party/xla/xla/pjrt/tfrt_cpu_pjrt_client.h b/third_party/xla/xla/pjrt/tfrt_cpu_pjrt_client.h
index 70175c021756a7..7fa97e13118f0d 100644
--- a/third_party/xla/xla/pjrt/tfrt_cpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/tfrt_cpu_pjrt_client.h
@@ -16,510 +16,8 @@ limitations under the License.
 #ifndef XLA_PJRT_TFRT_CPU_PJRT_CLIENT_H_
 #define XLA_PJRT_TFRT_CPU_PJRT_CLIENT_H_
 
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
+// Transitional forwarding header. Please include cpu/cpu_client.h directly.
 
-#include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "xla/client/xla_computation.h"
-#include "xla/executable_run_options.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/literal.h"
-#include "xla/pjrt/abstract_tfrt_cpu_buffer.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_executable.h"
-#include "xla/pjrt/pjrt_future.h"
-#include "xla/pjrt/semaphore.h"
-#include "xla/pjrt/tracked_tfrt_cpu_device_buffer.h"
-#include "xla/pjrt/transpose.h"
-#include "xla/runtime/cpu_event.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/service/computation_placer.h"
-#include "xla/service/executable.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/service/hlo_cost_analysis.h"
-#include "xla/shape.h"
-#include "xla/status.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/fingerprint.h"
-#include "tsl/platform/threadpool.h"
-
-namespace xla {
-
-class TfrtCpuDeviceDescription final : public PjRtDeviceDescription {
- public:
-  explicit TfrtCpuDeviceDescription(int id);
-
-  int id() const override { return id_; }
-
-  int process_index() const override { return 0; }
-
-  absl::string_view device_kind() const override;
-
-  absl::string_view DebugString() const override;
-
-  absl::string_view ToString() const override;
-
-  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
-      const override {
-    return attributes_;
-  }
-
- private:
-  int id_;
-  std::string debug_string_;
-  std::string to_string_;
-  absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes_ = {};
-};
-
-class TfrtCpuDevice final : public PjRtDevice {
- public:
-  explicit TfrtCpuDevice(int id, int max_inflight_computations = 32);
-
-  const TfrtCpuDeviceDescription& description() const override {
-    return description_;
-  }
-
-  void SetClient(PjRtClient* client) {
-    CHECK(client_ == nullptr);
-    client_ = client;
-  }
-
-  PjRtClient* client() const override { return client_; }
-
-  bool IsAddressable() const override {
-    return process_index() == client()->process_index();
-  }
-
-  // Used as `device_ordinal`.
-  int local_hardware_id() const override { return id(); }
-
-  Status TransferToInfeed(const LiteralSlice& literal) override;
-
-  Status TransferFromOutfeed(MutableBorrowingLiteral literal) override;
-
-  absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
-
-  StatusOr<PjRtMemorySpace*> default_memory_space() const override;
-
-  // Returns a semaphore for admission control on inflight computations.
-  Semaphore& max_inflight_computations_semaphore() {
-    return max_inflight_computations_semaphore_;
-  }
-
-  std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
-      absl::string_view description) const override {
-    return nullptr;
-  }
-
- private:
-  PjRtClient* client_ = nullptr;
-  TfrtCpuDeviceDescription description_;
-
-  // TODO(zhangqiaorjc): Optimize semaphore related overhead.
-  // Semaphore used to limit how many programs can be enqueued by the host
-  // ahead of the device.
-  Semaphore max_inflight_computations_semaphore_;
-};
-
-class TfrtCpuClient final : public PjRtClient {
- public:
-  TfrtCpuClient(int process_index,
-                std::vector<std::unique_ptr<TfrtCpuDevice>> devices,
-                size_t num_threads);
-  ~TfrtCpuClient() override;
-
-  int process_index() const override { return process_index_; }
-
-  int device_count() const override { return devices_.size(); }
-
-  int addressable_device_count() const override {
-    return addressable_devices_.size();
-  }
-
-  absl::Span<PjRtDevice* const> devices() const override { return devices_; }
-
-  absl::Span<PjRtDevice* const> addressable_devices() const override {
-    return addressable_devices_;
-  }
-
-  StatusOr<PjRtDevice*> LookupDevice(int device_id) const override;
-
-  StatusOr<PjRtDevice*> LookupAddressableDevice(
-      int local_hardware_id) const override;
-
-  absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
-
-  PjRtPlatformId platform_id() const override {
-    return tsl::Fingerprint64(CpuName());
-  }
-
-  absl::string_view platform_name() const override { return CpuName(); }
-
-  absl::string_view platform_version() const override { return "<unknown>"; }
-
-  PjRtRuntimeType runtime_type() const override { return kTfrt; }
-
-  StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
-      int num_replicas, int num_partitions) const override;
-
-  StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
-      const override;
-
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
-      const XlaComputation& computation, CompileOptions options) override;
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
-      mlir::ModuleOp module, CompileOptions options) override;
-
-  // For TfrtCpuClient, `options` is mandatory.
-  // This function returns an InvalidArgument error if `std::nullopt` is passed.
-  // TODO(b/237720161): make it actually optional
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
-      absl::string_view serialized,
-      std::optional<CompileOptions> options) override;
-
-  StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
-      Status error, const Shape& shape, PjRtDevice* device) override;
-
-  StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
-      const Shape& shape, PjRtDevice* device) override;
-
-  StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
-  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
-                                    PjRtDevice* device) override;
-
-  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
-  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
-                                    PjRtMemorySpace* memory_space) override {
-    return Unimplemented(
-        "CreateBuffersForAsyncHostToDevice with memory_space not implemented.");
-  }
-
-  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
-      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
-      std::optional<absl::Span<int64_t const>> byte_strides,
-      HostBufferSemantics host_buffer_semantics,
-      std::function<void()> on_done_with_host_buffer,
-      PjRtDevice* device) override;
-
-  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
-      const LiteralSlice& literal, PjRtDevice* device) override;
-
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
-  MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
-                              PjRtDevice* device,
-                              PjRtCrossHostRecvNotifier notifier) override {
-    return Unimplemented("MakeCrossHostReceiveBuffers not implemented.");
-  }
-
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
-  MakeCrossHostReceiveBuffersForGather(
-      absl::Span<const Shape> shapes, std::vector<GatherDetails> gather_details,
-      PjRtDevice* device, PjRtCrossHostRecvNotifier notifier) override {
-    return Unimplemented(
-        "MakeCrossHostReceiveBuffersForGather not implemented.");
-  }
-
-  StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
-      void* device_ptr, const Shape& shape, PjRtDevice* device,
-      std::function<void()> on_delete_callback,
-      std::optional<std::intptr_t> stream) override;
-
-  StatusOr<ChannelHandle> CreateChannelHandle() override {
-    return Unimplemented("CreateChannelHandle not implemented.");
-  }
-  StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() override {
-    return Unimplemented("CreateDeviceToHostChannelHandle not implemented.");
-  }
-  StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() override {
-    return Unimplemented("CreateHostToDeviceChannelHandle not implemented.");
-  }
-
-  Status Defragment() override {
-    return Unimplemented("Defragment not implemented.");
-  }
-
-  tsl::thread::ThreadPool* pjrt_client_thread_pool() const {
-    return pjrt_client_thread_pool_.get();
-  }
-
-  AsyncWorkRunner* async_work_runner() const {
-    return async_work_runner_.get();
-  }
-
-  Eigen::ThreadPoolDevice* eigen_intraop_device() const {
-    return eigen_intraop_device_.get();
-  }
-
-  tsl::AsyncValueRef<runtime::CpuEvent> GetLastCollectiveLaunchEvent() {
-    absl::MutexLock lock(&mu_);
-    return last_collective_launch_event_.CopyRef();
-  }
-
-  void SetLastCollectiveLaunchEvent(
-      tsl::AsyncValueRef<runtime::CpuEvent> event) {
-    absl::MutexLock lock(&mu_);
-    last_collective_launch_event_ = std::move(event);
-  }
-
- private:
-  int process_index_;
-  // Includes all devices, including non-addressable devices.
-  std::vector<std::unique_ptr<TfrtCpuDevice>> owned_devices_;
-  // Pointers to `owned_devices_`.
-  std::vector<PjRtDevice*> devices_;
-  // Maps Device::id() to the corresponding Device. Includes all devices.
-  absl::flat_hash_map<int, TfrtCpuDevice*> id_to_device_;
-  // Addressable devices indexed by core_id.
-  std::vector<PjRtDevice*> addressable_devices_;
-  std::unique_ptr<ComputationPlacer> computation_placer_;
-
-  // Thread pool for running PjRtClient tasks.
-  std::unique_ptr<tsl::thread::ThreadPool> pjrt_client_thread_pool_;
-  std::unique_ptr<AsyncWorkRunner> async_work_runner_;
-
-  // TODO(zhangqiaorjc): Use tsl::compat::EigenHostContextThreadPool.
-  std::unique_ptr<tsl::thread::ThreadPool> eigen_intraop_pool_;
-  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_intraop_device_;
-
-  // Launching collectives are prone to deadlock when we use fixed-sized
-  // threadpools since ExecuteHelper will block until all replicas reach the
-  // barrier. We ensure that
-  // 1. Threadpool size is at least as large as device_count so one collective
-  //    launch over all devices can succeed.
-  // 2. Gang-schedule each collective by conservatively ensuring a total order
-  //    of collectives and launching only one collective at a time to avoid
-  //    having no active threads to make progress
-  // TODO(zhangqiaorjc): Explore alternatives that allow multiple concurrent
-  // collectives.
-  mutable absl::Mutex mu_;
-  tsl::AsyncValueRef<runtime::CpuEvent> last_collective_launch_event_
-      ABSL_GUARDED_BY(mu_);
-
-  // A cache for transpose plans. We use transposes to convert
-  // (possibly strided) buffers provided to BufferFromHostBuffer into dense
-  // major-to-minor layout.
-  absl::Mutex transpose_mu_;
-  TransposePlanCache transpose_cache_ ABSL_GUARDED_BY(transpose_mu_);
-};
-
-class TfrtCpuBuffer final : public AbstractTfrtCpuBuffer {
- public:
-  TfrtCpuBuffer(
-      Shape on_device_shape,
-      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
-      TfrtCpuClient* client, TfrtCpuDevice* device);
-
-  TfrtCpuBuffer(const TfrtCpuBuffer&) = delete;
-  TfrtCpuBuffer(TfrtCpuBuffer&&) = delete;
-  TfrtCpuBuffer& operator=(const TfrtCpuBuffer&) = delete;
-  TfrtCpuBuffer& operator=(TfrtCpuBuffer&&) = delete;
-
-  PjRtMemorySpace* memory_space() const override { return nullptr; }
-  TfrtCpuDevice* device() const override { return device_; }
-  TfrtCpuClient* client() const override { return client_; }
-
-  using PjRtBuffer::ToLiteralSync;
-  PjRtFuture<Status> ToLiteral(MutableLiteralBase* literal) override;
-
-  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
-      PjRtDevice* dst_device) override;
-
- private:
-  absl::string_view buffer_name() const override { return "TfrtCpuBuffer"; }
-
-  TfrtCpuClient* client_;
-  TfrtCpuDevice* const device_;
-};
-
-class TfrtCpuExecutable final : public PjRtLoadedExecutable {
- public:
-  TfrtCpuExecutable(
-      int num_replicas, int num_partitions,
-      std::shared_ptr<DeviceAssignment> device_assignment,
-      bool parameter_is_tupled_arguments, CompileOptions compile_options,
-      std::unique_ptr<Executable> cpu_executable,
-      BufferAllocation::Index result_buffer_index,
-      absl::InlinedVector<BufferAllocation::Index, 4> result_buffer_indices,
-      std::vector<LogicalDeviceIds> addressable_device_logical_ids,
-      std::vector<PjRtDevice*> addressable_devices, TfrtCpuClient* client);
-
-  ~TfrtCpuExecutable() override = default;
-
-  TfrtCpuClient* client() const override { return client_; }
-
-  absl::string_view name() const override {
-    return cpu_executable_->shared_module()->name();
-  }
-
-  int num_replicas() const override { return num_replicas_; }
-
-  int num_partitions() const override { return num_partitions_; }
-
-  int64_t SizeOfGeneratedCodeInBytes() const override {
-    return cpu_executable_->SizeOfGeneratedCodeInBytes();
-  }
-
-  const DeviceAssignment& device_assignment() const override {
-    return *device_assignment_;
-  }
-
-  absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
-      const override {
-    return addressable_device_logical_ids_;
-  }
-
-  absl::Span<PjRtDevice* const> addressable_devices() const override {
-    return addressable_devices_;
-  }
-
-  StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
-      const override {
-    return std::vector<std::shared_ptr<HloModule>>{
-        cpu_executable_->shared_module()};
-  }
-
-  StatusOr<std::vector<std::vector<absl::string_view>>> GetOutputMemoryKinds()
-      const override {
-    return Unimplemented("GetOutputMemoryKinds is not supported.");
-  }
-
-  StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
-    CompiledMemoryStats memory_stats = CompiledMemoryStats();
-    memory_stats.generated_code_size_in_bytes = SizeOfGeneratedCodeInBytes();
-    const HloProto* proto = cpu_executable_->hlo_proto();
-    if (!proto) {
-      return tsl::errors::FailedPrecondition(
-          "cpu_executable_ has no hlo_proto.");
-    }
-    memory_stats.serialized_hlo_proto = proto->SerializeAsString();
-    return memory_stats;
-  }
-
-  using PjRtLoadedExecutable::Execute;
-  StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
-      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
-      const ExecuteOptions& options,
-      std::optional<std::vector<PjRtFuture<Status>>>& returned_futures)
-      override;
-
-  using PjRtLoadedExecutable::ExecuteSharded;
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
-      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
-      const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future,
-      bool fill_future) override;
-
-  using PjRtLoadedExecutable::ExecutePortable;
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
-      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
-      const ExecuteOptions& options,
-      std::optional<PjRtFuture<Status>>& returned_future,
-      bool fill_future) override;
-
-  void Delete() override;
-
-  bool IsDeleted() override;
-
-  StatusOr<std::string> SerializeExecutable() const override;
-
-  bool IsReturnedFutureSupported() const override { return true; }
-
-  StatusOr<std::optional<std::string>> Fingerprint() const;
-
-  std::shared_ptr<Executable> cpu_executable() const { return cpu_executable_; }
-
-  StatusOr<std::string> FingerprintExecutable() const override {
-    return Unimplemented("Fingerprinting executable is not supported.");
-  }
-
- private:
-  friend class TfrtCpuClient;
-
-  Status SetUpDonation(bool tuple_inputs);
-
-  // Checks that the input buffers passed in by the user have the correct size
-  // on device for the compiled program.
-  Status CheckBufferCompatibilities(
-      absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const>
-          input_buffers) const;
-
-  StatusOr<Result> ExecuteHelper(
-      absl::Span<PjRtBuffer* const> argument_handles, int replica,
-      int partition, const RunId& run_id, const ExecuteOptions& options,
-      tsl::AsyncValueRef<runtime::CpuEvent> last_collective_launch_event,
-      bool fill_future, TfrtCpuDevice* device = nullptr);
-
-  TfrtCpuClient* client_;
-
-  int num_replicas_;
-  int num_partitions_;
-  std::shared_ptr<DeviceAssignment> device_assignment_;
-  bool parameter_is_tupled_arguments_;
-  CompileOptions compile_options_;
-
-  std::shared_ptr<Executable> cpu_executable_;
-
-  // Caching `result_buffer_index_` and `result_buffer_indices_` to avoid lookup
-  // HLO dataflow analysis data structures in program execution critical path.
-
-  // Buffer allocation index corresponding to root buffer buffer.
-  BufferAllocation::Index result_buffer_index_;
-  // Buffer allocation indices corresponding to each result buffer leaf buffer.
-  absl::InlinedVector<BufferAllocation::Index, 4> result_buffer_indices_;
-
-  // Size on device of each leaf buffer of the compiled program, cached here
-  // for performance reasons.
-  std::vector<int64_t> input_buffer_sizes_in_bytes_;
-
-  // A sorted vector of parameters that have any aliased buffers and thus must
-  // be donated when executing the computation.
-  std::vector<int> parameters_that_must_be_donated_;
-
-  // The replica and partition indices of device_assignment_ to be run by this
-  // client. On single-host platforms without partitioning, this is all
-  // replicas (i.e. addressable_device_logical_ids_[i] = (i, 0)), but this may
-  // not be the case on multi-host platforms. If there are 4 replicas and 2
-  // partitions on a single host platform, size of
-  // addressable_device_logical_ids_ is 4*2 = 8.
-  std::vector<LogicalDeviceIds> addressable_device_logical_ids_;
-
-  // addressable_devices_[i] is the Device to which
-  // addressable_device_logical_ids_[i] is assigned. shared_ptrs instead of
-  // unique_ptrs to play well with the Python bindings (see xla.cc).
-  std::vector<PjRtDevice*> addressable_devices_;
-
-  // Cached result of comparing HloCostAnalysis FLOP estimate for execute
-  // critical path.
-  bool cheap_computation_;
-};
-
-// Creates a CPU client with one Device. For testing purposes, you can set the
-// number of devices passing the --xla_force_host_platform_device_count flag to
-// the XLA_FLAGS environment variable.
-StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(bool asynchronous);
-
-// Similar to the function above, but you can set the number of devices and max
-// number of inflight computations per device explicitly.
-StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(
-    bool asynchronous, int cpu_device_count,
-    int max_inflight_computations_per_device = 32);
-
-}  // namespace xla
+#include "xla/pjrt/cpu/cpu_client.h"
 
 #endif  // XLA_PJRT_TFRT_CPU_PJRT_CLIENT_H_
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.cc b/third_party/xla/xla/pjrt/tracked_device_buffer.cc
index c071a7d8a827d7..d353858c8f631a 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <atomic>
 #include <cinttypes>
+#include <cstdint>
 #include <functional>
 #include <iterator>
 #include <memory>
@@ -33,6 +34,8 @@ limitations under the License.
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/event.h"
 #include "xla/types.h"
+#include "tsl/profiler/lib/connected_traceme.h"
+#include "tsl/profiler/lib/context_types.h"
 
 namespace xla {
 
@@ -121,11 +124,21 @@ bool BufferSequencingEvent::IsComplete() {
 void BufferSequencingEvent::ExecuteOrAddToFutureTasks(
     const std::string& task_name, std::function<void()> task) {
   absl::MutexLock lock(&mu_);
+  tsl::profiler::TraceMeProducer producer(
+      "BufferSequencingEvent::ExecuteOrAddToFutureTasks",
+      tsl::profiler::ContextType::kPjRt);
+  uint64_t context_id = producer.GetContextId();
+  auto wrapped_task = [task = std::move(task), context_id]() {
+    tsl::profiler::TraceMeConsumer consumer("BufferSequencingEvent::Execute",
+                                            tsl::profiler::ContextType::kPjRt,
+                                            context_id);
+    task();
+  };
   if (defined_status_.IsConcrete()) {
-    thread_pool_->Schedule(std::move(task));
+    thread_pool_->Schedule(std::move(wrapped_task));
     return;
   }
-  on_ready_tasks_callback_[task_name] = std::move(task);
+  on_ready_tasks_callback_[task_name] = std::move(wrapped_task);
 }
 
 void BufferSequencingEvent::ExecuteFutureTasks() {
diff --git a/third_party/xla/xla/pjrt/utils.cc b/third_party/xla/xla/pjrt/utils.cc
index 0171f6286da7ab..59d12ced99c16b 100644
--- a/third_party/xla/xla/pjrt/utils.cc
+++ b/third_party/xla/xla/pjrt/utils.cc
@@ -22,13 +22,19 @@ limitations under the License.
 #include <memory>
 #include <numeric>
 #include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
 #include "absl/types/span.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "xla/client/executable_build_options.h"
 #include "xla/client/xla_computation.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -36,6 +42,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/layout_util.h"
+#include "xla/pjrt/layout_mode.h"
 #include "xla/primitive_util.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo.pb.h"
@@ -167,6 +174,324 @@ Status ParseDeviceAssignmentCompileOptions(
   return OkStatus();
 }
 
+// Helper method that takes an ArrayAttr of DictionaryAttrs for each arg or
+// result of a function, and looks for "mhlo.layout_mode". `all_attrs` can be
+// nullptr. `num_values` is the number of arguments or results.
+static StatusOr<std::vector<LayoutMode>> MlirAttrsToLayoutModes(
+    mlir::ArrayAttr all_attrs, size_t num_values) {
+  if (all_attrs == nullptr) {
+    return std::vector<LayoutMode>(num_values);
+  }
+  if (all_attrs.size() != num_values) {
+    return InvalidArgument(
+        "MlirAttrsToLayoutModes got unexpected number of attributes: %d, "
+        "expected: %d",
+        all_attrs.size(), num_values);
+  }
+
+  std::vector<LayoutMode> result;
+  result.reserve(all_attrs.size());
+  for (const mlir::Attribute& dict_attr : all_attrs) {
+    mlir::StringAttr attr =
+        dict_attr.cast<mlir::DictionaryAttr>().getAs<mlir::StringAttr>(
+            "mhlo.layout_mode");
+    if (attr != nullptr) {
+      TF_ASSIGN_OR_RETURN(LayoutMode mode,
+                          LayoutMode::FromString(attr.getValue().str()));
+      result.emplace_back(std::move(mode));
+    } else {
+      result.emplace_back();
+    }
+  }
+  return result;
+}
+
+// Helper function for getting default LayoutModes for tupled arguments or
+// outputs. Returns nullopt if the arguments/outputs are not tupled. Raises an
+// error if layout modes are requested on tupled values.
+static StatusOr<std::optional<std::vector<LayoutMode>>> GetTupleLayoutModes(
+    mlir::ArrayRef<mlir::Type> types, mlir::ArrayAttr all_attrs) {
+  if (types.size() != 1 || !llvm::isa<mlir::TupleType>(types[0])) {
+    return std::nullopt;
+  }
+  if (all_attrs != nullptr) {
+    if (all_attrs.size() != 1) {
+      return InvalidArgument(
+          "GetTupleLayoutModes expected single tuple attr, got %d attrs",
+          all_attrs.size());
+    }
+    mlir::StringAttr attr =
+        all_attrs.begin()->cast<mlir::DictionaryAttr>().getAs<mlir::StringAttr>(
+            "mhlo.layout_mode");
+    if (attr != nullptr) {
+      return Unimplemented("mhlo.layout_mode not supported with tupled values");
+    }
+  }
+  // Use default layout for all outputs.
+  return std::vector<LayoutMode>(types[0].cast<mlir::TupleType>().size());
+}
+
+StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(mlir::ModuleOp module) {
+  mlir::func::FuncOp main = module.lookupSymbol<mlir::func::FuncOp>("main");
+  if (main == nullptr) {
+    return InvalidArgument(
+        "GetArgLayoutModes passed module without main function");
+  }
+
+  // Special case: tupled arguments
+  TF_ASSIGN_OR_RETURN(std::optional<std::vector<LayoutMode>> maybe_result,
+                      GetTupleLayoutModes(main.getFunctionType().getInputs(),
+                                          main.getAllArgAttrs()));
+  if (maybe_result) return *maybe_result;
+
+  return MlirAttrsToLayoutModes(main.getAllArgAttrs(), main.getNumArguments());
+}
+
+StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(mlir::ModuleOp module) {
+  mlir::func::FuncOp main = module.lookupSymbol<mlir::func::FuncOp>("main");
+  if (main == nullptr) {
+    return InvalidArgument(
+        "GetOutputLayoutModes passed module without main function");
+  }
+
+  // Special case: tupled outputs
+  TF_ASSIGN_OR_RETURN(std::optional<std::vector<LayoutMode>> maybe_tuple_result,
+                      GetTupleLayoutModes(main.getFunctionType().getResults(),
+                                          main.getAllResultAttrs()));
+  if (maybe_tuple_result) return *maybe_tuple_result;
+
+  return MlirAttrsToLayoutModes(main.getAllResultAttrs(), main.getNumResults());
+}
+
+// Make sure to choose delimiter that will never show up in Layout strings.
+static const char* kLayoutModeDelimiter = ";";
+
+static std::string GetFrontendAttr(absl::Span<const LayoutMode> layout_modes) {
+  return absl::StrJoin(layout_modes, kLayoutModeDelimiter,
+                       [](std::string* out, const LayoutMode& mode) {
+                         absl::StrAppend(out, mode.ToString());
+                       });
+}
+
+Status AddLayoutModesToFrontendAttrs(mlir::ModuleOp module,
+                                     XlaComputation& xla_computation) {
+  TF_ASSIGN_OR_RETURN(std::vector<LayoutMode> arg_layout_modes,
+                      GetArgLayoutModes(module));
+  TF_ASSIGN_OR_RETURN(std::vector<LayoutMode> out_layout_modes,
+                      GetOutputLayoutModes(module));
+
+  // Type is string->string proto map. Using auto here to deal with different
+  // build environments.
+  auto& frontend_attrs = *xla_computation.mutable_proto()
+                              ->mutable_frontend_attributes()
+                              ->mutable_map();
+  frontend_attrs["arg_layout_modes"] = GetFrontendAttr(arg_layout_modes);
+  frontend_attrs["out_layout_modes"] = GetFrontendAttr(out_layout_modes);
+  return OkStatus();
+}
+
+static StatusOr<std::vector<LayoutMode>> GetLayoutModesFromFrontendAttr(
+    absl::string_view attr) {
+  // SkipEmpty() needed to avoid returning the empty string when attr is empty.
+  std::vector<std::string> str_modes =
+      absl::StrSplit(attr, kLayoutModeDelimiter, absl::SkipEmpty());
+  std::vector<LayoutMode> result;
+  for (const std::string& str_mode : str_modes) {
+    TF_ASSIGN_OR_RETURN(LayoutMode mode, LayoutMode::FromString(str_mode));
+    result.emplace_back(std::move(mode));
+  }
+  return result;
+}
+
+static StatusOr<std::vector<LayoutMode>> GetLayoutModes(
+    const XlaComputation& computation, absl::string_view frontend_attr_name,
+    size_t num_values) {
+  const auto& frontend_attrs = computation.proto().frontend_attributes().map();
+  auto iter = frontend_attrs.find(frontend_attr_name);
+  if (iter == frontend_attrs.end()) {
+    // Return all default layouts if frontend attr isn't present.
+    return std::vector<LayoutMode>(num_values);
+  }
+  return GetLayoutModesFromFrontendAttr(iter->second);
+}
+
+StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(
+    const XlaComputation& computation) {
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                      computation.GetProgramShape());
+  size_t num_args = program_shape.parameters_size() == 1 &&
+                            program_shape.parameters(0).IsTuple()
+                        ? program_shape.parameters(0).tuple_shapes_size()
+                        : program_shape.parameters_size();
+  return GetLayoutModes(computation, "arg_layout_modes", num_args);
+}
+
+StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(
+    const XlaComputation& computation) {
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                      computation.GetProgramShape());
+  size_t num_outputs = program_shape.result().IsTuple()
+                           ? program_shape.result().tuple_shapes_size()
+                           : 1;
+  return GetLayoutModes(computation, "out_layout_modes", num_outputs);
+}
+
+static StatusOr<Shape> LayoutModeToXlaShape(
+    const LayoutMode& layout_mode, const Shape& unsharded_shape,
+    const Shape& sharded_shape,
+    std::function<StatusOr<Shape>(Shape)>
+        choose_compact_layout_for_shape_function) {
+  if (unsharded_shape.IsToken() || unsharded_shape.IsOpaque()) {
+    return unsharded_shape;
+  }
+  if (!unsharded_shape.IsArray() || !sharded_shape.IsArray()) {
+    return InvalidArgument(
+        "LayoutModeToXlaShape must be passed array shapes, got "
+        "unsharded_shape: %s, sharded_shape: %s",
+        unsharded_shape.ToString(), sharded_shape.ToString());
+  }
+  // For sharded computations, XLA expects the layout to specified as the global
+  // shape with the sharded layout.
+  Shape result = unsharded_shape;
+  LayoutUtil::ClearLayout(&result);
+  switch (layout_mode.mode) {
+    case LayoutMode::Mode::kDefault: {
+      TF_ASSIGN_OR_RETURN(
+          Shape layout,
+          choose_compact_layout_for_shape_function(sharded_shape));
+      *result.mutable_layout() = layout.layout();
+      break;
+    }
+    case LayoutMode::Mode::kUserSpecified: {
+      CHECK(layout_mode.user_layout);
+      *result.mutable_layout() = *layout_mode.user_layout;
+      break;
+    }
+    case LayoutMode::Mode::kAuto: {
+      // Don't set any layout on `result`.
+      break;
+    }
+  }
+  return result;
+}
+
+StatusOr<std::pair<std::vector<Shape>, Shape>> LayoutModesToXlaShapes(
+    const XlaComputation& computation, std::vector<LayoutMode> arg_layout_modes,
+    std::vector<LayoutMode> out_layout_modes,
+    std::function<StatusOr<Shape>(Shape)>
+        choose_compact_layout_for_shape_function) {
+  // Compute sharded argument and output shapes.
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                      computation.GetProgramShape());
+  TF_ASSIGN_OR_RETURN(auto sharded_shapes,
+                      GetShardedProgramShapes(computation, program_shape));
+
+  // Untuple if necessary.
+  bool args_tupled = program_shape.parameters_size() == 1 &&
+                     program_shape.parameters(0).IsTuple();
+  const std::vector<Shape>& unsharded_arg_shapes =
+      args_tupled ? program_shape.parameters(0).tuple_shapes()
+                  : program_shape.parameters();
+  const std::vector<Shape>& sharded_arg_shapes =
+      args_tupled ? sharded_shapes.first[0].tuple_shapes()
+                  : sharded_shapes.first;
+
+  bool out_tupled = program_shape.result().IsTuple();
+  const std::vector<Shape>& unsharded_out_shapes =
+      out_tupled ? program_shape.result().tuple_shapes()
+                 : std::vector<Shape>{program_shape.result()};
+  const std::vector<Shape>& sharded_out_shapes =
+      out_tupled ? sharded_shapes.second.tuple_shapes()
+                 : std::vector<Shape>{sharded_shapes.second};
+
+  if (unsharded_arg_shapes.size() != arg_layout_modes.size()) {
+    return InvalidArgument(
+        "LayoutModesToXlaShapes got mismatched number of arguments and layout "
+        "modes (%d vs %d)",
+        unsharded_arg_shapes.size(), arg_layout_modes.size());
+  }
+  if (sharded_arg_shapes.size() != arg_layout_modes.size()) {
+    return InvalidArgument(
+        "LayoutModesToXlaShapes got mismatched number of sharded arguments and "
+        "layout modes (%d vs %d)",
+        sharded_arg_shapes.size(), arg_layout_modes.size());
+  }
+  if (unsharded_out_shapes.size() != out_layout_modes.size()) {
+    return InvalidArgument(
+        "LayoutModesToXlaShapes got mismatched number of outputs and layout "
+        "modes (%d vs %d)",
+        unsharded_out_shapes.size(), out_layout_modes.size());
+  }
+  if (sharded_out_shapes.size() != out_layout_modes.size()) {
+    return InvalidArgument(
+        "LayoutModesToXlaShapes got mismatched number of sharded outputs and "
+        "layout modes (%d vs %d)",
+        sharded_out_shapes.size(), out_layout_modes.size());
+  }
+
+  // Convert each LayoutMode to an xla::Shape with the appropriate Layout set or
+  // unset.
+  std::vector<Shape> flat_arg_layouts;
+  flat_arg_layouts.reserve(arg_layout_modes.size());
+  for (int i = 0; i < arg_layout_modes.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(
+        Shape layout,
+        LayoutModeToXlaShape(arg_layout_modes[i], unsharded_arg_shapes[i],
+                             sharded_arg_shapes[i],
+                             choose_compact_layout_for_shape_function));
+    flat_arg_layouts.emplace_back(std::move(layout));
+  }
+  std::vector<Shape> flat_out_layouts;
+  flat_out_layouts.reserve(out_layout_modes.size());
+  for (int i = 0; i < out_layout_modes.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(
+        Shape layout,
+        LayoutModeToXlaShape(out_layout_modes[i], unsharded_out_shapes[i],
+                             sharded_out_shapes[i],
+                             choose_compact_layout_for_shape_function));
+    flat_out_layouts.emplace_back(std::move(layout));
+  }
+
+  // Tuple final shapes if necessary.
+  std::vector<Shape> arg_layouts =
+      args_tupled
+          ? std::vector<Shape>{ShapeUtil::MakeTupleShape(flat_arg_layouts)}
+          : std::move(flat_arg_layouts);
+  Shape out_layout = out_tupled ? ShapeUtil::MakeTupleShape(flat_out_layouts)
+                                : flat_out_layouts[0];
+
+  return std::pair<std::vector<Shape>, Shape>{std::move(arg_layouts),
+                                              std::move(out_layout)};
+}
+
+StatusOr<std::pair<std::vector<Shape>, std::vector<const Shape*>>>
+LayoutModesToXla(const XlaComputation& computation,
+                 std::vector<LayoutMode> arg_layout_modes,
+                 std::vector<LayoutMode> out_layout_modes,
+                 std::function<StatusOr<Shape>(Shape)>
+                     choose_compact_layout_for_shape_function,
+                 ExecutableBuildOptions& build_options) {
+  TF_ASSIGN_OR_RETURN(
+      auto pair,
+      LayoutModesToXlaShapes(computation, arg_layout_modes, out_layout_modes,
+                             choose_compact_layout_for_shape_function));
+  std::vector<Shape>& arg_layouts = pair.first;
+  Shape& out_layout = pair.second;
+
+  // Generate result vector of pointers
+  std::vector<const Shape*> arg_layout_pointers;
+  arg_layout_pointers.reserve(arg_layouts.size());
+  for (int i = 0; i < arg_layouts.size(); ++i) {
+    arg_layout_pointers.push_back(&arg_layouts[i]);
+  }
+
+  // Update build_options
+  build_options.set_result_layout(out_layout);
+
+  return std::pair<std::vector<Shape>, std::vector<const Shape*>>{
+      std::move(arg_layouts), std::move(arg_layout_pointers)};
+}
+
 Status DetermineArgumentLayoutsFromCompileOptions(
     const XlaComputation& computation,
     std::function<StatusOr<Shape>(Shape)>
diff --git a/third_party/xla/xla/pjrt/utils.h b/third_party/xla/xla/pjrt/utils.h
index 8d47cad3bc5aff..7c423afc6dafb6 100644
--- a/third_party/xla/xla/pjrt/utils.h
+++ b/third_party/xla/xla/pjrt/utils.h
@@ -24,9 +24,11 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "xla/client/executable_build_options.h"
 #include "xla/client/xla_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/pjrt/layout_mode.h"
 #include "xla/service/computation_placer.h"
 #include "xla/shape.h"
 #include "xla/status.h"
@@ -44,6 +46,51 @@ Status ParseDeviceAssignmentCompileOptions(
     int* num_replicas, int* num_partitions,
     std::shared_ptr<DeviceAssignment>* device_assignment);
 
+// Returns the LayoutMode for each argument of the main function in the
+// module. Checks for the "mhlo.layout_mode" attr, and if not present, assumes
+// LayoutMode::Mode::kDefault.
+StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(mlir::ModuleOp module);
+// Returns the LayoutMode for each output of the main function in the
+// module. Checks for the "mhlo.layout_mode" attr, and if not present, assumes
+// LayoutMode::Mode::kDefault.
+StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(mlir::ModuleOp module);
+
+// Populates the frontend attributes "arg_layout_mode" and "out_layout_mode" in
+// xla_computation based on `module`. This function must be called before the
+// LayoutMode getters below work correctly on `computation`.
+Status AddLayoutModesToFrontendAttrs(mlir::ModuleOp module,
+                                     XlaComputation& xla_computation);
+// Returns the LayoutMode for each argument of the computations. Checks for the
+// "arg_layout_mode" frontend attribute, and if not present, assumes
+// LayoutMode::Mode::kDefault.
+StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(
+    const XlaComputation& computation);
+// Returns the LayoutMode for each argument of the computations. Checks for the
+// "out_layout_mode" frontend attribute, and if not present, assumes
+// LayoutMode::Mode::kDefault.
+StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(
+    const XlaComputation& computation);
+
+// Returns (arg shapes, output shape) with properly-set Layouts that can
+// be passed to XLA to reflect arg_layout_modes and out_layout_modes.
+StatusOr<std::pair<std::vector<Shape>, Shape>> LayoutModesToXlaShapes(
+    const XlaComputation& computation, std::vector<LayoutMode> arg_layout_modes,
+    std::vector<LayoutMode> out_layout_modes,
+    std::function<StatusOr<Shape>(Shape)>
+        choose_compact_layout_for_shape_function);
+
+// Generates useful data structures for communciating desired layouts to XLA:
+// * Returns a vector of argument xla::Shapes with properly-set Layouts
+// * Returns vector of pointers to those Shapes to create HloModuleConfig
+// * Modifies `build_options` to have the correct result_layout set or unset
+StatusOr<std::pair<std::vector<Shape>, std::vector<const Shape*>>>
+LayoutModesToXla(const XlaComputation& computation,
+                 std::vector<LayoutMode> arg_layout_modes,
+                 std::vector<LayoutMode> out_layout_modes,
+                 std::function<StatusOr<Shape>(Shape)>
+                     choose_compact_layout_for_shape_function,
+                 ExecutableBuildOptions& build_options);
+
 // Returns pointers to the argument layouts given an XlaComputation and
 // ExecutableBuildOptions.
 Status DetermineArgumentLayoutsFromCompileOptions(
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 4f2980b8c72552..e1ae3ac2feed7d 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -141,32 +141,13 @@ py_strict_test(
     ] + xla_py_test_deps(),
 )
 
-cc_library(
-    name = "status_casters",
-    hdrs = ["status_casters.h"],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":exceptions",
-        "//xla:status",
-        "//xla:statusor",
-        "@local_tsl//tsl/platform:macros",
-        "@pybind11",
-    ],
-)
-
 tsl_pybind_extension(
     name = "status_casters_ext",
     srcs = ["status_casters_ext.cc"],
     visibility = ["//visibility:private"],
     deps = [
-        ":exceptions",
-        ":status_casters",
+        "//xla/pjrt:exceptions",
+        "//xla/pjrt:status_casters",
         "@pybind11",
     ],
 )
@@ -184,22 +165,6 @@ py_strict_test(
     ] + xla_py_test_deps(),
 )
 
-cc_library(
-    name = "exceptions",
-    hdrs = ["exceptions.h"],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:status",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 cc_library(
     name = "types",
     srcs = ["types.cc"],
@@ -212,7 +177,6 @@ cc_library(
     features = ["-use_header_modules"],
     visibility = ["//visibility:public"],
     deps = [
-        ":exceptions",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:status",
@@ -220,6 +184,7 @@ cc_library(
         "//xla:statusor",
         "//xla:types",
         "//xla:xla_data_proto_cc",
+        "//xla/pjrt:exceptions",
         "//xla/python/ifrt",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -277,13 +242,13 @@ cc_library(
     features = ["-use_header_modules"],
     visibility = ["//visibility:public"],
     deps = [
-        ":exceptions",
         ":python_ref_manager",
         # placeholder for index annotation deps
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "//xla/pjrt:exceptions",
         "@local_tsl//tsl/platform:logging",
         "@pybind11",
     ],
@@ -348,13 +313,11 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":callback",
-        ":exceptions",
         ":pprof_profile_builder",
         ":py_client_gpu",
         ":py_host_callback_proto_cc",
         ":python_ref_manager",
         ":python_utils",
-        ":status_casters",
         ":traceback",
         ":transfer_guard_lib",
         ":types",
@@ -377,13 +340,16 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/client:xla_builder",
+        "//xla/pjrt:exceptions",
         "//xla/pjrt:host_callback",
         "//xla/pjrt:lru_cache",
         "//xla/pjrt:mlir_to_hlo",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_stream_executor_client",
+        "//xla/pjrt:status_casters",
         "//xla/pjrt:transpose",
         "//xla/python/ifrt",
         "//xla/python/pjrt_ifrt",
@@ -456,8 +422,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":callback",
-        ":exceptions",
         "//xla:comparison_util",
+        "//xla/pjrt:exceptions",
         "//xla/service:custom_call_status",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
@@ -512,12 +478,10 @@ cc_library(
     features = ["-use_header_modules"],
     visibility = ["//visibility:public"],
     deps = [
-        ":exceptions",
         ":py_client",
         ":python_ref_manager",
         ":python_utils",
         ":pytree",
-        ":status_casters",
         ":types",
         ":util",
         # placeholder for index annotation deps
@@ -533,8 +497,10 @@ cc_library(
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/pjrt:exceptions",
         "//xla/pjrt:lru_cache",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:status_casters",
         "//xla/python/ifrt",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/profiler/lib:traceme",
@@ -569,11 +535,11 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":inspect_sharding",
-        ":status_casters",
         # placeholder for index annotation deps
         "//xla/client:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_sharding_util",
+        "//xla/pjrt:status_casters",
         "//xla/service:call_inliner",
         "//xla/service:custom_call_sharding_helper",
         "//xla/service:hlo_pass_pipeline",
@@ -595,7 +561,6 @@ cc_library(
     features = ["-use_header_modules"],
     visibility = ["//visibility:public"],
     deps = [
-        ":status_casters",
         ":types",
         # placeholder for index annotation deps
         "@com_google_absl//absl/types:span",
@@ -611,6 +576,7 @@ cc_library(
         "//xla/client/lib:self_adjoint_eig",
         "//xla/client/lib:sorting",
         "//xla/client/lib:svd",
+        "//xla/pjrt:status_casters",
         "@pybind11",
     ],
 )
@@ -653,12 +619,12 @@ cc_library(
         ":py_client",
         ":python_utils",
         ":pytree",
-        ":status_casters",
         ":transfer_guard_lib",
         ":util",
         # placeholder for index annotation deps
         "@com_google_absl//absl/synchronization",
         "//xla/pjrt:lru_cache",
+        "//xla/pjrt:status_casters",
         "//xla/python/ifrt",
         "//xla/python/pjrt_ifrt",
         "@local_tsl//tsl/platform:errors",
@@ -679,12 +645,10 @@ cc_library(
     features = ["-use_header_modules"],
     visibility = ["//visibility:public"],
     deps = [
-        ":exceptions",
         ":jax_jit",
         ":py_client",
         ":python_utils",
         ":pytree",
-        ":status_casters",
         ":types",
         ":util",
         # placeholder for index annotation deps
@@ -695,7 +659,9 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
         "//xla:xla_data_proto_cc",
+        "//xla/pjrt:exceptions",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:status_casters",
         "//xla/python/ifrt",
         "//xla/python/pjrt_ifrt",
         "@local_tsl//tsl/platform:logging",
@@ -718,7 +684,7 @@ xla_cc_test(
         "//xla/client:xla_builder",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_stream_executor_client",
-        "//xla/pjrt:tfrt_cpu_pjrt_client",
+        "//xla/pjrt/cpu:cpu_client",
         "//xla/service:platform_util",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:test_main",
@@ -739,7 +705,6 @@ cc_library(
     deps = [
         ":outfeed_receiver",
         ":py_client",
-        ":status_casters",
         ":types",
         # placeholder for index annotation deps
         "@com_google_absl//absl/algorithm:container",
@@ -747,6 +712,7 @@ cc_library(
         "//xla/client:executable_build_options",
         "//xla/client:xla_builder",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:status_casters",
         "@pybind11",
     ],
 )
@@ -785,7 +751,6 @@ cc_library(
     features = ["-use_header_modules"],
     visibility = ["//visibility:public"],
     deps = [
-        ":exceptions",
         ":pytree_proto_cc",
         # placeholder for index annotation deps
         "@com_google_absl//absl/algorithm:container",
@@ -794,6 +759,7 @@ cc_library(
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "//xla/pjrt:exceptions",
         "@local_tsl//tsl/platform:logging",
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
@@ -813,7 +779,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":refine_polymorphic_shapes",
-        ":status_casters",
         ":types",
         # placeholder for index annotation deps
         "//xla:status",
@@ -822,6 +787,7 @@ cc_library(
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:all_passes",
         "//xla/pjrt:mlir_to_hlo",
+        "//xla/pjrt:status_casters",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
         "@local_tsl//tsl/platform:errors",
@@ -859,7 +825,7 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@stablehlo//:chlo_ops",
         "@stablehlo//:stablehlo_ops",
-        "@stablehlo//:stablehlo_passes",
+        "@stablehlo//stablehlo/experimental:experimental_stablehlo_passes",
     ],
 )
 
@@ -876,8 +842,6 @@ cc_library(
     features = ["-use_header_modules"],
     visibility = ["//visibility:public"],
     deps = [
-        ":exceptions",
-        ":status_casters",
         ":types",
         ":xplane_to_profile_instructions",
         # placeholder for index annotation deps
@@ -887,6 +851,8 @@ cc_library(
         "//xla/backends/profiler/cpu:python_tracer",
         "//xla/backends/profiler/plugin:plugin_tracer",
         "//xla/backends/profiler/plugin:profiler_c_api_hdrs",
+        "//xla/pjrt:exceptions",
+        "//xla/pjrt:status_casters",
         "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/pjrt/c:pjrt_c_api_profiler_extension_hdrs",
         "//xla/python/profiler/internal:traceme_wrapper",
@@ -917,11 +883,11 @@ cc_library(
     features = ["-use_header_modules"],
     visibility = ["//visibility:public"],
     deps = [
-        ":status_casters",
         # placeholder for index annotation deps
         "@com_google_absl//absl/base:core_headers",
         "//xla:status",
         "//xla:util",
+        "//xla/pjrt:status_casters",
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
     ],
@@ -982,9 +948,7 @@ cc_library(
     features = ["-use_header_modules"],
     visibility = ["//visibility:public"],
     deps = [
-        ":exceptions",
         ":py_client",
-        ":status_casters",
         ":types",
         # placeholder for index annotation deps
         "@com_google_absl//absl/hash",
@@ -1003,6 +967,8 @@ cc_library(
         "//xla/client:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
+        "//xla/pjrt:exceptions",
+        "//xla/pjrt:status_casters",
         "//xla/service:call_inliner",
         "//xla/service:computation_placer",
         "//xla/service:custom_call_target_registry",
@@ -1138,7 +1104,6 @@ cc_library(
         ":python_ref_manager",
         ":pytree",
         ":refine_polymorphic_shapes",
-        ":status_casters",
         ":traceback",
         ":transfer_guard_lib",
         ":types",
@@ -1147,10 +1112,12 @@ cc_library(
         ":xla_compiler",
         # placeholder for index annotation deps
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:initialize",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@local_config_python//:python_headers",  # buildcleaner: keep
         "//xla:literal",
@@ -1163,15 +1130,18 @@ cc_library(
         "//xla/pjrt:pjrt_api",
         "//xla/pjrt:pjrt_c_api_client",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
-        "//xla/pjrt:tfrt_cpu_pjrt_client",
+        "//xla/pjrt:status_casters",
         "//xla/pjrt/c:pjrt_c_api_hdrs",
+        "//xla/pjrt/cpu:cpu_client",
         "//xla/pjrt/distributed",
         "//xla/pjrt/distributed:client",
         "//xla/pjrt/distributed:protocol_proto_cc",
         "//xla/pjrt/distributed:service",
         "//xla/python/ifrt",
         "//xla/python/pjrt_ifrt",
+        "//xla/service/cpu:collectives_interface",
         "@local_tsl//tsl/distributed_runtime/preemption:preemption_sync_manager",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:logging",
@@ -1179,6 +1149,15 @@ cc_library(
         "@local_tsl//tsl/python/lib/core:numpy",
         "@pybind11",
     ] + select({
+        # gloo transport only builds on linux
+        "@local_tsl//tsl:macos": [],
+        "@local_tsl//tsl:windows": [],
+        "//conditions:default": [
+            "//third_party/gloo:transport_tcp",
+            "//xla/pjrt/cpu:gloo_collectives",
+            "//xla/pjrt/cpu:gloo_kv_store",
+        ],
+    }) + select({
         ":gpu_enabled": [
             "//xla/pjrt/gpu:se_gpu_pjrt_client",
         ],
@@ -1237,3 +1216,12 @@ xla_cc_test(
         "@local_tsl//tsl/profiler/utils:xplane_schema",
     ],
 )
+
+cc_library(
+    name = "status_casters",
+    hdrs = ["status_casters.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/pjrt:status_casters",
+    ],
+)
diff --git a/third_party/xla/xla/python/custom_call_sharding.cc b/third_party/xla/xla/python/custom_call_sharding.cc
index 6b0cdd2ba1f964..68118f57bb1a42 100644
--- a/third_party/xla/xla/python/custom_call_sharding.cc
+++ b/third_party/xla/xla/python/custom_call_sharding.cc
@@ -30,8 +30,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_sharding_util.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/inspect_sharding.h"
-#include "xla/python/status_casters.h"
 #include "xla/service/call_inliner.h"
 #include "xla/service/custom_call_sharding_helper.h"
 #include "xla/service/hlo_pass_pipeline.h"
@@ -122,7 +122,7 @@ class PyCustomCallPartitioner : public CustomCallPartitioner {
       auto py_result =
           partition_(GetArgShapes(instruction), GetArgShardings(instruction),
                      instruction->shape(), instruction->sharding(),
-                     instruction->raw_backend_config_string());
+                     py::bytes(instruction->raw_backend_config_string()));
 
       const XlaComputation* computation = nullptr;  // Kept alive by py_result.
       std::vector<HloSharding> arg_shardings;
@@ -186,27 +186,37 @@ class PyCustomCallPartitioner : public CustomCallPartitioner {
       const HloInstruction* instruction, const HloInstruction* user,
       const HloSharding& sharding) const override {
     py::gil_scoped_acquire gil;
-    // TODO(parkers): expand this API to handle the `user` sharding.
-    // The user is used when the custom call returns a Tuple and
-    // the user is a get-tuple-element. In this case we must update only
-    // part of the sharding spec.
-    auto result = py::cast<HloSharding>(
-        prop_user_sharding_(sharding, instruction->shape(),
-                            instruction->raw_backend_config_string()));
-    return result;
+    try {
+      // TODO(parkers): expand this API to handle the `user` sharding.
+      // The user is used when the custom call returns a Tuple and
+      // the user is a get-tuple-element. In this case we must update only
+      // part of the sharding spec.
+      auto result = py::cast<HloSharding>(prop_user_sharding_(
+          sharding, instruction->shape(),
+          py::bytes(instruction->raw_backend_config_string())));
+      return result;
+    } catch (const pybind11::error_already_set& e) {
+      LOG(FATAL) << absl::StrFormat("custom_partitioner: %s", e.what());
+    }
   }
   std::optional<HloSharding> InferShardingFromOperands(
       const HloInstruction* instruction) const override {
+    std::optional<HloSharding> result;
     std::vector<Shape> arg_shapes = GetArgShapes(instruction);
     auto arg_shardings = GetArgShardings(instruction);
     py::gil_scoped_acquire gil;
-    auto py_result = infer_sharding_from_operands_(
-        arg_shapes, arg_shardings, instruction->shape(),
-        instruction->raw_backend_config_string());
-    if (py_result.is_none()) {
-      return std::nullopt;
+    try {
+      auto py_result = infer_sharding_from_operands_(
+          arg_shapes, arg_shardings, instruction->shape(),
+          py::bytes(instruction->raw_backend_config_string()));
+      if (py_result.is_none()) {
+        return std::nullopt;
+      }
+      return py::cast<HloSharding>(py_result);
+    } catch (const pybind11::error_already_set& e) {
+      LOG(FATAL) << absl::StrFormat("custom_partitioner: %s", e.what());
     }
-    return py::cast<HloSharding>(py_result);
+    return result;
   }
   bool IsCustomCallShardable(const HloInstruction* instruction) const override {
     return true;
diff --git a/third_party/xla/xla/python/dlpack.cc b/third_party/xla/xla/python/dlpack.cc
index 42353e5b61559c..ba076e27cbdd36 100644
--- a/third_party/xla/xla/python/dlpack.cc
+++ b/third_party/xla/xla/python/dlpack.cc
@@ -377,6 +377,21 @@ StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
   Shape shape = ShapeUtil::MakeShapeWithDenseLayout(element_type, dimensions,
                                                     minor_to_major);
 
+  // Raise an error if the resulting PjRtBuffer would have a non-default layout.
+  // TODO(skyewm): we do this because JAX doesn't currently have good support
+  // for non-default layouts, and will return wrong results if a non-default
+  // layout is passed to a computation expecting default layouts. Remove this
+  // special case when non-default layouts are better supported by JAX.
+  TF_ASSIGN_OR_RETURN(Layout default_layout, device->client()->GetDefaultLayout(
+                                                 element_type, dimensions));
+  if (shape.layout() != default_layout) {
+    return Unimplemented(
+        "from_dlpack got array with non-default layout with minor-to-major "
+        "dimensions (%s), expected (%s)",
+        absl::StrJoin(shape.layout().minor_to_major(), ","),
+        absl::StrJoin(default_layout.minor_to_major(), ","));
+  }
+
   std::function<void()> on_delete_callback;
   if (dlmt->deleter) {
     on_delete_callback = [dlmt]() { dlmt->deleter(dlmt); };
diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index 13c22330ac9942..bb8ccf40d7be9d 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -73,9 +73,11 @@ cc_library(
         "//xla:statusor",
         "//xla:util",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_common",
         "//xla/python/ifrt/ir",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/container:node_hash_set",
         "@com_google_absl//absl/functional:function_ref",
@@ -296,12 +298,14 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":ifrt",
+        "//xla:literal",
         "//xla:test",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_device_description",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/concurrency:ref_count",
diff --git a/third_party/xla/xla/python/ifrt/client.h b/third_party/xla/xla/python/ifrt/client.h
index d0b961654938ba..418b08017be884 100644
--- a/third_party/xla/xla/python/ifrt/client.h
+++ b/third_party/xla/xla/python/ifrt/client.h
@@ -16,14 +16,20 @@ limitations under the License.
 #ifndef XLA_PYTHON_IFRT_CLIENT_H_
 #define XLA_PYTHON_IFRT_CLIENT_H_
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
+#include <string>
+#include <variant>
+#include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/tuple.h"
@@ -118,6 +124,10 @@ class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
   virtual absl::string_view platform_version() const = 0;
   virtual PlatformId platform_id() const = 0;
 
+  using ClientAttribute = xla::PjRtValueType;
+  virtual absl::flat_hash_map<std::string, ClientAttribute> attributes()
+      const = 0;
+
   virtual int device_count() const = 0;
   virtual int addressable_device_count() const = 0;
   virtual absl::Span<Device* const> devices() const = 0;
diff --git a/third_party/xla/xla/python/ifrt/device.cc b/third_party/xla/xla/python/ifrt/device.cc
index d05175745ead92..d999f35e919857 100644
--- a/third_party/xla/xla/python/ifrt/device.cc
+++ b/third_party/xla/xla/python/ifrt/device.cc
@@ -24,8 +24,13 @@ limitations under the License.
 namespace xla {
 namespace ifrt {
 
-DeviceList::DeviceList(Devices devices)
-    : state_(std::shared_ptr<State>(new State{std::move(devices)})) {}
+DeviceList::DeviceList(Devices devices) {
+  if (devices.size() <= kInlineDeviceSize) {
+    state_ = State{std::move(devices)};
+  } else {
+    state_ = std::make_shared<State>(State{std::move(devices)});
+  }
+}
 
 StatusOr<DeviceList> DeviceList::FromProto(LookupDeviceFunc lookup_device,
                                            const DeviceListProto& proto) {
diff --git a/third_party/xla/xla/python/ifrt/device.h b/third_party/xla/xla/python/ifrt/device.h
index f6fa137c22e585..caa816a6ebfea4 100644
--- a/third_party/xla/xla/python/ifrt/device.h
+++ b/third_party/xla/xla/python/ifrt/device.h
@@ -17,10 +17,13 @@ limitations under the License.
 #define XLA_PYTHON_IFRT_DEVICE_H_
 
 #include <memory>
+#include <type_traits>
+#include <variant>
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/types/span.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/python/ifrt/types.pb.h"
 
@@ -64,7 +67,7 @@ class DeviceList {
   // Returns a `DeviceListProto` representation.
   DeviceListProto ToProto() const;
 
-  absl::Span<Device* const> devices() const { return state_->devices; }
+  absl::Span<Device* const> devices() const { return state().devices; }
 
   bool operator==(const DeviceList& other) const {
     return devices() == other.devices();
@@ -73,18 +76,18 @@ class DeviceList {
     return devices() != other.devices();
   }
 
-  int size() const { return state_->devices.size(); }
-  bool empty() const { return state_->devices.empty(); }
+  int size() const { return state().devices.size(); }
+  bool empty() const { return state().devices.empty(); }
 
-  Device* operator[](int i) const { return state_->devices[i]; }
-  Device* at(int i) const { return state_->devices.at(i); }
-  Device* front() const { return state_->devices.front(); }
-  Device* back() const { return state_->devices.back(); }
+  Device* operator[](int i) const { return state().devices[i]; }
+  Device* at(int i) const { return state().devices.at(i); }
+  Device* front() const { return state().devices.front(); }
+  Device* back() const { return state().devices.back(); }
 
-  auto begin() const { return state_->devices.begin(); }
-  auto cbegin() const { return state_->devices.cbegin(); }
-  auto end() const { return state_->devices.end(); }
-  auto cend() const { return state_->devices.cend(); }
+  auto begin() const { return state().devices.begin(); }
+  auto cbegin() const { return state().devices.cbegin(); }
+  auto end() const { return state().devices.end(); }
+  auto cend() const { return state().devices.cend(); }
 
  private:
   // Internal state that may be shared across `DeviceList` instances.
@@ -92,7 +95,33 @@ class DeviceList {
     Devices devices;
   };
 
-  std::shared_ptr<State> state_;
+  State& state() {
+    return std::visit(
+        [](auto& state) -> State& {
+          using T = std::decay_t<decltype(state)>;
+          if constexpr (std::is_same_v<T, State>) {
+            return state;
+          } else if constexpr (std::is_same_v<T, std::shared_ptr<State>>) {
+            return *state;
+          }
+        },
+        state_);
+  }
+
+  const State& state() const {
+    return std::visit(
+        [](auto& state) -> const State& {
+          using T = std::decay_t<decltype(state)>;
+          if constexpr (std::is_same_v<T, State>) {
+            return state;
+          } else if constexpr (std::is_same_v<T, std::shared_ptr<State>>) {
+            return *state;
+          }
+        },
+        state_);
+  }
+
+  std::variant<State, std::shared_ptr<State>> state_;
 };
 
 // Returns the id of each device in `device_list`.
diff --git a/third_party/xla/xla/python/ifrt/executable.h b/third_party/xla/xla/python/ifrt/executable.h
index 0d489f5a1e3be6..885fd508347844 100644
--- a/third_party/xla/xla/python/ifrt/executable.h
+++ b/third_party/xla/xla/python/ifrt/executable.h
@@ -61,8 +61,10 @@ class Executable : public llvm::RTTIExtends<Executable, llvm::RTTIRoot> {
       const = 0;
   // Returns a list of output `OpSharding`.
   virtual std::optional<std::vector<OpSharding>> GetOutputShardings() const = 0;
-  // Returns a list of parameter `Layout`.
+  // Returns a list of parameter `xla::Layout`s.
   virtual StatusOr<std::vector<Layout>> GetParameterLayouts() const = 0;
+  // Returns a list of output/result `xla::Layout`s.
+  virtual StatusOr<std::vector<Layout>> GetOutputLayouts() const = 0;
   // Returns an `HloModule` (optimized) per partition.
   virtual StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
       const = 0;
@@ -116,8 +118,10 @@ class LoadedExecutable
       const = 0;
   // Returns a list of output OpSharding.
   virtual std::optional<std::vector<OpSharding>> GetOutputShardings() const = 0;
-  // Returns a list of parameter `Layout`.
+  // Returns a list of parameter `xla::Layout`s.
   virtual StatusOr<std::vector<Layout>> GetParameterLayouts() const = 0;
+  // Returns a list of output/result `xla::Layout`s.
+  virtual StatusOr<std::vector<Layout>> GetOutputLayouts() const = 0;
   // Return an HloModule (optimized) per partition.
   virtual StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
       const = 0;
diff --git a/third_party/xla/xla/python/ifrt/mock.cc b/third_party/xla/xla/python/ifrt/mock.cc
index b66f15c99901cf..db480907ad71ea 100644
--- a/third_party/xla/xla/python/ifrt/mock.cc
+++ b/third_party/xla/xla/python/ifrt/mock.cc
@@ -15,11 +15,28 @@ limitations under the License.
 
 #include "xla/python/ifrt/mock.h"
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
+#include <string>
 #include <utility>
 
+#include <gmock/gmock.h>
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/value.h"
+#include "tsl/concurrency/ref_count.h"
+
 namespace xla {
 namespace ifrt {
 
@@ -119,6 +136,9 @@ MockClient::MockClient(std::unique_ptr<xla::ifrt::Client> delegated)
   ON_CALL(*this, platform_id).WillByDefault([this]() {
     return delegated_->platform_id();
   });
+  ON_CALL(*this, attributes).WillByDefault([this]() {
+    return delegated_->attributes();
+  });
   ON_CALL(*this, device_count).WillByDefault([this]() {
     return delegated_->device_count();
   });
@@ -184,9 +204,12 @@ MockDevice::MockDevice(Device* delegated) : delegated_(delegated) {
   ON_CALL(*this, ToString).WillByDefault([this]() {
     return delegated_->ToString();
   });
-  ON_CALL(*this, Attributes).WillByDefault([this]() {
-    return delegated_->Attributes();
-  });
+  ON_CALL(*this, Attributes)
+      .WillByDefault(
+          [this]()
+              -> const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& {
+            return delegated_->Attributes();
+          });
   ON_CALL(*this, CreateAsyncTrackingEvent)
       .WillByDefault([this](absl::string_view description) {
         return delegated_->CreateAsyncTrackingEvent(description);
diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index 6d86c0582e709e..9ba0583d0015dd 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -114,6 +114,8 @@ class MockClient final : public llvm::RTTIExtends<MockClient, Client> {
   MOCK_METHOD(absl::string_view, runtime_type, (), (const, final));
   MOCK_METHOD(absl::string_view, platform_name, (), (const, final));
   MOCK_METHOD(absl::string_view, platform_version, (), (const, final));
+  MOCK_METHOD((absl::flat_hash_map<std::string, Client::ClientAttribute>),
+              attributes, (), (const, final));
   MOCK_METHOD(int, device_count, (), (const, final));
   MOCK_METHOD(PlatformId, platform_id, (), (const, final));
   MOCK_METHOD(int, addressable_device_count, (), (const, final));
@@ -230,6 +232,8 @@ class MockExecutable final
               (const, final));
   MOCK_METHOD(StatusOr<std::vector<Layout>>, GetParameterLayouts, (),
               (const, final));
+  MOCK_METHOD(StatusOr<std::vector<Layout>>, GetOutputLayouts, (),
+              (const, final));
   MOCK_METHOD(StatusOr<std::vector<std::shared_ptr<HloModule>>>, GetHloModules,
               (), (const, final));
   MOCK_METHOD((StatusOr<absl::flat_hash_map<std::string, CostAnalysisValue>>),
@@ -256,6 +260,8 @@ class MockLoadedExecutable final
               (const, final));
   MOCK_METHOD(StatusOr<std::vector<Layout>>, GetParameterLayouts, (),
               (const, final));
+  MOCK_METHOD(StatusOr<std::vector<Layout>>, GetOutputLayouts, (),
+              (const, final));
   MOCK_METHOD(absl::StatusOr<std::vector<std::vector<absl::string_view>>>,
               GetOutputMemoryKinds, (), (const, final));
   MOCK_METHOD(StatusOr<std::vector<std::shared_ptr<HloModule>>>, GetHloModules,
diff --git a/third_party/xla/xla/python/jax_jit.cc b/third_party/xla/xla/python/jax_jit.cc
index 754e31c1620cea..301b20180099c9 100644
--- a/third_party/xla/xla/python/jax_jit.cc
+++ b/third_party/xla/xla/python/jax_jit.cc
@@ -46,10 +46,10 @@ limitations under the License.
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/py_values.h"
 #include "xla/python/pytree.h"
 #include "xla/python/sharding.h"
-#include "xla/python/status_casters.h"
 #include "xla/python/types.h"
 #include "tsl/platform/status.h"
 #include "tsl/profiler/lib/traceme.h"
@@ -334,7 +334,16 @@ void BuildJaxjitSubmodule(py::module& m) {
       "thread_local_state", [&]() { return &ThreadLocalJitState(); },
       py::return_value_policy::reference);
 
-  jitlib.def("jit_is_disabled", &GetDisableJit);
+  jitlib.def(
+      "swap_thread_local_state_disable_jit",
+      [&](std::optional<bool> value) -> std::optional<bool> {
+        auto tls = &ThreadLocalJitState();
+        auto result = tls->disable_jit;
+        tls->disable_jit = value;
+        return result;
+      },
+      py::return_value_policy::reference);
+
   jitlib.def("get_enable_x64", &GetEnableX64);
   jitlib.def("set_thread_local_state_initialization_callback",
              [](py::object f) { initialize_local_state = f; });
diff --git a/third_party/xla/xla/python/mlir.cc b/third_party/xla/xla/python/mlir.cc
index 77dc23bc7b6409..e01c8b078cae22 100644
--- a/third_party/xla/xla/python/mlir.cc
+++ b/third_party/xla/xla/python/mlir.cc
@@ -37,8 +37,8 @@ limitations under the License.
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/pjrt/mlir_to_hlo.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/refine_polymorphic_shapes.h"
-#include "xla/python/status_casters.h"
 #include "xla/python/types.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/status.h"
diff --git a/third_party/xla/xla/python/ops.cc b/third_party/xla/xla/python/ops.cc
index 2d44a5bf41523d..8bf0003b65e996 100644
--- a/third_party/xla/xla/python/ops.cc
+++ b/third_party/xla/xla/python/ops.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "xla/client/lib/svd.h"
 #include "xla/client/xla_builder.h"
 #include "xla/client/xla_computation.h"
-#include "xla/python/status_casters.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/types.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/python/outfeed_receiver_py.cc b/third_party/xla/xla/python/outfeed_receiver_py.cc
index addd479f7bb6e3..55db82ef9f7187 100644
--- a/third_party/xla/xla/python/outfeed_receiver_py.cc
+++ b/third_party/xla/xla/python/outfeed_receiver_py.cc
@@ -30,9 +30,9 @@ limitations under the License.
 #include "xla/client/executable_build_options.h"
 #include "xla/client/xla_builder.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/outfeed_receiver.h"
 #include "xla/python/py_client.h"
-#include "xla/python/status_casters.h"
 #include "xla/python/types.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/python/outfeed_receiver_test.cc b/third_party/xla/xla/python/outfeed_receiver_test.cc
index b1a1364e999e85..0f67839f4cdfc2 100644
--- a/third_party/xla/xla/python/outfeed_receiver_test.cc
+++ b/third_party/xla/xla/python/outfeed_receiver_test.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "xla/client/client_library.h"
 #include "xla/client/executable_build_options.h"
 #include "xla/client/xla_builder.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
 #include "xla/service/platform_util.h"
 #include "xla/test.h"
 
@@ -112,7 +112,7 @@ class Accumulator {
 
 TEST(OutfeedReceiverTest, ReceiveOutfeedSimple) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
-                          GetTfrtCpuClient(true));
+                          GetTfrtCpuClient(CpuClientOptions()));
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
@@ -145,7 +145,7 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedSimple) {
 
 TEST(OutfeedReceiverTest, ReceiveOutfeedTwoComputations) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
-                          GetTfrtCpuClient(true));
+                          GetTfrtCpuClient(CpuClientOptions()));
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
@@ -190,7 +190,7 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoComputations) {
 
 TEST(OutfeedReceiverTest, ReceiveOutfeedTwoOutfeed) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
-                          GetTfrtCpuClient(true));
+                          GetTfrtCpuClient(CpuClientOptions()));
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
@@ -233,7 +233,7 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoOutfeed) {
 
 TEST(OutfeedReceiverTest, DifferentShapeForConsumerIdError) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
-                          GetTfrtCpuClient(true));
+                          GetTfrtCpuClient(CpuClientOptions()));
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
@@ -267,7 +267,7 @@ TEST(OutfeedReceiverTest, DifferentShapeForConsumerIdError) {
 
 TEST(OutfeedReceiverTest, InvalidConsumerIdError) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
-                          GetTfrtCpuClient(true));
+                          GetTfrtCpuClient(CpuClientOptions()));
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
diff --git a/third_party/xla/xla/python/pjit.cc b/third_party/xla/xla/python/pjit.cc
index 76e3dfc5a9e99a..00e08ff7b9d363 100644
--- a/third_party/xla/xla/python/pjit.cc
+++ b/third_party/xla/xla/python/pjit.cc
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "absl/synchronization/notification.h"
 #include "xla/pjrt/lru_cache.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/jax_jit.h"
 #include "xla/python/py_array.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "xla/python/python_utils.h"
 #include "xla/python/pytree.h"
 #include "xla/python/sharding.h"
-#include "xla/python/status_casters.h"
 #include "xla/python/transfer_guard_lib.h"
 #include "xla/python/util.h"
 #include "tsl/platform/errors.h"
@@ -728,10 +728,15 @@ void PjitFunction::PopulateCacheEntry(PjitCacheEntry& cache_entry,
 
 // Helper function used by the tp_clear GC method.
 void PjitFunction::ClearPythonReferences() {
+  // TODO(mattjj): phawkins@ observed that the xla::PyTreeRegistry
+  // pytree_registry_ attribute of PjitFunction could in principle also have
+  // python references to clear
   py::function cache_miss;
+  std::optional<py::function> fun;
   // Swap values for nulls before they are destroyed. See the Python
   // Py_CLEAR() documentation for a discussion of this topic.
   std::swap(cache_miss_, cache_miss);
+  std::swap(fun_, fun);
 }
 
 struct PjitFunctionObject {
@@ -814,6 +819,9 @@ void PjitFunction_tp_dealloc(PyObject* self) {
 }
 
 int PjitFunction_tp_traverse(PyObject* self, visitproc visit, void* arg) {
+  // TODO(mattjj): phawkins@ observed that the xla::PyTreeRegistry
+  // pytree_registry_ attribute of PjitFunction could in principle also have
+  // python references to visit
   PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
 #if PY_VERSION_HEX >= 0x03090000
   // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
@@ -821,6 +829,9 @@ int PjitFunction_tp_traverse(PyObject* self, visitproc visit, void* arg) {
 #endif
   Py_VISIT(o->dict);
   Py_VISIT(o->fun.cache_miss().ptr());
+  if (o->fun.fun()) {
+    Py_VISIT(o->fun.fun()->ptr());
+  }
   return 0;
 }
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index e256468cdcea82..b64176848b9f2d 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -259,7 +259,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":pjrt_ifrt",
-        "//xla/pjrt:tfrt_cpu_pjrt_client",
+        "//xla/pjrt/cpu:cpu_client",
         "//xla/python/ifrt:test_util",
     ],
     alwayslink = True,
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index 92c079f56e0928..048003004c3ec8 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
-#include "absl/status/status.h"
 #include "absl/strings/str_join.h"
 #include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
@@ -387,55 +386,35 @@ StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
             "first fetched to the host and then sent to the destination "
             "device.");
       }
-      if (new_sharding_has_memory_kind && memories_supported &&
-          semantics == ArrayCopySemantics::kDonateInput && !memory_kind_equal) {
-        return Unimplemented(
-            "Donation across different memory kinds is not implemented.");
-      }
-      // Try using `PjRtBuffer::CopyToMemorySpace` instead of
+      // Use `PjRtBuffer::CopyToMemorySpace` instead of
       // `PjRtBuffer::CopyToDevice` when memories are supported. Because the
       // semantics of the latter one is to copy to the default memory space of
       // the device.
-      std::unique_ptr<PjRtBuffer> copied_buffer;
       if (new_sharding_has_memory_kind && memories_supported) {
         TF_ASSIGN_OR_RETURN(
             auto memory_space,
             GetMemorySpaceFromMemoryKind(new_sharding->devices()[i],
                                          canonicalized_sharding_memory_kind));
-        StatusOr<std::unique_ptr<PjRtBuffer>> copied_buffer_using_memory_space =
-            pjrt_buffers_[i]->CopyToMemorySpace(memory_space);
-        if (copied_buffer_using_memory_space.ok()) {
-          copied_buffer = std::move(*copied_buffer_using_memory_space);
-        } else if (!absl::IsUnimplemented(
-                       copied_buffer_using_memory_space.status())) {
-          return copied_buffer_using_memory_space.status();
-        } else {
-          // Returns unimplemented if the sharding's memory space isn't the
-          // device's default memory space. Otherwise continue on to the
-          // CopyToDevice fallback.
-          // TODO(b/307743645): clean up this branch when memory space is better
-          // supported.
-          TF_ASSIGN_OR_RETURN(
-              PjRtMemorySpace * default_memory_space,
-              new_sharding->devices()[i]->default_memory_space());
-          if (canonicalized_sharding_memory_kind.memory_kind() !=
-              default_memory_space->memory_space_kind()) {
-            return copied_buffer_using_memory_space.status();
+        TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> copied_buffer,
+                            pjrt_buffers_[i]->CopyToMemorySpace(memory_space));
+        if (semantics == ArrayCopySemantics::kDonateInput) {
+          if (!memory_kind_equal) {
+            return Unimplemented(
+                "Donation across different memory kinds is not implemented.");
           }
+          pjrt_buffers_[i] = nullptr;
         }
+        buffers.push_back(std::shared_ptr<PjRtBuffer>(copied_buffer.release()));
+      } else {
+        // Use `PjRtBuffer::CopyToDevice` when memories are not supported.
+        TF_ASSIGN_OR_RETURN(
+            std::unique_ptr<xla::PjRtBuffer> copied_buffer,
+            pjrt_buffers_[i]->CopyToDevice(new_sharding->devices()[i]));
+        if (semantics == ArrayCopySemantics::kDonateInput) {
+          pjrt_buffers_[i] = nullptr;
+        }
+        buffers.push_back(std::shared_ptr<PjRtBuffer>(copied_buffer.release()));
       }
-      // Fallback to `PjRtBuffer::CopyToDevice` if (1) memories are not
-      // supported or (2) `PjRtBuffer::CopyToMemorySpace` returns unimplemented
-      // and canonicalized_sharding_memory_kind is the same as the
-      // default_memory_space of `new_sharding->devices()[i]`.
-      if (copied_buffer == nullptr) {
-        TF_ASSIGN_OR_RETURN(copied_buffer, pjrt_buffers_[i]->CopyToDevice(
-                                               new_sharding->devices()[i]));
-      }
-      if (semantics == ArrayCopySemantics::kDonateInput) {
-        pjrt_buffers_[i] = nullptr;
-      }
-      buffers.push_back(std::shared_ptr<PjRtBuffer>(copied_buffer.release()));
     }
   }
   return PjRtArray::Create(client_, dtype_, shape_, std::move(new_sharding),
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
index d80af00f4de028..4877e360981662 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
@@ -19,8 +19,10 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <optional>
+#include <string>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/pjrt/pjrt_client.h"
@@ -109,6 +111,18 @@ class PjRtClient final
     DCHECK(this);
     return pjrt_client_->platform_id();
   }
+  absl::flat_hash_map<std::string, ClientAttribute> attributes()
+      const override {
+    std::optional<PjRtPluginAttributes> attributes =
+        pjrt_client_->plugin_attributes();
+    if (!attributes.has_value()) {
+      return {};
+    }
+    return {{"pjrt_c_api_major_version",
+             ClientAttribute(attributes->pjrt_c_api_major_version)},
+            {"pjrt_c_api_minor_version",
+             ClientAttribute(attributes->pjrt_c_api_minor_version)}};
+  }
 
   int device_count() const override {
     DCHECK(this);
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
index 057f99d3c085d9..d2651d66ea4492 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -118,6 +118,11 @@ class PjRtExecutable final
     return pjrt_executable_->GetParameterLayouts();
   }
 
+  StatusOr<std::vector<Layout>> GetOutputLayouts() const override {
+    DCHECK(this);
+    return pjrt_executable_->GetOutputLayouts();
+  }
+
   StatusOr<std::optional<std::string>> Fingerprint() const override;
 
   StatusOr<std::string> Serialize() const override;
@@ -223,6 +228,11 @@ class PjRtLoadedExecutable final
     return pjrt_loaded_executable_->GetParameterLayouts();
   }
 
+  StatusOr<std::vector<Layout>> GetOutputLayouts() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->GetOutputLayouts();
+  }
+
   StatusOr<std::optional<std::string>> Fingerprint() const override;
 
   StatusOr<std::string> Serialize() const override;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/tfrt_cpu_client_test_lib.cc b/third_party/xla/xla/python/pjrt_ifrt/tfrt_cpu_client_test_lib.cc
index 1a485fc722d6a6..db1c424d89eb9f 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/tfrt_cpu_client_test_lib.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/tfrt_cpu_client_test_lib.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/python/ifrt/test_util.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 
@@ -24,16 +24,16 @@ namespace xla {
 namespace ifrt {
 namespace {
 
-const bool kUnused =
-    (test_util::RegisterClientFactory(
-         []() -> StatusOr<std::shared_ptr<Client>> {
-           TF_ASSIGN_OR_RETURN(auto pjrt_client,
-                               xla::GetTfrtCpuClient(/*asynchronous=*/true,
-                                                     /*cpu_device_count=*/2));
-           return std::shared_ptr<Client>(
-               PjRtClient::Create(std::move(pjrt_client)));
-         }),
-     true);
+const bool kUnused = (test_util::RegisterClientFactory(
+                          []() -> StatusOr<std::shared_ptr<Client>> {
+                            CpuClientOptions options;
+                            options.cpu_device_count = 4;
+                            TF_ASSIGN_OR_RETURN(auto pjrt_client,
+                                                xla::GetTfrtCpuClient(options));
+                            return std::shared_ptr<Client>(
+                                PjRtClient::Create(std::move(pjrt_client)));
+                          }),
+                      true);
 
 }  // namespace
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pmap_lib.cc b/third_party/xla/xla/python/pmap_lib.cc
index aed4ca042331ea..3481bf47dc1d64 100644
--- a/third_party/xla/xla/python/pmap_lib.cc
+++ b/third_party/xla/xla/python/pmap_lib.cc
@@ -36,7 +36,8 @@ limitations under the License.
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
-#include "xla/python/exceptions.h"
+#include "xla/pjrt/exceptions.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/memory.h"
@@ -50,7 +51,6 @@ limitations under the License.
 #include "xla/python/pytree.h"
 #include "xla/python/sharded_device_array.h"
 #include "xla/python/sharding.h"
-#include "xla/python/status_casters.h"
 #include "xla/python/types.h"
 #include "xla/python/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/python/profiler.cc b/third_party/xla/xla/python/profiler.cc
index 52b611bca62391..566c14a59d327e 100644
--- a/third_party/xla/xla/python/profiler.cc
+++ b/third_party/xla/xla/python/profiler.cc
@@ -28,9 +28,9 @@ limitations under the License.
 #include "xla/backends/profiler/plugin/profiler_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_profiler_extension.h"
-#include "xla/python/exceptions.h"
+#include "xla/pjrt/exceptions.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/profiler/internal/traceme_wrapper.h"
-#include "xla/python/status_casters.h"
 #include "xla/python/types.h"
 #include "xla/python/xplane_to_profile_instructions.h"
 #include "xla/status.h"
diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
index 8986cae185fcba..354eb40051dfda 100644
--- a/third_party/xla/xla/python/py_array.cc
+++ b/third_party/xla/xla/python/py_array.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
 #include "xla/pjrt/lru_cache.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/sharding.h"
@@ -41,7 +42,6 @@ limitations under the License.
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/python_utils.h"
 #include "xla/python/sharding.h"
-#include "xla/python/status_casters.h"
 #include "xla/python/transfer_guard_lib.h"
 #include "xla/python/util.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/python/py_buffer.cc b/third_party/xla/xla/python/py_buffer.cc
index ed98c84c81fa20..79f927231313bd 100644
--- a/third_party/xla/xla/python/py_buffer.cc
+++ b/third_party/xla/xla/python/py_buffer.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/primitive_util.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "xla/python/py_client.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/python_utils.h"
-#include "xla/python/status_casters.h"
 #include "xla/python/transfer_guard_lib.h"
 #include "xla/python/types.h"
 #include "xla/python/util.h"
diff --git a/third_party/xla/xla/python/py_client.cc b/third_party/xla/xla/python/py_client.cc
index 9aa33e908874a4..ad67d0640ce586 100644
--- a/third_party/xla/xla/python/py_client.cc
+++ b/third_party/xla/xla/python/py_client.cc
@@ -23,12 +23,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
 #include "xla/python/callback.h"
-#include "xla/python/exceptions.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/executable.h"
@@ -57,18 +57,9 @@ namespace xla {
 namespace py = pybind11;
 
 PyClient::PyClient(std::shared_ptr<ifrt::Client> ifrt_client)
-    : ifrt_client_(std::move(ifrt_client)) {
+    : ifrt_client_(std::move(ifrt_client)),
+      client_attributes_(ifrt_client_->attributes()) {
   CHECK(ifrt_client_);
-  // TODO(phawkins): this is a temporary backwards compatibility shim. We
-  // changed the name PJRT reports for GPU platforms to "cuda" or "rocm", but
-  // we haven't yet updated JAX clients that expect "gpu". Migrate users and
-  // remove this code.
-  if (ifrt_client_->platform_name() == "cuda" ||
-      ifrt_client_->platform_name() == "rocm") {
-    platform_name_ = "gpu";
-  } else {
-    platform_name_ = ifrt_client_->platform_name();
-  }
 }
 
 PyClient::~PyClient() {
@@ -371,15 +362,25 @@ StatusOr<std::shared_ptr<PyLoadedExecutable>> PyClient::Compile(
     std::vector<pybind11::capsule> host_callbacks) {
   // Pass allocated device memory size to compile options for pjrt compatible
   // backends.
-  if ((ifrt_client_->platform_id() == xla::CudaId() ||
-       ifrt_client_->platform_id() == xla::RocmId()) &&
-      !pjrt_client()->devices().empty()) {
-    auto maybe_stats = pjrt_client()->devices()[0]->GetAllocatorStats();
-    if (maybe_stats.ok() && maybe_stats->bytes_limit) {
-      options.executable_build_options.set_device_memory_size(
-          *maybe_stats->bytes_limit);
+  auto* pjrt_compatible_client =
+      llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(ifrt_client_.get());
+  if (pjrt_compatible_client != nullptr) {
+    auto addressable_devices =
+        pjrt_compatible_client->pjrt_client()->addressable_devices();
+    if (!addressable_devices.empty()) {
+      int device_ordinal = options.executable_build_options.device_ordinal();
+      if (device_ordinal < 0) {
+        device_ordinal = 0;
+      }
+      CHECK_LT(device_ordinal, addressable_devices.size());
+      auto stats = addressable_devices[device_ordinal]->GetAllocatorStats();
+      if (stats.ok() && stats->bytes_limit) {
+        options.executable_build_options.set_device_memory_size(
+            *stats->bytes_limit);
+      }
     }
   }
+
   std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable;
   std::optional<std::string> fingerprint;
   auto ifrt_compile_options =
diff --git a/third_party/xla/xla/python/py_client.h b/third_party/xla/xla/python/py_client.h
index 31397f89a7b60f..3c15b1057dc868 100644
--- a/third_party/xla/xla/python/py_client.h
+++ b/third_party/xla/xla/python/py_client.h
@@ -16,16 +16,19 @@ limitations under the License.
 #ifndef XLA_PYTHON_PY_CLIENT_H_
 #define XLA_PYTHON_PY_CLIENT_H_
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/client/xla_builder.h"
+#include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/python/exceptions.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/statusor.h"
@@ -136,13 +139,32 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
     return shared_ptr_pjrt_client();
   }
 
-  absl::string_view platform_name() const { return platform_name_; }
+  absl::string_view platform_name() const {
+    // TODO(phawkins): this is a temporary backwards compatibility shim. We
+    // changed the name PJRT reports for GPU platforms to "cuda" or "rocm", but
+    // we haven't yet updated JAX clients that expect "gpu". Migrate users and
+    // remove this code.
+    if (ifrt_client_->platform_name() == "cuda" ||
+        ifrt_client_->platform_name() == "rocm") {
+      return "gpu";
+    } else {
+      return ifrt_client_->platform_name();
+    }
+  }
   absl::string_view platform_version() const {
     return ifrt_client_->platform_version();
   }
   absl::string_view runtime_type() const {
     return ifrt_client_->runtime_type();
   }
+
+  // Returns implementation-specific attributes about this client, e.g. the PJRT
+  // C API version if applicable.
+  absl::flat_hash_map<std::string, xla::ifrt::Client::ClientAttribute>
+  attributes() const {
+    return client_attributes_;
+  }
+
   int addressable_device_count() const {
     return ifrt_client_->addressable_device_count();
   }
@@ -246,8 +268,8 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   friend struct PyArray_Storage;
 
   std::shared_ptr<ifrt::Client> ifrt_client_;
-  std::string platform_name_;
-
+  absl::flat_hash_map<std::string, xla::ifrt::Client::ClientAttribute>
+      client_attributes_;
   // Pointers to intrusive doubly-linked lists of arrays and executables, used
   // to iterate over all known objects when heap profiling. The list structure
   // is protected by the GIL.
diff --git a/third_party/xla/xla/python/py_client_gpu.cc b/third_party/xla/xla/python/py_client_gpu.cc
index 6a5caf680243d0..36b73f3d65ce6f 100644
--- a/third_party/xla/xla/python/py_client_gpu.cc
+++ b/third_party/xla/xla/python/py_client_gpu.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #endif
 #include "pybind11/pybind11.h"  // from @pybind11
+#include "xla/pjrt/exceptions.h"
 #include "xla/primitive_util.h"
 #include "xla/python/callback.h"
-#include "xla/python/exceptions.h"
 
 #if TENSORFLOW_USE_ROCM
 #define gpuSuccess hipSuccess
diff --git a/third_party/xla/xla/python/py_compile_only_client.cc b/third_party/xla/xla/python/py_compile_only_client.cc
index dfb8848118b333..a2031c40a3b0bb 100644
--- a/third_party/xla/xla/python/py_compile_only_client.cc
+++ b/third_party/xla/xla/python/py_compile_only_client.cc
@@ -22,11 +22,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "pybind11/stl.h"  // from @pybind11
 #include "xla/pjrt/mlir_to_hlo.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/device.h"
-#include "xla/python/status_casters.h"
 #include "tsl/python/lib/core/numpy.h"  //NOLINT
 
 namespace xla {
@@ -136,6 +137,10 @@ class CompileOnlyIfRtClient final
   ifrt::PlatformId platform_id() const override {
     return topology_->platform_id();
   }
+  absl::flat_hash_map<std::string, ClientAttribute> attributes()
+      const override {
+    return {};
+  }
 
   int device_count() const override { return devices().size(); }
   int addressable_device_count() const override { return 0; }
diff --git a/third_party/xla/xla/python/py_executable.cc b/third_party/xla/xla/python/py_executable.cc
index db9c6688aa498a..786872913beecf 100644
--- a/third_party/xla/xla/python/py_executable.cc
+++ b/third_party/xla/xla/python/py_executable.cc
@@ -366,6 +366,10 @@ StatusOr<std::vector<Layout>> PyLoadedExecutable::GetParameterLayouts() const {
   return ifrt_loaded_executable_->GetParameterLayouts();
 }
 
+StatusOr<std::vector<Layout>> PyLoadedExecutable::GetOutputLayouts() const {
+  return ifrt_loaded_executable_->GetOutputLayouts();
+}
+
 std::optional<std::vector<OpSharding>>
 PyLoadedExecutable::GetParameterShardings() const {
   return ifrt_loaded_executable_->GetParameterShardings();
diff --git a/third_party/xla/xla/python/py_executable.h b/third_party/xla/xla/python/py_executable.h
index 71c1ddf4835d11..8e722c908235be 100644
--- a/third_party/xla/xla/python/py_executable.h
+++ b/third_party/xla/xla/python/py_executable.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "pybind11/gil.h"  // from @pybind11
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/python/pjrt_ifrt/pjrt_executable.h"
 #include "xla/python/py_array.h"
 #include "xla/python/py_client.h"
@@ -139,10 +140,8 @@ class PyLoadedExecutable
     return ifrt_loaded_executable_->GetCompiledMemoryStats();
   }
 
-  StatusOr<absl::flat_hash_map<
-      std::string,
-      std::variant<std::string, int64_t, std::vector<int64_t>, float, bool>>>
-  GetCostAnalysis() const {
+  StatusOr<absl::flat_hash_map<std::string, PjRtValueType>> GetCostAnalysis()
+      const {
     return ifrt_loaded_executable_->GetCostAnalysis();
   }
 
@@ -174,6 +173,8 @@ class PyLoadedExecutable
 
   StatusOr<std::vector<Layout>> GetParameterLayouts() const;
 
+  StatusOr<std::vector<Layout>> GetOutputLayouts() const;
+
   std::optional<std::vector<OpSharding>> GetParameterShardings() const;
 
   std::optional<std::vector<OpSharding>> GetOutputShardings() const;
diff --git a/third_party/xla/xla/python/pytree.cc b/third_party/xla/xla/python/pytree.cc
index 1a47247bc5a21e..b5d42ec0f7ced5 100644
--- a/third_party/xla/xla/python/pytree.cc
+++ b/third_party/xla/xla/python/pytree.cc
@@ -41,7 +41,7 @@ limitations under the License.
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "pybind11/stl.h"  // from @pybind11
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
-#include "xla/python/exceptions.h"
+#include "xla/pjrt/exceptions.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/python/refine_polymorphic_shapes.cc b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
index 063b15aba58062..31961654cc2e19 100644
--- a/third_party/xla/xla/python/refine_polymorphic_shapes.cc
+++ b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
-#include "stablehlo/transforms/Passes.h"  // from @stablehlo
+#include "stablehlo/experimental/transforms/Passes.h"  // from @stablehlo
 #include "xla/mlir/utils/error_util.h"
 #include "tsl/platform/errors.h"
 
@@ -255,9 +255,9 @@ absl::Status RefinePolymorphicShapes(mlir::ModuleOp module,
   // TODO(necula): we should not need the inliner.
   pm.addPass(mlir::createInlinerPass());
   pm.addPass(mlir::createCSEPass());
-  pm.addPass(mlir::stablehlo::createStablehloRefineShapesPass());
+  pm.addPass(mlir::stablehlo::experimental::createStablehloRefineShapesPass());
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::stablehlo::createStablehloCanonicalizeDynamismPass());
+      mlir::stablehlo::experimental::createStablehloCanonicalizeDynamismPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       std::make_unique<CheckShapeAssertionsPass>(enable_shape_assertions));
   if (!mlir::succeeded(pm.run(module))) {
diff --git a/third_party/xla/xla/python/sharding.h b/third_party/xla/xla/python/sharding.h
index 0e780b10c37835..5fbd70e61fdcd5 100644
--- a/third_party/xla/xla/python/sharding.h
+++ b/third_party/xla/xla/python/sharding.h
@@ -30,11 +30,11 @@ limitations under the License.
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/py_client.h"
 #include "xla/python/py_device_list.h"
 #include "xla/python/sharded_device_array.h"
-#include "xla/python/status_casters.h"
 #include "xla/xla_data.pb.h"
 
 namespace jax {
diff --git a/third_party/xla/xla/python/status_casters.h b/third_party/xla/xla/python/status_casters.h
index 1b7020005360da..69e0abd6e742a3 100644
--- a/third_party/xla/xla/python/status_casters.h
+++ b/third_party/xla/xla/python/status_casters.h
@@ -16,205 +16,8 @@ limitations under the License.
 #ifndef XLA_PYTHON_STATUS_CASTERS_H_
 #define XLA_PYTHON_STATUS_CASTERS_H_
 
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
-#include "xla/python/exceptions.h"
-#include "xla/status.h"
-#include "xla/statusor.h"
-#include "tsl/platform/macros.h"
-
-namespace xla {
-
-// C++ -> Python caster helpers.
-//
-// Failing statuses become Python exceptions; OK Status() becomes None.
-//
-// Given there can be only a single global pybind11 type_caster for the
-// `absl::Status` type, and given XLA wants a custom exception being raised,
-// we use a dedicated helper to implement this feature without relying on a
-// global `type_caster`.
-//
-// For example:
-//
-// - Functions without arguments:
-//   m.def("my_func", []() { xla::ThrowIfError(MyFunc()); }
-// - Classes with a single argument:
-//   py_class.def("delete", [](Buffer& self) {
-//     xla::ThrowIfError(self.Delete());
-//   }
-//
-// For functions with more arguments, you can either inline the arguments,
-// or use the `ThrowIfErrorWrapper` wrapper defined below:
-//
-// m.def("my_func", xla::ThrowIfErrorWrapper(MyFunc));
-//
-// Nonstatic member functions can be wrapped by passing a
-// pointer-to-member-function:
-// xla::ThrowIfErrorWrapper(&MyClass::MyMethod)
-
-inline void ThrowIfError(xla::Status src) {
-  if (!src.ok()) {
-    throw xla::XlaRuntimeError(src);
-  }
-}
-
-// If one does not want to have to define a lambda specifying the inputs
-// arguments, on can use the `ThrowIfErrorWrapper` wrapper.
-//
-// There are three specializations:
-// - For free functions, `Sig` is the function type and `F` is `Sig&`.
-// - For callable types, `Sig` is the pointer to member function type
-//   and `F` is the type of the callable.
-// - For a nonstatic member function of a class `C`, `Sig` is the function type
-//   and `F` is Sig C::*.
-//
-// In the first two cases, the wrapper returns a callable with signature `Sig`;
-// in the third case, the wrapper returns callable with a modified signature
-// that takes a C instance as the first argument.
-template <typename Sig, typename F>
-struct ThrowIfErrorWrapper;
-
-// C++17 "deduction guide" that guides class template argument deduction (CTAD)
-// For free functions.
-template <typename F>
-ThrowIfErrorWrapper(F) -> ThrowIfErrorWrapper<decltype(&F::operator()), F>;
-
-// For callable types (with operator()).
-template <typename... Args>
-ThrowIfErrorWrapper(xla::Status (&)(Args...))
-    -> ThrowIfErrorWrapper<xla::Status(Args...), xla::Status (&)(Args...)>;
-
-// For unbound nonstatic member functions.
-template <typename C, typename... Args>
-ThrowIfErrorWrapper(xla::Status (C::*)(Args...))
-    -> ThrowIfErrorWrapper<xla::Status(Args...), C>;
-
-// Template specializations.
-
-// For free functions.
-template <typename... Args>
-struct ThrowIfErrorWrapper<xla::Status(Args...), xla::Status (&)(Args...)> {
-  explicit ThrowIfErrorWrapper(xla::Status (&f)(Args...)) : func(f) {}
-  void operator()(Args... args) {
-    xla::ThrowIfError(func(std::forward<Args>(args)...));
-  }
-  xla::Status (&func)(Args...);
-};
-
-// For callable types (with operator()), non-const and const versions.
-template <typename C, typename... Args, typename F>
-struct ThrowIfErrorWrapper<xla::Status (C::*)(Args...), F> {
-  explicit ThrowIfErrorWrapper(F&& f) : func(std::move(f)) {}
-  void operator()(Args... args) {
-    xla::ThrowIfError(func(std::forward<Args>(args)...));
-  }
-  F func;
-};
-template <typename C, typename... Args, typename F>
-struct ThrowIfErrorWrapper<xla::Status (C::*)(Args...) const, F> {
-  explicit ThrowIfErrorWrapper(F&& f) : func(std::move(f)) {}
-  void operator()(Args... args) const {
-    xla::ThrowIfError(func(std::forward<Args>(args)...));
-  }
-  F func;
-};
-
-// For unbound nonstatic member functions, non-const and const versions.
-// `ptmf` stands for "pointer to member function".
-template <typename C, typename... Args>
-struct ThrowIfErrorWrapper<xla::Status(Args...), C> {
-  explicit ThrowIfErrorWrapper(xla::Status (C::*ptmf)(Args...)) : ptmf(ptmf) {}
-  void operator()(C& instance, Args... args) {
-    xla::ThrowIfError((instance.*ptmf)(std::forward<Args>(args)...));
-  }
-  xla::Status (C::*ptmf)(Args...);
-};
-template <typename C, typename... Args>
-struct ThrowIfErrorWrapper<xla::Status(Args...) const, C> {
-  explicit ThrowIfErrorWrapper(xla::Status (C::*ptmf)(Args...) const)
-      : ptmf(ptmf) {}
-  void operator()(const C& instance, Args... args) const {
-    xla::ThrowIfError((instance.*ptmf)(std::forward<Args>(args)...));
-  }
-  xla::Status (C::*ptmf)(Args...) const;
-};
-
-// Utilities for `StatusOr`.
-template <typename T>
-T ValueOrThrow(StatusOr<T> v) {
-  if (!v.ok()) {
-    throw xla::XlaRuntimeError(v.status());
-  }
-  return std::move(v).value();
-}
-
-template <typename Sig, typename F>
-struct ValueOrThrowWrapper;
-
-template <typename F>
-ValueOrThrowWrapper(F) -> ValueOrThrowWrapper<decltype(&F::operator()), F>;
-
-template <typename R, typename... Args>
-ValueOrThrowWrapper(xla::StatusOr<R> (&)(Args...))
-    -> ValueOrThrowWrapper<xla::StatusOr<R>(Args...),
-                           xla::StatusOr<R> (&)(Args...)>;
-
-template <typename C, typename R, typename... Args>
-ValueOrThrowWrapper(xla::StatusOr<R> (C::*)(Args...))
-    -> ValueOrThrowWrapper<xla::StatusOr<R>(Args...), C>;
-
-// Deduction guide for const methods.
-template <typename C, typename R, typename... Args>
-ValueOrThrowWrapper(xla::StatusOr<R> (C::*)(Args...) const)
-    -> ValueOrThrowWrapper<xla::StatusOr<R>(Args...) const, C>;
-
-template <typename R, typename... Args>
-struct ValueOrThrowWrapper<xla::StatusOr<R>(Args...),
-                           xla::StatusOr<R> (&)(Args...)> {
-  explicit ValueOrThrowWrapper(xla::StatusOr<R> (&f)(Args...)) : func(f) {}
-  R operator()(Args... args) {
-    return xla::ValueOrThrow(func(std::forward<Args>(args)...));
-  }
-  xla::StatusOr<R> (&func)(Args...);
-};
-template <typename R, typename C, typename... Args, typename F>
-struct ValueOrThrowWrapper<xla::StatusOr<R> (C::*)(Args...), F> {
-  explicit ValueOrThrowWrapper(F&& f) : func(std::move(f)) {}
-  R operator()(Args... args) {
-    return xla::ValueOrThrow(func(std::forward<Args>(args)...));
-  }
-  F func;
-};
-template <typename R, typename C, typename... Args, typename F>
-struct ValueOrThrowWrapper<xla::StatusOr<R> (C::*)(Args...) const, F> {
-  explicit ValueOrThrowWrapper(F&& f) : func(std::move(f)) {}
-  R operator()(Args... args) const {
-    return xla::ValueOrThrow(func(std::forward<Args>(args)...));
-  }
-  F func;
-};
-
-// For unbound nonstatic member functions, non-const and const versions.
-// `ptmf` stands for "pointer to member function".
-template <typename R, typename C, typename... Args>
-struct ValueOrThrowWrapper<xla::StatusOr<R>(Args...), C> {
-  explicit ValueOrThrowWrapper(xla::StatusOr<R> (C::*ptmf)(Args...))
-      : ptmf(ptmf) {}
-  R operator()(C& instance, Args... args) {
-    return xla::ValueOrThrow((instance.*ptmf)(std::forward<Args>(args)...));
-  }
-  xla::StatusOr<R> (C::*ptmf)(Args...);
-};
-template <typename R, typename C, typename... Args>
-struct ValueOrThrowWrapper<xla::StatusOr<R>(Args...) const, C> {
-  explicit ValueOrThrowWrapper(xla::StatusOr<R> (C::*ptmf)(Args...) const)
-      : ptmf(ptmf) {}
-  R operator()(const C& instance, Args... args) const {
-    return xla::ValueOrThrow((instance.*ptmf)(std::forward<Args>(args)...));
-  }
-  xla::StatusOr<R> (C::*ptmf)(Args...) const;
-};
-
-}  // namespace xla
+// Forwarding header.
+// TODO(phawkins): update users to use the new header location.
+#include "xla/pjrt/status_casters.h"
 
 #endif  // XLA_PYTHON_STATUS_CASTERS_H_
diff --git a/third_party/xla/xla/python/status_casters_ext.cc b/third_party/xla/xla/python/status_casters_ext.cc
index 8f903da84427e1..df34ca8ce9d511 100644
--- a/third_party/xla/xla/python/status_casters_ext.cc
+++ b/third_party/xla/xla/python/status_casters_ext.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
-#include "xla/python/exceptions.h"
-#include "xla/python/status_casters.h"
+#include "xla/pjrt/exceptions.h"
+#include "xla/pjrt/status_casters.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/python/traceback.cc b/third_party/xla/xla/python/traceback.cc
index 88d6965b56e236..93389420ea573a 100644
--- a/third_party/xla/xla/python/traceback.cc
+++ b/third_party/xla/xla/python/traceback.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "pybind11/pytypes.h"  // from @pybind11
-#include "xla/python/exceptions.h"
+#include "xla/pjrt/exceptions.h"
 #include "xla/python/python_ref_manager.h"
 #include "tsl/platform/logging.h"
 
diff --git a/third_party/xla/xla/python/transfer_guard_lib.cc b/third_party/xla/xla/python/transfer_guard_lib.cc
index 94a720cab36317..f00ba07337edad 100644
--- a/third_party/xla/xla/python/transfer_guard_lib.cc
+++ b/third_party/xla/xla/python/transfer_guard_lib.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "pybind11/cast.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
-#include "xla/python/status_casters.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/status.h"
 #include "xla/util.h"
 
diff --git a/third_party/xla/xla/python/types.cc b/third_party/xla/xla/python/types.cc
index 288fcf92543fb6..3b86e3056e15c0 100644
--- a/third_party/xla/xla/python/types.cc
+++ b/third_party/xla/xla/python/types.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "xla/python/exceptions.h"
+#include "xla/pjrt/exceptions.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/status_macros.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/python/weakref_lru_cache.cc b/third_party/xla/xla/python/weakref_lru_cache.cc
index 20bde65e9c928d..4de438bfea47c6 100644
--- a/third_party/xla/xla/python/weakref_lru_cache.cc
+++ b/third_party/xla/xla/python/weakref_lru_cache.cc
@@ -28,6 +28,8 @@ limitations under the License.
 #include "pybind11/cast.h"  // from @pybind11
 #include "pybind11/gil.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
+#include "pybind11/stl.h"  // from @pybind11
 #include "xla/pjrt/lru_cache.h"
 
 namespace jax {
@@ -227,6 +229,22 @@ class WeakrefLRUCache : public std::enable_shared_from_this<WeakrefLRUCache> {
       return fn_(weakref_key, *args, **kwargs);
     }
   }
+  std::vector<pybind11::object> GetKeys() {
+    std::vector<pybind11::object> results;
+    mu_.Lock();
+    for (const auto& wr_key : entries_) {
+      for (const auto& rest : *wr_key.second) {
+        pybind11::tuple result(4);
+        result[0] = wr_key.first.weakref;
+        result[1] = rest.first.context;
+        result[2] = rest.first.args;
+        result[3] = rest.first.kwargs;
+        results.push_back(std::move(result));
+      }
+    }
+    mu_.Unlock();
+    return results;
+  }
   CacheInfo GetCacheInfo() const {
     CacheInfo result;
     result.hits = total_queries_ - misses_;
@@ -265,6 +283,7 @@ void BuildWeakrefLRUCacheAPI(pybind11::module& m) {
       py::class_<WeakrefLRUCache, std::shared_ptr<WeakrefLRUCache>>(
           m, "WeakrefLRUCache")
           .def("__call__", &WeakrefLRUCache::Call)
+          .def("cache_keys", &WeakrefLRUCache::GetKeys)
           .def("cache_info", &WeakrefLRUCache::GetCacheInfo)
           .def("cache_clear", &WeakrefLRUCache::Clear);
   py::class_<WeakrefLRUCache::CacheInfo>(weakref_lru_cache,
diff --git a/third_party/xla/xla/python/weakref_lru_cache_test.py b/third_party/xla/xla/python/weakref_lru_cache_test.py
index 213d8a9c23aedd..ae8808bb62d49a 100644
--- a/third_party/xla/xla/python/weakref_lru_cache_test.py
+++ b/third_party/xla/xla/python/weakref_lru_cache_test.py
@@ -94,6 +94,23 @@ def CacheFn(obj, kwkey1, kwkey2):
     self.assertEqual(cache(wrkey, kwkey1="b", kwkey2="a"), 2)
     self.assertEqual(cache(wrkey, kwkey2="b", kwkey1="a"), 1)
 
+  def testGetKeys(self):
+    def CacheFn(obj, arg):
+      del obj
+      return arg + "extra"
+
+    cache = xla_client.weakref_lru_cache(lambda: None, CacheFn, 4)
+
+    class WRKey:
+      pass
+
+    wrkey = WRKey()
+
+    self.assertEmpty(cache.cache_keys())
+    cache(wrkey, "arg1")
+    cache(wrkey, "arg2")
+    self.assertLen(cache.cache_keys(), 2)
+
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index 73fa3586a883b1..79e986d548db57 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <optional>
 #include <set>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -35,12 +36,14 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/python/py_client.h"
+#include "xla/service/cpu/collectives_interface.h"
 #include "tsl/python/lib/core/numpy.h"  //NOLINT
 // clang-format on
 
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "pybind11/attr.h"  // from @pybind11
 #include "pybind11/cast.h"  // from @pybind11
@@ -58,10 +61,19 @@ limitations under the License.
 #ifdef XLA_PYTHON_ENABLE_GPU
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #endif  // XLA_PYTHON_ENABLE_GPU
+
+#ifdef __linux__
+#include "third_party/gloo/gloo/transport/tcp/attr.h"
+#include "third_party/gloo/gloo/transport/tcp/device.h"
+#include "xla/pjrt/cpu/gloo_collectives.h"
+#include "xla/pjrt/cpu/gloo_kv_store.h"
+#endif  // __linux__
+
+#include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/pjrt_api.h"
 #include "xla/pjrt/pjrt_c_api_client.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/custom_call_sharding.h"
 #include "xla/python/dlpack.h"
 #include "xla/python/jax_jit.h"
@@ -82,7 +94,6 @@ limitations under the License.
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/pytree.h"
 #include "xla/python/sharding.h"
-#include "xla/python/status_casters.h"
 #include "xla/python/traceback.h"
 #include "xla/python/transfer_guard_lib.h"
 #include "xla/python/types.h"
@@ -480,18 +491,102 @@ static void Init(py::module_& m) {
                &PyClient::MakePythonCallbackUsingHostSendAndRecv),
            py::arg("callable"), py::arg("operand_shapes"),
            py::arg("result_shapes"), py::arg("send_channel_ids"),
-           py::arg("recv_channel_ids"), py::arg("serializer") = py::none());
+           py::arg("recv_channel_ids"), py::arg("serializer") = py::none())
+      .def("__getattr__", [](PyClient& client, std::string name) -> py::object {
+        const auto& attrs = client.attributes();
+        auto it = attrs.find(name);
+        if (it != attrs.end()) {
+          return std::visit([](auto&& v) { return py::cast(v); }, it->second);
+        }
+        throw py::attribute_error(absl::StrCat("Unknown attribute ", name));
+      });
+
+  py::class_<xla::cpu::CollectivesInterface,
+             std::shared_ptr<xla::cpu::CollectivesInterface>>
+      cpu_collectives(m, "CpuCollectives");
+
+  m.def(
+      "make_gloo_tcp_collectives",
+      [](std::shared_ptr<DistributedRuntimeClient> distributed_client,
+
+         std::optional<std::string> hostname,
+         std::optional<std::string> interface)
+          -> std::shared_ptr<xla::cpu::CollectivesInterface> {
+#ifdef __linux__
+        std::string key_prefix = "cpu:";
+        auto kv_get =
+            [distributed_client, key_prefix](
+                std::string_view k,
+                absl::Duration timeout) -> xla::StatusOr<std::string> {
+          return distributed_client->BlockingKeyValueGet(
+              absl::StrCat(key_prefix, k), timeout);
+        };
+        auto kv_put = [distributed_client, key_prefix](
+                          std::string_view k,
+                          std::string_view v) -> xla::Status {
+          return distributed_client->KeyValueSet(absl::StrCat(key_prefix, k),
+                                                 v);
+        };
+        auto gloo_kv_store =
+            std::make_unique<cpu::GlooKeyValueStore>(kv_get, kv_put);
+        auto tcp_attrs = gloo::transport::tcp::attr();
+        if (hostname) {
+          tcp_attrs.hostname = *hostname;
+        }
+        if (interface) {
+          tcp_attrs.iface = *interface;
+        }
+        auto tcp_device = gloo::transport::tcp::CreateDevice(tcp_attrs);
+        return std::make_shared<cpu::GlooCollectives>(std::move(gloo_kv_store),
+                                                      std::move(tcp_device));
+#else   // __linux__
+        throw xla::XlaRuntimeError(
+            "make_gloo_tcp_collectives only implemented for linux");
+#endif  // __linux__
+      },
+      py::arg("distributed_client"), py::arg("hostname") = std::nullopt,
+      py::arg("interface") = std::nullopt);
 
   m.def(
       "get_tfrt_cpu_client",
-      [](bool asynchronous) -> std::shared_ptr<PyClient> {
+      [](bool asynchronous,
+         std::shared_ptr<DistributedRuntimeClient> distributed_client,
+         int node_id, int num_nodes,
+         std::shared_ptr<xla::cpu::CollectivesInterface> collectives)
+          -> std::shared_ptr<PyClient> {
         py::gil_scoped_release gil_release;
+        CpuClientOptions options;
+        if (distributed_client != nullptr) {
+          std::string key_prefix = "cpu:";
+          options.kv_get =
+              [distributed_client, key_prefix](
+                  std::string_view k,
+                  absl::Duration timeout) -> xla::StatusOr<std::string> {
+            return distributed_client->BlockingKeyValueGet(
+                absl::StrCat(key_prefix, k), timeout);
+          };
+          options.kv_put = [distributed_client, key_prefix](
+                               std::string_view k,
+                               std::string_view v) -> xla::Status {
+            return distributed_client->KeyValueSet(absl::StrCat(key_prefix, k),
+                                                   v);
+          };
+          options.node_id = node_id;
+          options.num_nodes = num_nodes;
+
+          options.collectives = std::move(collectives);
+        }
+
+        options.asynchronous = asynchronous;
         std::unique_ptr<PjRtClient> client =
-            xla::ValueOrThrow(GetTfrtCpuClient(asynchronous));
+            xla::ValueOrThrow(GetTfrtCpuClient(options));
         return std::make_shared<PyClient>(
             ifrt::PjRtClient::Create(std::move(client)));
       },
-      py::arg("asynchronous") = true);
+      py::arg("asynchronous") = true, py::arg("distributed_client") = nullptr,
+      py::arg("node_id") = 0, py::arg("num_nodes") = 1,
+      py::arg("collectives") =
+          std::shared_ptr<xla::cpu::CollectivesInterface>());
   m.def("pjrt_plugin_loaded", [](std::string platform_name) -> bool {
     xla::StatusOr<const PJRT_Api*> pjrt_api = pjrt::PjrtApi(platform_name);
     return pjrt_api.ok();
@@ -536,24 +631,28 @@ static void Init(py::module_& m) {
           // Use the plugin name as key prefix.
           std::string key_prefix = "gpu:";
           kv_get = [distributed_client, key_prefix](
-                       const std::string& k,
+                       std::string_view k,
                        absl::Duration timeout) -> xla::StatusOr<std::string> {
             return distributed_client->BlockingKeyValueGet(
                 absl::StrCat(key_prefix, k), timeout);
           };
           kv_put = [distributed_client, key_prefix](
-                       const std::string& k,
-                       const std::string& v) -> xla::Status {
+                       std::string_view k, std::string_view v) -> xla::Status {
             return distributed_client->KeyValueSet(absl::StrCat(key_prefix, k),
                                                    v);
           };
         }
+        GpuClientOptions options;
+        options.allocator_config = allocator_config;
+        options.node_id = node_id;
+        options.num_nodes = num_nodes;
+        options.allowed_devices = allowed_devices;
+        options.platform_name = platform_name;
+        options.kv_get = kv_get;
+        options.kv_put = kv_put;
+        options.enable_mock_nccl = mock.value_or(false);
         std::unique_ptr<PjRtClient> client =
-            xla::ValueOrThrow(GetStreamExecutorGpuClient(
-                asynchronous, allocator_config, node_id, num_nodes,
-                allowed_devices, platform_name,
-                /*should_stage_host_to_device_transfers=*/true, kv_get, kv_put,
-                /*enable_mock_nccl=*/mock.value_or(false)));
+            xla::ValueOrThrow(GetStreamExecutorGpuClient(options));
         return std::make_shared<PyClient>(
             ifrt::PjRtClient::Create(std::move(client)));
       },
@@ -574,13 +673,13 @@ static void Init(py::module_& m) {
         PjRtClient::KeyValueGetCallback kv_get = nullptr;
         PjRtClient::KeyValuePutCallback kv_put = nullptr;
         if (distributed_client != nullptr) {
-          kv_get = [distributed_client, platform_name](const std::string& k,
+          kv_get = [distributed_client, platform_name](std::string_view k,
                                                        absl::Duration timeout) {
             return distributed_client->BlockingKeyValueGet(
                 absl::StrCat(platform_name, ":", k), timeout);
           };
-          kv_put = [distributed_client, platform_name](const std::string& k,
-                                                       const std::string& v) {
+          kv_put = [distributed_client, platform_name](std::string_view k,
+                                                       std::string_view v) {
             return distributed_client->KeyValueSet(
                 absl::StrCat(platform_name, ":", k), v);
           };
@@ -701,6 +800,8 @@ static void Init(py::module_& m) {
       .def("get_output_shardings", &PyLoadedExecutable::GetOutputShardings)
       .def("get_parameter_layouts",
            xla::ValueOrThrowWrapper(&PyLoadedExecutable::GetParameterLayouts))
+      .def("get_output_layouts",
+           xla::ValueOrThrowWrapper(&PyLoadedExecutable::GetOutputLayouts))
       .def("get_parameter_shardings",
            &PyLoadedExecutable::GetParameterShardings)
       .def("keep_alive", &PyLoadedExecutable::KeepAlive)
@@ -887,7 +988,7 @@ static void Init(py::module_& m) {
       [](std::string address, int num_nodes,
          std::optional<int> heartbeat_interval,
          std::optional<int> max_missing_heartbeats,
-         std::optional<int> enumerate_devices_timeout,
+         std::optional<int> cluster_register_timeout,
          std::optional<int> shutdown_timeout)
           -> std::unique_ptr<DistributedRuntimeService> {
         CoordinationServiceImpl::Options options;
@@ -898,9 +999,9 @@ static void Init(py::module_& m) {
         if (max_missing_heartbeats.has_value()) {
           options.max_missing_heartbeats = *max_missing_heartbeats;
         }
-        if (enumerate_devices_timeout.has_value()) {
-          options.enumerate_devices_timeout =
-              absl::Seconds(*enumerate_devices_timeout);
+        if (cluster_register_timeout.has_value()) {
+          options.cluster_register_timeout =
+              absl::Seconds(*cluster_register_timeout);
         }
         if (shutdown_timeout.has_value()) {
           options.shutdown_timeout = absl::Seconds(*shutdown_timeout);
@@ -912,7 +1013,7 @@ static void Init(py::module_& m) {
       py::arg("address"), py::arg("num_nodes"), py::kw_only(),
       py::arg("heartbeat_interval") = std::nullopt,
       py::arg("max_missing_heartbeats") = std::nullopt,
-      py::arg("enumerate_devices_timeout") = std::nullopt,
+      py::arg("cluster_register_timeout") = std::nullopt,
       py::arg("shutdown_timeout") = std::nullopt);
 
   m.def(
@@ -1012,6 +1113,8 @@ static void Init(py::module_& m) {
       .def("get_output_shardings", &PjRtExecutable::GetOutputShardings)
       .def("get_parameter_layouts",
            xla::ValueOrThrowWrapper(&PjRtExecutable::GetParameterLayouts))
+      .def("get_output_layouts",
+           xla::ValueOrThrowWrapper(&PjRtExecutable::GetOutputLayouts))
       .def("get_parameter_shardings", &PjRtExecutable::GetParameterShardings)
       .def("get_compiled_memory_stats",
            xla::ValueOrThrowWrapper(&PjRtExecutable::GetCompiledMemoryStats))
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index 7d46bafe86bb9e..f45ca418d00ea7 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -26,10 +26,11 @@
 import threading
 from typing import Any, List, Mapping, Optional, Sequence, Tuple, Union
 
-from . import xla_extension as _xla
 import ml_dtypes
 import numpy as np
 
+from . import xla_extension as _xla
+
 # Note this module does *not* depend on any Python protocol buffers. The XLA
 # Python bindings are currently packaged both as part of jaxlib and as part
 # of TensorFlow. If we use protocol buffers here, then importing both jaxlib
@@ -47,7 +48,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 210
+_version = 223
 
 # Version number for MLIR:Python components.
 mlir_api_version = 54
@@ -62,11 +63,20 @@
 _NameValueMapping = Mapping[str, Union[str, int, List[int], float, bool]]
 
 
-def make_cpu_client() -> ...:
-  register_custom_call_handler(
-      'cpu', _xla.register_custom_call_target
+def make_cpu_client(
+    distributed_client=None,
+    node_id=0,
+    num_nodes=1,
+    collectives=None
+) -> ...:
+  register_custom_call_handler('cpu', _xla.register_custom_call_target)
+  return _xla.get_tfrt_cpu_client(
+      asynchronous=True,
+      distributed_client=distributed_client,
+      node_id=node_id,
+      num_nodes=num_nodes,
+      collectives=collectives,
   )
-  return _xla.get_tfrt_cpu_client(asynchronous=True)
 
 
 def make_gpu_client(
@@ -78,13 +88,12 @@ def make_gpu_client(
     mock=False,
 ):
   """Returns a GPU client. BFC allocator is used by default."""
-  allocator = os.getenv('XLA_PYTHON_CLIENT_ALLOCATOR', 'default').lower()
-  memory_fraction = os.getenv('XLA_PYTHON_CLIENT_MEM_FRACTION')
-  preallocate = os.getenv('XLA_PYTHON_CLIENT_PREALLOCATE')
-  if allocator not in ('default', 'platform', 'bfc', 'cuda_async'):
-    raise ValueError(
-        'XLA_PYTHON_CLIENT_ALLOCATOR env var must be "default", "platform", '
-        '"bfc", or "cuda_async", got "%s"' % allocator)
+  options = generate_pjrt_gpu_plugin_options()
+  allocator = options['allocator']
+  memory_fraction = (
+      options['memory_fraction'] if 'memory_fraction' in options else None
+  )
+  preallocate = options['preallocate'] if 'preallocate' in options else None
   config = _xla.GpuAllocatorConfig()
   if allocator == 'default':
     config.kind = _xla.GpuAllocatorConfig.Kind.DEFAULT
@@ -97,12 +106,8 @@ def make_gpu_client(
   if memory_fraction:
     config.memory_fraction = float(memory_fraction)
   config.preallocate = preallocate not in ('0', 'false', 'False')
-  register_custom_call_handler(
-      'CUDA', _xla.register_custom_call_target
-  )
-  register_custom_call_handler(
-      'ROCM', _xla.register_custom_call_target
-  )
+  register_custom_call_handler('CUDA', _xla.register_custom_call_target)
+  register_custom_call_handler('ROCM', _xla.register_custom_call_target)
 
   return _xla.get_gpu_client(
       asynchronous=True,
@@ -185,12 +190,46 @@ def make_c_api_client(
 def make_tpu_client(library_path: Optional[str] = None):
   """Returns a TPU client. Defaults to allowing 32 in-flight computations."""
   if not pjrt_plugin_loaded('tpu'):
-    load_pjrt_plugin_dynamically('tpu', library_path or 'libtpu.so')
+    c_api = load_pjrt_plugin_dynamically('tpu', library_path or 'libtpu.so')
+    profiler.register_plugin_profiler(c_api)
   return make_tfrt_tpu_c_api_client()
 
 
+def generate_pjrt_gpu_plugin_options(
+    visible_devices: str = 'all',
+) -> _NameValueMapping:
+  """Generates the PjRt GPU plugin options.
+
+  Args:
+    visible_devices: A string of visible cuda devices.
+
+  Returns:
+    A dictionary of plugin options.
+  """
+
+  options = {}
+  if visible_devices != 'all':
+    options['visible_devices'] = [int(x) for x in visible_devices.split(',')]
+    options['platform_name'] = 'cuda'
+  allocator = os.getenv('XLA_PYTHON_CLIENT_ALLOCATOR', 'default').lower()
+  memory_fraction = os.getenv('XLA_PYTHON_CLIENT_MEM_FRACTION', '')
+  preallocate = os.getenv('XLA_PYTHON_CLIENT_PREALLOCATE', '')
+  if allocator not in ('default', 'platform', 'bfc', 'cuda_async'):
+    raise ValueError(
+        'XLA_PYTHON_CLIENT_ALLOCATOR env var must be "default", "platform", '
+        '"bfc", or "cuda_async", got "%s"' % allocator
+    )
+  options['allocator'] = allocator
+  if memory_fraction:
+    options['memory_fraction'] = float(memory_fraction)
+  if preallocate:
+    options['preallocate'] = preallocate not in ('false', 'False', '0')
+  return options
+
+
 class OpMetadata:
   """Python representation of a xla.OpMetadata protobuf."""
+
   __slots__ = ('op_type', 'op_name', 'source_file', 'source_line')
 
   def __init__(self, op_type='', op_name='', source_file='', source_line=0):
@@ -205,10 +244,8 @@ def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
   full_filename, lineno = inspect.stack()[skip_frames][1:3]
   filename = os.path.basename(full_filename)
   return OpMetadata(
-      op_type=op_type,
-      op_name=op_name,
-      source_file=filename,
-      source_line=lineno)
+      op_type=op_type, op_name=op_name, source_file=filename, source_line=lineno
+  )
 
 
 PrimitiveType = _xla.PrimitiveType
@@ -347,7 +384,8 @@ def convert(pyval):
     if isinstance(pyval, tuple):
       if layout is not None:
         raise NotImplementedError(
-            'shape_from_pyval does not support layouts for tuple shapes')
+            'shape_from_pyval does not support layouts for tuple shapes'
+        )
       return Shape.tuple_shape(tuple(convert(elt) for elt in pyval))
     else:
       return Shape.array_shape(pyval.dtype, np.shape(pyval), layout)
@@ -445,8 +483,9 @@ class PaddingType(enum.Enum):
   SAME = 2
 
 
-def window_padding_type_to_pad_values(padding_type, lhs_dims, rhs_dims,
-                                      window_strides):
+def window_padding_type_to_pad_values(
+    padding_type, lhs_dims, rhs_dims, window_strides
+):
   """Maps PaddingType or string to pad values (list of pairs of ints)."""
   if not isinstance(padding_type, (str, PaddingType)):
     msg = 'padding_type must be str or PaddingType, got {}.'
@@ -468,7 +507,8 @@ def window_padding_type_to_pad_values(padding_type, lhs_dims, rhs_dims,
     pad_sizes = [
         max((out_size - 1) * stride + filter_size - in_size, 0)
         for out_size, stride, filter_size, in_size in zip(
-            out_shape, window_strides, rhs_dims, lhs_dims)
+            out_shape, window_strides, rhs_dims, lhs_dims
+        )
     ]
     return [(pad_size // 2, pad_size - pad_size // 2) for pad_size in pad_sizes]
   else:
@@ -572,6 +612,7 @@ def register_custom_call_handler(platform: str, handler: Any) -> None:
 
 class PaddingConfigDimension:
   """Python representation of a xla.PaddingConfigDimension protobuf."""
+
   __slots__ = ('edge_padding_low', 'edge_padding_high', 'interior_padding')
 
   edge_padding_low: int
@@ -586,6 +627,7 @@ def __init__(self):
 
 class PaddingConfig:
   """Python representation of a xla.PaddingConfig protobuf."""
+
   __slots__ = ('dimensions',)
 
   def __init__(self):
@@ -619,8 +661,13 @@ def make_padding_config(
 
 class DotDimensionNumbers:
   """Python representation of a xla.DotDimensionNumbers protobuf."""
-  __slots__ = ('lhs_contracting_dimensions', 'rhs_contracting_dimensions',
-               'lhs_batch_dimensions', 'rhs_batch_dimensions')
+
+  __slots__ = (
+      'lhs_contracting_dimensions',
+      'rhs_contracting_dimensions',
+      'lhs_batch_dimensions',
+      'rhs_batch_dimensions',
+  )
 
   def __init__(self):
     self.lhs_contracting_dimensions = []
@@ -630,9 +677,10 @@ def __init__(self):
 
 
 def make_dot_dimension_numbers(
-    dimension_numbers: Union[DotDimensionNumbers,
-                             Tuple[Tuple[List[int], List[int]],
-                                   Tuple[List[int], List[int]]]]
+    dimension_numbers: Union[
+        DotDimensionNumbers,
+        Tuple[Tuple[List[int], List[int]], Tuple[List[int], List[int]]],
+    ]
 ) -> DotDimensionNumbers:
   """Builds a DotDimensionNumbers object from a specification.
 
@@ -659,11 +707,18 @@ def make_dot_dimension_numbers(
 
 class ConvolutionDimensionNumbers:
   """Python representation of a xla.ConvolutionDimensionNumbers protobuf."""
-  __slots__ = ('input_batch_dimension', 'input_feature_dimension',
-               'input_spatial_dimensions', 'kernel_input_feature_dimension',
-               'kernel_output_feature_dimension', 'kernel_spatial_dimensions',
-               'output_batch_dimension', 'output_feature_dimension',
-               'output_spatial_dimensions')
+
+  __slots__ = (
+      'input_batch_dimension',
+      'input_feature_dimension',
+      'input_spatial_dimensions',
+      'kernel_input_feature_dimension',
+      'kernel_output_feature_dimension',
+      'kernel_spatial_dimensions',
+      'output_batch_dimension',
+      'output_feature_dimension',
+      'output_spatial_dimensions',
+  )
 
   def __init__(self):
     self.input_batch_dimension = 0
@@ -678,30 +733,32 @@ def __init__(self):
 
 
 def make_convolution_dimension_numbers(
-    dimension_numbers: Union[None, ConvolutionDimensionNumbers, Tuple[str, str,
-                                                                      str]],
-    num_spatial_dimensions: int) -> ConvolutionDimensionNumbers:
+    dimension_numbers: Union[
+        None, ConvolutionDimensionNumbers, Tuple[str, str, str]
+    ],
+    num_spatial_dimensions: int,
+) -> ConvolutionDimensionNumbers:
   """Builds a ConvolutionDimensionNumbers object from a specification.
 
   Args:
     dimension_numbers: optional, either a ConvolutionDimensionNumbers object or
-      a tuple (lhs_spec, rhs_spec, out_spec). Each element is a string of
-      length N+2 identifying by position: (1) batch dimensions in lhs, rhs, and
-        the output with the character 'N', (2) feature dimensions in lhs and the
-        output with the character 'C', (3) input and output feature dimensions
-        in rhs with the characters 'I' and 'O' respectively, and (4) spatial
-        dimension correspondences between lhs, rhs, and the output using any
-        distinct characters. For example, to indicate dimension numbers
-        consistent with the Conv operation with two spatial dimensions, one
-        could use ('NCHW', 'OIHW', 'NCHW'). As another example, to indicate
-        dimension numbers consistent with the TensorFlow Conv2D operation, one
-        could use ('NHWC', 'HWIO', 'NHWC'). When using the latter form of
-        convolution dimension specification, window strides are associated with
-        spatial dimension character labels according to the order in which the
-        labels appear in the rhs_spec string, so that window_strides[0] is
-        matched with the dimension corresponding to the first character
-        appearing in rhs_spec that is not 'I' or 'O'. By default, use the same
-        dimension numbering as Conv and ConvWithGeneralPadding.
+      a tuple (lhs_spec, rhs_spec, out_spec). Each element is a string of length
+      N+2 identifying by position: (1) batch dimensions in lhs, rhs, and the
+      output with the character 'N', (2) feature dimensions in lhs and the
+      output with the character 'C', (3) input and output feature dimensions in
+      rhs with the characters 'I' and 'O' respectively, and (4) spatial
+      dimension correspondences between lhs, rhs, and the output using any
+      distinct characters. For example, to indicate dimension numbers consistent
+      with the Conv operation with two spatial dimensions, one could use
+      ('NCHW', 'OIHW', 'NCHW'). As another example, to indicate dimension
+      numbers consistent with the TensorFlow Conv2D operation, one could use
+      ('NHWC', 'HWIO', 'NHWC'). When using the latter form of convolution
+      dimension specification, window strides are associated with spatial
+      dimension character labels according to the order in which the labels
+      appear in the rhs_spec string, so that window_strides[0] is matched with
+      the dimension corresponding to the first character appearing in rhs_spec
+      that is not 'I' or 'O'. By default, use the same dimension numbering as
+      Conv and ConvWithGeneralPadding.
     num_spatial_dimensions: the number of spatial dimensions.
 
   Returns:
@@ -731,18 +788,26 @@ def make_convolution_dimension_numbers(
     dimension_numbers.kernel_input_feature_dimension = rhs_spec.index('I')
 
     dimension_numbers.kernel_spatial_dimensions.extend(
-        i for i, c in enumerate(rhs_spec) if c not in {'I', 'O'})
+        i for i, c in enumerate(rhs_spec) if c not in {'I', 'O'}
+    )
     dimension_numbers.input_spatial_dimensions.extend(
-        sorted((i for i, c in enumerate(lhs_spec) if c not in {'N', 'C'}),
-               key=lambda i: rhs_spec.index(lhs_spec[i])))
+        sorted(
+            (i for i, c in enumerate(lhs_spec) if c not in {'N', 'C'}),
+            key=lambda i: rhs_spec.index(lhs_spec[i]),
+        )
+    )
     dimension_numbers.output_spatial_dimensions.extend(
-        sorted((i for i, c in enumerate(out_spec) if c not in {'N', 'C'}),
-               key=lambda i: rhs_spec.index(out_spec[i])))
+        sorted(
+            (i for i, c in enumerate(out_spec) if c not in {'N', 'C'}),
+            key=lambda i: rhs_spec.index(out_spec[i]),
+        )
+    )
   return dimension_numbers
 
 
 class PrecisionConfig:
   """Python representation of a xla.PrecisionConfig protobuf."""
+
   __slots__ = ('operand_precision',)
 
   Precision = _xla.PrecisionConfig_Precision
@@ -753,8 +818,13 @@ def __init__(self):
 
 class GatherDimensionNumbers:
   """Python representation of a xla.GatherDimensionNumbers protobuf."""
-  __slots__ = ('offset_dims', 'collapsed_slice_dims', 'start_index_map',
-               'index_vector_dim')
+
+  __slots__ = (
+      'offset_dims',
+      'collapsed_slice_dims',
+      'start_index_map',
+      'index_vector_dim',
+  )
 
   def __init__(self):
     self.offset_dims = []
@@ -765,8 +835,13 @@ def __init__(self):
 
 class ScatterDimensionNumbers:
   """Python representation of a xla.ScatterDimensionNumbers protobuf."""
-  __slots__ = ('update_window_dims', 'inserted_window_dims',
-               'scatter_dims_to_operand_dims', 'index_vector_dim')
+
+  __slots__ = (
+      'update_window_dims',
+      'inserted_window_dims',
+      'scatter_dims_to_operand_dims',
+      'index_vector_dim',
+  )
 
   def __init__(self):
     self.update_window_dims = []
@@ -777,6 +852,7 @@ def __init__(self):
 
 class ReplicaGroup:
   """Python representation of a xla.ReplicaGroup protobuf."""
+
   __slots__ = ('replica_ids',)
 
   def __init__(self):
@@ -831,3 +907,4 @@ def heap_profile(client: Client) -> bytes:
 copy_array_to_devices_with_sharding = _xla.copy_array_to_devices_with_sharding
 batched_device_put = _xla.batched_device_put
 check_and_canonicalize_memory_kind = _xla.check_and_canonicalize_memory_kind
+Layout = _xla.Layout
diff --git a/third_party/xla/xla/python/xla_client.pyi b/third_party/xla/xla/python/xla_client.pyi
index 11cfb5f333da78..1c5f51c2eba6f4 100644
--- a/third_party/xla/xla/python/xla_client.pyi
+++ b/third_party/xla/xla/python/xla_client.pyi
@@ -66,7 +66,6 @@ XLA_ELEMENT_TYPE_TO_DTYPE: Dict[PrimitiveType, numpy.dtype]
 
 _NameValueMapping = Mapping[str, Union[str, int, List[int], float, bool]]
 
-
 def dtype_to_etype(dtype: numpy.dtype) -> PrimitiveType:
   ...
 
@@ -82,11 +81,14 @@ def shape_from_pyval(pyval: Any, layout: Sequence[int] | None = None) -> Any: ..
 def heap_profile(client: Client) -> bytes:
   ...
 
-
-def make_cpu_client() -> Client:
+def make_cpu_client(
+    distributed_client: Optional[DistributedRuntimeClient] = ...,
+    node_id: int = ...,
+    num_nodes: int = ...,
+    collectives: Optional[_xla.CpuCollectives] = ...,
+) -> Client:
   ...
 
-
 def make_gpu_client(
     distributed_client: Optional[DistributedRuntimeClient] = ...,
     node_id: int = ...,
@@ -96,23 +98,18 @@ def make_gpu_client(
     mock: Optional[bool]=...) -> Client:
   ...
 
-
 def make_tfrt_tpu_c_api_client(options: Optional[_NameValueMapping] = None) -> Client:
   ...
 
-
 def make_tfrt_tpu_c_api_device_topology(topology_name: Optional[str] = None, **kwargs) -> DeviceTopology:
   ...
 
-
 def get_topology_for_devices(devices: List[Device]) -> DeviceTopology:
   ...
 
-
-def make_tpu_client(library_path: Optional[str] = None) -> Client:
+def make_tpu_client(library_path: Optional[str]) -> Client:
   ...
 
-
 def make_c_api_client(
     plugin_name: str,
     options: Optional[_NameValueMapping] = None,
@@ -125,14 +122,16 @@ def pjrt_plugin_loaded(plugin_name: str) -> bool:
 def load_pjrt_plugin_dynamically(plugin_name: str, library_path: str) -> Any:
   ...
 
-
 def pjrt_plugin_initialized(plugin_name: str) -> bool:
   ...
 
-
 def initialize_pjrt_plugin(plugin_name: str) -> None:
   ...
 
+def generate_pjrt_gpu_plugin_options(
+    visible_devices: str = 'all',
+) -> _NameValueMapping:
+  ...
 
 class OpMetadata:
 
@@ -148,17 +147,14 @@ class OpMetadata:
   source_file: Optional[str]
   source_line: Optional[int]
 
-
 class PaddingConfigDimension:
   edge_padding_low: int
   edge_padding_high: int
   interior_padding: int
 
-
 class PaddingConfig:
   dimensions: List[PaddingConfigDimension]
 
-
 def make_padding_config(
     padding_config: Union[PaddingConfig, Sequence[Tuple[int, int, int]]]
 ) -> PaddingConfig:
@@ -168,14 +164,12 @@ class PaddingType(enum.Enum):
   VALID = 1
   SAME = 2
 
-
 class DotDimensionNumbers:
   lhs_contracting_dimensions: List[int]
   rhs_contracting_dimensions: List[int]
   lhs_batch_dimensions: List[int]
   rhs_batch_dimensions: List[int]
 
-
 def make_dot_dimension_numbers(
     dimension_numbers: Union[DotDimensionNumbers,
                              Tuple[Tuple[List[int], List[int]],
@@ -183,7 +177,6 @@ def make_dot_dimension_numbers(
 ) -> DotDimensionNumbers:
   ...
 
-
 class ConvolutionDimensionNumbers:
   input_batch_dimension: int
   input_feature_dimension: int
@@ -195,37 +188,31 @@ class ConvolutionDimensionNumbers:
   output_feature_dimension: int
   output_spatial_dimensions: List[int]
 
-
 def make_convolution_dimension_numbers(
     dimension_numbers: Union[None, ConvolutionDimensionNumbers, Tuple[str, str,
                                                                       str]],
     num_spatial_dimensions: int) -> ConvolutionDimensionNumbers:
   ...
 
-
 class PrecisionConfig:
   Precision = _xla.PrecisionConfig_Precision
   operand_precision: List[_xla.PrecisionConfig_Precision]
 
-
 class GatherDimensionNumbers:
   offset_dims: List[int]
   collapsed_slice_dims: List[int]
   start_index_map: List[int]
   index_vector_dim: int
 
-
 class ScatterDimensionNumbers:
   update_window_dims: List[int]
   inserted_window_dims: List[int]
   scatter_dims_to_operand_dims: List[int]
   index_vector_dim: int
 
-
 class ReplicaGroup:
   replica_ids: List[int]
 
-
 def make_replica_groups(
     replica_groups: Optional[Sequence[Sequence[int]]]) -> List[ReplicaGroup]:
   ...
@@ -247,15 +234,12 @@ def array_result_handler(
                _skip_checks: bool = ...) -> Callable:
   ...
 
-
 def register_custom_call_target(
     name: str, fn: Callable, platform: str = ...
 ) -> None:
   ...
 
-
 def register_custom_call_handler(xla_platform_name: str, handler: Any) -> None:
   ...
 
-
 def encode_inspect_sharding_callback(handler: Any) -> bytes: ...
diff --git a/third_party/xla/xla/python/xla_client_test.py b/third_party/xla/xla/python/xla_client_test.py
index dc858c6dbeed97..4bb0c60f97c608 100644
--- a/third_party/xla/xla/python/xla_client_test.py
+++ b/third_party/xla/xla/python/xla_client_test.py
@@ -210,7 +210,8 @@ def testStableComputationSerialization(self):
         self.assertEqual(computation.as_serialized_hlo_module_proto(), ref)
 
     # TODO(b/261771737): some version of this should work with pjrt_c_api=True
-    @unittest.skipIf(cloud_tpu or pathways or pjrt_c_api, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways or pathways_ifrt or pjrt_c_api,
+                     "not implemented")
     def testFlopEstimate(self):
       computation = self.ExampleComputation()
       properties = xla_client._xla.hlo_module_cost_analysis(
@@ -222,7 +223,8 @@ def testFingerprint(self):
       executable = self.backend.compile(
           xla_computation_to_mlir_module(computation))
       fingerprint = executable.fingerprint
-      if self.backend.platform == "tpu" and not (cloud_tpu or pathways):
+      if self.backend.platform == "tpu" and not (cloud_tpu or pathways or
+                                                 pathways_ifrt):
         logging.info("fingerprint: %s", fingerprint)
         self.assertNotEmpty(fingerprint)
       else:
@@ -511,7 +513,6 @@ class LayoutsTest(ComputationTest):
     """Tests related to getting and setting on-device memory layouts."""
 
     @unittest.skipIf(pathways, "not implemented")
-    @unittest.skipIf(pathways_ifrt, "check fails")
     def testGetArgumentLayouts(self):
       # Create computation with a few parameters.
       c = self._NewComputation()
@@ -539,8 +540,116 @@ def MakeArg(shape, dtype):
       self.assertLen(layouts[1].minor_to_major(), 2)
       self.assertEmpty(layouts[2].minor_to_major())
 
+    @unittest.skipIf(pathways, "not implemented")
+    def testGetArgumentLayoutsTupled(self):
+      # Generated with:
+      # jax.jit(lambda x, y, z: (x, y, z))(np.ones((1024, 8, 128)),
+      #                                    np.int32(42),
+      #                                    np.ones(10))
+      module_str = """
+module @jit__lambda_ attributes {mhlo.num_partitions = 1 : i32,
+                                 mhlo.num_replicas = 1 : i32} {
+  func.func public @main(
+      %arg0: tensor<1024x8x128xf32> {mhlo.sharding = "{replicated}"},
+      %arg1: tensor<i32> {mhlo.sharding = "{replicated}"},
+      %arg2: tensor<10xf32> {mhlo.sharding = "{replicated}"})
+      -> (tensor<1024x8x128xf32> {jax.result_info = "[0]"},
+          tensor<i32> {jax.result_info = "[1]"},
+          tensor<10xf32> {jax.result_info = "[2]"}) {
+    return %arg0, %arg1, %arg2 : tensor<1024x8x128xf32>, tensor<i32>, tensor<10xf32>
+  }
+}
+"""
+      options = xla_client.CompileOptions()
+      # 'parameter_is_tupled_arguments' causes MLIR untupled arguments to get
+      # turned into HLO tupled arguments.
+      options.parameter_is_tupled_arguments = True
+      executable = self.backend.compile(module_str, compile_options=options)
+
+      # Test that compiled executable returns plausible layouts.
+      layouts: Sequence[xla_client.Layout] = executable.get_parameter_layouts()
+      self.assertLen(layouts, 3)
+      self.assertLen(layouts[0].minor_to_major(), 3)
+      self.assertEmpty(layouts[1].minor_to_major())
+      self.assertLen(layouts[2].minor_to_major(), 1)
+
+    @unittest.skipIf(pathways, "not implemented")
+    def testGetOutputLayouts(self):
+      # Generated with jax.jit(lambda: (np.ones((1024, 128)), np.int32(42),
+      #                                 np.ones(10)))()
+      module_str = """
+module @jit__lambda_ attributes {mhlo.num_partitions = 1 : i32,
+                                 mhlo.num_replicas = 1 : i32} {
+  func.func public @main() -> (tensor<1024x128xf32> {jax.result_info = "[0]"},
+                               tensor<i32> {jax.result_info = "[1]"},
+                               tensor<10xf32> {jax.result_info = "[2]"}) {
+    %0 = stablehlo.constant dense<1.000000e+00> : tensor<1024x128xf32>
+    %1 = stablehlo.constant dense<1.000000e+00> : tensor<10xf32>
+    %2 = stablehlo.constant dense<42> : tensor<i32>
+    return %0, %2, %1 : tensor<1024x128xf32>, tensor<i32>, tensor<10xf32>
+  }
+}
+"""
+      executable = self.backend.compile(module_str)
+
+      # Test that compiled executable returns plausible layouts.
+      layouts: Sequence[xla_client.Layout] = executable.get_output_layouts()
+      self.assertLen(layouts, 3)
+      self.assertLen(layouts[0].minor_to_major(), 2)
+      self.assertEmpty(layouts[1].minor_to_major())
+      self.assertLen(layouts[2].minor_to_major(), 1)
+
     @unittest.skipIf(pathways, "not implemented")
     def testSetArgumentLayouts(self):
+      # TODO(b/309682374): implement on CPU and GPU
+      if self.backend.platform != "tpu":
+        raise self.skipTest("mhlo.layout_mode only implemented on TPU")
+
+      # Hand-edited version of:
+      # jax.jit(lambda x, y, z: (x, y, z))(np.ones((1024, 8, 128)),
+      #                                    np.int32(42),
+      #                                    np.ones(10))
+      module_str = """
+module @jit__lambda_ attributes {mhlo.num_partitions = 1 : i32,
+                                 mhlo.num_replicas = 1 : i32} {
+  func.func public @main(
+      %arg0: tensor<1024x8x128xf32> {mhlo.sharding = "{replicated}",
+                                     mhlo.layout_mode = "{0,1,2}"},
+      %arg1: tensor<i32> {mhlo.sharding = "{replicated}",
+                          mhlo.layout_mode = "{}"},
+      %arg2: tensor<10xf32> {mhlo.sharding = "{replicated}",
+                             mhlo.layout_mode = "{0}"})
+      -> (tensor<1024x8x128xf32> {jax.result_info = "[0]"},
+          tensor<i32> {jax.result_info = "[1]"},
+          tensor<10xf32> {jax.result_info = "[2]"}) {
+    return %arg0, %arg1, %arg2 : tensor<1024x8x128xf32>, tensor<i32>, tensor<10xf32>
+  }
+}
+      """
+      executable = self.backend.compile(module_str)
+
+      # Check input layouts.
+      input_layouts = executable.get_parameter_layouts()
+      self.assertLen(input_layouts, 3)
+      self.assertEqual(input_layouts[0].minor_to_major(), (0, 1, 2))
+      self.assertEqual(input_layouts[1].minor_to_major(), ())
+      self.assertEqual(input_layouts[2].minor_to_major(), (0,))
+
+      # Compile a version with default arg0 layout so we can make sure we
+      # actually set it above.
+      default_executable = self.backend.compile(
+          module_str.replace('"{0,1,2}"', '"default"')
+      )
+      self.assertNotEqual(
+          input_layouts[0].minor_to_major(),
+          default_executable.get_parameter_layouts()[0].minor_to_major())
+
+    @unittest.skipIf(pathways or pathways_ifrt, "not implemented")
+    def testSetArgumentLayoutsLegacy(self):
+      """Tests setting the arg layouts with compile_options (deprecated).
+
+      New code should use the mhlo.layout_mode string attr on parameters.
+      """
       # Create computation with custom input layouts.
       c = self._NewComputation()
       param_count = 0
@@ -579,6 +688,191 @@ def MakeArg(shape, dtype, layout):
         self.assertEqual(actual.minor_to_major(),
                          expected.layout().minor_to_major())
 
+    @unittest.skipIf(pathways, "not implemented")
+    def testSetOutputLayouts(self):
+      # TODO(b/309682374): implement on CPU and GPU
+      if self.backend.platform != "tpu":
+        raise self.skipTest("mhlo.layout_mode only implemented on TPU")
+
+      # Hand-edited version of:
+      # jax.jit(lambda x, y, z: (x, y, z))(np.ones((1024, 8, 128)),
+      #                                    np.int32(42),
+      #                                    np.ones(10))
+      module_str = """
+module @jit__lambda_ attributes {mhlo.num_partitions = 1 : i32,
+                                 mhlo.num_replicas = 1 : i32} {
+  func.func public @main(
+      %arg0: tensor<1024x8x128xf32> {mhlo.sharding = "{replicated}"},
+      %arg1: tensor<i32> {mhlo.sharding = "{replicated}"},
+      %arg2: tensor<10xf32> {mhlo.sharding = "{replicated}"})
+      -> (tensor<1024x8x128xf32> {jax.result_info = "[0]",
+                                  mhlo.layout_mode = "{0,1,2}"},
+          tensor<i32> {jax.result_info = "[1]",
+                       mhlo.layout_mode = "{}"},
+          tensor<10xf32> {jax.result_info = "[2]",
+                          mhlo.layout_mode = "{0}"}) {
+    return %arg0, %arg1, %arg2 : tensor<1024x8x128xf32>, tensor<i32>, tensor<10xf32>
+  }
+}
+      """
+      executable = self.backend.compile(module_str)
+
+      # Check output layouts.
+      output_layouts = executable.get_output_layouts()
+      self.assertLen(output_layouts, 3)
+      self.assertEqual(output_layouts[0].minor_to_major(), (0, 1, 2))
+      self.assertEqual(output_layouts[1].minor_to_major(), ())
+      self.assertEqual(output_layouts[2].minor_to_major(), (0,))
+
+      # Compile a version with default first output layout so we can make sure
+      # we actually set it above.
+      default_executable = self.backend.compile(
+          module_str.replace('"{0,1,2}"', '"default"')
+      )
+      self.assertNotEqual(
+          output_layouts[0].minor_to_major(),
+          default_executable.get_output_layouts()[0].minor_to_major())
+
+    @unittest.skipIf(pathways, "not implemented")
+    def SetLayoutsSharded(self):
+      # TODO(b/309682374): implement on CPU and GPU
+      if self.backend.platform != "tpu":
+        raise self.skipTest("mhlo.layout_mode only implemented on TPU")
+
+      # Hand-edited version of:
+      # sharding = PositionalSharding(mesh_utils.create_device_mesh((8,)))
+      # x = jax.device_put(np.ones((1024, 128)), sharding.reshape(4, 2))
+      # jax.jit(lambda x, y: x + y, out_shardings=sharding)(x, 1.)
+      #
+      # This also lightly tests mixed default + user-specified input layouts.
+      module_str = """
+module @jit__lambda_ attributes {mhlo.num_partitions = 8 : i32,
+                                 mhlo.num_replicas = 1 : i32} {
+  func.func public @main(
+      %arg0: tensor<1024x128xf32> {mhlo.sharding = "{devices=[4,2]0,1,2,3,4,5,6,7}",
+                                   mhlo.layout_mode = "{0,1}"},
+      %arg1: tensor<f32> {mhlo.sharding = "{replicated}"})
+      -> (tensor<1024x128xf32> {jax.result_info = "",
+                                mhlo.sharding = "{devices=[4,2]0,1,2,3,4,5,6,7}",
+                                mhlo.layout_mode = "{0,1}"}) {
+    %0 = stablehlo.convert %arg1 : tensor<f32>
+    %1 = stablehlo.broadcast_in_dim %0, dims = [] : (tensor<f32>) -> tensor<1024x128xf32>
+    %2 = stablehlo.add %arg0, %1 : tensor<1024x128xf32>
+    return %2 : tensor<1024x128xf32>
+  }
+}
+      """
+      executable = self.backend.compile(module_str)
+
+      # Check input layouts.
+      input_layouts = executable.get_parameter_layouts()
+      self.assertLen(input_layouts, 2)
+      self.assertEqual(input_layouts[0].minor_to_major(), (0, 1))
+      self.assertEqual(input_layouts[1].minor_to_major(), ())
+
+      # Check output layout.
+      output_layouts = executable.get_output_layouts()
+      self.assertLen(output_layouts, 1)
+      self.assertEqual(input_layouts[0].minor_to_major(), (0, 1))
+
+      # Compile a version with default layouts so we can make sure we actually
+      # set it above.
+      default_executable = self.backend.compile(
+          module_str.replace('"{0,1}"', '"default"')
+      )
+      self.assertNotEqual(
+          input_layouts[0].minor_to_major(),
+          default_executable.get_parameter_layouts()[0].minor_to_major())
+      self.assertNotEqual(
+          output_layouts[0].minor_to_major(),
+          default_executable.get_output_layouts()[0].minor_to_major())
+
+    @unittest.skipIf(pathways, "not implemented")
+    def testAutoArgumentLayouts(self):
+      # TODO(b/309682374): implement on CPU and GPU
+      if self.backend.platform != "tpu":
+        raise self.skipTest("mhlo.layout_mode only implemented on TPU")
+
+      # Hand-edited version of:
+      # jax.numpy.einsum("...a,ahd->...hd", ...)
+      module_str = """
+module @jit__lambda_ attributes {mhlo.num_partitions = 1 : i32,
+                                 mhlo.num_replicas = 1 : i32} {
+  func.func public @main(
+      %arg0: tensor<1024x1024xf32> {mhlo.sharding = "{replicated}",
+                                    mhlo.layout_mode = "auto"},
+      %arg1: tensor<1024x8x128xf32> {mhlo.sharding = "{replicated}",
+                                     mhlo.layout_mode = "auto"})
+      -> (tensor<1024x8x128xf32> {jax.result_info = ""}) {
+    %0 = stablehlo.dot_general %arg0, %arg1,
+        contracting_dims = [1] x [0],
+        precision = [DEFAULT, DEFAULT] : (tensor<1024x1024xf32>,
+                                          tensor<1024x8x128xf32>)
+        -> tensor<1024x8x128xf32>
+    return %0 : tensor<1024x8x128xf32>
+  }
+}
+"""
+      executable = self.backend.compile(module_str)
+
+      # Check input layouts.
+      input_layouts = executable.get_parameter_layouts()
+      self.assertEqual(input_layouts[0].minor_to_major(), (1, 0))
+      self.assertEqual(input_layouts[1].minor_to_major(), (2, 0, 1))
+
+      # Compile a version with default layouts so we can make sure the compiler
+      # is actually choosing above.
+      default_executable = self.backend.compile(
+          module_str.replace('"auto"', '"default"')
+      )
+      # We expect the compiler to choose a non-default layout for the second
+      # (1024,8,128) argument.
+      self.assertNotEqual(
+          input_layouts[1].minor_to_major(),
+          default_executable.get_parameter_layouts()[1].minor_to_major(),
+      )
+
+    @unittest.skipIf(pathways, "not implemented")
+    def testAutoOutputLayouts(self):
+      # TODO(b/309682374): implement on CPU and GPU
+      if self.backend.platform != "tpu":
+        raise self.skipTest("mhlo.layout_mode only implemented on TPU")
+
+      # Generated with jax.numpy.einsum("...a,ahd->...hd", ...)
+      module_str = """
+module @jit__lambda_ attributes {mhlo.num_partitions = 1 : i32,
+                                 mhlo.num_replicas = 1 : i32} {
+  func.func public @main(
+      %arg0: tensor<1024x1024xf32> {mhlo.sharding = "{replicated}"},
+      %arg1: tensor<1024x8x128xf32> {mhlo.sharding = "{replicated}"})
+      -> (tensor<1024x8x128xf32> {jax.result_info = "",
+                                  mhlo.layout_mode = "auto"}) {
+    %0 = stablehlo.dot_general %arg0, %arg1,
+        contracting_dims = [1] x [0],
+        precision = [DEFAULT, DEFAULT] : (tensor<1024x1024xf32>,
+                                          tensor<1024x8x128xf32>)
+        -> tensor<1024x8x128xf32>
+    return %0 : tensor<1024x8x128xf32>
+  }
+}
+"""
+      executable = self.backend.compile(module_str)
+
+      # Check output layout
+      output_layout, = executable.get_output_layouts()
+      self.assertEqual(output_layout.minor_to_major(), (2, 0, 1))
+
+      # Compile a version with default layouts so we can make sure the compiler
+      # is actually choosing above.
+      default_executable = self.backend.compile(
+          module_str.replace('"auto"', '"default"')
+      )
+      # We expect the compiler to choose a non-default output layout.
+      self.assertNotEqual(
+          output_layout.minor_to_major(),
+          default_executable.get_output_layouts()[0].minor_to_major(),
+      )
+
   tests.append(LayoutsTest)
 
   class BufferTest(ComputationTest):
@@ -754,7 +1048,7 @@ def testUnsafeBufferPointer(self):
       self.assertGreaterEqual(arg1_buffer.unsafe_buffer_pointer(), 0)
       self.assertGreaterEqual(arg2_buffer.unsafe_buffer_pointer(), 0)
 
-    @unittest.skipIf(cloud_tpu or pathways, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways or pathways_ifrt, "not implemented")
     def testClone(self):
       x = np.array([[3., 4., 5.]], np.float32)
       y = self.backend.buffer_from_pyval(x)
@@ -2122,7 +2416,8 @@ def testConditionalFalse(self):
                       false_computation)
       self._ExecuteAndCompareClose(c, expected=[1.])
 
-    @unittest.skipIf(cloud_tpu or pathways or pjrt_c_api, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways or pathways_ifrt or pjrt_c_api,
+                     "not implemented")
     def testInfeedS32Values(self):
       to_infeed = NumpyArrayS32([1, 2, 3, 4])
       c = self._NewComputation()
@@ -2142,7 +2437,8 @@ def testInfeedS32Values(self):
             compiled_c, (), backend=self.backend)
         self.assertEqual(result, item)
 
-    @unittest.skipIf(cloud_tpu or pathways or pjrt_c_api, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways or pathways_ifrt or pjrt_c_api,
+                     "not implemented")
     def testInfeedTuple(self):
       to_infeed = (NumpyArrayS32([1, 2, 3, 4]), NumpyArrayS32([[7], [8]]))
       c = self._NewComputation()
@@ -2162,7 +2458,8 @@ def testInfeedTuple(self):
       np.testing.assert_equal(result[0], to_infeed[0])
       np.testing.assert_equal(result[1], to_infeed[1])
 
-    @unittest.skipIf(cloud_tpu or pathways or pjrt_c_api, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways or pathways_ifrt or pjrt_c_api,
+                     "not implemented")
     def testInfeedThenOutfeedS32(self):
       to_round_trip = NumpyArrayS32([1, 2, 3, 4])
       c = self._NewComputation()
@@ -2589,12 +2886,29 @@ def testPlatformVersion(self):
           self.assertTrue(
               re.match(r"^cuda \d{4,}$", version),
               msg=f"Expected CUDA version string; got {repr(version)}")
-      elif self.backend.platform == "tpu" and not pathways:
+      elif self.backend.platform == "tpu" and not (pathways or pathways_ifrt):
         self.assertIn("tpu", version.lower())
         self.assertIn("cl/", version)
         self.assertIn("Built on ", version)
 
-    @unittest.skipIf(cloud_tpu or pathways or tfrt_tpu, "not implemented")
+    @unittest.skipIf(
+        not cloud_tpu and not pjrt_c_api, "PJRT version only exist for plugins"
+    )
+    def testPjRtCApiVersion(self):
+      self.assertGreaterEqual(self.backend.pjrt_c_api_major_version, 0)
+      self.assertGreaterEqual(self.backend.pjrt_c_api_minor_version, 0)
+
+    @unittest.skipIf(
+        cloud_tpu or pjrt_c_api, "PJRT version only exist for plugins"
+    )
+    def testNotExistPjRtCApiVersion(self):
+      with self.assertRaises(AttributeError):
+        self.backend.pjrt_c_api_major_version  # pylint: disable=pointless-statement
+      with self.assertRaises(AttributeError):
+        self.backend.pjrt_c_api_minor_version  # pylint: disable=pointless-statement
+
+    @unittest.skipIf(cloud_tpu or pathways or pathways_ifrt or tfrt_tpu,
+                     "not implemented")
     def testExecutableSerialization(self):
       if self.backend.platform != "tpu":
         self.skipTest("Test requires tpu platform")
diff --git a/third_party/xla/xla/python/xla_compiler.cc b/third_party/xla/xla/python/xla_compiler.cc
index d27bc48fb1899c..84f8f22e3a7828 100644
--- a/third_party/xla/xla/python/xla_compiler.cc
+++ b/third_party/xla/xla/python/xla_compiler.cc
@@ -43,10 +43,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/layout.h"
 #include "xla/layout_util.h"
-#include "xla/python/exceptions.h"
+#include "xla/pjrt/exceptions.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/py_client.h"
-#include "xla/python/status_casters.h"
 #include "xla/python/types.h"
 #include "xla/service/call_inliner.h"
 #include "xla/service/computation_placer.h"
@@ -288,6 +289,9 @@ void BuildXlaCompilerSubmodule(py::module& m) {
   // Shapes
   py::class_<Layout> layout_class(m, "Layout");
   layout_class
+      .def(py::init([](absl::Span<const int64_t> minor_to_major) {
+        return std::make_unique<Layout>(minor_to_major);
+      }))
       .def("minor_to_major",
            [](Layout layout) { return SpanToTuple(layout.minor_to_major()); })
       .def("__eq__", [](const Layout& layout,
@@ -296,7 +300,24 @@ void BuildXlaCompilerSubmodule(py::module& m) {
                         const Layout& other) { return layout != other; })
       .def("__hash__",
            [](const Layout& layout) { return absl::HashOf(layout); })
-      .def("to_string", &Layout::ToString);
+      .def("to_string", &Layout::ToString)
+      .def(py::pickle(
+          [](const Layout& self) -> py::tuple {
+            auto proto = self.ToProto();
+            std::string result;
+            if (!tsl::SerializeToStringDeterministic(proto, &result)) {
+              // throw converted by PyBind to a Python RuntimeError.
+              throw XlaRuntimeError(
+                  absl::StrCat("Layout.py_pickle: ",
+                               "SerializeToStringDeterministic failed"));
+            }
+            return py::make_tuple(py::bytes(result));
+          },
+          [](py::tuple t) {
+            LayoutProto result;
+            result.ParseFromString(t[0].cast<std::string>());
+            return Layout::CreateFromProto(result);
+          }));
 
   py::class_<Shape> shape_class(m, "Shape");
   shape_class
diff --git a/third_party/xla/xla/python/xla_extension/__init__.pyi b/third_party/xla/xla/python/xla_extension/__init__.pyi
index 95f63f91246ac1..d5dc96642c6b6c 100644
--- a/third_party/xla/xla/python/xla_extension/__init__.pyi
+++ b/third_party/xla/xla/python/xla_extension/__init__.pyi
@@ -20,14 +20,26 @@ import inspect
 import types
 import typing
 from typing import (
-    Any, Callable, ClassVar, Dict, Iterator, List, Optional, Sequence, Tuple,
-    Type, TypeVar, Union, overload)
+    Any,
+    Callable,
+    ClassVar,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    overload,
+)
 
 import numpy as np
 
-from . import ops
 from . import jax_jit
 from . import mlir
+from . import ops
 from . import outfeed_receiver
 from . import pmap_lib
 from . import profiler
@@ -73,6 +85,7 @@ class PrimitiveType(enum.IntEnum):
 # === BEGIN xla_compiler.cc
 
 class Layout:
+  def __init__(self, minor_to_major: Tuple[int, ...]): ...
   def minor_to_major(self) -> Tuple[int, ...]: ...
   def to_string(self) -> str: ...
   def __eq__(self, other: Layout) -> bool: ...
@@ -88,7 +101,8 @@ class Shape:
       type: Union[np.dtype, PrimitiveType],
       dims_seq: Any = ...,
       layout_seq: Any = ...,
-      dynamic_dimensions: Optional[List[bool]] = ...) -> Shape: ...
+      dynamic_dimensions: Optional[List[bool]] = ...,
+  ) -> Shape: ...
   @staticmethod
   def token_shape() -> Shape: ...
   @staticmethod
@@ -136,7 +150,7 @@ class XlaComputation:
   def get_hlo_module(self) -> HloModule: ...
   def program_shape(self) -> ProgramShape: ...
   def as_serialized_hlo_module_proto(self) -> bytes: ...
-  def as_hlo_text(self, print_large_constants: bool=False) -> str: ...
+  def as_hlo_text(self, print_large_constants: bool = False) -> str: ...
   def as_hlo_dot_graph(self) -> str: ...
   def hash(self) -> int: ...
   def as_hlo_module(self) -> HloModule: ...
@@ -176,10 +190,11 @@ class HloModule:
   @property
   def name(self) -> str: ...
   def to_string(self, options: HloPrintOptions = ...) -> str: ...
-  def as_serialized_hlo_module_proto(self)-> bytes: ...
+  def as_serialized_hlo_module_proto(self) -> bytes: ...
   @staticmethod
   def from_serialized_hlo_module_proto(
-    serialized_hlo_module_proto: bytes) -> HloModule: ...
+      serialized_hlo_module_proto: bytes,
+  ) -> HloModule: ...
   def computations(self) -> List[HloComputation]: ...
 
 class HloModuleGroup:
@@ -191,10 +206,9 @@ class HloModuleGroup:
 
 def hlo_module_to_dot_graph(hlo_module: HloModule) -> str: ...
 def hlo_module_from_text(hlo_module_text: str) -> HloModule: ...
-
 def hlo_module_cost_analysis(
-    client: Client,
-    module: HloModule) -> Dict[str, float]: ...
+    client: Client, module: HloModule
+) -> Dict[str, float]: ...
 
 class XlaOp: ...
 
@@ -214,7 +228,8 @@ class XlaBuilder:
       self,
       __output_index: Sequence[int],
       __param_number: int,
-      __param_index: Sequence[int]) -> None: ...
+      __param_index: Sequence[int],
+  ) -> None: ...
 
 class DeviceAssignment:
   @staticmethod
@@ -238,12 +253,18 @@ class CompileOptions:
   profile_version: int
   device_assignment: Optional[DeviceAssignment]
   compile_portable_executable: bool
-  env_option_overrides: List[Tuple[str,str]]
-
-def register_custom_call_target(fn_name: str, capsule: Any, platform: str) -> _Status: ...
-def register_custom_call_partitioner(name: str, prop_user_sharding: Callable,
-                                     partition: Callable, infer_sharding_from_operands: Callable,
-                                     can_side_effecting_have_replicated_sharding: bool) -> None: ...
+  env_option_overrides: List[Tuple[str, str]]
+
+def register_custom_call_target(
+    fn_name: str, capsule: Any, platform: str
+) -> _Status: ...
+def register_custom_call_partitioner(
+    name: str,
+    prop_user_sharding: Callable,
+    partition: Callable,
+    infer_sharding_from_operands: Callable,
+    can_side_effecting_have_replicated_sharding: bool,
+) -> None: ...
 def encode_inspect_sharding_callback(handler: Any) -> bytes: ...
 
 class DebugOptions:
@@ -294,7 +315,6 @@ class CompiledMemoryStats:
   serialized_hlo_proto: bytes
   def __str__(self) -> str: ...
 
-
 class ExecutableBuildOptions:
   def __init__(self) -> None: ...
   def __repr__(self) -> str: ...
@@ -350,11 +370,16 @@ class HloSharding:
   @staticmethod
   def from_string(sharding: str) -> HloSharding: ...
   @staticmethod
-  def tuple_sharding(shape: Shape, shardings: Sequence[HloSharding]) -> HloSharding: ...
+  def tuple_sharding(
+      shape: Shape, shardings: Sequence[HloSharding]
+  ) -> HloSharding: ...
   @staticmethod
-  def iota_tile(dims: Sequence[int], reshape_dims: Sequence[int],
-                transpose_perm: Sequence[int],
-                subgroup_types: Sequence[OpSharding.Type]) -> HloSharding: ...
+  def iota_tile(
+      dims: Sequence[int],
+      reshape_dims: Sequence[int],
+      transpose_perm: Sequence[int],
+      subgroup_types: Sequence[OpSharding.Type],
+  ) -> HloSharding: ...
   @staticmethod
   def replicate() -> HloSharding: ...
   @staticmethod
@@ -416,16 +441,17 @@ class Memory:
 
 class GpuAllocatorConfig:
   class Kind(enum.IntEnum):
-      DEFAULT: int
-      PLATFORM: int
-      BFC: int
-      CUDA_ASYNC: int
+    DEFAULT: int
+    PLATFORM: int
+    BFC: int
+    CUDA_ASYNC: int
 
   def __init__(
       self,
       kind: Kind = ...,
       memory_fraction: float = ...,
-      preallocate: bool = ...) -> None: ...
+      preallocate: bool = ...,
+  ) -> None: ...
 
 class HostBufferSemantics(enum.IntEnum):
   IMMUTABLE_ONLY_DURING_CALL: HostBufferSemantics
@@ -451,57 +477,87 @@ class Client:
       argument: Any,
       device: Optional[Device] = ...,
       force_copy: bool = ...,
-      host_buffer_semantics: HostBufferSemantics = ...) -> ArrayImpl: ...
+      host_buffer_semantics: HostBufferSemantics = ...,
+  ) -> ArrayImpl: ...
   def make_cross_host_receive_buffers(
-      self,
-      shapes: Sequence[Shape],
-      device: Device) -> List[Tuple[ArrayImpl, bytes]]: ...
+      self, shapes: Sequence[Shape], device: Device
+  ) -> List[Tuple[ArrayImpl, bytes]]: ...
   def compile(
       self,
       computation: Union[str, bytes],
-      compile_options: CompileOptions = ..., host_callbacks: Sequence[Any] = ...) -> LoadedExecutable: ...
+      compile_options: CompileOptions = ...,
+      host_callbacks: Sequence[Any] = ...,
+  ) -> LoadedExecutable: ...
   def serialize_executable(self, executable: LoadedExecutable) -> bytes: ...
   def deserialize_executable(
-      self, serialized: bytes,
-      options: Optional[CompileOptions], host_callbacks: Sequence[Any] = ...) -> LoadedExecutable: ...
+      self,
+      serialized: bytes,
+      options: Optional[CompileOptions],
+      host_callbacks: Sequence[Any] = ...,
+  ) -> LoadedExecutable: ...
   def heap_profile(self) -> bytes: ...
   def defragment(self) -> _Status: ...
   def get_emit_python_callback_descriptor(
-      self, callable: Callable, operand_shapes: Sequence[Shape],
-      results_shapes: Sequence[Shape]) -> Tuple[Any, Any]: ...
+      self,
+      callable: Callable,
+      operand_shapes: Sequence[Shape],
+      results_shapes: Sequence[Shape],
+  ) -> Tuple[Any, Any]: ...
   def make_python_callback_from_host_send_and_recv(
-      self, callable: Callable, operand_shapes: Sequence[Shape],
-      result_shapes: Sequence[Shape], send_channel_ids: Sequence[int],
-      recv_channel_ids: Sequence[int], serializer: Optional[Callable] = ...) -> Any: ...
+      self,
+      callable: Callable,
+      operand_shapes: Sequence[Shape],
+      result_shapes: Sequence[Shape],
+      send_channel_ids: Sequence[int],
+      recv_channel_ids: Sequence[int],
+      serializer: Optional[Callable] = ...,
+  ) -> Any: ...
+  def __getattr__(self, name: str) -> Any: ...
 
+class CpuCollectives: ...
 
-def get_tfrt_cpu_client(asynchronous: bool = ...) -> Client: ...
+def make_gloo_tcp_collectives(
+    distributed_client: Optional[DistributedRuntimeClient] = ...,
+    hostname: Optional[str] = ...,
+    interface: Optional[str] = ...,
+) -> CpuCollectives: ...
+
+def get_tfrt_cpu_client(
+    asynchronous: bool = ...,
+    distributed_client: Optional[DistributedRuntimeClient] = ...,
+    node_id: int = ...,
+    num_nodes: int = ...,
+    collectives: Optional[CpuCollectives] = ...,
+) -> Client: ...
 def get_gpu_client(
     asynchronous: bool = ...,
     allocator_config: GpuAllocatorConfig = ...,
     distributed_client: Optional[DistributedRuntimeClient] = ...,
     node_id: int = ...,
+    num_nodes: int = ...,
     allowed_devices: Optional[Any] = ...,
     platform_name: Optional[str] = ...,
-    mock:Optional[bool]=...) -> Client:...
+    mock: Optional[bool] = ...,
+) -> Client: ...
 def get_mock_gpu_client(
     asynchronous: bool = ...,
     allocator_config: GpuAllocatorConfig = ...,
     distributed_client: Optional[DistributedRuntimeClient] = ...,
     node_id: int = ...,
     allowed_devices: Optional[Any] = ...,
-    platform_name: Optional[str] = ...) -> Client:...
-def get_c_api_client(platform_name: str, options: Dict[str, Union[str, int, List[int], float]]) -> Client: ...
+    platform_name: Optional[str] = ...,
+) -> Client: ...
+def get_c_api_client(
+    platform_name: str,
+    options: Dict[str, Union[str, int, List[int], float, bool]],
+    distributed_client: Optional[DistributedRuntimeClient] = ...,
+) -> Client: ...
 def get_default_c_api_topology(
     platform_name: str,
     topology_name: str,
     options: Dict[str, Union[str, int, List[int], float]],
-) -> DeviceTopology:
-  ...
-def get_topology_for_devices(devices: List[Device]) -> DeviceTopology:
-  ...
-
-
+) -> DeviceTopology: ...
+def get_topology_for_devices(devices: List[Device]) -> DeviceTopology: ...
 def load_pjrt_plugin(platform_name: str, library_path: str) -> _Status: ...
 def pjrt_plugin_loaded(plugin_name: str) -> bool: ...
 def pjrt_plugin_initialized(plugin_name: str) -> bool: ...
@@ -538,25 +594,22 @@ ArrayImpl = Any
 #   traceback: Traceback
 #   _HAS_DYNAMIC_ATTRIBUTES: bool = ...
 
-
-def copy_array_to_devices_with_sharding(self: ArrayImpl, devices: List[Device], sharding: Any) -> ArrayImpl: ...
-
-
+def copy_array_to_devices_with_sharding(
+    self: ArrayImpl, devices: List[Device], sharding: Any
+) -> ArrayImpl: ...
 def batched_device_put(
-    aval: Any, sharding: Any, shards: Sequence[Any], devices: List[Device],
+    aval: Any,
+    sharding: Any,
+    shards: Sequence[Any],
+    devices: List[Device],
     committed: bool = True,
-) -> ArrayImpl:
-  ...
-
+) -> ArrayImpl: ...
 def check_and_canonicalize_memory_kind(
-    memory_kind: Optional[str], device_list: DeviceList) -> Optional[str]: ...
-
+    memory_kind: Optional[str], device_list: DeviceList
+) -> Optional[str]: ...
 def array_result_handler(
-               aval: Any,
-               sharding: Any,
-               committed: bool,
-               _skip_checks: bool = ...) -> Callable:
-  ...
+    aval: Any, sharding: Any, committed: bool, _skip_checks: bool = ...
+) -> Callable: ...
 
 class Token:
   def block_until_ready(self): ...
@@ -568,7 +621,9 @@ class ShardedToken:
 class ExecuteResults:
   def __len__(self) -> int: ...
   def disassemble_into_single_device_arrays(self) -> List[List[ArrayImpl]]: ...
-  def disassemble_prefix_into_single_device_arrays(self, n: int) -> List[List[ArrayImpl]]: ...
+  def disassemble_prefix_into_single_device_arrays(
+      self, n: int
+  ) -> List[List[ArrayImpl]]: ...
   def consume_with_handlers(self, handlers: List[Callable]) -> List[Any]: ...
   def consume_token(self) -> ShardedToken: ...
 
@@ -580,21 +635,24 @@ class LoadedExecutable:
   def delete(self) -> None: ...
   def execute(self, arguments: Sequence[ArrayImpl]) -> List[ArrayImpl]: ...
   def execute_with_token(
-      self,
-      arguments: Sequence[ArrayImpl]) -> Tuple[List[ArrayImpl], Token]:
-    ...
+      self, arguments: Sequence[ArrayImpl]
+  ) -> Tuple[List[ArrayImpl], Token]: ...
   def execute_sharded_on_local_devices(
-      self,
-      arguments: Sequence[List[ArrayImpl]]) -> List[List[ArrayImpl]]: ...
+      self, arguments: Sequence[List[ArrayImpl]]
+  ) -> List[List[ArrayImpl]]: ...
   def execute_sharded_on_local_devices_with_tokens(
-      self,
-      arguments: Sequence[List[ArrayImpl]]) -> Tuple[List[List[ArrayImpl]], ShardedToken]: ...
+      self, arguments: Sequence[List[ArrayImpl]]
+  ) -> Tuple[List[List[ArrayImpl]], ShardedToken]: ...
   def execute_sharded(
-      self,
-      arguments: Sequence[List[ArrayImpl]], with_tokens: bool = ...) -> ExecuteResults: ...
+      self, arguments: Sequence[List[ArrayImpl]], with_tokens: bool = ...
+  ) -> ExecuteResults: ...
   def hlo_modules(self) -> List[HloModule]: ...
   def get_output_memory_kinds(self) -> List[List[str]]: ...
   def get_compiled_memory_stats(self) -> CompiledMemoryStats: ...
+  def get_output_shardings(self) -> Optional[List[OpSharding]]: ...
+  def get_parameter_shardings(self) -> Optional[List[OpSharding]]: ...
+  def get_parameter_layouts(self) -> List[Layout]: ...
+  def get_output_layouts(self) -> List[Layout]: ...
   def keep_alive(self) -> None: ...
   def compile_options(self) -> CompileOptions: ...
   def cost_analysis(self) -> Dict[str, Any]: ...
@@ -607,6 +665,7 @@ class Executable:
   def get_output_shardings(self) -> Optional[List[OpSharding]]: ...
   def get_parameter_shardings(self) -> Optional[List[OpSharding]]: ...
   def get_parameter_layouts(self) -> List[Layout]: ...
+  def get_output_layouts(self) -> List[Layout]: ...
   def get_compiled_memory_stats(self) -> CompiledMemoryStats: ...
   def serialize(self) -> str: ...
   def compile_options(self) -> CompileOptions: ...
@@ -618,16 +677,19 @@ class DeviceTopology:
   def serialize(self) -> bytes: ...
   def __getattr__(self, name: str) -> Any: ...
 
-
 def buffer_to_dlpack_managed_tensor(
-    buffer: ArrayImpl,
-    stream: int | None = None) -> Any: ...
+    buffer: ArrayImpl, stream: int | None = None
+) -> Any: ...
 def dlpack_managed_tensor_to_buffer(
-    tensor: Any, device: Device, stream: int | None) -> ArrayImpl: ...
+    tensor: Any, device: Device, stream: int | None
+) -> ArrayImpl: ...
+
 # Legacy overload
 def dlpack_managed_tensor_to_buffer(
-    tensor: Any, cpu_backend: Optional[Client] = ...,
-    gpu_backend: Optional[Client] = ...) -> ArrayImpl: ...
+    tensor: Any,
+    cpu_backend: Optional[Client] = ...,
+    gpu_backend: Optional[Client] = ...,
+) -> ArrayImpl: ...
 
 # === BEGIN py_traceback.cc
 
@@ -646,12 +708,12 @@ class Traceback:
   def __str__(self) -> str: ...
   def as_python_traceback(self) -> Any: ...
   def raw_frames(self) -> Tuple[List[types.CodeType], List[int]]: ...
-
   @staticmethod
   def code_addr2line(code: types.CodeType, lasti: int) -> int: ...
   @staticmethod
-  def code_addr2location(code: types.CodeType,
-                         lasti: int) -> Tuple[int, int, int, int]: ...
+  def code_addr2location(
+      code: types.CodeType, lasti: int
+  ) -> Tuple[int, int, int, int]: ...
 
 def replace_thread_exc_traceback(traceback: Any): ...
 
@@ -659,23 +721,28 @@ def replace_thread_exc_traceback(traceback: Any): ...
 
 class DistributedRuntimeService:
   def shutdown(self) -> None: ...
+
 class DistributedRuntimeClient:
   def connect(self) -> _Status: ...
   def shutdown(self) -> _Status: ...
   def blocking_key_value_get(self, key: str, timeout_in_ms: int) -> _Status: ...
-  def blocking_key_value_get_bytes(self, key: str, timeout_in_ms: int) -> _Status: ...
+  def blocking_key_value_get_bytes(
+      self, key: str, timeout_in_ms: int
+  ) -> _Status: ...
   def key_value_dir_get(self, key: str) -> _Status: ...
   def key_value_dir_get_bytes(self, key: str) -> _Status: ...
   def key_value_set(self, key: str, value: str) -> _Status: ...
-  def key_value_delete(self, key:str) -> _Status: ...
+  def key_value_delete(self, key: str) -> _Status: ...
   def wait_at_barrier(self, barrier_id: str, timeout_in_ms: int) -> _Status: ...
+
 def get_distributed_runtime_service(
     address: str,
     num_nodes: int,
     heartbeat_interval: Optional[int] = ...,
     max_missing_heartbeats: Optional[int] = ...,
-    enumerate_devices_timeout: Optional[int] = ...,
-    shutdown_timeout: Optional[int] = ...) -> DistributedRuntimeService: ...
+    cluster_register_timeout: Optional[int] = ...,
+    shutdown_timeout: Optional[int] = ...,
+) -> DistributedRuntimeService: ...
 def get_distributed_runtime_client(
     address: str,
     node_id: int,
@@ -685,24 +752,21 @@ def get_distributed_runtime_client(
     heartbeat_interval: Optional[int] = ...,
     max_missing_heartbeats: Optional[int] = ...,
     missed_heartbeat_callback: Optional[Any] = ...,
-    shutdown_on_destruction: Optional[bool] = ...) -> DistributedRuntimeClient: ...
+    shutdown_on_destruction: Optional[bool] = ...,
+) -> DistributedRuntimeClient: ...
 
 class PreemptionSyncManager:
   def initialize(self, client: DistributedRuntimeClient) -> _Status: ...
   def reached_sync_point(self, step_counter: int) -> bool: ...
-def create_preemption_sync_manager() -> PreemptionSyncManager: ...
 
+def create_preemption_sync_manager() -> PreemptionSyncManager: ...
 def collect_garbage() -> None: ...
-
 def is_optimized_build() -> bool: ...
-
 def json_to_pprof_profile(json: str) -> bytes: ...
 def pprof_profile_to_json(proto: bytes) -> str: ...
 
-
 CompiledFunction = Any
 
-
 class PmapFunction:
   def __call__(self, *args, **kwargs) -> Any: ...
   def __getstate__(self) -> Any: ...
@@ -711,9 +775,9 @@ class PmapFunction:
   def _cache_size(self) -> int: ...
   def _cache_clear(self) -> None: ...
 
-def weakref_lru_cache(cache_context_fn: Callable, call: Callable, maxsize=...):
-  ...
-
+def weakref_lru_cache(
+    cache_context_fn: Callable, call: Callable, maxsize=...
+): ...
 
 class DeviceList:
   def __init__(self, device_assignment: Tuple[Device, ...]): ...
@@ -735,15 +799,19 @@ class DeviceList:
   @property
   def memory_kinds(self) -> Tuple[str, ...]: ...
 
-
 class Sharding: ...
-
 class XLACompatibleSharding(Sharding): ...
 
 class NamedSharding(XLACompatibleSharding):
-  def __init__(self, mesh: Any, spec: Any, *, memory_kind: Optional[str] = None,
-               _parsed_pspec: Any = None,
-               _manual_axes: frozenset[Any] = frozenset()): ...
+  def __init__(
+      self,
+      mesh: Any,
+      spec: Any,
+      *,
+      memory_kind: Optional[str] = None,
+      _parsed_pspec: Any = None,
+      _manual_axes: frozenset[Any] = frozenset(),
+  ): ...
   mesh: Any
   spec: Any
   _memory_kind: Optional[str]
@@ -758,15 +826,21 @@ class SingleDeviceSharding(XLACompatibleSharding):
   _internal_device_list: DeviceList
 
 class PmapSharding(XLACompatibleSharding):
-  def __init__(self, devices: Sequence[Any], sharding_spec: pmap_lib.ShardingSpec): ...
+  def __init__(
+      self, devices: Sequence[Any], sharding_spec: pmap_lib.ShardingSpec
+  ): ...
   devices: List[Any]
   sharding_spec: pmap_lib.ShardingSpec
   _internal_device_list: DeviceList
 
 class GSPMDSharding(XLACompatibleSharding):
-  def __init__(self, devices: Sequence[Device],
-               op_sharding: Union[OpSharding, HloSharding],
-               *, memory_kind: Optional[str] = None): ...
+  def __init__(
+      self,
+      devices: Sequence[Device],
+      op_sharding: Union[OpSharding, HloSharding],
+      *,
+      memory_kind: Optional[str] = None,
+  ): ...
   _devices: Tuple[Device, ...]
   _hlo_sharding: HloSharding
   _memory_kind: Optional[str]
@@ -785,12 +859,16 @@ class PjitFunctionCache:
   @staticmethod
   def clear_all(): ...
 
-def pjit(function_name: str, fun: Optional[Callable], cache_miss: Callable,
-         static_argnums: Sequence[int], static_argnames: Sequence[str],
-         donate_argnums: Sequence[int],
-         pytree_registry: pytree.PyTreeRegistry,
-         cache: Optional[PjitFunctionCache] = ...,
-         ) -> PjitFunction: ...
+def pjit(
+    function_name: str,
+    fun: Optional[Callable],
+    cache_miss: Callable,
+    static_argnums: Sequence[int],
+    static_argnames: Sequence[str],
+    donate_argnums: Sequence[int],
+    pytree_registry: pytree.PyTreeRegistry,
+    cache: Optional[PjitFunctionCache] = ...,
+) -> PjitFunction: ...
 
 class HloPassInterface:
   @property
@@ -811,11 +889,7 @@ class FlattenCallGraph(HloPassInterface):
 class TupleSimplifer(HloPassInterface):
   def __init__(self) -> None: ...
 
-
 def is_asan() -> bool: ...
-
 def is_msan() -> bool: ...
-
 def is_tsan() -> bool: ...
-
 def is_sanitized() -> bool: ...
diff --git a/third_party/xla/xla/python/xla_extension/jax_jit.pyi b/third_party/xla/xla/python/xla_extension/jax_jit.pyi
index e495b5fe8db9a2..3c647d5461b72e 100644
--- a/third_party/xla/xla/python/xla_extension/jax_jit.pyi
+++ b/third_party/xla/xla/python/xla_extension/jax_jit.pyi
@@ -34,11 +34,13 @@ class JitState:
 def global_state() -> JitState: ...
 def thread_local_state() -> JitState: ...
 
-def jit_is_disabled() -> bool: ...
 def get_enable_x64() -> bool: ...
 def set_thread_local_state_initialization_callback(
     function: Callable[[], None]): ...
 
+def swap_thread_local_state_disable_jit(
+    value: Optional[bool]) -> Optional[bool]: ...
+
 class ArgSignature:
   dtype: np.dtype
   shape: Tuple[int, ...]
diff --git a/third_party/xla/xla/python/xla_extension/ops.pyi b/third_party/xla/xla/python/xla_extension/ops.pyi
index 7d9b8af5247d31..a8538d7e054873 100644
--- a/third_party/xla/xla/python/xla_extension/ops.pyi
+++ b/third_party/xla/xla/python/xla_extension/ops.pyi
@@ -14,7 +14,7 @@
 # ==============================================================================
 
 import enum
-from typing import Any, List, Optional, Sequence, Tuple, overload
+from typing import Any, Optional, Sequence, overload
 
 from xla.python import xla_extension
 
@@ -101,7 +101,7 @@ def ApproxTopKReductionOutputSize(
     top_k: int,
     recall_target: float,
     aggregate_to_topk: Optional[bool] = ...,
-    input_size_override: Optional[int] = ...) -> Tuple[int, int]: ...
+    input_size_override: Optional[int] = ...) -> tuple[int, int]: ...
 def ReduceScatter(
     operand: XlaOp,
     computation: XlaComputation,
@@ -133,7 +133,7 @@ def Clamp(min: XlaOp, operand: XlaOp, max: XlaOp) -> XlaOp: ...
 def Collapse(operand: XlaOp, dimensions: Sequence[int]) -> XlaOp: ...
 def CollectivePermute(
     operand: XlaOp,
-    source_target_pairs: Sequence[Tuple[int, int]],
+    source_target_pairs: Sequence[tuple[int, int]],
     channel_id: Optional[_ChannelHandle] = ...) -> XlaOp: ...
 def ConcatInDim(builder: XlaBuilder,
                 operands: Sequence[XlaOp],
@@ -156,7 +156,7 @@ def ConvGeneralDilated(
     lhs: XlaOp,
     rhs: XlaOp,
     window_strides: Sequence[int],
-    padding: Sequence[Tuple[int, int]],
+    padding: Sequence[tuple[int, int]],
     lhs_dilation: Sequence[int],
     rhs_dilation: Sequence[int],
     dimension_numbers: _ConvDimensionNumbers,
@@ -199,7 +199,7 @@ def CustomCallWithAliasing(
     operand_shapes_with_layout: Sequence[Shape],
     opaque: bytes = ...,
     has_side_effect: bool = ...,
-    output_operand_aliasing: Sequence[Tuple[ShapeIndex, Tuple[int, ShapeIndex]]] = ...,
+    output_operand_aliasing: Sequence[tuple[ShapeIndex, tuple[int, ShapeIndex]]] = ...,
     literal: _LiteralSlice = ...,
     schedule: CustomCallSchedule = ...,
     api_version: CustomCallApiVersion = ...) -> XlaOp: ...
@@ -232,7 +232,7 @@ def Eigh(
     lower: bool = ...,
     max_iter: int = ...,
     epsilon: float = ...,
-    sort_eigenvalues: bool = ...) -> Tuple[XlaOp, XlaOp]: ...
+    sort_eigenvalues: bool = ...) -> tuple[XlaOp, XlaOp]: ...
 def Fft(
     operand: XlaOp,
     fft_type: FftType,
@@ -253,7 +253,7 @@ def InfeedWithToken(
 def Iota(builder: XlaBuilder, shape: Shape, iota_dimension: int) -> XlaOp: ...
 @overload
 def Iota(builder: XlaBuilder, type: PrimitiveType, size: int) -> XlaOp: ...
-def LU(a: XlaOp) -> Tuple[XlaOp, XlaOp, XlaOp]: ...
+def LU(a: XlaOp) -> tuple[XlaOp, XlaOp, XlaOp]: ...
 def Map(
     builder: XlaBuilder,
     operands: Sequence[XlaOp],
@@ -277,8 +277,8 @@ def Parameter(
     name: str = ...,
     replicated_at_leaf_buffers: Sequence[bool] = ...) -> XlaOp: ...
 def ProductOfElementaryHouseholderReflectors(a: XlaOp, taus: XlaOp) -> XlaOp: ...
-def QR(a: XlaOp, full_matrices: bool) -> Tuple[XlaOp, XlaOp]: ...
-def QrDecomposition(a: XlaOp) -> Tuple[XlaOp, XlaOp]: ...
+def QR(a: XlaOp, full_matrices: bool) -> tuple[XlaOp, XlaOp]: ...
+def QrDecomposition(a: XlaOp) -> tuple[XlaOp, XlaOp]: ...
 def Reduce(
     builder: XlaBuilder,
     operands: Sequence[XlaOp],
@@ -298,7 +298,7 @@ def ReduceWindowWithGeneralPadding(
     window_strides: Sequence[int],
     base_dilations: Sequence[int],
     window_dilations: Sequence[int],
-    padding: Sequence[Tuple[int, int]]) -> XlaOp: ...
+    padding: Sequence[tuple[int, int]]) -> XlaOp: ...
 @overload
 def ReduceWindowWithGeneralPadding(
     operands: Sequence[XlaOp],
@@ -308,7 +308,7 @@ def ReduceWindowWithGeneralPadding(
     window_strides: Sequence[int],
     base_dilations: Sequence[int],
     window_dilations: Sequence[int],
-    padding: Sequence[Tuple[int, int]]) -> XlaOp: ...
+    padding: Sequence[tuple[int, int]]) -> XlaOp: ...
 def ReplicaId(builder: XlaBuilder) -> XlaOp: ...
 @overload
 def Reshape(
@@ -348,7 +348,7 @@ def SelectAndScatterWithGeneralPadding(
     select: XlaComputation,
     window_dimensions: Sequence[int],
     window_strides: Sequence[int],
-    padding: Sequence[Tuple[int, int]],
+    padding: Sequence[tuple[int, int]],
     source: XlaOp,
     init_value: XlaOp,
     scatter: XlaComputation) -> XlaOp: ...
@@ -372,7 +372,7 @@ def Sort(
 def SVD(
     a: XlaOp,
     max_iter: int = ...,
-    epsilon: float = ...) -> Tuple[XlaOp, XlaOp, XlaOp]: ...
+    epsilon: float = ...) -> tuple[XlaOp, XlaOp, XlaOp]: ...
 def TopK(input: XlaOp, k: int) -> XlaOp: ...
 def Transpose(operand: XlaOp, permutation: Sequence[int]) -> XlaOp: ...
 def TriangularSolve(
diff --git a/third_party/xla/xla/runlit.cfg.py b/third_party/xla/xla/runlit.cfg.py
index 28d411422eb865..1c28ed57e8b3c8 100644
--- a/third_party/xla/xla/runlit.cfg.py
+++ b/third_party/xla/xla/runlit.cfg.py
@@ -97,6 +97,7 @@
     'xla-translate',
     'xla-translate-gpu-opt',
     'xla-translate-opt',
+    'hlo-opt',
 ]
 tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/third_party/xla/xla/runlit.site.cfg.py b/third_party/xla/xla/runlit.site.cfg.py
index 6c932591cac135..5fc38b581a0baf 100644
--- a/third_party/xla/xla/runlit.site.cfg.py
+++ b/third_party/xla/xla/runlit.site.cfg.py
@@ -48,6 +48,7 @@
     "service/mlir_gpu",
     "translate",
     "translate/mhlo_to_lhlo_with_xla",
+    "tools",
 ]
 config.mlir_tf_tools_dirs = [
     os.path.join(real_test_srcdir, os.environ["TEST_WORKSPACE"], xla_root_dir,
diff --git a/third_party/xla/xla/runtime/execution_engine.cc b/third_party/xla/xla/runtime/execution_engine.cc
index 1e6ab0913cc902..55ac3b27735496 100644
--- a/third_party/xla/xla/runtime/execution_engine.cc
+++ b/third_party/xla/xla/runtime/execution_engine.cc
@@ -171,9 +171,9 @@ absl::Status ExportWithXlaRuntimeAbi(llvm::Module &module,
     return InternalError("exported function must return void");
 
   // Add an XLA interface function for the exported function.
-  llvm::FunctionType *xla_runtime_type = llvm::FunctionType::get(
-      builder.getVoidTy(), builder.getInt8PtrTy()->getPointerTo(),
-      /*isVarArg=*/false);
+  llvm::FunctionType *xla_runtime_type =
+      llvm::FunctionType::get(builder.getVoidTy(), builder.getPtrTy(),
+                              /*isVarArg=*/false);
 
   llvm::FunctionCallee xla_runtime_func =
       module.getOrInsertFunction(exported_name, xla_runtime_type);
diff --git a/third_party/xla/xla/runtime/runner/BUILD b/third_party/xla/xla/runtime/runner/BUILD
deleted file mode 100644
index ff3276c1d65901..00000000000000
--- a/third_party/xla/xla/runtime/runner/BUILD
+++ /dev/null
@@ -1,91 +0,0 @@
-load("//xla:strict.default.bzl", "py_strict_library", "py_strict_test")  # maybe @unused in OSS
-load("//xla:xla.bzl", "xla_py_proto_library")
-load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
-
-package(
-    default_visibility = ["//visibility:public"],
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "runner_lib",
-    srcs = ["runner.cc"],
-    hdrs = ["runner.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runner_proto_cc",
-        "//xla/runtime:arguments",
-        "//xla/runtime:executable",
-        "//xla/runtime:jit_executable",
-        "//xla/runtime:logical_result",
-        "//xla/runtime:results",
-        "//xla/runtime:types",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/util:command_line_flags",
-    ],
-)
-
-tf_proto_library(
-    name = "runner_proto",
-    srcs = ["runner.proto"],
-    cc_api_version = 2,
-    protodeps = ["//xla:xla_data_proto"],
-    visibility = ["//visibility:public"],
-)
-
-xla_py_proto_library(
-    name = "runner_pb2",
-    api_version = 2,
-    visibility = ["//visibility:public"],
-    deps = [":runner_proto"],
-)
-
-xla_py_proto_library(
-    name = "xla_data_pb2",
-    api_version = 2,
-    visibility = ["//visibility:public"],
-    deps = ["//xla:xla_data_proto"],
-)
-
-py_strict_library(
-    name = "runner",
-    testonly = True,
-    srcs = ["runner.py"],
-    deps = [
-        ":runner_proto_py",
-        "//third_party/py/numpy",
-        "//xla:xla_data_proto_py",
-    ],
-)
-
-# copybara:uncomment_begin(b/254857628)
-# py_strict_test(
-#     name = "testlib_runner_test",
-#     size = "small",
-#     srcs = ["testlib_runner_test.py"],
-#     data = [":testlib_runner"],
-#     python_version = "PY3",
-#     srcs_version = "PY3",
-#     deps = [
-#         ":runner",
-#         "//third_party/py/numpy",
-#         "@absl_py//absl/testing:absltest",
-#     ],
-# )
-#
-# cc_binary(
-#     name = "testlib_runner",
-#     testonly = True,
-#     srcs = ["testlib_runner.cc"],
-#     deps = [
-#         ":runner_lib",
-#         "//xla/mlir/runtime/transforms/tests:testlib_pipeline",
-#     ],
-# )
-# copybara:uncomment_end
diff --git a/third_party/xla/xla/runtime/runner/runner.cc b/third_party/xla/xla/runtime/runner/runner.cc
deleted file mode 100644
index 41afe3d146f321..00000000000000
--- a/third_party/xla/xla/runtime/runner/runner.cc
+++ /dev/null
@@ -1,342 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/runner/runner.h"
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "xla/runtime/arguments.h"
-#include "xla/runtime/logical_result.h"
-#include "xla/runtime/results.h"
-#include "xla/runtime/runner/runner.pb.h"
-#include "xla/runtime/types.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/init_main.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/protobuf.h"
-
-namespace xla {
-namespace runtime {
-
-using absl::InternalError;
-using absl::InvalidArgumentError;
-using absl::StrFormat;
-
-using tsl::Env;
-using tsl::ReadBinaryProto;
-using tsl::ReadFileToString;
-using tsl::ReadTextProto;
-using tsl::WriteBinaryProto;
-using tsl::WriteTextProto;
-
-using RunnerArgs = Arguments<ScalarArg, MemrefDesc>;
-
-void AppendRunnerFlags(std::vector<tsl::Flag>* flag_list, RunnerFlags* flags) {
-  flag_list->emplace_back("function", &flags->function, "Test function name.");
-
-  flag_list->emplace_back("module", &flags->module_path, "Path to MLIR input.");
-
-  flag_list->emplace_back(
-      "arguments", &flags->arguments_path,
-      "Path to arguments file. If the file ends in '.pbtxt' it is expected to "
-      "be in the human-readable proto text format, otherwise it is expected "
-      "to be in the proto binary format.");
-
-  flag_list->emplace_back(
-      "results", &flags->results_path,
-      "Path to results file. The runner tool will serialize results into a "
-      " proto message and write it to this file path.");
-}
-//===----------------------------------------------------------------------===//
-
-AsyncTaskRunner* NoAsyncTaskRunner() {
-  return reinterpret_cast<AsyncTaskRunner*>(0xDEADBEEF);
-}
-
-//===----------------------------------------------------------------------===//
-// Helper functions to Read/Write protobuf messages.
-//===----------------------------------------------------------------------===//
-
-template <typename T>
-static absl::Status ReadProtoFile(Env* env, const std::string& fname,
-                                  T* proto) {
-  if (absl::EndsWith(fname, ".pbtxt")) {
-    return ReadTextProto(env, fname, proto);
-  } else {
-    return ReadBinaryProto(env, fname, proto);
-  }
-}
-
-template <typename T>
-static absl::Status WriteProtoFile(Env* env, const std::string& fname,
-                                   T& proto) {
-  if (absl::EndsWith(fname, ".pbtxt")) {
-    return WriteTextProto(env, fname, proto);
-  } else {
-    return WriteBinaryProto(env, fname, proto);
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Convert ArgumentsProto message to Xla runtime arguments.
-//===----------------------------------------------------------------------===//
-
-static absl::Status ConvertScalar(const ScalarProto& scalar, RunnerArgs& args) {
-  switch (scalar.value_case()) {
-    case ScalarProto::ValueCase::kI32:
-      args.emplace_back<ScalarArg>(scalar.i32());
-      break;
-    case ScalarProto::ValueCase::kI64:
-      args.emplace_back<ScalarArg>(scalar.i64());
-      break;
-    default:
-      return InvalidArgumentError(
-          StrFormat("unsupported scalar argument: %s", scalar.DebugString()));
-  }
-  return absl::OkStatus();
-}
-
-static absl::Status ConvertTensor(const TensorProto& tensor, RunnerArgs& args) {
-  args.emplace_back<MemrefDesc>(
-      tensor.dtype(),
-      static_cast<void*>(const_cast<std::string*>(&tensor.contents())),
-      /*offset=*/0, tensor.sizes(), tensor.strides());
-  return absl::OkStatus();
-}
-
-// Converts arguments protobuf message into Xla runtime arguments.
-static absl::Status ConvertArgs(ArgumentsProto& proto, RunnerArgs& args) {
-  for (auto& arg : proto.arguments()) {
-    switch (arg.argument_case()) {
-      // Convert `ScalarProto` -> `ScalarArg`.
-      case ArgumentProto::ArgumentCase::kScalar:
-        if (auto st = ConvertScalar(arg.scalar(), args); !st.ok()) return st;
-        break;
-      // Convert `TensorProto` -> `MemrefDesc`.
-      case ArgumentProto::ArgumentCase::kTensor:
-        if (auto st = ConvertTensor(arg.tensor(), args); !st.ok()) return st;
-        break;
-      // Unsupported argument type.
-      default:
-        return InvalidArgumentError(
-            StrFormat("unsupported argument: %s", arg.DebugString()));
-    }
-  }
-  return absl::OkStatus();
-}
-
-//===----------------------------------------------------------------------===//
-// Convert returned results to ResultsProto message.
-//===----------------------------------------------------------------------===//
-
-// TODO(ezhulenev): Implement error propagation through the results proto.
-static void CheckNoError(const absl::Status& status) {
-  LOG(FATAL) << "Unexpected call to `ReturnError`";
-}
-
-// Converts results returned from compiled Xla executable to results proto.
-struct ReturnResults {
-  LogicalResult operator()(unsigned result_index, const Type* type,
-                           const Type* runtime_type, void* ret) const {
-    // We rely on the fact that result converter handles results from left to
-    // right and we can push new results to the back of the list.
-    auto* result = proto->add_results();
-
-    // Return scalar result as `ScalarProto`.
-    auto* scalar = llvm::dyn_cast<ScalarType>(type);
-    switch (scalar ? scalar->type() : PrimitiveType::PRIMITIVE_TYPE_INVALID) {
-      case PrimitiveType::S32:
-        ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(ret, sizeof(int32_t));
-        result->mutable_scalar()->set_i32(*reinterpret_cast<int32_t*>(ret));
-        return success();
-      default:
-        break;
-    }
-
-    // Assuming result cannot be processed as Scalar, try `TensorProto`
-    auto* memref = llvm::dyn_cast<MemrefType>(runtime_type);
-    if (memref) {
-      auto desc = ConvertReturnedMemref<MemrefDesc>(*this, memref, ret);
-      if (failed(desc)) return failure();
-
-      char* data = static_cast<char*>(desc->data());
-      int64_t size_in_bytes = primitive_util::ByteWidth(desc->dtype());
-
-      TensorProto* tensor_proto = result->mutable_tensor();
-      for (int64_t size : desc->sizes()) {
-        size_in_bytes *= size;
-        tensor_proto->add_sizes(size);
-      }
-
-      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(data, size_in_bytes);
-      tensor_proto->set_contents(std::string(data, size_in_bytes));
-      tensor_proto->set_dtype(desc->dtype());
-
-      std::free(desc->data());
-      return success();
-    }
-
-    return failure();
-  }
-
-  MemrefDesc operator()(PrimitiveType element_type, void* base_ptr,
-                        void* data_ptr, int64_t offset,
-                        absl::Span<const int64_t> sizes,
-                        absl::Span<const int64_t> strides) const {
-    return MemrefDesc(element_type, base_ptr, offset, sizes, strides);
-  }
-
-  ResultsProto* proto = nullptr;
-};
-
-// Converts arguments protobuf message into Xla runtime arguments.
-static absl::Status WriteInoutResults(ArgumentsProto& proto, RunnerArgs& args,
-                                      ResultsProto* results) {
-  for (int i = 0; i < proto.arguments().size(); ++i) {
-    ArgumentProto arg = proto.arguments().Get(i);
-    switch (arg.argument_case()) {
-      case ArgumentProto::ArgumentCase::kScalar:
-        continue;
-      case ArgumentProto::ArgumentCase::kTensor:
-        if (arg.tensor().inout()) {
-          auto* result = results->add_results();
-          TensorProto* tensor_proto = result->mutable_tensor();
-
-          auto* memref = llvm::cast<MemrefDesc>(&args[i]);
-
-          char* sv = static_cast<char*>(memref->data());
-          int64_t size_in_bytes = primitive_util::ByteWidth(memref->dtype());
-
-          for (int64_t size : memref->sizes()) {
-            size_in_bytes *= size;
-            tensor_proto->add_sizes(size);
-          }
-
-          tensor_proto->set_contents(std::string(sv, size_in_bytes));
-          tensor_proto->set_dtype(memref->dtype());
-        }
-        break;
-      // Unsupported argument type.
-      default:
-        return InvalidArgumentError(
-            StrFormat("unsupported argument: %s", arg.DebugString()));
-    }
-  }
-
-  return absl::OkStatus();
-}
-
-//===----------------------------------------------------------------------===//
-
-absl::Status Execute(RunnerFlags flags,
-                     const JitExecutable::Options& compile_opts,
-                     const Executable::ExecuteOpts& execute_opts) {
-  LOG(INFO) << "Executing runner tool:\n"
-            << " - module: " << flags.module_path << "\n"
-            << " - arguments: " << flags.arguments_path << "\n"
-            << " - results: " << flags.results_path;
-
-  Env* env = Env::Default();
-
-  // Read MLIR module from the input file.
-  std::string module;
-  if (auto st = ReadFileToString(env, flags.module_path, &module); !st.ok()) {
-    return InternalError(
-        StrFormat("failed to read module input from %s, error: %s",
-                  flags.module_path, st.message()));
-  }
-
-  // Read arguments from the input file.
-  ArgumentsProto args_proto;
-  if (auto read = ReadProtoFile(env, flags.arguments_path, &args_proto);
-      !read.ok()) {
-    return InternalError(
-        StrFormat("failed to read arguments input from %s, error %s",
-                  flags.arguments_path, read.message()));
-  }
-
-  // Convert arguments proto message to the Xla runtime arguments.
-  RunnerArgs args(args_proto.arguments_size());
-  if (auto converted = ConvertArgs(args_proto, args); !converted.ok())
-    return converted;
-
-  // Instantiate JitExecutable from the input module.
-  absl::StatusOr<JitExecutable> jit_executable =
-      JitExecutable::Instantiate(module, compile_opts, {flags.function});
-  if (!jit_executable.ok()) return jit_executable.status();
-
-  // TODO(ezhulenev): Add support for specializing to arguments shapes/values.
-  AsyncValuePtr<Executable> executable = jit_executable->DefaultExecutable();
-  if (executable.IsError()) return executable.GetError();
-
-  // Convert returned results to results proto.
-  ResultsProto results_proto;
-  ResultConverterSet converter(CheckNoError, ReturnResults{&results_proto});
-
-  // Execute and convert results to proto message.
-  if (auto executed = executable->Execute(args, converter, execute_opts);
-      !executed.ok())
-    return executed.status();
-
-  if (auto inout = WriteInoutResults(args_proto, args, &results_proto);
-      !inout.ok())
-    return inout;
-
-  // Write results proto to the requested file location.
-  if (auto wrote = WriteProtoFile(env, flags.results_path, results_proto);
-      !wrote.ok())
-    return InternalError(
-        StrFormat("failed to write results proto to %s, error %s",
-                  flags.results_path, wrote.message()));
-
-  return absl::OkStatus();
-}
-
-//===----------------------------------------------------------------------===//
-// Compose Xla Runtime Runner into `main` function.
-//===----------------------------------------------------------------------===//
-
-int Main(int argc, char** argv, const JitExecutable::Options& compile_opts,
-         const Executable::ExecuteOpts& execute_opts) {
-  xla::runtime::RunnerFlags flags;
-
-  std::vector<tsl::Flag> flag_list;
-  xla::runtime::AppendRunnerFlags(&flag_list, &flags);
-
-  if (auto parsed = tsl::Flags::Parse(&argc, argv, flag_list); !parsed) {
-    std::cerr << "Failed to parse runner flags";
-    return 1;
-  }
-
-  tsl::port::InitMain(argv[0], &argc, &argv);
-
-  if (auto executed = Execute(flags, compile_opts, execute_opts);
-      !executed.ok()) {
-    std::cerr << "Failed to execute runner tool: " << executed.message();
-    return 1;
-  }
-
-  return 0;
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/runner/runner.h b/third_party/xla/xla/runtime/runner/runner.h
deleted file mode 100644
index 20344553ded097..00000000000000
--- a/third_party/xla/xla/runtime/runner/runner.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_RUNNER_RUNNER_H_
-#define XLA_RUNTIME_RUNNER_RUNNER_H_
-
-#include <string>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/jit_executable.h"
-#include "tsl/util/command_line_flags.h"
-
-namespace xla {
-namespace runtime {
-
-struct RunnerFlags {
-  std::string function;
-  std::string module_path;
-  std::string arguments_path;
-  std::string results_path;
-};
-
-void AppendRunnerFlags(std::vector<tsl::Flag>* flag_list, RunnerFlags* flags);
-
-// Fake AsyncTaskRunner for programs that do not plan to execute any async work.
-AsyncTaskRunner* NoAsyncTaskRunner();
-
-// Compiles and executes the MLIR input program defined by `flags` using
-// user-provided compilation and execution options.
-absl::Status Execute(RunnerFlags flags,
-                     const JitExecutable::Options& compile_opts,
-                     const Executable::ExecuteOpts& execute_opts);
-
-// A wrapper around `Execute` that does argument parsing and binary
-// initialization. Can be used as a main function in user-defined tools.
-int Main(int argc, char** argv, const JitExecutable::Options& compile_opts,
-         const Executable::ExecuteOpts& execute_opts);
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_RUNNER_RUNNER_H_
diff --git a/third_party/xla/xla/runtime/runner/runner.proto b/third_party/xla/xla/runtime/runner/runner.proto
deleted file mode 100644
index e2e6948c8eb9a4..00000000000000
--- a/third_party/xla/xla/runtime/runner/runner.proto
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-syntax = "proto3";
-
-package xla;
-
-import "xla/xla_data.proto";
-
-message ScalarProto {
-  oneof value {
-    int32 i32 = 1;
-    int64 i64 = 2;
-  }
-}
-
-message TensorProto {
-  PrimitiveType dtype = 1;
-  int64 offset = 2;
-  repeated int64 sizes = 3;
-  repeated int64 strides = 4;
-  bool inout = 5;
-
-  bytes contents = 6;
-}
-
-message ArgumentProto {
-  oneof argument {
-    ScalarProto scalar = 1;
-    TensorProto tensor = 2;
-  }
-}
-
-message ResultProto {
-  oneof result {
-    ScalarProto scalar = 1;
-    TensorProto tensor = 2;
-  }
-}
-
-message ArgumentsProto {
-  repeated ArgumentProto arguments = 1;
-}
-
-message ResultsProto {
-  repeated ResultProto results = 1;
-}
diff --git a/third_party/xla/xla/runtime/runner/runner.py b/third_party/xla/xla/runtime/runner/runner.py
deleted file mode 100644
index 5f443e8a04ab93..00000000000000
--- a/third_party/xla/xla/runtime/runner/runner.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Python helper for running Xla runtime runner tools."""
-
-import os
-import subprocess
-import tempfile
-from typing import Any, Sequence
-
-import numpy as np
-
-from local_xla.xla import xla_data_pb2
-from xla.runtime.runner import runner_pb2
-
-PrimitiveType = xla_data_pb2.PrimitiveType
-
-XLA_ELEMENT_TYPE_TO_DTYPE = {
-    PrimitiveType.PRED: np.dtype("bool"),
-    PrimitiveType.S8: np.dtype("int8"),
-    PrimitiveType.S16: np.dtype("int16"),
-    PrimitiveType.S32: np.dtype("int32"),
-    PrimitiveType.S64: np.dtype("int64"),
-    PrimitiveType.U8: np.dtype("uint8"),
-    PrimitiveType.U16: np.dtype("uint16"),
-    PrimitiveType.U32: np.dtype("uint32"),
-    PrimitiveType.U64: np.dtype("uint64"),
-    PrimitiveType.F16: np.dtype("float16"),
-    PrimitiveType.F32: np.dtype("float32"),
-    PrimitiveType.F64: np.dtype("float64"),
-    PrimitiveType.C64: np.dtype("complex64"),
-    PrimitiveType.C128: np.dtype("complex128"),
-    PrimitiveType.TUPLE: np.dtype(np.object_),
-    PrimitiveType.TOKEN: np.dtype(np.object_),
-}
-
-# Note the conversion on the key. Numpy has a known issue wherein dtype hashing
-# doesn't work as expected (https://github.com/numpy/numpy/issues/7242). Thus,
-# when keying by dtype in this dict, we use the string form of dtypes.
-DTYPE_TO_XLA_ELEMENT_TYPE = {
-    str(dt): et for et, dt in XLA_ELEMENT_TYPE_TO_DTYPE.items()
-}
-
-
-class Runner:
-  """Python helper for running Xla runtime runner tools."""
-
-  def __init__(self, runner: str):
-    self.runner = runner
-
-  def execute(self,
-              module: str,
-              function: str,
-              arguments: Sequence[Any],
-              inout: Sequence[int] = None) -> Sequence[Any]:
-    """Executes `module` with user-provided arguments."""
-    temp = tempfile.mkdtemp()
-
-    # Write input mlir module to a file.
-    module_file = os.path.join(temp, "module.mlir")
-    with open(module_file, "w") as f:
-      f.write(module)
-
-    inout = set(inout or [])
-
-    # Pack arguments into a proto message.
-    args_proto = runner_pb2.ArgumentsProto()
-    for i, arg in enumerate(arguments):
-      if isinstance(arg, int):
-        args_proto.arguments.append(
-            runner_pb2.ArgumentProto(scalar=runner_pb2.ScalarProto(i32=arg)))
-        if i in inout:
-          raise RuntimeError(f"inout param {i} cannot be of type ScalarArg")
-        continue
-      elif isinstance(arg, np.ndarray):
-        element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(arg.dtype)]
-        args_proto.arguments.append(
-            runner_pb2.ArgumentProto(
-                tensor=runner_pb2.TensorProto(
-                    dtype=element_type,
-                    sizes=arg.shape,
-                    strides=arg.strides,
-                    inout=(i in inout),
-                    contents=arg.tobytes())))
-
-        continue
-
-      raise TypeError("Unsupported argument type")
-
-    # Serialize argument proto message to a file.
-    arguments_file = os.path.join(temp, "arguments.pb")
-    with open(arguments_file, "wb") as f:
-      f.write(args_proto.SerializeToString())
-
-    # Expected results file path.
-    results_file = os.path.join(temp, "results.pb")
-
-    # Execute the runner tool.
-    runner_cmd = [
-        self.runner, "--logtostderr", f"--function={function}",
-        f"--module={module_file}", f"--arguments={arguments_file}",
-        f"--results={results_file}"
-    ]
-    result = subprocess.run(runner_cmd, capture_output=False, check=False)
-
-    if result.returncode != 0:
-      err = result.stderr.decode("utf-8")
-      raise RuntimeError(f"failed to execute runner tool: {err}")
-
-    # Read returned results.
-    with open(results_file, "rb") as f:
-      results_proto = runner_pb2.ResultsProto.FromString(f.read())
-
-    # Convert results from proto back to python objects.
-    results = []
-
-    for res in results_proto.results:
-      # Convert ScalarProto to scalar object
-      if res.HasField("scalar"):
-        scalar = res.scalar
-
-        if hasattr(scalar, "i32"):
-          results.append(scalar.i32)
-          continue
-        if hasattr(scalar, "i64"):
-          results.append(scalar.i64)
-          continue
-
-      # Convert TensorProto to numpy array
-      elif res.HasField("tensor"):
-        tensor = res.tensor
-        dtype = XLA_ELEMENT_TYPE_TO_DTYPE[tensor.dtype]
-        result_array = np.frombuffer(tensor.contents, dtype=dtype)
-        results.append(result_array)
-        continue
-
-      raise ValueError(f"Unknown result {res}")
-
-    return results
diff --git a/third_party/xla/xla/runtime/runner/testlib_runner.cc b/third_party/xla/xla/runtime/runner/testlib_runner.cc
deleted file mode 100644
index 4b66a8c426b79c..00000000000000
--- a/third_party/xla/xla/runtime/runner/testlib_runner.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/runtime/transforms/tests/testlib_pipeline.h"
-#include "xla/runtime/runner/runner.h"
-
-using namespace xla::runtime;  // NOLINT
-
-static JitExecutable::Options CompileOpts() {
-  JitExecutable::Options opts;
-  opts.specialization = JitExecutable::Specialization::kDisabled;
-  opts.compiler.register_dialects = RegisterXlaRuntimeTestlibDialects;
-  opts.compiler.create_compilation_pipeline = CreateXlaRuntimeTestlibPipeline;
-  return opts;
-}
-
-static Executable::ExecuteOpts ExecuteOpts() {
-  Executable::ExecuteOpts opts;
-  opts.async_task_runner = xla::runtime::NoAsyncTaskRunner();
-  return opts;
-}
-
-int main(int argc, char** argv) {
-  return xla::runtime::Main(argc, argv, CompileOpts(), ExecuteOpts());
-}
diff --git a/third_party/xla/xla/runtime/runner/testlib_runner_test.py b/third_party/xla/xla/runtime/runner/testlib_runner_test.py
deleted file mode 100644
index 70a36e4dbff0ba..00000000000000
--- a/third_party/xla/xla/runtime/runner/testlib_runner_test.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for testlib_runner."""
-
-import pathlib
-
-from absl.testing import absltest
-import numpy as np
-
-from xla.runtime.runner import runner
-
-# We assume that the testlib runner is defined in the same project as this test.
-r = runner.Runner(f'{pathlib.Path(__file__).parent.resolve()}/testlib_runner')
-
-
-class TestlibRunnerTest(absltest.TestCase):
-
-  def testScalarAdd(self):
-    module = """
-      func.func @add(%arg0: i32) -> i32 {
-        %0 = arith.constant 42 : i32
-        %1 = arith.addi %arg0, %0 : i32
-        return %1 : i32
-      }"""
-
-    [res] = r.execute(module, 'add', [42])
-    self.assertEqual(res, 84)
-
-  def testTensorAdd(self):
-    module = """
-      func.func @addtensor(%arg0: memref<?xf32>) {
-        %c0 = arith.constant 0 : index
-        %c1 = arith.constant 3 : index
-        %step = arith.constant 1 : index
-
-        scf.for %i = %c0 to %c1 step %step {
-          %0 = arith.constant 42.0 : f32
-          %1 = memref.load %arg0[%i] : memref<?xf32>
-          %2 = arith.addf %0, %1 : f32
-          memref.store %2, %arg0[%i] : memref<?xf32>
-        }
-        
-        func.return
-      }"""
-
-    arg = np.array([1.0, 2.0, 3.0], dtype=np.float32)
-    [res] = r.execute(module, 'addtensor', [arg], inout=[0])
-    self.assertTrue(
-        np.array_equal(res, np.array([43.0, 44.0, 45.0], dtype=np.float32)))
-
-  def testTensorReturn(self):
-    module = """
-      func.func @returntensor(%arg0: memref<?xf32>) -> memref<4xf32> {
-      %out = memref.alloc() : memref<4xf32>
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 4 : index
-      %step = arith.constant 1 : index
-
-      scf.for %i = %c0 to %c1 step %step {
-        %0 = memref.load %arg0[%i] : memref<?xf32>
-        memref.store %0, %out[%i] : memref<4xf32>
-      }
-
-      return %out : memref<4xf32>
-    }"""
-
-    arg = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
-    [res] = r.execute(module, 'returntensor', [arg])
-
-    self.assertTrue(
-        np.array_equal(res, np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)))
-
-if __name__ == '__main__':
-  absltest.main()
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 87ec7ef6c721b0..8d940f91bf4131 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -573,6 +573,7 @@ cc_library(
         ":hlo_graph_dumper",
         ":hlo_proto_util",
         "//xla:status",
+        "//xla:statusor",
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -612,6 +613,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -626,8 +628,10 @@ xla_cc_test(
     name = "shape_inference_test",
     srcs = ["shape_inference_test.cc"],
     deps = [
+        ":hlo_parser",
         ":shape_inference",
         "//xla:shape_util",
+        "//xla:statusor",
         "//xla:test",
         "//xla:test_helpers",
         "//xla:types",
@@ -672,6 +676,7 @@ cc_library(
         "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:sharding_op_util",
+        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -1111,6 +1116,7 @@ cc_library(
     name = "service",
     srcs = ["service.cc"],
     hdrs = ["service.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     visibility = ["//visibility:public"],
     deps = [
         ":allocation_tracker",
@@ -1158,6 +1164,7 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ],
     alwayslink = 1,
 )
@@ -4096,6 +4103,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
+        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
@@ -4104,6 +4112,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/lib/gtl:map_util",
         "@local_tsl//tsl/platform:errors",
     ],
@@ -4289,6 +4298,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -4391,6 +4401,7 @@ cc_library(
         ":hlo_value",
         "//xla:shape_tree",
         "//xla:shape_util",
+        "//xla:side_effect_util",
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
@@ -4398,6 +4409,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
@@ -4473,6 +4485,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -4719,6 +4732,7 @@ cc_library(
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -4748,10 +4762,11 @@ xla_cc_test(
     deps = [
         ":copy_insertion",
         ":hlo_graph_dumper",
+        ":hlo_module_config",
         ":hlo_parser",
-        ":hlo_runner",
+        "//xla:comparison_util",
         "//xla:debug_options_flags",
-        "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:test",
         "//xla:test_helpers",
@@ -4761,7 +4776,9 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_benchmark",
     ],
@@ -4789,6 +4806,7 @@ cc_library(
     deps = [
         ":hlo_dataflow_analysis",
         ":hlo_pass",
+        "//xla:shape_util",
         "//xla/hlo/ir:hlo",
     ],
 )
@@ -4892,6 +4910,7 @@ xla_cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -5090,6 +5109,7 @@ cc_library(
     hdrs = [
         "hlo_pass_pipeline.h",
     ],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     visibility = ["//visibility:public"],
     deps = [
         ":compilation_stats",
@@ -5109,6 +5129,7 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ],
 )
 
@@ -5475,8 +5496,11 @@ cc_library(
     deps = [
         "//xla:printer",
         "//xla:shape_layout",
+        "//xla:shape_util",
+        "//xla:statusor",
         "//xla:types",
         "//xla:xla_data_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -5499,6 +5523,7 @@ cc_library(
         "//xla/service/gpu:cublas_cudnn",
         "//xla/stream_executor",
         "//xla/stream_executor:dnn",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -6098,6 +6123,38 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "while_loop_fusible_sinking",
+    srcs = ["while_loop_fusible_sinking.cc"],
+    hdrs = ["while_loop_fusible_sinking.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":hlo_pass",
+        ":while_util",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@local_tsl//tsl/platform:errors",
+    ],
+)
+
+xla_cc_test(
+    name = "while_loop_fusible_sinking_test",
+    srcs = ["while_loop_fusible_sinking_test.cc"],
+    deps = [
+        ":while_loop_fusible_sinking",
+        "//xla:test",
+        "//xla/hlo/utils:hlo_matchers",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
+    ],
+)
+
 cc_library(
     name = "despecializer",
     srcs = ["despecializer.cc"],
@@ -6679,7 +6736,8 @@ cc_library(
         "//xla/client/lib:comparators",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
     ],
 )
@@ -6778,9 +6836,9 @@ cc_library(
     deps = [
         ":op_expander_pass",
         "//xla:comparison_util",
+        "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -6917,6 +6975,7 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",  # fixdeps: keep
         "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -7216,21 +7275,19 @@ xla_cc_binary(
         ":export_hlo",
         ":hlo_module_config",
         ":symbol_repository",
+        ":xla_compile_result_proto_cc_impl",
         "//xla:autotune_results_proto_cc",
         "//xla:debug_options_flags",
         "//xla:statusor",
         "//xla:util",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/mlir_hlo",
         "//xla/pjrt:mlir_to_hlo",
         "//xla/service:cpu_plugin",
-        "//xla/service/cpu:cpu_compiler",
-        "//xla/service/cpu:cpu_executable",
         "//xla/service/gpu:autotuner_util",
         "//xla/service/gpu:gpu_symbol_repository",
-        "//xla/stream_executor",
-        "//xla/stream_executor:device_memory_allocator",
         "//xla/tools:hlo_module_loader",
+        "//xla/tools:xla_compile_lib",
+        "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:ArithDialect",
@@ -7242,6 +7299,8 @@ xla_cc_binary(
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status_to_from_proto",
+        "@local_tsl//tsl/platform:types",
         "@local_tsl//tsl/util:command_line_flags",
         "@stablehlo//:register",
     ] + if_cuda_is_configured([
@@ -7432,8 +7491,33 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":compilation_environments",
+        "//xla:parse_flags_from_env",
+        "//xla:statusor",
+        "//xla:util",
         "//xla:xla_proto_cc",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/util:command_line_flags",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_compilation_environment_test",
+    size = "small",
+    srcs = ["gpu_compilation_environment_test.cc"],
+    deps = [
+        ":compilation_environments",
+        ":gpu_compilation_environment",
+        "//xla:parse_flags_from_env",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -7462,3 +7546,19 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [],
 )
+
+tf_proto_library(
+    name = "xla_compile_result_proto",
+    srcs = ["xla_compile_result.proto"],
+    make_default_target_header_only = True,
+    protodeps = [
+        ":hlo_proto",
+        "@local_tsl//tsl/protobuf:status_proto",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+exports_files(
+    ["xla_aot_compile_test_gpu_target_config.prototxt"],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/xla/xla/service/algebraic_simplifier.cc b/third_party/xla/xla/service/algebraic_simplifier.cc
index 166d8ec7de046f..b0b8b937d45a9b 100644
--- a/third_party/xla/xla/service/algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier.cc
@@ -3953,6 +3953,24 @@ Status AlgebraicSimplifierVisitor::HandleMinimum(HloInstruction* minimum) {
     }
   }
 
+  // min(min(x, y), y) -> min(x, y)
+  // min(min(x, y), x) -> min(x, y)
+  if (Match(lhs, m::MinimumAnyOrder(m::Op(), m::Op().Is(rhs)))) {
+    return ReplaceInstruction(minimum, lhs);
+  }
+  // min(x, min(x, y)) -> min(x, y)
+  if (Match(rhs, m::Minimum(m::Op().Is(lhs), m::Op()))) {
+    return ReplaceInstruction(minimum, rhs);
+  }
+  // min(y, min(x, y)) -> min(y, x)
+  // Note that we cannot simplify to min(x, y) here, as for the case that x and
+  // y are NaN but with different sign, it will make a difference.
+  if (Match(rhs, m::Minimum(m::Op(), m::Op().Is(lhs)))) {
+    TF_RETURN_IF_ERROR(minimum->ReplaceOperandWith(1, rhs->mutable_operand(0)));
+    MarkAsChanged();
+    return OkStatus();
+  }
+
   HloInstruction* clamp_upper_bound_bcast;
   HloInstruction* clamp_lower_bound_bcast;
   HloInstruction* to_clamp;
@@ -6102,6 +6120,10 @@ Status AlgebraicSimplifierVisitor::HandleRsqrt(HloInstruction* rsqrt) {
 
 Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
     HloInstruction* dynamic_slice) {
+  // Skip optimizations for async dynamic-slices.
+  if (dynamic_slice->parent()->IsAsyncComputation()) {
+    return OkStatus();
+  }
   auto operand = dynamic_slice->mutable_operand(0);
   if (ShapeUtil::IsScalar(dynamic_slice->shape())) {
     return ReplaceInstruction(dynamic_slice, operand);
@@ -6358,6 +6380,10 @@ Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
 
 Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
     HloInstruction* dynamic_update_slice) {
+  // Skip optimizations for async dynamic update slices
+  if (dynamic_update_slice->parent()->IsAsyncComputation()) {
+    return OkStatus();
+  }
   // Rewriting DynamicUpdateSlice when it matches
   // dynamic_update_slice(broadcast(constant),data,constant_index0,...)
   // to a Pad(x, constant)
diff --git a/third_party/xla/xla/service/algebraic_simplifier_test.cc b/third_party/xla/xla/service/algebraic_simplifier_test.cc
index 6de7433f729ca7..6f2ea5ceb72e2c 100644
--- a/third_party/xla/xla/service/algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier_test.cc
@@ -2427,6 +2427,78 @@ ENTRY test {
   }
 }
 
+TEST_F(AlgebraicSimplifierTest, MinimumOfMinimum1) {
+  const char* const hlo_string = R"(
+HloModule test
+
+ENTRY main {
+  x = f32[] parameter(0)
+  y = f32[] parameter(1)
+  min1 = f32[] minimum(x, y)
+  ROOT min = f32[] minimum(min1, y)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Minimum(m::Parameter(0), m::Parameter(1))));
+}
+
+TEST_F(AlgebraicSimplifierTest, MinimumOfMinimum2) {
+  const char* const hlo_string = R"(
+HloModule test
+
+ENTRY main {
+  x = f32[] parameter(0)
+  y = f32[] parameter(1)
+  min1 = f32[] minimum(x, y)
+  ROOT min = f32[] minimum(min1, x)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Minimum(m::Parameter(0), m::Parameter(1))));
+}
+
+TEST_F(AlgebraicSimplifierTest, MinimumOfMinimum3) {
+  const char* const hlo_string = R"(
+HloModule test
+
+ENTRY main {
+  x = f32[] parameter(0)
+  y = f32[] parameter(1)
+  min1 = f32[] minimum(x, y)
+  ROOT min = f32[] minimum(y, min1)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Minimum(m::Parameter(1), m::Parameter(0))));
+}
+
+TEST_F(AlgebraicSimplifierTest, MinimumOfMinimum4) {
+  const char* const hlo_string = R"(
+HloModule test
+
+ENTRY main {
+  x = f32[] parameter(0)
+  y = f32[] parameter(1)
+  min1 = f32[] minimum(x, y)
+  ROOT min = f32[] minimum(x, min1)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Minimum(m::Parameter(0), m::Parameter(1))));
+}
+
 TEST_F(AlgebraicSimplifierTest, TrivialReduceWindow_Add) {
   const char* const hlo_string = R"(
 HloModule test
@@ -7532,7 +7604,7 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReshapeWithoutLayout) {
       HloInstruction::CreateReshape(reshaped_shape, broadcast));
 
   std::unique_ptr<VerifiedHloModule> module = CreateNewVerifiedModule();
-  module->AddEntryComputationWithLayouts(builder.Build());
+  module->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
@@ -9985,6 +10057,51 @@ TEST_F(AlgebraicSimplifierTest, TransposeOfBroadcastSkipped) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(AlgebraicSimplifierTest, DontSinkInstructionsInDUSAsyncComputation) {
+  const char* kModuleStr = R"(
+   HloModule m
+   test {
+     %param_0 = f32[1]{0} parameter(0)
+     %param_1 = f32[10]{0} parameter(1)
+     %constant_1 = s32[] constant(0)
+     %dynamic-update-slice-start = ((f32[10]{0}, f32[1]{0}, s32[]),
+      f32[10]{0}, u32[]) dynamic-update-slice-start(f32[10]{0} %param_1,
+     f32[1]{0} %param_0, s32[] %constant_1)
+     ROOT %dynamic-update-slice-done =
+     f32[10]{0} dynamic-update-slice-done(((f32[10]{0}, f32[1]{0}, s32[]),
+     f32[10]{0}, u32[]) %dynamic-update-slice-start)
+   }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  bool changed =
+      RunHloPass(AlgebraicSimplifier(default_options_), m.get()).value();
+  SCOPED_TRACE(m->ToString());
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(AlgebraicSimplifierTest, DontSinkInstructionsInDSAsyncComputation) {
+  const char* kModuleStr = R"(
+   HloModule m
+   test {
+     %param_0 = f32[10]{0} parameter(0)
+     %constant_1 = s32[] constant(0)
+     %dynamic-slice-start = ((f32[10]{0}, s32[]), f32[1]{0}, u32[])
+      dynamic-slice-start(f32[10]{0} %param_0, s32[] %constant_1),
+      dynamic_slice_sizes={1}
+     ROOT %dynamic-slice-done = f32[1]{0}
+      dynamic-slice-done(((f32[10]{0}, s32[]), f32[1]{0}, u32[])
+        %dynamic-slice-start), dynamic_slice_sizes={1}
+   }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  bool changed =
+      RunHloPass(AlgebraicSimplifier(default_options_), m.get()).value();
+  SCOPED_TRACE(m->ToString());
+  EXPECT_FALSE(changed);
+}
+
 class AlgebraicSimplifierUpcastDowncastTest
     : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<
diff --git a/third_party/xla/xla/service/buffer_assignment.h b/third_party/xla/xla/service/buffer_assignment.h
index 0c8ebed0d295ad..f8671a21510450 100644
--- a/third_party/xla/xla/service/buffer_assignment.h
+++ b/third_party/xla/xla/service/buffer_assignment.h
@@ -370,14 +370,6 @@ class BufferAssignment {
     return allocations_;
   }
 
-  // This is similar to copying Allocations(), but since it's moved out, it
-  // preserves the addresses. Since BufferAllocation::Slice keeps a
-  // BufferAllocation*, and some backends keep BufferAllocation::Slice in
-  // xla::Executables, migrating off the use of addresses can be hard.
-  std::vector<BufferAllocation> ReleaseAllocations() {
-    return std::move(allocations_);
-  }
-
   // Returns the total size allocation holding all temporary buffers.
   int64_t temp_allocation_total_size() const {
     return temp_allocation_total_size_;
diff --git a/third_party/xla/xla/service/call_graph.cc b/third_party/xla/xla/service/call_graph.cc
index 41d7e961f7fc10..d9d7b107f65cb4 100644
--- a/third_party/xla/xla/service/call_graph.cc
+++ b/third_party/xla/xla/service/call_graph.cc
@@ -190,6 +190,21 @@ bool CallGraph::Dominates(const HloComputation* a,
   return DominatesHelper(a, b, &visited);
 }
 
+bool CallGraph::CanReach(const HloComputation* a,
+                         const HloComputation* b) const {
+  if (a == b) {
+    return true;
+  }
+
+  const CallGraphNode& b_node = GetNode(b);
+  for (const HloComputation* b_caller : b_node.callers()) {
+    if (CanReach(a, b_caller)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 namespace {
 // Returns the call context of a computation which is called from contexts 'a'
 // and 'b'.
diff --git a/third_party/xla/xla/service/call_graph.h b/third_party/xla/xla/service/call_graph.h
index 9cce636926c301..ac1ce21369a460 100644
--- a/third_party/xla/xla/service/call_graph.h
+++ b/third_party/xla/xla/service/call_graph.h
@@ -217,6 +217,10 @@ class CallGraph {
   // 'a'. Trivially, a computation dominates itself.
   bool Dominates(const HloComputation* a, const HloComputation* b) const;
 
+  // Returns true if 'a' can reach 'b' in the call graph. 'a' can reach 'b' if
+  // 'a' is 'b' or 'a' can reach one of the callers of 'b'.
+  bool CanReach(const HloComputation* a, const HloComputation* b) const;
+
   // Returns whether 'instruction' is contained in 'computation' either directly
   // ('instruction->parent' is 'computation') or indirectly ('computation'
   // dominates 'instruction->parent' in the call graph).
diff --git a/third_party/xla/xla/service/collective_ops_utils.cc b/third_party/xla/xla/service/collective_ops_utils.cc
index 87d9d9445cdb5e..46d0cc6d36b6b4 100644
--- a/third_party/xla/xla/service/collective_ops_utils.cc
+++ b/third_party/xla/xla/service/collective_ops_utils.cc
@@ -369,6 +369,11 @@ StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
                       device_assignment.LogicalIdForDevice(device_id));
   int current_replica_id = logical_id.replica_id;
   int current_partition_id = logical_id.computation_id;
+  TF_RET_CHECK(0 <= current_replica_id && current_replica_id < replica_count)
+      << current_replica_id << " " << replica_count;
+  TF_RET_CHECK(0 <= current_partition_id &&
+               current_partition_id < partition_count)
+      << current_partition_id << " " << partition_count;
 
   std::vector<GlobalDeviceId> participants;
   switch (group_mode) {
@@ -384,6 +389,8 @@ StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
       // partition.
       participants.reserve(participating_replicas.size());
       for (int replica_id : participating_replicas) {
+        TF_RET_CHECK(0 <= replica_id && replica_id < replica_count)
+            << replica_id << " " << replica_count;
         participants.emplace_back(
             device_assignment(replica_id, current_partition_id));
       }
@@ -398,6 +405,8 @@ StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
                                               partition_count, replica_groups));
       participants.reserve(participating_partitions.size());
       for (int partition_id : participating_partitions) {
+        TF_RET_CHECK(0 <= partition_id && partition_id < partition_count)
+            << partition_id << " " << partition_count;
         participants.emplace_back(
             device_assignment(current_replica_id, partition_id));
       }
@@ -412,6 +421,8 @@ StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
                                               replica_groups));
       participants.reserve(participating_replicas.size() * partition_count);
       for (int replica_id : participating_replicas) {
+        TF_RET_CHECK(0 <= replica_id && replica_id < replica_count)
+            << replica_id << " " << replica_count;
         for (int partition_id = 0; partition_id < partition_count;
              ++partition_id) {
           participants.emplace_back(
@@ -441,6 +452,8 @@ StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
       for (int flattened_id : participating_flattened_ids) {
         // Map from flattened id back to replica_id, partition_id.
         int replica_id = flattened_id / partition_count;
+        TF_RET_CHECK(0 <= replica_id && replica_id < replica_count)
+            << replica_id << " " << replica_count;
         int partition_id = flattened_id % partition_count;
         participants.emplace_back(device_assignment(replica_id, partition_id));
       }
diff --git a/third_party/xla/xla/service/collective_ops_utils.h b/third_party/xla/xla/service/collective_ops_utils.h
index 6917e34adc2225..55af5f51f113b4 100644
--- a/third_party/xla/xla/service/collective_ops_utils.h
+++ b/third_party/xla/xla/service/collective_ops_utils.h
@@ -17,7 +17,9 @@ limitations under the License.
 #define XLA_SERVICE_COLLECTIVE_OPS_UTILS_H_
 
 #include <memory>
+#include <optional>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -265,59 +267,17 @@ void WaitAndLogIfStuck(tsl::BlockingCounter* counter, const DescFn& desc_fn) {
 
 // Participant data for each rendezvous.
 struct ParticipantData {
-  explicit ParticipantData(const RendezvousKey& rendezvous_key)
-      : rendezvous_key(rendezvous_key) {}
+  ParticipantData(const RendezvousKey& rendezvous_key, int local_rank)
+      : rendezvous_key(rendezvous_key), local_rank(local_rank) {}
 
   virtual ~ParticipantData() {}
 
   RendezvousKey rendezvous_key;
+  int local_rank;  // Which of the local participants is this?
 
   virtual std::string ToString() const = 0;
 };
 
-// Encapsulates parameters to Rendezvous::SubmitParticipant.
-struct AllReduceParticipantData : ParticipantData {
-  AllReduceParticipantData(const RendezvousKey& rendezvous_key_p,
-                           int64_t device_ordinal_p, se::Stream* stream_p)
-      : ParticipantData(rendezvous_key_p),
-        device_ordinal(device_ordinal_p),
-        stream(stream_p) {}
-
-  // TODO(b/125951860): We should vet that we're buffer allocating such that
-  // source_buffer == destination_buffer if that avoids a NCCL copy (will depend
-  // on how well the NCCL in-place implementation performs vs the out-of-place
-  // implementation).
-  struct Buffer {
-    int64_t element_count;
-    se::DeviceMemoryBase source_data;
-    se::DeviceMemoryBase destination_data;
-    PrimitiveType primitive_type;
-  };
-  int64_t device_ordinal;
-  se::Stream* stream;
-  std::vector<Buffer> buffers;
-
-  ReductionKind reduction_kind;
-
-  // For each local all-reduce participant a (global ID, local device ordinal)
-  // pair for the participant. Participants are in no particular order.
-  std::vector<std::pair<GlobalDeviceId, int64_t>> local_devices;
-
-  std::string ToString() const override {
-    std::vector<std::string> buffer_strs;
-    buffer_strs.reserve(buffers.size());
-    for (const Buffer& buffer : buffers) {
-      buffer_strs.push_back(
-          absl::StrFormat("{element_count=%d}", buffer.element_count));
-    }
-    return absl::StrFormat(
-        "AllReduceParticipantData{buffers=[%s], rendezvous_key=%s, "
-        "device_ordinal=%d, stream=%p}",
-        absl::StrJoin(buffer_strs, ","), rendezvous_key.ToString(),
-        device_ordinal, stream);
-  }
-};
-
 // The set of threads that want to do a collective op together all pick the same
 // Rendezvous object out of the global cache and call SubmitParticipant.
 //
@@ -334,7 +294,8 @@ template <typename I, typename O,
 class Rendezvous {
  public:
   virtual ~Rendezvous() {}
-  explicit Rendezvous(const RendezvousKey& k) : key_(k) {}
+  explicit Rendezvous(const RendezvousKey& k)
+      : participants_(k.num_local_participants), key_(k) {}
 
   // Submit a participant to the rendezvous. We get the rendezvous from
   // `rendezvous_getter`, which we can then use to drop the existing reference.
@@ -368,24 +329,15 @@ class Rendezvous {
   // Returns domain-specific output O and whether this replica is primary.
   virtual StatusOr<O> RunCollectiveOp(const I& participant) = 0;
 
-  // Initialize the rendezvous by the first ("primary") thread which reaches the
-  // barrier. Returns whether this thread is primary.
-  bool InitializationBarrier() {
-    absl::MutexLock lock(&mu_);
-    if (!initialized_) {
-      initialized_ = true;
-      return true;
-    }
-    return false;
-  }
+  // Adding participants_ requires holding mu_.
+  // Not annotated with ABSL_GUARDED_BY(mu_) because we do not require the lock
+  // to be held during CollectiveOp(), since at that point all the data is known
+  // to be present due to the global barrier.
+  std::vector<std::optional<I>> participants_;
 
+ private:
   absl::Mutex mu_;
 
-  bool initialized_ ABSL_GUARDED_BY(mu_) = false;
-
-  std::vector<I> participants_ ABSL_GUARDED_BY(mu_);
-
- private:
   // Runs the all-reduce on the given thread.  If successful, returns
   //  - a handle to the clique that was used, so that the caller may keep the
   //    clique alive if it chooses.
@@ -396,18 +348,8 @@ class Rendezvous {
   SubmitParticipant(const I& participant) {
     {
       absl::MutexLock lock(&mu_);
-      CHECK(!initialized_);
-
-      // Spot check for consistent replica counts among submitting threads.
-      if (!participants_.empty() &&
-          participants_.back().rendezvous_key != participant.rendezvous_key) {
-        return InvalidArgument(
-            "Mismatch among all-reduce participants. Expected same "
-            "replica-count, element-count, and rendezvous-key but were %s and "
-            "%s",
-            participants_.back().ToString(), participant.ToString());
-      }
-      participants_.push_back(participant);
+      CHECK(!participants_[participant.local_rank].has_value());
+      participants_[participant.local_rank] = participant;
     }
 
     // Wait for all participants to arrive.
diff --git a/third_party/xla/xla/service/collective_pipeliner.cc b/third_party/xla/xla/service/collective_pipeliner.cc
index 3d5ddf374d32b6..07b99ec04d13f7 100644
--- a/third_party/xla/xla/service/collective_pipeliner.cc
+++ b/third_party/xla/xla/service/collective_pipeliner.cc
@@ -303,8 +303,9 @@ CheckStoreIntoSliceIsCompatible(HloInstruction* instr,
       return false;
     }
     if (i->opcode() == HloOpcode::kReduce &&
-        ShapeUtil::ElementsIn(i->shape()) ==
-            ShapeUtil::ElementsIn(instr->operand(0)->shape())) {
+        (ShapeUtil::ElementsIn(i->shape()) ==
+             ShapeUtil::ElementsIn(instr->operand(0)->shape()) ||
+         ShapeUtil::ElementsIn(instr->operand(0)->shape()) < 1024)) {
       return true;
     }
     return HloPredicateIsOp<HloOpcode::kSlice, HloOpcode::kDynamicSlice,
@@ -589,6 +590,15 @@ void UpdateInstructionChannelId(HloInstruction* cloned_instr,
     }
   }
   if (auto* channel_instr = DynCast<HloChannelInstruction>(cloned_instr)) {
+    if (channel_instr->opcode() == HloOpcode::kSendDone ||
+        channel_instr->opcode() == HloOpcode::kRecvDone) {
+      auto* operand = channel_instr->operand(0);
+      CHECK(operand->opcode() == HloOpcode::kSend ||
+            operand->opcode() == HloOpcode::kRecv);
+      channel_instr->set_channel_id(
+          Cast<HloChannelInstruction>(operand)->channel_id());
+      return;
+    }
     if (channel_instr->channel_id()) {
       channel_instr->set_channel_id(next_channel_id++);
     }
@@ -1035,12 +1045,17 @@ bool IsLoopInvariant(
   absl::flat_hash_set<const HloInstruction*> visited;
   while (!stack.empty()) {
     auto& current = stack.back();
+    invariant_cache[std::get<0>(current)] = true;
     if (std::get<0>(current)->HasSideEffect() ||
         std::get<0>(current)->opcode() == HloOpcode::kParameter) {
       invariant_cache[std::get<0>(current)] = false;
+      stack.pop_back();
+      continue;
     }
     if (std::get<0>(current)->operands().empty()) {
       invariant_cache[std::get<0>(current)] = true;
+      stack.pop_back();
+      continue;
     }
     if (std::get<1>(current) > 0) {
       auto* current_operand =
@@ -1607,6 +1622,9 @@ Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
     auto pipelined_instrs = CollectDependenciesToPipeline(
         move_info.collective_to_move, absl::MakeSpan(move_info.formatting_ops));
     for (auto* pipelined : pipelined_instrs) {
+      if (pipelined->opcode() == HloOpcode::kConstant) {
+        continue;
+      }
       const bool is_loop_invariant =
           IsLoopInvariant(pipelined, invariant_cache);
       is_output_instruction[pipelined] = new_init_operands.size();
@@ -1624,8 +1642,26 @@ Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
       new_init_operands.push_back(CreateZero(loop_computation, expanded_shape,
                                              expanded_shape.element_type()));
       indices_to_insert.insert(new_root_operands.size());
+      Shape extra_trivial_dim_shape =
+          ShapeUtil::PrependMajorDimension(1, pipelined->shape());
       HloInstruction* reshaped = body_computation->AddInstruction(
-          HloInstruction::CreateReshape(expanded_shape, pipelined));
+          HloInstruction::CreateReshape(extra_trivial_dim_shape, pipelined));
+      std::vector<HloInstruction*> indices(
+          expanded_shape.dimensions_size(),
+          CreateZero(body_computation,
+                     move_info.dynamic_update_slice->index_shapes()[0],
+                     move_info.dynamic_update_slice->index_shapes()[0]
+                         .element_type()));
+      indices[0] = move_info.dynamic_update_slice->index_operands()[0];
+      HloInstruction* input =
+          body_computation->AddInstruction(HloInstruction::CreateCustomCall(
+              expanded_shape,
+              {body_computation->AddInstruction(HloInstruction::CreateConstant(
+                  LiteralUtil::CreateR0((int32_t)new_root_operands.size())))},
+              "PlaceHolder"));
+      reshaped = body_computation->AddInstruction(
+          HloInstruction::CreateDynamicUpdateSlice(expanded_shape, input,
+                                                   reshaped, indices));
       new_root_operands.push_back(reshaped);
     }
   }
@@ -1728,7 +1764,7 @@ Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
     TF_RETURN_IF_ERROR(output->ReplaceOperandWith(0, new_param));
     TF_RETURN_IF_ERROR(
         old_operand_param->parent()->RemoveInstruction(old_operand_param));
-    if (insert_non_alias_custom_call) {
+    if (insert_non_alias_custom_call && original_to_move_indices.contains(i)) {
       auto* old_operand = output->mutable_operand(1);
       auto* custom_call =
           cloned_body->AddInstruction(HloInstruction::CreateCustomCall(
@@ -1753,6 +1789,9 @@ Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
     auto pipelined_instrs = CollectDependenciesToPipeline(
         to_move.collective_to_move, absl::MakeSpan(to_move.formatting_ops));
     for (auto* original_pipelined : pipelined_instrs) {
+      if (original_pipelined->opcode() == HloOpcode::kConstant) {
+        continue;
+      }
       const bool is_loop_invariant =
           IsLoopInvariant(original_pipelined, invariant_cache);
       CHECK(is_output_instruction.contains(original_pipelined));
@@ -1781,28 +1820,44 @@ Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
             {to_sink}));
     UpdateInstructionChannelId(pipelined_instr_cloned, next_channel_id);
     pipelined_map[to_move.collective_to_move] = pipelined_instr_cloned;
-    auto collect_operands = [&pipelined_map](HloInstruction* instr) {
+    absl::flat_hash_set<HloInstruction*> to_add_batch_set;
+    auto collect_operands = [&pipelined_map, &to_add_batch_set,
+                             loop_computation,
+                             &to_move](HloInstruction* instr) {
       std::vector<HloInstruction*> operands;
       for (auto* operand : instr->mutable_operands()) {
+        if (operand->opcode() == HloOpcode::kConstant) {
+          HloInstruction* cloned_constant = loop_computation->AddInstruction(
+              operand->CloneWithNewOperands(operand->shape(), {}));
+          if (!to_add_batch_set.contains(instr)) {
+            operands.push_back(cloned_constant);
+            continue;
+          }
+          Shape full_shape =
+              ComputeFullOutputShape(to_move, cloned_constant->shape());
+          absl::InlinedVector<int64_t, 4> operand_dims;
+          operand_dims.resize(cloned_constant->shape().dimensions_size());
+          absl::c_iota(operand_dims, 1);
+          HloInstruction* broadcasted =
+              loop_computation->AddInstruction(HloInstruction::CreateBroadcast(
+                  full_shape, cloned_constant, operand_dims));
+          operands.push_back(broadcasted);
+          continue;
+        }
         auto it = pipelined_map.find(operand);
         CHECK(it != pipelined_map.end());
         operands.push_back(it->second);
       }
       return operands;
     };
-    absl::flat_hash_set<HloInstruction*> to_add_batch_set;
     absl::flat_hash_set<HloInstruction*> formatting_ops_set(
         to_move.formatting_ops.begin(), to_move.formatting_ops.end());
     std::vector<HloInstruction*> stack(1, to_move.collective_to_move);
-    while (!stack.empty()) {
-      auto* current = stack.back();
-      stack.pop_back();
-      to_add_batch_set.insert(current);
-      for (auto* u : current->users()) {
-        if (formatting_ops_set.contains(u)) {
-          stack.push_back(u);
-        }
+    for (auto* current : to_move.formatting_ops) {
+      if (IsLoopInvariant(current, invariant_cache)) {
+        continue;
       }
+      to_add_batch_set.insert(current);
     }
     //  We are adding a batch dimension to the formatting ops, so we need to
     //  specially rewrite each instruction potentially if adding dimensions has
@@ -1814,12 +1869,12 @@ Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
         HloInstruction* cloned_not_to_batch = loop_computation->AddInstruction(
             formatting_op->CloneWithNewOperands(
                 formatting_op->shape(), collect_operands(formatting_op)));
+        UpdateInstructionChannelId(cloned_not_to_batch, next_channel_id);
         pipelined_map[formatting_op] = cloned_not_to_batch;
         continue;
       }
       if (formatting_op->IsElementwise() ||
           formatting_op->opcode() == HloOpcode::kReshape ||
-          formatting_op->opcode() == HloOpcode::kReduce ||
           formatting_op->opcode() == HloOpcode::kAllReduce ||
           formatting_op->opcode() == HloOpcode::kConvert ||
           formatting_op->opcode() == HloOpcode::kCollectivePermute) {
@@ -1830,6 +1885,26 @@ Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
         pipelined_map[formatting_op] = cloned_elementwise;
         continue;
       }
+      if (formatting_op->opcode() == HloOpcode::kReduce) {
+        auto operands = collect_operands(formatting_op);
+        std::vector<int64_t> dimensions(formatting_op->dimensions().begin(),
+                                        formatting_op->dimensions().end());
+        for (auto& dim : dimensions) {
+          ++dim;
+        }
+        // Look through broadcast for reduce init value.
+        if (operands[1]->opcode() == HloOpcode::kBroadcast) {
+          CHECK(operands[1]->operand(0)->opcode() == HloOpcode::kConstant);
+          operands[1] = operands[1]->mutable_operand(0);
+        }
+        HloInstruction* expanded_reduce =
+            loop_computation->AddInstruction(HloInstruction::CreateReduce(
+                ComputeFullOutputShape(to_move, formatting_op->shape()),
+                operands[0], operands[1], dimensions,
+                formatting_op->to_apply()));
+        pipelined_map[formatting_op] = expanded_reduce;
+        continue;
+      }
       if (formatting_op->opcode() == HloOpcode::kBroadcast) {
         CHECK(formatting_op->dimensions().empty());
         auto operands = collect_operands(formatting_op);
@@ -2257,6 +2332,8 @@ static Status TransformLoopBackward(const WhileLoopAnalysis& loop_analysis,
 StatusOr<bool> CollectivePipeliner::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  CHECK(config_.acceptable_formatting);
+  CHECK(config_.should_process);
   bool changed = false;
   std::vector<HloInstruction*> while_loop_instructions;
   for (HloComputation* computation : module->MakeComputationPostOrder()) {
@@ -2302,6 +2379,7 @@ StatusOr<bool> CollectivePipeliner::Run(
       }
     }
     if (config_.pipelining_direction == PipeliningDirection::kForward) {
+      CHECK(config_.reuse_pipelined_op_buffer);
       TF_RETURN_IF_ERROR(TransformLoopForward(
           loop_analysis, !config_.last_run, config_.level_to_operate_on,
           config_.pipeline_use_tree, config_.process_different_sized_ops,
diff --git a/third_party/xla/xla/service/collective_pipeliner.h b/third_party/xla/xla/service/collective_pipeliner.h
index 01b167dcc45835..8553dd96521931 100644
--- a/third_party/xla/xla/service/collective_pipeliner.h
+++ b/third_party/xla/xla/service/collective_pipeliner.h
@@ -75,7 +75,7 @@ class CollectivePipeliner : public HloModulePass {
     bool process_different_sized_ops = false;
     PipeliningDirection pipelining_direction = PipeliningDirection::kForward;
     HloPredicate should_process;
-    // Filter acceptable formatting ops for for forward piplining to discard
+    // Filter acceptable formatting ops for for forward pipelining to discard
     // cases that pipeline formatting operations that we don't want to support.
     HloPredicate acceptable_formatting;
     // If the pipelined op has same input/output size the we reuse  the same
diff --git a/third_party/xla/xla/service/collective_pipeliner_test.cc b/third_party/xla/xla/service/collective_pipeliner_test.cc
index efdd4f95bf69d1..600977539c0e29 100644
--- a/third_party/xla/xla/service/collective_pipeliner_test.cc
+++ b/third_party/xla/xla/service/collective_pipeliner_test.cc
@@ -163,6 +163,71 @@ ENTRY entry {
   EXPECT_EQ(get_tuple_index->tuple_index(), 3);
 }
 
+TEST_F(CollectivePipelinerTest, UpdateSendRecvChannelIdForHostTransfers) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128], bf16[3,8,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(3)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128], bf16[3,8,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  get-tuple-element.5 = bf16[3,8,128] get-tuple-element(param), index=2
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  after-all = after-all()
+  send.88 = (s32[], u32[], token[]) send(
+      add.232, after-all), channel_id=2, is_host_transfer=true
+  send-done.88 = token[] send-done(send.88), channel_id=2, is_host_transfer=true
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.5, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  ar.1 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=1
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ar.1, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128], bf16[3,8,128]) tuple(add.230, dynamic-update-slice.35, get-tuple-element.5)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(0)
+  p0 = bf16[3,8,128] parameter(0)
+  tuple = (s32[], bf16[3,8,128], bf16[3,8,128]) tuple(c0, p0, p0)
+  while = (s32[], bf16[3,8,128], bf16[3,8,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* entry_comp = module->entry_computation();
+  auto* unrolled_send_done = entry_comp->GetInstructionWithName("send-done.0");
+  ASSERT_THAT(unrolled_send_done, ::testing::NotNull());
+  auto* unrolled_send = unrolled_send_done->operand(0);
+  auto channel_id = [](const HloInstruction* instr) {
+    return DynCast<HloChannelInstruction>(instr)->channel_id();
+  };
+  EXPECT_EQ(channel_id(unrolled_send), channel_id(unrolled_send_done));
+}
+
 TEST_F(CollectivePipelinerTest, TransformIncrementIndexByOneNoReuse) {
   constexpr absl::string_view hlo_string = R"(
 HloModule module
diff --git a/third_party/xla/xla/service/compilation_environments.cc b/third_party/xla/xla/service/compilation_environments.cc
index 2268268744b11f..48a6ea0f1cd0bf 100644
--- a/third_party/xla/xla/service/compilation_environments.cc
+++ b/third_party/xla/xla/service/compilation_environments.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/protobuf.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -253,8 +254,8 @@ Status CompilationEnvironments::AddEnvImpl(
     return tsl::errors::InvalidArgument(
         "Unknown compilation environment type: %s", descriptor.full_name());
   }
-  std::unique_ptr<tsl::protobuf::Message> processed_env =
-      process_new_env(std::move(env));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<tsl::protobuf::Message> processed_env,
+                      process_new_env(std::move(env)));
 
   // Check for unknown fields
   const tsl::protobuf::UnknownFieldSet& unknown_fields =
diff --git a/third_party/xla/xla/service/compilation_environments.h b/third_party/xla/xla/service/compilation_environments.h
index d3778a96c8553f..6c7b592ac59b1b 100644
--- a/third_party/xla/xla/service/compilation_environments.h
+++ b/third_party/xla/xla/service/compilation_environments.h
@@ -46,8 +46,9 @@ namespace xla {
 // CompilationEnvironments is not thread-safe.
 class CompilationEnvironments {
  public:
-  using ProcessNewEnvFn = std::function<std::unique_ptr<tsl::protobuf::Message>(
-      std::unique_ptr<tsl::protobuf::Message>)>;
+  using ProcessNewEnvFn =
+      std::function<StatusOr<std::unique_ptr<tsl::protobuf::Message>>(
+          std::unique_ptr<tsl::protobuf::Message>)>;
 
   CompilationEnvironments() = default;
   CompilationEnvironments(const CompilationEnvironments& rhs) { *this = rhs; }
@@ -138,6 +139,7 @@ T& CompilationEnvironments::GetMutableEnv() {
     DefaultEnvCreatedByCompilationEnvironments(descriptor->full_name());
     it = environments_.find(descriptor);
   }
+
   return tensorflow::down_cast<T&>(*it->second);
 }
 
diff --git a/third_party/xla/xla/service/compile_only_service.h b/third_party/xla/xla/service/compile_only_service.h
index 3032710eb38dd0..a523ee87edf83a 100644
--- a/third_party/xla/xla/service/compile_only_service.h
+++ b/third_party/xla/xla/service/compile_only_service.h
@@ -57,10 +57,6 @@ class CompileOnlyService : public Service {
                           GetDeviceHandlesResponse* result) override {
     return Unimplemented("CompileOnlyService does not support devices.");
   }
-  Status WaitForExecution(const WaitForExecutionRequest* arg,
-                          WaitForExecutionResponse* result) override {
-    return Unimplemented("CompileOnlyService does not support execution.");
-  }
   Status TransferToServer(const TransferToServerRequest* arg,
                           TransferToServerResponse* result) override {
     return Unimplemented(
diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index 8bd705131b1ddc..39f096a60e27b5 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -68,7 +68,7 @@ class AotCompilationResult {
   }
 
   virtual StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, se::StreamExecutor* executor) const {
+      Compiler* compiler, const se::StreamExecutor* executor) const {
     return Unimplemented("LoadExecutable unimplemented.");
   }
 
@@ -107,7 +107,6 @@ class Compiler {
  public:
   // Description of a target device for compilation.
   struct TargetConfig {
-    TargetConfig() = default;
     explicit TargetConfig(const se::GpuTargetConfigProto& proto);
     explicit TargetConfig(se::StreamExecutor* s);
 
@@ -170,7 +169,7 @@ class Compiler {
   // The returned 'BufferAssignment' retains a pointer to the 'HloModule', so
   // the module must live at least as long as the buffer assignments.
   virtual StatusOr<std::unique_ptr<BufferAssignment>> AssignBuffers(
-      HloModule* module, se::StreamExecutor* executor) {
+      HloModule* module, const se::StreamExecutor* executor) {
     return Unimplemented("This compiler does not support this method");
   }
 
diff --git a/third_party/xla/xla/service/computation_layout.cc b/third_party/xla/xla/service/computation_layout.cc
index c4510143bd77e7..5f83c31343bf5a 100644
--- a/third_party/xla/xla/service/computation_layout.cc
+++ b/third_party/xla/xla/service/computation_layout.cc
@@ -15,13 +15,17 @@ limitations under the License.
 
 #include "xla/service/computation_layout.h"
 
-#include <algorithm>
 #include <string>
 #include <utility>
+#include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "xla/layout.h"
 #include "xla/printer.h"
+#include "xla/shape_util.h"
+#include "xla/statusor.h"
 #include "xla/types.h"
 
 namespace xla {
@@ -34,8 +38,6 @@ ComputationLayout::ComputationLayout(const ProgramShape& program_shape,
   }
   if (ignore_layouts) {
     SetToDefaultLayout();
-  } else {
-    SetToDefaultLayoutIfEmpty();
   }
 }
 
@@ -45,24 +47,74 @@ void ComputationLayout::SetToDefaultLayout() {
   }
   result_layout_.SetToDefaultLayout();
 }
-
-void ComputationLayout::SetToDefaultLayoutIfEmpty() {
-  for (auto& parameter_layout : parameter_layouts_) {
-    if (!parameter_layout.LayoutIsSet()) {
-      parameter_layout.SetToDefaultLayout();
-    }
-  }
-  if (!result_layout_.LayoutIsSet()) {
-    result_layout_.SetToDefaultLayout();
-  }
-}
-
 bool ComputationLayout::LayoutIsSet() const {
   return absl::c_all_of(parameter_layouts_,
                         [](const ShapeLayout& s) { return s.LayoutIsSet(); }) &&
          result_layout_.LayoutIsSet();
 }
 
+bool ComputationLayout::AnyLayoutSet() const {
+  return absl::c_any_of(parameter_layouts_,
+                        [](const ShapeLayout& s) { return s.LayoutIsSet(); }) ||
+         result_layout_.LayoutIsSet();
+}
+
+StatusOr<std::vector<Layout>> ComputationLayout::FlattenedParameterLayouts()
+    const {
+  std::vector<Layout> result;
+  for (int i = 0; i < parameter_count(); ++i) {
+    TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+        parameter_shape(i),
+        [this, &result](const Shape& subshape, const ShapeIndex& index) {
+          if (subshape.IsTuple()) {
+            return OkStatus();
+          }
+          if (!subshape.IsArray()) {
+            return Unimplemented(
+                "ComputationLayout::FlattenedParameterLayouts doesn't support "
+                "token or opaque parameters (got: %s)",
+                ToString());
+          }
+          if (!subshape.has_layout()) {
+            return InvalidArgument(
+                "ComputationLayout::FlattenedParameterLayouts can only be "
+                "called after all parameters have layouts assigned (got: %s)",
+                ToString());
+          }
+          result.push_back(subshape.layout());
+          return OkStatus();
+        }));
+  }
+  return result;
+}
+
+StatusOr<std::vector<Layout>> ComputationLayout::FlattenedResultLayouts()
+    const {
+  std::vector<Layout> result;
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      result_shape(),
+      [this, &result](const Shape& subshape, const ShapeIndex& index) {
+        if (subshape.IsTuple()) {
+          return OkStatus();
+        }
+        if (!subshape.IsArray()) {
+          return Unimplemented(
+              "ComputationLayout::FlattenedResultLayouts doesn't support "
+              "token or opaque outputs (got: %s)",
+              ToString());
+        }
+        if (!subshape.has_layout()) {
+          return InvalidArgument(
+              "ComputationLayout::FlattenedResultLayouts can only be called "
+              "after all outputs have layouts assigned (got: %s)",
+              ToString());
+        }
+        result.push_back(subshape.layout());
+        return OkStatus();
+      }));
+  return result;
+}
+
 void ComputationLayout::Print(Printer* printer) const {
   printer->Append("(");
   if (!parameter_layouts_.empty()) {
diff --git a/third_party/xla/xla/service/computation_layout.h b/third_party/xla/xla/service/computation_layout.h
index 572aaac669e358..659ce362201829 100644
--- a/third_party/xla/xla/service/computation_layout.h
+++ b/third_party/xla/xla/service/computation_layout.h
@@ -78,10 +78,20 @@ class ComputationLayout {
   // Sets layouts of all parameters and the result to the default layout.
   void SetToDefaultLayout();
 
-  void SetToDefaultLayoutIfEmpty();
-
   // Returns true if all layouts (parameters and result) have been set.
   bool LayoutIsSet() const;
+  // Returns true if any layouts (parameters and result) have been set.
+  bool AnyLayoutSet() const;
+
+  // Returns a list of each parameter's layout. If the parameters are tupled,
+  // returns an untupled list. Must only be called if all parameters have
+  // layouts set (check with LayoutIsSet()).
+  StatusOr<std::vector<Layout>> FlattenedParameterLayouts() const;
+
+  // Returns a list of each output's layout. If the result shape is a tuple,
+  // returns an untupled list. Must only be called if all outputs have layouts
+  // set (check with LayoutIsSet()).
+  StatusOr<std::vector<Layout>> FlattenedResultLayouts() const;
 
   // Prints a string representation of this object.
   void Print(Printer* printer) const;
diff --git a/third_party/xla/xla/service/convert_operand_folding.cc b/third_party/xla/xla/service/convert_operand_folding.cc
index daa28c0fdb7cb1..92d5d131e7a7f8 100644
--- a/third_party/xla/xla/service/convert_operand_folding.cc
+++ b/third_party/xla/xla/service/convert_operand_folding.cc
@@ -15,19 +15,70 @@ limitations under the License.
 
 #include "xla/service/convert_operand_folding.h"
 
-#include "absl/base/attributes.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/primitive_util.h"
+#include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
 bool IsUpcastConvert(const HloInstruction* hlo) {
-  if (hlo->opcode() != HloOpcode::kConvert) {
+  if (!hlo->shape().IsArray()) {
     return false;
   }
-  return primitive_util::CastPreservesValues(
-      hlo->operand(0)->shape().element_type(), hlo->shape().element_type());
+  switch (hlo->opcode()) {
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kGather:
+    case HloOpcode::kReshape:
+    case HloOpcode::kSlice:
+    case HloOpcode::kTranspose: {
+      return IsUpcastConvert(hlo->operand(0));
+    }
+    case HloOpcode::kReduce: {
+      if (ShapeUtil::ElementsIn(hlo->shape()) ==
+          ShapeUtil::ElementsIn(hlo->operand(0)->shape())) {
+        return IsUpcastConvert(hlo->operand(0));
+      }
+      return false;
+    }
+    case HloOpcode::kConvert:
+      return primitive_util::CastPreservesValues(
+          hlo->operand(0)->shape().element_type(), hlo->shape().element_type());
+    default:
+      return false;
+  }
+}
+
+HloInstruction* EffectiveOperand(HloInstruction* hlo) {
+  switch (hlo->opcode()) {
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kGather:
+    case HloOpcode::kReshape:
+    case HloOpcode::kSlice:
+    case HloOpcode::kTranspose: {
+      HloInstruction* operand = EffectiveOperand(hlo->mutable_operand(0));
+      HloInstruction* clone = hlo->AddInstruction(hlo->Clone());
+      *(clone->mutable_shape()) = ShapeUtil::ChangeElementType(
+          clone->shape(), operand->shape().element_type());
+      clone->ReplaceOperandWithDifferentShape(0, operand).IgnoreError();
+      return clone;
+    }
+    case HloOpcode::kReduce: {
+      // Reduce is a reshape in the case the the hlo chain was an upcast.
+      HloInstruction* operand = EffectiveOperand(hlo->mutable_operand(0));
+      return hlo->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::ChangeElementType(hlo->shape(),
+                                       operand->shape().element_type()),
+          operand));
+    }
+    case HloOpcode::kConvert:
+      return hlo->mutable_operand(0);
+    default:
+      return nullptr;
+  }
 }
 
 }  // namespace
@@ -52,7 +103,7 @@ StatusOr<HloInstruction*> ConvertOperandFolding::ExpandInstruction(
     auto* operand = instruction->mutable_operand(i);
     if (IsUpcastConvert(operand)) {
       TF_RETURN_IF_ERROR(instruction->ReplaceOperandWithDifferentShape(
-          i, operand->mutable_operand(0)));
+          i, EffectiveOperand(operand)));
     }
   }
   return nullptr;
diff --git a/third_party/xla/xla/service/convert_operand_folding_test.cc b/third_party/xla/xla/service/convert_operand_folding_test.cc
index c1a8b5d459cfb5..36653c96fe8aed 100644
--- a/third_party/xla/xla/service/convert_operand_folding_test.cc
+++ b/third_party/xla/xla/service/convert_operand_folding_test.cc
@@ -142,5 +142,73 @@ TEST_F(ConvertOperandFoldingTest, OneOperandFolded) {
             op::Shape("s16[2,2]{1,0}")));
 }
 
+TEST_F(ConvertOperandFoldingTest, FoldedWithFormatting) {
+  absl::string_view module_string = R"(
+  HloModule module
+  sum {
+    a = s16[] parameter(0)
+    b = s16[] parameter(1)
+    ROOT r  = add(a,b)
+  }
+
+  ENTRY main {
+    p0 = s8[3,10] parameter(0)
+    c0 = s16[3,10] convert(p0)
+    r0 = s16[3,2,5] reshape(c0)
+    t0 = s16[2,5,3] transpose(r0), dimensions={1,2,0}
+    s0 = s16[2,1,3] slice(t0), slice={[0:2], [2:3], [0:3]}
+    rs0 = s16[2,3] reshape(s0)
+    p1 = s8[3,1,2] parameter(1)
+    c1 = s16[3,1,2] convert(p1)
+    r1 = s16[1,3,2] transpose(c1), dimensions={1,0,2}
+    z = s16[] constant(0)
+    rr1 = s16[3,2] reduce(r1,z), dimensions={0}, to_apply=sum
+    ROOT dot = s16[2,2] dot(rs0, rr1), lhs_contracting_dims={1},
+                                          rhs_contracting_dims={0}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool folded,
+                          ConvertOperandFolding().Run(module.get()));
+  EXPECT_TRUE(folded);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Dot(
+          op::Reshape(op::Slice(op::Transpose(op::Reshape(op::Parameter(0))))),
+          op::Reshape(op::Transpose(op::Parameter(1)))));
+}
+
+TEST_F(ConvertOperandFoldingTest, FoldedWithDSAndGather) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    p0 = s8[100,3] parameter(0)
+    c0 = s16[100,3] convert(p0)
+    ids = s32[20] parameter(2)
+    g = s16[20,3] gather(c0, ids), offset_dims={1}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1, slice_sizes={1,3}
+    t = s16[3,20] transpose(g), dimensions={1,0}
+
+    p1 = s8[25,3] parameter(1)
+    c1 = s16[25,3] convert(p1)
+    z = s32[] constant(0)
+    s = s32[] parameter(3)
+    ds = s16[20,3] dynamic-slice(c1, s, z), dynamic_slice_sizes={20,3}
+
+    ROOT dot = s16[3,3] dot(t, ds), lhs_contracting_dims={1},
+                                          rhs_contracting_dims={0}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool folded,
+                          ConvertOperandFolding().Run(module.get()));
+  EXPECT_TRUE(folded);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Dot(op::Transpose(op::Gather(op::Parameter(0), op::Parameter(2))),
+              op::DynamicSlice(op::Parameter(1), op::Parameter(3),
+                               op::Constant())));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/copy_insertion.cc b/third_party/xla/xla/service/copy_insertion.cc
index a49fb6914dc566..7550b3633eefa5 100644
--- a/third_party/xla/xla/service/copy_insertion.cc
+++ b/third_party/xla/xla/service/copy_insertion.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include "xla/service/copy_insertion.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <memory>
 #include <optional>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
@@ -1191,6 +1193,18 @@ class CopyRemover {
               HloOrdering* ordering, bool check_live_range_ordering,
               const absl::flat_hash_set<absl::string_view>& execution_threads)
       : dataflow_(alias_analysis.dataflow_analysis()), ordering_(ordering) {
+    // Instruction indices based on post order traversal of computations and
+    // instructions. Used as an enhancement for getting strict weak ordering
+    // used for sorting below.
+    absl::flat_hash_map<int, int64_t> instruction_ids;
+    int64_t id = 0;
+    for (HloComputation* computation : module.MakeComputationPostOrder()) {
+      for (HloInstruction* instruction :
+           computation->MakeInstructionPostOrder()) {
+        instruction_ids[instruction->unique_id()] = id++;
+      }
+    }
+
     // Construct a list for each HLO buffer in the alias analysis. Maintain a
     // map from HloValue to the respective list element representing that
     // value. The map is used to construct the copy info map below.
@@ -1241,8 +1255,48 @@ class CopyRemover {
       }
 
       std::vector<const HloValue*> values = buffer.values();
-      absl::c_sort(values, [this](const HloValue* a, const HloValue* b) {
-        return ordering_->IsDefinedBefore(*a, *b);
+      absl::c_sort(values, [this, &instruction_ids](const HloValue* a,
+                                                    const HloValue* b) {
+        // IsDefinedBefore() is generally not strict weak ordering required by
+        // the sort algorithm, since a may not be comparable to b or c by
+        // IsDefinedBefore(), but b and c can be comparable. Such as in:
+        //   if () { b = ...; c = b + 1; } else { a = ...; }
+        // or
+        //   a = param(0) + param(1); b = param(2) + param(3); c = b + 1;
+        // So it fails the "incomparability being transitive" requirement by
+        // strict weak ordering. We enhance the ordering test by using
+        // instruction ids generated by post order visiting of the
+        // computations/instructions. All HloValue's are comparable and
+        // dependency (thus transitivity) is respected when hlo ordering cannot
+        // decide the order.
+        if (a == b) {
+          return false;
+        }
+        const bool a_has_smaller_id =
+            instruction_ids.at(a->defining_instruction()->unique_id()) <
+            instruction_ids.at(b->defining_instruction()->unique_id());
+        // Use a_has_smaller_id as a hint for the order between a and b. In case
+        // it's right, there is no need for two IsDefinedBefore() tests.
+        if (a_has_smaller_id) {
+          // Test a is defined before b first.
+          if (ordering_->IsDefinedBefore(*a, *b)) {
+            return true;
+          }
+          if (ordering_->IsDefinedBefore(*b, *a)) {
+            return false;
+          }
+        } else {
+          // Test b is defined before a first.
+          if (ordering_->IsDefinedBefore(*b, *a)) {
+            return false;
+          }
+          if (ordering_->IsDefinedBefore(*a, *b)) {
+            return true;
+          }
+        }
+
+        // Use post order as tie breaker.
+        return a_has_smaller_id;
       });
 
       // Create a list containing all of the values in the buffer.
@@ -1497,7 +1551,7 @@ class CopyRemover {
       // s_x will be ordered before the definition of d_1. To make sure the
       // copy elision is safe, the following code checks that this ordering is
       // valid --- in particular we check it is safe to order d_m ahead of all
-      // the liverages at and after x_{x+1}, and it is safe to order all uses
+      // the liverages at and after s_{x+1}, and it is safe to order all uses
       // of s_x before the definition of d_1, by checking the live range
       // constraints for each pair --- we cannot skip the later checks because
       // the live range ordering is not guranteed to be transitive --- while it
@@ -1833,7 +1887,7 @@ class CopyRemover {
 
 }  // namespace
 
-// We add copies for all non-phi indices of the true and false computation
+// We add copies for all phi indices of the true and false computation
 // roots, in order to resolve interference. We later rely on
 // RemoveUnnecessaryCopies to drop the unnecessary ones.
 Status CopyInsertion::AddCopiesForConditional(
@@ -1844,7 +1898,7 @@ Status CopyInsertion::AddCopiesForConditional(
   TF_RET_CHECK(conditional->opcode() == HloOpcode::kConditional);
   if (!IndicesToCopyForConditional(alias_analysis.dataflow_analysis(),
                                    conditional, &indices_to_copy)) {
-    VLOG(2) << "No copies necessary for kWhile instruction "
+    VLOG(2) << "No copies necessary for kConditional instruction "
             << conditional->name();
     return OkStatus();
   }
@@ -1873,6 +1927,9 @@ Status CopyInsertion::AddCopiesToResolveInterference(
                       HloAliasAnalysis::Run(module, can_share_buffer_));
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
+    if (computation->IsAsyncComputation()) {
+      continue;
+    }
     for (HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
       if (instruction->opcode() == HloOpcode::kWhile) {
@@ -1886,7 +1943,13 @@ Status CopyInsertion::AddCopiesToResolveInterference(
         // have been copied.
         absl::flat_hash_set<int64_t> copied_operands;
         for (const auto& operand_and_output_index :
-             HloDataflowAnalysis::GetInPlaceInputOutputPairs(instruction)) {
+             HloDataflowAnalysis::GetInPlaceInputOutputPairs(
+                 // Input/output buffer aliasing analysis needs to be done
+                 // directly with the wrapped instruction when the compiler sees
+                 // an async box.
+                 instruction->opcode() == HloOpcode::kAsyncStart
+                     ? instruction->async_wrapped_instruction()
+                     : instruction)) {
           const HloOperandIndex& operand_index = operand_and_output_index.first;
           if (copied_operands.contains(operand_index.operand_number)) {
             continue;
@@ -2108,7 +2171,8 @@ static int64_t GetNumExistingCopies(
 Status CopyInsertion::RemoveUnnecessaryCopies(
     HloModule* module, bool check_live_range_ordering,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_VLOG_LINES(4, module->ToString());
+  XLA_VLOG_LINES(
+      4, module->ToString(HloPrintOptions().set_syntax_sugar_async_ops(false)));
 
   // Use SequentialHloOrdering if the module has a schedule. The schedule can
   // provide more information on the ordering, allowing for detecting more
diff --git a/third_party/xla/xla/service/copy_insertion_test.cc b/third_party/xla/xla/service/copy_insertion_test.cc
index ab39471fb91d65..029577eaa5d8b9 100644
--- a/third_party/xla/xla/service/copy_insertion_test.cc
+++ b/third_party/xla/xla/service/copy_insertion_test.cc
@@ -15,25 +15,34 @@ limitations under the License.
 
 #include "xla/service/copy_insertion.h"
 
+#include <cstdint>
 #include <memory>
-#include <set>
+#include <string>
+#include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "xla/comparison_util.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/literal.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/literal_util.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_parser.h"
-#include "xla/service/hlo_runner.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test_benchmark.h"
 
@@ -42,6 +51,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
+using ::testing::NotNull;
 using ::testing::UnorderedElementsAre;
 
 int64_t CountCopies(const HloComputation& computation) {
@@ -3649,5 +3659,108 @@ ENTRY main {
   EXPECT_EQ(root->operand(2)->opcode(), HloOpcode::kGetTupleElement);
 }
 
+TEST_F(CopyInsertionTest, DontInsertCopiesInAsyncComputation) {
+  constexpr absl::string_view kModuleString = R"(
+HloModule test
+
+%async_computation {
+  %param_0 = f32[10,32,512]{2,1,0:T(8,128)S(5)} parameter(0)
+  %param_1 = f32[1,32,512]{2,1,0:T(8,128)} parameter(1)
+  %param_2 = s32[]{:T(128)} parameter(2)
+  %param_3 = s32[]{:T(128)} parameter(3)
+  %param_4 = s32[]{:T(128)} parameter(4)
+  ROOT %dynamic-update-slice.1 = f32[10,32,512]{2,1,0:T(8,128)S(5)}
+    dynamic-update-slice(%param_0, %param_1, %param_2, %param_3, %param_4)
+}
+
+ENTRY %main {
+  %param.1 = (s32[]{:T(128)}, f32[32,512]{1,0:T(8,128)},
+              f32[10,32,512]{2,1,0:T(8,128)S(5)}) parameter(0)
+  %get-tuple-element.132 = f32[10,32,512]{2,1,0:T(8,128)S(5)} get-tuple-element(
+    %param.1), index=2
+  %get-tuple-element.131 = f32[32,512]{1,0:T(8,128)} get-tuple-element(
+    %param.1), index=1
+  %cosine.0 = f32[32,512]{1,0:T(8,128)} cosine(%get-tuple-element.131)
+  %reshape.6 = f32[1,32,512]{2,1,0:T(8,128)} reshape(%cosine.0)
+  %get-tuple-element.130 = s32[]{:T(128)} get-tuple-element(%param.1), index=0
+  %constant.49 = s32[]{:T(128)} constant(0)
+  %compare.13 = pred[]{:T(512)} compare(
+      %get-tuple-element.130, %constant.49), direction=LT
+  %constant.50 = s32[]{:T(128)} constant(10)
+  %add.22 = s32[]{:T(128)} add(%get-tuple-element.130, %constant.50)
+  %select.6 = s32[]{:T(128)} select(
+      %compare.13, %add.22, %get-tuple-element.130)
+  %dynamic-update-slice-start = (
+    (f32[10,32,512]{2,1,0:T(8,128)S(5)}, f32[1,32,512]{2,1,0:T(8,128)},
+     s32[]{:T(128)}, s32[]{:T(128)}, s32[]{:T(128)}),
+     f32[10,32,512]{2,1,0:T(8,128)S(5)}, u32[]) async-start(
+      %get-tuple-element.132, %reshape.6, %select.6,
+      %constant.49, %constant.49), calls=%async_computation
+  ROOT %dynamic-update-slice-done = f32[10,32,512]{2,1,0:T(8,128)S(5)}
+    async-done(%dynamic-update-slice-start), calls=%async_computation
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+
+  CopyInsertion copy_insertion;
+  ASSERT_IS_OK(copy_insertion.Run(module.get()).status());
+  LOG(INFO) << module->ToString();
+
+  auto* async_computation = module->GetComputationWithName("async_computation");
+  ASSERT_THAT(async_computation, NotNull());
+  EXPECT_EQ(CountCopies(*async_computation), 0);
+
+  auto* main_computation = module->GetComputationWithName("main");
+  ASSERT_THAT(main_computation, NotNull());
+  EXPECT_EQ(CountCopies(*main_computation), 1);
+}
+
+TEST_F(CopyInsertionTest, AsyncDUSInLoop) {
+  constexpr absl::string_view kModuleString = R"(
+HloModule module
+
+async_wrapped {
+  async_param.1 = s32[1024]{0} parameter(0)
+  async_param.2 = s32[256]{0} parameter(1)
+  async_param.3 = s32[] parameter(2)
+  ROOT dus = s32[1024]{0} dynamic-update-slice(async_param.1, async_param.2, async_param.3)
+}
+
+condition {
+  input_tuple = (s32[1024]{0}, s32[256]{0}, s32[], pred[]) parameter(0)
+  ROOT cond = pred[] get-tuple-element(input_tuple), index=3
+}
+
+body {
+  input_tuple = (s32[1024]{0}, s32[256]{0}, s32[], pred[]) parameter(0)
+  input.1 = s32[1024]{0} get-tuple-element(input_tuple), index=0
+  input.2 = s32[256]{0} get-tuple-element(input_tuple), index=1
+  input.3 = s32[] get-tuple-element(input_tuple), index=2
+  input.4 = pred[] get-tuple-element(input_tuple), index=3
+  async-start = ((s32[1024]{0}, s32[256]{0}, s32[]), s32[1024]{0}, u32[]) async-start(input.1, input.2, input.3), async_group_id=0, calls=%async_wrapped
+  async-done = s32[1024]{0} async-done(async-start), async_group_id=0, calls=async_wrapped
+  ROOT tuple = (s32[1024]{0}, s32[256]{0}, s32[], pred[]) tuple(async-done, input.2, input.3, input.4)
+}
+
+ENTRY main {
+  input.1 = s32[256]{0} parameter(0)
+  input.2 = s32[] parameter(1)
+  input.3 = pred[] parameter(2)
+  broadcast = s32[1024]{0} broadcast(input.2), dimensions={}
+  while_tuple = (s32[1024]{0}, s32[256]{0}, s32[], pred[]) tuple(broadcast, input.1, input.2, input.3)
+  while = (s32[1024]{0}, s32[256]{0}, s32[], pred[]) while(while_tuple), condition=condition, body=body
+  ROOT gte = s32[1024]{0} get-tuple-element(while), index=0
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+
+  CopyInsertion copy_insertion(nullptr,
+                               /*use_region_based_live_range_analysis=*/-1);
+  ASSERT_IS_OK(copy_insertion.Run(module.get()).status());
+  VLOG(2) << module->ToString();
+  EXPECT_EQ(CountCopies(*module), 0);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 99383d270214d7..df2f3acb71cd82 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -2,6 +2,7 @@
 #    LLVM-based CPU backend for XLA.
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load(
     "//xla:xla.bzl",
     "ORC_JIT_MEMORY_MAPPER_TARGETS",
@@ -45,6 +46,19 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
+bool_flag(
+    name = "experimental_mlir_gpu",
+    build_setting_default = False,
+)
+
+config_setting(
+    name = "experimental_mlir_gpu_enabled",
+    flag_values = {
+        ":experimental_mlir_gpu": "True",
+    },
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "test_header_helper",
     testonly = True,
@@ -216,6 +230,7 @@ cc_library(
         ":hlo_xla_runtime_pipeline",
         ":ir_emission_utils",
         ":ir_emitter",
+        ":onednn_ops_rewriter",
         ":onednn_rewriter",
         ":parallel_task_assignment",
         ":simple_orc_jit",
@@ -243,7 +258,6 @@ cc_library(
         "//xla/mlir/runtime/transforms:jit_compiler",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:all_passes",
-        "//xla/mlir_hlo:gml_st_passes",
         "//xla/mlir_hlo:lhlo",
         "//xla/mlir_hlo:mhlo_passes",
         "//xla/mlir_hlo:transforms_passes",
@@ -251,7 +265,6 @@ cc_library(
         "//xla/runtime:executable",
         "//xla/runtime:jit_executable",
         "//xla/service:algebraic_simplifier",
-        "//xla/service:all_gather_decomposer",
         "//xla/service:all_reduce_promotion",
         "//xla/service:all_to_all_decomposer",
         "//xla/service:batch_dot_simplification",
@@ -305,9 +318,9 @@ cc_library(
         "//xla/service:map_inliner",
         "//xla/service:operand_upcaster",
         "//xla/service:optimization_barrier_expander",
+        "//xla/service:optimize_input_output_buffer_alias",
         "//xla/service:qr_expander",
         "//xla/service:reduce_decomposer",
-        "//xla/service:reduce_scatter_decomposer",
         "//xla/service:reshape_decomposer",
         "//xla/service:reshape_mover",
         "//xla/service:result_caster",
@@ -471,15 +484,17 @@ cc_library(
     name = "hlo_xla_runtime_pipeline",
     srcs = ["hlo_xla_runtime_pipeline.cc"],
     hdrs = ["hlo_xla_runtime_pipeline.h"],
+    local_defines = select({
+        ":experimental_mlir_gpu_enabled": ["EXPERIMENTAL_MLIR_GPU=1"],
+        "//conditions:default": [],
+    }),
     visibility = ["//visibility:public"],
     deps = [
         "//xla:status",
         "//xla/mlir/backends/cpu/transforms:passes",
         "//xla/mlir/runtime/transforms:compiler",
         "//xla/mlir_hlo:all_passes",
-        "//xla/mlir_hlo:gml_st_bufferizable_op_interface",
         "//xla/mlir_hlo:mhlo_passes",
-        "//xla/mlir_hlo:thlo_bufferizable_op_interface",
         "//xla/mlir_hlo:transforms_passes",
         "//xla/runtime:compiler",
         "@llvm-project//mlir:ArithTransforms",
@@ -488,8 +503,6 @@ cc_library(
         "@llvm-project//mlir:ComplexToStandard",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncTransforms",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:GPUToNVVMTransforms",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:MemRefTransforms",
         "@llvm-project//mlir:Pass",
@@ -503,10 +516,17 @@ cc_library(
         "@llvm-project//mlir:TensorTransforms",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorToLLVM",
+        "@llvm-project//mlir:VectorToSCF",
         "@llvm-project//mlir:VectorTransforms",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
-    ],
+    ] + select({
+        ":experimental_mlir_gpu_enabled": [
+            "@llvm-project//mlir:GPUDialect",
+            "@llvm-project//mlir:GPUToNVVMTransforms",
+        ],
+        "//conditions:default": [],
+    }),
     alwayslink = 1,  # has pipeline registration
 )
 
@@ -523,7 +543,9 @@ cc_library(
     deps = [
         ":compiler_functor",
         ":cpu_runtime",
+        ":onednn_layer_norm",
         ":onednn_matmul",
+        ":onednn_softmax",
         ":orc_jit_memory_mapper",
         ":runtime_conv2d",
         ":runtime_conv2d_acl",
@@ -899,28 +921,31 @@ cc_library(
     copts = runtime_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":collectives_interface",
+        ":cpu_executable_run_options",
+        ":in_process_collectives",
         "//xla:executable_run_options",
-        "//xla:refcounting_hash_map",
         "//xla:shape_util",
         "//xla:statusor",
         "//xla:types",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
+        "//xla/service:global_device_id",
         "//xla/service:hlo_parser",
-        "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:mutex",
-        "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
@@ -1354,7 +1379,9 @@ cc_library(
         ":dot_op_emitter",
         ":ir_emission_utils",
         ":target_machine_features",
+        "//xla:shape_util",
         "//xla:util",
+        "//xla/hlo/ir:hlo",
         "//xla/service:computation_layout",
         "//xla/service:layout_assignment",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1660,10 +1687,81 @@ cc_library(
     ] + mkl_deps(),
 )
 
+cc_library(
+    name = "onednn_layer_norm",
+    srcs = ["onednn_layer_norm.cc"],
+    hdrs = [
+        "onednn_layer_norm.h",
+        "@local_tsl//tsl/util:onednn_util_hdrs",
+    ],
+    copts = runtime_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":backend_config_proto_cc",
+        ":onednn_memory_util",
+        ":runtime_lightweight_check",
+        "//xla:executable_run_options",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:blocking_counter",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:platform_port",
+    ] + mkl_deps(),
+)
+
+cc_library(
+    name = "onednn_softmax",
+    srcs = ["onednn_softmax.cc"],
+    hdrs = [
+        "onednn_softmax.h",
+        "@local_tsl//tsl/util:onednn_util_hdrs",
+    ],
+    copts = runtime_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":backend_config_proto_cc",
+        ":onednn_memory_util",
+        ":runtime_lightweight_check",
+        "//xla:executable_run_options",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/log:check",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:blocking_counter",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:platform_port",
+    ] + mkl_deps(),
+)
+
 cc_library(
     name = "onednn_rewriter",
     srcs = ["onednn_rewriter.cc"],
-    hdrs = ["onednn_rewriter.h"],
+    hdrs = [
+        "onednn_rewriter.h",
+        "onednn_util.h",
+    ],
+    copts = tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":backend_config_proto_cc",
+        ":onednn_memory_util",
+        "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_creation_utils",
+        "//xla/service:hlo_pass",
+        "//xla/service:pattern_matcher",
+    ] + mkl_deps(),
+)
+
+cc_library(
+    name = "onednn_ops_rewriter",
+    srcs = ["onednn_ops_rewriter.cc"],
+    hdrs = [
+        "onednn_ops_rewriter.h",
+        "onednn_util.h",
+    ],
     copts = tsl_copts(),
     visibility = ["//visibility:public"],
     deps = [
@@ -1682,5 +1780,57 @@ cc_library(
     name = "cpu_symbol_repository",
     hdrs = ["cpu_symbol_repository.h"],
     visibility = ["//visibility:public"],
-    deps = ["//xla/service:symbol_repository"],
+    deps = [
+        "//xla:xla_proto_cc",
+        "//xla/service:symbol_repository",
+    ],
+)
+
+cc_library(
+    name = "collectives_interface",
+    hdrs = ["collectives_interface.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:xla_data_proto_cc",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:global_device_id",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "in_process_collectives",
+    srcs = ["in_process_collectives.cc"],
+    hdrs = ["in_process_collectives.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":collectives_interface",
+        "//xla:refcounting_hash_map",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:global_device_id",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+    ],
+)
+
+cc_library(
+    name = "cpu_executable_run_options",
+    hdrs = ["cpu_executable_run_options.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":collectives_interface",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
 )
diff --git a/third_party/xla/xla/service/cpu/backend_config.proto b/third_party/xla/xla/service/cpu/backend_config.proto
index 5ce04477008fb9..a0fa627fa3a6ab 100644
--- a/third_party/xla/xla/service/cpu/backend_config.proto
+++ b/third_party/xla/xla/service/cpu/backend_config.proto
@@ -10,6 +10,7 @@ message BackendConfig {
   repeated int64 outer_dimension_partitions = 1;
   // Configuration to be used by oneDNN matmul
   OneDnnMatMulConfig onednn_matmul_config = 2;
+  OneDnnLayerNormConfig onednn_layer_norm_config = 3;
 }
 
 message OneDnnMatMulConfig {
@@ -25,3 +26,15 @@ message OneDnnMatMulConfig {
   }
   repeated FusionKind fused_ops = 3;
 }
+
+message OneDnnLayerNormConfig {
+  // These enum needs to be mapped to oneDNN enum for post_op algorithm.
+  // TODO(intel-tf): Add kinds supported by oneDNN.
+  enum FusionKind {
+    UNDEFINED = 0;
+    SCALE = 1;
+    SHIFT = 2;
+    SCALE_AND_SHIFT = 3;
+  }
+  FusionKind fused_ops = 1;
+}
diff --git a/third_party/xla/xla/service/cpu/collectives_interface.h b/third_party/xla/xla/service/cpu/collectives_interface.h
new file mode 100644
index 00000000000000..e2c8190c81b986
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/collectives_interface.h
@@ -0,0 +1,92 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_COLLECTIVES_INTERFACE_H_
+#define XLA_SERVICE_CPU_COLLECTIVES_INTERFACE_H_
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/global_device_id.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+class CollectivesCommunicator {
+ public:
+  virtual ~CollectivesCommunicator() = default;
+
+  // Performs an all-reduce.
+  virtual absl::Status AllReduce(const RendezvousKey& key,
+                                 ReductionKind reduction_kind,
+                                 PrimitiveType element_type,
+                                 size_t num_elements, const void* input_buffer,
+                                 void* output_buffer,
+                                 absl::Duration timeout) = 0;
+
+  // Performs a collective permute.
+  // Arguments:
+  //  source_rank: the rank from which this rank should receive its data.
+  //    Optional; if absent, then the output is filled with zeros.
+  //  target_rank: the ranks to which this rank should send its data.
+  virtual absl::Status CollectivePermute(const RendezvousKey& key,
+                                         size_t num_bytes,
+                                         std::optional<int> source_rank,
+                                         absl::Span<int const> target_ranks,
+                                         const void* input_buffer,
+                                         void* output_buffer,
+                                         absl::Duration timeout) = 0;
+
+  // Performs an all-to-all.
+  // The all-to-all chunks are passed separately and do not have to be
+  // contiguous in memory.
+  virtual absl::Status AllToAll(const RendezvousKey& key, size_t chunk_bytes,
+                                absl::Span<const void* const> input_buffers,
+                                absl::Span<void* const> output_buffers,
+                                absl::Duration timeout) = 0;
+
+  // Performs an all-gather.
+  virtual absl::Status AllGather(const RendezvousKey& key, size_t chunk_bytes,
+                                 const void* input_buffer, void* output_buffer,
+                                 absl::Duration timeout) = 0;
+
+  // Performs a reduce-scatter
+  virtual absl::Status ReduceScatter(
+      const RendezvousKey& key, ReductionKind reduction_kind,
+      PrimitiveType element_type, size_t chunk_elems, const void* input_buffer,
+      void* output_buffer, absl::Duration timeout) = 0;
+};
+
+class CollectivesInterface {
+ public:
+  virtual ~CollectivesInterface() = default;
+
+  // Builds a context for a collective group.
+  // Args:
+  //  devices: the devices participating in this collective.
+  //  rank: the rank of this process.
+  virtual absl::StatusOr<std::shared_ptr<CollectivesCommunicator>>
+  GetCommunicator(absl::Span<GlobalDeviceId const> devices, int rank) = 0;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_COLLECTIVES_INTERFACE_H_
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 376da95f9e8837..6902ceee823742 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -70,6 +70,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/IR/Linalg.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
@@ -114,7 +115,6 @@ limitations under the License.
 #include "xla/runtime/executable.h"
 #include "xla/runtime/jit_executable.h"
 #include "xla/service/algebraic_simplifier.h"
-#include "xla/service/all_gather_decomposer.h"
 #include "xla/service/all_reduce_promotion.h"
 #include "xla/service/all_to_all_decomposer.h"
 #include "xla/service/batch_dot_simplification.h"
@@ -189,9 +189,9 @@ limitations under the License.
 #include "xla/service/map_inliner.h"
 #include "xla/service/operand_upcaster.h"
 #include "xla/service/optimization_barrier_expander.h"
+#include "xla/service/optimize_input_output_buffer_alias.h"
 #include "xla/service/qr_expander.h"
 #include "xla/service/reduce_decomposer.h"
-#include "xla/service/reduce_scatter_decomposer.h"
 #include "xla/service/reshape_decomposer.h"
 #include "xla/service/reshape_mover.h"
 #include "xla/service/result_caster.h"
@@ -235,6 +235,7 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+#include "xla/service/cpu/onednn_ops_rewriter.h"
 #include "xla/service/cpu/onednn_rewriter.h"
 #endif
 
@@ -254,22 +255,22 @@ void LoadMLIRDialects(mlir::MLIRContext& context) {
                       xla::runtime::RuntimeDialect>();
   mlir::registerBuiltinDialectTranslation(context);
   mlir::registerLLVMDialectTranslation(context);
+
+  mlir::DialectRegistry registry;
+  mlir::memref::registerAllocationOpInterfaceExternalModels(registry);
+  context.appendDialectRegistry(registry);
 }
 
 xla::cpu::HloXlaRuntimePipelineOptions GetHloXlaRuntimePipelineOptions(
     llvm::Triple target_triple, llvm::StringRef cpu_name) {
   xla::cpu::HloXlaRuntimePipelineOptions options;
-  options.enable_tiling_and_fusion =
-      xla::GetDebugOptionsFromFlags().xla_cpu_enable_mlir_tiling_and_fusion();
+  options.enable_tiling_and_fusion = false;
   if (xla::GetDebugOptionsFromFlags().xla_cpu_enable_custom_matmul_tiling()) {
     options.matmul_tile_sizes = {
         xla::GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_m_dim(),
         xla::GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_n_dim(),
         xla::GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_k_dim()};
   }
-  options.experimental_deallocation =
-      xla::GetDebugOptionsFromFlags()
-          .xla_cpu_enable_experimental_deallocation();
   options.enable_avx2 = [&] {
     // Derive whether this is an x86 CPU with AVX2 enabled.
     if (!target_triple.isX86()) return false;
@@ -280,7 +281,6 @@ xla::cpu::HloXlaRuntimePipelineOptions GetHloXlaRuntimePipelineOptions(
   options.cpu_name = cpu_name;
   if (xla::GetDebugOptionsFromFlags().xla_cpu_enable_mlir_fusion_outlining()) {
     options.enable_fusion_outlining = true;
-    options.experimental_deallocation = true;
   }
   return options;
 }
@@ -435,7 +435,7 @@ runtime::JitExecutable::Options GetXlaRuntimeJitExecutableOptions(
 
 StatusOr<std::unique_ptr<Executable>>
 CpuXlaRuntimeAotCompilationResult::LoadExecutable(
-    Compiler* compiler, se::StreamExecutor* executor) const {
+    Compiler* compiler, const se::StreamExecutor* executor) const {
   XlaRuntimeExecutableProto xla_runtime_executable =
       xla_runtime_cpu_executable_.xla_runtime_executable();
   TF_ASSIGN_OR_RETURN(HloModuleConfig hlo_module_config,
@@ -685,9 +685,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<QrExpander>();
   pipeline.AddPass<EighExpander>();
   pipeline.AddPass<TriangularSolveExpander>();
-  pipeline.AddPass<AllGatherDecomposer>();
   pipeline.AddPass<AllToAllDecomposer>();
-  pipeline.AddPass<ReduceScatterDecomposer>();
   pipeline.AddPass<StochasticConvertDecomposer>();
 
   // Inline computations with a single call site.
@@ -699,8 +697,13 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
   // AOT compiled code runs in single thread.
   if (!is_aot_compile) {
-    // Temporarily disabling oneDNN rewriter because it causes JAX regression.
-    // pipeline.AddPass<OneDnnRewriter>();
+    pipeline.AddPass<OneDnnRewriter>();
+    // Placing OneDnnOpsRewriter here to match the flax patterns
+    // TODO: Decide where would be the appropriate place for this pass to make
+    // it more generic
+    // TODO - intel: Name of the pass might seem redundant as oneDnnRewriter,
+    // but in future plan to rename oneDNNrewriter to specific to onednn matmul
+    pipeline.AddPass<OneDnnOpsRewriter>();
   }
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 
@@ -713,15 +716,15 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   // BF16/F8 lowering for most ops.
   FloatSupport bf16_support(BF16);
   pipeline.AddPass<FloatNormalization>(&bf16_support);
-  FloatSupport f8e5m2_support(F8E5M2);
+  FloatSupport f8e5m2_support(F8E5M2, F16);
   pipeline.AddPass<FloatNormalization>(&f8e5m2_support);
-  FloatSupport f8e4m3fn_support(F8E4M3FN);
+  FloatSupport f8e4m3fn_support(F8E4M3FN, F16);
   pipeline.AddPass<FloatNormalization>(&f8e4m3fn_support);
-  FloatSupport f8e4m3b11fnuz_support(F8E4M3B11FNUZ);
+  FloatSupport f8e4m3b11fnuz_support(F8E4M3B11FNUZ, F16);
   pipeline.AddPass<FloatNormalization>(&f8e4m3b11fnuz_support);
-  FloatSupport f8e5m2fnuz_support(F8E5M2FNUZ);
+  FloatSupport f8e5m2fnuz_support(F8E5M2FNUZ, F16);
   pipeline.AddPass<FloatNormalization>(&f8e5m2fnuz_support);
-  FloatSupport f8e4m3fnuz_support(F8E4M3FNUZ);
+  FloatSupport f8e4m3fnuz_support(F8E4M3FNUZ, F16);
   pipeline.AddPass<FloatNormalization>(&f8e4m3fnuz_support);
   // After canonicalization, there may be more batch dots that can be
   // simplified.
@@ -944,6 +947,7 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   pipeline.AddPass<HloDCE>();
   pipeline.AddPass<CopyInsertion>();
   pipeline.AddPass<HloDCE>();
+  pipeline.AddPass<OptimizeInputOutputBufferAlias>(true);
   return pipeline.Run(module).status();
 }
 
@@ -1084,7 +1088,7 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
 }
 
 StatusOr<std::unique_ptr<BufferAssignment>> CpuCompiler::AssignBuffers(
-    HloModule* module, se::StreamExecutor* /*stream_exec*/) {
+    HloModule* module, const se::StreamExecutor* /*stream_exec*/) {
   // Select an order for emitting the HLO instructions for each computation.
   // Using this sequence enables tighter buffer liveness analysis and reduced
   // memory usage (as compared to using DependencyHloOrdering).
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.h b/third_party/xla/xla/service/cpu/cpu_compiler.h
index a6bce23d574b2f..2313b8d4fa1dc9 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.h
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.h
@@ -122,7 +122,7 @@ class CpuXlaRuntimeAotCompilationResult : public AotCompilationResult {
   }
 
   StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, se::StreamExecutor* executor) const override;
+      Compiler* compiler, const se::StreamExecutor* executor) const override;
 
  private:
   XlaRuntimeCpuExecutableProto xla_runtime_cpu_executable_;
@@ -186,7 +186,7 @@ class CpuCompiler : public LLVMCompiler {
       const CompileOptions& options) override;
 
   StatusOr<std::unique_ptr<BufferAssignment>> AssignBuffers(
-      HloModule* module, se::StreamExecutor* stream_exec) override;
+      HloModule* module, const se::StreamExecutor* stream_exec) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index d27a822bc5b60c..404a2557d864ee 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -112,13 +112,9 @@ CpuExecutable::CpuExecutable(
     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
       assignment_(std::move(assignment)) {
-  if (assignment_) {
-    buffer_assignment_ =
-        std::make_shared<BufferAssignmentProto>(assignment_->ToProto());
-  }
-  if (has_module()) {
+  if (assignment_ && has_module()) {
     XlaDebugInfoManager::Get()->RegisterModule(shared_module(),
-                                               buffer_assignment_);
+                                               assignment_->ToProto());
   }
 }
 
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.h b/third_party/xla/xla/service/cpu/cpu_executable.h
index b02aefa28fc61c..55dcaa4f161185 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.h
+++ b/third_party/xla/xla/service/cpu/cpu_executable.h
@@ -177,6 +177,8 @@ class CpuExecutable : public Executable {
     ir_module_string_ = ir_module_string;
   }
 
+  const std::string& module_name() const { return module_name_; }
+
   static int64_t ShapeSizeBytes(const Shape& shape);
 
   // Type of the computation function we expect in the JIT.
@@ -249,8 +251,6 @@ class CpuExecutable : public Executable {
   // Buffer assignment for the buffers we need to allocate.
   const std::unique_ptr<const BufferAssignment> assignment_;
 
-  std::shared_ptr<const BufferAssignmentProto> buffer_assignment_;
-
   // The LLVM IR, in string format, of the unoptimized module generated for this
   // CpuExecutable. We save a string instead of an llvm::Module* because leaving
   // llvm::Module* in a singleton can cause the heap checker to emit false
diff --git a/third_party/xla/xla/service/cpu/cpu_executable_run_options.h b/third_party/xla/xla/service/cpu/cpu_executable_run_options.h
new file mode 100644
index 00000000000000..6d77a3589ac875
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/cpu_executable_run_options.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_CPU_EXECUTABLE_RUN_OPTIONS_H_
+#define XLA_SERVICE_CPU_CPU_EXECUTABLE_RUN_OPTIONS_H_
+
+#include "xla/service/cpu/collectives_interface.h"
+
+namespace xla::cpu {
+
+// CPU-specific executable options.
+// We keep these separate from ExecutableRunOptions to avoid adding
+// dependencies to ExecutableRunOptions.
+class CpuExecutableRunOptions {
+ public:
+  CpuExecutableRunOptions& set_collectives(CollectivesInterface* collectives) {
+    collectives_ = collectives;
+    return *this;
+  }
+  CollectivesInterface* collectives() const { return collectives_; }
+
+ private:
+  // For cross-process collectives, use this collective implementation to
+  // communicate.
+  CollectivesInterface* collectives_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_SERVICE_CPU_CPU_EXECUTABLE_RUN_OPTIONS_H_
diff --git a/third_party/xla/xla/service/cpu/cpu_layout_assignment.cc b/third_party/xla/xla/service/cpu/cpu_layout_assignment.cc
index 8b124ddaa60397..44eb78c7e3eb03 100644
--- a/third_party/xla/xla/service/cpu/cpu_layout_assignment.cc
+++ b/third_party/xla/xla/service/cpu/cpu_layout_assignment.cc
@@ -15,12 +15,18 @@ limitations under the License.
 
 #include "xla/service/cpu/cpu_layout_assignment.h"
 
+#include <cstdint>
 #include <numeric>
+#include <optional>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/map_util.h"
 #include "xla/service/cpu/dot_op_emitter.h"
 #include "xla/service/cpu/ir_emission_utils.h"
+#include "xla/shape_util.h"
 #include "tsl/platform/errors.h"
 
 namespace xla {
@@ -78,12 +84,17 @@ static optional<int64_t> ShouldMakeOperandColumnMajor(
   return it->second ? operand_idx : nullopt;
 }
 
-static Shape RowMajorShape(const Shape& old_shape) {
-  Shape new_shape(old_shape);
-  std::vector<int64_t> dimension_order(new_shape.dimensions_size());
-  std::iota(dimension_order.rbegin(), dimension_order.rend(), 0);
-  *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
-  return new_shape;
+static Shape RowMajorShape(Shape shape) {
+  ShapeUtil::ForEachMutableSubshape(
+      &shape, [](Shape* subshape, const ShapeIndex& index) {
+        if (!subshape->IsArray()) {
+          return;
+        }
+        std::vector<int64_t> dimension_order(subshape->dimensions_size());
+        std::iota(dimension_order.rbegin(), dimension_order.rend(), 0);
+        *subshape->mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
+      });
+  return shape;
 }
 
 static Shape ColMajorShape(const Shape& old_shape) {
@@ -103,6 +114,8 @@ static bool OperandsAndResultMustHaveRowMajorLayout(
   } else if (instr.opcode() == HloOpcode::kDot) {
     return DotOperandsAndResultMustHaveRowMajorLayout(instr,
                                                       target_machine_features);
+  } else if (instr.opcode() == HloOpcode::kCustomCall) {
+    return instr.custom_call_target() == "TopK";
   }
   return false;
 }
@@ -126,6 +139,20 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       const HloInstruction* op = instruction->operand(*op_idx);
       TF_RETURN_IF_ERROR(
           SetOperandLayout(ColMajorShape(op->shape()), instruction, *op_idx));
+    } else if (instruction->opcode() == HloOpcode::kReduceScatter) {
+      // XLA:CPU can only support reduce-scatter where the scatter dimension
+      // is the most major dimension in the layout.
+      auto ars = Cast<HloReduceScatterInstruction>(instruction);
+      TF_RETURN_IF_ERROR(SetInstructionLayout(
+          ShapeUtil::MoveDimToMajor(ars->shape(), ars->scatter_dimension()),
+          ars));
+    } else if (instruction->opcode() == HloOpcode::kAllGather) {
+      // XLA:CPU can only support all-gathers where the gather dimension is the
+      // most major dimension in the layout.
+      auto ag = Cast<HloAllGatherInstruction>(instruction);
+      TF_RETURN_IF_ERROR(SetInstructionLayout(
+          ShapeUtil::MoveDimToMajor(ag->shape(), ag->all_gather_dimension()),
+          ag));
     } else {
       for (int64_t operand_no = 0; operand_no < instruction->operand_count();
            ++operand_no) {
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.cc b/third_party/xla/xla/service/cpu/cpu_runtime.cc
index ee0208644f9b4a..b492ef8ecafe47 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.cc
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.cc
@@ -15,37 +15,44 @@ limitations under the License.
 
 #include "xla/service/cpu/cpu_runtime.h"
 
-#include <complex>
 #include <cstdarg>
-#include <cstddef>
+#include <cstdint>
 #include <cstring>
-#include <functional>
-#include <limits>
-#include <map>
+#include <iterator>
 #include <memory>
 #include <optional>
 #include <string>
-#include <type_traits>
+#include <string_view>
 #include <utility>
 #include <vector>
 
-#include "absl/base/dynamic_annotations.h"
+#include "absl/algorithm/container.h"
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/strings/str_format.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "xla/executable_run_options.h"
 #include "xla/layout_util.h"
-#include "xla/primitive_util.h"
-#include "xla/refcounting_hash_map.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
+#include "xla/service/cpu/collectives_interface.h"
+#include "xla/service/cpu/cpu_executable_run_options.h"
+#include "xla/service/cpu/in_process_collectives.h"
 #include "xla/service/cpu/xfeed_manager.h"
+#include "xla/service/global_device_id.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/shape_util.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 #include "tsl/profiler/lib/traceme.h"
@@ -141,6 +148,9 @@ extern const char* const kTracingStartSymbolName =
 extern const char* const kTracingEndSymbolName = "__xla_cpu_runtime_TracingEnd";
 extern const char* const kXlaCpuRuntimeSymbolNamePrefix = "__xla_cpu_runtime_";
 extern const char* const kAllReduceSymbolName = "__xla_cpu_runtime_AllReduce";
+extern const char* const kAllGatherSymbolName = "__xla_cpu_runtime_AllGather";
+extern const char* const kReduceScatterSymbolName =
+    "__xla_cpu_runtime_ReduceScatter";
 extern const char* const kAllToAllSymbolName = "__xla_cpu_runtime_AllToAll";
 extern const char* const kCollectivePermuteSymbolName =
     "__xla_cpu_runtime_CollectivePermute";
@@ -149,72 +159,13 @@ extern const char* const kPartitionIdSymbolName =
 extern const char* const kReplicaIdSymbolName = "__xla_cpu_runtime_ReplicaId";
 extern const char* const kOneDnnMatMulSymbolName =
     "__xla_cpu_runtime_OneDnnMatMul";
+extern const char* const kOneDnnSoftmaxSymbolName =
+    "__xla_cpu_runtime_OneDnnSoftmax";
+extern const char* const kOneDnnLayerNormSymbolName =
+    "__xla_cpu_runtime_OneDnnLayerNorm";
 
 namespace {
 
-struct CollectivePermuteParticipantData : ParticipantData {
-  CollectivePermuteParticipantData(const RendezvousKey& rendezvous_key_p,
-                                   int64_t device_ordinal_p,
-                                   se::Stream* stream_p)
-      : ParticipantData(rendezvous_key_p),
-        device_ordinal(device_ordinal_p),
-        stream(stream_p) {}
-
-  int64_t device_ordinal;
-  se::Stream* stream;
-  int replica_id;
-  se::DeviceMemoryBase source_data;
-  se::DeviceMemoryBase destination_data;
-  int64_t byte_size;
-  std::vector<int> replica_ids_to_copy_to;
-
-  std::string ToString() const override {
-    return absl::StrFormat(
-        "CollectivePermuteParticipantData{replica_id=%d, "
-        "source_data=%p, destination_data=%p, byte_size=%d, "
-        "replica_ids_to_copy_to=[%s], device_ordinal=%d, stream=%p}",
-        replica_id, source_data.opaque(), destination_data.opaque(), byte_size,
-        absl::StrJoin(replica_ids_to_copy_to, ", "), device_ordinal, stream);
-  }
-};
-
-struct AllToAllParticipantData : ParticipantData {
-  AllToAllParticipantData(const RendezvousKey& rendezvous_key_p,
-                          int64_t device_ordinal_p, se::Stream* stream_p)
-      : ParticipantData(rendezvous_key_p),
-        device_ordinal(device_ordinal_p),
-        stream(stream_p) {}
-
-  int64_t device_ordinal;
-  se::Stream* stream;
-  std::vector<se::DeviceMemoryBase> source_buffers;
-  std::vector<se::DeviceMemoryBase> destination_buffers;
-  GlobalDeviceId device_id;
-
-  // Replica ids participating in AllToAll, concatenation happens in the order
-  // of appearance.
-  std::vector<GlobalDeviceId> devices_to_copy_to;
-
-  std::string ToString() const override {
-    auto addr_formatter = [](std::string* out,
-                             const se::DeviceMemoryBase& mem) {
-      absl::StrAppend(out, absl::StrFormat("%p", mem.opaque()));
-    };
-    auto device_formatter = [](std::string* out, const GlobalDeviceId& device) {
-      absl::StrAppend(out, device.value());
-    };
-    return absl::StrFormat(
-        "AllToAllParticipantData{replica_id=%d, "
-        "replica_ids_to_copy_to=[%s], source_buffers=[%s], "
-        "destination_buffers=[%s], device_ordinal=%d, stream=%p}",
-        device_id.value(),
-        absl::StrJoin(devices_to_copy_to, ", ", device_formatter),
-        absl::StrJoin(source_buffers, ", ", addr_formatter),
-        absl::StrJoin(destination_buffers, ", ", addr_formatter),
-        device_ordinal, stream);
-  }
-};
-
 // Inverses the encoding of a Shape protobuf into an LLVM global variable.
 StatusOr<Shape> DecodeSelfDescribingShapeConstant(const void* shape_ptr,
                                                   int32_t size_bytes) {
@@ -250,343 +201,6 @@ int GetDeviceOrdinal(const ExecutableRunOptions* run_options) {
   return run_options->stream()->parent()->device_ordinal();
 }
 
-class CpuAllToAllRendezvous
-    : public Rendezvous<AllToAllParticipantData, std::nullptr_t> {
- public:
-  explicit CpuAllToAllRendezvous(const RendezvousKey& k)
-      : Rendezvous<AllToAllParticipantData, std::nullptr_t>(k) {}
-
- protected:
-  StatusOr<std::nullptr_t> RunCollectiveOp(
-      const AllToAllParticipantData& /*participant*/) override {
-    bool is_primary = InitializationBarrier();
-
-    if (is_primary) {
-      absl::MutexLock lock(&mu_);
-
-      CHECK(!participants_.empty());
-      CHECK(!participants_[0].source_buffers.empty());
-      int expected_buffer_size = participants_[0].source_buffers[0].size();
-
-      // Device id -> position in participants_.
-      absl::flat_hash_map<GlobalDeviceId, int> device_map;
-
-      for (int pos = 0; pos < participants_.size(); pos++) {
-        const AllToAllParticipantData& p = participants_[pos];
-        CHECK_EQ(p.source_buffers.size(), p.destination_buffers.size());
-        CHECK_EQ(p.source_buffers.size(), participants_.size());
-        for (int i = 0; i < p.source_buffers.size(); i++) {
-          CHECK_EQ(p.destination_buffers[i].size(), expected_buffer_size);
-          CHECK_EQ(p.source_buffers[i].size(), expected_buffer_size);
-        }
-        device_map[p.device_id] = pos;
-      }
-
-      const std::vector<GlobalDeviceId>& devices_to_copy_to =
-          participants_[0].devices_to_copy_to;
-
-      // Device id -> rank
-      absl::flat_hash_map<GlobalDeviceId, int> device_ranks;
-      for (int rank = 0; rank < devices_to_copy_to.size(); ++rank) {
-        auto device_id = devices_to_copy_to[rank];
-        device_ranks[device_id] = rank;
-      }
-
-      for (const AllToAllParticipantData& sender : participants_) {
-        VLOG(3) << "Processing AllToAll participant: " << sender.ToString();
-
-        int rank = FindOrDie(device_ranks, sender.device_id);
-
-        for (int i = 0; i < participants_.size(); ++i) {
-          auto device_id = devices_to_copy_to[i];
-          int participant_num = FindOrDie(device_map, device_id);
-          AllToAllParticipantData& receiver = participants_[participant_num];
-
-          std::memcpy(receiver.destination_buffers[rank].opaque(),
-                      sender.source_buffers[i].opaque(), expected_buffer_size);
-        }
-      }
-    }
-    return nullptr;
-  }
-};
-
-class CpuCollectivePermuteRendezvous
-    : public Rendezvous<CollectivePermuteParticipantData, std::nullptr_t> {
- public:
-  explicit CpuCollectivePermuteRendezvous(const RendezvousKey& k)
-      : Rendezvous<CollectivePermuteParticipantData, std::nullptr_t>(k) {}
-
- protected:
-  StatusOr<std::nullptr_t> RunCollectiveOp(
-      const CollectivePermuteParticipantData& /*participant*/) override {
-    bool primary = InitializationBarrier();
-
-    // Perform all copies from the primary thread.
-    if (primary) {
-      absl::MutexLock lock(&mu_);
-
-      std::map<int, int> replica_idx_to_participant_idx;
-      for (int p_idx = 0; p_idx < participants_.size(); p_idx++) {
-        replica_idx_to_participant_idx[participants_[p_idx].replica_id] = p_idx;
-      }
-      for (auto& p : participants_) {
-        for (int dest_replica : p.replica_ids_to_copy_to) {
-          auto& dest_p = participants_[FindOrDie(replica_idx_to_participant_idx,
-                                                 dest_replica)];
-          std::memcpy(dest_p.destination_data.opaque(), p.source_data.opaque(),
-                      p.byte_size);
-
-          // Each replica may be copied into only once.
-          replica_idx_to_participant_idx.erase(dest_replica);
-        }
-      }
-
-      // Zero out untouched participants.
-      for (auto& replica_p : replica_idx_to_participant_idx) {
-        auto& p = participants_[replica_p.second];
-        std::memset(p.destination_data.opaque(), 0, p.byte_size);
-      }
-    }
-    return nullptr;
-  }
-};
-
-class CpuAllReduceRendezvous
-    : public Rendezvous<AllReduceParticipantData, std::nullptr_t> {
- public:
-  explicit CpuAllReduceRendezvous(const RendezvousKey& k)
-      : Rendezvous<AllReduceParticipantData, std::nullptr_t>(k) {}
-
- protected:
-  StatusOr<std::nullptr_t> RunCollectiveOp(
-      const AllReduceParticipantData& participant) override {
-    PrimitiveType datatype = participant.buffers.front().primitive_type;
-    bool primary = InitializationBarrier();
-
-    if (primary) {
-      switch (datatype) {
-        case S8:
-          DoAllReduce<S8>(participant);
-          break;
-        case PRED:
-        case U8:
-          DoAllReduce<U8>(participant);
-          break;
-        case S16:
-          DoAllReduce<S16>(participant);
-          break;
-        case U16:
-          DoAllReduce<U16>(participant);
-          break;
-        case S32:
-          DoAllReduce<S32>(participant);
-          break;
-        case U32:
-          DoAllReduce<U32>(participant);
-          break;
-        case S64:
-          DoAllReduce<S64>(participant);
-          break;
-        case U64:
-          DoAllReduce<U64>(participant);
-          break;
-        case F16:
-          DoAllReduce<F16>(participant);
-          break;
-        case F32:
-          DoAllReduce<F32>(participant);
-          break;
-        case F64:
-          DoAllReduce<F64>(participant);
-          break;
-        case C64:
-          DoAllReduce<C64>(participant);
-          break;
-        case C128:
-          DoAllReduce<C128>(participant);
-          break;
-        default:
-          LOG(FATAL) << "Unexpected datatype;";
-      }
-    }
-    return nullptr;
-  }
-
- private:
-  template <PrimitiveType PT>
-  void DoAllReduce(AllReduceParticipantData participant) {
-    using T = typename primitive_util::PrimitiveTypeToNative<PT>::type;
-    absl::MutexLock lock(&mu_);
-    CHECK(!participants_.empty());
-    ReductionKind reduction_kind = participant.reduction_kind;
-    for (const auto& p : participants_) {
-      CHECK(p.reduction_kind == reduction_kind);
-    }
-    int num_participants = participants_.size();
-
-    // participant_idx -> buffer_idx -> buffer.
-    std::vector<std::vector<absl::Span<T>>> input_buffers;
-    std::vector<std::vector<absl::Span<T>>> output_buffers;
-    input_buffers.reserve(num_participants);
-    output_buffers.reserve(num_participants);
-    const AllReduceParticipantData& first_participant = participants_.front();
-
-    int buffers_per_participant = first_participant.buffers.size();
-    for (AllReduceParticipantData& p : participants_) {
-      CHECK_EQ(p.buffers.size(), buffers_per_participant);
-
-      input_buffers.emplace_back();
-      output_buffers.emplace_back();
-      std::vector<absl::Span<T>>& participant_input_buffers =
-          input_buffers.back();
-      std::vector<absl::Span<T>>& participant_output_buffers =
-          output_buffers.back();
-      participant_input_buffers.reserve(p.buffers.size());
-      participant_output_buffers.reserve(p.buffers.size());
-
-      for (int buffer_idx = 0; buffer_idx < buffers_per_participant;
-           buffer_idx++) {
-        auto& participant_buffer = p.buffers[buffer_idx];
-        participant_input_buffers.emplace_back(
-            static_cast<T*>(participant_buffer.source_data.opaque()),
-            participant_buffer.element_count);
-        participant_output_buffers.emplace_back(
-            static_cast<T*>(participant_buffer.destination_data.opaque()),
-            participant_buffer.element_count);
-        CHECK_EQ(participant_buffer.element_count,
-                 first_participant.buffers[buffer_idx].element_count);
-      }
-    }
-
-    for (int buffer_idx = 0; buffer_idx < buffers_per_participant;
-         buffer_idx++) {
-      int element_count = first_participant.buffers[buffer_idx].element_count;
-      for (int idx = 0; idx < element_count; idx++) {
-        T out = GetInitialValue<T>(reduction_kind);
-        for (int participant_idx = 0; participant_idx < participants_.size();
-             participant_idx++) {
-          out = PerformReductionStep<T>(
-              reduction_kind, out,
-              input_buffers[participant_idx][buffer_idx][idx]);
-        }
-        for (int participant_idx = 0; participant_idx < participants_.size();
-             participant_idx++) {
-          output_buffers[participant_idx][buffer_idx][idx] = out;
-        }
-      }
-    }
-  }
-
-  template <typename T>
-  T GetInitialValue(ReductionKind reduction_kind) {
-    switch (reduction_kind) {
-      case ReductionKind::SUM:
-        return static_cast<T>(0);
-      case ReductionKind::PRODUCT:
-        return static_cast<T>(1);
-      case ReductionKind::MIN:
-        return std::numeric_limits<T>::max();
-      case ReductionKind::MAX:
-        return std::numeric_limits<T>::min();
-    }
-  }
-
-  template <typename T, bool kIsSignedIntegralType>
-  struct SumProductTypeForReductionStep {
-    using type = T;
-  };
-
-  template <typename T>
-  struct SumProductTypeForReductionStep<T, /*kIsSignedIntegralType=*/true> {
-    using type = typename std::make_unsigned_t<T>;
-  };
-
-  template <typename T,
-            typename std::enable_if<!is_complex_v<T>>::type* = nullptr>
-  T PerformReductionStep(ReductionKind reduction_kind, T a, T b) {
-    using SumProductType = typename SumProductTypeForReductionStep<
-        T, std::is_integral<T>::value && std::is_signed<T>::value>::type;
-    switch (reduction_kind) {
-      case ReductionKind::SUM:
-        return absl::bit_cast<T>(
-            static_cast<SumProductType>(absl::bit_cast<SumProductType>(a) +
-                                        absl::bit_cast<SumProductType>(b)));
-      case ReductionKind::PRODUCT:
-        return absl::bit_cast<T>(
-            static_cast<SumProductType>(absl::bit_cast<SumProductType>(a) *
-                                        absl::bit_cast<SumProductType>(b)));
-      case ReductionKind::MIN:
-        return std::min(a, b);
-      case ReductionKind::MAX:
-        return std::max(a, b);
-    }
-  }
-
-  template <typename T,
-            typename std::enable_if<is_complex_v<T>>::type* = nullptr>
-  T PerformReductionStep(ReductionKind reduction_kind, T a, T b) {
-    using SumProductType = typename SumProductTypeForReductionStep<
-        T, std::is_integral<T>::value && std::is_signed<T>::value>::type;
-    switch (reduction_kind) {
-      case ReductionKind::SUM:
-        return absl::bit_cast<T>(
-            static_cast<SumProductType>(absl::bit_cast<SumProductType>(a) +
-                                        absl::bit_cast<SumProductType>(b)));
-      case ReductionKind::PRODUCT:
-        return absl::bit_cast<T>(
-            static_cast<SumProductType>(absl::bit_cast<SumProductType>(a) *
-                                        absl::bit_cast<SumProductType>(b)));
-      case ReductionKind::MIN:
-      case ReductionKind::MAX:
-        LOG(FATAL) << "min/max not valid for complex types";
-    }
-  }
-};
-
-RefcountingHashMap<RendezvousKey, CpuAllReduceRendezvous>&
-GlobalAllReduceRendezvousMap() {
-  static auto& m =
-      *new RefcountingHashMap<RendezvousKey, CpuAllReduceRendezvous>;
-  return m;
-}
-
-RefcountingHashMap<RendezvousKey, CpuCollectivePermuteRendezvous>&
-GlobalCollectivePermuteRendezvousMap() {
-  static auto& m =
-      *new RefcountingHashMap<RendezvousKey, CpuCollectivePermuteRendezvous>;
-  return m;
-}
-
-RefcountingHashMap<RendezvousKey, CpuAllToAllRendezvous>&
-GlobalAllToAllRendezvousMap() {
-  static auto& m =
-      *new RefcountingHashMap<RendezvousKey, CpuAllToAllRendezvous>;
-  return m;
-}
-
-RendezvousKey GetRendezvousKey(const ExecutableRunOptions* run_options,
-                               std::vector<ReplicaGroup> group,
-                               int32_t channel_id_present,
-                               std::optional<bool> use_global_device_ids,
-                               int64_t op_id) {
-  const DeviceAssignment& device_assignment = *run_options->device_assignment();
-  int device_ordinal = GetDeviceOrdinal(run_options);
-  RendezvousKey::CollectiveOpKind op_kind = channel_id_present
-                                                ? RendezvousKey::kCrossModule
-                                                : RendezvousKey::kCrossReplica;
-  std::vector<GlobalDeviceId> participating_devices =
-      GetParticipatingDevices(GlobalDeviceId(device_ordinal), device_assignment,
-                              group,
-                              GetCollectiveOpGroupMode(channel_id_present != 0,
-                                                       use_global_device_ids)
-                                  .value())
-          .value();
-  int num_local_participants = participating_devices.size();
-  return RendezvousKey{run_options->run_id(), std::move(participating_devices),
-                       num_local_participants, op_kind, op_id};
-}
-
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
 void* AcquireInfeedBufferForDequeueImpl(const ExecutableRunOptions* run_options,
                                         int32_t buffer_length,
@@ -664,6 +278,77 @@ void ReleaseOutfeedBufferAfterPopulationImpl(
                                          std::move(shape));
 }
 
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
+void ReplicaIdImpl(const ExecutableRunOptions* run_options,
+                   void* output_buffer) {
+  int device_ordinal = GetDeviceOrdinal(run_options);
+  int32_t replica_id = run_options->device_assignment()
+                           ->ReplicaIdForDevice(GlobalDeviceId(device_ordinal))
+                           .value();
+  std::memcpy(output_buffer, &replica_id, 4);
+}
+
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
+void PartitionIdImpl(const ExecutableRunOptions* run_options,
+                     void* output_buffer) {
+  int device_ordinal = GetDeviceOrdinal(run_options);
+  const DeviceAssignment::LogicalID logical_id =
+      run_options->device_assignment()
+          ->LogicalIdForDevice(GlobalDeviceId(device_ordinal))
+          .value();
+  std::memcpy(output_buffer, &logical_id.computation_id, 4);
+}
+
+RendezvousKey GetRendezvousKey(const ExecutableRunOptions* run_options,
+                               GlobalDeviceId device,
+                               std::vector<ReplicaGroup> group,
+                               int32_t channel_id_present,
+                               std::optional<bool> use_global_device_ids,
+                               int64_t op_id) {
+  const DeviceAssignment& device_assignment = *run_options->device_assignment();
+  RendezvousKey::CollectiveOpKind op_kind = channel_id_present
+                                                ? RendezvousKey::kCrossModule
+                                                : RendezvousKey::kCrossReplica;
+  std::vector<GlobalDeviceId> participating_devices =
+      GetParticipatingDevices(GlobalDeviceId(device), device_assignment, group,
+                              GetCollectiveOpGroupMode(channel_id_present != 0,
+                                                       use_global_device_ids)
+                                  .value())
+          .value();
+  int num_local_participants = participating_devices.size();
+  return RendezvousKey{run_options->run_id(), std::move(participating_devices),
+                       num_local_participants, op_kind, op_id};
+}
+
+CollectivesInterface* GetInProcessCollectivesImpl() {
+  static InProcessCollectives* c = new InProcessCollectives();
+  return c;
+}
+
+CollectivesInterface* GetCollectivesImpl(
+    const ExecutableRunOptions* run_options) {
+  if (run_options->cpu_executable_run_options() &&
+      run_options->cpu_executable_run_options()->collectives()) {
+    return run_options->cpu_executable_run_options()->collectives();
+  }
+  return GetInProcessCollectivesImpl();
+}
+
+absl::Duration DefaultCollectiveTimeout() { return absl::Minutes(30); }
+
+absl::StatusOr<int> RankInGlobalDevices(
+    absl::Span<GlobalDeviceId const> devices, GlobalDeviceId device) {
+  auto it = absl::c_find(devices, device);
+  if (it == devices.end()) {
+    return InvalidArgument(
+        "Device %d not present in global devices %s.", device.value(),
+        absl::StrJoin(devices, ", ", [](std::string* out, GlobalDeviceId id) {
+          absl::StrAppend(out, id.value());
+        }));
+  }
+  return std::distance(devices.begin(), it);
+}
+
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
 void AllToAllImpl(const ExecutableRunOptions* run_options,
                   int32_t channel_id_present, int64_t op_id,
@@ -671,41 +356,82 @@ void AllToAllImpl(const ExecutableRunOptions* run_options,
                   int32_t replica_groups_str_size, int32_t num_buffers,
                   int64_t buffer_size, void** source_buffers,
                   void** destination_buffers) {
-  int device_ordinal = GetDeviceOrdinal(run_options);
-  absl::string_view replica_groups_serialized(
+  GlobalDeviceId device(GetDeviceOrdinal(run_options));
+  std::string_view replica_groups_serialized(
       static_cast<const char*>(replica_groups_str), replica_groups_str_size);
   std::vector<ReplicaGroup> group =
       ParseReplicaGroupsOnly(replica_groups_serialized).value();
   RendezvousKey rendezvous_key =
-      GetRendezvousKey(run_options, group, channel_id_present,
+      GetRendezvousKey(run_options, device, group, channel_id_present,
                        /*use_global_device_ids=*/std::nullopt, op_id);
 
-  AllToAllParticipantData participant(rendezvous_key, device_ordinal,
-                                      run_options->stream());
-  participant.device_id = GlobalDeviceId(device_ordinal);
-  participant.devices_to_copy_to =
-      GetParticipatingDevices(
-          GlobalDeviceId(device_ordinal), *run_options->device_assignment(),
-          group,
-          GetCollectiveOpGroupMode(channel_id_present != 0,
-                                   /*use_global_device_ids=*/std::nullopt)
-              .value())
-          .value();
-  for (int i = 0; i < num_buffers; i++) {
-    participant.source_buffers.emplace_back(source_buffers[i], buffer_size);
-    participant.destination_buffers.emplace_back(destination_buffers[i],
-                                                 buffer_size);
-  }
-  auto make_cpu_rendezvous = [](const RendezvousKey& k) {
-    return std::make_unique<CpuAllToAllRendezvous>(k);
-  };
-  TF_CHECK_OK(CpuAllToAllRendezvous::SubmitParticipant(
-                  [&] {
-                    return GlobalAllToAllRendezvousMap().GetOrCreateIfAbsent(
-                        rendezvous_key, make_cpu_rendezvous);
-                  },
-                  participant)
-                  .status());
+  int rank = RankInGlobalDevices(rendezvous_key.global_devices, device).value();
+
+  CollectivesInterface* collectives = GetCollectivesImpl(run_options);
+
+  auto communicator =
+      collectives->GetCommunicator(rendezvous_key.global_devices, rank).value();
+  TF_CHECK_OK(communicator->AllToAll(
+      rendezvous_key, buffer_size,
+      absl::Span<const void* const>(source_buffers, num_buffers),
+      absl::Span<void* const>(destination_buffers, num_buffers),
+      DefaultCollectiveTimeout()));
+}
+
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
+void AllGatherImpl(const ExecutableRunOptions* run_options,
+                   int32_t channel_id_present, int32_t use_global_device_ids,
+                   int64_t op_id, const void* replica_groups_str,
+                   int32_t replica_groups_str_size, int64_t buffer_size,
+                   void* source_buffer, void* destination_buffer) {
+  GlobalDeviceId device(GetDeviceOrdinal(run_options));
+  std::string_view replica_groups_serialized(
+      static_cast<const char*>(replica_groups_str), replica_groups_str_size);
+  std::vector<ReplicaGroup> group =
+      ParseReplicaGroupsOnly(replica_groups_serialized).value();
+  RendezvousKey rendezvous_key =
+      GetRendezvousKey(run_options, device, group, channel_id_present,
+                       use_global_device_ids, op_id);
+
+  int rank = RankInGlobalDevices(rendezvous_key.global_devices, device).value();
+
+  CollectivesInterface* collectives = GetCollectivesImpl(run_options);
+
+  auto communicator =
+      collectives->GetCommunicator(rendezvous_key.global_devices, rank).value();
+  TF_CHECK_OK(communicator->AllGather(rendezvous_key, buffer_size,
+                                      source_buffer, destination_buffer,
+                                      DefaultCollectiveTimeout()));
+}
+
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
+void ReduceScatterImpl(const ExecutableRunOptions* run_options,
+                       const void* replica_groups_str,
+                       int32_t replica_groups_str_size,
+                       int32_t channel_id_present,
+                       int32_t use_global_device_ids, int64_t op_id,
+                       int32_t reduction_kind, int32_t element_type,
+                       int64_t chunk_elems, void* input_buffer,
+                       void* output_buffer) {
+  GlobalDeviceId device(GetDeviceOrdinal(run_options));
+  std::string_view replica_groups_serialized(
+      static_cast<const char*>(replica_groups_str), replica_groups_str_size);
+  std::vector<ReplicaGroup> group =
+      ParseReplicaGroupsOnly(replica_groups_serialized).value();
+  RendezvousKey rendezvous_key =
+      GetRendezvousKey(run_options, device, group, channel_id_present,
+                       use_global_device_ids, op_id);
+
+  int rank = RankInGlobalDevices(rendezvous_key.global_devices, device).value();
+
+  CollectivesInterface* collectives = GetCollectivesImpl(run_options);
+
+  auto communicator =
+      collectives->GetCommunicator(rendezvous_key.global_devices, rank).value();
+  TF_CHECK_OK(communicator->ReduceScatter(
+      rendezvous_key, static_cast<ReductionKind>(reduction_kind),
+      static_cast<PrimitiveType>(element_type), chunk_elems, input_buffer,
+      output_buffer, DefaultCollectiveTimeout()));
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
@@ -716,13 +442,14 @@ void AllReduceImpl(const ExecutableRunOptions* run_options,
                    int32_t reduction_kind, const void* shape_ptr,
                    int32_t shape_length, int32_t num_buffers,
                    void** input_buffers, void** output_buffers) {
-  int device_ordinal = GetDeviceOrdinal(run_options);
-  absl::string_view replica_groups_serialized(
+  GlobalDeviceId device(GetDeviceOrdinal(run_options));
+  std::string_view replica_groups_serialized(
       static_cast<const char*>(replica_groups_str), replica_groups_str_size);
   std::vector<ReplicaGroup> group =
       ParseReplicaGroupsOnly(replica_groups_serialized).value();
-  RendezvousKey rendezvous_key = GetRendezvousKey(
-      run_options, group, channel_id_present, use_global_device_ids, op_id);
+  RendezvousKey rendezvous_key =
+      GetRendezvousKey(run_options, device, group, channel_id_present,
+                       use_global_device_ids, op_id);
   auto shape_str = ShapeString(shape_ptr, shape_length);
   VLOG(2) << "All-reduce input/output shape : " << shape_str;
 
@@ -732,53 +459,19 @@ void AllReduceImpl(const ExecutableRunOptions* run_options,
   CHECK((num_buffers > 1 && shape.IsTuple()) ||
         (num_buffers == 1 && LayoutUtil::IsDenseArray(shape)));
 
-  AllReduceParticipantData participant(rendezvous_key, device_ordinal,
-                                       run_options->stream());
-  participant.reduction_kind = static_cast<ReductionKind>(reduction_kind);
+  int rank = RankInGlobalDevices(rendezvous_key.global_devices, device).value();
+
+  CollectivesInterface* collectives = GetCollectivesImpl(run_options);
+
+  auto communicator =
+      collectives->GetCommunicator(rendezvous_key.global_devices, rank).value();
   for (int i = 0; i < num_buffers; i++) {
     Shape subshape = num_buffers == 1 ? shape : shape.tuple_shapes(i);
-    AllReduceParticipantData::Buffer buffer;
-    buffer.element_count = ShapeUtil::ElementsIn(subshape);
-    buffer.primitive_type = subshape.element_type();
-    buffer.source_data =
-        se::DeviceMemoryBase(input_buffers[i], ShapeUtil::ByteSizeOf(subshape));
-    buffer.destination_data = se::DeviceMemoryBase(
-        output_buffers[i], ShapeUtil::ByteSizeOf(subshape));
-    participant.buffers.push_back(buffer);
+    TF_CHECK_OK(communicator->AllReduce(
+        rendezvous_key, static_cast<ReductionKind>(reduction_kind),
+        subshape.element_type(), ShapeUtil::ElementsIn(subshape),
+        input_buffers[i], output_buffers[i], DefaultCollectiveTimeout()));
   }
-
-  auto make_cpu_rendezvous = [](const RendezvousKey& k) {
-    return std::make_unique<CpuAllReduceRendezvous>(k);
-  };
-
-  TF_CHECK_OK(CpuAllReduceRendezvous::SubmitParticipant(
-                  [&] {
-                    return GlobalAllReduceRendezvousMap().GetOrCreateIfAbsent(
-                        rendezvous_key, make_cpu_rendezvous);
-                  },
-                  participant)
-                  .status());
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
-void ReplicaIdImpl(const ExecutableRunOptions* run_options,
-                   void* output_buffer) {
-  int device_ordinal = GetDeviceOrdinal(run_options);
-  int32_t replica_id = run_options->device_assignment()
-                           ->ReplicaIdForDevice(GlobalDeviceId(device_ordinal))
-                           .value();
-  std::memcpy(output_buffer, &replica_id, 4);
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
-void PartitionIdImpl(const ExecutableRunOptions* run_options,
-                     void* output_buffer) {
-  int device_ordinal = GetDeviceOrdinal(run_options);
-  const DeviceAssignment::LogicalID logical_id =
-      run_options->device_assignment()
-          ->LogicalIdForDevice(GlobalDeviceId(device_ordinal))
-          .value();
-  std::memcpy(output_buffer, &logical_id.computation_id, 4);
 }
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
@@ -787,17 +480,16 @@ void CollectivePermuteImpl(const ExecutableRunOptions* run_options,
                            int32_t byte_size, void* input_buffer,
                            void* output_buffer, const void* source_target_pairs,
                            int32_t source_target_pairs_size) {
-  int device_ordinal = GetDeviceOrdinal(run_options);
-  absl::string_view source_target_pairs_serialized(
+  GlobalDeviceId device(GetDeviceOrdinal(run_options));
+  std::string_view source_target_pairs_serialized(
       static_cast<const char*>(source_target_pairs), source_target_pairs_size);
   auto pairs = absl::StrSplit(source_target_pairs_serialized, ',');
   const DeviceAssignment::LogicalID logical_id =
-      run_options->device_assignment()
-          ->LogicalIdForDevice(GlobalDeviceId(device_ordinal))
-          .value();
+      run_options->device_assignment()->LogicalIdForDevice(device).value();
   int32_t logical_device_id =
       channel_id_present ? logical_id.computation_id : logical_id.replica_id;
 
+  std::optional<int> source_replica_id;
   std::vector<int> copy_to;
   for (auto& p : pairs) {
     std::vector<std::string> mapping = absl::StrSplit(p, '=');
@@ -807,30 +499,24 @@ void CollectivePermuteImpl(const ExecutableRunOptions* run_options,
     if (from == logical_device_id) {
       copy_to.push_back(to);
     }
+    if (to == logical_device_id) {
+      CHECK(!source_replica_id.has_value());
+      source_replica_id = from;
+    }
   }
   RendezvousKey rendezvous_key =
-      GetRendezvousKey(run_options, {}, channel_id_present,
+      GetRendezvousKey(run_options, device, {}, channel_id_present,
                        /*use_global_device_ids=*/std::nullopt, op_id);
 
-  CollectivePermuteParticipantData participant(rendezvous_key, device_ordinal,
-                                               run_options->stream());
-  participant.replica_id = logical_device_id;
-  participant.source_data = se::DeviceMemoryBase(input_buffer, byte_size);
-  participant.destination_data = se::DeviceMemoryBase(output_buffer, byte_size);
-  participant.replica_ids_to_copy_to = copy_to;
-  participant.byte_size = byte_size;
-
-  auto make_cpu_rendezvous = [](const RendezvousKey& k) {
-    return std::make_unique<CpuCollectivePermuteRendezvous>(k);
-  };
-  TF_CHECK_OK(
-      CpuCollectivePermuteRendezvous::SubmitParticipant(
-          [&] {
-            return GlobalCollectivePermuteRendezvousMap().GetOrCreateIfAbsent(
-                rendezvous_key, make_cpu_rendezvous);
-          },
-          participant)
-          .status());
+  int rank = RankInGlobalDevices(rendezvous_key.global_devices, device).value();
+
+  CollectivesInterface* collectives = GetCollectivesImpl(run_options);
+
+  auto communicator =
+      collectives->GetCommunicator(rendezvous_key.global_devices, rank).value();
+  TF_CHECK_OK(communicator->CollectivePermute(
+      rendezvous_key, byte_size, source_replica_id, copy_to, input_buffer,
+      output_buffer, DefaultCollectiveTimeout()));
 }
 }  // namespace
 }  // namespace runtime
@@ -902,6 +588,31 @@ void __xla_cpu_runtime_AllToAll(const xla::ExecutableRunOptions* run_options,
       destination_buffers);
 }
 
+void __xla_cpu_runtime_AllGather(const xla::ExecutableRunOptions* run_options,
+                                 int32_t channel_id_present,
+                                 int32_t use_global_device_ids, int64_t op_id,
+                                 const void* replica_groups_str,
+                                 int32_t replica_groups_str_size,
+                                 int64_t buffer_size, void* source_buffer,
+                                 void* destination_buffer) {
+  return xla::cpu::runtime::AllGatherImpl(
+      run_options, channel_id_present, use_global_device_ids, op_id,
+      replica_groups_str, replica_groups_str_size, buffer_size, source_buffer,
+      destination_buffer);
+}
+
+void __xla_cpu_runtime_ReduceScatter(
+    const xla::ExecutableRunOptions* run_options,
+    const void* replica_groups_str, int32_t replica_groups_str_size,
+    int32_t channel_id_present, int32_t use_global_device_ids, int64_t op_id,
+    int32_t reduction_kind, int32_t element_type, int64_t chunk_elems,
+    void* input_buffer, void* output_buffer) {
+  return xla::cpu::runtime::ReduceScatterImpl(
+      run_options, replica_groups_str, replica_groups_str_size,
+      channel_id_present, use_global_device_ids, op_id, reduction_kind,
+      element_type, chunk_elems, input_buffer, output_buffer);
+}
+
 void __xla_cpu_runtime_AllReduce(const xla::ExecutableRunOptions* run_options,
                                  const void* replica_groups_str,
                                  int32_t replica_groups_str_size,
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.h b/third_party/xla/xla/service/cpu/cpu_runtime.h
index 361a116e7c300f..d40c571e92574b 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.h
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.h
@@ -84,7 +84,11 @@ extern const char* const kReplicaIdSymbolName;
 extern const char* const kTracingStartSymbolName;
 extern const char* const kTracingEndSymbolName;
 extern const char* const kAllToAllSymbolName;
+extern const char* const kAllGatherSymbolName;
+extern const char* const kReduceScatterSymbolName;
 extern const char* const kOneDnnMatMulSymbolName;
+extern const char* const kOneDnnSoftmaxSymbolName;
+extern const char* const kOneDnnLayerNormSymbolName;
 
 // All symbol names for XLA CPU runtime functions need to start with this
 // prefix.
@@ -195,6 +199,19 @@ extern void __xla_cpu_runtime_AllToAll(
     int32_t replica_groups_str_size, int32_t num_buffers, int64_t buffer_size,
     void** source_buffers, void** destination_buffers);
 
+extern void __xla_cpu_runtime_AllGather(
+    const xla::ExecutableRunOptions* run_options, int32_t channel_id_present,
+    int32_t use_global_device_ids, int64_t op_id,
+    const void* replica_groups_str, int32_t replica_groups_str_size,
+    int64_t buffer_size, void* source_buffer, void* destination_buffer);
+
+void __xla_cpu_runtime_ReduceScatter(
+    const xla::ExecutableRunOptions* run_options,
+    const void* replica_groups_str, int32_t replica_groups_str_size,
+    int32_t channel_id_present, int32_t use_global_device_ids, int64_t op_id,
+    int32_t reduction_kind, int32_t element_type, int64_t chunk_elems,
+    void* input_buffer, void* output_buffer);
+
 // Write the partition ID into the output buffer.
 extern void __xla_cpu_runtime_PartitionId(
     const xla::ExecutableRunOptions* run_options, void* output_buffer);
diff --git a/third_party/xla/xla/service/cpu/dot_op_emitter.cc b/third_party/xla/xla/service/cpu/dot_op_emitter.cc
index 2e6dffa26742e2..e7c1371cbdbd6d 100644
--- a/third_party/xla/xla/service/cpu/dot_op_emitter.cc
+++ b/third_party/xla/xla/service/cpu/dot_op_emitter.cc
@@ -850,14 +850,13 @@ Status DotOpEmitter::EmitCallToRuntime() {
                            PrimitiveType_Name(type));
   }
 
-  llvm::Type* float_ptr_type = float_type->getPointerTo();
+  llvm::Type* ptr_type = b_->getPtrTy();
   llvm::Type* int64_type = b_->getInt64Ty();
   llvm::Type* int32_type = b_->getInt32Ty();
-  llvm::Type* int8_ptr_type = b_->getInt8Ty()->getPointerTo();
   llvm::FunctionType* matmul_type = llvm::FunctionType::get(
       b_->getVoidTy(),
-      {int8_ptr_type, float_ptr_type, float_ptr_type, float_ptr_type,
-       int64_type, int64_type, int64_type, int32_type, int32_type},
+      {ptr_type, ptr_type, ptr_type, ptr_type, int64_type, int64_type,
+       int64_type, int32_type, int32_type},
       /*isVarArg=*/false);
 
   llvm::FunctionCallee matmul_func =
@@ -894,15 +893,12 @@ Status DotOpEmitter::EmitCallToRuntime() {
     std::swap(transpose_lhs, transpose_rhs);
   }
 
-  b_->CreateCall(
-      matmul_func,
-      {b_->CreateBitCast(executable_run_options_value_, int8_ptr_type),
-       b_->CreateBitCast(target_array_.GetBasePointer(), float_ptr_type),
-       b_->CreateBitCast(lhs->GetBasePointer(), float_ptr_type),
-       b_->CreateBitCast(rhs->GetBasePointer(), float_ptr_type),
-       b_->getInt64(mat_mult_dims.m), b_->getInt64(mat_mult_dims.n),
-       b_->getInt64(mat_mult_dims.k), b_->getInt32(transpose_lhs),
-       b_->getInt32(transpose_rhs)});
+  b_->CreateCall(matmul_func,
+                 {executable_run_options_value_, target_array_.GetBasePointer(),
+                  lhs->GetBasePointer(), rhs->GetBasePointer(),
+                  b_->getInt64(mat_mult_dims.m), b_->getInt64(mat_mult_dims.n),
+                  b_->getInt64(mat_mult_dims.k), b_->getInt32(transpose_lhs),
+                  b_->getInt32(transpose_rhs)});
   return OkStatus();
 }
 
@@ -933,14 +929,13 @@ Status DotOpEmitter::EmitCallToBatchRuntime() {
                            PrimitiveType_Name(type));
   }
 
-  llvm::Type* float_ptr_type = float_type->getPointerTo();
+  llvm::Type* ptr_type = b_->getPtrTy();
   llvm::Type* int64_type = b_->getInt64Ty();
   llvm::Type* int32_type = b_->getInt32Ty();
-  llvm::Type* int8_ptr_type = b_->getInt8Ty()->getPointerTo();
   llvm::FunctionType* matmul_type = llvm::FunctionType::get(
       b_->getVoidTy(),
-      {int8_ptr_type, float_ptr_type, float_ptr_type, float_ptr_type,
-       int64_type, int64_type, int64_type, int64_type, int32_type, int32_type},
+      {ptr_type, ptr_type, ptr_type, ptr_type, int64_type, int64_type,
+       int64_type, int64_type, int32_type, int32_type},
       /*isVarArg=*/false);
 
   llvm::FunctionCallee matmul_func =
@@ -981,10 +976,8 @@ Status DotOpEmitter::EmitCallToBatchRuntime() {
 
   b_->CreateCall(
       matmul_func,
-      {b_->CreateBitCast(executable_run_options_value_, int8_ptr_type),
-       b_->CreateBitCast(target_array_.GetBasePointer(), float_ptr_type),
-       b_->CreateBitCast(lhs->GetBasePointer(), float_ptr_type),
-       b_->CreateBitCast(rhs->GetBasePointer(), float_ptr_type),
+      {executable_run_options_value_, target_array_.GetBasePointer(),
+       lhs->GetBasePointer(), rhs->GetBasePointer(),
        b_->getInt64(mat_mult_dims.m), b_->getInt64(mat_mult_dims.n),
        b_->getInt64(mat_mult_dims.k), b_->getInt64(lhs_shape.dimensions(0)),
        b_->getInt32(static_cast<uint32_t>(transpose_lhs)),
@@ -1273,9 +1266,8 @@ llvm_ir::IrArray CollapseFirstNDims(llvm::IRBuilder<>* b,
   CHECK_GE(shape.dimensions_size(), n);
   Shape new_shape = CollapseFirstNDims(shape, n);
   llvm::Type* new_ir_type = llvm_ir::ShapeToIrType(new_shape, module);
-  llvm::Value* new_value =
-      b->CreateBitCast(array.GetBasePointer(), new_ir_type->getPointerTo());
-  return llvm_ir::IrArray(new_value, new_ir_type, std::move(new_shape));
+  return llvm_ir::IrArray(array.GetBasePointer(), new_ir_type,
+                          std::move(new_shape));
 }
 
 Status ValidateDotDimensionNumbers(const DotDimensionNumbers& dim_numbers) {
@@ -1306,9 +1298,7 @@ llvm_ir::IrArray SliceOutInnerArray(llvm_ir::IrArray outer_array,
                                       batch_index->getType());
   llvm::Value* slice_ptr = outer_array.EmitArrayElementAddress(slice_index, b);
   llvm::Type* new_ir_type = llvm_ir::ShapeToIrType(inner_shape, module);
-  llvm::Type* slice_ptr_type = new_ir_type->getPointerTo();
-  return llvm_ir::IrArray(b->CreateBitCast(slice_ptr, slice_ptr_type),
-                          new_ir_type, std::move(inner_shape));
+  return llvm_ir::IrArray(slice_ptr, new_ir_type, std::move(inner_shape));
 }
 
 bool PotentiallyImplementedAsEigenMatmul(
diff --git a/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc b/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc
index a004f77306da78..0dafc4c9235d3e 100644
--- a/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc
+++ b/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc
@@ -20,21 +20,21 @@ limitations under the License.
 #include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
 #include "mlir/Conversion/BufferizationToMemRef/BufferizationToMemRef.h"  // from @llvm-project
 #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"  // from @llvm-project
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
 #include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/TensorToLinalg/TensorToLinalgPass.h"  // from @llvm-project
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"  // from @llvm-project
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Func/Transforms/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
@@ -47,18 +47,18 @@ limitations under the License.
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "xla/mlir/backends/cpu/transforms/passes.h"
 #include "xla/mlir/runtime/transforms/compiler.h"
-#include "xla/mlir_hlo/deallocation/transforms/passes.h"
-#include "xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.h"
-#include "xla/mlir_hlo/gml_st/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/interfaces/bufferizable_op_interface_impl.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
-#include "xla/mlir_hlo/thlo/interfaces/bufferizable_op_interface_impl.h"
-#include "xla/mlir_hlo/thlo/transforms/passes.h"
 #include "xla/mlir_hlo/transforms/passes.h"
 #include "xla/status.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 
+#ifdef EXPERIMENTAL_MLIR_GPU
+#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
+#endif  // EXPERIMENTAL_MLIR_GPU
+
 namespace xla {
 namespace cpu {
 namespace {
@@ -91,20 +91,15 @@ void AddSparsificationPasses(mlir::OpPassManager& pm, bool new_deallocator,
   // Setting 1 thread means cuSPARSE libgen.
   // Otherwise direct CUDA codegen.
   const bool gpu_codegen = xla_cpu_sparse_cuda_threads > 0;
+  const bool gpu_libgen = xla_cpu_sparse_cuda_threads == 1;
   mlir::SparsificationOptions sparsification_options;
   sparsification_options.enableRuntimeLibrary = false;
-  sparsification_options.enableIndexReduction = true;
-  if (gpu_codegen) {
-    if (xla_cpu_sparse_cuda_threads == 1) {
-      sparsification_options.enableGPULibgen = true;
-    } else {
-      sparsification_options.parallelizationStrategy =
-          mlir::SparseParallelizationStrategy::kDenseOuterLoop;
-    }
+  if (gpu_codegen && !gpu_libgen) {
+    sparsification_options.parallelizationStrategy =
+        mlir::SparseParallelizationStrategy::kDenseOuterLoop;
   }
   // Sparsification set up.
   pm.addNestedPass<FuncOp>(mlir::createLinalgGeneralizationPass());
-  pm.addNestedPass<FuncOp>(mlir::gml_st::createRewriteFromElementsOpPass());
   pm.addPass(mlir::bufferization::createEmptyTensorEliminationPass());
   pm.addPass(mlir::createSparsificationAndBufferizationPass(
       GetBufferizationOptions(new_deallocator), sparsification_options,
@@ -113,19 +108,26 @@ void AddSparsificationPasses(mlir::OpPassManager& pm, bool new_deallocator,
       /*enableBufferInitialization=*/false,
       /*vectorLength=*/0,
       /*enableVLAVectorization=*/false,
-      /*enableSIMDIndex32*/ false));
+      /*enableSIMDIndex32=*/false,
+      /*enableGPULibgen=*/gpu_libgen));
   pm.addPass(mlir::createStorageSpecifierToLLVMPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::bufferization::createFinalizingBufferizePass());
+#ifdef EXPERIMENTAL_MLIR_GPU
   // Sparse GPU acceleration lowers to GPU dialect.
   if (gpu_codegen) {
-    pm.addPass(mlir::createSparseGPUCodegenPass(xla_cpu_sparse_cuda_threads));
+    pm.addPass(
+        mlir::createSparseGPUCodegenPass(xla_cpu_sparse_cuda_threads, false));
     pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createStripDebugInfoPass());
     pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createConvertSCFToCFPass());
     pm.addNestedPass<mlir::gpu::GPUModuleOp>(
         mlir::createConvertGpuOpsToNVVMOps());
   }
+#else   // EXPERIMENTAL_MLIR_GPU
+  CHECK(!gpu_codegen)
+      << "Experimental MLIR GPU code generation was not enabled at build time";
+#endif  // EXPERIMENTAL_MLIR_GPU
 }
 
 void AddSparsificationPassPipeline(mlir::OpPassManager& pm) {
@@ -186,21 +188,13 @@ static Status CreateHloXlaPipeline(
   pm.addNestedPass<FuncOp>(mlir::mhlo::createHloCanonicalizeDotPass());
   pm.addNestedPass<FuncOp>(mlir::mhlo::createGroupReductionDimensionsPass());
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createLegalizeMHLOToTHLOPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createLegalizeHloToLinalgPass(
-          options.enable_tiling_and_fusion));
+      mlir::mhlo::createLegalizeHloToLinalgPass());
 
   // Lower index cast on tensors to tensor.generate.
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createLowerIndexCastPass());
 
   pm.addPass(mlir::mhlo::createConvertToSignlessPass());
 
-  // Tile tHLO ops to 1.
-  if (!options.enable_tiling_and_fusion) {
-    pm.addNestedPass<mlir::func::FuncOp>(mlir::gml_st::createTileByOnePass());
-  }
-
   // Lower shape dialect to standard to enable linalg canonicalizations (e.g.
   // use linalg inputs instead of outputs for memref.dim operations).
   pm.addNestedPass<mlir::func::FuncOp>(mlir::mhlo::createShapeSimplification());
@@ -214,17 +208,7 @@ static Status CreateHloXlaPipeline(
   pm.addPass(mlir::memref::createResolveShapedTypeResultDimsPass());
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::gml_st::createOptimizeLinalgOpsPass());
-  if (options.enable_tiling_and_fusion) {
-    mlir::gml_st::GmlStCPUTilingOptions opts =
-        mlir::gml_st::getDefaultCPUPipelineOptions(options.cpu_name);
-    opts.matmulTileSizes = options.matmul_tile_sizes;
-    opts.inlineFusionClusters = false;
-    mlir::gml_st::addCPUTilingPipeline(pm, opts);
-  } else {
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::createLinalgElementwiseOpFusionPass());
-  }
+      mlir::createLinalgElementwiseOpFusionPass());
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
   pm.addPass(mlir::createConvertTensorToLinalgPass());
 
@@ -238,8 +222,6 @@ static Status CreateHloXlaPipeline(
     return tsl::errors::Internal("Failed to set up detensorize pass.");
   }
   pm.addNestedPass<mlir::func::FuncOp>(std::move(detensorize));
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::gml_st::createScalarizationPass());
-  pm.addNestedPass<FuncOp>(mlir::gml_st::createRewriteFromElementsOpPass());
   pm.addPass(mlir::bufferization::createEmptyTensorEliminationPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::bufferization::createEmptyTensorToAllocTensorPass());
@@ -248,34 +230,25 @@ static Status CreateHloXlaPipeline(
   // bufferizing anything.
   pm.addPass(mlir::createCanonicalizerPass());
 
-  if (options.experimental_deallocation) {
-    // Experimental deallocation needs input IR without any buffer reuse to
-    // work optimally. This pass ensures that's the case.
-    pm.addNestedPass<FuncOp>(mlir::deallocation::createSplitAllocTensorsPass());
-  }
-
   if (options.sparse_bufferization) {
     // Convert Sparse tensors.
-    AddSparsificationPasses(pm, options.experimental_deallocation,
-                            options.xla_cpu_sparse_cuda_threads);
+    AddSparsificationPasses(pm, false, options.xla_cpu_sparse_cuda_threads);
   } else {
     pm.addPass(mlir::hlo::createOneShotBufferizePass());
   }
   pm.addNestedPass<mlir::func::FuncOp>(createRewriteReallocToAllocPass());
+  pm.addNestedPass<FuncOp>(mlir::createVectorizeCopyPass());
+  pm.addNestedPass<FuncOp>(mlir::createNaiveCopyRemovalPass());
 
-  if (options.enable_fusion_outlining) {
-    pm.addPass(mlir::gml_st::createFusionOutliningPass());
-    pm.addPass(mlir::func::createDuplicateFunctionEliminationPass());
-  }
-  pm.addNestedPass<FuncOp>(mlir::gml_st::createInlineFusionClustersPass());
-
-  if (options.enable_tiling_and_fusion) {
-    pm.addNestedPass<FuncOp>(mlir::gml_st::createVectorizeCopyPass());
-    pm.addNestedPass<FuncOp>(mlir::gml_st::createNaiveCopyRemovalPass());
+  // This should be unified. It exists, because the async runtime tests expect
+  // parallel loops.
+  if (options.sparse_bufferization) {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::createConvertLinalgToLoopsPass());
+  } else {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::createConvertLinalgToParallelLoopsPass());
   }
-  // Handle framework specific requirements for buffers and then insert
-  // deallocations for temporary buffers.
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::createConvertLinalgToLoopsPass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createCanonicalizerPass());
   mlir::bufferization::BufferResultsToOutParamsOptions out_params_options;
@@ -286,30 +259,15 @@ static Status CreateHloXlaPipeline(
   pm.addPass(mlir::bufferization::createBufferResultsToOutParamsPass(
       out_params_options));
 
-  if (options.experimental_deallocation) {
-    pm.addNestedPass<FuncOp>(
-        mlir::deallocation::createXlaBufferArgRewritePass());
-    pm.addPass(mlir::deallocation::createDeallocatePass());
-    pm.addNestedPass<FuncOp>(
-        mlir::deallocation::createDeallocationSimplificationPass());
-    // Remove SCF iter args that became redundant after simplification.
-    pm.addPass(mlir::createCanonicalizerPass());
-    pm.addNestedPass<FuncOp>(mlir::deallocation::createBufferReusePass());
-    pm.addNestedPass<FuncOp>(
-        mlir::deallocation::createDeallocationSimplificationPass());
-    pm.addNestedPass<FuncOp>(mlir::deallocation::createDeallocationToScfPass());
-  } else {
-    pm.addNestedPass<FuncOp>(
-        mlir::bufferization::createPromoteBuffersToStackPass(nullptr));
+  pm.addNestedPass<FuncOp>(
+      mlir::bufferization::createPromoteBuffersToStackPass(nullptr));
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::bufferization::createBufferDeallocationPass());
+  pm.addPass(mlir::createBufferizationToMemRefPass());
+  if (options.remove_copies_to_outparams) {
     pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::bufferization::createBufferDeallocationPass());
-    pm.addPass(mlir::createBufferizationToMemRefPass());
-    if (options.remove_copies_to_outparams) {
-      pm.addNestedPass<mlir::func::FuncOp>(
-          xla::cpu::createRemoveCopiesToOutParamsPass());
-    }
+        xla::cpu::createRemoveCopiesToOutParamsPass());
   }
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::thlo::createLegalizeSortPass());
 
   // Specialize linalg.matmul to linalg.dot, linalg.matvec or linalg.vecmat,
   // and immediately canonicalize to clean up not taken branches.
@@ -322,9 +280,7 @@ static Status CreateHloXlaPipeline(
 
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createCanonicalizerPass());
-
-  pm.addNestedPass<FuncOp>(
-      mlir::gml_st::createLowerVectorsPass(options.enable_avx2));
+  pm.addNestedPass<FuncOp>(mlir::createConvertVectorToSCFPass());
   pm.addNestedPass<FuncOp>(xla::cpu::createLegalizeI1VectorTransferOpsPass());
   pm.addNestedPass<FuncOp>(
       xla::cpu::createConvertXlaCpuMemRefElementCastToLLVMPass());
@@ -346,12 +302,11 @@ void RegisterHloXlaRuntimePipelineDialects(mlir::DialectRegistry& dialects) {
   mlir::arith::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::bufferization::func_ext::registerBufferizableOpInterfaceExternalModels(
       dialects);
-  mlir::gml_st::registerBufferizableOpInterfaceExternalModels(dialects);
+  mlir::memref::registerAllocationOpInterfaceExternalModels(dialects);
   mlir::linalg::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::linalg::registerTilingInterfaceExternalModels(dialects);
   mlir::mhlo::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::scf::registerBufferizableOpInterfaceExternalModels(dialects);
-  mlir::thlo::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::shape::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::sparse_tensor::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::tensor::registerBufferizableOpInterfaceExternalModels(dialects);
diff --git a/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.h b/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.h
index b77436c16565d2..c2fa5f6f4b4d2e 100644
--- a/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.h
+++ b/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.h
@@ -34,7 +34,6 @@ struct HloXlaRuntimePipelineOptions {
   bool enable_fusion_outlining = true;
   bool remove_copies_to_outparams = true;
   bool sparse_bufferization = true;
-  bool experimental_deallocation = false;
   bool enable_avx2 = true;
   // Accelerate sparse computations with CUDA threading.
   // This is an experimental feature, so off by default.
diff --git a/third_party/xla/xla/service/cpu/in_process_collectives.cc b/third_party/xla/xla/service/cpu/in_process_collectives.cc
new file mode 100644
index 00000000000000..ed30082be82e8e
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/in_process_collectives.cc
@@ -0,0 +1,651 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/in_process_collectives.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/primitive_util.h"
+#include "xla/refcounting_hash_map.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/cpu/collectives_interface.h"
+#include "xla/service/global_device_id.h"
+#include "xla/status_macros.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+
+namespace xla {
+namespace cpu {
+namespace runtime {
+namespace {
+
+void FormatGlobalId(std::string* out, const GlobalDeviceId& device) {
+  absl::StrAppend(out, device.value());
+}
+
+struct AllReduceParticipantData : ParticipantData {
+  explicit AllReduceParticipantData(const RendezvousKey& rendezvous_key_p,
+                                    int rank)
+      : ParticipantData(rendezvous_key_p, rank) {}
+
+  int64_t element_count;
+  const void* source_data;
+  void* destination_data;
+  PrimitiveType primitive_type;
+
+  ReductionKind reduction_kind;
+
+  std::string ToString() const override {
+    return absl::StrFormat(
+        "AllReduceParticipantData{rank=%d, element_count=%d, type=%s, "
+        "rendezvous_key=%s}",
+        local_rank, element_count, PrimitiveType_Name(primitive_type),
+        rendezvous_key.ToString());
+  }
+};
+
+template <typename T>
+T GetInitialValue(ReductionKind reduction_kind) {
+  switch (reduction_kind) {
+    case ReductionKind::SUM:
+      return static_cast<T>(0);
+    case ReductionKind::PRODUCT:
+      return static_cast<T>(1);
+    case ReductionKind::MIN:
+      return std::numeric_limits<T>::max();
+    case ReductionKind::MAX:
+      return std::numeric_limits<T>::min();
+  }
+}
+
+// We cannot use static_assert(false), because the C++ standard (prior to
+// CWG2518) does not allow the statement discarded by a constexpr if to
+// be ill-formed for every possible specialization.
+// See https://en.cppreference.com/w/cpp/language/if#Constexpr_if
+template <ReductionKind>
+constexpr bool always_false_v = false;
+
+template <ReductionKind reduction_kind, typename T>
+void ReduceHelper(absl::Span<T> acc, absl::Span<T const* const> inputs) {
+  // TODO(penporn): make sure this gets vectorized.
+  if constexpr (reduction_kind == ReductionKind::SUM) {
+    for (size_t j = 0; j < inputs.size(); ++j) {
+      for (size_t i = 0; i < acc.size(); ++i) {
+        acc[i] += inputs[j][i];
+      }
+    }
+  } else if constexpr (reduction_kind == ReductionKind::PRODUCT) {
+    for (size_t j = 0; j < inputs.size(); ++j) {
+      for (size_t i = 0; i < acc.size(); ++i) {
+        acc[i] *= inputs[j][i];
+      }
+    }
+  } else if constexpr (reduction_kind == ReductionKind::MIN) {
+    for (size_t j = 0; j < inputs.size(); ++j) {
+      for (size_t i = 0; i < acc.size(); ++i) {
+        acc[i] = std::min(acc[i], inputs[j][i]);
+      }
+    }
+  } else if constexpr (reduction_kind == ReductionKind::MAX) {
+    for (size_t j = 0; j < inputs.size(); ++j) {
+      for (size_t i = 0; i < acc.size(); ++i) {
+        acc[i] = std::max(acc[i], inputs[j][i]);
+      }
+    }
+  } else {
+    static_assert(always_false_v<reduction_kind>, "Unsupported reduction kind");
+  }
+}
+
+template <PrimitiveType PT>
+absl::Status ReduceScatter(ReductionKind reduction_kind,
+                           absl::Span<const void* const> inputs, void* output,
+                           int64_t num_elems) {
+  using T = typename primitive_util::PrimitiveTypeToNative<PT>::type;
+  T initial_value = GetInitialValue<T>(reduction_kind);
+
+  absl::Span<T> out_chunk =
+      absl::MakeSpan(reinterpret_cast<T*>(output), num_elems);
+  for (int64_t i = 0; i < num_elems; ++i) {
+    out_chunk[i] = initial_value;
+  }
+
+  absl::Span<T const* const> input_chunks(
+      reinterpret_cast<T const* const*>(inputs.data()), inputs.size());
+  switch (reduction_kind) {
+    case ReductionKind::SUM:
+      ReduceHelper<ReductionKind::SUM, T>(out_chunk, input_chunks);
+      break;
+    case ReductionKind::PRODUCT:
+      ReduceHelper<ReductionKind::PRODUCT, T>(out_chunk, input_chunks);
+      break;
+    case ReductionKind::MIN:
+      if constexpr (!is_complex_v<T>) {
+        ReduceHelper<ReductionKind::MIN, T>(out_chunk, input_chunks);
+      } else {
+        return absl::InvalidArgumentError(
+            "Min reductions not supported for complex types");
+      }
+      break;
+    case ReductionKind::MAX:
+      if constexpr (!is_complex_v<T>) {
+        ReduceHelper<ReductionKind::MAX, T>(out_chunk, input_chunks);
+      } else {
+        return absl::InvalidArgumentError(
+            "Max reductions not supported for complex types");
+      }
+      break;
+  }
+
+  return absl::OkStatus();
+}
+
+class CpuAllReduceRendezvous
+    : public Rendezvous<AllReduceParticipantData, std::nullptr_t> {
+ public:
+  explicit CpuAllReduceRendezvous(const RendezvousKey& k)
+      : Rendezvous<AllReduceParticipantData, std::nullptr_t>(k) {}
+
+ protected:
+  absl::StatusOr<std::nullptr_t> RunCollectiveOp(
+      const AllReduceParticipantData& me) override {
+    VLOG(3) << me.ToString();
+    int64_t world_size = participants_.size();
+    // Divide the buffer up into equal(ish) chunks. Rank r computes the r-th
+    // chunk of the output.
+    int64_t chunk_elems = CeilOfRatio(me.element_count, world_size);
+
+    int64_t start_elem = me.local_rank * chunk_elems;
+    int64_t end_elem = std::min(start_elem + chunk_elems, me.element_count);
+    chunk_elems = std::max(int64_t{0}, end_elem - start_elem);
+    if (chunk_elems == 0) {
+      return nullptr;
+    }
+
+    auto bytes_per_elem = primitive_util::ByteWidth(me.primitive_type);
+    int64_t chunk_offset = start_elem * bytes_per_elem;
+    int64_t chunk_bytes = chunk_elems * bytes_per_elem;
+    void* reduce_output =
+        reinterpret_cast<char*>(me.destination_data) + chunk_offset;
+
+    std::vector<const void*> inputs;
+    inputs.reserve(world_size);
+    for (const auto& p : participants_) {
+      inputs.push_back(reinterpret_cast<const char*>(p->source_data) +
+                       chunk_offset);
+    }
+
+    switch (me.primitive_type) {
+      case S8:
+        TF_RETURN_IF_ERROR(ReduceScatter<S8>(me.reduction_kind, inputs,
+                                             reduce_output, chunk_elems));
+        break;
+      case PRED:
+      case U8:
+        TF_RETURN_IF_ERROR(ReduceScatter<U8>(me.reduction_kind, inputs,
+                                             reduce_output, chunk_elems));
+        break;
+      case S16:
+        TF_RETURN_IF_ERROR(ReduceScatter<S16>(me.reduction_kind, inputs,
+                                              reduce_output, chunk_elems));
+        break;
+      case U16:
+        TF_RETURN_IF_ERROR(ReduceScatter<U16>(me.reduction_kind, inputs,
+                                              reduce_output, chunk_elems));
+        break;
+      case S32:
+        TF_RETURN_IF_ERROR(ReduceScatter<S32>(me.reduction_kind, inputs,
+                                              reduce_output, chunk_elems));
+        break;
+      case U32:
+        TF_RETURN_IF_ERROR(ReduceScatter<U32>(me.reduction_kind, inputs,
+                                              reduce_output, chunk_elems));
+        break;
+      case S64:
+        TF_RETURN_IF_ERROR(ReduceScatter<S64>(me.reduction_kind, inputs,
+                                              reduce_output, chunk_elems));
+        break;
+      case U64:
+        TF_RETURN_IF_ERROR(ReduceScatter<U64>(me.reduction_kind, inputs,
+                                              reduce_output, chunk_elems));
+        break;
+      case F16:
+        TF_RETURN_IF_ERROR(ReduceScatter<F16>(me.reduction_kind, inputs,
+                                              reduce_output, chunk_elems));
+        break;
+      case F32:
+        TF_RETURN_IF_ERROR(ReduceScatter<F32>(me.reduction_kind, inputs,
+                                              reduce_output, chunk_elems));
+        break;
+      case F64:
+        TF_RETURN_IF_ERROR(ReduceScatter<F64>(me.reduction_kind, inputs,
+                                              reduce_output, chunk_elems));
+        break;
+      case C64:
+        TF_RETURN_IF_ERROR(ReduceScatter<C64>(me.reduction_kind, inputs,
+                                              reduce_output, chunk_elems));
+        break;
+      case C128:
+        TF_RETURN_IF_ERROR(ReduceScatter<C128>(me.reduction_kind, inputs,
+                                               reduce_output, chunk_elems));
+        break;
+      default:
+        return absl::UnimplementedError("Unexpected datatype");
+    }
+
+    // All-gather the reduced chunks.
+    for (const auto& p : participants_) {
+      if (p->local_rank != me.local_rank) {
+        std::memcpy(reinterpret_cast<char*>(p->destination_data) + chunk_offset,
+                    reduce_output, chunk_bytes);
+      }
+    }
+    return nullptr;
+  }
+};
+
+struct CollectivePermuteParticipantData : ParticipantData {
+  CollectivePermuteParticipantData(const RendezvousKey& rendezvous_key_p,
+                                   int rank)
+      : ParticipantData(rendezvous_key_p, rank) {}
+  const void* source_buffer;
+  void* destination_buffer;
+  size_t num_bytes;
+
+  // From which rank is this participant receiving its data? Optional; if
+  // absent fill with zeros.
+  std::optional<int> source_rank;
+
+  std::string ToString() const override {
+    return absl::StrFormat(
+        "CollectivePermuteParticipantData{rank=%d, "
+        "source_buffer=%p, destination_buffer=%p, num_bytes=%d, "
+        "source_replica_id=%d, "
+        "devices=[%s]}",
+        local_rank, source_buffer, destination_buffer, num_bytes,
+        source_rank.value_or(-1),
+        absl::StrJoin(rendezvous_key.global_devices, ", ", FormatGlobalId));
+  }
+};
+
+class CpuCollectivePermuteRendezvous
+    : public Rendezvous<CollectivePermuteParticipantData, std::nullptr_t> {
+ public:
+  explicit CpuCollectivePermuteRendezvous(const RendezvousKey& k)
+      : Rendezvous<CollectivePermuteParticipantData, std::nullptr_t>(k) {}
+
+ protected:
+  CollectivesInterface* collectives_;
+
+  absl::StatusOr<std::nullptr_t> RunCollectiveOp(
+      const CollectivePermuteParticipantData& p) override {
+    VLOG(3) << p.ToString();
+    if (p.source_rank) {
+      std::memcpy(p.destination_buffer,
+                  participants_[*p.source_rank]->source_buffer, p.num_bytes);
+    } else {
+      std::memset(p.destination_buffer, 0, p.num_bytes);
+    }
+    return nullptr;
+  }
+};
+
+struct AllToAllParticipantData : ParticipantData {
+  AllToAllParticipantData(const RendezvousKey& rendezvous_key_p, int rank)
+      : ParticipantData(rendezvous_key_p, rank) {}
+
+  std::vector<const void*> source_buffers;
+  std::vector<void*> destination_buffers;
+  size_t chunk_size;
+
+  std::string ToString() const override {
+    auto addr_formatter = [](std::string* out, const void* mem) {
+      absl::StrAppend(out, absl::StrFormat("%p", mem));
+    };
+    return absl::StrFormat(
+        "AllToAllParticipantData{rank=%d, "
+        "devices=[%s], source_buffers=[%s], "
+        "destination_buffers=[%s], chunk_size=%d}",
+        local_rank,
+        absl::StrJoin(rendezvous_key.global_devices, ", ", FormatGlobalId),
+        absl::StrJoin(source_buffers, ", ", addr_formatter),
+        absl::StrJoin(destination_buffers, ", ", addr_formatter), chunk_size);
+  }
+};
+
+class CpuAllToAllRendezvous
+    : public Rendezvous<AllToAllParticipantData, std::nullptr_t> {
+ public:
+  explicit CpuAllToAllRendezvous(const RendezvousKey& k)
+      : Rendezvous<AllToAllParticipantData, std::nullptr_t>(k) {}
+
+ protected:
+  CollectivesInterface* collectives_;
+  absl::StatusOr<std::nullptr_t> RunCollectiveOp(
+      const AllToAllParticipantData& p) override {
+    int world_size = p.rendezvous_key.global_devices.size();
+    for (int i = 0; i < world_size; ++i) {
+      std::memcpy(participants_[i]->destination_buffers[p.local_rank],
+                  p.source_buffers[i], p.chunk_size);
+    }
+    return nullptr;
+  }
+};
+
+struct AllGatherParticipantData : ParticipantData {
+  AllGatherParticipantData(const RendezvousKey& rendezvous_key_p, int rank)
+      : ParticipantData(rendezvous_key_p, rank) {}
+
+  const void* source_buffer;
+  void* destination_buffer;
+  size_t chunk_size;
+
+  std::string ToString() const override {
+    return absl::StrFormat(
+        "AllGatherParticipantData{rank=%d, "
+        "devices=[%s], source_buffer=%p, "
+        "destination_buffer=%p, chunk_size=%d}",
+        local_rank,
+        absl::StrJoin(rendezvous_key.global_devices, ", ", FormatGlobalId),
+        source_buffer, destination_buffer, chunk_size);
+  }
+};
+
+class CpuAllGatherRendezvous
+    : public Rendezvous<AllGatherParticipantData, std::nullptr_t> {
+ public:
+  explicit CpuAllGatherRendezvous(const RendezvousKey& k)
+      : Rendezvous<AllGatherParticipantData, std::nullptr_t>(k) {}
+
+ protected:
+  CollectivesInterface* collectives_;
+  absl::StatusOr<std::nullptr_t> RunCollectiveOp(
+      const AllGatherParticipantData& p) override {
+    int world_size = p.rendezvous_key.global_devices.size();
+    char* out = static_cast<char*>(p.destination_buffer);
+    for (int i = 0; i < world_size; ++i, out += p.chunk_size) {
+      std::memcpy(out, participants_[i]->source_buffer, p.chunk_size);
+    }
+    return nullptr;
+  }
+};
+
+struct ReduceScatterParticipantData : ParticipantData {
+  ReduceScatterParticipantData(const RendezvousKey& rendezvous_key_p, int rank)
+      : ParticipantData(rendezvous_key_p, rank) {}
+
+  ReductionKind reduction_kind;
+  PrimitiveType element_type;
+  const void* source_buffer;
+  void* destination_buffer;
+  size_t chunk_elems;
+
+  std::string ToString() const override {
+    return absl::StrFormat(
+        "ReduceScatterParticipantData{rank=%d, "
+        "devices=[%s], source_buffer=%p, "
+        "destination_buffer=%p, chunk_elems=%d}",
+        local_rank,
+        absl::StrJoin(rendezvous_key.global_devices, ", ", FormatGlobalId),
+        source_buffer, destination_buffer, chunk_elems);
+  }
+};
+
+class CpuReduceScatterRendezvous
+    : public Rendezvous<ReduceScatterParticipantData, std::nullptr_t> {
+ public:
+  explicit CpuReduceScatterRendezvous(const RendezvousKey& k)
+      : Rendezvous<ReduceScatterParticipantData, std::nullptr_t>(k) {}
+
+ protected:
+  CollectivesInterface* collectives_;
+  absl::StatusOr<std::nullptr_t> RunCollectiveOp(
+      const ReduceScatterParticipantData& me) override {
+    auto bytes_per_elem = primitive_util::ByteWidth(me.element_type);
+    int64_t chunk_offset = me.local_rank * me.chunk_elems * bytes_per_elem;
+
+    std::vector<const void*> inputs;
+    inputs.reserve(participants_.size());
+    for (const auto& p : participants_) {
+      inputs.push_back(reinterpret_cast<const char*>(p->source_buffer) +
+                       chunk_offset);
+    }
+
+    switch (me.element_type) {
+      case S8:
+        TF_RETURN_IF_ERROR(ReduceScatter<S8>(
+            me.reduction_kind, inputs, me.destination_buffer, me.chunk_elems));
+        break;
+      case PRED:
+      case U8:
+        TF_RETURN_IF_ERROR(ReduceScatter<U8>(
+            me.reduction_kind, inputs, me.destination_buffer, me.chunk_elems));
+        break;
+      case S16:
+        TF_RETURN_IF_ERROR(ReduceScatter<S16>(
+            me.reduction_kind, inputs, me.destination_buffer, me.chunk_elems));
+        break;
+      case U16:
+        TF_RETURN_IF_ERROR(ReduceScatter<U16>(
+            me.reduction_kind, inputs, me.destination_buffer, me.chunk_elems));
+        break;
+      case S32:
+        TF_RETURN_IF_ERROR(ReduceScatter<S32>(
+            me.reduction_kind, inputs, me.destination_buffer, me.chunk_elems));
+        break;
+      case U32:
+        TF_RETURN_IF_ERROR(ReduceScatter<U32>(
+            me.reduction_kind, inputs, me.destination_buffer, me.chunk_elems));
+        break;
+      case S64:
+        TF_RETURN_IF_ERROR(ReduceScatter<S64>(
+            me.reduction_kind, inputs, me.destination_buffer, me.chunk_elems));
+        break;
+      case U64:
+        TF_RETURN_IF_ERROR(ReduceScatter<U64>(
+            me.reduction_kind, inputs, me.destination_buffer, me.chunk_elems));
+        break;
+      case F16:
+        TF_RETURN_IF_ERROR(ReduceScatter<F16>(
+            me.reduction_kind, inputs, me.destination_buffer, me.chunk_elems));
+        break;
+      case F32:
+        TF_RETURN_IF_ERROR(ReduceScatter<F32>(
+            me.reduction_kind, inputs, me.destination_buffer, me.chunk_elems));
+        break;
+      case F64:
+        TF_RETURN_IF_ERROR(ReduceScatter<F64>(
+            me.reduction_kind, inputs, me.destination_buffer, me.chunk_elems));
+        break;
+      case C64:
+        TF_RETURN_IF_ERROR(ReduceScatter<C64>(
+            me.reduction_kind, inputs, me.destination_buffer, me.chunk_elems));
+        break;
+      case C128:
+        TF_RETURN_IF_ERROR(ReduceScatter<C128>(
+            me.reduction_kind, inputs, me.destination_buffer, me.chunk_elems));
+        break;
+      default:
+        return absl::UnimplementedError("Unexpected datatype");
+    }
+
+    return nullptr;
+  }
+};
+
+}  // namespace
+
+struct InProcessCollectivesState {
+  RefcountingHashMap<RendezvousKey, CpuAllReduceRendezvous>
+      all_reduce_rendezvous_map;
+  RefcountingHashMap<RendezvousKey, CpuCollectivePermuteRendezvous>
+      collective_permute_rendezvous_map;
+  RefcountingHashMap<RendezvousKey, CpuAllToAllRendezvous>
+      all_to_all_rendezvous_map;
+  RefcountingHashMap<RendezvousKey, CpuAllGatherRendezvous>
+      all_gather_rendezvous_map;
+  RefcountingHashMap<RendezvousKey, CpuReduceScatterRendezvous>
+      reduce_scatter_rendezvous_map;
+};
+
+InProcessCollectivesCommunicator::InProcessCollectivesCommunicator(
+    InProcessCollectivesState* state, int rank, int size)
+    : state_(state), rank_(rank) {}
+InProcessCollectivesCommunicator::~InProcessCollectivesCommunicator() = default;
+
+absl::Status InProcessCollectivesCommunicator::AllReduce(
+    const RendezvousKey& key, ReductionKind reduction_kind,
+    PrimitiveType element_type, size_t num_elements,
+    const void* const input_buffer, void* const output_buffer,
+    absl::Duration timeout) {
+  AllReduceParticipantData participant(key, rank_);
+  participant.element_count = num_elements;
+  participant.primitive_type = element_type;
+  participant.source_data = input_buffer;
+  participant.destination_data = output_buffer;
+  participant.reduction_kind = reduction_kind;
+
+  auto make_cpu_rendezvous = [](const RendezvousKey& k) {
+    return std::make_unique<CpuAllReduceRendezvous>(k);
+  };
+
+  return CpuAllReduceRendezvous::SubmitParticipant(
+             [&] {
+               return state_->all_reduce_rendezvous_map.GetOrCreateIfAbsent(
+                   key, make_cpu_rendezvous);
+             },
+             participant)
+      .status();
+}
+
+absl::Status InProcessCollectivesCommunicator::CollectivePermute(
+    const RendezvousKey& key, size_t num_bytes, std::optional<int> source_rank,
+    absl::Span<int const> target_ranks, const void* input_buffer,
+    void* output_buffer, absl::Duration timeout) {
+  CollectivePermuteParticipantData participant(key, rank_);
+  participant.source_buffer = input_buffer;
+  participant.destination_buffer = output_buffer;
+  participant.num_bytes = num_bytes;
+  participant.source_rank = source_rank;
+  auto make_cpu_rendezvous = [](const RendezvousKey& k) {
+    return std::make_unique<CpuCollectivePermuteRendezvous>(k);
+  };
+  return CpuCollectivePermuteRendezvous::SubmitParticipant(
+             [&] {
+               return state_->collective_permute_rendezvous_map
+                   .GetOrCreateIfAbsent(key, make_cpu_rendezvous);
+             },
+             participant)
+      .status();
+}
+
+absl::Status InProcessCollectivesCommunicator::AllToAll(
+    const RendezvousKey& key, size_t chunk_bytes,
+    absl::Span<const void* const> input_buffers,
+    absl::Span<void* const> output_buffers, absl::Duration timeout) {
+  AllToAllParticipantData participant(key, rank_);
+  TF_RET_CHECK(input_buffers.size() == output_buffers.size());
+  participant.chunk_size = chunk_bytes;
+  participant.source_buffers.reserve(input_buffers.size());
+  participant.destination_buffers.reserve(output_buffers.size());
+  for (const void* input_buffer : input_buffers) {
+    participant.source_buffers.push_back(input_buffer);
+  }
+  for (void* output_buffer : output_buffers) {
+    participant.destination_buffers.push_back(output_buffer);
+  }
+  auto make_cpu_rendezvous = [](const RendezvousKey& k) {
+    return std::make_unique<CpuAllToAllRendezvous>(k);
+  };
+  return CpuAllToAllRendezvous::SubmitParticipant(
+             [&] {
+               return state_->all_to_all_rendezvous_map.GetOrCreateIfAbsent(
+                   key, make_cpu_rendezvous);
+             },
+             participant)
+      .status();
+}
+
+absl::Status InProcessCollectivesCommunicator::AllGather(
+    const RendezvousKey& key, size_t chunk_bytes, const void* input_buffer,
+    void* output_buffer, absl::Duration timeout) {
+  AllGatherParticipantData participant(key, rank_);
+  participant.chunk_size = chunk_bytes;
+  participant.source_buffer = input_buffer;
+  participant.destination_buffer = output_buffer;
+  auto make_cpu_rendezvous = [](const RendezvousKey& k) {
+    return std::make_unique<CpuAllGatherRendezvous>(k);
+  };
+  return CpuAllGatherRendezvous::SubmitParticipant(
+             [&] {
+               return state_->all_gather_rendezvous_map.GetOrCreateIfAbsent(
+                   key, make_cpu_rendezvous);
+             },
+             participant)
+      .status();
+}
+
+absl::Status InProcessCollectivesCommunicator::ReduceScatter(
+    const RendezvousKey& key, ReductionKind reduction_kind,
+    PrimitiveType element_type, size_t chunk_elems, const void* input_buffer,
+    void* output_buffer, absl::Duration timeout) {
+  ReduceScatterParticipantData participant(key, rank_);
+  participant.element_type = element_type;
+  participant.reduction_kind = reduction_kind;
+  participant.chunk_elems = chunk_elems;
+  participant.source_buffer = input_buffer;
+  participant.destination_buffer = output_buffer;
+  auto make_cpu_rendezvous = [](const RendezvousKey& k) {
+    return std::make_unique<CpuReduceScatterRendezvous>(k);
+  };
+  return CpuReduceScatterRendezvous::SubmitParticipant(
+             [&] {
+               return state_->reduce_scatter_rendezvous_map.GetOrCreateIfAbsent(
+                   key, make_cpu_rendezvous);
+             },
+             participant)
+      .status();
+}
+InProcessCollectives::InProcessCollectives()
+    : state_(std::make_unique<InProcessCollectivesState>()) {}
+InProcessCollectives::~InProcessCollectives() = default;
+
+absl::StatusOr<std::shared_ptr<CollectivesCommunicator>>
+InProcessCollectives::GetCommunicator(absl::Span<GlobalDeviceId const> devices,
+                                      int rank) {
+  // We don't care about devices here: we share rendezvous state globally.
+  return std::make_shared<InProcessCollectivesCommunicator>(state_.get(), rank,
+                                                            devices.size());
+}
+
+}  // namespace runtime
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/in_process_collectives.h b/third_party/xla/xla/service/cpu/in_process_collectives.h
new file mode 100644
index 00000000000000..f80baf38c4ebdc
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/in_process_collectives.h
@@ -0,0 +1,88 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_IN_PROCESS_COLLECTIVES_H_
+#define XLA_SERVICE_CPU_IN_PROCESS_COLLECTIVES_H_
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/cpu/collectives_interface.h"
+#include "xla/service/global_device_id.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu::runtime {
+
+struct InProcessCollectivesState;
+
+class InProcessCollectivesCommunicator : public CollectivesCommunicator {
+ public:
+  InProcessCollectivesCommunicator(InProcessCollectivesState* state, int rank,
+                                   int size);
+  ~InProcessCollectivesCommunicator() override;
+
+  absl::Status AllReduce(const RendezvousKey& key, ReductionKind reduction_kind,
+                         PrimitiveType element_type, size_t num_elements,
+                         const void* input_buffer, void* output_buffer,
+                         absl::Duration timeout) override;
+
+  absl::Status CollectivePermute(const RendezvousKey& key, size_t num_bytes,
+                                 std::optional<int> source_rank,
+                                 absl::Span<int const> target_ranks,
+                                 const void* input_buffer, void* output_buffer,
+                                 absl::Duration timeout) override;
+
+  absl::Status AllToAll(const RendezvousKey& key, size_t chunk_bytes,
+                        absl::Span<const void* const> input_buffers,
+                        absl::Span<void* const> output_buffers,
+                        absl::Duration timeout) override;
+
+  absl::Status AllGather(const RendezvousKey& key, size_t chunk_bytes,
+                         const void* input_buffer, void* output_buffer,
+                         absl::Duration timeout) override;
+
+  absl::Status ReduceScatter(const RendezvousKey& key,
+                             ReductionKind reduction_kind,
+                             PrimitiveType element_type, size_t chunk_elems,
+                             const void* input_buffer, void* output_buffer,
+                             absl::Duration timeout) override;
+
+ private:
+  InProcessCollectivesState* state_;
+  int rank_;
+};
+
+class InProcessCollectives : public CollectivesInterface {
+ public:
+  InProcessCollectives();
+  ~InProcessCollectives() override;
+
+  // Thread-safe.
+  absl::StatusOr<std::shared_ptr<CollectivesCommunicator>> GetCommunicator(
+      absl::Span<GlobalDeviceId const> devices, int rank) override;
+
+ private:
+  std::unique_ptr<InProcessCollectivesState> state_;
+};
+
+}  // namespace xla::cpu::runtime
+
+#endif  // XLA_SERVICE_CPU_IN_PROCESS_COLLECTIVES_H_
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.cc b/third_party/xla/xla/service/cpu/ir_emitter.cc
index 0375f31f8fb2bb..9245762b39aff9 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter.cc
@@ -139,21 +139,18 @@ void IrEmitter::EmitThreadLocalFunctionEpilogue(HloComputation* computation) {
     llvm::Value* ret_value =
         Load(root_value.GetBasePointeeType(), root_value.GetBasePointer(),
              "load_ret_value");
-    Store(ret_value,
-          BitCast(out_parameter, root_value.GetBasePointer()->getType()));
+    Store(ret_value, out_parameter);
   } else {
     CHECK(return_shape.IsTuple());
 
     llvm::Type* tuple_type = llvm_ir::ShapeToIrType(return_shape, module_);
-    llvm::Type* tuple_type_lvalue = tuple_type->getPointerTo();
-    llvm::Value* tuple_lvalue = BitCast(out_parameter, tuple_type_lvalue);
 
     for (int i = 0; i < return_shape.tuple_shapes_size(); i++) {
       const Shape& element_shape = return_shape.tuple_shapes(i);
       llvm::Value* destination = llvm_ir::EmitGetTupleElement(
           element_shape,
           /*index=*/i,
-          /*alignment=*/MinimumAlignmentForShape(element_shape), tuple_lvalue,
+          /*alignment=*/MinimumAlignmentForShape(element_shape), out_parameter,
           tuple_type, &b_);
 
       llvm::Value* source = llvm_ir::EmitGetTupleElement(
@@ -257,9 +254,7 @@ void IrEmitter::InitializeIrFunction(const std::string& function_name) {
 
 Status IrEmitter::HandleBitcast(HloInstruction* bitcast) {
   VLOG(2) << "HandleBitcast: " << bitcast->ToString();
-  emitted_value_[bitcast] =
-      BitCast(GetEmittedValueFor(bitcast->operand(0)),
-              IrShapeType(bitcast->shape())->getPointerTo(), IrName(bitcast));
+  emitted_value_[bitcast] = GetEmittedValueFor(bitcast->operand(0));
   return OkStatus();
 }
 
@@ -276,8 +271,7 @@ llvm::Constant* IrEmitter::EmitGlobalForLiteral(const Literal& literal) {
   result_global->setAlignment(
       llvm::Align(MinimumAlignmentForShape(literal.shape())));
   result_global->setUnnamedAddr(llvm::GlobalVariable::UnnamedAddr::Global);
-  return llvm::ConstantExpr::getBitCast(
-      result_global, IrShapeType(literal.shape())->getPointerTo());
+  return result_global;
 }
 
 Status IrEmitter::EmitConstantGlobals() {
@@ -480,8 +474,6 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
       llvm::Value * shape_ptr,
       llvm_ir::EncodeSelfDescribingShapeConstant(shape, &shape_length, &b_));
 
-  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(module_->getContext());
-
   const char* acquire_func_name =
       kind == XfeedKind::kInfeed
           ? runtime::kAcquireInfeedBufferForDequeueSymbolName
@@ -495,7 +487,7 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
       EmitCallToFunc(acquire_func_name,
                      {GetExecutableRunOptionsArgument(), b_.getInt32(length_32),
                       shape_ptr, b_.getInt32(shape_length)},
-                     i8_ptr_type);
+                     b_.getPtrTy());
   if (kind == XfeedKind::kInfeed) {
     // Copy to the program buffer address from the acquired buffer.
     MemCpy(program_buffer_address, /*DstAlign=*/llvm::Align(1),
@@ -622,17 +614,15 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
 
   CHECK(absl::c_binary_search(thread_local_computations_, sort->to_apply()));
   llvm::Value* values = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-      b_.getInt8PtrTy(), b_.getInt32(sort->operand_count()), "cc_values_alloca",
+      b_.getPtrTy(), b_.getInt32(sort->operand_count()), "cc_values_alloca",
       &b_);
   llvm::Value* sizes = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
       b_.getInt32Ty(), b_.getInt32(sort->operand_count()), "cc_sizes_alloca",
       &b_);
   for (int64_t i = 0; i < sort->operand_count(); ++i) {
-    llvm::Value* value_as_i8ptr =
-        PointerCast(destination_addresses[i], b_.getInt8PtrTy());
     llvm::Value* slot_in_values_alloca =
-        ConstInBoundsGEP1_32(b_.getInt8PtrTy(), values, i);
-    Store(value_as_i8ptr, slot_in_values_alloca);
+        ConstInBoundsGEP1_32(b_.getPtrTy(), values, i);
+    Store(destination_addresses[i], slot_in_values_alloca);
     llvm::Value* slot_in_sizes_alloca =
         ConstInBoundsGEP1_32(b_.getInt32Ty(), sizes, i);
     llvm::Value* size = b_.getInt32(ShapeUtil::ByteSizeOfPrimitiveType(
@@ -993,9 +983,6 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
       }
 
       PrimitiveType primitive_type = lhs->shape().element_type();
-      llvm::Type* ir_ptr_type = primitive_type == F16
-                                    ? b_.getHalfTy()->getPointerTo()
-                                    : b_.getFloatTy()->getPointerTo();
       bool multi_threaded =
           hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
       bool use_mkl_dnn =
@@ -1047,9 +1034,9 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
       }
       std::vector<llvm::Value*> args = {
           GetExecutableRunOptionsArgument(),
-          BitCast(GetEmittedValueFor(convolution), ir_ptr_type),
-          BitCast(lhs_address, ir_ptr_type),
-          BitCast(rhs_address, ir_ptr_type),
+          GetEmittedValueFor(convolution),
+          lhs_address,
+          rhs_address,
           b_.getInt64(input_batch),
       };
       for (int64_t d : input_dims) {
@@ -1125,7 +1112,6 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
   }
 
   // Args have been computed, make the call.
-  llvm::Type* int8_ptr_type = b_.getInt8Ty()->getPointerTo();
   bool multi_threaded_eigen =
       hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
   const char* fn_name = multi_threaded_eigen
@@ -1135,17 +1121,15 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
       EmitGlobalForLiteral(LiteralUtil::CreateR1<int64_t>(fft_length));
   auto* input_shape =
       EmitGlobalForLiteral(LiteralUtil::CreateR1<int64_t>(operand_shape_flat));
-  EmitCallToFunc(
-      fn_name,
-      {GetExecutableRunOptionsArgument(),
-       BitCast(GetEmittedValueFor(fft), int8_ptr_type),
-       BitCast(operand_address, int8_ptr_type), b_.getInt32(fft->fft_type()),
-       b_.getInt32(operand->shape().element_type() == F64 ||
-                   operand->shape().element_type() == C128),
-       b_.getInt32(fft_rank), input_shape, fft_lengths},
-      b_.getVoidTy(), /*does_not_throw=*/true,
-      /*only_accesses_arg_memory=*/false,
-      /*only_accesses_inaccessible_mem_or_arg_mem=*/true);
+  EmitCallToFunc(fn_name,
+                 {GetExecutableRunOptionsArgument(), GetEmittedValueFor(fft),
+                  operand_address, b_.getInt32(fft->fft_type()),
+                  b_.getInt32(operand->shape().element_type() == F64 ||
+                              operand->shape().element_type() == C128),
+                  b_.getInt32(fft_rank), input_shape, fft_lengths},
+                 b_.getVoidTy(), /*does_not_throw=*/true,
+                 /*only_accesses_arg_memory=*/false,
+                 /*only_accesses_inaccessible_mem_or_arg_mem=*/true);
 
   return OkStatus();
 }
@@ -1185,35 +1169,36 @@ Status IrEmitter::HandleAllReduceSingleReplica(HloInstruction* crs) {
   return OkStatus();
 }
 
+// Data types supported by ReduceScatter and AllReduce.
+static bool DataTypeIsSupportedByReduceScatter(PrimitiveType datatype) {
+  // TODO(cheshire): Fix duplication wrt. cpu_runtime
+  switch (datatype) {
+    case PRED:
+    case S8:
+    case U8:
+    case S16:
+    case U16:
+    case S32:
+    case U32:
+    case S64:
+    case U64:
+    case F16:
+    case F32:
+    case F64:
+    case C64:
+    case C128:
+      return true;
+    default:
+      return false;
+  }
+}
+
 Status IrEmitter::HandleAllReduceMultipleReplica(HloInstruction* crs) {
   CHECK_GE(crs->operand_count(), 1);
   PrimitiveType datatype = crs->operand(0)->shape().element_type();
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(crs));
 
-  bool is_datatype_supported = [&] {
-    // TODO(cheshire): Fix duplication wrt. cpu_runtime
-    switch (datatype) {
-      case PRED:
-      case S8:
-      case U8:
-      case S16:
-      case U16:
-      case S32:
-      case U32:
-      case S64:
-      case U64:
-      case F16:
-      case F32:
-      case F64:
-      case C64:
-      case C128:
-        return true;
-      default:
-        return false;
-    }
-  }();
-
-  if (!is_datatype_supported) {
+  if (!DataTypeIsSupportedByReduceScatter(datatype)) {
     return Unimplemented("AllReduce for datatype '%s' is not supported",
                          primitive_util::LowercasePrimitiveTypeName(datatype));
   }
@@ -1263,7 +1248,6 @@ Status IrEmitter::HandleAllReduceMultipleReplica(HloInstruction* crs) {
                       llvm_ir::EncodeSelfDescribingShapeConstant(
                           crs->shape(), &shape_length, &b_));
 
-  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(module_->getContext());
   bool use_global_device_ids =
       Cast<HloAllReduceInstruction>(crs)->use_global_device_ids();
   EmitCallToFunc(
@@ -1286,8 +1270,8 @@ Status IrEmitter::HandleAllReduceMultipleReplica(HloInstruction* crs) {
        /*shape_ptr=*/shape_ptr,
        /*shape_length=*/b_.getInt32(shape_length),
        /*num_buffers=*/b_.getInt32(crs->operand_count()),
-       /*input_buffers=*/b_.CreateBitCast(input_buffers, i8_ptr_type),
-       /*output_buffers=*/b_.CreateBitCast(output_buffers, i8_ptr_type)},
+       /*input_buffers=*/input_buffers,
+       /*output_buffers=*/output_buffers},
       b_.getVoidTy());
 
   return OkStatus();
@@ -1302,7 +1286,59 @@ Status IrEmitter::HandleAllReduce(HloInstruction* crs) {
 }
 
 Status IrEmitter::HandleReduceScatter(HloInstruction* rs) {
-  return Unimplemented("ReduceScatter is not implemented on CPU.");
+  CHECK_EQ(rs->operand_count(), 1);
+  PrimitiveType datatype = rs->operand(0)->shape().element_type();
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(rs));
+
+  if (!DataTypeIsSupportedByReduceScatter(datatype)) {
+    return Unimplemented("ReduceScatter for datatype '%s' is not supported",
+                         primitive_util::LowercasePrimitiveTypeName(datatype));
+  }
+
+  if (!MatchReductionComputation(rs->to_apply()).has_value()) {
+    return Unimplemented("ReduceScatter for computation '%s' is not supported",
+                         rs->to_apply()->ToString());
+  }
+
+  std::string replica_groups = ReplicaGroupsToString(rs->replica_groups());
+  int32_t replica_groups_size = replica_groups.size();
+  llvm::Value* replica_groups_v = b_.CreateGlobalStringPtr(replica_groups);
+
+  Shape shape = rs->operand(0)->shape();
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice input_slice,
+                      assignment_.GetUniqueSlice(rs->operand(0), {}));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice output_slice,
+                      assignment_.GetUniqueSlice(rs, {}));
+  llvm::Value* input_buffer = EmitBufferPointer(input_slice, shape);
+  llvm::Value* output_buffer = EmitBufferPointer(output_slice, shape);
+
+  bool use_global_device_ids =
+      Cast<HloReduceScatterInstruction>(rs)->use_global_device_ids();
+
+  EmitCallToFunc(
+      runtime::kReduceScatterSymbolName,
+      {/*run_options=*/GetExecutableRunOptionsArgument(),
+       /*replica_groups_str=*/replica_groups_v,
+       /*replica_groups_str_size=*/b_.getInt32(replica_groups_size),
+
+       /*channel_id_present=*/
+       b_.getInt32(static_cast<int32_t>(rs->channel_id().has_value())),
+       /*use_global_device_ids=*/
+       b_.getInt32(static_cast<int32_t>(use_global_device_ids)),
+       /*op_id=*/
+       b_.getInt64(rs->channel_id().has_value() ? *rs->channel_id()
+                                                : rs->GetModule()->unique_id()),
+       /*reduction_kind=*/
+       b_.getInt32(
+           static_cast<int32_t>(*MatchReductionComputation(rs->to_apply()))),
+       /*element_type=*/
+       b_.getInt32(static_cast<int32_t>(datatype)),
+       /*shape=*/b_.getInt64(ShapeUtil::ElementsIn(rs->shape())),
+       /*input_buffer=*/input_buffer,
+       /*output_buffer=*/output_buffer},
+      b_.getVoidTy());
+
+  return OkStatus();
 }
 
 Status IrEmitter::HandleAllToAll(HloInstruction* instruction) {
@@ -1311,7 +1347,6 @@ Status IrEmitter::HandleAllToAll(HloInstruction* instruction) {
   CHECK(!instr->split_dimension() && instr->shape().IsTuple())
       << "Only tuple AllToAll is supported";
 
-  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(module_->getContext());
   std::string replica_groups =
       ReplicaGroupsToString(instruction->replica_groups());
   int32_t replica_groups_size = replica_groups.size();
@@ -1339,21 +1374,74 @@ Status IrEmitter::HandleAllToAll(HloInstruction* instruction) {
   llvm::Value* output_buffers =
       EncodeArrayFunctionArguments(output_buffer_ptrs, "output_buffers", &b_);
 
+  EmitCallToFunc(runtime::kAllToAllSymbolName,
+                 {
+                     /*run_options=*/GetExecutableRunOptionsArgument(),
+                     /*channel_id_present=*/
+                     b_.getInt32(static_cast<int32_t>(
+                         instruction->channel_id().has_value())),
+                     /*op_id=*/
+                     b_.getInt64(instruction->channel_id().has_value()
+                                     ? *instruction->channel_id()
+                                     : instruction->GetModule()->unique_id()),
+                     /*replica_groups=*/replica_groups_v,
+                     /*replica_groups_size=*/b_.getInt32(replica_groups_size),
+                     /*num_buffers=*/b_.getInt32(instruction->operand_count()),
+                     /*buffer_size=*/b_.getInt64(buffer_size),
+                     /*source_buffers=*/input_buffers,
+                     /*destination_buffers=*/output_buffers,
+                 },
+                 b_.getVoidTy());
+
+  llvm_ir::EmitTuple(GetIrArrayFor(instruction), output_buffer_ptrs, &b_);
+  return OkStatus();
+}
+
+Status IrEmitter::HandleAllGather(HloInstruction* instruction) {
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(instruction));
+
+  std::string replica_groups =
+      ReplicaGroupsToString(instruction->replica_groups());
+  int32_t replica_groups_size = replica_groups.size();
+  llvm::Value* replica_groups_v = b_.CreateGlobalStringPtr(replica_groups);
+
+  std::vector<llvm::Value*> input_buffer_ptrs;
+  std::vector<llvm::Value*> output_buffer_ptrs;
+
+  const HloInstruction* op = instruction->operand(0);
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice in_slice,
+                      assignment_.GetUniqueSlice(op, {}));
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_slice,
+                      assignment_.GetUniqueSlice(instruction, {}));
+  const Shape& operand_shape = op->shape();
+  CHECK(op->shape().IsArray())
+      << "Operand to all-gather must be arrays: " << instruction->ToString();
+  llvm::Value* output_buffer = EmitBufferPointer(out_slice, operand_shape);
+  llvm::Value* input_buffer = GetEmittedValueFor(op);
+  int64_t buffer_size = in_slice.size();
+
+  bool use_global_device_ids =
+      Cast<HloAllGatherInstruction>(instruction)->use_global_device_ids();
+
   EmitCallToFunc(
-      runtime::kAllToAllSymbolName,
-      {/*run_options=*/GetExecutableRunOptionsArgument(),
-       /*channel_id_present=*/
-       b_.getInt32(static_cast<int32_t>(instruction->channel_id().has_value())),
-       /*op_id=*/
-       b_.getInt64(instruction->channel_id().has_value()
-                       ? *instruction->channel_id()
-                       : instruction->GetModule()->unique_id()),
-       /*replica_groups=*/replica_groups_v,
-       /*replica_groups_size=*/b_.getInt32(replica_groups_size),
-       /*num_buffers=*/b_.getInt32(instruction->operand_count()),
-       /*buffer_size=*/b_.getInt64(buffer_size),
-       /*source_buffers=*/b_.CreateBitCast(input_buffers, i8_ptr_type),
-       /*destination_buffers=*/b_.CreateBitCast(output_buffers, i8_ptr_type)},
+      runtime::kAllGatherSymbolName,
+      {
+          /*run_options=*/GetExecutableRunOptionsArgument(),
+          /*channel_id_present=*/
+          b_.getInt32(
+              static_cast<int32_t>(instruction->channel_id().has_value())),
+          /*use_global_device_ids=*/
+          b_.getInt32(static_cast<int32_t>(use_global_device_ids)),
+          /*op_id=*/
+          b_.getInt64(instruction->channel_id().has_value()
+                          ? *instruction->channel_id()
+                          : instruction->GetModule()->unique_id()),
+          /*replica_groups_str=*/replica_groups_v,
+          /*replica_groups_str_size=*/b_.getInt32(replica_groups_size),
+          /*buffer_size=*/b_.getInt64(buffer_size),
+          /*source_buffer=*/input_buffer,
+          /*destination_buffer=*/output_buffer,
+      },
       b_.getVoidTy());
 
   llvm_ir::EmitTuple(GetIrArrayFor(instruction), output_buffer_ptrs, &b_);
@@ -1378,7 +1466,6 @@ Status IrEmitter::HandleCollectivePermute(HloInstruction* crs) {
                       assignment_.GetUniqueSlice(crs, {}));
   llvm::Value* output_buffer = EmitBufferPointer(output_slice, shape);
 
-  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(module_->getContext());
   EmitCallToFunc(
       runtime::kCollectivePermuteSymbolName,
       {/*run_options=*/GetExecutableRunOptionsArgument(),
@@ -1389,8 +1476,8 @@ Status IrEmitter::HandleCollectivePermute(HloInstruction* crs) {
                        ? *crs->channel_id()
                        : crs->GetModule()->unique_id()),
        /*byte_size=*/b_.getInt32(ShapeUtil::ByteSizeOf(shape)),
-       /*input_buffer=*/b_.CreateBitCast(input_buffer, i8_ptr_type),
-       /*output_buffer=*/b_.CreateBitCast(output_buffer, i8_ptr_type),
+       /*input_buffer=*/input_buffer,
+       /*output_buffer=*/output_buffer,
        /*source_target_pairs=*/source_target_pairs_v,
        /*source_target_pairs_size=*/b_.getInt32(source_target_pairs.size())},
       b_.getVoidTy());
@@ -1403,12 +1490,10 @@ Status IrEmitter::HandlePartitionId(HloInstruction* hlo) {
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice output_slice,
                       assignment_.GetUniqueSlice(hlo, {}));
   llvm::Value* output_buffer = EmitBufferPointer(output_slice, hlo->shape());
-  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(module_->getContext());
-  EmitCallToFunc(
-      runtime::kPartitionIdSymbolName,
-      {/*run_options=*/GetExecutableRunOptionsArgument(),
-       /*output_buffer=*/b_.CreateBitCast(output_buffer, i8_ptr_type)},
-      b_.getVoidTy());
+  EmitCallToFunc(runtime::kPartitionIdSymbolName,
+                 {/*run_options=*/GetExecutableRunOptionsArgument(),
+                  /*output_buffer=*/output_buffer},
+                 b_.getVoidTy());
   return OkStatus();
 }
 
@@ -1417,12 +1502,10 @@ Status IrEmitter::HandleReplicaId(HloInstruction* hlo) {
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice output_slice,
                       assignment_.GetUniqueSlice(hlo, {}));
   llvm::Value* output_buffer = EmitBufferPointer(output_slice, hlo->shape());
-  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(module_->getContext());
-  EmitCallToFunc(
-      runtime::kReplicaIdSymbolName,
-      {/*run_options=*/GetExecutableRunOptionsArgument(),
-       /*output_buffer=*/b_.CreateBitCast(output_buffer, i8_ptr_type)},
-      b_.getVoidTy());
+  EmitCallToFunc(runtime::kReplicaIdSymbolName,
+                 {/*run_options=*/GetExecutableRunOptionsArgument(),
+                  /*output_buffer=*/output_buffer},
+                 b_.getVoidTy());
   return OkStatus();
 }
 
@@ -1694,16 +1777,14 @@ IrEmitter::EmitInnerLoopForVectorizedReduction(
   llvm_ir::IrArray::Index input_index(input_multi_index, arg->shape(),
                                       b_.getInt64Ty());
 
-  llvm::Value* input_address = BitCast(
-      arg_array.EmitArrayElementAddress(input_index, &b_), b_.getInt8PtrTy());
+  llvm::Value* input_address =
+      arg_array.EmitArrayElementAddress(input_index, &b_);
 
   for (int i = 0; i < accumulator.size(); i++) {
-    auto input_address_typed =
-        BitCast(input_address, accumulator[i]->getType());
     auto alloca = llvm::cast<llvm::AllocaInst>(accumulator[i]);
     auto current_accumulator_value = AlignedLoad(
         alloca->getAllocatedType(), accumulator[i], element_alignment);
-    auto addend = AlignedLoad(alloca->getAllocatedType(), input_address_typed,
+    auto addend = AlignedLoad(alloca->getAllocatedType(), input_address,
                               element_alignment);
     arg_array.AnnotateLoadStoreInstructionWithMetadata(addend);
 
@@ -1712,8 +1793,8 @@ IrEmitter::EmitInnerLoopForVectorizedReduction(
     AlignedStore(reduced_result, accumulator[i], element_alignment);
 
     if (i != (accumulator.size() - 1)) {
-      input_address = ConstInBoundsGEP1_32(reduced_result->getType(),
-                                           input_address_typed, 1);
+      input_address =
+          ConstInBoundsGEP1_32(reduced_result->getType(), input_address, 1);
     }
   }
 
@@ -1733,18 +1814,14 @@ void IrEmitter::EmitShardedVectorStore(
     llvm::Value* store_address, const std::vector<llvm::Value*>& value_to_store,
     llvm::Align alignment, const llvm_ir::IrArray& containing_array) {
   for (int i = 0; i < value_to_store.size(); i++) {
-    auto store_address_typed =
-        BitCast(store_address,
-                llvm::PointerType::getUnqual(value_to_store[i]->getType()));
-
     auto store_instruction =
-        AlignedStore(value_to_store[i], store_address_typed, alignment);
+        AlignedStore(value_to_store[i], store_address, alignment);
     containing_array.AnnotateLoadStoreInstructionWithMetadata(
         store_instruction);
 
     if (i != (value_to_store.size() - 1)) {
-      store_address = ConstInBoundsGEP1_32(value_to_store[i]->getType(),
-                                           store_address_typed, 1);
+      store_address =
+          ConstInBoundsGEP1_32(value_to_store[i]->getType(), store_address, 1);
     }
   }
 }
@@ -2298,8 +2375,6 @@ Status IrEmitter::HandleSliceToDynamic(HloInstruction* hlo) {
   int32_t raw_data_size =
       ShapeUtil::ByteSizeOf(ShapeUtil::MakeStaticShape(hlo->shape()));
   llvm::Value* dest_buffer = GetEmittedValueFor(hlo);
-  llvm::Value* raw_buffer =
-      b_.CreateBitCast(dest_buffer, b_.getInt8Ty()->getPointerTo());
   for (int64_t i = 1; i < hlo->operand_count(); ++i) {
     const int64_t dim_index = i - 1;
     llvm::Value* source_buffer = GetEmittedValueFor(hlo->operand(i));
@@ -2307,10 +2382,9 @@ Status IrEmitter::HandleSliceToDynamic(HloInstruction* hlo) {
                                         source_buffer, "dyn_dim_size");
 
     llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
-        b_.getInt8Ty(), raw_buffer,
+        b_.getInt8Ty(), dest_buffer,
         raw_data_size + dim_index * sizeof(int32_t));
-    b_.CreateStore(dyn_dim_size,
-                   b_.CreateBitCast(metadata, b_.getInt32Ty()->getPointerTo()));
+    b_.CreateStore(dyn_dim_size, metadata);
     dynamic_dims.push_back(b_.CreateIntCast(dyn_dim_size, b_.getInt64Ty(),
                                             /*isSigned=*/true,
                                             "i64_dyn_dim_size"));
@@ -2351,8 +2425,6 @@ Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
   llvm::Type* data_type = IrShapeType(data_shape);
   llvm_ir::IrArray data_array(data_address, data_type, data_shape);
   llvm::Value* source_buffer = GetEmittedValueFor(hlo->operand(0));
-  llvm::Value* raw_buffer =
-      b_.CreateBitCast(source_buffer, b_.getInt8Ty()->getPointerTo());
   int64_t raw_data_size =
       ShapeUtil::ByteSizeOf(ShapeUtil::MakeStaticShape(input_shape));
 
@@ -2371,15 +2443,11 @@ Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
         EmitBufferPointer(dim_size_slice, data_shape);
     const int64_t dim_index = i - 1;
     llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
-        b_.getInt8Ty(), raw_buffer,
+        b_.getInt8Ty(), source_buffer,
         raw_data_size + dim_index * sizeof(int32_t));
-    llvm::Value* dyn_dim_size = b_.CreateLoad(
-        b_.getInt32Ty(),
-        b_.CreateBitCast(metadata, b_.getInt32Ty()->getPointerTo()),
-        "dyn_dim_size");
-    b_.CreateStore(dyn_dim_size,
-                   b_.CreateBitCast(dest_dim_size_address,
-                                    b_.getInt32Ty()->getPointerTo()));
+    llvm::Value* dyn_dim_size =
+        b_.CreateLoad(b_.getInt32Ty(), metadata, "dyn_dim_size");
+    b_.CreateStore(dyn_dim_size, dest_dim_size_address);
     dynamic_dims.push_back(b_.CreateIntCast(dyn_dim_size, b_.getInt64Ty(),
                                             /*isSigned=*/true,
                                             "i64_dyn_dim_size"));
@@ -2414,13 +2482,16 @@ Status IrEmitter::HandleTopK(HloInstruction* hlo) {
   const HloInstruction* input = hlo->operand(0);
   const int64_t k = hlo->shape().tuple_shapes(0).dimensions().back();
   const bool has_batch = hlo->shape().tuple_shapes(0).dimensions_size() == 2;
-  TF_RET_CHECK(input->shape().element_type() == F32);
+  TF_RET_CHECK(input->shape().element_type() == F32) << hlo->ToString();
   TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(
-      hlo->shape().tuple_shapes(0).layout()));
+      hlo->shape().tuple_shapes(0).layout()))
+      << hlo->ToString();
   TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(
-      hlo->shape().tuple_shapes(1).layout()));
+      hlo->shape().tuple_shapes(1).layout()))
+      << hlo->ToString();
   TF_RET_CHECK(
-      LayoutUtil::IsMonotonicWithDim0Major(hlo->operand(0)->shape().layout()));
+      LayoutUtil::IsMonotonicWithDim0Major(hlo->operand(0)->shape().layout()))
+      << hlo->ToString();
 
   TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice values_slice,
                       assignment_.GetUniqueSlice(hlo->operand(0), {}));
@@ -2434,14 +2505,11 @@ Status IrEmitter::HandleTopK(HloInstruction* hlo) {
       EmitBufferPointer(out_values_slice, hlo->shape().tuple_shapes(0));
   llvm::Value* out_indices_ptr =
       EmitBufferPointer(out_indices_slice, hlo->shape().tuple_shapes(1));
-  EmitCallToFunc(
-      runtime::kTopKF32SymbolName,
-      {b_.getInt64(has_batch ? input->shape().dimensions(0) : 1),
-       b_.getInt64(input->shape().dimensions().back()), b_.getInt64(k),
-       BitCast(values_ptr, b_.getFloatTy()->getPointerTo()),
-       BitCast(out_values_ptr, b_.getFloatTy()->getPointerTo()),
-       BitCast(out_indices_ptr, b_.getInt32Ty()->getPointerTo())},
-      b_.getVoidTy());
+  EmitCallToFunc(runtime::kTopKF32SymbolName,
+                 {b_.getInt64(has_batch ? input->shape().dimensions(0) : 1),
+                  b_.getInt64(input->shape().dimensions().back()),
+                  b_.getInt64(k), values_ptr, out_values_ptr, out_indices_ptr},
+                 b_.getVoidTy());
 
   llvm_ir::EmitTuple(GetIrArrayFor(hlo), {out_values_ptr, out_indices_ptr},
                      &b_);
@@ -2485,6 +2553,110 @@ Status IrEmitter::HandleOneDnnMatMul(HloInstruction* custom_call) {
 
   return OkStatus();
 }
+
+Status IrEmitter::HandleOneDnnLayerNorm(HloInstruction* custom_call) {
+  //      args[0]: ptr to nargs
+  //      args[1]: ptr to ExecutableRunOptions
+  //      args[2]: ptr to OneDnnLayerNormConfig
+  //      args[3...]: ptrs to operands
+
+  // First three arguments: nargs, ExecutableRunOptions, and
+  // OneDnnLayerNormConfig.
+  const int nargs_offset = 3;
+  const int num_operands = custom_call->operand_count();
+  const int nargs = nargs_offset + num_operands;
+  int arg_indx = 0;
+
+  llvm::Type* i64_type = b_.getInt64Ty();
+  llvm::Type* ptr_type = b_.getPtrTy();
+  llvm::ArrayType* ptr_array_type = llvm::ArrayType::get(ptr_type, nargs);
+  llvm::Value* args_val = llvm::UndefValue::get(ptr_array_type);
+
+  // Insert nargs.
+  llvm::Value* nargs_val = b_.getInt64(nargs);
+  llvm::Value* nargs_ptr =
+      llvm_ir::EmitAllocaAtFunctionEntry(i64_type, "nargs", &b_);
+  llvm::Value* nargs_life_start =
+      b_.CreateLifetimeStart(nargs_ptr, b_.getInt64(-1));
+  llvm::Value* nargs_store = b_.CreateStore(nargs_val, nargs_ptr);
+  args_val = b_.CreateInsertValue(args_val, nargs_ptr, arg_indx++);
+
+  // Insert ExecutableRunOptions.
+  llvm::Value* run_opts_val = GetExecutableRunOptionsArgument();
+  args_val = b_.CreateInsertValue(args_val, run_opts_val, arg_indx++);
+
+  // Insert OneDnnLayerNormConfig.
+  auto typed_custom_call = Cast<HloCustomCallInstruction>(custom_call);
+  auto backend_config = typed_custom_call->backend_config<BackendConfig>();
+  OneDnnLayerNormConfig ln_config;
+  ln_config.CopyFrom(backend_config->onednn_layer_norm_config());
+  std::string str_config;
+  ln_config.SerializeToString(&str_config);
+  llvm::Value* ln_config_val =
+      b_.CreateGlobalStringPtr(llvm_ir::AsStringRef(str_config));
+  args_val = b_.CreateInsertValue(args_val, ln_config_val, arg_indx++);
+
+  // Insert operands.
+  std::vector<StackAlloca> operands_stack_alloca;
+  operands_stack_alloca.reserve(num_operands);
+  absl::c_transform(custom_call->operands(), operands_stack_alloca.begin(),
+                    [this](HloInstruction* instr) {
+                      llvm_ir::IrArray ir_array(GetIrArrayFor(instr));
+                      return GetAllocaAndEmitMemrefInfo(b_, ir_array);
+                    });
+  for (int i = 0; i < num_operands; ++i) {
+    args_val = b_.CreateInsertValue(args_val, operands_stack_alloca[i].value,
+                                    arg_indx++);
+  }
+
+  llvm::Value* args_ptr =
+      llvm_ir::EmitAllocaAtFunctionEntry(ptr_array_type, "layernorm.args", &b_);
+  llvm::Value* args_life_start =
+      b_.CreateLifetimeStart(args_ptr, b_.getInt64(-1));
+  llvm::Value* args_store = b_.CreateStore(args_val, args_ptr);
+
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call));
+  llvm_ir::IrArray result_array = GetIrArrayFor(custom_call);
+  auto result_stack_alloca = GetAllocaAndEmitMemrefInfo(b_, result_array);
+
+  EmitCallToFunc(runtime::kOneDnnLayerNormSymbolName,
+                 {result_stack_alloca.value, args_ptr}, b_.getVoidTy());
+
+  // Lifetime ends for all stack allocations.
+  b_.CreateLifetimeEnd(nargs_ptr, b_.getInt64(-1));
+  for (int i = 0; i < num_operands; ++i) {
+    operands_stack_alloca[i].EmitLifetimeEnd();
+  }
+  b_.CreateLifetimeEnd(args_ptr, b_.getInt64(-1));
+  result_stack_alloca.EmitLifetimeEnd();
+
+  return OkStatus();
+}
+
+Status IrEmitter::HandleOneDnnSoftmax(HloInstruction* custom_call) {
+  auto input = custom_call->operand(0);
+  llvm_ir::IrArray input_array(GetIrArrayFor(input));
+  auto input_stack_alloca = GetAllocaAndEmitMemrefInfo(b_, input_array);
+
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call));
+  llvm_ir::IrArray result_array = GetIrArrayFor(custom_call);
+  auto result_stack_alloca = GetAllocaAndEmitMemrefInfo(b_, result_array);
+
+  auto typed_custom_call = Cast<HloCustomCallInstruction>(custom_call);
+  EmitCallToFunc(runtime::kOneDnnSoftmaxSymbolName,
+                 {
+                     GetExecutableRunOptionsArgument(),
+                     input_stack_alloca.value,
+                     result_stack_alloca.value,
+                 },
+                 b_.getVoidTy());
+
+  input_stack_alloca.EmitLifetimeEnd();
+  result_stack_alloca.EmitLifetimeEnd();
+
+  return OkStatus();
+}
+
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 
 Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
@@ -2501,19 +2673,22 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
   if (custom_call->custom_call_target() == "__onednn$matmul") {
     return HandleOneDnnMatMul(custom_call);
   }
+  if (custom_call->custom_call_target() == "__onednn$softmax") {
+    return HandleOneDnnSoftmax(custom_call);
+  }
+  if (custom_call->custom_call_target() == "__onednn$layernorm") {
+    return HandleOneDnnLayerNorm(custom_call);
+  }
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
   absl::Span<HloInstruction* const> operands(custom_call->operands());
-  llvm::Type* i8_ptr_type = b_.getInt8PtrTy();
   llvm::AllocaInst* operands_alloca =
-      llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-          i8_ptr_type, b_.getInt32(operands.size()), "cc_operands_alloca", &b_);
+      llvm_ir::EmitAllocaAtFunctionEntryWithCount(b_.getPtrTy(),
+                                                  b_.getInt32(operands.size()),
+                                                  "cc_operands_alloca", &b_);
   for (size_t i = 0; i < operands.size(); ++i) {
-    const HloInstruction* operand = operands[i];
-    llvm::Value* operand_as_i8ptr =
-        PointerCast(GetEmittedValueFor(operand), i8_ptr_type);
     llvm::Value* slot_in_operands_alloca = InBoundsGEP(
         operands_alloca->getAllocatedType(), operands_alloca, {b_.getInt64(i)});
-    Store(operand_as_i8ptr, slot_in_operands_alloca);
+    Store(GetEmittedValueFor(operands[i]), slot_in_operands_alloca);
   }
   if (emit_code_for_msan_) {
     // Mark the alloca as initialized for msan. The buffer gets read by the
@@ -2523,7 +2698,7 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
     llvm::Type* intptr_type = b_.getIntPtrTy(dl);
     EmitCallToFunc(
         "__msan_unpoison",
-        {PointerCast(operands_alloca, i8_ptr_type),
+        {operands_alloca,
          llvm::ConstantInt::get(
              intptr_type, *operands_alloca->getAllocationSizeInBits(dl) / 8)},
         b_.getVoidTy());
@@ -2545,8 +2720,7 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
     }
     llvm_ir::EmitTuple(GetIrArrayFor(custom_call), base_ptrs, &b_);
   }
-  auto* output_address_arg =
-      PointerCast(GetEmittedValueFor(custom_call), i8_ptr_type);
+  auto* output_address_arg = GetEmittedValueFor(custom_call);
 
   auto typed_custom_call = Cast<HloCustomCallInstruction>(custom_call);
   switch (typed_custom_call->api_version()) {
@@ -2702,8 +2876,6 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
   std::vector<int64_t> outer_dims(std::next(concat_dim_layout_itr),
                                   output_min2maj.end());
 
-  llvm::Type* i8_ptr_type = b_.getInt8PtrTy();
-
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(concatenate));
   llvm_ir::IrArray target_array = GetIrArrayFor(concatenate);
 
@@ -2726,9 +2898,8 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
 
   // Contiguous subregions from each operand to the concatenate contribute to a
   // contiguous subregion in the target buffer starting at target_region_begin.
-  llvm::Value* target_region_begin = BitCast(
-      target_array.EmitArrayElementAddress(target_index, &b_, "target_region"),
-      i8_ptr_type);
+  llvm::Value* target_region_begin =
+      target_array.EmitArrayElementAddress(target_index, &b_, "target_region");
   int64_t byte_offset_into_target_region = 0;
 
   int64_t inner_dims_product =
@@ -2744,9 +2915,8 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
     llvm_ir::IrArray source_array = GetIrArrayFor(operand);
     llvm_ir::IrArray::Index source_index(target_multi_index, operand->shape(),
                                          b_.getInt64Ty());
-    llvm::Value* copy_source_address = BitCast(
-        source_array.EmitArrayElementAddress(source_index, &b_, "src_addr"),
-        i8_ptr_type);
+    llvm::Value* copy_source_address =
+        source_array.EmitArrayElementAddress(source_index, &b_, "src_addr");
 
     llvm::Value* copy_target_address =
         GEP(b_.getInt8Ty(), target_region_begin,
@@ -2771,27 +2941,25 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
 
 llvm::Value* IrEmitter::EmitPrintf(absl::string_view fmt,
                                    absl::Span<llvm::Value* const> arguments) {
-  llvm::Type* ptr_ty = b_.getInt8Ty()->getPointerTo();
   std::vector<llvm::Value*> call_args;
   call_args.push_back(b_.CreateGlobalStringPtr(llvm_ir::AsStringRef(fmt)));
   absl::c_copy(arguments, std::back_inserter(call_args));
   return b_.CreateCall(
       b_.GetInsertBlock()->getParent()->getParent()->getOrInsertFunction(
-          "printf", llvm::FunctionType::get(b_.getInt32Ty(), {ptr_ty},
+          "printf", llvm::FunctionType::get(b_.getInt32Ty(), {b_.getPtrTy()},
                                             /*isVarArg=*/true)),
       call_args);
 }
 
 llvm::Value* IrEmitter::EmitPrintfToStderr(
     absl::string_view fmt, absl::Span<llvm::Value* const> arguments) {
-  llvm::Type* ptr_ty = b_.getInt8Ty()->getPointerTo();
   std::vector<llvm::Value*> call_args;
   call_args.push_back(b_.CreateGlobalStringPtr(llvm_ir::AsStringRef(fmt)));
   absl::c_copy(arguments, std::back_inserter(call_args));
   return b_.CreateCall(
       b_.GetInsertBlock()->getParent()->getParent()->getOrInsertFunction(
           runtime::kPrintfToStderrSymbolName,
-          llvm::FunctionType::get(b_.getInt32Ty(), {ptr_ty},
+          llvm::FunctionType::get(b_.getInt32Ty(), {b_.getPtrTy()},
                                   /*isVarArg=*/true)),
       call_args);
 }
@@ -2832,17 +3000,13 @@ void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source,
       primitive_type_size, MinimumAlignmentForPrimitiveType(primitive_type)));
   llvm::Type* primitive_llvm_type =
       llvm_ir::PrimitiveTypeToIrType(primitive_type, module_);
-  llvm::Type* primitive_ptr_type =
-      llvm::PointerType::getUnqual(primitive_llvm_type);
 
   if (element_count == 1) {
     auto* load_instruction =
-        AlignedLoad(primitive_llvm_type, BitCast(source, primitive_ptr_type),
-                    element_alignment);
+        AlignedLoad(primitive_llvm_type, source, element_alignment);
     source_array.AnnotateLoadStoreInstructionWithMetadata(load_instruction);
     auto* store_instruction =
-        AlignedStore(load_instruction, BitCast(target, primitive_ptr_type),
-                     element_alignment);
+        AlignedStore(load_instruction, target, element_alignment);
     target_array.AnnotateLoadStoreInstructionWithMetadata(store_instruction);
   } else {
     auto* memcpy_instruction = b_.CreateMemCpy(
@@ -3016,9 +3180,6 @@ Status IrEmitter::HandleRngGetAndUpdateState(HloInstruction* rng_state) {
 
   // The buffer has an array type while the value has a i128. Cast the
   // buffer to i128 type to store the value.
-  address = BitCast(address, llvm::PointerType::get(
-                                 old_state->getType()->getScalarType(),
-                                 address->getType()->getPointerAddressSpace()));
   llvm::StoreInst* store = Store(old_state, address);
   store->setAlignment(llvm::Align(IrEmitter::MinimumAlignmentForPrimitiveType(
       rng_state->shape().element_type())));
@@ -3143,11 +3304,9 @@ void IrEmitter::TracingState::EmitTracingStart(llvm::IRBuilder<>* b,
     return;
   }
 
-  llvm::Type* int8_ptr_type = b->getInt8Ty()->getPointerTo();
-  llvm::Type* void_ptr_type =
-      int8_ptr_type;  // LLVM does not have a void*, we use an int8_t* instead.
+  llvm::Type* void_ptr_type = b->getPtrTy();
   llvm::FunctionType* fn_type =
-      llvm::FunctionType::get(b->getInt64Ty(), {void_ptr_type, int8_ptr_type},
+      llvm::FunctionType::get(b->getInt64Ty(), {void_ptr_type, void_ptr_type},
                               /*isVarArg=*/false);
 
   llvm::Function* function = b->GetInsertBlock()->getParent();
@@ -3161,9 +3320,7 @@ void IrEmitter::TracingState::EmitTracingStart(llvm::IRBuilder<>* b,
     fn->setOnlyAccessesArgMemory();
   }
   auto* hlo_name = b->CreateGlobalStringPtr(hlo->name());
-  auto* activity_id =
-      b->CreateCall(trace_func, {b->CreateBitCast(run_options, void_ptr_type),
-                                 b->CreateBitCast(hlo_name, int8_ptr_type)});
+  auto* activity_id = b->CreateCall(trace_func, {run_options, hlo_name});
   activity_id->setName(IrName(hlo, "activity_id"));
   activity_ids_[hlo] = activity_id;
 }
@@ -3175,11 +3332,8 @@ void IrEmitter::TracingState::EmitTracingEnd(llvm::IRBuilder<>* b,
     return;
   }
 
-  llvm::Type* void_ptr_type =
-      b->getInt8Ty()->getPointerTo();  // LLVM does not have a void*, we use an
-                                       // int8_t* instead.
   llvm::FunctionType* fn_type =
-      llvm::FunctionType::get(b->getVoidTy(), {void_ptr_type, b->getInt64Ty()},
+      llvm::FunctionType::get(b->getVoidTy(), {b->getPtrTy(), b->getInt64Ty()},
                               /*isVarArg=*/false);
 
   llvm::Function* function = b->GetInsertBlock()->getParent();
@@ -3193,8 +3347,7 @@ void IrEmitter::TracingState::EmitTracingEnd(llvm::IRBuilder<>* b,
     fn->setOnlyAccessesArgMemory();
   }
   auto* activity_id = activity_ids_.at(hlo);
-  b->CreateCall(trace_func,
-                {b->CreateBitCast(run_options, void_ptr_type), activity_id});
+  b->CreateCall(trace_func, {run_options, activity_id});
 }
 
 namespace {
@@ -3312,9 +3465,9 @@ llvm::Value* IrEmitter::EmitThreadLocalBufferPointer(
       // example, float for an XLA F32 element type).
       llvm::Value* params = compute_function_->parameters_arg();
       llvm::Value* param_address_offset = llvm_ir::EmitBufferIndexingGEP(
-          params, b_.getInt8PtrTy(), param_number, &b_);
+          params, b_.getPtrTy(), param_number, &b_);
       llvm::LoadInst* param_address_untyped =
-          Load(b_.getInt8PtrTy(), param_address_offset);
+          Load(b_.getPtrTy(), param_address_offset);
 
       if (!target_shape.IsOpaque()) {
         AttachAlignmentMetadataForLoad(param_address_untyped, target_shape);
@@ -3342,16 +3495,16 @@ llvm::Value* IrEmitter::EmitThreadLocalBufferPointer(
     }
     return buf_it->second;
   }();
-  return BitCast(tempbuf_address, IrShapeType(target_shape)->getPointerTo());
+  return tempbuf_address;
 }
 
 llvm::Value* IrEmitter::EmitGlobalBufferPointer(
     const BufferAllocation::Slice& slice, const Shape& target_shape) {
   const BufferAllocation& allocation = *slice.allocation();
   llvm::Value* tempbuf_address_ptr = llvm_ir::EmitBufferIndexingGEP(
-      GetBufferTableArgument(), b_.getInt8PtrTy(), slice.index(), &b_);
+      GetBufferTableArgument(), b_.getPtrTy(), slice.index(), &b_);
   llvm::LoadInst* tempbuf_address_base =
-      Load(b_.getInt8PtrTy(), tempbuf_address_ptr);
+      Load(b_.getPtrTy(), tempbuf_address_ptr);
   if (hlo_module_config_.debug_options()
           .xla_llvm_enable_invariant_load_metadata()) {
     tempbuf_address_base->setMetadata(
@@ -3367,8 +3520,7 @@ llvm::Value* IrEmitter::EmitGlobalBufferPointer(
     tempbuf_address_untyped = InBoundsGEP(b_.getInt8Ty(), tempbuf_address_base,
                                           b_.getInt64(slice.offset()));
   }
-  return BitCast(tempbuf_address_untyped,
-                 IrShapeType(target_shape)->getPointerTo());
+  return tempbuf_address_untyped;
 }
 
 llvm::Value* IrEmitter::EmitBufferPointer(const BufferAllocation::Slice& slice,
@@ -3376,9 +3528,7 @@ llvm::Value* IrEmitter::EmitBufferPointer(const BufferAllocation::Slice& slice,
   if (slice.allocation()->is_thread_local()) {
     return EmitThreadLocalBufferPointer(slice, target_shape);
   } else if (slice.allocation()->is_constant()) {
-    return BitCast(
-        FindOrDie(constant_buffer_to_global_, slice.allocation()->index()),
-        IrShapeType(target_shape)->getPointerTo());
+    return FindOrDie(constant_buffer_to_global_, slice.allocation()->index());
   } else {
     return EmitGlobalBufferPointer(slice, target_shape);
   }
@@ -3560,7 +3710,7 @@ std::vector<llvm::Value*> IrEmitter::EmitThreadLocalCall(
           /*return_value_buffer=*/return_value_buffer,
           /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(),
           /*buffer_table_arg=*/
-          llvm::Constant::getNullValue(b_.getInt8PtrTy()->getPointerTo()),
+          llvm::Constant::getNullValue(b_.getPtrTy()),
           /*status_arg=*/GetStatusArgument(),
           /*profile_counters_arg=*/GetProfileCountersArgument()));
 
@@ -3586,7 +3736,7 @@ void IrEmitter::EmitGlobalCall(const HloComputation& callee,
        GetArrayFunctionCallArguments(
            /*parameter_addresses=*/{}, &b_, name,
            /*return_value_buffer=*/
-           llvm::Constant::getNullValue(b_.getInt8PtrTy()),
+           llvm::Constant::getNullValue(b_.getPtrTy()),
            /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(),
            /*buffer_table_arg=*/GetBufferTableArgument(),
            /*status_arg=*/GetStatusArgument(),
@@ -3601,7 +3751,7 @@ llvm::Value* IrEmitter::GetBufferForGlobalCallReturnValue(
     const HloComputation& callee) {
   const HloInstruction* root_inst = callee.root_instruction();
   if (root_inst->opcode() == HloOpcode::kOutfeed) {
-    return llvm::Constant::getNullValue(b_.getInt8PtrTy());
+    return llvm::Constant::getNullValue(b_.getPtrTy());
   }
 
   const BufferAllocation::Slice root_buffer =
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.h b/third_party/xla/xla/service/cpu/ir_emitter.h
index 3a194d054cb5fd..89443a2ed0cb7f 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter.h
@@ -134,6 +134,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // special in some way are handled explicitly in HandleFoo methods.
   Status DefaultAction(HloInstruction* hlo) override;
 
+  Status HandleAllGather(HloInstruction* instruction) override;
   Status HandleAllToAll(HloInstruction* instruction) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleConstant(HloInstruction* constant) override;
@@ -195,6 +196,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   Status HandleAllReduceMultipleReplica(HloInstruction* crs);
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
   Status HandleOneDnnMatMul(HloInstruction* hlo);
+  Status HandleOneDnnSoftmax(HloInstruction* hlo);
+  Status HandleOneDnnLayerNorm(HloInstruction* hlo);
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
   // Private helper to initialize an IR function for the computation.
   void InitializeIrFunction(const std::string& function_name);
diff --git a/third_party/xla/xla/service/cpu/ir_function.cc b/third_party/xla/xla/service/cpu/ir_function.cc
index 39bd1bc3d74277..09ab5ed239b18e 100644
--- a/third_party/xla/xla/service/cpu/ir_function.cc
+++ b/third_party/xla/xla/service/cpu/ir_function.cc
@@ -28,17 +28,13 @@ namespace cpu {
 
 static std::vector<llvm::Type*> GetComputeFunctionParams(
     llvm::Module* llvm_module, const int64_t num_dynamic_loop_bounds) {
-  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(llvm_module->getContext());
-  llvm::Type* i8_ptr_ptr_type = i8_ptr_type->getPointerTo();
-  llvm::Type* i64_ptr_type =
-      llvm::PointerType::get(llvm_module->getContext(), 0);
-  std::vector<llvm::Type*> compute_function_params(
-      {i8_ptr_type, i8_ptr_type, i8_ptr_ptr_type, i8_ptr_ptr_type,
-       i8_ptr_type});
+  llvm::Type* ptr_type =
+      llvm::PointerType::getUnqual(llvm_module->getContext());
+  std::vector<llvm::Type*> compute_function_params(5, ptr_type);
   if (num_dynamic_loop_bounds > 0) {
-    compute_function_params.push_back(i64_ptr_type);
+    compute_function_params.push_back(ptr_type);
   }
-  compute_function_params.push_back(i64_ptr_type);
+  compute_function_params.push_back(ptr_type);
   return compute_function_params;
 }
 
@@ -203,21 +199,17 @@ llvm::Value* EncodeArrayFunctionArguments(
     absl::Span<llvm::Value* const> arguments, absl::string_view name,
     llvm::IRBuilder<>* b) {
   llvm::Value* arguments_buffer;
-  llvm::Type* int8ptr_ty = b->getInt8PtrTy();
   if (arguments.empty()) {
-    arguments_buffer = llvm::Constant::getNullValue(int8ptr_ty->getPointerTo());
+    arguments_buffer = llvm::Constant::getNullValue(b->getPtrTy());
   } else {
     arguments_buffer = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-        int8ptr_ty, b->getInt32(arguments.size()),
+        b->getPtrTy(), b->getInt32(arguments.size()),
         absl::StrCat(name, "_parameter_addresses"), b);
 
     for (size_t i = 0; i < arguments.size(); i++) {
-      llvm::Value* parameter_as_i8ptr = b->CreateBitCast(
-          arguments[i], b->getInt8PtrTy(),
-          absl::StrCat(name, "_parameter_", i, "_address_as_i8ptr"));
       llvm::Value* slot_in_param_addresses =
-          b->CreateInBoundsGEP(int8ptr_ty, arguments_buffer, b->getInt64(i));
-      b->CreateStore(parameter_as_i8ptr, slot_in_param_addresses);
+          b->CreateInBoundsGEP(b->getPtrTy(), arguments_buffer, b->getInt64(i));
+      b->CreateStore(arguments[i], slot_in_param_addresses);
     }
   }
   return arguments_buffer;
@@ -235,15 +227,9 @@ std::vector<llvm::Value*> GetArrayFunctionCallArguments(
   llvm::Value* parameter_addresses_buffer =
       EncodeArrayFunctionArguments(parameter_addresses, name, b);
 
-  const auto to_int8_ptr = [=](llvm::Value* ptr) {
-    return b->CreatePointerCast(ptr, b->getInt8PtrTy());
-  };
-  return std::vector<llvm::Value*>{to_int8_ptr(return_value_buffer),
-                                   to_int8_ptr(exec_run_options_arg),
-                                   parameter_addresses_buffer,
-                                   buffer_table_arg,
-                                   status_arg,
-                                   profile_counters_arg};
+  return std::vector<llvm::Value*>{
+      return_value_buffer, exec_run_options_arg, parameter_addresses_buffer,
+      buffer_table_arg,    status_arg,           profile_counters_arg};
 }
 
 // Emits a call to a runtime fork/join function which dispatches parallel
diff --git a/third_party/xla/xla/service/cpu/llvm_ir_runtime.cc b/third_party/xla/xla/service/cpu/llvm_ir_runtime.cc
index 07fda87e7df4c0..f9c0d0ae622b5a 100644
--- a/third_party/xla/xla/service/cpu/llvm_ir_runtime.cc
+++ b/third_party/xla/xla/service/cpu/llvm_ir_runtime.cc
@@ -47,8 +47,8 @@ void RemoveFunctionFromUsedList(llvm::Module* module, llvm::Function* fn) {
     return;
   }
 
-  llvm::Type* int8_ptr_type = llvm::Type::getInt8PtrTy(module->getContext());
-  llvm::Constant* casted_fn = llvm::ConstantExpr::getBitCast(fn, int8_ptr_type);
+  llvm::Type* ptr_type = llvm::PointerType::getUnqual(module->getContext());
+  llvm::Constant* casted_fn = llvm::ConstantExpr::getBitCast(fn, ptr_type);
   auto* initializer = llvm::cast<llvm::ConstantArray>(used->getInitializer());
   llvm::SmallVector<llvm::Constant*, 4> new_initializer;
   for (auto& op : initializer->operands()) {
@@ -64,7 +64,7 @@ void RemoveFunctionFromUsedList(llvm::Module* module, llvm::Function* fn) {
   used->eraseFromParent();
   if (!new_initializer.empty()) {
     llvm::ArrayType* array_type =
-        llvm::ArrayType::get(int8_ptr_type, new_initializer.size());
+        llvm::ArrayType::get(ptr_type, new_initializer.size());
     used = new llvm::GlobalVariable(
         *module, array_type, /*isConstant=*/false,
         llvm::GlobalValue::AppendingLinkage,
diff --git a/third_party/xla/xla/service/cpu/mlir_emitter.cc b/third_party/xla/xla/service/cpu/mlir_emitter.cc
index 02947a610a4be1..fb805157b54c6d 100644
--- a/third_party/xla/xla/service/cpu/mlir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/mlir_emitter.cc
@@ -59,15 +59,6 @@ std::unique_ptr<llvm::Module> MakeLLVMModule(
 void BuildViewForBuffer(llvm::SmallVectorImpl<llvm::Value *> *args,
                         llvm::IRBuilder<> *b, const Shape &opShape,
                         llvm::Value *op_val) {
-  llvm::Type *ty = op_val->getType();
-  if (!ty->isOpaquePointerTy()) {
-    while (auto aty = llvm::dyn_cast<llvm::ArrayType>(
-               ty->getNonOpaquePointerElementType())) {
-      ty = aty->getElementType()->getPointerTo();
-    }
-  }
-  op_val = b->CreateBitCast(op_val, ty);
-
   args->push_back(op_val);          // Allocated pointer.
   args->push_back(op_val);          // Aligned pointer.
   args->push_back(b->getInt64(0));  // Offset.
diff --git a/third_party/xla/xla/service/cpu/onednn_layer_norm.cc b/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
new file mode 100644
index 00000000000000..f919f1b8d0b532
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
@@ -0,0 +1,108 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include "xla/service/cpu/onednn_layer_norm.h"
+
+#include <algorithm>
+#include <cmath>
+#include <initializer_list>
+#include <vector>
+
+#define EIGEN_USE_THREADS
+
+#include "dnnl.hpp"
+#include "absl/base/dynamic_annotations.h"
+#include "xla/executable_run_options.h"
+#include "xla/service/cpu/backend_config.pb.h"
+#include "xla/service/cpu/onednn_memory_util.h"
+#include "xla/service/cpu/runtime_lightweight_check.h"
+#include "tsl/util/onednn_threadpool.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace xla {
+namespace cpu {
+namespace {
+using dnnl::engine;
+using dnnl::layer_normalization_forward;
+using dnnl::memory;
+using dnnl::normalization_flags;
+using dnnl::prop_kind;
+using dnnl::stream;
+}  // namespace
+
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnLayerNorm(
+    void* result, void** args) {
+  // args[0]: ptr to nargs
+  // args[1]: ptr to ExecutableRunOptions
+  // args[2]: ptr to OneDnnLayerNormConfig
+  // args[3...]: ptrs to operands
+  int arg_indx = 0;
+  const int64_t num_args = *(static_cast<int64_t*>(args[arg_indx++]));
+
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(args[arg_indx++]);
+  XLA_LIGHTWEIGHT_CHECK(run_options != nullptr);
+  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
+  tsl::OneDnnThreadPool thread_pool(
+      run_options->intra_op_thread_pool()->getPool(), false);
+  engine cpu_engine(engine::kind::cpu, 0);
+#ifndef ENABLE_ONEDNN_OPENMP
+  auto onednn_stream =
+      stream(dnnl::threadpool_interop::make_stream(cpu_engine, &thread_pool));
+#else
+  auto onednn_stream = stream(cpu_engine);
+#endif  // ENABLE_ONEDNN_OPENMP
+  std::string config_str(static_cast<const char*>(args[arg_indx++]));
+  OneDnnLayerNormConfig ln_config;
+  ln_config.ParseFromString(config_str);
+
+  MemrefInfo layer_minfo(args[arg_indx++]);
+  MemrefInfo gamma_minfo(args[arg_indx++]);
+  MemrefInfo beta_minfo(args[arg_indx++]);
+  MemrefInfo result_minfo(result);
+
+  auto src_md = layer_minfo.GetOneDnnMemDesc();
+  auto dst_md = result_minfo.GetOneDnnMemDesc();
+  auto scaleshift_md = beta_minfo.GetOneDnnMemDesc();
+
+  auto src_mem = memory(src_md, cpu_engine, layer_minfo.Data());
+  auto dst_mem = memory(dst_md, cpu_engine, result_minfo.Data());
+  auto scale_mem = memory(scaleshift_md, cpu_engine, gamma_minfo.Data());
+  auto shift_mem = memory(scaleshift_md, cpu_engine, beta_minfo.Data());
+
+  // TODO(intel-tf): Move epsilon to OneDnnLayerNormConfig.
+  const float epsilon = 1.e-5f;
+
+  auto lnorm_pd = layer_normalization_forward::primitive_desc(
+      cpu_engine, prop_kind::forward_inference, src_md, dst_md, epsilon,
+      normalization_flags::use_scale | normalization_flags::use_shift);
+
+  auto lnorm_prim = layer_normalization_forward(lnorm_pd);
+
+  std::unordered_map<int, memory> ln_args;
+  ln_args.insert({DNNL_ARG_SRC, src_mem});
+  ln_args.insert({DNNL_ARG_SCALE, scale_mem});
+  ln_args.insert({DNNL_ARG_SHIFT, shift_mem});
+  ln_args.insert({DNNL_ARG_DST, dst_mem});
+
+  lnorm_prim.execute(onednn_stream, ln_args);
+}
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_no_cuda.cc b/third_party/xla/xla/service/cpu/onednn_layer_norm.h
similarity index 62%
rename from third_party/xla/xla/service/gpu/runtime/topk_no_cuda.cc
rename to third_party/xla/xla/service/cpu/onednn_layer_norm.h
index 994f541ef7071d..0836aabf68a435 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_no_cuda.cc
+++ b/third_party/xla/xla/service/cpu/onednn_layer_norm.h
@@ -13,10 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime/topk.h"
+#ifndef XLA_SERVICE_CPU_ONEDNN_LAYER_NORM_H_
+#define XLA_SERVICE_CPU_ONEDNN_LAYER_NORM_H_
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
 
-namespace xla::gpu {
+namespace xla {
+namespace cpu {
 
-void RegisterTopkCustomCall(runtime::DirectCustomCallRegistry&) {}
+extern "C" {
+extern void __xla_cpu_runtime_OneDnnLayerNorm(void* result, void** args);
+}  // extern "C"
 
-}  // namespace xla::gpu
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_LAYER_NORM_H_
diff --git a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
new file mode 100644
index 00000000000000..2370f051052296
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
@@ -0,0 +1,390 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include "xla/service/cpu/onednn_ops_rewriter.h"
+
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/cpu/backend_config.pb.h"
+#include "xla/service/cpu/onednn_memory_util.h"
+#include "xla/service/cpu/onednn_util.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/status_macros.h"
+
+namespace xla {
+namespace cpu {
+
+namespace {
+namespace m = match;
+
+auto ConvertPattern(HloInstruction** instr) {
+  return m::Convert(m::Op(instr).WithElementType(PrimitiveType::BF16))
+      .WithElementType(PrimitiveType::F32);
+}
+
+template <typename Pattern>
+auto OptionalConvert(Pattern pattern) {
+  return m::AnyOf<HloInstruction>(m::Convert(pattern), std::move(pattern));
+}
+
+HloInstruction* FindLayerNormScale(HloInstruction* instr) {
+  HloInstruction* scale = nullptr;
+  auto scalePattern = m::Multiply().WithBinaryOperandsAnyOrder(
+      m::Broadcast(m::Op(&scale).WithOpcode(HloOpcode::kReshape)),
+      m::Broadcast(m::Reshape(m::Broadcast(m::Rsqrt()))).WithOneUser());
+  auto m = Match(instr, scalePattern);
+  return scale;
+}
+
+HloInstruction* FindLayerNormShift(HloInstruction* instr) {
+  HloInstruction* shift = nullptr;
+  auto m = Match(
+      instr,
+      m::Add().WithBinaryOperandsAnyOrder(
+          m::Multiply()
+              .WithBinaryOperandsAnyOrder(
+                  m::Op(), m::Subtract(m::Op(), m::Broadcast().WithOneUser())
+                               .WithOneUser())
+              .WithOneUser(),
+          m::Broadcast(m::Op(&shift))));
+  return shift;
+}
+
+std::optional<HloInstruction*> MatchSoftmax(HloInstruction* instr) {
+  //
+  // producer
+  // |   \
+  // |  reduce_max
+  // |     |
+  // |  reshape
+  // |     |
+  // |  broadcast
+  // |     |
+  // |  reshape
+  // |     |
+  // |  broadcast
+  // |   /
+  // subtract
+  // |
+  // exponential
+  // |   \
+  // |  reduce_sum
+  // |     |
+  // |  reshape
+  // |     |
+  // |  broadcast
+  // |     |
+  // |  reshape
+  // |     |
+  // |  broadcast
+  // |   /
+  // divide  // (instr parameter)
+  //
+  // where both reductions occur only on the last axis.
+  HloInstruction* left_exponential;
+  HloInstruction* right_exponential;
+  HloInstruction* left_producer;
+  HloInstruction* right_producer;
+
+  // Lower diamond
+  if (!Match(
+          instr,
+          m::Divide(
+              m::Exp(&left_exponential, m::Op()),
+              m::Broadcast(m::Reshape(m::Broadcast(OptionalConvert(m::Reshape(
+                  m::Reduce(
+                      OptionalConvert(m::Exp(&right_exponential, m::Op())),
+                      m::Op())
+                      .WithPredicate([](const HloInstruction* reduce) {
+                        HloComputation* reducer = reduce->to_apply();
+                        return (reducer->root_instruction()->opcode() ==
+                                    HloOpcode::kAdd &&
+                                reduce->dimensions().size() == 1 &&
+                                reduce->dimensions()[0] !=
+                                    reduce->shape().rank() - 1);
+                      })
+                      .WithOneUse())))))))) {
+    return std::nullopt;
+  }
+
+  if (left_exponential != right_exponential ||
+      left_exponential->user_count() != 2)
+    return std::nullopt;
+
+  // Upper diamond
+  if (!Match(left_exponential->mutable_operand(0),
+             m::Subtract(
+                 m::Op(&left_producer),
+                 m::Broadcast(
+                     m::Reshape(m::Broadcast(m::Reshape(
+                         m::Reduce(m::Op(&right_producer), m::Op())
+                             .WithPredicate([](const HloInstruction* reduce) {
+                               HloComputation* reducer = reduce->to_apply();
+                               return (reducer->root_instruction()->opcode() ==
+                                           HloOpcode::kMaximum &&
+                                       reduce->dimensions().size() == 1 &&
+                                       reduce->dimensions()[0] !=
+                                           reduce->shape().rank() - 1);
+                             })
+                             .WithOneUse()))))
+                     .WithOneUse())
+                 .WithOneUse())) {
+    return std::nullopt;
+  }
+
+  if (left_producer != right_producer || left_producer->user_count() != 2)
+    return std::nullopt;
+
+  return left_producer;
+}
+
+}  // namespace
+
+class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
+ public:
+  Status HandleAdd(HloInstruction* instr) override {
+    HloInstruction *slicemu1, *slicemu2;
+    HloInstruction *slicesource1, *slicesource2;
+    HloInstruction *musquare1, *musquare2;
+    HloInstruction *prod_c, *prod_l, *prod_s, *prod_r;
+    HloInstruction *slicevar, *hinge;
+
+    bool scaleFound = false;
+    bool shiftFound = false;
+
+    auto spine = m::Add().WithBinaryOperandsAnyOrder(
+        m::Broadcast(),
+        m::Multiply()
+            .WithBinaryOperandsAnyOrder(
+                m::Op(&hinge).WithOneUser(),
+                m::Subtract(
+                    m::Op(&prod_s),
+                    m::Broadcast(
+                        m::Reshape(
+                            m::Broadcast(
+                                m::Reshape(
+                                    m::Op(&slicemu1)
+                                        .WithOpcode(HloOpcode::kSlice)
+                                        .WithOperand(
+                                            0, m::Op(&slicesource1)
+                                                   .WithOpcode(
+                                                       HloOpcode::kDivide)))
+                                    .WithOneUser())
+                                .WithOneUser())
+                            .WithOneUser())
+                        .WithOneUser())
+                    .WithOneUser())
+            .WithOneUser());
+
+    if (!Match(instr, spine)) {
+      return OkStatus();
+    }
+
+    const Shape& prod_shape = prod_s->shape();
+    if (!IsSupportedType(prod_shape.element_type())) return OkStatus();
+
+    HloInstruction* shift = FindLayerNormShift(instr);
+    shiftFound = (shift != nullptr);
+
+    HloInstruction* scale = FindLayerNormScale(hinge);
+    scaleFound = (scale != nullptr);
+
+    // Currently patterns without scale and shift are
+    // not supported.
+    // OneDNN only supports 2 <= rank <= 5
+    if (!(prod_shape.rank() >= 2 && prod_shape.rank() <= 5) || !shiftFound ||
+        !scaleFound) {
+      return OkStatus();
+    }
+
+    // NOLINTBEGIN
+    auto main_pipeline = m::Multiply().WithBinaryOperandsAnyOrder(
+        m::Op(),
+        m::Broadcast(
+            m::Reshape(
+                m::Broadcast(
+                    m::Rsqrt(
+                        m::Add()
+                            .WithBinaryOperandsAnyOrder(
+                                m::Broadcast(m::Constant()),
+                                m::Reshape(
+                                    m::Maximum()
+                                        .WithBinaryOperandsAnyOrder(
+                                            m::Broadcast(),
+                                            m::Subtract(
+                                                m::Reshape(
+                                                    m::Op(&slicevar)
+                                                        .WithOpcode(
+                                                            HloOpcode::kSlice)
+                                                        .WithOperand(
+                                                            0,
+                                                            m::Op(&slicesource2)
+                                                                .WithOpcode(
+                                                                    HloOpcode::
+                                                                        kDivide)))
+                                                    .WithOneUser(),
+                                                m::Multiply(
+                                                    m::Op(&musquare1),
+                                                    m::Op(&musquare2)
+                                                        .WithOperand(
+                                                            0,
+                                                            m::Op(&slicemu2)
+                                                                .WithOpcode(
+                                                                    HloOpcode::
+                                                                        kSlice)))
+                                                    .WithOneUser())
+                                                .WithOneUser())
+                                        .WithOneUser())
+                                    .WithOneUser())
+                            .WithOneUser())
+                        .WithOneUser())
+                    .WithOneUser())
+                .WithOneUser())
+            .WithOneUser());
+    // NOLINTEND
+
+    if (!Match(hinge, main_pipeline) || slicemu1 != slicemu2 ||
+        musquare1 != musquare2 || slicesource1 != slicesource2) {
+      return OkStatus();
+    }
+
+    // Check if the slices are compatible
+    if (!(absl::c_all_of(slicemu1->slice_starts(),
+                         [](int64_t i) { return i == 0; }) &&
+          absl::c_equal(slicemu1->slice_limits(),
+                        slicemu1->shape().dimensions())) &&
+        !(absl::c_all_of(slicevar->slice_starts(),
+                         [](int64_t i) { return i == 0; }) &&
+          absl::c_equal(slicevar->slice_limits(),
+                        slicevar->shape().dimensions()))) {
+      return OkStatus();
+    }
+
+    auto empirical_expectations = m::Divide(
+        m::Reduce(m::Concatenate()
+                      .WithBinaryOperandsAnyOrder(
+                          m::Reshape(m::Multiply(m::Op(&prod_l), m::Op(&prod_c))
+                                         .WithOneUser())
+                              .WithOneUser(),
+                          m::Reshape(m::Op(&prod_r)).WithOneUser())
+                      .WithPredicate([](const HloInstruction* comb) {
+                        return (comb->dimensions().size() == 1 &&
+                                comb->dimensions()[0] == 0 &&
+                                comb->shape().dimensions(0) == 2);
+                      })
+                      .WithOneUser(),
+                  m::Constant())
+            .WithPredicate([](const HloInstruction* reduce) {
+              HloComputation* reducer = reduce->to_apply();
+              return (reducer->root_instruction()->opcode() ==
+                          HloOpcode::kAdd &&
+                      reduce->dimensions().size() == 1 &&
+                      reduce->dimensions()[0] == reduce->shape().rank());
+            })
+            .WithOneUser(),
+        m::Broadcast(m::ConstantScalar().WithPredicate(
+            [orig = prod_s](const HloInstruction* divisor) {
+              std::optional<double> actual =
+                  static_cast<const HloConstantInstruction*>(divisor)
+                      ->literal()
+                      .GetAsDouble({});
+              return (actual.has_value() &&
+                      orig->shape().dimensions(orig->shape().rank() - 1) ==
+                          *actual);
+            })));
+
+    HloInstruction *src1, *src2;
+    if (Match(slicesource2, empirical_expectations) &&
+        // Float32 pattern check
+        ((prod_l == prod_c && prod_c == prod_r && prod_l == prod_s) ||
+         // Bfloat16 pattern check
+         (prod_l == prod_c && prod_c == prod_r &&
+          Match(prod_l, ConvertPattern(&src1)) &&
+          Match(prod_s, ConvertPattern(&src2)) && src1 == src2))) {
+      HloInstruction* ln_call =
+          instr->AddInstruction(HloInstruction::CreateCustomCall(
+              prod_shape, {prod_r, scale, shift}, "__onednn$layernorm"));
+      BackendConfig backend_config;
+      OneDnnLayerNormConfig* ln_config =
+          backend_config.mutable_onednn_layer_norm_config();
+      ln_config->set_fused_ops(OneDnnLayerNormConfig::SCALE_AND_SHIFT);
+      TF_RETURN_IF_ERROR(ln_call->set_backend_config(backend_config));
+      TF_RETURN_IF_ERROR(ReplaceInstruction(instr, ln_call));
+    }
+
+    return OkStatus();
+  }
+
+  Status HandleConvert(HloInstruction* instr) override {
+    HloInstruction* ln_instr;
+    auto pattern = m::Convert(m::Op(&ln_instr)
+                                  .WithOneUser()
+                                  .WithOpcode(HloOpcode::kCustomCall)
+                                  .WithCustomCallTarget({"__onednn$layernorm"})
+                                  .WithElementType(PrimitiveType::F32))
+                       .WithElementType(PrimitiveType::BF16);
+
+    if (!IsSupportedType(instr->shape().element_type())) return OkStatus();
+    if (Match(instr, pattern)) {
+      HloInstruction* producer = instr->mutable_operand(0)->mutable_operand(0);
+      HloInstruction* newinp =
+          producer->AddInstruction(HloInstruction::CreateConvert(
+              ShapeUtil::ChangeElementType(producer->shape(),
+                                           instr->shape().element_type()),
+              {producer}));
+      absl::InlinedVector<HloInstruction*, 2> newoperands =
+          ln_instr->mutable_operands();
+      newoperands.at(0) = newinp;
+      HloInstruction* ln_call = instr->AddInstruction(
+          ln_instr->CloneWithNewOperands(instr->shape(), newoperands));
+      TF_RETURN_IF_ERROR(ReplaceInstruction(instr, ln_call));
+    }
+
+    return OkStatus();
+  }
+
+  Status HandleDivide(HloInstruction* divide_instr) override {
+    if (divide_instr->HasControlDependencies()) return OkStatus();
+    if (!IsSupportedType(divide_instr->shape().element_type()))
+      return OkStatus();
+    std::optional<HloInstruction*> producer;
+    bool found_pattern = false;
+    if (producer = MatchSoftmax(divide_instr)) {
+      found_pattern = true;
+    }
+
+    if (!found_pattern) return OkStatus();
+
+    const Shape& output_shape = divide_instr->shape();
+    HloInstruction* softmax_call =
+        divide_instr->AddInstruction(HloInstruction::CreateCustomCall(
+            output_shape, {producer.value()}, "__onednn$softmax"));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(divide_instr, softmax_call));
+
+    return OkStatus();
+  }
+};
+
+StatusOr<bool> OneDnnOpsRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  OneDnnOpsRewriterVisitor visitor;
+  return visitor.RunOnModule(module, execution_threads);
+}
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
diff --git a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h
new file mode 100644
index 00000000000000..50b7f4b378be81
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h
@@ -0,0 +1,44 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_CPU_ONEDNN_OPS_REWRITER_H_
+#define XLA_SERVICE_CPU_ONEDNN_OPS_REWRITER_H_
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include <optional>
+
+#include "absl/algorithm/container.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace cpu {
+
+// This pass fuses hlo instructions that can be fused into single oneDNN
+// operation and rewrites into custom calls.
+class OneDnnOpsRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "onednn-ops-rewriter"; }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_OPS_REWRITER_H_
diff --git a/third_party/xla/xla/service/cpu/onednn_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_rewriter.cc
index 45b38c59d76a61..38b98f8622ae25 100644
--- a/third_party/xla/xla/service/cpu/onednn_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_rewriter.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/service/cpu/onednn_memory_util.h"
+#include "xla/service/cpu/onednn_util.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/status_macros.h"
 #include "tsl/platform/cpu_info.h"
@@ -45,21 +46,6 @@ Status ValidateDotDimensionNumbers(const DotDimensionNumbers& dim_numbers) {
   return OkStatus();
 }
 
-bool IsSupportedType(xla::PrimitiveType dtype) {
-  using tsl::port::TestCPUFeature;
-  using tsl::port::CPUFeature;
-  switch (dtype) {
-    case F32:
-      return true;
-    case BF16:
-      return TestCPUFeature(CPUFeature::AVX512_BF16) ||
-             TestCPUFeature(CPUFeature::AMX_BF16);
-    default:
-      return false;
-  }
-  return false;
-}
-
 }  // namespace
 
 class OneDnnRewriterVisitor : public DfsHloRewriteVisitor {
@@ -82,7 +68,7 @@ class OneDnnRewriterVisitor : public DfsHloRewriteVisitor {
     // verifier already does the job. We, however, need to check if contraction
     // is over only 1 dimension (a.k.a. K dimension in matrix-multiplication
     // parlance). We also restrict that batch dimensions of the operands
-    // matches.
+    // match.
     if (!IsSupportedType(dot_instr->shape().element_type())) return OkStatus();
     auto dot_dim_numbers = dot_instr->dot_dimension_numbers();
     TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(dot_dim_numbers));
@@ -118,6 +104,17 @@ class OneDnnRewriterVisitor : public DfsHloRewriteVisitor {
         (dot_dim_numbers.rhs_contracting_dimensions(0) == rhs_shape.rank() - 2);
     if (!should_rewrite) return OkStatus();
 
+    // OneDNN matmul has scratch allocation and copy overheads. The overheads
+    // can be amortized if there is sufficient MAC (multiply-accumulate)
+    // operations. We don't rewrite for small cases (determined empirically).
+    // TODO(intel-tf): Relax the condition when more optimizations in oneDNN
+    // matmul is achieved.
+    auto rank = lhs_shape.rank();
+    auto rhs_dims = rhs_shape.dimensions();
+    int64_t num_mac_ops = ShapeUtil::ElementsIn(lhs_shape) * rhs_dims.back();
+    int mac_ops_threshold = (rank == 2) ? (1 << 23) : (1 << 18);
+    if (num_mac_ops < mac_ops_threshold) return OkStatus();
+
     HloInstruction* matmul_call =
         dot_instr->AddInstruction(HloInstruction::CreateCustomCall(
             output_shape,
diff --git a/third_party/xla/xla/service/cpu/onednn_softmax.cc b/third_party/xla/xla/service/cpu/onednn_softmax.cc
new file mode 100644
index 00000000000000..29a552d74632f0
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/onednn_softmax.cc
@@ -0,0 +1,85 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+#include "xla/service/cpu/onednn_softmax.h"
+
+#include <algorithm>
+#include <cmath>
+#include <initializer_list>
+#include <vector>
+
+// Both "absl/log/check.h" and "third_party/tsl/platform/logging.h"
+// are transitively included in bazel. Both of them define similar CHECK macros.
+// Explicitly including the Abseil header first because the TSL version has
+// undefs.
+
+// Otherwise, we would get redefinition error.
+// clang-format off
+#include "absl/log/check.h"
+// clang-format on
+
+#include "dnnl.hpp"
+#include "absl/base/dynamic_annotations.h"
+#include "xla/executable_run_options.h"
+#include "xla/service/cpu/backend_config.pb.h"
+#include "xla/service/cpu/onednn_memory_util.h"
+#include "xla/service/cpu/runtime_lightweight_check.h"
+#include "tsl/util/onednn_threadpool.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace xla {
+namespace cpu {
+
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnSoftmax(
+    const void* run_options_ptr, void* input, void* result) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  XLA_LIGHTWEIGHT_CHECK(run_options != nullptr);
+  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
+  tsl::OneDnnThreadPool thread_pool(
+      run_options->intra_op_thread_pool()->getPool(), false);
+  dnnl::engine cpu_engine(dnnl::engine::kind::cpu, 0);
+#ifndef ENABLE_ONEDNN_OPENMP
+  auto onednn_stream = dnnl::stream(
+      dnnl::threadpool_interop::make_stream(cpu_engine, &thread_pool));
+#else
+  auto onednn_stream = dnnl::stream(cpu_engine);
+#endif  // ENABLE_ONEDNN_OPENMP
+
+  MemrefInfo input_minfo(input);
+  MemrefInfo result_minfo(result);
+
+  auto src_md = input_minfo.GetOneDnnMemDesc();
+  auto dst_md = result_minfo.GetOneDnnMemDesc();
+
+  auto src_mem = dnnl::memory(src_md, cpu_engine, input_minfo.Data());
+  auto dst_mem = dnnl::memory(dst_md, cpu_engine, result_minfo.Data());
+
+  int axis = (input_minfo.GetOneDnnDims().size()) - 1;
+
+  auto softmax_pd = dnnl::softmax_forward::primitive_desc(
+      cpu_engine, dnnl::prop_kind::forward_inference,
+      dnnl::algorithm::softmax_accurate, src_md, dst_md, axis);
+
+  auto softmax_prim = dnnl::softmax_forward(softmax_pd);
+
+  std::unordered_map<int, dnnl::memory> softmax_args;
+  softmax_args.insert({DNNL_ARG_SRC, src_mem});
+  softmax_args.insert({DNNL_ARG_DST, dst_mem});
+
+  softmax_prim.execute(onednn_stream, softmax_args);
+}
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
diff --git a/third_party/xla/xla/service/cpu/onednn_softmax.h b/third_party/xla/xla/service/cpu/onednn_softmax.h
new file mode 100644
index 00000000000000..6c93886635c6cd
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/onednn_softmax.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ONEDNN_SOFTMAX_H_
+#define XLA_SERVICE_CPU_ONEDNN_SOFTMAX_H_
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+namespace xla {
+namespace cpu {
+
+extern "C" {
+extern void __xla_cpu_runtime_OneDnnSoftmax(const void* run_options_ptr,
+                                            void* input, void* result);
+}  // extern "C"
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_SOFTMAX_H_
diff --git a/third_party/xla/xla/service/cpu/onednn_util.h b/third_party/xla/xla/service/cpu/onednn_util.h
new file mode 100644
index 00000000000000..50461214bf7ec6
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/onednn_util.h
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ONEDNN_UTIL_H_
+#define XLA_SERVICE_CPU_ONEDNN_UTIL_H_
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/cpu_info.h"
+
+namespace xla {
+namespace cpu {
+
+inline bool IsSupportedType(xla::PrimitiveType dtype) {
+  using tsl::port::CPUFeature;
+  static bool is_bf16_supported = TestCPUFeature(CPUFeature::AVX512_BF16) ||
+                                  TestCPUFeature(CPUFeature::AMX_BF16);
+  switch (dtype) {
+    case F32:
+      return true;
+    case BF16:
+      return is_bf16_supported;
+    default:
+      break;
+  }
+  return false;
+}
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_UTIL_H_
diff --git a/third_party/xla/xla/service/cpu/simple_orc_jit.cc b/third_party/xla/xla/service/cpu/simple_orc_jit.cc
index 8895b4f6451d5a..161f5d3e0acf3a 100644
--- a/third_party/xla/xla/service/cpu/simple_orc_jit.cc
+++ b/third_party/xla/xla/service/cpu/simple_orc_jit.cc
@@ -64,7 +64,9 @@ limitations under the License.
 #include "tsl/platform/logging.h"
 
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+#include "xla/service/cpu/onednn_layer_norm.h"
 #include "xla/service/cpu/onednn_matmul.h"
+#include "xla/service/cpu/onednn_softmax.h"
 #endif
 
 // Provided by compiler-rt and MLIR.
@@ -485,6 +487,8 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(AllReduce);
   REGISTER_CPU_RUNTIME_SYMBOL(CollectivePermute);
   REGISTER_CPU_RUNTIME_SYMBOL(AllToAll);
+  REGISTER_CPU_RUNTIME_SYMBOL(AllGather);
+  REGISTER_CPU_RUNTIME_SYMBOL(ReduceScatter);
   REGISTER_CPU_RUNTIME_SYMBOL(PartitionId);
   REGISTER_CPU_RUNTIME_SYMBOL(ReplicaId);
   REGISTER_CPU_RUNTIME_SYMBOL(MKLConv2DF32);
@@ -525,6 +529,8 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(TracingEnd);
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
   REGISTER_CPU_RUNTIME_SYMBOL(OneDnnMatMul);
+  REGISTER_CPU_RUNTIME_SYMBOL(OneDnnSoftmax);
+  REGISTER_CPU_RUNTIME_SYMBOL(OneDnnLayerNorm);
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 
   registry->Register("__gnu_f2h_ieee", reinterpret_cast<void*>(__gnu_f2h_ieee),
diff --git a/third_party/xla/xla/service/cpu/tiled_dot_emitter.cc b/third_party/xla/xla/service/cpu/tiled_dot_emitter.cc
index b39caa8480723d..b3cb7228810d0b 100644
--- a/third_party/xla/xla/service/cpu/tiled_dot_emitter.cc
+++ b/third_party/xla/xla/service/cpu/tiled_dot_emitter.cc
@@ -949,54 +949,6 @@ void TiledSmallGemmEmitter::EmitTiledGemm(
   });
 }
 
-llvm::Type* GetPointerToElementType(llvm::Type* pointer_type) {
-  if (pointer_type->isOpaquePointerTy()) return pointer_type;
-
-  llvm::Type* type = pointer_type->getNonOpaquePointerElementType();
-  while (auto* array_type = llvm::dyn_cast<llvm::ArrayType>(type)) {
-    type = array_type->getElementType();
-  }
-
-  return type->getPointerTo();
-}
-
-struct GemvBuffersWithCanonicalType {
-  llvm::Value* lhs_canonicalized;
-  llvm::Value* rhs_canonicalized;
-  llvm::Value* addend_canonicalized;
-  llvm::Value* result_canonicalized;
-};
-
-GemvBuffersWithCanonicalType GetGemvBuffersWithCanonicalType(
-    llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend,
-    llvm::Value* result, llvm::IRBuilder<>* b) {
-  // We characterize a GEMV operation via M and K, since N is implicitly 1.
-  // This means the GEMV that multiplies (say) [5,6] with [6,1] is implemented
-  // by the same GEMV that multiplies [5,6] with [1,6].  However, the
-  // `llvm::Types` for the inputs to the two GEMVs don't match (in a trivial
-  // sense -- the in memory representations are the same) since they're computed
-  // from the `xla::Shape`s.  Since we want to be able to call the same
-  // `llvm::Function` for the two GEMVs we canonicalize the types of the GEMV
-  // inputs here into the same type.
-  GemvBuffersWithCanonicalType buffers_with_canonical_type;
-  llvm::Type* lhs_type = lhs->getType();
-  llvm::Type* rhs_type = rhs->getType();
-  llvm::Type* addend_type = addend ? addend->getType() : nullptr;
-  llvm::Type* result_type = result->getType();
-
-  buffers_with_canonical_type.lhs_canonicalized =
-      b->CreateBitCast(lhs, GetPointerToElementType(lhs_type));
-  buffers_with_canonical_type.rhs_canonicalized =
-      b->CreateBitCast(rhs, GetPointerToElementType(rhs_type));
-  buffers_with_canonical_type.addend_canonicalized =
-      addend ? b->CreateBitCast(addend, GetPointerToElementType(addend_type))
-             : nullptr;
-  buffers_with_canonical_type.result_canonicalized =
-      b->CreateBitCast(result, GetPointerToElementType(result_type));
-
-  return buffers_with_canonical_type;
-}
-
 }  // namespace
 
 void EmitRowMajorGemv(PrimitiveType scalar_type, int64_t tile_rows,
@@ -1009,17 +961,10 @@ void EmitRowMajorGemv(PrimitiveType scalar_type, int64_t tile_rows,
       /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols,
       /*m=*/m, /*k=*/k, /*has_addend=*/addend != nullptr);
 
-  GemvBuffersWithCanonicalType canonical_inputs =
-      GetGemvBuffersWithCanonicalType(lhs, rhs, addend, result, b);
-
   KernelSupportLibrary::EmitAndCallOutlinedKernel(
-      module_config, b, config.GetCacheKey(),
-      canonical_inputs.lhs_canonicalized, canonical_inputs.rhs_canonicalized,
-      canonical_inputs.addend_canonicalized,
-      canonical_inputs.result_canonicalized,
-      [&config, b, &canonical_inputs](llvm::Value* lhs, llvm::Value* rhs,
-                                      llvm::Value* addend,
-                                      llvm::Value* result) {
+      module_config, b, config.GetCacheKey(), lhs, rhs, addend, result,
+      [&config, b](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend,
+                   llvm::Value* result) {
         RowMajorMatrixVectorProductEmitter emitter(config, lhs, rhs, addend,
                                                    result, b);
         emitter.Emit();
@@ -1037,17 +982,10 @@ void EmitColumnMajorGemv(PrimitiveType scalar_type, int64_t tile_rows,
       /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols,
       /*m=*/m, /*k=*/k, /*has_addend=*/addend != nullptr);
 
-  GemvBuffersWithCanonicalType canonical_inputs =
-      GetGemvBuffersWithCanonicalType(lhs, rhs, addend, result, b);
-
   KernelSupportLibrary::EmitAndCallOutlinedKernel(
-      module_config, b, config.GetCacheKey(),
-      canonical_inputs.lhs_canonicalized, canonical_inputs.rhs_canonicalized,
-      canonical_inputs.addend_canonicalized,
-      canonical_inputs.result_canonicalized,
-      [&config, b, &canonical_inputs](llvm::Value* lhs, llvm::Value* rhs,
-                                      llvm::Value* addend,
-                                      llvm::Value* result) {
+      module_config, b, config.GetCacheKey(), lhs, rhs, addend, result,
+      [&config, b](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend,
+                   llvm::Value* result) {
         ColumnMajorMatrixVectorProductEmitter emitter(config, lhs, rhs, addend,
                                                       result, b);
         emitter.Emit();
diff --git a/third_party/xla/xla/service/defuser.cc b/third_party/xla/xla/service/defuser.cc
index 77ba8ebe989198..e2833d1af91b5b 100644
--- a/third_party/xla/xla/service/defuser.cc
+++ b/third_party/xla/xla/service/defuser.cc
@@ -36,58 +36,6 @@ limitations under the License.
 
 namespace xla {
 
-namespace {
-
-// Copy all the instructions in the given fusion instruction into the fusion
-// instruction's parent computation and replace the use of the fusion
-// instruction with the copy of the fusion expression root.
-Status Defuse(HloInstruction* fusion_instruction) {
-  VLOG(2) << "Defusing instruction: " << fusion_instruction->ToString();
-
-  HloComputation* fused_computation =
-      fusion_instruction->fused_instructions_computation();
-
-  // A map from fused instruction to its defused clone.
-  absl::flat_hash_map<const HloInstruction*, HloInstruction*>
-      defused_instructions;
-  // Initialize map to contain the fusion instruction parameters mapping
-  // to the operands of the fusion instruction.
-  for (int64_t i = 0; i < fusion_instruction->operand_count(); ++i) {
-    defused_instructions[fused_computation->parameter_instruction(i)] =
-        fusion_instruction->mutable_operand(i);
-  }
-
-  // Create a clone of each instruction of the fused computation in the same
-  // computation as the fusion instruction itself.
-  // TODO(b/68227302): Moving instruction to new computation rather than
-  // cloning and deleting.
-  for (HloInstruction* fused_instruction :
-       fused_computation->MakeInstructionPostOrder()) {
-    if (fused_instruction->opcode() == HloOpcode::kParameter) {
-      continue;
-    }
-    std::vector<HloInstruction*> new_operands;
-    for (HloInstruction* operand : fused_instruction->operands()) {
-      new_operands.push_back(defused_instructions.at(operand));
-    }
-    HloInstruction* defused_instruction =
-        fusion_instruction->parent()->AddInstruction(
-            fused_instruction->CloneWithNewOperands(fused_instruction->shape(),
-                                                    new_operands));
-    defused_instructions[fused_instruction] = defused_instruction;
-  }
-
-  TF_RETURN_IF_ERROR(fusion_instruction->ReplaceAllUsesWith(
-      defused_instructions.at(fusion_instruction->fused_expression_root())));
-
-  HloModule* module = fusion_instruction->GetModule();
-  TF_RETURN_IF_ERROR(
-      fusion_instruction->parent()->RemoveInstruction(fusion_instruction));
-  return module->RemoveEmbeddedComputation(fused_computation);
-}
-
-}  // namespace
-
 StatusOr<bool> Defuser::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
@@ -102,7 +50,7 @@ StatusOr<bool> Defuser::Run(
           TF_RET_CHECK(call_graph_node.caller_callsites().size() == 1);
           HloInstruction* fusion_instruction =
               call_graph_node.caller_callsites()[0].instruction();
-          TF_RETURN_IF_ERROR(Defuse(fusion_instruction));
+          TF_RETURN_IF_ERROR(fusion_instruction->Defuse());
           changed = true;
         }
         return OkStatus();
diff --git a/third_party/xla/xla/service/dump.cc b/third_party/xla/xla/service/dump.cc
index a490f9af998cc2..ec621038e0fc4d 100644
--- a/third_party/xla/xla/service/dump.cc
+++ b/third_party/xla/xla/service/dump.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
@@ -33,6 +34,7 @@ limitations under the License.
 #include "xla/service/hlo_graph_dumper.h"
 #include "xla/service/hlo_proto_util.h"
 #include "xla/status.h"
+#include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/lib/io/zlib_compression_options.h"
 #include "tsl/lib/io/zlib_outputbuffer.h"
@@ -44,6 +46,21 @@ limitations under the License.
 
 namespace xla {
 
+std::string RenderGraph(absl::string_view label, const HloModule& module,
+                        RenderedGraphFormat format,
+                        bool show_fusion_subcomputations) {
+  HloRenderOptions hlo_render_options;
+  hlo_render_options.show_fusion_subcomputations = show_fusion_subcomputations;
+  StatusOr<std::string> rendered_graph =
+      RenderGraph(*module.entry_computation(), label,
+                  module.config().debug_options(), format, hlo_render_options);
+  if (rendered_graph.ok()) {
+    return std::move(rendered_graph).value();
+  }
+  return absl::StrFormat("Error rendering graph: %s",
+                         rendered_graph.status().ToString());
+}
+
 namespace {
 
 using absl::StrCat;
@@ -428,36 +445,21 @@ static std::vector<std::string> DumpHloModuleImpl(
         pb, opts, opts.dump_compress_protos));
   }
 
-  auto render_graph = [&](RenderedGraphFormat format,
-                          bool show_fusion_subcomputations = true) {
-    HloRenderOptions hlo_render_options;
-    hlo_render_options.show_fusion_subcomputations =
-        show_fusion_subcomputations;
-    StatusOr<std::string> rendered_graph =
-        RenderGraph(*module.entry_computation(),
-                    /*label=*/filename, module.config().debug_options(), format,
-                    hlo_render_options);
-    if (rendered_graph.ok()) {
-      return std::move(rendered_graph).value();
-    }
-    return StrFormat("Error rendering graph: %s",
-                     rendered_graph.status().ToString());
-  };
-
   if (opts.dump_as_dot) {
-    file_paths.push_back(
-        DumpToFileInDirImpl(StrFormat("%s.dot", filename),
-                            render_graph(RenderedGraphFormat::kDot), opts));
+    file_paths.push_back(DumpToFileInDirImpl(
+        StrFormat("%s.dot", filename),
+        RenderGraph(filename, module, RenderedGraphFormat::kDot), opts));
   }
 
   if (opts.dump_as_html) {
-    file_paths.push_back(
-        DumpToFileInDirImpl(StrFormat("%s.html", filename),
-                            render_graph(RenderedGraphFormat::kHtml), opts));
+    file_paths.push_back(DumpToFileInDirImpl(
+        StrFormat("%s.html", filename),
+        RenderGraph(filename, module, RenderedGraphFormat::kHtml), opts));
     if (absl::StrContains(filename, kAfterOptimizationsDumpName)) {
       file_paths.push_back(DumpToFileInDirImpl(
           StrFormat("%s.top_level.html", filename),
-          render_graph(RenderedGraphFormat::kHtml, false), opts));
+          RenderGraph(filename, module, RenderedGraphFormat::kHtml, false),
+          opts));
     }
   }
 
@@ -486,7 +488,7 @@ static std::vector<std::string> DumpHloModuleImpl(
   // Special case for rendering graphs as URLs.  We'll dump them to a file
   // because why not, but we always log them to stdout as well.
   if (opts.dump_as_url) {
-    std::string url = render_graph(RenderedGraphFormat::kUrl);
+    std::string url = RenderGraph(filename, module, RenderedGraphFormat::kUrl);
     std::cout << filename << " --> " << url << std::endl;
     if (!opts.dumping_to_stdout()) {
       file_paths.push_back(
@@ -621,7 +623,7 @@ void DumpToFileInDirOrStdout(const HloModule& module, string_view file_prefix,
   CanonicalDebugOptions opts(module.config().debug_options());
   if (opts.dumping_to_stdout()) return op->dump();
 
-  mlir::OpPrintingFlags print_flags = mlir::OpPrintingFlags().useLocalScope();
+  mlir::OpPrintingFlags print_flags = mlir::OpPrintingFlags();
   // Enable debug info so that it is easier to see the corresponding HLO node.
   if (file_prefix == "lmhlo") {
     print_flags.enableDebugInfo(/*enable=*/true,
@@ -710,6 +712,11 @@ bool DumpingEnabledForHloModule(string_view hlo_module_name,
   return CanonicalDebugOptions(opts).should_dump_module(hlo_module_name);
 }
 
+bool DumpingEnabledForHloPass(string_view hlo_pass_name,
+                              const DebugOptions& opts) {
+  return CanonicalDebugOptions(opts).should_dump_pass(hlo_pass_name);
+}
+
 bool DumpingToStdout(const DebugOptions& opts) {
   return CanonicalDebugOptions(opts).dumping_to_stdout();
 }
diff --git a/third_party/xla/xla/service/dump.h b/third_party/xla/xla/service/dump.h
index 86244ba4b73edb..40e46862cd17b9 100644
--- a/third_party/xla/xla/service/dump.h
+++ b/third_party/xla/xla/service/dump.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_graph_dumper.h"
 #include "xla/status.h"
 #include "xla/xla.pb.h"
 
@@ -92,6 +93,11 @@ void DumpProtobufToFile(const tsl::protobuf::Message& proto,
                             tsl::Env*, const tsl::protobuf::Message&)>
                             text_formatter = nullptr);
 
+// Render graph in a given format.
+std::string RenderGraph(absl::string_view label, const HloModule& module,
+                        RenderedGraphFormat format,
+                        bool show_fusion_subcomputations = true);
+
 // Similar to above, but the filename depends on module's information and the
 // given name. Also allows for the optional serialization function.
 void DumpPerModuleProtobufToFile(
@@ -141,6 +147,11 @@ void DumpHloModuleMetadataIfEnabled(const std::vector<HloModule*>& modules);
 // generating an expensive string.
 bool DumpingEnabledForHloModule(absl::string_view hlo_module_name,
                                 const DebugOptions& opts);
+
+// Returns true if we should dump data for an HLO pass
+bool DumpingEnabledForHloPass(absl::string_view hlo_pass_name,
+                              const DebugOptions& opts);
+
 inline bool DumpingEnabledForHloModule(const HloModule& module) {
   return DumpingEnabledForHloModule(module.name(),
                                     module.config().debug_options());
diff --git a/third_party/xla/xla/service/dynamic_dimension_inference.cc b/third_party/xla/xla/service/dynamic_dimension_inference.cc
index 5e76198b462f9f..6a1e36488e34fb 100644
--- a/third_party/xla/xla/service/dynamic_dimension_inference.cc
+++ b/third_party/xla/xla/service/dynamic_dimension_inference.cc
@@ -1077,7 +1077,7 @@ Status DynamicDimensionInferenceVisitor::PassThroughDynamicDimension(
         const Shape& subshape = ShapeUtil::GetSubshape(hlo->shape(), index);
         auto* element = dynamic_sizes.mutable_element(index);
         element->resize(subshape.rank(), nullptr);
-        element->at(dimension) = dynamic_size;
+        (*element)[dimension] = dynamic_size;
         return OkStatus();
       }));
   dynamic_sizes.ForEachElement([&](const ShapeIndex& index, const auto& sizes) {
@@ -1655,7 +1655,7 @@ Status DynamicDimensionInferenceVisitor::HandleReduceWindow(
               auto* leaf_dynamic_sizes =
                   dynamic_sizes.mutable_element(reduce_window_result_index);
               leaf_dynamic_sizes->resize(subshape.rank(), nullptr);
-              leaf_dynamic_sizes->at(dimension) = dynamic_size;
+              (*leaf_dynamic_sizes)[dimension] = dynamic_size;
             });
 
         return OkStatus();
diff --git a/third_party/xla/xla/service/dynamic_padder.cc b/third_party/xla/xla/service/dynamic_padder.cc
index fd14724f742ad7..67d5846e061022 100644
--- a/third_party/xla/xla/service/dynamic_padder.cc
+++ b/third_party/xla/xla/service/dynamic_padder.cc
@@ -2220,7 +2220,7 @@ StatusOr<bool> DynamicPadder::Run(
   // their called computation to only take static tensors.
   for (auto it = computations.rbegin(); it != computations.rend(); ++it) {
     HloComputation* computation = *it;
-    if (!call_graph->Dominates(module->entry_computation(), computation)) {
+    if (!call_graph->CanReach(module->entry_computation(), computation)) {
       continue;
     }
     // if slice_dynamic_output_ is set and this is entry computation, we need
@@ -2242,7 +2242,7 @@ StatusOr<bool> DynamicPadder::Run(
   }
 
   for (auto* computation : module->computations(execution_threads)) {
-    if (!call_graph->Dominates(module->entry_computation(), computation)) {
+    if (!call_graph->CanReach(module->entry_computation(), computation)) {
       continue;
     }
     for (auto instruction : computation->MakeInstructionPostOrder()) {
@@ -2253,7 +2253,7 @@ StatusOr<bool> DynamicPadder::Run(
   }
 
   for (auto* computation : module->computations(execution_threads)) {
-    if (!call_graph->Dominates(module->entry_computation(), computation)) {
+    if (!call_graph->CanReach(module->entry_computation(), computation)) {
       continue;
     }
     for (auto instruction : computation->MakeInstructionPostOrder()) {
diff --git a/third_party/xla/xla/service/executable.h b/third_party/xla/xla/service/executable.h
index dc52a64bb81465..949b78b0d6f971 100644
--- a/third_party/xla/xla/service/executable.h
+++ b/third_party/xla/xla/service/executable.h
@@ -379,7 +379,19 @@ class Executable {
                ? module_config().debug_options().xla_dump_hlo_snapshots()
                : false;
   }
-  HloProto const* hlo_proto() const { return hlo_proto_.get(); }
+
+  HloProto const* hlo_proto() const {
+    if (hlo_proto_ != nullptr && !hlo_proto_->has_hlo_module()) {
+      *hlo_proto_->mutable_hlo_module() = module().ToProto();
+    }
+    return hlo_proto_.get();
+  }
+
+  const BufferAssignmentProto* buffer_assignment_proto() const {
+    return hlo_proto_ != nullptr && hlo_proto_->has_buffer_assignment()
+               ? &hlo_proto_->buffer_assignment()
+               : nullptr;
+  }
 
   std::string& debug_info() { return debug_info_; }
   void set_debug_info(const std::string& debug_info) {
@@ -403,9 +415,6 @@ class Executable {
   // for execution.
   const std::shared_ptr<HloModule> hlo_module_;
 
-  // The serialized HLO proto. Non-null only if dumping snapshots is enabled.
-  std::unique_ptr<HloProto const> hlo_proto_;
-
   // Execution count, used to generate a unique filename for each dumped
   // execution.
   int64_t execution_count_ = 0;
@@ -415,6 +424,14 @@ class Executable {
 
   // Generic debug information as a string.
   std::string debug_info_;
+
+ private:
+  // The serialized HLO proto. Non-null only if dumping snapshots is enabled.
+  // This field may also be only partially set: if only
+  // hlo_proto_->buffer_assignment is set and hlo_proto_->hlo_module isn't, the
+  // hlo_module proto will be computed on the fly when requested with
+  // hlo_proto(). This avoids wasting CPU and memory if the proto isn't needed.
+  std::unique_ptr<HloProto> hlo_proto_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/float_normalization_test.cc b/third_party/xla/xla/service/float_normalization_test.cc
index 2d6a976ff59df4..fb6c133a997897 100644
--- a/third_party/xla/xla/service/float_normalization_test.cc
+++ b/third_party/xla/xla/service/float_normalization_test.cc
@@ -35,8 +35,9 @@ namespace xla {
 
 class TestFloatSupport : public FloatSupport {
  public:
-  explicit TestFloatSupport(PrimitiveType low_precision_type)
-      : FloatSupport(low_precision_type) {}
+  explicit TestFloatSupport(PrimitiveType low_precision_type,
+                            PrimitiveType high_precision_type)
+      : FloatSupport(low_precision_type, high_precision_type) {}
   ~TestFloatSupport() override = default;
 
   bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
@@ -80,8 +81,9 @@ class TestFloatSupport : public FloatSupport {
 // but supports some collectives.
 class TestFloatNoComputeSupport : public FloatSupport {
  public:
-  explicit TestFloatNoComputeSupport(PrimitiveType low_precision_type)
-      : FloatSupport(low_precision_type) {}
+  explicit TestFloatNoComputeSupport(PrimitiveType low_precision_type,
+                                     PrimitiveType high_precision_type)
+      : FloatSupport(low_precision_type, high_precision_type) {}
   ~TestFloatNoComputeSupport() override = default;
 
   bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
@@ -114,8 +116,9 @@ class FloatNormalizationTest : public HloTestBase {
       : HloTestBase(/*verifier_layout_sensitive=*/false,
                     /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 
-  bool Normalize(HloModule* module, PrimitiveType low_precision_type = BF16) {
-    TestFloatSupport float_support(low_precision_type);
+  bool Normalize(HloModule* module, PrimitiveType low_precision_type = BF16,
+                 PrimitiveType high_precision_type = F32) {
+    TestFloatSupport float_support(low_precision_type, high_precision_type);
     FloatNormalization normalization(&float_support);
     StatusOr<bool> result = normalization.Run(module);
     EXPECT_IS_OK(result.status());
@@ -508,7 +511,7 @@ TEST_F(FloatNormalizationTest, ResolveIfUnsupportedF8e5m2) {
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module.get(), F8E5M2));
+  EXPECT_TRUE(Normalize(module.get(), F8E5M2, F16));
 
   EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
   EXPECT_EQ(computation->root_instruction()->operand(0), mul1);
@@ -519,8 +522,10 @@ TEST_F(FloatNormalizationTest, ResolveIfUnsupportedF8e5m2) {
 
 class FloatNormalizationNoComputeSupportTest : public FloatNormalizationTest {
  protected:
-  bool Normalize(HloModule* module, PrimitiveType low_precision_type = BF16) {
-    TestFloatNoComputeSupport float_support(low_precision_type);
+  bool Normalize(HloModule* module, PrimitiveType low_precision_type = BF16,
+                 PrimitiveType high_precision_type = F32) {
+    TestFloatNoComputeSupport float_support(low_precision_type,
+                                            high_precision_type);
     FloatNormalization normalization(&float_support);
 
     StatusOr<bool> result = normalization.Run(module);
diff --git a/third_party/xla/xla/service/float_support.h b/third_party/xla/xla/service/float_support.h
index 9a2691be1f27f6..4ee4e7157fa4c5 100644
--- a/third_party/xla/xla/service/float_support.h
+++ b/third_party/xla/xla/service/float_support.h
@@ -27,8 +27,10 @@ namespace xla {
 // backend.
 class FloatSupport {
  public:
-  explicit FloatSupport(PrimitiveType low_precision_type)
-      : low_precision_type_(low_precision_type) {}
+  explicit FloatSupport(PrimitiveType low_precision_type,
+                        PrimitiveType high_precision_type = F32)
+      : low_precision_type_(low_precision_type),
+        high_precision_type_(high_precision_type) {}
   virtual ~FloatSupport() = default;
 
   // The low-precision type. Callers can use this class to query whether the
@@ -38,16 +40,7 @@ class FloatSupport {
   // A high-precision type that should be used in place of the low-precision
   // type if the backend does not support the low-precision type for a certain
   // instruction.
-  PrimitiveType HighPrecisionType() const {
-    if (low_precision_type_ == F8E5M2 || low_precision_type_ == F8E4M3FN ||
-        low_precision_type_ == F8E4M3B11FNUZ ||
-        low_precision_type_ == F8E5M2FNUZ ||
-        low_precision_type_ == F8E4M3FNUZ) {
-      return F16;
-    }
-    DCHECK_EQ(low_precision_type_, BF16);
-    return F32;
-  }
+  PrimitiveType HighPrecisionType() const { return high_precision_type_; }
 
   // Returns whether the backend supports a low-precision operand for the HLO
   // instruction at the given index.
@@ -82,6 +75,7 @@ class FloatSupport {
 
  private:
   PrimitiveType low_precision_type_;
+  PrimitiveType high_precision_type_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 3c888208374ed6..ad1be6bbf92070 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -18,6 +18,7 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_hipblaslt",
     "if_rocm_is_configured",
+    "rocm_copts",
 )
 load("@local_tsl//tsl:tsl.bzl", "if_google", "if_nccl", "tsl_copts", "tsl_gpu_library")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
@@ -29,6 +30,7 @@ load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "if_static",
     "tf_cuda_tests_tags",
+    "tf_gpu_tests_tags",
 )
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
@@ -130,12 +132,15 @@ xla_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         "//xla:debug_options_flags",
+        "//xla:shape_util",
         "//xla:status",
         "//xla:status_macros",
         "//xla:test_helpers",
         "//xla/client:xla_builder",
         "//xla/client/lib:constants",
         "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/hlo/ir:hlo",
         "//xla/runtime:custom_call",
         "//xla/runtime:custom_call_registry",
         "//xla/runtime:executable",
@@ -252,10 +257,12 @@ cc_library(
     deps = [
         ":gpu_constants",
         ":gpu_executable",
+        ":ir_emission_utils",
         "//xla/service:buffer_assignment",
         "//xla/service:name_uniquer",
-        "//xla/stream_executor",
         "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/algorithm:container",
+        "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
     ],
@@ -296,6 +303,7 @@ cc_library(
         ":target_util",
         ":thunk",
         "//xla:autotuning_proto_cc",
+        "//xla:literal",
         "//xla:permutation_util",
         "//xla:shape_util",
         "//xla:status",
@@ -303,7 +311,7 @@ cc_library(
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/ffi",
+        "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
@@ -317,13 +325,22 @@ cc_library(
         "//xla/service:name_uniquer",
         "//xla/service/gpu/fusions",
         "//xla/service/gpu/fusions:fusion_emitter",
-        "//xla/service/gpu/fusions:input_slices",
-        "//xla/service/gpu/fusions:loop",
         "//xla/service/gpu/fusions:thunk_util",
         "//xla/service/gpu/fusions:tiling_util",
-        "//xla/service/gpu/fusions:transpose",
+        "//xla/service/gpu/kernels:custom_fusion",
+        "//xla/service/gpu/kernels:custom_kernel",
+        "//xla/service/gpu/kernels:topk_custom_kernel",
+        "//xla/service/gpu/runtime3:command_buffer_cmd",
+        "//xla/service/gpu/runtime3:command_buffer_cmd_emitter",
+        "//xla/service/gpu/runtime3:command_buffer_thunk",
+        "//xla/service/gpu/runtime3:conditional_thunk",
+        "//xla/service/gpu/runtime3:convolution_thunk",
         "//xla/service/gpu/runtime3:custom_call_thunk",
         "//xla/service/gpu/runtime3:fft_thunk",
+        "//xla/service/gpu/runtime3:for_thunk",
+        "//xla/service/gpu/runtime3:send_recv_thunk",
+        "//xla/service/gpu/runtime3:sequential_thunk",
+        "//xla/service/gpu/runtime3:while_thunk",
         "//xla/service/llvm_ir:buffer_assignment_util",
         "//xla/service/llvm_ir:dynamic_update_slice_util",
         "//xla/service/llvm_ir:fused_ir_emitter",
@@ -341,6 +358,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -350,6 +368,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BufferizationDialect",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
@@ -360,6 +379,7 @@ cc_library(
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:NVVMToLLVMIRTranslation",
         "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:ToLLVMIRTranslation",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:human_readable_json",
@@ -394,6 +414,7 @@ cc_library(
         ":backend_configs_cc",
         ":hlo_fusion_analysis",
         ":hlo_to_ir_bindings",
+        ":ir_emission_utils",
         ":ir_emitter_context",
         ":kernel_reuse_cache",
         ":target_util",
@@ -431,12 +452,13 @@ cc_library(
     hdrs = if_gpu_is_configured(["ir_emitter_triton.h"]),
     visibility = ["//visibility:public"],
     deps = [
-        ":gemm_rewriter_triton",
         ":hlo_traversal",
         ":ir_emission_utils",
         ":launch_dimensions",
         ":matmul_utils",
         ":target_util",
+        ":triton_fusion_analysis",
+        ":triton_tiling_propagation",
         "//xla:autotuning_proto_cc",
         "//xla:comparison_util",
         "//xla:literal",
@@ -460,7 +482,10 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Linker",
         "@llvm-project//llvm:Support",
@@ -489,7 +514,7 @@ cc_library(
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
+        "@local_tsl//tsl/platform:tensor_float_32_utils",
         "@triton//:TritonDialects",
         "@triton//:TritonTransforms",
     ] + if_cuda_is_configured([
@@ -508,14 +533,11 @@ xla_test(
     backend_tags = {"gpu": [
         "requires-gpu-sm70",
     ]},
-    backends = [
-        "gpu",
-    ],
+    backends = ["gpu"],
     shard_count = 20,
     tags = ["nomac"],
     deps = [
         ":backend_configs_cc",
-        ":gemm_rewriter_triton",
         ":gpu_device_info_for_tests",
         ":ir_emission_utils",
         ":ir_emitter_triton",
@@ -558,12 +580,10 @@ xla_test(
     backend_tags = {"gpu": [
         "requires-gpu-sm70",
     ]},
-    backends = [
-        "gpu",
-    ],
+    backends = ["gpu"],
     tags = [
         "large",
-        "no_oss",
+        "no_oss",  # requires-mem:16g tag doesn't work in open source
         "nomac",
         "requires-mem:16g",
     ],
@@ -580,16 +600,14 @@ xla_test(
 xla_test(
     name = "ir_emitter_triton_parametrized_test",
     srcs = if_cuda_is_configured(["ir_emitter_triton_parametrized_test.cc"]),
-    backend_tags = {"gpu": [
-        "requires-gpu-sm70",
-    ]},
     backends = [
-        "gpu",
+        "gpu_a100",
+        "gpu_v100",
     ],
     shard_count = 10,
     tags = ["nomac"],
     deps = [
-        ":gemm_rewriter_triton",
+        ":triton_support",
         "//xla:comparison_util",
         "//xla:error_spec",
         "//xla:xla_data_proto_cc",
@@ -615,7 +633,6 @@ cc_library(
         ":autotuner_util",
         ":backend_configs_cc",
         ":buffer_comparator",
-        ":gemm_rewriter_triton",
         ":gemm_rewriter",
         ":gpu_float_support",
         ":gpu_fusible",
@@ -693,6 +710,7 @@ xla_test(
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/stream_executor:device_description",
+        "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
         "//xla/tests:test_utils",
         "//xla/tests:verified_hlo_module",
@@ -740,6 +758,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":gpu_constants",
+        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -764,10 +783,12 @@ cc_library(
     deps = [
         ":buffer_allocations",
         ":gpu_executable_run_options",
+        "//xla:executable_run_options",
         "//xla/hlo/ir:hlo",
         "//xla/service:executable",
         "//xla/stream_executor",
         "//xla/translate/mhlo_to_hlo:location_exporter",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:IR",
@@ -914,10 +935,7 @@ xla_cc_test(
 cc_library(
     name = "gpu_executable",
     srcs = [
-        "conditional_thunk.cc",
-        "convolution_thunk.cc",
         "copy_thunk.cc",
-        "for_thunk.cc",
         "fused_mha_thunk.cc",
         "gpu_executable.cc",
         "infeed_thunk.cc",
@@ -926,14 +944,9 @@ cc_library(
         "norm_thunk.cc",
         "outfeed_thunk.cc",
         "replica_id_thunk.cc",
-        "sequential_thunk.cc",
-        "while_thunk.cc",
     ],
     hdrs = [
-        "conditional_thunk.h",
-        "convolution_thunk.h",
         "copy_thunk.h",
-        "for_thunk.h",
         "fused_mha_thunk.h",
         "gemm_thunk.h",
         "gpu_executable.h",
@@ -943,8 +956,6 @@ cc_library(
         "norm_thunk.h",
         "outfeed_thunk.h",
         "replica_id_thunk.h",
-        "sequential_thunk.h",
-        "while_thunk.h",
     ],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
@@ -996,10 +1007,13 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
         "//xla/service:xla_debug_info_manager",
+        "//xla/service/gpu/kernels:custom_kernel",
         "//xla/service/gpu/runtime:executable",
         "//xla/service/gpu/runtime:support",
+        "//xla/service/gpu/runtime:tracing",
         "//xla/service/gpu/runtime3:custom_call_thunk",
         "//xla/service/gpu/runtime3:fft_thunk",
+        "//xla/service/gpu/runtime3:send_recv_thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_description",
@@ -1020,6 +1034,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1037,7 +1052,7 @@ cc_library(
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
         "@local_tsl//tsl/profiler/lib:traceme",
     ] + if_gpu_is_configured([
-        ":precompiled_kernels",
+        ":make_batch_pointers",
         "//xla/service/gpu/runtime3:cholesky_thunk",
         "//xla/service/gpu/runtime3:triangular_solve_thunk",
     ]) + if_cuda_is_configured([
@@ -1062,7 +1077,12 @@ cc_library(
     deps = [
         ":hlo_traversal",
         ":target_util",
+        "//xla:literal",
         "//xla:shape_util",
+        "//xla:status",
+        "//xla:status_macros",
+        "//xla:statusor",
+        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -1075,12 +1095,22 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/translate/mhlo_to_hlo:location_exporter",
         "//xla/translate/mhlo_to_hlo:type_to_shape",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:ml_dtypes",
     ],
 )
 
@@ -1088,15 +1118,23 @@ xla_cc_test(
     name = "ir_emission_utils_test",
     srcs = ["ir_emission_utils_test.cc"],
     deps = [
-        ":hlo_traversal",
         ":ir_emission_utils",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:types",
         "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/mlir_hlo",
         "//xla/mlir_hlo:lhlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//xla/translate/hlo_to_mhlo:hlo_utils",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -1108,6 +1146,7 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     visibility = ["//visibility:public"],
     deps = [
+        ":ir_emission_utils",
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -1200,7 +1239,9 @@ cc_library(
     name = "gemm_rewriter",
     srcs = ["gemm_rewriter.cc"],
     hdrs = ["gemm_rewriter.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
@@ -1232,6 +1273,85 @@ cc_library(
     ]),
 )
 
+cc_library(
+    name = "triton_support",
+    srcs = ["triton_support.cc"],
+    hdrs = ["triton_support.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
+cc_library(
+    name = "triton_tiling_propagation",
+    srcs = ["triton_tiling_propagation.cc"],
+    hdrs = ["triton_tiling_propagation.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":triton_support",
+        "//xla:permutation_util",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:instruction_fusion",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "triton_fusion_analysis",
+    srcs = ["triton_fusion_analysis.cc"],
+    hdrs = ["triton_fusion_analysis.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":matmul_utils",
+        ":triton_tiling_propagation",
+        "//xla:autotuning_proto_cc",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:status_macros",
+        "//xla:statusor",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:instruction_fusion",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+    ],
+)
+
+xla_cc_test(
+    name = "triton_fusion_analysis_test",
+    srcs = ["triton_fusion_analysis_test.cc"],
+    deps = [
+        ":gemm_rewriter_triton",
+        ":triton_fusion_analysis",
+        "//xla:statusor",
+        "//xla/hlo/ir:hlo",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:verified_hlo_module",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "gemm_rewriter_triton",
     srcs = ["gemm_rewriter_triton.cc"],
@@ -1242,16 +1362,14 @@ cc_library(
         ":cublas_padding_requirements",
         ":ir_emission_utils",
         ":matmul_utils",
-        "//xla:autotuning_proto_cc",
-        "//xla:permutation_util",
+        ":triton_fusion_analysis",
+        ":triton_support",
+        ":triton_tiling_propagation",
         "//xla:shape_util",
         "//xla:status",
-        "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
-        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_pass",
         "//xla/service:instruction_fusion",
         "//xla/stream_executor:device_description",
@@ -1260,10 +1378,11 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:tensor_float_32_utils",
     ],
 )
 
@@ -1273,6 +1392,7 @@ xla_cc_test(
     deps = [
         ":cublas_padding_requirements",
         ":gemm_rewriter_triton",
+        ":triton_fusion_analysis",
         "//xla:autotuning_proto_cc",
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
@@ -1297,9 +1417,11 @@ cc_library(
     hdrs = ["split_k_gemm_rewriter.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":gemm_rewriter_triton",
         ":ir_emission_utils",
         ":matmul_utils",
+        ":triton_fusion_analysis",
+        ":triton_support",
+        ":triton_tiling_propagation",
         "//xla:autotuning_proto_cc",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -1326,9 +1448,9 @@ xla_cc_test(
     name = "split_k_gemm_rewriter_test",
     srcs = ["split_k_gemm_rewriter_test.cc"],
     deps = [
-        ":gemm_rewriter_triton",
         ":matmul_utils",
         ":split_k_gemm_rewriter",
+        ":triton_fusion_analysis",
         "//xla:autotuning_proto_cc",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -1351,6 +1473,52 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "fusion_merger_triton",
+    srcs = ["fusion_merger_triton.cc"],
+    hdrs = ["fusion_merger_triton.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":backend_configs_cc",
+        ":gpu_fusible",
+        ":ir_emission_utils",
+        ":triton_fusion_analysis",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+xla_test(
+    name = "fusion_merger_triton_test",
+    srcs = ["fusion_merger_triton_test.cc"],
+    backend_tags = {"gpu": [
+        "requires-gpu-sm70",
+    ]},
+    backends = [
+        "gpu",
+    ],
+    deps = [
+        ":fusion_merger_triton",
+        "//xla:autotune_results_proto_cc",
+        "//xla/service:pattern_matcher",
+        "//xla/service:pattern_matcher_gmock",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
+        "@com_google_absl//absl/log",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:status_matchers",
+    ],
+)
+
 cc_library(
     name = "softmax_rewriter_triton",
     srcs = ["softmax_rewriter_triton.cc"],
@@ -1358,8 +1526,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
-        ":gemm_rewriter_triton",
         ":ir_emission_utils",
+        ":triton_support",
         "//xla:shape_util",
         "//xla:status",
         "//xla:status_macros",
@@ -1420,11 +1588,13 @@ cc_library(
 
 cc_library(
     name = "gemm_algorithm_picker",
-    srcs = if_cuda_is_configured(["gemm_algorithm_picker.cc"]),
-    hdrs = if_cuda_is_configured(["gemm_algorithm_picker.h"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    srcs = if_gpu_is_configured(["gemm_algorithm_picker.cc"]),
+    hdrs = if_gpu_is_configured(["gemm_algorithm_picker.h"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
+    deps = if_gpu_is_configured([
         ":backend_configs_cc",
         ":buffer_comparator",
         ":gemm_thunk",
@@ -1441,8 +1611,7 @@ cc_library(
         "//xla:status_macros",
         "//xla/stream_executor",
         "//xla/stream_executor:blas",
-        "//xla/stream_executor/cuda:cublas_lt_header",
-        "//xla/stream_executor/cuda:cublas_plugin",
+        "//xla/stream_executor/gpu:gpu_blas_lt",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor/gpu:redzone_allocator",
@@ -1522,21 +1691,25 @@ cc_library(
     ]),
 )
 
-xla_cc_test(
+xla_test(
     name = "gemm_algorithm_picker_test",
-    srcs = ["gemm_algorithm_picker_test.cc"],
-    tags = [
+    srcs = if_gpu_is_configured(["gemm_algorithm_picker_test.cc"]),
+    backend_tags = {
+        "gpu": [
+            "requires-gpu-nvidia",
+            "noasan",
+            "nomsan",
+            "requires-gpu-sm70-only",
+        ],
+    },
+    backends = [
         "gpu",
-        "no_oss",
-        "noasan",
-        "nomsan",
-        "requires-gpu-sm70",
+        "gpu_v100",
     ],
     deps = [
         ":backend_configs_cc",
         ":gemm_algorithm_picker",
         ":gemm_rewriter",
-        "//xla/service:gpu_plugin",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/stream_executor:device_description",
@@ -1587,7 +1760,7 @@ cc_library(
         "//xla/stream_executor:host_or_device_scalar",
     ]) + if_rocm_is_configured([
         "//xla/stream_executor/rocm:hipblas_lt_header",
-        "//xla/stream_executor/rocm:hipblaslt_plugin",
+        "//xla/stream_executor/rocm:amdhipblaslt_plugin",
         "//xla/stream_executor:host_or_device_scalar",
         "//xla/stream_executor/platform:dso_loader",
     ]) + if_static([
@@ -1732,20 +1905,24 @@ cc_library(
     ]),
 )
 
-xla_cc_test(
+xla_test(
     name = "conv_algorithm_picker_test",
     srcs = if_gpu_is_configured(["conv_algorithm_picker_test.cc"]),
-    tags = [
+    backend_tags = {
+        "gpu": [
+            "requires-gpu-nvidia",
+            "noasan",
+            "nomsan",
+            "requires-gpu-sm70-only",
+        ],
+    },
+    backends = [
         "gpu",
-        "no_oss",
-        "noasan",
-        "nomsan",
-        "requires-gpu-sm70",
+        "gpu_v100",
     ],
     deps = [
         ":conv_algorithm_picker",
         ":gpu_conv_rewriter",
-        "//xla/service:gpu_plugin",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/service:tuple_simplifier",
@@ -2053,6 +2230,7 @@ cc_library(
     deps = [
         ":fusion_process_dump_proto_cc",
         ":gpu_fusible",
+        ":hlo_traversal",
         "//xla:shape_util",
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
@@ -2063,6 +2241,7 @@ cc_library(
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_pass",
         "//xla/service:instruction_fusion",
+        "//xla/service/gpu/model:fusion_analysis_cache",
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/service/gpu/model:gpu_performance_model",
         "//xla/stream_executor:device_description",
@@ -2073,7 +2252,10 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/meta:type_traits",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:blocking_counter",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
     ],
@@ -2096,6 +2278,8 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:status_matchers",
     ],
 )
 
@@ -2161,7 +2345,9 @@ xla_cc_test(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status_matchers",
     ],
 )
 
@@ -2577,6 +2763,7 @@ cc_library(
         ":ir_emitter_unnested",
         ":metrics",
         ":runtime_intrinsics",
+        ":thunk",
         "//xla:shape_util",
         "//xla:status",
         "//xla:status_macros",
@@ -2592,6 +2779,10 @@ cc_library(
         "//xla/service:dump",
         "//xla/service:hlo_dataflow_analysis",
         "//xla/service:hlo_proto_cc",
+        "//xla/service/gpu/runtime3:conditional_thunk",
+        "//xla/service/gpu/runtime3:for_thunk",
+        "//xla/service/gpu/runtime3:sequential_thunk",
+        "//xla/service/gpu/runtime3:while_thunk",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
@@ -2624,6 +2815,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
+        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -2631,7 +2823,10 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -2654,6 +2849,46 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "custom_fusion_rewriter",
+    srcs = ["custom_fusion_rewriter.cc"],
+    hdrs = ["custom_fusion_rewriter.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:statusor",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "//xla/service/gpu/kernels:custom_fusion_library",
+        "//xla/service/gpu/kernels:custom_fusion_pattern",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "custom_fusion_rewriter_test",
+    srcs = ["custom_fusion_rewriter_test.cc"],
+    deps = [
+        ":custom_fusion_rewriter",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu/kernels:custom_fusion_pattern",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "fusion_pipeline",
     srcs = ["fusion_pipeline.cc"],
@@ -2678,6 +2913,7 @@ cc_library(
         "//xla/service:layout_assignment",
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/stream_executor:device_description",
+        "@local_tsl//tsl/platform:env",
     ],
 )
 
@@ -2724,6 +2960,7 @@ cc_library(
         ":compile_module_to_llvm_ir",
         ":conv_layout_normalization",
         ":copy_fusion",
+        ":custom_fusion_rewriter",
         ":dot_dimension_sorter",
         ":executable_proto_cc",
         ":fusion_merger",
@@ -2771,6 +3008,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:variant",
         "@llvm-project//llvm:AsmParser",
         "@llvm-project//llvm:BitReader",
@@ -2850,6 +3088,7 @@ cc_library(
         "//xla/service:loop_schedule_linearizer",
         "//xla/service:operand_upcaster",
         "//xla/service:optimization_barrier_expander",
+        "//xla/service:optimize_input_output_buffer_alias",
         "//xla/service:qr_expander",
         "//xla/service:real_imag_expander",
         "//xla/service:reduce_decomposer",
@@ -2922,8 +3161,14 @@ cc_library(
         "//xla/service/gpu/runtime:executable",
         "@local_tsl//tsl/platform:numbers",
     ]) + xla_export_hlo_deps() + [
+        ":command_buffer_scheduling",
+        ":fusion_merger_triton",
         ":fusion_pipeline",
+        ":ir_emitter_context",
+        ":ir_emitter_unnested",
         ":prepare_hlo_for_ir_emitting_pipeline",
+        ":thunk",
+        "@llvm-project//mlir:FuncDialect",
         "@local_tsl//tsl/lib/monitoring:counter",
     ],
 )
@@ -3012,6 +3257,7 @@ cc_library(
         ":gpu_sort_rewriter",
         ":ir_emission_utils",
         ":metrics",
+        ":move_copy_to_users",
         ":target_constants",
         ":triangular_solve_rewriter",
         ":triton_autotuner",
@@ -3098,24 +3344,42 @@ xla_cc_test(
 
 xla_cc_test(
     name = "gpu_aot_compilation_test",
-    srcs = if_cuda_is_configured([
+    srcs = if_gpu_is_configured([
         "gpu_aot_compilation_test.cc",
     ]),
     env = {
         "XLA_FLAGS": "--xla_gpu_enable_xla_runtime_executable",
     },
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     tags = [
         "gpu",
         "no_oss",
-        "no_rocm",
         "nomsan",  # Pulls in precompiled NVIDIA libraries which cause false positives in msan.
         "requires-gpu-nvidia",
     ],
     deps = if_cuda_is_configured([
         ":nvptx_compiler_impl",
+    ]) + if_rocm_is_configured([
+        ":amdgpu_compiler_impl",
+    ]) + [
+        ":gpu_transfer_manager",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:hlo_module_group",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:gpu_plugin",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:multi_platform_manager",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream_executor_headers",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
-    ]),
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
+    ],
 )
 
 cc_library(
@@ -3144,11 +3408,14 @@ cc_library(
         ":autotuner_util",
         ":cusolver_rewriter",
         ":gemm_rewriter",
+        ":gemm_algorithm_picker",
         ":gpu_compiler",
         ":conv_algorithm_picker",
         ":gpu_conv_padding_legalization",
         ":gpu_conv_rewriter",
         ":gpu_layout_assignment",
+        ":cublas_pad_for_gemms",
+        ":cublas_padding_requirements",
         ":reduction_degenerate_dim_remover",
         ":reduction_dimension_grouper",
         ":reduction_layout_normalizer",
@@ -3160,6 +3427,7 @@ cc_library(
         "//xla/service:algebraic_simplifier",
         "//xla/service:call_inliner",
         "//xla/hlo/ir:hlo",
+        "//xla/service:dot_dimension_merger",
         "//xla/service:hlo_constant_folding",
         "//xla/service:hlo_cse",
         "//xla/service:hlo_pass",
@@ -3296,6 +3564,42 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "gpu_schedule_postprocessing",
+    srcs = ["gpu_schedule_postprocessing.cc"],
+    hdrs = ["gpu_schedule_postprocessing.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":backend_configs_cc",
+        "//xla:statusor",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:hlo_pass",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_schedule_postprocessing_test",
+    srcs = ["gpu_schedule_postprocessing_test.cc"],
+    deps = [
+        ":gpu_schedule_postprocessing",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_parser",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "gpu_hlo_schedule",
     srcs = ["gpu_hlo_schedule.cc"],
@@ -3304,6 +3608,7 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
+        ":gpu_schedule_postprocessing",
         "//xla:shape_util",
         "//xla:status",
         "//xla:statusor",
@@ -3449,8 +3754,10 @@ cc_library(
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:ir_headers",
         "@local_tsl//tsl/platform:macros",
@@ -3471,16 +3778,17 @@ xla_cc_test(
     ],
 )
 
-cuda_library(
+cc_library(
     name = "buffer_comparator",
-    srcs = if_cuda_is_configured([
-        "buffer_comparator.cc",
-        "buffer_comparator.cu.cc",
+    srcs = if_gpu_is_configured(["buffer_comparator.cc"]),
+    hdrs = if_gpu_is_configured(["buffer_comparator.h"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
     ]),
-    hdrs = if_cuda_is_configured(["buffer_comparator.h"]),
     visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
+    deps = if_gpu_is_configured([
         ":launch_dimensions",
+        ":buffer_comparator_kernel",
         ":gpu_asm_opts_util",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
@@ -3496,17 +3804,35 @@ cuda_library(
     ]),
 )
 
+cuda_library(
+    name = "buffer_comparator_kernel",
+    srcs = if_gpu_is_configured(["buffer_comparator.cu.cc"]),
+    copts = rocm_copts(),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
+)
+
 xla_cc_test(
     name = "buffer_comparator_test",
-    srcs = if_cuda_is_configured(["buffer_comparator_test.cc"]),
-    tags = tf_cuda_tests_tags(),
+    srcs = if_gpu_is_configured(["buffer_comparator_test.cc"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    tags = tf_gpu_tests_tags(),
     deps = [
         ":stream_executor_util",
         "//xla:shape_util",
         "//xla:types",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
-    ] + if_cuda_is_configured([
+    ] + if_gpu_is_configured([
         ":buffer_comparator",
         "//xla/stream_executor:device_memory",
     ]),
@@ -3538,6 +3864,7 @@ cc_library(
     hdrs = ["gpu_fusible.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":backend_configs_cc",
         ":ir_emission_utils",
         ":reduction_utils",
         "//xla:shape_util",
@@ -3589,19 +3916,22 @@ cc_library(
     ]),
 )
 
-xla_cc_test(
+xla_test(
     name = "cudnn_fused_conv_rewriter_test",
     srcs = ["cudnn_fused_conv_rewriter_test.cc"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    shard_count = 10,
-    tags = [
+    backend_tags = {
+        "gpu": [
+            "requires-gpu-nvidia",
+            "requires-gpu-sm80-only",
+            "noasan",
+            "nomsan",
+        ],
+    },
+    backends = [
         "gpu",
-        "no_oss",
-        "noasan",
-        "nomsan",
-        # This test runs some fusions that are only supported on Ampere+.
-        "requires-gpu-sm80",
     ],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    shard_count = 10,
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -4125,11 +4455,13 @@ test_suite(
         ":custom_call_test",
         # copybara:uncomment ":gpu_aot_compilation_test",
         # copybara:uncomment "//platforms/xla/tests/internal:xfeed_test_gpu",
-        # TODO(anlunx): Re-enable when the FFI mechanism is avalable in Thunk-based runtime.
+        # TODO(anlunx): Re-enable when the TopK is available in Thunk-based runtime.
         # copybara:uncomment # "//third_party/py/jax/experimental/jax2tf/tests:primitives_test_gpu",
         # copybara:uncomment "//third_party/py/jax/tests:pmap_test_gpu",
         # copybara:uncomment "//tensorflow/compiler/tests:fft_test_gpu",
+        "//xla/pjrt/gpu:se_gpu_pjrt_client_test",
         "//xla/python:xla_client_test_gpu",
+        # copybara:uncomment "//xla/service/gpu/kernels:cutlass_gemm_fusion_test_gpu",
         "//xla/service/gpu/tests:add_preds.hlo.test",
         "//xla/service/gpu/tests:concat.hlo.test",
         "//xla/service/gpu/tests:constant.hlo.test",
@@ -4158,7 +4490,8 @@ test_suite(
         "//xla/service/gpu/tests:gpu_infeed_test",
         "//xla/service/gpu/tests:gpu_input_fusible_slice_test",
         "//xla/service/gpu/tests:gpu_kernel_tiling_test",
-        "//xla/service/gpu/tests:gpu_ldg_test",
+        # TODO(ezhulenev): Re-enable this test.
+        #"//xla/service/gpu/tests:gpu_ldg_test",
         "//xla/service/gpu/tests:gpu_noalias_test",
         "//xla/service/gpu/tests:gpu_reduce_scatter_creator_test",
         "//xla/service/gpu/tests:gpu_spmd_e2e_compile_test",
@@ -4298,28 +4631,34 @@ cc_library(
 )
 
 cc_library(
-    name = "precompiled_kernels",
-    srcs = if_gpu_is_configured(["precompiled_kernels.cc"]),
-    hdrs = if_gpu_is_configured(["precompiled_kernels.h"]),
+    name = "make_batch_pointers",
+    srcs = if_gpu_is_configured(["make_batch_pointers.cc"]),
+    hdrs = if_gpu_is_configured(["make_batch_pointers.h"]),
     visibility = ["//visibility:public"],
-    deps = if_gpu_is_configured([
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
+    deps = [
         "//xla:status",
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
-        "//xla/stream_executor:device_memory",
         "//xla/stream_executor",
-        "//xla/stream_executor/gpu:asm_compiler",
-        "//xla/stream_executor/gpu:gpu_asm_opts",
+        "//xla/stream_executor:device_memory",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ] + if_cuda_is_configured([
+        ":make_batch_pointers_kernel",
     ]) + if_rocm_is_configured([
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/rocm:rocm_helpers",
     ]),
 )
 
+cuda_library(
+    name = "make_batch_pointers_kernel",
+    srcs = if_cuda_is_configured(["make_batch_pointers.cu.cc"]),
+    visibility = ["//visibility:public"],
+    deps = ["@local_config_cuda//cuda:cuda_headers"],
+)
+
 cc_library(
     name = "triangular_solve_rewriter",
     srcs = ["triangular_solve_rewriter.cc"],
@@ -4451,11 +4790,11 @@ cc_library(
 
 cc_library(
     name = "topk_specializer",
-    srcs = if_cuda_is_configured(
-        ["topk_specializer.cc"],
-        ["topk_specializer_nocuda.cc"],
-    ),
+    srcs = ["topk_specializer.cc"],
     hdrs = ["topk_specializer.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         "//xla:executable_run_options",
@@ -4591,7 +4930,6 @@ xla_cc_test(
     name = "hlo_traversal_test",
     srcs = ["hlo_traversal_test.cc"],
     deps = [
-        ":gpu_fusible",
         ":hlo_traversal",
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
@@ -4706,7 +5044,10 @@ xla_cc_test(
 xla_cc_test(
     name = "determinism_test",
     srcs = ["determinism_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    tags = tf_gpu_tests_tags(),
     deps = [
         ":autotuner_util",
         "//xla:literal",
diff --git a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
index a5c3fd758d3110..ff470cf3257dfd 100644
--- a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
@@ -20,8 +20,12 @@ limitations under the License.
 
 #include "xla/service/algebraic_simplifier.h"
 #include "xla/service/call_inliner.h"
+#include "xla/service/dot_dimension_merger.h"
 #include "xla/service/gpu/conv_algorithm_picker.h"
+#include "xla/service/gpu/cublas_pad_for_gemms.h"
+#include "xla/service/gpu/cublas_padding_requirements.h"
 #include "xla/service/gpu/cusolver_rewriter.h"
+#include "xla/service/gpu/gemm_algorithm_picker.h"
 #include "xla/service/gpu/gemm_rewriter.h"
 #include "xla/service/gpu/gpu_conv_padding_legalization.h"
 #include "xla/service/gpu/gpu_conv_rewriter.h"
@@ -115,10 +119,26 @@ Status AMDGPUCompiler::OptimizeHloPostLayoutAssignment(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
     const CompileOptions& options, const TargetConfig& gpu_target_config,
     tsl::thread::ThreadPool* thread_pool) {
+  HloPassPipeline pre_pipeline("AMDGPU post-layout_assignment part 1");
+
+  auto rocm_compute_capability = std::get<se::RocmComputeCapability>(
+      gpu_target_config.device_description.gpu_compute_capability());
+
+  pre_pipeline.AddPass<DotDimensionMerger>();
+
+  for (const auto& req : HipblasPaddingRequirements) {
+    pre_pipeline.AddPass<CublasPadForGemms>(rocm_compute_capability,
+                                            req.data_type, req.multiple_of);
+  }
+  // Padding a gemm operand that's a constant results in pad(constant).  Run
+  // constant-folding to simplify this into a new constant.
+  pre_pipeline.AddPass<HloConstantFolding>();
+  TF_RETURN_IF_ERROR(pre_pipeline.Run(hlo_module).status());
+
   TF_RETURN_IF_ERROR(GpuCompiler::OptimizeHloPostLayoutAssignment(
       hlo_module, stream_exec, options, gpu_target_config, thread_pool));
 
-  HloPassPipeline post_pipeline("AMDGPU post-layout_assignment");
+  HloPassPipeline post_pipeline("AMDGPU post-layout_assignment part 2");
 
   // Transform TriangularSolve ops into custom-calls, so we can add temp
   // memory.
@@ -153,8 +173,7 @@ Status AMDGPUCompiler::AddConvAndGemmAutotuningPasses(
   if (GpuConvAlgorithmPicker::IsEnabled(hlo_module)) {
     pipeline->AddPass<GpuConvAlgorithmPicker>(autotune_config);
   }
-  // TODO:
-  // pipeline->AddPass<GemmAlgorithmPicker>(autotune_config);
+  pipeline->AddPass<GemmAlgorithmPicker>(autotune_config);
   return OkStatus();
 }
 
@@ -162,13 +181,10 @@ AMDGPUCompiler::AMDGPUCompiler()
     : GpuCompiler(stream_executor::rocm::kROCmPlatformId,
                   amdgpu::TargetTriple(), amdgpu::DataLayout()) {}
 
-StatusOr<std::pair<std::string, std::vector<uint8_t>>>
-AMDGPUCompiler::CompileTargetBinary(const HloModuleConfig& module_config,
-                                    llvm::Module* llvm_module,
-                                    se::GpuComputeCapability gpu_version,
-                                    bool relocatable,
-                                    const HloModule* debug_module,
-                                    const CompileOptions& options) {
+StatusOr<GpuCompiler::BackendCompileResult> AMDGPUCompiler::CompileTargetBinary(
+    const HloModuleConfig& module_config, llvm::Module* llvm_module,
+    se::GpuComputeCapability gpu_version, bool relocatable,
+    const HloModule* debug_module, const CompileOptions& options) {
   if (rocdl_dir_.empty()) {
     // Compute rocdl_dir_ just once and cache it in this member.
     rocdl_dir_ = GetROCDLDir(module_config);
@@ -191,7 +207,7 @@ AMDGPUCompiler::CompileTargetBinary(const HloModuleConfig& module_config,
                                       module_config.compilation_cache_key()));
   }
 
-  return std::pair<std::string, std::vector<uint8_t>>("", std::move(hsaco));
+  return BackendCompileResult{"", std::move(hsaco)};
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/amdgpu_compiler.h b/third_party/xla/xla/service/gpu/amdgpu_compiler.h
index fb2a2e10edc4ce..d070c5d9cd8408 100644
--- a/third_party/xla/xla/service/gpu/amdgpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/amdgpu_compiler.h
@@ -51,7 +51,7 @@ class AMDGPUCompiler : public GpuCompiler {
       AutotuneConfig& autotune_config,
       tsl::thread::ThreadPool* thread_pool) override;
 
-  StatusOr<std::pair<std::string, std::vector<uint8_t>>> CompileTargetBinary(
+  StatusOr<BackendCompileResult> CompileTargetBinary(
       const HloModuleConfig& module_config, llvm::Module* llvm_module,
       se::GpuComputeCapability gpu_version, bool relocatable,
       const HloModule* debug_module, const CompileOptions& options) override;
diff --git a/third_party/xla/xla/service/gpu/autotuner_compile_util.cc b/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
index 69659373cf8115..4350dc235423af 100644
--- a/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
@@ -105,8 +105,19 @@ AutotunerCompileUtil::ProfileExecutable(
         ExecutionInputsFromBuffers(input_buffers, input_shapes);
     // Warmup: in and out buffers are reused while probing different configs,
     // so GPU caches should be in some comparable states during measurements.
-    TF_ASSIGN_OR_RETURN(ExecutionOutput execution_output,
-                        Execute(*executable, std::move(execution_inputs)));
+    StatusOr<ExecutionOutput> execution_output =
+        Execute(*executable, std::move(execution_inputs));
+    if (!execution_output.ok()) {
+      // Treat register allocation error gracefully. If the compilation happens
+      // with the driver during execution then the error could surface here.
+      // It's enough to check this once here.
+      if (execution_output.status().code() ==
+          absl::StatusCode::kResourceExhausted) {
+        return {std::nullopt};
+      }
+      return execution_output.status();
+    }
+
     TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
   }
   std::vector<ExecutionInput> execution_inputs =
@@ -125,7 +136,7 @@ StatusOr<std::unique_ptr<Executable>> AutotunerCompileUtil::Compile(
     GenerateModuleFn extractor) {
   StatusOr<std::unique_ptr<HloModule>> new_hlo_module = extractor(opts_);
   if (new_hlo_module.status().GetPayload(kUncompilableFusion).has_value()) {
-    // Incompatible value of split-k is an expected failure.
+    // Incompatible value of split-k is an example of an expected failure.
     return std::unique_ptr<Executable>();
   } else if (!new_hlo_module.status().ok()) {
     return new_hlo_module.status();
@@ -138,7 +149,7 @@ StatusOr<std::unique_ptr<Executable>> AutotunerCompileUtil::Compile(
                                /*is_autotuning_compilation=*/true});
   if (out.status().code() == absl::StatusCode::kResourceExhausted ||
       out.status().code() == absl::StatusCode::kCancelled) {
-    // Being out of shared memory budget is an expected failure.
+    // Being out of shared memory budget or registers is an expected failure.
     // Cancelling upon register spilling is also an expected failure.
     return std::unique_ptr<Executable>();
   }
diff --git a/third_party/xla/xla/service/gpu/backend_configs.proto b/third_party/xla/xla/service/gpu/backend_configs.proto
index c9bdbd009f0005..46f51ebe5857bc 100644
--- a/third_party/xla/xla/service/gpu/backend_configs.proto
+++ b/third_party/xla/xla/service/gpu/backend_configs.proto
@@ -111,15 +111,26 @@ message BitcastBackendConfig {
 // Backend config for async collective operations. Note that for is_sync will
 // be false by default, so even if a backend config is not explicitly attached
 // to the HLOInstruction, getting the backend_config will yield a default valued
-// proto which will have is_sync = false.
+// proto which will have is_sync = false. Attribute no_parallel_custom_call
+// asserts that an asynchronous collective operation does not execute in
+// parallel with custom-calls, which can trigger device synchronization . This
+// attribute will also be false by default and should lead to conversative
+// runtime behavior.
 message CollectiveBackendConfig {
   bool is_sync = 1;
+  bool no_parallel_custom_call = 2;
 }
 
 message ReificationCost {
   double end_to_end_cycles = 1;  // Total execution time of the reified op.
 }
 
+// Backend config for a custom fusion (pre-compiled device kernel implementing a
+// fusion computation).
+message CustomFusionConfig {
+  string name = 1;
+}
+
 message FusionBackendConfig {
   // kLoop, kInput, or kOutput (from HloInstruction::FusionKind), or your own
   // custom string.
@@ -136,6 +147,9 @@ message FusionBackendConfig {
   // present, we use the default Triton config.
   AutotuneResult.TritonGemmKey triton_gemm_config = 2;
 
+  // Only valid when kind == "__custom_fusion".
+  CustomFusionConfig custom_fusion_config = 4;
+
   // Cost model prediction.
   ReificationCost reification_cost = 3;
 }
@@ -176,4 +190,10 @@ message CudnnfMHABackendConfig {
 
   // Random seed used by dropout
   int64 seed = 15;
+
+  // Is flash attention
+  bool is_flash_attention = 20;
+
+  // Is causal mask
+  bool is_causal_mask = 21;
 }
diff --git a/third_party/xla/xla/service/gpu/backend_configs_test.cc b/third_party/xla/xla/service/gpu/backend_configs_test.cc
index 89eaa639d033d5..99c86bd4e08d52 100644
--- a/third_party/xla/xla/service/gpu/backend_configs_test.cc
+++ b/third_party/xla/xla/service/gpu/backend_configs_test.cc
@@ -50,6 +50,7 @@ TEST_F(BackendConfigsTest, DefaultCollectiveBackendConfig) {
       ags->backend_config<CollectiveBackendConfig>();
   EXPECT_THAT(collective_backend_config.status(), IsOk());
   EXPECT_THAT(collective_backend_config->is_sync(), IsFalse());
+  EXPECT_THAT(collective_backend_config->no_parallel_custom_call(), IsFalse());
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/buffer_allocations.cc b/third_party/xla/xla/service/gpu/buffer_allocations.cc
index 0b1b879d28707f..4bbad558e8959a 100644
--- a/third_party/xla/xla/service/gpu/buffer_allocations.cc
+++ b/third_party/xla/xla/service/gpu/buffer_allocations.cc
@@ -15,17 +15,12 @@ limitations under the License.
 
 #include "xla/service/gpu/buffer_allocations.h"
 
-#include <memory>
-#include <utility>
-
-#include "xla/map_util.h"
-#include "xla/service/gpu/gpu_constants.h"
-#include "xla/status_macros.h"
-#include "xla/types.h"
-#include "xla/util.h"
-#include "tsl/lib/gtl/map_util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
+#include <cstdint>
+#include <set>
+
+#include "xla/service/buffer_assignment.h"
+#include "xla/status.h"
+#include "xla/stream_executor/device_memory.h"
 
 namespace xla {
 namespace gpu {
@@ -59,7 +54,23 @@ se::DeviceMemoryBase BufferAllocations::GetDeviceAddress(
     BufferAllocation::Index buffer_index) const {
   CHECK_GE(buffer_index, 0);
   CHECK_LT(buffer_index, buffers_.size());
-  return buffers_[buffer_index];
+  se::DeviceMemoryBase base = buffers_[buffer_index];
+  if (reinterpret_cast<uintptr_t>(base.opaque()) == kExternalAllocationMarker) {
+    if (!external_allocations_) {
+      LOG(ERROR) << "Does not have external allocations for buffer "
+                 << buffer_index;
+      return se::DeviceMemoryBase();
+    }
+    auto external_address =
+        external_allocations_->GetDeviceAddress(buffer_index);
+    if (external_address.ok()) {
+      return external_address.value();
+    }
+    LOG(ERROR) << "External address for allocation" << buffer_index
+               << " is not allocated yet";
+    return se::DeviceMemoryBase();
+  }
+  return base;
 }
 
 se::DeviceMemoryBase& BufferAllocations::GetMutableDeviceAddress(
@@ -71,12 +82,42 @@ se::DeviceMemoryBase& BufferAllocations::GetMutableDeviceAddress(
 
 se::DeviceMemoryBase BufferAllocations::GetDeviceAddress(
     const BufferAllocation::Slice& buffer_slice) const {
-  se::DeviceMemoryBase base = GetDeviceAddress(buffer_slice.index());
-  CHECK_LE(buffer_slice.offset(), base.size());
-  CHECK_LE(buffer_slice.offset() + buffer_slice.size(), base.size());
-  return se::DeviceMemoryBase(
-      static_cast<char*>(base.opaque()) + buffer_slice.offset(),
-      buffer_slice.size());
+  int64_t index = buffer_slice.index();
+  se::DeviceMemoryBase base = GetDeviceAddress(index);
+
+  int64_t offset = buffer_slice.offset();
+  CHECK_LE(buffer_slice.offset(), base.size())
+      << "slice offset " << offset << " must be smaller than buffer #" << index
+      << " size " << base.size();
+
+  int64_t extent = offset + buffer_slice.size();
+  CHECK_LE(extent, base.size())
+      << "slice extent " << extent << " must be smaller than buffer #" << index
+      << " size " << base.size();
+
+  return base.GetByteSlice(buffer_slice.offset(), buffer_slice.size());
+}
+
+Status BufferAllocations::AddExternalAllocation(
+    BufferAllocation::Index index, se::DeviceMemoryBase memory) const {
+  if (external_allocations_ == nullptr) {
+    return InternalError(
+        "Calling external allocations, but no allocation tracker is provided"
+        "for allocation %d",
+        index);
+  }
+  return external_allocations_->AddAllocation(index, memory);
+}
+
+Status BufferAllocations::EraseExternalAllocation(
+    BufferAllocation::Index index) const {
+  if (external_allocations_ == nullptr) {
+    return InternalError(
+        "Calling external allocations, but no allocation tracker is provided"
+        "for allocation %d",
+        index);
+  }
+  return external_allocations_->EraseAllocation(index);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/buffer_allocations.h b/third_party/xla/xla/service/gpu/buffer_allocations.h
index 04e76c5157ee13..940e111bf82b9f 100644
--- a/third_party/xla/xla/service/gpu/buffer_allocations.h
+++ b/third_party/xla/xla/service/gpu/buffer_allocations.h
@@ -16,16 +16,18 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_BUFFER_ALLOCATIONS_H_
 #define XLA_SERVICE_GPU_BUFFER_ALLOCATIONS_H_
 
-#include <memory>
+#include <cstddef>
+#include <cstdint>
 #include <set>
 #include <string>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 
@@ -36,12 +38,45 @@ namespace gpu {
 // allocated device buffers.
 class BufferAllocations {
  public:
+  // This special address is used to indicate that the allocation is not
+  // allocated at construction time and instead will be lazily allocated and
+  // owned by the XLA executable itself (we use this special marker to handle
+  // buffer allocations allocated within command buffers, which for CUDA
+  // backends means that buffer allocation is done via memory allocation node).
+  //
+  // TODO(ezhulenev): Replace magic bit pattern with std::optional or
+  // std::variant to distinguish external allocations from a regular ones.
+  static constexpr uintptr_t kExternalAllocationMarker = 0xDEADBEEF;
+
+  // A virtual base class for external allocations that provides a mapping
+  // from a buffer index to an externally-managed device memory.
+  class ExternalAllocations {
+   public:
+    virtual ~ExternalAllocations() = default;
+
+    // Return a device address for a given buffer allocation. Returns error if
+    // corresponding allocation is not yet allocated.
+    virtual StatusOr<se::DeviceMemoryBase> GetDeviceAddress(
+        BufferAllocation::Index index) const = 0;
+
+    // Adds an external allocation for a given buffer index. Returns error if
+    // allocation already exists.
+    virtual Status AddAllocation(BufferAllocation::Index index,
+                                 se::DeviceMemoryBase memory) = 0;
+
+    // Erases an external allocation for a given buffer index. Returns error if
+    // allocation does not exists.
+    virtual Status EraseAllocation(BufferAllocation::Index index) = 0;
+  };
+
   BufferAllocations(absl::Span<se::DeviceMemoryBase const> buffers,
                     int device_ordinal,
-                    se::DeviceMemoryAllocator* memory_allocator)
+                    se::DeviceMemoryAllocator* memory_allocator,
+                    ExternalAllocations* external_allocations = nullptr)
       : buffers_(buffers.begin(), buffers.end()),
         device_ordinal_(device_ordinal),
-        memory_allocator_(memory_allocator) {}
+        memory_allocator_(memory_allocator),
+        external_allocations_(external_allocations) {}
 
   BufferAllocations(BufferAllocations&& other) = default;
   BufferAllocations& operator=(BufferAllocations&& other) = default;
@@ -68,6 +103,13 @@ class BufferAllocations {
   se::DeviceMemoryBase GetDeviceAddress(
       const BufferAllocation::Slice& buffer_slice) const;
 
+  // Add new allocation allocated by external allocator.
+  Status AddExternalAllocation(BufferAllocation::Index index,
+                               se::DeviceMemoryBase memory) const;
+
+  // Remove allocation freed by external allocator.
+  Status EraseExternalAllocation(BufferAllocation::Index index) const;
+
   // Tears down all buffers allocated by this object that are not in
   // `live_addresses`.
   Status TearDown(const std::set<se::DeviceMemoryBase>& live_addresses,
@@ -89,9 +131,17 @@ class BufferAllocations {
   // An array of device pointers that stores the address of each buffer
   // indexed by Index. Each element can point to a temporary buffer, an
   // input buffer, or nullptr if no buffer is needed for that Index.
+
+  // a special address (se::kExternalAllocationMarker) with non-zero size buffer
+  // is assumed to be lazily allocated buffer, and will be allocated through
+  // command buffer Allocate command during runtime.
   std::vector<se::DeviceMemoryBase> buffers_;
   int device_ordinal_;
   se::DeviceMemoryAllocator* memory_allocator_;
+
+  // For buffer address that marked as ExternalAllocations, tracks its real
+  // address here.
+  ExternalAllocations* external_allocations_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator.cc b/third_party/xla/xla/service/gpu/buffer_comparator.cc
index 73ce3a4878f7c5..c5dbe883f28919 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator.cc
+++ b/third_party/xla/xla/service/gpu/buffer_comparator.cc
@@ -168,7 +168,6 @@ static StatusOr<bool> CompareEqualParameterized(se::Stream* stream,
                                             stream, current, expected)));
   CHECK_EQ(host_return, result)
       << "Host comparison succeeded even though GPU comparison failed.";
-
   return false;
 }
 
@@ -176,38 +175,40 @@ StatusOr<bool> BufferComparator::CompareEqual(
     se::Stream* stream, se::DeviceMemoryBase current,
     se::DeviceMemoryBase expected) const {
   switch (shape_.element_type()) {
+#if GOOGLE_CUDA  // not available for ROCm yet..
     case xla::F8E4M3FN:
       return CompareEqualParameterized<tsl::float8_e4m3fn, float>(
           stream, current, expected, shape_, config_, "fp8_e4m3fn_comparison",
-          fp8_e4m3fn_comparison());
+          buffer_comparator::fp8_e4m3fn_comparison());
     case xla::F8E5M2:
       return CompareEqualParameterized<tsl::float8_e5m2, float>(
           stream, current, expected, shape_, config_, "fp8_e5m2_comparison",
-          fp8_e5m2_comparison());
+          buffer_comparator::fp8_e5m2_comparison());
+#endif  // GOOGLE_CUDA
     case xla::F16:
       return CompareEqualParameterized<Eigen::half, float>(
           stream, current, expected, shape_, config_, "fp16_comparison",
-          fp16_comparison());
+          buffer_comparator::fp16_comparison());
     case xla::BF16:
       return CompareEqualParameterized<Eigen::bfloat16, float>(
           stream, current, expected, shape_, config_, "bf16_comparison",
-          bf16_comparison());
+          buffer_comparator::bf16_comparison());
     case xla::F32:
       return CompareEqualParameterized<float, float>(
           stream, current, expected, shape_, config_, "fp32_comparison",
-          fp32_comparison());
+          buffer_comparator::fp32_comparison());
     case xla::F64:
       return CompareEqualParameterized<double, double>(
           stream, current, expected, shape_, config_, "fp64_comparison",
-          fp64_comparison());
+          buffer_comparator::fp64_comparison());
     case xla::S8:
       return CompareEqualParameterized<int8_t, float>(
           stream, current, expected, shape_, config_, "int8_comparison",
-          int8_comparison());
+          buffer_comparator::int8_comparison());
     case xla::S32:
       return CompareEqualParameterized<int32_t, float>(
           stream, current, expected, shape_, config_, "int32_comparison",
-          int32_comparison());
+          buffer_comparator::int32_comparison());
     default:
       return Unimplemented("Unimplemented element type");
   }
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator.cu.cc b/third_party/xla/xla/service/gpu/buffer_comparator.cu.cc
index a8ec06442213d0..08d99184f096b3 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator.cu.cc
+++ b/third_party/xla/xla/service/gpu/buffer_comparator.cu.cc
@@ -13,13 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/buffer_comparator.h"
-
+#if GOOGLE_CUDA
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <cuda_fp8.h>
 
-namespace xla::gpu {
+using bfloat16 = __nv_bfloat16;
+#define BF16_TO_F32 __bfloat162float
+
+#elif TENSORFLOW_USE_ROCM
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_fp16.h>
+
+using bfloat16 = hip_bfloat16;
+#define BF16_TO_F32 float
+
+#endif
+
+#include <cstdint>
+
+namespace xla::gpu::buffer_comparator {
 
 // Comparison kernel code: compare two buffers of
 // fp8/bf16/fp16/fp32/fp64/int8_t/int32_t of length buffer_length where the
@@ -36,6 +49,7 @@ __device__ __inline__ float Canonicalize(float input) {
   return isnan(input) ? input : max(-65505.0f, min(input, 65505.0f));
 }
 
+#if GOOGLE_CUDA
 __global__ void xla_fp8_e4m3fn_comparison(__nv_fp8_storage_t* buffer_a,
                                           __nv_fp8_storage_t* buffer_b,
                                           float rel_error_threshold,
@@ -81,6 +95,7 @@ __global__ void xla_fp8_e5m2_comparison(__nv_fp8_storage_t* buffer_a,
   if (rel_error > rel_error_threshold || isnan(rel_error))
     atomicAdd(mismatch_count, 1);
 }
+#endif  // GOOGLE_CUDA
 
 __global__ void xla_fp16_comparison(__half* buffer_a, __half* buffer_b,
                                     float rel_error_threshold,
@@ -134,15 +149,14 @@ __global__ void xla_fp64_comparison(double* buffer_a, double* buffer_b,
     atomicAdd(mismatch_count, 1);
 }
 
-__global__ void xla_bf16_comparison(__nv_bfloat16* buffer_a,
-                                    __nv_bfloat16* buffer_b,
+__global__ void xla_bf16_comparison(bfloat16* buffer_a, bfloat16* buffer_b,
                                     float rel_error_threshold,
                                     uint64_t buffer_length,
                                     int* mismatch_count) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx >= buffer_length) return;
-  float elem_a = __bfloat162float(buffer_a[idx]);
-  float elem_b = __bfloat162float(buffer_b[idx]);
+  float elem_a = BF16_TO_F32(buffer_a[idx]);
+  float elem_b = BF16_TO_F32(buffer_b[idx]);
   elem_a = Canonicalize(elem_a);
   elem_b = Canonicalize(elem_b);
   if (isnan(elem_a) && isnan(elem_b)) return;
@@ -182,36 +196,38 @@ __global__ void xla_int32_comparison(int* buffer_a, int* buffer_b,
 
 }  // namespace
 
-/*static*/ void* BufferComparator::fp8_e4m3fn_comparison() {
+#if GOOGLE_CUDA
+void* fp8_e4m3fn_comparison() {
   return reinterpret_cast<void*>(&xla_fp8_e4m3fn_comparison);
 }
 
-/*static*/ void* BufferComparator::fp8_e5m2_comparison() {
+void* fp8_e5m2_comparison() {
   return reinterpret_cast<void*>(&xla_fp8_e5m2_comparison);
 }
+#endif
 
-/*static*/ void* BufferComparator::fp16_comparison() {
+void* fp16_comparison() {
   return reinterpret_cast<void*>(&xla_fp16_comparison);
 }
 
-/*static*/ void* BufferComparator::bf16_comparison() {
+void* bf16_comparison() {
   return reinterpret_cast<void*>(&xla_bf16_comparison);
 }
 
-/*static*/ void* BufferComparator::fp32_comparison() {
+void* fp32_comparison() {
   return reinterpret_cast<void*>(&xla_fp32_comparison);
 }
 
-/*static*/ void* BufferComparator::fp64_comparison() {
+void* fp64_comparison() {
   return reinterpret_cast<void*>(&xla_fp64_comparison);
 }
 
-/*static*/ void* BufferComparator::int8_comparison() {
+void* int8_comparison() {
   return reinterpret_cast<void*>(&xla_int8_comparison);
 }
 
-/*static*/ void* BufferComparator::int32_comparison() {
+void* int32_comparison() {
   return reinterpret_cast<void*>(&xla_int32_comparison);
 }
 
-}  // namespace xla::gpu
+}  // namespace xla::gpu::buffer_comparator
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator.h b/third_party/xla/xla/service/gpu/buffer_comparator.h
index 8da2db16345089..113bbf9a326fbb 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator.h
+++ b/third_party/xla/xla/service/gpu/buffer_comparator.h
@@ -21,8 +21,7 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/stream_executor/stream_executor.h"
 
-namespace xla {
-namespace gpu {
+namespace xla::gpu {
 
 // A device-side comparator that compares buffers.
 class BufferComparator {
@@ -45,21 +44,23 @@ class BufferComparator {
                               se::DeviceMemoryBase expected) const;
 
  private:
-  // Returns a pointer to CUDA C++ device function implementing comparison.
-  static void* fp8_e4m3fn_comparison();
-  static void* fp8_e5m2_comparison();
-  static void* fp16_comparison();
-  static void* bf16_comparison();
-  static void* fp32_comparison();
-  static void* fp64_comparison();
-  static void* int8_comparison();
-  static void* int32_comparison();
-
   Shape shape_;
   HloModuleConfig config_;
 };
 
-}  // namespace gpu
-}  // namespace xla
+namespace buffer_comparator {
+
+// Returns a pointer to CUDA C++ device function implementing comparison.
+void* fp8_e4m3fn_comparison();
+void* fp8_e5m2_comparison();
+void* fp16_comparison();
+void* bf16_comparison();
+void* fp32_comparison();
+void* fp64_comparison();
+void* int8_comparison();
+void* int32_comparison();
+
+}  // namespace buffer_comparator
+}  // namespace xla::gpu
 
 #endif  // XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator_test.cc b/third_party/xla/xla/service/gpu/buffer_comparator_test.cc
index 540f3591362bc0..839e5038419c0f 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator_test.cc
+++ b/third_party/xla/xla/service/gpu/buffer_comparator_test.cc
@@ -34,8 +34,13 @@ namespace {
 class BufferComparatorTest : public testing::Test {
  protected:
   BufferComparatorTest()
-      : platform_(se::MultiPlatformManager::PlatformWithName("cuda").value()),
-        stream_exec_(platform_->ExecutorForDevice(0).value()) {}
+#if GOOGLE_CUDA
+      : platform_(se::MultiPlatformManager::PlatformWithName("CUDA").value()),
+#elif TENSORFLOW_USE_ROCM
+      : platform_(se::MultiPlatformManager::PlatformWithName("ROCM").value()),
+#endif
+        stream_exec_(platform_->ExecutorForDevice(0).value()) {
+  }
 
   // Take floats only for convenience. Still uses ElementType internally.
   template <typename ElementType>
@@ -162,7 +167,7 @@ TEST_F(BufferComparatorTest, TestInfs) {
   EXPECT_FALSE(CompareEqualFloatBuffers<double>({inf}, {-20}));
   EXPECT_FALSE(CompareEqualFloatBuffers<double>({-inf}, {20}));
   EXPECT_FALSE(CompareEqualFloatBuffers<double>({-inf}, {-20}));
-
+#if GOOGLE_CUDA
   EXPECT_TRUE(
       CompareEqualFloatBuffers<tsl::float8_e4m3fn>({inf}, {std::nanf("")}));
   EXPECT_TRUE(CompareEqualFloatBuffers<tsl::float8_e4m3fn>({inf}, {inf}));
@@ -182,6 +187,7 @@ TEST_F(BufferComparatorTest, TestInfs) {
   EXPECT_FALSE(CompareEqualFloatBuffers<tsl::float8_e5m2>({inf}, {-20}));
   EXPECT_FALSE(CompareEqualFloatBuffers<tsl::float8_e5m2>({-inf}, {20}));
   EXPECT_FALSE(CompareEqualFloatBuffers<tsl::float8_e5m2>({-inf}, {-20}));
+#endif  // GOOGLE_CUDA
 }
 
 TEST_F(BufferComparatorTest, TestNumbers) {
@@ -209,7 +215,7 @@ TEST_F(BufferComparatorTest, TestNumbers) {
   EXPECT_TRUE(CompareEqualFloatBuffers<int8_t>({90}, {100}));
   EXPECT_TRUE(CompareEqualFloatBuffers<int8_t>({100}, {90}));
   EXPECT_FALSE(CompareEqualFloatBuffers<int8_t>({-128}, {127}));
-
+#if GOOGLE_CUDA
   EXPECT_TRUE(CompareEqualFloatBuffers<tsl::float8_e4m3fn>({20}, {20.1}));
   EXPECT_FALSE(CompareEqualFloatBuffers<tsl::float8_e4m3fn>({0}, {1}));
   EXPECT_TRUE(CompareEqualFloatBuffers<tsl::float8_e4m3fn>({0.9}, {1}));
@@ -221,6 +227,7 @@ TEST_F(BufferComparatorTest, TestNumbers) {
   EXPECT_TRUE(CompareEqualFloatBuffers<tsl::float8_e5m2>({0.9}, {1}));
   EXPECT_TRUE(CompareEqualFloatBuffers<tsl::float8_e5m2>({11}, {12}));
   EXPECT_TRUE(CompareEqualFloatBuffers<tsl::float8_e5m2>({12}, {11}));
+#endif  // GOOGLE_CUDA
 }
 
 TEST_F(BufferComparatorTest, TestMultiple) {
@@ -291,7 +298,7 @@ TEST_F(BufferComparatorTest, TestMultiple) {
       rhs[i] = 0;
     }
   }
-
+#if GOOGLE_CUDA
   {
     EXPECT_TRUE(CompareEqualFloatBuffers<tsl::float8_e4m3fn>(
         {20, 30, 40, 50, 60}, {20.1, 30.1, 40.1, 50.1, 60.1}));
@@ -325,6 +332,7 @@ TEST_F(BufferComparatorTest, TestMultiple) {
       rhs[i] = 0;
     }
   }
+#endif  // GOOGLE_CUDA
 }
 
 TEST_F(BufferComparatorTest, BF16) {
diff --git a/third_party/xla/xla/service/gpu/buffer_sharing.cc b/third_party/xla/xla/service/gpu/buffer_sharing.cc
index 8dc8943938191e..260f533c2149ab 100644
--- a/third_party/xla/xla/service/gpu/buffer_sharing.cc
+++ b/third_party/xla/xla/service/gpu/buffer_sharing.cc
@@ -92,6 +92,8 @@ std::optional<bool> FusionCanShareBufferHint(const HloInstruction* user,
   // fusion parameter to fusion output which are elementwise (no copy) or
   // bitcast or an elementwise dynamic update slice (i.e. with the first operand
   // being on this path).
+  // In addition to that, we can also share the buffer for Scatter fusions if
+  // the scatter is the single output of the fusion.
   HloInstruction* fusion_param =
       user->fused_parameter(user->operand_index(operand));
   HloInstruction* output = user->fused_expression_root();
@@ -109,20 +111,13 @@ std::optional<bool> FusionCanShareBufferHint(const HloInstruction* user,
   q.push(fusion_param);
   visited.insert(fusion_param);
   bool found_path_to_output = false;
-  int reached_root = 0;
   while (!q.empty()) {
     HloInstruction* hlo_operand = q.front();
     q.pop();
-    if (hlo_operand->IsRoot()) {
-      ++reached_root;
-    }
     if (hlo_operand == output) {
       found_path_to_output = true;
       // We still need to process the users of 'hlo_operand'. There can be other
-      // reduction users in addition to the tuple user.
-      if (hlo_operand->user_count() > 1 && !is_reduction_emitter) {
-        return false;
-      }
+      // users in addition to the tuple user.
     }
     // Reduction emitter processes the reduction first, so the values below it
     // will not interfere with buffer sharing.
@@ -187,7 +182,20 @@ std::optional<bool> FusionCanShareBufferHint(const HloInstruction* user,
       }
     }
   }
-  return found_path_to_output && reached_root == 1;
+  // Special case: multi-output fusions with Scatter or DynamicUpdateSlice. For
+  // Scatter, we currently do not support multi-output fusions anyway, but still
+  // handle it here. To be on the safe side, check for !IsElementwise() instead
+  // of checking whether it is Scatter or DynamicUpdateSlice.
+  if (user->IsMultiOutputFusion() && !non_bitcast_root->IsElementwise()) {
+    // Check if any other fusion output was reached. If yes, we cannot share,
+    // because the order in which the output is written might be different.
+    for (HloInstruction* operand : user->fused_expression_root()->operands()) {
+      if (operand != output && visited.find(operand) != visited.end()) {
+        return false;
+      }
+    }
+  }
+  return found_path_to_output;
 }
 
 std::optional<bool> CanShareBufferHint(const HloInstruction* user,
diff --git a/third_party/xla/xla/service/gpu/build_defs.bzl b/third_party/xla/xla/service/gpu/build_defs.bzl
index bca5e3bda58d39..2f5305d152886c 100644
--- a/third_party/xla/xla/service/gpu/build_defs.bzl
+++ b/third_party/xla/xla/service/gpu/build_defs.bzl
@@ -2,7 +2,13 @@
 """
 
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
-load("@local_config_rocm//rocm:build_defs.bzl", "rocm_copts")  # copybara:comment
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured", "rocm_copts")
+load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+
+# buildifier: disable=out-of-order-load
+# Internally this loads a macro, but in OSS this is a function
+def register_extension_info(**kwargs):
+    pass
 
 def get_cub_sort_kernel_types(name = ""):
     """ List of supported types for CUB sort kernels.
@@ -34,9 +40,21 @@ def build_cub_sort_kernels(name, types, local_defines = [], **kwargs):
     """ Create build rules for all CUB sort kernels.
     """
     for suffix in types:
-        cuda_library(
+        gpu_kernel_library(
             name = name + "_" + suffix,
             local_defines = local_defines + ["CUB_TYPE_" + suffix.upper()],
-            copts = rocm_copts(),  # copybara:comment
             **kwargs
         )
+
+register_extension_info(extension = build_cub_sort_kernels, label_regex_for_dep = "{extension_name}_.*")
+
+def gpu_kernel_library(name, copts = [], local_defines = [], **kwargs):
+    cuda_library(
+        name = name,
+        local_defines = local_defines + if_cuda_is_configured(["GOOGLE_CUDA=1"]) +
+                        if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
+        copts = copts + rocm_copts(),
+        **kwargs
+    )
+
+register_extension_info(extension = gpu_kernel_library, label_regex_for_dep = "{extension_name}")
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
index 7cf4d4038bd9fa..c3c808bf3975ba 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
@@ -22,13 +22,21 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
@@ -36,34 +44,97 @@ limitations under the License.
 
 namespace xla::gpu {
 
-namespace {
+// TODO(ezhulenev): We should use debug options to get this flag.
+static constexpr int kMinNumCommands = 2;
+
+using CommandBuffer = CommandBufferScheduling::CommandBuffer;
+using CommandBufferConfig = CommandBufferScheduling::CommandBufferConfig;
+
+//===----------------------------------------------------------------------===//
+// Pattern matching HLO instructions to commands
+//===----------------------------------------------------------------------===//
 
-// We categorize HLO instructions into two types.
-// 1. Commands: Instructions that correspond to a command that will be
-// submitted to a GPU. Fused computations and library calls fall into this
-// category.
-// 2. Intermediates: Instructions that produce intermediate values that are
-// used by commands.
-bool IsCommand(const HloInstruction* inst) {
-  // TODO(anlunx): Add support for conditionals and while loops.
-  return inst->opcode() == HloOpcode::kFusion;
+static bool IsConstant(const HloInstruction* hlo) {
+  return hlo->opcode() == HloOpcode::kConstant;
 }
 
-bool IsIntermediate(const HloInstruction* inst) {
-  switch (inst->opcode()) {
-    case HloOpcode::kConstant:
-    case HloOpcode::kGetTupleElement:
-      return true;
-    default:
-      return false;
-  }
+// Returns true if instruction is no-op at run time and doesn't have a
+// corresponding Thunk or Command (metadata only operation).
+static bool IsNoOp(const HloInstruction* hlo) {
+  return HloPredicateIsOp<HloOpcode::kParameter, HloOpcode::kBitcast,
+                          HloOpcode::kTuple, HloOpcode::kGetTupleElement>(hlo);
+};
+
+// Returns true if HLO instruction has a corresponding command buffer command.
+static bool IsCommand(const HloInstruction* hlo,
+                      const CommandBufferConfig& config);
+
+// Returns true if HLO computation can executed as a command buffer.
+static bool IsCommand(const HloComputation* computation,
+                      const CommandBufferConfig& config) {
+  return absl::c_all_of(
+      computation->instructions(), [&](const HloInstruction* inst) {
+        return IsNoOp(inst) || IsConstant(inst) || IsCommand(inst, config);
+      });
+}
+
+// This is a template to define pattern matching functions for HLO instructions
+// that do not have a corresponding class for them.
+template <HloOpcode op>
+static bool IsCommand(const HloInstruction*, const CommandBufferConfig&);
+
+// Fusions compiled to device kernels (or lowered to custom kernels) which
+// always have a corresponding command buffer command.
+static bool IsCommand(const HloFusionInstruction* fusion,
+                      const CommandBufferConfig& config) {
+  // Certain kinds of fusions have special emitters that we do not support when
+  // emitting from HLO.
+  auto unsupported = [](const HloInstruction* inst) {
+    return inst->opcode() == HloOpcode::kDynamicUpdateSlice;
+  };
+
+  return config.contains(DebugOptions::FUSION) &&
+         !absl::c_any_of(fusion->called_computation()->instructions(),
+                         unsupported);
+}
+
+// Sort operations lowered to memcpy and device kernels and we have a
+// corresponding command buffer commands for them.
+static bool IsCommand(const HloSortInstruction* sort,
+                      const CommandBufferConfig& config) {
+  return config.contains(DebugOptions::FUSION);
 }
 
-void RemoveTrailingIntermediates(HloInstructionSequence& seq) {
+// While loops can be executed inside command buffers only if condition and body
+// regions can be executed as command buffers.
+template <>
+bool IsCommand<HloOpcode::kWhile>(const HloInstruction* hlo,
+                                  const CommandBufferConfig& config) {
+  return config.contains(DebugOptions::WHILE) &&
+         IsCommand(hlo->while_condition(), config) &&
+         IsCommand(hlo->while_body(), config);
+}
+
+static bool IsCommand(const HloInstruction* hlo,
+                      const CommandBufferConfig& config) {
+  if (auto* fusion = DynCast<HloFusionInstruction>(hlo))
+    return IsCommand(fusion, config);
+
+  if (auto* sort = DynCast<HloSortInstruction>(hlo))
+    return IsCommand(sort, config);
+
+  if (hlo->opcode() == HloOpcode::kWhile)
+    return IsCommand<HloOpcode::kWhile>(hlo, config);
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+
+static void RemoveTrailingNoOps(HloInstructionSequence& seq) {
   std::vector<HloInstruction*> instructions = seq.instructions();
   for (int i = instructions.size() - 1; i >= 0; i--) {
-    HloInstruction* inst = instructions[i];
-    if (IsIntermediate(inst)) {
+    if (HloInstruction* inst = instructions[i]; IsNoOp(inst)) {
       seq.remove_instruction(inst);
     } else {
       break;
@@ -71,24 +142,27 @@ void RemoveTrailingIntermediates(HloInstructionSequence& seq) {
   }
 }
 
-constexpr int kMinNumCommands = 2;
+//===----------------------------------------------------------------------===//
+// Discovering sequences of compatible Hlo instructions
+//===----------------------------------------------------------------------===//
 
+namespace {
+struct Accumulator {
+  std::vector<HloInstructionSequence> sequences;
+  HloInstructionSequence current_seq;
+  int num_commands_in_current_seq = 0;
+};
 }  // namespace
 
 // The input is a scheduled sequence of instructions. This function collects
 // subsequences that will be extracted as command buffers.
 std::vector<HloInstructionSequence>
 CommandBufferScheduling::CollectCommandBufferSequences(
-    const HloInstructionSequence inst_sequence) {
-  struct Accumulator {
-    std::vector<HloInstructionSequence> sequences;
-    HloInstructionSequence current_seq;
-    int num_commands_in_current_seq = 0;
-  };
-
-  auto start_new_sequence = [](Accumulator* acc) -> Accumulator* {
+    const HloInstructionSequence inst_sequence,
+    const CommandBufferConfig& config) {
+  auto start_new_sequence = [](Accumulator* acc) {
     if (acc->num_commands_in_current_seq >= kMinNumCommands) {
-      RemoveTrailingIntermediates(acc->current_seq);
+      RemoveTrailingNoOps(acc->current_seq);
       acc->sequences.push_back(acc->current_seq);
     }
     acc->current_seq = HloInstructionSequence();
@@ -96,14 +170,12 @@ CommandBufferScheduling::CollectCommandBufferSequences(
     return acc;
   };
 
-  auto process_instruction = [&start_new_sequence](
-                                 Accumulator* acc,
-                                 HloInstruction* inst) -> Accumulator* {
-    if (IsCommand(inst)) {
+  auto process_instruction = [&](Accumulator* acc, HloInstruction* inst) {
+    if (IsCommand(inst, config)) {
       acc->current_seq.push_back(inst);
-      acc->num_commands_in_current_seq += 1;
+      acc->num_commands_in_current_seq++;
       return acc;
-    } else if (IsIntermediate(inst)) {
+    } else if (IsNoOp(inst)) {
       if (acc->current_seq.size() > 0) {
         acc->current_seq.push_back(inst);
       }
@@ -141,181 +213,249 @@ void CommandBufferScheduling::MoveParametersToFront(
   schedule.set_sequence(computation, new_sequence);
 }
 
-StatusOr<CommandBufferScheduling::BuildCommandBufferResult>
-CommandBufferScheduling::BuildCommandBuffer(HloInstructionSequence seq) {
+//===----------------------------------------------------------------------===//
+// Prepares command buffer from sequence of instructions
+//===----------------------------------------------------------------------===//
+
+StatusOr<CommandBuffer> CommandBufferScheduling::PrepareCommandBuffer(
+    const HloInstructionSequence& seq) {
   auto builder = HloComputation::Builder("command_buffer");
-  const std::vector<HloInstruction*>& instructions = seq.instructions();
+
+  absl::Span<HloInstruction* const> instructions =
+      absl::MakeSpan(seq.instructions());
+
+  // A set of instructions that will be moved into command buffer computation.
+  absl::flat_hash_set<HloInstruction*> in_command_buffer(instructions.begin(),
+                                                         instructions.end());
 
   // The sequence might use results of instructions that are not captured by the
   // sequence. We pass those results as parameters and map the producers of the
   // results to their corresponding parameter instructions.
-  absl::flat_hash_map<HloInstruction*, HloParameterInstruction*> parameters_map;
-  int64_t parameter_number = 0;
+  absl::flat_hash_map<HloInstruction*, HloParameterInstruction*> parameters;
+
+  // Mapping from command buffer instructions to their clones in the command
+  // buffer computation body.
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> inst_mapping;
+
+  // Maps HLO instructions in the original computation to instructions in the
+  // command buffer: (a) a parameter corresponding to captured value (b) cloned
+  // instruction corresponding to a command.
+  auto mapped_operands = [&](HloInstruction* instr) {
+    absl::InlinedVector<HloInstruction*, 4> operands;
+    for (HloInstruction* operand : instr->operands()) {
+      if (auto it = inst_mapping.find(operand); it != inst_mapping.end())
+        operands.push_back(it->second);
+    }
+    return operands;
+  };
+
+  // Create parameters in the command buffer computation for captured values.
   for (HloInstruction* inst : instructions) {
     for (HloInstruction* operand : inst->operands()) {
-      if (absl::c_find(instructions, operand) != instructions.end()) {
-        continue;
-      }
+      // We already mapped instruction to a parameter.
+      if (parameters.contains(operand)) continue;
+
+      // Operand instruction is a part of the command buffer.
+      if (in_command_buffer.contains(operand)) continue;
+
+      // Create a new parameter for value defined outside of a command buffer.
+      int64_t parameter_id = parameters.size();
+      auto* parameter = Cast<HloParameterInstruction>(builder.AddInstruction(
+          HloInstruction::CreateParameter(parameter_id, operand->shape(),
+                                          absl::StrCat("p", parameter_id))));
+      inst_mapping[operand] = parameters[operand] = parameter;
+    }
+  }
 
-      if (!parameters_map.contains(operand)) {
-        TF_ASSIGN_OR_RETURN(
-            HloInstruction * parameter,
-            builder.AddParameter(HloInstruction::CreateParameter(
-                parameter_number, operand->shape(), "param")));
-        parameter_number++;
-        parameters_map[operand] =
-            static_cast<HloParameterInstruction*>(parameter);
-      }
+  // Clone commands into the command buffer body with mapped operands.
+  for (HloInstruction* inst : seq.instructions()) {
+    HloCloneContext ctx(inst->GetModule());
+
+    // Cloned instructions should call the same computations as original
+    // instructions will be dead code eliminated.
+    for (HloComputation* called_computation : inst->called_computations()) {
+      ctx.MapComputation(called_computation, called_computation);
     }
+
+    inst_mapping[inst] = builder.AddInstruction(
+        inst->CloneWithNewOperands(inst->shape(), mapped_operands(inst), &ctx));
   }
 
-  // We copy instructions from the sequence to the computation and map the
-  // original instruction to its clone.
-  absl::flat_hash_map<HloInstruction*, HloInstruction*> instructions_map;
+  // Convert parameters to command buffer arguments.
+  std::vector<HloInstruction*> arguments(parameters.size());
+  for (auto& [argument, parameter] : parameters) {
+    arguments[parameter->parameter_number()] = argument;
+  }
+
+  // Collect command buffer `results` (instructions replaced in the original
+  // computation) and `results` (instructions in the command buffer).
+  std::vector<HloInstruction*> results;
+  std::vector<HloInstruction*> returned;
+
+  auto has_external_users = [&](HloInstruction* inst) {
+    return inst->IsRoot() || absl::c_any_of(inst->users(), [&](auto* user) {
+             return !in_command_buffer.contains(user);
+           });
+  };
+
+  for (HloInstruction* inst : instructions) {
+    if (has_external_users(inst)) {
+      results.push_back(inst);
+      returned.push_back(inst_mapping[inst]);
+    }
+  }
+
+  // If we return multiple results wrap them into tuple.
+  if (returned.size() > 1) {
+    builder.AddInstruction(HloInstruction::CreateTuple(returned));
+  }
+
+  return CommandBuffer{std::move(arguments), std::move(results),
+                       builder.Build(), std::move(inst_mapping)};
+}
+
+//===----------------------------------------------------------------------===//
+// Rewrites original computation into command buffer call
+//===----------------------------------------------------------------------===//
+
+Status CommandBufferScheduling::RewriteCommandBuffer(
+    HloComputation* parent, const HloInstructionSequence& seq,
+    CommandBuffer command_buffer) {
+  if (command_buffer.results.empty())
+    return absl::InternalError("command buffer rsults must be not empty");
+
+  // If we have more than one result we return them as tuple, and get individual
+  // values using `get-tuple-element` instructions. Otherwise we simply return
+  // a result from a command buffer computation.
+  Shape cmd_buffer_result_shape;
+  bool has_single_result = command_buffer.results.size() == 1;
+
+  if (has_single_result) {
+    cmd_buffer_result_shape = command_buffer.results[0]->shape();
+  } else {
+    absl::InlinedVector<Shape, 4> shapes;
+    shapes.reserve(command_buffer.results.size());
+    for (auto* res : command_buffer.results) shapes.push_back(res->shape());
+    cmd_buffer_result_shape = ShapeUtil::MakeTupleShape(shapes);
+  }
+
+  HloComputation* computation =
+      parent->parent()->AddComputationAndUnifyNamesAndIds(
+          std::move(command_buffer.computation),
+          /*is_entry=*/false);
+
+  HloInstruction* call = parent->AddInstruction(HloInstruction::CreateCall(
+      cmd_buffer_result_shape, command_buffer.arguments, computation));
+
+  // Replace all users or original results with a command buffer results.
+  if (has_single_result) {
+    TF_RETURN_IF_ERROR(command_buffer.results[0]->ReplaceAllUsesWith(call));
+  } else {
+    for (int i = 0; i < command_buffer.results.size(); i++) {
+      TF_RETURN_IF_ERROR(
+          command_buffer.results[i]->ReplaceAllUsesWith(parent->AddInstruction(
+              HloInstruction::CreateGetTupleElement(call, i))));
+    }
+  }
+
+  // As we are running after scheduling we have to keep it valid.
+  HloSchedule& schedule = parent->parent()->schedule();
+
+  // Update schedule to replace the last instruction with a command buffer call.
+  // Removal of the rest of the instructions in the sequence is handled by
+  // schedule update below.
+  HloInstructionSequence& sequence = schedule.GetOrCreateSequence(parent);
+  sequence.replace_instruction(seq.instructions().back(), call);
+
+  // Rebuild original instruction sequence schedule in a newly created
+  // command buffer computation to guarantee that we'll get exactly the same
+  // buffer assignment result as if we were running without command buffers.
+  HloInstructionSequence cmd_buffer_schedule;
+  for (auto* argument : command_buffer.arguments) {
+    cmd_buffer_schedule.push_back(command_buffer.inst_mapping[argument]);
+  }
+  for (auto* inst : seq.instructions()) {
+    cmd_buffer_schedule.push_back(command_buffer.inst_mapping[inst]);
+  }
+  if (!has_single_result) {
+    cmd_buffer_schedule.push_back(computation->root_instruction());
+  }
+  schedule.set_sequence(computation, cmd_buffer_schedule);
+
+  // Forward control dependencies between original instructions to instruction
+  // in the command buffer computation.
+  auto& inst_mapping = command_buffer.inst_mapping;
   for (HloInstruction* inst : seq.instructions()) {
-    switch (inst->opcode()) {
-      case HloOpcode::kFusion: {
-        std::vector<HloInstruction*> operands;
-        for (HloInstruction* operand : inst->operands()) {
-          auto it = parameters_map.find(operand);
-          if (it != parameters_map.end()) {
-            operands.push_back(it->second);
-          } else {
-            operands.push_back(instructions_map[operand]);
-          }
-        }
-        instructions_map[inst] =
-            builder.AddInstruction(HloInstruction::CreateFusion(
-                inst->shape(), inst->fusion_kind(), operands,
-                inst->fused_instructions_computation()));
-        break;
+    HloInstruction* cmd_inst = inst_mapping[inst];
+
+    // Forward control dependencies to the new instruction inside command
+    // buffer. If the dependent instruction is not captured by the command
+    // buffer, forward the dependency to the command buffer call instead.
+    for (HloInstruction* predecessor : inst->control_predecessors()) {
+      if (auto it = inst_mapping.find(predecessor); it != inst_mapping.end()) {
+        HloInstruction* cmd_predecessor = it->second;
+        TF_RETURN_IF_ERROR(cmd_predecessor->AddControlDependencyTo(cmd_inst));
+      } else {
+        TF_RETURN_IF_ERROR(predecessor->AddControlDependencyTo(call));
       }
-      case HloOpcode::kConstant:
-        instructions_map[inst] = builder.AddInstruction(
-            HloInstruction::CreateConstant(inst->literal().Clone()));
-        break;
-      case HloOpcode::kGetTupleElement: {
-        HloGetTupleElementInstruction* get_tuple_index =
-            static_cast<HloGetTupleElementInstruction*>(inst);
-        HloInstruction* original_operand = get_tuple_index->mutable_operand(0);
-        auto it = parameters_map.find(original_operand);
-        HloInstruction* operand;
-        if (it != parameters_map.end()) {
-          operand = it->second;
-        } else {
-          operand = instructions_map[original_operand];
-        }
-        instructions_map[inst] =
-            builder.AddInstruction(HloInstruction::CreateGetTupleElement(
-                inst->shape(), operand, get_tuple_index->tuple_index()));
-        break;
+    }
+
+    for (HloInstruction* successor : inst->control_successors()) {
+      if (auto it = inst_mapping.find(successor); it != inst_mapping.end()) {
+        HloInstruction* cmd_successor = it->second;
+        TF_RETURN_IF_ERROR(cmd_inst->AddControlDependencyTo(cmd_successor));
+      } else {
+        TF_RETURN_IF_ERROR(call->AddControlDependencyTo(successor));
       }
-      default:
-        return InternalError("HLO opcode unsupported by command buffers");
     }
+
+    TF_RETURN_IF_ERROR(inst->DropAllControlDeps());
   }
 
-  // Build result tuple.
-  std::vector<HloInstruction*> new_instructions;
-  absl::flat_hash_map<HloInstruction*, int64_t> inst_to_tuple_index_map;
-  int64_t index = 0;
-  for (HloInstruction* inst : seq.instructions()) {
-    new_instructions.push_back(instructions_map[inst]);
-    inst_to_tuple_index_map[inst] = index;
-    index++;
+  // Traverse in reverse order as original sequence was topologically sorted and
+  // we can't remove instructions with users.
+  for (int32_t i = seq.instructions().size() - 1; i >= 0; i--) {
+    TF_RETURN_IF_ERROR(parent->RemoveInstruction(seq.instructions()[i]));
   }
-  builder.AddInstruction(HloInstruction::CreateTuple(new_instructions));
 
-  BuildCommandBufferResult result = {builder.Build(), parameters_map,
-                                     inst_to_tuple_index_map, instructions_map};
-  return result;
+  return OkStatus();
 }
 
+//===----------------------------------------------------------------------===//
+
 StatusOr<bool> CommandBufferScheduling::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  if (!module->has_schedule()) {
-    return InternalError("module is not scheduled");
+  // We run command buffer scheduling after a regular scheduling to guarantee
+  // that command buffers will not change execution order and buffer assignment
+  // compared to a regular execution. Some operations (i.e. async collectives)
+  // can't be captured into command buffers, and forming too large command
+  // buffers too early can impact async operations scheduling.
+  if (!module->has_schedule()) return InternalError("module is not scheduled");
+
+  CommandBufferConfig config;
+  for (auto cmd_type :
+       module->config().debug_options().xla_gpu_enable_command_buffer()) {
+    config.insert(static_cast<DebugOptions::CommandBufferCmdType>(cmd_type));
   }
+
+  // TODO(b/315874495): We should traverse all computations in topological order
+  // to discover command buffers inside nested control flow computations.
   HloComputation* entry = module->entry_computation();
   MoveParametersToFront(entry);
+
   std::vector<HloInstructionSequence> sequences =
-      CollectCommandBufferSequences(module->schedule().sequence(entry));
+      CollectCommandBufferSequences(module->schedule().sequence(entry), config);
 
   for (const HloInstructionSequence& seq : sequences) {
-    TF_ASSIGN_OR_RETURN(BuildCommandBufferResult result,
-                        BuildCommandBuffer(seq));
-
-    Shape shape;
-    shape.set_element_type(TUPLE);
-    shape.mutable_tuple_shapes()->resize(result.inst_to_tuple_index_map.size());
-    for (const auto [inst, index] : result.inst_to_tuple_index_map) {
-      shape.mutable_tuple_shapes()->at(index) = inst->shape();
-    }
-
-    std::vector<HloInstruction*> operands(result.parameters_map.size());
-    for (const auto [inst, parameter] : result.parameters_map) {
-      operands[parameter->parameter_number()] = inst;
-    }
-
-    HloComputation* command_buffer =
-        module->AddComputationAndUnifyNamesAndIds(std::move(result.computation),
-                                                  /*is_entry=*/false);
-    HloInstruction* call_command_buffer = entry->AddInstruction(
-        HloInstruction::CreateCall(shape, operands, command_buffer));
-
-    std::vector<HloInstruction*> results(result.inst_to_tuple_index_map.size());
-    for (int i = 0; i < result.inst_to_tuple_index_map.size(); i++) {
-      results[i] = entry->AddInstruction(
-          HloInstruction::CreateGetTupleElement(call_command_buffer, i));
-    }
-
-    // Remove instructions in the command buffer sequence.
-    bool first_inst = true;
-    for (HloInstruction* inst : seq.instructions()) {
-      // Replace the first instruction in the sequence by command buffer call.
-      // Removal of the rest of the instructions in the sequence is handled by
-      // HloSchedule::Update().
-      if (first_inst) {
-        first_inst = false;
-        HloInstructionSequence& sequence =
-            module->schedule().GetOrCreateSequence(entry);
-        sequence.replace_instruction(inst, call_command_buffer);
-      }
-
-      // Forward control dependencies to the new instruction inside command
-      // buffer. If the dependent instruction is not captured by the command
-      // buffer, forward the dependency to the command buffer call instead.
-      HloInstruction* new_inst = result.instructions_map[inst];
-      for (HloInstruction* predecessor : inst->control_predecessors()) {
-        if (auto it = result.instructions_map.find(predecessor);
-            it != result.instructions_map.end()) {
-          HloInstruction* new_predecessor = it->second;
-          TF_RETURN_IF_ERROR(new_predecessor->AddControlDependencyTo(new_inst));
-        } else {
-          TF_RETURN_IF_ERROR(
-              predecessor->AddControlDependencyTo(call_command_buffer));
-        }
-      }
-      for (HloInstruction* successor : inst->control_successors()) {
-        if (auto it = result.instructions_map.find(successor);
-            it != result.instructions_map.end()) {
-          HloInstruction* new_successor = it->second;
-          TF_RETURN_IF_ERROR(new_inst->AddControlDependencyTo(new_successor));
-        } else {
-          TF_RETURN_IF_ERROR(
-              call_command_buffer->AddControlDependencyTo(successor));
-        }
-      }
-      TF_RETURN_IF_ERROR(inst->DropAllControlDeps());
-
-      int64_t tuple_index = result.inst_to_tuple_index_map[inst];
-      TF_RETURN_IF_ERROR(inst->ReplaceAllUsesWith(results[tuple_index]));
-      TF_RETURN_IF_ERROR(entry->RemoveInstruction(inst));
-    }
+    TF_ASSIGN_OR_RETURN(CommandBuffer command_buffer,
+                        PrepareCommandBuffer(seq));
+    TF_RETURN_IF_ERROR(
+        RewriteCommandBuffer(entry, seq, std::move(command_buffer)));
   }
 
   TF_RETURN_IF_ERROR(module->schedule().Update());
+
   return true;
 }
 
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.h b/third_party/xla/xla/service/gpu/command_buffer_scheduling.h
index 601e72c9f984bf..85659dd4537739 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling.h
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace xla::gpu {
@@ -69,6 +70,11 @@ namespace xla::gpu {
 // custom call to a first class operation later.
 class CommandBufferScheduling : public HloModulePass {
  public:
+  // DebugOptions control which commands are enabled. Long term we want to
+  // remove that flag and enable all supported commands by default.
+  using CommandBufferConfig =
+      absl::flat_hash_set<DebugOptions::CommandBufferCmdType>;
+
   absl::string_view name() const override {
     return "command-buffer-scheduling";
   }
@@ -79,34 +85,36 @@ class CommandBufferScheduling : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
   static std::vector<HloInstructionSequence> CollectCommandBufferSequences(
-      HloInstructionSequence inst_sequence);
+      HloInstructionSequence inst_sequence, const CommandBufferConfig& config);
+
   static void MoveParametersToFront(HloComputation* computation);
 
-  struct BuildCommandBufferResult {
+  struct CommandBuffer {
+    // Command buffer arguments (call instruction arguments).
+    std::vector<HloInstruction*> arguments;
+
+    // Command buffer result (call instruction result tuple).
+    std::vector<HloInstruction*> results;
+
+    // Hlo computation corresponding to a command buffer body.
     std::unique_ptr<HloComputation> computation;
 
-    // Maps external instructions used by the command buffer to a parameter
-    // of the command buffer computation. The command buffer uses parameters
-    // to access the results of external instructions.
-    absl::flat_hash_map<HloInstruction*, HloParameterInstruction*>
-        parameters_map;
-
-    // We move some instructions to the command buffer computation and return
-    // the results back to the original computation by tuple. This field maps
-    // the original instruction to the tuple index of the result that replaces
-    // the original instruction.
-    absl::flat_hash_map<HloInstruction*, int64_t> inst_to_tuple_index_map;
-
-    // Map original instructions to their clones in the command buffer
-    // computation.
-    absl::flat_hash_map<HloInstruction*, HloInstruction*> instructions_map;
+    // Mapping from original instruction to their clones in the command buffer.
+    absl::flat_hash_map<HloInstruction*, HloInstruction*> inst_mapping;
   };
 
-  // Builds a computation from the instruction sequence. Used values constructed
-  // by instructions outside of the sequence are passed in as parameters.
-  // Results of instructions in the sequence are returned in a tuple.
-  static StatusOr<BuildCommandBufferResult> BuildCommandBuffer(
-      HloInstructionSequence seq);
+  // Prepares a command buffer from the instruction sequence. Used values
+  // constructed by instructions outside of the sequence are passed in as
+  // parameters. Results of instructions in the sequence are returned in a tuple
+  // (if command buffer has a single result we don't wrap it into tuple).
+  static StatusOr<CommandBuffer> PrepareCommandBuffer(
+      const HloInstructionSequence& seq);
+
+  // Rewrites prepared command buffer computation into Hlo operations in the
+  // parent computation (calls command buffer and replaced all users).
+  static Status RewriteCommandBuffer(HloComputation* parent,
+                                     const HloInstructionSequence& seq,
+                                     CommandBuffer command_buffer);
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc
index 2a556b6f0ef2bf..f521c84b7f2fa7 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/command_buffer_scheduling.h"
 
-#include <cstdint>
 #include <memory>
 #include <utility>
 #include <vector>
@@ -33,11 +32,12 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
-
 namespace {
 
 class CommandBufferSchedulingTest : public HloTestBase {};
 
+using CommandBuffer = CommandBufferScheduling::CommandBuffer;
+
 TEST_F(CommandBufferSchedulingTest, SingleCommandBuffer) {
   const char* hlo = R"(
       HloModule TestModule, is_scheduled=true
@@ -63,11 +63,11 @@ TEST_F(CommandBufferSchedulingTest, SingleCommandBuffer) {
       })";
 
   const char* expected = R"(
-// CHECK: %command_buffer (param: s32[], param.1: s32[]) -> (s32[], s32[]) {
-// CHECK:   %param = s32[] parameter(0)
-// CHECK:   %param.1 = s32[] parameter(1)
-// CHECK:   %fusion.2 = s32[] fusion(%param, %param.1), kind=kLoop, calls=%fused_computation
-// CHECK:   %fusion.3 = s32[] fusion(%param, %param.1), kind=kLoop, calls=%fused_computation.1
+// CHECK: %command_buffer ([[P0:.+]]: s32[], [[P1:.+]]: s32[]) -> (s32[], s32[]) {
+// CHECK:   %[[P0]] = s32[] parameter(0)
+// CHECK:   %[[P1]] = s32[] parameter(1)
+// CHECK:   %fusion.2 = s32[] fusion(%[[P0]], %[[P1]]), kind=kLoop, calls=%fused_computation
+// CHECK:   %fusion.3 = s32[] fusion(%[[P0]], %[[P1]]), kind=kLoop, calls=%fused_computation.1
 // CHECK:   ROOT %tuple = (s32[], s32[]) tuple(%fusion.2, %fusion.3)
 // CHECK: }
 //
@@ -130,38 +130,31 @@ TEST_F(CommandBufferSchedulingTest, MultipleCommandBuffers) {
       })";
 
   const char* expected = R"(
-// CHECK:  %command_buffer (param: s32[], param.1: s32[], param.2: (s32[], s32[])) -> (s32[], s32[], s32[]) {
-// CHECK:    %param = s32[] parameter(0)
-// CHECK:    %param.1 = s32[] parameter(1)
-// CHECK:    %param.2 = (s32[], s32[]) parameter(2)
-// CHECK:    %fusion.4 = s32[] fusion(%param, %param.1), kind=kLoop, calls=%fused_computation
-// CHECK:    %get-tuple-element = s32[] get-tuple-element(%param.2), index=0
-// CHECK:    %fusion.5 = s32[] fusion(%fusion.4, %get-tuple-element), kind=kLoop, calls=%fused_computation.1
-// CHECK:    ROOT %tuple = (s32[], s32[], s32[]) tuple(%fusion.4, %get-tuple-element, %fusion.5)
+// CHECK:  %command_buffer ([[P0:.+]]: s32[], [[P1:.+]]: s32[], [[P2:.+]]: (s32[], s32[])) -> s32[] {
+// CHECK:    %[[P0]] = s32[] parameter(0)
+// CHECK:    %[[P1]] = s32[] parameter(1)
+// CHECK:    %[[P2]] = (s32[], s32[]) parameter(2)
+// CHECK:    %[[F0:.+]] = s32[] fusion(%[[P0]], %[[P1]]), kind=kLoop, calls=%fused_computation
+// CHECK:    %[[V0:.+]] = s32[] get-tuple-element(%[[P2]]), index=0
+// CHECK:    ROOT {{.*}} = s32[] fusion(%[[F0]], %[[V0]]), kind=kLoop, calls=%fused_computation.1
 // CHECK:  }
 
-// CHECK:  %command_buffer.1 (param.3: s32[], param.4: s32[]) -> (s32[], s32[]) {
-// CHECK:    %param.3 = s32[] parameter(0)
-// CHECK:    %param.4 = s32[] parameter(1)
-// CHECK:    %fusion.6 = s32[] fusion(%param.3, %param.4), kind=kLoop, calls=%fused_computation.2
-// CHECK:    %fusion.7 = s32[] fusion(%param.3, %fusion.6), kind=kLoop, calls=%fused_computation.3
-// CHECK:    ROOT %tuple.1 = (s32[], s32[]) tuple(%fusion.6, %fusion.7)
+// CHECK:  %command_buffer.1 ([[P0:.+]]: s32[], [[P1:.+]]: s32[]) -> s32[] {
+// CHECK:    %[[P0]] = s32[] parameter(0)
+// CHECK:    %[[P1]] = s32[] parameter(1)
+// CHECK:    %[[F2:.+]] = s32[] fusion(%[[P0]], %[[P1]]), kind=kLoop, calls=%fused_computation.2
+// CHECK:    ROOT {{.*}} = s32[] fusion(%[[P0]], %[[F2]]), kind=kLoop, calls=%fused_computation.3
 // CHECK:  }
 
 // CHECK:  ENTRY %main (a: s32[], b: s32[], c: (s32[], s32[])) -> s32[] {
 // CHECK:    %a = s32[] parameter(0)
 // CHECK:    %b = s32[] parameter(1)
 // CHECK:    %c = (s32[], s32[]) parameter(2)
-// CHECK:    %call = (s32[], s32[], s32[]) call(%a, %b, %c), to_apply=%command_buffer
-// CHECK:    %get-tuple-element.1 = s32[] get-tuple-element(%call), index=0
-// CHECK:    %get-tuple-element.2 = s32[] get-tuple-element(%call), index=1
-// CHECK:    %get-tuple-element.3 = s32[] get-tuple-element(%call), index=2
+// CHECK:    %[[CMD0:.+]] = s32[] call(%a, %b, %c), to_apply=%command_buffer
 // CHECK:    %e = s32[] get-tuple-element(%c), index=1
-// CHECK:    %custom-call = s32[] custom-call(%get-tuple-element.3, %e), custom_call_target="some target"
-// CHECK:    %call.1 = (s32[], s32[]) call(%custom-call, %a), to_apply=%command_buffer.1
-// CHECK:    %get-tuple-element.4 = s32[] get-tuple-element(%call.1), index=0
-// CHECK:    %get-tuple-element.5 = s32[] get-tuple-element(%call.1), index=1
-// CHECK:    ROOT %custom-call.1 = s32[] custom-call(%get-tuple-element.5), custom_call_target="some target"
+// CHECK:    %[[CALL:.+]] = s32[] custom-call(%[[CMD0]], %e), custom_call_target="some target"
+// CHECK:    %[[CMD1:.+]] = s32[] call(%[[CALL]], %a), to_apply=%command_buffer.1
+// CHECK:    ROOT {{.*}} = s32[] custom-call(%[[CMD1]]), custom_call_target="some target"
 // CHECK:  })";
 
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(), expected,
@@ -221,8 +214,11 @@ TEST_F(CommandBufferSchedulingTest, CollectCommandBufferSequence) {
   }
   EXPECT_EQ(seq.size(), 10);
 
+  CommandBufferScheduling::CommandBufferConfig config;
+  config.insert(DebugOptions::FUSION);
+
   std::vector<HloInstructionSequence> command_buffer_sequences =
-      CommandBufferScheduling::CollectCommandBufferSequences(seq);
+      CommandBufferScheduling::CollectCommandBufferSequences(seq, config);
   EXPECT_EQ(command_buffer_sequences.size(), 2);
 
   std::vector<HloInstruction*> seq_0 =
@@ -283,7 +279,7 @@ TEST_F(CommandBufferSchedulingTest, MoveParametersToFront) {
   EXPECT_TRUE(filecheck_matches);
 }
 
-TEST_F(CommandBufferSchedulingTest, BuildComputation) {
+TEST_F(CommandBufferSchedulingTest, PrepareCommandBuffer) {
   const char* hlo = R"(
       HloModule TestModule, is_scheduled=true
 
@@ -322,20 +318,19 @@ TEST_F(CommandBufferSchedulingTest, BuildComputation) {
     instructions.push_back(inst);
   }
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      CommandBufferScheduling::BuildCommandBufferResult result,
-      CommandBufferScheduling::BuildCommandBuffer(seq));
+  TF_ASSERT_OK_AND_ASSIGN(CommandBuffer command_buffer,
+                          CommandBufferScheduling::PrepareCommandBuffer(seq));
   HloComputation* computation = module->AddComputationAndUnifyNamesAndIds(
-      std::move(result.computation), false);
+      std::move(command_buffer.computation), false);
 
   const char* expected = R"(
-// CHECK: %command_buffer (param: s32[], param.1: s32[]) -> ((s32[], s32[]), s32[], s32[]) {
-// CHECK:  %param = s32[] parameter(0)
-// CHECK:  %param.1 = s32[] parameter(1)
-// CHECK:  %fusion.2 = (s32[], s32[]) fusion(%param, %param.1), kind=kLoop, calls=%fused_computation
-// CHECK:  %get-tuple-element = s32[] get-tuple-element(%fusion.2), index=0
-// CHECK:  %fusion.3 = s32[] fusion(%param, %get-tuple-element), kind=kLoop, calls=%fused_computation.1
-// CHECK:  ROOT %tuple.1 = ((s32[], s32[]), s32[], s32[]) tuple(%fusion.2, %get-tuple-element, %fusion.3)
+// CHECK: %command_buffer ([[P0:.+]]: s32[], [[P1:.+]]: s32[]) -> (s32[], s32[]) {
+// CHECK:  %[[P0]] = s32[] parameter(0)
+// CHECK:  %[[P1]] = s32[] parameter(1)
+// CHECK:  %fusion.2 = (s32[], s32[]) fusion(%[[P0]], %[[P1]]), kind=kLoop, calls=%fused_computation
+// CHECK:  %[[V0:.+]] = s32[] get-tuple-element(%fusion.2), index=0
+// CHECK:  %fusion.3 = s32[] fusion(%[[P0]], %[[V0]]), kind=kLoop, calls=%fused_computation.1
+// CHECK:  ROOT {{.*}} = (s32[], s32[]) tuple(%[[V0]], %fusion.3)
 // CHECK:})";
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -345,16 +340,15 @@ TEST_F(CommandBufferSchedulingTest, BuildComputation) {
                    expected));
   EXPECT_TRUE(filecheck_matches);
 
-  absl::flat_hash_map<HloInstruction*, HloParameterInstruction*>&
-      parameters_map = result.parameters_map;
-  EXPECT_EQ(parameters_map[instructions[0]]->parameter_number(), 0);
-  EXPECT_EQ(parameters_map[instructions[1]]->parameter_number(), 1);
+  auto& arguments = command_buffer.arguments;
+  ASSERT_EQ(arguments.size(), 2);
+  EXPECT_EQ(arguments[0], instructions[0]);
+  EXPECT_EQ(arguments[1], instructions[1]);
 
-  absl::flat_hash_map<HloInstruction*, int64_t>& inst_to_tuple_index_map =
-      result.inst_to_tuple_index_map;
-  EXPECT_EQ(inst_to_tuple_index_map[instructions[2]], 0);
-  EXPECT_EQ(inst_to_tuple_index_map[instructions[3]], 1);
-  EXPECT_EQ(inst_to_tuple_index_map[instructions[4]], 2);
+  auto& results = command_buffer.results;
+  ASSERT_EQ(results.size(), 2);
+  EXPECT_EQ(results[0], instructions[3]);
+  EXPECT_EQ(results[1], instructions[4]);
 }
 
 TEST_F(CommandBufferSchedulingTest, RelayControlDependencies) {
@@ -387,28 +381,25 @@ TEST_F(CommandBufferSchedulingTest, RelayControlDependencies) {
         %fusion.1 = s32[] fusion(s32[] %a, s32[] %b), kind=kLoop, calls=%fused_computation.1, control-predecessors={%fusion}
         %custom-call.1 = s32[] custom-call(), custom_call_target="some target"
         %fusion.2 = s32[] fusion(s32[] %a, s32[] %b), kind=kLoop, calls=%fused_computation.2, control-predecessors={%fusion.1}
-        ROOT %custom-call.2 = s32[] custom-call(), custom_call_target="some target"
+        ROOT %custom-call.2 = s32[] custom-call(s32[] %fusion.1, s32[] %fusion.2), custom_call_target="some target"
       })";
 
   const char* expected = R"(
-// CHECK: %command_buffer (param: s32[], param.1: s32[]) -> (s32[], s32[]) {
-// CHECK:   %param = s32[] parameter(0)
-// CHECK:   %param.1 = s32[] parameter(1)
-// CHECK:   %fusion.3 = s32[] fusion(%param, %param.1), kind=kLoop, calls=%fused_computation
-// CHECK:   %fusion.4 = s32[] fusion(%param, %param.1), kind=kLoop, calls=%fused_computation.1, control-predecessors={%fusion.3}
-// CHECK:   ROOT %tuple = (s32[], s32[]) tuple(%fusion.3, %fusion.4)
+// CHECK: %command_buffer ([[P0:.+]]: s32[], [[P1:.+]]: s32[]) -> s32[] {
+// CHECK:   %[[P0]] = s32[] parameter(0)
+// CHECK:   %[[P1]] = s32[] parameter(1)
+// CHECK:   %[[F0:.+]] = s32[] fusion(%[[P0]], %[[P1]])
+// CHECK:   ROOT {{.*}} = s32[] fusion(%[[P0]], %[[P1]]), {{.*}} control-predecessors={%[[F0]]}
 // CHECK: }
 //
 // CHECK: ENTRY %main (a: s32[], b: s32[]) -> s32[] {
 // CHECK:   %a = s32[] parameter(0)
 // CHECK:   %b = s32[] parameter(1)
 // CHECK:   %custom-call = s32[] custom-call(), custom_call_target="some target"
-// CHECK:   %call = (s32[], s32[]) call(%a, %b), to_apply=%command_buffer, control-predecessors={%custom-call}
-// CHECK:   %get-tuple-element = s32[] get-tuple-element(%call), index=0
-// CHECK:   %get-tuple-element.1 = s32[] get-tuple-element(%call), index=1
+// CHECK:   %call = s32[] call(%a, %b), to_apply=%command_buffer, control-predecessors={%custom-call}
 // CHECK:   %custom-call.1 = s32[] custom-call(), custom_call_target="some target"
-// CHECK:   %fusion.2 = s32[] fusion(%a, %b), kind=kLoop, calls=%fused_computation.2, control-predecessors={%call}
-// CHECK:   ROOT %custom-call.2 = s32[] custom-call(), custom_call_target="some target"
+// CHECK:   %[[F3:.+]] = s32[] fusion(%a, %b), kind=kLoop, calls=%fused_computation.2, control-predecessors={%call}
+// CHECK:   ROOT %custom-call.2 = s32[] custom-call(%call, %[[F3]]), custom_call_target="some target"
 // CHECK: })";
 
   RunAndFilecheckHloRewrite(hlo, CommandBufferScheduling(), expected,
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index ec29a9ebfccba2..ab663859021393 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -53,15 +53,15 @@ limitations under the License.
 #include "xla/service/buffer_value.h"
 #include "xla/service/dump.h"
 #include "xla/service/gpu/buffer_sharing.h"
-#include "xla/service/gpu/conditional_thunk.h"
-#include "xla/service/gpu/for_thunk.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_unnested.h"
 #include "xla/service/gpu/metrics.h"
-#include "xla/service/gpu/sequential_thunk.h"
-#include "xla/service/gpu/while_thunk.h"
+#include "xla/service/gpu/runtime3/conditional_thunk.h"
+#include "xla/service/gpu/runtime3/for_thunk.h"
+#include "xla/service/gpu/runtime3/sequential_thunk.h"
+#include "xla/service/gpu/runtime3/while_thunk.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
@@ -106,16 +106,40 @@ static bool HasFp8(const HloModule& hlo_module) {
   return false;
 }
 
+class DumpAfterPassIfEnabled : public mlir::PassInstrumentation {
+ public:
+  DumpAfterPassIfEnabled(const HloModule* hlo_module,
+                         const mlir::ModuleOp* mlir_module)
+      : hlo_module_{hlo_module}, mlir_module_{mlir_module} {}
+  void runAfterPass(mlir::Pass* pass, mlir::Operation* op) override {
+    std::string pass_name = pass->getName().str();
+    bool should_dump_pass = DumpingEnabledForHloPass(
+        pass_name, hlo_module_->config().debug_options());
+    if (!should_dump_pass) return;
+    std::string module_str = llvm_ir::DumpToString(*mlir_module_);
+    auto prefix = "lower_to_xla_gpu_runtime";
+    auto suffix =
+        absl::StrCat("pass_", absl::StrFormat("%02d", pass_counter_++), ".",
+                     "after", ".", pass_name, ".mlir");
+    DumpToFileInDirOrStdout(*hlo_module_, prefix, suffix, module_str);
+  }
+
+ private:
+  const HloModule* hlo_module_;
+  const mlir::ModuleOp* mlir_module_;
+  int pass_counter_ = 0;
+};
+
 // Lowers MLIR module to the XLA Gpu runtime custom calls.
 static Status LowerToXlaGpuRuntime(
     mlir::ModuleOp module, llvm::StringRef entry_function_name,
     llvm::ArrayRef<int64_t> buffer_sizes, ThunkSequence* thunk_sequence,
-    const DebugOptions& debug_options,
-    se::GpuComputeCapability compute_capability) {
+    const HloModule* hlo_module, se::GpuComputeCapability compute_capability) {
   if (!module) {
     return InternalError("No MLIR module to lower.");
   }
 
+  const DebugOptions& debug_options = hlo_module->config().debug_options();
   bool should_verify = debug_options.xla_gpu_llvm_verification_level() >= 1;
 #ifndef NDEBUG
   should_verify = true;
@@ -123,6 +147,10 @@ static Status LowerToXlaGpuRuntime(
 
   mlir::PassManager pm(module->getName(), mlir::PassManager::Nesting::Implicit);
   pm.enableVerifier(should_verify);
+  if (hlo_module != nullptr && DumpingEnabledForHloModule(*hlo_module)) {
+    pm.addInstrumentation(
+        std::make_unique<DumpAfterPassIfEnabled>(hlo_module, &module));
+  }
 
   absl::flat_hash_set<DebugOptions::CommandBufferCmdType> command_types;
   for (int command_type_num : debug_options.xla_gpu_enable_command_buffer()) {
@@ -149,6 +177,8 @@ static Status LowerToXlaGpuRuntime(
   return OkStatus();
 }
 
+}  // namespace
+
 void ForAllThunks(const std::function<void(Thunk*)>& fn,
                   ThunkSequence* thunk_sequence) {
   for (std::unique_ptr<Thunk>& thunk : *thunk_sequence) {
@@ -175,8 +205,6 @@ void ForAllThunks(const std::function<void(Thunk*)>& fn,
   }
 }
 
-}  // namespace
-
 static void ForwardCollectiveAttrs(mlir::ModuleOp module,
                                    llvm::StringRef entry_function_name,
                                    const HloModuleConfig& config) {
@@ -188,26 +216,24 @@ static void ForwardCollectiveAttrs(mlir::ModuleOp module,
 
 StatusOr<GpuExecutable::OwnedGpuRuntimeProgram> LowerToJitRt(
     mlir::ModuleOp mlir_module, llvm::StringRef entry_function_name,
-    llvm::ArrayRef<int64_t> buffer_sizes, const HloModuleConfig& module_config,
-    std::unique_ptr<ThunkSequence> thunk_sequence,
-    const HloModule* hlo_module_for_dump,
+    llvm::ArrayRef<int64_t> buffer_sizes,
+    std::unique_ptr<ThunkSequence> thunk_sequence, const HloModule* hlo_module,
     se::GpuComputeCapability compute_capability) {
+  const auto& module_config = hlo_module->config();
   // Forward collective (NCCL) attributes for use by the lowering pipeline.
   ForwardCollectiveAttrs(mlir_module, entry_function_name, module_config);
 
   // Lower LMHLO operations to the XLA:GPU runtime custom calls.
   TF_RETURN_IF_ERROR(LowerToXlaGpuRuntime(
       mlir_module, {entry_function_name.data(), entry_function_name.size()},
-      buffer_sizes, thunk_sequence.get(), module_config.debug_options(),
-      compute_capability));
+      buffer_sizes, thunk_sequence.get(), hlo_module, compute_capability));
 
   // TODO(b/232033540): Pass MLIR module directly to Gpu runtime executable
   // without forcing serialization.
   std::string module_str = llvm_ir::DumpToString(mlir_module);
 
-  if (hlo_module_for_dump != nullptr) {
-    DumpToFileInDirOrStdout(*hlo_module_for_dump, "gpu_rt_host", "mlir",
-                            module_str);
+  if (hlo_module != nullptr) {
+    DumpToFileInDirOrStdout(*hlo_module, "gpu_rt_host", "mlir", module_str);
   }
 
   // Collect allocation indices for handling graph capture functions.
@@ -218,20 +244,6 @@ StatusOr<GpuExecutable::OwnedGpuRuntimeProgram> LowerToJitRt(
       std::move(allocation_indices), module_config.debug_options());
 }
 
-StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
-    HloModule* hlo_module, llvm::LLVMContext* llvm_context,
-    const std::string& target_triple, const std::string& data_layout,
-    const std::string& platform_name, const se::Platform::Id platform_id,
-    const se::DeviceDescription& gpu_device_info,
-    const BufferValue::SizeFunction& buffer_size_bytes_function) {
-  CompileModuleResults results;
-  TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
-      hlo_module, llvm_context, target_triple, data_layout, platform_name,
-      platform_id, gpu_device_info, &CanShareBufferHint,
-      buffer_size_bytes_function, &results));
-  return std::move(results.llvm_module);
-}
-
 // Analyze the function signature to reconstruct a vector of BufferAllocation
 // objects, as well as other output information.
 //
@@ -240,7 +252,7 @@ StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
 static Status GetMlirAllocationInfo(
     mlir::func::FuncOp func, std::vector<BufferAllocation>* allocations,
     absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>* output_info,
-    Shape* output_shape, EntryFunctionAttributes* entry_func_attrs) {
+    Shape* output_shape) {
   CHECK(allocations->empty());
   allocations->reserve(func.getNumArguments());
 
@@ -268,61 +280,26 @@ static Status GetMlirAllocationInfo(
     }
   }
 
-  // Encode buffer parameter metadata in a proto for persisting, because BEF
-  // doesn't persist function attributes.
-  for (int i = 0; i < func.getNumArguments(); i++) {
-    auto buffer = entry_func_attrs->add_buffers();
-    if (auto param_attr = func.getArgAttr(i, "lmhlo.params")) {
-      buffer->set_lmhlo_params_present(true);
-      buffer->set_lmhlo_params(param_attr.cast<mlir::IntegerAttr>().getInt());
-    }
-    if (auto shape_index_attr = func.getArgAttr(i, "lmhlo.param_shape_index")) {
-      auto param_shape_index = buffer->mutable_lmhlo_param_shape_index();
-      for (const llvm::APInt& element :
-           shape_index_attr.cast<mlir::DenseIntElementsAttr>()) {
-        param_shape_index->add_indices(element.getSExtValue());
-      }
-    }
-    if (auto constant_name_attr = func.getArgAttr(i, "lmhlo.constant_name")) {
-      buffer->set_lmhlo_constant_name(
-          constant_name_attr.cast<mlir::StringAttr>().str());
-    }
-    if (func.getArgAttr(i, "lmhlo.must_alias")) {
-      buffer->set_lmhlo_must_alias(true);
-    }
-    if (auto output_index_attr = func.getArgAttr(i, "lmhlo.output_index")) {
-      auto output_index = buffer->mutable_lmhlo_output_index();
-      for (const llvm::APInt& element :
-           output_index_attr.cast<mlir::DenseIntElementsAttr>()) {
-        output_index->add_indices(element.getSExtValue());
-      }
-    }
-  }
-  entry_func_attrs->set_result_xla_shape(
-      func->getAttrOfType<mlir::StringAttr>("result_xla_shape")
-          .getValue()
-          .str());
-
   return GpuExecutable::SetUpMlirAllocation(func, buffer_sizes, allocations,
                                             output_info, output_shape);
 }
 
 // The order of `thunk_sequence` corresponds to
 // `hlo_schedule->ThunkLaunchOrder()`.
-Status CompileModuleToLlvmIrImpl(
+StatusOr<CompileModuleResults> CompileModuleToLlvmIr(
     HloModule* hlo_module, llvm::LLVMContext* llvm_context,
     const std::string& target_triple, const std::string& data_layout,
     const std::string& platform_name, se::Platform::Id platform_id,
     const se::DeviceDescription& gpu_device_info,
     const HloDataflowAnalysis::CanShareBuffer& can_share_buffer_function,
-    const BufferValue::SizeFunction& buffer_size_bytes_function,
-    CompileModuleResults* results) {
-  results->llvm_module = std::make_unique<llvm::Module>("", *llvm_context);
-  results->llvm_module->setTargetTriple(target_triple);
-  results->llvm_module->setDataLayout(data_layout);
+    const BufferValue::SizeFunction& buffer_size_bytes_function) {
+  CompileModuleResults results;
+  results.llvm_module = std::make_unique<llvm::Module>("", *llvm_context);
+  results.llvm_module->setTargetTriple(target_triple);
+  results.llvm_module->setDataLayout(data_layout);
 
   TF_ASSIGN_OR_RETURN(
-      results->buffer_assignment,
+      results.buffer_assignment,
       BufferAssigner::Run(
           hlo_module,
           std::make_unique<SequentialHloOrdering>(hlo_module->schedule()),
@@ -334,7 +311,7 @@ Status CompileModuleToLlvmIrImpl(
           /*must_not_live_out=*/{}, can_share_buffer_function));
 
   VLOG(1) << "Buffer Assignment Stats for " << hlo_module->name() << "\n"
-          << results->buffer_assignment->GetStats().ToString();
+          << results.buffer_assignment->GetStats().ToString();
   struct GetCcStr {
     std::string operator()(const se::CudaComputeCapability& cc) const {
       return absl::StrCat("sm_", cc.ToString());
@@ -344,7 +321,7 @@ Status CompileModuleToLlvmIrImpl(
     }
   };
   DumpHloModuleIfEnabled(
-      *hlo_module, *results->buffer_assignment,
+      *hlo_module, *results.buffer_assignment,
       absl::StrCat(
           std::visit(GetCcStr(), gpu_device_info.gpu_compute_capability()),
           "_gpu_", kAfterOptimizationsDumpName));
@@ -370,11 +347,11 @@ Status CompileModuleToLlvmIrImpl(
 
   // Store the allocations in the order of the LMHLO buffer arguments.
   std::vector<const BufferAllocation*> ordered_allocations;
-  TF_RETURN_IF_ERROR(HloToLhloModule(*results->buffer_assignment, *hlo_module,
+  TF_RETURN_IF_ERROR(HloToLhloModule(*results.buffer_assignment, *hlo_module,
                                      *mlir_module, &ordered_allocations,
                                      &operation_map));
 
-  results->module_name =
+  results.module_name =
       mlir::mhlo::GetDebugNameFromLocation(mlir_module->getLoc());
 
   if (DumpingEnabledForHloModule(*hlo_module)) {
@@ -389,34 +366,33 @@ Status CompileModuleToLlvmIrImpl(
   std::vector<BufferAllocation> mlir_allocations;
   absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo> mlir_output_info;
   Shape mlir_output_shape;
-  TF_RETURN_IF_ERROR(GetMlirAllocationInfo(
-      entry_function, &mlir_allocations, &mlir_output_info, &mlir_output_shape,
-      &results->entry_func_attrs));
+  TF_RETURN_IF_ERROR(GetMlirAllocationInfo(entry_function, &mlir_allocations,
+                                           &mlir_output_info,
+                                           &mlir_output_shape));
 
   IrEmitterContext ir_emitter_context(
-      hlo_module, results->buffer_assignment.get(), platform_name,
-      gpu_device_info, mlir_context.get(), results->llvm_module.get(),
-      emit_from_hlo);
+      hlo_module, results.buffer_assignment.get(), platform_name,
+      gpu_device_info, mlir_context.get(), results.llvm_module.get(),
+      emit_from_hlo, /*emit_kernels=*/true);
 
   std::vector<BufferAllocation*> allocations;
   if (emit_from_hlo) {
-    results->output_shape = hlo_module->result_shape();
-    TF_ASSIGN_OR_RETURN(
-        results->output_info,
-        GetOutputInfo(*hlo_module, *results->buffer_assignment));
+    results.output_shape = hlo_module->result_shape();
+    TF_ASSIGN_OR_RETURN(results.output_info,
+                        GetOutputInfo(*hlo_module, *results.buffer_assignment));
     TF_RET_CHECK(mlir_allocations.size() == ordered_allocations.size());
     ir_emitter_context.set_allocations(ordered_allocations);
-    results->use_original_allocations = true;
+    results.use_original_allocations = true;
   } else {
-    results->allocations = std::move(mlir_allocations);
-    results->output_shape = mlir_output_shape;
-    results->output_info = mlir_output_info;
-    allocations.reserve(results->allocations.size());
-    for (auto& allocation : results->allocations) {
+    results.allocations = std::move(mlir_allocations);
+    results.output_shape = mlir_output_shape;
+    results.output_info = mlir_output_info;
+    allocations.reserve(results.allocations.size());
+    for (auto& allocation : results.allocations) {
       allocations.push_back(&allocation);
     }
     ir_emitter_context.set_allocations(allocations);
-    results->use_original_allocations = false;
+    results.use_original_allocations = false;
   }
 
   auto ir_emitter = IrEmitterUnnested::Create(&ir_emitter_context);
@@ -439,7 +415,7 @@ Status CompileModuleToLlvmIrImpl(
                                           ir_emitter_context.constants());
     }
 
-    results->constants = std::move(ir_emitter_context.constants());
+    results.constants = std::move(ir_emitter_context.constants());
     uint64_t end_usecs = tsl::Env::Default()->NowMicros();
 
     // This won't record values for calls that error out (because if they error
@@ -454,23 +430,21 @@ Status CompileModuleToLlvmIrImpl(
     // Sizes of all buffers required for running XLA module.
     std::vector<int64_t> buffer_sizes;
     llvm::transform(
-        results->allocations, std::back_inserter(buffer_sizes),
+        results.allocations, std::back_inserter(buffer_sizes),
         [](const BufferAllocation& allocation) { return allocation.size(); });
 
     TF_ASSIGN_OR_RETURN(
-        results->executable,
+        results.executable,
         LowerToJitRt(*mlir_module, entry_function.getName(), buffer_sizes,
-                     hlo_module->config(), ir_emitter->ConsumeThunkSequence(),
-                     /*hlo_module_for_dump=*/hlo_module,
+                     ir_emitter->ConsumeThunkSequence(), hlo_module,
                      gpu_device_info.gpu_compute_capability()));
-    return OkStatus();
+  } else {
+    auto thunk_sequence = ir_emitter->ConsumeThunkSequence();
+    ForAllThunks([](Thunk* thunk) { thunk->ClearCompileTimeInfo(); },
+                 thunk_sequence.get());
+    results.executable = std::move(thunk_sequence);
   }
-
-  auto thunk_sequence = ir_emitter->ConsumeThunkSequence();
-  ForAllThunks([](Thunk* thunk) { thunk->ClearCompileTimeInfo(); },
-               thunk_sequence.get());
-  results->executable = std::move(thunk_sequence);
-  return OkStatus();
+  return results;
 }
 
 // Removes all globals from the given module that are both uninitialized and
@@ -481,7 +455,7 @@ void RemoveUnusedAndUninitializedGlobals(
   for (const auto& info : constants) {
     // Empty content means the constant is initialized in the LLVM IR, so we
     // must not remove it.
-    if (!info.content.empty()) {
+    if (!info.content.span().empty()) {
       llvm::GlobalVariable* global =
           llvm_module->getGlobalVariable(info.symbol_name);
       CHECK(global != nullptr);
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
index 286320f019c55c..03f93a3482975b 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_COMPILE_MODULE_TO_LLVM_IR_H_
 #define XLA_SERVICE_GPU_COMPILE_MODULE_TO_LLVM_IR_H_
 
+#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
@@ -26,6 +27,7 @@ limitations under the License.
 #include "xla/service/buffer_value.h"
 #include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/gpu/thunk.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/statusor.h"
@@ -43,7 +45,6 @@ struct CompileModuleResults {
   std::variant<GpuExecutable::OwnedThunkSequence,
                GpuExecutable::OwnedGpuRuntimeProgram>
       executable;
-  EntryFunctionAttributes entry_func_attrs;
   std::vector<GpuExecutable::ConstantInfo> constants;
   absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo> output_info;
   Shape output_shape;
@@ -55,29 +56,22 @@ struct CompileModuleResults {
   bool use_original_allocations;
 };
 
+void ForAllThunks(const std::function<void(Thunk*)>& fn,
+                  ThunkSequence* thunk_sequence);
+
 // Removes all globals from the given module that are both uninitialized and
 // have no uses within that module.
 void RemoveUnusedAndUninitializedGlobals(
     llvm::Module* llvm_module,
     const std::vector<GpuExecutable::ConstantInfo>& constants);
 
-// Compile `hlo_module` using XLA GPU and return the LLVM module thus generated.
-// The GpuExecutable (and the Thunks that are part of it) are not returned.
-StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
-    HloModule* hlo_module, llvm::LLVMContext* llvm_context,
-    const std::string& target_triple, const std::string& data_layout,
-    const std::string& platform_name, se::Platform::Id platform_id,
-    const se::DeviceDescription& gpu_device_info,
-    const BufferValue::SizeFunction& buffer_size_bytes_function);
-
-Status CompileModuleToLlvmIrImpl(
+StatusOr<CompileModuleResults> CompileModuleToLlvmIr(
     HloModule* hlo_module, llvm::LLVMContext* llvm_context,
     const std::string& target_triple, const std::string& data_layout,
     const std::string& platform_name, se::Platform::Id platform_id,
     const se::DeviceDescription& gpu_device_info,
     const HloDataflowAnalysis::CanShareBuffer& can_share_buffer_function,
-    const BufferValue::SizeFunction& buffer_size_bytes_function,
-    CompileModuleResults* results);
+    const BufferValue::SizeFunction& buffer_size_bytes_function);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc b/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc
index c54473cf86fd7b..b4ba0bfd7d2e84 100644
--- a/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc
+++ b/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc
@@ -155,7 +155,7 @@ bool CheckCanonical(HloDotInstruction* dot) {
 }  // namespace
 
 static std::vector<HloDotInstruction*> GetRelevantDots(
-    const se::CudaComputeCapability cuda_compute_capability,
+    const se::GpuComputeCapability& gpu_compute_capability,
     HloComputation* comp, PrimitiveType datatype) {
   std::vector<HloDotInstruction*> gemms;
 
@@ -168,8 +168,8 @@ static std::vector<HloDotInstruction*> GetRelevantDots(
                 ->config()
                 .debug_options()
                 .xla_gpu_enable_triton_gemm() &&
-            CanTritonHandleGEMM(*dot, cuda_compute_capability) &&
-            ShouldTritonHandleGEMM(*dot, cuda_compute_capability))) {
+            CanTritonHandleGEMM(*dot, gpu_compute_capability) &&
+            ShouldTritonHandleGEMM(*dot, gpu_compute_capability))) {
         gemms.push_back(dot);
       }
     }
@@ -184,7 +184,7 @@ StatusOr<bool> CublasPadForGemms::Run(
   for (HloComputation* comp :
        module->MakeNonfusionComputations(execution_threads)) {
     for (HloDotInstruction* dot :
-         GetRelevantDots(cuda_compute_capability_, comp, datatype_)) {
+         GetRelevantDots(gpu_compute_capability_, comp, datatype_)) {
       TF_ASSIGN_OR_RETURN(bool result,
                           PadForGemm(dot, datatype_, pad_to_multiple_of_));
       changed |= result;
diff --git a/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.h b/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.h
index 758949c91eb9fb..7b3ce7fe050486 100644
--- a/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.h
+++ b/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.h
@@ -31,9 +31,9 @@ namespace gpu {
 // so it should go strictly later.
 class CublasPadForGemms : public HloModulePass {
  public:
-  CublasPadForGemms(const se::CudaComputeCapability cuda_compute_capability,
+  CublasPadForGemms(const se::GpuComputeCapability gpu_compute_capability,
                     PrimitiveType datatype, int32_t pad_to_multiple_of)
-      : cuda_compute_capability_(cuda_compute_capability),
+      : gpu_compute_capability_(gpu_compute_capability),
         datatype_(datatype),
         pad_to_multiple_of_(pad_to_multiple_of) {}
 
@@ -45,7 +45,7 @@ class CublasPadForGemms : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  const se::CudaComputeCapability cuda_compute_capability_;
+  const se::GpuComputeCapability gpu_compute_capability_;
   PrimitiveType datatype_;
   int32_t pad_to_multiple_of_;
 };
diff --git a/third_party/xla/xla/service/gpu/cublas_padding_requirements.cc b/third_party/xla/xla/service/gpu/cublas_padding_requirements.cc
index 4c99defa5ff19e..b6a025d2f6a118 100644
--- a/third_party/xla/xla/service/gpu/cublas_padding_requirements.cc
+++ b/third_party/xla/xla/service/gpu/cublas_padding_requirements.cc
@@ -26,21 +26,39 @@ namespace gpu {
 
 namespace {
 
+template <class... Ts>
+struct Overload : Ts... {
+  using Ts::operator()...;
+};
+template <class... Ts>
+Overload(Ts...) -> Overload<Ts...>;
+
 bool DimensionRequiresPadding(const int64_t size, const PrimitiveType data_type,
-                              const se::CudaComputeCapability cc) {
-  for (const CublasPaddingRequirement& requirement :
-       CublasPaddingRequirements) {
-    if (cc.IsAtLeast(requirement.min_compute_capability) &&
-        data_type == requirement.data_type &&
-        size % requirement.multiple_of != 0) {
-      return true;
-    }
-  }
-  return false;
+                              const se::GpuComputeCapability& gpu_cc) {
+  return std::visit(
+      Overload{
+          [&](const se::CudaComputeCapability& cc) {
+            for (const auto& req : CublasPaddingRequirements) {
+              if (cc.IsAtLeast(req.min_compute_capability) &&
+                  data_type == req.data_type && size % req.multiple_of != 0) {
+                return true;
+              }
+            }
+            return false;
+          },
+          [&](const se::RocmComputeCapability& cc) {
+            for (const auto& req : HipblasPaddingRequirements) {
+              if (data_type == req.data_type && size % req.multiple_of != 0) {
+                return true;
+              }
+            }
+            return false;
+          }},
+      gpu_cc);
 }
 
 bool ShapeRequiresPadding(const Shape& shape,
-                          const se::CudaComputeCapability cc) {
+                          const se::GpuComputeCapability& cc) {
   // Since dots are canonicalized before padding only the last two dimensions
   // of each operand represent non-batch dimensions and may need padding.
   return DimensionRequiresPadding(shape.dimensions(shape.rank() - 1),
@@ -52,7 +70,7 @@ bool ShapeRequiresPadding(const Shape& shape,
 }  // namespace
 
 bool CublasRequiresPadding(const HloDotInstruction& dot,
-                           const se::CudaComputeCapability cc) {
+                           const se::GpuComputeCapability& cc) {
   return ShapeRequiresPadding(dot.operand(0)->shape(), cc) ||
          ShapeRequiresPadding(dot.operand(1)->shape(), cc);
 }
diff --git a/third_party/xla/xla/service/gpu/cublas_padding_requirements.h b/third_party/xla/xla/service/gpu/cublas_padding_requirements.h
index b0b44e61b57bbe..71c8b434a11c02 100644
--- a/third_party/xla/xla/service/gpu/cublas_padding_requirements.h
+++ b/third_party/xla/xla/service/gpu/cublas_padding_requirements.h
@@ -31,15 +31,23 @@ struct CublasPaddingRequirement {
   int multiple_of;
 };
 
+struct HipblasPaddingRequirement {
+  PrimitiveType data_type;
+  int multiple_of;
+};
+
 // List of padding requirements per compute capability and data type.
 constexpr std::array<CublasPaddingRequirement, 3> CublasPaddingRequirements{
     {{se::CudaComputeCapability::VOLTA, S8, 4},
      {se::CudaComputeCapability::VOLTA, F16, 8},
      {se::CudaComputeCapability::AMPERE, BF16, 8}}};
 
+constexpr std::array<HipblasPaddingRequirement, 2> HipblasPaddingRequirements{
+    {{/*rocm gpu arch,*/ F16, 8}, {/*rocm gpu arch,*/ BF16, 8}}};
+
 // Tell if either of the operands of the dot requires padding.
 bool CublasRequiresPadding(const HloDotInstruction& dot,
-                           se::CudaComputeCapability cc);
+                           const se::GpuComputeCapability& cc);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
index 099e6776a370d3..409cfd53feb0e3 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
@@ -212,22 +212,33 @@ auto GetUnfusedReduceMaxSumSoftmaxPattern(
     HloInstruction** softmax_reduce_sum = nullptr,
     HloInstruction** softmax_reduce_sum_bcast = nullptr) {
   // The reduce-max part of the softmax
-  auto unfused_softmax_max_subpattern = m::SharedSubpattern(m::Subtract(
-      m::Op(), m::Broadcast(OptionalConvert(OptionalConvert(
-                   m::Op()
-                       .WithPredicate(IsReduceMax)
-                       .WithOperand(0, OptionalBitcast(OptionalConvert(
-                                           m::Op(softmax_input)))))))));
+  // reduce_max and subtract will always have exactly 1 user
+  // in both training and inference
+  // softmax_input should always have exactly 2 users
+  auto unfused_softmax_max_subpattern = m::SharedSubpattern(
+      m::Subtract(
+          m::Op(),
+          m::Broadcast(OptionalConvert(
+              m::Op()
+                  .WithPredicate(IsReduceMax)
+                  .WithOneUse()
+                  .WithOperand(0, OptionalBitcast(OptionalConvert(
+                                      m::Op(softmax_input).WithNumUser(2)))))))
+          .WithOneUse());
   // The reduce-add part of the softmax
+  // reduce_sum and reduce_sum_broadcast should have 2 users in training
+  // and 1 user in inference
   auto unfused_softmax_sum_subpattern = m::SharedSubpattern(m::Divide(
       OptionalBitcast(m::Exp(unfused_softmax_max_subpattern)),
       m::Broadcast(
           softmax_reduce_sum_bcast,
-          OptionalConvert(OptionalConvert(
+          OptionalConvert(
               m::Op(softmax_reduce_sum)
                   .WithOperand(0, OptionalBitcast(OptionalConvert(
                                       m::Exp(unfused_softmax_max_subpattern))))
-                  .WithPredicate(IsReduceSum))))));
+                  .WithPredicate(IsReduceSum)
+                  .WithAtMostNumUser(2)))
+          .WithAtMostNumUser(2)));
   return unfused_softmax_sum_subpattern;
 }
 
@@ -409,12 +420,16 @@ MatchFwdResult MatchDefaultFwdBmmBmm(MatchFwdResult previous_result,
   // Try matching default bmm1-bmm2 pattern
   HloInstruction* bmm_1;
   HloInstruction* bmm_2;
-
+  // bmm1 should have at most 2 users at this case
+  // 1. 1 user(bmm2) in case of inference
+  // 2. 2 users(bmm2 and backward bmm) in case of training
   auto default_bmm_bmm_pattern =
       m::Op(&bmm_2)
           .WithPredicate(IsBatchedMatmul)
           .WithOperand(bmm2_operand_position,
-                       m::Op(&bmm_1).WithPredicate(IsBatchedMatmul));
+                       m::Op(&bmm_1)
+                           .WithPredicate(IsBatchedMatmul)
+                           .WithAtMostNumUser(2));
 
   // If any of bmm1's operands is coming from a forward fMHA call, then return
   // false
@@ -513,11 +528,12 @@ MatchFwdResult MatchBmm1UnfusedBiasSoftmaxBmm2(MatchFwdResult previous_result,
   HloInstruction* bmm_1;
   HloInstruction* bias = nullptr;
   HloInstruction* scale = nullptr;
-
+  // bmm1/scale/bias add should have 2 users if being connected to softmax
+  // otherwise should have exactly 1 user
   auto first_bmm_pattern =
       m::SharedSubpattern(m::Op(&bmm_1).WithPredicate(IsBatchedMatmul));
   auto unfused_scaled_bmm_subpattern = m::MultiplyAnyOrder(
-      OptionalConvert(first_bmm_pattern),
+      OptionalConvert(first_bmm_pattern.WithOneUse()),
       OptionalConvert(
           m::Broadcast(m::Constant(&scale).WithPredicate(IsScalar))));
 
@@ -531,7 +547,8 @@ MatchFwdResult MatchBmm1UnfusedBiasSoftmaxBmm2(MatchFwdResult previous_result,
   } else if (Match(softmax_input,
                    OptionalBitcast(m::AddAnyOrder(
                        OptionalConvert(OptionalBitcast(m::AnyOf<HloInstruction>(
-                           unfused_scaled_bmm_subpattern, first_bmm_pattern))),
+                           unfused_scaled_bmm_subpattern.WithOneUse(),
+                           first_bmm_pattern.WithOneUse()))),
                        m::Op(&bias))))) {
     match_result.matched_bmm_1 = bmm_1;
     match_result.matched_scale = scale;
@@ -561,29 +578,30 @@ MatchFwdResult MatchBmm1ScaleBiasMaskSoftmaxDropoutBmm2(
       OptionalConvert(
           m::Op(&bmm_1).WithPredicate(IsBatchedMatmul).WithOneUse()),
       m::Broadcast(m::Constant(&scale).WithPredicate(IsScalar))));
-
-  if (Match(
-          softmax_input,
-          OptionalConvert(m::Select(
-              m::Op(&mask).WithPredicate([](const HloInstruction* instr) {
-                return instr->shape().element_type() == PRED;
-              }),
-              // Match bmm1-scale-bias-mask
-              m::AnyOf<HloInstruction>(
-                  // Scale and bias might or might not be fused
-                  // with gemm
-                  m::Op(&bmm_1).WithPredicate(IsBatchedMatmul).WithOneUse(),
-                  OptionalConvert(m::AnyOf<HloInstruction>(
-                      // Try to match unfused bias
-                      m::AddAnyOrder(m::Op(&bias),
-                                     m::AnyOf<HloInstruction>(
-                                         OptionalConvert(
-                                             m::Op(&bmm_1)
-                                                 .WithPredicate(IsBatchedMatmul)
-                                                 .WithOneUse()),
-                                         unfused_scaled_bmm_subpattern)),
-                      unfused_scaled_bmm_subpattern))),
-              m::Op())))) {
+  // bmm1/scale/bias add/mask should have 2 users if being connected to softmax
+  // otherwise should have exactly 1 user
+  if (Match(softmax_input,
+            OptionalConvert(m::Select(
+                m::Op(&mask).WithPredicate([](const HloInstruction* instr) {
+                  return instr->shape().element_type() == PRED;
+                }),
+                // Match bmm1-scale-bias-mask
+                m::AnyOf<HloInstruction>(
+                    // Scale and bias might or might not be fused
+                    // with gemm
+                    m::Op(&bmm_1).WithPredicate(IsBatchedMatmul).WithOneUse(),
+                    OptionalConvert(m::AnyOf<HloInstruction>(
+                        // Try to match unfused bias
+                        m::AddAnyOrder(
+                            m::Op(&bias),
+                            m::AnyOf<HloInstruction>(
+                                OptionalConvert(
+                                    m::Op(&bmm_1)
+                                        .WithPredicate(IsBatchedMatmul)
+                                        .WithOneUse()),
+                                unfused_scaled_bmm_subpattern.WithOneUse())),
+                        unfused_scaled_bmm_subpattern.WithOneUse()))),
+                m::Op())))) {
     if (!IsSupportedPrimitiveType(bmm_1)) {
       matched_result.has_match = false;
       return matched_result;
@@ -671,11 +689,10 @@ bool IsBmm2GradGemm2(HloInstruction* instr) {
 }
 
 MatchBwdResult MatchBmm1GradGemm1(MatchBwdResult previous_result,
-                                  HloInstruction* fwd_fmha_call,
                                   HloInstruction* bmm_1) {
   MatchBwdResult match_result = previous_result;
   match_result.has_match = false;
-  const HloInstruction* q_tensor = fwd_fmha_call->operand(0);
+  const HloInstruction* q_tensor = bmm_1->operand(0);
   for (int64_t i = 0; i < q_tensor->user_count(); i++) {
     HloInstruction* q_tensor_user_i = q_tensor->users()[i];
     if (IsBatchedMatmul(q_tensor_user_i) && q_tensor_user_i != bmm_1) {
@@ -695,40 +712,36 @@ MatchBwdResult MatchBmm1GradGemm2(MatchBwdResult previous_result,
   HloInstruction* bmm_1_grad_2 = nullptr;
   MatchBwdResult match_result = previous_result;
   match_result.has_match = false;
-  // bmm1 gradient gemm2 shares the same input as bmm1 gradient gemm1.
+  // bmm1 gradient gemm2 shares the same input d_s as bmm1 gradient gemm1.
   // Check to see if bmm1 grad gemm1 needs canonicalization or not, if not,
   // then the shared input is the first operand.
-  int64_t parent_nodex_index =
-      match_result.bmm_1_grad_1_need_canonicalization ? 1 : 0;
+  int64_t d_s_index = match_result.bmm_1_grad_1_need_canonicalization ? 1 : 0;
   HloInstruction* d_s_user_0 = match_result.matched_bmm_1_grad_1;
 
-  HloInstruction* parent_node = d_s_user_0->mutable_operand(parent_nodex_index);
-  if (parent_node->opcode() == HloOpcode::kBitcast &&
-      parent_node->user_count() == 1) {
-    d_s_user_0 = parent_node;
-    parent_node = parent_node->mutable_operand(0);
+  HloInstruction* d_s = d_s_user_0->mutable_operand(d_s_index);
+  if (d_s->opcode() == HloOpcode::kBitcast && d_s->user_count() == 1) {
+    d_s = d_s->mutable_operand(0);
   }
 
-  auto bmm_1_grad_2_it =
-      std::find_if(parent_node->users().begin(), parent_node->users().end(),
-                   [&](HloInstruction* instr) {
-                     return instr != match_result.matched_bmm_1_grad_1 &&
-                            instr->opcode() != HloOpcode::kReduce;
-                   });
-  if (bmm_1_grad_2_it != parent_node->users().end()) {
+  auto bmm_1_grad_2_it = std::find_if(
+      d_s->users().begin(), d_s->users().end(), [&](HloInstruction* instr) {
+        return instr != match_result.matched_bmm_1_grad_1 &&
+               instr->opcode() != HloOpcode::kReduce;
+      });
+  if (bmm_1_grad_2_it != d_s->users().end()) {
     bmm_1_grad_2 = *bmm_1_grad_2_it;
   } else {
     return match_result;
   }
   if (bmm_1_grad_2->opcode() == HloOpcode::kBitcast &&
       bmm_1_grad_2->user_count() == 1) {
-    parent_node = bmm_1_grad_2;
+    d_s = bmm_1_grad_2;
     bmm_1_grad_2 = bmm_1_grad_2->users()[0];
   }
 
   match_result.matched_bmm_1_grad_2 = bmm_1_grad_2;
 
-  if (match_result.matched_bmm_1_grad_2->operand_index(parent_node) != 0) {
+  if (match_result.matched_bmm_1_grad_2->operand_index(d_s) != 0) {
     match_result.bmm_1_grad_2_need_canonicalization = true;
   }
   match_result.has_match = true;
@@ -826,21 +839,31 @@ MatchBwdResult MatchBwdBmmSoftmaxDropoutBmm(MatchBwdResult previous_result,
   HloInstruction* exp_2;
   HloInstruction* d_softmax;
 
-  auto bwd_softmax_pattern =
-      OptionalBitcast(OptionalConvert(m::MultiplyAnyOrder(
+  // d_softmax = exp * (dy / s_b - sum(dy * exp * 1 / s^2))
+  // there could be at most 3 users of d_softmax: bmm1grad1 bmm1grad2 and dbias
+  auto bwd_softmax_pattern = OptionalBitcast(OptionalConvert(
+      m::MultiplyAnyOrder(
           &d_softmax,
           m::AddAnyOrder(
-              m::Divide(),
-              m::Broadcast(OptionalBitcast(
-                  OptionalConvert(OptionalConvert(m::Negate(OptionalBitcast(
-                      m::Op()
-                          .WithPredicate(IsReduceSum)
-                          .WithOperand(0, OptionalBitcast(m::MultiplyAnyOrder(
-                                              m::MultiplyAnyOrder(
-                                                  m::Op(&bwd_softmax_input),
-                                                  m::Broadcast()),
-                                              m::Exp(&exp_2, m::Op()))))))))))),
-          m::Exp(&exp_1, m::Op()))));
+              m::Divide().WithOneUse(),
+              m::Broadcast(OptionalBitcast(OptionalConvert(
+                  m::Negate(
+                      OptionalBitcast(
+                          m::Op()
+                              .WithPredicate(IsReduceSum)
+                              .WithOneUse()
+                              .WithOperand(
+                                  0, OptionalBitcast(
+                                         m::MultiplyAnyOrder(
+                                             m::MultiplyAnyOrder(
+                                                 m::Op(&bwd_softmax_input),
+                                                 m::Broadcast())
+                                                 .WithOneUse(),
+                                             m::Exp(&exp_2, m::Op()))
+                                             .WithOneUse()))))
+                      .WithOneUse())))),
+          m::Exp(&exp_1, m::Op()))
+          .WithAtMostNumUser(3)));
 
   // Backward mask input pattern
   // we already matched this in the fwd. Just make sure the same mask is used in
@@ -972,7 +995,7 @@ MatchBwdResult MatchBackwardBmms(HloInstruction* fwd_fmha_call,
     return matched_result;
   }
 
-  matched_result = MatchBmm1GradGemm1(matched_result, fwd_fmha_call, bmm_1);
+  matched_result = MatchBmm1GradGemm1(matched_result, bmm_1);
   if (!matched_result.has_match) {
     return matched_result;
   }
@@ -1154,6 +1177,9 @@ StatusOr<HloInstruction*> FuseFwdMultiHeadedAttentionBlock(
   HloInstruction* lhs_bmm1;
   HloInstruction* rhs_bmm1;
   HloInstruction* rhs_bmm2;
+  DotDimensionNumbers orig_bmm1_dot_dim = bmm_1->dot_dimension_numbers();
+  DotDimensionNumbers orig_bmm2_dot_dim = bmm_2->dot_dimension_numbers();
+
   TF_ASSIGN_OR_RETURN(rhs_bmm1, ChangeCheckedDimToFastest(
                                     comp, bmm_1, false /*is_lhs*/,
                                     true /*should_contracting_be_fastest*/));
@@ -1176,6 +1202,11 @@ StatusOr<HloInstruction*> FuseFwdMultiHeadedAttentionBlock(
       bmm_2->dot_dimension_numbers();
 
   TF_RET_CHECK((dropout_rate >= 0.0 && dropout_rate <= 1.0));
+  // Restore original DotDimensionNumbers.
+  *((DynCast<HloDotInstruction>(bmm_1))->mutable_dot_dimension_numbers()) =
+      orig_bmm1_dot_dim;
+  *((DynCast<HloDotInstruction>(bmm_2))->mutable_dot_dimension_numbers()) =
+      orig_bmm2_dot_dim;
 
   // If scale node is assigned, extract value from it.
   if (scale != nullptr) {
@@ -1288,9 +1319,15 @@ StatusOr<HloInstruction*> FuseFwdMultiHeadedAttentionBlock(
       HloInstruction::CreateGetTupleElement(bmm_2->shape(), fmha_call, 0)));
 
   if (activation_output) {
-    TF_RETURN_IF_ERROR(comp->ReplaceWithNewInstruction(
-        activation_output, HloInstruction::CreateGetTupleElement(
-                               activation_output->shape(), fmha_call, 2)));
+    HloInstruction* activation_gte =
+        comp->AddInstruction(HloInstruction::CreateGetTupleElement(
+            activation_output->shape(), fmha_call, 2));
+    TF_RETURN_IF_ERROR(comp->ReplaceInstructionWithDifferentShape(
+                               activation_output, activation_gte,
+                               /*preserve_sharding=*/false,
+                               /*relay_control_dependency=*/false,
+                               /*remove_unused_operands=*/false)
+                           .status());
   }
 
   if (VLOG_IS_ON(2)) {
@@ -1337,6 +1374,14 @@ StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
   HloInstruction* lhs_bmm2_grad_gemm1;
   HloInstruction* rhs_bmm2_grad_gemm2;
   HloInstruction* d_output_grad;
+  DotDimensionNumbers orig_bmm1_grad1_config =
+      bmm_1_grad_1->dot_dimension_numbers();
+  DotDimensionNumbers orig_bmm1_grad2_config =
+      bmm_1_grad_2->dot_dimension_numbers();
+  DotDimensionNumbers orig_bmm2_grad1_config =
+      bmm_2_grad_1->dot_dimension_numbers();
+  DotDimensionNumbers orig_bmm2_grad2_config =
+      bmm_2_grad_2->dot_dimension_numbers();
 
   // Q tensor
   TF_ASSIGN_OR_RETURN(
@@ -1420,6 +1465,16 @@ StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
   *bwd_fmha_config.mutable_bmm2_grad_gemm2_dot_dimension_numbers() =
       bmm_2_grad_2->dot_dimension_numbers();
 
+  // Restore original DotDimensionNumbers
+  *((DynCast<HloDotInstruction>(bmm_1_grad_1))
+        ->mutable_dot_dimension_numbers()) = orig_bmm1_grad1_config;
+  *((DynCast<HloDotInstruction>(bmm_1_grad_2))
+        ->mutable_dot_dimension_numbers()) = orig_bmm1_grad2_config;
+  *((DynCast<HloDotInstruction>(bmm_2_grad_1))
+        ->mutable_dot_dimension_numbers()) = orig_bmm2_grad1_config;
+  *((DynCast<HloDotInstruction>(bmm_2_grad_2))
+        ->mutable_dot_dimension_numbers()) = orig_bmm2_grad2_config;
+
   bwd_fmha_config.set_fmha_scale(fwd_config.fmha_scale());
   bwd_fmha_config.set_dropout_rate(fwd_config.dropout_rate());
   // Set to an arbitrary seed for now, seed is not exposed to XLA in HLO
@@ -1544,6 +1599,28 @@ StatusOr<bool> CudnnFusedMHARewriter::Run(
               matched_result.need_canonicalization, matched_result.is_training,
               matched_result.matched_custom_call_name, debug_options));
       if (!is_mha_module_supported) continue;
+
+      // If we have an activation with more than 1 users in non-training mode,
+      // we cannot rewrite the graph. So skip processing the rest.
+      HloInstruction* activation =
+          matched_result.need_canonicalization
+              ? matched_result.matched_bmm_2->mutable_operand(1)
+              : matched_result.matched_bmm_2->mutable_operand(0);
+      if (!matched_result.is_training && activation->user_count() > 1) {
+        VLOG(2)
+            << "Activation: " << activation->ToString()
+            << " cannot have more than 1 users in non-training mode. Skipping.";
+        continue;
+      }
+      HloInstruction* original_bmm2_producer0 =
+          matched_result.matched_bmm_2->mutable_operand(0);
+      HloInstruction* original_bmm2_producer1 =
+          matched_result.matched_bmm_2->mutable_operand(1);
+
+      std::vector<HloInstruction*> original_activation_producers;
+      for (HloInstruction* operand : activation->mutable_operands()) {
+        original_activation_producers.push_back(operand);
+      }
       // If we need to canonicalize the bmm, we will assign the newly
       // canonicalized bmm to bmm_2.
       if (matched_result.need_canonicalization) {
@@ -1578,6 +1655,33 @@ StatusOr<bool> CudnnFusedMHARewriter::Run(
                 fwd_fmha_call, matched_result.matched_bmm_1,
                 matched_result.matched_mask, v_transposed);
         if (!matched_bwd_result.has_match) {
+          VLOG(2) << "Backward pattern not matching, skipping.";
+          // If backward pattern is not matched, we need to restore the
+          // original graph structure.
+          // Replacing new GTEs added by forward FMHA call with cloned old
+          // activations and bmm2.
+          HloInstruction* output_gte = fwd_fmha_call->users()[0];
+          HloInstruction* activation_gte = fwd_fmha_call->users()[1];
+          std::string suffix = "fmha_no_match_clone";
+          HloInstruction* cloned_activation =
+              comp->AddInstruction(activation->CloneWithNewOperands(
+                  activation->shape(), original_activation_producers, suffix));
+
+          // Since old activation is detached by forward FMHA rewrite, we need
+          // to use the newly cloned activation.
+          HloInstruction* lhs = activation == original_bmm2_producer0
+                                    ? cloned_activation
+                                    : original_bmm2_producer1;
+          HloInstruction* rhs = activation == original_bmm2_producer0
+                                    ? original_bmm2_producer1
+                                    : cloned_activation;
+          HloInstruction* cloned_bmm2 = comp->AddInstruction(
+              matched_result.matched_bmm_2->CloneWithNewOperands(
+                  matched_result.matched_bmm_2->shape(), {lhs, rhs}, suffix));
+
+          TF_RETURN_IF_ERROR(comp->ReplaceInstruction(output_gte, cloned_bmm2));
+          TF_RETURN_IF_ERROR(
+              comp->ReplaceInstruction(activation_gte, cloned_activation));
           continue;
         }
         // check if dbias is the only user of d_intermediate besides
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
index 1545b4a0e39e3f..cd40337dfc399d 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
@@ -2893,6 +2893,587 @@ ENTRY main.146 {
   EXPECT_NEAR(config.dropout_rate(), 0.1, 1e-2);
 }
 
+TEST_F(CudnnFusedMhaRewriterTestHloTest,
+       ActivationHasMoreThan1UserShouldNotLower) {
+  const char* module_str = R"(
+HloModule test
+
+%region_50.2457 (Arg_0.2458: bf16[], Arg_1.2459: bf16[]) -> bf16[] {
+  %Arg_0.2458 = bf16[] parameter(0)
+  %Arg_1.2459 = bf16[] parameter(1)
+  ROOT %maximum.2 = bf16[] maximum(bf16[] %Arg_0.2458, bf16[] %Arg_1.2459)
+}
+
+%region_36.2316 (Arg_0.2317: f32[], Arg_1.2318: f32[]) -> f32[] {
+  %Arg_0.2317 = f32[] parameter(0)
+  %Arg_1.2318 = f32[] parameter(1)
+  ROOT %add.342 = f32[] add(f32[] %Arg_0.2317, f32[] %Arg_1.2318)
+}
+
+ENTRY main {
+  %transpose.482 = bf16[4,5,64]{2,1,0} parameter(0)
+  %transpose.484 = bf16[4,64,5]{2,1,0} parameter(1)
+  %dot.20 = bf16[4,5,5]{2,1,0} dot(bf16[4,5,64]{2,1,0} %transpose.482, bf16[4,64,5]{2,1,0} %transpose.484), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+  %constant.2515 = bf16[] constant(0.125)
+  %broadcast.789 = bf16[4,5,5]{2,1,0} broadcast(bf16[] %constant.2515), dimensions={}
+  %multiply.267 = bf16[4,5,5]{2,1,0} multiply(bf16[4,5,5]{2,1,0} %dot.20, bf16[4,5,5]{2,1,0} %broadcast.789)
+  %constant.287 = f32[] constant(-1)
+  %broadcast.792 = bf16[4,5,5]{2,1,0} parameter(3)
+  %add.348 = bf16[4,5,5]{2,1,0} add(bf16[4,5,5]{2,1,0} %multiply.267, bf16[4,5,5]{2,1,0} %broadcast.792)
+  %constant.2510 = bf16[] constant(-inf)
+  %reduce.2550 = bf16[4,5]{1,0} reduce(bf16[4,5,5]{2,1,0} %add.348, bf16[] %constant.2510), dimensions={2}, to_apply=%region_50.2457
+  %broadcast.793 = bf16[4,5,5]{2,1,0} broadcast(bf16[4,5]{1,0} %reduce.2550), dimensions={0,1}
+  %subtract.81 = bf16[4,5,5]{2,1,0} subtract(bf16[4,5,5]{2,1,0} %add.348, bf16[4,5,5]{2,1,0} %broadcast.793)
+  %exponential.21 = bf16[4,5,5]{2,1,0} exponential(bf16[4,5,5]{2,1,0} %subtract.81)
+  %convert.180 = f32[4,5,5]{2,1,0} convert(bf16[4,5,5]{2,1,0} %exponential.21)
+  %constant.2509 = f32[] constant(0)
+  %reduce.2558 = f32[4,5]{1,0} reduce(f32[4,5,5]{2,1,0} %convert.180, f32[] %constant.2509), dimensions={2}, to_apply=%region_36.2316
+  %convert.182 = bf16[4,5]{1,0} convert(f32[4,5]{1,0} %reduce.2558)
+  %broadcast.794 = bf16[4,5,5]{2,1,0} broadcast(bf16[4,5]{1,0} %convert.182), dimensions={0,1}
+  %divide.25 = bf16[4,5,5]{2,1,0} divide(bf16[4,5,5]{2,1,0} %exponential.21, bf16[4,5,5]{2,1,0} %broadcast.794)
+  %transpose.481 = bf16[4,64,5]{2,1,0} parameter(2)
+  %dot.21 = bf16[4,64,5]{2,1,0} dot(bf16[4,64,5]{2,1,0} %transpose.481, bf16[4,5,5]{2,1,0} %divide.25), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={2}
+  ROOT %tuple.2668 = (bf16[4,5,5]{2,1,0}, bf16[4,64,5]{2,1,0}) tuple(bf16[4,5,5]{2,1,0} %divide.25, bf16[4,64,5]{2,1,0} %dot.21)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  CudnnFusedMHARewriter fusedMhaRewriter{
+      GetCudaComputeCapability(),
+      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+  HloDCE dce;
+  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
+
+  HloVerifier verifier(/*layout_sensitive=*/false,
+                       /*allow_mixed_precision*/ true);
+  ASSERT_IS_OK(verifier.Run(m.get()).status());
+
+  EXPECT_EQ(CountFusedAttentionCall(m.get()), 0);
+}
+
+TEST_F(CudnnFusedMhaRewriterTestHloTest,
+       F16InvalidTrainingBmm1ScaleBiasMaskSoftmaxBmm2ShouldNotBeLowered) {
+  const char* module_str = R"(
+HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[2,6,128,64]{3,2,1,0},f16[2,6,64,128]{3,2,1,0},f16[2,6,128,64]{3,2,1,0},f16[2,6,128,64]{3,2,1,0})->(f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,64,128]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
+
+region_0.21 {
+  Arg_0.22 = f16[] parameter(0)
+  Arg_1.23 = f16[] parameter(1)
+  ROOT maximum = f16[] maximum(Arg_0.22, Arg_1.23)
+}
+
+region_1.33 {
+  Arg_0.34 = f32[] parameter(0)
+  Arg_1.35 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0.34, Arg_1.35)
+}
+
+region_2.55 {
+  Arg_0.56 = f16[] parameter(0)
+  Arg_1.57 = f16[] parameter(1)
+  ROOT add.1 = f16[] add(Arg_0.56, Arg_1.57)
+}
+
+ENTRY main.82 {
+  constant.18 = pred[2,6,128,128]{3,2,1,0} constant({...})
+  Arg_0.1 = f16[2,6,128,64]{3,2,1,0} parameter(0), sharding={replicated}
+  Arg_1.2 = f16[2,6,64,128]{3,2,1,0} parameter(1), sharding={replicated}
+  dot.17 = f16[2,6,128,128]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  constant.22 = f16[] constant(2)
+  broadcast.24 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.22), dimensions={}
+  multiply.2 = f16[2,6,128,128]{3,2,1,0} multiply(dot.17, broadcast.24)
+  constant.19 = f16[] constant(1)
+  broadcast.13 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.19), dimensions={}
+  add.3 = f16[2,6,128,128]{3,2,1,0} add(multiply.2, broadcast.13)
+  constant.21 = f16[] constant(0)
+  broadcast.23 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.21), dimensions={}
+  select.1 = f16[2,6,128,128]{3,2,1,0} select(constant.18, add.3, broadcast.23)
+  constant.15 = f16[] constant(-inf)
+  reduce.25 = f16[2,6,128]{2,1,0} reduce(select.1, constant.15), dimensions={3}, to_apply=region_0.21
+  broadcast.17 = f16[2,6,128,128]{3,2,1,0} broadcast(reduce.25), dimensions={0,1,2}
+  subtract.1 = f16[2,6,128,128]{3,2,1,0} subtract(select.1, broadcast.17)
+  exponential.1 = f16[2,6,128,128]{3,2,1,0} exponential(subtract.1)
+  convert.5 = f32[2,6,128,128]{3,2,1,0} convert(exponential.1)
+  constant.17 = f32[] constant(0)
+  reduce.37 = f32[2,6,128]{2,1,0} reduce(convert.5, constant.17), dimensions={3}, to_apply=region_1.33
+  convert.9 = f16[2,6,128]{2,1,0} convert(reduce.37)
+  broadcast.26 = f16[2,6,128,128]{3,2,1,0} broadcast(convert.9), dimensions={0,1,2}
+  divide.5 = f16[2,6,128,128]{3,2,1,0} divide(exponential.1, broadcast.26)
+  Arg_2.3 = f16[2,6,128,64]{3,2,1,0} parameter(2), sharding={replicated}
+  dot.46 = f16[2,6,128,64]{3,2,1,0} dot(divide.5, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  Arg_3.4 = f16[2,6,128,64]{3,2,1,0} parameter(3), sharding={replicated}
+  dot.49 = f16[2,6,128,128]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  divide.4 = f16[2,6,128,128]{3,2,1,0} divide(dot.49, broadcast.26)
+  broadcast.20 = f16[2,6,128]{2,1,0} broadcast(constant.19), dimensions={}
+  multiply.3 = f16[2,6,128]{2,1,0} multiply(convert.9, convert.9)
+  divide.3 = f16[2,6,128]{2,1,0} divide(broadcast.20, multiply.3)
+  broadcast.21 = f16[2,6,128,128]{3,2,1,0} broadcast(divide.3), dimensions={0,1,2}
+  multiply.4 = f16[2,6,128,128]{3,2,1,0} multiply(dot.49, broadcast.21)
+  multiply.5 = f16[2,6,128,128]{3,2,1,0} multiply(multiply.4, exponential.1)
+  reduce.59 = f16[2,6,128]{2,1,0} reduce(multiply.5, constant.21), dimensions={3}, to_apply=region_2.55
+  broadcast.25 = f16[2,6,128,128]{3,2,1,0} broadcast(reduce.59), dimensions={0,1,2}
+  add.5 = f16[2,6,128,128]{3,2,1,0} add(divide.4, broadcast.25)
+  multiply.8 = f16[2,6,128,128]{3,2,1,0} multiply(add.5, exponential.1)
+  select.3 = f16[2,6,128,128]{3,2,1,0} select(constant.18, multiply.8, broadcast.23)
+  multiply.9 = f16[2,6,128,128]{3,2,1,0} multiply(select.3, broadcast.24)
+  dot.80 = f16[2,6,128,64]{3,2,1,0} dot(multiply.9, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  dot = f16[2,6,64,128]{3,2,1,0} dot(Arg_0.1, multiply.9), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  dot.1 = f16[2,6,128,64]{3,2,1,0} dot(divide.5, Arg_3.4), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  ROOT tuple.81 = (f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,64,128]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}) tuple(dot.46, dot.80, dot, dot.1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  CudnnFusedMHARewriter fusedMhaRewriter{
+      GetCudaComputeCapability(),
+      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+  HloDCE dce;
+  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
+
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+
+  HloVerifier verifier(/*layout_sensitive=*/false,
+                       /*allow_mixed_precision*/ true);
+  ASSERT_IS_OK(verifier.Run(m.get()).status());
+
+  // The backward pattern in the graph is not a valid fmha pattern,
+  // we expect no rewrite happening.
+  EXPECT_EQ(CountFusedAttentionCall(m.get()), 0);
+  EXPECT_EQ(CountFusedAttentionCall(m.get(), /*is_backward*/ true), 0);
+}
+
+TEST_F(CudnnFusedMhaRewriterTestHloTest,
+       F16InvalidTrainingBmm1ScaleBiasMaskSoftmaxDropoutBmm2ShouldNotLower) {
+  const char* module_str = R"(
+HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[2,6,128,64]{3,2,1,0},f16[2,6,64,128]{3,2,1,0},f16[2,6,128,64]{3,2,1,0},f16[2,6,128,64]{3,2,1,0})->(f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,64,128]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
+
+region_0.38 {
+  Arg_0.39 = f16[] parameter(0)
+  Arg_1.40 = f16[] parameter(1)
+  ROOT maximum.1 = f16[] maximum(Arg_0.39, Arg_1.40)
+}
+
+region_1.50 {
+  Arg_0.51 = f32[] parameter(0)
+  Arg_1.52 = f32[] parameter(1)
+  ROOT add.2 = f32[] add(Arg_0.51, Arg_1.52)
+}
+
+region_2.99 {
+  Arg_0.100 = f16[] parameter(0)
+  Arg_1.101 = f16[] parameter(1)
+  ROOT add.3 = f16[] add(Arg_0.100, Arg_1.101)
+}
+
+ENTRY main.126 {
+  constant.6 = u32[1]{0} constant({2718843009})
+  constant.8 = u32[1]{0} constant({1272950319})
+  constant.10 = u32[1]{0} constant({0})
+  constant.12 = u32[1]{0} constant({2711844646})
+  custom-call.65 = (u32[1]{0}, u32[1]{0}) custom-call(constant.6, constant.8, constant.10, constant.12), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[1]{0}, u32[1]{0}, u32[1]{0}, u32[1]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\001\000\000\000\000\000\000\000"
+  get-tuple-element.66 = u32[1]{0} get-tuple-element(custom-call.65), index=0
+  bitcast.343 = u32[] bitcast(get-tuple-element.66)
+  broadcast.27 = u32[98304]{0} broadcast(bitcast.343), dimensions={}
+  get-tuple-element.67 = u32[1]{0} get-tuple-element(custom-call.65), index=1
+  bitcast.344 = u32[] bitcast(get-tuple-element.67)
+  broadcast.28 = u32[98304]{0} broadcast(bitcast.344), dimensions={}
+  iota.68 = u32[196608]{0} iota(), iota_dimension=0
+  slice = u32[98304]{0} slice(iota.68), slice={[0:98304]}
+  slice.1 = u32[98304]{0} slice(iota.68), slice={[98304:196608]}
+  custom-call.75 = (u32[98304]{0}, u32[98304]{0}) custom-call(broadcast.27, broadcast.28, slice, slice.1), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[98304]{0}, u32[98304]{0}, u32[98304]{0}, u32[98304]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\000\200\001\000\000\000\000\000"
+  get-tuple-element.76 = u32[98304]{0} get-tuple-element(custom-call.75), index=0
+  get-tuple-element.77 = u32[98304]{0} get-tuple-element(custom-call.75), index=1
+  concatenate.2 = u32[196608]{0} concatenate(get-tuple-element.76, get-tuple-element.77), dimensions={0}
+  constant.56 = u32[] constant(9)
+  broadcast.63 = u32[196608]{0} broadcast(constant.56), dimensions={}
+  shift-right-logical.3 = u32[196608]{0} shift-right-logical(concatenate.2, broadcast.63)
+  constant.57 = u32[] constant(1065353216)
+  broadcast.64 = u32[196608]{0} broadcast(constant.57), dimensions={}
+  or.3 = u32[196608]{0} or(shift-right-logical.3, broadcast.64)
+  bitcast-convert.3 = f32[196608]{0} bitcast-convert(or.3)
+  constant.58 = f32[] constant(-1)
+  broadcast.65 = f32[196608]{0} broadcast(constant.58), dimensions={}
+  add.10 = f32[196608]{0} add(bitcast-convert.3, broadcast.65)
+  constant.48 = f32[] constant(0)
+  broadcast.66 = f32[196608]{0} broadcast(constant.48), dimensions={}
+  maximum.4 = f32[196608]{0} maximum(add.10, broadcast.66)
+  constant.59 = f32[] constant(0.9)
+  broadcast.67 = f32[196608]{0} broadcast(constant.59), dimensions={}
+  compare.3 = pred[196608]{0} compare(maximum.4, broadcast.67), direction=LT
+  bitcast.308 = pred[2,6,128,128]{3,2,1,0} bitcast(compare.3)
+  constant.44 = pred[2,6,128,128]{3,2,1,0} constant({...})
+  Arg_0.1 = f16[2,6,128,64]{3,2,1,0} parameter(0), sharding={replicated}
+  Arg_1.2 = f16[2,6,64,128]{3,2,1,0} parameter(1), sharding={replicated}
+  dot.34 = f16[2,6,128,128]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  constant.55 = f16[] constant(2)
+  broadcast.61 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.55), dimensions={}
+  multiply.8 = f16[2,6,128,128]{3,2,1,0} multiply(dot.34, broadcast.61)
+  constant.52 = f16[] constant(1)
+  broadcast.39 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.52), dimensions={}
+  add.6 = f16[2,6,128,128]{3,2,1,0} add(multiply.8, broadcast.39)
+  constant.54 = f16[] constant(0)
+  broadcast.52 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.54), dimensions={}
+  select.1 = f16[2,6,128,128]{3,2,1,0} select(constant.44, add.6, broadcast.52)
+  constant.41 = f16[] constant(-inf)
+  reduce.42 = f16[2,6,128]{2,1,0} reduce(select.1, constant.41), dimensions={3}, to_apply=region_0.38
+  broadcast.42 = f16[2,6,128,128]{3,2,1,0} broadcast(reduce.42), dimensions={0,1,2}
+  subtract.1 = f16[2,6,128,128]{3,2,1,0} subtract(select.1, broadcast.42)
+  exponential.1 = f16[2,6,128,128]{3,2,1,0} exponential(subtract.1)
+  convert.5 = f32[2,6,128,128]{3,2,1,0} convert(exponential.1)
+  reduce.54 = f32[2,6,128]{2,1,0} reduce(convert.5, constant.48), dimensions={3}, to_apply=region_1.50
+  convert.9 = f16[2,6,128]{2,1,0} convert(reduce.54)
+  broadcast.68 = f16[2,6,128,128]{3,2,1,0} broadcast(convert.9), dimensions={0,1,2}
+  divide.5 = f16[2,6,128,128]{3,2,1,0} divide(exponential.1, broadcast.68)
+  constant.60 = f16[] constant(1.1113)
+  broadcast.69 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.60), dimensions={}
+  multiply.20 = f16[2,6,128,128]{3,2,1,0} multiply(divide.5, broadcast.69)
+  select.8 = f16[2,6,128,128]{3,2,1,0} select(bitcast.308, multiply.20, broadcast.52)
+  Arg_2.3 = f16[2,6,128,64]{3,2,1,0} parameter(2), sharding={replicated}
+  dot.88 = f16[2,6,128,64]{3,2,1,0} dot(select.8, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  bitcast.248 = pred[2,6,128,128]{3,2,1,0} bitcast(compare.3)
+  Arg_3.4 = f16[2,6,128,64]{3,2,1,0} parameter(3), sharding={replicated}
+  dot.91 = f16[2,6,128,128]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  select.6 = f16[2,6,128,128]{3,2,1,0} select(bitcast.248, dot.91, broadcast.52)
+  multiply.17 = f16[2,6,128,128]{3,2,1,0} multiply(select.6, broadcast.69)
+  divide.4 = f16[2,6,128,128]{3,2,1,0} divide(multiply.17, broadcast.68)
+  broadcast.55 = f16[2,6,128]{2,1,0} broadcast(constant.52), dimensions={}
+  multiply.11 = f16[2,6,128]{2,1,0} multiply(convert.9, convert.9)
+  divide.3 = f16[2,6,128]{2,1,0} divide(broadcast.55, multiply.11)
+  broadcast.56 = f16[2,6,128]{2,1,0} broadcast(constant.60), dimensions={}
+  multiply.12 = f16[2,6,128]{2,1,0} multiply(divide.3, broadcast.56)
+  broadcast.58 = f16[2,6,128,128]{3,2,1,0} broadcast(multiply.12), dimensions={0,1,2}
+  multiply.13 = f16[2,6,128,128]{3,2,1,0} multiply(select.6, broadcast.58)
+  multiply.14 = f16[2,6,128,128]{3,2,1,0} multiply(multiply.13, exponential.1)
+  reduce.103 = f16[2,6,128]{2,1,0} reduce(multiply.14, constant.54), dimensions={3}, to_apply=region_2.99
+  broadcast.62 = f16[2,6,128,128]{3,2,1,0} broadcast(reduce.103), dimensions={0,1,2}
+  add.9 = f16[2,6,128,128]{3,2,1,0} add(divide.4, broadcast.62)
+  multiply.18 = f16[2,6,128,128]{3,2,1,0} multiply(add.9, exponential.1)
+  select.7 = f16[2,6,128,128]{3,2,1,0} select(constant.44, multiply.18, broadcast.52)
+  multiply.19 = f16[2,6,128,128]{3,2,1,0} multiply(select.7, broadcast.61)
+  dot.124 = f16[2,6,128,64]{3,2,1,0} dot(multiply.19, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  dot = f16[2,6,64,128]{3,2,1,0} dot(Arg_0.1, multiply.19), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  dot.1 = f16[2,6,128,64]{3,2,1,0} dot(select.8, Arg_3.4), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  ROOT tuple.125 = (f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,64,128]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}) tuple(dot.88, dot.124, dot, dot.1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  CudnnFusedMHARewriter fusedMhaRewriter{
+      GetCudaComputeCapability(),
+      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+  HloDCE dce;
+  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
+
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+
+  HloVerifier verifier(/*layout_sensitive=*/false,
+                       /*allow_mixed_precision*/ true);
+  ASSERT_IS_OK(verifier.Run(m.get()).status());
+
+  // The backward pattern in the graph is not a valid fmha pattern,
+  // we expect no rewrite happening.
+  EXPECT_EQ(CountFusedAttentionCall(m.get()), 0);
+  EXPECT_EQ(CountFusedAttentionCall(m.get(), /*is_backward*/ true), 0);
+}
+
+TEST_F(CudnnFusedMhaRewriterTestHloTest,
+       F16TrainingBmm1ScaleBiasSoftmaxBmm2QTranspose) {
+  const char* module_str = R"(
+HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[2,6,64,128]{3,2,1,0},f16[2,6,64,128]{3,2,1,0},f16[2,6,128,64]{3,2,1,0},f16[2,6,128,64]{3,2,1,0})->(f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,64,128]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
+
+region_0.21 {
+  Arg_0.22 = f16[] parameter(0)
+  Arg_1.23 = f16[] parameter(1)
+  ROOT maximum = f16[] maximum(Arg_0.22, Arg_1.23)
+}
+
+region_1.33 {
+  Arg_0.34 = f32[] parameter(0)
+  Arg_1.35 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0.34, Arg_1.35)
+}
+
+region_2.55 {
+  Arg_0.56 = f16[] parameter(0)
+  Arg_1.57 = f16[] parameter(1)
+  ROOT add.1 = f16[] add(Arg_0.56, Arg_1.57)
+}
+
+ENTRY main.82 {
+  Arg_0.1 = f16[2,6,64,128]{3,2,1,0} parameter(0), sharding={replicated}
+  Arg_1.2 = f16[2,6,64,128]{3,2,1,0} parameter(1), sharding={replicated}
+  dot.17 = f16[2,6,128,128]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  constant.22 = f16[] constant(2)
+  broadcast.24 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.22), dimensions={}
+  multiply.2 = f16[2,6,128,128]{3,2,1,0} multiply(dot.17, broadcast.24)
+  constant.19 = f16[] constant(1)
+  broadcast.13 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.19), dimensions={}
+  add.3 = f16[2,6,128,128]{3,2,1,0} add(multiply.2, broadcast.13)
+  constant.21 = f16[] constant(0)
+  constant.15 = f16[] constant(-inf)
+  reduce.25 = f16[2,6,128]{2,1,0} reduce(add.3, constant.15), dimensions={3}, to_apply=region_0.21
+  broadcast.17 = f16[2,6,128,128]{3,2,1,0} broadcast(reduce.25), dimensions={0,1,2}
+  subtract.1 = f16[2,6,128,128]{3,2,1,0} subtract(add.3, broadcast.17)
+  exponential.1 = f16[2,6,128,128]{3,2,1,0} exponential(subtract.1)
+  convert.5 = f32[2,6,128,128]{3,2,1,0} convert(exponential.1)
+  constant.17 = f32[] constant(0)
+  reduce.37 = f32[2,6,128]{2,1,0} reduce(convert.5, constant.17), dimensions={3}, to_apply=region_1.33
+  convert.9 = f16[2,6,128]{2,1,0} convert(reduce.37)
+  broadcast.26 = f16[2,6,128,128]{3,2,1,0} broadcast(convert.9), dimensions={0,1,2}
+  divide.5 = f16[2,6,128,128]{3,2,1,0} divide(exponential.1, broadcast.26)
+  Arg_2.3 = f16[2,6,128,64]{3,2,1,0} parameter(2), sharding={replicated}
+  dot.46 = f16[2,6,128,64]{3,2,1,0} dot(divide.5, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  Arg_3.4 = f16[2,6,128,64]{3,2,1,0} parameter(3), sharding={replicated}
+  dot.49 = f16[2,6,128,128]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  divide.4 = f16[2,6,128,128]{3,2,1,0} divide(dot.49, broadcast.26)
+  broadcast.20 = f16[2,6,128]{2,1,0} broadcast(constant.19), dimensions={}
+  multiply.3 = f16[2,6,128]{2,1,0} multiply(convert.9, convert.9)
+  divide.3 = f16[2,6,128]{2,1,0} divide(broadcast.20, multiply.3)
+  broadcast.21 = f16[2,6,128,128]{3,2,1,0} broadcast(divide.3), dimensions={0,1,2}
+  multiply.4 = f16[2,6,128,128]{3,2,1,0} multiply(dot.49, broadcast.21)
+  multiply.5 = f16[2,6,128,128]{3,2,1,0} multiply(multiply.4, exponential.1)
+  reduce.59 = f16[2,6,128]{2,1,0} reduce(multiply.5, constant.21), dimensions={3}, to_apply=region_2.55
+  negate.2 = f16[2,6,128]{2,1,0} negate(reduce.59)
+  broadcast.25 = f16[2,6,128,128]{3,2,1,0} broadcast(negate.2), dimensions={0,1,2}
+  add.5 = f16[2,6,128,128]{3,2,1,0} add(divide.4, broadcast.25)
+  multiply.8 = f16[2,6,128,128]{3,2,1,0} multiply(add.5, exponential.1)
+  multiply.9 = f16[2,6,128,128]{3,2,1,0} multiply(multiply.8, broadcast.24)
+  dot.80 = f16[2,6,128,64]{3,2,1,0} dot(multiply.9, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  dot = f16[2,6,64,128]{3,2,1,0} dot(Arg_0.1, multiply.9), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  dot.1 = f16[2,6,128,64]{3,2,1,0} dot(divide.5, Arg_3.4), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  ROOT tuple.81 = (f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,64,128]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}) tuple(dot.46, dot.80, dot, dot.1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  CudnnFusedMHARewriter fusedMhaRewriter{
+      GetCudaComputeCapability(),
+      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+  HloDCE dce;
+  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
+
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+
+  const HloInstruction* fmha;
+
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(
+          m::GetTupleElement(
+              m::CustomCall(&fmha, {kCudnnfMHAScaleBiasSoftmaxCallTarget}), 0)
+              .WithShape(F16, {2, 6, 128, 64}),
+          m::GetTupleElement(
+              m::CustomCall(&fmha,
+                            {kCudnnfMHAScaleBiasSoftmaxBackwardCallTarget}),
+              0)
+              .WithShape(F16, {2, 6, 128, 64}),
+          m::Transpose(
+              m::GetTupleElement(
+                  m::CustomCall({kCudnnfMHAScaleBiasSoftmaxBackwardCallTarget}),
+                  1))
+              .WithShape(F16, {2, 6, 64, 128}),
+          m::GetTupleElement(
+              m::CustomCall({kCudnnfMHAScaleBiasSoftmaxBackwardCallTarget}), 2)
+              .WithShape(F16, {2, 6, 128, 64}))));
+  TF_ASSERT_OK_AND_ASSIGN(auto config,
+                          fmha->backend_config<CudnnfMHABackendConfig>());
+  EXPECT_EQ(fmha->operands().size(), 5);
+  EXPECT_NEAR(config.dropout_rate(), 0, 1e-2);
+}
+
+TEST_F(CudnnFusedMhaRewriterTestHloTest,
+       F16Bmm1UnfusedSoftmaxBmm2IncorrectBmm1NumUsers) {
+  const char* module_str = R"(
+HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[2,6,40,64]{3,2,1,0},f16[2,6,64,40]{3,2,1,0},f16[2,6,40,64]{3,2,1,0})->(f16[2,6,40,64]{3,2,1,0}, f16[2,6,40,40]{3,2,1,0})}
+
+region_0.7 {
+  Arg_0.8 = f16[] parameter(0)
+  Arg_1.9 = f16[] parameter(1)
+  ROOT maximum = f16[] maximum(Arg_0.8, Arg_1.9)
+}
+
+region_1.19 {
+  Arg_0.20 = f32[] parameter(0)
+  Arg_1.21 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0.20, Arg_1.21)
+}
+
+ENTRY main.31 {
+  Arg_0.1 = f16[2,6,40,64]{3,2,1,0} parameter(0), sharding={replicated}
+  Arg_1.2 = f16[2,6,64,40]{3,2,1,0} parameter(1), sharding={replicated}
+  dot = f16[2,6,40,40]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={3}, rhs_contracting_dims={2}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
+  // extra user of bmm1
+  neg.1 = f16[2,6,40,40]{3,2,1,0} negate(dot)
+  constant = f16[] constant(-inf)
+  reduce.11 = f16[2,6,40]{2,1,0} reduce(dot, constant), dimensions={3}, to_apply=region_0.7
+  broadcast.3 = f16[2,6,40,40]{3,2,1,0} broadcast(reduce.11), dimensions={0,1,2}
+  subtract.1 = f16[2,6,40,40]{3,2,1,0} subtract(dot, broadcast.3)
+  exponential.1 = f16[2,6,40,40]{3,2,1,0} exponential(subtract.1)
+  convert.1 = f32[2,6,40,40]{3,2,1,0} convert(exponential.1)
+  constant.1 = f32[] constant(0)
+  reduce.23 = f32[2,6,40]{2,1,0} reduce(convert.1, constant.1), dimensions={3}, to_apply=region_1.19
+  convert.2 = f16[2,6,40]{2,1,0} convert(reduce.23)
+  broadcast.4 = f16[2,6,40,40]{3,2,1,0} broadcast(convert.2), dimensions={0,1,2}
+  divide = f16[2,6,40,40]{3,2,1,0} divide(exponential.1, broadcast.4)
+  Arg_2.3 = f16[2,6,40,64]{3,2,1,0} parameter(2), sharding={replicated}
+  dot.1 = f16[2,6,40,64]{3,2,1,0} dot(divide, Arg_2.3), lhs_contracting_dims={3}, rhs_contracting_dims={2}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
+  ROOT tuple.81 = (f16[2,6,40,64]{3,2,1,0}, f16[2,6,40,40]{3,2,1,0}) tuple(dot.1, neg.1)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Dot(), m::Negate())));
+}
+
+TEST_F(CudnnFusedMhaRewriterTestHloTest,
+       F16Bmm1UnfusedSoftmaxBmm2IncorrectSoftmaxNumUsers) {
+  const char* module_str = R"(
+HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[2,6,40,64]{3,2,1,0},f16[2,6,64,40]{3,2,1,0},f16[2,6,40,64]{3,2,1,0})->(f16[2,6,40,64]{3,2,1,0}, f16[2,6,40,40]{3,2,1,0})}
+
+region_0.7 {
+  Arg_0.8 = f16[] parameter(0)
+  Arg_1.9 = f16[] parameter(1)
+  ROOT maximum = f16[] maximum(Arg_0.8, Arg_1.9)
+}
+
+region_1.19 {
+  Arg_0.20 = f32[] parameter(0)
+  Arg_1.21 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0.20, Arg_1.21)
+}
+
+ENTRY main.31 {
+  Arg_0.1 = f16[2,6,40,64]{3,2,1,0} parameter(0), sharding={replicated}
+  Arg_1.2 = f16[2,6,64,40]{3,2,1,0} parameter(1), sharding={replicated}
+  dot = f16[2,6,40,40]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={3}, rhs_contracting_dims={2}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
+  constant = f16[] constant(-inf)
+  reduce.11 = f16[2,6,40]{2,1,0} reduce(dot, constant), dimensions={3}, to_apply=region_0.7
+  broadcast.3 = f16[2,6,40,40]{3,2,1,0} broadcast(reduce.11), dimensions={0,1,2}
+  subtract.1 = f16[2,6,40,40]{3,2,1,0} subtract(dot, broadcast.3)
+  // extra user of softmax sub node
+  neg.1 = f16[2,6,40,40]{3,2,1,0} negate(subtract.1)
+  exponential.1 = f16[2,6,40,40]{3,2,1,0} exponential(subtract.1)
+  convert.1 = f32[2,6,40,40]{3,2,1,0} convert(exponential.1)
+  constant.1 = f32[] constant(0)
+  reduce.23 = f32[2,6,40]{2,1,0} reduce(convert.1, constant.1), dimensions={3}, to_apply=region_1.19
+  convert.2 = f16[2,6,40]{2,1,0} convert(reduce.23)
+  broadcast.4 = f16[2,6,40,40]{3,2,1,0} broadcast(convert.2), dimensions={0,1,2}
+  divide = f16[2,6,40,40]{3,2,1,0} divide(exponential.1, broadcast.4)
+  Arg_2.3 = f16[2,6,40,64]{3,2,1,0} parameter(2), sharding={replicated}
+  dot.1 = f16[2,6,40,64]{3,2,1,0} dot(divide, Arg_2.3), lhs_contracting_dims={3}, rhs_contracting_dims={2}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
+  ROOT tuple.81 = (f16[2,6,40,64]{3,2,1,0}, f16[2,6,40,40]{3,2,1,0}) tuple(dot.1, neg.1)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Dot(), m::Negate())));
+}
+
+TEST_F(CudnnFusedMhaRewriterTestHloTest,
+       F16TrainingBmm1ScaleBiasSoftmaxBmm2IncorrectSoftmaxBwdNumUsers) {
+  const char* module_str = R"(
+HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[2,6,64,128]{3,2,1,0},f16[2,6,64,128]{3,2,1,0},f16[2,6,128,64]{3,2,1,0},f16[2,6,128,64]{3,2,1,0})->(f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,64,128]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,128]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
+
+region_0.21 {
+  Arg_0.22 = f16[] parameter(0)
+  Arg_1.23 = f16[] parameter(1)
+  ROOT maximum = f16[] maximum(Arg_0.22, Arg_1.23)
+}
+
+region_1.33 {
+  Arg_0.34 = f32[] parameter(0)
+  Arg_1.35 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0.34, Arg_1.35)
+}
+
+region_2.55 {
+  Arg_0.56 = f16[] parameter(0)
+  Arg_1.57 = f16[] parameter(1)
+  ROOT add.1 = f16[] add(Arg_0.56, Arg_1.57)
+}
+
+ENTRY main.82 {
+  Arg_0.1 = f16[2,6,64,128]{3,2,1,0} parameter(0), sharding={replicated}
+  Arg_1.2 = f16[2,6,64,128]{3,2,1,0} parameter(1), sharding={replicated}
+  dot.17 = f16[2,6,128,128]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  constant.22 = f16[] constant(2)
+  broadcast.24 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.22), dimensions={}
+  multiply.2 = f16[2,6,128,128]{3,2,1,0} multiply(dot.17, broadcast.24)
+  constant.19 = f16[] constant(1)
+  broadcast.13 = f16[2,6,128,128]{3,2,1,0} broadcast(constant.19), dimensions={}
+  add.3 = f16[2,6,128,128]{3,2,1,0} add(multiply.2, broadcast.13)
+  constant.21 = f16[] constant(0)
+  constant.15 = f16[] constant(-inf)
+  reduce.25 = f16[2,6,128]{2,1,0} reduce(add.3, constant.15), dimensions={3}, to_apply=region_0.21
+  broadcast.17 = f16[2,6,128,128]{3,2,1,0} broadcast(reduce.25), dimensions={0,1,2}
+  subtract.1 = f16[2,6,128,128]{3,2,1,0} subtract(add.3, broadcast.17)
+  exponential.1 = f16[2,6,128,128]{3,2,1,0} exponential(subtract.1)
+  convert.5 = f32[2,6,128,128]{3,2,1,0} convert(exponential.1)
+  constant.17 = f32[] constant(0)
+  reduce.37 = f32[2,6,128]{2,1,0} reduce(convert.5, constant.17), dimensions={3}, to_apply=region_1.33
+  convert.9 = f16[2,6,128]{2,1,0} convert(reduce.37)
+  broadcast.26 = f16[2,6,128,128]{3,2,1,0} broadcast(convert.9), dimensions={0,1,2}
+  divide.5 = f16[2,6,128,128]{3,2,1,0} divide(exponential.1, broadcast.26)
+  Arg_2.3 = f16[2,6,128,64]{3,2,1,0} parameter(2), sharding={replicated}
+  dot.46 = f16[2,6,128,64]{3,2,1,0} dot(divide.5, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  Arg_3.4 = f16[2,6,128,64]{3,2,1,0} parameter(3), sharding={replicated}
+  dot.49 = f16[2,6,128,128]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  divide.4 = f16[2,6,128,128]{3,2,1,0} divide(dot.49, broadcast.26)
+  // extra user of softmax bwd divide node
+  neg.1 = f16[2,6,128,128]{3,2,1,0} negate(divide.4)
+  broadcast.20 = f16[2,6,128]{2,1,0} broadcast(constant.19), dimensions={}
+  multiply.3 = f16[2,6,128]{2,1,0} multiply(convert.9, convert.9)
+  divide.3 = f16[2,6,128]{2,1,0} divide(broadcast.20, multiply.3)
+  broadcast.21 = f16[2,6,128,128]{3,2,1,0} broadcast(divide.3), dimensions={0,1,2}
+  multiply.4 = f16[2,6,128,128]{3,2,1,0} multiply(dot.49, broadcast.21)
+  multiply.5 = f16[2,6,128,128]{3,2,1,0} multiply(multiply.4, exponential.1)
+  reduce.59 = f16[2,6,128]{2,1,0} reduce(multiply.5, constant.21), dimensions={3}, to_apply=region_2.55
+  negate.2 = f16[2,6,128]{2,1,0} negate(reduce.59)
+  broadcast.25 = f16[2,6,128,128]{3,2,1,0} broadcast(negate.2), dimensions={0,1,2}
+  add.5 = f16[2,6,128,128]{3,2,1,0} add(divide.4, broadcast.25)
+  multiply.8 = f16[2,6,128,128]{3,2,1,0} multiply(add.5, exponential.1)
+  multiply.9 = f16[2,6,128,128]{3,2,1,0} multiply(multiply.8, broadcast.24)
+  dot.80 = f16[2,6,128,64]{3,2,1,0} dot(multiply.9, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  dot = f16[2,6,64,128]{3,2,1,0} dot(Arg_0.1, multiply.9), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  dot.1 = f16[2,6,128,64]{3,2,1,0} dot(divide.5, Arg_3.4), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  ROOT tuple.81 = (f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,64,128]{3,2,1,0}, f16[2,6,128,64]{3,2,1,0}, f16[2,6,128,128]{3,2,1,0}) tuple(dot.46, dot.80, dot, dot.1, neg.1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  CudnnFusedMHARewriter fusedMhaRewriter{
+      GetCudaComputeCapability(),
+      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+  HloDCE dce;
+  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
+
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Dot(), m::Dot(), m::Dot(), m::Dot(),
+                                  m::Negate())));
+}
+
 }  // anonymous namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/custom_call_test.cc b/third_party/xla/xla/service/gpu/custom_call_test.cc
index d1ed30504594b9..562f4a8641b646 100644
--- a/third_party/xla/xla/service/gpu/custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_call_test.cc
@@ -17,8 +17,6 @@ limitations under the License.
 #include <sstream>
 #include <string>
 
-#include "absl/strings/str_cat.h"
-
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
@@ -30,9 +28,14 @@ limitations under the License.
 #endif
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "xla/client/lib/constants.h"
 #include "xla/client/xla_builder.h"
 #include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/custom_call.h"
 #include "xla/runtime/custom_call_registry.h"
 #include "xla/runtime/executable.h"
@@ -44,6 +47,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/custom_call_registry.h"
 #include "xla/service/gpu/runtime/support.h"
 #include "xla/service/service_executable_run_options.h"
+#include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/test_helpers.h"
@@ -460,5 +464,61 @@ TEST_F(CustomCallTest, ExportedFfiMemcpy) {
   EXPECT_THAT(result.data<float>(), ::testing::Each(42));
 }
 
+//===----------------------------------------------------------------------===//
+// XLA:FFI handler with attached HloComputation
+//===----------------------------------------------------------------------===//
+
+static Status MemcpyWithCalledComputation(
+    const ServiceExecutableRunOptions* run_options, ffi::Buffer src,
+    ffi::Buffer dst, const HloComputation* called_computation) {
+  if (called_computation == nullptr)
+    return absl::InternalError("Called computation is not defined");
+
+  if (called_computation->instruction_count() != 1)
+    return absl::InternalError("Unexpected number of instructions");
+
+  if (!DynCast<HloParameterInstruction>(called_computation->root_instruction()))
+    return absl::InternalError("ROOT must be a paremeter");
+
+  return MemcpyImpl(run_options, src, dst);
+}
+
+XLA_FFI_DEFINE_HANDLER(kMemcpyWithCalledComputation,
+                       MemcpyWithCalledComputation,
+                       ffi::Ffi::Bind()
+                           .Ctx<ServiceExecutableRunOptions>()
+                           .Arg<ffi::Buffer>()  // src
+                           .Arg<ffi::Buffer>()  // dst
+                           .Ctx<ffi::CalledComputation>());
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(),
+                         "__gpu$xla.gpu.ext.memcpy_with_called_compuation",
+                         kMemcpyWithCalledComputation);
+
+TEST_F(CustomCallTest, WithCalledComputation) {
+  // FFI handlers with called computations supported only with Thunks runtime.
+  mutable_debug_options()->set_xla_gpu_enable_xla_runtime_executable(false);
+
+  auto shape = ShapeUtil::MakeShape(F32, {128});
+
+  // Build a called computation which is just a copy instruction.
+  XlaBuilder copy("copy");
+  auto p0 = Parameter(&copy, 0, shape, "l_val");
+  Copy(p0);
+  auto copy_computation = copy.Build().value();
+
+  XlaBuilder b(TestName());
+  CustomCallWithComputation(
+      &b, "__gpu$xla.gpu.ext.memcpy_with_called_compuation",
+      /*operands=*/{Broadcast(ConstantR0WithType(&b, F32, 42.0), {128})},
+      copy_computation, shape, /*opaque=*/"",
+      /*has_side_effect=*/false,
+      /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+      /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  TF_ASSERT_OK_AND_ASSIGN(auto result, ExecuteAndTransfer(&b, {}));
+  EXPECT_THAT(result.data<float>(), ::testing::Each(42));
+}
+
 }  // anonymous namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/custom_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/custom_fusion_rewriter.cc
new file mode 100644
index 00000000000000..622db91c1abea9
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/custom_fusion_rewriter.cc
@@ -0,0 +1,216 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/custom_fusion_rewriter.h"
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/kernels/custom_fusion_pattern.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu {
+
+CustomFusionRewriter::CustomFusionRewriter(
+    const se::DeviceDescription* device,
+    const CustomFusionPatternRegistry* patterns)
+    : device_(device), patterns_(patterns) {}
+
+// Returns a set of instruction that have users outside of a matched pattern
+// and have a replacement that must be applied after building a new custom
+// fusion instruction. Only root instruction can have external users and does
+// not require a replacement, as the fusion itself is a replacement. If
+// instruction has external users and does not have a replacement returns empty
+// optional.
+static std::optional<absl::flat_hash_set<HloInstruction*>>
+GetPatternReplacements(const CustomFusionPattern::Match& match) {
+  absl::flat_hash_set<HloInstruction*> requires_replacement;
+  absl::flat_hash_set<HloInstruction*> instructions_set(
+      match.instructions().begin(), match.instructions().end());
+
+  for (HloInstruction* instr : match.instructions()) {
+    for (HloInstruction* user : instr->users()) {
+      if (instr == match.root() || instructions_set.contains(user)) continue;
+
+      if (match.HasReplacement(instr)) {
+        requires_replacement.insert(instr);
+        continue;
+      }
+
+      VLOG(3) << "Custom fusion intermediate result " << instr->name()
+              << " has users outside of a matched pattern: " << user->name();
+      return std::nullopt;
+    }
+  }
+
+  return requires_replacement;
+}
+
+// Returns instructions that have to become custom fusion parameters. Returns an
+// error if matched pattern can't be outlined as a fusion.
+static absl::InlinedVector<HloInstruction*, 4> GetPatternCaptures(
+    const CustomFusionPattern::Match& match) {
+  absl::InlinedVector<HloInstruction*, 4> captures;
+
+  absl::flat_hash_set<HloInstruction*> instructions_set(
+      match.instructions().begin(), match.instructions().end());
+
+  for (HloInstruction* instr : match.instructions()) {
+    for (HloInstruction* operand : instr->operands()) {
+      if (!instructions_set.contains(operand) &&
+          absl::c_find(captures, operand) == captures.end()) {
+        captures.emplace_back(operand);
+      }
+    }
+  }
+
+  return captures;
+}
+
+// Creates custom fusion computation and moves all matched instructions into it.
+static StatusOr<HloComputation*> CreateFusionBody(
+    HloModule* module, const CustomFusionPattern::Match& match,
+    absl::Span<HloInstruction* const> captures) {
+  HloComputation::Builder builder(match.config().name());
+
+  // A mapping from original instructions to instructions in the fusion body.
+  absl::flat_hash_map<const HloInstruction*, HloInstruction*> instr_mapping;
+
+  auto mapped_operands = [&](HloInstruction* instr) {
+    absl::InlinedVector<HloInstruction*, 4> operands;
+    for (HloInstruction* operand : instr->operands()) {
+      operands.push_back(instr_mapping.at(operand));
+    }
+    return operands;
+  };
+
+  // For every parameter create a parameter instruction in the computation body
+  // and set up instruction mapping.
+  for (const HloInstruction* capture : captures) {
+    int64_t index = instr_mapping.size();
+    instr_mapping[capture] =
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            index, capture->shape(), absl::StrCat("p", index)));
+  }
+
+  // TODO(ezhulenev): Instructions in the pattern must be topologically sorted,
+  // otherwise we'll get a crash! Figure out how to do it!
+  for (HloInstruction* instr : match.instructions()) {
+    instr_mapping[instr] = builder.AddInstruction(
+        instr->CloneWithNewOperands(instr->shape(), mapped_operands(instr)));
+  }
+
+  return module->AddComputationAndUnifyNamesAndIds(builder.Build(), false);
+}
+
+static StatusOr<HloInstruction*> CreateFusionInstruction(
+    HloModule* module, const CustomFusionPattern::Match& match,
+    absl::Span<HloInstruction* const> captures, HloComputation* body) {
+  // We'll be replacing the root operation of a custom fusion with a fusion
+  // instruction calling fusion computation.
+  HloInstruction* root = match.root();
+  HloComputation* parent = root->parent();
+
+  // Add a fusion operation calling outlined fusion computation.
+  HloInstruction* fusion = parent->AddInstruction(HloInstruction::CreateFusion(
+      root->shape(), HloInstruction::FusionKind::kCustom, captures, body));
+  module->SetAndUniquifyInstrName(fusion, match.config().name());
+
+  // Set backends config to a matched custom fusion config.
+  FusionBackendConfig backend_config;
+  backend_config.set_kind("__custom_fusion");
+  *backend_config.mutable_custom_fusion_config() = match.config();
+  TF_RETURN_IF_ERROR(fusion->set_backend_config(std::move(backend_config)));
+
+  return fusion;
+}
+
+StatusOr<bool> CustomFusionRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  std::vector<CustomFusionPattern::Match> matches;
+
+  // Collect all potential custom fusion matches in the module.
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instr : computation->instructions()) {
+      auto matched = patterns_->Match(*device_, instr);
+      matches.insert(matches.end(), matched.begin(), matched.end());
+    }
+  }
+
+  if (matches.empty()) return false;
+
+  for (const CustomFusionPattern::Match& match : matches) {
+    VLOG(2) << "Matched custom fusion " << match.config().name()
+            << "; root instruction: " << match.instructions().back()->name();
+
+    auto replacememts = GetPatternReplacements(match);
+    if (!replacememts.has_value()) continue;
+
+    auto captures = GetPatternCaptures(match);
+
+    TF_ASSIGN_OR_RETURN(HloComputation * fusion_body,
+                        CreateFusionBody(module, match, captures));
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * fusion,
+        CreateFusionInstruction(module, match, captures, fusion_body));
+
+    VLOG(2) << "Added a fusion instruction: " << fusion->name()
+            << " for custom fusion " << match.config().name()
+            << " (instruction count = " << match.instructions().size() << ")";
+
+    for (HloInstruction* instr : *replacememts) {
+      VLOG(2) << "Replace matched instruction: " << instr->name()
+              << " with a pattern replacement";
+
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * replacement,
+          match.BuildReplacement(instr, Cast<HloFusionInstruction>(fusion)));
+
+      TF_RETURN_IF_ERROR(
+          instr->ReplaceAllUsesWith(replacement, match.config().name()));
+
+      VLOG(2) << "Replaced instruction: " << instr->name()
+              << " with: " << replacement->name();
+    }
+
+    VLOG(2) << "Replace custom fusion root instruction " << match.root()->name()
+            << "with " << fusion->name();
+    HloComputation* parent = match.root()->parent();
+    TF_RETURN_IF_ERROR(parent->ReplaceInstruction(match.root(), fusion));
+  }
+
+  return true;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/custom_fusion_rewriter.h b/third_party/xla/xla/service/gpu/custom_fusion_rewriter.h
new file mode 100644
index 00000000000000..dd0c641873efb1
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/custom_fusion_rewriter.h
@@ -0,0 +1,82 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_CUSTOM_FUSION_REWRITER_H_
+#define XLA_SERVICE_GPU_CUSTOM_FUSION_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/service/gpu/kernels/custom_fusion_pattern.h"
+#include "xla/service/hlo_pass_interface.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+// Pattern matches HLO instruction to custom fusions (hand written CUDA C++
+// kernels, e.g. custom GEMMs implemented with CUTLASS) and rewrites them into
+// fusion instructions and fusion computations.
+//
+// Example: pattern matching dot operation into CUTLASS gemm
+//
+//  ENTRY %main (p0: f16[15,19], p1: f16[19,17]) -> f16[15,17] {
+//    %p0 = f16[15,19]{1,0} parameter(0)
+//    %p1 = f16[19,17]{1,0} parameter(1)
+//    ROOT %r = f16[15,17]{1,0} dot(%p0, %p1),
+//      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+//  }
+//
+// After the pass:
+//
+//  %cutlass_gemm (p0: f16[19,17], p1: f16[15,19]) -> f16[15,17] {
+//    %p0 = f16[15,19]{1,0} parameter(0)
+//    %p1 = f16[19,17]{1,0} parameter(1)
+//    ROOT %r = f16[15,17]{1,0} dot(%p0, %p1),
+//      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+//  }
+//
+//  ENTRY %main (p0: f16[15,19], p1: f16[19,17]) -> f16[15,17] {
+//    %p0 = f16[15,19]{1,0} parameter(0)
+//    %p1 = f16[19,17]{1,0} parameter(1)
+//    ROOT %r = f16[15,17]{1,0} fusion(%p0, %p1), kind=kCustom,
+//      calls==cutlass_gemm,
+//      backend_config={kind: "__custom_fusion",
+//                      custom_fusion_config: {"name":"cutlass_gemm"}}
+//  }
+//
+class CustomFusionRewriter : public HloModulePass {
+ public:
+  explicit CustomFusionRewriter(const se::DeviceDescription* device,
+                                const CustomFusionPatternRegistry* patterns =
+                                    CustomFusionPatternRegistry::Default());
+
+  absl::string_view name() const override { return "custom-fusion-rewriter"; }
+
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::DeviceDescription* device_;
+  const CustomFusionPatternRegistry* patterns_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_CUSTOM_FUSION_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/custom_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/custom_fusion_rewriter_test.cc
new file mode 100644
index 00000000000000..84252a0fa2e3ff
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/custom_fusion_rewriter_test.cc
@@ -0,0 +1,91 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/custom_fusion_rewriter.h"
+
+#include <optional>
+#include <utility>
+
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/kernels/custom_fusion_pattern.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/test.h"
+
+namespace xla::gpu {
+
+//===----------------------------------------------------------------------===//
+// Simple pattern matchers for testing custom fusion rewriter.
+//===----------------------------------------------------------------------===//
+
+class SimpleGemmPattern : public CustomFusionPattern {
+ public:
+  std::optional<Match> TryMatch(const se::DeviceDescription& device,
+                                HloInstruction* instr) const override {
+    if (auto* dot = DynCast<HloDotInstruction>(instr)) {
+      CustomFusionConfig config;
+      config.set_name("simple_gemm");
+      return Match{config, {instr}};
+    }
+    return std::nullopt;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+class CustomFusionRewriterTest : public HloTestBase {};
+
+TEST_F(CustomFusionRewriterTest, SimpleGemm) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY %main (p0: f16[15,19], p1: f16[19,17]) -> f16[15,17] {
+      %p0 = f16[15,19]{1,0} parameter(0)
+      %p1 = f16[19,17]{1,0} parameter(1)
+      ROOT %r = f16[15,17]{1,0} dot(%p0, %p1),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK: %simple_gemm {{.*}} {
+    ; CHECK:   [[P0:%[^ ]+]] = f16[15,19]{1,0} parameter(0)
+    ; CHECK:   [[P1:%[^ ]+]] = f16[19,17]{1,0} parameter(1)
+    ; CHECK:   ROOT [[DOT:%[^ ]+]] = f16[15,17]{1,0} dot([[P0]], [[P1]]),
+    ; CHECK:     lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    ; CHECK: }
+
+    ; CHECK: ENTRY %main {{.*}} {
+    ; CHECK:   ROOT [[FUSION:%[^ ]+]] = f16[15,17]{1,0} fusion
+    ; CHECK:     kind=kCustom, calls=%simple_gemm,
+    ; CHECK:     backend_config={
+    ; CHECK:       "kind":"__custom_fusion",
+    ; CHECK:       "custom_fusion_config":{"name":"simple_gemm"}
+    ; CHECK:     }
+    ; CHECK: }
+  )";
+
+  CustomFusionPatternRegistry patterns;
+  patterns.Emplace<SimpleGemmPattern>();
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  CustomFusionRewriter pass(&device, &patterns);
+  RunAndFilecheckHloRewrite(hlo, std::move(pass), expected);
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/determinism_test.cc b/third_party/xla/xla/service/gpu/determinism_test.cc
index 6a2f476bdb696e..ab362dba4ad42b 100644
--- a/third_party/xla/xla/service/gpu/determinism_test.cc
+++ b/third_party/xla/xla/service/gpu/determinism_test.cc
@@ -89,6 +89,16 @@ ENTRY e {
   ROOT d = f32[128,128] dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
 
+#if TENSORFLOW_USE_ROCM
+  auto rocm = backend()
+                  .default_stream_executor()
+                  ->GetDeviceDescription()
+                  .rocm_compute_capability();
+  if (!rocm.has_hipblaslt()) {
+    GTEST_SKIP() << "No hipblas-lt support on this architecture!";
+  }
+#endif  // TENSORFLOW_USE_ROCM
+
   debug_options_.set_xla_gpu_triton_fusion_level(0);
   MatchOptimizedHlo(kHloText, R"(; CHECK: custom_call_target="__cublas$gemm")");
   AssertDeterminism(kHloText);
@@ -100,13 +110,17 @@ ENTRY e {
 }
 
 TEST_F(DeterminismTest, TritonDot) {
-  se::CudaComputeCapability compute_capability = backend()
-                                                     .default_stream_executor()
-                                                     ->GetDeviceDescription()
-                                                     .cuda_compute_capability();
-  if (!compute_capability.IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+#if GOOGLE_CUDA
+  auto comp = backend()
+                  .default_stream_executor()
+                  ->GetDeviceDescription()
+                  .cuda_compute_capability();
+  if (!comp.IsAtLeast(se::CudaComputeCapability::VOLTA)) {
     GTEST_SKIP() << "Triton not used on pre-Volta GPUs";
   }
+#elif TENSORFLOW_USE_ROCM
+  GTEST_SKIP() << "Triton Gemm rewriter is not yet supported on ROCM";
+#endif  // TENSORFLOW_USE_ROCM
 
   constexpr absl::string_view kHloText = R"(
 ENTRY e {
diff --git a/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc b/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
index 260287cc259521..a82753243dec65 100644
--- a/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
@@ -188,6 +188,17 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatBinaryOp(
         {lhs_value, rhs_value}, {lhs_value->getType()}, b());
   }
 
+  // sm_80 and up has min.NaN and max.NaN instructions.
+  if (output_type == F32 &&
+      ir_emitter_context_.cuda_compute_capability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      (opcode == HloOpcode::kMaximum || opcode == HloOpcode::kMinimum)) {
+    return llvm_ir::EmitCallToIntrinsic(
+        opcode == HloOpcode::kMaximum ? llvm::Intrinsic::maximum
+                                      : llvm::Intrinsic::minimum,
+        {lhs_value, rhs_value}, {lhs_value->getType()}, b());
+  }
+
   switch (op->opcode()) {
     case HloOpcode::kRemainder: {
       return EmitDeviceMathCall(TargetDeviceFunctionID::kFmod,
@@ -342,6 +353,18 @@ llvm::Value* GpuElementalIrEmitter::EmitThreadId() {
   return NSWAdd(NSWMul(block_id, threads_per_block), thread_id_in_block);
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitF32ToBF16(
+    llvm::Value* f32_value) {
+  // sm_80 and up has an instruction to convert f32 into bf16.
+  if (ir_emitter_context_.cuda_compute_capability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    return BitCast(
+        FPTrunc(BitCast(f32_value, b()->getFloatTy()), b()->getBFloatTy()),
+        b()->getInt16Ty());
+  }
+  return ElementalIrEmitter::EmitF32ToBF16(f32_value);
+}
+
 StatusOr<std::vector<llvm::Value*>> GpuElementalIrEmitter::EmitThreadLocalCall(
     const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
     absl::string_view, bool /*is_reducer*/) {
diff --git a/third_party/xla/xla/service/gpu/elemental_ir_emitter.h b/third_party/xla/xla/service/gpu/elemental_ir_emitter.h
index f97861ba2e7afc..76a4ec19dbba67 100644
--- a/third_party/xla/xla/service/gpu/elemental_ir_emitter.h
+++ b/third_party/xla/xla/service/gpu/elemental_ir_emitter.h
@@ -96,6 +96,8 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
 
   llvm::Value* EmitThreadId() override;
 
+  StatusOr<llvm::Value*> EmitF32ToBF16(llvm::Value* f32_value) override;
+
   bool fast_min_max() override {
     return ir_emitter_context_.debug_options().xla_gpu_enable_fast_min_max();
   }
diff --git a/third_party/xla/xla/service/gpu/executable.proto b/third_party/xla/xla/service/gpu/executable.proto
index 8dfd18d80b2f1b..3a9e08a8665116 100644
--- a/third_party/xla/xla/service/gpu/executable.proto
+++ b/third_party/xla/xla/service/gpu/executable.proto
@@ -28,9 +28,6 @@ message XlaRuntimeGpuExecutableProto {
 
   XlaRuntimeExecutableProto xla_runtime_executable = 1;
 
-  // XLA-specific attributes of the executable's entry function.
-  EntryFunctionAttributes entry_func_attrs = 2;
-
   // PTX for the compiled GPU kernels.
   string gpu_asm_text = 3;
 
@@ -40,3 +37,10 @@ message XlaRuntimeGpuExecutableProto {
   // Constants required by the serialized executable.
   repeated ConstantInfoProto constants = 5;
 }
+
+message CompilationResultProto {
+  HloModuleProto hlo_module = 1;
+  BufferAssignmentProto buffer_assignment = 2;
+  string asm_text = 3;
+  bytes binary = 4;
+}
diff --git a/third_party/xla/xla/service/gpu/fused_mha_thunk.cc b/third_party/xla/xla/service/gpu/fused_mha_thunk.cc
index 96562a8e2f9403..f0ba6f3fbd1774 100644
--- a/third_party/xla/xla/service/gpu/fused_mha_thunk.cc
+++ b/third_party/xla/xla/service/gpu/fused_mha_thunk.cc
@@ -61,6 +61,15 @@ FusedMultiHeadedAttentionRunner& FusedMHAThunk::GetOrCreateRunner(
   return *it->second;
 }
 
+std::optional<se::DeviceMemoryBase> AssignBufferIfNotNull(
+    const BufferAllocations& buffer_allocations,
+    BufferAllocation::Slice& slice) {
+  return slice.allocation() != nullptr
+             ? std::optional<se::DeviceMemoryBase>{buffer_allocations
+                                                       .GetDeviceAddress(slice)}
+             : std::nullopt;
+}
+
 Status FusedMHAThunk::ExecuteOnStream(const ExecuteParams& params) {
   const auto& buffer_allocations = *params.buffer_allocations;
   se::DeviceMemoryBase lhs_bmm1_buffer =
@@ -74,19 +83,12 @@ Status FusedMHAThunk::ExecuteOnStream(const ExecuteParams& params) {
   se::DeviceMemoryBase scratch_buffer =
       buffer_allocations.GetDeviceAddress(scratch_buffer_);
 
-  std::optional<se::DeviceMemoryBase> mask_buffer;
-  if (mask_buffer_.allocation() != nullptr) {
-    mask_buffer = buffer_allocations.GetDeviceAddress(mask_buffer_);
-  }
-  std::optional<se::DeviceMemoryBase> bias_buffer;
-  if (bias_buffer_.allocation() != nullptr) {
-    bias_buffer = buffer_allocations.GetDeviceAddress(bias_buffer_);
-  }
-
-  std::optional<se::DeviceMemoryBase> activation_buffer;
-  if (activation_buffer_.allocation() != nullptr) {
-    activation_buffer = buffer_allocations.GetDeviceAddress(activation_buffer_);
-  }
+  std::optional<se::DeviceMemoryBase> mask_buffer =
+      AssignBufferIfNotNull(buffer_allocations, mask_buffer_);
+  std::optional<se::DeviceMemoryBase> bias_buffer =
+      AssignBufferIfNotNull(buffer_allocations, bias_buffer_);
+  std::optional<se::DeviceMemoryBase> activation_buffer =
+      AssignBufferIfNotNull(buffer_allocations, activation_buffer_);
 
   RunFusedMHAOptions opts;
   opts.runner_cache = &GetOrCreateRunner(params.stream);
@@ -109,7 +111,9 @@ FusedMHABackwardThunk::FusedMHABackwardThunk(
     BufferAllocation::Slice d_output, BufferAllocation::Slice scratch,
     BufferAllocation::Slice d_bmm1_lhs, BufferAllocation::Slice d_bmm1_rhs,
     BufferAllocation::Slice d_bmm2_rhs, BufferAllocation::Slice d_s,
-    BufferAllocation::Slice mask, BufferAllocation::Slice d_bias)
+    BufferAllocation::Slice softmax_sum, BufferAllocation::Slice d_Q_accum,
+    BufferAllocation::Slice mask, BufferAllocation::Slice d_bias,
+    BufferAllocation::Slice fwd_output, BufferAllocation::Slice bias)
     : Thunk(Kind::kFusedMHA, thunk_info),
       bmm1_grad_gemm1_rhs_buffer_(bmm1_grad_gemm1_rhs),
       bmm1_grad_gemm2_rhs_buffer_(bmm1_grad_gemm2_rhs),
@@ -121,8 +125,12 @@ FusedMHABackwardThunk::FusedMHABackwardThunk(
       d_bmm1_rhs_buffer_(d_bmm1_rhs),
       d_bmm2_rhs_buffer_(d_bmm2_rhs),
       d_s_buffer_(d_s),
+      softmax_sum_buffer_(softmax_sum),
+      d_Q_accum_buffer_(d_Q_accum),
       mask_buffer_(mask),
       d_bias_buffer_(d_bias),
+      fwd_output_buffer_(fwd_output),
+      bias_buffer_(bias),
       config_(std::move(config)) {}
 
 FusedMultiHeadedAttentionBackwardRunner&
@@ -169,18 +177,21 @@ Status FusedMHABackwardThunk::ExecuteOnStream(const ExecuteParams& params) {
   se::DeviceMemoryBase d_bmm2_rhs_buffer =
       buffer_allocations.GetDeviceAddress(d_bmm2_rhs_buffer_);
 
-  se::DeviceMemoryBase d_S_buffer =
-      buffer_allocations.GetDeviceAddress(d_s_buffer_);
+  std::optional<se::DeviceMemoryBase> d_s_buffer =
+      AssignBufferIfNotNull(buffer_allocations, d_s_buffer_);
+  std::optional<se::DeviceMemoryBase> softmax_sum_buffer =
+      AssignBufferIfNotNull(buffer_allocations, softmax_sum_buffer_);
+  std::optional<se::DeviceMemoryBase> d_Q_accum_buffer =
+      AssignBufferIfNotNull(buffer_allocations, d_Q_accum_buffer_);
+  std::optional<se::DeviceMemoryBase> mask_buffer =
+      AssignBufferIfNotNull(buffer_allocations, mask_buffer_);
+  std::optional<se::DeviceMemoryBase> d_bias_buffer =
+      AssignBufferIfNotNull(buffer_allocations, d_bias_buffer_);
+  std::optional<se::DeviceMemoryBase> fwd_output_buffer =
+      AssignBufferIfNotNull(buffer_allocations, fwd_output_buffer_);
+  std::optional<se::DeviceMemoryBase> bias_buffer =
+      AssignBufferIfNotNull(buffer_allocations, bias_buffer_);
 
-  std::optional<se::DeviceMemoryBase> mask_buffer;
-  if (mask_buffer_.allocation() != nullptr) {
-    mask_buffer = buffer_allocations.GetDeviceAddress(mask_buffer_);
-  }
-
-  std::optional<se::DeviceMemoryBase> d_bias_buffer;
-  if (d_bias_buffer_.allocation() != nullptr) {
-    d_bias_buffer = buffer_allocations.GetDeviceAddress(d_bias_buffer_);
-  }
   RunFusedMHABackwardOptions opts;
 
   opts.runner_cache = &GetOrCreateRunner(params.stream);
@@ -189,7 +200,8 @@ Status FusedMHABackwardThunk::ExecuteOnStream(const ExecuteParams& params) {
       config_, bmm1_grad_gemm1_rhs_buffer, bmm1_grad_gemm2_rhs_buffer,
       bmm2_grad_gemm1_lhs_buffer, bmm2_grad_gemm2_rhs_buffer, d_output_buffer,
       scratch_buffer, d_bmm1_lhs_buffer, d_bmm1_rhs_buffer, d_bmm2_rhs_buffer,
-      d_S_buffer, mask_buffer, d_bias_buffer, params.stream, opts));
+      d_s_buffer, softmax_sum_buffer, d_Q_accum_buffer, mask_buffer,
+      d_bias_buffer, fwd_output_buffer, bias_buffer, params.stream, opts));
   if (!params.stream->ok()) {
     return InternalError("FusedMHABackwardThunk::ExecuteOnStream failed.");
   }
diff --git a/third_party/xla/xla/service/gpu/fused_mha_thunk.h b/third_party/xla/xla/service/gpu/fused_mha_thunk.h
index a1db1d23e16c9e..a0d9e58aa0e648 100644
--- a/third_party/xla/xla/service/gpu/fused_mha_thunk.h
+++ b/third_party/xla/xla/service/gpu/fused_mha_thunk.h
@@ -91,9 +91,13 @@ class FusedMHABackwardThunk : public Thunk {
                         BufferAllocation::Slice d_bmm1_lhs_slice,
                         BufferAllocation::Slice d_bmm1_rhs_slice,
                         BufferAllocation::Slice d_bmm2_rhs_slice,
-                        BufferAllocation::Slice d_S_slice,
+                        BufferAllocation::Slice d_s_slice,
+                        BufferAllocation::Slice softmax_sum_slice,
+                        BufferAllocation::Slice d_Q_accum_slice,
                         BufferAllocation::Slice mask_slice,
-                        BufferAllocation::Slice d_bias_slice);
+                        BufferAllocation::Slice d_bias_slice,
+                        BufferAllocation::Slice fwd_output_slice,
+                        BufferAllocation::Slice bias_slice);
 
   FusedMHABackwardThunk(const FusedMHABackwardThunk&) = delete;
   FusedMHABackwardThunk& operator=(const FusedMHABackwardThunk&) = delete;
@@ -111,8 +115,12 @@ class FusedMHABackwardThunk : public Thunk {
   BufferAllocation::Slice d_bmm1_rhs_buffer_;
   BufferAllocation::Slice d_bmm2_rhs_buffer_;
   BufferAllocation::Slice d_s_buffer_;
+  BufferAllocation::Slice softmax_sum_buffer_;
+  BufferAllocation::Slice d_Q_accum_buffer_;
   BufferAllocation::Slice mask_buffer_;
   BufferAllocation::Slice d_bias_buffer_;
+  BufferAllocation::Slice fwd_output_buffer_;
+  BufferAllocation::Slice bias_buffer_;
 
   FusedMultiHeadedAttentionBackwardRunner& GetOrCreateRunner(
       const stream_executor::Stream* stream);
diff --git a/third_party/xla/xla/service/gpu/fusion_merger.cc b/third_party/xla/xla/service/gpu/fusion_merger.cc
index 3e44bc4c84d7d8..5bf8b68763e8c9 100644
--- a/third_party/xla/xla/service/gpu/fusion_merger.cc
+++ b/third_party/xla/xla/service/gpu/fusion_merger.cc
@@ -220,6 +220,10 @@ FusionDecision FusionInstructionMerger::ShouldFuse(HloInstruction* producer) {
       ++num_fail_merge_all_users_;
       return "not fusing bitcast ops";
     }
+    if (user->IsCustomFusion()) {
+      ++num_fail_merge_all_users_;
+      return "not fusing custom fusions";
+    }
     auto consumer_hero = GetRealHeroForMultiOutputFusion(*user);
     if (auto compatible =
             FusionHeroesAreCompatible(producer_hero, consumer_hero);
diff --git a/third_party/xla/xla/service/gpu/fusion_merger_triton.cc b/third_party/xla/xla/service/gpu/fusion_merger_triton.cc
new file mode 100644
index 00000000000000..4a908ce61a9e19
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusion_merger_triton.cc
@@ -0,0 +1,281 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/fusion_merger_triton.h"
+
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gpu_fusible.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/triton_fusion_analysis.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"
+
+namespace xla::gpu {
+namespace {
+
+// Taking in a producer HloFusionInstruction, tries to merge into consumer
+// triton softmax fusion.
+// The following is assumed:
+//  * The producer is an HloFusionInstruction
+//  * The (sole) consumer of the producer is a triton softmax fusion
+//
+// Returns std::optional<HloFusionInstruction*>, pointing to the new (fused)
+// triton softmax instruction if the producer was successfully merged into the
+// consumer. If the merge was unsuccessful, the original computation remains
+// unchanged and a nullopt is returned.
+std::optional<HloFusionInstruction*>
+TryMergeFusionProducerIntoTritonSoftmaxConsumer(
+    HloFusionInstruction* producer) {
+  // TODO(b/313026024): Add support for multiple users
+  CHECK_EQ(producer->user_count(), 1);
+
+  HloComputation* computation = producer->parent();
+  HloModule* parent_module = computation->parent();
+  HloInstruction* original_softmax_instruction = producer->users().front();
+  CHECK_EQ(original_softmax_instruction->opcode(), HloOpcode::kFusion);
+
+  std::unique_ptr<HloInstruction> candidate =
+      original_softmax_instruction->Clone();
+  HloInstruction* candidate_fusion = candidate.get();
+
+  // Try to merge the producer into candidate fusion.
+  candidate_fusion->MergeFusionInstruction(producer);
+
+  HloComputation* fused_computation =
+      candidate_fusion->called_computations().front();
+
+  const auto analysis = TritonFusionAnalysis::Execute(*fused_computation);
+
+  if (!analysis.ok()) {
+    return std::nullopt;
+  }
+
+  computation->AddInstruction(std::move(candidate));
+
+  if (original_softmax_instruction->IsRoot()) {
+    computation->set_root_instruction(candidate_fusion);
+  }
+
+  TF_CHECK_OK(
+      original_softmax_instruction->ReplaceAllUsesWith(candidate_fusion));
+
+  HloComputation* original_softmax_computation =
+      original_softmax_instruction->fused_instructions_computation();
+  TF_CHECK_OK(computation->RemoveInstruction(original_softmax_instruction));
+  TF_CHECK_OK(
+      parent_module->RemoveEmbeddedComputation(original_softmax_computation));
+
+  CHECK_EQ(0, producer->user_count()) << producer->ToString();
+  HloComputation* original_producer_computation =
+      producer->fused_instructions_computation();
+  TF_CHECK_OK(computation->RemoveInstruction(producer));
+  TF_CHECK_OK(
+      parent_module->RemoveEmbeddedComputation(original_producer_computation));
+
+  return Cast<HloFusionInstruction>(candidate_fusion);
+}
+
+// Taking in a consumer HloFusionInstruction and a HloInstruction for a triton
+// softmax fusion, tries to merge the consumer fusion into the softmax fusion.
+// The following is assumed:
+//  * The consumer is an HloFusionInstruction
+//  * consumer->shape().IsArray(), i.e. not a multi-output consumer
+//  * The original_softmax_instr is a triton softmax fusion
+//  * The consumer is the sole user of original_softmax_instr
+//
+// Returns std::optional<HloFusionInstruction*>, pointing to the new (fused)
+// triton softmax instruction if the consumer was successfully merged into the
+// producer. If the merge was unsuccessful, the original computation remains
+// unchanged and a nullopt is returned.
+std::optional<HloFusionInstruction*>
+TryMergeFusionConsumerIntoTritonSoftmaxProducer(
+    HloFusionInstruction* consumer,
+    HloFusionInstruction* original_softmax_instr) {
+  CHECK_EQ(original_softmax_instr->opcode(), HloOpcode::kFusion);
+  CHECK_EQ(original_softmax_instr->user_count(), 1);
+  CHECK_EQ(original_softmax_instr->users().front(), consumer);
+  CHECK(consumer->shape().IsArray());
+  CHECK_OK(original_softmax_instr->backend_config<FusionBackendConfig>());
+  CHECK_EQ(
+      original_softmax_instr->backend_config<FusionBackendConfig>()->kind(),
+      kTritonSoftmaxFusionKind);
+  HloComputation* parent_computation = consumer->parent();
+  HloModule* parent_module = parent_computation->parent();
+
+  // We clone the consumer to generate a candidate that we fuse into.
+  std::unique_ptr<HloInstruction> candidate_instr_ptr = consumer->Clone();
+  HloInstruction* consumer_candidate_instr = candidate_instr_ptr.get();
+
+  // Try to merge the producer into candidate fusion.
+  consumer_candidate_instr->MergeFusionInstruction(original_softmax_instr);
+  HloComputation* fused_computation =
+      consumer_candidate_instr->fused_instructions_computation();
+
+  const auto analysis = TritonFusionAnalysis::Execute(*fused_computation);
+
+  if (!analysis.ok()) {
+    return std::nullopt;
+  }
+
+  // We want our joined fusion to have the correct fusion_kind, backend_config,
+  // etc for a triton fusion. So we assemble a new instruction rather than
+  // using consumer_candidate_instr, which would not get triton codegen'd.
+  std::unique_ptr<HloInstruction> new_softmax_instr_ptr =
+      HloInstruction::CreateFusion(
+          /*shape=*/consumer_candidate_instr->shape(),
+          /*fusion_kind=*/original_softmax_instr->fusion_kind(),
+          /*operands=*/consumer_candidate_instr->operands(),
+          /*fusion_computation=*/fused_computation,
+          /*prefix=*/"triton_softmax_");
+
+  HloInstruction* new_softmax_instr = new_softmax_instr_ptr.get();
+
+  new_softmax_instr->CopyBackendConfigFrom(original_softmax_instr);
+
+  // Now, we incorporate new_softmax_instr into our module.
+  parent_computation->AddInstruction(std::move(new_softmax_instr_ptr));
+
+  if (consumer->IsRoot()) {
+    parent_computation->set_root_instruction(new_softmax_instr);
+  }
+
+  TF_CHECK_OK(consumer->ReplaceAllUsesWith(new_softmax_instr));
+
+  // Remove the replaced instructions and computations from the module.
+  HloComputation* original_consumer_computation =
+      consumer->fused_instructions_computation();
+  TF_CHECK_OK(parent_computation->RemoveInstruction(consumer));
+  TF_CHECK_OK(
+      parent_module->RemoveEmbeddedComputation(original_consumer_computation));
+
+  CHECK_EQ(0, original_softmax_instr->user_count());
+
+  // Keep a ptr to the original computation so we can remove it from the module.
+  HloComputation* original_softmax_computation =
+      original_softmax_instr->fused_instructions_computation();
+
+  TF_CHECK_OK(parent_computation->RemoveInstruction(original_softmax_instr));
+  TF_CHECK_OK(
+      parent_module->RemoveEmbeddedComputation(original_softmax_computation));
+
+  return Cast<HloFusionInstruction>(new_softmax_instr);
+}
+
+bool TryMergeProducerAndConsumerFusionsIntoTritonSoftmax(
+    HloFusionInstruction* softmax_fusion) {
+  // The softmax_fusion should come directly from the matcher, and have a single
+  // operand.
+  CHECK_EQ(softmax_fusion->operand_count(), 1);
+
+  // TODO(b/313026024): Add support for multiple users
+  bool should_try_merging_producer =
+      softmax_fusion->operand(0)->user_count() == 1 &&
+      softmax_fusion->operand(0)->opcode() == HloOpcode::kFusion;
+  // TODO(b/315040476): generalize for multiple users and multi-output
+  bool should_try_merging_consumer =
+      softmax_fusion->user_count() == 1 &&
+      softmax_fusion->users().front()->opcode() == HloOpcode::kFusion &&
+      softmax_fusion->users().front()->shape().IsArray();
+
+  bool changed = false;
+  if (should_try_merging_producer) {
+    HloFusionInstruction* producer =
+        Cast<HloFusionInstruction>(softmax_fusion->mutable_operand(0));
+
+    VLOG(6) << "Fusing producer " << producer->ToShortString() << " into "
+            << softmax_fusion->ToShortString();
+
+    std::optional<HloFusionInstruction*> result =
+        TryMergeFusionProducerIntoTritonSoftmaxConsumer(producer);
+
+    if (!result.has_value()) {
+      VLOG(6) << "Did not fuse producer into "
+              << softmax_fusion->ToShortString();
+    } else {
+      softmax_fusion = result.value();
+      changed = true;
+    }
+  }
+
+  if (should_try_merging_consumer) {
+    HloFusionInstruction* consumer =
+        Cast<HloFusionInstruction>(softmax_fusion->users().front());
+
+    VLOG(6) << "Fusing consumer " << consumer->ToShortString() << " into "
+            << softmax_fusion->ToShortString();
+
+    std::optional<HloFusionInstruction*> result =
+        TryMergeFusionConsumerIntoTritonSoftmaxProducer(consumer,
+                                                        softmax_fusion);
+
+    if (!result.has_value()) {
+      VLOG(6) << "Did not fuse consumer into "
+              << softmax_fusion->ToShortString();
+    } else {
+      softmax_fusion = result.value();
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+}  // anonymous namespace
+
+StatusOr<bool> FusionMergerTriton::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  int fused_comps = 0;
+  for (HloComputation* comp :
+       module->MakeNonfusionComputations(execution_threads)) {
+    if (comp->IsCustomCallComputation()) {
+      continue;
+    }
+
+    for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
+      if (!IsTritonSoftmaxFusion(*instr)) continue;
+
+      VLOG(6) << "Matched triton_softmax fusion: " << instr->ToShortString();
+
+      HloFusionInstruction* softmax = Cast<HloFusionInstruction>(instr);
+
+      bool result =
+          TryMergeProducerAndConsumerFusionsIntoTritonSoftmax(softmax);
+
+      if (!result) {
+        VLOG(6) << "Did not fuse producer or consumer into "
+                << instr->ToShortString();
+      } else {
+        ++fused_comps;
+      }
+    }
+  }
+  return fused_comps > 0;
+}
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/fusion_merger_triton.h b/third_party/xla/xla/service/gpu/fusion_merger_triton.h
new file mode 100644
index 00000000000000..56fb5e4667bbb7
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusion_merger_triton.h
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSION_MERGER_TRITON_H_
+#define XLA_SERVICE_GPU_FUSION_MERGER_TRITON_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+#include "xla/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+// An HLO pass that attempts to merge producer fusions into triton softmax
+// fusions.
+//
+// Producer kernels are only merged if the resulting fusion can be correctly
+// tiled. If the result can be tiled, all operations from the auxiliary
+// producer fusion will be merged into the triton softmax computation, and this
+// computation will replace both the auxiliary and original triton softmax
+// fusion.
+//
+// Auxiliary fusions are not merged into consumer triton fusions if:
+// * The auxiliary fusion has multiple users
+// * The resulting merged fusion is not tilable
+class FusionMergerTriton : public HloModulePass {
+ public:
+  explicit FusionMergerTriton() = default;
+  absl::string_view name() const override { return "fusion-merger-triton"; }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSION_MERGER_TRITON_H_
diff --git a/third_party/xla/xla/service/gpu/fusion_merger_triton_test.cc b/third_party/xla/xla/service/gpu/fusion_merger_triton_test.cc
new file mode 100644
index 00000000000000..44e71af02ac9f0
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusion_merger_triton_test.cc
@@ -0,0 +1,548 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/fusion_merger_triton.h"
+
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "xla/autotune_results.pb.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/service/pattern_matcher_gmock.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/status_matchers.h"
+
+using ::tsl::testing::IsOk;
+using ::tsl::testing::IsOkAndHolds;
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace m = ::xla::match;
+using FusionMergerTritonTest = HloTestBase;
+
+TEST_F(FusionMergerTritonTest,
+       CanMergeTritonFusionWithSingleParameterProducer) {
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+auxiliary_computation {
+  parameter_0 = f32[125]{0} parameter(0)
+  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125]{0} parameter(0)
+  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0), kind=kLoop, calls=auxiliary_computation
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+})";
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  FusionMergerTriton fusion_merger;
+  EXPECT_THAT(fusion_merger.Run(module.get()), IsOkAndHolds(true));
+  EXPECT_THAT(verifier().Run(module.get()), IsOk());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter())));
+}
+
+TEST_F(FusionMergerTritonTest, CanMergeWithTwoParameterConsumer) {
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+consumer_computation {
+  parameter_0 = f32[125]{0} parameter(0)
+  parameter_1 = f32[125,127]{1,0} parameter(1)
+  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, broadcast)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125,127]{1,0} parameter(0)
+  param_1 = f32[125]{0} parameter(1)
+  triton_softmax = f32[125,127]{1,0} fusion(param_0), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+  ROOT consumer_fusion = f32[125,127]{1,0} fusion(param_1, triton_softmax), kind=kLoop, calls=consumer_computation
+})";
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  FusionMergerTriton fusion_merger{};
+  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
+}
+
+TEST_F(
+    FusionMergerTritonTest,
+    CanMergeProducerFusionIntoTritonSoftmaxConsumerWhenTheConsumerIsNotRoot) {
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+auxiliary_computation {
+  parameter_0 = f32[125]{0} parameter(0)
+  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125]{0} parameter(0)
+  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0), kind=kLoop, calls=auxiliary_computation
+  triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+  ROOT broadcast = f32[10,125,127]{2,1,0} broadcast(triton_softmax), dimensions={1,2}
+})";
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  FusionMergerTriton fusion_merger;
+  EXPECT_THAT(fusion_merger.Run(module.get()), IsOkAndHolds(true));
+  EXPECT_THAT(verifier().Run(module.get()), IsOk());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(m::Fusion(m::Parameter()))));
+}
+
+TEST_F(FusionMergerTritonTest,
+       CanMergeTritonFusionWithMultipleParameterProducer) {
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+auxiliary_computation {
+  parameter_0 = f32[125]{0} parameter(0)
+  parameter_1 = f32[125,127]{1,0} parameter(1)
+  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, broadcast)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125]{0} parameter(0)
+  param_1 = f32[125,127]{1,0} parameter(1)
+  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kLoop, calls=auxiliary_computation
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+})";
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  FusionMergerTriton fusion_merger;
+  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
+}
+
+TEST_F(FusionMergerTritonTest, CanMergeTritonFusionWithTransposeProducer) {
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+auxiliary_computation {
+  parameter_0 = f32[125]{0} parameter(0)
+  parameter_1 = f32[127,125]{1,0} parameter(1)
+  transpose = f32[125,127]{1,0} transpose(parameter_1), dimensions={1,0}
+  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(transpose, broadcast)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125]{0} parameter(0)
+  param_1 = f32[127,125]{1,0} parameter(1)
+  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kLoop, calls=auxiliary_computation
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+})";
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  FusionMergerTriton fusion_merger;
+  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
+}
+
+TEST_F(FusionMergerTritonTest,
+       DoesNotMergeTritonFusionWithProducerContainingUntileableOp) {
+  // Right now, concatenate is not tileable.
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+auxiliary_computation {
+  parameter_0 = f32[125,63]{1,0} parameter(0)
+  parameter_1 = f32[125,64]{1,0} parameter(1)
+  ROOT concatenate = f32[125,127]{1,0} concatenate(parameter_0, parameter_1), dimensions={1}
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125,63]{1,0} parameter(0)
+  param_1 = f32[125,64]{1,0} parameter(1)
+  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kLoop, calls=auxiliary_computation
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+})";
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  FusionMergerTriton fusion_merger;
+  EXPECT_FALSE(fusion_merger.Run(module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Fusion(m::Parameter(), m::Parameter()))));
+}
+
+TEST_F(FusionMergerTritonTest, CanMergeTritonFusionWithElementwiseProducer) {
+  const std::string kHloText = R"(
+HloModule layernorm
+
+add_f32 {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add_6 = f32[] add(Arg_0, Arg_1)
+}
+
+auxiliary_fusion {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  parameter_1 = f32[125,127]{1,0} parameter(1)
+  ROOT multiply_1 = f32[125,127]{1,0} multiply(parameter_0, parameter_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  constant_0 = f32[] constant(0)
+  reduce = f32[125]{0} reduce(parameter_0, constant_0), dimensions={1}, to_apply=add_f32
+  broadcast = f32[125,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT multiply_result = f32[125,127]{1,0} multiply(parameter_0, broadcast)
+}
+
+ENTRY main {
+  param_0 = f32[125,127]{1,0} parameter(0)
+  param_1 = f32[125,127]{1,0} parameter(1)
+  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kCustom, calls=auxiliary_fusion
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+}
+
+)";
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  FusionMergerTriton fusion_merger;
+  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
+}
+
+TEST_F(FusionMergerTritonTest,
+       DoesNotMergeSoftmaxWithParamBroadcastedAlongBatchAndReduceDimensions) {
+  const std::string kHloText = R"(
+HloModule t
+
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+auxiliary_computation {
+  param_0 = f32[10,125,127]{2,1,0} parameter(0)
+  param_1 = f32[10]{0} parameter(1)
+  broadcast_0 = f32[10,125,127]{2,1,0} broadcast(param_1), dimensions={0}
+  ROOT multiply_0 = f32[10,125,127]{2,1,0} multiply(param_0, broadcast_0)
+}
+
+triton_softmax_computation {
+  param_0 = f32[10,125,127]{2,1,0} parameter(0)
+  multiply = f32[10,125,127]{2,1,0} multiply(param_0, param_0)
+  constant = f32[] constant(0)
+  reduce = f32[10,125]{1,0} reduce(multiply, constant), dimensions={2}, to_apply=add
+  broadcast = f32[10,125,127]{2,1,0} broadcast(reduce), dimensions={0,1}
+  ROOT multiply_out = f32[10,125,127]{2,1,0} multiply(param_0, broadcast)
+}
+
+ENTRY main {
+  param_0 = f32[10,125,127]{2,1,0} parameter(0)
+  param_1 = f32[10]{0} parameter(1)
+  auxiliary_fusion = f32[10,125,127]{2,1,0} fusion(param_0, param_1), kind=kCustom, calls=auxiliary_computation
+  ROOT triton_softmax = f32[10,125,127]{2,1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  FusionMergerTriton fusion_merger;
+  EXPECT_FALSE(fusion_merger.Run(module.get()).value());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Fusion())));
+}
+
+TEST_F(FusionMergerTritonTest, CanMergeWithBothProducerAndConsumerFusions) {
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+producer_computation {
+  parameter_0 = f32[125]{0} parameter(0)
+  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
+}
+
+consumer_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  parameter_1 = f32[125,127]{1,0} parameter(1)
+  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, parameter_0)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125]{0} parameter(0)
+  param_1 = f32[125,127]{1,0} parameter(1)
+  producer_fusion = f32[125,127]{1,0} fusion(param_0), kind=kLoop, calls=producer_computation
+  triton_softmax = f32[125,127]{1,0} fusion(producer_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+  ROOT consumer_fusion = f32[125,127]{1,0} fusion(param_1, triton_softmax), kind=kLoop, calls=consumer_computation
+})";
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  FusionMergerTriton fusion_merger{};
+  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
+}
+
+TEST_F(FusionMergerTritonTest,
+       CanMergeWithMultiInputProducerAndConsumerFusions) {
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+producer_computation {
+  parameter_0 = f32[125]{0} parameter(0)
+  parameter_1 = f32[125,127]{1,0} parameter(1)
+  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
+  ROOT add = f32[125,127]{1,0} add(parameter_1, broadcast)
+}
+
+consumer_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  parameter_1 = f32[125,127]{1,0} parameter(1)
+  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, parameter_0)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125]{0} parameter(0)
+  param_1 = f32[125,127]{1,0} parameter(1)
+  param_2 = f32[125,127]{1,0} parameter(2)
+  producer_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kLoop, calls=producer_computation
+  triton_softmax = f32[125,127]{1,0} fusion(producer_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+  ROOT consumer_fusion = f32[125,127]{1,0} fusion(param_2, triton_softmax), kind=kLoop, calls=consumer_computation
+})";
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  FusionMergerTriton fusion_merger{};
+  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter())));
+}
+
+TEST_F(FusionMergerTritonTest,
+       CanMergeWithBothProducerAndConsumerFusionsSharingParameter) {
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+producer_computation {
+  parameter_0 = f32[125]{0} parameter(0)
+  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
+}
+
+consumer_computation {
+  parameter_0 = f32[125]{0} parameter(0)
+  parameter_1 = f32[125,127]{1,0} parameter(1)
+  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, broadcast)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125]{0} parameter(0)
+  producer_fusion = f32[125,127]{1,0} fusion(param_0), kind=kLoop, calls=producer_computation
+  triton_softmax = f32[125,127]{1,0} fusion(producer_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+  ROOT consumer_fusion = f32[125,127]{1,0} fusion(param_0, triton_softmax), kind=kLoop, calls=consumer_computation
+})";
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  FusionMergerTriton fusion_merger{};
+  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter())));
+}
+
+TEST_F(FusionMergerTritonTest, DoesNotMergeSoftmaxWithMultiOutputConsumer) {
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+producer_computation {
+  parameter_0 = f32[125]{0} parameter(0)
+  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
+}
+
+consumer_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  parameter_1 = f32[125,127]{1,0} parameter(1)
+  add = f32[125,127]{1,0} add(parameter_1, parameter_0)
+  multiply = f32[125,127]{1,0} multiply(parameter_1, parameter_0)
+  ROOT tuple = (f32[125,127]{1,0}, f32[125,127]{1,0}) tuple(add, multiply)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125,127]{1,0} parameter(0)
+  triton_softmax = f32[125,127]{1,0} fusion(param_0), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+  ROOT consumer_fusion = (f32[125,127]{1,0}, f32[125,127]{1,0}) fusion(param_0, triton_softmax), kind=kLoop, calls=consumer_computation
+})";
+  auto module = ParseAndReturnVerifiedModule(kHloText).value();
+  FusionMergerTriton fusion_merger;
+  EXPECT_FALSE(fusion_merger.Run(module.get()).value());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter(), m::Fusion())));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.cc b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
index 5b0914dbd47723..6554ca858bd910 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
@@ -43,6 +43,7 @@ namespace gpu {
 HloPassPipeline FusionPipeline(
     const DebugOptions& debug_options,
     HloCostAnalysis::ShapeSizeFunction shape_size_bytes_function,
+    tsl::thread::ThreadPool* thread_pool,
     const se::DeviceDescription& gpu_device_info) {
   HloPassFix<HloPassPipeline> fusion("fusion");
   // We try to split variadic ops with many parameters into several such ops
@@ -56,12 +57,13 @@ HloPassPipeline FusionPipeline(
                   LayoutAssignment::InstructionCanChangeLayout)),
       "hlo verifier (debug)");
 
-  GpuHloCostAnalysis::Options cost_analysis_options{
-      shape_size_bytes_function,
-      /*per_second_rates=*/{},
-      /*count_multiple_input_accesses=*/true};
   if (debug_options.xla_gpu_enable_priority_fusion()) {
-    fusion.AddPass<GpuPriorityFusion>(gpu_device_info, cost_analysis_options);
+    GpuHloCostAnalysis::Options cost_analysis_options{
+        shape_size_bytes_function,
+        /*per_second_rates=*/{},
+        /*count_multiple_input_accesses=*/true};
+    fusion.AddPass<GpuPriorityFusion>(thread_pool, gpu_device_info,
+                                      std::move(cost_analysis_options));
   } else {
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false,
                                          gpu_device_info);
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.h b/third_party/xla/xla/service/gpu/fusion_pipeline.h
index cecb1a98091270..bd2d7bb1f7ed27 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.h
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.h
@@ -20,14 +20,17 @@ limitations under the License.
 #include "xla/service/hlo_pass_pipeline.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla.pb.h"
+#include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace gpu {
 
 // Function wrapper around the (non-horizontal) XLA GPU fusion pipeline.
+// Thread pool may be nullptr.
 HloPassPipeline FusionPipeline(
     const DebugOptions& debug_options,
     HloCostAnalysis::ShapeSizeFunction shape_size_bytes_function,
+    tsl::thread::ThreadPool* thread_pool,
     const se::DeviceDescription& gpu_device_info);
 
 // Function wrapper around the horizontal XLA GPU fusion pipeline.
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 3bfc5b6ee3587a..543b77ef92f994 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -5,9 +5,13 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":fusion_emitter",
+        "//xla:status",
+        "//xla:statusor",
         "//xla/hlo/ir:hlo",
+        "//xla/service:elemental_ir_emitter",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
+        "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/llvm_ir:dynamic_update_slice_util",
         "//xla/service/llvm_ir:fused_ir_emitter",
@@ -24,8 +28,20 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":fusion_emitter",
+        "//xla:shape_util",
+        "//xla:statusor",
+        "//xla/hlo/ir:hlo",
+        "//xla/mlir_hlo:lhlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:elemental_ir_emitter",
         "//xla/service/gpu:gpu_executable",
+        "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:ir_emitter_context",
+        "//xla/service/gpu:kernel_reuse_cache",
+        "//xla/service/gpu:thunk",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -35,11 +51,11 @@ cc_library(
     hdrs = ["fusion_emitter.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo:lhlo",
         "//xla/service:elemental_ir_emitter",
         "//xla/service/gpu:gpu_executable",
-        "//xla/service/gpu:hlo_to_ir_bindings",
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:kernel_arguments",
         "//xla/service/gpu:kernel_reuse_cache",
@@ -48,10 +64,12 @@ cc_library(
         "//xla/service/gpu:thunk",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
-        "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -71,15 +89,19 @@ cc_library(
         ":reduction",
         ":transpose",
         "//xla:shape_util",
+        "//xla:status",
+        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo:lhlo",
         "//xla/service:buffer_assignment",
-        "//xla/service:elemental_ir_emitter",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu:ir_emitter_context",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -90,6 +112,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":fusion_emitter",
+        "//xla:status",
+        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo:lhlo",
         "//xla/service:elemental_ir_emitter",
@@ -101,6 +125,7 @@ cc_library(
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "@llvm-project//llvm:ir_headers",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -110,12 +135,21 @@ cc_library(
     hdrs = ["tiling_util.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//xla:shape_util",
+        "//xla:statusor",
+        "//xla:util",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:target_util",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:kernel_support_library",
+        "//xla/service/llvm_ir:llvm_loop",
+        "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -125,16 +159,17 @@ cc_library(
     hdrs = ["thunk_util.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//xla:literal",
         "//xla:shape_util",
         "//xla:statusor",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service/gpu:gpu_executable",
-        "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:thunk",
-        "//xla/translate/hlo_to_mhlo:hlo_utils",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
     ],
 )
 
@@ -152,8 +187,8 @@ cc_library(
         "//xla:status_macros",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
-        "//xla/mlir_hlo",
         "//xla/mlir_hlo:lhlo",
+        "//xla/service:buffer_assignment",
         "//xla/service:elemental_ir_emitter",
         "//xla/service/gpu:gpu_executable",
         "//xla/service/gpu:gpu_fusible",
@@ -161,22 +196,29 @@ cc_library(
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:ir_emitter",
         "//xla/service/gpu:ir_emitter_context",
+        "//xla/service/gpu:kernel_arguments",
         "//xla/service/gpu:kernel_reuse_cache",
+        "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:parallel_loop_emitter",
+        "//xla/service/gpu:reduction_utils",
         "//xla/service/gpu:target_util",
         "//xla/service/gpu:thunk",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:kernel_support_library",
+        "//xla/service/llvm_ir:llvm_loop",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/llvm_ir:loop_emitter",
-        "//xla/translate/mhlo_to_hlo:location_exporter",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -192,6 +234,8 @@ cc_library(
         ":fusion_emitter",
         ":tiling_util",
         "//xla:permutation_util",
+        "//xla:status",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo:lhlo",
         "//xla/service:elemental_ir_emitter",
@@ -203,7 +247,14 @@ cc_library(
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -214,14 +265,25 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":fusion_emitter",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:elemental_ir_emitter",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
+        "//xla/service/gpu:ir_emitter_context",
+        "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:parallel_loop_emitter",
         "//xla/service/llvm_ir:fused_ir_emitter",
+        "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:kernel_support_library",
-        "//xla/service/llvm_ir:llvm_util",
+        "//xla/service/llvm_ir:llvm_loop",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:ir_headers",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/fusions/copy.cc b/third_party/xla/xla/service/gpu/fusions/copy.cc
index a04885f9f0a255..78726689495c82 100644
--- a/third_party/xla/xla/service/gpu/fusions/copy.cc
+++ b/third_party/xla/xla/service/gpu/fusions/copy.cc
@@ -16,26 +16,43 @@ limitations under the License.
 
 #include <memory>
 
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/IRBuilder.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/gpu/copy_thunk.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/kernel_reuse_cache.h"
+#include "xla/service/gpu/thunk.h"
+#include "xla/shape_util.h"
+#include "xla/statusor.h"
 
 namespace xla {
 namespace gpu {
 
 StatusOr<FusionEmissionResult> MemcpyFusion::Emit(
-    IrEmitterContext& ir_emitter_context, ElementalIrEmitter& elemental_emitter,
+    IrEmitterContext& ir_emitter_context, ElementalIrEmitter&,
     mlir::lmhlo::FusionOp fusion_op, const HloFusionInstruction& fusion,
-    KernelReuseCache& kernel_cache, llvm::IRBuilder<>*) const {
-  auto src_buffer = *GetAllocationSlice(src_, ir_emitter_context.allocations());
-  auto dst_buffer = *GetAllocationSlice(dst_, ir_emitter_context.allocations());
+    KernelReuseCache&, llvm::IRBuilder<>*) const {
   FusionEmissionResult result;
-  if (src_buffer != dst_buffer) {
-    result.thunks.emplace_back(std::make_unique<DeviceToDeviceCopyThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(fusion_op),
-        /*source_buffer=*/src_buffer,
-        /*destination_buffer=*/dst_buffer,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(GetShape(src_)),
-        /*source_value=*/src_,
-        /*destination_value=*/dst_));
+  for (int i = 0; i < src_buffers_.size(); ++i) {
+    if (src_buffers_[i] != dst_buffers_[i]) {
+      result.thunks.emplace_back(std::make_unique<DeviceToDeviceCopyThunk>(
+          ir_emitter_context.emit_ir_from_hlo()
+              ? Thunk::ThunkInfo::WithProfileAnnotation(&fusion)
+              : Thunk::ThunkInfo::WithProfileAnnotation(fusion_op),
+          /*source_buffer=*/src_buffers_[i],
+          /*destination_buffer=*/dst_buffers_[i],
+          /*mem_size=*/src_buffers_[i].size(),
+          /*source_value=*/ir_emitter_context.emit_ir_from_hlo() ? nullptr
+                                                                 : srcs_[i],
+          /*destination_value=*/ir_emitter_context.emit_ir_from_hlo()
+              ? nullptr
+              : dsts_[i]));
+    }
   }
   return result;
 }
diff --git a/third_party/xla/xla/service/gpu/fusions/copy.h b/third_party/xla/xla/service/gpu/fusions/copy.h
index 173517d73384c7..cc5b60626bc48f 100644
--- a/third_party/xla/xla/service/gpu/fusions/copy.h
+++ b/third_party/xla/xla/service/gpu/fusions/copy.h
@@ -15,17 +15,27 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSIONS_COPY_H_
 #define XLA_SERVICE_GPU_FUSIONS_COPY_H_
 
+#include <vector>
+
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 
 namespace xla {
 namespace gpu {
 
-// Special case of a fusion consisting only of a kCopy instruction that can be
-// implemented using a memcpy.
+// Special case of a fusion consisting only of `kCopy` instructions that can be
+// implemented using `memcpy`s.
 class MemcpyFusion : public FusionInterface {
  public:
-  MemcpyFusion(mlir::Value src, mlir::Value dst) : src_(src), dst_(dst) {}
+  MemcpyFusion(std::vector<BufferAllocation::Slice> src_buffers,
+               std::vector<BufferAllocation::Slice> dst_buffers,
+               std::vector<mlir::Value> srcs, std::vector<mlir::Value> dsts)
+      : src_buffers_(std::move(src_buffers)),
+        dst_buffers_(std::move(dst_buffers)),
+        srcs_(std::move(srcs)),
+        dsts_(std::move(dsts)) {}
 
   StatusOr<FusionEmissionResult> Emit(IrEmitterContext& ir_emitter_context,
                                       ElementalIrEmitter& elemental_emitter,
@@ -35,8 +45,13 @@ class MemcpyFusion : public FusionInterface {
                                       llvm::IRBuilder<>*) const final;
 
  private:
-  mlir::Value src_;
-  mlir::Value dst_;
+  std::vector<BufferAllocation::Slice> src_buffers_;
+  std::vector<BufferAllocation::Slice> dst_buffers_;
+
+  // These are only used by the LMHLO code path and are empty if emitting from
+  // HLO.
+  std::vector<mlir::Value> srcs_;
+  std::vector<mlir::Value> dsts_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
index fd3d73035c8154..3e60e18e5c0f5f 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
@@ -15,24 +15,41 @@ limitations under the License.
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 
 #include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "xla/service/gpu/hlo_to_ir_bindings.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "xla/service/elemental_ir_emitter.h"
+#include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/kernel_thunk.h"
+#include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/statusor.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -114,7 +131,7 @@ BuildKernelPrototype(IrEmitterContext& ir_emitter_context,
   llvm::LLVMContext& context = llvm_module->getContext();
   llvm::FunctionType* kernel_type = llvm::FunctionType::get(
       /*Result=*/llvm::Type::getVoidTy(context),
-      std::vector<llvm::Type*>(kNumLlvmArgs, builder->getInt8PtrTy()),
+      std::vector<llvm::Type*>(kNumLlvmArgs, builder->getPtrTy()),
       /*isVarArg=*/false);
   llvm::Function* kernel =
       llvm::Function::Create(kernel_type, llvm::GlobalValue::ExternalLinkage,
@@ -163,9 +180,7 @@ BuildKernelPrototype(IrEmitterContext& ir_emitter_context,
 
     llvm::Type* ir_type =
         llvm_ir::ShapeToIrType(kernel_argument.shape(), llvm_module);
-    llvm_ir::IrArray ir_array(
-        CastToTypedValue(kernel_argument.shape(), &llvm_arg, builder), ir_type,
-        kernel_argument.shape());
+    llvm_ir::IrArray ir_array(&llvm_arg, ir_type, kernel_argument.shape());
 
     if (!kernel_argument.written()) {
       ir_array.MarkInvariantOverWholeProgram(&llvm_arg.getContext());
@@ -205,9 +220,13 @@ StatusOr<FusionEmissionResult> KernelFusionEmitterBase::Emit(
               ir_emitter_context, suggested_kernel_name,
               kernel_arguments.args(), fusion.operand_count(), launch_dims,
               builder);
-          TF_RETURN_IF_ERROR(EmitKernel(ir_emitter_context, elemental_emitter,
-                                        fusion, launch_dims, std::move(inputs),
-                                        std::move(outputs), builder, i));
+          if (ir_emitter_context.emit_kernels()) {
+            TF_RETURN_IF_ERROR(EmitKernel(
+                ir_emitter_context, elemental_emitter, fusion, launch_dims,
+                std::move(inputs), std::move(outputs), builder, i));
+          } else {
+            VLOG(3) << "Skipped kernel compilation: " << suggested_kernel_name;
+          }
           // TODO(jreiffers): Return shmem_bytes from EmitKernel when
           // converting the Triton emitters to this infrastructure.
           return KernelReuseCache::Entry{kernel->getName().str(), launch_dims,
diff --git a/third_party/xla/xla/service/gpu/fusions/fusions.cc b/third_party/xla/xla/service/gpu/fusions/fusions.cc
index f4b9d4053db2c7..20e76007041aeb 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusions.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusions.cc
@@ -16,10 +16,18 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <utility>
+#include <variant>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
 #include "absl/types/span.h"
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout_util.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/fusions/copy.h"
@@ -32,51 +40,161 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
+namespace {
+namespace {
 
-bool IsSingleInstructionFusion(mlir::lmhlo::FusionOp fusion) {
-  bool seen_instruction = false;
-  for (mlir::Operation& instr : fusion.getRegion().front()) {
-    if (mlir::isa<mlir::lmhlo::TerminatorOp, mlir::mhlo::ReturnOp,
-                  mlir::bufferization::ToTensorOp, mlir::memref::TensorStoreOp>(
-            &instr)) {
-      continue;
+bool IsParameterOrGteOfParameter(const HloInstruction* instr) {
+  if (instr->opcode() == HloOpcode::kParameter) {
+    return true;
+  }
+  if (instr->opcode() == HloOpcode::kGetTupleElement) {
+    return IsParameterOrGteOfParameter(instr->operand(0));
+  }
+  return false;
+}
+
+bool IsDynamicUpdateSliceFusion(const HloFusionAnalysis& analysis) {
+  return absl::c_all_of(
+      analysis.fusion_roots(), [](const HloInstruction* root) {
+        return root->opcode() == HloOpcode::kDynamicUpdateSlice ||
+               (root->opcode() == HloOpcode::kBitcast &&
+                root->operand(0)->opcode() == HloOpcode::kDynamicUpdateSlice);
+      });
+}
+
+}  // namespace
+
+StatusOr<std::optional<std::unique_ptr<FusionInterface>>> GetCopyFusionImpl(
+    HloFusionAnalysis& analysis, LmhloFusionInfo fusion_info) {
+  mlir::lmhlo::FusionOp fusion_op = fusion_info.fusion_op;
+  absl::Span<const BufferAllocation* const> allocations =
+      fusion_info.allocations;
+
+  auto params = GetHloOperands(fusion_op);
+  auto outputs = GetHloOutputs(fusion_op);
+  std::vector<mlir::Value> srcs;
+  srcs.reserve(outputs.size());
+
+  for (auto* root : analysis.fusion_roots()) {
+    if (root->opcode() != HloOpcode::kCopy ||
+        root->operand(0)->opcode() != HloOpcode::kParameter ||
+        !LayoutUtil::Equal(root->operand(0)->shape().layout(),
+                           root->shape().layout())) {
+      return std::nullopt;
     }
-    if (seen_instruction) return false;
-    seen_instruction = true;
+
+    mlir::Value src = params[root->operand(0)->parameter_number()];
+    if (!GetAllocationSlice(src, allocations).ok()) return std::nullopt;
+
+    srcs.emplace_back(src);
+  }
+
+  auto dsts = std::vector<mlir::Value>(outputs.begin(), outputs.end());
+  DCHECK(srcs.size() == dsts.size());
+  std::vector<BufferAllocation::Slice> src_buffers;
+  std::vector<BufferAllocation::Slice> dst_buffers;
+  for (int i = 0; i < srcs.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice src_buffer,
+                        GetAllocationSlice(srcs[i], allocations));
+    src_buffers.push_back(src_buffer);
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dst_buffer,
+                        GetAllocationSlice(dsts[i], allocations));
+    dst_buffers.push_back(dst_buffer);
   }
-  return seen_instruction;
+
+  return std::make_unique<MemcpyFusion>(std::move(src_buffers),
+                                        std::move(dst_buffers), std::move(srcs),
+                                        std::move(dsts));
 }
 
-std::optional<std::unique_ptr<FusionInterface>> GetFusionEmitter(
+StatusOr<std::optional<std::unique_ptr<FusionInterface>>> GetCopyFusionImpl(
+    HloFusionAnalysis& analysis, HloFusionInfo fusion_info) {
+  const HloFusionInstruction* fusion = fusion_info.instr;
+  const BufferAssignment* buffer_assignment = fusion_info.buffer_assignment;
+
+  std::vector<BufferAllocation::Slice> src_buffers;
+  for (auto* root : analysis.fusion_roots()) {
+    if (root->opcode() != HloOpcode::kCopy ||
+        root->operand(0)->opcode() != HloOpcode::kParameter ||
+        !LayoutUtil::Equal(root->operand(0)->shape().layout(),
+                           root->shape().layout())) {
+      return std::nullopt;
+    }
+
+    const HloInstruction* src_instr =
+        fusion->operands()[root->operand(0)->parameter_number()];
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
+                        buffer_assignment->GetUniqueSlice(src_instr, {}));
+    src_buffers.push_back(slice);
+  }
+
+  std::vector<BufferAllocation::Slice> dst_buffers;
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      fusion->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+        if (!subshape.IsArray()) {
+          return OkStatus();
+        }
+        TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
+                            buffer_assignment->GetUniqueSlice(fusion, index));
+        dst_buffers.push_back(slice);
+        return OkStatus();
+      }));
+
+  DCHECK(src_buffers.size() == dst_buffers.size());
+  std::vector<mlir::Value> srcs;
+  std::vector<mlir::Value> dsts;
+  return std::make_unique<MemcpyFusion>(std::move(src_buffers),
+                                        std::move(dst_buffers),
+                                        /*srcs=*/std::vector<mlir::Value>(),
+                                        /*dsts=*/std::vector<mlir::Value>());
+}
+
+StatusOr<std::optional<std::unique_ptr<FusionInterface>>> GetCopyFusion(
     HloFusionAnalysis& analysis,
-    absl::Span<const BufferAllocation* const> allocations,
-    mlir::lmhlo::FusionOp fusion_op) {
+    std::variant<HloFusionInfo, LmhloFusionInfo> fusion_info) {
+  if (std::holds_alternative<HloFusionInfo>(fusion_info)) {
+    return GetCopyFusionImpl(analysis, std::get<HloFusionInfo>(fusion_info));
+  } else {
+    return GetCopyFusionImpl(analysis, std::get<LmhloFusionInfo>(fusion_info));
+  }
+}
+
+}  // namespace
+
+StatusOr<std::optional<std::unique_ptr<FusionInterface>>> GetFusionEmitter(
+    HloFusionAnalysis& analysis,
+    std::variant<HloFusionInfo, LmhloFusionInfo> fusion_info) {
   switch (analysis.GetEmitterFusionKind()) {
     case HloFusionAnalysis::EmitterFusionKind::kInputSlices:
       return std::make_unique<InputSlicesFusion>(analysis);
     case HloFusionAnalysis::EmitterFusionKind::kLoop: {
-      if (!allocations.empty() && fusion_op != nullptr) {
-        bool is_single = IsSingleInstructionFusion(fusion_op);
-        if (!is_single && CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
-                              fusion_op, allocations)) {
-          return std::make_unique<InPlaceDynamicUpdateSliceEmitter>(analysis);
+      if (IsDynamicUpdateSliceFusion(analysis)) {
+        if (!std::holds_alternative<LmhloFusionInfo>(fusion_info)) {
+          return std::nullopt;
         }
-        if (is_single && analysis.fusion_roots().size() == 1 &&
-            analysis.fusion_roots().front()->opcode() == HloOpcode::kCopy) {
-          mlir::Value operand = GetHloOperands(fusion_op).front();
-          mlir::Value output = GetHloOutputs(fusion_op).front();
-          Shape operand_shape = GetShape(operand);
-          Shape output_shape = GetShape(output);
-          if (LayoutUtil::Equal(operand_shape.layout(),
-                                output_shape.layout()) &&
-              GetAllocationSlice(operand, allocations).ok()) {
-            return std::make_unique<MemcpyFusion>(operand, output);
-          }
+        auto lmhlo_fusion_info = std::get<LmhloFusionInfo>(fusion_info);
+        absl::Span<const BufferAllocation* const> allocations =
+            lmhlo_fusion_info.allocations;
+        mlir::lmhlo::FusionOp fusion_op = lmhlo_fusion_info.fusion_op;
+        if (CanEmitFusedDynamicUpdateSliceInPlaceForGpu(fusion_op,
+                                                        allocations)) {
+          return std::make_unique<InPlaceDynamicUpdateSliceEmitter>(analysis);
         }
       }
+      TF_ASSIGN_OR_RETURN(
+          std::optional<std::unique_ptr<FusionInterface>> copy_fusion,
+          GetCopyFusion(analysis, fusion_info));
+      if (copy_fusion.has_value()) {
+        return copy_fusion;
+      }
       return std::make_unique<LoopFusion>(analysis);
     }
     case HloFusionAnalysis::EmitterFusionKind::kReduction:
diff --git a/third_party/xla/xla/service/gpu/fusions/fusions.h b/third_party/xla/xla/service/gpu/fusions/fusions.h
index 82fc63400b411c..286da0a5218e2d 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusions.h
+++ b/third_party/xla/xla/service/gpu/fusions/fusions.h
@@ -17,24 +17,43 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <variant>
 
 #include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/statusor.h"
 
 namespace xla {
 namespace gpu {
 
+struct LmhloFusionInfo {
+  mlir::lmhlo::FusionOp fusion_op;
+  absl::Span<const BufferAllocation* const> allocations;
+
+  explicit LmhloFusionInfo(
+      mlir::lmhlo::FusionOp fusion_op,
+      absl::Span<const BufferAllocation* const> allocations)
+      : fusion_op(fusion_op), allocations(allocations) {}
+};
+
+struct HloFusionInfo {
+  const HloFusionInstruction* instr;
+  const BufferAssignment* buffer_assignment;
+
+  explicit HloFusionInfo(const HloFusionInstruction* instr,
+                         const BufferAssignment* buffer_assignment)
+      : instr(instr), buffer_assignment(buffer_assignment) {}
+};
+
 // Returns the emitter for the given fusion. Returns nullopt if the fusion
 // type is not yet supported.
-// `allocations` may be empty and `fusion_op` may be nullptr if buffer
-// assignment didn't run yet.
-std::optional<std::unique_ptr<FusionInterface>> GetFusionEmitter(
+StatusOr<std::optional<std::unique_ptr<FusionInterface>>> GetFusionEmitter(
     HloFusionAnalysis& analysis,
-    absl::Span<const BufferAllocation* const> allocations,
-    mlir::lmhlo::FusionOp fusion_op);
+    std::variant<HloFusionInfo, LmhloFusionInfo> fusion_info);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
index 5056affd6e5be3..61245ebc1c377b 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
@@ -20,10 +20,14 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/IRBuilder.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/elemental_ir_emitter.h"
+#include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.cc b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
index 8b2463140f7f26..9b72ebb8abdbea 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
@@ -14,14 +14,34 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/input_slices.h"
 
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
+#include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/kernel_support_library.h"
-#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/service/llvm_ir/llvm_loop.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.cc b/third_party/xla/xla/service/gpu/fusions/loop.cc
index 9e5b6aa713b36c..088cc375cc71bb 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop.cc
@@ -17,12 +17,18 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Type.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction.cc b/third_party/xla/xla/service/gpu/fusions/reduction.cc
index 318e34fea68cdf..05c1d205e0f07b 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction.cc
@@ -14,14 +14,19 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/reduction.h"
 
+#include <array>
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
@@ -35,30 +40,36 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/thunk_util.h"
 #include "xla/service/gpu/fusions/tiling_util.h"
 #include "xla/service/gpu/gpu_fusible.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_nested.h"
+#include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/kernel_mapping_scheme.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/kernel_thunk.h"
+#include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
+#include "xla/service/gpu/reduction_utils.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/kernel_support_library.h"
+#include "xla/service/llvm_ir/llvm_loop.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/llvm_ir/loop_emitter.h"
 #include "xla/shape.h"
@@ -66,7 +77,7 @@ limitations under the License.
 #include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
-#include "xla/translate/mhlo_to_hlo/location_exporter.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
@@ -83,16 +94,6 @@ using ReductionOutputMap =
 
 using ExtraOutputGensMap = ConstHloInstructionMap<llvm_ir::ElementGenerator>;
 
-// For a row reduction, returns the number of rows we can process in parallel
-// per warp.
-int RowReductionGetRowsPerWarp(int reduced_dimension_size) {
-  if (WarpSize() % reduced_dimension_size != 0 ||
-      reduced_dimension_size >= WarpSize()) {
-    return 1;
-  }
-  return WarpSize() / reduced_dimension_size;
-}
-
 int GetNumOutputs(const Shape& shape) {
   if (shape.IsTuple()) {
     return shape.tuple_shapes_size();
@@ -100,6 +101,163 @@ int GetNumOutputs(const Shape& shape) {
   return 1;
 }
 
+llvm::Type* GetIndexType(const HloFusionInstruction& fusion,
+                         const TilingScheme& tiling_scheme,
+                         llvm::IRBuilder<>* builder) {
+  return GetIndexTypeForKernel(&fusion,
+                               tiling_scheme.GetNumThreadsPerBlockPhysical() *
+                                   tiling_scheme.GetNumberOfBlocksPhysical(),
+                               builder);
+}
+
+class ReductionEmitter {
+ public:
+  ReductionEmitter(HloFusionAnalysis& analysis,
+                   IrEmitterContext& ir_emitter_context,
+                   ElementalIrEmitter& elemental_emitter,
+                   mlir::lmhlo::FusionOp fusion_op,
+                   const HloFusionInstruction& fusion,
+                   KernelReuseCache& kernel_cache, llvm::IRBuilder<>* builder)
+      : analysis_(analysis),
+        ir_emitter_context_(ir_emitter_context),
+        elemental_emitter_(elemental_emitter),
+        fusion_op_(fusion_op),
+        fusion_(fusion),
+        kernel_cache_(kernel_cache),
+        builder_(builder),
+        index_ty_(GetIndexType(
+            fusion, analysis.GetReductionCodegenInfo()->GetTilingScheme(),
+            builder)) {}
+
+  StatusOr<FusionEmissionResult> Emit();
+
+ private:
+  friend class ReductionGroupEmitter;
+
+  StatusOr<std::unique_ptr<Thunk>> BuildKernelThunkForFusion(
+      const LaunchDimensions& launch_dimensions,
+      absl::string_view discriminator,
+      std::function<Status(std::vector<llvm_ir::IrArray>,
+                           std::vector<llvm_ir::IrArray>)>
+          kernel_builder_fn);
+
+  StatusOr<std::unique_ptr<Thunk>> BuildFusedInitializerThunk(
+      const HloInstruction* fusion_root, mlir::Value dest,
+      BufferAllocation::Slice dest_slice, int output_index);
+
+  Status EmitIRForReduction(
+      absl::Span<const HloInstruction* const> instr_index_group,
+      FusedIrEmitter& fused_emitter, const ReductionOutputMap& result_ir_arrays,
+      const Shape& input_shape);
+
+  void MaybeEmitFenceForAMDGPU();
+  void EmitSyncThreads();
+
+  // For a row reduction, returns the number of rows we can process in parallel
+  // per warp.
+  int RowReductionGetRowsPerWarp() const {
+    int reduced_dimension_size = ReducedDimensionSize();
+    if (WarpSize() % reduced_dimension_size != 0 ||
+        reduced_dimension_size >= WarpSize()) {
+      return 1;
+    }
+    return WarpSize() / reduced_dimension_size;
+  }
+
+  int ReducedDimensionSize() const {
+    return analysis_.GetReductionCodegenInfo()
+        ->GetTilingScheme()
+        .GetDimsInElems()[2];
+  }
+
+  HloFusionAnalysis& analysis_;
+  IrEmitterContext& ir_emitter_context_;
+  ElementalIrEmitter& elemental_emitter_;
+  mlir::lmhlo::FusionOp fusion_op_;
+  const HloFusionInstruction& fusion_;
+  KernelReuseCache& kernel_cache_;
+  llvm::IRBuilder<>* builder_;
+  llvm::Type* index_ty_;
+};
+
+class ReductionGroupEmitter {
+ public:
+  struct ReductionCalculationState {
+    llvm::GlobalVariable* shared_cache;
+    llvm::Value* initial_value;
+    llvm::AllocaInst* partial_result_address;
+    llvm::AllocaInst* input_address;
+    llvm_ir::ElementGenerator input_gen;
+  };
+
+  ReductionGroupEmitter(
+      ReductionEmitter& reduction_emitter,
+      absl::Span<const HloReduceInstruction* const> reduce_instr_index_group,
+      const ReductionOutputMap& result_ir_arrays,
+      FusedIrEmitter& fused_emitter);
+
+  const ReductionCalculationState& GetCalculationStateFor(
+      const HloInstruction* instruction, int operand_idx) const {
+    const ReductionOpState& op_state = state_.at(instruction);
+    CHECK_LT(operand_idx, op_state.size());
+    return op_state[operand_idx];
+  }
+
+  void SetCalculationStateFor(
+      const ReductionCalculationState& calculation_state,
+      const HloInstruction* instruction, int operand_idx) {
+    ReductionOpState& op_state = state_[instruction];
+    CHECK_EQ(operand_idx, op_state.size());
+    op_state.push_back(calculation_state);
+  }
+
+  void EmitReductionOutputForRowReduction(
+      const TilingKernelInfo& tiling_kernel_info,
+      const HloReduceInstruction* reduction, const HloInstruction* root,
+      int partial_result_idx) const;
+
+  void EmitReductionOutputForColumnReduction(
+      const TilingKernelInfo& tiling_kernel_info,
+      const HloReduceInstruction* reduction, const HloInstruction* root,
+      int partial_result_idx) const;
+
+  void EmitFullWarpShuffleDownLoopForReduce(
+      const HloComputation* reducer,
+      absl::Span<TypedPointer const> partial_result_addresses,
+      int threads_per_block, int num_results_per_warp) const;
+
+  void WriteReductionOutput(const TilingKernelInfo& tiling_kernel_info,
+                            const HloReduceInstruction* reduction,
+                            const HloInstruction* root, int partial_result_idx,
+                            absl::Span<TypedPointer const> values) const;
+
+  llvm_ir::IrArray::Index GetOutputIndexForReduction(
+      int partial_result_idx, const TilingKernelInfo& tiling_kernel_info,
+      const HloReduceInstruction* reduction, const HloInstruction* root,
+      int output_idx) const;
+
+  void GenerateElementForReducer(
+      const HloReduceInstruction* reduction, llvm::Value* partial_result_index,
+      const llvm_ir::IrArray::Index& index_without_linear,
+      const llvm_ir::IrArray::Index& input_index,
+      int num_partial_results) const;
+
+  Status EmitExtraOutputsForReduce(
+      const Shape& reduction_operand_shape,
+      const llvm_ir::IrArray::Index& index,
+      const ExtraOutputGensMap& extra_output_gens) const;
+
+ private:
+  ReductionEmitter& reduction_emitter_;
+  const ReductionOutputMap& result_ir_arrays_;
+
+  // One state per reduction operand.
+  using ReductionOpState = absl::InlinedVector<ReductionCalculationState, 2>;
+
+  // HloInstruction -> operand_idx -> cache
+  absl::flat_hash_map<const HloInstruction*, ReductionOpState> state_;
+};
+
 // Allocates a shared tile of given dimensions, applying scaling specified in
 // tilng_scheme as a major-most dimension to avoid collisions.
 llvm::GlobalVariable* AllocateShared(
@@ -119,16 +277,20 @@ llvm::GlobalVariable* AllocateShared(
 
 // Creates accumulator alloca's, populates them with initial values, generates
 // __shared__ caches and returns the populated object.
-ReductionCodegenState GenerateReductionCodegenState(
-    llvm::IRBuilder<>* builder, mlir::lmhlo::FusionOp fusion,
-    const ReductionCodegenInfo& reduction_info,
+ReductionGroupEmitter::ReductionGroupEmitter(
+    ReductionEmitter& reduction_emitter,
     absl::Span<const HloReduceInstruction* const> reduce_instr_index_group,
-    FusedIrEmitter& fused_emitter) {
-  ReductionCodegenState reduction_codegen_state(reduction_info);
-  VLOG(10) << "Emit prologue for reduction: " << llvm_ir::DumpToString(fusion);
-
+    const ReductionOutputMap& result_ir_arrays, FusedIrEmitter& fused_emitter)
+    : reduction_emitter_(reduction_emitter),
+      result_ir_arrays_(result_ir_arrays) {
+  const ReductionCodegenInfo& reduction_info =
+      *reduction_emitter_.analysis_.GetReductionCodegenInfo();
+  VLOG(10) << "Emit prologue for reduction: "
+           << reduction_emitter_.fusion_.ToString();
+
+  auto* builder = reduction_emitter_.builder_;
   for (const HloReduceInstruction* reduce_hlo : reduce_instr_index_group) {
-    int num_partial_results = reduction_codegen_state.GetNumPartialResults();
+    int num_partial_results = reduction_info.GetNumPartialResults();
     for (int op_result_idx = 0;
          op_result_idx < GetNumOutputs(reduce_hlo->shape()); op_result_idx++) {
       Shape result_shape = reduce_hlo->shape().IsTuple()
@@ -162,15 +324,11 @@ ReductionCodegenState GenerateReductionCodegenState(
                                partial_result_address, {builder->getInt32(i)}));
       }
 
-      const TilingScheme& tiling_scheme =
-          reduction_codegen_state.GetTilingScheme();
-      int64_t num_threads_x =
-          tiling_scheme.GetNumThreadsFor(TilingScheme::DimX);
+      const TilingScheme& tiling_scheme = reduction_info.GetTilingScheme();
       llvm::GlobalVariable* shared_cache = [&]() -> llvm::GlobalVariable* {
-        if (reduction_codegen_state.IsRowReduction()) {
+        if (reduction_info.IsRowReduction()) {
           // Multi-row reductions do not use shared memory.
-          if (RowReductionGetRowsPerWarp(tiling_scheme.GetDimsInElems()[2]) >
-              1) {
+          if (reduction_emitter_.RowReductionGetRowsPerWarp() > 1) {
             return nullptr;
           }
           // Allocate __shared__
@@ -181,6 +339,8 @@ ReductionCodegenState GenerateReductionCodegenState(
                                 {num_partial_results, num_warps},
                                 "shared_cache");
         } else {
+          int64_t num_threads_x =
+              tiling_scheme.GetNumThreadsFor(TilingScheme::DimX);
           // Allocate __shared__
           // cache[num_threads][num_threads + 1], where
           // num_threads == num_threads_x == num_threads_y.  The "+1" is used to
@@ -198,32 +358,27 @@ ReductionCodegenState GenerateReductionCodegenState(
 
       llvm_ir::ElementGenerator input_gen =
           *fused_emitter.GetGenerator(*reduce_hlo->inputs()[op_result_idx]);
-      reduction_codegen_state.SetCalculationStateFor(
+      SetCalculationStateFor(
           {shared_cache, init_ir_value, partial_result_address,
            reduction_input_address, input_gen},
           reduce_hlo, op_result_idx);
     }
   }
-
-  return reduction_codegen_state;
 }
 
-void MaybeEmitFenceForAMDGPU(llvm::IRBuilder<>* builder,
-                             IrEmitterContext& ir_emitter_context) {
-  auto* module = builder->GetInsertBlock()->getModule();
+void ReductionEmitter::MaybeEmitFenceForAMDGPU() {
+  auto* module = builder_->GetInsertBlock()->getModule();
   if (IsAMDGPU(module) &&
-      ir_emitter_context.rocm_compute_capability().gcn_arch_name().substr(
-          0, 6) == "gfx90a") {
-    builder->CreateFence(
+      ir_emitter_context_.rocm_compute_capability().fence_before_barrier()) {
+    builder_->CreateFence(
         llvm::AtomicOrdering::SequentiallyConsistent,
-        builder->getContext().getOrInsertSyncScopeID("workgroup"));
+        builder_->getContext().getOrInsertSyncScopeID("workgroup"));
   }
 }
 
-void EmitSyncThreads(llvm::IRBuilder<>* builder,
-                     IrEmitterContext& ir_emitter_context) {
-  MaybeEmitFenceForAMDGPU(builder, ir_emitter_context);
-  EmitCallToTargetIntrinsic(TargetIntrinsicID::kBarrierId, {}, {}, builder);
+void ReductionEmitter::EmitSyncThreads() {
+  MaybeEmitFenceForAMDGPU();
+  EmitCallToTargetIntrinsic(TargetIntrinsicID::kBarrierId, {}, {}, builder_);
 }
 
 // Builds a thunk that calls a new or reused kernel for a fusion operation.
@@ -247,27 +402,30 @@ void EmitSyncThreads(llvm::IRBuilder<>* builder,
 //                             ...));
 // AddThunkToThunkSequence(std::move(thunk))
 // ```
-StatusOr<std::unique_ptr<Thunk>> BuildKernelThunkForFusion(
-    IrEmitterContext& ir_emitter_context, KernelReuseCache& kernel_cache,
-    mlir::lmhlo::FusionOp fusion_op, const HloComputation* fused_computation,
+StatusOr<std::unique_ptr<Thunk>> ReductionEmitter::BuildKernelThunkForFusion(
     const LaunchDimensions& launch_dimensions, absl::string_view discriminator,
     std::function<Status(std::vector<llvm_ir::IrArray>,
                          std::vector<llvm_ir::IrArray>)>
-        kernel_builder_fn,
-    llvm::IRBuilder<>* builder) {
-  std::string suggested_kernel_name = GetIrNameFromLoc(fusion_op->getLoc());
+        kernel_builder_fn) {
+  const HloComputation* fused_computation =
+      fusion_.fused_instructions_computation();
+  std::string suggested_kernel_name = std::string(fusion_.name());
 
   TF_ASSIGN_OR_RETURN(
       auto kernel_arguments,
-      KernelArguments::Create(ir_emitter_context.allocations(), fusion_op));
+      ir_emitter_context_.emit_ir_from_hlo()
+          ? KernelArguments::Create(ir_emitter_context_.buffer_assignment(),
+                                    &fusion_)
+          : KernelArguments::Create(ir_emitter_context_.allocations(),
+                                    fusion_op_));
 
   auto kernel_builder_status = OkStatus();
-  auto [entry, cached] = kernel_cache.Get(
+  auto [entry, cached] = kernel_cache_.Get(
       fused_computation, kernel_arguments.args(), discriminator,
       [&]() -> KernelReuseCache::Entry {
         auto [kernel, input_arrays, output_arrays] = BuildKernelPrototype(
-            ir_emitter_context, suggested_kernel_name, kernel_arguments.args(),
-            fusion_op.getInputBuffers().size(), launch_dimensions, builder);
+            ir_emitter_context_, suggested_kernel_name, kernel_arguments.args(),
+            fusion_.operand_count(), launch_dimensions, builder_);
         kernel_builder_status = kernel_builder_fn(input_arrays, output_arrays);
         return {kernel->getName().str(), launch_dimensions};
       });
@@ -277,22 +435,27 @@ StatusOr<std::unique_ptr<Thunk>> BuildKernelThunkForFusion(
             << entry.kernel_name;
   }
 
+  if (ir_emitter_context_.emit_ir_from_hlo()) {
+    return std::make_unique<KernelThunk>(
+        &fusion_, entry.kernel_name, kernel_arguments.args(), launch_dimensions,
+        // Shared memory is allocated statically.
+        /*shmem_bytes=*/0);
+  }
+
   return std::make_unique<KernelThunk>(
-      fusion_op, entry.kernel_name, kernel_arguments.args(), launch_dimensions,
+      fusion_op_, entry.kernel_name, kernel_arguments.args(), launch_dimensions,
       // Shared memory is allocated statically.
       /*shmem_bytes=*/0);
 }
 
-Status EmitExtraOutputsForReduce(llvm::IRBuilder<>* builder,
-                                 const Shape& reduction_operand_shape,
-                                 const ReductionOutputMap& result_ir_arrays,
-                                 const llvm_ir::IrArray::Index& index,
-                                 const ReductionCodegenInfo& reduction_info,
-                                 const ExtraOutputGensMap& extra_output_gens) {
+Status ReductionGroupEmitter::EmitExtraOutputsForReduce(
+    const Shape& reduction_operand_shape, const llvm_ir::IrArray::Index& index,
+    const ExtraOutputGensMap& extra_output_gens) const {
   if (extra_output_gens.empty()) {
     return OkStatus();
   }
 
+  auto* builder = reduction_emitter_.builder_;
   // Compute all extra output values before writing them. This avoids
   // overwriting aliased input/output buffers before all reads occurred.
   std::vector<std::pair<const HloInstruction*, llvm::Value*>>
@@ -314,48 +477,48 @@ Status EmitExtraOutputsForReduce(llvm::IRBuilder<>* builder,
   }
 
   for (const auto& [instr, generator] : extra_output_ir_values) {
-    absl::Span<llvm_ir::IrArray const> result_ir = result_ir_arrays.at(instr);
+    absl::Span<llvm_ir::IrArray const> result_ir = result_ir_arrays_.at(instr);
     CHECK_EQ(result_ir.size(), 1);
     result_ir[0].EmitWriteArrayElement(
         get_index(instr), generator, builder, /*use_linear_index=*/
-        reduction_info.GetNumPartialResults() == 1);
+        reduction_emitter_.analysis_.GetReductionCodegenInfo()
+                ->GetNumPartialResults() == 1);
   }
   return OkStatus();
 }
 
-StatusOr<std::unique_ptr<Thunk>> BuildFusedInitializerThunk(
-    IrEmitterContext& ir_emitter_context, mlir::lmhlo::FusionOp fusion,
-    const HloComputation* fused_computation,
-    ElementalIrEmitter& elemental_emitter, KernelReuseCache& kernel_cache,
-    int output_index, llvm::IRBuilder<>* builder) {
-  auto reduce = mlir::dyn_cast_or_null<mlir::mhlo::ReduceOp>(
-      fusion.getFusionRoots()[output_index]);
-
+StatusOr<std::unique_ptr<Thunk>> ReductionEmitter::BuildFusedInitializerThunk(
+    const HloInstruction* fusion_root, mlir::Value dest,
+    BufferAllocation::Slice dest_slice, int output_index) {
+  const HloReduceInstruction* reduce =
+      DynCast<HloReduceInstruction>(fusion_root);
   TF_RET_CHECK(reduce);
-  TF_RET_CHECK(reduce.getNumResults() == 1);
 
-  mlir::Value init_value = reduce.getInitValues()[0];
-  mlir::Value dest = fusion.getOutputBuffers()[output_index];
-  TF_ASSIGN_OR_RETURN(std::optional<std::unique_ptr<Thunk>> constant_init_thunk,
-                      BuildConstantInitializerThunk(ir_emitter_context, fusion,
-                                                    init_value, dest));
+  const HloInstruction* init_value = reduce->init_values()[0];
+  TF_ASSIGN_OR_RETURN(
+      std::optional<std::unique_ptr<Thunk>> constant_init_thunk,
+      BuildConstantInitializerThunk(ir_emitter_context_, fusion_op_,
+                                    fusion_root, init_value, dest, dest_slice));
   if (constant_init_thunk) {
     return *std::move(constant_init_thunk);
   }
 
-  const Shape dest_shape = GetShape(dest);
+  const Shape dest_shape = fusion_root->shape();
 
   TF_ASSIGN_OR_RETURN(LaunchDimensions launch_dimensions,
                       CalculateLaunchDimensions(
-                          dest_shape, ir_emitter_context.gpu_device_info()));
+                          dest_shape, ir_emitter_context_.gpu_device_info()));
+  const HloComputation* fused_computation =
+      fusion_.fused_instructions_computation();
 
   auto builder_fn = [&](std::vector<llvm_ir::IrArray> inputs,
                         std::vector<llvm_ir::IrArray> outputs) -> Status {
-    FusedIrEmitter fused_emitter(elemental_emitter);
+    FusedIrEmitter fused_emitter(elemental_emitter_);
     for (int i = 0; i < fused_computation->num_parameters(); i++) {
       fused_emitter.BindGenerator(
           *fused_computation->parameter_instruction(i),
-          [builder, input = inputs[i]](llvm_ir::IrArray::Index index) {
+          [builder = builder_,
+           input = inputs[i]](llvm_ir::IrArray::Index index) {
             return input.EmitReadArrayElement(index, builder);
           });
     }
@@ -369,16 +532,15 @@ StatusOr<std::unique_ptr<Thunk>> BuildFusedInitializerThunk(
     TF_ASSIGN_OR_RETURN(auto generator,
                         fused_emitter.GetGenerator(*instr->operand(1)));
     TF_RETURN_IF_ERROR(ParallelLoopEmitter(generator, {outputs[output_index]},
-                                           launch_dimensions, builder)
-                           .EmitLoop(GetIrNameFromLoc(fusion.getLoc())));
+                                           launch_dimensions, builder_)
+                           .EmitLoop(fusion_.name()));
     return OkStatus();
   };
 
-  return BuildKernelThunkForFusion(ir_emitter_context, kernel_cache, fusion,
-                                   fused_computation, launch_dimensions,
+  return BuildKernelThunkForFusion(launch_dimensions,
                                    /*discriminator=*/
                                    absl::StrCat("init_", output_index),
-                                   builder_fn, builder);
+                                   builder_fn);
 }
 
 // Gets the output offset as calculated from thread_id.x (to be applied to the
@@ -401,18 +563,17 @@ static llvm::Value* GetStartOffsetX(const TilingScheme& tiling_scheme,
 //
 // Multiple partial_result_address inputs happen when doing variadic
 // reduction: each one should get the output value.
-void EmitFullWarpShuffleDownLoopForReduce(
-    llvm::IRBuilder<>* builder, IrEmitterContext& ir_emitter_context,
+void ReductionGroupEmitter::EmitFullWarpShuffleDownLoopForReduce(
     const HloComputation* reducer,
     absl::Span<TypedPointer const> partial_result_addresses,
-    int threads_per_block, int num_results_per_warp) {
+    int threads_per_block, int num_results_per_warp) const {
   // This only works when the block size is a multiple of 32 threads.
-
   // We check this here as a mistake in the number of threads per
   // block is very hard to detect.
   CHECK_EQ(threads_per_block % 32, 0);
   CHECK_EQ(WarpSize() % num_results_per_warp, 0);
 
+  auto* builder = reduction_emitter_.builder_;
   for (int distance = 16 / num_results_per_warp; distance >= 1; distance /= 2) {
     absl::InlinedVector<llvm::Value*, 2> reduction_params;
 
@@ -433,10 +594,7 @@ void EmitFullWarpShuffleDownLoopForReduce(
       llvm::Type* shuffled_value_type = element_type->isStructTy()
                                             ? builder->getIntNTy(bit_width)
                                             : element_type;
-      auto convert_pointer_for_shuffle = [&](llvm::Value* ptr) {
-        return builder->CreatePointerBitCastOrAddrSpaceCast(
-            ptr, shuffled_value_type->getPointerTo());
-      };
+      auto convert_pointer_for_shuffle = [&](llvm::Value* ptr) { return ptr; };
 
       llvm::Value* partial_result = builder->CreateLoad(
           shuffled_value_type,
@@ -449,8 +607,9 @@ void EmitFullWarpShuffleDownLoopForReduce(
     }
 
     StatusOr<std::vector<llvm::Value*>> returned_scalars =
-        CallNestedComputationWithScalarAddrs(builder, ir_emitter_context,
-                                             *reducer, reduction_params);
+        CallNestedComputationWithScalarAddrs(
+            builder, reduction_emitter_.ir_emitter_context_, *reducer,
+            reduction_params);
     TF_CHECK_OK(returned_scalars.status());
 
     for (int i = 0; i < returned_scalars->size(); i++) {
@@ -460,27 +619,28 @@ void EmitFullWarpShuffleDownLoopForReduce(
   }
 }
 
-llvm_ir::IrArray::Index GetOutputIndexForReduction(
-    llvm::IRBuilder<>* builder, int partial_result_idx, llvm::Type* index_ty,
-    const ReductionCodegenState& reduction_codegen_state,
-    const TilingKernelInfo& tiling_kernel_info,
+llvm_ir::IrArray::Index ReductionGroupEmitter::GetOutputIndexForReduction(
+    int partial_result_idx, const TilingKernelInfo& tiling_kernel_info,
     const HloReduceInstruction* reduction, const HloInstruction* root,
-    int output_idx) {
+    int output_idx) const {
   auto constant = [&](uint64_t c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
+    return llvm::ConstantInt::get(reduction_emitter_.index_ty_, c);
   };
 
-  const TilingScheme& tiling_scheme = reduction_codegen_state.GetTilingScheme();
+  auto* builder = reduction_emitter_.builder_;
+  const auto& reduction_info =
+      *reduction_emitter_.analysis_.GetReductionCodegenInfo();
+  const TilingScheme& tiling_scheme = reduction_info.GetTilingScheme();
   const TilingThreadIdInfo& thread_id_info = tiling_kernel_info.thread_id_info;
 
   llvm_ir::IrArray::Index start_offset = [&] {
     llvm::Value* x_loc = thread_id_info.thread_id_x;
     llvm::Value* y_loc = thread_id_info.thread_id_y;
-    if (!reduction_codegen_state.IsRowReduction()) {
+    if (!reduction_info.IsRowReduction()) {
       std::swap(x_loc, y_loc);
     }
-    llvm::Value* start_offset_x =
-        GetStartOffsetX(tiling_scheme, x_loc, index_ty, builder);
+    llvm::Value* start_offset_x = GetStartOffsetX(
+        tiling_scheme, x_loc, reduction_emitter_.index_ty_, builder);
     return tiling_kernel_info.tile_origin
         .AddOffsetToDim(y_loc, TilingScheme::DimY, builder)
         .AddOffsetToDim(start_offset_x, TilingScheme::DimX, builder);
@@ -496,7 +656,7 @@ llvm_ir::IrArray::Index GetOutputIndexForReduction(
   llvm::Value* untransposed_output_linear_address = [&] {
     const llvm_ir::IrArray::Index index = start_offset.AddOffsetToDim(
         constant(partial_result_idx), TilingScheme::DimX, builder);
-    if (reduction_codegen_state.IsRowReduction()) {
+    if (reduction_info.IsRowReduction()) {
       // For row-reduction, y-coordinate determines which row we write into.
       return index[TilingScheme::DimY];
     }
@@ -544,28 +704,24 @@ llvm::Value* CastSharedToGlobal(llvm::IRBuilder<>* builder, llvm::Value* input,
       name);
 }
 
-void WriteReductionOutput(llvm::IRBuilder<>* builder,
-                          IrEmitterContext& ir_emitter_context,
-                          llvm::Type* index_ty,
-                          const ReductionCodegenState& reduction_codegen_state,
-                          const TilingKernelInfo& tiling_kernel_info,
-                          const ReductionOutputMap& output_arrays,
-                          const HloReduceInstruction* reduction,
-                          const HloInstruction* root, int partial_result_idx,
-                          const absl::Span<TypedPointer const> values,
-                          ElementalIrEmitter& elemental_emitter) {
+void ReductionGroupEmitter::WriteReductionOutput(
+    const TilingKernelInfo& tiling_kernel_info,
+    const HloReduceInstruction* reduction, const HloInstruction* root,
+    int partial_result_idx, const absl::Span<TypedPointer const> values) const {
+  auto* builder = reduction_emitter_.builder_;
+  const auto& reduction_info =
+      *reduction_emitter_.analysis_.GetReductionCodegenInfo();
   const HloComputation* reducer = reduction->to_apply();
   for (const auto& [oidx, typed_ptr] : llvm::enumerate(values)) {
     auto [output_ptr, type] = typed_ptr;
     llvm_ir::IrArray::Index output_index = GetOutputIndexForReduction(
-        builder, partial_result_idx, index_ty, reduction_codegen_state,
-        tiling_kernel_info, reduction, root, oidx);
+        partial_result_idx, tiling_kernel_info, reduction, root, oidx);
 
     llvm::Value* output_address =
-        output_arrays.at(root)[oidx].EmitArrayElementAddress(
+        result_ir_arrays_.at(root)[oidx].EmitArrayElementAddress(
             output_index, builder, "output_element_address");
-    if (reduction_codegen_state.IsRaceFree()) {
-      FusedIrEmitter fused_emitter(elemental_emitter);
+    if (reduction_info.IsRaceFree()) {
+      FusedIrEmitter fused_emitter(reduction_emitter_.elemental_emitter_);
       llvm::Value* loaded = builder->CreateLoad(type, output_ptr, "output");
       fused_emitter.BindGenerator(
           *reduction,
@@ -578,36 +734,34 @@ void WriteReductionOutput(llvm::IRBuilder<>* builder,
       CHECK_EQ(reduction, root)
           << "output fusion is not allowed for racing reductions";
       TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
-          builder, ir_emitter_context, *reducer, output_address, output_ptr,
-          type));
+          builder, reduction_emitter_.ir_emitter_context_, *reducer,
+          output_address, output_ptr, type));
     }
   }
 }
 
 // `current_output`: the value the tile has calculated.
 // `output_address`: address where the output value has to be written.
-void EmitReductionOutputForRowReduction(
-    llvm::IRBuilder<>* builder, IrEmitterContext& ir_emitter_context,
+void ReductionGroupEmitter::EmitReductionOutputForRowReduction(
     const TilingKernelInfo& tiling_kernel_info,
-    const ReductionCodegenState& reduction_codegen_state, llvm::Type* index_ty,
-    const ReductionOutputMap& output_arrays,
     const HloReduceInstruction* reduction, const HloInstruction* root,
-    int partial_result_idx, ElementalIrEmitter& elemental_emitter) {
+    int partial_result_idx) const {
   const HloComputation* reducer = reduction->to_apply();
   const auto& thread_id_info = tiling_kernel_info.thread_id_info;
   auto constant = [&](uint64_t c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
+    return llvm::ConstantInt::get(reduction_emitter_.index_ty_, c);
   };
+
+  auto* builder = reduction_emitter_.builder_;
   auto is_zero = [&](llvm::Value* value) {
     return builder->CreateICmpEQ(value, constant(0));
   };
 
   int num_outputs = reducer->num_parameters() / 2;
-  const TilingScheme& tiling_scheme = reduction_codegen_state.GetTilingScheme();
   absl::InlinedVector<TypedPointer, 2> current_outputs;
   for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
-    const ReductionCodegenState::ReductionCalculationState& state =
-        reduction_codegen_state.GetCalculationStateFor(reduction, output_idx);
+    const ReductionGroupEmitter::ReductionCalculationState& state =
+        GetCalculationStateFor(reduction, output_idx);
     current_outputs.push_back(
         {builder->CreateInBoundsGEP(
              state.partial_result_address->getAllocatedType(),
@@ -616,10 +770,12 @@ void EmitReductionOutputForRowReduction(
          state.partial_result_address->getAllocatedType()});
   }
 
-  int reduced_dimension_size = tiling_scheme.GetDimsInElems()[2];
-  int num_rows_per_warp = RowReductionGetRowsPerWarp(reduced_dimension_size);
+  const auto& reduction_info =
+      *reduction_emitter_.analysis_.GetReductionCodegenInfo();
+  const TilingScheme& tiling_scheme = reduction_info.GetTilingScheme();
+  int num_rows_per_warp = reduction_emitter_.RowReductionGetRowsPerWarp();
   EmitFullWarpShuffleDownLoopForReduce(
-      builder, ir_emitter_context, reducer, absl::MakeSpan(current_outputs),
+      reducer, absl::MakeSpan(current_outputs),
       tiling_scheme.GetNumThreadsPerBlockPhysical(), num_rows_per_warp);
 
   KernelSupportLibrary ksl(builder);
@@ -629,24 +785,22 @@ void EmitReductionOutputForRowReduction(
   auto emit_write_output = [&](llvm::Value* write_condition,
                                const absl::Span<TypedPointer const> values) {
     ksl.If("reduction_write_output", write_condition, [&] {
-      WriteReductionOutput(builder, ir_emitter_context, index_ty,
-                           reduction_codegen_state, tiling_kernel_info,
-                           output_arrays, reduction, root, partial_result_idx,
-                           values, elemental_emitter);
+      WriteReductionOutput(tiling_kernel_info, reduction, root,
+                           partial_result_idx, values);
     });
   };
 
   if (num_rows_per_warp > 1) {
     llvm::Value* is_writing_thread = is_zero(builder->CreateAnd(
-        thread_id_info.thread_id_x, constant(reduced_dimension_size - 1)));
+        thread_id_info.thread_id_x,
+        constant(reduction_emitter_.ReducedDimensionSize() - 1)));
     emit_write_output(is_writing_thread, current_outputs);
     return;
   }
 
   ksl.If("intra_warp_reduce_write", is_zero(thread_id_info.lane_id), [&] {
     for (int oidx = 0; oidx < num_outputs; oidx++) {
-      const ReductionCodegenState::ReductionCalculationState& state =
-          reduction_codegen_state.GetCalculationStateFor(reduction, oidx);
+      const auto& state = GetCalculationStateFor(reduction, oidx);
       llvm::Value* shmem_output_addr = thread_id_info.GEPIntoSharedMemory(
           builder, state.shared_cache, {constant(partial_result_idx), warp_id});
       builder->CreateStore(builder->CreateLoad(current_outputs[oidx].second,
@@ -657,12 +811,11 @@ void EmitReductionOutputForRowReduction(
 
   // TODO(cheshire): Don't we want to sync it once for everything in the
   // output? Not once per each?
-  EmitSyncThreads(builder, ir_emitter_context);
+  reduction_emitter_.EmitSyncThreads();
   ksl.If("inter_warp_reduce", is_zero(warp_id), [&] {
     absl::InlinedVector<TypedPointer, 2> selected_values;
     for (int oidx = 0; oidx < num_outputs; oidx++) {
-      const ReductionCodegenState::ReductionCalculationState& state =
-          reduction_codegen_state.GetCalculationStateFor(reduction, oidx);
+      const auto& state = GetCalculationStateFor(reduction, oidx);
       llvm::Value* block_accum_addr = thread_id_info.GEPIntoSharedMemory(
           builder, state.shared_cache,
           {constant(partial_result_idx), thread_id_info.lane_id});
@@ -696,7 +849,7 @@ void EmitReductionOutputForRowReduction(
     // also unnecessary and should be removed.
     if (tiling_scheme.GetNumThreadsPerBlock() > WarpSize()) {
       EmitFullWarpShuffleDownLoopForReduce(
-          builder, ir_emitter_context, reducer, absl::MakeSpan(selected_values),
+          reducer, absl::MakeSpan(selected_values),
           tiling_scheme.GetNumThreadsPerBlock(), /*num_results_per_warp=*/1);
     }
 
@@ -705,37 +858,36 @@ void EmitReductionOutputForRowReduction(
 }
 
 // Same arguments as EmitReductionOutputForRowReduction.
-void EmitReductionOutputForColumnReduction(
-    llvm::IRBuilder<>* builder, IrEmitterContext& ir_emitter_context,
+void ReductionGroupEmitter::EmitReductionOutputForColumnReduction(
     const TilingKernelInfo& tiling_kernel_info,
-    const ReductionCodegenState& reduction_codegen_state, llvm::Type* index_ty,
-    const ReductionOutputMap& output_arrays,
     const HloReduceInstruction* reduction, const HloInstruction* root,
-    int partial_result_idx, ElementalIrEmitter& elemental_emitter) {
+    int partial_result_idx) const {
+  auto* builder = reduction_emitter_.builder_;
   KernelSupportLibrary ksl(builder);
   const HloComputation* reducer = reduction->to_apply();
   const auto& thread_id_info = tiling_kernel_info.thread_id_info;
 
   auto constant = [&](uint64_t c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
+    return llvm::ConstantInt::get(reduction_emitter_.index_ty_, c);
   };
   auto is_zero = [&](llvm::Value* value) {
     return builder->CreateICmpEQ(value, constant(0));
   };
-  const TilingScheme& tiling_scheme = reduction_codegen_state.GetTilingScheme();
+  const auto& reduction_info =
+      *reduction_emitter_.analysis_.GetReductionCodegenInfo();
+  const TilingScheme& tiling_scheme = reduction_info.GetTilingScheme();
   int num_outputs = reducer->num_parameters() / 2;
 
   // Wait for reads from shmem in the last iteration to complete.  (If this is
   // slow, we could "double-buffer" by having two shmem buffers and switching
   // between them.)
   if (partial_result_idx > 0) {
-    EmitSyncThreads(builder, ir_emitter_context);
+    reduction_emitter_.EmitSyncThreads();
   }
 
   // Store the transpose in shared memory.
   for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
-    const ReductionCodegenState::ReductionCalculationState& state =
-        reduction_codegen_state.GetCalculationStateFor(reduction, output_idx);
+    const auto& state = GetCalculationStateFor(reduction, output_idx);
     llvm::GlobalVariable* shared_cache = state.shared_cache;
     llvm::AddrSpaceCastInst* shmem_output_addr =
         llvm::cast<llvm::AddrSpaceCastInst>(thread_id_info.GEPIntoSharedMemory(
@@ -752,13 +904,12 @@ void EmitReductionOutputForColumnReduction(
     builder->CreateStore(current_output_value, shmem_output_addr);
   }
 
-  EmitSyncThreads(builder, ir_emitter_context);
+  reduction_emitter_.EmitSyncThreads();
 
   // Get transposed element from shared memory.
   absl::InlinedVector<TypedPointer, 2> shmem_transposed_addrs;
   for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
-    const ReductionCodegenState::ReductionCalculationState& state =
-        reduction_codegen_state.GetCalculationStateFor(reduction, output_idx);
+    const auto& state = GetCalculationStateFor(reduction, output_idx);
     llvm::AddrSpaceCastInst* shmem_transposed_addr =
         llvm::cast<llvm::AddrSpaceCastInst>(thread_id_info.GEPIntoSharedMemory(
             builder, state.shared_cache,
@@ -770,7 +921,7 @@ void EmitReductionOutputForColumnReduction(
                                     ->getResultElementType()});
   }
 
-  EmitFullWarpShuffleDownLoopForReduce(builder, ir_emitter_context, reducer,
+  EmitFullWarpShuffleDownLoopForReduce(reducer,
                                        absl::MakeSpan(shmem_transposed_addrs),
                                        tiling_scheme.GetNumThreadsPerBlock(),
                                        /*num_results_per_warp=*/1);
@@ -779,38 +930,33 @@ void EmitReductionOutputForColumnReduction(
   // tensor, so they should not write any output at all.
   llvm::Value* has_output = builder->CreateAnd(
       builder->CreateICmpULT(
-          GetStartOffsetX(tiling_scheme, thread_id_info.thread_id_y, index_ty,
-                          builder),
+          GetStartOffsetX(tiling_scheme, thread_id_info.thread_id_y,
+                          reduction_emitter_.index_ty_, builder),
           tiling_kernel_info.output_tile_bounds[1]),
       builder->CreateICmpULT(thread_id_info.thread_id_x,
                              tiling_kernel_info.output_tile_bounds[0]));
 
   ksl.If("reduction_write_output",
          builder->CreateAnd(has_output, is_zero(thread_id_info.lane_id)), [&] {
-           WriteReductionOutput(
-               builder, ir_emitter_context, index_ty, reduction_codegen_state,
-               tiling_kernel_info, output_arrays, reduction, root,
-               partial_result_idx, shmem_transposed_addrs, elemental_emitter);
+           WriteReductionOutput(tiling_kernel_info, reduction, root,
+                                partial_result_idx, shmem_transposed_addrs);
          });
 }
 
 // Generate a single element of the tile (update the accumulator state) for a
 // given reducer of index `i`.
-void GenerateElementForReducer(
-    llvm::IRBuilder<>* builder, IrEmitterContext& ir_emitter_context,
+void ReductionGroupEmitter::GenerateElementForReducer(
     const HloReduceInstruction* reduction, llvm::Value* partial_result_index,
-    const ReductionCodegenState& codegen_state,
     const llvm_ir::IrArray::Index& index_without_linear,
-    const llvm_ir::IrArray::Index& input_index, int num_partial_results,
-    const ReductionOutputMap& result_ir_arrays) {
+    const llvm_ir::IrArray::Index& input_index, int num_partial_results) const {
   HloComputation* reducer = reduction->to_apply();
+  auto* builder = reduction_emitter_.builder_;
   CHECK_EQ(reducer->num_parameters() % 2, 0);
 
   absl::InlinedVector<llvm::Value*, 2> reduction_accumulators;
   absl::InlinedVector<llvm::Value*, 2> reduction_input_value;
   for (int red_idx = 0; red_idx < reducer->num_parameters() / 2; red_idx++) {
-    const ReductionCodegenState::ReductionCalculationState& state =
-        codegen_state.GetCalculationStateFor(reduction, red_idx);
+    const auto& state = GetCalculationStateFor(reduction, red_idx);
 
     llvm::AllocaInst* input_address = state.input_address;
     llvm::AllocaInst* partial_reduction_result_address =
@@ -841,8 +987,9 @@ void GenerateElementForReducer(
   // those pointers, and we have returned values on the stack (as well
   // as pointers to them).
   StatusOr<std::vector<llvm::Value*>> returned_scalars =
-      CallNestedComputationWithScalarAddrs(builder, ir_emitter_context,
-                                           *reducer, reduction_params);
+      CallNestedComputationWithScalarAddrs(
+          builder, reduction_emitter_.ir_emitter_context_, *reducer,
+          reduction_params);
   TF_CHECK_OK(returned_scalars.status());
 
   for (int i = 0; i < returned_scalars->size(); i++) {
@@ -851,13 +998,11 @@ void GenerateElementForReducer(
 }
 
 // Emits code for reductions in the output_instructions.
-Status EmitIRForReduction(
-    llvm::IRBuilder<>* builder, IrEmitterContext& ir_emitter_context,
-    mlir::lmhlo::FusionOp fusion,
+Status ReductionEmitter::EmitIRForReduction(
     absl::Span<const HloInstruction* const> instr_index_group,
     FusedIrEmitter& fused_emitter, const ReductionOutputMap& result_ir_arrays,
-    const ReductionCodegenInfo& reduction_info, const Shape& input_shape,
-    ElementalIrEmitter& elemental_emitter) {
+    const Shape& input_shape) {
+  const auto& reduction_info = *analysis_.GetReductionCodegenInfo();
   std::vector<const HloInstruction*> roots;
   std::vector<const HloReduceInstruction*> heroes;
   ExtraOutputGensMap extra_output_gens;
@@ -876,79 +1021,68 @@ Status EmitIRForReduction(
   CHECK(!heroes.empty()) << " expect at least one reduce instructions.";
   const TilingScheme& tiling_scheme = reduction_info.GetTilingScheme();
   CHECK_EQ(tiling_scheme.GetNumThreadsPerBlockPhysical() % WarpSize(), 0);
-  llvm::Type* index_ty =
-      GetIndexTypeForKernel(fusion,
-                            tiling_scheme.GetNumThreadsPerBlockPhysical() *
-                                tiling_scheme.GetNumberOfBlocksPhysical(),
-                            builder);
-  ReductionCodegenState codegen_state = GenerateReductionCodegenState(
-      builder, fusion, reduction_info, heroes, fused_emitter);
+  ReductionGroupEmitter group_emitter(*this, heroes, result_ir_arrays,
+                                      fused_emitter);
 
   EmitTileElementFunction emit_reduction_element =
       [&](const TilingThreadIdInfo& thread_id_info,
           const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
           llvm::Value* x_loc) {
         llvm_ir::IrArray::Index input_index = GetUnnormalizedIndex(
-            index, input_shape, builder,
-            codegen_state.GetTilingScheme().GetDimsInElems());
+            index, input_shape, builder_,
+            reduction_info.GetTilingScheme().GetDimsInElems());
         llvm::Value* partial_result_index =
-            codegen_state.IsRowReduction()
-                ? builder->getInt32(0)
-                : builder->CreateSub(
+            reduction_info.IsRowReduction()
+                ? builder_->getInt32(0)
+                : builder_->CreateSub(
                       x_loc,
                       GetStartOffsetX(tiling_scheme, thread_id_info.thread_id_x,
-                                      index_ty, builder));
+                                      index_ty_, builder_));
 
         // Clear the linear index field of the llvm_ir::IrArray::Index to enable
         // the use of GetElementPointer with array types. This enables the
         // vectorization of the computation for different partial results. Use
         // this index if 'num_partial_results > 1'.
-        int num_partial_results = codegen_state.GetNumPartialResults();
+        int num_partial_results = reduction_info.GetNumPartialResults();
         llvm_ir::IrArray::Index index_without_linear{
             input_index.multidim(), input_shape, input_index.GetType()};
 
         // Emit code to generate the input and perform the reduction computation
         // for each reduction instruction.
         for (const HloReduceInstruction* reduce : heroes) {
-          GenerateElementForReducer(builder, ir_emitter_context, reduce,
-                                    partial_result_index, codegen_state,
-                                    index_without_linear, input_index,
-                                    num_partial_results, result_ir_arrays);
+          group_emitter.GenerateElementForReducer(
+              reduce, partial_result_index, index_without_linear, input_index,
+              num_partial_results);
         }
 
         // Emit code to generate the output for the non-reduction instructions
         // in the fusion, if any.
-        TF_CHECK_OK(EmitExtraOutputsForReduce(
-            builder, input_shape, result_ir_arrays, input_index, reduction_info,
-            extra_output_gens));
+        TF_CHECK_OK(group_emitter.EmitExtraOutputsForReduce(
+            input_shape, input_index, extra_output_gens));
       };
 
   TF_ASSIGN_OR_RETURN(
       TilingKernelInfo tiling_kernel_info,
-      EmitTilingKernel(builder, tiling_scheme, index_ty,
+      EmitTilingKernel(builder_, tiling_scheme, index_ty_,
                        [&](const TilingThreadIdInfo& thread_id_info,
                            const llvm_ir::IrArray::Index& index,
                            std::array<llvm::Value*, 2> tile_dimensions) {
-                         EmitTile(builder, codegen_state.GetTilingScheme(),
+                         EmitTile(builder_, reduction_info.GetTilingScheme(),
                                   index, thread_id_info, tile_dimensions,
                                   emit_reduction_element);
                        }));
 
-  KernelSupportLibrary ksl(builder);
+  KernelSupportLibrary ksl(builder_);
   for (auto [reduce, root] : llvm::zip(heroes, roots)) {
     for (int partial_result_idx = 0;
          partial_result_idx < reduction_info.GetNumPartialResults();
          ++partial_result_idx) {
-      if (codegen_state.IsRowReduction()) {
-        EmitReductionOutputForRowReduction(
-            builder, ir_emitter_context, tiling_kernel_info, codegen_state,
-            index_ty, result_ir_arrays, reduce, root, partial_result_idx,
-            elemental_emitter);
+      if (reduction_info.IsRowReduction()) {
+        group_emitter.EmitReductionOutputForRowReduction(
+            tiling_kernel_info, reduce, root, partial_result_idx);
       } else {
-        EmitReductionOutputForColumnReduction(
-            builder, ir_emitter_context, tiling_kernel_info, codegen_state,
-            index_ty, result_ir_arrays, reduce, root, partial_result_idx,
-            elemental_emitter);
+        group_emitter.EmitReductionOutputForColumnReduction(
+            tiling_kernel_info, reduce, root, partial_result_idx);
       }
     }
   }
@@ -956,47 +1090,87 @@ Status EmitIRForReduction(
   return OkStatus();
 }
 
-}  // namespace
-
-StatusOr<FusionEmissionResult> ReductionFusion::Emit(
-    IrEmitterContext& ir_emitter_context, ElementalIrEmitter& elemental_emitter,
-    mlir::lmhlo::FusionOp fusion_op, const HloFusionInstruction& fusion,
-    KernelReuseCache& kernel_cache, llvm::IRBuilder<>* builder) const {
+StatusOr<FusionEmissionResult> ReductionEmitter::Emit() {
   auto* reduction_codegen_info = analysis_.GetReductionCodegenInfo();
   TF_ASSIGN_OR_RETURN(auto launch_dimensions, analysis_.GetLaunchDimensions());
 
   FusionEmissionResult result;
-  VLOG(3) << "Launch dimensions of "
-          << mlir::mhlo::GetDebugNameFromLocation(fusion_op.getLoc()) << ": "
+  VLOG(3) << "Launch dimensions of " << fusion_.name() << ": "
           << launch_dimensions.ToString();
   const HloComputation* fused_computation =
-      fusion.fused_instructions_computation();
+      fusion_.fused_instructions_computation();
   if (!reduction_codegen_info->IsRaceFree()) {
+    // We need to get the dest slice by traversing the slice assigned to
+    // fusion, because instructions inside fusion don't have buffer assignment.
+    //
+    // The order of fusion roots is determined by its position in the result
+    // tuple. For example, in the following fused computation
+    //
+    // %fused_computation {
+    //   %a = ...
+    //   &b = ...
+    //   ROOT %root = tuple(%a, %b)
+    // }
+    //
+    // The fusion root with index = 0 is %a, and the fusion root %b has index 1.
+    // Therefore we can get the ordered slices by calling ForEachSubshape on the
+    // result shape.
+    std::vector<BufferAllocation::Slice> slices;
+    if (ir_emitter_context_.emit_ir_from_hlo()) {
+      TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+          fusion_.shape(), [&](const Shape& subshape, ShapeIndex index) {
+            if (!ShapeUtil::IsLeafIndex(fusion_.shape(), index)) {
+              return OkStatus();
+            }
+
+            TF_ASSIGN_OR_RETURN(
+                BufferAllocation::Slice slice,
+                ir_emitter_context_.buffer_assignment().GetUniqueSlice(&fusion_,
+                                                                       index));
+            slices.push_back(slice);
+            return OkStatus();
+          }));
+    }
+
     absl::Span<const HloInstruction* const> fusion_roots =
         analysis_.fusion_roots();
     for (int i = 0; i < fusion_roots.size(); ++i) {
-      if (IsReductionFromOrToContiguousDimensions(*fusion_roots[i])) {
+      const HloInstruction* fusion_root = fusion_roots[i];
+
+      mlir::Value dest = ir_emitter_context_.emit_ir_from_hlo()
+                             ? nullptr
+                             : fusion_op_.getOutputBuffers()[i];
+
+      BufferAllocation::Slice dest_slice;
+      if (ir_emitter_context_.emit_ir_from_hlo()) {
+        dest_slice = slices[i];
+      } else {
+        TF_ASSIGN_OR_RETURN(
+            dest_slice,
+            GetAllocationSlice(dest, ir_emitter_context_.allocations()));
+      }
+
+      if (IsReductionFromOrToContiguousDimensions(*fusion_root)) {
         TF_ASSIGN_OR_RETURN(
             result.thunks.emplace_back(),
-            BuildFusedInitializerThunk(ir_emitter_context, fusion_op,
-                                       fused_computation, elemental_emitter,
-                                       kernel_cache, i, builder));
+            BuildFusedInitializerThunk(fusion_root, dest, dest_slice, i));
       }
     }
   }
 
   auto builder_fn = [&, this](std::vector<llvm_ir::IrArray> inputs,
                               std::vector<llvm_ir::IrArray> outputs) -> Status {
-    FusedIrEmitter fused_emitter(elemental_emitter);
+    FusedIrEmitter fused_emitter(elemental_emitter_);
     for (int i = 0; i < fused_computation->num_parameters(); i++) {
       HloInstruction* fused_operand =
           fused_computation->parameter_instruction(i);
-      fused_emitter.BindGenerator(*fused_operand,
-                                  [builder, input = inputs[i], fused_operand](
-                                      const llvm_ir::IrArray::Index& index) {
-                                    return input.EmitReadArrayElement(
-                                        index, builder, fused_operand->name());
-                                  });
+      fused_emitter.BindGenerator(
+          *fused_operand,
+          [builder = builder_, input = inputs[i],
+           fused_operand](const llvm_ir::IrArray::Index& index) {
+            return input.EmitReadArrayElement(index, builder,
+                                              fused_operand->name());
+          });
     }
 
     // Get outputs.
@@ -1010,7 +1184,7 @@ StatusOr<FusionEmissionResult> ReductionFusion::Emit(
       ir_arrays_idx += get_num_results;
     }
 
-    KernelSupportLibrary ksl(builder, llvm_ir::UnrollMode::kDefaultUnroll);
+    KernelSupportLibrary ksl(builder_, llvm_ir::UnrollMode::kDefaultUnroll);
 
     // Use raw block_id_y to select the i-th parallel reduction to run. Using
     // block_id_y instead of block_id_x simplifies the index calculation
@@ -1022,17 +1196,15 @@ StatusOr<FusionEmissionResult> ReductionFusion::Emit(
         reduction_codegen_info->GetReduceOperandShape();
 
     llvm::CallInst* raw_block_id_y = gpu::EmitCallToTargetIntrinsic(
-        gpu::TargetIntrinsicID::kBlockIdy, {}, {}, builder);
+        gpu::TargetIntrinsicID::kBlockIdy, {}, {}, builder_);
     llvm_ir::AddRangeMetadata(0, instr_index_groups.size(),
                               llvm::cast<llvm::Instruction>(raw_block_id_y));
     for (int i = 0; i < instr_index_groups.size(); ++i) {
       TF_RETURN_IF_ERROR(ksl.IfWithStatus(
           absl::StrCat("reduce-group-", i),
-          builder->CreateICmpEQ(raw_block_id_y, builder->getInt32(i)), [&] {
-            return EmitIRForReduction(builder, ir_emitter_context, fusion_op,
-                                      instr_index_groups[i], fused_emitter,
-                                      result_ir_arrays, *reduction_codegen_info,
-                                      reduce_operand_shape, elemental_emitter);
+          builder_->CreateICmpEQ(raw_block_id_y, builder_->getInt32(i)), [&] {
+            return EmitIRForReduction(instr_index_groups[i], fused_emitter,
+                                      result_ir_arrays, reduce_operand_shape);
           }));
     }
 
@@ -1041,11 +1213,20 @@ StatusOr<FusionEmissionResult> ReductionFusion::Emit(
 
   TF_ASSIGN_OR_RETURN(
       result.thunks.emplace_back(),
-      BuildKernelThunkForFusion(ir_emitter_context, kernel_cache, fusion_op,
-                                fused_computation, launch_dimensions, "",
-                                builder_fn, builder));
+      BuildKernelThunkForFusion(launch_dimensions, "", builder_fn));
   return result;
 }
 
+}  // namespace
+
+StatusOr<FusionEmissionResult> ReductionFusion::Emit(
+    IrEmitterContext& ir_emitter_context, ElementalIrEmitter& elemental_emitter,
+    mlir::lmhlo::FusionOp fusion_op, const HloFusionInstruction& fusion,
+    KernelReuseCache& kernel_cache, llvm::IRBuilder<>* builder) const {
+  return ReductionEmitter(analysis_, ir_emitter_context, elemental_emitter,
+                          fusion_op, fusion, kernel_cache, builder)
+      .Emit();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/thunk_util.cc b/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
index 98a66ca2d8dc5f..24fdfbfbc6f9b4 100644
--- a/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
+++ b/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
@@ -14,21 +14,27 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/thunk_util.h"
 
+#include <cstdint>
+#include <cstring>
 #include <memory>
 #include <optional>
 
+#include "absl/algorithm/container.h"
 #include "absl/types/span.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
-#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/literal.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/memset_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/shape.h"
-#include "xla/translate/hlo_to_mhlo/hlo_utils.h"
+#include "xla/shape_util.h"
+#include "xla/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -80,36 +86,16 @@ std::optional<std::unique_ptr<Thunk>> BuildConstantInitializerThunk(
 
 StatusOr<std::optional<std::unique_ptr<Thunk>>> BuildConstantInitializerThunk(
     IrEmitterContext& ir_emitter_context, mlir::Operation* op,
-    mlir::Value init_value, mlir::Value dest) {
-  mlir::DenseElementsAttr const_init;
-  if (auto get_global_memref =
-          mlir::dyn_cast_or_null<mlir::memref::GetGlobalOp>(
-              init_value.getDefiningOp())) {
-    auto global_memref =
-        mlir::SymbolTable::lookupNearestSymbolFrom<mlir::memref::GlobalOp>(
-            get_global_memref, get_global_memref.getNameAttr());
-    if (global_memref.getConstant() && global_memref.getInitialValue()) {
-      // If the initial value happens to be a constant, generate a specialized
-      // thunk.
-      const_init = global_memref.getInitialValue()
-                       .value()
-                       .cast<mlir::DenseElementsAttr>();
-    }
-  } else if (auto constant = mlir::dyn_cast_or_null<mlir::mhlo::ConstantOp>(
-                 init_value.getDefiningOp())) {
-    const_init = constant.getValue().dyn_cast<mlir::DenseElementsAttr>();
-  }
-
-  if (const_init) {
-    std::vector<uint8_t> literal_bytes;
-    TF_RETURN_IF_ERROR(
-        CopyDenseElementsDataToXlaFormat(const_init, &literal_bytes));
-
-    TF_ASSIGN_OR_RETURN(
-        auto dest_slice,
-        GetAllocationSlice(dest, ir_emitter_context.allocations()));
-
-    const Shape dest_shape = GetShape(dest);
+    const HloInstruction* instr, const HloInstruction* init_value,
+    mlir::Value dest, BufferAllocation::Slice dest_slice) {
+  if (const HloConstantInstruction* constant =
+          DynCast<HloConstantInstruction>(init_value)) {
+    const Literal& literal = constant->literal();
+    absl::Span<const uint8_t> literal_bytes(
+        static_cast<const uint8_t*>(literal.untyped_data()),
+        literal.size_bytes());
+
+    const Shape dest_shape = instr->shape();
     return BuildConstantInitializerThunk(op, literal_bytes, dest, dest_slice,
                                          dest_shape);
   }
diff --git a/third_party/xla/xla/service/gpu/fusions/thunk_util.h b/third_party/xla/xla/service/gpu/fusions/thunk_util.h
index 0ac5dad3f90099..29a9715209dd9c 100644
--- a/third_party/xla/xla/service/gpu/fusions/thunk_util.h
+++ b/third_party/xla/xla/service/gpu/fusions/thunk_util.h
@@ -18,6 +18,9 @@ limitations under the License.
 #include <memory>
 #include <optional>
 
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/statusor.h"
@@ -29,7 +32,8 @@ namespace gpu {
 // empty optional if the value is not a constant.
 StatusOr<std::optional<std::unique_ptr<Thunk>>> BuildConstantInitializerThunk(
     IrEmitterContext& ir_emitter_context, mlir::Operation* op,
-    mlir::Value init_value, mlir::Value dest);
+    const HloInstruction* instr, const HloInstruction* init_value,
+    mlir::Value dest, BufferAllocation::Slice dest_slice);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/tiling_util.cc b/third_party/xla/xla/service/gpu/fusions/tiling_util.cc
index a1bb666fec7b37..9cdd4d9e0d7109 100644
--- a/third_party/xla/xla/service/gpu/fusions/tiling_util.cc
+++ b/third_party/xla/xla/service/gpu/fusions/tiling_util.cc
@@ -15,11 +15,32 @@ limitations under the License.
 
 #include "xla/service/gpu/fusions/tiling_util.h"
 
+#include <array>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/kernel_mapping_scheme.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/kernel_support_library.h"
+#include "xla/service/llvm_ir/llvm_loop.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -351,9 +372,9 @@ llvm::Value* TilingThreadIdInfo::GEPIntoSharedMemory(
   llvm::Value* gep =
       b->CreateInBoundsGEP(shared->getValueType(), shared, idxs_scaled, name);
 
-  llvm::PointerType* pointer_in_addressspace =
-      llvm::PointerType::getWithSamePointeeType(
-          llvm::cast<llvm::PointerType>(gep->getType()), /*AddressSpace=*/0);
+  llvm::PointerType* pointer_in_addressspace = llvm::PointerType::get(
+      llvm::cast<llvm::PointerType>(gep->getType())->getContext(),
+      /*AddressSpace=*/0);
 
   // __shared__ memory uses a different address space, so we cast it to
   // global address space before writing or reading.
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose.cc b/third_party/xla/xla/service/gpu/fusions/transpose.cc
index 758743734d751e..8c6ac048d3dbef 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose.cc
@@ -14,18 +14,39 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/transpose.h"
 
+#include <array>
+#include <cstdint>
+#include <optional>
+#include <tuple>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AtomicOrdering.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/permutation_util.h"
+#include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/gpu/fusions/tiling_util.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/kernel_mapping_scheme.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/status.h"
+#include "xla/util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -50,8 +71,7 @@ void MaybeEmitFenceForAMDGPU(llvm::IRBuilder<>* builder,
                              IrEmitterContext& ir_emitter_context) {
   auto* module = builder->GetInsertBlock()->getModule();
   if (IsAMDGPU(module) &&
-      ir_emitter_context.rocm_compute_capability().gcn_arch_name().substr(
-          0, 6) == "gfx90a") {
+      ir_emitter_context.rocm_compute_capability().fence_before_barrier()) {
     builder->CreateFence(
         llvm::AtomicOrdering::SequentiallyConsistent,
         builder->getContext().getOrInsertSyncScopeID("workgroup"));
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
index 20f9e709f0961a..6f623849fb3436 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
@@ -47,10 +47,8 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 #include "tsl/util/proto/proto_utils.h"
 
-#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/service/gpu/buffer_comparator.h"
-#include "xla/stream_executor/cuda/cuda_blas_lt.h"
-#include "xla/stream_executor/gpu/redzone_allocator.h"
 #endif
 
 namespace xla {
@@ -113,7 +111,7 @@ StatusOr<AutotuneResult> GetBestAlgorithm(
     if (!autotune_config.should_check_correctness()) {
       continue;
     }
-
+#if GOOGLE_CUDA  // redzone check is not yet available on ROCm
     TF_ASSIGN_OR_RETURN(
         se::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
         allocator.CheckRedzones());
@@ -126,6 +124,7 @@ StatusOr<AutotuneResult> GetBestAlgorithm(
       CHECK(!autotune_config.should_crash_on_check_failure());
       continue;
     }
+#endif  // GOOGLE_CUDA
 
     if (!reference_algorithm) {
       stream->ThenMemcpy(&reference_buffer, output_buffer,
@@ -195,31 +194,33 @@ StatusOr<AutotuneResult> GetBestBlasAlgorithm(
 
 namespace {
 
-StatusOr<se::gpu::BlasLt::Epilogue> AsBlasLtEpilogue(
+using se::gpu::BlasLt;
+
+StatusOr<BlasLt::Epilogue> AsBlasLtEpilogue(
     GemmBackendConfig_Epilogue epilogue) {
   switch (epilogue) {
     case GemmBackendConfig::DEFAULT:
-      return se::gpu::BlasLt::Epilogue::kDefault;
+      return BlasLt::Epilogue::kDefault;
     case GemmBackendConfig::RELU:
-      return se::gpu::BlasLt::Epilogue::kReLU;
+      return BlasLt::Epilogue::kReLU;
     case GemmBackendConfig::GELU:
-      return se::gpu::BlasLt::Epilogue::kGELU;
+      return BlasLt::Epilogue::kGELU;
     case GemmBackendConfig::GELU_AUX:
-      return se::gpu::BlasLt::Epilogue::kGELUWithAux;
+      return BlasLt::Epilogue::kGELUWithAux;
     case GemmBackendConfig::BIAS:
-      return se::gpu::BlasLt::Epilogue::kBias;
+      return BlasLt::Epilogue::kBias;
     case GemmBackendConfig::BIAS_RELU:
-      return se::gpu::BlasLt::Epilogue::kBiasThenReLU;
+      return BlasLt::Epilogue::kBiasThenReLU;
     case GemmBackendConfig::BIAS_GELU:
-      return se::gpu::BlasLt::Epilogue::kBiasThenGELU;
+      return BlasLt::Epilogue::kBiasThenGELU;
     case GemmBackendConfig::BIAS_GELU_AUX:
-      return se::gpu::BlasLt::Epilogue::kBiasThenGELUWithAux;
+      return BlasLt::Epilogue::kBiasThenGELUWithAux;
     default:
       return InternalError("Unsupported Epilogue.");
   }
 }
 
-#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 StatusOr<AutotuneResult> DoGemmAutotuneNoCache(
     const HloInstruction* gemm, const AutotuneCacheKey& key,
@@ -309,18 +310,18 @@ StatusOr<AutotuneResult> DoGemmAutotuneNoCache(
                                                   autotune_config, rng_state));
     }
 
-    TF_ASSIGN_OR_RETURN(
-        auto plan, se::gpu::BlasLt::GetMatmulPlan(stream, config, epilogue));
+    TF_ASSIGN_OR_RETURN(auto plan,
+                        BlasLt::GetMatmulPlan(stream, config, epilogue));
 
     TF_ASSIGN_OR_RETURN(auto algorithms, plan->GetAlgorithms());
 
     TF_ASSIGN_OR_RETURN(
         best_algorithm,
-        GetBestAlgorithm<se::gpu::BlasLt::MatmulAlgorithm>(
+        GetBestAlgorithm<BlasLt::MatmulAlgorithm>(
             stream, buffer_allocator, gemm->ToString(), autotune_config,
             lhs_buffer, rhs_buffer, output_buffer, algorithms, output_shape,
             hlo_module_config, gemm_config.beta(),
-            [&](const se::gpu::BlasLt::MatmulAlgorithm& algorithm)
+            [&](const BlasLt::MatmulAlgorithm& algorithm)
                 -> StatusOr<se::blas::ProfileResult> {
               se::OwningScratchAllocator<> scratch_allocator(
                   stream->parent()->device_ordinal(), allocator);
@@ -336,6 +337,14 @@ StatusOr<AutotuneResult> DoGemmAutotuneNoCache(
     std::vector<se::blas::AlgorithmType> algorithms;
     TF_RET_CHECK(stream->parent()->GetBlasGemmAlgorithms(stream, &algorithms));
 
+#if TENSORFLOW_USE_ROCM        // Blas gemm algorithms are not yet supported
+    if (algorithms.empty()) {  // nothing to autotune
+      VLOG(1) << "Skipping autotuning for ROCm..";
+      best_algorithm.mutable_gemm()->set_algorithm(se::blas::kDefaultAlgorithm);
+      return best_algorithm;
+    }
+#endif
+
     TF_ASSIGN_OR_RETURN(
         best_algorithm,
         GetBestBlasAlgorithm(
@@ -366,7 +375,7 @@ StatusOr<AutotuneResult> DoGemmAutotuneNoCache(
   return best_algorithm;
 }
 
-#endif  // (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Do Gemm Autotune without stream executor. Use results from autotune cache
 // only.
@@ -390,15 +399,20 @@ StatusOr<bool> RunOnInstruction(HloInstruction* gemm,
                         return DoGemmAutotuneNoCache(gemm, key, config);
                       }));
 
-  se::CudaComputeCapability capability = config.GetCudaComputeCapability();
   GemmBackendConfig updated_config = gemm_config;
 
   // We only set the 'algorithm' field on non-Ampere architectures, as for
   // Ampere it's ignored in any case.
-  if (!capability.IsAtLeast(se::CudaComputeCapability::AMPERE)) {
+  bool update_algorithm = true;
+#if GOOGLE_CUDA
+  auto capability = config.GetCudaComputeCapability();
+  update_algorithm = !capability.IsAtLeast(se::CudaComputeCapability::AMPERE);
+#endif
+  if (update_algorithm) {
     if (algorithm.has_gemm()) {
       updated_config.set_selected_algorithm(algorithm.gemm().algorithm());
     } else {
+      // TODO: autotuning is NOT available for gpublas-lt (blas gemm only) !
       updated_config.set_selected_algorithm(se::blas::kRuntimeAutotuning);
     }
   }
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.h b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.h
index 9675d8968bfebf..99cae9f796f496 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.h
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.h
@@ -32,9 +32,8 @@ limitations under the License.
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 
-#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/service/gpu/gpu_conv_runner.h"
-#include "xla/stream_executor/cuda/cuda_blas_lt.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
 #endif
 
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker_test.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker_test.cc
index 07a901fd781226..b552eebe890f90 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker_test.cc
@@ -33,12 +33,30 @@ namespace {
 
 namespace m = ::xla::match;
 
-class GemmAlgorithmPickerTest : public HloTestBase {
+class GemmAlgorithmPickerTest : public HloTestBase,
+                                public ::testing::WithParamInterface<bool> {
  public:
   GemmAlgorithmPickerTest() { AutotunerUtil::ClearAutotuneResults(); }
+
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_enable_cublaslt(GetParam());
+    debug_options.set_xla_gpu_enable_triton_gemm(false);
+    return debug_options;
+  }
 };
 
-TEST_F(GemmAlgorithmPickerTest, SetAlgorithm) {
+TEST_P(GemmAlgorithmPickerTest, SetAlgorithm) {
+  auto comp = backend()
+                  .default_stream_executor()
+                  ->GetDeviceDescription()
+                  .cuda_compute_capability();
+  if (comp.IsAtLeast(se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "Skipping this test for Ampere+ as it is supported and "
+                    "recommended with "
+                    "the Nvidia Volta+ GPUs.";
+  }
+
   constexpr absl::string_view kHlo = R"(
 HloModule module
 
@@ -47,7 +65,10 @@ ENTRY main {
   %arg1 = f32[100,100]{1,0} parameter(1)
   ROOT %dot = f32[100,100]{1,0} dot(arg0, arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kHlo));
+
+  auto module_cfg = GetModuleConfigForTest();
+  TF_ASSERT_OK_AND_ASSIGN(auto m,
+                          ParseAndReturnVerifiedModule(kHlo, module_cfg));
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
@@ -57,7 +78,7 @@ ENTRY main {
   bool changed = false;
   TF_ASSERT_OK_AND_ASSIGN(
       changed, RunHloPass(GemmRewriter(stream_exec->GetDeviceDescription()
-                                           .cuda_compute_capability()),
+                                           .gpu_compute_capability()),
                           m.get()));
   changed = false;
   DebugOptions opts;
@@ -79,11 +100,11 @@ ENTRY main {
 
   // Now send the same module through GemmAlgorithmPicker again.  The dot should
   // have the new algorithm.
-  TF_ASSERT_OK_AND_ASSIGN(m, ParseAndReturnVerifiedModule(kHlo));
+  TF_ASSERT_OK_AND_ASSIGN(m, ParseAndReturnVerifiedModule(kHlo, module_cfg));
   changed = false;
   TF_ASSERT_OK_AND_ASSIGN(
       changed, RunHloPass(GemmRewriter(stream_exec->GetDeviceDescription()
-                                           .cuda_compute_capability()),
+                                           .gpu_compute_capability()),
                           m.get()));
   changed = false;
   TF_ASSERT_OK_AND_ASSIGN(changed,
@@ -92,15 +113,30 @@ ENTRY main {
 
   SCOPED_TRACE(m->ToString());
   HloInstruction* dot;
-  ASSERT_THAT(m->entry_computation()->root_instruction(),
-              GmockMatch(m::GetTupleElement(m::CustomCall(&dot), 0)));
+  if (module_cfg.debug_options().xla_gpu_enable_cublaslt()) {
+    ASSERT_THAT(m->entry_computation()->root_instruction(),
+                GmockMatch(m::CustomCall(&dot)));
+  } else {
+    ASSERT_THAT(m->entry_computation()->root_instruction(),
+                GmockMatch(m::GetTupleElement(m::CustomCall(&dot), 0)));
+  }
 
   TF_ASSERT_OK_AND_ASSIGN(GemmBackendConfig config,
                           dot->backend_config<GemmBackendConfig>());
   EXPECT_EQ(config.selected_algorithm(), new_algo_id);
 }
 
-TEST_F(GemmAlgorithmPickerTest, GetAlgorithmWithoutDevice) {
+TEST_P(GemmAlgorithmPickerTest, GetAlgorithmWithoutDevice) {
+  auto comp = backend()
+                  .default_stream_executor()
+                  ->GetDeviceDescription()
+                  .cuda_compute_capability();
+  if (comp.IsAtLeast(se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "Skipping this test for Ampere+ as it is supported and "
+                    "recommended with "
+                    "the Nvidia Volta+ GPUs.";
+  }
+
   constexpr absl::string_view kHlo = R"(
 HloModule module
 
@@ -109,7 +145,8 @@ ENTRY main {
   %arg1 = f32[100,100]{1,0} parameter(1)
   ROOT %dot = f32[100,100]{1,0} dot(arg0, arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kHlo));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto m, ParseAndReturnVerifiedModule(kHlo, GetModuleConfigForTest()));
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
@@ -120,7 +157,7 @@ ENTRY main {
   bool changed = false;
   TF_ASSERT_OK_AND_ASSIGN(
       changed, RunHloPass(GemmRewriter(stream_exec->GetDeviceDescription()
-                                           .cuda_compute_capability()),
+                                           .gpu_compute_capability()),
                           m.get()));
   changed = false;
 
@@ -142,9 +179,10 @@ ENTRY main {
   AutotunerUtil::ClearAutotuneResults();
   TF_ASSERT_OK(AutotunerUtil::LoadAutotuneResults(results));
 
+  auto module_cfg = GetModuleConfigForTest();
   // Now send the same module through GemmAlgorithmPicker again.  The dot should
   // have the new algorithm.
-  TF_ASSERT_OK_AND_ASSIGN(m, ParseAndReturnVerifiedModule(kHlo));
+  TF_ASSERT_OK_AND_ASSIGN(m, ParseAndReturnVerifiedModule(kHlo, module_cfg));
   changed = false;
 
   DevicelessConfig deviceless_config{
@@ -153,7 +191,7 @@ ENTRY main {
   AutotuneConfig deviceless_cfg{deviceless_config, opts};
   TF_ASSERT_OK_AND_ASSIGN(
       changed, RunHloPass(GemmRewriter(stream_exec->GetDeviceDescription()
-                                           .cuda_compute_capability()),
+                                           .gpu_compute_capability()),
                           m.get()));
   changed = false;
   TF_ASSERT_OK_AND_ASSIGN(
@@ -162,13 +200,22 @@ ENTRY main {
 
   SCOPED_TRACE(m->ToString());
   HloInstruction* dot;
-  ASSERT_THAT(m->entry_computation()->root_instruction(),
-              GmockMatch(m::GetTupleElement(m::CustomCall(&dot), 0)));
+
+  if (module_cfg.debug_options().xla_gpu_enable_cublaslt()) {
+    ASSERT_THAT(m->entry_computation()->root_instruction(),
+                GmockMatch(m::CustomCall(&dot)));
+  } else {
+    ASSERT_THAT(m->entry_computation()->root_instruction(),
+                GmockMatch(m::GetTupleElement(m::CustomCall(&dot), 0)));
+  }
 
   TF_ASSERT_OK_AND_ASSIGN(GemmBackendConfig config,
                           dot->backend_config<GemmBackendConfig>());
   EXPECT_EQ(config.selected_algorithm(), new_algo_id);
 }
 
+INSTANTIATE_TEST_SUITE_P(GemmAlgorithmPickerTestSuite, GemmAlgorithmPickerTest,
+                         ::testing::Bool());
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index 0e41cfdcbdd1e0..c74f246cabc672 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -471,7 +471,7 @@ auto OptionalBitcast(HloInstruction **optional_bitcast, Pattern pattern) {
 // when the output of the GEMM is requested in FP8 format.
 class GemmRewriterVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit GemmRewriterVisitor(se::GpuComputeCapability gpu_version)
+  explicit GemmRewriterVisitor(const se::GpuComputeCapability &gpu_version)
       : gpu_version_(gpu_version) {}
 
   Status HandleDot(HloInstruction *instr) override {
@@ -1869,12 +1869,9 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     TF_ASSIGN_OR_RETURN(bool output_is_column_major,
                         MatrixIsColumnMajor(instr, gemm_backend_config));
 
-    if (std::holds_alternative<se::RocmComputeCapability>(gpu_version_)) {
-      auto rocm_compute_capability_ =
-          std::get<se::RocmComputeCapability>(gpu_version_);
-
-      // as of ROCm 5.5, hipblaslt only supports MI200.
-      if (rocm_compute_capability_.gcn_arch_name().substr(0, 6) != "gfx90a") {
+    if (auto isrocm = std::get_if<se::RocmComputeCapability>(&gpu_version_);
+        isrocm) {
+      if (!isrocm->has_hipblaslt()) {
         return false;
       }
     }
@@ -1965,7 +1962,8 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
 // having to match output tuples.
 class GemmWorkspaceRewriteVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit GemmWorkspaceRewriteVisitor(se::GpuComputeCapability gpu_version)
+  explicit GemmWorkspaceRewriteVisitor(
+      const se::GpuComputeCapability &gpu_version)
       : gpu_version_(gpu_version) {}
 
   Status HandleCustomCall(HloInstruction *instr) override {
@@ -1979,7 +1977,7 @@ class GemmWorkspaceRewriteVisitor : public DfsHloRewriteVisitor {
     // Pass a user-managed workspace to legacy cuBLAS operations, as
     // otherwise cuBLAS will use its own internal pool which will be competing
     // with XLA allocator for device memory.
-    int64_t workspace = cuda_cc == nullptr ? 0
+    int64_t workspace = cuda_cc == nullptr ? GemmConfig::kDefaultWorkspace
                         : cuda_cc->IsAtLeastHopper()
                             ? GemmConfig::kHopperWorkspace
                             : GemmConfig::kDefaultWorkspace;
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
index 60de92585b43d4..995999292c520a 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
@@ -15,11 +15,9 @@ limitations under the License.
 
 #include "xla/service/gpu/gemm_rewriter_triton.h"
 
-#include <algorithm>
 #include <array>
 #include <cstdint>
-#include <iterator>
-#include <list>
+#include <optional>
 #include <queue>
 #include <string>
 #include <utility>
@@ -31,1262 +29,514 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/ir/hlo_schedule.h"
-#include "xla/hlo/utils/hlo_query.h"
-#include "xla/layout.h"
-#include "xla/permutation_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_padding_requirements.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/triton_fusion_analysis.h"
+#include "xla/service/gpu/triton_support.h"
+#include "xla/service/gpu/triton_tiling_propagation.h"
 #include "xla/service/instruction_fusion.h"
-#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
-#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/platform/tensor_float_32_utils.h"
 
 namespace xla {
 namespace gpu {
 
-bool TensorIterationSpec::operator==(const TensorIterationSpec& other) const {
-  VLOG(9) << this->ToString();
-  VLOG(9) << other.ToString();
-  auto it_this = dim_iteration_specs_.cbegin();
-  while (it_this != dim_iteration_specs_.cend()) {
-    auto it_other = other.dim_iteration_specs_.find(it_this->first);
-    if (it_other == other.dim_iteration_specs_.cend()) {
-      return false;
-    }
-    if (it_this->second.size() != it_other->second.size()) {
-      return false;
-    }
-    for (int fragment = 0; fragment < it_this->second.size(); ++fragment) {
-      if (it_this->second[fragment] != it_other->second[fragment]) {
-        return false;
-      }
-    }
-    ++it_this;
-  }
-  return true;
-}
-
-bool IsDistributiveOverAddition(const HloInstruction& hlo) {
-  // The list is most likely incomplete.
-  // For example division can be added too but only for operand #0.
-  if (hlo.opcode() == HloOpcode::kMultiply ||
-      hlo.opcode() == HloOpcode::kNegate ||
-      hlo.opcode() == HloOpcode::kBitcast ||
-      hlo.opcode() == HloOpcode::kReshape || hlo.opcode() == HloOpcode::kCopy ||
-      hlo.opcode() == HloOpcode::kTranspose ||
-      hlo.opcode() == HloOpcode::kConvert ||
-      hlo.opcode() == HloOpcode::kBroadcast ||
-      hlo.opcode() == HloOpcode::kSlice) {
-    return true;
-  }
-  return false;
-}
-
 namespace {
 
-FusionDecision RequireTritonFusibleConvert(
-    const HloInstruction* input, se::GpuComputeCapability gpu_version) {
-  // TODO(b/266862494): Can pick up almost any
-  // convert, but if it's reducing the data volume it should rather be fused
-  // to the output of the producer kernel. However not all operations support
-  // output fusion - then it should be fused here anyway!
-  if (ShapeUtil::ByteSizeOf(input->operand(0)->shape()) >
-      ShapeUtil::ByteSizeOf(input->shape())) {
-    return "Narrowing conversion.";
-  }
-  return FusionDecision{};
-}
-
-// Handles numbers of dimensions of an HLO instruction
-// projected onto another one.
-// Used to calculate cumulative index transformations done by non-elementwise
-// instructions between source and target.
-class DimensionOrder {
+template <class... Ts>
+struct Overload : Ts... {
+  using Ts::operator()...;
+};
+template <class... Ts>
+Overload(Ts...) -> Overload<Ts...>;
+
+using triton_fusion::CombineRequirements;
+using triton_fusion::DimensionOrder;
+using triton_fusion::DimOrderMap;
+using triton_fusion::DimOrdersAndReqs;
+using triton_fusion::DimOrdersAndReqsOrError;
+using triton_fusion::DotRequirements;
+using triton_fusion::FusionContext;
+using triton_fusion::GetPropagatedDimOrdersAndRequirementsIfProfitablyFusible;
+using triton_fusion::HeroProperties;
+using triton_fusion::Requirements;
+using triton_fusion::RequirementsOrError;
+using triton_fusion::TransformDirection;
+
+// This represents a path in a graph which helps in separating different uses of
+// HLOs in fusions.
+//
+// For example let's say that we can reach an HLO in 2 ways:
+// 1. dot->operand(0)->operand(1)->operand(2);
+// 2. dot->operand(0)->operand(2)->operand(0);
+// Then the corresponding paths will be:
+// 1. "0,1,2"
+// 2. "0,2,0"
+// dot->users()[0]->users()[0]->operand(1) would be represented like this:
+// "-1,-1,1"
+class GraphPath final {
  public:
-  // Softmax fusions have a fixed tiling scheme. These numbers are chosen to
-  // reflect that reductions in softmax fusions currently happen on the minor-
-  // most dimension (dimensions_minor(0)) and the rest (1+) is treated as a
-  // single non-tiled batch dimension. The numbers have to match those the
-  // emitter uses in the queries to the analysis.
-  static constexpr int kSoftmaxReductionDimension = 0;
-  static constexpr int kSoftmaxBatchDimension = 1;
-
-  static DimensionOrder FromDotOperandOrOutput(
-      const HloInstruction& hlo, const int split_k_dimension_index = -1) {
-    DimensionOrder dim_order;
-    dim_order.tensor_fragments_order_.reserve(hlo.shape().rank());
-    for (const int i : hlo.shape().layout().minor_to_major()) {
-      int target_dim_number = i;
-      if (i == split_k_dimension_index) {
-        CHECK(!dim_order.tensor_fragments_order_.empty())
-            << "The split-K batch dimension has be preceded by the contracting "
-               "dimension it originates from by construction.";
-        target_dim_number =
-            dim_order.tensor_fragments_order_.back().dst_dim_number();
-      }
-      dim_order.dim_fragments_orders_[target_dim_number].push_back(
-          dim_order.tensor_fragments_order_.size());
-      dim_order.tensor_fragments_order_.push_back(
-          Fragment{target_dim_number, hlo.shape().dimensions(i)});
-    }
-    return dim_order;
+  static inline constexpr int64_t kUserIndex = -1;
+  GraphPath() = default;
+  GraphPath GetPathOfOperand(int64_t operand_index) const {
+    CHECK_GE(operand_index, 0);
+    GraphPath path = *this;
+    path.path_.push_back(operand_index);
+    return path;
   }
 
-  static DimensionOrder FromSoftmaxRoot(const HloInstruction& hlo) {
-    DimensionOrder dim_order;
-    dim_order.tensor_fragments_order_.reserve(hlo.shape().rank());
-    dim_order.dim_fragments_orders_[kSoftmaxReductionDimension].push_back(
-        dim_order.tensor_fragments_order_.size());
-    dim_order.tensor_fragments_order_.push_back(
-        Fragment{kSoftmaxReductionDimension, hlo.shape().dimensions_minor(0)});
-    for (int i = 1; i < hlo.shape().rank(); ++i) {
-      dim_order.dim_fragments_orders_[kSoftmaxBatchDimension].push_back(
-          dim_order.tensor_fragments_order_.size());
-      dim_order.tensor_fragments_order_.push_back(
-          Fragment{kSoftmaxBatchDimension, hlo.shape().dimensions_minor(i)});
-    }
-    return dim_order;
+  GraphPath GetPathOfUser() const {
+    GraphPath path = *this;
+    path.path_.push_back(kUserIndex);
+    return path;
   }
+  std::string ToString() const { return absl::StrJoin(path_, ","); }
 
-  // Description of a continuous fragment of one dimension of a tensor.
-  class Fragment {
-   public:
-    explicit Fragment(int dst_dim_number, int64_t size)
-        : dst_dim_number_(dst_dim_number),
-          size_(size),
-          slice_start_(0),
-          slice_limit_(size) {}
-
-    std::string ToString() const {
-      return absl::StrCat(dst_dim_number_, ":", size_, ":", slice_start_, "-",
-                          slice_limit_);
-    }
-    // Label carrying the dimension number of an defining operation.
-    int dst_dim_number() const { return dst_dim_number_; }
-    // Total number of elements in the fragment ignoring slicing.
-    int64_t full_size() const { return size_; }
-    // First used element.
-    int64_t slice_start() const { return slice_start_; }
-    // Last used element.
-    int64_t slice_limit() const { return slice_limit_; }
-    int64_t sliced_size() const { return slice_limit_ - slice_start_; }
-    bool is_sliced() const { return full_size() != sliced_size(); }
-    void set_slice(int64_t start, int64_t limit) {
-      slice_start_ = start;
-      slice_limit_ = limit;
-    }
-    void set_size(int64_t size) { size_ = size; }
+  bool operator==(const GraphPath& other) const { return path_ == other.path_; }
 
-   private:
-    const int dst_dim_number_;
-    int64_t size_;
-    int64_t slice_start_;
-    int64_t slice_limit_;
-  };
-  using Fragments = std::vector<Fragment>;
-  using FragmentOrders = absl::flat_hash_map<int, std::vector<int>>;
-
-  const Fragments& TensorFragmentsOrder() const {
-    return tensor_fragments_order_;
+  template <typename H>
+  friend H AbslHashValue(H h, const GraphPath& path) {
+    return H::combine(std::move(h), path.path_);
   }
-  Fragments& TensorFragmentsOrder() { return tensor_fragments_order_; }
 
-  const FragmentOrders& DimFragmentsOrders() const {
-    return dim_fragments_orders_;
-  }
-  FragmentOrders& DimFragmentsOrders() { return dim_fragments_orders_; }
+ private:
+  std::vector<int64_t> path_;
+};
 
-  // Tells that two dimension orders describe the same tensor physical layout.
-  bool IsPhysicallyEquivalent(const DimensionOrder& other) const;
+struct FusionQueueItem {
+  FusionQueueItem(GraphPath path, const HloInstruction& hlo,
+                  DimensionOrder dim_order)
+      : path(path), hlo(hlo), dim_order(dim_order) {}
 
   std::string ToString() const {
-    std::string ret = absl::StrJoin(tensor_fragments_order_, " - ",
-                                    [](std::string* out, const Fragment& f) {
-                                      absl::StrAppend(out, f.ToString(), " ");
-                                    });
-    absl::StrAppend(&ret, "|");
-    for (const auto& [dim, fragments] : dim_fragments_orders_) {
-      absl::StrAppend(&ret, dim, ":", absl::StrJoin(fragments, ","), " ");
-    }
-    return ret;
+    return absl::StrCat(path.ToString(), ":", hlo.ToString(), ":",
+                        dim_order.ToString());
   }
 
- private:
-  // Sequence of all fragments of dimensions of tensor's shape
-  // in layout minor-to-major (physical) order.
-  Fragments tensor_fragments_order_;
-  // Iteration orders of fragments of each dimension of the defining operation
-  // (fragments can be physically unordered and disconnected within
-  // the shape due to reshapes and transposes).
-  FragmentOrders dim_fragments_orders_;
+  const GraphPath path;
+  const HloInstruction& hlo;
+  const DimensionOrder dim_order;
 };
 
-using DimIterationSpec = TensorIterationSpec::DimIterationSpec;
-using Fragment = DimensionOrder::Fragment;
-using Fragments = DimensionOrder::Fragments;
-using FragmentOrders = DimensionOrder::FragmentOrders;
-using DimOrderMap = absl::flat_hash_map<const HloInstruction*, DimensionOrder>;
-
-struct DimOrderUpdates {
-  DimOrderMap map;
-  int64_t splittable_dimension_major_part_size = 0;
+struct FusionDecisionAndIterspec {
+  bool fuse = false;
+  TensorIterationSpec iterspec;
 };
-
-TensorIterationSpec DimensionOrderToTensorIterationSpec(
-    const DimensionOrder& order) {
-  const Fragments& dim_fragments = order.TensorFragmentsOrder();
-  TensorIterationSpec tensor_spec;
-  int64_t accumulated_stride = 1;
-  int last_dim = -1;
-  auto remove_last_fragment_if_degenerate = [&tensor_spec](const int dim_idx) {
-    if (dim_idx >= 0 && !tensor_spec[dim_idx].empty() &&
-        tensor_spec[dim_idx].back().count == 1) {
-      tensor_spec[dim_idx].pop_back();
-    }
-  };
-  for (int dim_order_index = 0; dim_order_index < dim_fragments.size();
-       ++dim_order_index) {
-    const DimensionOrder::Fragment& fragment = dim_fragments[dim_order_index];
-    VLOG(6) << fragment.ToString();
-
-    DimIterationSpec& dim_spec = tensor_spec[fragment.dst_dim_number()];
-    if (last_dim == fragment.dst_dim_number()) {
-      // Remove previous 1-sized subfragment if present.
-      if (!dim_spec.empty() && !dim_spec.back().subfragments.empty() &&
-          dim_spec.back().subfragments.back() == 1) {
-        dim_spec.back().subfragments.pop_back();
-      }
-      // Contiguous dimension, split only logically. Merge it back.
-      if (fragment.full_size() > 1) {
-        CHECK(!dim_spec.empty());
-        CHECK(!dim_spec.back().is_sliced())
-            << "Only the major-most fragment can have an offset.";
-        dim_spec.back().slice_start =
-            fragment.slice_start() * dim_spec.back().count;
-        dim_spec.back().slice_limit =
-            fragment.slice_limit() * dim_spec.back().count;
-        dim_spec.back().count *= fragment.full_size();
-        dim_spec.back().subfragments.push_back(fragment.sliced_size());
-      }
-    } else {
-      remove_last_fragment_if_degenerate(last_dim);
-      // Add part of the dimension.
-      dim_spec.push_back(
-          TensorIterationSpec::IterationSpecFragment{accumulated_stride,
-                                                     fragment.full_size(),
-                                                     fragment.slice_start(),
-                                                     fragment.slice_limit(),
-                                                     {fragment.sliced_size()}});
-    }
-
-    accumulated_stride *= fragment.full_size();
-    last_dim = fragment.dst_dim_number();
-  }
-  remove_last_fragment_if_degenerate(last_dim);
-  tensor_spec.RemoveEmptyDimensions();
-  return tensor_spec;
-}
-
-bool DimensionOrder::IsPhysicallyEquivalent(const DimensionOrder& other) const {
-  return DimensionOrderToTensorIterationSpec(*this) ==
-         DimensionOrderToTensorIterationSpec(other);
-}
-
-enum class TransformDirection { kInputToOutput, kOutputToInput };
-
-using DimOrderUpdatesOrError = std::variant<FusionDecision, DimOrderUpdates>;
-
-class FusionContext {
-  struct DotProperties {
-    int splittable_dimension;
-    int64_t splittable_dimension_supported_major_part_size;
-  };
-  struct SoftmaxProperties {
-    int softmax_reduction_dimension;
-    int softmax_batch_dimension;
-  };
-
-  explicit FusionContext(DotProperties properties) : properties_(properties) {}
-
-  explicit FusionContext(SoftmaxProperties properties)
-      : properties_(properties) {}
-
-  DimOrderUpdatesOrError HandleElementwise(const HloInstruction* hlo,
-                                           const DimOrderMap& dim_orders) const;
-  DimOrderUpdatesOrError HandleBitcast(const HloInstruction* hlo,
-                                       const DimOrderMap& dim_orders,
-                                       TransformDirection direction) const;
-  DimOrderUpdatesOrError HandleDimensionAlteringOp(
-      const HloInstruction* hlo, const DimOrderMap& dim_orders,
-      TransformDirection direction) const;
-
- public:
-  // Create fusion context from a dot operand according to
-  // the currently supported configurations.
-  static FusionContext FromDotOperand(const HloInstruction& dot,
-                                      int operand_number, int split_k = 1);
-
-  // Create fusion context from dot's output.
-  static FusionContext FromDotOutput(
-      const HloInstruction& dot, int split_k,
-      int64_t splittable_dimension_supported_major_part_size);
-
-  static FusionContext FromSoftmaxRoot(const HloInstruction&);
-
-  DimOrderUpdatesOrError HandleInstruction(const HloInstruction* hlo,
-                                           const DimOrderMap& dim_orders,
-                                           TransformDirection direction) const;
-
-  // Tells if the dimension order is supported by the triton emitters.
-  // Only the dimension indicated by SplittableDimensionIndex() can be split
-  // physically once by other dimensions. Other ones can be only split
-  // logically. All subdimensions within a dimension have to be ordered.
-  // Return major part of splittable dimension in split_dim_major_part if a
-  // supported split is detected.
-  FusionDecision RequireSupportedDimOrder(const DimensionOrder& order,
-                                          int64_t& split_dim_major_part) const;
-  // Apply RequireSupportedDimOrder() to all known dimension orders
-  // around `hlo`.
-  FusionDecision RequireSupportedDimOrders(const HloInstruction& hlo,
-                                           DimOrderUpdates& updates) const;
-  // Try to calculate transformations of dimensions defined by the
-  // instruction, then check that the resulting dimension orders are supported.
-  DimOrderUpdatesOrError RequireSupportedInstruction(
-      const HloInstruction& hlo, const DimOrderMap& dim_orders,
-      TransformDirection direction) const;
-  // Checks if the instruction is possible and profitable to fuse.
-  // If so tries to transform dim_order describing one side of `hlo` into
-  // description(s) of its other side if it is supported.
-  DimOrderUpdatesOrError AnalyzeForFusion(
-      const HloInstruction& hlo, TransformDirection transform_direction,
-      absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
-          old_to_new_mapping,
-      se::GpuComputeCapability gpu_version) const;
-  // Add dimension orders from `updates` to `dim_orders_` and update the
-  // splittable dimension ratio if all of them are compatible.
-  bool MergeUpdates(const DimOrderUpdates& updates);
-  // Fuse an instruction with all its fusible inputs.
-  // If an input is not fusible stop there and make a parameter of the new
-  // fusion, otherwise put it onto stack and check its own inputs first.
-  void TryToFuseWithInputsRecursively(
-      HloInstruction& root, se::GpuComputeCapability gpu_version,
-      absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
-          old_to_new_mapping,
-      std::vector<HloInstruction*>& fusion_inputs,
-      HloComputation::Builder& builder);
-  // Propagate dimension orders in consumer->producer direction starting at
-  // `origin` with output `origin_dim_order` till parameters of the computation.
-  // Store the found parameters and their iteration specs.
-  Status PropagateDimensionOrdersToParameters(
-      const HloInstruction& origin, ConstHloInstructionSet& parameters,
-      ConstHloInstructionMap<TensorIterationSpec>& iter_specs);
-
-  // Index of dot dimension that can be split.
-  // Currently typically LHS non-contracting one.
-  int64_t SplittableDimensionIndex() const {
-    CHECK(std::holds_alternative<DotProperties>(properties_));
-    return std::get<DotProperties>(properties_).splittable_dimension;
-  }
-  // Tells whether `size` major part of a dimension can be physically split.
-  bool IsSupportedSplittableDimensionMajorPartSize(const int64_t size) const {
-    CHECK_NE(size, 0);
-    CHECK(std::holds_alternative<DotProperties>(properties_));
-    // 0 means no specific size requirement.
-    return std::get<DotProperties>(properties_)
-                   .splittable_dimension_supported_major_part_size == 0 ||
-           std::get<DotProperties>(properties_)
-                   .splittable_dimension_supported_major_part_size == size;
-  }
-  int SplittableDimensionMajorPartSize() const {
-    CHECK(std::holds_alternative<DotProperties>(properties_));
-    return std::get<DotProperties>(properties_)
-        .splittable_dimension_supported_major_part_size;
-  }
-  const DimOrderMap& DimOrders() const { return dim_orders_; }
-
- private:
-  DimOrderUpdatesOrError AnalyzeForFusionImpl(
-      const HloInstruction& hlo, TransformDirection transform_direction,
-      absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
-          old_to_new_mapping,
-      const DimOrderMap& dim_orders,
-      se::GpuComputeCapability gpu_version) const;
-  bool SetSplittableDimensionMajorPartSize(const int64_t size) {
-    if (IsSupportedSplittableDimensionMajorPartSize(size)) {
-      std::get<DotProperties>(properties_)
-          .splittable_dimension_supported_major_part_size = size;
-      return true;
-    }
-    return false;
-  }
-
-  std::variant<DotProperties, SoftmaxProperties> properties_;
-  DimOrderMap dim_orders_;
+using FusionMap = absl::flat_hash_map<GraphPath, FusionDecisionAndIterspec>;
+struct FusionMapAndRequirements {
+  FusionMap fusion_map;
+  Requirements requirements;
 };
 
-FusionContext FusionContext::FromDotOperand(const HloInstruction& dot,
-                                            const int operand_number,
-                                            const int split_k) {
-  // There can be either none or one split-K batch dimension.
-  const int num_split_k_batch_dims = split_k > 1;
-  int split_k_dimension_index = -1;
-  if (split_k > 1) {
-    split_k_dimension_index =
-        ContractingDimensionIndex(dot, operand_number) - 1;
-  }
-  int splittable_dimension_index = -1;
-  // LHS non-contracting dimension can be split if non-splitK batch is absent.
-  if (operand_number == 0 &&
-      dot.dot_dimension_numbers().lhs_batch_dimensions_size() -
-              num_split_k_batch_dims ==
-          0) {
-    splittable_dimension_index =
-        NonContractingDimensionIndex(dot, operand_number);
-  }
-  FusionContext context(FusionContext::DotProperties{
-      splittable_dimension_index,
-      /*splittable_dimension_supported_major_size=*/0});
-  context.dim_orders_[dot.operand(operand_number)] =
-      DimensionOrder::FromDotOperandOrOutput(*dot.operand(operand_number),
-                                             split_k_dimension_index);
-  return context;
-}
-
-FusionContext FusionContext::FromDotOutput(
-    const HloInstruction& dot, const int split_k,
-    const int64_t splittable_dimension_supported_major_part_size) {
-  // Allow non-contracting dimension originating from LHS to split if
-  // this dimension is split at the output at the same ratio as
-  // at the input.
-  int splittable_dimension_index = -1;
-  if (splittable_dimension_supported_major_part_size > 1) {
-    // Split-K dimension is the first one in the output if present;
-    // LHS non-contracting follows (batch is absent in this case).
-    splittable_dimension_index = (split_k > 1) ? 1 : 0;
-  }
-  FusionContext context(FusionContext::DotProperties{
-      splittable_dimension_index,
-      splittable_dimension_supported_major_part_size});
-  context.dim_orders_[&dot] = DimensionOrder::FromDotOperandOrOutput(dot);
-  return context;
-}
-
-FusionContext FusionContext::FromSoftmaxRoot(const HloInstruction& root) {
-  FusionContext context(FusionContext::SoftmaxProperties{
-      DimensionOrder::kSoftmaxReductionDimension,
-      DimensionOrder::kSoftmaxBatchDimension});
-  context.dim_orders_[&root] = DimensionOrder::FromSoftmaxRoot(root);
-  return context;
-}
-
-FusionDecision FusionContext::RequireSupportedDimOrder(
-    const DimensionOrder& order, int64_t& split_dim_major_part) const {
-  VLOG(8) << order.ToString();
-  const Fragments& tensor_dim_fragments = order.TensorFragmentsOrder();
-  for (const auto& [dim_index, dim_fragments] : order.DimFragmentsOrders()) {
-    CHECK(!dim_fragments.empty());
-    for (int i = 0; i < dim_fragments.size() - 1; ++i) {
-      if (tensor_dim_fragments[dim_fragments[i]].is_sliced()) {
-        return "Sliced non-major-most fragment.";
-      }
-    }
-    int group_counter = 0;
-    int last_seen_group_last_fragment_index = -1;
-    auto fragment_it = dim_fragments.cbegin();
-    while (true) {
-      if (fragment_it == dim_fragments.cend()) {
-        break;
-      }
-      int64_t grouped_size = tensor_dim_fragments[*fragment_it].full_size();
-      // Gather contiguous fragments: they have consecutive indices.
-      while ((fragment_it + 1) != dim_fragments.cend() &&
-             *(fragment_it + 1) == *fragment_it + 1) {
-        ++fragment_it;
-        grouped_size *= tensor_dim_fragments[*fragment_it].full_size();
-      }
-      // Ignore 1-sized groups of fragments.
-      if (grouped_size == 1) {
-        ++fragment_it;
-        continue;
-      }
-
-      if (last_seen_group_last_fragment_index > *fragment_it) {
-        return "Transpose within a dimension.";
-      }
-
-      ++group_counter;
-      if (group_counter > 1) {
-        if (dim_index == SplittableDimensionIndex() &&
-            IsSupportedSplittableDimensionMajorPartSize(grouped_size)) {
-          if (group_counter == 2) {
-            if (split_dim_major_part != 0 &&
-                split_dim_major_part != grouped_size) {
-              return "Conflicting splits of splittable dimension";
-            }
-            split_dim_major_part = grouped_size;
-          } else if (group_counter > 2) {
-            return "2nd split of a splittable dimension.";
-          }
-        } else {
-          return "Unsupported split of a dimension.";
-        }
-      }
-
-      last_seen_group_last_fragment_index = *fragment_it;
-      ++fragment_it;
-    }
-  }
-  return FusionDecision{};
-}
-
-FusionDecision FusionContext::RequireSupportedDimOrders(
-    const HloInstruction& hlo, DimOrderUpdates& updates) const {
-  auto check_if_present = [&](const HloInstruction* instr) {
-    if (auto it = updates.map.find(instr); it != updates.map.end()) {
-      return RequireSupportedDimOrder(
-          it->second, updates.splittable_dimension_major_part_size);
-    }
-    return FusionDecision{};
-  };
-  for (const HloInstruction* operand : hlo.operands()) {
-    if (auto result = check_if_present(operand); !result) {
-      return result;
-    }
-  }
-  return check_if_present(&hlo);
-}
-
-DimOrderUpdatesOrError FusionContext::HandleElementwise(
-    const HloInstruction* hlo, const DimOrderMap& dim_orders) const {
-  // The output and all the input dimension orders of `hlo` have to be the same.
-  const HloInstruction* src = nullptr;
-  const DimensionOrder* src_dim_order;
-  // Try using the output as a reference if it's already described, otherwise
-  // scan through all operands.
-  if (auto it = dim_orders.find(hlo); it != dim_orders.cend()) {
-    src = it->first;
-    src_dim_order = &it->second;
-  } else {
-    for (const HloInstruction* operand : hlo->operands()) {
-      if (auto it = dim_orders.find(operand); it != dim_orders.cend()) {
-        src = it->first;
-        src_dim_order = &it->second;
-        break;
-      }
-    }
-    CHECK_NE(src, nullptr);
-  }
-
-  DimOrderUpdates result;
-  result.map.insert({hlo, DimensionOrder(*src_dim_order)});
-  for (const HloInstruction* operand : hlo->operands()) {
-    result.map.insert({operand, DimensionOrder(dim_orders.at(src))});
-  }
-  return result;
-}
-
-DimOrderUpdatesOrError FusionContext::HandleBitcast(
-    const HloInstruction* hlo, const DimOrderMap& dim_orders,
-    const TransformDirection direction) const {
-  const HloInstruction* src =
-      (direction == TransformDirection::kOutputToInput) ? hlo : hlo->operand(0);
-  const HloInstruction* dst =
-      (direction == TransformDirection::kOutputToInput) ? hlo->operand(0) : hlo;
-  const Shape& dst_shape = dst->shape();
-  const Fragments& src_fragments_order =
-      dim_orders.at(src).TensorFragmentsOrder();
-  DimOrderUpdates result;
-  DimensionOrder& dst_dim_order =
-      result.map.insert({dst, DimensionOrder()}).first->second;
-  Fragments& dst_fragments_order = dst_dim_order.TensorFragmentsOrder();
-  // Size of not yet assigned part of current target dimension.
-  int64_t dst_remaining_size = 1;
-  // Track destination fragments created from a source one.
-  absl::flat_hash_map<const Fragment*, std::vector<int>> src_to_dst;
-  // Iterate in parallel over source dimension order and target dimensions
-  // in minor_to_major order. Find groups of dimensions of equal size
-  // and project the source dimension order onto the destination.
-  auto dst_dim_it = dst_shape.layout().minor_to_major().cbegin();
-  const auto dst_dim_end = dst_shape.layout().minor_to_major().cend();
-  for (auto src_dim = src_fragments_order.cbegin();
-       src_dim != src_fragments_order.cend(); ++src_dim) {
-    auto add_new_fragment = [&](const Fragment& fragment) {
-      dst_fragments_order.push_back(fragment);
-      src_to_dst[&*src_dim].push_back(dst_fragments_order.size() - 1);
-    };
-    if (std::holds_alternative<SoftmaxProperties>(properties_) &&
-        src_dim->dst_dim_number() ==
-            std::get<SoftmaxProperties>(properties_).softmax_batch_dimension) {
-      // Special handling for softmax batch dimension: allow arbitrary reshapes
-      // on it because it's guaranteed by the construction of the fusion to have
-      // no physical alterations like transposes.
-      // Find a continuous group of fragments corresponding to this dimension in
-      // the source and assign the corresponding size in fragments of the
-      // destination ignoring the source ones.
-      dst_remaining_size = src_dim->full_size();
-      while (src_dim + 1 != src_fragments_order.cend() &&
-             (src_dim + 1)->dst_dim_number() == src_dim->dst_dim_number()) {
-        ++src_dim;
-        dst_remaining_size *= src_dim->full_size();
-      }
-      while (dst_remaining_size > 1) {
-        CHECK(dst_dim_it != dst_dim_end);
-        add_new_fragment(Fragment{src_dim->dst_dim_number(),
-                                  dst_shape.dimensions(*dst_dim_it)});
-        dst_remaining_size /= dst_shape.dimensions(*dst_dim_it);
-        ++dst_dim_it;
-      }
-      continue;
-    }
-    if (dst_remaining_size >= src_dim->full_size()) {
-      if (dst_remaining_size % src_dim->full_size()) {
-        return "Unsupported bitcast";
-      }
-      // Source dimension fragment completely fits into the destination one:
-      // just copy it as is.
-      add_new_fragment(*src_dim);
-      // Update the size of the remaining part of the destination that is
-      // carried over to next source dimensions.
-      dst_remaining_size /= src_dim->full_size();
-    } else {
-      // Source is larger than destination.
-      // Assign further destination dimensions.
-      // Size of the not yet assigned part of the source dimension.
-      int64_t src_remaining_size = src_dim->full_size();
-      // Handle dimension splits.
-      if (dst_remaining_size > 1) {
-        // If there is a remaining fragment of a previous destination dimension
-        // assign it first.
-        if (src_remaining_size % dst_remaining_size || (src_dim->is_sliced())) {
-          return "Unsupported bitcast";
-        }
-        add_new_fragment(
-            Fragment{src_dim->dst_dim_number(), dst_remaining_size});
-        // Update the size of the fragment remaining to assign.
-        src_remaining_size /= dst_remaining_size;
-        dst_remaining_size = 1;
-      }
-      while (src_remaining_size > 1) {
-        // Assign destination dimensions until the source remainder is covered.
-        CHECK(dst_dim_it != dst_dim_end);
-        int64_t dst_dim_size = dst_shape.dimensions(*dst_dim_it);
-        int64_t new_fragment_size = dst_dim_size;
-        if (dst_dim_size > src_remaining_size) {
-          // If adding the next destination dimension exceeds source fragment
-          // size assign the remainder of the source and carry over the
-          // remainder of the destination.
-          if (dst_dim_size % src_remaining_size) {
-            return "Unsupported bitcast";
-          }
-          dst_remaining_size = dst_dim_size / src_remaining_size;
-          new_fragment_size = src_remaining_size;
-        }
-        if (src_dim->is_sliced()) {
-          return "Unsupported bitcast";
-        }
-        add_new_fragment(
-            Fragment{src_dim->dst_dim_number(), new_fragment_size});
-        src_remaining_size /= new_fragment_size;
-        ++dst_dim_it;
-      }
-    }
-  }
-  CHECK_EQ(dst_remaining_size, 1);
-
-  // Handle remaining major dimensions of the destination. Call all degenerate
-  // ones subdimensions of the most-major non-degenerate one. Otherwise
-  // give up.
-  while (dst_dim_it != dst_dim_end) {
-    if (dst_shape.dimensions(*dst_dim_it) != 1) {
-      return "Unsupported bitcast";
-    }
-    if (!dst_fragments_order.empty()) {
-      dst_fragments_order.push_back(
-          Fragment{dst_fragments_order.back().dst_dim_number(), 1});
-      src_to_dst[&src_fragments_order.back()].push_back(
-          dst_fragments_order.size() - 1);
-    }
-    ++dst_dim_it;
-  }
-
-  FragmentOrders& dst_dim_fragment_orders = dst_dim_order.DimFragmentsOrders();
-  for (const auto& [dim_index, dim_sequence] :
-       dim_orders.at(src).DimFragmentsOrders()) {
-    std::vector<int>& dst = dst_dim_fragment_orders[dim_index];
-    dst.reserve(dim_sequence.size());
-    for (const int src : dim_sequence) {
-      std::copy(src_to_dst[&src_fragments_order[src]].cbegin(),
-                src_to_dst[&src_fragments_order[src]].cend(),
-                std::back_inserter(dst));
-    }
-  }
-
-  return result;
-}
-
-// Handle copy, transpose, broadcast or reduce.
-// Common between them is that they alter the tensor dimensions or their order
-// and the way to handle layouts.
-DimOrderUpdatesOrError FusionContext::HandleDimensionAlteringOp(
-    const HloInstruction* hlo, const DimOrderMap& dim_orders,
-    const TransformDirection direction) const {
-  // Temporary storage for new fragments local to this function.
-  // Please keep this as the first local variable of this function, with type
-  // std::list to make sure that all pointers to elements of this remain valid
-  // throughout the entire function. std::deque would also work but it is
-  // unnecessarily big for a typical size of 1.
-  std::list<Fragment> new_fragments;
-
-  const HloInstruction* src =
-      (direction == TransformDirection::kOutputToInput) ? hlo : hlo->operand(0);
-  // Note: copying instead of using a const reference because
-  // some operations (slice) will modify fragment properties in-place.
-  Fragments src_fragments_order = dim_orders.at(src).TensorFragmentsOrder();
-  if (hlo->opcode() == HloOpcode::kSlice &&
-      ShapeUtil::IsEffectiveScalar(hlo->shape())) {
-    return FusionDecision("Slice to scalar is not implemented yet.");
-  }
-  // Every HLO dimension can correspond to a group of subdimensions in
-  // dim_order_. For the easier handling of permutations: group dim_order_ by
-  // dimension, apply permutations, then finally remove the grouping.
-  // Group subdimensions by iterating over them in the same order as over
-  // full dimensions and matching by total size.
-  std::vector<std::vector<Fragment*>> src_physical;
-  src_physical.reserve(src->shape().rank());
-  auto src_fragment_it = src_fragments_order.begin();
-  for (int64_t dim_index : src->shape().layout().minor_to_major()) {
-    const int64_t dim_size = src->shape().dimensions(dim_index);
-    int64_t subdim_size_accumulator = 1;
-    std::vector<Fragment*> subdim_group;
-    do {
-      CHECK(src_fragment_it != src_fragments_order.end());
-      subdim_size_accumulator *= src_fragment_it->full_size();
-      subdim_group.push_back(&*src_fragment_it);
-      ++src_fragment_it;
-    } while (subdim_size_accumulator < dim_size);
-    CHECK_EQ(subdim_size_accumulator, dim_size);
-    src_physical.push_back(subdim_group);
-  }
-
-  // Source physical -> source logical.
-  std::vector<std::vector<Fragment*>> src_logical;
-  src_logical.resize(src_physical.size());
-  for (int i = 0; i < src_physical.size(); ++i) {
-    src_logical[src->shape().layout().minor_to_major(i)] = src_physical[i];
-  }
-
-  HloInstruction::InstructionVector output;
-  output.push_back(const_cast<HloInstruction*>(hlo));
-  DimOrderUpdates result;
-  for (const HloInstruction* dst :
-       (direction == TransformDirection::kInputToOutput) ? output
-                                                         : hlo->operands()) {
-    DimensionOrder& dst_dim_order =
-        result.map.insert({dst, DimensionOrder()}).first->second;
-    // Source logical -> destination logical.
-    std::vector<std::vector<Fragment*>> dst_logical;
-    if (hlo->opcode() == HloOpcode::kTranspose) {
-      const auto* transpose = Cast<HloTransposeInstruction>(hlo);
-      std::vector<int64_t> permutation(transpose->dimensions().cbegin(),
-                                       transpose->dimensions().cend());
-      if (direction == TransformDirection::kInputToOutput) {
-        permutation = InversePermutation(permutation);
-      }
-      dst_logical.resize(permutation.size());
-      for (int i = 0; i < permutation.size(); ++i) {
-        dst_logical[permutation[i]] = src_logical[i];
-      }
-    } else if (hlo->opcode() == HloOpcode::kBroadcast) {
-      const auto* broadcast = Cast<HloBroadcastInstruction>(hlo);
-      dst_logical.resize(broadcast->dimensions().size());
-      for (int i = 0; i < broadcast->dimensions().size(); ++i) {
-        dst_logical[i] = src_logical[broadcast->dimensions()[i]];
-      }
-    } else if (hlo->opcode() == HloOpcode::kReduce) {
-      // Operand 1 (the neutral value) has to be a scalar.
-      if (dst != hlo && hlo->operand_index(dst) == 1) {
-        continue;
-      }
-      const auto* reduce = Cast<HloReduceInstruction>(hlo);
-      dst_logical.resize(src_logical.size() + reduce->dimensions().size());
-      if (reduce->dimensions().size() != 1) {
-        return FusionDecision("Unsupported reduction.");
-      }
-      for (int i = 0; i < dst_logical.size(); ++i) {
-        if (i == reduce->dimensions().front()) {
-          // This way to assign the reduction dimension will only work for
-          // softmax fusions with known patterns for now. Generally a reduction
-          // should create a new tiled dimension.
-          dst_logical[i] = {&new_fragments.emplace_back(
-              std::get<SoftmaxProperties>(properties_)
-                  .softmax_reduction_dimension,
-              reduce->operand(0)->shape().dimensions(i))};
-        } else {
-          dst_logical[i] = src_logical[i];
-        }
-      }
-    } else if (hlo->opcode() == HloOpcode::kCopy) {
-      // Copy preserves the logical shape, just permutes the layout.
-      CHECK(ShapeUtil::SameDimensions(src->shape(), dst->shape()));
-      dst_logical = src_logical;
-    } else if (hlo->opcode() == HloOpcode::kPad) {
-      // Operand 1 (the padding value) has to be a scalar.
-      if (dst != hlo && hlo->operand_index(dst) == 1) {
-        continue;
-      }
-      const auto* pad = Cast<HloPadInstruction>(hlo);
-      dst_logical.resize(src_logical.size());
-      for (int i = 0; i < src_logical.size(); ++i) {
-        // This only handles the padding added by
-        // PadDotOperandsIfNeededForSplitK, which sets only edge_padding_high.
-        const int padding =
-            pad->padding_config().dimensions(i).edge_padding_high();
-        CHECK_EQ(pad->padding_config().dimensions(i).edge_padding_low(), 0);
-        CHECK_EQ(pad->padding_config().dimensions(i).interior_padding(), 0);
-        if (padding == 0) {
-          dst_logical[i] = src_logical[i];
-        } else {
-          // This case is executed for the contracting dimension when we run the
-          // TritonFusionAnalysis after the padding and the split-k transform
-          // are applied.
-          const std::vector<Fragment*>& fragments = src_logical[i];
-          // We must have 2 fragments at this point.
-          CHECK_EQ(fragments.size(), 2);
-          // The dst_dim_numbers must be the same for the 2 fragments of the
-          // contracting dimension after applying split-k.
-          CHECK_EQ(fragments[0]->dst_dim_number(),
-                   fragments[1]->dst_dim_number());
-
-          new_fragments.emplace_back(
-              fragments[0]->dst_dim_number(),
-              fragments[0]->full_size() * fragments[1]->full_size() - padding);
-          dst_logical[i] = {&new_fragments.back()};
-        }
-      }
-    } else if (hlo->opcode() == HloOpcode::kSlice) {
-      const auto slice = Cast<HloSliceInstruction>(hlo);
-      dst_logical.resize(src_logical.size());
-      for (int dim = 0; dim < src_logical.size(); ++dim) {
-        dst_logical[dim] = src_logical[dim];
-        if (slice->slice_limits(dim) - slice->slice_starts(dim) !=
-            dst->shape().dimensions(dim)) {
-          if (dst_logical[dim].size() > 1) {
-            return FusionDecision("Slicing of fragmented dimension.");
-          }
-          dst_logical[dim].front()->set_size(dst->shape().dimensions(dim));
-          dst_logical[dim].front()->set_slice(slice->slice_starts(dim),
-                                              slice->slice_limits(dim));
-        }
-      }
-    } else {
-      return FusionDecision("Function called on a wrong instruction.");
-    }
-    // Destination logical -> destination physical and ungroup subdimensions.
-    // Map original fragments to the resulting ones to derive their new
-    // logical ordering within each dimension.
-    absl::flat_hash_map<const Fragment*, int> src_to_dst;
-    Fragments& dst_fragments_order = dst_dim_order.TensorFragmentsOrder();
-    FragmentOrders& dst_dim_fragments_order =
-        dst_dim_order.DimFragmentsOrders();
-    // Remember which dimensions are present before a broadcast;
-    // skip cases when already present dimension is being expanded.
-    absl::flat_hash_set<int> dim_numbers_present_in_dst;
-    for (const int64_t dim_idx : dst->shape().layout().minor_to_major()) {
-      for (const Fragment* subdim : dst_logical[dim_idx]) {
-        dst_fragments_order.push_back(*subdim);
-        src_to_dst[subdim] = dst_fragments_order.size() - 1;
-        dim_numbers_present_in_dst.insert(subdim->dst_dim_number());
-      }
-    }
-    for (const auto& [dim_index, dim_sequence] :
-         dim_orders.at(src).DimFragmentsOrders()) {
-      for (const int fragment_number : dim_sequence) {
-        const auto it = src_to_dst.find(&src_fragments_order[fragment_number]);
-        if (it == src_to_dst.cend()) {
-          if (hlo->opcode() == HloOpcode::kBroadcast &&
-              src_fragments_order[fragment_number].full_size() > 1 &&
-              dim_numbers_present_in_dst.contains(dim_index)) {
-            return FusionDecision("Unsupported broadcast");
-          }
-          continue;
-        }
-        dst_dim_fragments_order[dim_index].push_back(it->second);
-      }
-    }
-  }
-  return result;
-}
-
-// Infers DimensionOrders of all unknown sides (output, operands)
-// of `hlo` from the known ones.
-DimOrderUpdatesOrError FusionContext::HandleInstruction(
-    const HloInstruction* hlo, const DimOrderMap& dim_orders,
-    const TransformDirection direction) const {
-  VLOG(7) << "Analyzing " << hlo->ToString();
-  if (hlo->opcode() == HloOpcode::kParameter ||
-      hlo_query::IsScalarConstant(hlo)) {
-    return DimOrderUpdates{};
-  } else if (hlo->opcode() == HloOpcode::kTranspose ||
-             hlo->opcode() == HloOpcode::kCopy) {
-    return HandleDimensionAlteringOp(hlo, dim_orders, direction);
-  } else if (hlo->opcode() == HloOpcode::kBroadcast) {
-    if (direction != TransformDirection::kOutputToInput) {
-      return "Unsupported broadcast direction.";
-    }
-    return HandleDimensionAlteringOp(hlo, dim_orders, direction);
-  } else if (hlo->opcode() == HloOpcode::kReduce) {
-    if (!std::holds_alternative<SoftmaxProperties>(properties_)) {
-      return "Reductions are not supported in GEMM fusions yet.";
-    }
-    if (direction != TransformDirection::kOutputToInput) {
-      return "Unsupported direction of reduction.";
-    }
-    return HandleDimensionAlteringOp(hlo, dim_orders, direction);
-  } else if (hlo->opcode() == HloOpcode::kPad) {
-    if (direction != TransformDirection::kOutputToInput) {
-      return "Unsupported pad direction.";
-    }
-    return HandleDimensionAlteringOp(hlo, dim_orders, direction);
-  } else if (hlo->operand_count() > 0 &&
-             IsTritonSupportedElementwise(
-                 hlo->opcode(), hlo->operand(0)->shape().element_type())) {
-    return HandleElementwise(hlo, dim_orders);
-  } else if (hlo->opcode() == HloOpcode::kBitcast) {
-    return HandleBitcast(hlo, dim_orders, direction);
-  } else if (hlo->opcode() == HloOpcode::kSlice) {
-    if (direction != TransformDirection::kOutputToInput) {
-      return "Unsupported slice direction.";
-    }
-    return HandleDimensionAlteringOp(hlo, dim_orders, direction);
-  } else if (hlo->opcode() == HloOpcode::kReshape) {
-    if (!ShapeUtil::ReshapeIsBitcast(hlo->operand(0)->shape(), hlo->shape())) {
-      return "Non-bitcast reshape.";
-    }
-    return HandleBitcast(hlo, dim_orders, direction);
-  }
-  return "Unimplemented instruction.";
-}
-
-// Difference of input and output data volumes of an instruction.
-int64_t InputMinusOutputBytes(const HloInstruction& hlo) {
-  CHECK(!hlo.shape().IsTuple());
-  int64_t input_size = 0;
-  for (const HloInstruction* operand : hlo.operands()) {
-    CHECK(!operand->shape().IsTuple());
-    input_size += ShapeUtil::ByteSizeOf(operand->shape());
-  }
-  return input_size - ShapeUtil::ByteSizeOf(hlo.shape());
-}
-
-// Tells if an instruction has no user into which it could be fused.
-// More cases should be added here.
-bool CanNotBeFusedIntoAUser(const HloInstruction& hlo) {
-  return hlo.IsRoot() || (hlo.user_count() == 1 && hlo.users()[0]->IsRoot() &&
-                          hlo.users()[0]->opcode() == HloOpcode::kTuple);
-}
-
-// Let input and output data volumes of a fusion grow by small amounts.
-constexpr int kIoToleranceBytes = 1024;
-
-// Tells that fusing an instruction as an input is efficient.
-bool IsInputWorthFusing(const HloInstruction& hlo) {
-  if (InputMinusOutputBytes(hlo) <= kIoToleranceBytes) {
-    return true;
-  }
-  if (hlo.user_count() > 1) {
-    return false;
-  }
-  if (hlo.opcode() == HloOpcode::kSlice &&
-      hlo_query::AllOperandsAreParametersOrConstants(hlo)) {
-    return true;
-  }
-  return hlo_query::AllOperandsAreParametersOrConstantsWithSingleUser(hlo);
-}
-
-// Tells that fusing an instruction as an output is efficient.
-bool IsOutputWorthFusing(const HloInstruction& hlo) {
-  return CanNotBeFusedIntoAUser(hlo) ||
-         InputMinusOutputBytes(hlo) >= -kIoToleranceBytes;
-}
+struct HlosAndRequirements {
+  // The original HLO (which is outside the fusion computation).
+  const HloInstruction* original_hlo = nullptr;
+  // The fused HLO inside the new fusion computation, built by the builder.
+  //
+  // This can have the same opcode as `original_hlo` or it can be a parameter if
+  // the original HLO can't be fused.
+  const HloInstruction* fused_hlo = nullptr;
+  // The requirements imposed by the fused operations.
+  //
+  // If we fuse further operations they may have to conform to these
+  // requirements.
+  Requirements requirements;
+};
 
-DimOrderUpdatesOrError FusionContext::RequireSupportedInstruction(
-    const HloInstruction& hlo, const DimOrderMap& dim_orders,
-    const TransformDirection transform_direction) const {
-  auto result = HandleInstruction(&hlo, dim_orders, transform_direction);
-  if (!std::holds_alternative<DimOrderUpdates>(result)) {
-    return std::get<FusionDecision>(result);
-  }
+// Clones the hero kDot operation into the fusion.
+HloInstruction& FuseDot(const HloDotInstruction& dot,
+                        const HloInstruction& fused_lhs,
+                        const HloInstruction& fused_rhs,
+                        HloComputation::Builder& builder  // append
+) {
+  CHECK_EQ(dot.operand_count(), 2);
+  VLOG(3) << "Fusing " << dot.ToString();
 
-  if (FusionDecision supported =
-          RequireSupportedDimOrders(hlo, std::get<DimOrderUpdates>(result));
-      !supported) {
-    return supported;
-  }
-  return std::get<DimOrderUpdates>(result);
-}
-
-DimOrderUpdatesOrError FusionContext::AnalyzeForFusion(
-    const HloInstruction& hlo, const TransformDirection transform_direction,
-    absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
-        old_to_new_mapping,
-    const se::GpuComputeCapability gpu_version) const {
-  return AnalyzeForFusionImpl(hlo, transform_direction, old_to_new_mapping,
-                              dim_orders_, gpu_version);
-}
-
-DimOrderUpdatesOrError FusionContext::AnalyzeForFusionImpl(
-    const HloInstruction& hlo, const TransformDirection transform_direction,
-    absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
-        old_to_new_mapping,
-    const DimOrderMap& dim_orders,
-    const se::GpuComputeCapability gpu_version) const {
-  if (hlo.opcode() == HloOpcode::kTuple ||
-      hlo.opcode() == HloOpcode::kGetTupleElement) {
-    return "Unsupported instruction.";
-  }
-  if (hlo.opcode() == HloOpcode::kReduce) {
-    return "Reductions are not fused yet.";
-  }
-  if (hlo.opcode() == HloOpcode::kPad) {
-    return "Pads are not fused yet.";
-  }
-  for (const HloInstruction* operand : hlo.operands()) {
-    if (!IsTritonSupportedDataType(operand->shape().element_type(),
-                                   gpu_version)) {
-      return "Unsupported input data type.";
-    }
-  }
-  if (!IsTritonSupportedDataType(hlo.shape().element_type(), gpu_version)) {
-    return "Unsupported output data type.";
-  }
-  DimOrderUpdatesOrError result =
-      RequireSupportedInstruction(hlo, dim_orders, transform_direction);
-  if (!std::holds_alternative<DimOrderUpdates>(result)) {
-    return result;
-  }
-  int fusion_level =
-      hlo.GetModule()->config().debug_options().xla_gpu_triton_fusion_level();
-  if (!std::get<se::CudaComputeCapability>(gpu_version)
-           .IsAtLeast(se::CudaComputeCapability::AMPERE)) {
-    fusion_level = std::min(fusion_level, 1);
-  }
-  if (transform_direction == TransformDirection::kOutputToInput) {
-    if (fusion_level < 2) {
-      if (hlo.opcode() == HloOpcode::kConvert) {
-        if (FusionDecision decision =
-                RequireTritonFusibleConvert(&hlo, gpu_version);
-            !decision) {
-          return decision;
-        }
-      } else if (hlo.IsElementwise() && hlo.opcode() != HloOpcode::kCopy) {
-        return "Ignored elementwise operation";
-      }
-    } else {
-      // Exception for binary elementwise operations: in most cases these are
-      // not trivial to fuse because they increase DRAM traffic but if one
-      // of the inputs is for example a broadcast that can be fused too it
-      // becomes worth fusing. Look ahead and analyze operands here.
-      bool accepted = false;
-      if (hlo.IsElementwise() && hlo.operand_count() == 2) {
-        for (const HloInstruction* operand : hlo.operands()) {
-          if (operand->opcode() == HloOpcode::kBroadcast &&
-              (operand->operand(0)->opcode() == HloOpcode::kParameter ||
-               operand->operand(0)->opcode() == HloOpcode::kConstant) &&
-              std::holds_alternative<DimOrderUpdates>(AnalyzeForFusionImpl(
-                  *operand, transform_direction, old_to_new_mapping,
-                  std::get<DimOrderUpdates>(result).map, gpu_version))) {
-            accepted = true;
-            break;
-          }
-        }
-      }
-      if (!accepted && !IsInputWorthFusing(hlo)) {
-        return "Not obviously profitable to fuse as input.";
-      }
-    }
-  } else {
-    if (fusion_level < 2) {
-      return "Skipping fusing outputs at low fusion levels.";
-    }
-    for (const HloInstruction* operand : hlo.operands()) {
-      // Skip already fused operands.
-      if (old_to_new_mapping.contains(operand)) {
-        continue;
-      }
-      // Currently only broadcasts of scalar constants or parameters
-      // are accepted as other inputs of non-unary operations
-      // in the output fusion.
-      if (hlo_query::IsBroadcastOfScalarConstant(*operand) ||
-          operand->opcode() == HloOpcode::kParameter) {
-        continue;
-      }
-      return "Has multiple inputs - not properly analyzed yet.";
-    }
-    if (!IsOutputWorthFusing(hlo)) {
-      return "Not obviously profitable to fuse as output.";
-    }
-  }
-  return std::get<DimOrderUpdates>(result);
-}
-
-// Clone an instruction into the fusion.
-void Fuse(HloInstruction& hlo,
-          absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
-              old_to_new_mapping,
-          std::vector<HloInstruction*>& fusion_inputs,
-          HloComputation::Builder& builder) {
-  if (old_to_new_mapping.contains(&hlo)) {
-    return;
-  }
-  VLOG(3) << "Fusing " << hlo.ToString();
-  auto get_or_add_parameter = [&](HloInstruction& instr) {
-    if (auto it = old_to_new_mapping.find(&instr);
-        it != old_to_new_mapping.end()) {
-      return it->second;
-    }
-    fusion_inputs.push_back(&instr);
-    return old_to_new_mapping
-        .insert({&instr,
-                 builder.AddInstruction(HloInstruction::CreateParameter(
-                     fusion_inputs.size() - 1, instr.shape(),
-                     absl::StrCat("parameter_", fusion_inputs.size() - 1)))})
-        .first->second;
-  };
-  if (hlo.opcode() == HloOpcode::kParameter ||
-      hlo.opcode() == HloOpcode::kGetTupleElement) {
-    get_or_add_parameter(hlo);
-  } else {
-    std::vector<HloInstruction*> hlo_new_operands;
-    for (HloInstruction* operand : hlo.operands()) {
-      hlo_new_operands.push_back(get_or_add_parameter(*operand));
-    }
-    old_to_new_mapping[&hlo] = builder.AddInstruction(
-        hlo.CloneWithNewOperands(hlo.shape(), hlo_new_operands));
-  }
+  std::array<HloInstruction*, 2> hlo_new_operands = {
+      const_cast<HloInstruction*>(&fused_lhs),
+      const_cast<HloInstruction*>(&fused_rhs)};
+  return *builder.AddInstruction(
+      dot.CloneWithNewOperands(dot.shape(), hlo_new_operands));
 }
 
 // Tells how many new parameters does a fusion gain by fusing the operation as
 // an input.
 int64_t NumAddedParameters(const HloInstruction& hlo) {
   // Non-scalar constant is equivalent to a parameter: one input, one output.
-  if (hlo.opcode() == HloOpcode::kConstant &&
-      !ShapeUtil::IsScalar(hlo.shape())) {
+  if (hlo.opcode() == HloOpcode::kParameter ||
+      (hlo.opcode() == HloOpcode::kConstant &&
+       !ShapeUtil::IsScalar(hlo.shape()))) {
     return 0;
   }
   // All other instructions add all own inputs and remove own single output.
   return hlo.operand_count() - 1;
 }
 
-bool FusionContext::MergeUpdates(const DimOrderUpdates& updates) {
-  // First check that all updates to insert are compatible to avoid
-  // incomplete merges.
-  for (const auto& [key, value] : updates.map) {
-    auto it = dim_orders_.find(key);
-    if (it != dim_orders_.cend() && !it->second.IsPhysicallyEquivalent(value)) {
-      return false;
-    }
-  }
-  if (updates.splittable_dimension_major_part_size > 1 &&
-      !SetSplittableDimensionMajorPartSize(
-          updates.splittable_dimension_major_part_size)) {
-    return false;
-  }
-  dim_orders_.insert(updates.map.begin(), updates.map.end());
-  return true;
-}
+// Just a helper to reduce "unwrapping" code where we use this.
+std::optional<DimOrdersAndReqs> GetOperandDimOrdersAndCombinedReqs(
+    const HloInstruction& hlo, const DimensionOrder& dim_order,
+    const HeroProperties& properties,
+    const se::GpuComputeCapability& gpu_version,
+    const Requirements& requirements) {
+  DimOrdersAndReqsOrError dim_orders_and_new_reqs =
+      GetPropagatedDimOrdersAndRequirements(
+          hlo, dim_order, TransformDirection::kOutputToInput, properties);
+  if (!std::holds_alternative<DimOrdersAndReqs>(dim_orders_and_new_reqs)) {
+    return std::nullopt;
+  }
+  RequirementsOrError combined_reqs = CombineRequirements(
+      requirements,
+      std::get<DimOrdersAndReqs>(dim_orders_and_new_reqs).requirements);
+  if (!std::holds_alternative<Requirements>(combined_reqs)) {
+    return std::nullopt;
+  }
+  return DimOrdersAndReqs{
+      std::get<DimOrdersAndReqs>(dim_orders_and_new_reqs).dim_orders,
+      std::get<Requirements>(combined_reqs)};
+}
+
+// Just a helper to reduce "unwrapping" code where we use this.
+std::optional<DimOrdersAndReqs> GetOperandDimOrdersAndCombinedReqsIfProfitable(
+    const HloInstruction& hlo, const DimensionOrder& dim_order,
+    const HeroProperties& properties,
+    const se::GpuComputeCapability& gpu_version,
+    const Requirements& requirements) {
+  DimOrdersAndReqsOrError dim_orders_and_new_reqs =
+      GetPropagatedDimOrdersAndRequirementsIfProfitablyFusible(
+          hlo, TransformDirection::kOutputToInput,
+          /*src_operand_index=*/std::nullopt, dim_order, gpu_version,
+          properties);
+  if (!std::holds_alternative<DimOrdersAndReqs>(dim_orders_and_new_reqs)) {
+    return std::nullopt;
+  }
+  RequirementsOrError combined_reqs = CombineRequirements(
+      requirements,
+      std::get<DimOrdersAndReqs>(dim_orders_and_new_reqs).requirements);
+  if (!std::holds_alternative<Requirements>(combined_reqs)) {
+    return std::nullopt;
+  }
+  return DimOrdersAndReqs{
+      std::get<DimOrdersAndReqs>(dim_orders_and_new_reqs).dim_orders,
+      std::get<Requirements>(combined_reqs)};
+}
+
+// Just a helper to reduce "unwrapping" code where we use this.
+std::optional<DimOrdersAndReqs> GetUserDimOrdersAndCombinedReqsIfProfitable(
+    const HloInstruction& hlo, const DimensionOrder& hlo_dim_order,
+    const HloInstruction& user, const HeroProperties& properties,
+    const se::GpuComputeCapability& gpu_version,
+    const Requirements& requirements) {
+  DimOrdersAndReqsOrError dim_orders_and_new_reqs =
+      GetPropagatedDimOrdersAndRequirementsIfProfitablyFusible(
+          user, TransformDirection::kInputToOutput, user.operand_index(&hlo),
+          hlo_dim_order, gpu_version, properties);
+  if (!std::holds_alternative<DimOrdersAndReqs>(dim_orders_and_new_reqs)) {
+    return std::nullopt;
+  }
+  RequirementsOrError combined_reqs = CombineRequirements(
+      requirements,
+      std::get<DimOrdersAndReqs>(dim_orders_and_new_reqs).requirements);
+  if (!std::holds_alternative<Requirements>(combined_reqs)) {
+    return std::nullopt;
+  }
+  return DimOrdersAndReqs{
+      std::get<DimOrdersAndReqs>(dim_orders_and_new_reqs).dim_orders,
+      std::get<Requirements>(combined_reqs)};
+}
+
+// Builds the fusion map and the requirements which can later be used to
+// actually fuse that subgraph.
+FusionMapAndRequirements BuildFusionMapAndRequirementsTowardOperands(
+    const GraphPath& root_path, const HloInstruction& root_hlo,
+    const DimensionOrder& root_dim_order, const std::optional<int>& max_params,
+    const se::GpuComputeCapability& gpu_version,
+    const HeroProperties& properties, const Requirements& requirements_so_far) {
+  CHECK(!max_params.has_value() || max_params.value() >= 1);
+  FusionMap fusion_map;
+  Requirements combined_requirements = requirements_so_far;
+  auto add_to_fusion_map = [&](const FusionQueueItem& item, bool fuse) {
+    CHECK(
+        fusion_map
+            .insert({item.path, {fuse, item.dim_order.ToTensorIterationSpec()}})
+            .second);
+  };
 
-void FusionContext::TryToFuseWithInputsRecursively(
-    HloInstruction& root, const se::GpuComputeCapability gpu_version,
-    absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
-        old_to_new_mapping,
-    std::vector<HloInstruction*>& fusion_inputs,
-    HloComputation::Builder& builder) {
-  // Instructions at the fusion edge that can either get fused too or
+  // GraphPaths at the fusion edge that can either get fused too or
   // become parameters of the fusion. Used to track the number of parameters.
-  absl::flat_hash_set<const HloInstruction*> inputs;
-  // Traverse all connected instructions that could be fused, analyze them and
-  // collect ones that will be fused.
-  absl::flat_hash_set<const HloInstruction*> to_fuse_set;
-  std::list<HloInstruction*> to_fuse_list;
-  absl::flat_hash_set<const HloInstruction*> enqueued;
-  std::queue<HloInstruction*> to_visit;
-  to_visit.push(&root);
-  int num_requeued = 0;
-  while (to_visit.size() > num_requeued) {
-    HloInstruction* hlo = to_visit.front();
-    to_visit.pop();
+  absl::flat_hash_set<GraphPath> inputs({root_path});
+  std::queue<FusionQueueItem> fusion_queue(
+      {FusionQueueItem(root_path, root_hlo, root_dim_order)});
+  int64_t num_requeued = 0;
+  // BFS
+  while (fusion_queue.size() > num_requeued) {
+    FusionQueueItem item = fusion_queue.front();
+    fusion_queue.pop();
+
     // Watch the total number of fusion parameters.
-    if (inputs.size() >= TritonFusionAnalysis::kMaxParameterPerScope &&
-        NumAddedParameters(*hlo) > 0) {
+    if (max_params.has_value() &&
+        inputs.size() + NumAddedParameters(item.hlo) > max_params.value()) {
       // Re-queue: the number of parameters may go down when other instructions
       // are processed.
-      to_visit.push(hlo);
+      fusion_queue.push(item);
       // Prevent infinite loops.
       ++num_requeued;
       continue;
     }
     num_requeued = 0;
-    const DimOrderUpdatesOrError result =
-        AnalyzeForFusion(*hlo, TransformDirection::kOutputToInput,
-                         old_to_new_mapping, gpu_version);
-    if (!std::holds_alternative<DimOrderUpdates>(result) ||
-        !MergeUpdates(std::get<DimOrderUpdates>(result))) {
+    if (item.hlo.opcode() == HloOpcode::kParameter) {
+      add_to_fusion_map(item, /*fuse=*/false);
       continue;
     }
-    if (hlo->opcode() != HloOpcode::kParameter) {
-      inputs.erase(hlo);
-    }
-    inputs.insert(hlo->operands().cbegin(), hlo->operands().cend());
-    to_fuse_set.insert(hlo);
-    to_fuse_list.push_back(hlo);
-    for (HloInstruction* operand : hlo->operands()) {
-      if (enqueued.insert(operand).second) {
-        VLOG(6) << "Enqueueing " << operand->ToString();
-        to_visit.push(operand);
-      }
+    auto opt_result = GetOperandDimOrdersAndCombinedReqsIfProfitable(
+        item.hlo, item.dim_order, properties, gpu_version,
+        combined_requirements);
+    if (!opt_result.has_value()) {
+      add_to_fusion_map(item, /*fuse=*/false);
+      continue;
     }
-  }
-  // Find one by one instructions that have no operands queued to be fused and
-  // fuse them.
-  while (!to_fuse_list.empty()) {
-    for (auto it = to_fuse_list.begin(); it != to_fuse_list.end();) {
-      bool ready_to_fuse = true;
-      for (const HloInstruction* operand : (*it)->operands()) {
-        if (to_fuse_set.contains(operand)) {
-          ready_to_fuse = false;
-          break;
-        }
-      }
-      if (ready_to_fuse) {
-        Fuse(**it, old_to_new_mapping, fusion_inputs, builder);
-        to_fuse_set.erase(*it);
-        it = to_fuse_list.erase(it);
+    const DimOrderMap operand_dim_orders = std::move(opt_result->dim_orders);
+    combined_requirements = std::move(opt_result->requirements);
+    inputs.erase(item.path);
+    for (int64_t i = 0; i < item.hlo.operand_count(); ++i) {
+      GraphPath operand_path = item.path.GetPathOfOperand(i);
+      const HloInstruction& operand = *item.hlo.operand(i);
+      const DimensionOrder& operand_dim_order = operand_dim_orders.at(&operand);
+
+      FusionQueueItem new_item(operand_path, operand, operand_dim_order);
+      VLOG(6) << "Enqueueing " << new_item.ToString();
+      inputs.insert(new_item.path);
+      fusion_queue.push(new_item);
+    }
+    add_to_fusion_map(item, /*fuse=*/true);
+  }
+  // Handle the remaining requeued items.
+  while (!fusion_queue.empty()) {
+    add_to_fusion_map(fusion_queue.front(), /*fuse=*/false);
+    fusion_queue.pop();
+  }
+  return {fusion_map, combined_requirements};
+}
+
+// Builds the nodes for the fusion represented by the fusion map.
+HloInstruction& BuildFusionTowardOperands(
+    const FusionMap& fusion_map, const GraphPath& path,
+    const HloInstruction& hlo,
+    HloComputation::Builder& builder,            // append
+    std::vector<HloInstruction*>& fusion_params  // append
+) {
+  auto fusion_map_it = fusion_map.find(path);
+  CHECK(fusion_map_it != fusion_map.end());
+  FusionDecisionAndIterspec decision_and_iterspec = fusion_map_it->second;
+
+  HloInstruction* new_hlo = nullptr;
+  if (decision_and_iterspec.fuse) {
+    HloInstruction::InstructionVector new_operands;
+    for (int i = 0; i < hlo.operand_count(); ++i) {
+      const HloInstruction* operand = hlo.operand(i);
+      new_operands.push_back(
+          &BuildFusionTowardOperands(fusion_map, path.GetPathOfOperand(i),
+                                     *operand, builder, fusion_params));
+    }
+    new_hlo = builder.AddInstruction(
+        hlo.CloneWithNewOperands(hlo.shape(), new_operands));
+  } else {
+    fusion_params.push_back(const_cast<HloInstruction*>(&hlo));
+    new_hlo = builder.AddInstruction(HloInstruction::CreateParameter(
+        fusion_params.size() - 1, hlo.shape(),
+        absl::StrCat("parameter_", fusion_params.size() - 1)));
+  }
+  return *new_hlo;
+}
+
+// Grows the fusion toward the operands.
+//
+// This always succeeds.
+//
+// If it's not possible to fuse something, it fuses a parameter instead.
+//
+// The fusion can grow until it has `max_params` params and it can only grow
+// with operations for which the DimOrder propagation works and they don't
+// impose requirements contradicting the existing requirements.
+//
+// The return value contains the HLOs corresponding to `root_hlo` and the
+// requirements corresponding to the whole fusion so far.
+HlosAndRequirements FuseTowardOperands(
+    const GraphPath& root_path, const HloInstruction& root_hlo,
+    const DimensionOrder& root_dim_order, const std::optional<int>& max_params,
+    const se::GpuComputeCapability& gpu_version,
+    const HeroProperties& properties, const Requirements& requirements_so_far,
+    HloComputation::Builder& builder,            // append
+    std::vector<HloInstruction*>& fusion_params  // append
+) {
+  FusionMapAndRequirements fusion_map_and_reqs =
+      BuildFusionMapAndRequirementsTowardOperands(
+          root_path, root_hlo, root_dim_order, max_params, gpu_version,
+          properties, requirements_so_far);
+  HloInstruction& fused_hlo_or_param =
+      BuildFusionTowardOperands(fusion_map_and_reqs.fusion_map, root_path,
+                                root_hlo, builder, fusion_params);
+  return HlosAndRequirements{&root_hlo, &fused_hlo_or_param,
+                             fusion_map_and_reqs.requirements};
+}
+
+// Grows the fusion toward the given dot operand.
+//
+// This always succeeds.
+//
+// If it's not possible to fuse something, it fuses a parameter instead.
+//
+// The fusion can grow until it has `max_params` params and it can only grow
+// with operations for which the DimOrder propagation works and they don't
+// impose requirements contradicting the existing requirements.
+//
+// The return value contains the HLOs corresponding to the given dot operand and
+// the requirements corresponding to the whole fusion so far.
+HlosAndRequirements FuseDotOperand(
+    const HloInstruction& dot, int operand_index,
+    const se::GpuComputeCapability& gpu_version,
+    HloComputation::Builder& builder,            // append
+    std::vector<HloInstruction*>& fusion_params  // append
+) {
+  // Direct dot inputs have well defined dimension orders.
+  const FusionContext context =
+      FusionContext::FromDotOperand(dot, operand_index);
+  const HloInstruction& operand = *dot.operand(operand_index);
+  return FuseTowardOperands(GraphPath().GetPathOfOperand(operand_index),
+                            operand, context.dim_orders().at(&operand),
+                            TritonFusionAnalysis::kMaxParameterPerDotOperand,
+                            gpu_version, context.hero_properties(),
+                            context.requirements(), builder, fusion_params);
+}
+
+// Grows the fusion toward the users.
+//
+// This always succeeds.
+//
+// The fusion can grow as long as the DimOrder propagation works and the users
+// don't impose requirements contradicting the existing requirements.
+//
+// The return value contains the HLOs corresponding to the "lowest" fused user
+// or `hlo` if no users can be fused.
+//
+// It also grows the fusion upward, toward the "other" operands of the users,
+// but currently only in special cases, such as binary elementwise operation
+// with broadcast of scalar constant.
+HlosAndRequirements FuseTowardUsers(
+    const GraphPath& hlo_path, const HloInstruction& hlo,
+    const HloInstruction& fused_hlo, const DimensionOrder& hlo_dim_order,
+    const se::GpuComputeCapability& gpu_version,
+    const HeroProperties& properties, const Requirements& requirements,
+    HloComputation::Builder& builder,            // append
+    std::vector<HloInstruction*>& fusion_params  // append
+) {
+  const HlosAndRequirements existing_hlos_and_requirements = {&hlo, &fused_hlo,
+                                                              requirements};
+  if (hlo.user_count() != 1) {
+    return existing_hlos_and_requirements;
+  }
+  const HloInstruction& user = *hlo.users()[0];
+  if (!IsDistributiveOverAddition(user)) {
+    return existing_hlos_and_requirements;
+  }
+
+  // Get the dim orders for the user.
+  auto opt_user_result = GetUserDimOrdersAndCombinedReqsIfProfitable(
+      hlo, hlo_dim_order, user, properties, gpu_version, requirements);
+  if (!opt_user_result.has_value()) {
+    return existing_hlos_and_requirements;
+  }
+  DimensionOrder user_dim_order = opt_user_result->dim_orders.at(&user);
+  GraphPath user_path = hlo_path.GetPathOfUser();
+  Requirements combined_requirements = opt_user_result->requirements;
+
+  HloInstruction::InstructionVector new_operands;
+  if (user.operand_count() == 1) {
+    new_operands.push_back(const_cast<HloInstruction*>(&fused_hlo));
+  } else {
+    // Get the dim orders for the operands of the user.
+    // We shouldn't do a profitability check here, we made that decision in
+    // GetUserDimOrdersAndCombinedReqsIfProfitable.
+    auto opt_operand_result = GetOperandDimOrdersAndCombinedReqs(
+        user, user_dim_order, properties, gpu_version, combined_requirements);
+    // This shouldn't fail, because currently we only encounter this when we
+    // have just propagated down the DimOrders on a binary elementwise
+    // operation (user). In that case propagating up the DimOrders should always
+    // work.
+    if (!opt_operand_result.has_value()) {
+      return existing_hlos_and_requirements;
+    }
+    DimOrderMap operand_dim_orders = opt_operand_result->dim_orders;
+    combined_requirements = opt_operand_result->requirements;
+
+    // Fuse the other operands of the user.
+    for (int i = 0; i < user.operand_count(); ++i) {
+      const HloInstruction& operand = *user.operand(i);
+      if (&operand == &hlo) {
+        new_operands.push_back(const_cast<HloInstruction*>(&fused_hlo));
       } else {
-        ++it;
-      }
-    }
-  }
+        HlosAndRequirements hlos_and_requirements = FuseTowardOperands(
+            user_path.GetPathOfOperand(i), operand,
+            operand_dim_orders.at(&operand),
+            /*max_params=*/std::nullopt, gpu_version, properties,
+            combined_requirements, builder, fusion_params);
+        new_operands.push_back(
+            const_cast<HloInstruction*>(hlos_and_requirements.fused_hlo));
+        combined_requirements = hlos_and_requirements.requirements;
+      }
+    }
+  }
+
+  const HloInstruction& fused_user = *builder.AddInstruction(
+      user.CloneWithNewOperands(user.shape(), new_operands));
+  return FuseTowardUsers(user_path, user, fused_user, user_dim_order,
+                         gpu_version, properties, combined_requirements,
+                         builder, fusion_params);
+}
+
+// Grows the fusion toward the users of the dot.
+//
+// This always succeeds.
+//
+// The fusion can grow as long as the DimOrder propagation works and the users
+// don't impose requirements contradicting the existing requirements.
+//
+// The return value contains the HLOs corresponding to the "lowest" fused user
+// or `dot` if no users can be fused.
+//
+// It also grows the fusion towards the "other" operands of the users, but
+// currently only in special cases, such as binary elementwise operation with
+// broadcast of scalar constant.
+HlosAndRequirements FuseDotOutput(
+    const HloInstruction& dot, const HloInstruction& fused_dot,
+    const se::GpuComputeCapability& gpu_version,
+    const DotRequirements& requirements,
+    HloComputation::Builder& builder,            // append
+    std::vector<HloInstruction*>& fusion_params  // append
+) {
+  const auto context =
+      FusionContext::FromDotOutput(dot, /*split_k=*/1, requirements);
+  return FuseTowardUsers(GraphPath(), dot, fused_dot,
+                         context.dim_orders().at(&dot), gpu_version,
+                         context.hero_properties(), context.requirements(),
+                         builder, fusion_params);
 }
 
 // Fuses dot and the compatible and profitable to fuse operations around it
@@ -1294,11 +544,11 @@ void FusionContext::TryToFuseWithInputsRecursively(
 // get populated with the non-fused instructions that become operands of the
 // call to this fusion. fusion_output_ptr (if not nullptr) gets assigned the
 // original instruction that has to be replaced by the call to the fusion.
-StatusOr<FusionDecision> FuseDot(HloInstruction& dot,
-                                 const se::GpuComputeCapability gpu_version,
-                                 HloComputation::Builder& builder,
-                                 std::vector<HloInstruction*>& fusion_inputs,
-                                 HloInstruction** fusion_output_ptr) {
+StatusOr<FusionDecision> CreateDotFusion(
+    const HloDotInstruction& dot, const se::GpuComputeCapability gpu_version,
+    HloComputation::Builder& builder,
+    std::vector<HloInstruction*>& fusion_inputs,
+    HloInstruction** fusion_output_ptr) {
   VLOG(5) << dot.ToString();
   if (FusionDecision can_handle = CanTritonHandleGEMM(dot, gpu_version);
       !can_handle) {
@@ -1306,82 +556,44 @@ StatusOr<FusionDecision> FuseDot(HloInstruction& dot,
     return can_handle;
   }
 
-  // Original instruction -> fused one.
-  absl::flat_hash_map<const HloInstruction*, HloInstruction*>
-      old_to_new_mapping;
+  HlosAndRequirements lhs_hlos_and_reqs = FuseDotOperand(
+      dot, /*operand_index=*/0, gpu_version, builder, fusion_inputs);
+  HlosAndRequirements rhs_hlos_and_reqs = FuseDotOperand(
+      dot, /*operand_index=*/1, gpu_version, builder, fusion_inputs);
+  HloInstruction& fused_dot = FuseDot(dot, *lhs_hlos_and_reqs.fused_hlo,
+                                      *rhs_hlos_and_reqs.fused_hlo, builder);
+  // For now the RHS doesn't support splits, so it also doesn't impose any
+  // requirements.
+  HlosAndRequirements fused_output_and_reqs =
+      FuseDotOutput(dot, fused_dot, gpu_version,
+                    std::get<DotRequirements>(lhs_hlos_and_reqs.requirements),
+                    builder, fusion_inputs);
 
-  // Separate traversal from LHS and RHS inputs of the dot: they use
-  // differently shaped tiles but may go through same HLO graph nodes.
-  // Direct dot inputs have well defined dimension orders.
-
-  auto fuse_inputs = [&](int operand_number) -> StatusOr<FusionContext> {
-    const int operand_count_before = fusion_inputs.size();
-    // Direct dot inputs have well defined dimension orders.
-    auto context = FusionContext::FromDotOperand(dot, operand_number);
-    context.TryToFuseWithInputsRecursively(*dot.mutable_operand(operand_number),
-                                           gpu_version, old_to_new_mapping,
-                                           fusion_inputs, builder);
-    TF_RET_CHECK(fusion_inputs.size() - operand_count_before <=
-                 TritonFusionAnalysis::kMaxParameterPerScope);
-    return context;
-  };
-
-  TF_ASSIGN_OR_RETURN(const FusionContext lhs_context, fuse_inputs(0));
-  if (auto result = fuse_inputs(1); !result.ok()) {
-    return result.status();
-  }
-
-  Fuse(dot, old_to_new_mapping, fusion_inputs, builder);
-
-  // Fusion at dot's output.
-
-  // These describe _outputs_ of corresponding HLOs.
-  auto context = FusionContext::FromDotOutput(
-      dot, /*split_k=*/1, lhs_context.SplittableDimensionMajorPartSize());
-  HloInstruction* fusion_output = &dot;
-  bool output_changed = true;
-  while (output_changed) {
-    output_changed = false;
-    if (fusion_output->user_count() != 1) {
-      break;
-    }
-    HloInstruction* user = fusion_output->users()[0];
-    if (!IsDistributiveOverAddition(*user)) {
-      break;
-    }
-    auto result =
-        context.AnalyzeForFusion(*user, TransformDirection::kInputToOutput,
-                                 old_to_new_mapping, gpu_version);
-    if (!std::holds_alternative<DimOrderUpdates>(result)) {
-      continue;
-    }
-    TF_RET_CHECK(context.MergeUpdates(std::get<DimOrderUpdates>(result)));
-    for (HloInstruction* operand : user->operands()) {
-      if (!old_to_new_mapping.contains(operand)) {
-        context.TryToFuseWithInputsRecursively(
-            *operand, gpu_version, old_to_new_mapping, fusion_inputs, builder);
-      }
-    }
-    Fuse(*user, old_to_new_mapping, fusion_inputs, builder);
-    fusion_output = user;
-    output_changed = true;
-  }
   if (fusion_output_ptr != nullptr) {
-    *fusion_output_ptr = fusion_output;
+    *fusion_output_ptr =
+        const_cast<HloInstruction*>(fused_output_and_reqs.original_hlo);
   }
+
   if (dot.GetModule()->config().debug_options().xla_gpu_triton_gemm_any()) {
     return FusionDecision{};
   }
-  // Only fuse if this is not a "pure" matmul.
-  for (const auto& iter : old_to_new_mapping) {
+
+  bool is_pure_matmul = true;
+  (void)builder.ForEachInstruction([&](const HloInstruction* fused_hlo) {
     static constexpr std::array<HloOpcode, 4> kPureOpcodes = {
         HloOpcode::kBitcast, HloOpcode::kDot, HloOpcode::kParameter,
         HloOpcode::kReshape};
-    const HloOpcode opcode = iter.second->opcode();
-    if (absl::c_find(kPureOpcodes, opcode) == kPureOpcodes.end()) {
-      return FusionDecision{};
+    if (absl::c_find(kPureOpcodes, fused_hlo->opcode()) == kPureOpcodes.end()) {
+      is_pure_matmul = false;
+      // Stop iterating.
+      return absl::CancelledError();
     }
+    return OkStatus();
+  });
+  if (!is_pure_matmul) {
+    return FusionDecision{};
   }
+
   return "No profitable operations to fuse.";
 }
 
@@ -1389,28 +601,30 @@ StatusOr<FusionDecision> FuseDot(HloInstruction& dot,
 // operations that can target the triton GEMM emitter.
 class GemmRewriterTritonVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit GemmRewriterTritonVisitor(const se::GpuComputeCapability gpu_version)
+  explicit GemmRewriterTritonVisitor(
+      const se::GpuComputeCapability& gpu_version)
       : gpu_version_(gpu_version) {}
   // Checks that a dot() should be targeting the triton GEMM emitter;
   // if so - fuses all its compatible inputs and outputs as a new computation
   // and replaces the original dot() with a call to the computation.
   Status HandleDot(HloInstruction* dot) override {
+    CHECK_EQ(dot->opcode(), HloOpcode::kDot);
+
     std::string fusion_name = absl::StrCat("triton_gemm_", dot->name());
     HloComputation::Builder builder(absl::StrCat(fusion_name, "_computation"));
     std::vector<HloInstruction*> fusion_inputs;
     HloInstruction* fusion_output = nullptr;
     TF_ASSIGN_OR_RETURN(
         const FusionDecision should_fuse,
-        FuseDot(*dot, gpu_version_, builder, fusion_inputs, &fusion_output));
+        CreateDotFusion(*Cast<HloDotInstruction>(dot), gpu_version_, builder,
+                        fusion_inputs, &fusion_output));
     if (builder.last_added_instruction() == nullptr) {
       return OkStatus();
     }
     // If a GEMM requiring padding for cuBLAS is encountered here this
     // happened because earlier ShouldTritonHandleGEMM() accepted it and padding
     // was skipped. Accept it ignoring profitability checks.
-    if (!CublasRequiresPadding(
-            *Cast<HloDotInstruction>(dot),
-            std::get<se::CudaComputeCapability>(gpu_version_)) &&
+    if (!CublasRequiresPadding(*Cast<HloDotInstruction>(dot), gpu_version_) &&
         !should_fuse) {
       return OkStatus();
     }
@@ -1447,235 +661,37 @@ class GemmRewriterTritonVisitor : public DfsHloRewriteVisitor {
 };
 
 StatusOr<bool> RunOnComputation(HloComputation* computation,
-                                se::GpuComputeCapability gpu_version) {
+                                const se::GpuComputeCapability& gpu_version) {
   GemmRewriterTritonVisitor visitor(gpu_version);
   TF_RETURN_IF_ERROR(computation->Accept(&visitor));
   return visitor.changed();
 }
 
-Status FusionContext::PropagateDimensionOrdersToParameters(
-    const HloInstruction& origin, ConstHloInstructionSet& parameters,
-    ConstHloInstructionMap<TensorIterationSpec>& iter_specs) {
-  absl::flat_hash_set<const HloInstruction*> visited;
-  std::queue<const HloInstruction*> to_process;
-  // Dimension orders describing outputs of corresponding instructions.
-  visited.insert(&origin);
-  to_process.push(&origin);
-  while (!to_process.empty()) {
-    const HloInstruction* hlo = to_process.front();
-    to_process.pop();
-    if (hlo->opcode() == HloOpcode::kParameter) {
-      // One parameter corresponds to one iteration spec in the results of the
-      // analysis. This describes well situations when a parameter has one or
-      // more elementwise users - they share the same tiling. Situations when
-      // one instruction is read differently by different users in the same
-      // scope of the dot are currently prevented during the fusion.
-      TF_RET_CHECK(parameters.insert(hlo).second);
-      VLOG(5) << hlo->ToString();
-    }
-    auto result =
-        HandleInstruction(hlo, dim_orders_, TransformDirection::kOutputToInput);
-    TF_RET_CHECK(std::holds_alternative<DimOrderUpdates>(result));
-    TF_RET_CHECK(
-        RequireSupportedDimOrders(*hlo, std::get<DimOrderUpdates>(result)));
-    TF_RET_CHECK(MergeUpdates(std::get<DimOrderUpdates>(result)));
-    iter_specs[hlo] = DimensionOrderToTensorIterationSpec(dim_orders_.at(hlo));
-    for (const HloInstruction* operand : hlo->operands()) {
-      if (!visited.insert(operand).second) {
-        continue;
-      }
-      if (operand->opcode() == HloOpcode::kDot) {
-        // Encountering the dot itself happens during the processing of the
-        // output fusion. The propagation should stop at it.
-        continue;
-      }
-      to_process.push(operand);
-    }
-  }
-  return OkStatus();
-}
-
-}  // anonymous namespace
-
-// Data types that are supported by the Triton emitters.
-bool IsTritonSupportedDataType(PrimitiveType type,
-                               se::GpuComputeCapability gpu_version) {
-  auto cuda_compute_capability =
-      std::get<se::CudaComputeCapability>(gpu_version);
-  switch (type) {
-    case PRED:
-    case S8:
-    case S16:
-    case S32:
-    case F16:
-    case F32:
-      return true;
-    case BF16:
-      return cuda_compute_capability.IsAtLeast(
-          stream_executor::CudaComputeCapability::AMPERE);
-    default:
-      return false;
-  }
-}
-
-// BF16 is supported in a sense that all operations on it are implemented
-// through F32 and converts have to be inserted into the HLO graph, but
-// they can be missing during fusion.
-
-std::vector<HloOpcode> TritonSupportedUnaryElementwise(
-    PrimitiveType element_type) {
-  std::vector<HloOpcode> ret = {HloOpcode::kConvert};
-  if (element_type == PrimitiveType::PRED) {
-    ret.push_back(HloOpcode::kNot);
-    return ret;
-  }
-  ret.push_back(HloOpcode::kAbs);
-  ret.push_back(HloOpcode::kNegate);
-  if (element_type == PrimitiveType::F32 ||
-      element_type == PrimitiveType::BF16 ||
-      element_type == PrimitiveType::F64) {
-    absl::c_copy(std::vector<HloOpcode>{HloOpcode::kCos, HloOpcode::kExp,
-                                        HloOpcode::kExpm1, HloOpcode::kLog,
-                                        HloOpcode::kLog1p, HloOpcode::kRsqrt,
-                                        HloOpcode::kSin, HloOpcode::kSqrt,
-                                        HloOpcode::kCbrt, HloOpcode::kTan,
-                                        HloOpcode::kTanh},
-                 std::back_inserter(ret));
-  }
-  return ret;
-}
-
-std::vector<HloOpcode> TritonSupportedBinaryElementwise(
-    PrimitiveType element_type) {
-  if (element_type == PrimitiveType::PRED) {
-    return {HloOpcode::kAnd, HloOpcode::kOr, HloOpcode::kXor,
-            HloOpcode::kCompare};
-  }
-  std::vector<HloOpcode> ret = {HloOpcode::kAdd,      HloOpcode::kCompare,
-                                HloOpcode::kMaximum,  HloOpcode::kMinimum,
-                                HloOpcode::kMultiply, HloOpcode::kSubtract};
-  if (element_type == PrimitiveType::F32 ||
-      element_type == PrimitiveType::BF16 ||
-      element_type == PrimitiveType::F64) {
-    ret.push_back(HloOpcode::kAtan2);
-    ret.push_back(HloOpcode::kDivide);
-    ret.push_back(HloOpcode::kPower);
-  }
-  return ret;
-}
-
-std::vector<HloOpcode> TritonSupportedTernaryElementwise(
-    PrimitiveType element_type) {
-  return {HloOpcode::kSelect};
-}
-
-bool IsTritonSupportedElementwise(HloOpcode opcode,
-                                  PrimitiveType element_type) {
-  return absl::c_linear_search(TritonSupportedUnaryElementwise(element_type),
-                               opcode) ||
-         absl::c_linear_search(TritonSupportedBinaryElementwise(element_type),
-                               opcode) ||
-         absl::c_linear_search(TritonSupportedTernaryElementwise(element_type),
-                               opcode);
-}
-
-StatusOr<TritonFusionAnalysis> TritonFusionAnalysis::Execute(
-    const HloComputation& computation, const int split_k) {
-  VLOG(5) << computation.ToString(HloPrintOptions::ShortParsable());
-  TritonFusionAnalysis analysis;
-  const HloInstruction* dot =
-      hlo_query::GetFirstInstructionWithOpcode(computation, HloOpcode::kDot);
-  if (dot != nullptr) {
-    TF_RETURN_IF_ERROR(analysis.ExecuteForDotFusion(*dot, split_k));
-  } else {
-    TF_RETURN_IF_ERROR(
-        analysis.ExecuteForSoftmaxFusion(*computation.root_instruction()));
-  }
-  return analysis;
-}
-
-Status TritonFusionAnalysis::ExecuteForSoftmaxFusion(
-    const HloInstruction& root) {
-  auto context = FusionContext::FromSoftmaxRoot(root);
-  // Softmax fusion uses one tiled scope.
-  TF_RETURN_IF_ERROR(context.PropagateDimensionOrdersToParameters(
-      root, parameters_[Scope::OUTPUT], iter_specs_[Scope::OUTPUT]));
-  iter_specs_[Scope::LHS] = {};
-  iter_specs_[Scope::RHS] = {};
-  return OkStatus();
-}
-
-Status TritonFusionAnalysis::ExecuteForDotFusion(const HloInstruction& dot,
-                                                 const int split_k) {
-  int64_t lhs_nc_split_major_part_size = -1;
-  for (const Scope scope : {Scope::LHS, Scope::RHS}) {
-    const int operand_number = static_cast<int>(scope);
-    auto context = FusionContext::FromDotOperand(dot, operand_number, split_k);
-    TF_RETURN_IF_ERROR(context.PropagateDimensionOrdersToParameters(
-        *dot.operand(operand_number), parameters_[scope], iter_specs_[scope]));
-    if (scope == Scope::LHS && context.SplittableDimensionMajorPartSize() > 1) {
-      lhs_nc_split_major_part_size = context.SplittableDimensionMajorPartSize();
-    }
-  }
-
-  auto context =
-      FusionContext::FromDotOutput(dot, split_k, lhs_nc_split_major_part_size);
-  const HloInstruction* output = &dot;
-  // Currently supported is one fusion output and one path from dot to it.
-  // Propagate dimension order from dot to root.
-  while (!output->IsRoot()) {
-    TF_RET_CHECK(output->user_count() == 1);
-    output = output->users()[0];
-    auto result = context.HandleInstruction(output, context.DimOrders(),
-                                            TransformDirection::kInputToOutput);
-    TF_RET_CHECK(std::holds_alternative<DimOrderUpdates>(result));
-    TF_RET_CHECK(context.RequireSupportedDimOrders(
-        *output, std::get<DimOrderUpdates>(result)));
-    TF_RET_CHECK(context.MergeUpdates(std::get<DimOrderUpdates>(result)));
-  }
-  TF_RET_CHECK(iter_specs_[Scope::OUTPUT]
-                   .insert({output, DimensionOrderToTensorIterationSpec(
-                                        context.DimOrders().at(output))})
-                   .second);
-  if (output != &dot) {
-    // Propagate back to parameters of the output fusion.
-    TF_RETURN_IF_ERROR(context.PropagateDimensionOrdersToParameters(
-        *output, parameters_[Scope::OUTPUT], iter_specs_[Scope::OUTPUT]));
-  }
-  return OkStatus();
-}
-
-const DimIterationSpec* TritonFusionAnalysis::IterSpec(
-    const TritonFusionAnalysis::Scope scope, const HloInstruction* hlo,
-    const int dimension) const {
-  auto hlo_spec = iter_specs_.at(scope).find(hlo);
-  if (hlo_spec != iter_specs_.at(scope).cend()) {
-    auto dim_spec = hlo_spec->second.Storage().find(dimension);
-    if (dim_spec != hlo_spec->second.Storage().cend()) {
-      return &dim_spec->second;
-    }
-  }
-  return nullptr;
-}
+}  // namespace
 
-FusionDecision CanTritonHandleGEMM(const HloInstruction& dot,
-                                   const se::GpuComputeCapability gpu_version) {
-  if (dot.opcode() != HloOpcode::kDot ||
+FusionDecision CanTritonHandleGEMM(
+    const HloDotInstruction& dot, const se::GpuComputeCapability& gpu_version) {
+  if (!tsl::tensor_float_32_execution_enabled() ||
       absl::c_any_of(dot.precision_config().operand_precision(),
                      [](int x) { return x != PrecisionConfig::DEFAULT; })) {
     return "Non-default precision.";
   }
 
   auto supported_output_type = [&](const PrimitiveType t) {
-    const auto cuda_compute_capability =
-        std::get<se::CudaComputeCapability>(gpu_version);
     switch (t) {
       case F16:
       case F32:
         return true;
       case BF16:
-        return cuda_compute_capability.IsAtLeast(
-            stream_executor::CudaComputeCapability::AMPERE);
+        return std::visit(
+            Overload{[](const se::CudaComputeCapability& cc) {
+                       return cc.IsAtLeast(
+                           stream_executor::CudaComputeCapability::AMPERE);
+                     },
+                     [](const se::RocmComputeCapability&) {
+                       return true;  // TODO check rocm support!
+                     }},
+            gpu_version);
       default:
         return false;
     }
@@ -1726,12 +742,12 @@ FusionDecision CanTritonHandleGEMM(const HloInstruction& dot,
   return FusionDecision{};
 }
 
-bool ShouldTritonHandleGEMM(HloInstruction& dot,
-                            const se::GpuComputeCapability gpu_version) {
+bool ShouldTritonHandleGEMM(HloDotInstruction& dot,
+                            const se::GpuComputeCapability& gpu_version) {
   std::vector<HloInstruction*> fusion_inputs;
   HloComputation::Builder builder("disposable");
-  return FuseDot(dot, gpu_version, builder, fusion_inputs,
-                 /*fusion_output_ptr=*/nullptr)
+  return CreateDotFusion(dot, gpu_version, builder, fusion_inputs,
+                         /*fusion_output_ptr=*/nullptr)
       ->CanFuse();
 }
 
@@ -1748,67 +764,5 @@ StatusOr<bool> GemmRewriterTriton::Run(
   return changed;
 }
 
-static std::string IterationSpecByInstructionMapToString(  // NOLINT
-    const TritonFusionAnalysis::IterationSpecByInstructionMap& m) {
-  return absl::StrCat("IterSpec{",
-                      absl::StrJoin(m, ", ",
-                                    [&](std::string* s, const auto& kv) {
-                                      absl::StrAppend(s, kv.first->name(), ": ",
-                                                      kv.second.ToString());
-                                    }),
-                      "}");
-}
-
-static std::string ScopeToString(TritonFusionAnalysis::Scope s) {  // NOLINT
-  switch (s) {
-    case TritonFusionAnalysis::Scope::LHS:
-      return "LHS";
-    case TritonFusionAnalysis::Scope::RHS:
-      return "RHS";
-    case TritonFusionAnalysis::Scope::OUTPUT:
-      return "OUTPUT";
-  }
-}
-
-std::string TensorIterationSpec::IterationSpecFragment::ToString() const {
-  return absl::StrCat("{stride=", stride, ", count=", count,
-                      ", slice_start=", slice_start, ", subfragments=[",
-                      absl::StrJoin(subfragments, ", "), "]}");
-}
-
-bool TensorIterationSpec::IterationSpecFragment::operator!=(
-    const IterationSpecFragment& other) const {
-  return stride != other.stride || count != other.count ||
-         slice_start != other.slice_start || slice_limit != other.slice_limit;
-}
-
-std::string TensorIterationSpec::ToString() const {
-  return absl::StrCat(
-      "{",
-      absl::StrJoin(dim_iteration_specs_, ", ",
-                    [&](std::string* s, const auto& kv) {
-                      absl::StrAppend(
-                          s, kv.first, ": ", "[",
-                          absl::StrJoin(kv.second, ", ",
-                                        [&](std::string* ss, const auto& v) {
-                                          absl::StrAppend(ss, v.ToString());
-                                        }),
-                          "]");
-                    }),
-      "}");
-}
-
-std::string TritonFusionAnalysis::ToString() const {
-  return absl::StrCat(
-      "TritonFusionAnalysis{\n",
-      absl::StrJoin(iter_specs_, ",\n",
-                    [&](std::string* s, const auto& kv) {
-                      absl::StrAppend(
-                          s, ScopeToString(kv.first), ": ",
-                          IterationSpecByInstructionMapToString(kv.second));
-                    }),
-      "\n}");
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.h b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.h
index db52fa54aadb31..3eaf02af71fc01 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.h
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.h
@@ -15,146 +15,34 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_GEMM_REWRITER_TRITON_H_
 #define XLA_SERVICE_GPU_GEMM_REWRITER_TRITON_H_
 
-#include <cstdint>
-#include <string>
-#include <vector>
+// This file contains the code for fusing dots and other operations into Triton
+// GEMM fusions.
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
-#include "xla/autotuning.pb.h"
-#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/service/instruction_fusion.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
 
-// Tells if f(a+b) == f(a) + f(b).
-bool IsDistributiveOverAddition(const HloInstruction& hlo);
-
-// Allowlist of unary elementwise operations supported by Triton GEMM codegen.
-std::vector<HloOpcode> TritonSupportedUnaryElementwise(PrimitiveType);
-
-// Allowlist of binary elementwise operations supported by Triton GEMM codegen.
-std::vector<HloOpcode> TritonSupportedBinaryElementwise(PrimitiveType);
-
-// Allowlist of ternary elementwise operations supported by Triton GEMM codegen.
-std::vector<HloOpcode> TritonSupportedTernaryElementwise(PrimitiveType);
-
-// Data types that are supported by the Triton emitters.
-bool IsTritonSupportedDataType(PrimitiveType, se::GpuComputeCapability);
-
-// Checks elementwise operation against all supported by Triton GEMM codegen.
-bool IsTritonSupportedElementwise(HloOpcode, PrimitiveType);
-
 // Filters GEMMs which can be handled using Triton.
-FusionDecision CanTritonHandleGEMM(const HloInstruction&,
-                                   se::GpuComputeCapability gpu_version);
+FusionDecision CanTritonHandleGEMM(const HloDotInstruction&,
+                                   const se::GpuComputeCapability&);
 
 // Filters GEMMs which are better to handle using Triton.
-bool ShouldTritonHandleGEMM(HloInstruction&,
-                            se::GpuComputeCapability gpu_version);
-
-class TensorIterationSpec {
- public:
-  // Description of basic iteration: `count` elements separated by `stride`.
-  struct IterationSpecFragment {
-    int64_t stride;
-    int64_t count;
-    int64_t slice_start;
-    int64_t slice_limit;
-    // Logical subfragments when this iteration is composed
-    // of several HLO dimensions.
-    std::vector<int64_t> subfragments;
-
-    bool is_sliced() const { return count != slice_limit - slice_start; }
-    bool operator!=(const IterationSpecFragment& other) const;
-    std::string ToString() const;
-  };
-  // Description of complex iteration over a sequence of several strides.
-  // Describes a logically contiguous dimension of a tensor physically
-  // separated into multiple fragments by other dimensions.
-  using DimIterationSpec = std::vector<IterationSpecFragment>;
-
-  using StorageType = absl::flat_hash_map<int, DimIterationSpec>;
-  const DimIterationSpec& operator[](const int dimension) const {
-    return dim_iteration_specs_.at(dimension);
-  }
-  DimIterationSpec& operator[](const int dimension) {
-    return dim_iteration_specs_[dimension];
-  }
-  const StorageType& Storage() const { return dim_iteration_specs_; }
-  void RemoveEmptyDimensions() {
-    absl::erase_if(dim_iteration_specs_,
-                   [](const auto& it) { return it.second.empty(); });
-  }
-
-  // Compares physical layouts of tensors ignoring subfragments of dimensions.
-  bool operator==(const TensorIterationSpec& other) const;
-
-  std::string ToString() const;
-
- private:
-  StorageType dim_iteration_specs_;
-};
-
-// Analysis of tensor iteration orders within tiled fusions.
-class TritonFusionAnalysis {
-  Status ExecuteForDotFusion(const HloInstruction& dot, int split_k);
-  Status ExecuteForSoftmaxFusion(const HloInstruction& root);
-
- public:
-  // Execute the analysis of a fusion computation.
-  // `split_k` indicates whether this operation was converted to the split-K
-  // form and tells the analysis how to interpret the batch dimensions.
-  static StatusOr<TritonFusionAnalysis> Execute(
-      const HloComputation& computation, int split_k = 1);
-
-  // A scope is an HLO graph that can be tiled efficiently using same or
-  // compatible tile shapes on all operations. GEMM fusion has 3 scopes
-  // defined by left operand, right operand and output.
-  enum class Scope { LHS = 0, RHS = 1, OUTPUT = 2 };
-
-  using IterationSpecByInstructionMap =
-      ConstHloInstructionMap<TensorIterationSpec>;
-  using IterationSpecByInstructionByScopeMap =
-      std::map<Scope, IterationSpecByInstructionMap>;
-
-  // Every parameter requires a separate piece of shared memory for asynchronous
-  // loads. Multiple parameters are approximately equivalent to multiple
-  // pipeline stages.
-  static constexpr int kMaxParameterPerScope = 4;
-
-  // Scope -> HLO -> dot dimension number -> iteration spec at the HLO's output.
-  const TensorIterationSpec::DimIterationSpec* IterSpec(Scope scope,
-                                                        const HloInstruction*,
-                                                        int dimension) const;
-  // Parameter HLO instructions used in a scope of `dot`.
-  const ConstHloInstructionSet& ScopeParameters(const Scope scope) const {
-    return parameters_.at(scope);
-  }
-
-  std::string ToString() const;
-
- private:
-  IterationSpecByInstructionByScopeMap iter_specs_;
-  // HLO computation parameters per scope.
-  std::map<Scope, ConstHloInstructionSet> parameters_;
-};
+bool ShouldTritonHandleGEMM(HloDotInstruction&,
+                            const se::GpuComputeCapability&);
 
 // Rewrite compatible dot() calls into custom calls with fused computations
 // that target Triton-based matmul emitter.
 class GemmRewriterTriton : public HloModulePass {
  public:
-  explicit GemmRewriterTriton(se::GpuComputeCapability gpu_version)
+  explicit GemmRewriterTriton(const se::GpuComputeCapability& gpu_version)
       : gpu_version_(gpu_version) {}
   absl::string_view name() const override { return "triton-gemm-rewriter"; }
 
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
index 9798fb4124554e..e60e74396f989b 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/cublas_padding_requirements.h"
+#include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/statusor.h"
@@ -188,614 +189,6 @@ ENTRY e {
   EXPECT_FALSE(GemmRewriterTriton(gpu_version_).Run(module.get()).value());
 }
 
-using TritonDotAnalysisTest = HloTestBase;
-
-TEST_F(TritonDotAnalysisTest, NopBitcasts) {
-  const std::string hlo_text = R"(
-HloModule t
-
-triton_dot {
-  param_0.1 = s8[48,4]{1,0} parameter(0)
-  bitcast.18 = s8[1,48,4]{2,1,0} bitcast(param_0.1)
-  bitcast.19 = s8[48,4]{1,0} bitcast(bitcast.18)
-  convert.4 = bf16[48,4]{1,0} convert(bitcast.19)
-  param_1.1 = bf16[4,3]{1,0} parameter(1)
-  ROOT dot = bf16[48,3]{1,0} dot(convert.4, param_1.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s8[48,4]{1,0} parameter(0)
-  p1 = bf16[4,3]{1,0} parameter(1)
-  custom-call = bf16[48,3]{1,0} custom-call(p0, p1),
-    custom_call_target="__triton",
-    called_computations={triton_dot}
-  ROOT bitcast.2 = bf16[1,8,6,3]{3,2,1,0} bitcast(custom-call)
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_text));
-  const HloComputation* dot_computation = module->entry_computation()
-                                              ->root_instruction()
-                                              ->operand(0)
-                                              ->called_computations()[0];
-  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
-  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
-  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
-                          TritonFusionAnalysis::Execute(*dot_computation));
-  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
-            p0);
-  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
-            p1);
-  EXPECT_THAT(
-      *analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 0),
-      ElementsAre(FieldsAre(/*stride=*/4, /*count=*/48, /*slice_start=*/0,
-                            /*slice_limit=*/48, ElementsAre(48))));
-  EXPECT_THAT(
-      *analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 1),
-      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/4, /*slice_start=*/0,
-                            /*slice_limit=*/4, ElementsAre(4))));
-  EXPECT_THAT(
-      *analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 0),
-      ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4, /*slice_start=*/0,
-                            /*slice_limit=*/4, ElementsAre(4))));
-  EXPECT_THAT(
-      *analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 1),
-      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3, /*slice_start=*/0,
-                            /*slice_limit=*/3, ElementsAre(3))));
-}
-
-TEST_F(TritonDotAnalysisTest, Merge) {
-  const std::string hlo_text = R"(
-HloModule t
-
-triton_dot {
-  param_0.1 = s8[1,8,6,4]{3,2,1,0} parameter(0)
-  bitcast.18 = s8[48,4]{1,0} bitcast(param_0.1)
-  convert.4 = bf16[48,4]{1,0} convert(bitcast.18)
-  param_1.1 = bf16[4,3]{1,0} parameter(1)
-  ROOT dot = bf16[48,3]{1,0} dot(convert.4, param_1.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s8[1,8,6,4]{3,2,1,0} parameter(0)
-  p1 = bf16[4,3]{1,0} parameter(1)
-  custom-call = bf16[48,3]{1,0} custom-call(p0, p1),
-    custom_call_target="__triton",
-    called_computations={triton_dot}
-  ROOT bitcast.2 = bf16[1,8,6,3]{3,2,1,0} bitcast(custom-call)
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_text));
-  const HloComputation* dot_computation = module->entry_computation()
-                                              ->root_instruction()
-                                              ->operand(0)
-                                              ->called_computations()[0];
-  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
-  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
-  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
-                          TritonFusionAnalysis::Execute(*dot_computation));
-  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
-            p0);
-  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
-            p1);
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 0),
-              ElementsAre(FieldsAre(/*stride=*/4, /*count=*/6 * 8,
-                                    /*slice_start=*/0, /*slice_limit=*/6 * 8,
-                                    /*subfragments=*/ElementsAre(6, 8))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/4,
-                                    /*slice_start=*/0, /*slice_limit=*/4,
-                                    /*subfragments=*/ElementsAre(4))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 0),
-              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4,
-                                    /*slice_start=*/0, /*slice_limit=*/4,
-                                    /*subfragments=*/ElementsAre(4))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3,
-                                    /*slice_start=*/0, /*slice_limit=*/3,
-                                    /*subfragments=*/ElementsAre(3))));
-}
-
-TEST_F(TritonDotAnalysisTest, Split) {
-  const std::string hlo_text = R"(
-HloModule t
-
-triton_dot {
-  %parameter_1 = f32[24000,2]{1,0} parameter(1)
-  %convert.15 = f16[24000,2]{1,0} convert(%parameter_1)
-  %parameter_0 = f16[4]{0} parameter(0)
-  %bitcast.45 = f16[2,2]{1,0} bitcast(%parameter_0)
-  ROOT %dot.26 = f16[24000,2]{1,0} dot(%convert.15, %bitcast.45),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f16[4]{0} parameter(0)
-  p1 = f32[24000,2]{1,0} parameter(1)
-  ROOT r = f16[24000,2]{1,0} custom-call(p0, p1),
-    custom_call_target="__triton",
-    called_computations={triton_dot}
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_text));
-  const HloComputation* dot_computation =
-      module->entry_computation()->root_instruction()->called_computations()[0];
-  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
-  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
-  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
-                          TritonFusionAnalysis::Execute(*dot_computation));
-  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
-            p1);
-  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
-            p0);
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p1, 0),
-              ElementsAre(FieldsAre(/*stride=*/2, /*count=*/24000,
-                                    /*slice_start=*/0, /*slice_limit=*/24000,
-                                    /*subfragments=*/ElementsAre(24000))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p1, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/2,
-                                    /*slice_start=*/0, /*slice_limit=*/2,
-                                    /*subfragments=*/ElementsAre(2))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p0, 0),
-              ElementsAre(FieldsAre(/*stride=*/2, /*count=*/2,
-                                    /*slice_start=*/0, /*slice_limit=*/2,
-                                    /*subfragments=*/ElementsAre(2))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p0, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/2,
-                                    /*slice_start=*/0, /*slice_limit=*/2,
-                                    /*subfragments=*/ElementsAre(2))));
-}
-
-TEST_F(TritonDotAnalysisTest, TransposeMerge) {
-  const std::string hlo_text = R"(
-HloModule t
-
-triton_dot {
-  param_0.1 = s8[1,4,8,6]{3,2,1,0} parameter(0)
-  transpose.3 = s8[1,8,6,4]{3,2,1,0} transpose(param_0.1), dimensions={0,2,3,1}
-  bitcast.18 = s8[48,4]{1,0} bitcast(transpose.3)
-  convert.4 = bf16[48,4]{1,0} convert(bitcast.18)
-  param_1.1 = bf16[4,3]{1,0} parameter(1)
-  ROOT dot = bf16[48,3]{1,0} dot(convert.4, param_1.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s8[1,4,8,6]{3,2,1,0} parameter(0)
-  p1 = bf16[4,3]{1,0} parameter(1)
-  custom-call = bf16[48,3]{1,0} custom-call(p0, p1),
-    custom_call_target="__triton",
-    called_computations={triton_dot}
-  ROOT bitcast.2 = bf16[1,8,6,3]{3,2,1,0} bitcast(custom-call)
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_text));
-  const HloComputation* dot_computation = module->entry_computation()
-                                              ->root_instruction()
-                                              ->operand(0)
-                                              ->called_computations()[0];
-  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
-  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
-  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
-                          TritonFusionAnalysis::Execute(*dot_computation));
-  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
-            p0);
-  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
-            p1);
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 0),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8 * 6,
-                                    /*slice_start=*/0, /*slice_limit=*/8 * 6,
-                                    /*subfragments=*/ElementsAre(6, 8))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 1),
-              ElementsAre(FieldsAre(/*stride=*/8 * 6, /*count=*/4,
-                                    /*slice_start=*/0, /*slice_limit=*/4,
-                                    /*subfragments=*/ElementsAre(4))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 0),
-              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4,
-                                    /*slice_start=*/0, /*slice_limit=*/4,
-                                    /*subfragments=*/ElementsAre(4))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3,
-                                    /*slice_start=*/0, /*slice_limit=*/3,
-                                    /*subfragments=*/ElementsAre(3))));
-}
-
-TEST_F(TritonDotAnalysisTest, CopyMerge) {
-  const std::string hlo_text = R"(
-HloModule t
-
-triton_dot {
-  param_0.1 = s8[1,4,8,6]{3,2,1,0} parameter(0)
-  bitcast.99 = s8[1,8,6,4]{2,1,3,0} bitcast(param_0.1)
-  copy.3 = s8[1,8,6,4]{3,2,1,0} copy(bitcast.99)
-  bitcast.18 = s8[48,4]{1,0} bitcast(copy.3)
-  convert.4 = bf16[48,4]{1,0} convert(bitcast.18)
-  param_1.1 = bf16[4,3]{1,0} parameter(1)
-  ROOT dot = bf16[48,3]{1,0} dot(convert.4, param_1.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s8[1,4,8,6]{3,2,1,0} parameter(0)
-  p1 = bf16[4,3]{1,0} parameter(1)
-  custom-call = bf16[48,3]{1,0} custom-call(p0, p1),
-    custom_call_target="__triton",
-    called_computations={triton_dot}
-  ROOT bitcast.2 = bf16[1,8,6,3]{3,2,1,0} bitcast(custom-call)
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_text));
-  const HloComputation* dot_computation = module->entry_computation()
-                                              ->root_instruction()
-                                              ->operand(0)
-                                              ->called_computations()[0];
-  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
-  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
-  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
-                          TritonFusionAnalysis::Execute(*dot_computation));
-  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
-            p0);
-  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
-            p1);
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 0),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8 * 6,
-                                    /*slice_start=*/0, /*slice_limit=*/8 * 6,
-                                    /*subfragments=*/ElementsAre(6, 8))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 1),
-              ElementsAre(FieldsAre(/*stride=*/8 * 6, /*count=*/4,
-                                    /*slice_start=*/0, /*slice_limit=*/4,
-                                    /*subfragments=*/ElementsAre(4))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 0),
-              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4,
-                                    /*slice_start=*/0, /*slice_limit=*/4,
-                                    /*subfragments=*/ElementsAre(4))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3,
-                                    /*slice_start=*/0, /*slice_limit=*/3,
-                                    /*subfragments=*/ElementsAre(3))));
-}
-
-TEST_F(TritonDotAnalysisTest, TransposeMergeNCN) {
-  const std::string hlo_text = R"(
-HloModule t
-
-triton_dot {
-  param_0.1 = bf16[3,4,8,1]{3,2,1,0} parameter(0)
-  transpose.3 = bf16[3,8,1,4]{3,2,1,0} transpose(param_0.1), dimensions={0,2,3,1}
-  bitcast.18 = bf16[24,4]{1,0} bitcast(transpose.3)
-  param_1.1 = bf16[4,3]{1,0} parameter(1)
-  ROOT dot = bf16[24,3]{1,0} dot(bitcast.18, param_1.1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = bf16[3,4,8,1]{3,2,1,0} parameter(0)
-  p1 = bf16[4,3]{1,0} parameter(1)
-  custom-call = bf16[24,3]{1,0} custom-call(p0, p1),
-    custom_call_target="__triton", called_computations={triton_dot}
-  ROOT bitcast.2 = bf16[3,8,1,3]{3,2,1,0} bitcast(custom-call)
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_text));
-  const HloComputation* dot_computation = module->entry_computation()
-                                              ->root_instruction()
-                                              ->operand(0)
-                                              ->called_computations()[0];
-  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
-  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
-  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
-                          TritonFusionAnalysis::Execute(*dot_computation));
-  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
-            p0);
-  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
-            p1);
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 0),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8,
-                                    /*slice_start=*/0, /*slice_limit=*/8,
-                                    /*subfragments=*/ElementsAre(8)),
-                          FieldsAre(/*stride=*/4 * 8, /*count=*/3,
-                                    /*slice_start=*/0, /*slice_limit=*/3,
-                                    /*subfragments=*/ElementsAre(3))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 1),
-              ElementsAre(FieldsAre(/*stride=*/8, /*count=*/4,
-                                    /*slice_start=*/0, /*slice_limit=*/4,
-                                    /*subfragments=*/ElementsAre(4))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 0),
-              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4,
-                                    /*slice_start=*/0, /*slice_limit=*/4,
-                                    /*subfragments=*/ElementsAre(4))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 1),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3,
-                                    /*slice_start=*/0, /*slice_limit=*/3,
-                                    /*subfragments=*/ElementsAre(3))));
-}
-
-TEST_F(TritonDotAnalysisTest, TransposeOutput) {
-  const std::string hlo_text = R"(
-HloModule t
-
-triton_dot {
-  p0 = bf16[24,4]{1,0} parameter(0)
-  p1 = bf16[4,3]{1,0} parameter(1)
-  dot = bf16[24,3]{1,0} dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  bc = bf16[12,2,3]{2,1,0} bitcast(dot)
-  ROOT t = bf16[3,12,2]{2,1,0} transpose(bc), dimensions={2,0,1}
-}
-
-ENTRY e {
-  p0 = bf16[24,4]{1,0} parameter(0)
-  p1 = bf16[4,3]{1,0} parameter(1)
-  ROOT r = bf16[3,12,2]{2,1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_dot
-})";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_text));
-  const HloComputation* dot_computation =
-      module->entry_computation()->root_instruction()->called_computations()[0];
-  const HloInstruction* dot_output = dot_computation->root_instruction();
-  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
-                          TritonFusionAnalysis::Execute(*dot_computation));
-  EXPECT_THAT(
-      *analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, dot_output, 0),
-      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/24, /*slice_start=*/0,
-                            /*slice_limit=*/24,
-                            /*subfragments=*/ElementsAre(2, 12))));
-  EXPECT_THAT(
-      *analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, dot_output, 1),
-      ElementsAre(FieldsAre(/*stride=*/24, /*count=*/3, /*slice_start=*/0,
-                            /*slice_limit=*/3,
-                            /*subfragments=*/ElementsAre(3))));
-}
-
-TEST_F(TritonDotAnalysisTest, OutputParameterIsHandled) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(R"(
-HloModule t
-
-triton_dot {
-  p0 = bf16[24,4]{1,0} parameter(0)
-  p1 = bf16[4,3]{1,0} parameter(1)
-  dot = bf16[24,3]{1,0} dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  p2 = f16[3,24]{1,0} parameter(2)
-  p2t = f16[24,3]{1,0} transpose(p2), dimensions={1,0}
-  p2tc = bf16[24,3]{1,0} convert(p2t)
-  ROOT r = bf16[24,3]{1,0} divide(p2tc, dot)
-}
-
-ENTRY e {
-  p0 = bf16[24,4]{1,0} parameter(0)
-  p1 = bf16[4,3]{1,0} parameter(1)
-  p2 = f16[3,24]{1,0} parameter(2)
-  ROOT r = bf16[24,3]{1,0} fusion(p0, p1, p2), kind=kCustom,
-    calls=triton_dot
-})"));
-  const HloComputation* dot_computation =
-      module->entry_computation()->root_instruction()->called_computations()[0];
-  const HloInstruction* output_param =
-      dot_computation->parameter_instruction(2);
-  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
-                          TritonFusionAnalysis::Execute(*dot_computation));
-  EXPECT_EQ(
-      analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, output_param, 0)
-          ->size(),
-      1);
-  EXPECT_THAT(
-      *analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, output_param, 0),
-      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/24, /*slice_start=*/0,
-                            /*slice_limit=*/24,
-                            /*subfragments=*/ElementsAre(24))));
-  EXPECT_EQ(
-      analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, output_param, 1)
-          ->size(),
-      1);
-  EXPECT_THAT(
-      *analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, output_param, 1),
-      ElementsAre(FieldsAre(/*stride=*/24, /*count=*/3, /*slice_start=*/0,
-                            /*slice_limit=*/3,
-                            /*subfragments=*/ElementsAre(3))));
-}
-
-TEST_F(TritonDotAnalysisTest, InputBroadcastFromScalarIsHandled) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(R"(
-HloModule t
-
-triton_dot {
-  p0 = bf16[24,4]{1,0} parameter(0)
-  p1 = bf16[] parameter(1)
-  p1b = bf16[4,3] broadcast(p1)
-  ROOT dot = bf16[24,3]{1,0} dot(p0, p1b),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = bf16[24,4]{1,0} parameter(0)
-  p1 = bf16[] parameter(1)
-  ROOT r = bf16[24,3]{1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_dot
-})"));
-  const HloComputation* dot_computation =
-      module->entry_computation()->root_instruction()->called_computations()[0];
-  const HloInstruction* scalar = dot_computation->parameter_instruction(1);
-  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
-                          TritonFusionAnalysis::Execute(*dot_computation));
-  EXPECT_EQ(analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, scalar, 0),
-            nullptr);
-  EXPECT_EQ(analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, scalar, 1),
-            nullptr);
-}
-
-TEST_F(TritonDotAnalysisTest, InputBroadcastFromVectorIsHandled) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(R"(
-HloModule t
-
-triton_dot {
-  p0 = bf16[24,4]{1,0} parameter(0)
-  p1 = bf16[4] parameter(1)
-  p1b = bf16[4,3] broadcast(p1), dimensions={0}
-  ROOT dot = bf16[24,3]{1,0} dot(p0, p1b),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = bf16[24,4]{1,0} parameter(0)
-  p1 = bf16[4] parameter(1)
-  ROOT r = bf16[24,3]{1,0} fusion(p0, p1), kind=kCustom,
-    calls=triton_dot
-})"));
-  const HloComputation* dot_computation =
-      module->entry_computation()->root_instruction()->called_computations()[0];
-  const HloInstruction* vector = dot_computation->parameter_instruction(1);
-  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
-                          TritonFusionAnalysis::Execute(*dot_computation));
-  EXPECT_EQ(
-      analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, vector, 0)->size(),
-      1);
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, vector, 0),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/4,
-                                    /*slice_start=*/0, /*slice_limit=*/4,
-                                    /*subfragments=*/ElementsAre(4))));
-}
-
-TEST_F(TritonDotAnalysisTest, OutputBroadcastIsNotAccepted) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(R"(
-HloModule t
-
-ENTRY e {
-  p0 = f16[2,35] parameter(0)
-  p0c = bf16[2,35] convert(p0)
-  p1 = bf16[35,2] parameter(1)
-  dot = bf16[2,2] dot(p0c, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT bc = bf16[2,2,100] broadcast(dot), dimensions={0,1}
-})"));
-  EXPECT_TRUE(GemmRewriterTriton(se::CudaComputeCapability{
-                                     se::CudaComputeCapability::AMPERE, 0})
-                  .Run(module.get())
-                  .value());
-  EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
-            HloOpcode::kBroadcast);
-}
-
-TEST_F(TritonDotAnalysisTest, DegenerateSplitFragmentIsHandled) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(R"(
-triton_gemm_r {
-  Arg_0.1 = s8[30,913,8,21]{3,2,1,0} parameter(0)
-  bitcast.6 = s8[30,8,21,913]{2,1,3,0} bitcast(Arg_0.1)
-  copy.7 = s8[30,8,21,913]{3,2,1,0} copy(bitcast.6)
-  bitcast.8 = s8[5040,913]{1,0} bitcast(copy.7)
-  convert.9 = bf16[5040,913]{1,0} convert(bitcast.8)
-  bitcast.32 = bf16[58,913]{1,0} parameter(1)
-  dot.33 = bf16[5040,58]{1,0} dot(convert.9, bitcast.32),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  bitcast.34 = bf16[30,8,21,58]{3,2,1,0} bitcast(dot.33)
-  copy.35 = bf16[30,8,21,58]{2,1,3,0} copy(bitcast.34)
-  ROOT bitcast.41 = bf16[30,1,58,8,21]{4,3,2,1,0} bitcast(copy.35)
-}
-
-ENTRY e {
-  Arg_0.1 = s8[30,913,8,21]{3,2,1,0} parameter(0)
-  Arg_1.2 = bf16[58,913]{1,0} parameter(1)
-  ROOT r = bf16[30,1,58,8,21]{4,3,2,1,0} fusion(Arg_0.1, Arg_1.2), kind=kCustom,
-    calls=triton_gemm_r,
-    backend_config={kind: "__triton_gemm"}
-})"));
-  const HloComputation* dot_computation =
-      module->entry_computation()->root_instruction()->called_computations()[0];
-  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
-                          TritonFusionAnalysis::Execute(*dot_computation));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
-                                 dot_computation->root_instruction(), 0),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8 * 21,
-                                    /*slice_start=*/0, /*slice_limit=*/8 * 21,
-                                    /*subfragments=*/ElementsAre(21, 8)),
-                          FieldsAre(/*stride=*/8 * 21 * 58, /*count=*/30,
-                                    /*slice_start=*/0, /*slice_limit=*/30,
-                                    /*subfragments=*/ElementsAre(30))));
-}
-
-using TritonSoftmaxAnalysisTest = HloTestBase;
-
-TEST_F(TritonSoftmaxAnalysisTest, DegenerateBatchDimensionIsSupported) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(R"(
-max {
-  p1 = f32[] parameter(1)
-  p0 = f32[] parameter(0)
-  ROOT m = f32[] maximum(p0, p1)
-}
-
-triton_softmax_computation {
-  p0 = f32[1,97]{1,0} parameter(0)
-  bitcast = f32[97]{0} bitcast(p0)
-  constant = f32[] constant(-inf)
-  reduce = f32[] reduce(bitcast, constant), dimensions={0}, to_apply=max
-  broadcast = f32[1,97]{1,0} broadcast(reduce), dimensions={}
-  ROOT subtract = f32[1,97]{1,0} subtract(p0, broadcast)
-}
-
-ENTRY e {
-  p0 = f32[1,97]{1,0} parameter(0)
-  ROOT r = f32[1,97]{1,0} fusion(p0), kind=kCustom,
-    calls=triton_softmax_computation,
-    backend_config={"kind":"__triton_softmax"}
-})"));
-  const HloComputation* computation =
-      module->entry_computation()->root_instruction()->called_computations()[0];
-  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
-                          TritonFusionAnalysis::Execute(*computation));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
-                                 computation->root_instruction(), 0),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/97,
-                                    /*slice_start=*/0, /*slice_limit=*/97,
-                                    /*subfragments=*/ElementsAre(97))));
-  EXPECT_EQ(analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
-                              computation->root_instruction(), 1),
-            nullptr);
-}
-
-TEST_F(TritonSoftmaxAnalysisTest, BroadcastIntoBatchDimensionIsSupported) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(R"(
-c {
-  p1 = f32[127]{0} parameter(0)
-  ROOT b = f32[125,127]{1,0} broadcast(p1), dimensions={1}
-}
-
-ENTRY e {
-  p0 = f32[127]{0} parameter(0)
-  ROOT t = f32[125,127]{1,0} fusion(p0), kind=kCustom, calls=c
-})"));
-  const HloComputation* computation =
-      module->entry_computation()->root_instruction()->called_computations()[0];
-  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
-                          TritonFusionAnalysis::Execute(*computation));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
-                                 computation->root_instruction(), 0),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/127,
-                                    /*slice_start=*/0, /*slice_limit=*/127,
-                                    /*subfragments=*/ElementsAre(127))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
-                                 computation->root_instruction(), 1),
-              ElementsAre(FieldsAre(/*stride=*/127, /*count=*/125,
-                                    /*slice_start=*/0, /*slice_limit=*/125,
-                                    /*subfragments=*/ElementsAre(125))));
-  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
-                                 computation->parameter_instruction(0), 0),
-              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/127,
-                                    /*slice_start=*/0, /*slice_limit=*/127,
-                                    /*subfragments=*/ElementsAre(127))));
-  EXPECT_EQ(analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
-                              computation->parameter_instruction(0), 1),
-            nullptr);
-}
-
 TEST_F(GemmRewriterTritonTest, HandleDotIfCublasRequiresPadding) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
@@ -1121,7 +514,156 @@ ENTRY e {
   EXPECT_EQ(module->entry_computation()->root_instruction()->fusion_kind(),
             HloInstruction::FusionKind::kCustom);
   EXPECT_LE(module->entry_computation()->root_instruction()->operand_count(),
-            TritonFusionAnalysis::kMaxParameterPerScope * 2);
+            TritonFusionAnalysis::kMaxParameterPerDotOperand * 2);
+}
+
+TEST_F(GemmRewriterTritonLevel2Test,
+       DoNotFuseTooManyParametersWhenAnInstructionWouldAddMultipleParameters) {
+  static_assert(TritonFusionAnalysis::kMaxParameterPerDotOperand == 4,
+                "We have to update this test.");
+  // If we fuse the select, it adds 2 additional parameters at once (not 3,
+  // because the select instruction itself is removed from the parameters).
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  a = f32[3,49]{1,0} parameter(0)
+  b = f32[3,49]{1,0} parameter(1)
+  c = pred[3,49]{1,0} parameter(2)
+  d = f32[3,49]{1,0} parameter(3)
+  e = f32[3,49]{1,0} parameter(4)
+  add0 = f32[3,49]{1,0} add(a, b)
+  select = f32[3,49]{1,0} select(c, d, e)
+  add1 = f32[3,49]{1,0} add(add0, select)
+  f = f32[3,32]{1,0} parameter(5)
+  ROOT tmp_102 = f32[49,32]{1,0} dot(add1, f), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+})"));
+
+  EXPECT_TRUE(GemmRewriterTriton(gpu_version_).Run(module.get()).value());
+  EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
+            HloOpcode::kFusion);
+  EXPECT_EQ(module->entry_computation()->root_instruction()->fusion_kind(),
+            HloInstruction::FusionKind::kCustom);
+  EXPECT_LE(module->entry_computation()->root_instruction()->operand_count(),
+            TritonFusionAnalysis::kMaxParameterPerDotOperand + 1);
+}
+
+TEST_F(GemmRewriterTritonLevel2Test, DoNotFuseTooManyParametersForConcat) {
+  static_assert(TritonFusionAnalysis::kMaxParameterPerDotOperand == 4,
+                "We have to update this test.");
+  // The concat shouldn't overgo the allowed parameter limit.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  a = f32[3,3]{1,0} parameter(0)
+  b = f32[3,3]{1,0} parameter(1)
+  c = f32[3,3]{1,0} parameter(2)
+  d = f32[3,3]{1,0} parameter(3)
+  e = f32[3,3]{1,0} parameter(4)
+  f = f16[3,3]{1,0} parameter(5)
+  concat = f32[15,3]{1,0} concatenate(a, b, c, d, e), dimensions={0}
+  convert = f32[3,3]{1,0} convert(f)
+  ROOT dot = f32[15,3]{1,0} dot(concat, convert), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})"));
+
+  EXPECT_TRUE(GemmRewriterTriton(gpu_version_).Run(module.get()).value());
+  EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
+            HloOpcode::kFusion);
+  EXPECT_EQ(module->entry_computation()->root_instruction()->fusion_kind(),
+            HloInstruction::FusionKind::kCustom);
+  EXPECT_LE(module->entry_computation()->root_instruction()->operand_count(),
+            TritonFusionAnalysis::kMaxParameterPerDotOperand + 1);
+}
+
+TEST_F(GemmRewriterTritonLevel2Test,
+       InstructionsReachableFromMultipleOperandsAreHandledCorrectly) {
+  static_assert(TritonFusionAnalysis::kMaxParameterPerDotOperand == 4,
+                "We have to update this test.");
+  // There was a bug that some dead code was generated into some fusions in a
+  // specific edge case. When some instructions were reachable both through the
+  // LHS and the RHS operands, the BFS (Breadth-first search) through the LHS1
+  // operand "marked" one operation as non-fusible because it would exceed the
+  // limit on fusion parameters per operand. But the BFS through the RHS operand
+  // went through that node and fused some more operands. So the resulting
+  // fusion was not connected and caused errors. This test case checks that such
+  // configurations generate a correct HLO now.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  a = f32[2,4]{1,0} parameter(0)
+  b = f32[2,4]{1,0} parameter(1)
+  c = f32[2,4]{1,0} parameter(2)
+  d = f32[2,4]{1,0} parameter(3)
+  e = f32[2,4]{1,0} parameter(4)
+  add0 = f32[2,4]{1,0} add(a, b)
+  add1 = f32[2,4]{1,0} add(add0, c)
+  add2 = f32[2,4]{1,0} add(add1, d)
+  add3 = f32[2,4]{1,0} add(add2, e)
+  ROOT r = f32[2,2]{1,0} dot(add3, add0),
+           lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})"));
+
+  EXPECT_TRUE(GemmRewriterTriton(gpu_version_).Run(module.get()).value());
+  // ~VerifiedHloModule() will verify the module.
+}
+
+TEST_F(GemmRewriterTritonLevel2Test, EachScopeIsFusedToASeparateSubgraph) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  a = f32[2,4]{1,0} parameter(0)
+  b = f32[2,4]{1,0} parameter(1)
+  add = f32[2,4]{1,0} add(a, b)
+  ROOT r = f32[2,2]{1,0} dot(add, add),
+           lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})"));
+
+  EXPECT_TRUE(GemmRewriterTriton(gpu_version_).Run(module.get()).value());
+
+  MatchHloModule(*module, R"(
+CHECK-DAG: %[[P0:.*]] = f32[2,4]{1,0} parameter(0)
+CHECK-DAG: %[[P1:.*]] = f32[2,4]{1,0} parameter(1)
+CHECK-DAG: %[[ADD0:.*]] = f32[2,4]{1,0} add(f32[2,4]{1,0} %[[P0]], f32[2,4]{1,0} %[[P1]])
+CHECK-DAG: %[[P2:.*]] = f32[2,4]{1,0} parameter(2)
+CHECK-DAG: %[[P3:.*]] = f32[2,4]{1,0} parameter(3)
+CHECK-DAG: %[[ADD1:.*]] = f32[2,4]{1,0} add(f32[2,4]{1,0} %[[P2]], f32[2,4]{1,0} %[[P3]])
+CHECK-DAG: ROOT {{.*}} = f32[2,2]{1,0} dot(f32[2,4]{1,0} %[[ADD0]], f32[2,4]{1,0} %[[ADD1]])
+CHECK: ENTRY
+CHECK-DAG: %[[P0:.*]] = f32[2,4]{1,0} parameter(0)
+CHECK-DAG: %[[P1:.*]] = f32[2,4]{1,0} parameter(1)
+CHECK-DAG: ROOT {{.*}} = f32[2,2]{1,0}
+CHECK-SAME: fusion(f32[2,4]{1,0} %[[P0]], f32[2,4]{1,0} %[[P1]], f32[2,4]{1,0} %[[P0]], f32[2,4]{1,0} %[[P1]]),
+CHECK-SAME: kind=kCustom
+CHECK-SAME: __triton_gemm
+})");
+}
+
+TEST_F(GemmRewriterTritonLevel2Test, TheFusionIsATree) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  a = f32[2,4]{1,0} parameter(0)
+  add = f32[2,4]{1,0} add(a, a)
+  ROOT r = f32[2,2]{1,0} dot(add, add),
+           lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})"));
+
+  EXPECT_TRUE(GemmRewriterTriton(gpu_version_).Run(module.get()).value());
+
+  MatchHloModule(*module, R"(
+CHECK-DAG: %[[P0:.*]] = f32[2,4]{1,0} parameter(0)
+CHECK-DAG: %[[P1:.*]] = f32[2,4]{1,0} parameter(1)
+CHECK-DAG: %[[ADD0:.*]] = f32[2,4]{1,0} add(f32[2,4]{1,0} %[[P0]], f32[2,4]{1,0} %[[P1]])
+CHECK-DAG: %[[P2:.*]] = f32[2,4]{1,0} parameter(2)
+CHECK-DAG: %[[P3:.*]] = f32[2,4]{1,0} parameter(3)
+CHECK-DAG: %[[ADD1:.*]] = f32[2,4]{1,0} add(f32[2,4]{1,0} %[[P2]], f32[2,4]{1,0} %[[P3]])
+CHECK-DAG: ROOT {{.*}} = f32[2,2]{1,0} dot(f32[2,4]{1,0} %[[ADD0]], f32[2,4]{1,0} %[[ADD1]])
+CHECK: ENTRY
+CHECK-DAG: %[[P0:.*]] = f32[2,4]{1,0} parameter(0)
+CHECK-DAG: ROOT {{.*}} = f32[2,2]{1,0}
+CHECK-SAME: fusion(f32[2,4]{1,0} %[[P0]], f32[2,4]{1,0} %[[P0]], f32[2,4]{1,0} %[[P0]], f32[2,4]{1,0} %[[P0]]),
+CHECK-SAME: kind=kCustom
+CHECK-SAME: __triton_gemm
+})");
 }
 
 TEST_F(GemmRewriterTritonLevel2Test,
@@ -1174,7 +716,7 @@ ENTRY e {
                   .Run(module.get())
                   .value());
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch((m::Fusion(m::Parameter(), m::Exp()))));
+              GmockMatch((m::Fusion(m::Exp(), m::Parameter()))));
 }
 
 TEST_F(GemmRewriterTritonLevel2Test, ParameterUsedElementwiseTwiceIsFused) {
@@ -1196,21 +738,22 @@ ENTRY e {
                                      se::CudaComputeCapability::AMPERE, 0})
                   .Run(module.get())
                   .value());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch((m::Fusion(m::Parameter(), m::Parameter()))));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch((m::Fusion(m::Parameter(), m::Parameter(), m::Parameter()))));
   TF_ASSERT_OK_AND_ASSIGN(
       const auto analysis,
       TritonFusionAnalysis::Execute(*module->entry_computation()
                                          ->root_instruction()
                                          ->called_computations()[0]));
   EXPECT_EQ(analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).size(),
-            1);
+            2);
   EXPECT_EQ(analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).size(),
             1);
 }
 
 TEST_F(GemmRewriterTritonLevel2Test,
-       ParameterUsedNonElementwiseTwiceIsFusedOnlyOnOnePath) {
+       ParameterUsedNonElementwiseTwiceIsFusedOnBothPaths) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
 HloModule t
@@ -1230,7 +773,7 @@ ENTRY e {
                   .value());
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
-      GmockMatch((m::Fusion(m::Parameter(), m::Transpose(), m::Parameter()))));
+      GmockMatch((m::Fusion(m::Parameter(), m::Parameter(), m::Parameter()))));
 }
 
 TEST_F(GemmRewriterTritonLevel2Test,
@@ -1280,6 +823,191 @@ ENTRY e {
                                  m::Negate()))));
 }
 
+TEST_F(GemmRewriterTritonLevel2Test, NestedSlicingIsAnalyzedCorrectly) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+triton_gemm_d_computation {
+  p0 = f32[6,24]{1,0} parameter(0)
+  s1 = f32[5,20]{1,0} slice(p0), slice={[1:6], [3:23]}
+  n1 = f32[5,20]{1,0} negate(s1)
+  s2 = f32[3,7]{1,0} slice(n1), slice={[1:4], [13:20]}
+  p1 = f32[7,37]{1,0} parameter(1)
+  ROOT d = f32[3,37]{1,0} dot(s2, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[7,37]{1,0} parameter(0)
+  p1 = f32[6,24]{1,0} parameter(1)
+  ROOT triton_gemm_d = f32[3,37]{1,0} fusion(p1, p0), kind=kCustom,
+    calls=triton_gemm_d_computation
+})"));
+  const HloComputation* computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*computation));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS,
+                                 computation->parameter_instruction(0), 0),
+              ElementsAre(FieldsAre(/*stride=*/24, /*count=*/6,
+                                    /*slice_start=*/2, /*sliced_count=*/3,
+                                    /*subfragments=*/ElementsAre(3))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS,
+                                 computation->parameter_instruction(0), 1),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/24,
+                                    /*slice_start=*/16, /*sliced_count=*/7,
+                                    /*subfragments=*/ElementsAre(7))));
+}
+
+TEST_F(GemmRewriterTritonLevel2Test, FusedConcatenationIsAnalyzedCorrectly) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+e {
+  p0 = s8[153,1536] parameter(0)
+  p1 = s8[153,128] parameter(1)
+  p2 = s8[153,256] parameter(2)
+  cat = s8[153,1920] concatenate(p0, p1, p2), dimensions={1}
+  cvt = bf16[153,1920] convert(cat)
+  p3 = bf16[16,153] parameter(3)
+  ROOT d = bf16[16,1920] dot(p3, cvt),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})"));
+  EXPECT_TRUE(GemmRewriterTriton(se::CudaComputeCapability{
+                                     se::CudaComputeCapability::AMPERE, 0})
+                  .Run(module.get())
+                  .value());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch((m::Fusion(m::Parameter(), m::Parameter(),
+                                    m::Parameter(), m::Parameter()))));
+  const HloComputation* computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*computation));
+
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS,
+                                 computation->parameter_instruction(1), 0),
+              ElementsAre(FieldsAre(/*stride=*/1536, /*count=*/153,
+                                    /*slice_start=*/0, /*sliced_count=*/153,
+                                    /*subfragments=*/ElementsAre(153))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS,
+                                 computation->parameter_instruction(1), 1),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/1536,
+                                    /*slice_start=*/0, /*sliced_count=*/1536,
+                                    /*subfragments=*/ElementsAre(1536))));
+
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS,
+                                 computation->parameter_instruction(2), 0),
+              ElementsAre(FieldsAre(/*stride=*/128, /*count=*/153,
+                                    /*slice_start=*/0, /*sliced_count=*/153,
+                                    /*subfragments=*/ElementsAre(153))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS,
+                                 computation->parameter_instruction(2), 1),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/128,
+                                    /*slice_start=*/0, /*sliced_count=*/128,
+                                    /*subfragments=*/ElementsAre(128))));
+
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS,
+                                 computation->parameter_instruction(3), 0),
+              ElementsAre(FieldsAre(/*stride=*/256, /*count=*/153,
+                                    /*slice_start=*/0, /*sliced_count=*/153,
+                                    /*subfragments=*/ElementsAre(153))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS,
+                                 computation->parameter_instruction(3), 1),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/256,
+                                    /*slice_start=*/0,
+                                    /*sliced_count=*/256,
+                                    /*subfragments=*/ElementsAre(256))));
+}
+
+TEST_F(GemmRewriterTritonLevel2Test, IndivisibleConcatenationIsNotFused) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+e {
+  p0 = s8[124,1024] parameter(0)
+  p1 = s8[124,1001] parameter(1)
+  cat = s8[124,2025] concatenate(p0, p1), dimensions={1}
+  cvt = f16[124,2025] convert(cat)
+  p2 = f16[123,124] parameter(2)
+  ROOT d = f16[2025,123] dot(cvt, p2),
+    lhs_contracting_dims={0}, rhs_contracting_dims={1}
+})"));
+  EXPECT_TRUE(GemmRewriterTriton(se::CudaComputeCapability{
+                                     se::CudaComputeCapability::AMPERE, 0})
+                  .Run(module.get())
+                  .value());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch((m::Fusion(m::Concatenate(), m::Parameter()))));
+}
+
+TEST_F(GemmRewriterTritonLevel2Test, ConcatenationOfContractingIsNotFused) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+e {
+  p0 = s8[124,1024] parameter(0)
+  p1 = s8[124,1024] parameter(1)
+  cat = s8[124,2048] concatenate(p0, p1), dimensions={1}
+  cvt = f16[124,2048] convert(cat)
+  p2 = f16[123,2048] parameter(2)
+  ROOT d = f16[124,123] dot(cvt, p2),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})"));
+  EXPECT_TRUE(GemmRewriterTriton(se::CudaComputeCapability{
+                                     se::CudaComputeCapability::AMPERE, 0})
+                  .Run(module.get())
+                  .value());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch((m::Fusion(m::Concatenate(), m::Parameter()))));
+}
+
+TEST_F(GemmRewriterTritonLevel2Test, ConcatenationOfBatchIsNotFused) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+e {
+  p0 = s8[124,1024,50] parameter(0)
+  p1 = s8[124,1024,50] parameter(1)
+  cat = s8[124,2048,50] concatenate(p0, p1), dimensions={1}
+  cvt = f16[124,2048,50] convert(cat)
+  p2 = f16[123,2048,50] parameter(2)
+  ROOT d = f16[2048,124,123] dot(cvt, p2),
+    lhs_batch_dims={1}, rhs_batch_dims={1},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2}
+})"));
+  EXPECT_TRUE(GemmRewriterTriton(se::CudaComputeCapability{
+                                     se::CudaComputeCapability::AMPERE, 0})
+                  .Run(module.get())
+                  .value());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch((m::Fusion(m::Concatenate(), m::Parameter()))));
+}
+
+TEST_F(GemmRewriterTritonLevel2Test,
+       DifferentConcatenationOfSameParametersIsFusedViaNodeDuplication) {
+  // It means that the same input is passed to the fusion multiple times and
+  // it's read differently for each.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+e {
+  p0 = s8[128,2] parameter(0)
+  p1 = s8[128,2] parameter(1)
+  cat0 = s8[256,2] concatenate(p0, p1), dimensions={0}
+  cvt0 = f16[256,2] convert(cat0)
+  cat1 = s8[256,2] concatenate(p1, p0), dimensions={0}
+  n1 = s8[256,2] negate(cat1)
+  cvt1 = f16[256,2] convert(n1)
+  a = f16[256,2] add(cvt1, cvt0)
+  p2 = f16[2,18] parameter(2)
+  ROOT d = f16[18,256] dot(p2, a),
+    lhs_contracting_dims={0}, rhs_contracting_dims={1}
+})"));
+
+  EXPECT_TRUE(GemmRewriterTriton(se::CudaComputeCapability{
+                                     se::CudaComputeCapability::AMPERE, 0})
+                  .Run(module.get())
+                  .value());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch((m::Fusion(m::Parameter(), m::Concatenate(),
+                                    m::Concatenate()))));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc b/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
index 34149050ae20c0..db5e3d9436d653 100644
--- a/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
@@ -14,9 +14,28 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <string>
 #include <utility>
-
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/strings/ascii.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/multi_platform_manager.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "tsl/platform/statusor.h"
+
+#if GOOGLE_CUDA
 #include "xla/service/gpu/nvptx_compiler.h"
+#elif TF_USE_ROCM
+#include "xla/service/gpu/amdgpu_compiler.h"
+#endif
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
@@ -36,30 +55,80 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  NVPTXCompiler compiler;
+  auto compiler = backend().compiler();
+  auto name =
+      absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          se::MultiPlatformManager::PlatformWithName(name));
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * stream_exec,
+                          platform->ExecutorForDevice(0));
+
+  // Compile AOT.
+  auto module_group = std::make_unique<HloModuleGroup>(std::move(module));
+  AotCompilationOptions aot_options(compiler->PlatformId());
+  // ToDo: Remove after unification of AOT compiler
+  if (!aot_options.debug_options().xla_gpu_enable_xla_runtime_executable()) {
+    return;
+  }
+
+  aot_options.set_executor(stream_exec);
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
+      compiler->CompileAheadOfTime(std::move(module_group), aot_options));
+
+  // Serialize-deserialize AOT compilation result.
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized_aot_result,
+                          aot_results[0]->SerializeAsString());
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<AotCompilationResult> aot_result,
+      compiler->LoadAotCompilationResult(serialized_aot_result));
+
+  // Load Executable from AOT compilation result.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          aot_result->LoadExecutable(compiler, stream_exec));
+}
+
+TEST_F(GpuAotCompilationTest, LoadExecutableForThunkRuntime) {
+  const absl::string_view hlo_string = R"(
+HloModule Test
+
+ENTRY main {
+  a = f32[100, 200]{1,0} parameter(0)
+  ROOT b = f32[100, 200]{0,1} copy(a)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_enable_xla_runtime_executable(false);
+  module->mutable_config().set_debug_options(debug_options);
+
+  auto compiler = backend().compiler();
+  auto name =
+      absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
   TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
-                          se::MultiPlatformManager::PlatformWithName("cuda"));
+                          se::MultiPlatformManager::PlatformWithName(name));
   TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * stream_exec,
                           platform->ExecutorForDevice(0));
 
   // Compile AOT.
   auto module_group = std::make_unique<HloModuleGroup>(std::move(module));
-  AotCompilationOptions aot_options(compiler.PlatformId());
+  AotCompilationOptions aot_options(compiler->PlatformId());
   aot_options.set_executor(stream_exec);
   TF_ASSERT_OK_AND_ASSIGN(
       std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
-      compiler.CompileAheadOfTime(std::move(module_group), aot_options));
+      compiler->CompileAheadOfTime(std::move(module_group), aot_options));
 
   // Serialize-deserialize AOT compilation result.
   TF_ASSERT_OK_AND_ASSIGN(std::string serialized_aot_result,
                           aot_results[0]->SerializeAsString());
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<AotCompilationResult> aot_result,
-      compiler.LoadAotCompilationResult(serialized_aot_result));
+      compiler->LoadAotCompilationResult(serialized_aot_result));
 
   // Load Executable from AOT compilation result.
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
-                          aot_result->LoadExecutable(&compiler, stream_exec));
+                          aot_result->LoadExecutable(compiler, stream_exec));
 }
 
 TEST_F(GpuAotCompilationTest, AotCompilationWithoutGpuDevice) {
@@ -74,9 +143,11 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  NVPTXCompiler compiler;
+  auto compiler = backend().compiler();
+  auto name =
+      absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
   TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
-                          se::MultiPlatformManager::PlatformWithName("cuda"));
+                          se::MultiPlatformManager::PlatformWithName(name));
   TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * stream_exec,
                           platform->ExecutorForDevice(0));
 
@@ -84,23 +155,28 @@ ENTRY main {
 
   // Stream executor is not passed as an option.
   Compiler::TargetConfig gpu_target_config(stream_exec);
-  AotCompilationOptions aot_options(compiler.PlatformId());
+  AotCompilationOptions aot_options(compiler->PlatformId());
+  // ToDo: Remove after unification of AOT compiler
+  if (!aot_options.debug_options().xla_gpu_enable_xla_runtime_executable()) {
+    return;
+  }
+
   aot_options.set_target_config(gpu_target_config);
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
-      compiler.CompileAheadOfTime(std::move(module_group), aot_options));
+      compiler->CompileAheadOfTime(std::move(module_group), aot_options));
 
   // Serialize-deserialize AOT compilation result.
   TF_ASSERT_OK_AND_ASSIGN(std::string serialized_aot_result,
                           aot_results[0]->SerializeAsString());
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<AotCompilationResult> aot_result,
-      compiler.LoadAotCompilationResult(serialized_aot_result));
+      compiler->LoadAotCompilationResult(serialized_aot_result));
 
   // Load Executable from AOT compilation result.
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
-                          aot_result->LoadExecutable(&compiler, stream_exec));
+                          aot_result->LoadExecutable(compiler, stream_exec));
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index fc6e47cdeb46e6..9c55820c56f7b2 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -33,7 +33,9 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "absl/types/variant.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
@@ -53,7 +55,13 @@ limitations under the License.
 #include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -114,10 +122,13 @@ limitations under the License.
 #include "xla/service/gpu/alias_passthrough_params.h"
 #include "xla/service/gpu/all_reduce_blueconnect.h"
 #include "xla/service/gpu/autotuner_util.h"
+#include "xla/service/gpu/command_buffer_scheduling.h"
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
 #include "xla/service/gpu/conv_layout_normalization.h"
 #include "xla/service/gpu/copy_fusion.h"
+#include "xla/service/gpu/custom_fusion_rewriter.h"
 #include "xla/service/gpu/dot_dimension_sorter.h"
+#include "xla/service/gpu/fusion_merger_triton.h"
 #include "xla/service/gpu/fusion_pipeline.h"
 #include "xla/service/gpu/fusion_wrapper.h"
 #include "xla/service/gpu/gemm_broadcast_folding_rewriter.h"
@@ -138,6 +149,8 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_stats.h"
 #include "xla/service/gpu/horizontal_loop_fusion.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/ir_emitter_unnested.h"
 #include "xla/service/gpu/loop_double_buffer_transformer.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/metrics.h"
@@ -154,6 +167,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime_intrinsics.h"
 #include "xla/service/gpu/scatter_slice_simplifier.h"
 #include "xla/service/gpu/softmax_rewriter_triton.h"
+#include "xla/service/gpu/thunk.h"
 #include "xla/service/gpu/topk_specializer.h"
 #include "xla/service/gpu/topk_splitter.h"
 #include "xla/service/gpu/tree_reduction_rewriter.h"
@@ -178,6 +192,7 @@ limitations under the License.
 #include "xla/service/loop_schedule_linearizer.h"
 #include "xla/service/operand_upcaster.h"
 #include "xla/service/optimization_barrier_expander.h"
+#include "xla/service/optimize_input_output_buffer_alias.h"
 #include "xla/service/qr_expander.h"
 #include "xla/service/real_imag_expander.h"
 #include "xla/service/reduce_decomposer.h"
@@ -214,11 +229,16 @@ limitations under the License.
 #include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
+#if GOOGLE_CUDA
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
+#elif TENSORFLOW_USE_ROCM
+#include "xla/stream_executor/rocm/rocm_platform_id.h"
+#endif
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
@@ -227,6 +247,7 @@ limitations under the License.
 #include "tsl/platform/cpu_info.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
@@ -257,7 +278,7 @@ StatusOr<AutotuneConfig> GetAutotuneConfig(
   return deviceless_config;
 }
 
-se::GpuComputeCapability GetGpuVersion(se::StreamExecutor* stream_exec) {
+se::GpuComputeCapability GetGpuVersion(const se::StreamExecutor* stream_exec) {
   return stream_exec->GetDeviceDescription().gpu_compute_capability();
 }
 
@@ -266,8 +287,8 @@ class GpuAotCompilationResult : public AotCompilationResult {
  public:
   GpuAotCompilationResult(
       HloModuleProto hlo, std::string_view obj_file,
-      std::string_view mlir_module, EntryFunctionAttributes entry_func_attrs,
-      std::string_view gpu_asm_text, absl::Span<const uint8_t> gpu_binary,
+      std::string_view mlir_module, std::string_view gpu_asm_text,
+      absl::Span<const uint8_t> gpu_binary,
       absl::Span<const GpuExecutable::ConstantInfo> constants = {}) {
     XlaRuntimeExecutableProto xla_runtime_executable;
     *xla_runtime_executable.mutable_hlo_module_proto() = hlo;
@@ -276,7 +297,6 @@ class GpuAotCompilationResult : public AotCompilationResult {
     *xla_runtime_gpu_executable_.mutable_xla_runtime_executable() =
         xla_runtime_executable;
 
-    *xla_runtime_gpu_executable_.mutable_entry_func_attrs() = entry_func_attrs;
     xla_runtime_gpu_executable_.set_gpu_asm_text(std::string(gpu_asm_text));
     xla_runtime_gpu_executable_.set_gpu_binary(gpu_binary.data(),
                                                gpu_binary.size());
@@ -285,7 +305,8 @@ class GpuAotCompilationResult : public AotCompilationResult {
       auto* cst_proto = xla_runtime_gpu_executable_.add_constants();
       cst_proto->set_symbol_name(cst.symbol_name);
       cst_proto->set_allocation_index(cst.allocation_index);
-      cst_proto->set_content(cst.content.data(), cst.content.size());
+      cst_proto->set_content(cst.content.span().data(),
+                             cst.content.span().size());
     }
   }
 
@@ -307,16 +328,52 @@ class GpuAotCompilationResult : public AotCompilationResult {
   }
 
   StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, se::StreamExecutor* executor) const override;
+      Compiler* compiler, const se::StreamExecutor* executor) const override;
 
  private:
   XlaRuntimeGpuExecutableProto xla_runtime_gpu_executable_;
 };
 
+class GpuThunkAotCompilationResult : public AotCompilationResult {
+ public:
+  GpuThunkAotCompilationResult(const HloModule* hlo_module,
+                               const BufferAssignment* buffer_assignment,
+                               std::string_view asm_text,
+                               absl::Span<const uint8_t> binary) {
+    *proto_.mutable_hlo_module() = hlo_module->ToProto();
+    *proto_.mutable_buffer_assignment() = buffer_assignment->ToProto();
+    proto_.set_asm_text(std::string(asm_text));
+    proto_.set_binary(binary.data(), binary.size());
+  }
+
+  explicit GpuThunkAotCompilationResult(CompilationResultProto proto)
+      : proto_(proto) {}
+
+  StatusOr<std::string> SerializeAsString() const override {
+    return proto_.SerializeAsString();
+  }
+
+  static StatusOr<std::unique_ptr<GpuThunkAotCompilationResult>> FromString(
+      const std::string& serialized) {
+    CompilationResultProto proto;
+    if (!proto.ParseFromString(serialized)) {
+      return InternalError(
+          "Failed to parse serialized GpuThunkAotCompilationResult.");
+    }
+    return std::make_unique<GpuThunkAotCompilationResult>(proto);
+  }
+
+  StatusOr<std::unique_ptr<Executable>> LoadExecutable(
+      Compiler* compiler, const se::StreamExecutor* stream_exec) const override;
+
+ private:
+  CompilationResultProto proto_;
+};
+
 }  // end anonymous namespace
 
 StatusOr<std::unique_ptr<Executable>> GpuAotCompilationResult::LoadExecutable(
-    Compiler* compiler, se::StreamExecutor* executor) const {
+    Compiler* compiler, const se::StreamExecutor* executor) const {
   XlaRuntimeExecutableProto xla_runtime_executable =
       xla_runtime_gpu_executable_.xla_runtime_executable();
   TF_ASSIGN_OR_RETURN(HloModuleConfig hlo_module_config,
@@ -331,18 +388,120 @@ StatusOr<std::unique_ptr<Executable>> GpuAotCompilationResult::LoadExecutable(
   for (auto& cst : xla_runtime_gpu_executable_.constants()) {
     GpuExecutable::ConstantInfo constant = {
         cst.symbol_name(),
-        {cst.content().begin(), cst.content().end()},
+        DenseDataIntermediate::Own(
+            std::vector<uint8_t>{cst.content().begin(), cst.content().end()}),
         cst.allocation_index()};
     constants.push_back(std::move(constant));
   }
 
   return GpuExecutable::LoadFromObjFile(
       std::move(hlo_module), xla_runtime_executable.obj_file(),
-      xla_runtime_executable.mlir_module(),
-      xla_runtime_gpu_executable_.entry_func_attrs(),
-      GetDebugOptionsFromFlags(), xla_runtime_gpu_executable_.gpu_asm_text(),
+      xla_runtime_executable.mlir_module(), GetDebugOptionsFromFlags(),
+      xla_runtime_gpu_executable_.gpu_asm_text(),
       xla_runtime_gpu_executable_.gpu_binary(), std::move(constants),
-      GetGpuVersion(executor), executor);
+      GetGpuVersion(executor));
+}
+
+StatusOr<std::unique_ptr<Executable>>
+GpuThunkAotCompilationResult::LoadExecutable(
+    Compiler* compiler, const se::StreamExecutor* stream_exec) const {
+  // Recreate HloModule from proto.
+  TF_ASSIGN_OR_RETURN(HloModuleConfig hlo_module_config,
+                      HloModule::CreateModuleConfigFromProto(
+                          proto_.hlo_module(), GetDebugOptionsFromFlags()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      HloModule::CreateFromProto(proto_.hlo_module(), hlo_module_config));
+
+  // Recreate BufferAssignment from proto.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<BufferAssignment> buffer_assignment,
+      BufferAssignment::FromProto(proto_.buffer_assignment(), hlo_module.get(),
+                                  compiler->BufferSizeBytesFunction(),
+                                  /*can_share_buffer=*/nullptr));
+
+  std::vector<uint8_t> binary(proto_.binary().begin(), proto_.binary().end());
+
+  // Build the executable, which should be a thunk sequence.
+  TF_ASSIGN_OR_RETURN(
+      se::Platform * platform,
+      se::MultiPlatformManager::PlatformWithId(compiler->PlatformId()));
+  std::string platform_name = platform->Name();
+  se::DeviceDescription gpu_device_info = stream_exec->GetDeviceDescription();
+  mlir::DialectRegistry registry;
+  IrEmitterUnnested::GetDependentDialects(registry);
+  auto mlir_context = std::make_unique<mlir::MLIRContext>(registry);
+  llvm::LLVMContext llvm_context;
+  auto llvm_module = std::make_unique<llvm::Module>("", llvm_context);
+  auto* gpu_compiler = dynamic_cast<GpuCompiler*>(compiler);
+  if (gpu_compiler == nullptr) {
+    return InternalError("Compiler is not a GpuCompiler.");
+  }
+  llvm_module->setTargetTriple(gpu_compiler->target_triple());
+  llvm_module->setDataLayout(gpu_compiler->data_layout());
+  IrEmitterContext ir_emitter_context(hlo_module.get(), buffer_assignment.get(),
+                                      platform_name, gpu_device_info,
+                                      mlir_context.get(), llvm_module.get(),
+                                      /*emit_ir_from_hlo=*/true,
+                                      /*emit_kernels=*/false);
+  mlir::OwningOpRef<mlir::ModuleOp> mlir_module = llvm_ir::CreateMlirModuleOp(
+      mlir::Builder(mlir_context.get()).getUnknownLoc(), hlo_module->name());
+  std::vector<const BufferAllocation*> ordered_allocations;
+  absl::flat_hash_map<const mlir::Operation*, const xla::HloInstruction*>
+      operation_map;
+  TF_RETURN_IF_ERROR(HloToLhloModule(*buffer_assignment, *hlo_module,
+                                     *mlir_module, &ordered_allocations,
+                                     &operation_map));
+  ir_emitter_context.set_allocations(ordered_allocations);
+  auto ir_emitter = IrEmitterUnnested::Create(&ir_emitter_context);
+  auto entry_function = mlir::cast<mlir::func::FuncOp>(
+      mlir_module->lookupSymbol(hlo_module->entry_computation()->name()));
+  // TODO(anlunx): EmitLmhloRegion emits fusion kernels. We need to make sure
+  // ptx and cubin already contain emission results and disable kernel emission
+  // here.
+  TF_RETURN_IF_ERROR(
+      ir_emitter->EmitLmhloRegion(&entry_function.getBody(), operation_map));
+  std::unique_ptr<ThunkSequence> thunk_sequence =
+      ir_emitter->ConsumeThunkSequence();
+  ForAllThunks([](Thunk* thunk) { thunk->ClearCompileTimeInfo(); },
+               thunk_sequence.get());
+
+  // Get all other fields required by GpuExecutable.
+  std::vector<GpuExecutable::ConstantInfo> constants =
+      std::move(ir_emitter_context.constants());
+  TF_ASSIGN_OR_RETURN(auto output_info,
+                      GetOutputInfo(*hlo_module, *buffer_assignment));
+  const Shape& output_shape = hlo_module->result_shape();
+  std::function<std::string()> buffer_assignment_dumper = [] {
+    return std::string();
+  };
+  bool enable_persistent_temp_buffers =
+      hlo_module->config()
+          .debug_options()
+          .xla_gpu_enable_persistent_temp_buffers();
+  int64_t debug_buffer_assignment_show_max =
+      hlo_module->config()
+          .debug_options()
+          .xla_debug_buffer_assignment_show_max();
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<GpuExecutable> executable,
+      GpuExecutable::Create(GpuExecutable::Params{
+          /*asm_text=*/proto_.asm_text(),
+          /*binary=*/binary,
+          /*gpu_version=*/gpu_device_info.gpu_compute_capability(),
+          /*executable=*/std::move(thunk_sequence),
+          /*constants=*/std::move(constants),
+          /*output_info=*/std::move(output_info),
+          /*module_name=*/std::move(hlo_module->name()),
+          /*output_shape=*/std::move(output_shape),
+          /*mlir_allocations=*/std::nullopt,
+          /*buffer_assignment=*/std::move(buffer_assignment),
+          /*enable_persistent_temp_buffers=*/enable_persistent_temp_buffers,
+          /*debug_buffer_assignment_show_max=*/debug_buffer_assignment_show_max,
+          /*debug_module=*/std::move(hlo_module),
+          /*enable_debug_info_manager=*/true}));
+  return executable;
 }
 
 GpuCompiler::GpuCompiler(se::Platform::Id platform_id,
@@ -385,22 +544,18 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
                                       const TargetConfig& gpu_target_config) {
   const DebugOptions& debug_options = hlo_module->config().debug_options();
 
-  // By default use an externally provided thread pool.
-  tsl::thread::ThreadPool* thread_pool = options.thread_pool;
-  std::optional<tsl::thread::ThreadPool> overriding_thread_pool;
-  int num_threads = hlo_module->config()
-                        .debug_options()
-                        .xla_gpu_force_compilation_parallelism();
-  // If an external thread pool is provided or single-threaded operation is
-  // requested do not create a thread pool.
-  if (thread_pool == nullptr && num_threads != 1) {
-    // Zero means "default", treat it as "max parallelism" here.
-    if (num_threads == 0) {
-      num_threads = tsl::port::MaxParallelism();
-    }
-    overriding_thread_pool.emplace(tsl::Env::Default(), "", num_threads);
-    thread_pool = &*overriding_thread_pool;
-  }
+  // LOG_LINES is used instead of LOG since the message can exceed the
+  // maximum line length, which results in the message being truncated.
+  XLA_VLOG_LINES(
+      1, absl::StrFormat("GpuCompilationEnvironment of hlo_module %s:\n%s",
+                         hlo_module->name(), debug_options.DebugString()));
+
+  MaybeOwningThreadPool thread_pool = MaybeOwningThreadPool::GetOrCreate(
+      /*parallelism=*/hlo_module->config()
+          .debug_options()
+          .xla_gpu_force_compilation_parallelism(),
+      /*default_thread_pool=*/options.thread_pool,
+      /*default_parallelism=*/tsl::port::MaxParallelism());
 
   AlgebraicSimplifierOptions layout_insensitive_algsimp_opts({},
                                                              ConvIsLowerable);
@@ -520,7 +675,8 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
         /*is_spmd=*/true, /*propagate_metadata=*/false,
         hlo_module->config().allow_spmd_sharding_propagation_to_output());
     spmd_pipeline.AddPass<spmd::StatefulRngSpmdPartitioner>(
-        num_partitions, hlo_module->config().replica_count());
+        num_partitions, hlo_module->config().replica_count(),
+        debug_options.xla_gpu_threshold_for_windowed_einsum_mib());
     spmd_pipeline.AddPass<CollectivePermuteMotion>();
     TF_RETURN_IF_ERROR(spmd_pipeline.Run(hlo_module).status());
   } else {
@@ -825,20 +981,25 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
     // Layout's element_size_in_bits field.
     pipeline.AddPass<SubByteNormalization>(
         SubByteNormalization::SET_ELEMENT_SIZE);
+    pipeline.AddPass<OptimizeInputOutputBufferAlias>(true);
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
   // Run target-specific HLO optimization passes after layout assignment.
   TF_RETURN_IF_ERROR(OptimizeHloPostLayoutAssignment(
-      hlo_module, stream_exec, options, gpu_target_config, thread_pool));
+      hlo_module, stream_exec, options, gpu_target_config, thread_pool.get()));
 
   const se::DeviceDescription& gpu_device_info =
       gpu_target_config.device_description;
 
-  TF_RETURN_IF_ERROR(
-      FusionPipeline(debug_options, ShapeSizeBytesFunction(), gpu_device_info)
-          .Run(hlo_module)
-          .status());
+  TF_RETURN_IF_ERROR(FusionPipeline(debug_options, ShapeSizeBytesFunction(),
+                                    thread_pool.get(), gpu_device_info)
+                         .Run(hlo_module)
+                         .status());
+
+  if (debug_options.xla_gpu_enable_triton_softmax_fusion()) {
+    TF_RETURN_IF_ERROR(FusionMergerTriton().Run(hlo_module).status());
+  }
 
   if (debug_options.xla_gpu_collect_cost_model_stats()) {
     GpuHloCostAnalysis::Options cost_analysis_options{
@@ -962,7 +1123,8 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
           /*process_different_sized_ops=*/true,
           /*pipelining_direction=*/
           CollectivePipeliner::PipeliningDirection::kBackward,
-          /*should_process=*/may_pipeline_p2p};
+          /*should_process=*/may_pipeline_p2p,
+          /*acceptable_formatting=*/[](const HloInstruction*) { return true; }};
       pipeline.AddPass<CollectivePipeliner>(config);
     }
 
@@ -1013,11 +1175,11 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
                                         gpu_target_config));
   // Lambdas and related constants:
   const GpuFloatSupport bf16_support(BF16);
-  const GpuFloatSupport f8e5m2_support(F8E5M2);
-  const GpuFloatSupport f8e4m3fn_support(F8E4M3FN);
-  const FloatSupport f8e4m3b11fnuz_support(F8E4M3B11FNUZ);
-  const FloatSupport f8e5m2fnuz_support(F8E5M2FNUZ);
-  const FloatSupport f8e4m3fnuz_support(F8E4M3FNUZ);
+  const GpuFloatSupport f8e5m2_support(F8E5M2, F16);
+  const GpuFloatSupport f8e4m3fn_support(F8E4M3FN, F16);
+  const FloatSupport f8e4m3b11fnuz_support(F8E4M3B11FNUZ, F16);
+  const FloatSupport f8e5m2fnuz_support(F8E5M2FNUZ, F16);
+  const FloatSupport f8e4m3fnuz_support(F8E4M3FNUZ, F16);
   auto add_float_normalization = [&](HloPassPipeline& pipeline) {
     auto& sub_pipeline =
         pipeline.AddPass<HloPassPipeline>("float_normalization");
@@ -1053,6 +1215,18 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     });
     pipeline.AddPass<HloPassFix<MoveCopyToUsers>>();
 
+    // Greedy pattern matching for custom fusions. We run it before Triton
+    // rewriter or a regular Gemm rewriter to be able to match compatible GEMMs
+    // before they matched into Triton gemm or a cuBLAS custom call.
+    //
+    // TODO(ezhulenev): This should be plugged into the cost model and fusion
+    // heuristic, so we can mix and match various Gemm implementations based
+    // on projected (measured) performance.
+    if (debug_options.xla_gpu_enable_custom_fusions()) {
+      pipeline.AddPass<CustomFusionRewriter>(
+          &gpu_target_config.device_description);
+    }
+
     // Rewrite GEMMs into custom calls.
     se::GpuComputeCapability gpu_version =
         gpu_target_config.device_description.gpu_compute_capability();
@@ -1136,11 +1310,21 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(simplifier_options);
 
+  if (debug_options.xla_gpu_simplify_all_fp_conversions()) {
+    // This pass cleans up chains of compiler-generated converts
+    // (i.e. f32 -> bf16 -> f32) that have been produced by the algebraic
+    // simplifier by rearranging ops (i.e. by pushing broadcasts towards the
+    // root).
+    pipeline.AddPass<SimplifyFPConversions>(
+        SimplifyFPConversions::Scope::
+            kOnlySimplifyCompilerGeneratedConversions);
+  }
+
   // Since this CSE runs after collective schedule linearizer which inserts
   // control dependencies, ignore these control deps when replacing instructions
   // with equivalent ones here.
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
-                           /*only_fusion_computations*/ false,
+                           /*only_fusion_computations=*/false,
                            /*ignore_control_dependencies=*/true);
   TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
 
@@ -1270,7 +1454,7 @@ Status RunPostSchedulingCopyInsertion(
 }  // namespace
 
 StatusOr<std::unique_ptr<BufferAssignment>> GpuCompiler::AssignBuffers(
-    HloModule* hlo_module, se::StreamExecutor* stream_exec) {
+    HloModule* hlo_module, const se::StreamExecutor* stream_exec) {
   const se::DeviceDescription& gpu_device_info =
       stream_exec->GetDeviceDescription();
   const int64_t scheduler_mem_limit =
@@ -1338,139 +1522,103 @@ std::unique_ptr<llvm::Module> CopyToContext(const llvm::Module& module,
 
 }  // namespace
 
-StatusOr<std::pair<std::string, std::vector<uint8_t>>>
-GpuCompiler::CompileToTargetBinary(const HloModuleConfig& module_config,
-                                   std::unique_ptr<llvm::Module> llvm_module,
-                                   se::GpuComputeCapability gpu_version,
-                                   se::StreamExecutor* stream_exec,
-                                   const CompileOptions& options,
-                                   const HloModule* debug_module) {
-  using BackendCompileResult = std::pair<std::string, std::vector<uint8_t>>;
-
-  const auto compile_single_module =
-      [this, gpu_version, &module_config, &options, debug_module](
-          llvm::Module* llvm_module, bool relocatable,
-          std::optional<int> shard_number) -> StatusOr<BackendCompileResult> {
-    {
-      // This may print multiple lines per HLO compilation because of the
-      // parallelized compilation of LLVM modules.
-      XLA_SCOPED_LOGGING_TIMER_IF(
-          absl::StrCat(
-              "GpuCompiler::RunBackend - Running LLVM verifier for ",
-              (debug_module != nullptr ? debug_module->name() : "(unknown)")),
-          !options.is_autotuning_compilation);
-
-      llvm_module->getContext().setDiagnosticHandlerCallBack(
-          NullDiagnosticHandler, nullptr);
-
-      std::string err;
-      llvm::raw_string_ostream err_stream(err);
-
-      // verifyModule() returns true if the module is broken.
-      TF_RET_CHECK(!llvm::verifyModule(*llvm_module, &err_stream))
-          << "Invalid LLVM IR before optimizations:\n"
-          << err_stream.str()
-          << "\nThis probably indicates a bug in the HLO -> LLVM IR "
-             "lowering. Rerun with --xla_dump_to to get the IR"
-          << (debug_module
-                  ? absl::StrCat(" and looks for files with name containing: *",
-                                 FilenameFor(*debug_module, "", ""), "*")
-                  : ".");
-    }
-    StatusOr<std::pair<std::string, std::vector<uint8_t>>> result =
-        CompileTargetBinary(module_config, llvm_module, gpu_version,
-                            relocatable, debug_module, options);
+StatusOr<GpuCompiler::BackendCompileResult> GpuCompiler::CompileSingleModule(
+    const HloModuleConfig& module_config, se::GpuComputeCapability gpu_version,
+    const HloModule* debug_module, llvm::Module* llvm_module, bool relocatable,
+    const CompileOptions& options, std::optional<int> shard_number) {
+  // This may print multiple lines per HLO compilation because of the
+  // parallelized compilation of LLVM modules.
+  XLA_SCOPED_LOGGING_TIMER_IF(
+      absl::StrCat(
+          "GpuCompiler::RunBackend - Running LLVM verifier for ",
+          (debug_module != nullptr ? debug_module->name() : "(unknown)")),
+      !options.is_autotuning_compilation);
 
-    if (!result.ok()) {
-      return result;
-    }
+  llvm_module->getContext().setDiagnosticHandlerCallBack(NullDiagnosticHandler,
+                                                         nullptr);
 
-    const bool should_dump =
-        DumpingEnabledForHloModule(debug_module ? debug_module->name() : "",
-                                   module_config.debug_options());
-
-    if (should_dump) {
-      if (debug_module) {
-        if (shard_number.has_value()) {
-          llvm_ir::DumpIrIfEnabled(*debug_module, *llvm_module,
-                                   /*optimized=*/true,
-                                   std::to_string(*shard_number));
-        } else {
-          llvm_ir::DumpIrIfEnabled(*debug_module, *llvm_module,
-                                   /*optimized=*/true);
-        }
-      } else {
-        LOG(ERROR)
-            << "Dumping is not implemented since the file name cannot be "
-               "inferred. Please implement (potentially MLIR) module -> "
-               "filename heuristic.";
-      }
-    }
+  std::string err;
+  llvm::raw_string_ostream err_stream(err);
 
-    if (user_post_optimization_hook_) {
-      user_post_optimization_hook_(*llvm_module);
-    }
+  // verifyModule() returns true if the module is broken.
+  TF_RET_CHECK(!llvm::verifyModule(*llvm_module, &err_stream))
+      << "Invalid LLVM IR before optimizations:\n"
+      << err_stream.str()
+      << "\nThis probably indicates a bug in the HLO -> LLVM IR "
+         "lowering. Rerun with --xla_dump_to to get the IR"
+      << (debug_module
+              ? absl::StrCat(" and looks for files with name containing: *",
+                             FilenameFor(*debug_module, "", ""), "*")
+              : ".");
 
-    // Write PTX to IR dump directory, if IR dumping was requested.
-    if (should_dump) {
-      absl::string_view ptx = result->first;
-      if (debug_module) {
-        if (shard_number.has_value()) {
-          DumpToFileInDirOrStdout(*debug_module, "",
-                                  std::to_string(*shard_number) + ".ptx", ptx);
-        } else {
-          DumpToFileInDirOrStdout(*debug_module, "", "ptx", ptx);
-        }
-      } else {
-        LOG(ERROR)
-            << "Dumping is not implemented since the file name cannot be "
-               "inferred. Please implement (potentially MLIR) module -> "
-               "filename heuristic.";
-      }
+  TF_ASSIGN_OR_RETURN(
+      BackendCompileResult result,
+      CompileTargetBinary(module_config, llvm_module, gpu_version, relocatable,
+                          debug_module, options));
+
+  const bool should_dump = DumpingEnabledForHloModule(
+      debug_module ? debug_module->name() : "", module_config.debug_options());
+
+  if (should_dump) {
+    if (debug_module) {
+      llvm_ir::DumpIrIfEnabled(
+          *debug_module, *llvm_module,
+          /*optimized=*/true,
+          shard_number.has_value() ? std::to_string(*shard_number) : "");
+    } else {
+      LOG(ERROR) << "Dumping is not implemented since the file name cannot be "
+                    "inferred. Please implement (potentially MLIR) module -> "
+                    "filename heuristic.";
     }
-
-    return result;
-  };
-
-  // Disable multi-threading during deviceless AOT compilation.
-  // TODO(anlunx): Enable multi-threading once deviceless AOT compilation is
-  // enabled.
-  if (!stream_exec) {
-    return compile_single_module(llvm_module.get(), /*relocatable=*/false,
-                                 /*shard_number=*/std::nullopt);
   }
 
-  tsl::thread::ThreadPool* thread_pool;
-  std::optional<tsl::thread::ThreadPool> overriding_thread_pool;
-  switch (
-      module_config.debug_options().xla_gpu_force_compilation_parallelism()) {
-    case 0:
-      thread_pool = options.thread_pool;
-      break;
-    case 1:
-      thread_pool = nullptr;
-      break;
-    default:
-      overriding_thread_pool.emplace(
-          tsl::Env::Default(), "",
-          module_config.debug_options()
-              .xla_gpu_force_compilation_parallelism());
-      thread_pool = &*overriding_thread_pool;
-      break;
+  if (user_post_optimization_hook_) {
+    user_post_optimization_hook_(*llvm_module);
   }
 
-  if (!thread_pool) {
-    return compile_single_module(llvm_module.get(), /*relocatable=*/false,
-                                 /*shard_number=*/std::nullopt);
+  // Write PTX to IR dump directory, if IR dumping was requested.
+  if (should_dump) {
+    absl::string_view ptx = result.asm_text;
+    if (debug_module) {
+      DumpToFileInDirOrStdout(*debug_module, "",
+                              shard_number.has_value()
+                                  ? (std::to_string(*shard_number) + ".ptx")
+                                  : "ptx",
+                              ptx);
+    } else {
+      LOG(ERROR) << "Dumping is not implemented since the file name cannot be "
+                    "inferred. Please implement (potentially MLIR) module -> "
+                    "filename heuristic.";
+    }
   }
 
+  return result;
+}
+
+StatusOr<GpuCompiler::BackendCompileResult> GpuCompiler::CompileToTargetBinary(
+    const HloModuleConfig& module_config, llvm::Module* llvm_module,
+    se::GpuComputeCapability gpu_version, se::StreamExecutor* stream_exec,
+    const CompileOptions& options, const HloModule* debug_module) {
+  MaybeOwningThreadPool thread_pool = MaybeOwningThreadPool::GetOrCreate(
+      /*parallelism=*/module_config.debug_options()
+          .xla_gpu_force_compilation_parallelism(),
+      /*default_thread_pool=*/options.thread_pool,
+      /*default_parallelism=*/1);
+
   // Test whether LinkModules is supported.
   TF_ASSIGN_OR_RETURN(bool can_use_link_modules,
                       CanUseLinkModules(module_config));
-  if (!can_use_link_modules) {
-    return compile_single_module(llvm_module.get(), /*relocatable=*/false,
-                                 /*shard_number=*/std::nullopt);
+
+  // Disable multi-threading during deviceless AOT compilation.
+  // TODO(anlunx): Enable multi-threading once deviceless AOT compilation is
+  // enabled.
+  if (!can_use_link_modules || !thread_pool || !stream_exec) {
+    return CompileSingleModule(module_config, gpu_version, debug_module,
+
+                               llvm_module, /*relocatable=*/false, options,
+                               /*shard_number=*/std::nullopt);
   }
+
   std::vector<std::unique_ptr<llvm::Module>> llvm_modules;
   int num_functions = 0;
   for (llvm::Function& func : llvm_module->functions()) {
@@ -1523,17 +1671,19 @@ GpuCompiler::CompileToTargetBinary(const HloModuleConfig& module_config,
       llvm_modules.size());
   tsl::BlockingCounter counter(llvm_modules.size());
   for (int i = 0; i < llvm_modules.size(); i++) {
-    thread_pool->Schedule(
-        [&compile_results, compile_single_module, i, &llvm_modules, &counter] {
-          // Each thread has its own context to avoid race conditions.
-          llvm::LLVMContext new_context;
-          std::unique_ptr<llvm::Module> new_module =
-              CopyToContext(*llvm_modules.at(i), new_context);
-          compile_results.at(i) =
-              compile_single_module(new_module.get(),
-                                    /*relocatable=*/true, /*shard_number=*/i);
-          counter.DecrementCount();
-        });
+    thread_pool->Schedule([&compile_results, i, &llvm_modules, &counter, this,
+                           &module_config, &gpu_version, &debug_module,
+                           &options] {
+      // Each thread has its own context to avoid race conditions.
+      llvm::LLVMContext new_context;
+      std::unique_ptr<llvm::Module> new_module =
+          CopyToContext(*llvm_modules.at(i), new_context);
+      compile_results.at(i) = CompileSingleModule(
+          module_config, gpu_version, debug_module, new_module.get(),
+          /*relocatable=*/true, options,
+          /*shard_number=*/i);
+      counter.DecrementCount();
+    });
   }
   counter.Wait();
 
@@ -1541,12 +1691,12 @@ GpuCompiler::CompileToTargetBinary(const HloModuleConfig& module_config,
   std::vector<std::vector<uint8_t>> submodule_compile_results;
   for (auto& maybe_result : compile_results) {
     TF_ASSIGN_OR_RETURN(auto result, maybe_result);
-    if (result.second.empty()) {
+    if (result.binary.empty()) {
       continue;
     }
-    ptx_snippets += result.first;
+    ptx_snippets += result.asm_text;
     ptx_snippets += "\n";
-    submodule_compile_results.push_back(result.second);
+    submodule_compile_results.push_back(result.binary);
   }
 
   auto maybe_backend_result =
@@ -1560,8 +1710,56 @@ GpuCompiler::CompileToTargetBinary(const HloModuleConfig& module_config,
                << maybe_backend_result.status();
     return maybe_backend_result.status();
   }
+  return BackendCompileResult{ptx_snippets, std::move(*maybe_backend_result)};
+}
+
+StatusOr<GpuCompiler::CompileResultWithMetadata>
+GpuCompiler::CompileToBackendResult(
+    HloModule* module, llvm::LLVMContext* llvm_context,
+    se::StreamExecutor* executor, const CompileOptions& options,
+    const se::DeviceDescription& gpu_device_info) {
+  const int64_t scheduler_mem_limit =
+      GetSchedulerMemoryLimit(module, gpu_device_info, pointer_size_);
+  TF_RETURN_IF_ERROR(ScheduleGpuModule(module, pointer_size_,
+                                       scheduler_mem_limit, gpu_device_info));
+
+  TF_RETURN_IF_ERROR(RunPostSchedulingPipelines(module, scheduler_mem_limit));
 
-  return std::make_pair(ptx_snippets, std::move(*maybe_backend_result));
+  TF_ASSIGN_OR_RETURN(se::Platform * platform,
+                      se::MultiPlatformManager::PlatformWithId(PlatformId()));
+
+  // Compile the module
+  TF_ASSIGN_OR_RETURN(
+      CompileModuleResults compile_module_results,
+      CompileModuleToLlvmIr(module, llvm_context, target_triple_, data_layout_,
+                            platform->Name(), platform->id(), gpu_device_info,
+                            GetCanShareBuffer(), BufferSizeBytesFunction()));
+
+  if (user_pre_optimization_hook_) {
+    user_pre_optimization_hook_(*compile_module_results.llvm_module);
+  }
+
+  llvm_ir::DumpIrIfEnabled(*module, *compile_module_results.llvm_module,
+                           /*optimized=*/false);
+
+  TF_ASSIGN_OR_RETURN(
+      BackendCompileResult backend_result,
+      CompileToTargetBinary(
+          module->config(), compile_module_results.llvm_module.get(),
+          gpu_device_info.gpu_compute_capability(), executor, options, module));
+  RecordXlaDeviceBinarySize(backend_result.binary.size());
+  if (DumpingEnabledForHloModule(*module) &&
+      std::holds_alternative<GpuExecutable::OwnedThunkSequence>(
+          compile_module_results.executable)) {
+    const ThunkSequence& thunk_sequence =
+        *std::get<GpuExecutable::OwnedThunkSequence>(
+            compile_module_results.executable);
+    DumpToFileInDirOrStdout(*module, "", "thunk_sequence.txt",
+                            thunk_sequence.ToString());
+  }
+
+  return CompileResultWithMetadata{std::move(backend_result),
+                                   std::move(compile_module_results)};
 }
 
 StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
@@ -1618,119 +1816,59 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     }
   }
 
-  const int64_t scheduler_mem_limit =
-      GetSchedulerMemoryLimit(module.get(), gpu_device_info, pointer_size_);
-  TF_RETURN_IF_ERROR(ScheduleGpuModule(module.get(), pointer_size_,
-                                       scheduler_mem_limit, gpu_device_info));
-  TF_RETURN_IF_ERROR(
-      RunPostSchedulingPipelines(module.get(), scheduler_mem_limit));
-
-  TF_ASSIGN_OR_RETURN(se::Platform * platform,
-                      se::MultiPlatformManager::PlatformWithId(PlatformId()));
-
-  CompileModuleResults compile_module_results;
-
-  TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
-      module.get(), &llvm_context, target_triple_, data_layout_,
-      platform->Name(), platform->id(), gpu_device_info, GetCanShareBuffer(),
-      BufferSizeBytesFunction(), &compile_module_results));
-
-  if (user_pre_optimization_hook_) {
-    user_pre_optimization_hook_(*compile_module_results.llvm_module);
-  }
-  std::string ir_module_string_before_opt;
-  const bool embed_ir_in_executable =
-      module->config().debug_options().xla_embed_ir_in_executable();
-  if (embed_ir_in_executable) {
-    ir_module_string_before_opt =
-        llvm_ir::DumpToString(compile_module_results.llvm_module.get());
-  }
-
-  llvm_ir::DumpIrIfEnabled(*module, *compile_module_results.llvm_module,
-                           /*optimized=*/false);
-
-  std::string asm_text;
-  std::vector<uint8_t> binary;
   TF_ASSIGN_OR_RETURN(
-      std::tie(asm_text, binary),
-      CompileToTargetBinary(module->config(),
-                            std::move(compile_module_results.llvm_module),
-                            gpu_device_info.gpu_compute_capability(),
-                            stream_exec, options, module.get()));
-  RecordXlaDeviceBinarySize(binary.size());
+      CompileResultWithMetadata res,
+      CompileToBackendResult(module.get(), &llvm_context, stream_exec, options,
+                             gpu_device_info));
 
-  if (DumpingEnabledForHloModule(*module) &&
-      std::holds_alternative<GpuExecutable::OwnedThunkSequence>(
-          compile_module_results.executable)) {
-    const ThunkSequence& thunk_sequence =
-        *std::get<GpuExecutable::OwnedThunkSequence>(
-            compile_module_results.executable);
+  if (auto thunk_sequence = std::get_if<GpuExecutable::OwnedThunkSequence>(
+          &res.compile_module_results.executable);
+      DumpingEnabledForHloModule(*module) && thunk_sequence) {
     DumpToFileInDirOrStdout(*module, "", "thunk_sequence.txt",
-                            thunk_sequence.ToString());
-  }
-
-  std::shared_ptr<BufferAssignment> buffer_assignment;
-  std::unique_ptr<BufferAssignmentProto> buffer_assignment_proto;
-  std::function<std::string()> buffer_assignment_dumper = [] {
-    return std::string();
-  };
-  if (!options.is_autotuning_compilation) {
-    // Make it shared to be captured in the later lambda.
-    buffer_assignment = std::move(compile_module_results.buffer_assignment);
-    buffer_assignment_proto =
-        std::make_unique<BufferAssignmentProto>(buffer_assignment->ToProto());
-    size_t max_buffers_to_show =
-        module->config().debug_options().xla_debug_buffer_assignment_show_max();
-    buffer_assignment_dumper = [buffer_assignment, max_buffers_to_show] {
-      return buffer_assignment->ToVerboseString(max_buffers_to_show);
-    };
+                            (*thunk_sequence)->ToString());
   }
 
-  std::vector<BufferAllocation> allocations;
-  if (compile_module_results.use_original_allocations) {
-    if (!options.is_autotuning_compilation) {
-      std::vector<BufferAllocation> original_allocations =
-          buffer_assignment->ReleaseAllocations();
-      allocations = std::move(original_allocations);
-    } else {
-      std::vector<BufferAllocation> original_allocations =
-          compile_module_results.buffer_assignment->ReleaseAllocations();
-      allocations = std::move(original_allocations);
-    }
-  } else {
-    allocations = std::move(compile_module_results.allocations);
-  }
+  // The module is being moved into the GpuExecutable below and we need to
+  // read a few config values from the module, before it becomes invalid.
+  bool embed_ir_in_executable =
+      module->config().debug_options().xla_embed_ir_in_executable();
+  int64_t debug_buffer_assignment_show_max =
+      module->config().debug_options().xla_debug_buffer_assignment_show_max();
+  bool enable_persistent_temp_buffers =
+      module->config().debug_options().xla_gpu_enable_persistent_temp_buffers();
 
   TF_ASSIGN_OR_RETURN(
       auto gpu_executable,
       GpuExecutable::Create(GpuExecutable::Params{
-          /*asm_text=*/(options.is_autotuning_compilation && !binary.empty())
+          /*asm_text=*/(options.is_autotuning_compilation &&
+                        !res.backend_result.binary.empty())
               ? std::string()
-              : std::move(asm_text),
-          /*binary=*/std::move(binary),
+              : std::move(res.backend_result.asm_text),
+          /*binary=*/std::move(res.backend_result.binary),
           /*gpu_version=*/gpu_device_info.gpu_compute_capability(),
-          /*executable=*/std::move(compile_module_results.executable),
-          /*entry_func_attrs=*/
-          std::move(compile_module_results.entry_func_attrs),
-          /*constants=*/std::move(compile_module_results.constants),
-          /*output_info=*/std::move(compile_module_results.output_info),
-          /*module_name=*/std::move(compile_module_results.module_name),
-          /*output_shape=*/std::move(compile_module_results.output_shape),
-          /*allocations=*/std::move(allocations),
-          /*enable_persistent_temp_buffers=*/
-          module->config()
-              .debug_options()
-              .xla_gpu_enable_persistent_temp_buffers(),
-          /*debug_buffer_assignment=*/std::move(buffer_assignment_proto),
-          /*verbose_buffer_assignment_string_dumper=*/
-          std::move(buffer_assignment_dumper),
+          /*executable=*/std::move(res.compile_module_results.executable),
+          /*constants=*/std::move(res.compile_module_results.constants),
+          /*output_info=*/std::move(res.compile_module_results.output_info),
+          /*module_name=*/std::move(res.compile_module_results.module_name),
+          /*output_shape=*/std::move(res.compile_module_results.output_shape),
+          /*mlir_allocations=*/
+          (res.compile_module_results.use_original_allocations
+               ? std::optional<std::vector<BufferAllocation>>()
+               : std::move(res.compile_module_results.allocations)),
+          /*buffer_assignment=*/
+          std::move(res.compile_module_results.buffer_assignment),
+          /*enable_persistent_temp_buffers=*/enable_persistent_temp_buffers,
+          /*debug_buffer_assignment_show_max=*/debug_buffer_assignment_show_max,
           /*debug_module=*/options.is_autotuning_compilation
               ? std::unique_ptr<HloModule>()
               : std::move(module),
           /*enable_debug_info_manager=*/!options.is_autotuning_compilation}));
+
   if (embed_ir_in_executable) {
-    DCHECK_NE("", ir_module_string_before_opt);
+    std::string ir_module_string_before_opt =
+        llvm_ir::DumpToString(res.compile_module_results.llvm_module.get());
     gpu_executable->set_ir_module_string(ir_module_string_before_opt);
+    DCHECK_NE("", ir_module_string_before_opt);
   }
 
   IncrementCompiledProgramsCount();
@@ -1739,10 +1877,11 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     // Dump computation proto state and buffer assignment for
     // CompiledMemoryAnalysis.
     auto hlo_proto = std::make_unique<HloProto>();
-    *hlo_proto->mutable_hlo_module() = gpu_executable->module().ToProto();
-    *hlo_proto->mutable_buffer_assignment() = buffer_assignment->ToProto();
+    *hlo_proto->mutable_buffer_assignment() =
+        gpu_executable->buffer_assignment()->ToProto();
     gpu_executable->set_hlo_proto(std::move(hlo_proto));
-    gpu_executable->set_debug_info(buffer_assignment->GetStats().ToString());
+    gpu_executable->set_debug_info(
+        gpu_executable->buffer_assignment()->GetStats().ToString());
   }
 
   return static_cast<std::unique_ptr<Executable>>(std::move(gpu_executable));
@@ -1751,7 +1890,11 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                                 const AotCompilationOptions& options) {
+#if GOOGLE_CUDA
   CHECK(options.PlatformId() == se::cuda::kCudaPlatformId);
+#elif TENSORFLOW_USE_ROCM
+  CHECK(options.PlatformId() == se::rocm::kROCmPlatformId);
+#endif
 
   std::vector<std::unique_ptr<HloModule>> modules =
       module_group->ConsumeModules();
@@ -1763,69 +1906,30 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
   const se::DeviceDescription& gpu_device_info =
       target_config.has_value() ? target_config->device_description
                                 : options.executor()->GetDeviceDescription();
-  for (const auto& module : modules) {
+  for (const std::unique_ptr<HloModule>& module : modules) {
     llvm::LLVMContext llvm_context;
-
-    const int64_t scheduler_mem_limit =
-        GetSchedulerMemoryLimit(module.get(), gpu_device_info, pointer_size_);
-    TF_RETURN_IF_ERROR(ScheduleGpuModule(module.get(), pointer_size_,
-                                         scheduler_mem_limit, gpu_device_info));
-    TF_RETURN_IF_ERROR(
-        RunPostSchedulingPipelines(module.get(), scheduler_mem_limit));
-
-    // Compile the module
-    CompileModuleResults compile_module_results;
-
-    if (target_config.has_value()) {
-      TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
-          module.get(), &llvm_context, target_triple_, data_layout_,
-          target_config->platform_name, options.PlatformId(),
-          target_config->device_description, GetCanShareBuffer(),
-          BufferSizeBytesFunction(), &compile_module_results));
-    } else {
-      CHECK(options.executor() != nullptr);
-      auto stream_exec = options.executor();
-      TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
-          module.get(), &llvm_context, target_triple_, data_layout_,
-          stream_exec->platform()->Name(), options.PlatformId(),
-          stream_exec->GetDeviceDescription(), GetCanShareBuffer(),
-          BufferSizeBytesFunction(), &compile_module_results));
-    }
-    if (user_pre_optimization_hook_) {
-      user_pre_optimization_hook_(*compile_module_results.llvm_module);
-    }
-
-    using BackendCompileResult = std::pair<std::string, std::vector<uint8_t>>;
-    BackendCompileResult backend_result;
-    if (target_config.has_value()) {
-      TF_ASSIGN_OR_RETURN(
-          backend_result,
-          CompileToTargetBinary(
-              module->config(), std::move(compile_module_results.llvm_module),
-              target_config->device_description.gpu_compute_capability(),
-              options.executor(), {options.device_allocator()}, module.get()));
-    } else {
-      TF_ASSIGN_OR_RETURN(
-          backend_result,
-          CompileToTargetBinary(
-              module->config(), std::move(compile_module_results.llvm_module),
-              GetGpuVersion(options.executor()), options.executor(),
-              {options.device_allocator()}, module.get()));
+    TF_ASSIGN_OR_RETURN(
+        CompileResultWithMetadata res,
+        CompileToBackendResult(module.get(), &llvm_context, options.executor(),
+                               {options.device_allocator()}, gpu_device_info));
+
+    if (!IsXlaRuntimeExecutableEnabled(module->config())) {
+      // Create GpuThunkAotCompilationResult if thunk runtime is enabled.
+      results.emplace_back(std::make_unique<GpuThunkAotCompilationResult>(
+          module.get(), res.compile_module_results.buffer_assignment.get(),
+          res.backend_result.asm_text, res.backend_result.binary));
+      continue;
     }
 
-    auto& compiled_executable = compile_module_results.executable;
-
-    if (!std::holds_alternative<GpuExecutable::OwnedGpuRuntimeProgram>(
-            compiled_executable)) {
+    const auto* program = std::get_if<GpuExecutable::OwnedGpuRuntimeProgram>(
+        &res.compile_module_results.executable);
+    if (!program) {
       return InternalError("Gpu runtime program was not provided");
     }
 
     // TODO(ezhulenev): Unify AOT compilation with GpuRuntimeExecutable::Create
     // (see `gpu/runtime/executable.h`).
 
-    const auto& program =
-        std::get<GpuExecutable::OwnedGpuRuntimeProgram>(compiled_executable);
-
     // Options for the default XLA runtime compilation pipeline.
     runtime::CompilationPipelineOptions copts;
 
@@ -1852,7 +1956,7 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
 
     // Instantiate new JitExecutable from the MLIR source.
     auto jit_executable = runtime::JitExecutable::Instantiate(
-        program->module, program->entry_point, opts);
+        (*program)->module, (*program)->entry_point, opts);
     if (!jit_executable.ok())
       return InternalError("Failed to compile XLA program: %s",
                            jit_executable.status().message());
@@ -1869,9 +1973,9 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                      obj_file->getBuffer().size());
 
     results.emplace_back(std::make_unique<GpuAotCompilationResult>(
-        module->ToProto(), data, program->module,
-        compile_module_results.entry_func_attrs, backend_result.first,
-        backend_result.second, compile_module_results.constants));
+        module->ToProto(), data, (*program)->module,
+        res.backend_result.asm_text, res.backend_result.binary,
+        res.compile_module_results.constants));
   }
   return std::move(results);
 }
@@ -1887,19 +1991,19 @@ StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
     Executable* executable) const {
   auto* gpu_executable = tensorflow::down_cast<GpuExecutable*>(executable);
   if (!gpu_executable) return Internal("GpuExecutable is null");
-  HloModuleProto module_proto = gpu_executable->module().ToProto();
-  TF_ASSIGN_OR_RETURN(auto obj_file, gpu_executable->GetObjFile());
-  TF_ASSIGN_OR_RETURN(auto mlir_module, gpu_executable->GetMlirModule());
-  xla::EntryFunctionAttributes entry_func_attrs =
-      gpu_executable->entry_func_attrs();
-  auto text = gpu_executable->text();
-  auto binary = gpu_executable->binary();
-
-  std::unique_ptr<AotCompilationResult> result =
-      std::make_unique<xla::gpu::GpuAotCompilationResult>(
-          module_proto, obj_file, mlir_module, entry_func_attrs, text, binary,
-          gpu_executable->constants());
-  return result;
+
+  if (gpu_executable->IsXlaRuntimeEnabled()) {
+    HloModuleProto module_proto = gpu_executable->module().ToProto();
+    auto obj_file = gpu_executable->GetObjFile().value_or("");
+    auto mlir_module = gpu_executable->GetMlirModule().value_or("");
+    return std::make_unique<xla::gpu::GpuAotCompilationResult>(
+        module_proto, obj_file, mlir_module, gpu_executable->text(),
+        gpu_executable->binary(), gpu_executable->constants());
+  } else {
+    return std::make_unique<xla::gpu::GpuThunkAotCompilationResult>(
+        &gpu_executable->module(), gpu_executable->buffer_assignment(),
+        gpu_executable->text(), gpu_executable->binary());
+  }
 }
 
 Status GpuCompiler::RunPostSchedulingPipelines(
@@ -1950,6 +2054,15 @@ Status GpuCompiler::RunPostSchedulingPipelines(
     // insert additional copies.
     TF_RETURN_IF_ERROR(pipeline.Run(module).status());
   }
+
+  // After we have a scheduled module and all operations wrapped into fusions we
+  // can decide how to wrap them into command buffers.
+  if (!IsXlaRuntimeExecutableEnabled(module->config())) {
+    HloPassPipeline pipeline("command-buffer-scheduling");
+    pipeline.AddPass<CommandBufferScheduling>();
+    TF_RETURN_IF_ERROR(pipeline.Run(module).status());
+  }
+
   return OkStatus();
 }
 
@@ -1992,7 +2105,11 @@ GpuCompiler::LoadAotCompilationResult(
 StatusOr<std::unique_ptr<AotCompilationResult>>
 GpuCompiler::LoadAotCompilationResultStatic(
     const std::string& serialized_aot_result) {
-  return GpuAotCompilationResult::FromString(serialized_aot_result);
+  // TODO(anlunx): Remove the code that loads a GpuAotCompilationResult when we
+  // convert to thunk runtime.
+  auto result = GpuAotCompilationResult::FromString(serialized_aot_result);
+  if (result.ok()) return result;
+  return GpuThunkAotCompilationResult::FromString(serialized_aot_result);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index cb378c653b2273..0bb8e0fc91e8c7 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/buffer_sharing.h"
+#include "xla/service/gpu/compile_module_to_llvm_ir.h"
 #include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/hlo.pb.h"
@@ -72,7 +73,7 @@ class GpuCompiler : public LLVMCompiler {
       const CompileOptions& options) override;
 
   StatusOr<std::unique_ptr<BufferAssignment>> AssignBuffers(
-      HloModule* hlo_module, se::StreamExecutor* stream_exec) override;
+      HloModule* hlo_module, const se::StreamExecutor* stream_exec) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
@@ -82,12 +83,6 @@ class GpuCompiler : public LLVMCompiler {
   CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                      AotCompilationOptions const& options) override;
 
-  StatusOr<std::pair<std::string, std::vector<uint8_t>>> CompileToTargetBinary(
-      const HloModuleConfig& module_config,
-      std::unique_ptr<llvm::Module> llvm_module,
-      se::GpuComputeCapability gpu_version, se::StreamExecutor* stream_exec,
-      const CompileOptions& options, const HloModule* debug_module);
-
   se::Platform::Id PlatformId() const override { return platform_id_; }
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
@@ -107,7 +102,15 @@ class GpuCompiler : public LLVMCompiler {
   Status RunPostSchedulingPipelines(HloModule* module,
                                     int64_t scheduler_mem_limit) const;
 
+  std::string target_triple() const { return target_triple_; }
+  std::string data_layout() const { return data_layout_; }
+
  protected:
+  struct BackendCompileResult {
+    std::string asm_text;
+    std::vector<uint8_t> binary;
+  };
+
   // During compilation with device, stream_exec != null and autotune_results
   // == null. During deviceless AOT compilation, stream_exec == null and
   // autotune_results != null.
@@ -149,6 +152,28 @@ class GpuCompiler : public LLVMCompiler {
   }
 
  private:
+  struct CompileResultWithMetadata {
+    BackendCompileResult backend_result;
+    CompileModuleResults compile_module_results;
+  };
+
+  // Schedule and compile the module.
+  StatusOr<CompileResultWithMetadata> CompileToBackendResult(
+      HloModule* module, llvm::LLVMContext* llvm_context,
+      se::StreamExecutor* executor, const CompileOptions& options,
+      const se::DeviceDescription& gpu_device_info);
+
+  StatusOr<BackendCompileResult> CompileToTargetBinary(
+      const HloModuleConfig& module_config, llvm::Module* llvm_module,
+      se::GpuComputeCapability gpu_version, se::StreamExecutor* stream_exec,
+      const CompileOptions& options, const HloModule* debug_module);
+
+  StatusOr<BackendCompileResult> CompileSingleModule(
+      const HloModuleConfig& module_config,
+      se::GpuComputeCapability gpu_version, const HloModule* debug_module,
+      llvm::Module* llvm_module, bool relocatable,
+      const CompileOptions& options, std::optional<int> shard_number);
+
   Status LoadAutotuneResultsFromFile(const DebugOptions& debug_options);
   Status SerializeAutotuneResultsToFile(const DebugOptions& debug_options);
 
@@ -171,12 +196,10 @@ class GpuCompiler : public LLVMCompiler {
 
   // TODO(timshen): Replace `debug_module` with some portable debug information
   // that accommodates both HLO and MLIR.
-  virtual StatusOr<std::pair<std::string, std::vector<uint8_t>>>
-  CompileTargetBinary(const HloModuleConfig& module_config,
-                      llvm::Module* llvm_module,
-                      se::GpuComputeCapability gpu_version, bool relocatable,
-                      const HloModule* debug_module,
-                      const CompileOptions& options) = 0;
+  virtual StatusOr<BackendCompileResult> CompileTargetBinary(
+      const HloModuleConfig& module_config, llvm::Module* llvm_module,
+      se::GpuComputeCapability gpu_version, bool relocatable,
+      const HloModule* debug_module, const CompileOptions& options) = 0;
 
   Status PrepareHloModuleForIrEmitting(HloModule* hlo_module);
 
diff --git a/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc b/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc
index 648b1ab7cfa27f..ae8123389a7f94 100644
--- a/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_copy_insertion_test.cc
@@ -226,6 +226,40 @@ ENTRY main {
       FusionCanShareBufferHint(fusion, fusion->operand(0), {1}));
 }
 
+TEST_F(FusionCanShareBufferHintTest,
+       BufferCanBeSharedMultiOutputFusionTwoReachableOutputs) {
+  const char* const kModuleString = R"(
+HloModule fusion
+
+fused_computation {
+  param_0.1 = f32[2,3]{1,0} parameter(0)
+  param_1.1 = f32[2,3]{1,0} parameter(1)
+  neg = f32[2,3]{1,0} negate(param_1.1)
+  mul = f32[2,3]{1,0} multiply(param_0.1, neg)
+  ROOT tuple = (f32[2,3]{1,0}, f32[2,3]{1,0}) tuple(mul, neg)
+}
+
+ENTRY main {
+  param_0 = f32[2,3]{1,0} parameter(0)
+  param_1 = f32[2,3]{1,0} parameter(1)
+  ROOT fusion = (f32[2,3]{1,0}, f32[2,3]{1,0}) fusion(param_0, param_1), kind=kLoop, calls=fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalTrue(FusionCanShareBufferHint(fusion, fusion->operand(0), {0}));
+  // The first operand cannot share the buffer with the second fusion output,
+  // because there is no path between them.
+  ExpectOptionalFalse(
+      FusionCanShareBufferHint(fusion, fusion->operand(0), {1}));
+  // The second operand can share the buffer with the second fusion output and
+  // the first fusion output.
+  ExpectOptionalTrue(FusionCanShareBufferHint(fusion, fusion->operand(1), {0}));
+  ExpectOptionalTrue(FusionCanShareBufferHint(fusion, fusion->operand(1), {1}));
+}
+
 TEST_F(FusionCanShareBufferHintTest, BufferCanBeSharedReductionEmitter) {
   constexpr char kModuleString[] = R"(
 HloModule TestModule
@@ -259,6 +293,65 @@ ENTRY %main {
   ExpectOptionalTrue(FusionCanShareBufferHint(fusion, fusion->operand(0), {1}));
 }
 
+TEST_F(FusionCanShareBufferHintTest,
+       BufferCannotBeSharedScatterMultiOutputFusion) {
+  // This is a fusion that we would normally not create because it cannot be
+  // emitted in-place. Still check whether buffer sharing logic would handle it
+  // correctly.
+  const char* const kModuleString = R"(
+    HloModule fusion
+
+    add {
+      lhs = s32[] parameter(0)
+      rhs = s32[] parameter(1)
+      ROOT add = s32[] add(lhs, rhs)
+    }
+
+    fused_computation {
+      p0 = s32[3,3] parameter(0)
+      p1 = s32[3] parameter(1)
+      indices = s32[3] add(p1, p1)
+      p2 = s32[3,3] parameter(2)
+      updates = s32[3,3] add(p2, p2)
+      add = s32[3,3] add(p0, p0)
+      scatter = s32[3,3] scatter(p0, indices, updates),
+          to_apply=add,
+          update_window_dims={1},
+          inserted_window_dims={0},
+          scatter_dims_to_operand_dims={0},
+          index_vector_dim=1
+      ROOT output = (s32[3,3], s32[3,3]) tuple(scatter, add)
+    }
+
+    ENTRY main {
+      parameter0 = s32[3,3] parameter(0)
+      parameter1 = s32[3] parameter(1)
+      parameter2 = s32[3,3] parameter(2)
+      ROOT fusion = (s32[3,3], s32[3,3]) fusion(parameter0, parameter1, parameter2), kind=kInput, calls=fused_computation
+    }
+    )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  // We expect that no buffer can be shared, because when Scatter is involved,
+  // the only buffer we can potentially share is the first operand of scatter,
+  // but if that is also used for a different fusion output, it will not work
+  // due to potentially different access patterns.
+  ExpectOptionalFalse(
+      FusionCanShareBufferHint(fusion, fusion->operand(0), {0}));
+  ExpectOptionalFalse(
+      FusionCanShareBufferHint(fusion, fusion->operand(0), {1}));
+  ExpectOptionalFalse(
+      FusionCanShareBufferHint(fusion, fusion->operand(1), {0}));
+  ExpectOptionalFalse(
+      FusionCanShareBufferHint(fusion, fusion->operand(1), {1}));
+  ExpectOptionalFalse(
+      FusionCanShareBufferHint(fusion, fusion->operand(2), {0}));
+  ExpectOptionalFalse(
+      FusionCanShareBufferHint(fusion, fusion->operand(2), {1}));
+}
+
 TEST_F(FusionCanShareBufferHintTest, BufferCanBeSharedScatterFusion) {
   const char* const kModuleString = R"(
     HloModule fusion
@@ -627,6 +720,49 @@ ENTRY main {
   ExpectOptionalTrue(FusionCanShareBufferHint(fusion, fusion->operand(0), {}));
 }
 
+TEST_F(FusionCanShareBufferHintTest,
+       BufferCannotBeSharedDynamicUpdateSliceAndOtherUser) {
+  // This is a fusion that we would normally not create because it cannot be
+  // emitted in-place. Still check whether buffer sharing logic would handle it
+  // correctly.
+  const char* const kModuleString = R"(
+HloModule fusion
+
+fused_computation {
+  param_0.1 = s32[6]{0} parameter(0)
+  bitcast = s32[2,3]{1,0} bitcast(param_0.1)
+  zero = s32[] constant(0)
+  param_1.1 = s32[] parameter(1)
+  dynamic-slice = s32[1,2]{1,0} dynamic-slice(bitcast, param_1.1, zero), dynamic_slice_sizes={1,2}
+  one = s32[] constant(1)
+  broadcast = s32[1,2]{1,0} broadcast(one), dimensions={}
+  add = s32[1,2] add(dynamic-slice, broadcast)
+  dynamic-update-slice = s32[2,3]{1,0} dynamic-update-slice(bitcast, add, param_1.1, zero)
+  bitcast.1 = s32[6]{0} bitcast(dynamic-update-slice)
+  neg = s32[2,3]{1,0} negate(bitcast)
+  ROOT output = (s32[6]{0}, s32[2,3]{1,0}) tuple(bitcast.1, neg)
+}
+
+ENTRY main {
+  param_0 = s32[6]{0} parameter(0)
+  param_1 = s32[] parameter(1)
+  ROOT fusion = (s32[6]{0},s32[2,3]{1,0}) fusion(param_0, param_1), kind=kInput, calls=fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalFalse(
+      FusionCanShareBufferHint(fusion, fusion->operand(0), {0}));
+  ExpectOptionalFalse(
+      FusionCanShareBufferHint(fusion, fusion->operand(0), {1}));
+  ExpectOptionalFalse(
+      FusionCanShareBufferHint(fusion, fusion->operand(1), {0}));
+  ExpectOptionalFalse(
+      FusionCanShareBufferHint(fusion, fusion->operand(1), {1}));
+}
+
 TEST_F(FusionCanShareBufferHintTest,
        BufferCannotBeSharedBecauseDUSAndDSAccessDifferentSliceSizes) {
   const char* const kModuleString = R"(
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc
index 5f88847b3352a4..8eb44d45aa56eb 100644
--- a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc
+++ b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc
@@ -53,7 +53,7 @@ stream_executor::DeviceDescription TestGpuDeviceInfo::AMDMI210DeviceInfo() {
   b.set_shared_memory_per_core(64 * 1024);
   b.set_threads_per_core_limit(2048);
   b.set_core_count(104);
-  b.set_fpus_per_core(0);
+  b.set_fpus_per_core(128);
   b.set_block_dim_limit_x(2'147'483'647);
   b.set_block_dim_limit_y(2'147'483'647);
   b.set_block_dim_limit_z(2'147'483'647);
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info_test.cc b/third_party/xla/xla/service/gpu/gpu_device_info_test.cc
index cb8d69f4c62592..c9e2ae245a7535 100644
--- a/third_party/xla/xla/service/gpu/gpu_device_info_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_device_info_test.cc
@@ -119,7 +119,7 @@ TEST(DeviceInfoTest, DeviceInfoIsCorrect) {
             /*shared_memory_per_block_optin=*/0,
             /*shared_memory_per_core=*/64 * 1024,
             /*threads_per_core_limit=*/2560, /*core_count=*/120,
-            /*fpus_per_core=*/0, /*block_dim_limit_x=*/2'147'483'647,
+            /*fpus_per_core=*/128, /*block_dim_limit_x=*/2'147'483'647,
             /*block_dim_limit_y=*/2'147'483'647,
             /*block_dim_limit_z=*/2'147'483'647,
             /*memory_bandwidth=*/1228800000000,
@@ -136,7 +136,7 @@ TEST(DeviceInfoTest, DeviceInfoIsCorrect) {
             /*shared_memory_per_block_optin=*/0,
             /*shared_memory_per_core=*/64 * 1024,
             /*threads_per_core_limit=*/2560, /*core_count=*/60,
-            /*fpus_per_core=*/0, /*block_dim_limit_x=*/2'147'483'647,
+            /*fpus_per_core=*/64, /*block_dim_limit_x=*/2'147'483'647,
             /*block_dim_limit_y=*/2'147'483'647,
             /*block_dim_limit_z=*/2'147'483'647,
             /*memory_bandwidth=*/256000000000,
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index a576d8922534b3..da8b6b4b9da0e6 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
 #include "mlir/Parser/Parser.h"  // from @llvm-project
@@ -42,6 +43,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
 #include "xla/service/gpu/runtime/executable.h"
+#include "xla/service/gpu/runtime/tracing.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/hlo_parser.h"
@@ -125,14 +127,13 @@ GpuExecutable::GpuExecutable(GpuExecutable::Params params)
       text_(std::move(params.asm_text)),
       binary_(std::move(params.binary)),
       gpu_version_(params.gpu_version),
-      entry_func_attrs_(params.entry_func_attrs),
       module_name_(params.module_name),
       output_shape_(params.output_shape),
-      allocations_(std::move(params.allocations)),
+      allocations_(std::move(params.mlir_allocations)),
+      buffer_assignment_(std::move(params.buffer_assignment)),
       enable_persistent_temp_buffers_(params.enable_persistent_temp_buffers),
-      debug_buffer_assignment_(std::move(params.debug_buffer_assignment)),
-      verbose_buffer_assignment_string_dumper_(
-          params.verbose_buffer_assignment_string_dumper),
+      debug_buffer_assignment_show_max_(
+          params.debug_buffer_assignment_show_max),
       constants_(std::move(params.constants)),
       output_info_(std::move(params.output_info)),
       enable_debug_info_manager_(params.enable_debug_info_manager) {
@@ -143,9 +144,12 @@ GpuExecutable::GpuExecutable(GpuExecutable::Params params)
   *(uint64_t*)(&binary_[binary_.size() - 16]) = tsl::EnvTime::NowNanos();
   *(uint64_t*)(&binary_[binary_.size() - 8]) = tsl::random::New64();
 #endif
+  if (has_module()) {
+    annotation_info_.emplace(module());
+  }
   if (has_module() && enable_debug_info_manager_) {
     XlaDebugInfoManager::Get()->RegisterModule(shared_module(),
-                                               debug_buffer_assignment_);
+                                               buffer_assignment_->ToProto());
   }
 }
 
@@ -227,15 +231,6 @@ Status ExecuteThunks(const std::string& module_name, ModuleIdentifier module_id,
       [&] { return absl::StrCat(module_name, ":XLA GPU module"); },
       tsl::profiler::TraceMeLevel::kInfo);
 
-  ScopedAnnotationAlways annotation([&] {
-    std::string module_id_str;
-    if (module_id >= 0) {
-      module_id_str = absl::StrFormat(",program_id=%d", module_id);
-    }
-    return absl::StrFormat("XlaModule:#hlo_module=%s%s#", module_name,
-                           module_id_str);
-  });
-
   for (const std::unique_ptr<Thunk>& thunk : thunk_sequence) {
     // Annotate execution of this op if tracing was enabled when we started
     // running this module.  If tracing is enabled *while* we're running the
@@ -334,19 +329,20 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
       VLOG(3) << "Resolved global " << info.symbol_name << " to "
               << global.opaque();
 
-      if (!info.content.empty()) {
+      if (!info.content.span().empty()) {
         // This means the constant did not have an initializer in the PTX and
         // therefore must be initialized by XLA here.
-        stream->ThenMemcpy(&global, info.content.data(), info.content.size());
+        stream->ThenMemcpy(&global, info.content.span().data(),
+                           info.content.span().size());
         submitted_mem_copies = true;
       }
     } else {
       // The constant was not defined in the PTX and therefore must be both
       // allocated and initialized by XLA here.
-      CHECK(!info.content.empty());
+      CHECK(!info.content.span().empty());
 
-      TF_ASSIGN_OR_RETURN(
-          auto shared, executor->CreateOrShareConstant(stream, info.content));
+      TF_ASSIGN_OR_RETURN(auto shared, executor->CreateOrShareConstant(
+                                           stream, info.content.span()));
       global = *shared;
       VLOG(3) << "Allocated (or shared) global " << info.symbol_name << " at "
               << global.opaque();
@@ -420,7 +416,8 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
           memory_allocator->Allocate(device_ordinal, buffer_size);
       if (!buffer.ok()) {
         return ResourceExhausted("%s\n%s\n", buffer.status().message(),
-                                 verbose_buffer_assignment_string_dumper_());
+                                 buffer_assignment_->ToVerboseString(
+                                     debug_buffer_assignment_show_max_));
       }
       buffer_address = buffer->Release();
     }
@@ -458,11 +455,12 @@ StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
       [&] { return std::string("Build buffer allocations"); },
       tsl::profiler::TraceMeLevel::kInfo);
 
-  const int64_t num_buffers = allocations_.size();
+  absl::Span<const BufferAllocation> allocations = GetAllocations();
+  const int64_t num_buffers = allocations.size();
   std::vector<se::DeviceMemoryBase> buffers;
   buffers.reserve(num_buffers);
   for (int64_t i = 0; i < num_buffers; ++i) {
-    const BufferAllocation& allocation = allocations_[i];
+    const BufferAllocation& allocation = allocations[i];
     // Check if the buffer is already stored as a persistent buffer.
     se::DeviceMemoryBase buffer;
     if (buffer_alloc_to_persistent_memory_map.contains(allocation.index())) {
@@ -511,15 +509,6 @@ static Status ExecuteXlaRuntime(const std::string& module_name,
       [&] { return absl::StrCat(module_name, ":XLA GPU module"); },
       tsl::profiler::TraceMeLevel::kInfo);
 
-  ScopedAnnotationAlways annotation([&] {
-    std::string module_id_str;
-    if (module_id >= 0) {
-      module_id_str = absl::StrFormat(",program_id=%d", module_id);
-    }
-    return absl::StrFormat("XlaModule:#hlo_module=%s%s#", module_name,
-                           module_id_str);
-  });
-
   auto executed = gpu_runtime_executable.Execute(
       run_options, asm_text, binary, buffer_allocations, gpu_lock, temp_buffer);
   if (!executed.ok()) return executed;
@@ -538,7 +527,7 @@ Status GpuExecutable::PopulatePersistentTempBuffers(
 
   // Allocate persistent temp buffers.
   BufferAllocToDeviceMemoryMap buffer_alloc_to_device_memory_map;
-  for (const BufferAllocation& allocation : allocations_) {
+  for (const BufferAllocation& allocation : GetAllocations()) {
     if (!allocation.IsPreallocatedTempBuffer()) {
       continue;
     }
@@ -634,6 +623,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
     return true;
   }();
 
+  absl::Span<const BufferAllocation> allocations = GetAllocations();
   for (auto& p : result.MutableResult()->buffers()) {
     const ShapeIndex& index = p.first;
     if (!output_info_.contains(index)) {
@@ -641,7 +631,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
     }
     const OutputInfo& output_info = output_info_.at(index);
     const BufferAllocation* allocation =
-        &allocations_[output_info.allocation_index];
+        &allocations[output_info.allocation_index];
     se::DeviceMemoryBase& result_buffer = p.second;
 
     VLOG(4) << "Looking at: allocation " << output_info.allocation_index
@@ -702,7 +692,8 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
         if (!allocated_buffer.ok()) {
           return ResourceExhausted("%s\n%s\n",
                                    allocated_buffer.status().message(),
-                                   verbose_buffer_assignment_string_dumper_());
+                                   buffer_assignment_->ToVerboseString(
+                                       debug_buffer_assignment_show_max_));
         }
         result_buffer = allocated_buffer->Release();
         se::DeviceMemoryBase& aliased_buffer =
@@ -735,7 +726,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
 
   // Free all temporary allocations.
   std::vector<BufferAllocation> non_persistent_allocations;
-  for (const BufferAllocation& allocation : allocations_) {
+  for (const BufferAllocation& allocation : GetAllocations()) {
     if (!persistent_buffers_map.contains(allocation.index())) {
       non_persistent_allocations.push_back(allocation);
     }
@@ -750,6 +741,24 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
   return std::move(result);
 }
 
+namespace {
+struct ModuleAnnotationManager {
+  ModuleAnnotationManager(const std::optional<ModuleAnnotations>& annotations) {
+    if (annotations.has_value()) {
+      m_old_annotations = SetCurrentModuleAnnotations(&(*annotations));
+    }
+  }
+  ~ModuleAnnotationManager() {
+    if (m_old_annotations.has_value()) {
+      SetCurrentModuleAnnotations(*m_old_annotations);
+    }
+  }
+
+ private:
+  std::optional<const ModuleAnnotations*> m_old_annotations;
+};
+}  // namespace
+
 Status GpuExecutable::ExecuteThunksOrXlaRuntime(
     const ServiceExecutableRunOptions* run_options,
     const BufferAllocations& buffer_allocations, bool block_host_until_done,
@@ -763,6 +772,15 @@ Status GpuExecutable::ExecuteThunksOrXlaRuntime(
     unique_id = module().unique_id();
   }
 
+  ScopedAnnotationAlways annotation([&]() -> ModuleAnnotation {
+    if (annotation_info_) {
+      return annotation_info_->top_level;
+    } else {
+      return {module_name_, unique_id};
+    }
+  });
+  ModuleAnnotationManager set_current_kernel_annotations{annotation_info_};
+
   if (thunks_) {
     se::StreamExecutor* executor = run_options->stream()->parent();
     Thunk::ExecutableSource executable_source = {text_, binary_};
@@ -783,7 +801,7 @@ Status GpuExecutable::ExecuteThunksOrXlaRuntime(
   // Match IrEmitter's temp buffer allocation for kernel launches. See
   // IrEmitterUnnested::BuildKernelThunkImpl().
   const BufferAllocation* temp_buffer = nullptr;
-  for (const BufferAllocation& alloc : allocations_) {
+  for (const BufferAllocation& alloc : GetAllocations()) {
     if (alloc.IsPreallocatedTempBuffer()) {
       // Retrieve the first seen temp buffer.
       if (temp_buffer == nullptr) temp_buffer = &alloc;
@@ -806,8 +824,7 @@ int64_t GpuExecutable::SizeOfGeneratedCodeInBytes() const {
     return -1;
   }
   int64_t size = binary().size();
-  for (BufferAllocation::Index i = 0; i < allocations_.size(); ++i) {
-    const BufferAllocation& allocation = allocations_[i];
+  for (const auto& allocation : GetAllocations()) {
     if (allocation.is_constant()) {
       size += allocation.size();
     }
@@ -932,10 +949,8 @@ GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment) {
 GpuExecutable::GpuExecutable(
     std::shared_ptr<HloModule> hlo_module, std::string asm_text,
     std::vector<uint8_t> binary, std::vector<ConstantInfo> constants,
-    se::GpuComputeCapability gpu_version,
-    xla::EntryFunctionAttributes entry_func_attrs,
-    absl::string_view module_name, Shape xla_output_shape,
-    std::vector<BufferAllocation> allocations,
+    se::GpuComputeCapability gpu_version, absl::string_view module_name,
+    Shape xla_output_shape, std::vector<BufferAllocation> allocations,
     absl::flat_hash_map<ShapeIndex, OutputInfo> output_info,
     std::unique_ptr<GpuRuntimeExecutable> gpu_runtime_executable)
     : Executable(std::move(hlo_module)),
@@ -943,7 +958,6 @@ GpuExecutable::GpuExecutable(
       binary_(std::move(binary)),
       gpu_version_(gpu_version),
       gpu_runtime_executable_(std::move(gpu_runtime_executable)),
-      entry_func_attrs_(entry_func_attrs),
       module_name_(module_name),
       output_shape_(xla_output_shape),
       allocations_(std::move(allocations)),
@@ -951,8 +965,9 @@ GpuExecutable::GpuExecutable(
       output_info_(std::move(output_info)),
       enable_debug_info_manager_(true) {
   if (has_module()) {
+    annotation_info_.emplace(module());
     XlaDebugInfoManager::Get()->RegisterModule(shared_module(),
-                                               debug_buffer_assignment_);
+                                               BufferAssignmentProto());
   }
 }
 
@@ -1051,11 +1066,9 @@ static std::vector<std::vector<int64_t>> GetAllocationIndices(
 
 StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromObjFile(
     std::shared_ptr<HloModule> hlo_module, absl::string_view obj_file,
-    absl::string_view mlir_module,
-    xla::EntryFunctionAttributes entry_func_attrs, DebugOptions debug_options,
+    absl::string_view mlir_module, DebugOptions debug_options,
     absl::string_view asm_text, absl::string_view binary,
-    std::vector<ConstantInfo> constants, se::GpuComputeCapability gpu_version,
-    se::StreamExecutor* executor) {
+    std::vector<ConstantInfo> constants, se::GpuComputeCapability gpu_version) {
   VLOG(1) << "Load serialized Gpu executable from object file: module="
           << hlo_module->name();
 
@@ -1121,9 +1134,9 @@ StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromObjFile(
   std::vector<uint8_t> binary_vector(binary.begin(), binary.end());
   return std::unique_ptr<Executable>(new GpuExecutable(
       std::move(hlo_module), std::move(asm_text_string),
-      std::move(binary_vector), std::move(constants), gpu_version,
-      entry_func_attrs, name, result_xla_shape, std::move(allocations),
-      std::move(output_info), std::move(gpu_runtime_executable)));
+      std::move(binary_vector), std::move(constants), gpu_version, name,
+      result_xla_shape, std::move(allocations), std::move(output_info),
+      std::move(gpu_runtime_executable)));
 }
 
 StatusOr<std::string_view> GpuExecutable::GetObjFile() const {
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/xla/xla/service/gpu/gpu_executable.h
index bef637f595de01..dfe729b7d426d2 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -36,7 +36,9 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
+#include "xla/service/gpu/runtime/annotation.h"
 #include "xla/service/gpu/runtime/executable.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/hlo_execution_profile.h"
@@ -64,7 +66,7 @@ class GpuExecutable : public Executable {
 
   struct ConstantInfo {
     std::string symbol_name;
-    std::vector<uint8_t> content;
+    DenseDataIntermediate content;
     int allocation_index = -1;
   };
 
@@ -88,22 +90,14 @@ class GpuExecutable : public Executable {
     // (native function) or experimental XLA runtime executable (IREE VM
     // function) depending on which is supplied.
     std::variant<OwnedThunkSequence, OwnedGpuRuntimeProgram> executable;
-    xla::EntryFunctionAttributes entry_func_attrs;
     std::vector<ConstantInfo> constants;
     absl::flat_hash_map<ShapeIndex, OutputInfo> output_info;
     std::string module_name;
     xla::Shape output_shape;
-    std::vector<BufferAllocation> allocations;
+    std::optional<std::vector<BufferAllocation>> mlir_allocations;
+    std::unique_ptr<const BufferAssignment> buffer_assignment;
     bool enable_persistent_temp_buffers;
-    std::unique_ptr<BufferAssignmentProto> debug_buffer_assignment = nullptr;
-
-    // A callable that dumps out a debug string upon device OOM. It's not the
-    // string itself, as the string can be huge and increase peak host memory
-    // usage for the common (non-OOM) case.
-    std::function<std::string()> verbose_buffer_assignment_string_dumper = [] {
-      return std::string();
-    };
-
+    int64_t debug_buffer_assignment_show_max;
     std::unique_ptr<HloModule> debug_module = nullptr;
     bool enable_debug_info_manager = true;
   };
@@ -123,11 +117,10 @@ class GpuExecutable : public Executable {
   // compiled to a native function using the XLA Runtime stack).
   static StatusOr<std::unique_ptr<Executable>> LoadFromObjFile(
       std::shared_ptr<HloModule> hlo_module, absl::string_view obj_file,
-      absl::string_view mlir_module,
-      xla::EntryFunctionAttributes entry_func_attrs, DebugOptions debug_options,
+      absl::string_view mlir_module, DebugOptions debug_options,
       absl::string_view asm_text, absl::string_view binary,
-      std::vector<ConstantInfo> constants, se::GpuComputeCapability gpu_version,
-      stream_executor::StreamExecutor* executor);
+      std::vector<ConstantInfo> constants,
+      se::GpuComputeCapability gpu_version);
 
   // Constructor to use when loading a GpuExecutable from an object file (native
   // function compiled for XLA Runtime). Omits setting class members that aren't
@@ -136,7 +129,6 @@ class GpuExecutable : public Executable {
                 std::vector<uint8_t> binary,
                 std::vector<ConstantInfo> constants,
                 se::GpuComputeCapability gpu_version,
-                xla::EntryFunctionAttributes entry_func_attrs,
                 absl::string_view module_name, Shape xla_output_shape,
                 std::vector<BufferAllocation> allocations,
                 absl::flat_hash_map<ShapeIndex, OutputInfo> output_info,
@@ -190,18 +182,34 @@ class GpuExecutable : public Executable {
       VariantArguments arguments);
 
   absl::Span<const BufferAllocation> GetAllocations() const {
-    return allocations_;
+    // A GpuExecutable can get its allocations in three ways:
+    // 1 - From a regular compilation that uses allocations from MLIR.
+    // 2 - From a regular compilation that uses the original allocations from
+    //     the buffer assignment.
+    // 3 - From loading the executable from an object file.
+    //
+    // In cases 1 and 3, the allocations are stored in allocations_ and in
+    // case 2, they are part of the buffer_assignment.
+    //
+    // This function chooses the correct allocations to be used within the
+    // GpuExecutable code.
+    return allocations_.has_value() ? *allocations_
+                                    : buffer_assignment_->Allocations();
   }
 
-  const std::vector<ConstantInfo>& constants() const { return constants_; }
-
-  xla::EntryFunctionAttributes entry_func_attrs() const {
-    return entry_func_attrs_;
+  bool IsXlaRuntimeEnabled() const {
+    return gpu_runtime_executable_ != nullptr;
   }
 
+  const std::vector<ConstantInfo>& constants() const { return constants_; }
+
   StatusOr<std::string_view> GetObjFile() const;
   StatusOr<std::string_view> GetMlirModule() const;
 
+  const BufferAssignment* buffer_assignment() const {
+    return buffer_assignment_.get();
+  }
+
  private:
   // Use GpuExecutable::Create() to create an instance.
   explicit GpuExecutable(Params params);
@@ -288,15 +296,23 @@ class GpuExecutable : public Executable {
   // Xla runtime is enabled).
   std::unique_ptr<GpuRuntimeExecutable> gpu_runtime_executable_;
 
-  xla::EntryFunctionAttributes entry_func_attrs_;
-
   std::string module_name_;
 
   xla::Shape output_shape_;
 
-  // Owns the buffer data at runtime. It provides information to allocate
-  // memory for every output/temp buffers.
-  const std::vector<BufferAllocation> allocations_;
+  // The allocations_ object contains allocations that **may** be used to
+  // provide information for allocating memory for every output/temp buffer.
+  // See the comment on GetAllocations().
+  std::optional<const std::vector<BufferAllocation>> allocations_;
+
+  // The buffer_assignment_ object contains allocations that **may** be used to
+  // provide information for allocating memory for every output/temp buffer.
+  // See the comment on GetAllocations().
+  //
+  // This object is also used for dumping debug info.
+  std::unique_ptr<const xla::BufferAssignment> buffer_assignment_;
+
+  std::optional<ModuleAnnotations> annotation_info_;
 
   bool enable_persistent_temp_buffers_ = false;
 
@@ -308,8 +324,7 @@ class GpuExecutable : public Executable {
                       BufferAllocToDeviceMemoryMap>
       persistent_temp_buffers_ ABSL_GUARDED_BY(persistent_temp_buffers_mu_);
 
-  std::shared_ptr<BufferAssignmentProto> debug_buffer_assignment_;
-  std::function<std::string()> verbose_buffer_assignment_string_dumper_;
+  int64_t debug_buffer_assignment_show_max_;
 
   absl::Mutex module_handle_mutex_;
   // Cache of module handles. Required to keep loaded modules alive until this
diff --git a/third_party/xla/xla/service/gpu/gpu_float_support.h b/third_party/xla/xla/service/gpu/gpu_float_support.h
index c9e0e2ac0c48e7..2bdc64a739f85c 100644
--- a/third_party/xla/xla/service/gpu/gpu_float_support.h
+++ b/third_party/xla/xla/service/gpu/gpu_float_support.h
@@ -27,8 +27,9 @@ namespace gpu {
 
 class GpuFloatSupport : public FloatSupport {
  public:
-  explicit GpuFloatSupport(PrimitiveType low_precision_type)
-      : FloatSupport(low_precision_type) {}
+  explicit GpuFloatSupport(PrimitiveType low_precision_type,
+                           PrimitiveType high_precision_type = F32)
+      : FloatSupport(low_precision_type, high_precision_type) {}
 
   bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
                                    int64_t operand_index) const override {
diff --git a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc
index 74c3d839750a5a..a796f0b5652ca0 100644
--- a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc
@@ -83,8 +83,8 @@ Status RunFusedMHA(GpufMHAParams params, se::Stream *stream,
                                      params.config->activation,
                                      dropout_rate,
                                      seed,
-                                     false,
-                                     false};
+                                     params.config->is_flash_attention,
+                                     params.config->is_causal_mask};
   TF_ASSIGN_OR_RETURN(auto *runner,
                       lazy_runner->GetOrCreateRunner(config, stream));
   return (*runner)(stream, options.profile_result, scratch_memory,
@@ -201,20 +201,21 @@ void AssignSeed(GpufMHAConfig &config,
 }
 
 template <typename ElementType, typename OutputType>
-Status RunFusedMHABackward(GpufMHABackwardParams params, se::Stream *stream,
-                           RunFusedMHABackwardOptions options,
-                           DeviceMemory<ElementType> bmm1_grad_gemm1_rhs_buffer,
-                           DeviceMemory<ElementType> bmm1_grad_gemm2_rhs_buffer,
-                           DeviceMemory<ElementType> bmm2_grad_gemm1_lhs_buffer,
-                           DeviceMemory<ElementType> bmm2_grad_gemm2_rhs_buffer,
-                           DeviceMemory<ElementType> d_output_buffer,
-                           DeviceMemory<OutputType> d_bmm1_lhs_buffer,
-                           DeviceMemory<OutputType> d_bmm1_rhs_buffer,
-                           DeviceMemory<OutputType> d_bmm2_rhs_buffer,
-                           DeviceMemory<OutputType> d_s_buffer,
-                           DeviceMemoryBase mask_buffer,
-                           DeviceMemoryBase d_bias_buffer,
-                           DeviceMemoryBase scratch_memory) {
+Status RunFusedMHABackward(
+    GpufMHABackwardParams params, se::Stream *stream,
+    RunFusedMHABackwardOptions options,
+    DeviceMemory<ElementType> bmm1_grad_gemm1_rhs_buffer,
+    DeviceMemory<ElementType> bmm1_grad_gemm2_rhs_buffer,
+    DeviceMemory<ElementType> bmm2_grad_gemm1_lhs_buffer,
+    DeviceMemory<ElementType> bmm2_grad_gemm2_rhs_buffer,
+    DeviceMemory<ElementType> d_output_buffer,
+    DeviceMemory<OutputType> d_bmm1_lhs_buffer,
+    DeviceMemory<OutputType> d_bmm1_rhs_buffer,
+    DeviceMemory<OutputType> d_bmm2_rhs_buffer, DeviceMemoryBase d_s_buffer,
+    DeviceMemoryBase softmax_buffer, DeviceMemoryBase d_Q_accum_buffer,
+    DeviceMemoryBase mask_buffer, DeviceMemoryBase d_bias_buffer,
+    DeviceMemoryBase fwd_output_buffer, DeviceMemoryBase bias_buffer,
+    DeviceMemoryBase scratch_memory) {
   se::dnn::LazyOpRunner<se::dnn::FusedMHABackwardOp> *lazy_runner =
       options.runner_cache->AsFusedMHABackwardRunner();
   std::optional<se::dnn::LazyOpRunner<se::dnn::FusedMHABackwardOp>>
@@ -223,6 +224,7 @@ Status RunFusedMHABackward(GpufMHABackwardParams params, se::Stream *stream,
     local_runner.emplace(params.config->algorithm);
     lazy_runner = &*local_runner;
   }
+  // FMHA TODO: add GetDNNFusedMHAKindFromCudnnfMHAKind here
   TF_ASSIGN_OR_RETURN(se::dnn::FusedMHAKind kind,
                       GetDNNFusedMHAKindFromCudnnfMHAKind(params.config->kind));
   std::optional<double> dropout_rate;
@@ -239,27 +241,25 @@ Status RunFusedMHABackward(GpufMHABackwardParams params, se::Stream *stream,
   if (params.config->seed) {
     seed = *params.config->seed;
   }
-  // TODO: set is_flash_attention to real value, set it to false for now
-  se::dnn::FusedMHABackwardOp::Config config{
-      kind,
-      scale,
-      params.config->bmm1_grad_gemm1_rhs,
-      params.config->bmm1_grad_gemm2_rhs,
-      params.config->bmm2_grad_gemm1_lhs,
-      params.config->bmm2_grad_gemm2_rhs,
-      params.config->d_output,
-      params.config->d_bmm1_lhs,
-      params.config->d_bmm1_rhs,
-      params.config->d_bmm2_rhs,
-      std::optional<TensorDescriptor>(params.config->d_s),
-      params.config->mask,
-      params.config->d_bias,
-      std::nullopt,
-      std::nullopt,
-      dropout_rate,
-      seed,
-      false,
-      false};
+  se::dnn::FusedMHABackwardOp::Config config{kind,
+                                             scale,
+                                             params.config->bmm1_grad_gemm1_rhs,
+                                             params.config->bmm1_grad_gemm2_rhs,
+                                             params.config->bmm2_grad_gemm1_lhs,
+                                             params.config->bmm2_grad_gemm2_rhs,
+                                             params.config->d_output,
+                                             params.config->d_bmm1_lhs,
+                                             params.config->d_bmm1_rhs,
+                                             params.config->d_bmm2_rhs,
+                                             params.config->d_s,
+                                             params.config->mask,
+                                             params.config->d_bias,
+                                             params.config->fwd_output,
+                                             params.config->bias,
+                                             dropout_rate,
+                                             seed,
+                                             params.config->is_flash_attention,
+                                             params.config->is_causal_mask};
   TF_ASSIGN_OR_RETURN(auto *runner,
                       lazy_runner->GetOrCreateRunner(config, stream));
   // TODO: pass in real softmax_sum, dQ_accum, fwd_output
@@ -267,9 +267,10 @@ Status RunFusedMHABackward(GpufMHABackwardParams params, se::Stream *stream,
                    bmm1_grad_gemm1_rhs_buffer, bmm1_grad_gemm2_rhs_buffer,
                    bmm2_grad_gemm1_lhs_buffer, bmm2_grad_gemm2_rhs_buffer,
                    d_output_buffer, d_bmm1_lhs_buffer, d_bmm1_rhs_buffer,
-                   d_bmm2_rhs_buffer, d_s_buffer, se::DeviceMemoryBase(),
-                   se::DeviceMemoryBase(), mask_buffer, d_bias_buffer,
-                   se::DeviceMemoryBase(), se::DeviceMemoryBase());
+                   d_bmm2_rhs_buffer, d_s_buffer, softmax_buffer,
+                   d_Q_accum_buffer, mask_buffer, d_bias_buffer,
+                   fwd_output_buffer, bias_buffer);
+  return OkStatus();
 }
 
 template <typename ElementType, typename BiasType, typename OutputType>
@@ -292,7 +293,20 @@ Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
       se::DeviceMemory<OutputType>(params.d_bmm1_rhs_buffer);
   auto d_bmm2_rhs_buffer =
       se::DeviceMemory<OutputType>(params.d_bmm2_rhs_buffer);
-  auto d_s_buffer = se::DeviceMemory<OutputType>(params.d_s_buffer);
+
+  // optional buffers
+  auto d_s_buffer = params.d_s_buffer.has_value()
+                        ? se::DeviceMemory<OutputType>(*params.d_s_buffer)
+                        : se::DeviceMemoryBase();
+  auto softmax_sum_buffer =
+      params.softmax_sum_buffer.has_value()
+          ? se::DeviceMemory<float>(*params.softmax_sum_buffer)
+          : se::DeviceMemoryBase();
+
+  auto d_Q_accum_buffer =
+      params.d_Q_accum_buffer.has_value()
+          ? se::DeviceMemory<float>(*params.d_Q_accum_buffer)
+          : se::DeviceMemoryBase();
 
   auto mask_buffer = params.mask_buffer.has_value()
                          ? se::DeviceMemory<ElementType>(*params.mask_buffer)
@@ -302,6 +316,15 @@ Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
                            ? se::DeviceMemory<OutputType>(*params.d_bias_buffer)
                            : se::DeviceMemoryBase();
 
+  auto fwd_output_buffer =
+      params.fwd_output_buffer.has_value()
+          ? se::DeviceMemory<ElementType>(*params.fwd_output_buffer)
+          : se::DeviceMemoryBase();
+
+  auto bias_buffer = params.bias_buffer.has_value()
+                         ? se::DeviceMemory<BiasType>(*params.bias_buffer)
+                         : se::DeviceMemoryBase();
+
   se::dnn::AlgorithmDesc algorithm = params.config->algorithm;
   if (options.runner_cache) {
     algorithm = options.runner_cache->ToAlgorithmDesc();
@@ -322,8 +345,9 @@ Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
           params, stream, options, bmm1_grad_gemm1_rhs_buffer,
           bmm1_grad_gemm2_rhs_buffer, bmm2_grad_gemm1_lhs_buffer,
           bmm2_grad_gemm2_rhs_buffer, d_output_buffer, d_bmm1_lhs_buffer,
-          d_bmm1_rhs_buffer, d_bmm2_rhs_buffer, d_s_buffer, mask_buffer,
-          d_bias_buffer, scratch_memory);
+          d_bmm1_rhs_buffer, d_bmm2_rhs_buffer, d_s_buffer, softmax_sum_buffer,
+          d_Q_accum_buffer, mask_buffer, d_bias_buffer, fwd_output_buffer,
+          bias_buffer, scratch_memory);
       break;
     default:
       return InternalError("Invalid cuDNN fMHA kind");
@@ -428,6 +452,8 @@ Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
                                         bias_shape.layout().minor_to_major());
   }
   config.kind = desc.kind;
+  config.is_flash_attention = desc.is_flash_attention;
+  config.is_causal_mask = desc.is_causal_mask;
   const CudnnfMHABackendConfig &backend_config = desc.backend_config;
   config.algorithm = se::dnn::AlgorithmDesc(backend_config.algorithm());
 
@@ -449,7 +475,6 @@ Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
   const Shape &d_bmm1_lhs_shape = desc.d_bmm1_lhs_shape;
   const Shape &d_bmm1_rhs_shape = desc.d_bmm1_rhs_shape;
   const Shape &d_bmm2_rhs_shape = desc.d_bmm2_rhs_shape;
-
   // Get DNN dtype from primtive types
   TF_ASSIGN_OR_RETURN(DataType bmm1_grad_gemm1_rhs_type,
                       GetDNNDataTypeFromPrimitiveType(
@@ -537,7 +562,6 @@ Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
 
   if (desc.d_bias_shape) {
     const Shape &d_bias_shape = *desc.d_bias_shape;
-
     // Get DNN dtype from primtive types
     TF_ASSIGN_OR_RETURN(DataType d_bias_type, GetDNNDataTypeFromPrimitiveType(
                                                   d_bias_shape.element_type()));
@@ -553,7 +577,27 @@ Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
     config.mask = TensorDescriptor::For(mask_type, mask_shape.dimensions(),
                                         mask_shape.layout().minor_to_major());
   }
+  if (desc.fwd_output_shape) {
+    const Shape &fwd_output_shape = *desc.fwd_output_shape;
+    TF_ASSIGN_OR_RETURN(
+        DataType fwd_output_type,
+        GetDNNDataTypeFromPrimitiveType(fwd_output_shape.element_type()));
+    config.fwd_output =
+        TensorDescriptor::For(fwd_output_type, fwd_output_shape.dimensions(),
+                              fwd_output_shape.layout().minor_to_major());
+  }
+
+  if (desc.bias_shape) {
+    const Shape &bias_shape = *desc.bias_shape;
+    TF_ASSIGN_OR_RETURN(DataType bias_type, GetDNNDataTypeFromPrimitiveType(
+                                                bias_shape.element_type()));
+    config.bias = TensorDescriptor::For(bias_type, bias_shape.dimensions(),
+                                        bias_shape.layout().minor_to_major());
+  }
+
   config.kind = desc.kind;
+  config.is_flash_attention = desc.is_flash_attention;
+  config.is_causal_mask = desc.is_causal_mask;
   const CudnnfMHABackendConfig &backend_config = desc.backend_config;
   config.algorithm = se::dnn::AlgorithmDesc(backend_config.algorithm());
 
@@ -601,9 +645,14 @@ Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
     se::DeviceMemoryBase d_output_buffer,
     se::DeviceMemoryBase d_bmm1_lhs_buffer,
     se::DeviceMemoryBase d_bmm1_rhs_buffer,
-    se::DeviceMemoryBase d_bmm2_rhs_buffer, se::DeviceMemoryBase d_s_buffer,
+    se::DeviceMemoryBase d_bmm2_rhs_buffer,
+    std::optional<se::DeviceMemoryBase> d_s_buffer,
+    std::optional<se::DeviceMemoryBase> softmax_sum_buffer,
+    std::optional<se::DeviceMemoryBase> d_Q_accum_buffer,
     std::optional<se::DeviceMemoryBase> mask_buffer,
-    std::optional<se::DeviceMemoryBase> d_bias_buffer) {
+    std::optional<se::DeviceMemoryBase> d_bias_buffer,
+    std::optional<se::DeviceMemoryBase> fwd_output_buffer,
+    std::optional<se::DeviceMemoryBase> bias_buffer) {
   GpufMHABackwardParams params;
   params.config = &config;
   params.bmm1_grad_gemm1_rhs_buffer = bmm1_grad_gemm1_rhs_buffer;
@@ -615,9 +664,12 @@ Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
   params.d_bmm1_rhs_buffer = d_bmm1_rhs_buffer;
   params.d_bmm2_rhs_buffer = d_bmm2_rhs_buffer;
   params.d_s_buffer = d_s_buffer;
+  params.softmax_sum_buffer = softmax_sum_buffer;
+  params.d_Q_accum_buffer = d_Q_accum_buffer;
   params.mask_buffer = mask_buffer;
   params.d_bias_buffer = d_bias_buffer;
-
+  params.fwd_output_buffer = fwd_output_buffer;
+  params.bias_buffer = bias_buffer;
   return params;
 }
 
@@ -651,28 +703,32 @@ Status RunGpuFMHA(const GpufMHAConfig &fmha_config,
   return OkStatus();
 }
 
-Status RunGpuFMHABackward(const GpufMHABackwardConfig &fmha_config,
-                          se::DeviceMemoryBase bmm1_grad_gemm1_rhs_buffer,
-                          se::DeviceMemoryBase bmm1_grad_gemm2_rhs_buffer,
-                          se::DeviceMemoryBase bmm2_grad_gemm1_lhs_buffer,
-                          se::DeviceMemoryBase bmm2_grad_gemm2_rhs_buffer,
-                          se::DeviceMemoryBase d_output_buffer,
-                          se::DeviceMemoryBase scratch_buffer,
-                          se::DeviceMemoryBase d_bmm1_lhs_buffer,
-                          se::DeviceMemoryBase d_bmm1_rhs_buffer,
-                          se::DeviceMemoryBase d_bmm2_rhs_buffer,
-                          se::DeviceMemoryBase d_s_buffer,
-                          std::optional<se::DeviceMemoryBase> mask_buffer,
-                          std::optional<se::DeviceMemoryBase> d_bias_buffer,
-                          se::Stream *stream,
-                          RunFusedMHABackwardOptions options) {
+Status RunGpuFMHABackward(
+    const GpufMHABackwardConfig &fmha_config,
+    se::DeviceMemoryBase bmm1_grad_gemm1_rhs_buffer,
+    se::DeviceMemoryBase bmm1_grad_gemm2_rhs_buffer,
+    se::DeviceMemoryBase bmm2_grad_gemm1_lhs_buffer,
+    se::DeviceMemoryBase bmm2_grad_gemm2_rhs_buffer,
+    se::DeviceMemoryBase d_output_buffer, se::DeviceMemoryBase scratch_buffer,
+    se::DeviceMemoryBase d_bmm1_lhs_buffer,
+    se::DeviceMemoryBase d_bmm1_rhs_buffer,
+    se::DeviceMemoryBase d_bmm2_rhs_buffer,
+    std::optional<se::DeviceMemoryBase> d_s_buffer,
+    std::optional<se::DeviceMemoryBase> softmax_sum_buffer,
+    std::optional<se::DeviceMemoryBase> d_Q_accum_buffer,
+    std::optional<se::DeviceMemoryBase> mask_buffer,
+    std::optional<se::DeviceMemoryBase> d_bias_buffer,
+    std::optional<se::DeviceMemoryBase> fwd_output_buffer,
+    std::optional<se::DeviceMemoryBase> bias_buffer, se::Stream *stream,
+    RunFusedMHABackwardOptions options) {
   TF_ASSIGN_OR_RETURN(
       GpufMHABackwardParams params,
       GpufMHABackwardParams::For(
           fmha_config, bmm1_grad_gemm1_rhs_buffer, bmm1_grad_gemm2_rhs_buffer,
           bmm2_grad_gemm1_lhs_buffer, bmm2_grad_gemm2_rhs_buffer,
           d_output_buffer, d_bmm1_lhs_buffer, d_bmm1_rhs_buffer,
-          d_bmm2_rhs_buffer, d_s_buffer, mask_buffer, d_bias_buffer));
+          d_bmm2_rhs_buffer, d_s_buffer, softmax_sum_buffer, d_Q_accum_buffer,
+          mask_buffer, d_bias_buffer, fwd_output_buffer, bias_buffer));
   PrimitiveType input_primitive_type = fmha_config.input_type;
   switch (input_primitive_type) {
     case F16:
diff --git a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
index 637a3c474c4f7e..041993431030c7 100644
--- a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
+++ b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
@@ -46,6 +46,8 @@ namespace gpu {
 struct GpufMHADescriptor {
   CudnnfMHAKind kind;
   CudnnfMHABackendConfig backend_config;
+  bool is_flash_attention;
+  bool is_causal_mask;
   Shape lhs_bmm1_shape;
   Shape rhs_bmm1_shape;
   Shape rhs_bmm2_shape;
@@ -62,6 +64,8 @@ struct GpufMHADescriptor {
 struct GpufMHABackwardDescriptor {
   CudnnfMHAKind kind;
   CudnnfMHABackendConfig backend_config;
+  bool is_flash_attention;
+  bool is_causal_mask;
   Shape bmm1_grad_gemm1_rhs_shape;
   Shape bmm1_grad_gemm2_rhs_shape;
   Shape bmm2_grad_gemm1_lhs_shape;
@@ -75,8 +79,11 @@ struct GpufMHABackwardDescriptor {
   DotDimensionNumbers bmm2_grad_gemm1_dnums;
   DotDimensionNumbers bmm2_grad_gemm2_dnums;
 
+  std::optional<Shape> d_s_shape;
+  std::optional<Shape> fwd_output_shape;
   std::optional<Shape> mask_shape;
   std::optional<Shape> d_bias_shape;
+  std::optional<Shape> bias_shape;
 };
 // Structure to describe static properties of a GPU fused Multi-Headed
 // Attention.
@@ -91,7 +98,8 @@ struct GpufMHAConfig {
   std::optional<int64_t> seed;
 
   se::dnn::AlgorithmDesc algorithm;
-
+  bool is_flash_attention;
+  bool is_causal_mask;
   // bias -> [1, num_attn_heads, q_seq_len, kv_seq_len]
   // mask -> [batch_size, 1, q_seq_len, kv_seq_len]
   se::dnn::MatmulTensorDescriptor lhs_bmm1;
@@ -119,7 +127,8 @@ struct GpufMHABackwardConfig {
   std::optional<int64_t> seed;
 
   se::dnn::AlgorithmDesc algorithm;
-
+  bool is_flash_attention;
+  bool is_causal_mask;
   // mask -> [batch_size, 1, q_seq_len, kv_seq_len]
   // d_bias -> [1, num_heads, q_seq_len, kv_seq_len]
   se::dnn::MatmulTensorDescriptor bmm1_grad_gemm1_rhs;
@@ -130,9 +139,11 @@ struct GpufMHABackwardConfig {
   se::dnn::TensorDescriptor d_bmm1_lhs;
   se::dnn::TensorDescriptor d_bmm1_rhs;
   se::dnn::TensorDescriptor d_bmm2_rhs;
-  se::dnn::TensorDescriptor d_s;
-  std::optional<se::dnn::TensorDescriptor> d_bias;
+  std::optional<se::dnn::TensorDescriptor> d_s;
   std::optional<se::dnn::TensorDescriptor> mask;
+  std::optional<se::dnn::TensorDescriptor> d_bias;
+  std::optional<se::dnn::TensorDescriptor> fwd_output;
+  std::optional<se::dnn::TensorDescriptor> bias;
 };
 
 // Implementation struct exposed for debugging and log analysis.
@@ -165,9 +176,14 @@ struct GpufMHABackwardParams {
       se::DeviceMemoryBase d_output_buffer,
       se::DeviceMemoryBase d_bmm1_lhs_buffer,
       se::DeviceMemoryBase d_bmm1_rhs_buffer,
-      se::DeviceMemoryBase d_bmm2_rhs_buffer, se::DeviceMemoryBase d_s_buffer,
+      se::DeviceMemoryBase d_bmm2_rhs_buffer,
+      std::optional<se::DeviceMemoryBase> d_s_buffer,
+      std::optional<se::DeviceMemoryBase> softmax_sum_buffer,
+      std::optional<se::DeviceMemoryBase> d_Q_accum_buffer,
       std::optional<se::DeviceMemoryBase> mask_buffer,
-      std::optional<se::DeviceMemoryBase> d_bias_buffer);
+      std::optional<se::DeviceMemoryBase> d_bias_buffer,
+      std::optional<se::DeviceMemoryBase> fwd_output_buffer,
+      std::optional<se::DeviceMemoryBase> bias_buffer);
 
   const GpufMHABackwardConfig* config;  // Not owned
   se::DeviceMemoryBase bmm1_grad_gemm1_rhs_buffer;
@@ -178,9 +194,13 @@ struct GpufMHABackwardParams {
   se::DeviceMemoryBase d_bmm1_lhs_buffer;
   se::DeviceMemoryBase d_bmm1_rhs_buffer;
   se::DeviceMemoryBase d_bmm2_rhs_buffer;
-  se::DeviceMemoryBase d_s_buffer;
-  std::optional<se::DeviceMemoryBase> d_bias_buffer;
+  std::optional<se::DeviceMemoryBase> d_s_buffer;
+  std::optional<se::DeviceMemoryBase> softmax_sum_buffer;
+  std::optional<se::DeviceMemoryBase> d_Q_accum_buffer;
   std::optional<se::DeviceMemoryBase> mask_buffer;
+  std::optional<se::DeviceMemoryBase> d_bias_buffer;
+  std::optional<se::DeviceMemoryBase> fwd_output_buffer;
+  std::optional<se::DeviceMemoryBase> bias_buffer;
 };
 
 class FusedMultiHeadedAttentionRunner {
@@ -371,20 +391,24 @@ Status RunGpuFMHA(const GpufMHAConfig& fmha_config,
                   std::optional<se::DeviceMemoryBase> activation_buffer,
                   se::Stream* stream, RunFusedMHAOptions = {});
 
-Status RunGpuFMHABackward(const GpufMHABackwardConfig& fmha_config,
-                          se::DeviceMemoryBase bmm1_grad_gemm1_rhs_buffer,
-                          se::DeviceMemoryBase bmm1_grad_gemm2_rhs_buffer,
-                          se::DeviceMemoryBase bmm2_grad_gemm1_lhs_buffer,
-                          se::DeviceMemoryBase bmm2_grad_gemm2_rhs_buffer,
-                          se::DeviceMemoryBase d_output_buffer,
-                          se::DeviceMemoryBase scratch_buffer,
-                          se::DeviceMemoryBase d_bmm1_lhs_buffer,
-                          se::DeviceMemoryBase d_bmm1_rhs_buffer,
-                          se::DeviceMemoryBase d_bmm2_rhs_buffer,
-                          se::DeviceMemoryBase d_s_buffer,
-                          std::optional<se::DeviceMemoryBase> mask_buffer,
-                          std::optional<se::DeviceMemoryBase> d_bias_buffer,
-                          se::Stream* stream, RunFusedMHABackwardOptions = {});
+Status RunGpuFMHABackward(
+    const GpufMHABackwardConfig& fmha_config,
+    se::DeviceMemoryBase bmm1_grad_gemm1_rhs_buffer,
+    se::DeviceMemoryBase bmm1_grad_gemm2_rhs_buffer,
+    se::DeviceMemoryBase bmm2_grad_gemm1_lhs_buffer,
+    se::DeviceMemoryBase bmm2_grad_gemm2_rhs_buffer,
+    se::DeviceMemoryBase d_output_buffer, se::DeviceMemoryBase scratch_buffer,
+    se::DeviceMemoryBase d_bmm1_lhs_buffer,
+    se::DeviceMemoryBase d_bmm1_rhs_buffer,
+    se::DeviceMemoryBase d_bmm2_rhs_buffer,
+    std::optional<se::DeviceMemoryBase> d_s_buffer,
+    std::optional<se::DeviceMemoryBase> softmax_sum_buffer,
+    std::optional<se::DeviceMemoryBase> d_Q_accum_buffer,
+    std::optional<se::DeviceMemoryBase> mask_buffer,
+    std::optional<se::DeviceMemoryBase> d_bias_buffer,
+    std::optional<se::DeviceMemoryBase> fwd_output_buffer,
+    std::optional<se::DeviceMemoryBase> bias_buffer, se::Stream* stream,
+    RunFusedMHABackwardOptions = {});
 
 std::string ToString(const GpufMHAConfig& config);
 
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.cc b/third_party/xla/xla/service/gpu/gpu_fusible.cc
index 744dd1ce1d4bed..2c7ea59a11884d 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/reduction_utils.h"
 #include "xla/service/instruction_fusion.h"
@@ -152,15 +153,14 @@ bool IsNestableVariadicReduction(const HloInstruction& instr) {
 }
 
 bool IsInputFusibleTranspose(const HloInstruction& instr) {
-  if (instr.opcode() == HloOpcode::kBitcast) {
+  if (instr.opcode() == HloOpcode::kBitcast || instr.IsCustomFusion()) {
     return false;
   }
-  auto& hero = FindNonTrivialHero(instr);
-  if (GetDescriptionForTiledTransposeEmitter(instr, hero).has_value()) {
-    return true;
+  if (instr.opcode() == HloOpcode::kFusion) {
+    return HasAnyTiledTransposeRoot(*instr.fused_instructions_computation());
   }
-  return !instr.IsCustomFusion() && instr.opcode() == HloOpcode::kFusion &&
-         HasAnyTiledTransposeRoot(*instr.called_computations()[0]);
+  auto& hero = FindNonTrivialHero(instr);
+  return GetDescriptionForTiledTransposeEmitter(instr, hero).has_value();
 }
 
 const HloInstruction* GetRealHeroForMultiOutputFusion(
@@ -468,6 +468,17 @@ FusionDecision IsProducerConsumerFusible(const HloInstruction& producer,
              .xla_gpu_enable_reduction_epilogue_fusion()) {
       return "Reduction epilogue fusion is not enabled.";
     }
+    // TODO(akuegel): Remove workaround when producer_hero is computed
+    // correctly.
+    const HloInstruction& reduce_hero =
+        producer_hero.opcode() == HloOpcode::kFusion
+            ? FindNonTrivialHero(*producer_hero.fused_expression_root())
+            : producer_hero;
+    if (!ReductionIsRaceFree(
+            reduce_hero.GetModule()->config(),
+            GetReductionKindAndContiguousComponents(reduce_hero))) {
+      return "Reduction output fusion only works for race free reductions";
+    }
     if (!AllSatisfy(consumer, [](const HloInstruction* hlo) {
           return IsIntermediate(hlo, /*allowed_operand_count=*/1);
         })) {
@@ -939,5 +950,13 @@ bool IsRealReductionHero(const HloInstruction& root,
                              GetReductionKindAndContiguousComponents(hero));
 }
 
+bool IsTritonSoftmaxFusion(const HloInstruction& instr) {
+  return instr.opcode() == HloOpcode::kFusion &&
+         instr.fusion_kind() == HloInstruction::FusionKind::kCustom &&
+         instr.backend_config<FusionBackendConfig>().ok() &&
+         instr.backend_config<FusionBackendConfig>()->kind() ==
+             kTritonSoftmaxFusionKind;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.h b/third_party/xla/xla/service/gpu/gpu_fusible.h
index 897b44bb301fc1..ff5820b600ca51 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.h
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.h
@@ -211,6 +211,9 @@ std::vector<const HloInstruction*> GetFusionRoots(
 bool IsRealReductionHero(const HloInstruction& root,
                          const HloInstruction& hero);
 
+// Whether the instruction is a Triton Softmax fusion.
+bool IsTritonSoftmaxFusion(const HloInstruction& instr);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
index 8766416cf032b1..172d18122c33d3 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
@@ -1213,6 +1213,40 @@ TEST_F(GpuFusibleTest, FuseLayoutChangingOpWithElementwise) {
       static_cast<bool>(IsProducerConsumerFusible(*producer, *consumer)));
 }
 
+TEST_F(GpuFusibleTest, FuseReduceWithUnaryElementwise) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    ENTRY main.12 {
+      Arg_0.1 = f32[2048]{0} parameter(0)
+      constant.4 = f32[] constant(0.0)
+      reduce.10 = f32[] reduce(Arg_0.1, constant.4), dimensions={0}, to_apply=scalar_add
+      ROOT exp = f32[] exponential(reduce.10)
+    })"))
+                    .value();
+
+  const HloInstruction* consumer =
+      module->entry_computation()->root_instruction();
+  const HloInstruction* producer = consumer->operand(0);
+  EXPECT_TRUE(
+      static_cast<bool>(IsProducerConsumerFusible(*producer, *consumer)));
+}
+
+TEST_F(GpuFusibleTest, DoNotFuseReduceWithRacesWithUnaryElementwise) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    ENTRY main.12 {
+      Arg_0.1 = f32[196608]{0} parameter(0)
+      constant.4 = f32[] constant(0.0)
+      reduce.10 = f32[] reduce(Arg_0.1, constant.4), dimensions={0}, to_apply=scalar_add
+      ROOT exp = f32[] exponential(reduce.10)
+    })"))
+                    .value();
+
+  const HloInstruction* consumer =
+      module->entry_computation()->root_instruction();
+  const HloInstruction* producer = consumer->operand(0);
+  EXPECT_FALSE(
+      static_cast<bool>(IsProducerConsumerFusible(*producer, *consumer)));
+}
+
 TEST_F(GpuFusibleTest, CreatesHeavyComputation_NonfusionInstr) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
     ENTRY entry {
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index baeccacbd1069a..3023e7d6db5a76 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/service/buffer_value.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/gpu_schedule_postprocessing.h"
 #include "xla/service/gpu/model/analytical_latency_estimator.h"
 #include "xla/service/hlo_memory_scheduler.h"
 #include "xla/service/hlo_pass_pipeline.h"
@@ -703,6 +704,11 @@ Status ScheduleGpuModule(HloModule* module, int64_t pointer_size,
       std::move(scheduler_core), shape_size_in_bytes);
 
   TF_RETURN_IF_ERROR(pipeline.Run(module).status());
+
+  HloPassPipeline postprocessing_pipeline("gpu-schedule-postprocessing");
+  postprocessing_pipeline.AddPass<GpuSchedulePostprocessing>();
+  TF_RETURN_IF_ERROR(postprocessing_pipeline.Run(module).status());
+
   return OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/gpu/gpu_schedule_postprocessing.cc b/third_party/xla/xla/service/gpu/gpu_schedule_postprocessing.cc
new file mode 100644
index 00000000000000..8c66874b378eef
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_schedule_postprocessing.cc
@@ -0,0 +1,165 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gpu_schedule_postprocessing.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/statusor.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+// Maps a computation to a boolean that indicates whether the computation may
+// invoke custom-calls directly or indirectly, which can eventually trigger gpu
+// synchronization.
+using CustomCallInComputation =
+    absl::flat_hash_map<const HloComputation*, bool>;
+
+// Returns whether the hlo may invoke custom-calls which may trigger gpu
+// synchronization. Currently, we only check for custom-calls, because they are
+// the only operations that can be parallel with asynchronous collectives
+// operations in an hlo-schedule and may trigger gpu synchronization.
+bool MayInvokeCustomCall(
+    const HloInstruction* hlo,
+    const CustomCallInComputation& custom_call_in_computation) {
+  if (hlo->opcode() == HloOpcode::kCustomCall) {
+    return true;
+  }
+
+  return absl::c_any_of(
+      hlo->called_computations(), [&](const HloComputation* callee) {
+        return custom_call_in_computation.find(callee)->second;
+      });
+}
+
+// Returns true if this is an asynchronous collective start operation, excluding
+// P2P operations.
+StatusOr<bool> IsRelevantAsynchronousStart(const HloInstruction* hlo) {
+  HloOpcode opcode = hlo->opcode();
+  if (!hlo_query::IsAsyncCollectiveStartOp(opcode,
+                                           /*include_send_recv=*/false)) {
+    return false;
+  }
+  TF_ASSIGN_OR_RETURN(CollectiveBackendConfig collective_backend_config,
+                      hlo->backend_config<CollectiveBackendConfig>());
+  return !collective_backend_config.is_sync();
+}
+
+// Returns true if this is a collective done operation, excluding P2P
+// operations.
+StatusOr<bool> IsRelevantAsynchronousDone(const HloInstruction* hlo) {
+  HloOpcode opcode = hlo->opcode();
+  return hlo_query::IsAsyncCollectiveDoneOp(opcode,
+                                            /*include_send_recv=*/false);
+}
+
+// For a given computation, finds all the asynchronous collective operations
+// that aren't parallel with custom-calls and sets its no_parallel_custom_call
+// attribute to true. Also records whether the given computation may invoke
+// custom-calls.
+StatusOr<bool> ProcessComputation(
+    const HloSchedule& schedule, HloComputation* computation,
+    CustomCallInComputation& custom_call_in_computation) {
+  bool changed = false;
+  bool has_custom_call = false;
+  absl::flat_hash_set<HloInstruction*> async_starts;
+  const HloInstructionSequence& sequence = schedule.sequence(computation);
+
+  // Visit instructions in the sequence. Collect relevant asynchronous
+  // collective start ops. When we see a relevant asynchronous collective done
+  // op, remove the corresponding start op from the collection and set its
+  // attribute no_parallel_custom_call to true. When we see a custom-call, clear
+  // the start ops from the collection and keep their attribute
+  // no_parallel_custom_call as false.
+  const std::vector<HloInstruction*> all_instructions = sequence.instructions();
+  for (HloInstruction* hlo : all_instructions) {
+    if (MayInvokeCustomCall(hlo, custom_call_in_computation)) {
+      async_starts.clear();
+      has_custom_call = true;
+      continue;
+    }
+    TF_ASSIGN_OR_RETURN(bool is_async_start, IsRelevantAsynchronousStart(hlo));
+    if (is_async_start) {
+      async_starts.insert(hlo);
+      continue;
+    }
+
+    TF_ASSIGN_OR_RETURN(bool is_async_done, IsRelevantAsynchronousDone(hlo));
+    if (is_async_done) {
+      HloInstruction* async_start = hlo->mutable_operand(0);
+      if (async_starts.contains(async_start)) {
+        changed = true;
+        TF_ASSIGN_OR_RETURN(
+            CollectiveBackendConfig collective_backend_config,
+            async_start->backend_config<CollectiveBackendConfig>());
+        collective_backend_config.set_no_parallel_custom_call(true);
+        TF_RETURN_IF_ERROR(
+            async_start->set_backend_config(collective_backend_config));
+        async_starts.erase(async_start);
+      }
+    }
+  }
+
+  custom_call_in_computation[computation] = has_custom_call;
+  return changed;
+}
+
+}  // anonymous namespace
+
+StatusOr<bool> GpuSchedulePostprocessing::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  if (!module->has_schedule()) return false;
+  HloSchedule& schedule = module->schedule();
+  bool changed = false;
+  CustomCallInComputation custom_call_in_computation;
+
+  // We visit computations in the order of callees to callers, as information is
+  // propagated from calles to callers.
+  std::vector<HloComputation*> all_computations =
+      module->MakeComputationPostOrder(execution_threads);
+  for (auto iter = all_computations.begin(); iter != all_computations.end();
+       ++iter) {
+    HloComputation* computation = *iter;
+    if (computation->IsFusionComputation()) {
+      custom_call_in_computation[computation] = false;
+      continue;
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        bool result,
+        ProcessComputation(schedule, computation, custom_call_in_computation));
+    changed |= result;
+  }
+
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_schedule_postprocessing.h b/third_party/xla/xla/service/gpu/gpu_schedule_postprocessing.h
new file mode 100644
index 00000000000000..521d74e617d100
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_schedule_postprocessing.h
@@ -0,0 +1,48 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_SCHEDULE_POSTPROCESSING_H_
+#define XLA_SERVICE_GPU_GPU_SCHEDULE_POSTPROCESSING_H_
+
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Amends a schedule result with the needed information to support a runtime
+// implementation. Currently, this pass refines attribute
+// no_parallel_custom_call for asynchronous collective operations to support
+// runtime optimization, such as skipping rendezvous of all participating
+// threads for NCCL collective operations. In particular, it sets the attribute
+// value for Collective-start operations with is_sync=false; it also keeps the
+// attribute value untouch for the operations with is_sync=true and for P2P
+// operations, assumming the runtime won't use those values.
+//
+class GpuSchedulePostprocessing : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "gpu-schedule-postprocessing";
+  }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GPU_SCHEDULE_POSTPROCESSING_H_
diff --git a/third_party/xla/xla/service/gpu/gpu_schedule_postprocessing_test.cc b/third_party/xla/xla/service/gpu/gpu_schedule_postprocessing_test.cc
new file mode 100644
index 00000000000000..b9ef17de14c825
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_schedule_postprocessing_test.cc
@@ -0,0 +1,157 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gpu_schedule_postprocessing.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/hlo_parser.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/util.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using GpuSchedulePostprocessingTest = HloTestBase;
+
+TEST_F(GpuSchedulePostprocessingTest, SynchronousOpsNotChanged) {
+  constexpr absl::string_view kHloString = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    pf32 = f32[1] parameter(0)
+
+    all-gather-start = (f32[1], f32[2]) all-gather-start(pf32), dimensions={0}, backend_config="{\"is_sync\":true}"
+    ROOT all-gather-done = f32[2] all-gather-done(all-gather-start)
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((kHloString)));
+  GpuSchedulePostprocessing pass;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(GpuSchedulePostprocessingTest, P2POpsNotChanged) {
+  constexpr absl::string_view kHloString = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY main {
+    f0 = f32[] constant(0.0)
+    init = f32[1, 1024, 1024] broadcast(f0), dimensions={}
+
+    after-all = token[] after-all()
+    recv = (f32[1, 1024, 1024], u32[], token[]) recv(after-all), channel_id=2,
+      frontend_attributes={
+      _xla_send_recv_source_target_pairs="{{0,1}, {1,2}}"
+    }
+    recv-done = (f32[1, 1024, 1024], token[]) recv-done(recv), channel_id=2
+    ROOT recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done), index=0
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((kHloString)));
+  GpuSchedulePostprocessing pass;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(GpuSchedulePostprocessingTest, AsynchronousOpsChanged) {
+  constexpr absl::string_view kHloString = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    pf32 = f32[1] parameter(0)
+    pf32.2 = f32[1] custom-call(pf32), custom_call_target="my_custom_call"
+    all-gather-start = (f32[1], f32[2]) all-gather-start(pf32.2), dimensions={0}, backend_config="{\"is_sync\":false}"
+    ROOT all-gather-done = f32[2] all-gather-done(all-gather-start)
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((kHloString)));
+  GpuSchedulePostprocessing pass;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  HloInstruction* start = FindInstruction(module.get(), "all-gather-start");
+  TF_ASSERT_OK_AND_ASSIGN(CollectiveBackendConfig collective_backend_config,
+                          start->backend_config<CollectiveBackendConfig>());
+  EXPECT_TRUE(collective_backend_config.no_parallel_custom_call());
+}
+
+TEST_F(GpuSchedulePostprocessingTest, AsynchronousOpsWithParallelCustomcall) {
+  constexpr absl::string_view kHloString = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    pf32 = f32[1] parameter(0)
+    all-gather-start = (f32[1], f32[2]) all-gather-start(pf32), dimensions={0}, backend_config="{\"is_sync\":false}"
+    pf32.2 = f32[1] custom-call(pf32), custom_call_target="my_custom_call"
+    all-gather-done = f32[2] all-gather-done(all-gather-start)
+    ROOT out = (f32[1], f32[2]) tuple(f32[1] pf32.2, f32[2] all-gather-done)
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((kHloString)));
+  GpuSchedulePostprocessing pass;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
+  EXPECT_FALSE(changed);
+
+  HloInstruction* start = FindInstruction(module.get(), "all-gather-start");
+  TF_ASSERT_OK_AND_ASSIGN(CollectiveBackendConfig collective_backend_config,
+                          start->backend_config<CollectiveBackendConfig>());
+  EXPECT_FALSE(collective_backend_config.no_parallel_custom_call());
+}
+
+TEST_F(GpuSchedulePostprocessingTest,
+       AsynchronousOpsWithParallelNestedCustomcall) {
+  constexpr absl::string_view kHloString = R"(
+  HloModule module, is_scheduled=true
+  foo {
+    v = f32[1] parameter(0)
+    ROOT ret = f32[1] custom-call(v), custom_call_target="my_custom_call"
+  }
+
+  ENTRY entry {
+    pf32 = f32[1] parameter(0)
+    all-gather-start = (f32[1], f32[2]) all-gather-start(pf32), dimensions={0}, backend_config="{\"is_sync\":false}"
+    pf32.2 = f32[1] call(f32[1] pf32), to_apply=foo
+    all-gather-done = f32[2] all-gather-done(all-gather-start)
+    ROOT out = (f32[1], f32[2]) tuple(f32[1] pf32.2, f32[2] all-gather-done)
+  }
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((kHloString)));
+  GpuSchedulePostprocessing pass;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
+  EXPECT_FALSE(changed);
+
+  HloInstruction* start = FindInstruction(module.get(), "all-gather-start");
+  TF_ASSERT_OK_AND_ASSIGN(CollectiveBackendConfig collective_backend_config,
+                          start->backend_config<CollectiveBackendConfig>());
+  EXPECT_FALSE(collective_backend_config.no_parallel_custom_call());
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc b/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
index 5bd1c557e952a3..a695ac712dec59 100644
--- a/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
+++ b/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
@@ -123,8 +123,7 @@ Status GpuTransferManager::ReadDynamicShapes(se::Stream* stream,
         }
 
         auto buffer_8 = se::DeviceMemory<uint8_t>(buffer);
-        auto metadata_buffer =
-            stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
+        auto metadata_buffer = buffer_8.GetSlice(offset, metadata_size);
         copies.push_back(std::make_pair(metadata_buffer, &device_sub_shape));
 
         return OkStatus();
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
index 79fe6e849312b7..05182f8e660f1c 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
@@ -26,8 +26,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/node_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/numeric/bits.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -57,6 +59,35 @@ const auto kDimX = TilingScheme::DimX;
 const auto kLinearIndexingX = TilingScheme::LinearIndexingX;
 const auto kStridedIndexingX = TilingScheme::StridedIndexingX;
 
+std::optional<TilingScheme> ComputeTransposeTilingScheme(
+    const std::optional<TransposeDescription>& tiled_transpose) {
+  if (!tiled_transpose) {
+    return std::nullopt;
+  }
+
+  constexpr int kNumRows = 4;
+  static_assert(WarpSize() % kNumRows == 0);
+
+  // 3D view over the input shape.
+  Vector3 dims = tiled_transpose->dimensions;
+  Vector3 order = tiled_transpose->permutation;
+
+  Vector3 permuted_dims = {dims[order[0]], dims[order[1]], dims[order[2]]};
+  Vector3 tile_sizes{1, 1, 1};
+  tile_sizes[order[2]] = WarpSize() / kNumRows;
+  Vector3 num_threads{1, 1, WarpSize()};
+  num_threads[order[2]] = kNumRows;
+
+  return TilingScheme(
+      /*permuted_dims*/ permuted_dims,
+      /*tile_sizes=*/tile_sizes,
+      /*num_threads=*/num_threads,
+      /*indexing_order=*/kLinearIndexingX,
+      /*vector_size=*/1,
+      /*scaling_factor=*/1,
+      /*tiling_dimensions=*/{order[2], 2});
+}
+
 // Returns true if `instr` is a non-strided slice.
 bool IsSliceWithUnitStrides(const HloInstruction* instr) {
   auto slice = DynCast<HloSliceInstruction>(instr);
@@ -80,32 +111,30 @@ bool AllSliceInputsAreCompatible(
   });
 }
 
-bool MayPreventVectorization(
-    const std::vector<const HloInstruction*>& fusion_roots,
-    const FusionBoundaryFn& fusion_boundary_fn) {
+bool MayPreventVectorization(const HloFusionAdaptor& fusion) {
   // An empirically chosen constant: unrolling concat with a large amount of
   // arguments causes excessive register spilling.
   static constexpr int kMaxConcatArgumentsForUnrolling = 10;
-  return HloAnyOf(
-      fusion_roots, fusion_boundary_fn, [&](const HloInstruction& node) {
-        switch (node.opcode()) {
-          case HloOpcode::kReduceWindow:
-          case HloOpcode::kSort:
-          case HloOpcode::kDot:
-          case HloOpcode::kSin:
-          case HloOpcode::kCos:
-          case HloOpcode::kTan:
-          case HloOpcode::kPower:
-          case HloOpcode::kAtan2:
-            return true;
-          case HloOpcode::kConcatenate:
-            return node.operand_count() > kMaxConcatArgumentsForUnrolling;
-          case HloOpcode::kReduce:
-            return node.shape().tuple_shapes_size() > 1;
-          default:
-            return false;
-        }
-      });
+  return HloAnyOf(fusion.GetRoots(), fusion, [&](auto node) {
+    switch (node.opcode()) {
+      case HloOpcode::kReduceWindow:
+      case HloOpcode::kSort:
+      case HloOpcode::kDot:
+      case HloOpcode::kSin:
+      case HloOpcode::kCos:
+      case HloOpcode::kTan:
+      case HloOpcode::kPower:
+      case HloOpcode::kAtan2:
+        return true;
+      case HloOpcode::kConcatenate:
+        return node.instruction().operand_count() >
+               kMaxConcatArgumentsForUnrolling;
+      case HloOpcode::kReduce:
+        return node.instruction().shape().tuple_shapes_size() > 1;
+      default:
+        return false;
+    }
+  });
 }
 
 // Determines if we enable the row optimized codegen. When we have a fusion with
@@ -114,15 +143,15 @@ bool MayPreventVectorization(
 // particular on A100. The int is the number of inputs with rank `out_rank`. Its
 // value is only defined if row vectorization is enabled.
 std::pair<bool /*enabled*/, int> RowVectorizationEnabled(
-    const std::vector<const HloInstruction*>& fusion_roots, int64_t out_rank) {
-  const auto is_row_major = [](const HloInstruction* instr) {
+    const HloFusionAdaptor& fusion, int64_t out_rank) {
+  auto roots = fusion.GetRoots();
+  const auto is_row_major = [](auto instr) {
     // Only tested when the inputs are row-major. So only enable that case.
     // Maybe it would work if only the inner dimensions is contiguous.
-    return LayoutUtil::IsMonotonicWithDim0Major(instr->shape().layout());
+    return LayoutUtil::IsMonotonicWithDim0Major(instr.shape().layout());
   };
-  bool row_vectorized = fusion_roots.size() == 1 &&
-                        !fusion_roots[0]->shape().IsTuple() &&
-                        is_row_major(fusion_roots[0]);
+  bool row_vectorized = roots.size() == 1 && !roots[0].shape().IsTuple() &&
+                        is_row_major(roots[0]);
   if (!row_vectorized) {
     return {false, 0};
   }
@@ -138,12 +167,13 @@ std::pair<bool /*enabled*/, int> RowVectorizationEnabled(
   int num_big_inputs = 0;
   bool some_row_broadcasting = false;
   HloBfsConsumersFirstTraversal(
-      {fusion_roots.front()},
-      [&](const HloInstruction& producer, const HloInstruction& consumer) {
-        return consumer.opcode() == HloOpcode::kParameter;
-      },
-      [&](const HloInstruction& node) -> TraversalResult {
-        if (node.IsElementwise()) {
+      roots, fusion,
+      [&](auto node) -> TraversalResult {
+        if (!row_vectorized) {
+          return TraversalResult::kAbortTraversal;
+        }
+
+        if (node.instruction().IsElementwise()) {
           return TraversalResult::kVisitOperands;
         }
 
@@ -151,33 +181,33 @@ std::pair<bool /*enabled*/, int> RowVectorizationEnabled(
           case HloOpcode::kConstant:
             return TraversalResult::kDoNotVisitOperands;
           case HloOpcode::kParameter:
-            if (node.shape().rank() == out_rank) {
-              ++num_big_inputs;
-            }
-            // TODO(jreiffers): When extending this to work with unfused HLO,
-            // move this check to the boundary function.
-            if (!is_row_major(&node)) {
-              row_vectorized = false;
-              return TraversalResult::kAbortTraversal;
-            }
             return TraversalResult::kVisitOperands;
-          case HloOpcode::kBroadcast:
-            if (node.dimensions().empty()) {
+          case HloOpcode::kBroadcast: {
+            auto dims = node.instruction().dimensions();
+            if (dims.empty()) {
               return TraversalResult::kVisitOperands;
             }
 
-            if (node.dimensions().size() == 1 &&
-                node.dimensions().front() == node.shape().rank() - 1) {
+            if (dims.size() == 1 && dims.front() == node.shape().rank() - 1) {
               some_row_broadcasting = true;
               return TraversalResult::kVisitOperands;
             }
             TF_FALLTHROUGH_INTENDED;
+          }
           default:
             VLOG(2) << "Row vectorization not enabled due to: "
                     << node.ToString();
             row_vectorized = false;
             return TraversalResult::kAbortTraversal;
         }
+      },
+      [&](auto argument) {
+        if (argument.shape().rank() == out_rank) {
+          ++num_big_inputs;
+        }
+        if (!is_row_major(argument)) {
+          row_vectorized = false;
+        }
       });
   // Trigger only when there is a row broadcasting.
   return std::make_pair(row_vectorized && some_row_broadcasting,
@@ -254,38 +284,70 @@ std::optional<TransposeDescription> FindConsistentTransposeHero(
   return tiled_transpose_hero;
 }
 
+int SmallestInputDtypeBits(const std::vector<const HloInstruction*>& args) {
+  int bits = std::numeric_limits<int>::max();
+  for (const HloInstruction* operand : args) {
+    if (!operand->shape().IsArray()) continue;
+    bits = std::min(bits,
+                    primitive_util::BitWidth(operand->shape().element_type()));
+  }
+  return bits;
+}
+
 }  // namespace
 
+HloFusionAnalysis::HloFusionAnalysis(
+    FusionBackendConfig fusion_backend_config,
+    std::vector<const HloInstruction*> fusion_roots,
+    std::unique_ptr<HloFusionAdaptor> fusion,
+    std::vector<const HloInstruction*> fusion_heroes,
+    const se::DeviceDescription* device_info,
+    std::optional<TransposeDescription> tiled_transpose,
+    HloFusionAnalysis::InputOutputInfo input_output_info)
+    : fusion_backend_config_(std::move(fusion_backend_config)),
+      fusion_roots_(std::move(fusion_roots)),
+      fusion_(std::move(fusion)),
+      fusion_heroes_(std::move(fusion_heroes)),
+      device_info_(device_info),
+      tiled_transpose_(tiled_transpose),
+      input_output_info_(std::move(input_output_info)),
+      reduction_codegen_info_(ComputeReductionCodegenInfo(FindHeroReduction())),
+      transpose_tiling_scheme_(ComputeTransposeTilingScheme(tiled_transpose_)),
+      loop_fusion_config_(ComputeLoopFusionConfig()) {}
+
 // static
 StatusOr<HloFusionAnalysis> HloFusionAnalysis::Create(
     FusionBackendConfig backend_config,
-    std::vector<const HloInstruction*> hlo_roots, FusionBoundaryFn boundary_fn,
+    std::unique_ptr<HloFusionAdaptor> fusion,
     const se::DeviceDescription* device_info) {
+  std::vector<const HloInstruction*> roots;
   std::vector<const HloInstruction*> heroes;
-  heroes.reserve(hlo_roots.size());
-  for (auto* root : hlo_roots) {
-    heroes.push_back(&FindNonTrivialHero(*root, boundary_fn));
+  for (auto root : fusion->GetRoots()) {
+    roots.push_back(&root.instruction());
+    heroes.push_back(&FindNonTrivialHero(*roots.back(), *fusion));
   }
 
   std::vector<const HloInstruction*> fusion_arguments;
-  FindFusionArguments(hlo_roots, boundary_fn,
-                      [&](const HloInstruction& argument) {
-                        fusion_arguments.push_back(&argument);
-                      });
+  FindFusionArguments(*fusion, [&](auto argument) {
+    fusion_arguments.push_back(&argument.instruction());
+  });
 
   auto is_4bit = [](const HloInstruction* arg) {
     return primitive_util::Is4BitType(arg->shape().element_type());
   };
-  bool has_4_bit_input = absl::c_any_of(fusion_arguments, is_4bit);
-  bool has_4_bit_output = absl::c_any_of(hlo_roots, is_4bit);
+
+  InputOutputInfo input_output_info{
+      .has_4_bit_input = absl::c_any_of(fusion_arguments, is_4bit),
+      .has_4_bit_output = absl::c_any_of(roots, is_4bit),
+      .smallest_input_dtype_bits = SmallestInputDtypeBits(fusion_arguments),
+  };
 
   std::optional<TransposeDescription> tiled_transpose_hero =
-      FindConsistentTransposeHero(hlo_roots, heroes);
+      FindConsistentTransposeHero(roots, heroes);
 
-  return HloFusionAnalysis(std::move(backend_config), std::move(hlo_roots),
-                           std::move(boundary_fn), std::move(fusion_arguments),
-                           std::move(heroes), device_info, tiled_transpose_hero,
-                           has_4_bit_input, has_4_bit_output);
+  return HloFusionAnalysis(std::move(backend_config), std::move(roots),
+                           std::move(fusion), std::move(heroes), device_info,
+                           tiled_transpose_hero, std::move(input_output_info));
 }
 
 // static
@@ -295,10 +357,8 @@ StatusOr<HloFusionAnalysis> HloFusionAnalysis::Create(
   CHECK(device_info != nullptr);
   TF_ASSIGN_OR_RETURN(auto backend_config,
                       fusion->backend_config<FusionBackendConfig>());
-
-  auto hlo_roots = GetFusionRoots(*fusion->fused_instructions_computation());
-  return Create(std::move(backend_config), std::move(hlo_roots),
-                DefaultFusionBoundaryFn, device_info);
+  return Create(std::move(backend_config),
+                HloFusionAdaptor::ForInstruction(fusion), device_info);
 }
 
 // Returns true if the fusion has consistent transpose heros.
@@ -308,6 +368,10 @@ bool HloFusionAnalysis::HasConsistentTransposeHeros() const {
 
 HloFusionAnalysis::EmitterFusionKind HloFusionAnalysis::GetEmitterFusionKind()
     const {
+  if (fusion_backend_config_.kind() == kCustomFusionKind) {
+    return EmitterFusionKind::kCustomFusion;
+  }
+
 #if GOOGLE_CUDA
   if (fusion_backend_config_.kind() == kTritonGemmFusionKind ||
       fusion_backend_config_.kind() == kTritonSoftmaxFusionKind) {
@@ -315,7 +379,8 @@ HloFusionAnalysis::EmitterFusionKind HloFusionAnalysis::GetEmitterFusionKind()
   }
 #endif
 
-  if (has_4_bit_input_ || has_4_bit_output_) {
+  if (input_output_info_.has_4_bit_input ||
+      input_output_info_.has_4_bit_output) {
     // Only loop fusions currently can handle int4 inputs/outputs, due to the
     // special handling with IrArray needed to deal with two values occupying a
     // single byte.
@@ -348,7 +413,7 @@ HloFusionAnalysis::EmitterFusionKind HloFusionAnalysis::GetEmitterFusionKind()
   return EmitterFusionKind::kLoop;
 }
 
-StatusOr<LaunchDimensions> HloFusionAnalysis::GetLaunchDimensions() {
+StatusOr<LaunchDimensions> HloFusionAnalysis::GetLaunchDimensions() const {
   auto emitter_fusion_kind = GetEmitterFusionKind();
   switch (emitter_fusion_kind) {
     case EmitterFusionKind::kLoop: {
@@ -388,13 +453,19 @@ StatusOr<LaunchDimensions> HloFusionAnalysis::GetLaunchDimensions() {
       return CalculateLaunchDimensions(root_shape, *device_info_,
                                        {unroll_factor, /*few_waves=*/false});
     }
+    case EmitterFusionKind::kCustomFusion:
+      return absl::UnimplementedError(
+          "GetLaunchDimensions is not implemented for custom fusions");
     case EmitterFusionKind::kTriton:
-      return Unimplemented("GetLaunchDimensions");
+      return absl::UnimplementedError(
+          "GetLaunchDimensions is not implemented for Triton fusions");
   }
 }
 
 const HloInstruction* HloFusionAnalysis::FindHeroReduction() const {
-  CHECK(GetEmitterFusionKind() == EmitterFusionKind::kReduction);
+  if (GetEmitterFusionKind() != EmitterFusionKind::kReduction) {
+    return nullptr;
+  }
   auto roots = fusion_roots();
   CHECK(!roots.empty());
   // We always use the first reduce root that triggers unnested reduction
@@ -409,57 +480,8 @@ const HloInstruction* HloFusionAnalysis::FindHeroReduction() const {
   LOG(FATAL) << "Did not find a hero reduction";
 }
 
-const ReductionCodegenInfo* HloFusionAnalysis::GetReductionCodegenInfo() {
-  if (reduction_codegen_info_.has_value()) {
-    return &reduction_codegen_info_.value();
-  }
-
-  const HloInstruction* hero_reduction = FindHeroReduction();
-
-  auto reduction_codegen_info = ComputeReductionCodegenInfo(hero_reduction);
-  reduction_codegen_info_.emplace(std::move(reduction_codegen_info));
-  return &reduction_codegen_info_.value();
-}
-
-const TilingScheme* HloFusionAnalysis::GetTransposeTilingScheme() {
-  if (transpose_tiling_scheme_.has_value()) {
-    return &transpose_tiling_scheme_.value();
-  }
-
-  if (!tiled_transpose_) {
-    return nullptr;
-  }
-
-  constexpr int kNumRows = 4;
-  static_assert(WarpSize() % kNumRows == 0);
-
-  // 3D view over the input shape.
-  Vector3 dims = tiled_transpose_->dimensions;
-  Vector3 order = tiled_transpose_->permutation;
-
-  Vector3 permuted_dims = {dims[order[0]], dims[order[1]], dims[order[2]]};
-  Vector3 tile_sizes{1, 1, 1};
-  tile_sizes[order[2]] = WarpSize() / kNumRows;
-  Vector3 num_threads{1, 1, WarpSize()};
-  num_threads[order[2]] = kNumRows;
-
-  TilingScheme tiling_scheme(
-      /*permuted_dims*/ permuted_dims,
-      /*tile_sizes=*/tile_sizes,
-      /*num_threads=*/num_threads,
-      /*indexing_order=*/kLinearIndexingX,
-      /*vector_size=*/1,
-      /*scaling_factor=*/1,
-      /*tiling_dimensions=*/{order[2], 2});
-  transpose_tiling_scheme_.emplace(std::move(tiling_scheme));
-  return &transpose_tiling_scheme_.value();
-}
-
-const LaunchDimensionsConfig* HloFusionAnalysis::GetLoopFusionConfig() {
-  if (loop_fusion_config_.has_value()) {
-    return &loop_fusion_config_.value();
-  }
-
+std::optional<LaunchDimensionsConfig>
+HloFusionAnalysis::ComputeLoopFusionConfig() const {
   int unroll_factor = 1;
   // Unrolling is good to read large inputs with small elements
   // due to vector loads, but increases the register pressure when one
@@ -470,13 +492,12 @@ const LaunchDimensionsConfig* HloFusionAnalysis::GetLoopFusionConfig() {
   int64_t num_elements = ShapeUtil::ElementsIn(GetElementShape());
   int64_t n_threads_max =
       device_info_->threads_per_core_limit() * device_info_->core_count();
-  if (num_elements >= n_threads_max &&
-      !MayPreventVectorization(fusion_roots_, fusion_boundary_fn_)) {
+  if (num_elements >= n_threads_max && !MayPreventVectorization(*fusion_)) {
     unroll_factor = ComputeMaxUnrollFactor(num_elements);
   }
   // CHECK that unroll_factor is a power-of-2, as needed by the logic below.
   CHECK(absl::has_single_bit(static_cast<uint64_t>(unroll_factor)));
-  if (has_4_bit_output_ && unroll_factor == 1) {
+  if (input_output_info_.has_4_bit_output && unroll_factor == 1) {
     // Ensure a single thread writes to a byte containing two int4 values by
     // setting unroll_factor to 2. unroll_factor is always a power of 2, so
     // setting it to 2 here ensures unroll_factor is even when there are 4-bit
@@ -492,31 +513,31 @@ const LaunchDimensionsConfig* HloFusionAnalysis::GetLoopFusionConfig() {
 
   if (GetEmitterFusionKind() == EmitterFusionKind::kScatter) {
     // Only the unroll factor is used for scatter.
-    loop_fusion_config_.emplace(LaunchDimensionsConfig{unroll_factor});
-    return &loop_fusion_config_.value();
+    return LaunchDimensionsConfig{unroll_factor};
   }
 
   bool row_vectorized;
   int num_big_inputs;
   std::tie(row_vectorized, num_big_inputs) =
-      RowVectorizationEnabled(fusion_roots(), GetElementShape().rank());
-  bool few_waves = !HloAnyOf(
-      fusion_roots_, fusion_boundary_fn_, [&](const HloInstruction& instr) {
-        if (instr.opcode() == HloOpcode::kParameter ||
-            instr.opcode() == HloOpcode::kConstant ||
-            HloInstruction::IsOpElementwise(instr.opcode())) {
-          return false;
-        }
-        if (auto broadcast = DynCast<HloBroadcastInstruction>(&instr)) {
-          if (broadcast->dimensions().empty() ||
-              // More than 3 big inputs cause a speed regression.
-              (row_vectorized && num_big_inputs <= 3)) {
-            return false;
-          }
-        }
-        VLOG(2) << "few_waves not enabled due to: " << instr.ToString();
-        return true;
-      });
+      RowVectorizationEnabled(*fusion_, GetElementShape().rank());
+  bool few_waves = !HloAnyOf(fusion_->GetRoots(), *fusion_, [&](auto instr) {
+    if (instr.opcode() == HloOpcode::kParameter ||
+        instr.opcode() == HloOpcode::kConstant ||
+        HloInstruction::IsOpElementwise(instr.opcode())) {
+      return false;
+    }
+    if (auto broadcast =
+            DynCast<HloBroadcastInstruction>(&instr.instruction())) {
+      if (broadcast->dimensions().empty() ||
+          // More than 3 big inputs cause a speed regression.
+          (row_vectorized && num_big_inputs <= 3)) {
+        return false;
+      }
+    }
+    VLOG(2) << "few_waves not enabled due to: "
+            << instr.instruction().ToString();
+    return true;
+  });
 
   LaunchDimensionsConfig launch_config{unroll_factor, few_waves,
                                        row_vectorized};
@@ -528,8 +549,7 @@ const LaunchDimensionsConfig* HloFusionAnalysis::GetLoopFusionConfig() {
     launch_config.row_vectorized = false;
     launch_config.few_waves = false;
   }
-  loop_fusion_config_.emplace(std::move(launch_config));
-  return &loop_fusion_config_.value();
+  return launch_config;
 }
 
 const Shape& HloFusionAnalysis::GetElementShape() const {
@@ -540,17 +560,12 @@ const Shape& HloFusionAnalysis::GetElementShape() const {
   return *shape;
 }
 
-int HloFusionAnalysis::SmallestInputDtypeBits() const {
-  int bits = std::numeric_limits<int>::max();
-  for (const HloInstruction* operand : fusion_arguments_) {
-    bits = std::min(bits,
-                    primitive_util::BitWidth(operand->shape().element_type()));
-  }
-  return bits;
-}
-
 int64_t HloFusionAnalysis::MaxBeneficialColumnReductionUnrollBasedOnBlockSize()
     const {
+  // Some callers use this analysis with an invalid device info.
+  // TODO(jreiffers): Fix that.
+  if (device_info_->core_count() == 0) return 1;
+
   int64_t num_reduce_output_elems = 0;
   for (const HloInstruction* root : fusion_roots()) {
     if (!IsReductionFromOrToContiguousDimensions(*root)) {
@@ -591,64 +606,68 @@ HloFusionAnalysis::GroupDisjointReductions() const {
     return {{fusion_roots()[0]}};
   }
 
-  ConstHloInstructionMap<tensorflow::UnionFind<const HloInstruction*>>
+  absl::node_hash_map<HloInstructionAdaptor,
+                      tensorflow::UnionFind<HloInstructionAdaptor>>
       disjoint_sets;
 
   // TODO(b/249976438): we currently do not treat properly
   // aliasing between inputs and outputs of the fusion, so for now put all
   // non-reduction roots into one group to avoid read-after-write conflicts.
-  const HloInstruction* first_non_reduction_root = nullptr;
+  std::optional<HloInstructionAdaptor> first_non_reduction_root = std::nullopt;
 
-  ConstHloInstructionMap<absl::flat_hash_set<const HloInstruction*>>
+  absl::node_hash_map<HloInstructionAdaptor,
+                      absl::flat_hash_set<HloInstructionAdaptor>>
       reachable_outputs;
-  absl::flat_hash_set<const HloInstruction*> roots_with_reduction;
-  for (auto [root, hero] : llvm::zip(fusion_roots(), fusion_heroes_)) {
+  absl::flat_hash_set<HloInstructionAdaptor> roots_with_reduction;
+  auto roots = fusion_->GetRoots();
+  for (auto [root, hero] : llvm::zip(roots, fusion_heroes_)) {
     disjoint_sets[root].Get() = root;
     reachable_outputs[root].insert(root);
-    if (IsRealReductionHero(*root, *hero)) {
+    if (IsRealReductionHero(root.instruction(), *hero)) {
       roots_with_reduction.insert(root);
     } else if (first_non_reduction_root) {
-      disjoint_sets[first_non_reduction_root].Merge(&disjoint_sets[root]);
+      disjoint_sets[*first_non_reduction_root].Merge(&disjoint_sets[root]);
     } else {
       first_non_reduction_root = root;
     }
   }
 
-  std::vector<const HloInstruction*> instructions;
+  std::vector<HloInstructionAdaptor> instructions;
   HloBfsConsumersFirstTraversal(
-      fusion_roots_,
-      [&](const HloInstruction& producer, const HloInstruction& consumer) {
-        auto& producer_reachable = reachable_outputs[&producer];
-        for (auto* instruction : reachable_outputs[&consumer]) {
-          producer_reachable.insert(instruction);
+      roots, *fusion_,
+      [&](HloInstructionAdaptor consumer) {
+        auto& consumer_reachable = reachable_outputs[consumer];
+        for (auto producer : consumer.GetOperands()) {
+          reachable_outputs[producer].insert(consumer_reachable.begin(),
+                                             consumer_reachable.end());
         }
-        return fusion_boundary_fn_(producer, consumer);
-      },
-      [&](const HloInstruction& node) {
-        instructions.push_back(&node);
+        instructions.push_back(consumer);
         return TraversalResult::kVisitOperands;
+      },
+      [&](HloInstructionAdaptor argument) {
+        instructions.push_back(argument);
       });
 
-  for (const HloInstruction* instr : instructions) {
+  for (auto instr : instructions) {
     const auto& reachable = reachable_outputs[instr];
-    std::vector<const HloInstruction*> reached_output_ids;
+    std::vector<HloInstructionAdaptor> reached_output_ids;
     bool added_to_reduce = false;
-    for (const HloInstruction* output : fusion_roots()) {
+    for (auto output : roots) {
       bool has_real_hero = roots_with_reduction.contains(output);
-      if (has_real_hero && (hlo_query::IsBroadcastedConstantOrScalar(*instr))) {
+      if (has_real_hero &&
+          (hlo_query::IsBroadcastedConstantOrScalar(instr.instruction()))) {
         if (added_to_reduce) {
           // Do not group more than one output reduce instructions through
           // broadcasted constants or scalars, as the recomputation should be
           // acceptable.
-          VLOG(3) << "Skip broadcasted constant or scalar "
-                  << instr->ToString();
+          VLOG(3) << "Skip broadcasted constant or scalar " << instr.ToString();
           continue;
         }
       }
       // Now group output instructions if they have common predecessors.
       if (reachable.contains(output)) {
-        VLOG(3) << "Reaching " << output->ToString() << " from "
-                << instr->ToString();
+        VLOG(3) << "Reaching " << output.ToString() << " from "
+                << instr.ToString();
         reached_output_ids.push_back(output);
         if (has_real_hero) {
           added_to_reduce = true;
@@ -663,8 +682,9 @@ HloFusionAnalysis::GroupDisjointReductions() const {
 
   // Place output instructions in the same set into the same group.
   ConstHloInstructionMap<std::vector<const HloInstruction*>> groups;
-  for (const HloInstruction* root : fusion_roots()) {
-    groups[disjoint_sets[root].Get()].push_back(root);
+  for (auto root : roots) {
+    groups[&disjoint_sets[root].Get().instruction()].push_back(
+        &root.instruction());
   }
 
   std::vector<std::vector<const HloInstruction*>> ret;
@@ -701,10 +721,9 @@ bool HloFusionAnalysis::IsUnrollingColumnReductionBeneficial(
 
   // Fusion inputs that have the same dimension as the reduce input and
   // only involve in element-wise operations can be vectorized.
-  absl::flat_hash_set<const HloInstruction*> reachable_through_non_elementwise;
+  absl::flat_hash_set<HloInstructionAdaptor> reachable_through_non_elementwise;
   HloBfsConsumersFirstTraversal(
-      fusion_roots_,
-      [&](const HloInstruction& producer, const HloInstruction& consumer) {
+      fusion_->GetRoots(), *fusion_, [&](auto consumer) {
         // We check if the consumer is elementwise, unless this edge is a
         // virtual edge that only exists in partially fused HLO. There are two
         // types of such edges:
@@ -712,37 +731,34 @@ bool HloFusionAnalysis::IsUnrollingColumnReductionBeneficial(
         //    within a fusion. Here, the producer is a parameter of the fusion
         //    instruction.
         // 2. Edges from fusion roots to fusion nodes.
-        if (reachable_through_non_elementwise.contains(&consumer) ||
-            (!(consumer.opcode() == HloOpcode::kParameter ||
-               consumer.opcode() == HloOpcode::kFusion ||
-               consumer.IsElementwise()) &&
-             !use_chain_endings.contains(&consumer))) {
-          reachable_through_non_elementwise.insert(&producer);
+        if (reachable_through_non_elementwise.contains(consumer) ||
+            (!consumer.instruction().IsElementwise() &&
+             !use_chain_endings.contains(&consumer.instruction()))) {
+          for (auto producer : consumer.GetOperands()) {
+            reachable_through_non_elementwise.insert(producer);
+          }
         }
-
-        return fusion_boundary_fn_(producer, consumer);
-      },
-      [&](const HloInstruction& node) {
         return TraversalResult::kVisitOperands;
       });
 
-  for (auto* argument : fusion_arguments_) {
-    if (!reachable_through_non_elementwise.contains(argument) &&
-        ShapeUtil::SameDimensions(input_shape, argument->shape())) {
+  int64_t num_elements = ShapeUtil::ElementsIn(input_shape);
+  FindFusionArguments(*fusion_, [&](auto arg) {
+    if (!reachable_through_non_elementwise.contains(arg) &&
+        ShapeUtil::SameDimensions(input_shape, arg.shape())) {
       ++can_be_vectorized;
     }
-  }
 
-  // Fusion inputs with more elements than the reduce op input must participate
-  // in non-elementwise operations and we assume that they are not vectorizable
-  // for the purpose of estimating the benefit of unrolling. If the kernel is
-  // unrolled even with such an assumption,  and the accesses to those inputs
-  // turn out to be vectorizable, the compiler will still vectorize them.
-  int64_t num_elements = ShapeUtil::ElementsIn(input_shape);
-  cannot_be_vectorized +=
-      absl::c_count_if(fusion_arguments_, [&](const HloInstruction* parameter) {
-        return ShapeUtil::ElementsIn(parameter->shape()) > num_elements;
-      });
+    // Fusion inputs with more elements than the reduce op input must
+    // participate in non-elementwise operations and we assume that they are
+    // not vectorizable for the purpose of estimating the benefit of
+    // unrolling. If the kernel is unrolled even with such an assumption,
+    // and the accesses to those inputs turn out to be vectorizable, the
+    // compiler will still vectorize them.
+    if (ShapeUtil::ElementsIn(arg.shape()) > num_elements) {
+      ++cannot_be_vectorized;
+    }
+  });
+
   if (can_be_vectorized < cannot_be_vectorized) {
     return false;
   }
@@ -761,7 +777,7 @@ bool HloFusionAnalysis::CanVectorizeReduction(
   }
 
   if (reduction_dimensions.dimensions[kDimX] % 2 != 0 ||
-      MayPreventVectorization(fusion_roots_, fusion_boundary_fn_)) {
+      MayPreventVectorization(*fusion_)) {
     return false;
   }
 
@@ -776,7 +792,7 @@ bool HloFusionAnalysis::CanVectorizeReduction(
   if (cuda_cc == nullptr) return false;
   if (cuda_cc->IsAtLeast(se::CudaComputeCapability::VOLTA)) return true;
   if (cuda_cc->IsAtLeast(se::CudaComputeCapability::PASCAL_)) {
-    return SmallestInputDtypeBits() <= 32 &&
+    return input_output_info_.smallest_input_dtype_bits <= 32 &&
            reduction_dimensions.dimensions[kDimX] %
                    (reduction_tiling[2] * num_threads_x) ==
                0;
@@ -800,8 +816,13 @@ int HloFusionAnalysis::CalculateVirtualThreadScalingFactorForReduction(
   return 1;
 }
 
-ReductionCodegenInfo HloFusionAnalysis::ComputeReductionCodegenInfo(
+std::optional<ReductionCodegenInfo>
+HloFusionAnalysis::ComputeReductionCodegenInfo(
     const HloInstruction* hero_reduction) const {
+  if (!hero_reduction) {
+    return std::nullopt;
+  }
+
   Shape input_shape = hero_reduction->operand(0)->shape();
   ReductionDimensions reduction_dimensions =
       GetReductionKindAndContiguousComponents(*hero_reduction);
@@ -854,7 +875,8 @@ ReductionCodegenInfo HloFusionAnalysis::ComputeReductionCodegenInfo(
   // difference, e.g. by affecting register spilling.
   int num_partial_results = 1;
   if (!reduction_dimensions.is_row_reduction && vectorize) {
-    int smallest_input_dtype_bits = SmallestInputDtypeBits();
+    int smallest_input_dtype_bits =
+        input_output_info_.smallest_input_dtype_bits;
     if (smallest_input_dtype_bits <= 32) {
       // Make sure to use all the data read at once.
       // Instead of hardcoding the granularity, we can query the granularity we
@@ -896,5 +918,27 @@ ReductionCodegenInfo HloFusionAnalysis::ComputeReductionCodegenInfo(
       reduction_is_race_free, std::move(instr_index_groups), hero_reduction);
 }
 
+std::optional<HloFusionAnalysis> AnalyzeProducerConsumerFusion(
+    const HloInstruction& producer, const HloInstruction& consumer,
+    const se::DeviceDescription& device_info) {
+  auto ret = HloFusionAnalysis::Create(
+      FusionBackendConfig::default_instance(),
+      std::make_unique<ProducerConsumerFusion>(
+          HloFusionAdaptor::ForInstruction(&producer),
+          HloFusionAdaptor::ForInstruction(&consumer)),
+      &device_info);
+  if (!ret.ok()) return std::nullopt;
+  return {std::move(*ret)};
+}
+
+std::optional<HloFusionAnalysis> AnalyzeFusion(
+    const HloInstruction& consumer, const se::DeviceDescription& device_info) {
+  auto ret = HloFusionAnalysis::Create(
+      FusionBackendConfig::default_instance(),
+      HloFusionAdaptor::ForInstruction(&consumer), &device_info);
+  if (!ret.ok()) return std::nullopt;
+  return {std::move(*ret)};
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
index 309aa9a4421007..786fe3d8a86d9d 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
@@ -40,6 +40,7 @@ class HloFusionAnalysis {
   // The type of emitted fusion.
   enum class EmitterFusionKind {
     kLoop,
+    kCustomFusion,
     kTriton,
     kReduction,
     kTranspose,
@@ -49,8 +50,8 @@ class HloFusionAnalysis {
 
   static StatusOr<HloFusionAnalysis> Create(
       FusionBackendConfig backend_config,
-      std::vector<const HloInstruction*> hlo_roots,
-      FusionBoundaryFn boundary_fn, const se::DeviceDescription* device_info);
+      std::unique_ptr<HloFusionAdaptor> fusion,
+      const se::DeviceDescription* device_info);
   static StatusOr<HloFusionAnalysis> Create(
       const HloFusionInstruction* fusion,
       const se::DeviceDescription* device_info);
@@ -58,53 +59,56 @@ class HloFusionAnalysis {
   const std::vector<const HloInstruction*>& fusion_roots() const {
     return fusion_roots_;
   }
-  const FusionBoundaryFn& fusion_boundary() const {
-    return fusion_boundary_fn_;
-  }
+  const HloFusionAdaptor& fusion() const { return *fusion_; }
 
   // Determines the fusion type for the emitter.
   EmitterFusionKind GetEmitterFusionKind() const;
 
   // Determines the launch dimensions for the fusion. The fusion kind must not
   // be `kTriton`.
-  StatusOr<LaunchDimensions> GetLaunchDimensions();
+  StatusOr<LaunchDimensions> GetLaunchDimensions() const;
 
   // Calculates the reduction information. Returns `nullptr` if the fusion is
   // not a reduction.
-  const ReductionCodegenInfo* GetReductionCodegenInfo();
+  const ReductionCodegenInfo* GetReductionCodegenInfo() const {
+    return reduction_codegen_info_.has_value() ? &*reduction_codegen_info_
+                                               : nullptr;
+  }
 
   // Calculates the transpose tiling information. Returns `nullptr` if the
   // fusion is not a transpose.
-  const TilingScheme* GetTransposeTilingScheme();
+  const TilingScheme* GetTransposeTilingScheme() const {
+    return transpose_tiling_scheme_.has_value() ? &*transpose_tiling_scheme_
+                                                : nullptr;
+  }
 
   // Calculates the loop fusion config. Returns `nullptr` if the fusion is not a
   // loop.
-  const LaunchDimensionsConfig* GetLoopFusionConfig();
+  const LaunchDimensionsConfig* GetLoopFusionConfig() const {
+    return loop_fusion_config_.has_value() ? &*loop_fusion_config_ : nullptr;
+  }
 
   // Returns the hero reduction of the computation.
   const HloInstruction* FindHeroReduction() const;
 
  private:
+  // Precomputed information about inputs (arguments) and outputs (roots) of the
+  // fusion.
+  struct InputOutputInfo {
+    bool has_4_bit_input;
+    bool has_4_bit_output;
+    int smallest_input_dtype_bits;
+  };
+
   HloFusionAnalysis(FusionBackendConfig fusion_backend_config,
                     std::vector<const HloInstruction*> fusion_roots,
-                    FusionBoundaryFn fusion_boundary_fn,
-                    std::vector<const HloInstruction*> fusion_arguments,
+                    std::unique_ptr<HloFusionAdaptor> fusion,
                     std::vector<const HloInstruction*> fusion_heroes,
                     const se::DeviceDescription* device_info,
                     std::optional<TransposeDescription> tiled_transpose,
-                    bool has_4_bit_input, bool has_4_bit_output)
-      : fusion_backend_config_(std::move(fusion_backend_config)),
-        fusion_roots_(std::move(fusion_roots)),
-        fusion_boundary_fn_(std::move(fusion_boundary_fn)),
-        fusion_arguments_(std::move(fusion_arguments)),
-        fusion_heroes_(std::move(fusion_heroes)),
-        device_info_(device_info),
-        tiled_transpose_(tiled_transpose),
-        has_4_bit_input_(has_4_bit_input),
-        has_4_bit_output_(has_4_bit_output) {}
+                    InputOutputInfo input_output_info);
 
   const Shape& GetElementShape() const;
-  int SmallestInputDtypeBits() const;
   int64_t MaxBeneficialColumnReductionUnrollBasedOnBlockSize() const;
   std::vector<std::vector<const HloInstruction*>> GroupDisjointReductions()
       const;
@@ -117,27 +121,34 @@ class HloFusionAnalysis {
                              bool reduction_is_race_free) const;
   int CalculateVirtualThreadScalingFactorForReduction(
       const ReductionDimensions& reduction_dimensions) const;
-  ReductionCodegenInfo ComputeReductionCodegenInfo(
+  std::optional<ReductionCodegenInfo> ComputeReductionCodegenInfo(
       const HloInstruction* hero_reduction) const;
+  std::optional<LaunchDimensionsConfig> ComputeLoopFusionConfig() const;
   bool HasConsistentTransposeHeros() const;
 
   FusionBackendConfig fusion_backend_config_;
   std::vector<const HloInstruction*> fusion_roots_;
-  FusionBoundaryFn fusion_boundary_fn_;
-  // The HLO instructions that are inputs into the fusion. These instructions
-  // are /outside/ the fusion.
-  std::vector<const HloInstruction*> fusion_arguments_;
+  std::unique_ptr<HloFusionAdaptor> fusion_;
   std::vector<const HloInstruction*> fusion_heroes_;
   const se::DeviceDescription* device_info_;
   std::optional<TransposeDescription> tiled_transpose_;
-  const bool has_4_bit_input_ = false;
-  const bool has_4_bit_output_ = false;
+  InputOutputInfo input_output_info_;
 
   std::optional<ReductionCodegenInfo> reduction_codegen_info_;
   std::optional<TilingScheme> transpose_tiling_scheme_;
   std::optional<LaunchDimensionsConfig> loop_fusion_config_;
 };
 
+// Creates a HloFusionAnalysis that analyzes a hypothetical fusion of producer
+// into consumer.
+std::optional<HloFusionAnalysis> AnalyzeProducerConsumerFusion(
+    const HloInstruction& producer, const HloInstruction& consumer,
+    const se::DeviceDescription& device_info);
+// Creates a HloFusionAnalysis that analyzes just consumer as a standalone
+// fusion.
+std::optional<HloFusionAnalysis> AnalyzeFusion(
+    const HloInstruction& consumer, const se::DeviceDescription& device_info);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
index ddc92667a6b75b..9b1bf89afd94c8 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_traversal.h"
+#include "xla/stream_executor/device_description.pb.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
@@ -46,18 +47,15 @@ TEST_F(HloFusionAnalysisTest, DoesNotPeekOutsideBoundary) {
   auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
 
   auto* root = module->entry_computation()->root_instruction();
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto analysis, HloFusionAnalysis::Create(
-                         FusionBackendConfig::default_instance(), {root},
-                         MakeSingleInstructionFusion(*root), &device_info));
-  EXPECT_EQ(analysis.GetEmitterFusionKind(),
+  auto analysis = AnalyzeFusion(*root, device_info);
+  ASSERT_NE(analysis, std::nullopt);
+  EXPECT_EQ(analysis->GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kLoop);
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto analysis_fused,
-      HloFusionAnalysis::Create(FusionBackendConfig::default_instance(), {root},
-                                DefaultFusionBoundaryFn, &device_info));
-  EXPECT_EQ(analysis_fused.GetEmitterFusionKind(),
+  auto analysis_fused =
+      AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
+  ASSERT_NE(analysis_fused, std::nullopt);
+  EXPECT_EQ(analysis_fused->GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction);
 }
 
@@ -71,23 +69,30 @@ TEST_F(HloFusionAnalysisTest, ReductionWithMultipleUsers) {
       ROOT add = f32[] add(p0, p1)
     }
 
-    ENTRY main {
+    fused_computation {
       %p0 = f32[1024] parameter(0)
       %p1 = f32[] parameter(1)
       %reduce = f32[] reduce(%p0, %p1), dimensions={0}, to_apply=add
       %negate = f32[] negate(%reduce)
       %log = f32[] log(%reduce)
       ROOT %tuple = (f32[], f32[]) tuple(%negate, %log)
+    }
+
+    ENTRY main {
+      %p0 = f32[1024] parameter(0)
+      %p1 = f32[] parameter(1)
+      ROOT %fusion = (f32[], f32[]) fusion(%p0, %p1), kind=kLoop, calls=fused_computation
     })")
                     .value();
 
   auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
 
-  auto* root = module->entry_computation()->root_instruction();
   TF_ASSERT_OK_AND_ASSIGN(
-      auto analysis,
-      HloFusionAnalysis::Create(FusionBackendConfig::default_instance(), {root},
-                                DefaultFusionBoundaryFn, &device_info));
+      auto analysis, HloFusionAnalysis::Create(
+                         FusionBackendConfig::default_instance(),
+                         HloFusionAdaptor::ForInstruction(
+                             module->entry_computation()->root_instruction()),
+                         &device_info));
   // This fusion cannot use the reduction emitter because the reduce has two
   // users.
   EXPECT_EQ(analysis.GetEmitterFusionKind(),
@@ -104,11 +109,17 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusion) {
       ROOT add = f32[] add(p0, p1)
     }
 
-    ENTRY main {
+    fused_computation {
       %p0 = f32[1024] parameter(0)
       %p1 = f32[] parameter(1)
       %reduce = f32[] reduce(%p0, %p1), dimensions={0}, to_apply=add
       ROOT %negate = f32[] negate(%reduce)
+    }
+
+    ENTRY main {
+      %p0 = f32[1024] parameter(0)
+      %p1 = f32[] parameter(1)
+      ROOT %fusion = f32[] fusion(%p0, %p1), kind=kInput, calls=fused_computation
     })")
                     .value();
 
@@ -116,9 +127,9 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusion) {
 
   auto* root = module->entry_computation()->root_instruction();
   TF_ASSERT_OK_AND_ASSIGN(
-      auto analysis,
-      HloFusionAnalysis::Create(FusionBackendConfig::default_instance(), {root},
-                                DefaultFusionBoundaryFn, &device_info));
+      auto analysis, HloFusionAnalysis::Create(
+                         FusionBackendConfig::default_instance(),
+                         HloFusionAdaptor::ForInstruction(root), &device_info));
   EXPECT_EQ(analysis.GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction);
 }
@@ -150,12 +161,11 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFused) {
   auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
 
   auto* root = module->entry_computation()->root_instruction();
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto analysis,
-      HloFusionAnalysis::Create(
-          FusionBackendConfig::default_instance(), {root},
-          MakeProducerConsumerFusion(*root->operand(0), *root), &device_info));
-  EXPECT_EQ(analysis.GetEmitterFusionKind(),
+
+  auto analysis =
+      AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
+  ASSERT_NE(analysis, std::nullopt);
+  EXPECT_EQ(analysis->GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction);
 }
 
@@ -185,13 +195,10 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFusedInConsumer) {
   auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
 
   auto* root = module->entry_computation()->root_instruction();
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto analysis,
-      HloFusionAnalysis::Create(
-          FusionBackendConfig::default_instance(),
-          {root->fused_expression_root()},
-          MakeProducerConsumerFusion(*root->operand(0), *root), &device_info));
-  EXPECT_EQ(analysis.GetEmitterFusionKind(),
+  auto analysis =
+      AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
+  ASSERT_NE(analysis, std::nullopt);
+  EXPECT_EQ(analysis->GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction);
 }
 
@@ -227,12 +234,42 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFusedInBoth) {
   auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
 
   auto* root = module->entry_computation()->root_instruction();
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto analysis,
-      HloFusionAnalysis::Create(
-          FusionBackendConfig::default_instance(), {root},
-          MakeProducerConsumerFusion(*root->operand(0), *root), &device_info));
-  EXPECT_EQ(analysis.GetEmitterFusionKind(),
+  auto analysis =
+      AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
+  ASSERT_NE(analysis, std::nullopt);
+  EXPECT_EQ(analysis->GetEmitterFusionKind(),
+            HloFusionAnalysis::EmitterFusionKind::kReduction);
+}
+
+TEST_F(HloFusionAnalysisTest, InvalidDevice) {
+  // Verifies that an analysis can be created even with an invalid/empty device
+  // info, and that the emitter type is determined correctly.
+  // Don't rely on this behavior.
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+
+    ENTRY main {
+      %p0 = f32[1024,128] parameter(0)
+      %p1 = f32[] parameter(1)
+      %reduce = f32[128] reduce(%p0, %p1), dimensions={0}, to_apply=add
+      ROOT %bitcast = s32[128] bitcast(%reduce)
+    })")
+                    .value();
+
+  stream_executor::GpuDeviceInfoProto device_info_proto;
+  stream_executor::DeviceDescription device_info(device_info_proto);
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis_fused =
+      AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
+  ASSERT_NE(analysis_fused, std::nullopt);
+  EXPECT_EQ(analysis_fused->GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction);
 }
 
diff --git a/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.cc b/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.cc
index 8748e7995c7cf2..475830d7a48685 100644
--- a/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -102,8 +102,7 @@ llvm::Value* HloToIrBindings::EmitGetTupleElement(const HloInstruction* gte,
   // TODO(b/26344050): tighten the alignment based on the real element type.
   if (gte->operand(0)->opcode() != HloOpcode::kGetTupleElement) {
     return llvm_ir::EmitGetTupleElement(
-        gte->shape(), gte->tuple_index(), /*alignment=*/1,
-        GetTypedIrValue(*gte->operand(0), {}, base_ptr),
+        gte->shape(), gte->tuple_index(), /*alignment=*/1, base_ptr,
         llvm_ir::ShapeToIrType(gte->operand(0)->shape(), module_), b_);
   }
   return llvm_ir::EmitGetTupleElement(
@@ -120,51 +119,17 @@ static bool HasMeaningfulName(llvm::Value* value) {
   return false;
 }
 
-llvm::Value* CastToTypedValue(const Shape& shape, llvm::Value* ir_value,
-                              llvm::IRBuilder<>* b) {
-  llvm::Type* pointee_type =
-      llvm_ir::ShapeToIrType(shape, b->GetInsertBlock()->getModule());
-
-  llvm::Type* dest_type = pointee_type->getPointerTo();
-
-  llvm::Value* typed_ir_value;
-  if (llvm::isa<llvm::GlobalVariable>(ir_value)) {
-    typed_ir_value = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-        llvm::cast<llvm::GlobalVariable>(ir_value), dest_type);
-  } else {
-    typed_ir_value = b->CreatePointerBitCastOrAddrSpaceCast(
-        ir_value, pointee_type->getPointerTo());
-  }
-  return typed_ir_value;
-}
-
-llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo,
-                                              ShapeIndexView shape_index,
-                                              llvm::Value* ir_value) {
-  auto typed_ir_value = CastToTypedValue(
-      ShapeUtil::GetSubshape(hlo.shape(), shape_index), ir_value, b_);
-  if (!HasMeaningfulName(ir_value)) {
-    ir_value->setName(llvm_ir::IrName(&hlo, "raw"));
-  }
-  if (!HasMeaningfulName(typed_ir_value)) {
-    typed_ir_value->setName(llvm_ir::IrName(&hlo, "typed"));
-  }
-  return typed_ir_value;
-}
-
 void HloToIrBindings::BindHloToIrValue(const HloInstruction& hlo,
                                        llvm::Value* ir_value,
                                        ShapeIndexView shape_index) {
   VLOG(2) << "Binding " << hlo.ToString();
 
   const Shape& hlo_shape = hlo.shape();
-  llvm::Value* typed_ir_value = GetTypedIrValue(hlo, shape_index, ir_value);
-
   if (!BoundToIrValue(hlo)) {
     // Set the root of ShapeTree first before assigning the element ir value.
     InsertOrDie(&base_ptrs_, &hlo, ShapeTree<llvm::Value*>(hlo_shape, nullptr));
   }
-  *(base_ptrs_[&hlo].mutable_element(shape_index)) = typed_ir_value;
+  *(base_ptrs_[&hlo].mutable_element(shape_index)) = ir_value;
 }
 
 llvm_ir::IrArray HloToIrBindings::GetIrArray(const HloInstruction& hlo,
diff --git a/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.h b/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.h
index 3696968160ad1f..6ccc40246b7c41 100644
--- a/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.h
@@ -85,11 +85,6 @@ class HloToIrBindings {
   llvm::Value* EmitGetTupleElement(const HloInstruction* gte,
                                    llvm::Value* base_ptr);
 
-  // Returns an llvm typed ir representation of 'ir_value' based on 'hlo' shape.
-  llvm::Value* GetTypedIrValue(const HloInstruction& hlo,
-                               ShapeIndexView shape_index,
-                               llvm::Value* ir_value);
-
   const bool is_nested_;
 
   llvm::IRBuilder<>* b_;
@@ -106,10 +101,6 @@ class HloToIrBindings {
   llvm::Value* temp_buffer_base_ = nullptr;
 };
 
-// Converts `ir_value` with type i8* to a typed LLVM Value* based on `shape`.
-llvm::Value* CastToTypedValue(const Shape& shape, llvm::Value* ir_value,
-                              llvm::IRBuilder<>* b);
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal.cc b/third_party/xla/xla/service/gpu/hlo_traversal.cc
index b4025db0a121eb..ac5286e88ad685 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal.cc
+++ b/third_party/xla/xla/service/gpu/hlo_traversal.cc
@@ -15,9 +15,11 @@ limitations under the License.
 #include "xla/service/gpu/hlo_traversal.h"
 
 #include <functional>
+#include <memory>
 #include <queue>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -25,77 +27,209 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+namespace {
 
-bool DefaultFusionBoundaryFn(const HloInstruction&,
-                             const HloInstruction& consumer) {
-  return consumer.opcode() == HloOpcode::kParameter;
+template <typename F>
+void ResolveUsers(const HloInstruction* value, const HloInstruction* user,
+                  F&& fn) {
+  if (user->opcode() == HloOpcode::kFusion) {
+    auto* param = user->fused_parameter(user->operand_index(value));
+    for (const auto* param_user : param->users()) {
+      fn(param_user);
+    }
+  } else {
+    fn(user);
+  }
 }
 
-FusionBoundaryFn MakeProducerConsumerFusion(
-    const HloInstruction& fused_producer,
-    const HloInstruction& fused_consumer) {
-  if (fused_consumer.opcode() == HloOpcode::kFusion &&
-      fused_producer.opcode() == HloOpcode::kFusion) {
-    // fusion -> fusion.
-    return [&](const HloInstruction& producer, const HloInstruction& consumer) {
-      return DefaultFusionBoundaryFn(producer, consumer) &&
-             &producer != &fused_producer;
-    };
+const HloInstruction* ResolveOperand(const HloInstruction* operand) {
+  if (operand->opcode() == HloOpcode::kFusion) {
+    return operand->fused_expression_root();
+  }
+  if (operand->opcode() == HloOpcode::kParameter) {
+    if (auto* fusion = operand->parent()->FusionInstruction()) {
+      return ResolveOperand(fusion->operand(operand->parameter_number()));
+    }
+  }
+  return operand;
+}
+
+class SingleInstructionFusion : public HloFusionAdaptor {
+ public:
+  explicit SingleInstructionFusion(const HloInstruction* instruction)
+      : instruction_(*instruction) {
+    CHECK_NE(instruction->opcode(), HloOpcode::kFusion)
+        << "Use HloFusionFusion";
+  }
+
+  bool ContainsInstruction(HloInstructionAdaptor instruction) const override {
+    return instruction == instruction_;
+  }
+
+  absl::InlinedVector<HloInstructionAdaptor, 2> GetRoots() const override {
+    return {instruction_};
+  }
+
+ private:
+  HloInstructionAdaptor instruction_;
+};
+
+class HloComputationFusion : public HloFusionAdaptor {
+ public:
+  explicit HloComputationFusion(const HloComputation* computation)
+      : computation_(computation) {
+    // HloFusionAdaptor should only be created for fusion computations, that
+    // usually have only a few roots, but there is a case when we can it for
+    // non-fusion computations with thousands of roots. It happens inside
+    // `FindNonTrivialHero` and it gets very expensive. Calling
+    // `FindNonTrivialHero` also doesn't make sense on non-fusion computation,
+    // but `InstructionFusion` and `FusionMerger` depend on this behavoiur in
+    // `IsProducerConsumerFusible`.
+    //
+    // `FindNonTrivialHero` only call `ContainsInstruction` and doesn't use
+    // information about roots, so we can skip looking for roots as performance
+    // optimization.
+    // TODO(shyshkov): Clean this up once priority fusion is fully launched.
+    if (computation->IsFusionComputation()) {
+      roots_ = FindRoots(computation);
+    }
   }
-  if (fused_consumer.opcode() == HloOpcode::kFusion) {
-    // non-fusion -> fusion.
-    return [&](const HloInstruction& producer, const HloInstruction& consumer) {
-      if (DefaultFusionBoundaryFn(producer, consumer)) {
-        return &producer != &fused_producer;
+
+  static absl::InlinedVector<HloInstructionAdaptor, 2> FindRoots(
+      const HloComputation* computation) {
+    absl::InlinedVector<HloInstructionAdaptor, 2> roots;
+
+    std::function<void(const HloInstruction*)> get_roots;
+    absl::flat_hash_set<HloInstructionAdaptor> roots_set;
+    get_roots = [&](const HloInstruction* instr) {
+      if (instr->opcode() == HloOpcode::kTuple) {
+        for (const auto* operand : instr->operands()) {
+          get_roots(operand);
+        }
+      } else {
+        HloInstructionAdaptor wrapped{*instr};
+        if (roots_set.insert(wrapped).second) {
+          roots.push_back(wrapped);
+        }
       }
-      // Otherwise, don't follow edges above the fused producer.
-      return &consumer == &fused_producer;
     };
+    get_roots(computation->root_instruction());
+
+    return roots;
+  }
+
+  bool ContainsInstruction(HloInstructionAdaptor instruction) const override {
+    return instruction.instruction().parent() == computation_;
+  }
+
+  absl::InlinedVector<HloInstructionAdaptor, 2> GetRoots() const override {
+    CHECK(!roots_.empty())
+        << "No roots found in the computation. HloFusionAdaptor was likely "
+           "created for a non-fusion computation: "
+        << computation_->ToString();
+
+    return roots_;
+  }
+
+ private:
+  const HloComputation* computation_;
+  absl::InlinedVector<HloInstructionAdaptor, 2> roots_;
+};
+
+}  // namespace
+
+std::unique_ptr<HloFusionAdaptor> HloFusionAdaptor::ForInstruction(
+    const HloInstruction* instruction) {
+  if (instruction->opcode() == HloOpcode::kFusion) {
+    return ForComputation(instruction->fused_instructions_computation());
   }
-  // anything -> non-fusion.
-  return [&](const HloInstruction& producer, const HloInstruction& consumer) {
-    if (&consumer == &fused_consumer) {
-      // If the consumer is the fused user, only follow edges to the fused
-      // producer.
-      return &fused_producer != &producer;
+  return std::make_unique<SingleInstructionFusion>(instruction);
+}
+
+std::unique_ptr<HloFusionAdaptor> HloFusionAdaptor::ForComputation(
+    const HloComputation* computation) {
+  return std::make_unique<HloComputationFusion>(computation);
+}
+
+absl::InlinedVector<HloInstructionAdaptor, 2>
+HloInstructionAdaptor::GetOperands() const {
+  absl::InlinedVector<HloInstructionAdaptor, 2> operands;
+  if (instruction_->opcode() == HloOpcode::kParameter) {
+    // The only time this should happen is when a fusion has a parameter
+    // that is also a root. This probably never makes sense, but it technically
+    // is valid HLO, so we support it by treating the parameter as an identity
+    // function in this context.
+    auto operand = ResolveOperand(instruction_);
+    if (operand != instruction_) {
+      operands.emplace_back(*operand);
     }
+  } else {
+    for (const auto* operand : instruction_->operands()) {
+      operands.emplace_back(*ResolveOperand(operand));
+    }
+  }
+  return operands;
+}
 
-    // Otherwise, fall back to the default; we're already in the fused
-    // producer.
-    return DefaultFusionBoundaryFn(producer, consumer);
-  };
+HloInstructionAdaptor HloInstructionAdaptor::GetOperand(int index) const {
+  return HloInstructionAdaptor{*ResolveOperand(instruction_->operand(index))};
 }
 
-FusionBoundaryFn MakeSingleInstructionFusion(const HloInstruction& root) {
-  if (root.opcode() == HloOpcode::kFusion) {
-    return DefaultFusionBoundaryFn;
+absl::InlinedVector<HloInstructionAdaptor, 2> HloInstructionAdaptor::GetUsers()
+    const {
+  absl::InlinedVector<HloInstructionAdaptor, 2> users;
+  auto add_user = [&](const HloInstruction* instr) {
+    users.emplace_back(*instr);
+  };
+
+  if (instruction_->IsRoot()) {
+    if (auto* fusion = instruction_->parent()->FusionInstruction()) {
+      for (auto* user : fusion->users()) {
+        ResolveUsers(fusion, user, add_user);
+      }
+    }
   }
-  return [](const HloInstruction&, const HloInstruction&) { return true; };
+
+  for (auto* user : instruction_->users()) {
+    ResolveUsers(instruction_, user, add_user);
+  }
+
+  return users;
+}
+
+bool operator==(const HloInstructionAdaptor& lhs,
+                const HloInstructionAdaptor& rhs) {
+  return lhs.instruction_->GetModule() == rhs.instruction_->GetModule() &&
+         lhs.instruction_->unique_id() == rhs.instruction_->unique_id();
 }
 
 void HloBfsConsumersFirstTraversal(
-    absl::Span<const HloInstruction* const> roots,
-    const FusionBoundaryFn& boundary,
-    const std::function<TraversalResult(const HloInstruction& node)>& visit) {
-  absl::flat_hash_set<const HloInstruction*> visited;
-  std::queue<const HloInstruction*> q;
-  auto enqueue_operands = [&](const HloInstruction& node) {
-    for (const auto* predecessor : FindPredecessors(node, boundary)) {
-      if (visited.insert(predecessor).second) {
-        q.push(predecessor);
+    absl::Span<const HloInstructionAdaptor> roots,
+    const HloFusionAdaptor& fusion,
+    const std::function<TraversalResult(HloInstructionAdaptor node)>& visit,
+    const std::function<void(HloInstructionAdaptor producer)>& visit_arg) {
+  absl::flat_hash_set<HloInstructionAdaptor> visited;
+  std::queue<HloInstructionAdaptor> q;
+  auto enqueue_operands = [&](const HloInstructionAdaptor& node) {
+    for (auto operand : node.GetOperands()) {
+      if (visited.insert(operand).second) {
+        if (fusion.ContainsInstruction(operand)) {
+          q.push(operand);
+        } else {
+          visit_arg(operand);
+        }
       }
     }
   };
-
-  for (auto* root : roots) {
+  for (auto root : roots) {
     q.push(root);
   }
   while (!q.empty()) {
-    const HloInstruction* node = q.front();
+    HloInstructionAdaptor node = q.front();
     q.pop();
-    switch (visit(*node)) {
+    switch (visit(node)) {
       case TraversalResult::kVisitOperands:
-        enqueue_operands(*node);
+        enqueue_operands(node);
         break;
       case TraversalResult::kAbortTraversal:
         return;
@@ -106,71 +240,33 @@ void HloBfsConsumersFirstTraversal(
 }
 
 void FindFusionArguments(
-    absl::Span<const HloInstruction* const> roots,
-    const FusionBoundaryFn& boundary,
-    const std::function<void(const HloInstruction& param)>& visit) {
-  absl::flat_hash_set<const HloInstruction*> visited;
+    const HloFusionAdaptor& fusion,
+    const std::function<void(HloInstructionAdaptor param)>& visit) {
   HloBfsConsumersFirstTraversal(
-      roots,
-      [&](const HloInstruction& producer, const HloInstruction& consumer) {
-        auto is_boundary = boundary(producer, consumer);
-        if (is_boundary) {
-          if (visited.insert(&producer).second) {
-            visit(producer);
-          }
-        }
-        return is_boundary;
-      },
-      [&](const HloInstruction&) { return TraversalResult::kVisitOperands; });
-}
-
-bool HloAnyOf(absl::Span<const HloInstruction* const> roots,
-              const FusionBoundaryFn& boundary,
-              const std::function<bool(const HloInstruction& node)>& visit) {
-  return HloFindIf(roots, boundary, visit) != nullptr;
+      fusion.GetRoots(), fusion,
+      [&](HloInstructionAdaptor) { return TraversalResult::kVisitOperands; },
+      visit);
 }
 
-const HloInstruction* HloFindIf(
-    absl::Span<const HloInstruction* const> roots,
-    const FusionBoundaryFn& boundary,
-    const std::function<bool(const HloInstruction& node)>& visit) {
-  const HloInstruction* result = nullptr;
-  HloBfsConsumersFirstTraversal(roots, boundary,
-                                [&](const HloInstruction& node) {
-                                  if (visit(node)) {
-                                    result = &node;
-                                    return TraversalResult::kAbortTraversal;
-                                  }
-                                  return TraversalResult::kVisitOperands;
-                                });
-  return result;
+bool HloAnyOf(absl::Span<const HloInstructionAdaptor> roots,
+              const HloFusionAdaptor& fusion,
+              const std::function<bool(HloInstructionAdaptor node)>& visit) {
+  return HloFindIf(roots, fusion, visit).has_value();
 }
 
-absl::InlinedVector<const HloInstruction*, 2> FindPredecessors(
-    const HloInstruction& node, const FusionBoundaryFn& boundary) {
-  absl::InlinedVector<const HloInstruction*, 2> predecessors;
-  auto visit = [&](const HloInstruction& predecessor) {
-    if (!boundary(predecessor, node)) {
-      predecessors.push_back(&predecessor);
+std::optional<HloInstructionAdaptor> HloFindIf(
+    absl::Span<const HloInstructionAdaptor> roots,
+    const HloFusionAdaptor& fusion,
+    const std::function<bool(HloInstructionAdaptor node)>& visit) {
+  std::optional<HloInstructionAdaptor> result = std::nullopt;
+  HloBfsConsumersFirstTraversal(roots, fusion, [&](HloInstructionAdaptor node) {
+    if (visit(node)) {
+      result = node;
+      return TraversalResult::kAbortTraversal;
     }
-  };
-
-  switch (node.opcode()) {
-    case HloOpcode::kParameter:
-      if (auto* fusion = node.parent()->FusionInstruction()) {
-        // If the parent is the entry computation, there's no predecessor.
-        visit(*fusion->operand(node.parameter_number()));
-      }
-      break;
-    case HloOpcode::kFusion:
-      visit(*node.fused_expression_root());
-      break;
-    default:
-      for (HloInstruction* operand : node.operands()) {
-        visit(*operand);
-      }
-  }
-  return predecessors;
+    return TraversalResult::kVisitOperands;
+  });
+  return result;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal.h b/third_party/xla/xla/service/gpu/hlo_traversal.h
index 97a71ff39f8901..9a4f29d621f922 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal.h
+++ b/third_party/xla/xla/service/gpu/hlo_traversal.h
@@ -20,10 +20,77 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 
 namespace xla {
 namespace gpu {
 
+// Treats HloInstructions as if they were unfused.
+class HloInstructionAdaptor {
+ public:
+  HloInstructionAdaptor() = default;
+  explicit HloInstructionAdaptor(const HloInstruction& instruction)
+      : instruction_(&instruction) {}
+
+  HloOpcode opcode() const { return instruction_->opcode(); }
+  absl::string_view name() const { return instruction_->name(); }
+
+  HloInstructionAdaptor GetOperand(int index) const;
+  absl::InlinedVector<HloInstructionAdaptor, 2> GetOperands() const;
+  absl::InlinedVector<HloInstructionAdaptor, 2> GetUsers() const;
+  const xla::Shape& shape() const { return instruction_->shape(); }
+  std::string ToString() const { return instruction_->ToString(); }
+
+  friend bool operator==(const HloInstructionAdaptor& lhs,
+                         const HloInstructionAdaptor& rhs);
+  template <typename H>
+  friend H AbslHashValue(H h, const HloInstructionAdaptor& m);
+
+  // Use sparingly; prefer extending the interface.
+  const HloInstruction& instruction() const { return *instruction_; }
+
+ private:
+  const HloInstruction* instruction_;
+};
+
+template <typename H>
+H AbslHashValue(H h, const HloInstructionAdaptor& m) {
+  return H::combine(std::move(h), m.instruction_->GetModule(),
+                    m.instruction_->unique_id());
+}
+
+class HloFusionAdaptor {
+ public:
+  virtual ~HloFusionAdaptor() = default;
+  virtual bool ContainsInstruction(HloInstructionAdaptor instruction) const = 0;
+  virtual absl::InlinedVector<HloInstructionAdaptor, 2> GetRoots() const = 0;
+
+  static std::unique_ptr<HloFusionAdaptor> ForInstruction(
+      const HloInstruction* instruction);
+  static std::unique_ptr<HloFusionAdaptor> ForComputation(
+      const HloComputation* computation);
+};
+
+class ProducerConsumerFusion : public HloFusionAdaptor {
+ public:
+  ProducerConsumerFusion(std::unique_ptr<HloFusionAdaptor> producer,
+                         std::unique_ptr<HloFusionAdaptor> consumer)
+      : producer_(std::move(producer)), consumer_(std::move(consumer)) {}
+
+  bool ContainsInstruction(HloInstructionAdaptor instruction) const override {
+    return producer_->ContainsInstruction(instruction) ||
+           consumer_->ContainsInstruction(instruction);
+  }
+
+  absl::InlinedVector<HloInstructionAdaptor, 2> GetRoots() const override {
+    return consumer_->GetRoots();
+  }
+
+ private:
+  std::unique_ptr<HloFusionAdaptor> producer_;
+  std::unique_ptr<HloFusionAdaptor> consumer_;
+};
+
 enum class TraversalResult {
   // Visit the operands of this node.
   kVisitOperands,
@@ -35,56 +102,35 @@ enum class TraversalResult {
   kDoNotVisitOperands,
 };
 
-using FusionBoundaryFn = std::function<bool(const HloInstruction& producer,
-                                            const HloInstruction& consumer)>;
-
-// Boundary function for HloFusionInstructions.
-bool DefaultFusionBoundaryFn(const HloInstruction& producer,
-                             const HloInstruction& consumer);
-
-// Creates a fusion boundary function for fusing the given producer and
-// consumer. `fused_consumer` must be a consumer of `fused_producer`.
-FusionBoundaryFn MakeProducerConsumerFusion(
-    const HloInstruction& fused_producer, const HloInstruction& fused_consumer);
-
-// Creates a fusion boundary function for a fusion consisting only of `root`. If
-// `root` is a fusion, the result is the same as `DefaultFusionBuondaryFn`. If
-// `root` is the root of a fusion, the result is just that root, not the entire
-// computation.
-FusionBoundaryFn MakeSingleInstructionFusion(const HloInstruction& root);
-
 // Visit the HLO nodes starting from `roots` in BFS order (consumers before
-// producers). Each node will be visited exactly once. The graph is not
-// traversed along edges for which `boundary` returns true.
+// producers). Each node will be visited exactly once.
 void HloBfsConsumersFirstTraversal(
-    absl::Span<const HloInstruction* const> roots,
-    const FusionBoundaryFn& boundary,
-    const std::function<TraversalResult(const HloInstruction& node)>& visit);
+    absl::Span<const HloInstructionAdaptor> roots,
+    const HloFusionAdaptor& fusion,
+    const std::function<TraversalResult(HloInstructionAdaptor node)>&
+        visit_node,
+    const std::function<void(HloInstructionAdaptor producer)>& visit_arg =
+        [](HloInstructionAdaptor) {});
 
 // Visit the HLO nodes starting from `roots`, returning true if the return value
 // of `visit` for any of nodes is true. Uses the same order as
 // `HloBfsConsumersFirstTraversal`.
-bool HloAnyOf(absl::Span<const HloInstruction* const> roots,
-              const FusionBoundaryFn& boundary,
-              const std::function<bool(const HloInstruction& node)>& visit);
+bool HloAnyOf(absl::Span<const HloInstructionAdaptor> roots,
+              const HloFusionAdaptor& fusion,
+              const std::function<bool(HloInstructionAdaptor node)>& visit);
 
 // Visit the HLO nodes stating from `roots`, returning the first
 // node for which `visit` returns true, or `nullptr` if no node matches. Uses
 // the same order as `HloBfsConsumersFirstTraversal`.
-const HloInstruction* HloFindIf(
-    absl::Span<const HloInstruction* const> roots,
-    const FusionBoundaryFn& boundary,
-    const std::function<bool(const HloInstruction& node)>& visit);
+std::optional<HloInstructionAdaptor> HloFindIf(
+    absl::Span<const HloInstructionAdaptor> roots,
+    const HloFusionAdaptor& fusion,
+    const std::function<bool(HloInstructionAdaptor node)>& visit);
 
 // Visit the producers of all parameters that are needed by the fusion.
 void FindFusionArguments(
-    absl::Span<const HloInstruction* const> roots,
-    const FusionBoundaryFn& boundary,
-    const std::function<void(const HloInstruction& producer)>& visit);
-
-// Returns all predecessors of node that lie within the boundary.
-absl::InlinedVector<const HloInstruction*, 2> FindPredecessors(
-    const HloInstruction& node, const FusionBoundaryFn& boundary);
+    const HloFusionAdaptor& fusion,
+    const std::function<void(HloInstructionAdaptor producer)>& visit);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal_test.cc b/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
index 0aed2da35dd977..d6f18c17a30f55 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/gpu/gpu_fusible.h"
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
@@ -52,158 +51,139 @@ const char kTestModule[] = R"(
       p0 = f32[] parameter(0)
       p1 = f32[128] parameter(1)
       sum = f32[128] add(p1, p1)
-      negate = f32[128] negate(sum)
+      log = f32[128] log(sum)
+      negate = f32[128] negate(log)
       fusion = f32[] fusion(p0, negate), kind=kLoop, calls=fused_computation
       ROOT difference = f32[] subtract(fusion, p0)
     })";
 
-TEST_F(HloTraversalTest, TraverseFusion) {
+TEST_F(HloTraversalTest, AdaptorOperands) {
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
-  std::vector<std::string> visited_nodes;
-  HloBfsConsumersFirstTraversal(
-      {module->GetComputationWithName("fused_computation")->root_instruction()},
-      DefaultFusionBoundaryFn, [&](const HloInstruction& node) {
-        visited_nodes.emplace_back(node.name());
-        return TraversalResult::kVisitOperands;
-      });
 
-  EXPECT_THAT(visited_nodes, ElementsAre("reduce.1", "mul", "p0.1", "p1.1"));
+  HloInstructionAdaptor instr{
+      *module->entry_computation()->GetInstructionWithName("difference")};
+
+  auto operands = instr.GetOperands();
+  ASSERT_EQ(operands.size(), 2);
+  EXPECT_EQ(operands[0].name(), "reduce.1");
+  EXPECT_EQ(operands[1].name(), "p0");
 }
 
-TEST_F(HloTraversalTest, TraverseFusionPartially) {
-  auto module = ParseAndReturnVerifiedModule(kTestModule).value();
-  std::vector<std::string> visited_nodes;
-  HloBfsConsumersFirstTraversal(
-      {module->GetComputationWithName("fused_computation")->root_instruction()},
-      DefaultFusionBoundaryFn, [&](const HloInstruction& node) {
-        visited_nodes.emplace_back(node.name());
-        return node.opcode() == HloOpcode::kReduce
-                   ? TraversalResult::kVisitOperands
-                   : TraversalResult::kDoNotVisitOperands;
-      });
+TEST_F(HloTraversalTest, AdaptorUsers) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule test
+
+    fused_computation {
+      p0 = f32[] parameter(0)
+      neg = f32[] negate(p0)
+      add = f32[] add(p0, neg)
+      ROOT t = (f32[], f32[]) tuple(neg, add)
+    }
 
-  EXPECT_THAT(visited_nodes, ElementsAre("reduce.1", "mul", "p0.1"));
+    ENTRY entry {
+      p0 = f32[] parameter(0)
+      fusion = (f32[], f32[]) fusion(p0), kind=kLoop, calls=fused_computation
+      ROOT gte = f32[] get-tuple-element(fusion), index=0
+    }
+  )")
+                    .value();
+
+  auto get_single_user = [](auto instr) {
+    auto users = instr.GetUsers();
+    EXPECT_EQ(users.size(), 1);
+    return users[0];
+  };
+
+  HloInstructionAdaptor add{*module->GetComputationWithName("fused_computation")
+                                 ->GetInstructionWithName("add")};
+  EXPECT_EQ(get_single_user(add).name(), "t");
+  EXPECT_EQ(get_single_user(get_single_user(add)).name(), "gte");
 }
 
-TEST_F(HloTraversalTest, AbortTraversal) {
+TEST_F(HloTraversalTest, TraverseFusion) {
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
   std::vector<std::string> visited_nodes;
+  std::vector<std::string> visited_args;
+  auto fusion = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("fusion"));
   HloBfsConsumersFirstTraversal(
-      {module->GetComputationWithName("fused_computation")->root_instruction()},
-      DefaultFusionBoundaryFn, [&](const HloInstruction& node) {
+      fusion->GetRoots(), *fusion,
+      [&](HloInstructionAdaptor node) {
         visited_nodes.emplace_back(node.name());
-        return node.opcode() == HloOpcode::kReduce
-                   ? TraversalResult::kVisitOperands
-                   : TraversalResult::kAbortTraversal;
+        return TraversalResult::kVisitOperands;
+      },
+      [&](HloInstructionAdaptor arg) {
+        visited_args.emplace_back(arg.name());
       });
 
   EXPECT_THAT(visited_nodes, ElementsAre("reduce.1", "mul"));
+  EXPECT_THAT(visited_args, ElementsAre("p0", "negate"));
 }
 
-TEST_F(HloTraversalTest, TraversePartialFusion) {
-  // Verifies that we correctly traverse the fusion that would result if we
-  // fused the negation into fused_computation.
+TEST_F(HloTraversalTest, AbortTraversal) {
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
+  auto fusion = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("fusion"));
   std::vector<std::string> visited_nodes;
+  HloBfsConsumersFirstTraversal(fusion->GetRoots(), *fusion,
+                                [&](HloInstructionAdaptor node) {
+                                  visited_nodes.emplace_back(node.name());
+                                  return node.opcode() == HloOpcode::kReduce
+                                             ? TraversalResult::kVisitOperands
+                                             : TraversalResult::kAbortTraversal;
+                                });
 
-  auto* fused_computation = module->GetComputationWithName("fused_computation");
-  HloBfsConsumersFirstTraversal(
-      {fused_computation->root_instruction()},
-      [&](const HloInstruction& producer, const HloInstruction& consumer) {
-        return &consumer == fused_computation->parameter_instruction(0) ||
-               consumer.opcode() == HloOpcode::kNegate;
-      },
-      [&](const HloInstruction& node) {
-        visited_nodes.emplace_back(node.name());
-        return TraversalResult::kVisitOperands;
-      });
-
-  EXPECT_THAT(visited_nodes,
-              ElementsAre("reduce.1", "mul", "p0.1", "p1.1", "negate"));
+  EXPECT_THAT(visited_nodes, ElementsAre("reduce.1", "mul"));
 }
 
 TEST_F(HloTraversalTest, FindArguments) {
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
+  auto fusion = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("fusion"));
   std::vector<std::string> producers;
-  FindFusionArguments(
-      {module->GetComputationWithName("fused_computation")->root_instruction()},
-      DefaultFusionBoundaryFn, [&](const HloInstruction& producer) {
-        producers.emplace_back(producer.name());
-      });
+  FindFusionArguments(*fusion, [&](HloInstructionAdaptor producer) {
+    producers.emplace_back(producer.name());
+  });
   EXPECT_THAT(producers, ElementsAre("p0", "negate"));
 }
 
 TEST_F(HloTraversalTest, FindArgumentsAfterFusion) {
   // Verifies that we correctly find the arguments after fusing the negation.
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
+  auto producer = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("negate"));
+  auto consumer = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("fusion"));
   std::vector<std::string> producers;
-  auto* fused_computation = module->GetComputationWithName("fused_computation");
-  FindFusionArguments(
-      {fused_computation->root_instruction()},
-      [&](const HloInstruction& producer, const HloInstruction& consumer) {
-        return &consumer == fused_computation->parameter_instruction(0) ||
-               consumer.opcode() == HloOpcode::kNegate;
-      },
-      [&](const HloInstruction& producer) {
-        producers.emplace_back(producer.name());
-      });
-  EXPECT_THAT(producers, ElementsAre("p0", "sum"));
-}
-
-TEST_F(HloTraversalTest, FuseEverything) {
-  auto module = ParseAndReturnVerifiedModule(kTestModule).value();
-  std::vector<std::string> producers;
-  auto* fused_computation = module->GetComputationWithName("fused_computation");
   FindFusionArguments(
-      {fused_computation->root_instruction()},
-      [&](const HloInstruction& producer, const HloInstruction& consumer) {
-        return producer.opcode() == HloOpcode::kParameter &&
-               producer.parent()->IsEntryComputation();
-      },
-      [&](const HloInstruction& producer) {
+      ProducerConsumerFusion(std::move(producer), std::move(consumer)),
+      [&](HloInstructionAdaptor producer) {
         producers.emplace_back(producer.name());
       });
-  EXPECT_THAT(producers, ElementsAre("p0", "p1"));
-}
-
-TEST_F(HloTraversalTest, FuseConsumer) {
-  auto module = ParseAndReturnVerifiedModule(kTestModule).value();
-  std::vector<std::string> visited_nodes;
-  HloBfsConsumersFirstTraversal(
-      {module->entry_computation()->root_instruction()},
-      [](const HloInstruction& producer, const HloInstruction& consumer) {
-        return consumer.opcode() == HloOpcode::kParameter ||
-               (producer.opcode() == HloOpcode::kParameter &&
-                consumer.opcode() == HloOpcode::kSubtract);
-      },
-      [&](const HloInstruction& node) {
-        visited_nodes.emplace_back(node.name());
-        return TraversalResult::kVisitOperands;
-      });
-  EXPECT_THAT(visited_nodes, ElementsAre("difference", "fusion", "reduce.1",
-                                         "mul", "p0.1", "p1.1"));
+  EXPECT_THAT(producers, ElementsAre("p0", "log"));
 }
 
 TEST_F(HloTraversalTest, FindIf) {
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
+  auto fusion = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("fusion"));
   std::vector<std::string> visited_nodes;
-  auto* result = HloFindIf(
-      {module->GetComputationWithName("fused_computation")->root_instruction()},
-      DefaultFusionBoundaryFn, [&](const HloInstruction& node) {
+  auto result =
+      HloFindIf(fusion->GetRoots(), *fusion, [&](HloInstructionAdaptor node) {
         return node.opcode() == HloOpcode::kMultiply;
       });
-  ASSERT_NE(result, nullptr);
+  ASSERT_NE(result, std::nullopt);
   ASSERT_EQ(result->name(), "mul");
 }
 
 TEST_F(HloTraversalTest, NotFound) {
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
+  auto fusion = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("fusion"));
   std::vector<std::string> visited_nodes;
-  auto* result = HloFindIf(
-      {module->GetComputationWithName("fused_computation")->root_instruction()},
-      DefaultFusionBoundaryFn,
-      [&](const HloInstruction& node) { return false; });
-  ASSERT_EQ(result, nullptr);
+  auto result = HloFindIf(fusion->GetRoots(), *fusion,
+                          [&](HloInstructionAdaptor node) { return false; });
+  ASSERT_EQ(result, std::nullopt);
 }
 
 const char kTwoFusions[] = R"(
@@ -240,102 +220,115 @@ const char kTwoFusions[] = R"(
 
 TEST_F(HloTraversalTest, FuseFusionConsumer) {
   auto module = ParseAndReturnVerifiedModule(kTwoFusions).value();
-  auto* producer =
-      module->entry_computation()->GetInstructionWithName("negate");
-  auto* consumer =
-      module->entry_computation()->GetInstructionWithName("fusion.1");
 
-  auto roots = GetFusionRoots(*consumer->fused_instructions_computation());
-  auto boundary = MakeProducerConsumerFusion(*producer, *consumer);
+  auto producer = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("negate"));
+  auto consumer = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("fusion.1"));
+  ProducerConsumerFusion fusion(std::move(producer), std::move(consumer));
+
   std::vector<std::string> nodes;
-  HloBfsConsumersFirstTraversal(roots, boundary,
-                                [&](const HloInstruction& node) {
-                                  nodes.emplace_back(node.name());
-                                  return TraversalResult::kVisitOperands;
-                                });
   std::vector<std::string> params;
-  FindFusionArguments(roots, boundary, [&](const HloInstruction& param) {
-    params.emplace_back(param.name());
-  });
+  HloBfsConsumersFirstTraversal(
+      fusion.GetRoots(), fusion,
+      [&](HloInstructionAdaptor node) {
+        nodes.emplace_back(node.name());
+        return TraversalResult::kVisitOperands;
+      },
+      [&](HloInstructionAdaptor param) { params.emplace_back(param.name()); });
 
-  EXPECT_THAT(nodes, ElementsAre("reduce.1", "mul", "p0.1", "p1.1", "negate"));
+  EXPECT_THAT(nodes, ElementsAre("reduce.1", "mul", "negate"));
   EXPECT_THAT(params, ElementsAre("p0", "sum"));
 }
 
 TEST_F(HloTraversalTest, FuseFusionProducer) {
   auto module = ParseAndReturnVerifiedModule(kTwoFusions).value();
-  auto* producer =
-      module->entry_computation()->GetInstructionWithName("fusion.2");
-  auto* consumer =
-      module->entry_computation()->GetInstructionWithName("difference");
 
-  auto boundary = MakeProducerConsumerFusion(*producer, *consumer);
+  auto producer = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("fusion.2"));
+  auto consumer = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("difference"));
+  ProducerConsumerFusion fusion(std::move(producer), std::move(consumer));
+
   std::vector<std::string> nodes;
-  HloBfsConsumersFirstTraversal({consumer}, boundary,
-                                [&](const HloInstruction& node) {
-                                  nodes.emplace_back(node.name());
-                                  return TraversalResult::kVisitOperands;
-                                });
   std::vector<std::string> params;
-  FindFusionArguments({consumer}, boundary, [&](const HloInstruction& param) {
-    params.emplace_back(param.name());
-  });
+  HloBfsConsumersFirstTraversal(
+      fusion.GetRoots(), fusion,
+      [&](HloInstructionAdaptor node) {
+        nodes.emplace_back(node.name());
+        return TraversalResult::kVisitOperands;
+      },
+      [&](HloInstructionAdaptor arg) { params.emplace_back(arg.name()); });
 
-  EXPECT_THAT(
-      nodes, ElementsAre("difference", "fusion.2", "reduce.2", "p1.2", "p0.2"));
-  EXPECT_THAT(params, ElementsAre("p0", "negate", "fusion.1"));
+  EXPECT_THAT(nodes, ElementsAre("difference", "reduce.2"));
+  EXPECT_THAT(params, ElementsAre("p0", "negate", "reduce.1"));
 }
 
 TEST_F(HloTraversalTest, FuseFusionConsumerAndProducer) {
   auto module = ParseAndReturnVerifiedModule(kTwoFusions).value();
-  auto* producer =
-      module->entry_computation()->GetInstructionWithName("fusion.1");
-  auto* consumer =
-      module->entry_computation()->GetInstructionWithName("fusion.2");
+  auto producer = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("fusion.1"));
+  auto consumer = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("fusion.2"));
+  ProducerConsumerFusion fusion(std::move(producer), std::move(consumer));
 
-  auto roots = GetFusionRoots(*consumer->fused_instructions_computation());
-  auto boundary = MakeProducerConsumerFusion(*producer, *consumer);
   std::vector<std::string> nodes;
-  HloBfsConsumersFirstTraversal(roots, boundary,
-                                [&](const HloInstruction& node) {
+  HloBfsConsumersFirstTraversal(fusion.GetRoots(), fusion,
+                                [&](HloInstructionAdaptor node) {
                                   nodes.emplace_back(node.name());
                                   return TraversalResult::kVisitOperands;
                                 });
   std::vector<std::string> params;
-  FindFusionArguments(roots, boundary, [&](const HloInstruction& param) {
+  FindFusionArguments(fusion, [&](const HloInstructionAdaptor& param) {
     params.emplace_back(param.name());
   });
 
-  EXPECT_THAT(nodes, ElementsAre("reduce.2", "p1.2", "p0.2", "fusion.1",
-                                 "reduce.1", "mul", "p0.1", "p1.1"));
+  EXPECT_THAT(nodes, ElementsAre("reduce.2", "reduce.1", "mul"));
   EXPECT_THAT(params, ElementsAre("negate", "p0"));
 }
 
+TEST_F(HloTraversalTest, FuseNonFusionConsumerAndProducer) {
+  auto module = ParseAndReturnVerifiedModule(kTestModule).value();
+
+  auto producer = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("log"));
+  auto consumer = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("negate"));
+  ProducerConsumerFusion fusion(std::move(producer), std::move(consumer));
+
+  std::vector<std::string> nodes;
+  HloBfsConsumersFirstTraversal(fusion.GetRoots(), fusion,
+                                [&](HloInstructionAdaptor node) {
+                                  nodes.emplace_back(node.name());
+                                  return TraversalResult::kVisitOperands;
+                                });
+
+  EXPECT_THAT(nodes, ElementsAre("negate", "log"));
+}
+
 TEST_F(HloTraversalTest, SingleInstructionFusionOfFusion) {
   auto module = ParseAndReturnVerifiedModule(kTwoFusions).value();
-  auto* fusion =
-      module->entry_computation()->GetInstructionWithName("fusion.1");
+  auto fusion = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("fusion.1"));
 
-  auto boundary = MakeSingleInstructionFusion(*fusion);
   std::vector<std::string> nodes;
-  HloBfsConsumersFirstTraversal({fusion}, boundary,
-                                [&](const HloInstruction& node) {
+  HloBfsConsumersFirstTraversal(fusion->GetRoots(), *fusion,
+                                [&](HloInstructionAdaptor node) {
                                   nodes.emplace_back(node.name());
                                   return TraversalResult::kVisitOperands;
                                 });
 
-  EXPECT_THAT(nodes,
-              ElementsAre("fusion.1", "reduce.1", "mul", "p0.1", "p1.1"));
+  EXPECT_THAT(nodes, ElementsAre("reduce.1", "mul"));
 }
 
 TEST_F(HloTraversalTest, SingleInstructionFusionOfInstruction) {
   auto module = ParseAndReturnVerifiedModule(kTwoFusions).value();
-  auto* negate = module->entry_computation()->GetInstructionWithName("negate");
+  auto fusion = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("negate"));
 
-  auto boundary = MakeSingleInstructionFusion(*negate);
   std::vector<std::string> nodes;
-  HloBfsConsumersFirstTraversal({negate}, boundary,
-                                [&](const HloInstruction& node) {
+  HloBfsConsumersFirstTraversal(fusion->GetRoots(), *fusion,
+                                [&](HloInstructionAdaptor node) {
                                   nodes.emplace_back(node.name());
                                   return TraversalResult::kVisitOperands;
                                 });
diff --git a/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc b/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc
index 8f8e8eee454a06..6491764728d30a 100644
--- a/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc
+++ b/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc
@@ -341,76 +341,59 @@ HorizontalLoopFusionImpl::FusionCandidates::GetNextSpanOfFusions() {
       return 32;
     } else {
       if (fusible_instrs_[pos_]->opcode() == HloOpcode::kFusion) {
-        auto fused_instruction_count =
-            fusible_instrs_[pos_]->fused_instruction_count();
-        if (fused_instruction_count < 8) {
-          return 32;
-        } else if (fused_instruction_count < 16) {
-          return 16;
-        } else if (fused_instruction_count < 32) {
-          return 8;
-        } else if (fused_instruction_count < 64) {
-          return 4;
-        } else {
-          return 2;
-        }
+        return 32;
       } else {
         return 64;
       }
     }
   }();
 
-  // CUDA has a parameter size limit of ~4k bytes.
-  constexpr int64_t kMaxCudaParamSize = 4000;
-  size_t accum_io_size = 0;
-  auto reach_max_fusion_batch_size = [&](size_t left, size_t right) -> bool {
-    if (right - left >= kMaxFusionBatchSize) {
-      return true;
-    }
-
-    accum_io_size += fusible_instrs_.at(right)->operand_count() +
-                     GetOutputSizeOfFusible(*fusible_instrs_.at(right));
-
-    if (accum_io_size * 8 >= kMaxCudaParamSize) {
-      return true;
-    }
-
-    return false;
-  };
-
   size_t left = pos_;
   size_t right = pos_ + 1;
   size_t first_output_size = GetOutputSizeOfFusible(*fusible_instrs_[left]);
   PrimitiveType first_output_type =
       GetUniqueOutputTypeOfFusible(*fusible_instrs_[left]);
+  // CUDA has a parameter size limit of ~4k bytes.
+  constexpr int64_t kMaxCudaParamSize = 4000;
+  size_t accum_io_size = 0;
+  size_t accum_num_outputs = 0;
   for (; right < fusible_instrs_.size(); ++right) {
     PrimitiveType cur_output_type =
         GetUniqueOutputTypeOfFusible(*fusible_instrs_[right]);
     if (first_output_type != cur_output_type) {
       // Cannot fuse computations who have multiple output types.
       break;
-    } else if (first_output_size !=
-               GetOutputSizeOfFusible(*fusible_instrs_[right])) {
+    }
+    if (first_output_size != GetOutputSizeOfFusible(*fusible_instrs_[right])) {
       // Cannot fuse computations who have different numbers of outputs.
       break;
-    } else if (GetInstrCountOfFusible(*fusible_instrs_[left]) !=
-               GetInstrCountOfFusible(*fusible_instrs_[right])) {
+    }
+    if (GetInstrCountOfFusible(*fusible_instrs_[left]) !=
+        GetInstrCountOfFusible(*fusible_instrs_[right])) {
       // Do not fuse computations of different instruction counts as it may
       // introduce control divergence. This is a very simple heuristic to avoid
       // fusing computations with too much discrepancy and we may improve it
       // when the needs arise.
       break;
-    } else if (!sliced_input_fusion_ &&
-               !ShapeUtil::EqualIgnoringElementType(
-                   GetOutputsOfFusible(*fusible_instrs_[left])[0]->shape(),
-                   GetOutputsOfFusible(*fusible_instrs_[right])[0]->shape())) {
+    }
+    if (!sliced_input_fusion_ &&
+        !ShapeUtil::EqualIgnoringElementType(
+            GetOutputsOfFusible(*fusible_instrs_[left])[0]->shape(),
+            GetOutputsOfFusible(*fusible_instrs_[right])[0]->shape())) {
       // This is for fusing into kLoop type kernel, so we requires that each
       // fusion operand have the same shape
       break;
-    } else if (reach_max_fusion_batch_size(left, right)) {
+    }
+    size_t num_outputs = GetOutputSizeOfFusible(*fusible_instrs_[right]);
+    accum_num_outputs += num_outputs;
+    if (accum_num_outputs >= kMaxFusionBatchSize) {
       // Hit max fusion batch size.
       break;
     }
+    accum_io_size += fusible_instrs_.at(right)->operand_count() + num_outputs;
+    if (accum_io_size * 8 >= kMaxCudaParamSize) {
+      break;
+    }
   }
   VLOG(2) << "horizontal fuse get instruction span with " << (right - left)
           << " instructions for sliced_input_fusion=" << sliced_input_fusion_
diff --git a/third_party/xla/xla/service/gpu/instruction_fusion.cc b/third_party/xla/xla/service/gpu/instruction_fusion.cc
index 60a8b5b7a1f0ec..d1cb4cb06b2e98 100644
--- a/third_party/xla/xla/service/gpu/instruction_fusion.cc
+++ b/third_party/xla/xla/service/gpu/instruction_fusion.cc
@@ -74,7 +74,7 @@ FusionDecision GpuInstructionFusion::ShouldFuseInexpensiveChecks(
 
   // Cost condition: not fuse (simple, expensive producers) and (consumers who
   // reuse operand elements).
-  if (producer->opcode() != HloOpcode::kFusion && is_expensive(*producer) &&
+  if (is_expensive(*producer) &&
       ReusesOperandElements(consumer, operand_index)) {
     return "the producer is expensive, and the consumer reuses inputs";
   }
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index a28628c145d52f..be7a9438d2f3c2 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -16,35 +16,76 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 
 #include <algorithm>
+#include <climits>
+#include <cstddef>
 #include <cstdint>
+#include <cstring>
+#include <functional>
 #include <optional>
 #include <queue>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/FPEnv.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal.h"
+#include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/primitive_util.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/service/llvm_ir/buffer_assignment_util.h"
 #include "xla/service/llvm_ir/llvm_type_conversion_util.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/status_macros.h"
+#include "xla/statusor.h"
 #include "xla/translate/mhlo_to_hlo/location_exporter.h"
 #include "xla/translate/mhlo_to_hlo/type_to_shape.h"
+#include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/ml_dtypes.h"
 
 namespace xla {
 namespace gpu {
@@ -191,7 +232,7 @@ llvm::Value* EmitPrintf(absl::string_view fmt,
         builder->CreateGEP(arguments_type, arguments_ptr,
                            {builder->getInt64(0), builder->getInt32(i)}));
   }
-  llvm::Type* ptr_ty = builder->getInt8Ty()->getPointerTo();
+  llvm::Type* ptr_ty = builder->getPtrTy();
   return builder->CreateCall(
       builder->GetInsertBlock()->getParent()->getParent()->getOrInsertFunction(
           "vprintf",
@@ -538,7 +579,7 @@ bool CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
       }
       dus_user = *bitcast->user_begin();
     }
-    if (!mlir::isa<mlir::memref::TensorStoreOp>(dus_user)) {
+    if (!mlir::isa<mlir::bufferization::MaterializeInDestinationOp>(dus_user)) {
       return false;
     }
     auto operand = dus.getOperand();
@@ -564,8 +605,8 @@ bool CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
     q.push(parameter);
     visited.insert(parameter);
     // We have already checked above that the DUS only has one user: a
-    // (possibly bitcasted) TensorStoreOp. So we don't need to visit it during
-    // the breadth-first search.
+    // (possibly bitcasted) MaterializeInDestinationOp. So we don't need to
+    // visit it during the breadth-first search.
     visited.insert(dus);
     while (!q.empty()) {
       auto op = q.front();
@@ -723,7 +764,8 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
   return std::nullopt;
 }
 
-bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count) {
+bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count,
+                    const HloFusionAdaptor* fusion) {
   // Number of operands should be in range [1, allowed_operand_count].
   if (instr->operand_count() == 0 ||
       instr->operand_count() > allowed_operand_count) {
@@ -731,7 +773,15 @@ bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count) {
   }
 
   // Intermediate `instr` can't have multiple users.
-  if (instr->user_count() > 1) {
+  // If we have a boundary function, only consider users within the
+  // boundary.
+  // TODO(jreiffers): Figure out the point of this check.
+  int64_t num_users =
+      fusion ? absl::c_count_if(
+                   HloInstructionAdaptor{*instr}.GetUsers(),
+                   [&](auto user) { return fusion->ContainsInstruction(user); })
+             : instr->user_count();
+  if (num_users > 1) {
     return false;
   }
 
@@ -764,56 +814,59 @@ static bool IsParameter(const HloInstruction& instr) {
   return instr.opcode() == HloOpcode::kParameter;
 }
 
-const HloInstruction& FindNonTrivialHero(
-    const HloInstruction& instr,
-    const std::function<bool(const HloInstruction& producer,
-                             const HloInstruction& consumer)>& is_boundary) {
-  const HloInstruction* idx = &instr;
+const HloInstruction& FindNonTrivialHero(const HloInstruction& instr,
+                                         const HloFusionAdaptor& fusion) {
+  HloInstructionAdaptor idx{instr};
 
   // Go up the chain of trivial element-wise(+bitcast, -copy) operations. Such
   // chains are bound to be quite small, as we restrict the number of users as
   // well. Note that no memoization is needed due to user number constraints: we
   // never have to revisit same nodes.
-  auto get_intermediate_arg = [&](const HloInstruction* node) {
-    if (node->opcode() == HloOpcode::kFusion ||
-        node->opcode() == HloOpcode::kParameter) {
-      auto preds = FindPredecessors(*node, is_boundary);
-      return preds.size() == 1 ? preds.front() : nullptr;
-    }
-    return IsIntermediate(node) && !is_boundary(*node->operand(0), *node)
-               ? node->operand(0)
-               : nullptr;
+  auto get_intermediate_arg =
+      [&](HloInstructionAdaptor node) -> std::optional<HloInstructionAdaptor> {
+    if (IsIntermediate(&node.instruction(), 1, &fusion) &&
+        fusion.ContainsInstruction(node.GetOperand(0))) {
+      return node.GetOperand(0);
+    }
+    return std::nullopt;
   };
-  while (auto* arg = get_intermediate_arg(idx)) {
-    idx = arg;
+  while (auto arg = get_intermediate_arg(idx)) {
+    idx = *arg;
+  }
+
+  // The reduction emitter can't handle multiple users.
+  if (idx.opcode() == HloOpcode::kReduce &&
+      absl::c_count_if(idx.GetUsers(), [&](auto user) {
+        return fusion.ContainsInstruction(user);
+      }) > 1) {
+    return instr;
   }
 
-  const HloInstruction* transpose = nullptr;
+  std::optional<HloInstructionAdaptor> transpose = std::nullopt;
   // Try a bit harder to find a transpose hero. The shared memory transpose
   // emitter also works if there are ops with more than 1 operand on the path
   // between root and the transpose op, we still want the restriction though
   // that each op on the path is elementwise and has only 1 user.
-  auto visit = [&transpose](const HloInstruction& node) {
-    if (FindTiledLogicalTranspose(node)) {
+  auto visit = [&transpose](HloInstructionAdaptor node) {
+    if (FindTiledLogicalTranspose(node.instruction())) {
       // If we do not find a unique transpose op, use the original non-trivial
       // hero.
       if (transpose) {
-        transpose = nullptr;
+        transpose = std::nullopt;
         return TraversalResult::kAbortTraversal;
       }
-      transpose = &node;
+      transpose = node;
       return TraversalResult::kDoNotVisitOperands;
     }
 
-    if (node.opcode() != HloOpcode::kParameter &&
-        node.opcode() != HloOpcode::kFusion &&
-        !IsIntermediate(&node, /*allowed_operand_count=*/3)) {
+    if (!IsIntermediate(&node.instruction(), /*allowed_operand_count=*/3)) {
       return TraversalResult::kDoNotVisitOperands;
     }
     return TraversalResult::kVisitOperands;
   };
-  HloBfsConsumersFirstTraversal({idx}, is_boundary, visit);
-  return transpose ? *transpose : *idx;
+  HloBfsConsumersFirstTraversal({idx}, fusion, visit);
+
+  return transpose ? transpose->instruction() : idx.instruction();
 }
 
 const HloInstruction& FindNonTrivialHero(const HloInstruction& instr) {
@@ -821,10 +874,9 @@ const HloInstruction& FindNonTrivialHero(const HloInstruction& instr) {
   // happens. Return the fusion itself for historical reasons.
   // TODO(jreiffers): Clean this up.
   if (instr.opcode() == HloOpcode::kFusion) return instr;
-  return FindNonTrivialHero(instr, [](const HloInstruction& producer,
-                                      const HloInstruction& consumer) {
-    return consumer.opcode() == HloOpcode::kParameter;
-  });
+
+  return FindNonTrivialHero(instr,
+                            *HloFusionAdaptor::ForComputation(instr.parent()));
 }
 
 void VLogModule(int level, const llvm::Module& module) {
@@ -954,5 +1006,27 @@ bool IsAMDGPU(const llvm::Module* module) {
   return llvm::Triple(module->getTargetTriple()).isAMDGPU();
 }
 
+StatusOr<DenseDataIntermediate> LiteralToXlaFormat(const Literal& literal) {
+  PrimitiveType element_type = literal.shape().element_type();
+  if (!primitive_util::IsArrayType(element_type)) {
+    return Internal("Unsupported type in LiteralToXlaFormat");
+  }
+
+  int64_t byte_size = literal.size_bytes();
+  if (primitive_util::Is4BitType(element_type)) {
+    std::vector<uint8_t> output(CeilOfRatio(byte_size, int64_t{2}));
+    absl::Span<char> output_span =
+        absl::MakeSpan(reinterpret_cast<char*>(output.data()), output.size());
+    PackInt4(
+        absl::MakeSpan(reinterpret_cast<const char*>(literal.untyped_data()),
+                       byte_size),
+        output_span);
+    return DenseDataIntermediate::Own(std::move(output));
+  }
+
+  return DenseDataIntermediate::Alias(absl::MakeSpan(
+      reinterpret_cast<const uint8_t*>(literal.untyped_data()), byte_size));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.h b/third_party/xla/xla/service/gpu/ir_emission_utils.h
index 89035315160949..ade59584899c43 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.h
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.h
@@ -16,15 +16,24 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_IR_EMISSION_UTILS_H_
 #define XLA_SERVICE_GPU_IR_EMISSION_UTILS_H_
 
+#include <cstdint>
 #include <optional>
 #include <string>
+#include <utility>
+#include <variant>
 #include <vector>
 
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/literal.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/hlo_traversal.h"
+#include "xla/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -47,6 +56,10 @@ bool IsMatrixMultiplication(const HloInstruction& dot);
 
 inline constexpr int64_t WarpSize() { return 32; }
 
+// Fusions that implemented with pre-compiled device kernels have
+// FusionBackendConfig.kind requel to this string.
+inline constexpr absl::string_view kCustomFusionKind = "__custom_fusion";
+
 // Fusions that use Triton have FusionBackendConfig.kind equal to this string.
 inline constexpr absl::string_view kTritonGemmFusionKind = "__triton_gemm";
 
@@ -114,8 +127,6 @@ StatusOr<BufferAllocation::Slice> GetAllocationSlice(
     mlir::Value v, absl::Span<const BufferAllocation* const> allocations,
     std::string* constant_name = nullptr);
 
-bool IsSingleInstructionFusion(mlir::lmhlo::FusionOp fusion);
-
 bool CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
     mlir::lmhlo::FusionOp fusion,
     absl::Span<const BufferAllocation* const> allocations);
@@ -143,12 +154,11 @@ Shape GetShape(mlir::Value value);
 // or vice versa.
 // Note: when this is called with a fusion instruction, it will traverse into
 // the fusion (unless the boundary function stops it).
-const HloInstruction& FindNonTrivialHero(
-    const HloInstruction& instr,
-    const std::function<bool(const HloInstruction& producer,
-                             const HloInstruction& consumer)>& is_boundary);
-// Like above, with the default boundary function. Additionally, this will not
-// traverse into `instr`'s computation if it is a fusion.
+const HloInstruction& FindNonTrivialHero(const HloInstruction& instr,
+                                         const HloFusionAdaptor& fusion);
+
+// Like above, but assumes the instruction is inside an HloFusionInstruction.
+// Returns the instruction itself if it is an HloFusionInstruction.
 const HloInstruction& FindNonTrivialHero(const HloInstruction& instr);
 
 /// Description of how to emit a given transposition.
@@ -193,7 +203,10 @@ std::optional<TransposeDescription> FindTiledLogicalTranspose(
 std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
     const HloInstruction& root, const HloInstruction& hero);
 
-bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count = 1);
+// Checks if the instruction is elementwise and only has a single user. If
+// a fusion adaptor is provided, only checks for users within the fusion.
+bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count = 1,
+                    const HloFusionAdaptor* fusion = nullptr);
 
 // Log the given module if the VLOG level is >= level.
 void VLogModule(int level, const llvm::Module& module);
@@ -222,6 +235,37 @@ std::string GetIrNameFromLoc(mlir::Location loc);
 // Whether the module's target is an AMD GPU.
 bool IsAMDGPU(const llvm::Module* module);
 
+// This class stores either a non-owning reference or owns data that represents
+// a dense array in XLA format. It is used for intermediate storage during IR
+// constant emission.
+class DenseDataIntermediate {
+ public:
+  // Creates an instance of DenseDataIntermediate that owns the provided vector.
+  static DenseDataIntermediate Own(std::vector<uint8_t> owned) {
+    DenseDataIntermediate di;
+    di.data_ = std::move(owned);
+    return di;
+  }
+
+  // Creates an instance of DenseDataIntermediate that aliases the input.
+  static DenseDataIntermediate Alias(absl::Span<const uint8_t> aliased) {
+    DenseDataIntermediate di;
+    di.data_ = aliased;
+    return di;
+  }
+
+  // Returns a reference to the data this object represents.
+  absl::Span<const uint8_t> span() const {
+    return data_.index() == 0 ? absl::Span<const uint8_t>(std::get<0>(data_))
+                              : std::get<1>(data_);
+  }
+
+ private:
+  std::variant<std::vector<uint8_t>, absl::Span<const uint8_t>> data_;
+};
+
+StatusOr<DenseDataIntermediate> LiteralToXlaFormat(const Literal& literal);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
index 1eec6e59991d10..1b3e323a58db76 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
@@ -15,12 +15,31 @@ limitations under the License.
 
 #include "xla/service/gpu/ir_emission_utils.h"
 
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <vector>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/translate/hlo_to_mhlo/hlo_utils.h"
+#include "xla/types.h"
 #include "xla/util.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -165,7 +184,42 @@ ENTRY entry {
   // emitter is fast for S8 output.
   EXPECT_FALSE(
       GetDescriptionForTiledTransposeEmitter(*r, *r->operand(0)).has_value());
-  EXPECT_EQ(&FindNonTrivialHero(*r), r->operand(0));
+  EXPECT_EQ(FindNonTrivialHero(*r).name(), "t");
+}
+
+TEST_F(IrEmissionUtilsTest, FindReduceHeroEpilogueFusion) {
+  const char* hlo = R"(
+    HloModule module
+
+    %add {
+      %x = f32[] parameter(0)
+      %y = f32[] parameter(1)
+      ROOT %add = f32[] add(%x, %y)
+    }
+
+    %fused_computation (param_0.4: f32[128,64], param_1.4: bf16[]) -> bf16[64] {
+      %param_0 = f32[128,64]{1,0} parameter(0)
+      %param_1 = bf16[] parameter(1)
+      %convert.0 = f32[] convert(bf16[] %param_1)
+      %reduce.0 = f32[64]{0} reduce(f32[128,64]{1,0} %param_0, f32[] %convert.0), dimensions={0}, to_apply=%add
+      ROOT %convert.1 = bf16[64]{0} convert(f32[64]{0} %reduce.0)
+    }
+
+    ENTRY %main {
+      %param_0 = f32[128,64]{1,0} parameter(0)
+      %param_1 = bf16[] parameter(1)
+      ROOT fusion = bf16[64]{0} fusion(%param_0, %param_1), kind=kInput, calls=fused_computation
+    }
+    )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+
+  HloInstruction* r = module->entry_computation()->root_instruction();
+  auto fusion = HloFusionAdaptor::ForInstruction(r);
+  const auto& result =
+      FindNonTrivialHero(fusion->GetRoots()[0].instruction(), *fusion);
+  EXPECT_EQ(result.name(), "reduce.0");
 }
 
 TEST_F(IrEmissionUtilsTest, FindAnyTiledTransposeWithIntermediateBinaryOp) {
@@ -261,46 +315,13 @@ ENTRY entry {
 
   HloInstruction* r = module->GetComputationWithName("f")->root_instruction();
   HloInstruction* transpose =
-      module->entry_computation()->parameter_instruction(0)->users().front();
-  EXPECT_EQ(
-      &FindNonTrivialHero(
-          *r,
-          [](const HloInstruction& producer, const HloInstruction& consumer) {
-            return consumer.opcode() == HloOpcode::kTranspose;
-          }),
-      transpose);
-}
-
-TEST_F(IrEmissionUtilsTest, FindNonTrivialHeroThroughFusion) {
-  const char* hlo = R"(
-HloModule module
-
-f {
-  p0 = f32[100,200,300]{2,1,0} parameter(0)
-  ROOT add = f32[100,200,300]{2,1,0} add(p0, p0)
-}
-
-ENTRY entry {
-  p0 = f32[300,200,100]{2,1,0} parameter(0)
-  p1 = f32[100,200,300]{2,1,0} parameter(1)
-  t = f32[100,200,300]{2,1,0} transpose(p0), dimensions={2,1,0}
-  fusion = f32[100,200,300]{2,1,0} fusion(t), kind=kLoop, calls=f
-  ROOT add = f32[100,200,300]{2,1,0} add(p1, fusion)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo));
-
-  HloInstruction* r = module->entry_computation()->root_instruction();
-  HloInstruction* transpose =
-      module->entry_computation()->parameter_instruction(0)->users().front();
+      module->entry_computation()->GetInstructionWithName("t");
+  HloInstruction* fusion =
+      module->entry_computation()->GetInstructionWithName("fusion");
   EXPECT_EQ(
-      &FindNonTrivialHero(
-          *r,
-          [](const HloInstruction& producer, const HloInstruction& consumer) {
-            return consumer.opcode() == HloOpcode::kTranspose;
-          }),
+      &FindNonTrivialHero(*r, ProducerConsumerFusion(
+                                  HloFusionAdaptor::ForInstruction(transpose),
+                                  HloFusionAdaptor::ForInstruction(fusion))),
       transpose);
 }
 
@@ -330,50 +351,15 @@ ENTRY entry {
                                   ->parameter_instruction(0)
                                   ->users()
                                   .front();
+  HloInstruction* fusion =
+      module->entry_computation()->GetInstructionWithName("fusion");
   EXPECT_EQ(
       &FindNonTrivialHero(
-          *r,
-          [](const HloInstruction& producer, const HloInstruction& consumer) {
-            return consumer.opcode() == HloOpcode::kParameter;
-          }),
+          *r, ProducerConsumerFusion(HloFusionAdaptor::ForInstruction(fusion),
+                                     HloFusionAdaptor::ForInstruction(r))),
       transpose);
 }
 
-TEST_F(IrEmissionUtilsTest, FindNonTrivialHeroSomeOperandsInFusion) {
-  const char* hlo = R"(
-HloModule module
-
-ENTRY entry {
-  p0 = f32[300,200,100]{2,1,0} parameter(0)
-  p1 = f32[100,200,300]{2,1,0} parameter(1)
-
-  transpose = f32[100,200,300]{2,1,0} transpose(p0), dimensions={2,1,0}
-  subtract = f32[100,200,300]{2,1,0} subtract(transpose, p1)
-  ROOT add = f32[100,200,300]{2,1,0} add(subtract, p1)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo));
-
-  HloInstruction* r = module->entry_computation()->root_instruction();
-  HloInstruction* transpose =
-      module->entry_computation()->parameter_instruction(0)->users().front();
-  // The transpose is the hero if everything is on one fusion.
-  EXPECT_EQ(&FindNonTrivialHero(
-                *r, [](const HloInstruction& producer,
-                       const HloInstruction& consumer) { return false; }),
-            transpose);
-  // The transpose isn't the hero if we cut the fusion at the subtraction.
-  EXPECT_EQ(
-      &FindNonTrivialHero(
-          *r,
-          [](const HloInstruction& producer, const HloInstruction& consumer) {
-            return producer.opcode() == HloOpcode::kSubtract;
-          }),
-      r);
-}
-
 TEST_F(IrEmissionUtilsTest, FindTiledTransposeOneSwapDimIsSmall) {
   const char* hlo = R"(
 HloModule module
@@ -458,5 +444,43 @@ ENTRY entry {
   EXPECT_EQ(result->permutation, Vector3({2, 1, 0}));
 }
 
+TEST_F(IrEmissionUtilsTest, LiteralToAttrToXlaFormat) {
+  // int16, should be aliased.
+  {
+    Literal literal = LiteralUtil::CreateR2<int16_t>({{0, 1, 2}, {3, 4, 5}});
+
+    TF_ASSERT_OK_AND_ASSIGN(DenseDataIntermediate data,
+                            LiteralToXlaFormat(literal));
+    EXPECT_EQ(data.span().size(), literal.size_bytes());
+    EXPECT_EQ(reinterpret_cast<const char*>(data.span().data()),
+              literal.untyped_data());
+  }
+
+  // int4, even, should be a new (unaliased) packed array.
+  {
+    Literal literal = LiteralUtil::CreateR2<s4>(
+        {{s4(0), s4(1), s4(2)}, {s4(3), s4(4), s4(5)}});
+
+    TF_ASSERT_OK_AND_ASSIGN(DenseDataIntermediate data,
+                            LiteralToXlaFormat(literal));
+    EXPECT_EQ(data.span(), std::vector<uint8_t>({0x01, 0x23, 0x45}));
+    EXPECT_NE(reinterpret_cast<const void*>(data.span().data()),
+              literal.untyped_data());
+  }
+
+  // int4, odd, should be a new (unaliased) packed array.
+  {
+    Literal literal = LiteralUtil::CreateR2<u4>(
+        {{u4(0), u4(1), u4(2)}, {u4(3), u4(4), u4(5)}, {u4(6), u4(7), u4(8)}});
+
+    TF_ASSERT_OK_AND_ASSIGN(DenseDataIntermediate data,
+                            LiteralToXlaFormat(literal));
+    EXPECT_EQ(data.span(),
+              std::vector<uint8_t>({0x01, 0x23, 0x45, 0x67, 0x80}));
+    EXPECT_NE(reinterpret_cast<const void*>(data.span().data()),
+              literal.untyped_data());
+  }
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emitter.cc b/third_party/xla/xla/service/gpu/ir_emitter.cc
index 8290787c69db0d..8250107d11ae95 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter.cc
@@ -263,8 +263,7 @@ void IrEmitter::BindFusionArguments(const HloInstruction* fusion,
 void IrEmitter::MaybeEmitFenceForAMDGPU(llvm::AtomicOrdering atomic_ordering,
                                         const char* sync_scope_id) {
   if (IsEmittingForAMDGPU() &&
-      ir_emitter_context_->rocm_compute_capability().gcn_arch_name().substr(
-          0, 6) == "gfx90a") {
+      ir_emitter_context_->rocm_compute_capability().fence_before_barrier()) {
     b_.CreateFence(atomic_ordering,
                    b_.getContext().getOrInsertSyncScopeID(sync_scope_id));
   }
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.cc b/third_party/xla/xla/service/gpu/ir_emitter_context.cc
index af3a8530b5426c..9af81cb90fd454 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.cc
@@ -20,7 +20,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "xla/service/gpu/gpu_constants.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 
 namespace xla {
 namespace gpu {
@@ -29,7 +32,7 @@ void IrEmitterContext::emit_constant(int64_t num_elements,
                                      int64_t bytes_per_element,
                                      absl::string_view symbol_name,
                                      int allocation_idx,
-                                     llvm::ArrayRef<uint8_t> content,
+                                     DenseDataIntermediate content,
                                      llvm::IRBuilder<>* b) {
   // LLVM and PTXAS don't deal well with large constants, so we only emit very
   // small constants directly in LLVM IR.  Larger constants are emitted with
@@ -50,15 +53,17 @@ void IrEmitterContext::emit_constant(int64_t num_elements,
   GpuExecutable::ConstantInfo info;
   llvm::Constant* initializer = [&]() -> llvm::Constant* {
     if (!should_emit_initializer) {
-      info.content = content;
+      info.content = std::move(content);
       return llvm::ConstantAggregateZero::get(global_type);
     }
 
     std::vector<uint8_t> padded(kMinConstAllocationInBytes, 0);
-    absl::c_copy(content, padded.begin());
+    absl::c_copy(content.span(), padded.begin());
     return llvm::ConstantDataArray::get<uint8_t>(
         llvm_module_->getContext(),
-        needs_padding ? llvm::ArrayRef<uint8_t>(padded) : content);
+        needs_padding ? llvm::ArrayRef<uint8_t>(padded)
+                      : llvm::ArrayRef<uint8_t>(content.span().data(),
+                                                content.span().size()));
   }();
 
   // These globals will be looked up by name by GpuExecutable so we need to
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.h b/third_party/xla/xla/service/gpu/ir_emitter_context.h
index 3e9787a645b569..4ec71b8b0d61f9 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/name_uniquer.h"
 #include "xla/stream_executor/device_description.h"
 
@@ -42,14 +43,15 @@ class IrEmitterContext {
                    std::string platform_name,
                    const se::DeviceDescription& gpu_device_info,
                    mlir::MLIRContext* mlir_context, llvm::Module* llvm_module,
-                   bool emit_ir_from_hlo)
+                   bool emit_ir_from_hlo, bool emit_kernels)
       : hlo_module_(hlo_module),
         buffer_assignment_(buffer_assignment),
         platform_name_(std::move(platform_name)),
         gpu_device_info_(gpu_device_info),
         mlir_context_(mlir_context),
         llvm_module_(llvm_module),
-        emit_ir_from_hlo_(emit_ir_from_hlo) {}
+        emit_ir_from_hlo_(emit_ir_from_hlo),
+        emit_kernels_(emit_kernels) {}
   // Disallow copy and assign.
   IrEmitterContext(const IrEmitterContext&) = delete;
   IrEmitterContext& operator=(const IrEmitterContext&) = delete;
@@ -63,14 +65,17 @@ class IrEmitterContext {
   const se::DeviceDescription& gpu_device_info() const {
     return gpu_device_info_;
   }
+  const se::GpuComputeCapability& gpu_compute_capability() const {
+    return gpu_device_info_.gpu_compute_capability();
+  }
   se::CudaComputeCapability cuda_compute_capability() const {
-    auto* cc = std::get_if<se::CudaComputeCapability>(
-        &gpu_device_info_.gpu_compute_capability());
+    auto* cc =
+        std::get_if<se::CudaComputeCapability>(&gpu_compute_capability());
     return cc != nullptr ? *cc : se::CudaComputeCapability();
   }
   se::RocmComputeCapability rocm_compute_capability() const {
-    auto* cc = std::get_if<se::RocmComputeCapability>(
-        &gpu_device_info_.gpu_compute_capability());
+    auto* cc =
+        std::get_if<se::RocmComputeCapability>(&gpu_compute_capability());
     return cc != nullptr ? *cc : se::RocmComputeCapability();
   }
   mlir::MLIRContext* mlir_context() { return mlir_context_; }
@@ -91,13 +96,14 @@ class IrEmitterContext {
   // element, given symbol name and content.
   void emit_constant(int64_t num_elements, int64_t bytes_per_element,
                      absl::string_view symbol_name, int allocation_idx,
-                     llvm::ArrayRef<uint8_t> content, llvm::IRBuilder<>* b);
+                     DenseDataIntermediate content, llvm::IRBuilder<>* b);
 
   const DebugOptions& debug_options() const {
     return hlo_module_->config().debug_options();
   }
 
   bool emit_ir_from_hlo() const { return emit_ir_from_hlo_; }
+  bool emit_kernels() const { return emit_kernels_; }
 
  private:
   const HloModule* hlo_module_;
@@ -115,6 +121,9 @@ class IrEmitterContext {
   NameUniquer name_uniquer_;
   std::vector<GpuExecutable::ConstantInfo> constants_;
   const bool emit_ir_from_hlo_;
+
+  // We should not emit kernels when loading thunks from a compilation result.
+  const bool emit_kernels_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_nested.cc b/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
index ad83ba15ad9724..d76d8b227e3cb0 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -25,6 +26,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
@@ -113,8 +115,7 @@ StatusOr<llvm::Function*> IrEmitterNested::CodegenNestedComputation() {
   for (const HloInstruction* param : params) {
     io_hlos.push_back(param);
     const Shape& param_shape = param->shape();
-    argument_types.push_back(
-        llvm_ir::ShapeToIrType(param_shape, module_)->getPointerTo());
+    argument_types.push_back(b_.getPtrTy());
     int64_t param_size =
         llvm_ir::ByteSizeOf(param_shape, module_->getDataLayout());
     argument_dereferenceable_bytes.push_back(param_size);
@@ -123,8 +124,7 @@ StatusOr<llvm::Function*> IrEmitterNested::CodegenNestedComputation() {
   const HloInstruction* root = nested_computation_.root_instruction();
   {
     const Shape& root_shape = root->shape();
-    argument_types.push_back(
-        llvm_ir::ShapeToIrType(root_shape, module_)->getPointerTo());
+    argument_types.push_back(b_.getPtrTy());
     int64_t root_size = llvm_ir::ByteSizeOf(
         root_shape, ir_emitter_context_->llvm_module()->getDataLayout());
     argument_dereferenceable_bytes.push_back(root_size);
@@ -184,20 +184,17 @@ StatusOr<llvm::Function*> IrEmitterNested::CodegenNestedComputation() {
       llvm::Value* ret_value =
           Load(llvm_ir::ShapeToIrType(return_shape, module_), root_value,
                "load_ret_value");
-      Store(ret_value,
-            BitCast(out_parameter, root_value->getType(), "bitcast_ret_value"));
+      Store(ret_value, out_parameter);
     } else {
       CHECK(return_shape.IsTuple());
       llvm::Type* tuple_type = llvm_ir::ShapeToIrType(return_shape, module_);
-      llvm::Type* tuple_type_ptr = tuple_type->getPointerTo();
-      llvm::Value* tuple_ptr = BitCast(out_parameter, tuple_type_ptr);
 
       for (int i = 0; i < return_shape.tuple_shapes_size(); i++) {
         const Shape& element_shape = return_shape.tuple_shapes(i);
         llvm::Value* destination = llvm_ir::EmitGetTupleElement(
             element_shape,
             /*index=*/i,
-            /*alignment=*/1, tuple_ptr, tuple_type, &b_);
+            /*alignment=*/1, out_parameter, tuple_type, &b_);
         llvm::Value* source = llvm_ir::EmitGetTupleElement(
             element_shape,
             /*index=*/i,
@@ -257,7 +254,9 @@ Status IrEmitterNested::EmitConstants(const HloComputation& computation) {
 
         global_name,
         /*allocation_idx=*/-1,
-        llvm::ArrayRef<uint8_t>(base, base + literal.size_bytes()), &b_);
+        DenseDataIntermediate::Alias(
+            absl::MakeSpan(base, base + literal.size_bytes())),
+        &b_);
   }
   return OkStatus();
 }
@@ -269,8 +268,8 @@ llvm::Value* AddrCastToDefault(llvm::Value* arg, llvm::IRBuilder<>& b) {
   llvm::Type* arg_type = arg->getType();
   CHECK(arg_type->isPointerTy());
   if (arg_type->getPointerAddressSpace() != 0) {
-    llvm::Type* generic_arg_type = llvm::PointerType::getWithSamePointeeType(
-        llvm::cast<llvm::PointerType>(arg_type), 0);
+    llvm::Type* generic_arg_type = llvm::PointerType::get(
+        llvm::cast<llvm::PointerType>(arg_type)->getContext(), 0);
     llvm::Value* addrspacecast_arg =
         b.CreateAddrSpaceCast(arg, generic_arg_type);
     return addrspacecast_arg;
@@ -293,8 +292,8 @@ void EmitAMDGPUAtomicAdd(llvm::IRBuilder<>* builder,
           // is in global addrspace (1)
           : builder->CreateAddrSpaceCast(
                 output_address,
-                llvm::PointerType::getWithSamePointeeType(output_address_type,
-                                                          /*AddressSpace=*/1));
+                llvm::PointerType::get(output_address_type->getContext(),
+                                       /*AddressSpace=*/1));
 
   builder->CreateAtomicRMW(
       llvm::AtomicRMWInst::FAdd, output_ptr, source, llvm::MaybeAlign(),
@@ -416,15 +415,8 @@ bool MaybeEmitDirectAtomicOperation(llvm::IRBuilder<>* builder,
 
       KernelSupportLibrary ksl(builder, llvm_ir::UnrollMode::kDefaultUnroll);
 
-      llvm::PointerType* output_address_type =
-          llvm::dyn_cast<llvm::PointerType>(output_address->getType());
-      llvm::Type* atomic_address_type = builder->getFloatTy()->getPointerTo(
-          output_address_type->getPointerAddressSpace());
-      llvm::Value* atomic_memory_address =
-          builder->CreatePointerBitCastOrAddrSpaceCast(output_address,
-                                                       atomic_address_type);
       llvm::Value* old_output = builder->CreateLoad(
-          builder->getFloatTy(), atomic_memory_address, "old_output");
+          builder->getFloatTy(), output_address, "old_output");
       auto is_nan_output = builder->CreateFCmpUNO(old_output, old_output);
       ksl.If(
           "is_nan_output", is_nan_output,
@@ -555,14 +547,13 @@ Status EmitAtomicOperationUsingCAS(llvm::IRBuilder<>* builder,
   llvm::PointerType* output_address_type =
       llvm::dyn_cast<llvm::PointerType>(output_address->getType());
   CHECK_NE(output_address_type, nullptr);
-  CHECK(output_address_type->isOpaqueOrPointeeTypeMatches(element_type));
 
   int element_size = llvm_ir::GetSizeInBits(element_type);
 
   int atomic_size = (element_size < 32) ? 32 : element_size;
   llvm::Type* atomic_type = builder->getIntNTy(atomic_size);
   llvm::Type* atomic_address_type =
-      atomic_type->getPointerTo(output_address_type->getPointerAddressSpace());
+      builder->getPtrTy(output_address_type->getPointerAddressSpace());
 
   // cas_old_output_address and cas_new_output_address point to the scratch
   // memory where we store the old and new values for the repeated atomicCAS
@@ -598,16 +589,14 @@ Status EmitAtomicOperationUsingCAS(llvm::IRBuilder<>* builder,
         offset);
     binop_output_address = builder->CreateIntToPtr(
         binop_output_address,
-        llvm::PointerType::get(
-            element_type,
+        builder->getPtrTy(
             cas_new_output_address->getType()->getPointerAddressSpace()));
   } else {
     atomic_memory_address = builder->CreatePointerBitCastOrAddrSpaceCast(
         output_address, atomic_address_type);
     binop_output_address = builder->CreatePointerBitCastOrAddrSpaceCast(
         cas_new_output_address,
-        llvm::PointerType::get(
-            element_type,
+        builder->getPtrTy(
             cas_new_output_address->getType()->getPointerAddressSpace()));
   }
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 2c24214ca75716..1bb09ed966b57f 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emitter_triton.h"
 
 #include <climits>
+#include <cmath>
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -31,7 +32,10 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/cord.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
@@ -90,13 +94,14 @@ limitations under the License.
 #include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
 #include "xla/primitive_util.h"
 #include "xla/service/dump.h"
-#include "xla/service/gpu/gemm_rewriter_triton.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/target_util.h"
+#include "xla/service/gpu/triton_fusion_analysis.h"
+#include "xla/service/gpu/triton_tiling_propagation.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -110,6 +115,7 @@ limitations under the License.
 #include "tsl/platform/path.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/platform/tensor_float_32_utils.h"
 #include "triton/Conversion/NVGPUToLLVM/NVGPUToLLVMPass.h"
 #include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h"
 #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h"
@@ -264,6 +270,9 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
       dst_element_ty.isa<mlir::IntegerType>()) {
     if (src_element_ty.getIntOrFloatBitWidth() <
         dst_element_ty.getIntOrFloatBitWidth()) {
+      if (src_element_ty.isInteger(1)) {
+        return b.create<ma::ExtUIOp>(dst_ty, value);
+      }
       return b.create<ma::ExtSIOp>(dst_ty, value);
     }
     return b.create<ma::TruncIOp>(dst_ty, value);
@@ -301,10 +310,12 @@ Value Subtract(ImplicitLocOpBuilder& b, ValueRange values) {
 
 Value Compare(ImplicitLocOpBuilder& b, ValueRange values,
               mlir::mhlo::ComparisonDirection direction) {
-  if (mlir::getElementTypeOrSelf(values[0]).isa<mlir::IntegerType>()) {
+  const Type type = mlir::getElementTypeOrSelf(values[0]);
+  if (type.isa<mlir::IntegerType>()) {
     return b.create<ma::CmpIOp>(
-        mlir::mhlo::impl::getCmpPredicate<ma::CmpIPredicate>(direction,
-                                                             /*isSigned=*/true)
+        mlir::mhlo::impl::getCmpPredicate<ma::CmpIPredicate>(
+            direction,
+            /*isSigned=*/!type.isInteger(1))
             .value(),
         values[0], values[1]);
   }
@@ -659,8 +670,14 @@ StatusOr<Value> EmitScope(
     absl::flat_hash_map<const HloInstruction*, Value>& values) {
   for (const HloInstruction* hlo : instructions) {
     Value result;
-    if (hlo->opcode() == HloOpcode::kParameter) {
-      // Parameter loads are handled outside EmitScope.
+    if (hlo->opcode() == HloOpcode::kConcatenate) {
+      // Parameter loads and their concatenations are handled outside EmitScope.
+      TF_RET_CHECK(values.contains(hlo)) << hlo->ToString();
+      continue;
+    } else if (hlo->opcode() == HloOpcode::kParameter) {
+      if (hlo->users()[0]->opcode() == HloOpcode::kConcatenate) {
+        continue;
+      }
       TF_RET_CHECK(values.contains(hlo)) << hlo->ToString();
       continue;
     } else if (hlo->opcode() == HloOpcode::kConstant) {
@@ -703,9 +720,11 @@ void CreateTritonPipeline(mlir::OpPassManager& pm,
                           const se::CudaComputeCapability& cc, int num_warps,
                           int num_stages) {
   const int ccAsInt = cc.major * 10 + cc.minor;
-  // Based on optimize_ttir() in
-  // @triton//:python/triton/compiler/compiler.py
-  pm.addPass(mt::createRewriteTensorPointerPass());
+  const int threadsPerWarp = 32;
+  const int numCTAs = 1;
+  // Based on make_ttir() in
+  // @triton//:python/triton/compiler/backends/cuda.py
+  pm.addPass(mt::createRewriteTensorPointerPass(ccAsInt));
   pm.addPass(mlir::createInlinerPass());
   pm.addPass(mt::createCombineOpsPass());
   pm.addPass(mlir::createCanonicalizerPass());
@@ -713,20 +732,28 @@ void CreateTritonPipeline(mlir::OpPassManager& pm,
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createLoopInvariantCodeMotionPass());
   pm.addPass(mlir::createSymbolDCEPass());
-  // Based on ttir_to_ttgir() in
-  // @triton//:python/triton/compiler/compiler.py
-  pm.addPass(mt::createConvertTritonToTritonGPUPass(num_warps));
-  // Based on optimize_ttgir() in
-  // @triton//:python/triton/compiler/compiler.py
+  // Based on make_ttgir() under "# TTIR -> TTGIR" in
+  // @triton//:python/triton/compiler/backends/cuda.py
+  pm.addPass(mt::createConvertTritonToTritonGPUPass(num_warps, threadsPerWarp,
+                                                    numCTAs, ccAsInt));
+  // Based on make_ttgir() under "# optimize TTGIR" in
+  // @triton//:python/triton/compiler/backends/cuda.py
   pm.addPass(mlir::createTritonGPUCoalescePass());
-  pm.addPass(mlir::createTritonNvidiaGPUPlanCTAPass());
+  pm.addPass(mlir::createTritonNvidiaGPUPlanCTAPass(/*clusterInfo=*/));
+  pm.addPass(mlir::createTritonGPURewriteTensorPointerPass(ccAsInt));
+  pm.addPass(mlir::createTritonNvidiaGPUPlanCTAPass(/*clusterInfo=*/));
   pm.addPass(mlir::createTritonGPURemoveLayoutConversionsPass());
   pm.addPass(mlir::createTritonGPUAccelerateMatmulPass(ccAsInt));
   pm.addPass(mlir::createTritonGPURemoveLayoutConversionsPass());
   pm.addPass(mlir::createTritonGPUOptimizeDotOperandsPass());
-  pm.addPass(mlir::createTritonGPUPipelinePass(num_stages, num_warps));
-  pm.addPass(mlir::createTritonNvidiaGPUMaterializeLoadStorePass());
-  pm.addPass(mlir::createTritonGPUPrefetchPass());
+  pm.addPass(mlir::createCSEPass());
+  pm.addPass(mlir::createTritonGPUPipelinePass(num_stages, num_warps, numCTAs,
+                                               ccAsInt));
+  pm.addPass(
+      mlir::createTritonNvidiaGPUMaterializeLoadStorePass(num_warps, ccAsInt));
+  if (ccAsInt <= 80) {
+    pm.addPass(mlir::createTritonGPUPrefetchPass());
+  }
   pm.addPass(mlir::createTritonGPUOptimizeDotOperandsPass());
   pm.addPass(mlir::createTritonGPURemoveLayoutConversionsPass());
   pm.addPass(mlir::createTritonGPUDecomposeConversionsPass());
@@ -734,13 +761,19 @@ void CreateTritonPipeline(mlir::OpPassManager& pm,
   pm.addPass(mlir::createTritonGPUReorderInstructionsPass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createSymbolDCEPass());
+  if (ccAsInt >= 90) {
+    pm.addPass(mlir::createTritonNvidiaGPUFenceInsertionPass(ccAsInt));
+  }
   pm.addPass(mlir::createTritonNvidiaGPUWSFixupMissingAttrs());
+  pm.addPass(mlir::createTritonGPUOptimizeThreadLocalityPass());
+  pm.addPass(mlir::createCanonicalizerPass());
   // Based on translateTritonGPUToLLVMIR() in
   // @triton//:lib/Target/LLVMIR/LLVMIRTranslation.cpp
   pm.addPass(mlir::createConvertSCFToCFPass());
   pm.addPass(mlir::createConvertIndexToLLVMPass());
-  pm.addPass(
-      mt::createConvertTritonGPUToLLVMPass(ccAsInt, mt::Default, nullptr));
+  pm.addPass(mt::createConvertTritonGPUToLLVMPass(ccAsInt,
+                                                  /*target=*/mt::Default,
+                                                  /*tmaMetadata=*/nullptr));
   pm.addPass(mt::createConvertNVGPUToLLVMPass());
   pm.addPass(mlir::createArithToLLVMConversionPass());
   pm.addPass(mlir::createCanonicalizerPass());
@@ -776,7 +809,7 @@ void StripParameterAddressSpaces(mlir::RewriterBase& rewriter,
         auto ptr_ty = type.dyn_cast<ml::LLVMPointerType>();
         if (!ptr_ty) return type;
         if (ptr_ty.getAddressSpace() != mn::kGlobalMemorySpace) return type;
-        return ml::LLVMPointerType::get(ptr_ty.getElementType());
+        return ml::LLVMPointerType::get(ptr_ty.getContext());
       }));
   ml::LLVMFunctionType generic_func_ty =
       func_ty.clone(generic_func_params, func_ty.getReturnTypes());
@@ -1049,6 +1082,33 @@ struct Side {
   std::optional<int64_t> batch_dim_idx;
 };
 
+// if (index < limits[0]) {
+//   return choices[0];
+// } else if (index < limits[1]) {
+//   return choices[1];
+// } else if (...) {
+// ...
+// } else {
+//   return choices.back();
+// }
+Value EmitMultiSelect(ImplicitLocOpBuilder b, Value index, ValueRange limits,
+                      ValueRange choices) {
+  CHECK_EQ(choices.size() - 1, limits.size());
+  Value result = choices[0];
+  for (int i = 0; i < choices.size() - 1; ++i) {
+    result = b.create<ma::SelectOp>(
+        b.create<ma::CmpIOp>(ma::CmpIPredicate::slt, index, limits[i]), result,
+        choices[i + 1]);
+  }
+  return result;
+}
+
+Status UncompilableMatmul(absl::string_view explanation) {
+  Status s = absl::CancelledError(explanation);
+  s.SetPayload(kUncompilableFusion, absl::Cord(explanation));
+  return s;
+}
+
 class MatMulEmitterHelper {
  public:
   MatMulEmitterHelper(absl::string_view libdevice_path,
@@ -1125,49 +1185,140 @@ class MatMulEmitterHelper {
         values);
   }
 
-  Value EmitTensorPointer(const HloInstruction* hlo, const Side& side,
-                          Value base, Value pid_k,
-                          std::vector<int32_t>& boundary_checks) {
-    auto pid_batch =
-        b_.create<mt::GetProgramIdOp>(launch_config_.batch_program_id_dim);
-
+  StatusOr<Value> EmitTensorPointer(const HloInstruction* hlo, const Side& side,
+                                    ValueRange bases, Value pid_k,
+                                    std::vector<int32_t>& boundary_checks) {
+    // Parameters of MakeTensorPtrOp to be generated by this function.
+    Value base;
     std::vector<Value> bounds;
     std::vector<Value> strides;
     // Offsets from tensor origin, same for all thread blocks.
     std::vector<Value> tensor_offsets;
-    // Offsets for a given thread block, typically pid * block size.
-    std::vector<Value> block_offsets;
     std::vector<int32_t> block_dims;
     std::vector<int32_t> dim_order;
 
+    // Offsets for a given thread block, typically pid * block size.
+    // Used in a one-off AdvanceOp applied to the generated MakeTensorPtrOp.
+    std::vector<Value> block_offsets;
+
+    // Concatenations of parameters are handled during generation of block
+    // pointers because of a limitation of implementation of block pointers
+    // in the Triton compiler: block pointers are not supported inside
+    // conditionals.
+    // Therefore instead of directly using a conditional to emit a concatenation
+    // and emitting its inputs inside the cases a single block pointer is
+    // emitted for all inputs, but all its properties (base, strides etc) get
+    // generated conditionally on the position of the current thread block
+    // within the concatenated dimension.
+
+    // Index of concatenated dimension if present, -1 otherwise.
+    int concat_dim_idx;
+    // Offsets along the concatenated dimension at which operands change.
+    std::vector<Value> concat_boundaries;
+    // Block index along the concatenated dimension * block size.
+    Value concat_dim_pid_offset;
+
+    if (hlo->opcode() == HloOpcode::kConcatenate) {
+      // For now only non-contracting dimension can be concatenated.
+      concat_dim_idx = (side.scope == TritonFusionAnalysis::Scope::LHS)
+                           ? dims_.lhs_noncontracting_dim_idx
+                           : dims_.rhs_noncontracting_dim_idx;
+      const DimProperties& properties = [&] {
+        for (const DimProperties& dim : side.tiled_dims) {
+          if (dim.index == concat_dim_idx) {
+            return dim;
+          }
+        }
+        LOG(FATAL) << "Missing dimension.";
+      }();
+      CHECK_EQ(bases.size(), hlo->operand_count());
+
+      concat_boundaries.reserve(hlo->operand_count() - 1);
+      int64_t accumulated_size = 0;
+      for (int i = 0; i < hlo->operand_count() - 1; ++i) {
+        const int64_t operand_size =
+            analysis_.IterSpec(side.scope, hlo->operand(i), concat_dim_idx)
+                ->at(0)
+                .count;
+        if (operand_size % properties.block_size != 0) {
+          return UncompilableMatmul(
+              "Operand is not divisible by the block size.");
+        }
+        accumulated_size += operand_size;
+        concat_boundaries.push_back(Cst32(accumulated_size));
+      }
+
+      concat_dim_pid_offset =
+          b_.create<ma::MulIOp>(properties.pid, Cst32(properties.block_size));
+      base =
+          EmitMultiSelect(b_, concat_dim_pid_offset, concat_boundaries, bases);
+    } else {
+      concat_dim_idx = -1;
+      base = bases[0];
+    }
+
     auto add_dim = [&](const DimProperties& properties) {
-      const TensorIterationSpec::DimIterationSpec* spec =
-          analysis_.IterSpec(side.scope, hlo, properties.index);
-      if (spec == nullptr) {
+      if (analysis_.IterSpec(side.scope, hlo, properties.index) == nullptr) {
         return;
       }
-      int64_t count = spec->at(0).count;
-      if (side.scope == TritonFusionAnalysis::Scope::OUTPUT &&
-          properties.index == dims_.out_lhs_noncontracting_dim_idx &&
-          spec->size() == 1 && dims_.lhs_noncontracting_split.has_value()) {
-        // Dimension of the output produced by the non-contracting LHS one
-        // is logically split, major part is addressed using pid_batch.
-        count /= *dims_.lhs_noncontracting_split;
-      }
-      if (count % (properties.block_size * properties.split_value) != 0) {
-        boundary_checks.push_back(bounds.size());
-      }
-      bounds.push_back(Cst64(count));
-      strides.push_back(Cst64(spec->at(0).stride));
-      block_offsets.push_back(
+      Value pid_offset =
           (properties.pid == nullptr)
               ? Cst32(0)
               : b_.create<ma::MulIOp>(properties.pid,
-                                      Cst32(properties.block_size)));
-      tensor_offsets.push_back(Cst32(spec->at(0).slice_start));
+                                      Cst32(properties.block_size));
+      std::vector<const HloInstruction*> inputs;
+      if (hlo->opcode() == HloOpcode::kConcatenate) {
+        inputs.insert(inputs.end(), hlo->operands().cbegin(),
+                      hlo->operands().cend());
+      } else {
+        inputs = {hlo};
+      }
+      std::vector<const TensorIterationSpec::DimIterationSpec*> specs;
+      std::vector<Value> input_strides;
+      std::vector<Value> input_offsets;
+      std::vector<Value> input_bounds;
+      specs.reserve(inputs.size());
+      input_strides.reserve(inputs.size());
+      input_offsets.reserve(inputs.size());
+      input_bounds.reserve(inputs.size());
+      for (const HloInstruction* input : inputs) {
+        specs.push_back(
+            analysis_.IterSpec(side.scope, input, properties.index));
+        input_strides.push_back(Cst64(specs.back()->at(0).stride));
+        input_offsets.push_back(b_.create<ma::SubIOp>(
+            pid_offset, input_offsets.empty()
+                            ? Cst32(0)
+                            : concat_boundaries[input_offsets.size() - 1]));
+        input_bounds.push_back(Cst64(specs.back()->at(0).count));
+      }
+      strides.push_back(EmitMultiSelect(b_, concat_dim_pid_offset,
+                                        concat_boundaries, input_strides));
+      if (properties.index == concat_dim_idx) {
+        block_offsets.push_back(
+            EmitMultiSelect(b_, pid_offset, concat_boundaries, input_offsets));
+        bounds.push_back(
+            EmitMultiSelect(b_, pid_offset, concat_boundaries, input_bounds));
+      } else {
+        block_offsets.push_back(pid_offset);
+        int64_t count = specs.back()->at(0).count;
+        if (side.scope == TritonFusionAnalysis::Scope::OUTPUT &&
+            properties.index == dims_.out_lhs_noncontracting_dim_idx &&
+            specs.back()->size() == 1 &&
+            dims_.lhs_noncontracting_split.has_value()) {
+          // Dimension of the output produced by the non-contracting LHS one
+          // is logically split, major part is addressed using pid_batch.
+          count /= *dims_.lhs_noncontracting_split;
+        }
+        bounds.push_back(Cst64(count));
+        if (count % (properties.block_size * properties.split_value) != 0) {
+          boundary_checks.push_back(bounds.size() - 1);
+        }
+      }
+      tensor_offsets.push_back(Cst32(specs.back()->at(0).slice_start));
       block_dims.push_back(properties.block_size);
       dim_order.emplace(dim_order.begin(), dim_order.size());
     };
+
     for (const DimProperties& dim : side.tiled_dims) {
       add_dim(dim);
     }
@@ -1202,6 +1353,8 @@ class MatMulEmitterHelper {
       }
     }
     if (stride_batch != 0) {
+      Value pid_batch =
+          b_.create<mt::GetProgramIdOp>(launch_config_.batch_program_id_dim);
       Value pid_offset_batch = b_.create<ma::MulIOp>(
           b_.create<ma::AddIOp>(Cst(offset_batch), ConvertScalar(pid_batch)),
           Cst(stride_batch));
@@ -1220,6 +1373,7 @@ class MatMulEmitterHelper {
     }
 
     if (block_dims.empty()) {
+      // Load of a scalar.
       return base;
     }
     auto tensor_ptr =
@@ -1258,20 +1412,60 @@ class MatMulEmitterHelper {
 
 }  // namespace
 
-LaunchDimensions GetMatMulLaunchDimensions(
-    const TritonFusionAnalysis& analysis,
-    absl::Span<const HloInstruction* const> roots,
-    const FusionBoundaryFn& fusion_boundary, const TritonGemmConfig& config) {
-  const auto* dot = static_cast<const HloDotInstruction*>(
-      HloFindIf(roots, fusion_boundary, [](const HloInstruction& node) {
-        return node.opcode() == HloOpcode::kDot;
-      }));
-  CHECK_NE(dot, nullptr);
-  const MatMulDims dims(config, *dot, analysis);
-  const MatMulLaunchConfig launch_config(config, *dot, dims);
+LaunchDimensions GetMatMulLaunchDimensions(const TritonFusionAnalysis& analysis,
+                                           const HloFusionAdaptor& fusion,
+                                           const TritonGemmConfig& config) {
+  auto dot = HloFindIf(fusion.GetRoots(), fusion, [](auto node) {
+    return node.opcode() == HloOpcode::kDot;
+  });
+  CHECK(dot != std::nullopt);
+  const auto& dot_instr =
+      *static_cast<const HloDotInstruction*>(&dot->instruction());
+  MatMulDims dims(config, dot_instr, analysis);
+  MatMulLaunchConfig launch_config(config, dot_instr, dims);
   return launch_config.launch_dims;
 }
 
+SmallVector<Value> GetArguments(mlir::triton::FuncOp fn,
+                                const HloInstruction& input) {
+  if (input.opcode() == HloOpcode::kParameter) {
+    return {fn.getArgument(input.parameter_number())};
+  } else if (input.opcode() == HloOpcode::kConcatenate) {
+    SmallVector<Value> result;
+    for (const HloInstruction* operand : input.operands()) {
+      result.push_back(fn.getArgument(operand->parameter_number()));
+    }
+    return result;
+  }
+  LOG(FATAL) << "Unexpected opcode: " << input.opcode();
+}
+
+// Concatenations can currently only be applied directly to parameters;
+// all concatenated parameters share the same block pointer. This function
+// returns all inputs of a kernel: concatenations of parameters and standalone
+// parameters.
+ConstHloInstructionSet ScopeInputs(const TritonFusionAnalysis& analysis,
+                                   const TritonFusionAnalysis::Scope scope) {
+  ConstHloInstructionSet result;
+  for (const HloInstruction* parameter : analysis.ScopeParameters(scope)) {
+    if (absl::c_any_of(parameter->users(), [](const HloInstruction* user) {
+          return user->opcode() == HloOpcode::kConcatenate;
+        })) {
+      // Concatenation is always the only user of its parameters by
+      // construction.
+      CHECK_EQ(parameter->users().size(), 1);
+      for (const HloInstruction* operand : parameter->users()[0]->operands()) {
+        // All operands of a concatenation have to be computation parameters.
+        CHECK_EQ(operand->opcode(), HloOpcode::kParameter);
+      }
+      result.insert(parameter->users()[0]);
+    } else {
+      result.insert(parameter);
+    }
+  }
+  return result;
+}
+
 // Variable naming: lhs [m, k] x rhs [k, n] -> out [m, n].
 Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
                   const TritonFusionAnalysis& analysis,
@@ -1342,7 +1536,7 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
 
   // Parameters are passed to the loop in non-trivial order, these maps help
   // finding them and their attributes.
-  absl::flat_hash_map<int, const HloInstruction*> iter_args_to_parameters;
+  absl::flat_hash_map<int, const HloInstruction*> iter_args_to_inputs;
   absl::flat_hash_map<int, std::vector<int32_t>> iter_args_to_boundary_checks;
 
   Side lhs{
@@ -1376,18 +1570,25 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
     // Load tiles of all parameters of LHS and RHS scopes and advance pointers.
     for (int i = 0; i < iter_args.size() - 1; ++i) {
       const bool is_lhs =
-          i < analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).size();
+          i < ScopeInputs(analysis, TritonFusionAnalysis::Scope::LHS).size();
       Side& side = is_lhs ? lhs : rhs;
       auto& values = is_lhs ? values_lhs : values_rhs;
-      CHECK(values
-                .insert({iter_args_to_parameters[i],
-                         EmitParameterLoad(b, iter_args[i],
-                                           iter_args_to_boundary_checks[i])})
-                .second);
+
+      const HloInstruction* param_hlo = iter_args_to_inputs[i];
+      Type param_ty = TritonType(b, param_hlo->shape().element_type());
+      Type param_storage_ty = StorageType(b, param_ty);
+      Value param_value =
+          EmitParameterLoad(b, iter_args[i], iter_args_to_boundary_checks[i]);
+      if (param_ty != param_storage_ty) {
+        // For example cast i8 to i1.
+        param_value = Cast(b, param_value, param_ty);
+      }
+
+      CHECK(values.insert({param_hlo, param_value}).second);
       SmallVector<Value> increments;
       for (const DimProperties& dim : side.tiled_dims) {
-        const TensorIterationSpec::DimIterationSpec* spec = analysis.IterSpec(
-            side.scope, iter_args_to_parameters[i], dim.index);
+        const TensorIterationSpec::DimIterationSpec* spec =
+            analysis.IterSpec(side.scope, iter_args_to_inputs[i], dim.index);
         if (spec == nullptr || spec->at(0).stride == 0) {
           continue;
         }
@@ -1445,6 +1646,7 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
     }
 
     const bool allow_tf32 =
+        tsl::tensor_float_32_execution_enabled() &&
         absl::c_none_of(dot_instr->precision_config().operand_precision(),
                         [](const int precision) {
                           return precision != PrecisionConfig::DEFAULT;
@@ -1463,21 +1665,21 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
     b.create<mlir::scf::YieldOp>(iter_args_next);
   };
 
-  // Pointers to parameters of LHS scope, then RHS, then the accumulator
+  // Pointers to inputs of LHS scope, then RHS, then the accumulator
   // that change with every loop iteration and are passed between them.
-  // LHS and RHS can use same HLO computation parameters, but because they use
-  // different pointers they have to be stored separately for each scope.
   SmallVector<Value> iter_args;
   iter_args.reserve(
-      analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).size() +
-      analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).size() + 1);
+      ScopeInputs(analysis, TritonFusionAnalysis::Scope::LHS).size() +
+      ScopeInputs(analysis, TritonFusionAnalysis::Scope::RHS).size() + 1);
 
   for (const Side& side : {lhs, rhs}) {
-    for (const HloInstruction* param : analysis.ScopeParameters(side.scope)) {
-      CHECK(iter_args_to_parameters.insert({iter_args.size(), param}).second);
-      iter_args.push_back(emitter.EmitTensorPointer(
-          param, side, fn.getArgument(param->parameter_number()), pid_k,
-          iter_args_to_boundary_checks[iter_args.size()]));
+    for (const HloInstruction* input : ScopeInputs(analysis, side.scope)) {
+      CHECK(iter_args_to_inputs.insert({iter_args.size(), input}).second);
+      TF_ASSIGN_OR_RETURN(Value tensor_ptr,
+                          emitter.EmitTensorPointer(
+                              input, side, GetArguments(fn, *input), pid_k,
+                              iter_args_to_boundary_checks[iter_args.size()]));
+      iter_args.push_back(tensor_ptr);
     }
   }
 
@@ -1496,14 +1698,15 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
   if (std::vector<const HloInstruction*> to_emit =
           emitter.EpiloguePostOrderTransitiveOperands(root);
       !to_emit.empty()) {
-    for (const HloInstruction* parameter :
-         analysis.ScopeParameters(TritonFusionAnalysis::Scope::OUTPUT)) {
+    for (const HloInstruction* input :
+         ScopeInputs(analysis, TritonFusionAnalysis::Scope::OUTPUT)) {
       std::vector<int32_t> boundary_checks;
-      Value tensor_pointer = emitter.EmitTensorPointer(
-          parameter, out, fn.getArgument(parameter->parameter_number()), pid_k,
-          boundary_checks);
+      TF_ASSIGN_OR_RETURN(
+          Value tensor_pointer,
+          emitter.EmitTensorPointer(input, out, GetArguments(fn, *input), pid_k,
+                                    boundary_checks));
       CHECK(values_out
-                .insert({parameter,
+                .insert({input,
                          EmitParameterLoad(b, tensor_pointer, boundary_checks)})
                 .second);
     }
@@ -1519,25 +1722,25 @@ Status EmitMatMul(mlir::OpBuilder builder, absl::string_view libdevice_path,
     const HloInstruction* producer =
         root->shape().IsTuple() ? root->operand(i) : root;
     std::vector<int32_t> boundary_checks;
-    Value tensor_pointer = emitter.EmitTensorPointer(
-        producer, out,
-        fn.getArgument(i + dot_instr->parent()->num_parameters()), pid_k,
-        boundary_checks);
+    TF_ASSIGN_OR_RETURN(
+        Value tensor_pointer,
+        emitter.EmitTensorPointer(
+            producer, out,
+            {fn.getArgument(i + dot_instr->parent()->num_parameters())}, pid_k,
+            boundary_checks));
     b.create<mt::StoreOp>(tensor_pointer, values_out[producer], boundary_checks,
                           mt::CacheModifier::NONE, mt::EvictionPolicy::NORMAL);
   }
   return OkStatus();
 }
 
-LaunchDimensions GetSoftMaxLaunchDimensions(
-    absl::Span<const HloInstruction* const> roots,
-    const FusionBoundaryFn& fusion_boundary, const TritonGemmConfig& config) {
-  const HloInstruction* reduce =
-      HloFindIf(roots, fusion_boundary, [](const HloInstruction& node) {
-        return node.opcode() == HloOpcode::kReduce;
-      });
-  CHECK_NE(reduce, nullptr);
-  const Shape& reduce_input_shape = reduce->operand(0)->shape();
+LaunchDimensions GetSoftMaxLaunchDimensions(const HloFusionAdaptor& fusion,
+                                            const TritonGemmConfig& config) {
+  auto reduce = HloFindIf(fusion.GetRoots(), fusion, [](auto node) {
+    return node.opcode() == HloOpcode::kReduce;
+  });
+  CHECK(reduce != std::nullopt);
+  const Shape& reduce_input_shape = reduce->instruction().operand(0)->shape();
   int num_rows = 1;
   for (int minor_axis = 1; minor_axis < reduce_input_shape.rank();
        ++minor_axis) {
@@ -1564,7 +1767,8 @@ Status EmitSoftMax(mlir::OpBuilder builder, absl::string_view libdevice_path,
   //   * the last axis of every reduction parameter has the same length
   //   * reductions only reduce a single operand
   //   * all the shapes have canonical layout (logical layout = physical layout)
-  //   * the computation has a single input and a single output
+  //   * the computation has a single output
+  //   * we tile along a single dimension
 
   // TODO(bchetioui): allow doing several rows per block (e.g. for when rows
   // are smaller than the minimum transaction size)
@@ -1581,49 +1785,110 @@ Status EmitSoftMax(mlir::OpBuilder builder, absl::string_view libdevice_path,
   CHECK_EQ(reduce->dimensions()[0], reduce_input_shape.rank() - 1);
 
   int row_len = reduce_input_shape.dimensions_minor(0);
-  int block_size = 1;
-
-  // block_size must be a power of two.
-  while (block_size < row_len) {
-    block_size *= 2;
-  }
 
   Value pid = b.create<ma::ExtSIOp>(
       b.getI64Type(), b.create<mt::GetProgramIdOp>(mt::ProgramIDDim::X));
   Value row_stride = CreateConst(b, b.getI32Type(), row_len);
 
+  Value row_offset = b.create<ma::MulIOp>(
+      pid, b.create<ma::ExtSIOp>(b.getI64Type(), row_stride));
+  Value zero_offset = CreateConst(b, b.getI64Type(), 0);
+
   absl::flat_hash_map<const HloInstruction*, Value> values_out;
-  auto make_tensor_pointer = [&](Value base) {
-    Value offset = b.create<ma::MulIOp>(
-        pid, b.create<ma::ExtSIOp>(b.getI64Type(), row_stride));
-    return b.create<mt::MakeTensorPtrOp>(
-        /*base=*/AddPtr(b, base, offset),
-        /*shape=*/ValueRange{CreateConst(b, b.getI64Type(), row_len)},
-        /*strides=*/ValueRange{CreateConst(b, b.getI64Type(), 1)},
-        /*offsets=*/ValueRange{CreateConst(b, b.getI32Type(), 0)},
+  std::vector<int32_t> boundary_checks;
+
+  // block_size must be a power of two.
+  int result_block_size = pow(2, ceil(log(row_len) / log(2)));
+
+  if (result_block_size != row_len) {
+    boundary_checks.push_back(0);
+  }
+
+  // Emits load instructions
+  for (int param_idx = 0; param_idx < computation->num_parameters();
+       ++param_idx) {
+    HloInstruction* param = computation->parameter_instruction(param_idx);
+    // Current tiling derivation assigns index 0 to the reduction dimension and
+    // index 1 to the batch dimension.
+    auto reduce_iterspec = analysis.IterSpec(
+        TritonFusionAnalysis::Scope::OUTPUT, param, /*dimension=*/0);
+    auto batch_iterspec = analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
+                                            param, /*dimension=*/1);
+
+    // Make sure only batch and reduce dims are present in tiling
+    CHECK_EQ(analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, param,
+                               /*dimension=*/2),
+             nullptr);
+
+    if (!reduce_iterspec) {
+      // This parameter's broadcast is along the reduce dimension, and so
+      // each pid uses and broadcasts its own index.
+
+      // If batchDimIterSpec is also not present, then this parameter is a
+      // scalar, in which case we reuse this for each pid with offset.
+      Value batch_offset = batch_iterspec ? pid : zero_offset;
+
+      values_out[param] = EmitParameterLoad(
+          b, AddPtr(b, fn.getArgument(param_idx), batch_offset),
+          boundary_checks);
+      continue;
+    }
+
+    CHECK_NE(reduce_iterspec, nullptr);
+    CHECK_EQ(reduce_iterspec->size(), 1);
+
+    // TODO(b/310721908): The below assumes that we tile along a single dim.
+    int reduce_dim_len = reduce_iterspec->front().count;
+    int reduce_dim_stride = reduce_iterspec->front().stride;
+    int slice_offset = reduce_iterspec->front().slice_start;
+
+    // If the batch dimension is present in this parameter's tile, we must make
+    // sure each batch idx is offset by the correct number of rows. If it is not
+    // present, then the reduce dim data is reused without any offset.
+    Value base_offset = batch_iterspec ? row_offset : zero_offset;
+
+    // We assume that the reduced axis of this parameter has length row_len.
+    CHECK_EQ(reduce_dim_len, row_len);
+
+    // block_size must be a power of two.
+    int block_size = pow(2, ceil(log(reduce_dim_len) / log(2)));
+
+    // Verify that this param contains a single contiguous fragment.
+    CHECK_EQ(reduce_iterspec->front().subfragments.size(), 1);
+
+    Value emitted_tensor = b.create<mt::MakeTensorPtrOp>(
+        /*base=*/AddPtr(b, fn.getArgument(param_idx), base_offset),
+        /*shape=*/ValueRange{CreateConst(b, b.getI64Type(), reduce_dim_len)},
+        /*strides=*/
+        ValueRange{CreateConst(b, b.getI64Type(), reduce_dim_stride)},
+        /*offsets=*/ValueRange{CreateConst(b, b.getI32Type(), slice_offset)},
         /*tensorShape=*/std::vector<int32_t>{block_size},
         /*order=*/std::vector<int32_t>{0});
-  };
 
-  std::vector<int32_t> boundary_checks;
-  if (block_size != row_len) {
-    boundary_checks.push_back(0);
+    values_out[param] = EmitParameterLoad(b, emitted_tensor, boundary_checks);
   }
-  values_out[computation->parameter_instruction(0)] = EmitParameterLoad(
-      b, make_tensor_pointer(fn.getArgument(0)), boundary_checks);
+
   // Dimension 0 is the reduced one by construction and it's the only one
   // present in the tile shapes.
   std::vector<DimProperties> tiled_dims = {DimProperties(
-      /*index=*/0, pid, block_size, /*split_value=*/1)};
+      /*index=*/0, pid, result_block_size, /*split_value=*/1)};
   TF_ASSIGN_OR_RETURN(
       Value result,
       EmitScope(b, libdevice_path, &analysis,
                 TritonFusionAnalysis::Scope::OUTPUT, tiled_dims,
                 computation->MakeInstructionPostOrder(), values_out));
 
-  b.create<mt::StoreOp>(make_tensor_pointer(fn.getArgument(1)), result,
-                        std::vector<int32_t>{0}, mt::CacheModifier::NONE,
-                        mt::EvictionPolicy::NORMAL);
+  Value store_tensor = b.create<mt::MakeTensorPtrOp>(
+      /*base=*/AddPtr(b, fn.getArgument(computation->num_parameters()),
+                      row_offset),
+      /*shape=*/ValueRange{CreateConst(b, b.getI64Type(), row_len)},
+      /*strides=*/ValueRange{CreateConst(b, b.getI64Type(), 1)},
+      /*offsets=*/ValueRange{CreateConst(b, b.getI32Type(), 0)},
+      /*tensorShape=*/std::vector<int32_t>{result_block_size},
+      /*order=*/std::vector<int32_t>{0});
+
+  b.create<mt::StoreOp>(store_tensor, result, std::vector<int32_t>{0},
+                        mt::CacheModifier::NONE, mt::EvictionPolicy::NORMAL);
   return OkStatus();
 }
 
@@ -1845,7 +2110,9 @@ StatusOr<TritonWrapperResult> TritonWrapper(
           .getInt();
   VLOG(2) << "Shared memory usage: " << shared_mem_bytes << " B";
   if (shared_mem_bytes > device_info.shared_memory_per_block_optin()) {
-    return ResourceExhausted("Shared memory size limit exceeded.");
+    return absl::ResourceExhaustedError(absl::StrFormat(
+        "Shared memory size limit exceeded: requested %d, available: %d",
+        shared_mem_bytes, device_info.shared_memory_per_block_optin()));
   }
 
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.h b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
index 52d66dd9c8c9a0..8a6e233c17f02b 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
@@ -26,10 +26,10 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_computation.h"
-#include "xla/service/gpu/gemm_rewriter_triton.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
@@ -42,10 +42,9 @@ struct TritonWrapperResult {
 };
 
 // Compute the launch dimensions for the given Triton MatMul.
-LaunchDimensions GetMatMulLaunchDimensions(
-    const TritonFusionAnalysis& analysis,
-    absl::Span<const HloInstruction* const> roots,
-    const FusionBoundaryFn& fusion_boundary, const TritonGemmConfig& config);
+LaunchDimensions GetMatMulLaunchDimensions(const TritonFusionAnalysis& analysis,
+                                           const HloFusionAdaptor& fusion,
+                                           const TritonGemmConfig& config);
 // Use tiling and execution parameters from 'config'.
 Status EmitMatMul(mlir::OpBuilder b, absl::string_view libdevice_path,
                   const TritonFusionAnalysis& analysis,
@@ -53,9 +52,8 @@ Status EmitMatMul(mlir::OpBuilder b, absl::string_view libdevice_path,
                   const TritonGemmConfig& config, int shmem_budget);
 
 // Compute the launch dimensions for the given Triton SoftMax.
-LaunchDimensions GetSoftMaxLaunchDimensions(
-    absl::Span<const HloInstruction* const> roots,
-    const FusionBoundaryFn& fusion_boundary, const TritonGemmConfig& config);
+LaunchDimensions GetSoftMaxLaunchDimensions(const HloFusionAdaptor& fusion,
+                                            const TritonGemmConfig& config);
 // Generate Softmax in Triton IR inside 'fn'.
 // Use execution parameters from 'config'.
 Status EmitSoftMax(mlir::OpBuilder b, absl::string_view libdevice_path,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index ff027d0e35ba5d..d8af28c6f2d474 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -23,13 +23,14 @@ limitations under the License.
 #include "absl/base/optimization.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "xla/comparison_util.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/primitive_util.h"
-#include "xla/service/gpu/gemm_rewriter_triton.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/service/gpu/triton_support.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
@@ -515,22 +516,22 @@ TEST_P(SelectTest, SelectFusionExecutesCorrectly) {
 
   const std::string kHloTestTemplate = R"(
 triton_gemm___computation {
-  parameter_0 = $1[92,11]{1,0} parameter(0)
-  parameter_1 = $0[11,63]{1,0} parameter(1)
-  parameter_2 = $0[11,63]{1,0} parameter(2)
-  parameter_3 = pred[11,63]{1,0} parameter(3)
-  f1.1 = $0[11,63]{1,0} select(parameter_3, parameter_1, parameter_2)
-  c.1 = $1[11,63]{1,0} convert(f1.1)
+  parameter_0 = $1[92,13]{1,0} parameter(0)
+  parameter_1 = $0[13,63]{1,0} parameter(1)
+  parameter_2 = $0[13,63]{1,0} parameter(2)
+  parameter_3 = pred[13,63]{1,0} parameter(3)
+  f1.1 = $0[13,63]{1,0} select(parameter_3, parameter_1, parameter_2)
+  c.1 = $1[13,63]{1,0} convert(f1.1)
   ROOT _.1 = $1[92,63]{1,0} dot(parameter_0, c.1),
     lhs_contracting_dims={1}, rhs_contracting_dims={0},
     operand_precision={HIGH, HIGH}
 }
 
 ENTRY e {
-  p0 = $1[92,11]{1,0} parameter(0)
-  p1 = $0[11,63]{1,0} parameter(1)
-  p2 = $0[11,63]{1,0} parameter(2)
-  p3 = pred[11,63]{1,0} parameter(3)
+  p0 = $1[92,13]{1,0} parameter(0)
+  p1 = $0[13,63]{1,0} parameter(1)
+  p2 = $0[13,63]{1,0} parameter(2)
+  p3 = pred[13,63]{1,0} parameter(3)
   ROOT triton_gemm__ = $1[92,63]{1,0} fusion(p0, p1, p2, p3), kind=kCustom,
     calls=triton_gemm___computation,
     backend_config={"kind":"__triton_gemm",
@@ -544,19 +545,19 @@ ENTRY e {
 
   const std::string kHloRefTemplate = R"(
 fused_computation {
-  p0 = $0[11,63]{1,0} parameter(0)
-  p1 = $0[11,63]{1,0} parameter(1)
-  p2 = pred[11,63]{1,0} parameter(2)
-  f.1 = $0[11,63]{1,0} select(p2, p0, p1)
-  ROOT convert.1 = $1[11,63]{1,0} convert(f.1)
+  p0 = $0[13,63]{1,0} parameter(0)
+  p1 = $0[13,63]{1,0} parameter(1)
+  p2 = pred[13,63]{1,0} parameter(2)
+  f.1 = $0[13,63]{1,0} select(p2, p0, p1)
+  ROOT convert.1 = $1[13,63]{1,0} convert(f.1)
 }
 
 ENTRY e {
-  p3 = pred[11,63]{1,0} parameter(3)
-  p2 = $0[11,63]{1,0} parameter(2)
-  p1 = $0[11,63]{1,0} parameter(1)
-  p0 = $1[92,11]{1,0} parameter(0)
-  fusion = $1[11,63]{1,0} fusion(p1, p2, p3), kind=kLoop,
+  p3 = pred[13,63]{1,0} parameter(3)
+  p2 = $0[13,63]{1,0} parameter(2)
+  p1 = $0[13,63]{1,0} parameter(1)
+  p0 = $1[92,13]{1,0} parameter(0)
+  fusion = $1[13,63]{1,0} fusion(p1, p2, p3), kind=kLoop,
     calls=fused_computation
   gemm = ($1[92,63]{1,0}, s8[0]{0}) custom-call(p0, fusion),
     custom_call_target="__cublas$$gemm",
@@ -878,11 +879,7 @@ ENTRY main {
   if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
                                se::CudaComputeCapability::AMPERE)) {
     hlo_ref_template = R"(
-; CHECK:    ENTRY
-; CHECK:      %[[P0:.*]] = bf16[127,125]{1,0} parameter(0)
-; CHECK:      %[[FUSED_REDUCE:.*]] = f32[127]{0} fusion(%[[P0]])
-; CHECK:      ROOT
-; CHECK-SAME: fusion(%[[P0]], %[[FUSED_REDUCE]])
+; CHECK-NOT: triton
 )";
   } else {
     hlo_ref_template = R"(
@@ -916,6 +913,49 @@ ENTRY main {
                             ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
 }
 
+TEST_P(TritonSoftmaxTest, CanFuseAndEmitSoftmaxDiamondWithSmallRows) {
+  PrimitiveType data_type = GetParam();
+  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
+                               se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
+    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
+  }
+
+  constexpr absl::string_view kHloTextTemplate = R"(
+HloModule softmax
+min_computation {
+  arg_0 = $0[] parameter(0)
+  arg_1 = $0[] parameter(1)
+  ROOT minimum = $0[] minimum(arg_0, arg_1)
+}
+ENTRY main {
+  param_0 = $0[127,7]{1,0} parameter(0)
+  constant_neg_inf = $0[] constant(-inf)
+  reduce = $0[127]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=min_computation
+  broadcast = $0[127,7]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = $0[127,7]{1,0} subtract(param_0, broadcast)
+}
+)";
+
+  const std::string hlo_text = absl::Substitute(
+      kHloTextTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  constexpr absl::string_view kHloRefTemplate = R"(
+; CHECK:    ENTRY
+; CHECK:      %[[param_0:.*]] = $0[127,7]{1,0} parameter(0)
+; CHECK:      ROOT
+; CHECK-SAME: fusion(%[[param_0]])
+; CHECK-SAME:   kind=kCustom
+; CHECK-SAME:   __triton_softmax
+)";
+
+  const std::string hlo_ref = absl::Substitute(
+      kHloRefTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  MatchOptimizedHlo(hlo_text, hlo_ref);
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec(/*aabs=*/0, /*arel=*/0)));
+}
+
 TEST_F(TritonSoftmaxTest, CanFuseAndEmitDiamondWithBF16Converts) {
   if (!GetCudaComputeCapability().IsAtLeast(
           se::CudaComputeCapability::AMPERE)) {
@@ -2319,6 +2359,90 @@ ENTRY main {
 INSTANTIATE_TEST_SUITE_P(TritonSoftmaxTestSuite, TritonSoftmaxTest,
                          ::testing::Values(F32, F16, BF16));
 
+TEST_F(TritonSoftmaxTest, CanFuseAndEmitTritonSoftmaxWithTwoParameters) {
+  const std::string hlo_text = R"(
+HloModule layernorm
+
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+ENTRY main {
+  param_0 = f32[125,127]{1,0} parameter(0)
+  param_1 = f32[127]{0} parameter(1)
+  broadcast_0 = f32[125,127]{1,0} broadcast(param_1), dimensions={1}
+  multiply_0 = f32[125,127]{1,0} multiply(param_0, broadcast_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+)";
+
+  // Param order is arbitrary. We test that only param_1 is in the fused root
+  // instruction below.
+  const std::string hlo_ref = R"(
+; CHECK:    ENTRY
+; CHECK-DAG:    %[[param_0:.*]] = f32[125,127]{1,0} parameter(0)
+; CHECK-DAG:    %[[param_1:.*]] = f32[127]{0} parameter(1)
+; CHECK:      ROOT
+; CHECK-SAME:   f32[125,127]{1,0} fusion
+; CHECK-SAME:   %[[param_1]]
+; CHECK-SAME:   kind=kCustom
+; CHECK-SAME:   triton_softmax
+)";
+  MatchOptimizedHlo(hlo_text, hlo_ref);
+
+  float tolerance = 2e-6;
+  EXPECT_TRUE(RunAndCompare(hlo_text,
+                            ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
+}
+
+TEST_F(TritonSoftmaxTest, CanFuseAndEmitTritonSoftmaxWithNonBatchReduce) {
+  const std::string hlo_text = R"(
+HloModule layernorm
+
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+ENTRY main {
+  param_0 = f32[125,127]{1,0} parameter(0)
+  param_1 = f32[10,125,127]{2,1,0} parameter(1)
+  constant = f32[] constant(0)
+  reduce_0 = f32[125,127]{1,0} reduce(param_1, constant), dimensions={0}, to_apply=add
+  multiply_0 = f32[125,127]{1,0} multiply(param_0, reduce_0)
+  constant_0 = f32[] constant(0)
+  reduce_1 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_1), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+)";
+
+  // We expect to not fuse everything into the triton softmax, because of the
+  // reduce over the non-row dimension.
+  const std::string hlo_ref = R"(
+; CHECK:      ENTRY
+; CHECK-DAG:    %[[P0:.*]] = f32[125,127]{1,0} parameter(0)
+; CHECK-DAG:    %[[P1:.*]] = f32[10,125,127]{2,1,0} parameter(1)
+; CHECK:        %[[FUSION:.*]] = f32[125,127]{1,0} fusion(%[[P0]], %[[P1]])
+; CHECK:        kind=kLoop
+; CHECK:      ROOT
+; CHECK-SAME:   f32[125,127]{1,0} fusion(%[[FUSION]])
+; CHECK-SAME:   kind=kCustom
+; CHECK-SAME:   triton_softmax
+)";
+  MatchOptimizedHlo(hlo_text, hlo_ref);
+
+  float tolerance = 2e-6;
+  EXPECT_TRUE(RunAndCompare(hlo_text,
+                            ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 148977393bc81f..e0990e5ba2b14b 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/gemm_rewriter_triton.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
@@ -61,7 +60,7 @@ namespace {
 
 namespace m = ::xla::match;
 
-class TritonGemmTest : public GpuCodegenTest {
+class TritonTest : public GpuCodegenTest {
  public:
   se::CudaComputeCapability GetCudaComputeCapability() {
     return backend()
@@ -69,8 +68,12 @@ class TritonGemmTest : public GpuCodegenTest {
         ->GetDeviceDescription()
         .cuda_compute_capability();
   }
+};
+
+class TritonGemmTest : public TritonTest {
+ public:
   DebugOptions GetDebugOptionsForTest() override {
-    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    DebugOptions debug_options = TritonTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_cublas_fallback(false);
     return debug_options;
   }
@@ -85,7 +88,7 @@ class TritonGemmTestWithoutTritonGemmAny : public TritonGemmTest {
   }
 };
 
-class TritonFilecheckTest : public TritonGemmTest {
+class TritonFilecheckTest : public TritonTest {
  public:
   StatusOr<bool> CreateTritonIrAndFileCheck(
       absl::string_view hlo_text, const TritonGemmConfig& config,
@@ -206,6 +209,484 @@ CHECK:    }
               tsl::testing::IsOkAndHolds(true));
 }
 
+TEST_F(TritonFilecheckTest, TestSoftmaxEmitterWithSingleParameter) {
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125,127]{1,0} parameter(0)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(param_0), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+})";
+  TritonGemmConfig config(16, 64, 32, 1, 1, 1);
+  ASSERT_THAT(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+                                         "triton_softmax_computation", R"(
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK:            %[[PID:.*]] = tt.get_program_id x : i32
+CHECK:            arith.extsi %[[PID]] : i32 to i64
+CHECK:            tt.addptr %[[P0]]
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            tt.reduce
+CHECK-NEXT:       ^bb0(%[[ARG2:[^:]*]]: f32, %[[ARG3:[^:]*]]: f32):
+CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG2]], %[[ARG3]] : f32
+CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
+CHECK-NEXT:       }) : (tensor<128xf32>) -> f32
+CHECK:            tt.splat
+CHECK:            arith.mulf
+CHECK-SAME:       tensor<128xf32>
+CHECK:            tt.addptr %[[P1]]
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.store
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<128xf32>, 1>, tensor<128xf32>
+CHECK:            tt.return
+CHECK:        }
+)"),
+              tsl::testing::IsOkAndHolds(true));
+}
+
+TEST_F(TritonFilecheckTest, TestSoftmaxEmitterWithSingleScalarParameter) {
+  const std::string kHloText = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[] parameter(0)
+  broadcast_1 = f32[125,127]{1,0} broadcast(parameter_0), dimensions={}
+  multiply_0 = f32[125,127]{1,0} multiply(broadcast_1, broadcast_1)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[] constant(42)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(param_0), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+})";
+  TritonGemmConfig config(16, 64, 32, 1, 1, 1);
+  ASSERT_THAT(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+                                         "triton_softmax_computation", R"(
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK-DAG:            %[[PID:.*]] = tt.get_program_id x : i32
+CHECK-DAG:            arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:            %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
+CHECK-DAG:            %[[ARG_0:.*]] = tt.addptr %[[P0]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:            tt.load %[[ARG_0]] {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+CHECK-NEXT:       tt.splat
+CHECK:            tt.reduce
+CHECK-NEXT:       ^bb0(%[[ARG2:[^:]*]]: f32, %[[ARG3:[^:]*]]: f32):
+CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG2]], %[[ARG3]] : f32
+CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
+CHECK-NEXT:       }) : (tensor<128xf32>) -> f32
+CHECK:            tt.splat
+CHECK:            arith.mulf
+CHECK-SAME:       tensor<128xf32>
+CHECK:            tt.addptr %[[P1]]
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.store
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<128xf32>, 1>, tensor<128xf32>
+CHECK:            tt.return
+CHECK:        }
+)"),
+              tsl::testing::IsOkAndHolds(true));
+}
+
+TEST_F(TritonFilecheckTest, TestSoftmaxEmitterWithMultipleParameters) {
+  const std::string kHloText = R"(
+HloModule t
+
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  param_0 = f32[125,127]{1,0} parameter(0)
+  param_1 = f32[127]{0} parameter(1)
+  broadcast_0 = f32[125,127]{1,0} broadcast(param_1), dimensions={1}
+  multiply_0 = f32[125,127]{1,0} multiply(param_0, broadcast_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125,127]{1,0} parameter(0)
+  param_1 = f32[127]{0} parameter(1)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(param_0, param_1), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+}
+)";
+  TritonGemmConfig config(16, 64, 32, 1, 1, 1);
+  ASSERT_THAT(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+                                         "triton_softmax_computation", R"(
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
+CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
+CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
+CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
+CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            tt.reduce
+CHECK-NEXT:       ^bb0(%[[ARG3:[^:]*]]: f32, %[[ARG4:[^:]*]]: f32):
+CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
+CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
+CHECK-NEXT:       }) : (tensor<128xf32>) -> f32
+CHECK:            tt.addptr %[[P2]]
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.store
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<128xf32>, 1>, tensor<128xf32>
+CHECK:            tt.return
+CHECK:        }
+)"),
+              tsl::testing::IsOkAndHolds(true));
+}
+
+TEST_F(TritonFilecheckTest,
+       TestSoftmaxEmitterWithMultipleParametersOrderSwapped) {
+  // This mirrors the multiple parameter test above, but with the parameter to
+  // be batch-broadcasted in the parameter_0 place instead of parameter_1.
+  const std::string kHloText = R"(
+HloModule t
+
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  param_0 = f32[125,127]{1,0} parameter(1)
+  param_1 = f32[127]{0} parameter(0)
+  broadcast_0 = f32[125,127]{1,0} broadcast(param_1), dimensions={1}
+  multiply_0 = f32[125,127]{1,0} multiply(param_0, broadcast_0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125,127]{1,0} parameter(1)
+  param_1 = f32[127]{0} parameter(0)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(param_1, param_0), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+}
+)";
+  TritonGemmConfig config(16, 64, 32, 1, 1, 1);
+  ASSERT_THAT(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+                                         "triton_softmax_computation", R"(
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
+CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
+CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
+CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
+CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            tt.reduce
+CHECK-NEXT:       ^bb0(%[[ARG3:[^:]*]]: f32, %[[ARG4:[^:]*]]: f32):
+CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
+CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
+CHECK-NEXT:       }) : (tensor<128xf32>) -> f32
+CHECK:            tt.splat
+CHECK:            tt.addptr %[[P2]]
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.store
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<128xf32>, 1>, tensor<128xf32>
+CHECK:            tt.return
+CHECK:        }
+)"),
+              tsl::testing::IsOkAndHolds(true));
+}
+
+TEST_F(TritonFilecheckTest,
+       TestSoftmaxEmitterWithAdditionalParameterEnteringAfterDiamond) {
+  const std::string kHloText = R"(
+HloModule t
+
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  param_0 = f32[125,127]{1,0} parameter(0)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(param_0, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  param_1 = f32[127]{0} parameter(1)
+  broadcast_0 = f32[125,127]{1,0} broadcast(param_1), dimensions={1}
+  ROOT multiply_0 = f32[125,127]{1,0} multiply(broadcast_4, broadcast_0)
+}
+
+ENTRY main {
+  param_0 = f32[125,127]{1,0} parameter(0)
+  param_1 = f32[127]{0} parameter(1)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(param_0, param_1), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+}
+)";
+  TritonGemmConfig config(16, 64, 32, 1, 1, 1);
+  ASSERT_THAT(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+                                         "triton_softmax_computation", R"(
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
+CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
+CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
+CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
+CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            tt.reduce
+CHECK-NEXT:       ^bb0(%[[ARG3:[^:]*]]: f32, %[[ARG4:[^:]*]]: f32):
+CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
+CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
+CHECK-NEXT:       }) : (tensor<128xf32>) -> f32
+CHECK:            tt.addptr %[[P2]]
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.store
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<128xf32>, 1>, tensor<128xf32>
+CHECK:            tt.return
+CHECK:        }
+)"),
+              tsl::testing::IsOkAndHolds(true));
+}
+
+TEST_F(TritonFilecheckTest,
+       TestSoftmaxEmitterWithMultipleParametersAlongTiledDimension) {
+  const std::string kHloText = R"(
+HloModule t
+
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  param_0 = f32[125,127]{1,0} parameter(0)
+  param_1 = f32[127]{0} parameter(1)
+  param_2 = f32[125]{0} parameter(2)
+  broadcast_0 = f32[125,127]{1,0} broadcast(param_1), dimensions={1}
+  multiply_0 = f32[125,127]{1,0} multiply(param_0, broadcast_0)
+  broadcast_1 = f32[125,127]{1,0} broadcast(param_2), dimensions={0}
+  multiply_1 = f32[125,127]{1,0} multiply(multiply_0, broadcast_1)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[125]{0} reduce(multiply_1, constant_0), dimensions={1}, to_apply=add
+  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_1, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[125,127]{1,0} parameter(1)
+  param_1 = f32[127]{0} parameter(0)
+  param_2 = f32[125]{0} parameter(2)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(param_0, param_1, param_2), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+}
+)";
+  TritonGemmConfig config(16, 64, 32, 1, 1, 1);
+  ASSERT_THAT(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+                                         "triton_softmax_computation", R"(
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
+CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
+CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
+CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
+CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            %[[ARG2:.*]] = tt.addptr %[[P2]], %[[PID_i64]] : !tt.ptr<f32, 1>, i64
+CHECK-NEXT:       tt.load %[[ARG2]] {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+CHECK:            tt.reduce
+CHECK-NEXT:       ^bb0(%[[ARG4:[^:]*]]: f32, %[[ARG5:[^:]*]]: f32):
+CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG4]], %[[ARG5]] : f32
+CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
+CHECK-NEXT:       }) : (tensor<128xf32>) -> f32
+CHECK:            tt.splat
+CHECK:            arith.mulf
+CHECK-SAME:       tensor<128xf32>
+CHECK:            tt.addptr %[[P3]]
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.store
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<128xf32>, 1>, tensor<128xf32>
+CHECK:            tt.return
+CHECK:        }
+)"),
+              tsl::testing::IsOkAndHolds(true));
+}
+
+TEST_F(TritonFilecheckTest, TestSoftmaxEmitterWithMultipleTiledDimensions) {
+  const std::string kHloText = R"(
+HloModule t
+
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  param_0 = f32[10,125,127]{2,1,0} parameter(0)
+  param_1 = f32[127]{0} parameter(1)
+  param_2 = f32[10,125]{1,0} parameter(2)
+  broadcast_0 = f32[10,125,127]{2,1,0} broadcast(param_1), dimensions={2}
+  multiply_0 = f32[10,125,127]{2,1,0} multiply(param_0, broadcast_0)
+  broadcast_1 = f32[10,125,127]{2,1,0} broadcast(param_2), dimensions={0,1}
+  multiply_1 = f32[10,125,127]{2,1,0} multiply(multiply_0, broadcast_1)
+  constant_0 = f32[] constant(0)
+  reduce_0 = f32[10,125]{1,0} reduce(multiply_1, constant_0), dimensions={2}, to_apply=add
+  broadcast_4 = f32[10,125,127]{2,1,0} broadcast(reduce_0), dimensions={0,1}
+  ROOT multiply = f32[10,125,127]{2,1,0} multiply(multiply_1, broadcast_4)
+}
+
+ENTRY main {
+  param_0 = f32[10,125,127]{2,1,0} parameter(0)
+  param_1 = f32[127]{0} parameter(1)
+  param_2 = f32[10,125]{1,0} parameter(2)
+  ROOT triton_softmax = f32[10,125,127]{2,1,0} fusion(param_0, param_1, param_2), kind=kCustom, calls=triton_softmax_computation, backend_config={"kind":"__triton_softmax"}
+}
+)";
+  TritonGemmConfig config(16, 64, 32, 1, 1, 1);
+  ASSERT_THAT(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+                                         "triton_softmax_computation", R"(
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
+CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
+CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
+CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
+CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            %[[ARG2:.*]] = tt.addptr %[[P2]], %[[PID_i64]] : !tt.ptr<f32, 1>, i64
+CHECK-NEXT:       tt.load %[[ARG2]] {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+CHECK:            tt.reduce
+CHECK-NEXT:       ^bb0(%[[ARG4:[^:]*]]: f32, %[[ARG5:[^:]*]]: f32):
+CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG4]], %[[ARG5]] : f32
+CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
+CHECK-NEXT:       }) : (tensor<128xf32>) -> f32
+CHECK:            tt.splat
+CHECK:            arith.mulf
+CHECK-SAME:       tensor<128xf32>
+CHECK:            tt.addptr %[[P3]]
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.store
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<128xf32>, 1>, tensor<128xf32>
+CHECK:            tt.return
+CHECK:        }
+)"),
+              tsl::testing::IsOkAndHolds(true));
+}
+
+TEST_F(TritonFilecheckTest, PredParametersAreTruncatedToI1) {
+  const std::string kHloText = R"(
+HloModule m
+
+triton_gemm_computation {
+  p = pred[2,2]{1,0} parameter(0)
+  a = f32[2,2]{1,0} parameter(1)
+  b = f32[2,2]{1,0} parameter(2)
+  c = f32[2,2]{1,0} parameter(3)
+  compare = pred[2,2]{1,0} compare(a, b), direction=LT
+  and = pred[2,2]{1,0} and(p, compare)
+  convert = f32[2,2]{1,0} convert(and)
+  ROOT r = f32[2,2]{1,0} dot(convert, c),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p = pred[2,2]{1,0} parameter(0)
+  a = f32[2,2]{1,0} parameter(1)
+  b = f32[2,2]{1,0} parameter(2)
+  c = f32[2,2]{1,0} parameter(3)
+  ROOT triton_gemm = f32[2,2]{1,0} fusion(p, a, b, c), kind=kCustom,
+    calls=triton_gemm_computation,
+    backend_config={kind: "__triton_gemm",
+      triton_gemm_config: {
+        "block_m":16,"block_n":16,"block_k":16,
+        "split_k":1,"num_stages":1,"num_warps":1
+      }
+    }
+}
+)";
+  TritonGemmConfig config(16, 16, 16, 1, 1, 1);
+  ASSERT_THAT(CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul,
+                                         "triton_gemm_computation", R"(
+CHECK: %[[LOAD:.*]] = tt.load %{{.*}} {{.*}} : !tt.ptr<tensor<16x16xi8>, 1> -> tensor<16x16xi8>
+CHECK: %[[TRUNCI:.*]] = arith.trunci %[[LOAD]] : tensor<16x16xi8> to tensor<16x16xi1>
+CHECK: %{{.*}} = arith.andi %[[TRUNCI]], %{{.*}} : tensor<16x16xi1>
+)"),
+              tsl::testing::IsOkAndHolds(true));
+}
+
 TEST_F(TritonGemmTest, DoNotUseTensorCoresWithNonDefaultPrecision) {
   const std::string kHloText = R"(
 triton_gemm_r {
@@ -334,8 +815,9 @@ ENTRY entry {
                     se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
                                               /*minor=*/0},
                     dev_info, config, &llvm_module, &EmitMatMul, mlir_context),
-      tsl::testing::StatusIs(tsl::error::RESOURCE_EXHAUSTED,
-                             "Shared memory size limit exceeded."));
+      tsl::testing::StatusIs(
+          tsl::error::RESOURCE_EXHAUSTED,
+          ::testing::HasSubstr("Shared memory size limit exceeded")));
 
   config.block_m = 64;
   config.block_n = 128;
@@ -953,8 +1435,9 @@ ENTRY e {
     lhs_contracting_dims={1}, rhs_contracting_dims={1}
 })";
 
+  // The fusion has separate parameters for each scope.
   MatchOptimizedHlo(hlo_text, R"(
-; CHECK: fusion(%p0), kind=kCustom
+; CHECK: fusion(%p0, %p0), kind=kCustom
 ; CHECK-SAME: "block_m":
 )");
 
@@ -1008,7 +1491,7 @@ ENTRY e {
       GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter())
                      .WithFusionKind(HloInstruction::FusionKind::kCustom)));
 
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-1, /*arel=*/1e-3}));
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-1, /*arel=*/1e-2}));
 }
 
 TEST_F(TritonGemmLevel2Test, BinaryOperationWithLargeInputsIsNotFused) {
@@ -1261,6 +1744,32 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/2e-3, /*arel=*/2e-3}));
 }
 
+TEST_F(TritonGemmLevel2Test, FuseConcatenation) {
+  const std::string kHloText = R"(
+e {
+  p0 = s8[153,1536] parameter(0)
+  p1 = s8[153,128] parameter(1)
+  p2 = s8[153,128] parameter(2)
+  cat = s8[153,1792] concatenate(p0, p1, p2), dimensions={1}
+  cvt = bf16[153,1792] convert(cat)
+  p3 = bf16[16,153] parameter(3)
+  ROOT d = bf16[16,1792] dot(p3, cvt),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter(),
+                           m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3,
+                                                /*arel=*/1e-3}));
+}
+
 TEST_F(TritonGemmLevel2TestAny, MinimumHandlesNaNsOnTheLeft) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
@@ -1519,6 +2028,28 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+TEST_F(TritonGemmLevel2Test, NestedSlicingWorks) {
+  const std::string kHloText = R"(
+ENTRY e {
+  p1 = f32[6,24] parameter(1)
+  s1 = f32[5,20] slice(p1), slice={[1:6], [3:23]}
+  n1 = f32[5,20] negate(s1)
+  s2 = f32[3,7] slice(n1), slice={[1:4], [13:20]}
+  p0 = f32[7,37] parameter(0)
+  ROOT d = f32[3,37] dot(s2, p0),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-3}));
+}
+
 TEST_F(TritonGemmTest, SlicedBatchDimensionIsSupported) {
   const std::string kHloText = R"(
 ENTRY e {
@@ -1701,6 +2232,32 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+TEST_F(TritonGemmLevel2Test, SupportPredParametersUsedInExpressions) {
+  const std::string kHloText = R"(
+ENTRY e {
+  p = pred[2,2]{1,0} parameter(0)
+  a = f32[2,2]{1,0} parameter(1)
+  b = f32[2,2]{1,0} parameter(2)
+  c = f32[2,2]{1,0} parameter(3)
+  compare = pred[2,2]{1,0} compare(a, b), direction=LT
+  and = pred[2,2]{1,0} and(p, compare)
+  convert = f32[2,2]{1,0} convert(and)
+  ROOT r = f32[2,2]{1,0} dot(convert, c),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter(),
+                           m::Parameter())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
+}
+
 TEST_F(TritonGemmTest, Naming) {
   const char* hlo_text = R"(
 HloModule t
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 02e89848fcbbd3..67377712e7f818 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -27,12 +27,14 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -67,13 +69,14 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
 #include "xla/ffi/api/c_api.h"
-#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -81,6 +84,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/layout_util.h"
+#include "xla/literal.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
@@ -90,17 +94,11 @@ limitations under the License.
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/conditional_thunk.h"
-#include "xla/service/gpu/convolution_thunk.h"
 #include "xla/service/gpu/copy_thunk.h"
-#include "xla/service/gpu/for_thunk.h"
 #include "xla/service/gpu/fused_mha_thunk.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/fusions.h"
-#include "xla/service/gpu/fusions/input_slices.h"
-#include "xla/service/gpu/fusions/loop.h"
 #include "xla/service/gpu/fusions/thunk_util.h"
-#include "xla/service/gpu/fusions/transpose.h"
 #include "xla/service/gpu/gemm_thunk.h"
 #include "xla/service/gpu/gpu_asm_opts_util.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
@@ -114,6 +112,9 @@ limitations under the License.
 #include "xla/service/gpu/ir_emitter_nested.h"
 #include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/kernel_thunk.h"
+#include "xla/service/gpu/kernels/custom_fusion.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/kernels/topk_custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/nccl_all_gather_thunk.h"
@@ -125,11 +126,18 @@ limitations under the License.
 #include "xla/service/gpu/outfeed_thunk.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/gpu/replica_id_thunk.h"
+#include "xla/service/gpu/runtime3/command_buffer_cmd.h"
+#include "xla/service/gpu/runtime3/command_buffer_cmd_emitter.h"
+#include "xla/service/gpu/runtime3/command_buffer_thunk.h"
+#include "xla/service/gpu/runtime3/conditional_thunk.h"
+#include "xla/service/gpu/runtime3/convolution_thunk.h"
 #include "xla/service/gpu/runtime3/custom_call_thunk.h"
 #include "xla/service/gpu/runtime3/fft_thunk.h"
-#include "xla/service/gpu/sequential_thunk.h"
+#include "xla/service/gpu/runtime3/for_thunk.h"
+#include "xla/service/gpu/runtime3/send_recv_thunk.h"
+#include "xla/service/gpu/runtime3/sequential_thunk.h"
+#include "xla/service/gpu/runtime3/while_thunk.h"
 #include "xla/service/gpu/thunk.h"
-#include "xla/service/gpu/while_thunk.h"
 #include "xla/service/llvm_ir/buffer_assignment_util.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -157,12 +165,12 @@ limitations under the License.
 #include "tsl/protobuf/dnn.pb.h"
 
 #if GOOGLE_CUDA || TF_HIPBLASLT
-#include "xla/service/gpu/cub_sort_thunk.h"
 #include "xla/service/gpu/gpublas_lt_matmul_thunk.h"
-#include "xla/service/gpu/ir_emitter_triton.h"
 #endif  // GOOGLE_CUDA || TF_HIPBLASLT
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "xla/service/gpu/cub_sort_thunk.h"
+#include "xla/service/gpu/ir_emitter_triton.h"
 #include "xla/service/gpu/runtime3/cholesky_thunk.h"
 #include "xla/service/gpu/runtime3/triangular_solve_thunk.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -243,6 +251,13 @@ StatusOr<xla::gpu::CudnnfMHAKind> AsCudnnBackwardfMHAKind(
     case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::
         BackwardScaleBiasMaskSoftmaxDropout:
       return xla::gpu::CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout;
+      break;
+    case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::BackwardSoftmax:
+      return xla::gpu::CudnnfMHAKind::kBackwardSoftmax;
+      break;
+    case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::BackwardSoftmaxDropout:
+      return xla::gpu::CudnnfMHAKind::kBackwardSoftmaxDropout;
+      break;
     default:
       return xla::InternalError("Unsupported fused_mha_backward_dag_signature");
   }
@@ -271,17 +286,21 @@ StatusOr<xla::gpu::CudnnfMHAKind> AsCudnnBackwardfMHAKind(
 // ```
 StatusOr<std::unique_ptr<Thunk>> BuildKernelThunkForFusion(
     IrEmitterContext& ir_emitter_context, KernelReuseCache& kernel_cache,
-    mlir::lmhlo::FusionOp fusion_op, const HloComputation* fused_computation,
+    const HloFusionInstruction* fusion, mlir::lmhlo::FusionOp fusion_op,
+    const HloComputation* fused_computation,
     const LaunchDimensions& launch_dimensions, absl::string_view discriminator,
     std::function<Status(std::vector<llvm_ir::IrArray>,
                          std::vector<llvm_ir::IrArray>)>
         kernel_builder_fn,
     llvm::IRBuilder<>* builder) {
-  std::string suggested_kernel_name = GetIrNameFromLoc(fusion_op->getLoc());
+  std::string suggested_kernel_name = std::string(fusion->name());
 
-  TF_ASSIGN_OR_RETURN(
-      auto kernel_arguments,
-      KernelArguments::Create(ir_emitter_context.allocations(), fusion_op));
+  TF_ASSIGN_OR_RETURN(auto kernel_arguments,
+                      ir_emitter_context.emit_ir_from_hlo()
+                          ? KernelArguments::Create(
+                                ir_emitter_context.buffer_assignment(), fusion)
+                          : KernelArguments::Create(
+                                ir_emitter_context.allocations(), fusion_op));
 
   auto kernel_builder_status = OkStatus();
   auto [entry, cached] = kernel_cache.Get(
@@ -289,7 +308,7 @@ StatusOr<std::unique_ptr<Thunk>> BuildKernelThunkForFusion(
       [&]() -> KernelReuseCache::Entry {
         auto [kernel, input_arrays, output_arrays] = BuildKernelPrototype(
             ir_emitter_context, suggested_kernel_name, kernel_arguments.args(),
-            fusion_op.getInputBuffers().size(), launch_dimensions, builder);
+            fusion->operand_count(), launch_dimensions, builder);
         kernel_builder_status = kernel_builder_fn(input_arrays, output_arrays);
         return {kernel->getName().str(), launch_dimensions};
       });
@@ -299,11 +318,39 @@ StatusOr<std::unique_ptr<Thunk>> BuildKernelThunkForFusion(
             << entry.kernel_name;
   }
 
+  std::variant<mlir::Operation*, const HloInstruction*> op;
+  if (ir_emitter_context.emit_ir_from_hlo()) {
+    op = fusion;
+  } else {
+    op = fusion_op;
+  }
+
   return std::make_unique<KernelThunk>(
-      fusion_op, entry.kernel_name, kernel_arguments.args(), launch_dimensions,
+      op, entry.kernel_name, kernel_arguments.args(), launch_dimensions,
       /*shmem_bytes=*/0);
 }
 
+StatusOr<std::unique_ptr<Thunk>> BuildCustomKernelThunkForFusion(
+    IrEmitterContext& ir_emitter_context, const HloFusionInstruction* fusion,
+    mlir::lmhlo::FusionOp fusion_op, CustomKernel custom_kernel) {
+  TF_ASSIGN_OR_RETURN(auto kernel_arguments,
+                      ir_emitter_context.emit_ir_from_hlo()
+                          ? KernelArguments::Create(
+                                ir_emitter_context.buffer_assignment(), fusion)
+                          : KernelArguments::Create(
+                                ir_emitter_context.allocations(), fusion_op));
+
+  std::variant<mlir::Operation*, const HloInstruction*> instr;
+  if (ir_emitter_context.emit_ir_from_hlo()) {
+    instr = fusion;
+  } else {
+    instr = fusion_op;
+  }
+
+  return std::make_unique<CustomKernelThunk>(
+      instr, std::move(custom_kernel), std::move(kernel_arguments.args()));
+}
+
 // Derives the number of warps to use for processing a Triton Softmax fusion.
 int DeriveNumWarpsFromTritonSoftmaxComputation(
     const HloComputation* computation) {
@@ -339,6 +386,7 @@ int DeriveNumWarpsFromTritonSoftmaxComputation(
 
 IrEmitterUnnested::IrEmitterUnnested(IrEmitterContext* ir_emitter_context)
     : IrEmitter(ir_emitter_context, /*is_nested=*/false),
+      send_recv_events_(std::make_shared<SendRecvAsyncEvents>()),
       elemental_emitter_(*ir_emitter_context, &b_) {}
 
 std::unique_ptr<IrEmitterUnnested> IrEmitterUnnested::Create(
@@ -371,27 +419,19 @@ Status IrEmitterUnnested::EmitUnreachable(mlir::Operation* op,
   return OkStatus();
 }
 
-Status IrEmitterUnnested::EmitConstant(mlir::Operation* op) {
+Status IrEmitterUnnested::EmitConstant(mlir::Operation* op,
+                                       const Literal& literal) {
   auto get_global = mlir::cast<mlir::memref::GetGlobalOp>(op);
   auto module = get_global->getParentOfType<mlir::ModuleOp>();
   auto global = mlir::cast<mlir::memref::GlobalOp>(
       module.lookupSymbol(get_global.getName()));
-  auto literal = global.getInitialValue()->dyn_cast<mlir::DenseElementsAttr>();
-  TF_RET_CHECK(literal);
-  std::vector<uint8_t> content;
-  TF_RETURN_IF_ERROR(CopyDenseElementsDataToXlaFormat(literal, &content));
-  int num_elements, element_bytes;
-  if (literal.getType().getElementType().isInteger(4)) {
-    // Treat int4 constant as int8 constant with half the number of elements
-    TF_RET_CHECK(content.size() ==
-                 (literal.getType().getNumElements() + 1) / 2);
-    num_elements = content.size();
-    element_bytes = 1;
-  } else {
-    num_elements = literal.getType().getNumElements();
-    TF_ASSIGN_OR_RETURN(
-        element_bytes, GetElementTypeBytes(literal.getType().getElementType()));
-  }
+  TF_ASSIGN_OR_RETURN(DenseDataIntermediate content,
+                      LiteralToXlaFormat(literal));
+
+  int element_bytes = primitive_util::ByteWidth(literal.shape().element_type());
+  TF_RET_CHECK(content.span().size() % element_bytes == 0);
+  // Treat int4 constant as int8 constant with half the number of elements.
+  int num_elements = content.span().size() / element_bytes;
 
   int64_t arg_index =
       global->getAttrOfType<mlir::IntegerAttr>("lmhlo.alloc").getInt();
@@ -399,7 +439,26 @@ Status IrEmitterUnnested::EmitConstant(mlir::Operation* op) {
 
   ir_emitter_context_->emit_constant(num_elements, element_bytes,
                                      global.getSymName(), allocation_index,
-                                     content, &b_);
+                                     std::move(content), &b_);
+  return OkStatus();
+}
+
+Status IrEmitterUnnested::EmitConstant(const HloConstantInstruction* instr) {
+  TF_ASSIGN_OR_RETURN(DenseDataIntermediate content,
+                      LiteralToXlaFormat(instr->literal()));
+
+  int element_bytes =
+      primitive_util::ByteWidth(instr->literal().shape().element_type());
+  TF_RET_CHECK(content.span().size() % element_bytes == 0);
+  // Treat int4 constant as int8 constant with half the number of elements.
+  int num_elements = content.span().size() / element_bytes;
+
+  std::string global_name = llvm_ir::ConstantHloToGlobalName(*instr);
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
+                      GetAllocationSliceForHlo(instr, {}));
+
+  ir_emitter_context_->emit_constant(num_elements, element_bytes, global_name,
+                                     slice.index(), std::move(content), &b_);
   return OkStatus();
 }
 
@@ -458,8 +517,7 @@ llvm::Value* IrEmitterUnnested::CreateLoad(llvm::Value* address,
   int data_bytes = data_type->getPrimitiveSizeInBits() /
                    primitive_util::BitWidth(PrimitiveType::U8);
   if (alignment_bytes == 0) {
-    return b_.CreateLoad(data_type,
-                         b_.CreateBitCast(address, data_type->getPointerTo()));
+    return b_.CreateLoad(data_type, address);
   }
 
   int alignment_bitwidth =
@@ -488,8 +546,7 @@ void IrEmitterUnnested::CreateStore(llvm::Value* data, llvm::Value* address,
                    primitive_util::BitWidth(PrimitiveType::U8);
   CHECK_GE(data_bytes, alignment_bytes);
   if (alignment_bytes == 0) {
-    b_.CreateStore(data,
-                   b_.CreateBitCast(address, data->getType()->getPointerTo()));
+    b_.CreateStore(data, address);
     return;
   }
 
@@ -504,10 +561,7 @@ void IrEmitterUnnested::CreateStore(llvm::Value* data, llvm::Value* address,
         b_.CreateLShr(data,
                       llvm::ConstantInt::get(b_.getInt32Ty(), offset_bytes)),
         b_.getIntNTy(alignment_bitwidth), "truncated_value");
-    b_.CreateStore(
-        shifted_partial,
-        b_.CreateBitCast(offset_address,
-                         b_.getIntNTy(alignment_bitwidth)->getPointerTo()));
+    b_.CreateStore(shifted_partial, offset_address);
   }
 }
 
@@ -544,8 +598,6 @@ Status IrEmitterUnnested::EmitPadToStatic(mlir::Operation* op) {
   //   int* source_array = input[0];
   //   int* dest_array = output[0];
   llvm::Value* source_buffer = source_array.GetBasePointer();
-  llvm::Value* raw_buffer =
-      b_.CreateBitCast(source_buffer, b_.getInt8Ty()->getPointerTo());
 
   // TODO(jurahul): input_shape here is the static shape of the input (which has
   // a dynamic shape in XLA). Currently, we are mapping that to a static shaped
@@ -566,7 +618,7 @@ Status IrEmitterUnnested::EmitPadToStatic(mlir::Operation* op) {
 
     const int64_t dim_index = i - 1;
     llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
-        b_.getInt8Ty(), raw_buffer,
+        b_.getInt8Ty(), source_buffer,
         raw_data_size + dim_index * sizeof(int32_t));
     llvm::Value* dyn_dim_size =
         CreateLoad(metadata, b_.getInt32Ty(), alignment);
@@ -679,8 +731,6 @@ Status IrEmitterUnnested::EmitSliceToDynamic(mlir::Operation* op) {
   //   int* dest_array = output[0];
   const llvm_ir::IrArray data_array = input_arrays.back();
   llvm::Value* dest_buffer = data_array.GetBasePointer();
-  llvm::Value* raw_buffer =
-      b_.CreateBitCast(dest_buffer, b_.getInt8Ty()->getPointerTo());
 
   // Load dynamic dimensions from memory.
   std::vector<llvm::Value*> dynamic_dims;
@@ -705,7 +755,7 @@ Status IrEmitterUnnested::EmitSliceToDynamic(mlir::Operation* op) {
     for (int64_t i = 1; i < slice_to_dynamic.getArgs().size(); ++i) {
       const int64_t dim_index = i - 1;
       llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
-          b_.getInt8Ty(), raw_buffer,
+          b_.getInt8Ty(), dest_buffer,
           raw_data_size + dim_index * sizeof(int32_t));
       // output[i] stores dynamic_dim_(i-1)
       CreateStore(dynamic_dims[dim_index], metadata, alignment);
@@ -762,6 +812,24 @@ Status IrEmitterUnnested::EmitSliceToDynamic(mlir::Operation* op) {
   return OkStatus();
 }
 
+Status IrEmitterUnnested::EmitCommandBufferThunk(const HloInstruction* instr) {
+  // Spawn a new IrEmitterUnnested to emit thunks for the command buffer
+  // computation. Then convert emitted thunks to a sequence of CommandBufferCmd.
+  // The resulting thunk added to the thunk sequence is a CommandBufferThunk.
+  // Thunks emitted from the command buffer computation are discarded.
+  DCHECK_EQ(instr->called_computations().size(), 1);
+  const HloComputation* command_buffer = instr->called_computations().front();
+  auto ir_emitter = IrEmitterUnnested::Create(ir_emitter_context_);
+  TF_RETURN_IF_ERROR(ir_emitter->EmitHloComputation(command_buffer));
+  std::unique_ptr<ThunkSequence> thunk_sequence =
+      ir_emitter->ConsumeThunkSequence();
+  TF_ASSIGN_OR_RETURN(CommandBufferCmdSequence cmd_sequence,
+                      ConvertToCommands(*thunk_sequence));
+  AddThunkToThunkSequence(std::make_unique<CommandBufferThunk>(
+      std::move(cmd_sequence), Thunk::ThunkInfo::WithProfileAnnotation(instr)));
+  return OkStatus();
+}
+
 Status IrEmitterUnnested::EmitConvolutionThunk(mlir::Operation* op) {
   using mlir::dyn_cast;
   using mlir::lmhlo_gpu::Activation;
@@ -1208,6 +1276,11 @@ Status IrEmitterUnnested::EmitFusedMHAThunk(mlir::Operation* op) {
         ShapeUtil::MakeShapeWithDenseLayout(
             GetShape(fmha.getOutput()).element_type(),
             intermediate_tensor_dims_array, intermediate_tensor_layout_array);
+
+    // set if flash attention here
+    descriptor.is_flash_attention = fmha.getIsFlashAttention();
+    // set if causal mask here
+    descriptor.is_causal_mask = fmha.getIsCausalMask();
     return OkStatus();
   };
 
@@ -1235,9 +1308,9 @@ Status IrEmitterUnnested::EmitFusedMHABackwardThunk(mlir::Operation* op) {
   GpufMHABackwardDescriptor descriptor;
   BufferAllocation::Slice bmm1_grad_gemm1_rhs_slice, bmm1_grad_gemm2_rhs_slice,
       bmm2_grad_gemm1_lhs_slice, bmm2_grad_gemm2_rhs_slice, d_output_slice,
-      scratch_slice, mask_slice;
+      scratch_slice, mask_slice, fwd_output_slice, bias_slice;
   BufferAllocation::Slice d_bmm1_lhs_slice, d_bmm1_rhs_slice, d_bmm2_rhs_slice,
-      d_S_slice, d_bias_slice;
+      d_s_slice, softmax_sum_slice, d_Q_accum_slice, d_bias_slice;
 
   auto populate_common = [&](auto fmha) -> Status {
     descriptor.backend_config.set_fmha_scale(
@@ -1267,6 +1340,10 @@ Status IrEmitterUnnested::EmitFusedMHABackwardThunk(mlir::Operation* op) {
       algorithm->mutable_workspace_size()->set_value(workspace_size);
     }
 
+    // set if flash attention here
+    descriptor.is_flash_attention = fmha.getIsFlashAttention();
+    // set if causal mask here
+    descriptor.is_causal_mask = fmha.getIsCausalMask();
     descriptor.bmm1_grad_gemm1_dnums =
         ConvertDotDimensionNumbers(fmha.getBmm1GradGemm1DotDimensionNumbers());
     descriptor.bmm1_grad_gemm2_dnums =
@@ -1290,10 +1367,31 @@ Status IrEmitterUnnested::EmitFusedMHABackwardThunk(mlir::Operation* op) {
     TF_ASSIGN_OR_RETURN(bmm1_grad_gemm2_rhs_slice,
                         GetAllocationSlice(fmha.getBmm1GradGemm2Rhs()));
 
-    descriptor.bmm2_grad_gemm1_lhs_shape = ShapeUtil::MakeShapeWithDenseLayout(
-        GetShape(fmha.getBmm2GradGemm1Lhs()).element_type(),
-        GetShape(fmha.getBmm2GradGemm1Lhs()).dimensions(),
-        GetShape(fmha.getBmm2GradGemm1Lhs()).layout().minor_to_major());
+    // fwd activation
+    // fmha.getBmm2GradGemm1Lhs() could be bmm2_grad_gemm1_lhs for regular
+    // attention or softmax stats for flash attention here we set the shape to
+    // be bmm2_grad_gemm1_lhs even it is flash attention
+    if (descriptor.is_flash_attention) {
+      // flash attention TODO: make sure the layout is correct for
+      // bmm2_grad_gemm1_lhs
+      TF_ASSIGN_OR_RETURN(auto intermediate_tensor_dims_array,
+                          ConvertMlirArrayAttrToInt64Array(
+                              fmha.getIntermediateTensorDimensions()));
+      TF_ASSIGN_OR_RETURN(
+          auto intermediate_tensor_layout_array,
+          ConvertMlirArrayAttrToInt64Array(fmha.getIntermediateTensorLayout()));
+
+      descriptor.bmm2_grad_gemm1_lhs_shape =
+          ShapeUtil::MakeShapeWithDenseLayout(
+              GetShape(fmha.getDOutput()).element_type(),
+              intermediate_tensor_dims_array, intermediate_tensor_layout_array);
+    } else {
+      descriptor.bmm2_grad_gemm1_lhs_shape =
+          ShapeUtil::MakeShapeWithDenseLayout(
+              GetShape(fmha.getBmm2GradGemm1Lhs()).element_type(),
+              GetShape(fmha.getBmm2GradGemm1Lhs()).dimensions(),
+              GetShape(fmha.getBmm2GradGemm1Lhs()).layout().minor_to_major());
+    }
     TF_ASSIGN_OR_RETURN(bmm2_grad_gemm1_lhs_slice,
                         GetAllocationSlice(fmha.getBmm2GradGemm1Lhs()));
 
@@ -1332,7 +1430,13 @@ Status IrEmitterUnnested::EmitFusedMHABackwardThunk(mlir::Operation* op) {
 
     TF_ASSIGN_OR_RETURN(scratch_slice, GetAllocationSlice(fmha.getScratch()));
 
-    TF_ASSIGN_OR_RETURN(d_S_slice, GetAllocationSlice(fmha.getD_S()));
+    if (fmha.getD_S() != nullptr) {
+      descriptor.d_s_shape = ShapeUtil::MakeShapeWithDenseLayout(
+          GetShape(fmha.getD_S()).element_type(),
+          GetShape(fmha.getD_S()).dimensions(),
+          GetShape(fmha.getD_S()).layout().minor_to_major());
+      TF_ASSIGN_OR_RETURN(d_s_slice, GetAllocationSlice(fmha.getD_S()));
+    }
 
     if (fmha.getDBias() != nullptr) {
       descriptor.d_bias_shape = ShapeUtil::MakeShapeWithDenseLayout(
@@ -1356,6 +1460,33 @@ Status IrEmitterUnnested::EmitFusedMHABackwardThunk(mlir::Operation* op) {
 
       TF_ASSIGN_OR_RETURN(mask_slice, GetAllocationSlice(fmha.getMask()));
     }
+    // add flash attention backward related slice here
+    if (fmha.getBias() != nullptr) {
+      descriptor.bias_shape = ShapeUtil::MakeShapeWithDenseLayout(
+          GetShape(fmha.getBias()).element_type(),
+          GetShape(fmha.getBias()).dimensions(),
+          GetShape(fmha.getBias()).layout().minor_to_major());
+      TF_ASSIGN_OR_RETURN(bias_slice, GetAllocationSlice(fmha.getBias()));
+    }
+
+    if (fmha.getSoftmaxSum() != nullptr) {
+      TF_ASSIGN_OR_RETURN(softmax_sum_slice,
+                          GetAllocationSlice(fmha.getSoftmaxSum()));
+    }
+
+    if (fmha.getD_QAccum() != nullptr) {
+      TF_ASSIGN_OR_RETURN(d_Q_accum_slice,
+                          GetAllocationSlice(fmha.getD_QAccum()));
+    }
+
+    if (fmha.getFwdOutput() != nullptr) {
+      descriptor.fwd_output_shape = ShapeUtil::MakeShapeWithDenseLayout(
+          GetShape(fmha.getFwdOutput()).element_type(),
+          GetShape(fmha.getFwdOutput()).dimensions(),
+          GetShape(fmha.getFwdOutput()).layout().minor_to_major());
+      TF_ASSIGN_OR_RETURN(fwd_output_slice,
+                          GetAllocationSlice(fmha.getFwdOutput()));
+    }
     return OkStatus();
   };
 
@@ -1377,7 +1508,8 @@ Status IrEmitterUnnested::EmitFusedMHABackwardThunk(mlir::Operation* op) {
       bmm1_grad_gemm1_rhs_slice, bmm1_grad_gemm2_rhs_slice,
       bmm2_grad_gemm1_lhs_slice, bmm2_grad_gemm2_rhs_slice, d_output_slice,
       scratch_slice, d_bmm1_lhs_slice, d_bmm1_rhs_slice, d_bmm2_rhs_slice,
-      d_S_slice, mask_slice, d_bias_slice));
+      d_s_slice, softmax_sum_slice, d_Q_accum_slice, mask_slice, d_bias_slice,
+      fwd_output_slice, bias_slice));
 
   return OkStatus();
 }
@@ -1534,6 +1666,9 @@ static StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
         case 32:
           attributes[name] = static_cast<int32_t>(integer.getInt());
           return OkStatus();
+        case 64:
+          attributes[name] = static_cast<int64_t>(integer.getInt());
+          return OkStatus();
         default:
           return absl::InvalidArgumentError(absl::StrCat(
               "Unsupported integer attribute bit width for attribute: ", name));
@@ -1569,7 +1704,8 @@ static StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
   return attributes;
 }
 
-Status IrEmitterUnnested::EmitCustomCallThunk(mlir::Operation* op) {
+Status IrEmitterUnnested::EmitCustomCallThunk(
+    mlir::Operation* op, const HloCustomCallInstruction* instr) {
   auto custom_call = mlir::cast<mlir::lmhlo::CustomCallOp>(op);
   const std::string call_target_name = custom_call.getCallTargetName().str();
 
@@ -1723,9 +1859,11 @@ Status IrEmitterUnnested::EmitCustomCallThunk(mlir::Operation* op) {
   }
 
   auto ffi_thunk = [&] {
+    auto& called_computations = instr->called_computations();
     return std::make_unique<CustomCallThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(op), *handler,
-        std::move(operands), std::move(results), std::move(attributes));
+        std::move(operands), std::move(results), std::move(attributes),
+        called_computations.empty() ? nullptr : called_computations[0]);
   };
 
   auto legacy_thunk = [&] {
@@ -1860,12 +1998,58 @@ Status IrEmitterUnnested::EmitTriangularSolveCustomCall(mlir::Operation* op) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+Status IrEmitterUnnested::EmitTopKCustomCall(
+    const HloCustomCallInstruction* instr) {
+  auto operands = instr->operands();
+  auto shape = instr->shape();
+  TF_RET_CHECK(operands.size() == 1)
+      << "Expect only 1 operand for TopK custom call.";
+  TF_RET_CHECK(shape.IsTuple())
+      << "Expect TopK custom call to have tuple shape.";
+  TF_RET_CHECK(shape.tuple_shapes_size() == 2)
+      << "Expect TopK custom call shape to have exactly 2 sub-shapes.";
+
+  auto data_shape = operands[0]->shape();
+  auto top_elements_shape = shape.tuple_shapes()[0];
+  auto indices_shape = shape.tuple_shapes()[1];
+
+  TF_RET_CHECK(data_shape.rank() <= 2) << "Invalid input shape.";
+  TF_RET_CHECK(indices_shape.element_type() == PrimitiveType::S32)
+      << "Indices should be S32.";
+
+  bool has_batch = data_shape.rank() == 2;
+  auto [batch_size, n, k] =
+      has_batch
+          ? std::tuple<size_t, size_t, size_t>{data_shape.dimensions(0),
+                                               data_shape.dimensions(1),
+                                               top_elements_shape.dimensions(1)}
+          : std::tuple<size_t, size_t, size_t>{
+                1, data_shape.dimensions(0), top_elements_shape.dimensions(0)};
+
+  // Load TopK custom kernel.
+  TF_ASSIGN_OR_RETURN(CustomKernel kernel,
+                      kernel::topk::GetTopKKernel(
+                          "topk", data_shape.element_type(), n, k, batch_size));
+
+  // Prepare kernel arguments.
+  TF_ASSIGN_OR_RETURN(
+      auto kernel_arguments,
+      KernelArguments::Create(ir_emitter_context_->buffer_assignment(), instr,
+                              operands));
+
+  auto thunk = std::make_unique<CustomKernelThunk>(
+      instr, std::move(kernel), std::move(kernel_arguments.args()));
+  AddThunkToThunkSequence(std::move(thunk));
+
+  return OkStatus();
+}
+
 // Convert the following form of fusion region:
 //   fusion() {
 //     %0 = tensor_load %external_memref0
 //     %1 = tensor_load %external_memref1
 //     ...
-//     tensor_store %ret, %external_memref2
+//     materialize_in_destination %ret, %external_memref2
 //   }
 // to
 //   fusion(%external_memref0, %external_memref1) (^bb(%0, %1) {
@@ -1880,7 +2064,7 @@ static Status ProcessFusionForConversion(mlir::Region* region,
                                          std::vector<Shape>* operand_shapes,
                                          std::vector<Shape>* output_shapes) {
   std::vector<mlir::bufferization::ToTensorOp> loads;
-  std::vector<mlir::memref::TensorStoreOp> stores;
+  std::vector<mlir::bufferization::MaterializeInDestinationOp> stores;
 
   region->walk([&](mlir::bufferization::ToTensorOp load) {
     if (load.getMemref().getParentRegion() != region) {
@@ -1888,8 +2072,9 @@ static Status ProcessFusionForConversion(mlir::Region* region,
     }
   });
 
-  region->walk([&](mlir::memref::TensorStoreOp store) {
-    if (store.getMemref().getParentRegion() != region) {
+  region->walk([&](mlir::bufferization::MaterializeInDestinationOp store) {
+    if (!llvm::isa<mlir::TensorType>(store.getDest().getType())) return;
+    if (store.getDest().getParentRegion() != region) {
       stores.push_back(store);
     }
   });
@@ -1904,10 +2089,10 @@ static Status ProcessFusionForConversion(mlir::Region* region,
 
   std::vector<mlir::Value> returned_values;
   for (auto store : stores) {
-    Shape shape = GetShape(store.getMemref());
+    Shape shape = GetShape(store.getDest());
     output_shapes->push_back(shape);
 
-    returned_values.push_back(store.getTensor());
+    returned_values.push_back(store.getSource());
     store.erase();
   }
 
@@ -1919,25 +2104,30 @@ static Status ProcessFusionForConversion(mlir::Region* region,
 }
 
 #if GOOGLE_CUDA
-Status IrEmitterUnnested::EmitTritonFusion(
-    const HloFusionAnalysis& hlo_fusion_analysis, mlir::Operation* op,
-    const TritonGemmConfig& config,
-    const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-        hlo_for_lmhlo) {
+StatusOr<FusionEmissionResult> IrEmitterUnnested::EmitTritonFusion(
+    const HloFusionAnalysis& hlo_fusion_analysis,
+    const HloFusionInstruction* fusion, mlir::Operation* op) {
   // Note: In this method we can't use `BuildKernelThunk` as usual,
   // because we only get the launch dimensions after code generation. So we
   // implement kernel reuse using lower level APIs, such as
   // `BuildKernelThunkImpl`.
-
-  VLOG(3) << llvm_ir::DumpToString(op);
-  auto fusion_op = mlir::cast<mlir::lmhlo::FusionOp>(op);
-
-  std::string suggested_kernel_name = GetIrNameFromLoc(fusion_op->getLoc());
+  CHECK_NE(fusion, nullptr);
+  if (!ir_emitter_context_->emit_ir_from_hlo()) {
+    CHECK_NE(op, nullptr);
+  }
+  if (ir_emitter_context_->emit_ir_from_hlo()) {
+    VLOG(3) << fusion->ToString();
+  } else {
+    VLOG(3) << llvm_ir::DumpToString(op);
+  }
+  std::string suggested_kernel_name = std::string(fusion->name());
   TF_ASSIGN_OR_RETURN(
       auto kernel_arguments,
-      KernelArguments::Create(ir_emitter_context_->allocations(), fusion_op));
-
-  auto* fusion = Cast<HloFusionInstruction>(hlo_for_lmhlo.at(op));
+      ir_emitter_context_->emit_ir_from_hlo()
+          ? KernelArguments::Create(ir_emitter_context_->buffer_assignment(),
+                                    fusion)
+          : KernelArguments::Create(ir_emitter_context_->allocations(),
+                                    mlir::cast<mlir::lmhlo::FusionOp>(op)));
 
   const HloComputation* hlo_computation =
       fusion->fused_instructions_computation();
@@ -1950,18 +2140,19 @@ Status IrEmitterUnnested::EmitTritonFusion(
             llvm_ir::SanitizeFunctionName(
                 absl::StrCat(suggested_kernel_name, "_impl")));
 
-    FusionBackendConfig backend_config;
-    auto backend_config_str = fusion_op.getBackendConfig()
-                                  .value_or(mlir::Attribute())
-                                  .dyn_cast_or_null<mlir::StringAttr>();
-    CHECK(backend_config_str);
-    TF_RETURN_IF_ERROR(tsl::HumanReadableJsonToProto(backend_config_str.str(),
-                                                     &backend_config));
+    TF_ASSIGN_OR_RETURN(auto backend_config,
+                        fusion->backend_config<FusionBackendConfig>());
     absl::string_view fusion_kind = backend_config.kind();
 
     TritonWrapperResult triton_wrapper_result;
     LaunchDimensions launch_dimensions;
     if (fusion_kind == kTritonSoftmaxFusionKind) {
+      auto& triton_config = *backend_config.mutable_triton_gemm_config();
+      triton_config.set_num_stages(1);
+      triton_config.set_num_warps(DeriveNumWarpsFromTritonSoftmaxComputation(
+          fusion->fused_instructions_computation()));
+      TritonGemmConfig config = TritonGemmConfig::FromProto(triton_config);
+
       TF_ASSIGN_OR_RETURN(auto analysis,
                           TritonFusionAnalysis::Execute(*hlo_computation));
       TF_ASSIGN_OR_RETURN(
@@ -1971,11 +2162,29 @@ Status IrEmitterUnnested::EmitTritonFusion(
                         ir_emitter_context_->cuda_compute_capability(),
                         ir_emitter_context_->gpu_device_info(), config, module_,
                         &EmitSoftMax, *ir_emitter_context_->mlir_context()));
-      launch_dimensions = GetSoftMaxLaunchDimensions(
-          hlo_fusion_analysis.fusion_roots(),
-          hlo_fusion_analysis.fusion_boundary(), config);
+      launch_dimensions =
+          GetSoftMaxLaunchDimensions(hlo_fusion_analysis.fusion(), config);
     } else {  // Must be a MatMul
       CHECK_EQ(fusion_kind, kTritonGemmFusionKind);
+      if (!backend_config.has_triton_gemm_config()) {
+        if (ir_emitter_context_->emit_ir_from_hlo()) {
+          LOG(WARNING) << "Using fallback triton GEMM config for op "
+                       << fusion->name();
+        } else {
+          LOG(WARNING) << "Using fallback triton GEMM config for op "
+                       << GetIrNameFromLoc(op->getLoc());
+        }
+        auto& triton_config = *backend_config.mutable_triton_gemm_config();
+        triton_config.set_block_m(64);
+        triton_config.set_block_k(64);
+        triton_config.set_block_n(64);
+        triton_config.set_split_k(1);
+        triton_config.set_num_stages(1);
+        triton_config.set_num_warps(2);
+      }
+      TritonGemmConfig config =
+          TritonGemmConfig::FromProto(backend_config.triton_gemm_config());
+
       TF_ASSIGN_OR_RETURN(auto analysis, TritonFusionAnalysis::Execute(
                                              *hlo_computation, config.split_k));
       TF_ASSIGN_OR_RETURN(
@@ -1986,8 +2195,7 @@ Status IrEmitterUnnested::EmitTritonFusion(
                         ir_emitter_context_->gpu_device_info(), config, module_,
                         &EmitMatMul, *ir_emitter_context_->mlir_context()));
       launch_dimensions = GetMatMulLaunchDimensions(
-          analysis, hlo_fusion_analysis.fusion_roots(),
-          hlo_fusion_analysis.fusion_boundary(), config);
+          analysis, hlo_fusion_analysis.fusion(), config);
     }
 
     llvm::Function* impl_fn = module_->getFunction(impl_fn_name);
@@ -2014,65 +2222,83 @@ Status IrEmitterUnnested::EmitTritonFusion(
       /*discriminator=*/"", generate);
   TF_RETURN_IF_ERROR(kernel.status());
 
-  AddThunkToThunkSequence(std::make_unique<KernelThunk>(
-      op, kernel->kernel_name, kernel_arguments.args(),
+  std::variant<mlir::Operation*, const HloInstruction*> fusion_op;
+  if (ir_emitter_context_->emit_ir_from_hlo()) {
+    fusion_op = fusion;
+  } else {
+    fusion_op = op;
+  }
+
+  FusionEmissionResult result;
+  result.thunks.emplace_back(std::make_unique<KernelThunk>(
+      fusion_op, kernel->kernel_name, kernel_arguments.args(),
       kernel->launch_dimensions, kernel->shmem_bytes));
-  return OkStatus();
+
+  return result;
 }
 
 #endif  // GOOGLE_CUDA
 
-// Check if the fusion instruction should be emitted as an in place dynamic
-// update slice or a memcpy fusion. The logic is copied from GetFusionEmitter.
-bool IsSpecializedLoopFusion(
-    mlir::Operation* op, absl::Span<const BufferAllocation* const> allocations,
-    HloFusionAnalysis& analysis) {
-  auto fusion_op = mlir::cast<mlir::lmhlo::FusionOp>(op);
-  if (!allocations.empty() && fusion_op != nullptr) {
-    bool is_single = IsSingleInstructionFusion(fusion_op);
-    if (!is_single &&
-        CanEmitFusedDynamicUpdateSliceInPlaceForGpu(fusion_op, allocations)) {
-      return true;
-    }
-    if (is_single && analysis.fusion_roots().size() == 1 &&
-        analysis.fusion_roots().front()->opcode() == HloOpcode::kCopy) {
-      mlir::Value operand = GetHloOperands(fusion_op).front();
-      mlir::Value output = GetHloOutputs(fusion_op).front();
-      Shape operand_shape = GetShape(operand);
-      Shape output_shape = GetShape(output);
-      if (LayoutUtil::Equal(operand_shape.layout(), output_shape.layout()) &&
-          GetAllocationSlice(operand, allocations).ok()) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-Status IrEmitterUnnested::EmitFusion(const HloFusionInstruction* instr,
-                                     HloFusionAnalysis& fusion_analysis) {
-  // TODO(anlunx): Support kReduction, kTriton, and kScatter.
-  std::unique_ptr<FusionInterface> emitter;
+Status IrEmitterUnnested::EmitFusion(
+    const HloFusionInstruction* instr, HloFusionAnalysis& fusion_analysis,
+    mlir::Operation* op,
+    const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
+        hlo_for_lmhlo) {
+  FusionEmissionResult emission_result;
   switch (fusion_analysis.GetEmitterFusionKind()) {
     case HloFusionAnalysis::EmitterFusionKind::kInputSlices:
-      emitter = std::make_unique<InputSlicesFusion>(fusion_analysis);
-      break;
     case HloFusionAnalysis::EmitterFusionKind::kLoop:
-      // TODO(anlunx): Support MemcpyFusion and InPlaceDymaicUpdateSlice.
-      emitter = std::make_unique<LoopFusion>(fusion_analysis);
-      break;
     case HloFusionAnalysis::EmitterFusionKind::kTranspose:
-      emitter = std::make_unique<TransposeFusion>(fusion_analysis);
+    case HloFusionAnalysis::EmitterFusionKind::kReduction: {
+      TF_ASSIGN_OR_RETURN(
+          std::optional<std::unique_ptr<FusionInterface>> emitter,
+          GetFusionEmitter(
+              fusion_analysis,
+              HloFusionInfo(instr, &ir_emitter_context_->buffer_assignment())));
+      // TODO(anlunx): Support InPlaceDynamicUpdateSlice and remove this
+      // fallback.
+      if (!emitter) {
+        TF_RET_CHECK(op)
+            << "Fusion should have been handled by GetFusionEmitter, fallback "
+               "disabled because no lmhlo op is available.";
+        return EmitFusion(op, hlo_for_lmhlo);
+      }
+      TF_ASSIGN_OR_RETURN(
+          emission_result,
+          (*emitter)->Emit(*ir_emitter_context_, elemental_emitter_, nullptr,
+                           *instr, kernel_reuse_cache_, &b_));
+      break;
+    }
+    case HloFusionAnalysis::EmitterFusionKind::kTriton: {
+      TF_ASSIGN_OR_RETURN(auto backend_config,
+                          instr->backend_config<FusionBackendConfig>());
+#if GOOGLE_CUDA
+      TF_ASSIGN_OR_RETURN(emission_result,
+                          EmitTritonFusion(fusion_analysis, instr, nullptr));
+      break;
+#endif
+      LOG(FATAL) << "Unsupported fusion kind: " << backend_config.kind();
+    }
+    case HloFusionAnalysis::EmitterFusionKind::kScatter: {
+      TF_ASSIGN_OR_RETURN(emission_result,
+                          EmitScatter(instr, nullptr, fusion_analysis));
+      break;
+    }
+    case HloFusionAnalysis::EmitterFusionKind::kCustomFusion: {
+      TF_ASSIGN_OR_RETURN(auto backend_config,
+                          instr->backend_config<FusionBackendConfig>());
+      TF_ASSIGN_OR_RETURN(
+          emission_result,
+          EmitCustomFusion(instr, nullptr,
+                           backend_config.custom_fusion_config()));
       break;
+    }
     default:
       return FailedPrecondition(
           "Fusion type not supported by the HLO emitter.");
       break;
   }
 
-  TF_ASSIGN_OR_RETURN(auto emission_result,
-                      emitter->Emit(*ir_emitter_context_, elemental_emitter_,
-                                    nullptr, *instr, kernel_reuse_cache_, &b_));
   for (auto& thunk : emission_result.thunks) {
     AddThunkToThunkSequence(std::move(thunk));
   }
@@ -2100,71 +2326,60 @@ Status IrEmitterUnnested::EmitFusion(
     }
   }
 
-  auto* fused_computation = fusion->fused_instructions_computation();
-
   // Create HloFusionAnalysis instance.
   const se::DeviceDescription& device_info =
       ir_emitter_context_->gpu_device_info();
   TF_ASSIGN_OR_RETURN(auto fusion_analysis,
                       HloFusionAnalysis::Create(fusion, &device_info));
 
-  auto emitter = GetFusionEmitter(
-      fusion_analysis, ir_emitter_context_->allocations(), fusion_op);
-  if (emitter != std::nullopt) {
-    TF_ASSIGN_OR_RETURN(
-        auto emission_result,
-        (*emitter)->Emit(*ir_emitter_context_, elemental_emitter_, fusion_op,
-                         *fusion, kernel_reuse_cache_, &b_));
-    for (auto& thunk : emission_result.thunks) {
-      AddThunkToThunkSequence(std::move(thunk));
-    }
-    return OkStatus();
-  }
-
-  // Dispatch to the fusion specific emitter.
+  FusionEmissionResult emission_result;
   auto emitter_fusion_kind = fusion_analysis.GetEmitterFusionKind();
   switch (emitter_fusion_kind) {
+    case HloFusionAnalysis::EmitterFusionKind::kInputSlices:
+    case HloFusionAnalysis::EmitterFusionKind::kLoop:
+    case HloFusionAnalysis::EmitterFusionKind::kReduction:
+    case HloFusionAnalysis::EmitterFusionKind::kTranspose: {
+      TF_ASSIGN_OR_RETURN(
+          std::optional<std::unique_ptr<FusionInterface>> emitter,
+          GetFusionEmitter(
+              fusion_analysis,
+              LmhloFusionInfo(fusion_op, ir_emitter_context_->allocations())));
+      if (emitter == std::nullopt) {
+        return FailedPrecondition(
+            "Fusion should have been handled by GetFusionEmitter.");
+      }
+      TF_ASSIGN_OR_RETURN(
+          emission_result,
+          (*emitter)->Emit(*ir_emitter_context_, elemental_emitter_, fusion_op,
+                           *fusion, kernel_reuse_cache_, &b_));
+      break;
+    }
     case HloFusionAnalysis::EmitterFusionKind::kTriton: {
 #if GOOGLE_CUDA
-      if (backend_config.kind() == kTritonGemmFusionKind) {
-        if (!backend_config.has_triton_gemm_config()) {
-          LOG(WARNING) << "Using fallback triton GEMM config for op "
-                       << GetIrNameFromLoc(op->getLoc());
-          auto& triton_config = *backend_config.mutable_triton_gemm_config();
-          triton_config.set_block_m(64);
-          triton_config.set_block_k(64);
-          triton_config.set_block_n(64);
-          triton_config.set_split_k(1);
-          triton_config.set_num_stages(1);
-          triton_config.set_num_warps(2);
-        }
-        return EmitTritonFusion(
-            fusion_analysis, fusion_op,
-            TritonGemmConfig::FromProto(backend_config.triton_gemm_config()),
-            hlo_for_lmhlo);
-      }
-      if (backend_config.kind() == kTritonSoftmaxFusionKind) {
-        auto& triton_config = *backend_config.mutable_triton_gemm_config();
-        triton_config.set_num_stages(1);
-        triton_config.set_num_warps(
-            DeriveNumWarpsFromTritonSoftmaxComputation(fused_computation));
-        return EmitTritonFusion(
-            fusion_analysis, fusion_op,
-            TritonGemmConfig::FromProto(backend_config.triton_gemm_config()),
-            hlo_for_lmhlo);
-      }
+      TF_ASSIGN_OR_RETURN(emission_result,
+                          EmitTritonFusion(fusion_analysis, fusion, fusion_op));
+      break;
 #endif
       LOG(FATAL) << "Unsupported fusion kind: " << backend_config.kind();
     }
-    case HloFusionAnalysis::EmitterFusionKind::kScatter:
-      return EmitScatter(fusion_op, fused_computation, fusion_analysis);
-    case HloFusionAnalysis::EmitterFusionKind::kInputSlices:
-    case HloFusionAnalysis::EmitterFusionKind::kLoop:
-    case HloFusionAnalysis::EmitterFusionKind::kReduction:
-    case HloFusionAnalysis::EmitterFusionKind::kTranspose:
-      return FailedPrecondition(
-          "Fusion should have been handled by GetFusionEmitter.");
+    case HloFusionAnalysis::EmitterFusionKind::kScatter: {
+      TF_ASSIGN_OR_RETURN(emission_result,
+                          EmitScatter(fusion, fusion_op, fusion_analysis));
+      break;
+    }
+    case HloFusionAnalysis::EmitterFusionKind::kCustomFusion: {
+      TF_ASSIGN_OR_RETURN(
+          emission_result,
+          EmitCustomFusion(fusion, fusion_op,
+                           backend_config.custom_fusion_config()));
+      break;
+    }
   }
+
+  for (auto& thunk : emission_result.thunks) {
+    AddThunkToThunkSequence(std::move(thunk));
+  }
+  return OkStatus();
 }
 
 Status IrEmitterUnnested::AssertNonDeterminismIsOkay(
@@ -2201,11 +2416,12 @@ Status IrEmitterUnnested::EmitSelectAndScatter(
 
   std::string name = GetIrNameFromLoc(select_and_scatter_op.getLoc());
 
+  const HloInstruction* init_value = select_and_scatter->operand(2);
   // IrEmitterUnnested implements kSelectAndScatter as a SequentialThunk
   // consisting of two thunks, an initializer KernelThunk that initializes
   // the output and another KernelThunk that accumulates the scattered
   // elements.
-  TF_RETURN_IF_ERROR(BuildInitializerThunk(op,
+  TF_RETURN_IF_ERROR(BuildInitializerThunk(op, select_and_scatter, init_value,
                                            select_and_scatter_op.getInitValue(),
                                            select_and_scatter_op.getOut()));
 
@@ -2422,7 +2638,7 @@ Status IrEmitterUnnested::EmitSelectAndScatter(
 }
 
 Status IrEmitterUnnested::EmitWhile(
-    mlir::Operation* op,
+    mlir::Operation* op, const HloInstruction* instr,
     const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
         hlo_for_lmhlo) {
   auto while_op = mlir::cast<mlir::lmhlo::WhileOp>(op);
@@ -2450,15 +2666,35 @@ Status IrEmitterUnnested::EmitWhile(
                       *while_op.getTripCount(), hlo_for_lmhlo));
     AddThunkToThunkSequence(std::move(thunk));
   } else {
-    TF_ASSIGN_OR_RETURN(
-        auto thunk,
-        BuildWhileThunk(while_op, Thunk::ThunkInfo::WithProfileAnnotation(op),
-                        hlo_for_lmhlo));
-    AddThunkToThunkSequence(std::move(thunk));
+    // TODO(ezhulenev): We have few remaining tests that depend on emitting
+    // special fusions, so we can't yet enable while thunk emission here.
+    static constexpr bool kWhileThunkNotSupported = false;
+    if (ir_emitter_context_->emit_ir_from_hlo() && kWhileThunkNotSupported) {
+      TF_ASSIGN_OR_RETURN(
+          auto thunk,
+          BuildWhileThunk(instr,
+                          Thunk::ThunkInfo::WithProfileAnnotation(instr)));
+      AddThunkToThunkSequence(std::move(thunk));
+    } else {
+      TF_ASSIGN_OR_RETURN(
+          auto thunk,
+          BuildWhileThunk(while_op, Thunk::ThunkInfo::WithProfileAnnotation(op),
+                          hlo_for_lmhlo));
+      AddThunkToThunkSequence(std::move(thunk));
+    }
   }
   return OkStatus();
 }
 
+Status IrEmitterUnnested::EmitWhile(const HloInstruction* instr) {
+  // TODO(ezhulenev): Add support for emitting ForThunks for known trip count.
+  TF_ASSIGN_OR_RETURN(
+      auto thunk,
+      BuildWhileThunk(instr, Thunk::ThunkInfo::WithProfileAnnotation(instr)));
+  AddThunkToThunkSequence(std::move(thunk));
+  return OkStatus();
+}
+
 Status IrEmitterUnnested::EmitRngGetAndUpdateState(mlir::Operation* op) {
   auto rng_op = mlir::dyn_cast<mlir::lmhlo::RngGetAndUpdateStateOp>(op);
 
@@ -2477,10 +2713,6 @@ Status IrEmitterUnnested::EmitRngGetAndUpdateState(mlir::Operation* op) {
       llvm_ir::IrArray::Index(
           /*linear=*/b_.getInt64(0), shape, &b_),
       &b_, "rng_state_address");
-  output_address = BitCast(
-      output_address, llvm::PointerType::get(
-                          old_state->getType(),
-                          output_address->getType()->getPointerAddressSpace()));
   Store(old_state, output_address);
 
   return OkStatus();
@@ -2493,11 +2725,18 @@ Status IrEmitterUnnested::EmitScatter(
     std::vector<llvm::Value*> input_scatter_multidim;
     std::vector<int64_t> raw_window_bounds;
 
+    auto get_i64_array = [](absl::Span<const int64_t> container) {
+      return llvm::ArrayRef<int64_t>{container.data(),
+                                     static_cast<size_t>(container.size())};
+    };
+
+    llvm::ArrayRef<int64_t> update_window_dims =
+        get_i64_array(desc.dim_numbers.update_window_dims());
     // Partition the index into window indices and scatter indices.
     for (int64_t i = 0, e = index.size(); i != e; ++i) {
       // For window indices also remember the window size, this comes in handy
       // later.
-      if (llvm::is_contained(desc.dim_numbers.getUpdateWindowDims(), i)) {
+      if (llvm::is_contained(update_window_dims, i)) {
         raw_window_multidim.push_back(index[i]);
         raw_window_bounds.push_back(desc.updates_shape.dimensions(i));
       } else {
@@ -2505,7 +2744,7 @@ Status IrEmitterUnnested::EmitScatter(
       }
     }
     DCHECK_EQ(raw_window_multidim.size(),
-              desc.dim_numbers.getUpdateWindowDims().size());
+              desc.dim_numbers.update_window_dims_size());
 
     // Apply inserted_window_dims to the window dimensions.
     int64_t raw_window_multidim_idx = 0;
@@ -2515,8 +2754,10 @@ Status IrEmitterUnnested::EmitScatter(
     input_window_bounds.reserve(rank);
     input_window_multidim.reserve(rank);
 
+    llvm::ArrayRef<int64_t> inserted_window_dims =
+        get_i64_array(desc.dim_numbers.inserted_window_dims());
     for (int64_t i = 0; i != rank; ++i) {
-      if (llvm::is_contained(desc.dim_numbers.getInsertedWindowDims(), i)) {
+      if (llvm::is_contained(inserted_window_dims, i)) {
         input_window_bounds.push_back(1);  // Trivial dimension.
         input_window_multidim.push_back(index.GetConstantWithIndexType(0));
       } else {
@@ -2531,11 +2772,11 @@ Status IrEmitterUnnested::EmitScatter(
 
     // Insert a 1 dimension at the end if index_vector_dim requests one.
     Shape scatter_indices_shape_fixed = desc.scatter_indices_shape;
-    if (desc.dim_numbers.getIndexVectorDim() ==
+    if (desc.dim_numbers.index_vector_dim() ==
         desc.scatter_indices_shape.rank()) {
       scatter_indices_shape_fixed.add_dimensions(1);
       scatter_indices_shape_fixed.mutable_layout()->add_minor_to_major(
-          desc.dim_numbers.getIndexVectorDim());
+          desc.dim_numbers.index_vector_dim());
     }
 
     // Now load the indices corresponding to the current window from
@@ -2543,21 +2784,22 @@ Status IrEmitterUnnested::EmitScatter(
     std::vector<llvm::Value*> raw_scatter_index_multidim =
         input_scatter_multidim;
     raw_scatter_index_multidim.insert(raw_scatter_index_multidim.begin() +
-                                          desc.dim_numbers.getIndexVectorDim(),
+                                          desc.dim_numbers.index_vector_dim(),
                                       nullptr);
+
+    llvm::ArrayRef<int64_t> scatter_dims_to_operand_dims =
+        get_i64_array(desc.dim_numbers.scatter_dims_to_operand_dims());
     llvm::Value* is_in_bounds = b_.getTrue();
-    for (int64_t i = 0,
-                 e = desc.dim_numbers.getScatterDimsToOperandDims().size();
-         i != e; ++i) {
+    for (int64_t i = 0, e = scatter_dims_to_operand_dims.size(); i != e; ++i) {
       // Our index is stored along index_vector_dim, insert that into the lookup
       // index into scatter_indices.
-      raw_scatter_index_multidim[desc.dim_numbers.getIndexVectorDim()] =
+      raw_scatter_index_multidim[desc.dim_numbers.index_vector_dim()] =
           index.GetConstantWithIndexType(i);
       llvm_ir::IrArray::Index raw_scatter_index_index(
           raw_scatter_index_multidim, scatter_indices_shape_fixed,
           index.GetType());
 
-      int64_t operand_dim = desc.dim_numbers.getScatterDimsToOperandDims()[i];
+      int64_t operand_dim = scatter_dims_to_operand_dims[i];
       if (operand_dim > rank) {
         return absl::OutOfRangeError(
             "The provided scatter_dims_to_operand_dims was out of range.");
@@ -2621,31 +2863,43 @@ Status IrEmitterUnnested::EmitScatter(
                 desc.get_index_type(launch_dimensions.launch_bound()));
 }
 
-Status IrEmitterUnnested::EmitSort(
-    mlir::Operation* op,
-    const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-        hlo_for_lmhlo) {
-  auto sort_op = mlir::cast<mlir::lmhlo::SortOp>(op);
-  auto* sort = hlo_for_lmhlo.at(op);
-
-  std::string op_name = GetIrNameFromLoc(sort_op.getLoc());
-  llvm::SmallVector<mlir::Value> operands = GetHloOperands(sort_op);
-  const Shape& keys_shape = GetShape(operands[0]);
-  int64_t dimension_to_sort = sort_op.getDimension();
-  for (int64_t i = 0; i < operands.size(); ++i) {
+Status IrEmitterUnnested::EmitSort(mlir::Operation* op,
+                                   const HloSortInstruction* sort) {
+  auto sort_op = mlir::dyn_cast_or_null<mlir::lmhlo::SortOp>(op);
+  if (!ir_emitter_context_->emit_ir_from_hlo() && !sort_op) {
+    return absl::InternalError("MLIR operations must be not null");
+  }
+
+  std::string op_name(sort->name());
+  const Shape& keys_shape = sort->operand(0)->shape();
+  int64_t dimension_to_sort = sort->sort_dimension();
+  for (int64_t i = 0; i < sort->operand_count(); ++i) {
+    ShapeIndex shape_index =
+        sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
     // We assume that the layout of all involved operands and outputs is the
     // same.
-    TF_RET_CHECK(
-        LayoutUtil::LayoutsInShapesEqual(keys_shape, GetShape(operands[i])));
+    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(keys_shape,
+                                                  sort->operand(i)->shape()));
     TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
-        keys_shape, GetShape(GetHloOutputs(sort_op)[i])));
-
-    // If possible, we share buffers. If that is not possible, we need to copy
-    // the values, because the emitter does the sorting in-place.
-    TF_ASSIGN_OR_RETURN(auto destination_buffer,
-                        GetAllocationSlice(sort_op.getOutput()[i]));
-    TF_ASSIGN_OR_RETURN(auto source_address,
-                        GetAllocationSlice(sort_op.getOperands()[i]));
+        keys_shape, ShapeUtil::GetSubshape(sort->shape(), shape_index)));
+
+    BufferAllocation::Slice destination_buffer;
+    BufferAllocation::Slice source_address;
+
+    // If possible, we share buffers. If that is not possible, we need to
+    // copy the values, because the emitter does the sorting in-place.
+    if (ir_emitter_context_->emit_ir_from_hlo()) {
+      TF_ASSIGN_OR_RETURN(destination_buffer,
+                          GetAllocationSliceForHlo(sort, shape_index));
+      TF_ASSIGN_OR_RETURN(source_address,
+                          GetAllocationSliceForHlo(sort->operand(i), {}));
+    } else {
+      TF_ASSIGN_OR_RETURN(destination_buffer,
+                          GetAllocationSlice(sort_op.getOutput()[i]));
+      TF_ASSIGN_OR_RETURN(source_address,
+                          GetAllocationSlice(sort_op.getOperands()[i]));
+    }
+
     if (destination_buffer != source_address) {
       // TODO(b/26783907): Figure out why we never seem to share buffers for
       // key/value sort.
@@ -2654,9 +2908,9 @@ Status IrEmitterUnnested::EmitSort(
           Thunk::ThunkInfo(op),
           /*source_buffer=*/source_address,
           /*destination_buffer=*/destination_buffer,
-          /*mem_size=*/ShapeUtil::ByteSizeOf(GetShape(operands[i])),
-          /*source_value=*/sort_op.getOperands()[i],
-          /*destination_value=*/sort_op.getOutput()[i]));
+          /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(i)->shape()),
+          /*source_value=*/sort_op ? sort_op.getOperands()[i] : nullptr,
+          /*destination_value=*/sort_op ? sort_op.getOutput()[i] : nullptr));
     }
   }
 
@@ -2728,10 +2982,10 @@ Status IrEmitterUnnested::EmitSort(
   // Check whether we should use any tiling. We might not be able to use it if
   // we have not enough threads, or not enough shared memory.
   int64_t total_shared_memory_needed = 0;
-  for (int64_t i = 0; i < operands.size(); ++i) {
+  for (int64_t i = 0; i < sort->operand_count(); ++i) {
     total_shared_memory_needed +=
         kTileSize * ShapeUtil::ByteSizeOfPrimitiveType(
-                        GetShape(operands[i]).element_type());
+                        sort->operand(i)->shape().element_type());
   }
   bool no_tiling =
       kThreadsPerBlock >
@@ -2760,9 +3014,13 @@ Status IrEmitterUnnested::EmitSort(
     LaunchDimensions launch_dimensions = xor_masks.size() > 1
                                              ? tiled_launch_dimensions
                                              : standard_launch_dimensions;
-    TF_ASSIGN_OR_RETURN(auto ir_arrays,
-                        BuildKernelThunkForNonFusionOp(
-                            sort_op, sort_op.getOutput(), launch_dimensions));
+    TF_ASSIGN_OR_RETURN(
+        auto ir_arrays,
+        ir_emitter_context_->emit_ir_from_hlo()
+            ? BuildKernelThunkForNonFusionOp(sort, {}, launch_dimensions)
+            : BuildKernelThunkForNonFusionOp(sort_op, sort_op.getOutput(),
+                                             launch_dimensions));
+
     auto& [inputs, outputs] = ir_arrays;
     auto* comparator = sort->called_computations().front();
     return llvm_ir::EmitSortInPlace(
@@ -2802,6 +3060,11 @@ Status IrEmitterUnnested::EmitSort(
   return OkStatus();
 }
 
+Status IrEmitterUnnested::EmitSort(const HloSortInstruction* sort) {
+  CHECK(ir_emitter_context_->emit_ir_from_hlo());  // NOLINT
+  return EmitSort(nullptr, sort);
+}
+
 template <typename ThunkType, typename OpT>
 Status IrEmitterUnnested::EmitReplicaOrPartitionId(mlir::Operation* op) {
   auto casted = mlir::cast<OpT>(op);
@@ -3013,6 +3276,32 @@ IrEmitterUnnested::BuildKernelThunkForNonFusionOp(
   return {{inputs, outputs}};
 }
 
+StatusOr<std::pair<std::vector<llvm_ir::IrArray> /*inputs*/,
+                   std::vector<llvm_ir::IrArray> /*outputs*/>>
+IrEmitterUnnested::BuildKernelThunkForNonFusionOp(
+    const HloInstruction* hlo,
+    absl::Span<const HloInstruction* const> needed_operands,
+    const LaunchDimensions& launch_dimensions) {
+  std::string suggested_kernel_name(hlo->name());
+
+  TF_ASSIGN_OR_RETURN(
+      auto kernel_arguments,
+      KernelArguments::Create(ir_emitter_context_->buffer_assignment(), hlo,
+                              needed_operands));
+
+  VLOG(3) << "Generating (without reuse check): " << suggested_kernel_name;
+
+  auto [kernel, inputs, outputs] = BuildKernelPrototype(
+      *ir_emitter_context_, suggested_kernel_name, kernel_arguments.args(),
+      kernel_arguments.args().size(), launch_dimensions, &b_);
+
+  AddThunkToThunkSequence(std::make_unique<KernelThunk>(
+      hlo, kernel->getName().str(), kernel_arguments.args(), launch_dimensions,
+      /*shmem_bytes=*/0));
+
+  return {{inputs, outputs}};
+}
+
 StatusOr<
     std::pair<std::vector<llvm_ir::IrArray>, std::vector<llvm_ir::IrArray>>>
 IrEmitterUnnested::BuildKernelThunkForNonFusionOp(
@@ -3021,16 +3310,20 @@ IrEmitterUnnested::BuildKernelThunkForNonFusionOp(
                                         launch_dimensions);
 }
 
-Status IrEmitterUnnested::BuildInitializerThunk(mlir::Operation* op,
-                                                mlir::Value init_value,
-                                                mlir::Value dest) {
+Status IrEmitterUnnested::BuildInitializerThunk(
+    mlir::Operation* op, const HloInstruction* instr,
+    const HloInstruction* init_value, mlir::Value init_value_mlir,
+    mlir::Value dest) {
   // initial value must be a scalar memref.
-  auto init_type = init_value.getType().dyn_cast<mlir::MemRefType>();
-  TF_RET_CHECK(init_type.getRank() == 0);
+  TF_RET_CHECK(init_value->shape().rank() == 0);
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dest_slice,
+                      GetAllocationSlice(dest));
 
-  TF_ASSIGN_OR_RETURN(std::optional<std::unique_ptr<Thunk>> constant_init_thunk,
-                      BuildConstantInitializerThunk(*ir_emitter_context_, op,
-                                                    init_value, dest));
+  TF_ASSIGN_OR_RETURN(
+      std::optional<std::unique_ptr<Thunk>> constant_init_thunk,
+      BuildConstantInitializerThunk(*ir_emitter_context_, op, instr, init_value,
+                                    dest, dest_slice));
   if (constant_init_thunk) {
     AddThunkToThunkSequence(*std::move(constant_init_thunk));
     return OkStatus();
@@ -3044,8 +3337,8 @@ Status IrEmitterUnnested::BuildInitializerThunk(mlir::Operation* op,
                       CalculateLaunchDimensions(
                           dest_shape, ir_emitter_context_->gpu_device_info()));
   TF_ASSIGN_OR_RETURN(auto ir_arrays,
-                      BuildKernelThunkForNonFusionOp(op, {init_value, dest},
-                                                     launch_dimensions));
+                      BuildKernelThunkForNonFusionOp(
+                          op, {init_value_mlir, dest}, launch_dimensions));
   auto& [inputs, outputs] = ir_arrays;
   auto init_array = inputs[0];
 
@@ -3089,6 +3382,28 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
                      ir_emitter_body->ConsumeThunkSequence()));
 }
 
+StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
+    const HloInstruction* instr, const Thunk::ThunkInfo& thunk_info) {
+  HloComputation* condition = instr->while_condition();
+  HloComputation* body = instr->while_body();
+
+  // Generate thunk sequence for while 'condition'.
+  auto ir_emitter_condition = IrEmitterUnnested::Create(ir_emitter_context_);
+  TF_RETURN_IF_ERROR(ir_emitter_condition->EmitHloComputation(condition));
+
+  // Generate thunk sequence for while 'body'.
+  auto ir_emitter_body = IrEmitterUnnested::Create(ir_emitter_context_);
+  TF_RETURN_IF_ERROR(ir_emitter_body->EmitHloComputation(body));
+
+  // Buffer slice holding while loop predicate.
+  TF_ASSIGN_OR_RETURN(
+      auto pred, GetAllocationSliceForHlo(condition->root_instruction(), {}));
+
+  return std::unique_ptr<Thunk>(new WhileThunk(
+      thunk_info, pred, ir_emitter_condition->ConsumeThunkSequence(),
+      ir_emitter_body->ConsumeThunkSequence()));
+}
+
 StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildForThunk(
     mlir::lmhlo::WhileOp while_op, const Thunk::ThunkInfo& thunk_info,
     const int64_t loop_limit,
@@ -3108,9 +3423,10 @@ Status IrEmitterUnnested::EmitTargetElementLoop(
   return InternalError("This should be unreachable");
 }
 
-Status IrEmitterUnnested::EmitScatter(mlir::lmhlo::FusionOp fusion_op,
-                                      const HloComputation* fused_computation,
-                                      HloFusionAnalysis& fusion_analysis) {
+StatusOr<FusionEmissionResult> IrEmitterUnnested::EmitScatter(
+    const HloFusionInstruction* fusion, mlir::lmhlo::FusionOp fusion_op,
+    HloFusionAnalysis& fusion_analysis) {
+  auto* fused_computation = fusion->fused_instructions_computation();
   auto* root = fused_computation->root_instruction();
 
   // Nothing should have been fused into the first operand of scatter.
@@ -3137,16 +3453,16 @@ Status IrEmitterUnnested::EmitScatter(mlir::lmhlo::FusionOp fusion_op,
           });
     }
 
-    TF_ASSIGN_OR_RETURN(const auto dim_numbers,
-                        mlir::LhloDialectEmitter::GetScatterDimensionNumbers(
-                            root, fusion_op.getContext()));
+    auto* scatter = Cast<HloScatterInstruction>(root);
+    const xla::ScatterDimensionNumbers& xla_scatter_dim =
+        scatter->scatter_dimension_numbers();
 
     ScatterDescriptor desc;
     desc.name = llvm_ir::IrName(root);
     desc.operand_shape = root->operand(0)->shape();
     desc.scatter_indices_shape = root->operand(1)->shape();
     desc.updates_shape = updates_shape;
-    desc.dim_numbers = dim_numbers;
+    desc.dim_numbers = xla_scatter_dim;
     desc.unique_indices = root->unique_indices();
     desc.update_computation = root->called_computations()[0];
     desc.output = outputs.back();
@@ -3160,15 +3476,123 @@ Status IrEmitterUnnested::EmitScatter(mlir::lmhlo::FusionOp fusion_op,
     return EmitScatter(desc, launch_dimensions);
   };
 
-  TF_ASSIGN_OR_RETURN(
-      auto thunk,
-      BuildKernelThunkForFusion(*ir_emitter_context_, kernel_reuse_cache_,
-                                fusion_op, fused_computation, launch_dimensions,
-                                /*discriminator=*/"scatter", builder_fn, &b_));
-  AddThunkToThunkSequence(std::move(thunk));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> kernel_thunk,
+                      BuildKernelThunkForFusion(
+                          *ir_emitter_context_, kernel_reuse_cache_, fusion,
+                          fusion_op, fused_computation, launch_dimensions,
+                          /*discriminator=*/"scatter", builder_fn, &b_));
+
+  FusionEmissionResult result;
+  result.thunks.push_back(std::move(kernel_thunk));
+  return result;
+}
+
+static absl::flat_hash_map<std::string, std::string> ConvertFrontendAttributes(
+    const FrontendAttributes& attrs) {
+  absl::flat_hash_map<std::string, std::string> result;
+  for (auto& [k, v] : attrs.map()) result[k] = v;
+  return result;
+}
+
+Status IrEmitterUnnested::EmitSendThunk(const HloSendInstruction* instr) {
+  if (!instr->channel_id().has_value())
+    return absl::InternalError("Unknown send instruction channel id");
+
+  const HloInstruction* src = instr->operand(0);
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice buffer,
+                      GetAllocationSliceForHlo(src, {}));
+
+  AddThunkToThunkSequence(std::make_unique<SendThunk>(
+      Thunk::ThunkInfo::WithProfileAnnotation(instr), src->shape(), buffer,
+      *instr->channel_id(), send_recv_events_,
+      ConvertFrontendAttributes(instr->frontend_attributes())));
+
   return OkStatus();
 }
 
+Status IrEmitterUnnested::EmitSendDoneThunk(
+    const HloSendDoneInstruction* instr) {
+  if (!instr->channel_id().has_value())
+    return absl::InternalError("Unknown send done instruction channel id");
+
+  AddThunkToThunkSequence(std::make_unique<SendDoneThunk>(
+      Thunk::ThunkInfo::WithProfileAnnotation(instr), *instr->channel_id(),
+      send_recv_events_));
+
+  return OkStatus();
+}
+
+Status IrEmitterUnnested::EmitRecvThunk(const HloRecvInstruction* instr) {
+  if (!instr->channel_id().has_value())
+    return absl::InternalError("Unknown recv instruction channel id");
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice buffer,
+                      GetAllocationSliceForHlo(instr, {0}));
+
+  AddThunkToThunkSequence(std::make_unique<RecvThunk>(
+      Thunk::ThunkInfo::WithProfileAnnotation(instr),
+      instr->shape().tuple_shapes()[0], buffer, *instr->channel_id(),
+      send_recv_events_,
+      ConvertFrontendAttributes(instr->frontend_attributes())));
+
+  return OkStatus();
+}
+
+Status IrEmitterUnnested::EmitRecvDoneThunk(
+    const HloRecvDoneInstruction* instr) {
+  if (!instr->channel_id().has_value())
+    return absl::InternalError("Unknown recv done instruction channel id");
+
+  AddThunkToThunkSequence(std::make_unique<RecvDoneThunk>(
+      Thunk::ThunkInfo::WithProfileAnnotation(instr), *instr->channel_id(),
+      send_recv_events_));
+
+  return OkStatus();
+}
+
+StatusOr<FusionEmissionResult> IrEmitterUnnested::EmitCustomFusion(
+    const HloFusionInstruction* fusion, mlir::lmhlo::FusionOp fusion_op,
+    const CustomFusionConfig& config) {
+  VLOG(3) << "Lower HLO fusion to a custom fusion " << config.name();
+
+  auto* registry = CustomFusionRegistry::Default();
+  auto* custom_fusion = registry->Lookup(config.name());
+
+  // If custom fusion is not found it means that some of the build targets might
+  // not be statically linked into the binary.
+  if (custom_fusion == nullptr) {
+    return absl::InternalError(absl::StrCat(
+        "Custom fusion ", config.name(), " not found in a default registry."));
+  }
+
+  // Load custom kernels that can implement a fusion computation.
+  TF_ASSIGN_OR_RETURN(
+      std::vector<CustomKernel> kernels,
+      custom_fusion->LoadKernels(ir_emitter_context_->gpu_device_info(),
+                                 fusion->fused_instructions_computation()));
+
+  // This should never happen, it means that compilation pipeline created a
+  // fusion operation that is not supported by a given custom fusion.
+  if (kernels.empty()) {
+    return absl::InternalError(
+        absl::StrCat("Custom fusion ", config.name(),
+                     " returned empty custom kernels for a fused computation"));
+  }
+
+  // TODO(ezhulenev): Add support for auto tuning to select the best kernel.
+  if (kernels.size() != 1) {
+    return absl::InternalError("Expected exactly one custom kernel");
+  }
+
+  TF_ASSIGN_OR_RETURN(auto thunk, BuildCustomKernelThunkForFusion(
+                                      *ir_emitter_context_, fusion, fusion_op,
+                                      std::move(kernels[0])));
+
+  FusionEmissionResult result;
+  result.thunks.push_back(std::move(thunk));
+  return result;
+}
+
 Status IrEmitterUnnested::EmitOp(
     mlir::Operation* op,
     const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
@@ -3181,9 +3605,15 @@ Status IrEmitterUnnested::EmitOp(
   }
 
   if (mlir::isa<mlir::memref::GetGlobalOp>(op)) {
-    return EmitConstant(op);
+    const HloConstantInstruction* hlo_const_instr =
+        DynCast<HloConstantInstruction>(hlo_for_lmhlo.at(op));
+    TF_RET_CHECK(hlo_const_instr);
+    return EmitConstant(op, hlo_const_instr->literal());
   }
 
+  bool is_gpu_runtime = ir_emitter_context_->debug_options()
+                            .xla_gpu_enable_xla_runtime_executable();
+
   if (auto call = mlir::dyn_cast<mlir::lmhlo::CustomCallOp>(op)) {
     if (call.getCallTargetName() == "PadToStatic") {
       return EmitPadToStatic(op);
@@ -3199,7 +3629,13 @@ Status IrEmitterUnnested::EmitOp(
     }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-    return EmitCustomCallThunk(op);
+    if (!is_gpu_runtime && call.getCallTargetName() == "__gpu$TopK") {
+      return EmitTopKCustomCall(
+          Cast<HloCustomCallInstruction>(hlo_for_lmhlo.at(op)));
+    }
+
+    return EmitCustomCallThunk(
+        op, Cast<HloCustomCallInstruction>(hlo_for_lmhlo.at(op)));
   }
 
   if (mlir::isa<mlir::lmhlo_gpu::GEMMOp>(op)) {
@@ -3272,17 +3708,7 @@ Status IrEmitterUnnested::EmitOp(
           ir_emitter_context_->gpu_device_info();
       TF_ASSIGN_OR_RETURN(auto fusion_analysis,
                           HloFusionAnalysis::Create(instr, &device_info));
-      HloFusionAnalysis::EmitterFusionKind kind =
-          fusion_analysis.GetEmitterFusionKind();
-      // TODO(anlunx): Add support for emitting kTriton, kScatter, kReduction,
-      // and specialized kLoops.
-      if (kind != HloFusionAnalysis::EmitterFusionKind::kTriton &&
-          kind != HloFusionAnalysis::EmitterFusionKind::kScatter &&
-          kind != HloFusionAnalysis::EmitterFusionKind::kReduction &&
-          !IsSpecializedLoopFusion(op, ir_emitter_context_->allocations(),
-                                   fusion_analysis)) {
-        return EmitFusion(instr, fusion_analysis);
-      }
+      return EmitFusion(instr, fusion_analysis, op, hlo_for_lmhlo);
     }
 
     return EmitFusion(op, hlo_for_lmhlo);
@@ -3297,7 +3723,7 @@ Status IrEmitterUnnested::EmitOp(
   }
 
   if (mlir::isa<mlir::lmhlo::SortOp>(op)) {
-    return EmitSort(op, hlo_for_lmhlo);
+    return EmitSort(op, Cast<HloSortInstruction>(hlo_for_lmhlo.at(op)));
   }
 
   if (mlir::isa<mlir::lmhlo::ReplicaIdOp>(op)) {
@@ -3373,7 +3799,7 @@ Status IrEmitterUnnested::EmitOp(
   }
 
   if (mlir::isa<mlir::lmhlo::WhileOp>(op)) {
-    return EmitWhile(op, hlo_for_lmhlo);
+    return EmitWhile(op, hlo_for_lmhlo.at(op), hlo_for_lmhlo);
   }
 
   // Remaining arith.constant ops are the gpu.launch_func dimensions as a result
@@ -3384,14 +3810,12 @@ Status IrEmitterUnnested::EmitOp(
   }
 
   if (mlir::isa<mlir::lmhlo::CommandBufferOp>(op)) {
-    // TODO(b/304824183): Emit a command buffer thunk when it's implemented.
-    return InternalError("Command buffer is unimplemented");
+    return EmitCommandBufferThunk(hlo_for_lmhlo.at(op));
   }
 
-  // Point to point communication operations are only implemented as XLA
-  // GPU runtime custom calls.
-  bool is_gpu_runtime = ir_emitter_context_->debug_options()
-                            .xla_gpu_enable_xla_runtime_executable();
+  // In GPU runtime point-to-point communications implemented as runtime custom
+  // calls, and we do not need real thunks to construct them, so we can emit
+  // stubs that always fail. This is deprecated and will be removed in Q1 2024.
   if (is_gpu_runtime &&
       mlir::isa<mlir::lmhlo::SendOp, mlir::lmhlo::RecvOp,
                 mlir::lmhlo::SendDoneOp, mlir::lmhlo::RecvDoneOp>(op)) {
@@ -3400,6 +3824,24 @@ Status IrEmitterUnnested::EmitOp(
                            "implemented as thunks");
   }
 
+  if (mlir::isa<mlir::lmhlo::SendOp>(op)) {
+    return EmitSendThunk(Cast<HloSendInstruction>(hlo_for_lmhlo.at(op)));
+  }
+
+  if (mlir::isa<mlir::lmhlo::SendDoneOp>(op)) {
+    return EmitSendDoneThunk(
+        Cast<HloSendDoneInstruction>(hlo_for_lmhlo.at(op)));
+  }
+
+  if (mlir::isa<mlir::lmhlo::RecvOp>(op)) {
+    return EmitRecvThunk(Cast<HloRecvInstruction>(hlo_for_lmhlo.at(op)));
+  }
+
+  if (mlir::isa<mlir::lmhlo::RecvDoneOp>(op)) {
+    return EmitRecvDoneThunk(
+        Cast<HloRecvDoneInstruction>(hlo_for_lmhlo.at(op)));
+  }
+
   return InternalError("Unrecognized op: %s", llvm_ir::DumpToString(op));
 }
 
@@ -3413,6 +3855,49 @@ Status IrEmitterUnnested::EmitLmhloRegion(
   return OkStatus();
 }
 
+Status IrEmitterUnnested::EmitHloInstruction(const HloInstruction* instr) {
+  // TODO(anlunx): Support other instruction opcodes.
+  switch (instr->opcode()) {
+    case HloOpcode::kFusion: {
+      auto* fusion = Cast<HloFusionInstruction>(instr);
+      TF_ASSIGN_OR_RETURN(auto backend_config,
+                          instr->backend_config<FusionBackendConfig>());
+      const se::DeviceDescription& device_info =
+          ir_emitter_context_->gpu_device_info();
+      TF_ASSIGN_OR_RETURN(auto fusion_analysis,
+                          HloFusionAnalysis::Create(fusion, &device_info));
+      return EmitFusion(fusion, fusion_analysis, nullptr, {});
+    }
+    case HloOpcode::kWhile:
+      return EmitWhile(instr);
+    case HloOpcode::kSort:
+      return EmitSort(Cast<HloSortInstruction>(instr));
+    case HloOpcode::kConstant:
+      return EmitConstant(Cast<HloConstantInstruction>(instr));
+    // We don't need to emit thunks for these operations because their semantics
+    // are encoded by buffers.
+    case HloOpcode::kBitcast:
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kParameter:
+    case HloOpcode::kTuple:
+      return OkStatus();
+    default:
+      return InternalError("Unsupported instruction opcode: %s",
+                           HloOpcodeString(instr->opcode()));
+  }
+
+  return InternalError("Unhandled HLO instruction");
+}
+
+Status IrEmitterUnnested::EmitHloComputation(
+    const HloComputation* computation) {
+  ThunkSequence thunk_sequence;
+  for (const HloInstruction* instr : computation->instructions()) {
+    TF_RETURN_IF_ERROR(EmitHloInstruction(instr));
+  }
+  return OkStatus();
+}
+
 void IrEmitterUnnested::GetDependentDialects(mlir::DialectRegistry& registry) {
   registry.insert<mlir::arith::ArithDialect, mlir::func::FuncDialect,
                   mlir::gpu::GPUDialect, mlir::lmhlo::LmhloDialect,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
index eeb35101f60b7f..e2543db8b55552 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
 
 #include <array>
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
@@ -24,6 +25,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -33,16 +35,19 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/elemental_ir_emitter.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/tiling_util.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emitter.h"
 #include "xla/service/gpu/kernel_mapping_scheme.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime3/send_recv_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
 
 namespace xla {
@@ -115,6 +120,12 @@ class IrEmitterUnnested : public IrEmitter {
       const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
           hlo_for_lmhlo);
 
+  // Emits code for the given HLO computation. Right now it is only used to emit
+  // thunks for constructing command buffer. The plan is to replace
+  // EmitLmhloRegion by this function altogether, after we support emitting
+  // all instructions from HLO.
+  Status EmitHloComputation(const HloComputation* computation);
+
   static void GetDependentDialects(mlir::DialectRegistry& registry);
 
  private:
@@ -122,10 +133,13 @@ class IrEmitterUnnested : public IrEmitter {
 
   Status EmitUnreachable(mlir::Operation* op, std::string error_message);
 
+  Status EmitCommandBufferThunk(const HloInstruction* instr);
+
   // IrEmitterUnnested handles the following instructions differently from
   // IrEmitter. It also mixes in some special handling for custom kernels
   // via the ThunkEmitter.
-  Status EmitConstant(mlir::Operation* op);
+  Status EmitConstant(mlir::Operation* op, const Literal& literal);
+  Status EmitConstant(const HloConstantInstruction* instr);
 
   Status EmitConditional(
       mlir::Operation* op,
@@ -140,11 +154,9 @@ class IrEmitterUnnested : public IrEmitter {
   Status EmitCublasLtMatmulThunkF8(mlir::Operation* op);
   Status EmitConvolutionReorderThunk(mlir::Operation* op);
   Status EmitNormThunk(mlir::Operation* op);
-  Status EmitTritonFusion(
-      const HloFusionAnalysis& hlo_fusion_analysis, mlir::Operation* op,
-      const TritonGemmConfig& config,
-      const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-          hlo_for_lmhlo);
+  StatusOr<FusionEmissionResult> EmitTritonFusion(
+      const HloFusionAnalysis& hlo_fusion_analysis,
+      const HloFusionInstruction* fusion, mlir::Operation* op);
   Status EmitFusedMHAThunk(mlir::Operation* op);
   Status EmitFusedMHABackwardThunk(mlir::Operation* op);
 #endif  // GOOGLE_CUDA
@@ -153,32 +165,42 @@ class IrEmitterUnnested : public IrEmitter {
   Status EmitCholeskyThunk(mlir::Operation* op);
   Status EmitCholeskyThunk(const HloInstruction* instr);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  Status EmitCustomCallThunk(mlir::Operation* op);
+  Status EmitCustomCallThunk(mlir::Operation* op,
+                             const HloCustomCallInstruction* instr);
   Status EmitFftThunk(mlir::Operation* op);
   Status EmitFusion(
       mlir::Operation* op,
       const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
           hlo_for_lmhlo);
-  Status EmitFusion(const HloFusionInstruction* instr,
-                    HloFusionAnalysis& fusion_analysis);
+  Status EmitFusion(
+      const HloFusionInstruction* instr, HloFusionAnalysis& fusion_analysis,
+      mlir::Operation* op,
+      const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
+          hlo_for_lmhlo);
   Status EmitSelectAndScatter(
       mlir::Operation* op,
       const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
           hlo_for_lmhlo);
   Status EmitWhile(
-      mlir::Operation* op,
+      mlir::Operation* op, const HloInstruction* instr,
       const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
           hlo_for_lmhlo);
+  Status EmitWhile(const HloInstruction* instr);
   Status EmitInfeed(mlir::Operation* op);
   Status EmitOutfeed(mlir::Operation* op);
   Status EmitRngGetAndUpdateState(mlir::Operation* op);
-  Status EmitSort(
-      mlir::Operation* op,
-      const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-          hlo_for_lmhlo);
+  Status EmitSort(mlir::Operation* op, const HloSortInstruction* sort);
+  Status EmitSort(const HloSortInstruction* sort);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   Status EmitTriangularSolveCustomCall(mlir::Operation* op);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  Status EmitTopKCustomCall(const HloCustomCallInstruction* instr);
+
+  Status EmitSendThunk(const HloSendInstruction* instr);
+  Status EmitSendDoneThunk(const HloSendDoneInstruction* instr);
+
+  Status EmitRecvThunk(const HloRecvInstruction* instr);
+  Status EmitRecvDoneThunk(const HloRecvDoneInstruction* instr);
 
   template <typename NcclThunkType, typename OpT>
   Status EmitNcclThunk(mlir::Operation* op);
@@ -196,6 +218,8 @@ class IrEmitterUnnested : public IrEmitter {
       const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
           hlo_for_lmhlo);
 
+  Status EmitHloInstruction(const HloInstruction* instr);
+
   static Thunk::ThunkInfo GetThunkInfo(mlir::Operation* op);
 
   Status EmitTargetElementLoop(
@@ -340,7 +364,7 @@ class IrEmitterUnnested : public IrEmitter {
     Shape operand_shape;
     Shape scatter_indices_shape;
     Shape updates_shape;
-    mlir::mhlo::ScatterDimensionNumbersAttr dim_numbers;
+    ScatterDimensionNumbers dim_numbers;
     bool unique_indices;
     const HloComputation* update_computation;
     llvm_ir::IrArray output;
@@ -354,9 +378,15 @@ class IrEmitterUnnested : public IrEmitter {
   Status EmitScatter(const ScatterDescriptor& desc,
                      const LaunchDimensions& launch_dimensions);
 
-  Status EmitScatter(mlir::lmhlo::FusionOp fusion_op,
-                     const HloComputation* fused_computation,
-                     HloFusionAnalysis& fusion_analysis);
+  StatusOr<FusionEmissionResult> EmitScatter(
+      const HloFusionInstruction* fusion, mlir::lmhlo::FusionOp fusion_op,
+      HloFusionAnalysis& fusion_analysis);
+
+  // Emits kernel thunk for a custom fusion implemented with hand written custom
+  // device kernels.
+  StatusOr<FusionEmissionResult> EmitCustomFusion(
+      const HloFusionInstruction* fusion, mlir::lmhlo::FusionOp fusion_op,
+      const CustomFusionConfig& config);
 
   // Builds a kernel thunk for a non-fusion operation, without reuse.
   //
@@ -379,8 +409,16 @@ class IrEmitterUnnested : public IrEmitter {
                                  mlir::ValueRange needed_operands,
                                  const LaunchDimensions& launch_dimensions);
 
-  Status BuildInitializerThunk(mlir::Operation* op, mlir::Value init_value,
-                               mlir::Value dest);
+  StatusOr<std::pair<std::vector<llvm_ir::IrArray> /*inputs*/,
+                     std::vector<llvm_ir::IrArray> /*outputs*/>>
+  BuildKernelThunkForNonFusionOp(
+      const HloInstruction* hlo,
+      absl::Span<const HloInstruction* const> needed_operands,
+      const LaunchDimensions& launch_dimensions);
+
+  Status BuildInitializerThunk(mlir::Operation* op, const HloInstruction* instr,
+                               const HloInstruction* init_value,
+                               mlir::Value init_value_mlir, mlir::Value dest);
 
   // Returns a WhileThunk that invokes thunk sequences for 'condition' and
   // 'body' sub-computations of while instruction 'hlo'.
@@ -389,6 +427,9 @@ class IrEmitterUnnested : public IrEmitter {
       const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
           hlo_for_lmhlo);
 
+  StatusOr<std::unique_ptr<Thunk>> BuildWhileThunk(
+      const HloInstruction* instr, const Thunk::ThunkInfo& thunk_info);
+
   // Returns a ForThunk which executes 'loop_limit' invocations of a thunk
   // sequence from the 'body' sub-computation of the while instruction 'hlo'.
   StatusOr<std::unique_ptr<Thunk>> BuildForThunk(
@@ -416,6 +457,9 @@ class IrEmitterUnnested : public IrEmitter {
   absl::flat_hash_map<mlir::Operation*, NcclCollectiveThunk::AsyncExecutor*>
       async_executors_;
 
+  // Container for async send/recv events shared by send/recv thunks.
+  std::shared_ptr<SendRecvAsyncEvents> send_recv_events_;
+
   // Begin optional members for XLA HLO -> LMHLO:
   absl::flat_hash_map<const mlir::Region*, std::unique_ptr<HloModule>>
       scratch_nested_computations_;
diff --git a/third_party/xla/xla/service/gpu/kernel_arguments.cc b/third_party/xla/xla/service/gpu/kernel_arguments.cc
index 5fcea6d418a286..f9f8fae5ff1749 100644
--- a/third_party/xla/xla/service/gpu/kernel_arguments.cc
+++ b/third_party/xla/xla/service/gpu/kernel_arguments.cc
@@ -164,5 +164,34 @@ StatusOr<KernelArguments> KernelArguments::Create(
   return KernelArguments{std::move(kernel_arguments)};
 }
 
+StatusOr<KernelArguments> KernelArguments::Create(
+    const BufferAssignment& buffer_assignment,
+    const HloInstruction* non_fusion_hlo,
+    absl::Span<const HloInstruction* const> needed_operands) {
+  std::vector<KernelArgument> kernel_arguments;
+  for (const HloInstruction* operand : needed_operands) {
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
+                        buffer_assignment.GetUniqueSlice(operand, {}));
+    kernel_arguments.emplace_back(KernelArgument(
+        /*value=*/nullptr, operand->shape(), slice, /*written=*/false));
+  }
+
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      non_fusion_hlo->shape(),
+      [&](const Shape& subshape, const ShapeIndex& index) {
+        if (!subshape.IsArray()) return OkStatus();
+
+        TF_ASSIGN_OR_RETURN(
+            BufferAllocation::Slice slice,
+            buffer_assignment.GetUniqueSlice(non_fusion_hlo, index));
+
+        kernel_arguments.emplace_back(KernelArgument(
+            /*value=*/nullptr, subshape, slice, /*written=*/true));
+        return OkStatus();
+      }));
+
+  return KernelArguments{std::move(kernel_arguments)};
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/kernel_arguments.h b/third_party/xla/xla/service/gpu/kernel_arguments.h
index 12926227edec46..5e3fc9fe17084c 100644
--- a/third_party/xla/xla/service/gpu/kernel_arguments.h
+++ b/third_party/xla/xla/service/gpu/kernel_arguments.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/service/buffer_assignment.h"
@@ -79,6 +80,11 @@ class KernelArguments {
       absl::Span<const BufferAllocation* const> allocations,
       mlir::Operation* non_fusion_op, mlir::ValueRange needed_operands);
 
+  static StatusOr<KernelArguments> Create(
+      const BufferAssignment& buffer_assignment,
+      const HloInstruction* non_fusion_hlo,
+      absl::Span<const HloInstruction* const> needed_operands);
+
   const std::vector<KernelArgument>& args() const { return args_; }
 
  private:
diff --git a/third_party/xla/xla/service/gpu/kernel_mapping_scheme.h b/third_party/xla/xla/service/gpu/kernel_mapping_scheme.h
index 4a6f0f7ae3c6fa..92ad74b634c03e 100644
--- a/third_party/xla/xla/service/gpu/kernel_mapping_scheme.h
+++ b/third_party/xla/xla/service/gpu/kernel_mapping_scheme.h
@@ -146,34 +146,34 @@ class TilingScheme {
 
  private:
   // The number of elements in each dimension.
-  const Vector3 dims_in_elems_;
+  Vector3 dims_in_elems_;
 
   // The number of elements for each dimension of a tile.
-  const Vector3 tile_sizes_;
+  Vector3 tile_sizes_;
 
   // The dimensions which are used for the shared memory tile.
-  const Vector2 tiling_dimensions_;
+  Vector2 tiling_dimensions_;
 
   // Number of threads implicitly assigned to each dimension.
-  const Vector3 num_threads_;
+  Vector3 num_threads_;
 
-  const IndexingOrder indexing_order_;
+  IndexingOrder indexing_order_;
 
   // Vector size for dimension X.
-  const int vector_size_;
+  int vector_size_;
 
   // Scaling apply to transform physical threadIdx into logical.
-  const int64_t thread_id_virtual_scaling_ = 1;
+  int64_t thread_id_virtual_scaling_ = 1;
 };
 
 class ReductionCodegenInfo {
  public:
   using IndexGroups = std::vector<std::vector<const HloInstruction*>>;
 
-  explicit ReductionCodegenInfo(TilingScheme mapping_scheme,
-                                int num_partial_results, bool is_row_reduction,
-                                bool is_race_free, IndexGroups index_groups,
-                                const HloInstruction* first_reduce)
+  ReductionCodegenInfo(TilingScheme mapping_scheme, int num_partial_results,
+                       bool is_row_reduction, bool is_race_free,
+                       IndexGroups index_groups,
+                       const HloInstruction* first_reduce)
       : tiling_scheme_(mapping_scheme),
         num_partial_results_(num_partial_results),
         is_row_reduction_(is_row_reduction),
@@ -193,12 +193,11 @@ class ReductionCodegenInfo {
   }
 
   int GetNumPartialResults() const { return num_partial_results_; }
+  bool IsRowReduction() const { return is_row_reduction_; }
   bool IsRaceFree() const { return is_race_free_; }
 
  private:
-  friend class ReductionCodegenState;
-
-  const TilingScheme tiling_scheme_;
+  TilingScheme tiling_scheme_;
   int num_partial_results_;
   bool is_row_reduction_;
   bool is_race_free_;
@@ -206,59 +205,6 @@ class ReductionCodegenInfo {
   const HloInstruction* first_reduce_;
 };
 
-class ReductionCodegenState {
- public:
-  struct ReductionCalculationState {
-    llvm::GlobalVariable* shared_cache;
-    llvm::Value* initial_value;
-    llvm::AllocaInst* partial_result_address;
-    llvm::AllocaInst* input_address;
-    llvm_ir::ElementGenerator input_gen;
-  };
-
-  explicit ReductionCodegenState(
-      const ReductionCodegenInfo& reduction_codegen_info)
-      : reduction_codegen_info_(reduction_codegen_info) {}
-
-  const TilingScheme& GetTilingScheme() const {
-    return reduction_codegen_info_.tiling_scheme_;
-  }
-
-  int GetNumPartialResults() const {
-    return reduction_codegen_info_.num_partial_results_;
-  }
-
-  bool IsRowReduction() const {
-    return reduction_codegen_info_.is_row_reduction_;
-  }
-
-  bool IsRaceFree() const { return reduction_codegen_info_.IsRaceFree(); }
-
-  const ReductionCalculationState& GetCalculationStateFor(
-      const HloInstruction* instruction, int operand_idx) const {
-    const ReductionOpState& op_state = state_.at(instruction);
-    CHECK_LT(operand_idx, op_state.size());
-    return op_state[operand_idx];
-  }
-
-  void SetCalculationStateFor(
-      const ReductionCalculationState& calculation_state,
-      const HloInstruction* instruction, int operand_idx) {
-    ReductionOpState& op_state = state_[instruction];
-    CHECK_EQ(operand_idx, op_state.size());
-    op_state.push_back(calculation_state);
-  }
-
- private:
-  ReductionCodegenInfo reduction_codegen_info_;
-
-  // One state per reduction operand.
-  using ReductionOpState = absl::InlinedVector<ReductionCalculationState, 2>;
-
-  // HloInstruction -> operand_idx -> cache
-  absl::flat_hash_map<const HloInstruction*, ReductionOpState> state_;
-};
-
 }  // end namespace gpu
 }  // end namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/kernel_thunk.cc b/third_party/xla/xla/service/gpu/kernel_thunk.cc
index 1ddff6ecf20acc..04c53bce052d6a 100644
--- a/third_party/xla/xla/service/gpu/kernel_thunk.cc
+++ b/third_party/xla/xla/service/gpu/kernel_thunk.cc
@@ -22,21 +22,32 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/kernel_arguments.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/thunk.h"
+#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
+//===----------------------------------------------------------------------===//
+// KernelThunk
+//===----------------------------------------------------------------------===//
+
 mlir::Value RemoveTransformingOperations(mlir::Value value) {
   mlir::Operation* defining_op = value.getDefiningOp();
   if (auto cast_op = llvm::isa<mlir::memref::ReinterpretCastOp,
@@ -98,7 +109,7 @@ Status KernelThunk::Initialize(se::StreamExecutor* executor,
   // profiles.
   auto it = kernel_cache_.find(executor);
   if (kernel_cache_.end() == it) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::KernelBase> kernel,
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Kernel> kernel,
                         CreateKernel(kernel_name_, args_.size(), src.text,
                                      src.binary, executor, shmem_bytes_));
 
@@ -129,7 +140,7 @@ Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
   // Load the kernel.
   se::StreamExecutor* executor = params.stream->parent();
   LaunchDimensions launch_dimensions;
-  const se::KernelBase* kernel = nullptr;
+  const se::Kernel* kernel = nullptr;
 
   {
     absl::MutexLock lock(&mutex_);
@@ -157,5 +168,97 @@ Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
                                params.stream);
 }
 
+//===----------------------------------------------------------------------===//
+// CustomKernelThunk
+//===----------------------------------------------------------------------===//
+
+CustomKernelThunk::CustomKernelThunk(
+    std::variant<mlir::Operation*, const HloInstruction*> instr,
+    CustomKernel custom_kernel,
+    absl::Span<const KernelArgument> kernel_arguments)
+    : Thunk(Kind::kCustomKernel,
+            std::holds_alternative<mlir::Operation*>(instr)
+                ? Thunk::ThunkInfo::WithProfileAnnotation(
+                      std::get<mlir::Operation*>(instr))
+                : Thunk::ThunkInfo::WithProfileAnnotation(
+                      std::get<const HloInstruction*>(instr))),
+      custom_kernel_(std::move(custom_kernel)) {
+  args_.reserve(kernel_arguments.size());
+  written_.reserve(kernel_arguments.size());
+  for (const auto& kernel_argument : kernel_arguments) {
+    if (!kernel_argument.first_with_same_slice().has_value()) {
+      args_.push_back(kernel_argument.slice());
+      written_.push_back(kernel_argument.written());
+    }
+  }
+
+  if (std::holds_alternative<const HloInstruction*>(instr)) {
+    // Skip populating MLIR values_ if emitting from HLO.
+    return;
+  }
+
+  values_.reserve(kernel_arguments.size());
+  for (const auto& kernel_argument : kernel_arguments) {
+    if (!kernel_argument.first_with_same_slice().has_value()) {
+      values_.push_back(RemoveTransformingOperations(kernel_argument.value()));
+    }
+  }
+}
+
+std::string CustomKernelThunk::ToStringExtra(int indent) const {
+  return custom_kernel_.ToString();
+}
+
+Status CustomKernelThunk::Initialize(se::StreamExecutor* executor,
+                                     ExecutableSource src) {
+  absl::MutexLock lock(&mutex_);
+
+  auto it = kernel_cache_.find(executor);
+  if (kernel_cache_.end() == it) {
+    auto kernel = std::make_unique<se::Kernel>(executor);
+    TF_RETURN_IF_ERROR(
+        executor->GetKernel(custom_kernel_.kernel_spec(), kernel.get()));
+    kernel_cache_.emplace(executor, std::move(kernel));
+  }
+
+  return OkStatus();
+}
+
+Status CustomKernelThunk::ExecuteOnStream(const ExecuteParams& params) {
+  se::StreamExecutor* executor = params.stream->parent();
+
+  const se::Kernel* kernel = [&] {
+    absl::MutexLock lock(&mutex_);
+    return kernel_cache_[executor].get();
+  }();
+
+  VLOG(3) << "Launching " << custom_kernel_.ToString() << " as device kernel "
+          << kernel->name();
+
+  absl::InlinedVector<se::DeviceMemoryBase, 4> buffer_args;
+  for (const BufferAllocation::Slice& arg : args_) {
+    se::DeviceMemoryBase buf = params.buffer_allocations->GetDeviceAddress(arg);
+    VLOG(3) << "  Arg: alloc #" << arg.index() << ", offset: " << arg.offset()
+            << ": " << buf.opaque() << " (" << buf.size() << "B)";
+    buffer_args.push_back(buf);
+  }
+
+  if (VLOG_IS_ON(100)) {
+    PrintBufferContents(params.stream, buffer_args);
+  }
+
+  se::KernelArgsDeviceMemoryArray args(buffer_args,
+                                       custom_kernel_.shared_memory_bytes());
+
+  if (auto cluster = custom_kernel_.cluster_dims(); cluster.has_value()) {
+    return executor->Launch(params.stream, custom_kernel_.thread_dims(),
+                            custom_kernel_.block_dims(), *cluster, *kernel,
+                            args);
+  } else {
+    return executor->Launch(params.stream, custom_kernel_.thread_dims(),
+                            custom_kernel_.block_dims(), *kernel, args);
+  }
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/kernel_thunk.h b/third_party/xla/xla/service/gpu/kernel_thunk.h
index c95372be10e192..2c16d3cc1659f0 100644
--- a/third_party/xla/xla/service/gpu/kernel_thunk.h
+++ b/third_party/xla/xla/service/gpu/kernel_thunk.h
@@ -16,11 +16,16 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_KERNEL_THUNK_H_
 #define XLA_SERVICE_GPU_KERNEL_THUNK_H_
 
+#include <cstdint>
 #include <memory>
 #include <string>
+#include <string_view>
+#include <variant>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -28,16 +33,29 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/kernel_arguments.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/thunk.h"
+#include "xla/status.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/types.h"
+#include "xla/types.h"  // IWYU pragma: keep
 
 namespace xla {
 namespace gpu {
 
 class GpuExecutable;
 
+// TODO(ezhulenev): Unify KernelThunk and CustomKernelThunk as they are very
+// similar. XLA:GPU should use more of kernel loading APIs provided by
+// StreamExecutor out of the box and less custom kernel loading solutions.
+//
+// Today KernelThunk is required for lowering to XLA runtime, and
+// CustomKernelThunk is only supported for thunk execution.
+
+//===----------------------------------------------------------------------===//
+// KernelThunk
+//===----------------------------------------------------------------------===//
+
 // This class stores everything that StreamExecutor needs for launching a
 // kernel. It implements the ExecuteOnStream interface for GpuExecutable to
 // invoke the corresponding kernel.
@@ -104,11 +122,68 @@ class KernelThunk : public Thunk {
   // mlir::Value(s) corresponding to the buffer slice arguments.
   std::vector<mlir::Value> values_;
 
+  // Loaded kernels for each `StreamExecutor`.
   mutable absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>>
+      kernel_cache_ ABSL_GUARDED_BY(mutex_);
+};
+
+//===----------------------------------------------------------------------===//
+// CustomKernelThunk
+//===----------------------------------------------------------------------===//
+
+// CustomKernelThunk loads and executes kernels defined by a custom kernel
+// (which in practice means hand written CUDA C++ kernel), instead of a kernel
+// compiled by XLA and loaded from an executable source.
+class CustomKernelThunk : public Thunk {
+ public:
+  CustomKernelThunk(std::variant<mlir::Operation*, const HloInstruction*> inst,
+                    CustomKernel custom_kernel,
+                    absl::Span<const KernelArgument> kernel_arguments);
+
+  std::string ToStringExtra(int indent) const override;
+
+  Status Initialize(se::StreamExecutor* executor,
+                    ExecutableSource src) override;
+  Status ExecuteOnStream(const ExecuteParams& params) override;
+
+  // TODO(ezhulenev): All of the APIs below needed only for LMHLO lowering and
+  // should be removed after we migrate to Thunks runtime.
+
+  std::string_view custom_kernel_name() const { return custom_kernel_.name(); }
+
+  const std::vector<bool>& written() const { return written_; }
+  absl::Span<const mlir::Value> values() const { return values_; }
+
+  LaunchDimensions launch_dimensions() const {
+    LaunchDimensions::Dim3D threads;
+    threads.x = custom_kernel_.thread_dims().x;
+    threads.y = custom_kernel_.thread_dims().y;
+    threads.z = custom_kernel_.thread_dims().z;
+    LaunchDimensions::Dim3D blocks;
+    blocks.x = custom_kernel_.block_dims().x;
+    blocks.y = custom_kernel_.block_dims().y;
+    blocks.z = custom_kernel_.block_dims().z;
+    return LaunchDimensions(blocks, threads);
+  }
+
+  int64_t shmem_bytes() const { return custom_kernel_.shared_memory_bytes(); }
+
+ private:
+  // Buffer slices passed to the kernel as arguments.
+  std::vector<BufferAllocation::Slice> args_;
 
-  // Loaded kernels for each `StreamExecutor`.  Requires pointer stability of
-  // values.
-  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::KernelBase>>
+  // args_[i] is written iff (written_[i] == true).
+  std::vector<bool> written_;
+
+  // mlir::Value(s) corresponding to the buffer slice arguments.
+  std::vector<mlir::Value> values_;
+
+  CustomKernel custom_kernel_;
+
+  // Loaded kernels for each `StreamExecutor`.
+  mutable absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>>
       kernel_cache_ ABSL_GUARDED_BY(mutex_);
 };
 
diff --git a/third_party/xla/xla/service/gpu/kernels/BUILD b/third_party/xla/xla/service/gpu/kernels/BUILD
new file mode 100644
index 00000000000000..a3dd0693433460
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/BUILD
@@ -0,0 +1,316 @@
+load("//xla/tests:build_defs.bzl", "xla_test")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = ["//xla:friends"],
+)
+
+cc_library(
+    name = "custom_fusion",
+    srcs = ["custom_fusion.cc"],
+    hdrs = ["custom_fusion.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":custom_kernel",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla/hlo/ir:hlo",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "custom_fusion_pattern",
+    srcs = ["custom_fusion_pattern.cc"],
+    hdrs = ["custom_fusion_pattern.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:statusor",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "custom_kernel",
+    srcs = ["custom_kernel.cc"],
+    hdrs = ["custom_kernel.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:statusor",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+# Bundle all custom fusions into a single target, so we can link all fusions and patterns by adding
+# a single dependency.
+cc_library(
+    name = "custom_fusion_library",
+    visibility = ["//visibility:public"],
+    deps = [":cutlass_gemm_fusion"],
+)
+
+cc_library(
+    name = "cutlass_gemm_fusion",
+    srcs = ["cutlass_gemm_fusion.cc"],
+    hdrs = ["cutlass_gemm_fusion.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":custom_fusion",
+        ":custom_fusion_pattern",
+        ":custom_kernel",
+        ":cutlass_gemm",
+        ":cutlass_gemm_custom_kernel",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:pattern_matcher",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+    alwayslink = 1,  # static fusion registration
+)
+
+xla_test(
+    name = "cutlass_gemm_fusion_test",
+    srcs = ["cutlass_gemm_fusion_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        ":custom_fusion_pattern",
+        ":cutlass_gemm_fusion",
+        "//xla:array",
+        "//xla:array2d",
+        "//xla:array3d",
+        "//xla:debug_options_flags",
+        "//xla:error_spec",
+        "//xla:literal_util",
+        "//xla:types",
+        "//xla/service/gpu:custom_fusion_rewriter",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
+cc_library(
+    name = "topk_custom_kernel",
+    srcs = if_cuda_is_configured(
+        ["topk_custom_kernel.cc"],
+        ["topk_custom_kernel_stub.cc"],
+    ),
+    hdrs = ["topk_custom_kernel.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":custom_kernel",
+        "//xla:statusor",
+        "//xla:xla_data_proto_cc",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:statusor",
+    ] + if_cuda_is_configured([
+        "//xla/service/gpu/runtime:topk_kernel_cuda",
+    ]) + if_rocm_is_configured([
+        "//xla/service/gpu/runtime:topk_kernel_rocm",
+    ]),
+)
+
+#===--------------------------------------------------------------------------------------------===#
+# CUTLASS Gemm <-> xla::gpu::kernel::CustomKernel adaptor
+#===--------------------------------------------------------------------------------------------===#
+
+cc_library(
+    name = "cutlass_gemm_custom_kernel",
+    srcs = if_cuda_is_configured(
+        ["cutlass_gemm_custom_kernel.cc"],
+        ["cutlass_gemm_custom_kernel_stub.cc"],
+    ),
+    hdrs = ["cutlass_gemm_custom_kernel.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":custom_kernel",
+        ":cutlass_gemm",
+        ":cutlass_gemm_kernels",
+        "//xla:statusor",
+        "//xla:xla_data_proto_cc",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@cutlass_archive//:cutlass",
+    ],
+)
+
+xla_test(
+    name = "cutlass_gemm_custom_kernel_test",
+    srcs = if_cuda_is_configured(["cutlass_gemm_custom_kernel_test.cc"]),
+    backends = ["gpu"],
+    data = [":cutlass_gemm_kernel_f32xf32_to_f32.so"],
+    deps = [
+        ":cutlass_gemm_custom_kernel",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/stream_executor",
+        "//xla/stream_executor:multi_platform_manager",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor/cuda:cuda_platform",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
+cc_binary(
+    name = "cutlass_gemm_custom_kernel_benchmarks",
+    testonly = 1,
+    srcs = if_cuda_is_configured(["cutlass_gemm_custom_kernel_benchmarks.cc"]),
+    deps = [
+        ":cutlass_gemm_custom_kernel",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:gpu_plugin",
+        "//xla/stream_executor",
+        "//xla/stream_executor:multi_platform_manager",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor/cuda:cuda_platform",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_benchmark",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
+#===--------------------------------------------------------------------------------------------===#
+# CUTLASS GemmUniversal-base kernels <-> StreamExecutor adaptor
+#===--------------------------------------------------------------------------------------------===#
+
+cc_library(
+    name = "cutlass_gemm",
+    srcs = ["cutlass_gemm.cc"],
+    hdrs = ["cutlass_gemm.h"],
+    visibility = ["//visibility:public"],
+    deps = ["@local_tsl//tsl/platform:logging"],
+)
+
+cuda_library(
+    name = "cutlass_gemm_adaptor",
+    hdrs = if_cuda_is_configured(["cutlass_gemm_adaptor.cu.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":cutlass_gemm",
+        "@cutlass_archive//:cutlass",
+    ]),
+)
+
+#===--------------------------------------------------------------------------------------------===#
+# CUTLASS Gemm kernels implementation
+#===--------------------------------------------------------------------------------------------===#
+
+# We split each individual kernel into a separate targets to compile them all in parallel. We also
+# do not have any dependencies except CUTLASS itself to reduce the number of recompilations.
+
+cc_library(
+    name = "cutlass_gemm_kernels",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cutlass_gemm_kernel_bf16xbf16_to_bf16",
+        ":cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80",
+        ":cutlass_gemm_kernel_f32xf32_to_f32",
+    ],
+)
+
+# CUTLASS requires all loops to be unrolled, and in some kernels defined below we force Clang/LLVM
+# to unroll them with extra compiler options because by default LLVM is not as aggressive with loop
+# unrolling as NVCC.
+
+# TODO(ezhulenev): Write a build rule to simplify kernel target declarations.
+
+cuda_library(
+    name = "cutlass_gemm_kernel_bf16xbf16_to_bf16",
+    srcs = if_cuda_is_configured(["cutlass_gemm_kernel_bf16xbf16_to_bf16.cu.cc"]),
+    copts = ["-mllvm -unroll-threshold=100000"],
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":cutlass_gemm_adaptor",
+        "@cutlass_archive//:cutlass",
+        "@local_config_cuda//cuda:cuda_headers",
+    ]),
+)
+
+cuda_library(
+    name = "cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80",
+    srcs = if_cuda_is_configured(["cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80.cu.cc"]),
+    copts = ["-mllvm -unroll-threshold=100000"],
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":cutlass_gemm_adaptor",
+        "@cutlass_archive//:cutlass",
+        "@local_config_cuda//cuda:cuda_headers",
+    ]),
+)
+
+cuda_library(
+    name = "cutlass_gemm_kernel_f32xf32_to_f32",
+    srcs = if_cuda_is_configured(["cutlass_gemm_kernel_f32xf32_to_f32.cu.cc"]),
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":cutlass_gemm_adaptor",
+        "@cutlass_archive//:cutlass",
+        "@local_config_cuda//cuda:cuda_headers",
+    ]),
+)
+
+#===--------------------------------------------------------------------------------------------===#
+# CUTLASS Gemm kernel libraries
+#===--------------------------------------------------------------------------------------------===#
+
+cc_binary(
+    name = "cutlass_gemm_kernel_f32xf32_to_f32.so",
+    srcs = if_cuda_is_configured(["cutlass_gemm_kernel_f32xf32_to_f32.cc"]),
+    linkshared = True,
+    linkstatic = False,
+    deps = [
+        ":cutlass_gemm",
+        ":cutlass_gemm_kernel_f32xf32_to_f32",
+    ],
+)
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_fusion.cc b/third_party/xla/xla/service/gpu/kernels/custom_fusion.cc
new file mode 100644
index 00000000000000..c35ecfa833fe38
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/custom_fusion.cc
@@ -0,0 +1,56 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/kernels/custom_fusion.h"
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/status.h"
+
+namespace xla::gpu {
+
+//===----------------------------------------------------------------------===//
+// CustomFusionRegistry
+//===----------------------------------------------------------------------===//
+
+CustomFusionRegistry* CustomFusionRegistry::Default() {
+  static auto* registry = new CustomFusionRegistry();
+  return registry;
+}
+
+Status CustomFusionRegistry::Register(std::string name,
+                                      std::unique_ptr<CustomFusion> fusion) {
+  absl::MutexLock lock(&mutex_);
+  if (auto it = registry_.try_emplace(name, std::move(fusion)); it.second)
+    return OkStatus();
+  return absl::InternalError(
+      absl::StrCat("Custom fusion ", name, " already registered."));
+}
+
+CustomFusion* CustomFusionRegistry::Lookup(std::string_view name) const {
+  absl::MutexLock lock(&mutex_);
+  if (auto it = registry_.find(name); it != registry_.end())
+    return it->second.get();
+  return nullptr;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_fusion.h b/third_party/xla/xla/service/gpu/kernels/custom_fusion.h
new file mode 100644
index 00000000000000..51425d767f5f5d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/custom_fusion.h
@@ -0,0 +1,151 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUSTOM_FUSION_H_
+#define XLA_SERVICE_GPU_KERNELS_CUSTOM_FUSION_H_
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
+#include "tsl/platform/logging.h"
+
+namespace xla::gpu {
+
+//===----------------------------------------------------------------------===//
+// CustomFusion
+//===----------------------------------------------------------------------===//
+
+// Custom fusion is a mechanism for registering custom kernels corresponding to
+// HLO fusions.
+//
+// Example: row-major mixed dtype gemm with fused bitcast
+//
+//   %gemm (parameter_0: s8[19,17], parameter_1: f16[15,19]) -> f16[15,17] {
+//     %parameter_1 = f16[15,19]{1,0} parameter(1)
+//     %parameter_0 = s8[19,17]{1,0} parameter(0)
+//     %cp1.1 = f16[19,17]{1,0} convert(%parameter_0)
+//     ROOT %r.1 = f16[15,17]{1,0} dot(%parameter_1, %cp1.1),
+//                                   lhs_contracting_dims={1},
+//                                   rhs_contracting_dims={0}
+//  }
+//
+//  ENTRY %e (p0: f16[15,19], p1: s8[19,17]) -> f16[15,17] {
+//    %p1 = s8[19,17]{1,0} parameter(1)
+//    %p0 = f16[15,19]{1,0} parameter(0)
+//    ROOT %gemm = f16[15,17]{1,0} fusion(%p1, %p0), kind=kCustom,
+//                                 <implementation detail backend config>
+//  }
+//
+// XLA:GPU has multiple strategies for executing this fusion on device:
+//
+// (1) cuBLAS library call: a lot of simple gemm operations are supported by
+//     cuBLAS out of the box. However some combinations of paramters casting and
+//     epilogue fusion are not supported, which means that XLA has to form
+//     smaller fusions or use code generation to compiled a device kernel.
+//
+// (2) Triton: XLA:GPU uses Triton to codegen gemm fusion into devie kernels
+//     (PTX and CUBIN for NVIDIA gpus).
+//
+// (3) Custom fusion is another mechanism to execute fusion on device, which
+//     relies on pre-compiled libraries of custom kernels authored by CUDA C++
+//     experts. Custom fusion implements one particular fusion pattern (e.g.
+//     type casting plus a dot operation like in the example above) with custom
+//     kernels that XLA has to choose from at run time based on auto tuning.
+//
+//     In practice custom fusion almost always implemented with multiple
+//     kernels, because input shapes are not known at compile time, and custom
+//     fusion has multiple kernels with different tiling schemes.
+//
+// What differentiates custom fusions from custom calls, is that custom fusion
+// should be implemented with a device kernel, and this allows XLA:GPU to treat
+// custom fusion just like any other device kernel: it's launched as a regular
+// KernelThunk and automatically captured into command buffers.
+//
+// Custom calls (registered with XLA:FFI) on the other hand gives much more
+// flexibility, and can be implemented as a combination of a non-trivial host
+// side code plus multiple kernel launches or library calls.
+//
+// Also XLA:FFI offers a stable C API that allows registering external functions
+// loaded from dynamic libraries compiled with a different toolchain of XLA
+// version. Custom fusions integration relies on C++ ABI and static linking.
+//
+// TODO(ezhulenev): It should be possible to lower `stablehlo.custom_call`
+// operations to custom fusions, albeit with a static linking restriction.
+class CustomFusion {
+ public:
+  virtual ~CustomFusion() = default;
+
+  // Loads kernels implementing `hlo_computation` optimized for a given device.
+  virtual StatusOr<std::vector<CustomKernel>> LoadKernels(
+      const se::DeviceDescription& device,
+      const HloComputation* computation) const = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// CustomFusionRegistry
+//===----------------------------------------------------------------------===//
+
+// Custom fusion registry is a mapping from a custom fusion name to the custom
+// fusion implementation, and XLA compiler uses this registry to lower fusion
+// operations to kernels when emitting thunks.
+class CustomFusionRegistry {
+ public:
+  // Returns a pointer to a default custom fusion registry, which is a global
+  // static registry.
+  static CustomFusionRegistry* Default();
+
+  // Registers custom fusion in the registry. Returns error if fusion with the
+  // given name already registered.
+  Status Register(std::string name, std::unique_ptr<CustomFusion> fusion);
+
+  // Looks up custom fusion by name. Return nullptr if it's not found.
+  CustomFusion* Lookup(std::string_view name) const;
+
+ private:
+  mutable absl::Mutex mutex_;
+  absl::flat_hash_map<std::string, std::unique_ptr<CustomFusion>> registry_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace xla::gpu
+
+#define XLA_REGISTER_CUSTOM_FUSION(NAME, FUSION) \
+  XLA_REGISTER_CUSTOM_FUSION_(NAME, FUSION, __COUNTER__)
+
+#define XLA_REGISTER_CUSTOM_FUSION_(NAME, FUSION, N) \
+  XLA_REGISTER_CUSTOM_FUSION__(NAME, FUSION, N)
+
+#define XLA_REGISTER_CUSTOM_FUSION__(NAME, FUSION, N)              \
+  ABSL_ATTRIBUTE_UNUSED static const bool                          \
+      xla_custom_fusion_##N##_registered_ = [] {                   \
+        ::xla::Status status =                                     \
+            ::xla::gpu::CustomFusionRegistry::Default()->Register( \
+                NAME, std::make_unique<FUSION>());                 \
+        if (!status.ok()) LOG(ERROR) << status;                    \
+        return status.ok();                                        \
+      }()
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUSTOM_FUSION_H_
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_fusion_pattern.cc b/third_party/xla/xla/service/gpu/kernels/custom_fusion_pattern.cc
new file mode 100644
index 00000000000000..6da062c1363b39
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/custom_fusion_pattern.cc
@@ -0,0 +1,83 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/kernels/custom_fusion_pattern.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+//===----------------------------------------------------------------------===//
+// CustomFusionPattern::Match
+//===----------------------------------------------------------------------===//
+
+CustomFusionPattern::Match::Match(CustomFusionConfig config,
+                                  std::vector<HloInstruction*> instructions)
+    : config_(std::move(config)), instructions_(std::move(instructions)) {}
+
+void CustomFusionPattern::Match::AddReplacement(HloInstruction* instr,
+                                                Replacement replacement) {
+  replacements_[instr] = std::move(replacement);
+}
+
+bool CustomFusionPattern::Match::HasReplacement(HloInstruction* instr) const {
+  return replacements_.contains(instr);
+}
+
+StatusOr<HloInstruction*> CustomFusionPattern::Match::BuildReplacement(
+    HloInstruction* instr, HloFusionInstruction* fusion) const {
+  if (auto it = replacements_.find(instr); it != replacements_.end()) {
+    return it->second(fusion);
+  }
+
+  return absl::InvalidArgumentError(
+      absl::StrCat("no replacement for instruction: ", instr->name()));
+}
+
+//===----------------------------------------------------------------------===//
+// CustomFusionPatternRegistry
+//===----------------------------------------------------------------------===//
+
+CustomFusionPatternRegistry* CustomFusionPatternRegistry::Default() {
+  static auto* registry = new CustomFusionPatternRegistry();
+  return registry;
+}
+
+std::vector<CustomFusionPattern::Match> CustomFusionPatternRegistry::Match(
+    const se::DeviceDescription& device, HloInstruction* instr) const {
+  std::vector<CustomFusionPattern::Match> matches;
+  for (auto& pattern : patterns_) {
+    if (auto matched = pattern->TryMatch(device, instr); matched.has_value())
+      matches.push_back(std::move(*matched));
+  }
+  return matches;
+}
+
+void CustomFusionPatternRegistry::Add(
+    std::unique_ptr<CustomFusionPattern> pattern) {
+  patterns_.push_back(std::move(pattern));
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_fusion_pattern.h b/third_party/xla/xla/service/gpu/kernels/custom_fusion_pattern.h
new file mode 100644
index 00000000000000..308960902e823d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/custom_fusion_pattern.h
@@ -0,0 +1,130 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUSTOM_FUSION_PATTERN_H_
+#define XLA_SERVICE_GPU_KERNELS_CUSTOM_FUSION_PATTERN_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+//===----------------------------------------------------------------------===//
+// CustomFusionPattern
+//===----------------------------------------------------------------------===//
+
+// Custom fusion pattern matches HLO instruction to custom kernels.
+class CustomFusionPattern {
+ public:
+  virtual ~CustomFusionPattern() = default;
+
+  // Matched sequence of instructions that can be handled by a custom fusion.
+  class Match {
+   public:
+    Match(CustomFusionConfig config,
+          std::vector<HloInstruction *> instructions);
+
+    // If some of operations matched by a pattern have users outside of the
+    // custom fusion, pattern can optionally provide a replacement that can be
+    // derived from the fusion instruction result, or from other instructions in
+    // the parent computation.
+    using Replacement =
+        std::function<StatusOr<HloInstruction *>(HloFusionInstruction *)>;
+
+    void AddReplacement(HloInstruction *instr, Replacement replacement);
+    bool HasReplacement(HloInstruction *instr) const;
+
+    // Builds a replacement for `instr` using a `fusion` instruction constructed
+    // for a pattern match.
+    StatusOr<HloInstruction *> BuildReplacement(
+        HloInstruction *instr, HloFusionInstruction *fusion) const;
+
+    const CustomFusionConfig &config() const { return config_; }
+    absl::Span<HloInstruction *const> instructions() const {
+      return instructions_;
+    }
+
+    HloInstruction *root() const { return instructions_.back(); }
+
+   private:
+    CustomFusionConfig config_;
+    std::vector<HloInstruction *> instructions_;
+    absl::flat_hash_map<const HloInstruction *, Replacement> replacements_;
+  };
+
+  // Returns custom fusion config and a list of instructions that matched to a
+  // custom fusion (one or more custom kernels). Custom fusion pass will outline
+  // matched instructions into a custom fusion operation if possible.
+  //
+  // TODO(ezhulenev): Today the last instruction defines custom fusion root
+  // (results), however we need to add support for custom fusion that can return
+  // intermediate result, and custom fusions that require an extra workspace.
+  virtual std::optional<Match> TryMatch(const se::DeviceDescription &device,
+                                        HloInstruction *instr) const = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// CustomFusionPatternRegistry
+//===----------------------------------------------------------------------===//
+
+class CustomFusionPatternRegistry {
+ public:
+  // Returns a pointer to a default custom fusion pattern registry, which is a
+  // global static registry.
+  static CustomFusionPatternRegistry *Default();
+
+  std::vector<CustomFusionPattern::Match> Match(
+      const se::DeviceDescription &device, HloInstruction *instr) const;
+
+  void Add(std::unique_ptr<CustomFusionPattern> pattern);
+
+  template <typename... Ts, typename = std::enable_if_t<sizeof...(Ts) != 0>>
+  void Emplace() {
+    (Add(std::make_unique<Ts>()), ...);
+  }
+
+ private:
+  std::vector<std::unique_ptr<CustomFusionPattern>> patterns_;
+};
+
+}  // namespace xla::gpu
+
+#define XLA_REGISTER_CUSTOM_FUSION_PATTERN(PATTERN) \
+  XLA_REGISTER_CUSTOM_FUSION_PATTERN_(PATTERN, __COUNTER__)
+
+#define XLA_REGISTER_CUSTOM_FUSION_PATTERN_(PATTERN, N) \
+  XLA_REGISTER_CUSTOM_FUSION_PATTERN__(PATTERN, N)
+
+#define XLA_REGISTER_CUSTOM_FUSION_PATTERN__(PATTERN, N)   \
+  ABSL_ATTRIBUTE_UNUSED static const bool                  \
+      xla_custom_fusion_pattern_##N##_registered_ = [] {   \
+        ::xla::gpu::CustomFusionPatternRegistry::Default() \
+            ->Emplace<PATTERN>();                          \
+        return true;                                       \
+      }()
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUSTOM_FUSION_PATTERN_H_
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc b/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc
new file mode 100644
index 00000000000000..16bc995b692c66
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc
@@ -0,0 +1,78 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/kernels/custom_kernel.h"
+
+#include <cstddef>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "absl/strings/str_format.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace xla::gpu {
+
+CustomKernel::CustomKernel(std::string name,
+                           se::MultiKernelLoaderSpec kernel_spec,
+                           se::BlockDim block_dims, se::ThreadDim thread_dims,
+                           size_t shared_memory_bytes)
+    : name_(std::move(name)),
+      kernel_spec_(std::move(kernel_spec)),
+      block_dims_(block_dims),
+      thread_dims_(thread_dims),
+      cluster_dims_(std::nullopt),
+      shared_memory_bytes_(shared_memory_bytes) {}
+
+CustomKernel::CustomKernel(std::string name,
+                           se::MultiKernelLoaderSpec kernel_spec,
+                           se::BlockDim block_dims, se::ThreadDim thread_dims,
+                           se::ClusterDim cluster_dims,
+                           size_t shared_memory_bytes)
+    : name_(std::move(name)),
+      kernel_spec_(std::move(kernel_spec)),
+      block_dims_(block_dims),
+      thread_dims_(thread_dims),
+      cluster_dims_(cluster_dims),
+      shared_memory_bytes_(shared_memory_bytes) {}
+
+std::string_view CustomKernel::name() const { return name_; }
+
+const se::MultiKernelLoaderSpec& CustomKernel::kernel_spec() const {
+  return kernel_spec_;
+}
+
+se::BlockDim CustomKernel::block_dims() const { return block_dims_; }
+
+se::ThreadDim CustomKernel::thread_dims() const { return thread_dims_; }
+
+std::optional<se::ClusterDim> CustomKernel::cluster_dims() const {
+  return cluster_dims_;
+}
+
+size_t CustomKernel::shared_memory_bytes() const {
+  return shared_memory_bytes_;
+}
+
+std::string CustomKernel::ToString() const {
+  return absl::StrFormat(
+      "%s grid: [%d, %d, %d] threads: [%d, %d, %d] shared_memory: %d bytes",
+      name_, block_dims_.x, block_dims_.y, block_dims_.z, thread_dims_.x,
+      thread_dims_.y, thread_dims_.z, shared_memory_bytes_);
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_kernel.h b/third_party/xla/xla/service/gpu/kernels/custom_kernel.h
new file mode 100644
index 00000000000000..29c40cccfb166e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel.h
@@ -0,0 +1,81 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_H_
+#define XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_H_
+
+#include <cstddef>
+#include <optional>
+#include <string>
+#include <string_view>
+
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
+
+// WARNING: This header (and a build target) should have minimal dependencies as
+// it's included into all device kernel implementations, and we want to minimize
+// the number of (very expensive!) recompilations.
+
+namespace xla::gpu {
+namespace se = ::stream_executor;  // NOLINT
+
+// Custom kernel is a mechanism for plugging pre-compiled device kernels into
+// XLA GPU runtime. Custom kernel defines how to load the kernel on an executor
+// and what are the grid size requirements for running it.
+//
+// We use this API to hide kernel implementation details from XLA (e.g. we can
+// export CUTLASS gemm kernels as custom kernels to XLA), so that XLA can only
+// implement generic interfaces, e.g. command buffer implementation for CUDA
+// can automatically add a kernel node to a graph for arbitrary custom kernel.
+//
+// TODO(ezhulenev): Add custom kernel signature to track number and types of
+// buffer arguments, and a way to mark one of an arguments as a workspace and
+// define if it has to be zeroed first.
+class CustomKernel {
+ public:
+  CustomKernel(std::string name, se::MultiKernelLoaderSpec kernel_spec,
+               se::BlockDim block_dims, se::ThreadDim thread_dims,
+               size_t shared_memory_bytes);
+
+  CustomKernel(std::string name, se::MultiKernelLoaderSpec kernel_spec,
+               se::BlockDim block_dims, se::ThreadDim thread_dims,
+               se::ClusterDim cluster_dims, size_t shared_memory_bytes);
+
+  std::string_view name() const;
+
+  const se::MultiKernelLoaderSpec& kernel_spec() const;
+
+  se::BlockDim block_dims() const;
+
+  se::ThreadDim thread_dims() const;
+
+  std::optional<se::ClusterDim> cluster_dims() const;
+
+  size_t shared_memory_bytes() const;
+
+  std::string ToString() const;
+
+ private:
+  std::string name_;
+  se::MultiKernelLoaderSpec kernel_spec_;
+  se::BlockDim block_dims_;
+  se::ThreadDim thread_dims_;
+  std::optional<se::ClusterDim> cluster_dims_;
+  size_t shared_memory_bytes_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_H_
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm.cc
new file mode 100644
index 00000000000000..5ce9163c9b935f
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm.cc
@@ -0,0 +1,188 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/kernels/cutlass_gemm.h"
+
+#include <cstdint>
+#include <optional>
+#include <string>
+
+#include "tsl/platform/logging.h"
+
+#if !defined(PLATFORM_WINDOWS)
+#include <dlfcn.h>
+#endif
+
+namespace xla::gpu::kernel::gemm_universal {
+
+// TODO(b/315492043): We should add an XLA PJRT style C API for registering
+// libraries of custom CUTLASS kernels compiled into shared libraries. It should
+// be possible to bundle multiple custom CUTLASS kernels into a single shared
+// library, and then load them optionally by name. For now we assume that there
+// is a 1-to-1 mapping from a kernel to shared library, and they exported with
+// a simple C API, and we hope that functions exported from a library has ABI
+// that matches our expectations.
+
+using BlockDimFn = void (*)(int32_t m, int32_t n, int32_t k, uint32_t* x,
+                            uint32_t* y, uint32_t* z);
+using ThreadDimFn = void (*)(uint32_t* x, uint32_t* y, uint32_t* z);
+using SharedMemoryBytesFn = int32_t (*)();
+using CanImplementFn = bool (*)(int32_t m, int32_t n, int32_t k);
+using InitializeFn = void (*)(void* params, int32_t m, int32_t n, int32_t k,
+                              void* a, void* b, void* c, int32_t device_sms,
+                              int32_t sm_occupancy);
+using KernelSymboFn = void* (*)();
+
+static constexpr const char* kBlockDimFn = "xla_cutlass_kernel_block_dim";
+static constexpr const char* kThreadDimFn = "xla_cutlass_kernel_thread_dim";
+static constexpr const char* kSharedMemoryBytes =
+    "xla_cutlass_kernel_shared_memory_bytes";
+static constexpr const char* kCanImplement = "xla_cutlass_kernel_can_implement";
+static constexpr const char* kInitialize = "xla_cutlass_kernel_initialize";
+static constexpr const char* kKernelSymbol = "xla_cutlass_kernel_symbol";
+
+static void* Dlopen(const char* path) {
+#if defined(PLATFORM_WINDOWS)
+  return nullptr;
+#else
+  return dlopen(path, RTLD_LAZY);
+#endif  // defined(PLATFORM_WINDOWS)
+}
+
+static void* Dlsym(void* handle, const char* name) {
+#if defined(PLATFORM_WINDOWS)
+  return nullptr;
+#else
+  return dlsym(handle, name);
+#endif  // defined(PLATFORM_WINDOWS)
+}
+
+//===----------------------------------------------------------------------===//
+// CUTLASS Host Side Adaptor
+//===----------------------------------------------------------------------===//
+
+std::optional<Adaptor<DlOpenedKernel>> Adaptor<DlOpenedKernel>::Load(
+    const std::string& path) {
+  VLOG(3) << "Load CUTLASS adaptor from a shared library: " << path;
+
+  void* library = Dlopen(path.c_str());
+  if (library == nullptr) return std::nullopt;
+
+  auto resolve = [&](const char* name) -> void* {
+    void* sym = Dlsym(library, name);
+    if (sym == nullptr) {
+      LOG(ERROR) << "Failed to resolve CUTLASS adaptor function: " << name
+                 << " in library: " << path;
+    }
+    return sym;
+  };
+
+  void* block_dim_fn = resolve(kBlockDimFn);
+  if (block_dim_fn == nullptr) return std::nullopt;
+
+  void* thread_dim_fn = resolve(kThreadDimFn);
+  if (thread_dim_fn == nullptr) return std::nullopt;
+
+  void* shared_memory_bytes_fn = resolve(kSharedMemoryBytes);
+  if (shared_memory_bytes_fn == nullptr) return std::nullopt;
+
+  void* can_implement_fn = resolve(kCanImplement);
+  if (shared_memory_bytes_fn == nullptr) return std::nullopt;
+
+  void* initialize_fn = resolve(kInitialize);
+  if (shared_memory_bytes_fn == nullptr) return std::nullopt;
+
+  return Adaptor(library, block_dim_fn, thread_dim_fn, shared_memory_bytes_fn,
+                 can_implement_fn, initialize_fn);
+}
+
+std::optional<Dim3> Adaptor<DlOpenedKernel>::ClusterDim() const {
+  return std::nullopt;
+}
+
+Dim3 Adaptor<DlOpenedKernel>::BlockDim(int32_t m, int32_t n, int32_t k) const {
+  Dim3 dim;
+  reinterpret_cast<BlockDimFn>(block_dim_fn_)(m, n, k, &dim.x, &dim.y, &dim.z);
+  return dim;
+}
+
+Dim3 Adaptor<DlOpenedKernel>::ThreadDim() const {
+  Dim3 dim;
+  reinterpret_cast<ThreadDimFn>(thread_dim_fn_)(&dim.x, &dim.y, &dim.z);
+  return dim;
+}
+
+int32_t Adaptor<DlOpenedKernel>::SharedMemoryBytes() const {
+  return reinterpret_cast<SharedMemoryBytesFn>(shared_memory_bytes_fn_)();
+}
+
+bool Adaptor<DlOpenedKernel>::CanImplement(const Arguments& args) const {
+  return reinterpret_cast<CanImplementFn>(can_implement_fn_)(args.m, args.n,
+                                                             args.k);
+}
+
+void Adaptor<DlOpenedKernel>::Initialize(void* params, const Arguments& args,
+                                         int32_t device_sms,
+                                         int32_t sm_occupancy) const {
+  reinterpret_cast<InitializeFn>(initialize_fn_)(params, args.m, args.n, args.k,
+                                                 args.a, args.b, args.c,
+                                                 device_sms, sm_occupancy);
+}
+
+Adaptor<DlOpenedKernel>::Adaptor(void* handle, void* block_dim_fn,
+                                 void* thread_dim_fn,
+                                 void* shared_memory_bytes_fn,
+                                 void* can_implement_fn, void* initialize_fn)
+    : handle_(handle),
+      block_dim_fn_(block_dim_fn),
+      thread_dim_fn_(thread_dim_fn),
+      shared_memory_bytes_fn_(shared_memory_bytes_fn),
+      can_implement_fn_(can_implement_fn),
+      initialize_fn_(initialize_fn) {}
+
+//===----------------------------------------------------------------------===//
+// CUTLASS Device Side Adaptor
+//===----------------------------------------------------------------------===//
+
+std::optional<DeviceKernel<DlOpenedKernel>> DeviceKernel<DlOpenedKernel>::Load(
+    const std::string& path) {
+  VLOG(3) << "Load CUTLASS device kernel from a shared library: " << path;
+
+  void* library = Dlopen(path.c_str());
+  if (library == nullptr) return std::nullopt;
+
+  auto resolve = [&](const char* name) -> void* {
+    void* sym = Dlsym(library, name);
+    if (sym == nullptr) {
+      LOG(ERROR) << "Failed to resolve CUTLASS kernel function: " << name
+                 << " in library: " << path;
+    }
+    return sym;
+  };
+
+  void* kernel_symbol_fn = resolve(kKernelSymbol);
+  if (kernel_symbol_fn == nullptr) return std::nullopt;
+
+  return DeviceKernel(library, kernel_symbol_fn);
+}
+
+void* DeviceKernel<DlOpenedKernel>::symbol() const {
+  return reinterpret_cast<KernelSymboFn>(symbol_fn_)();
+}
+
+DeviceKernel<DlOpenedKernel>::DeviceKernel(void* handle, void* symbol_fn)
+    : handle_(handle), symbol_fn_(symbol_fn) {}
+
+}  // namespace xla::gpu::kernel::gemm_universal
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm.h b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm.h
new file mode 100644
index 00000000000000..f4df75346617de
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm.h
@@ -0,0 +1,236 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_H_
+#define XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_H_
+
+//===-------------------------------------------------------------------------//
+//                 ! ! ! ! !      WARNING      ! ! ! ! !                      //
+//===-------------------------------------------------------------------------//
+//                                                                            //
+//   Do not add external dependencies to this header. Use only std library.   //
+//                                                                            //
+//===-------------------------------------------------------------------------//
+//                 ! ! ! ! !      WARNING      ! ! ! ! !                      //
+//===-------------------------------------------------------------------------//
+
+#include <cstdint>
+#include <optional>
+#include <string>
+
+namespace xla::gpu::kernel::gemm_universal {
+
+//===----------------------------------------------------------------------===//
+// Tag based GEMM dispatching
+//===----------------------------------------------------------------------===//
+
+// We use tag-based template specializations to carefully avoid including
+// CUTLASS headers into regular libraries, and specialize templates in separate
+// CUDA build targets that have no dependencies on other parts of XLA or ABSL to
+// enable parallel compilation and minimize recompilations on code changes.
+//
+// Here we re-define some of the enums and types defined in CUTLASS and CUTE to
+// break a dependency on them from XLA.
+
+enum class Arch { kDefault, kSm80 };
+
+template <Arch arch>
+struct Bf16xBf16ToBf16 {};
+
+template <Arch arch>
+struct F32xF32ToF32 {};
+
+// A tag to specialize CUTLASS kernel adaptors for loading kernels from shared
+// libraries using dlopen.
+struct DlOpenedKernel {};
+
+//===----------------------------------------------------------------------===//
+// CUTLASS gemm arguments
+//===----------------------------------------------------------------------===//
+
+struct Arguments {
+  int32_t m;
+  int32_t n;
+  int32_t k;
+
+  void* a;
+  void* b;
+  void* c;
+};
+
+// Indices of a custom fusion parameters corresponding to Gemm kernel arguments.
+//
+// Example:
+//   se::KernelArgsDeviceMemoryArray args = ...
+//   void* lhs = args->device_memory_ptr(indices.lhs);
+//
+// Custom fusion instruction can have parameters in arbitrary order, and we need
+// a mapping from a custom kernel argument to the fusion instruction parameter.
+struct ArgsIndices {
+  int64_t lhs;
+  int64_t rhs;
+  int64_t out;
+};
+
+// Following structs encode how a custom kernel arguments packing and a custom
+// CUTLASS kernel itself can find dynamic-slice offsets at run time.
+//
+// Example: CUTLASS gemm with a dynamic-update-slice
+//
+//   cutlass_gemm {
+//     p0 = f32[2,2]{1,0} parameter(0)
+//     p1 = f32[2,2,2]{2,1,0} parameter(1)
+//     p2 = s32[] parameter(2)               <--- major dim offset
+//     p3 = s32[] parameter(3)               <--- minor dims offset
+//     dot = f32[2,2]{1,0} dot(p0, p0)
+//     ...
+//     ROOT r = f32[2,2,2]{2,1,0} dynamic-update-slice(p1, ..., p2, p3, p3)
+//   }
+//
+// In this example `p2` parameter defines a dynamic slice offset along the
+// major dimension (0-th dimension for a row major layout). In practice
+// parameters can be passed to fusions in arbitrary order, and when we pack
+// custom kernel arguments into device kernel parameters we need to know
+// how to find correct device pointers in the list of fusion arguments.
+//
+// For this example:
+//
+//   DynamicSliceIndices::out = 2
+//   DynamicSliceParams::out = <pointer to p2 buffer>
+//
+// `DynamicSliceIndices` used in the host-code to fetch device memory pointers
+// from arguments and pass it as `DynamicSliceParams` to a device kernel.
+//
+// Example:
+//   se::KernelArgsDeviceMemoryArray args = ...
+//   void* out_ptr = args->device_memory_ptr(*slice_indices.out);
+//
+//   DynamicSliceParams params { // this struct passed to a kernel
+//     out_ptr,                  // kernel loads offset value from this pointer
+//     ...
+//   };
+//
+
+// TODO(ezhulenev): Support dynamic slices along all dimensions, today we assume
+// that we can slice only along the leading dimension (batch).
+
+// Indices of a custom fusion parameters corresponding to dynamic slice offsets.
+struct DynamicSliceIndices {
+  // Index of a dynamic slice offset along the major dimension.
+  std::optional<int64_t> out;
+};
+
+// Pointers to buffers (s32[] buffers in HLO) holding dynamic slice offsets.
+struct DynamicSliceParams {
+  // Dynamic slice offset along the major dimension.
+  std::optional<int32_t*> out;
+};
+
+//===----------------------------------------------------------------------===//
+// CUTLASS Host Side Adaptor
+//===----------------------------------------------------------------------===//
+
+template <typename Tag>
+struct Traits;
+
+struct Dim3 {
+  uint32_t x = 1;
+  uint32_t y = 1;
+  uint32_t z = 1;
+};
+
+// This is a type-erased adaptor that has all details required for launching
+// CUTLASS kernel on a device. At run time device kernel parameters is really
+// just a bag of bytes that driver sends to a kernel, so we rely on it to hide
+// CUTLASS templates inside individual build targets and don't leak them into
+// XLA, as they contain device code and can't be parsed by regular clang.
+template <typename Tag>
+class Adaptor {
+ public:
+  std::optional<Dim3> ClusterDim() const;
+  Dim3 BlockDim(int32_t m, int32_t n, int32_t k) const;
+  Dim3 ThreadDim() const;
+
+  int32_t SharedMemoryBytes() const;
+
+  bool CanImplement(const Arguments& args) const;
+  void Initialize(void* params, const Arguments& args, int32_t device_sms,
+                  int32_t sm_occupancy) const;
+};
+
+// This is a specialization of adaptor that can load CUTLASS kernels from
+// pre-compiled shared libraries on disk. Libraries can be compiled ahead of
+// time using external toolchain, e.g. NVCC, as long as they export required
+// symbols with a plain C calling convention.
+template <>
+class Adaptor<DlOpenedKernel> {
+ public:
+  static std::optional<Adaptor> Load(const std::string& path);
+
+  std::optional<Dim3> ClusterDim() const;
+  Dim3 BlockDim(int32_t m, int32_t n, int32_t k) const;
+  Dim3 ThreadDim() const;
+
+  int32_t SharedMemoryBytes() const;
+
+  bool CanImplement(const Arguments& args) const;
+  void Initialize(void* params, const Arguments& args, int32_t device_sms,
+                  int32_t sm_occupancy) const;
+
+ private:
+  Adaptor(void* handle, void* block_dim_fn, void* thread_dim_fn,
+          void* shared_memory_bytes_fn, void* can_implement_fn,
+          void* initialize_fn);
+
+  void* handle_;
+  void* block_dim_fn_;
+  void* thread_dim_fn_;
+  void* shared_memory_bytes_fn_;
+  void* can_implement_fn_;
+  void* initialize_fn_;
+};
+
+//===----------------------------------------------------------------------===//
+// CUTLASS Device Side Adaptor
+//===----------------------------------------------------------------------===//
+
+// We keep device side adaptor separate from host side adaptor so that we could
+// easily split host and device code compilation if needed.
+
+template <typename Tag>
+class DeviceKernel {
+ public:
+  void* symbol() const;
+};
+
+// This is a specialization of device kernel for loading CUTLASS kernels from
+// shared libraries on disk (see Adaptor specialization above).
+template <>
+class DeviceKernel<DlOpenedKernel> {
+ public:
+  static std::optional<DeviceKernel> Load(const std::string& path);
+
+  void* symbol() const;
+
+ private:
+  DeviceKernel(void* handle, void* symbol_fn);
+
+  void* handle_;
+  void* symbol_fn_;
+};
+
+}  // namespace xla::gpu::kernel::gemm_universal
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_H_
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_adaptor.cu.h b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_adaptor.cu.h
new file mode 100644
index 00000000000000..bd3da443675c25
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_adaptor.cu.h
@@ -0,0 +1,215 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_ADAPTOR_CU_H_
+#define XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_ADAPTOR_CU_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/layout/matrix.h"
+#include "xla/service/gpu/kernels/cutlass_gemm.h"
+
+namespace xla::gpu::kernel::gemm_universal {
+
+// This is a template library implementing adaptor from a CUTLASS kernel to
+// StreamExecutor primitives for kernel arguments packing and kernel launching.
+//
+// This library is based on `GemmUniversalAdaptor` from CUTLASS itself, but
+// instead of targeting CUDA runtime for launching kernels, it targets XLA
+// StreamExecutor abstractions, but conceptually it has the same role: wrapping
+// device kernels into C++ API to make them launchable on streams.
+
+//===----------------------------------------------------------------------===//
+// Gemm strides computation
+//===----------------------------------------------------------------------===//
+
+// TODO(ezhulenev): CUTLASS already has functions in cute to compute strides for
+// a GEMM operations/kernels. Remove custom LdA/B/C functions.
+
+template <typename Gemm>
+int64_t LdA(const cutlass::gemm::GemmCoord &problem_size) {
+  using LayoutA = typename Gemm::LayoutA;
+
+  if constexpr (std::is_same_v<LayoutA, cutlass::layout::RowMajor>) {
+    return problem_size.k();
+  } else {
+    static_assert(sizeof(Gemm) == 0, "unsupported layout type");
+  }
+}
+
+template <typename Gemm>
+int64_t LdB(const cutlass::gemm::GemmCoord &problem_size) {
+  using LayoutB = typename Gemm::LayoutB;
+
+  if constexpr (std::is_same_v<LayoutB, cutlass::layout::RowMajor>) {
+    return problem_size.n();
+  } else {
+    static_assert(sizeof(Gemm) == 0, "unsupported layout type");
+  }
+}
+
+template <typename Gemm>
+int64_t LdC(const cutlass::gemm::GemmCoord &problem_size) {
+  using LayoutC = typename Gemm::LayoutA;
+
+  if constexpr (std::is_same_v<LayoutC, cutlass::layout::RowMajor>) {
+    return problem_size.n();
+  } else {
+    static_assert(sizeof(Gemm) == 0, "unsupported layout type");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// CUTLASS 2x Adaptor
+//===----------------------------------------------------------------------===//
+
+template <typename Tag>
+std::optional<Dim3> Adaptor<Tag>::ClusterDim() const {
+  return std::nullopt;
+}
+
+template <typename Tag>
+Dim3 Adaptor<Tag>::ThreadDim() const {
+  return Dim3{Traits<Tag>::Kernel::kThreadCount};
+}
+
+template <typename Tag>
+Dim3 Adaptor<Tag>::BlockDim(int32_t m, int32_t n, int32_t k) const {
+  using Operation = typename Traits<Tag>::Operation;
+  using ThreadblockSwizzle = typename Operation::ThreadblockSwizzle;
+  using ThreadblockShape = typename Operation::ThreadblockShape;
+
+  cutlass::gemm::GemmCoord problem_size(m, n, k);
+  cutlass::gemm::GemmCoord tile_size(ThreadblockShape::kM, ThreadblockShape::kN,
+                                     ThreadblockShape::kK);
+  cutlass::gemm::GemmCoord grid_tiled_shape =
+      ThreadblockSwizzle::get_tiled_shape(problem_size, tile_size,
+                                          /*split_k_slices=*/1);
+
+  auto grid = ThreadblockSwizzle().get_grid_shape(grid_tiled_shape);
+  return Dim3{grid.x, grid.y, grid.z};
+}
+
+template <typename Tag>
+int32_t Adaptor<Tag>::SharedMemoryBytes() const {
+  return sizeof(typename Traits<Tag>::Kernel::SharedStorage);
+};
+
+template <typename Tag>
+bool Adaptor<Tag>::CanImplement(const Arguments &args) const {
+  cutlass::gemm::GemmCoord problem_size(args.m, args.n, args.k);
+  return Traits<Tag>::Kernel::can_implement(problem_size) ==
+         cutlass::Status::kSuccess;
+}
+
+template <typename Tag>
+void Adaptor<Tag>::Initialize(void *params, const Arguments &args,
+                              int32_t device_sms, int32_t sm_occupancy) const {
+  // Sanity check that parameters struct is compatible with parameters storage
+  // defined by custom gemm kernel.
+  static_assert(sizeof(typename Traits<Tag>::Params) <= 1024,
+                "Params struct size is too large");
+  static_assert(alignof(typename Traits<Tag>::Params) <= 32,
+                "Params struct alignment is too large");
+
+  cutlass::gemm::GemmCoord problem_size(args.m, args.n, args.k);
+
+  // TODO(ezhulenev): Replace with cute::stride instead of custom templates.
+  auto lda = LdA<typename Traits<Tag>::Operation>(problem_size);
+  auto ldb = LdB<typename Traits<Tag>::Operation>(problem_size);
+  auto ldc = LdC<typename Traits<Tag>::Operation>(problem_size);
+
+  auto mode = cutlass::gemm::GemmUniversalMode::kGemm;
+
+  // TODO(ezhulenev): We hardcode parameters for `LinearCombination`
+  // epilogue, however `Gemm` template can be compiled with arbitrary
+  // epilogues. We have to support custom epilogues in a way that does not
+  // leak cutlass types via the public API function signature.
+  using Accumulator = typename Traits<Tag>::Operation::ElementAccumulator;
+  Accumulator alpha{1.0};
+  Accumulator beta{0.0};
+
+  typename Traits<Tag>::Arguments arguments(  // CUTLASS Operation arguments
+      mode, problem_size,                     //
+      1,                                      // batch
+      {alpha, beta},                          // epilogue
+      args.a, args.b, args.c, args.c,         // pointers
+      0, 0, 0, 0,                             // batch strides
+      lda, ldb, ldc, ldc                      // strides
+  );
+
+  // Convert CUTLASS operation arguments to a device kernel parameters.
+  new (params)
+      typename Traits<Tag>::Params(arguments, device_sms, sm_occupancy);
+}
+
+//===----------------------------------------------------------------------===//
+// CUTLASS 2x Device Kernel Entry Point
+//===----------------------------------------------------------------------===//
+
+// This entry point is based on `cutlass::Kernel2` template with an extra
+// parameter to pass dynamic slices.
+template <typename Kernel>
+__global__ void KernelEntryPoint(typename Kernel::Params params,
+                                 DynamicSliceParams slices) {
+  extern __shared__ int SharedStorageBase[];
+  typename Kernel::SharedStorage *shared_storage =
+      reinterpret_cast<typename Kernel::SharedStorage *>(SharedStorageBase);
+
+  // Update output pointers to account for dynamic offsets.
+  if (slices.out.has_value()) {
+    auto m = params.problem_size.m();
+    auto n = params.problem_size.n();
+
+    int32_t out_offset = **slices.out;
+
+    char *ptr_c = reinterpret_cast<char *>(params.ptr_C);
+    char *ptr_d = reinterpret_cast<char *>(params.ptr_D);
+
+    using ElementC = typename Kernel::ElementC;
+    params.ptr_C = ptr_c + sizeof(ElementC) * out_offset * (m * n);
+    params.ptr_D = ptr_d + sizeof(ElementC) * out_offset * (m * n);
+  }
+
+  Kernel::invoke(params, *shared_storage);
+}
+
+template <typename Tag>
+void *DeviceKernel<Tag>::symbol() const {
+  return reinterpret_cast<void *>(
+      KernelEntryPoint<typename Traits<Tag>::Kernel>);
+};
+
+//===----------------------------------------------------------------------===//
+// CUTLASS kernel traits helper
+//===----------------------------------------------------------------------===//
+
+#define XLA_GPU_DEFINE_CUTLASS_GEMM_TRAITS(TAG, OPERATION) \
+  template <>                                              \
+  struct Traits<TAG> {                                     \
+    using Operation = OPERATION;                           \
+    using Arguments = typename Operation::Arguments;       \
+    using Kernel = typename Operation::GemmKernel;         \
+    using Params = typename Kernel::Params;                \
+  }
+
+}  // namespace xla::gpu::kernel::gemm_universal
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_ADAPTOR_CU_H_
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
new file mode 100644
index 00000000000000..39b53b7608b3fc
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
@@ -0,0 +1,219 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/kernels/cutlass_gemm.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu::kernel::gemm_universal {
+
+static constexpr auto Default = Arch::kDefault;  // NOLINT
+static constexpr auto Sm80 = Arch::kSm80;        // NOLINT
+
+// Each individual CUTLASS kernel adaptor will be compiled in a separate
+// cuda_library and linked into the `cutlass_gemm_custom_kernels` target. We use
+// this approach for a few reasons:
+//
+//   - It enables parallel compilation of CUTLASS templates which in practice
+//     becomes quite expensive for any non-trivial GEMM.
+//
+//   - We do not include any of the CUTLASS headers in our custom kernel
+//     library which would require converting it to a cuda_library, and we
+//     want to minimize the number of headers included in .cu.cc files as NVCC
+//     does not particularly like templates defined in ABSL.
+//
+extern template struct Adaptor<F32xF32ToF32<Default>>;
+extern template struct DeviceKernel<F32xF32ToF32<Default>>;
+
+extern template struct Adaptor<Bf16xBf16ToBf16<Default>>;
+extern template struct DeviceKernel<Bf16xBf16ToBf16<Default>>;
+
+extern template struct Adaptor<Bf16xBf16ToBf16<Sm80>>;
+extern template struct DeviceKernel<Bf16xBf16ToBf16<Sm80>>;
+
+//===----------------------------------------------------------------------===//
+// CUTLASS kernel arguments packing
+//===----------------------------------------------------------------------===//
+
+using KernelArgsPacking = se::MultiKernelLoaderSpec::KernelArgsPacking;
+
+template <typename Dim>
+static Dim As(Dim3 d) {
+  return Dim(d.x, d.y, d.z);
+}
+
+// Returns a pointer to device memory holding a slice offset.
+static int32_t* SlicePtr(const se::KernelArgsDeviceMemoryArray* args,
+                         int64_t index) {
+  const void* opaque = args->device_memory_ptr(index);
+  return static_cast<int32_t*>(const_cast<void*>(opaque));
+}
+
+template <typename Tag>
+KernelArgsPacking ArgsPacking(int32_t m, int32_t n, int32_t k,
+                              const ArgsIndices& indices,
+                              const DynamicSliceIndices& slices,
+                              int32_t device_sms, Adaptor<Tag> adaptor) {
+  using Packed = StatusOr<std::unique_ptr<se::KernelArgsPackedArrayBase>>;
+
+  // TODO(ezhulenev): CUTLASS kernel Params struct not necessarily trivially
+  // destructible or even trivially copyable, we have to own the life time of an
+  // object constructed in the storage. For now we ignore it, and it's textbook
+  // definition of UB, but for CUTLASS kernels we use today it's perfectly safe.
+  struct Params {
+    alignas(32) std::byte storage[1024];
+  };
+
+  return [=](const se::Kernel& kernel, const se::KernelArgs& args) -> Packed {
+    auto* mem_args = se::Cast<se::KernelArgsDeviceMemoryArray>(&args);
+
+    Arguments arguments = {m, n, k};
+    arguments.a = const_cast<void*>(mem_args->device_memory_ptr(indices.lhs));
+    arguments.b = const_cast<void*>(mem_args->device_memory_ptr(indices.rhs));
+    arguments.c = const_cast<void*>(mem_args->device_memory_ptr(indices.out));
+
+    if (!adaptor.CanImplement(arguments)) {
+      return absl::InternalError(absl::StrCat(
+          "CUTLASS kernel can not implement gemm for a given problem size",
+          ": m=", m, ", n=", n, ", k=", k));
+    }
+
+    auto threads = As<se::ThreadDim>(adaptor.ThreadDim());
+    auto shmem_bytes = adaptor.SharedMemoryBytes();
+
+    // We keep max_occupancy in a static variable as currently for all
+    // practical purposes all stream executors in the process have identical
+    // underlying devices, and there is no need to repeatedly query this
+    // property.
+    static int32_t sm_occupancy =
+        kernel.GetMaxOccupiedBlocksPerCore(threads, shmem_bytes).value_or(1);
+
+    // TODO(ezhulenev): In theory when sm_occupancy is 0 we should not be able
+    // to run kernels, and we could return error here, however in practice
+    // it's not true, and kernels with 0 occupancy run just fine! Figure out
+    // where is the problem, and how we can reliably use sm occupancy numbers.
+    //
+    // TODO(ezhulenev): We need to set kernel dynamic shmem limit before asking
+    // for sm occupancy, it's likely why we get 0 today.
+    if (sm_occupancy == 0) {
+      LOG_FIRST_N(WARNING, 1)
+          << "CUTLASS gemm kernel reported 0 occupancy: threads_per_block="
+          << (threads.x * threads.y * threads.z)
+          << ", dynamic_shared_memory_bytes=" << shmem_bytes;
+    }
+
+    // Initialize parameters storage using adaptor.
+    Params params;
+    adaptor.Initialize(&params, arguments, device_sms, sm_occupancy);
+
+    // Optionally set up dynamic slice parameters to allow kernel adjust
+    // buffer pointers passed via `params`.
+    DynamicSliceParams slice_params;
+    if (slices.out.has_value()) {
+      slice_params.out = SlicePtr(mem_args, *slices.out);
+    }
+
+    // TODO(ezhulenev): We need to support EmplaceKernelArgs with inplace
+    // construction to avoid copying 1kb of byte storage.
+    return se::PackKernelArgs<Params, DynamicSliceParams>(
+        args.number_of_shared_bytes(), params, slice_params);
+  };
+}
+//===----------------------------------------------------------------------===//
+
+template <typename Tag>
+static StatusOr<CustomKernel> Load(std::string name, int32_t m, int32_t n,
+                                   int32_t k, const ArgsIndices& indices,
+                                   const DynamicSliceIndices& slices,
+                                   const se::DeviceDescription& device,
+                                   Adaptor<Tag> adaptor = {},
+                                   DeviceKernel<Tag> kernel = {}) {
+  // Get the dispatch grid size and shared memory requirements.
+  auto block_dim = As<se::BlockDim>(adaptor.BlockDim(m, n, k));
+  auto thread_dim = As<se::ThreadDim>(adaptor.ThreadDim());
+  auto shared_memory_bytes = adaptor.SharedMemoryBytes();
+
+  auto packing =
+      ArgsPacking<Tag>(m, n, k, indices, slices, device.core_count(), adaptor);
+
+  se::MultiKernelLoaderSpec spec(/*arity=*/2, std::move(packing));
+  spec.AddInProcessSymbol(kernel.symbol(), name);
+
+  return CustomKernel(std::move(name), std::move(spec), block_dim, thread_dim,
+                      shared_memory_bytes);
+}
+
+StatusOr<CustomKernel> GetCutlassGemmKernel(
+    std::string name, PrimitiveType dtype, int32_t m, int32_t n, int32_t k,
+    const ArgsIndices& indices, const DynamicSliceIndices& slices,
+    const se::DeviceDescription& device) {
+  auto& cuda_cc =
+      std::get<se::CudaComputeCapability>(device.gpu_compute_capability());
+
+  switch (dtype) {
+    case PrimitiveType::F32:
+      return Load<F32xF32ToF32<Default>>(std::move(name), m, n, k, indices,
+                                         slices, device);
+    case PrimitiveType::BF16:
+      if (cuda_cc.IsAtLeastAmpere()) {
+        return Load<Bf16xBf16ToBf16<Sm80>>(std::move(name), m, n, k, indices,
+                                           slices, device);
+      }
+      return Load<Bf16xBf16ToBf16<Default>>(std::move(name), m, n, k, indices,
+                                            slices, device);
+
+    default:
+      return absl::InvalidArgumentError("Unsupported CUTLASS gemm data type");
+  }
+}
+
+StatusOr<CustomKernel> LoadCutlassGemmKernel(
+    std::string name, const std::string& library_path, PrimitiveType dtype,
+    int32_t m, int32_t n, int32_t k, const ArgsIndices& indices,
+    const DynamicSliceIndices& slices, const se::DeviceDescription& device) {
+  auto adaptor = Adaptor<DlOpenedKernel>::Load(library_path);
+  if (!adaptor.has_value()) {
+    return absl::InternalError(
+        absl::StrCat("Failed to load CUTLASS adaptor from a shared library: ",
+                     library_path));
+  }
+
+  auto kernel = DeviceKernel<DlOpenedKernel>::Load(library_path);
+  if (!kernel.has_value()) {
+    return absl::InternalError(absl::StrCat(
+        "Failed to load CUTLASS kernel from a shared library: ", library_path));
+  }
+
+  return Load<DlOpenedKernel>(std::move(name), m, n, k, indices, slices, device,
+                              *adaptor, *kernel);
+}
+
+}  // namespace xla::gpu::kernel::gemm_universal
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h
new file mode 100644
index 00000000000000..5da0a2b95673a4
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h
@@ -0,0 +1,46 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_CUSTOM_KERNEL_H_
+#define XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_CUSTOM_KERNEL_H_
+
+#include <cstdint>
+#include <string>
+
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/kernels/cutlass_gemm.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu::kernel::gemm_universal {
+
+// Returns a pre-compiled custom kernel for a given data type and problem size.
+StatusOr<CustomKernel> GetCutlassGemmKernel(
+    std::string name, PrimitiveType dtype, int32_t m, int32_t n, int32_t k,
+    const ArgsIndices& indices, const DynamicSliceIndices& slices,
+    const se::DeviceDescription& device);
+
+// Loads custom kernel for a given data type and problem size from a shared
+// library. It's up to the caller to guarantee that CUTLASS kernel in the shared
+// library is compatible with the data type and problem size.
+StatusOr<CustomKernel> LoadCutlassGemmKernel(
+    std::string name, const std::string& library_path, PrimitiveType dtype,
+    int32_t m, int32_t n, int32_t k, const ArgsIndices& indices,
+    const DynamicSliceIndices& slices, const se::DeviceDescription& device);
+
+}  // namespace xla::gpu::kernel::gemm_universal
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_CUSTOM_KERNEL_H_
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
new file mode 100644
index 00000000000000..53969e660e34de
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
@@ -0,0 +1,84 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/multi_platform_manager.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/test.h"
+#include "tsl/platform/test_benchmark.h"
+
+namespace xla::gpu::kernel::gemm_universal {
+
+static uint32_t BitPattern(float value) {
+  uint32_t pattern;
+  std::memcpy(&pattern, &value, sizeof(float));
+  return pattern;
+}
+
+static void BM_RowMajorGemm(benchmark::State& state) {
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("CUDA").value();
+  se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+  const se::DeviceDescription& device = executor->GetDeviceDescription();
+
+  se::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  se::Kernel gemm(executor);
+
+  // GEMM: 8192x4096 * 4096x16384 -> 8192x16384
+  int32_t m = 8192;
+  int32_t n = 16384;
+  int32_t k = 4096;
+
+  auto custom_kernel =
+      GetCutlassGemmKernel("cutlass_gemm", PrimitiveType::BF16, m, n, k,
+                           /*indices=*/{0, 1, 2}, /*slices=*/{}, device);
+  TF_CHECK_OK(executor->GetKernel(custom_kernel->kernel_spec(), &gemm));
+
+  // Prepare arguments: a=1.1, b=1.2, c=0.0
+  se::DeviceMemory<float> a = executor->AllocateArray<float>(m * k, 0);
+  se::DeviceMemory<float> b = executor->AllocateArray<float>(k * n, 0);
+  se::DeviceMemory<float> c = executor->AllocateArray<float>(m * n, 0);
+
+  stream.ThenMemset32(&a, BitPattern(1.1f), a.size());
+  stream.ThenMemset32(&b, BitPattern(1.2f), b.size());
+  stream.ThenMemZero(&c, c.size());
+
+  se::KernelArgsDeviceMemoryArray args(
+      std::vector<se::DeviceMemoryBase>({a, b, c}),
+      custom_kernel->shared_memory_bytes());
+
+  for (auto s : state) {
+    TF_CHECK_OK(executor->Launch(&stream, custom_kernel->thread_dims(),
+                                 custom_kernel->block_dims(), gemm, args));
+    TF_CHECK_OK(stream.BlockHostUntilDone());
+  }
+}
+
+BENCHMARK(BM_RowMajorGemm);
+
+}  // namespace xla::gpu::kernel::gemm_universal
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_stub.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_stub.cc
new file mode 100644
index 00000000000000..1bbaeb7e8cd65e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_stub.cc
@@ -0,0 +1,41 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/kernels/cutlass_gemm.h"
+#include "xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu::kernel::gemm_universal {
+
+StatusOr<CustomKernel> GetCutlassGemmKernel(
+    std::string name, PrimitiveType dtype, int32_t m, int32_t n, int32_t k,
+    const ArgsIndices& indices, const DynamicSliceIndices& slices,
+    const se::DeviceDescription& device) {
+  return absl::InternalError("XLA compiled without CUDA support");
+}
+
+StatusOr<CustomKernel> LoadCutlassGemmKernel(
+    std::string name, const std::string& library_path, PrimitiveType dtype,
+    int32_t m, int32_t n, int32_t k, const ArgsIndices& indices,
+    const DynamicSliceIndices& slices, const se::DeviceDescription& device) {
+  return absl::InternalError("XLA compiled without CUDA support");
+}
+
+}  // namespace xla::gpu::kernel::gemm_universal
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
new file mode 100644
index 00000000000000..dee0b34ad68894
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
@@ -0,0 +1,135 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h"
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/multi_platform_manager.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla::gpu::kernel::gemm_universal {
+
+TEST(CutlassGemmKernelTest, SimpleGemm) {
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("CUDA").value();
+  se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  se::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  se::Kernel gemm(executor);
+
+  // Load [4, 4] x [4, 4] gemm kernel written in CUDA C++ with CUTLASS.
+  auto custom_kernel = GetCutlassGemmKernel(
+      "cutlass_gemm", PrimitiveType::F32, 4, 4, 4,
+      /*indices=*/{0, 1, 2}, /*slices=*/{}, executor->GetDeviceDescription());
+  TF_ASSERT_OK(executor->GetKernel(custom_kernel->kernel_spec(), &gemm));
+
+  int64_t length = 4 * 4;
+  int64_t byte_length = sizeof(float) * length;
+
+  // Prepare arguments: a=2, b=2, c=0
+  se::DeviceMemory<float> a = executor->AllocateArray<float>(length, 0);
+  se::DeviceMemory<float> b = executor->AllocateArray<float>(length, 0);
+  se::DeviceMemory<float> c = executor->AllocateArray<float>(length, 0);
+
+  float value = 2.0;
+  uint32_t pattern;
+  std::memcpy(&pattern, &value, sizeof(pattern));
+
+  stream.ThenMemset32(&a, pattern, byte_length);
+  stream.ThenMemset32(&b, pattern, byte_length);
+  stream.ThenMemZero(&c, byte_length);
+
+  // Launch gemm kernel with device memory arguments.
+  se::KernelArgsDeviceMemoryArray arr(
+      std::vector<se::DeviceMemoryBase>({a, b, c}),
+      custom_kernel->shared_memory_bytes());
+  TF_ASSERT_OK(executor->Launch(&stream, custom_kernel->thread_dims(),
+                                custom_kernel->block_dims(), gemm, arr));
+
+  // Copy `c` data back to host.
+  std::vector<float> dst(length, -1.0f);
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+
+  std::vector<float> expected(length, 16.0);
+  ASSERT_EQ(dst, expected);
+}
+
+TEST(CutlassGemmKernelTest, LoadFromSharedLibrary) {
+  std::string kernel_lib_path =
+      tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service", "gpu", "kernels",
+                        "cutlass_gemm_kernel_f32xf32_to_f32.so");
+
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("CUDA").value();
+  se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  se::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  se::Kernel gemm(executor);
+
+  // Load [4, 4] x [4, 4] gemm kernel written in CUDA C++ with CUTLASS.
+  auto custom_kernel = LoadCutlassGemmKernel(
+      "cutlass_gemm", kernel_lib_path, PrimitiveType::F32, 4, 4, 4,
+      /*indices=*/{0, 1, 2}, /*slices=*/{}, executor->GetDeviceDescription());
+  TF_ASSERT_OK(executor->GetKernel(custom_kernel->kernel_spec(), &gemm));
+
+  int64_t length = 4 * 4;
+  int64_t byte_length = sizeof(float) * length;
+
+  se::DeviceMemory<float> a = executor->AllocateArray<float>(length, 0);
+  se::DeviceMemory<float> b = executor->AllocateArray<float>(length, 0);
+  se::DeviceMemory<float> c = executor->AllocateArray<float>(length, 0);
+
+  float value = 2.0;
+  uint32_t pattern;
+  std::memcpy(&pattern, &value, sizeof(pattern));
+
+  stream.ThenMemset32(&a, pattern, byte_length);
+  stream.ThenMemset32(&b, pattern, byte_length);
+  stream.ThenMemZero(&c, byte_length);
+
+  // Launch gemm kernel with device memory arguments.
+  se::KernelArgsDeviceMemoryArray arr(
+      std::vector<se::DeviceMemoryBase>({a, b, c}),
+      custom_kernel->shared_memory_bytes());
+  TF_ASSERT_OK(executor->Launch(&stream, custom_kernel->thread_dims(),
+                                custom_kernel->block_dims(), gemm, arr));
+
+  // Copy `c` data back to host.
+  std::vector<float> dst(length, -1.0f);
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+
+  std::vector<float> expected(length, 16.0);
+  ASSERT_EQ(dst, expected);
+}
+
+}  // namespace xla::gpu::kernel::gemm_universal
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
new file mode 100644
index 00000000000000..b3bc09a9b148dc
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
@@ -0,0 +1,346 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/kernels/cutlass_gemm_fusion.h"
+
+#include <cstddef>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/kernels/custom_fusion.h"
+#include "xla/service/gpu/kernels/custom_fusion_pattern.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/kernels/cutlass_gemm.h"
+#include "xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/shape.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu {
+
+//===----------------------------------------------------------------------===//
+// Cutlass Gemm pattern matching helpers
+//===----------------------------------------------------------------------===//
+
+namespace {
+namespace m = match;
+
+// Pattern for matching mixed precision GEMMs.
+struct GemmWithUpcast {
+  explicit GemmWithUpcast(HloDotInstruction* dot) : dot(dot) {}
+
+  HloInstruction* dot;
+  HloInstruction* lhs_upcast = nullptr;  // HLO convert instr
+  HloInstruction* rhs_upcast = nullptr;  // HLO convert instr
+};
+
+// Pattern for matching GEMM with surrounding dynamic-slice/update-slice.
+struct GemmWithDynamicSlice {
+  explicit GemmWithDynamicSlice(HloDynamicUpdateSliceInstruction* update_slice)
+      : update_slice(update_slice) {}
+
+  std::vector<HloInstruction*> Instrs() { return {dot, bitcast, update_slice}; }
+
+  HloInstruction* dot = nullptr;
+  HloInstruction* bitcast = nullptr;       // result bitcast
+  HloInstruction* update_slice = nullptr;  // update result slice
+};
+}  // namespace
+
+// Returns OK if dot instruction is a simple 2D row-major gemm.
+static Status MatchRowMajorGemm(HloDotInstruction* dot) {
+  if (dot->operand(0)->shape().dimensions_size() != 2 ||
+      dot->operand(1)->shape().dimensions_size() != 2) {
+    return absl::InternalError("operands must have rank 2");
+  }
+
+  auto& dot_dims = dot->dot_dimension_numbers();
+
+  if (dot_dims.lhs_contracting_dimensions().size() != 1 ||
+      dot_dims.lhs_contracting_dimensions()[0] != 1) {
+    return absl::InternalError("lhs contracting dimensions must be 1");
+  }
+
+  if (dot_dims.rhs_contracting_dimensions().size() != 1 ||
+      dot_dims.rhs_contracting_dimensions()[0] != 0) {
+    return absl::InternalError("rhs contracting dimensions must be 0");
+  }
+
+  return OkStatus();
+}
+
+// Return OK if dot instruction is a simple gemm with all operands and result
+// having the same data type.
+static Status MatchSimpleGemm(HloDotInstruction* dot,
+                              absl::Span<const PrimitiveType> support_dtypes) {
+  TF_RETURN_IF_ERROR(MatchRowMajorGemm(dot));
+
+  for (PrimitiveType dtype : support_dtypes) {
+    if (dot->operand(0)->shape().element_type() == dtype &&
+        dot->operand(1)->shape().element_type() == dtype &&
+        dot->shape().element_type() == dtype) {
+      return OkStatus();
+    }
+  }
+
+  return absl::InternalError("unsupported operands type");
+}
+
+// Returns matched GEMM with one of the operands upcasted to the accumulator
+// data type with an HLO convert instruction.
+static StatusOr<GemmWithUpcast> MatchGemmWithUpcast(HloDotInstruction* dot) {
+  TF_RETURN_IF_ERROR(MatchRowMajorGemm(dot));
+
+  GemmWithUpcast match(dot);
+
+  // C <- convert(A) * B
+  if (Match(const_cast<HloInstruction*>(dot->operand(0)),
+            m::Convert(&match.lhs_upcast, m::Op()))) {
+    return match;
+  }
+
+  // C <- A * convert(B)
+  if (Match(const_cast<HloInstruction*>(dot->operand(1)),
+            m::Convert(&match.rhs_upcast, m::Op()))) {
+    return match;
+  }
+
+  return absl::InternalError("unsupported gemm with upcasing");
+}
+
+// Returns matched GEMM with result used to update a slice.
+static StatusOr<GemmWithDynamicSlice> MatchGemmWithDynamicUpdateSlice(
+    HloDynamicUpdateSliceInstruction* update_slice) {
+  GemmWithDynamicSlice match(update_slice);
+
+  if (!Match(
+          const_cast<HloInstruction*>(update_slice->operand(1)),
+          m::Bitcast(&match.bitcast, m::Dot(&match.dot, m::Op(), m::Op())))) {
+    return absl::InternalError("failed to match update slice instr");
+  }
+
+  TF_RETURN_IF_ERROR(MatchRowMajorGemm(Cast<HloDotInstruction>(match.dot)));
+
+  return match;
+}
+
+//===----------------------------------------------------------------------===//
+// Cutlass Gemm Patterns
+//===----------------------------------------------------------------------===//
+
+std::optional<CustomFusionPattern::Match> CutlassGemmPattern::TryMatch(
+    const se::DeviceDescription& device, HloInstruction* instr) const {
+  auto* dot = DynCast<HloDotInstruction>(instr);
+  if (!dot) return std::nullopt;
+
+  auto matched = MatchSimpleGemm(dot, {PrimitiveType::F32});
+  if (!matched.ok()) return std::nullopt;
+
+  CustomFusionConfig config;
+  config.set_name("cutlass_gemm");
+  return Match{config, {instr}};
+}
+
+std::optional<CustomFusionPattern::Match>
+CutlassGemmWithDynamicUpdateSlicePattern::TryMatch(
+    const se::DeviceDescription& device, HloInstruction* instr) const {
+  auto* update_slice = DynCast<HloDynamicUpdateSliceInstruction>(instr);
+  if (!update_slice) return std::nullopt;
+
+  auto matched = MatchGemmWithDynamicUpdateSlice(update_slice);
+  if (!matched.ok()) return std::nullopt;
+
+  CustomFusionConfig config;
+  config.set_name("cutlass_gemm_with_dynamic_update_slice");
+
+  Match match(config, matched->Instrs());
+
+  // Add an optional replacement for intermediate dot instruction as a
+  // dynamic-slice from the fusion result.
+  match.AddReplacement(matched->dot, [=](HloFusionInstruction* fusion) {
+    HloComputation* parent = fusion->parent();
+    auto* dus = Cast<HloDynamicUpdateSliceInstruction>(matched->update_slice);
+    auto* slice = parent->AddInstruction(HloInstruction::CreateDynamicSlice(
+        matched->bitcast->shape(), fusion, dus->index_operands(),
+        matched->bitcast->shape().dimensions()));
+    return parent->AddInstruction(
+        HloInstruction::CreateBitcast(matched->dot->shape(), slice));
+  });
+
+  return match;
+}
+
+std::optional<CustomFusionPattern::Match>
+CutlassGemmWithUpcastPattern::TryMatch(const se::DeviceDescription& device,
+                                       HloInstruction* instr) const {
+  auto* dot = DynCast<HloDotInstruction>(instr);
+  if (!dot) return std::nullopt;
+
+  auto matched = MatchGemmWithUpcast(dot);
+  if (!matched.ok()) return std::nullopt;
+
+  // Only one operand can be upcasted.
+  DCHECK(matched->lhs_upcast == nullptr || matched->rhs_upcast == nullptr);
+
+  CustomFusionConfig config;
+  config.set_name("cutlass_gemm_with_upcast");
+
+  return matched->lhs_upcast ? Match{config, {matched->lhs_upcast, instr}}
+                             : Match{config, {matched->rhs_upcast, instr}};
+}
+
+//===----------------------------------------------------------------------===//
+// Cutlass Gemm Fusions
+//===----------------------------------------------------------------------===//
+
+class CutlassGemmFusion : public CustomFusion {
+ public:
+  StatusOr<std::vector<CustomKernel>> LoadKernels(
+      const se::DeviceDescription& device,
+      const HloComputation* computation) const final {
+    auto* dot = DynCast<HloDotInstruction>(computation->root_instruction());
+    if (dot == nullptr) {
+      return absl::InternalError(
+          "cutlass_gemm requires ROOT operation to be a dot");
+    }
+
+    TF_RETURN_IF_ERROR(MatchSimpleGemm(dot, {PrimitiveType::F32}));
+
+    auto dtype = dot->shape().element_type();
+
+    auto* lhs = Cast<HloParameterInstruction>(dot->operand(0));
+    auto* rhs = Cast<HloParameterInstruction>(dot->operand(1));
+
+    // Mapping from fusion arguments to gemm kernel arguments.
+    kernel::gemm_universal::ArgsIndices indices = {
+        lhs->parameter_number(), rhs->parameter_number(),
+        computation->num_parameters()};
+
+    auto& lhs_shape = lhs->shape();
+    auto& rhs_shape = rhs->shape();
+
+    size_t m = lhs_shape.dimensions(0);
+    size_t k = lhs_shape.dimensions(1);
+    size_t n = rhs_shape.dimensions(1);
+
+    TF_ASSIGN_OR_RETURN(
+        auto kernel,
+        kernel::gemm_universal::GetCutlassGemmKernel(
+            "cutlass_gemm", dtype, m, n, k, indices, /*slices=*/{}, device));
+    return std::vector<CustomKernel>{std::move(kernel)};
+  }
+};
+
+class CutlassGemmWithUpcastFusion : public CustomFusion {
+ public:
+  StatusOr<std::vector<CustomKernel>> LoadKernels(
+      const se::DeviceDescription& device,
+      const HloComputation* computation) const final {
+    auto* dot = DynCast<HloDotInstruction>(computation->root_instruction());
+    if (dot == nullptr) {
+      return absl::InternalError(
+          "cutlass_gemm requires ROOT operation to be a dot");
+    }
+
+    TF_ASSIGN_OR_RETURN(auto matched, MatchGemmWithUpcast(dot));
+
+    // We only support upcasting of rhs operand.
+    if (matched.lhs_upcast != nullptr)
+      return absl::InternalError("only rhs upcasting is implemented");
+
+    auto dot_dtype = dot->shape().element_type();
+    auto upcast_dtype = matched.rhs_upcast->shape().element_type();
+
+    // We only support BF16 <- BF16 x S8 upcasted gemm.
+    if (dot_dtype != PrimitiveType::BF16 || upcast_dtype != PrimitiveType::S8)
+      return absl::InternalError("unsupported upcasting pattern");
+
+    return absl::UnimplementedError("requires CUTLASS 3.3.0");
+  }
+};
+
+class CutlassGemmWithDynamicUpdateSliceFusion : public CustomFusion {
+ public:
+  StatusOr<std::vector<CustomKernel>> LoadKernels(
+      const se::DeviceDescription& device,
+      const HloComputation* computation) const final {
+    auto* dus = DynCast<HloDynamicUpdateSliceInstruction>(
+        computation->root_instruction());
+    if (dus == nullptr) {
+      return absl::InternalError(
+          "cutlass_gemm_with_dynamic_update_slice requires ROOT operation to "
+          "be a dynamic update slice");
+    }
+
+    TF_ASSIGN_OR_RETURN(auto matched, MatchGemmWithDynamicUpdateSlice(dus));
+    TF_RETURN_IF_ERROR(
+        MatchSimpleGemm(Cast<HloDotInstruction>(matched.dot),
+                        {PrimitiveType::F32, PrimitiveType::BF16}));
+
+    auto dtype = matched.dot->shape().element_type();
+
+    auto* lhs = Cast<HloParameterInstruction>(matched.dot->operand(0));
+    auto* rhs = Cast<HloParameterInstruction>(matched.dot->operand(1));
+    auto* out = Cast<HloParameterInstruction>(matched.update_slice->operand(0));
+
+    // Mapping from fusion arguments to gemm kernel arguments.
+    kernel::gemm_universal::ArgsIndices args_indices = {
+        lhs->parameter_number(), rhs->parameter_number(),
+        out->parameter_number()};
+
+    // Mapping to a buffer that holds output slice offset.
+    auto* offset =
+        Cast<HloParameterInstruction>(matched.update_slice->operand(2));
+
+    kernel::gemm_universal::DynamicSliceIndices slices;
+    slices.out = offset->parameter_number();
+
+    auto& lhs_shape = lhs->shape();
+    auto& rhs_shape = rhs->shape();
+
+    size_t m = lhs_shape.dimensions(0);
+    size_t k = lhs_shape.dimensions(1);
+    size_t n = rhs_shape.dimensions(1);
+
+    TF_ASSIGN_OR_RETURN(
+        auto kernel, kernel::gemm_universal::GetCutlassGemmKernel(
+                         "cutlass_gemm_with_dynamic_update_slice", dtype, m, n,
+                         k, args_indices, slices, device));
+    return std::vector<CustomKernel>{std::move(kernel)};
+  }
+};
+
+}  // namespace xla::gpu
+
+XLA_REGISTER_CUSTOM_FUSION_PATTERN(
+    ::xla::gpu::CutlassGemmWithDynamicUpdateSlicePattern);
+
+XLA_REGISTER_CUSTOM_FUSION("cutlass_gemm", ::xla::gpu::CutlassGemmFusion);
+XLA_REGISTER_CUSTOM_FUSION("cutlass_gemm_with_upcast",
+                           ::xla::gpu::CutlassGemmWithUpcastFusion);
+XLA_REGISTER_CUSTOM_FUSION("cutlass_gemm_with_dynamic_update_slice",
+                           ::xla::gpu::CutlassGemmWithDynamicUpdateSliceFusion);
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.h b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.h
new file mode 100644
index 00000000000000..4477efecece0dc
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.h
@@ -0,0 +1,51 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_FUSION_H_
+#define XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_FUSION_H_
+
+#include <optional>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/kernels/custom_fusion_pattern.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+// Pattern matches simple row-major gemms to CUTLASS kernels.
+class CutlassGemmPattern : public CustomFusionPattern {
+ public:
+  std::optional<Match> TryMatch(const se::DeviceDescription& device,
+                                HloInstruction* instr) const override;
+};
+
+// Pattern matches simple row-major gemms with dynamic-update-slice.
+class CutlassGemmWithDynamicUpdateSlicePattern : public CustomFusionPattern {
+ public:
+  std::optional<Match> TryMatch(const se::DeviceDescription& device,
+                                HloInstruction* instr) const override;
+};
+
+// Pattern matches mixed dtype gemms when one of the operands is upcasted to an
+// accumulator (output) dtype, i.e. BF16 <= BF16 x S8.
+class CutlassGemmWithUpcastPattern : public CustomFusionPattern {
+ public:
+  std::optional<Match> TryMatch(const se::DeviceDescription& device,
+                                HloInstruction* instr) const override;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_KERNELS_CUTLASS_GEMM_FUSION_H_
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc
new file mode 100644
index 00000000000000..13e02385c54af4
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc
@@ -0,0 +1,371 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/kernels/cutlass_gemm_fusion.h"
+
+#include <cstdint>
+#include <utility>
+
+#include "xla/array.h"
+#include "xla/array2d.h"
+#include "xla/array3d.h"
+#include "xla/error_spec.h"
+#include "xla/literal_util.h"
+#include "xla/service/gpu/custom_fusion_rewriter.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/kernels/custom_fusion_pattern.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/types.h"
+#include "tsl/platform/test.h"
+
+namespace xla::gpu {
+
+class CutlassFusionTest : public HloTestBase {};
+
+//===----------------------------------------------------------------------===//
+// Pattern matching tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(CutlassFusionTest, RowMajorGemm) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY %main (p0: f32[15,19], p1: f32[19,17]) -> f32[15,17] {
+      %p0 = f32[15,19]{1,0} parameter(0)
+      %p1 = f32[19,17]{1,0} parameter(1)
+      ROOT %r = f32[15,17]{1,0} dot(%p0, %p1),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK: %cutlass_gemm {{.*}} {
+    ; CHECK:   [[P0:%[^ ]+]] = f32[15,19]{1,0} parameter(0)
+    ; CHECK:   [[P1:%[^ ]+]] = f32[19,17]{1,0} parameter(1)
+    ; CHECK:   ROOT [[DOT:%[^ ]+]] = f32[15,17]{1,0} dot([[P0]], [[P1]]),
+    ; CEHCK:     lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    ; CHECK: }
+
+    ; CHECK: ENTRY %main {{.*}} {
+    ; CHECK:   ROOT [[FUSION:%[^ ]+]] = f32[15,17]{1,0} fusion
+    ; CHECK:     kind=kCustom, calls=%cutlass_gemm,
+    ; CHECK:     backend_config={
+    ; CHECK:       "kind":"__custom_fusion",
+    ; CHECK:       "custom_fusion_config":{"name":"cutlass_gemm"}
+    ; CHECK:     }
+    ; CHECK: }
+  )";
+
+  CustomFusionPatternRegistry patterns;
+  patterns.Emplace<CutlassGemmPattern>();
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  CustomFusionRewriter pass(&device, &patterns);
+  RunAndFilecheckHloRewrite(hlo, std::move(pass), expected);
+}
+
+TEST_F(CutlassFusionTest, RowMajorGemmWithUpcast) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY %main (p0: bf16[15,19], p1: s8[19,17]) -> bf16[15,17] {
+      %p0 = bf16[15,19]{1,0} parameter(0)
+      %p1 = s8[19,17]{1,0} parameter(1)
+      %c1 = bf16[19,17]{1,0} convert(%p1)
+      ROOT %r = bf16[15,17]{1,0} dot(%p0, %c1),
+        lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK: %cutlass_gemm_with_upcast {{.*}} {
+    ; CHECK-DAG: [[P0:%[^ ]+]] = bf16[15,19]{1,0} parameter
+    ; CHECK-DAG: [[P1:%[^ ]+]] = s8[19,17]{1,0} parameter
+    ; CHECK:     [[C1:%[^ ]+]] = bf16[19,17]{1,0} convert([[P1]])
+    ; CHECK:     ROOT [[DOT:%[^ ]+]] = bf16[15,17]{1,0} dot([[P0]], [[C1]]),
+    ; CEHCK:       lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    ; CHECK: }
+
+    ; CHECK: ENTRY %main {{.*}} {
+    ; CHECK:   ROOT [[FUSION:%[^ ]+]] = bf16[15,17]{1,0} fusion
+    ; CHECK:     kind=kCustom, calls=%cutlass_gemm_with_upcast,
+    ; CHECK:     backend_config={
+    ; CHECK:       "kind":"__custom_fusion",
+    ; CHECK:       "custom_fusion_config":{"name":"cutlass_gemm_with_upcast"}
+    ; CHECK:     }
+    ; CHECK: }
+  )";
+
+  CustomFusionPatternRegistry patterns;
+  patterns.Emplace<CutlassGemmWithUpcastPattern>();
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  CustomFusionRewriter pass(&device, &patterns);
+  RunAndFilecheckHloRewrite(hlo, std::move(pass), expected);
+}
+
+TEST_F(CutlassFusionTest, RowMajorGemmWithDynamicUpdateSlice) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY %main (p0: f32[2,2,2], p1: f32[2,2], i: s32[]) -> f32[2,2,2] {
+      %p0 = f32[2,2,2]{2,1,0} parameter(0)
+      %p1 = f32[2,2]{1,0} parameter(1)
+      %i = s32[] parameter(2)
+
+      %dot = f32[2,2]{1,0} dot(%p1, %p1),
+               lhs_contracting_dims={1},
+               rhs_contracting_dims={0}
+      %bc = f32[1,2,2]{2,1,0} bitcast(%dot)
+
+      ROOT %r = f32[2,2,2]{2,1,0} dynamic-update-slice(%p0, %bc, %i, %i, %i)
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK: %cutlass_gemm_with_dynamic_update_slice {{.*}} {
+    ; CHECK-DAG: [[P0:%[^ ]+]] = f32[2,2]{1,0} parameter
+    ; CHECK-DAG: [[P1:%[^ ]+]] = f32[2,2,2]{2,1,0} parameter
+    ; CHECK-DAG: [[P2:%[^ ]+]] = s32[] parameter
+    ; CHECK-DAG: [[DOT:%[^ ]+]] = f32[2,2]{1,0} dot([[P0]], [[P0]])
+    ; CHECK-DAG: [[CAST:%[^ ]+]] = f32[1,2,2]{2,1,0} bitcast([[DOT]])
+    ; CHECK:     ROOT [[DUS:%[^ ]+]] = f32[2,2,2]{2,1,0} dynamic-update-slice(
+    ; CHECK:       [[P1]], [[CAST]], [[P2]], [[P2]], [[P2]]
+    ; CHECK:     )
+    ; CHECK: }
+
+    ; CHECK: ENTRY %main {{.*}} {
+    ; CHECK:   ROOT [[FUSION:%[^ ]+]] = f32[2,2,2]{2,1,0} fusion
+    ; CHECK:     kind=kCustom, calls=%cutlass_gemm_with_dynamic_update_slice,
+    ; CHECK:     backend_config={
+    ; CHECK:       "kind":"__custom_fusion",
+    ; CHECK:       "custom_fusion_config":{
+    ; CHECK:         "name":"cutlass_gemm_with_dynamic_update_slice"
+    ; CHECK:       }
+    ; CHECK:     }
+    ; CHECK: }
+  )";
+
+  CustomFusionPatternRegistry patterns;
+  patterns.Emplace<CutlassGemmWithDynamicUpdateSlicePattern>();
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  CustomFusionRewriter pass(&device, &patterns);
+  RunAndFilecheckHloRewrite(hlo, std::move(pass), expected);
+}
+
+TEST_F(CutlassFusionTest, RowMajorGemmWithDynamicUpdateSliceMultipleUses) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY %main {
+      %p0 = f32[2,2,2]{2,1,0} parameter(0)
+      %p1 = f32[2,2]{1,0} parameter(1)
+      %i = s32[] parameter(2)
+
+      %dot = f32[2,2]{1,0} dot(%p1, %p1),
+               lhs_contracting_dims={1},
+               rhs_contracting_dims={0}
+      %add = f32[2,2]{1,0} add(%dot, %dot)
+
+      %cast = f32[1,2,2]{2,1,0} bitcast(%dot)
+      %dus = f32[2,2,2]{2,1,0} dynamic-update-slice(%p0, %cast, %i, %i, %i)
+
+      ROOT %r = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(%add, %dus)
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK: %cutlass_gemm_with_dynamic_update_slice {{.*}} {
+    ; CHECK-DAG: [[P0:%[^ ]+]] = f32[2,2]{1,0} parameter
+    ; CHECK-DAG: [[P1:%[^ ]+]] = f32[2,2,2]{2,1,0} parameter
+    ; CHECK-DAG: [[P2:%[^ ]+]] = s32[] parameter
+    ; CHECK-DAG: [[DOT:%[^ ]+]] = f32[2,2]{1,0} dot([[P0]], [[P0]])
+    ; CHECK-DAG: [[CAST:%[^ ]+]] = f32[1,2,2]{2,1,0} bitcast([[DOT]])
+    ; CHECK:     ROOT [[DUS:%[^ ]+]] = f32[2,2,2]{2,1,0} dynamic-update-slice(
+    ; CHECK:       [[P1]], [[CAST]], [[P2]], [[P2]], [[P2]]
+    ; CHECK:     )
+    ; CHECK: }
+
+    ; CHECK: ENTRY %main {{.*}} {
+    ; CHECK:   [[OFFSET:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK:   [[FUSION:%[^ ]+]] = f32[2,2,2]{2,1,0} fusion
+    ; CHECK:     kind=kCustom, calls=%cutlass_gemm_with_dynamic_update_slice,
+    ; CHECK:     backend_config={
+    ; CHECK:       "kind":"__custom_fusion",
+    ; CHECK:       "custom_fusion_config":{
+    ; CHECK:         "name":"cutlass_gemm_with_dynamic_update_slice"
+    ; CHECK:       }
+    ; CHECK:     }
+    ; CHECK:   [[SLICE:%[^ ]+]] = f32[1,2,2]{2,1,0} dynamic-slice(
+    ; CHECK:     [[FUSION]], [[OFFSET]], [[OFFSET]], [[OFFSET]]),
+    ; CHECK:     dynamic_slice_sizes={1,2,2}
+    ; CHECK:   [[CAST:%[^. ]+]] = f32[2,2]{1,0} bitcast([[SLICE]])
+    ; CHECK:   [[ADD:%[^. ]+]] = f32[2,2]{1,0} add([[CAST]], [[CAST]])
+    ; CHECK: }
+  )";
+
+  CustomFusionPatternRegistry patterns;
+  patterns.Emplace<CutlassGemmWithDynamicUpdateSlicePattern>();
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  CustomFusionRewriter pass(&device, &patterns);
+  RunAndFilecheckHloRewrite(hlo, std::move(pass), expected);
+}
+
+//===----------------------------------------------------------------------===//
+// Run And Compare Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(CutlassFusionTest, RowMajorGemmKernel) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_text_cublas = R"(
+  HloModule cublas
+
+  ENTRY e {
+    arg0 = f32[100,784]{1,0} parameter(0)
+    arg1 = f32[784,10]{1,0} parameter(1)
+    gemm = (f32[100,10]{1,0}, s8[0]{0}) custom-call(arg0, arg1),
+      custom_call_target="__cublas$gemm",
+      backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+    ROOT get-tuple-element = f32[100,10]{1,0} get-tuple-element((f32[100,10]{1,0}, s8[0]{0}) gemm), index=0
+  })";
+
+  const char* hlo_text_custom_fusion = R"(
+  HloModule cutlass
+
+  cutlass_gemm {
+    arg0 = f32[100,784]{1,0} parameter(0)
+    arg1 = f32[784,10]{1,0} parameter(1)
+    ROOT dot = f32[100,10]{1,0} dot(arg0, arg1),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+
+  ENTRY e {
+    arg0 = f32[100,784]{1,0} parameter(0)
+    arg1 = f32[784,10]{1,0} parameter(1)
+    ROOT _ = f32[100,10]{1,0} fusion(arg0, arg1), kind=kCustom, calls=cutlass_gemm,
+      backend_config={kind: "__custom_fusion", custom_fusion_config: {"name":"cutlass_gemm"}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_cublas, hlo_text_custom_fusion,
+                                      error_spec, /*run_hlo_passes=*/false));
+}
+
+TEST_F(CutlassFusionTest, RowMajorGemmWithUpcastKernel) {
+  GTEST_SKIP() << "Requires CUTLASS 3.3.0+";
+
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_text_cublas = R"(
+  HloModule cublas
+
+  ENTRY e {
+    p0 = bf16[16,32]{1,0} parameter(0)
+    p1 = s8[32,8]{1,0} parameter(1)
+    c1 = bf16[32,8]{1,0} convert(p1)
+    gemm = (bf16[16,8]{1,0}, s8[0]{0}) custom-call(p0, c1),
+      custom_call_target="__cublas$gemm",
+      backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+    ROOT get-tuple-element = bf16[16,8]{1,0} get-tuple-element(gemm), index=0
+  })";
+
+  const char* hlo_text_custom_fusion = R"(
+  HloModule cutlass
+
+  cutlass_gemm_with_upcast {
+    p0 = bf16[16,32]{1,0} parameter(0)
+    p1 = s8[32,8]{1,0} parameter(1)
+    c1 = bf16[32,8]{1,0} convert(p1)
+    ROOT dot = bf16[16,8]{1,0} dot(p0, c1),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+
+  ENTRY e {
+    p0 = bf16[16,32]{1,0} parameter(0)
+    p1 = s8[32,8]{1,0} parameter(1)
+    ROOT _ = bf16[16,8]{1,0} fusion(p0, p1), kind=kCustom, calls=cutlass_gemm_with_upcast,
+      backend_config={kind: "__custom_fusion", custom_fusion_config: {"name":"cutlass_gemm_with_upcast"}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_cublas, hlo_text_custom_fusion,
+                                      error_spec, /*run_hlo_passes=*/false));
+}
+
+TEST_F(CutlassFusionTest, RowMajorGemmWithDynamicUpdateSliceKernel) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_text_cublas = R"(
+  HloModule cublas
+
+  ENTRY e {
+    p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    p1 = bf16[8,8]{1,0} parameter(1)
+    p2 = s32[] parameter(2)
+    p3 = s32[] parameter(3)
+
+    gemm.tuple = (bf16[8,8]{1,0}, s8[0]{0}) custom-call(p1, p1),
+      custom_call_target="__cublas$gemm",
+      backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
+    gemm = bf16[8,8]{1,0} get-tuple-element(gemm.tuple), index=0
+    cast = bf16[1,8,8]{2,1,0} bitcast(gemm)
+
+    ROOT r = bf16[2,8,8]{2,1,0} dynamic-update-slice(p0, cast, p2, p3, p3)
+  })";
+
+  const char* hlo_text_custom_fusion = R"(
+  HloModule cutlass
+
+  cutlass_gemm {
+    p0.1 = bf16[8,8]{1,0} parameter(0)
+    p1.1 = bf16[2,8,8]{2,1,0} parameter(1)
+    p2 = s32[] parameter(2)
+    p3 = s32[] parameter(3)
+    dot.1 = bf16[8,8]{1,0} dot(p0.1, p0.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    bc.1 = bf16[1,8,8]{2,1,0} bitcast(dot.1)
+    ROOT r.1 = bf16[2,8,8]{2,1,0} dynamic-update-slice(p1.1, bc.1, p2, p3, p3)
+  }
+
+  ENTRY e {
+    p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    p1 = bf16[8,8]{1,0} parameter(1)
+    p2 = s32[] parameter(2)
+    p3 = s32[] parameter(3)
+    ROOT _ = bf16[2,8,8]{2,1,0} fusion(p1, p0, p2, p3), kind=kCustom,
+      calls=%cutlass_gemm,
+      backend_config={"kind":"__custom_fusion","custom_fusion_config":{"name":"cutlass_gemm_with_dynamic_update_slice"}}
+  })";
+
+  Array3D<bfloat16> p0_arr(2, 8, 8);  // bf16[2,8,8]
+  Array2D<bfloat16> p1_arr(8, 8);     // bf16[8,8]
+  p1_arr.Each([](int64_t i, int64_t j, bfloat16* out) {
+    *out = bfloat16{1.0f * i * j};
+  });
+
+  Array<int32_t> p2_arr({}, 1);
+  Array<int32_t> p3_arr({}, 0);
+
+  auto p0 = LiteralUtil::CreateFromArray(p0_arr);
+  auto p1 = LiteralUtil::CreateFromArray(p1_arr);
+  auto p2 = LiteralUtil::CreateFromArray(p2_arr);
+  auto p3 = LiteralUtil::CreateFromArray(p3_arr);
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_cublas, hlo_text_custom_fusion,
+                                      {&p0, &p1, &p2, &p3}, error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_kernel_bf16xbf16_to_bf16.cu.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_kernel_bf16xbf16_to_bf16.cu.cc
new file mode 100644
index 00000000000000..395351234925b7
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_kernel_bf16xbf16_to_bf16.cu.cc
@@ -0,0 +1,33 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "xla/service/gpu/kernels/cutlass_gemm_adaptor.cu.h"
+
+namespace xla::gpu::kernel::gemm_universal {
+
+using GemmOperation = cutlass::gemm::device::GemmUniversal<
+    cutlass::bfloat16_t, cutlass::layout::RowMajor,  // A
+    cutlass::bfloat16_t, cutlass::layout::RowMajor,  // B
+    cutlass::bfloat16_t, cutlass::layout::RowMajor,  // C
+    float>;
+
+XLA_GPU_DEFINE_CUTLASS_GEMM_TRAITS(Bf16xBf16ToBf16<Arch::kDefault>,
+                                   GemmOperation);
+
+template struct Adaptor<Bf16xBf16ToBf16<Arch::kDefault>>;
+template struct DeviceKernel<Bf16xBf16ToBf16<Arch::kDefault>>;
+
+}  // namespace xla::gpu::kernel::gemm_universal
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80.cu.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80.cu.cc
new file mode 100644
index 00000000000000..13974f0db297aa
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80.cu.cc
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "xla/service/gpu/kernels/cutlass_gemm_adaptor.cu.h"
+
+namespace xla::gpu::kernel::gemm_universal {
+
+using GemmOperation = cutlass::gemm::device::GemmUniversal<
+    cutlass::bfloat16_t, cutlass::layout::RowMajor,  // A
+    cutlass::bfloat16_t, cutlass::layout::RowMajor,  // B
+    cutlass::bfloat16_t, cutlass::layout::RowMajor,  // C
+    float, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 64>,  // ThreadblockShape
+    cutlass::gemm::GemmShape<64, 64, 64>,    // WarpShape
+    cutlass::gemm::GemmShape<16, 8, 16>,     // InstructionShape
+    cutlass::epilogue::thread::LinearCombination<cutlass::bfloat16_t, 8, float,
+                                                 float>,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    /*Stages=*/3, /*AlignmentA=*/8, /*AlignmentB=*/8>;
+
+XLA_GPU_DEFINE_CUTLASS_GEMM_TRAITS(Bf16xBf16ToBf16<Arch::kSm80>, GemmOperation);
+
+template struct Adaptor<Bf16xBf16ToBf16<Arch::kSm80>>;
+template struct DeviceKernel<Bf16xBf16ToBf16<Arch::kSm80>>;
+
+}  // namespace xla::gpu::kernel::gemm_universal
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_kernel_f32xf32_to_f32.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_kernel_f32xf32_to_f32.cc
new file mode 100644
index 00000000000000..a4864c43255b4f
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_kernel_f32xf32_to_f32.cc
@@ -0,0 +1,73 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include "xla/service/gpu/kernels/cutlass_gemm.h"
+
+namespace xla::gpu::kernel::gemm_universal {
+
+using CutlassGemm = F32xF32ToF32<Arch::kDefault>;
+
+extern template struct Adaptor<CutlassGemm>;
+extern template struct DeviceKernel<CutlassGemm>;
+
+extern "C" void xla_cutlass_kernel_block_dim(int32_t m, int32_t n, int32_t k,
+                                             uint32_t* x, uint32_t* y,
+                                             uint32_t* z) {
+  Adaptor<CutlassGemm> adaptor;
+  auto dim = adaptor.BlockDim(m, n, k);
+  *x = dim.x;
+  *y = dim.y;
+  *z = dim.z;
+}
+
+extern "C" void xla_cutlass_kernel_thread_dim(uint32_t* x, uint32_t* y,
+                                              uint32_t* z) {
+  Adaptor<CutlassGemm> adaptor;
+  auto dim = adaptor.ThreadDim();
+  *x = dim.x;
+  *y = dim.y;
+  *z = dim.z;
+}
+
+extern "C" int32_t xla_cutlass_kernel_shared_memory_bytes() {
+  Adaptor<CutlassGemm> adaptor;
+  return adaptor.SharedMemoryBytes();
+}
+
+extern "C" bool xla_cutlass_kernel_can_implement(int32_t m, int32_t n,
+                                                 int32_t k) {
+  Adaptor<CutlassGemm> adaptor;
+  Arguments arguments = {m, n, k};
+  return adaptor.CanImplement(arguments);
+}
+
+extern "C" void xla_cutlass_kernel_initialize(void* params, int32_t m,
+                                              int32_t n, int32_t k, void* a,
+                                              void* b, void* c,
+                                              int32_t device_sms,
+                                              int32_t sm_occupancy) {
+  Adaptor<CutlassGemm> adaptor;
+  Arguments arguments = {m, n, k, a, b, c};
+  adaptor.Initialize(params, arguments, device_sms, sm_occupancy);
+}
+
+extern "C" void* xla_cutlass_kernel_symbol() {
+  DeviceKernel<CutlassGemm> kernel;
+  return kernel.symbol();
+}
+
+}  // namespace xla::gpu::kernel::gemm_universal
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_kernel_f32xf32_to_f32.cu.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_kernel_f32xf32_to_f32.cu.cc
new file mode 100644
index 00000000000000..169f15180cce04
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_kernel_f32xf32_to_f32.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "xla/service/gpu/kernels/cutlass_gemm_adaptor.cu.h"
+
+namespace xla::gpu::kernel::gemm_universal {
+
+using GemmOperation =
+    cutlass::gemm::device::GemmUniversal<float, cutlass::layout::RowMajor,
+                                         float, cutlass::layout::RowMajor,
+                                         float, cutlass::layout::RowMajor>;
+
+XLA_GPU_DEFINE_CUTLASS_GEMM_TRAITS(F32xF32ToF32<Arch::kDefault>,
+                                   GemmOperation);
+
+template struct Adaptor<F32xF32ToF32<Arch::kDefault>>;
+template struct DeviceKernel<F32xF32ToF32<Arch::kDefault>>;
+}  // namespace xla::gpu::kernel::gemm_universal
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.cc b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.cc
new file mode 100644
index 00000000000000..fdf3ecc657fb11
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.cc
@@ -0,0 +1,134 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/kernels/topk_custom_kernel.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/numeric/bits.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "Eigen/Core"  // from @eigen_archive
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/runtime/topk_kernel_common.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu::kernel::topk {
+
+namespace {
+
+using KernelArgsPacking = se::MultiKernelLoaderSpec::KernelArgsPacking;
+
+#define WAVEFRONT_SIZE 32
+
+// The optimal number of threads is the smaller value between the number of
+// threads available per block and the number of slices of data.
+size_t EstimateOptimalNumThreads(size_t n, size_t k, size_t batch_size) {
+  // Estimate number of threads per block that can run concurrently given the
+  // register footprint (k elements are kept in registers at all times).
+  constexpr size_t kEstimatedThreadsPerBlock = 512;
+  constexpr size_t kMaxKValue = 16;
+  size_t simultaneous_threads_per_block =
+      kEstimatedThreadsPerBlock * (kMaxKValue / k);
+  size_t threads_per_block =
+      std::min(simultaneous_threads_per_block, kTopKMaxThreadsPerBlock);
+  // Minimum amount of data that each thread needs to receive for the algorithm.
+  size_t min_slice = absl::bit_floor(n / absl::bit_ceil(k));
+  return std::min(threads_per_block, min_slice);
+}
+
+// Gets the right version of TopK kernel based on the value of `k`.
+template <typename T>
+StatusOr<void*> GetKernel(int n, int k) {
+  if (k <= 1) return GetTopKKernelForK<T, 1>(n);
+  if (k <= 2) return GetTopKKernelForK<T, 2>(n);
+  if (k <= 4) return GetTopKKernelForK<T, 4>(n);
+  if (k <= 8) return GetTopKKernelForK<T, 8>(n);
+  if (k <= 16) return GetTopKKernelForK<T, 16>(n);
+  return absl::UnimplementedError(absl::StrCat("Unsupported K: ", k));
+}
+
+// Returns the function creating packed arguments for TopK kernel.
+template <typename T>
+KernelArgsPacking CreateTopKArgsPacking(size_t num_elements, size_t k) {
+  using Packed = StatusOr<std::unique_ptr<se::KernelArgsPackedArrayBase>>;
+
+  return [=](const se::Kernel& kernel, const se::KernelArgs& args) -> Packed {
+    auto* mem_args = se::Cast<se::KernelArgsDeviceMemoryArray>(&args);
+
+    se::DeviceMemory<T> data(mem_args->device_memory_args()[0]);
+    se::DeviceMemory<T> top_elements(mem_args->device_memory_args()[1]);
+    se::DeviceMemory<uint32_t> top_indices(mem_args->device_memory_args()[2]);
+
+    return se::PackKernelArgs(args.number_of_shared_bytes(), data, num_elements,
+                              top_elements, top_indices, k);
+  };
+}
+
+// Implementation for creating a CustomKernel for TopK operation with element
+// type `T`.
+template <typename T>
+StatusOr<CustomKernel> GetTypedTopK(std::string name, size_t num_elements,
+                                    size_t k, size_t batch_size) {
+  constexpr size_t kMaxKVSize = sizeof(uint64_t);
+  constexpr size_t kWavefrontSize = 32;
+  // Allocate shmem assuming we have a full reduction.
+  int shmem_size = absl::bit_ceil(k) * kMaxKVSize * kWavefrontSize;
+  int num_threads = EstimateOptimalNumThreads(num_elements, k, batch_size);
+  if (num_threads == 0) {
+    return absl::FailedPreconditionError(
+        "Invalid kernel parameters. This is likely a bug in the "
+        "TopkSpecializer.");
+  }
+
+  auto packing = CreateTopKArgsPacking<T>(num_elements, k);
+
+  se::MultiKernelLoaderSpec spec(/*arity=*/5, std::move(packing));
+  TF_ASSIGN_OR_RETURN(void* kernel_symbol, GetKernel<T>(num_elements, k));
+  spec.AddInProcessSymbol(kernel_symbol, name);
+
+  return CustomKernel(std::move(name), std::move(spec),
+                      se::BlockDim(batch_size, 1, 1),
+                      se::ThreadDim(num_threads, 1, 1), shmem_size);
+}
+
+}  // namespace
+
+StatusOr<CustomKernel> GetTopKKernel(std::string name, PrimitiveType dtype,
+                                     size_t num_elements, size_t k,
+                                     size_t batch_size) {
+  switch (dtype) {
+    case PrimitiveType::F32:
+      return GetTypedTopK<float>(std::move(name), num_elements, k, batch_size);
+    case PrimitiveType::BF16:
+      return GetTypedTopK<Eigen::bfloat16>(std::move(name), num_elements, k,
+                                           batch_size);
+    default:
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported GpuTopK data type: ", dtype));
+  }
+}
+
+}  // namespace xla::gpu::kernel::topk
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.h b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.h
new file mode 100644
index 00000000000000..6d7eeb9ca4bad1
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_KERNELS_TOPK_CUSTOM_KERNEL_H_
+#define XLA_SERVICE_GPU_KERNELS_TOPK_CUSTOM_KERNEL_H_
+
+#include <cstdint>
+
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu::kernel::topk {
+
+// Creates a CustomKernel for TopK operation.
+StatusOr<CustomKernel> GetTopKKernel(std::string name, PrimitiveType dtype,
+                                     size_t num_elements, size_t k,
+                                     size_t batch_size);
+
+}  // namespace xla::gpu::kernel::topk
+
+#endif  // XLA_SERVICE_GPU_KERNELS_TOPK_CUSTOM_KERNEL_H_
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/utils/tensor_utils.cc b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel_stub.cc
similarity index 52%
rename from third_party/xla/xla/mlir_hlo/gml_st/utils/tensor_utils.cc
rename to third_party/xla/xla/service/gpu/kernels/topk_custom_kernel_stub.cc
index 50e6cd31c495de..d8e47d644a6799 100644
--- a/third_party/xla/xla/mlir_hlo/gml_st/utils/tensor_utils.cc
+++ b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel_stub.cc
@@ -13,22 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "gml_st/utils/tensor_utils.h"
-
-namespace mlir::gml_st {
-
-// Returns ids of size-1 dims that were expanded or collapsed by
-// tensor.expand_shape/tensor.collapse_shape.
-SmallVector<int64_t> getPreservedDimensions(
-    ArrayRef<int64_t> shape,
-    ArrayRef<ReassociationIndices> reassociationIndices) {
-  SmallVector<int64_t> result;
-  for (ReassociationIndicesRef indices : reassociationIndices) {
-    const auto* findIt =
-        llvm::find_if(indices, [&](int64_t idx) { return shape[idx] != 1; });
-    result.push_back(findIt == indices.end() ? 0 : *findIt);
-  }
-  return result;
+#include <cstdint>
+
+#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu::kernel::topk {
+
+// Fallback implementation of creating a CustomKernel for TopK operation.
+StatusOr<CustomKernel> GetTopKKernel(std::string name, PrimitiveType dtype,
+                                     size_t num_elements, size_t k,
+                                     size_t batch_size) {
+  return absl::InternalError("XLA compiled without CUDA support");
 }
 
-}  // namespace mlir::gml_st
+}  // namespace xla::gpu::kernel::topk
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 20c22245c217bf..6f8d3a760493e5 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -855,10 +855,14 @@ Status AMDGPUTargetModuleLinker(llvm::Module* module,
 // related changes which have not yet been upstreamed (to the LLVM repo)
 // When that upstreaming happens (and TF LLVM pointer moves past the
 // upstream commit), the following mapping will need to change
-std::string MapGCNArchNameTokenToFeatureStr(const std::string& token) {
+std::string MapGCNArchNameTokenToFeatureStr(const std::string& token,
+                                            const std::string& gfx) {
   if (token == "sramecc+") {
     return "+sramecc";
   } else if (token == "sramecc-") {
+    if (gfx == "gfx90a" || gfx == "gfx940" || gfx == "gfx941" ||
+        gfx == "gfx942")
+      return "";
     return "-sramecc";
   } else if (token == "xnack+") {
     return "+xnack";
@@ -883,7 +887,7 @@ std::pair<std::string, std::string> GetFeatureStrFromGCNArchName(
     // The rest of the tokens are the feature/targetid strings
     if (it != tokens.begin()) {
       std::string token(*it);
-      std::string mapped_token = MapGCNArchNameTokenToFeatureStr(token);
+      std::string mapped_token = MapGCNArchNameTokenToFeatureStr(token, gfx);
       mapped_tokens.push_back(mapped_token);
     }
   }
diff --git a/third_party/xla/xla/service/gpu/make_batch_pointers.cc b/third_party/xla/xla/service/gpu/make_batch_pointers.cc
new file mode 100644
index 00000000000000..66f38420720051
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/make_batch_pointers.cc
@@ -0,0 +1,73 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/make_batch_pointers.h"
+
+#include <cstddef>
+#include <memory>
+
+#include "xla/status.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+#if TENSORFLOW_USE_ROCM
+#include "xla/stream_executor/gpu/gpu_stream.h"
+namespace stream_executor::gpu {
+
+extern void rocm_MakeBatchPointers(void* stream, char* base, int stride, int n,
+                                   void** ptrs_out);
+
+}  // namespace stream_executor::gpu
+#endif
+
+namespace xla::gpu {
+
+namespace make_batch_pointers {
+void* kernel();  // returns a pointer to a CUDA C++ device function
+}  // namespace make_batch_pointers
+
+Status MakeBatchPointers(se::Stream* stream, se::DeviceMemoryBase base_ptr,
+                         size_t stride_bytes, size_t n,
+                         se::DeviceMemoryBase ptrs_out) {
+  static constexpr size_t kThreads = 128;
+
+  se::StreamExecutor* executor = stream->parent();
+
+#if TENSORFLOW_USE_ROCM
+  stream_executor::gpu::rocm_MakeBatchPointers(
+      se::gpu::AsGpuStreamValue(stream),
+      reinterpret_cast<char*>(base_ptr.opaque()), stride_bytes, n,
+      reinterpret_cast<void**>(ptrs_out.opaque()));
+#else
+
+  TF_ASSIGN_OR_RETURN(
+      auto kernel, (executor->CreateTypedKernel<se::DeviceMemoryBase, size_t,
+                                                size_t, se::DeviceMemoryBase>(
+                       "make_batch_pointers", make_batch_pointers::kernel())));
+
+  TF_RETURN_IF_ERROR(
+      stream->ThenLaunch(se::ThreadDim(kThreads, 1, 1),
+                         se::BlockDim(CeilOfRatio(n, kThreads), 1, 1), *kernel,
+                         base_ptr, stride_bytes, n, ptrs_out));
+#endif
+  return OkStatus();
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/make_batch_pointers.cu.cc b/third_party/xla/xla/service/gpu/make_batch_pointers.cu.cc
new file mode 100644
index 00000000000000..a8caaab316420c
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/make_batch_pointers.cu.cc
@@ -0,0 +1,32 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+
+namespace xla::gpu {
+namespace {
+__global__ void MakeBatchPointers(char* base, size_t stride, size_t n,
+                                  void** ptrs_out) {
+  size_t idx = size_t(threadIdx.x) + size_t(blockIdx.x) * size_t(blockDim.x);
+  if (idx >= n) return;
+  ptrs_out[idx] = base + idx * stride;
+}
+}  // namespace
+
+namespace make_batch_pointers {
+void* kernel() { return reinterpret_cast<void*>(MakeBatchPointers); }
+}  // namespace make_batch_pointers
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/precompiled_kernels.h b/third_party/xla/xla/service/gpu/make_batch_pointers.h
similarity index 79%
rename from third_party/xla/xla/service/gpu/precompiled_kernels.h
rename to third_party/xla/xla/service/gpu/make_batch_pointers.h
index 3d1e96afafbeb2..320090beae3865 100644
--- a/third_party/xla/xla/service/gpu/precompiled_kernels.h
+++ b/third_party/xla/xla/service/gpu/make_batch_pointers.h
@@ -13,19 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_PRECOMPILED_KERNELS_H_
-#define XLA_SERVICE_GPU_PRECOMPILED_KERNELS_H_
+#ifndef XLA_SERVICE_GPU_MAKE_BATCH_POINTERS_H_
+#define XLA_SERVICE_GPU_MAKE_BATCH_POINTERS_H_
+
+#include <cstddef>
+#include <cstdint>
 
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/gpu/gpu_asm_opts.h"
 #include "xla/stream_executor/stream.h"
-#include "xla/types.h"
-
-// Hardcoded GPU kernels for various simple tasks.
+#include "xla/types.h"  // IWYU pragma: keep
 
-namespace xla {
-namespace gpu {
+namespace xla::gpu {
 
 // In GPU memory, does
 //
@@ -51,10 +50,10 @@ namespace gpu {
 //    driver and slow down *all* work on the GPU.  So to do this right, we'd
 //    need to allocate the host memory as pinned, one alloc per stream.  Then
 //    we'd need to manage this memory without leaks.  This becomes complex!
-Status MakeBatchPointers(se::Stream* stream, const se::GpuAsmOpts& asm_opts,
-                         se::DeviceMemoryBase base_ptr, int stride_bytes, int n,
+Status MakeBatchPointers(se::Stream* stream, se::DeviceMemoryBase base_ptr,
+                         size_t stride_bytes, size_t n,
                          se::DeviceMemoryBase ptrs_out);
 
-}  // namespace gpu
-}  // namespace xla
-#endif  // XLA_SERVICE_GPU_PRECOMPILED_KERNELS_H_
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_MAKE_BATCH_POINTERS_H_
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
index 8136ac0a48a3c5..3482c7edeefcf5 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -571,6 +571,7 @@ Status DoGemm(int64_t batch_size, int64_t m, int64_t n, int64_t k,
   se::blas::BlasSupport::ScopedWorkspace scoped_workspace(
       stream->parent()->AsBlas(), &workspace);
 
+// TODO: enable DoGemmWithAlgorithm for ROCm !
 #if GOOGLE_CUDA
   if (algorithm) {
     return DoGemmWithAlgorithm<Scale, Input, Output>(
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.h b/third_party/xla/xla/service/gpu/matmul_utils.h
index 07ba566ad128cb..4d329fea522a78 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.h
+++ b/third_party/xla/xla/service/gpu/matmul_utils.h
@@ -43,6 +43,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+// Ordered non-contracting dimensions for a dot instruction operand.
 StatusOr<std::vector<int64_t>> GetNonContractingDims(
     const Shape& shape, absl::Span<const int64_t> batch_dims,
     absl::Span<const int64_t> contracting_dims);
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index d1305137f3b5fb..df23b22769eaf2 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -34,7 +34,9 @@ cc_library(
         "//xla/service:latency_hiding_scheduler",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -50,11 +52,9 @@ xla_test(
     deps = [
         ":analytical_latency_estimator",
         "//xla:shape_util",
+        "//xla:statusor",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_matchers",
         "//xla/service:hlo_cost_analysis",
-        "//xla/service:hlo_module_config",
-        "//xla/service:hlo_parser",
         "//xla/service:latency_hiding_scheduler",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
@@ -66,6 +66,37 @@ xla_test(
     ],
 )
 
+cc_library(
+    name = "fusion_analysis_cache",
+    srcs = ["fusion_analysis_cache.cc"],
+    hdrs = ["fusion_analysis_cache.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+xla_cc_test(
+    name = "fusion_analysis_cache_test",
+    srcs = ["fusion_analysis_cache_test.cc"],
+    deps = [
+        ":fusion_analysis_cache",
+        "//xla/service:hlo_parser",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "gpu_cost_model_stats_collection",
     srcs = ["gpu_cost_model_stats_collection.cc"],
@@ -81,7 +112,6 @@ cc_library(
         "//xla/service:hlo_pass",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:status",
@@ -95,14 +125,15 @@ xla_cc_test(
         ":gpu_cost_model_stats_collection",
         ":gpu_hlo_cost_analysis",
         "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
         "//xla/service:hlo_cost_analysis",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/lib/core:status_test_util",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -118,6 +149,7 @@ cc_library(
     deps = [
         ":hlo_op_profile_proto_cc",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
@@ -127,8 +159,13 @@ cc_library(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
         "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -137,8 +174,13 @@ xla_cc_test(
     srcs = ["gpu_hlo_cost_analysis_test.cc"],
     deps = [
         ":gpu_hlo_cost_analysis",
+        "//xla:shape_util",
+        "//xla/service:hlo_cost_analysis",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -149,17 +191,28 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     visibility = ["//visibility:public"],
     deps = [
+        ":coalescing_analysis",
+        ":fusion_analysis_cache",
         ":gpu_hlo_cost_analysis",
         "//xla:shape_util",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_dataflow_analysis",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:hlo_traversal",
+        "//xla/service/gpu:launch_dimensions",
         "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:status",
     ] + if_cuda_is_configured(xla_nvml_deps()),
 )
 
@@ -170,6 +223,7 @@ xla_cc_test(
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
         "//xla:shape_util",
+        "//xla:test_helpers",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
         "//xla/service/gpu:backend_configs_cc",
@@ -179,6 +233,96 @@ xla_cc_test(
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "indexing_map_simplifier",
+    srcs = ["indexing_map_simplifier.cc"],
+    hdrs = ["indexing_map_simplifier.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "indexing_map_simplifier_test",
+    srcs = ["indexing_map_simplifier_test.cc"],
+    deps = [
+        ":indexing_map_simplifier",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "tile_analysis",
+    srcs = ["tile_analysis.cc"],
+    hdrs = ["tile_analysis.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":indexing_map_simplifier",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:matmul_utils",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "tile_analysis_test",
+    srcs = ["tile_analysis_test.cc"],
+    deps = [
+        ":tile_analysis",
+        "//xla:status_macros",
+        "//xla:statusor",
+        "//xla:test_helpers",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "coalescing_analysis",
+    srcs = ["coalescing_analysis.cc"],
+    hdrs = ["coalescing_analysis.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:gpu_fusible",
+        "//xla/service/gpu:hlo_fusion_analysis",
     ],
 )
 
@@ -218,8 +362,11 @@ cc_library(
         "//xla/service:interpreter_plugin",
         "//xla/stream_executor:device_description",
         "//xla/tests:test_utils",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -268,6 +415,7 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/service:gpu_plugin",
         "//xla/tests:hlo_test_base",
+        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
index d13c96d435606a..08f84f049af99c 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/log/log.h"
 #include "absl/time/time.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/latency_hiding_scheduler.h"
 #include "xla/stream_executor/device_description.h"
+#include "tsl/platform/status.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
index 4521fe0685939d..6a65cd17531e04 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/service/latency_hiding_scheduler.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
new file mode 100644
index 00000000000000..2da6fd98af6fb1
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -0,0 +1,78 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/coalescing_analysis.h"
+
+#include <optional>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/gpu_fusible.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+
+namespace xla {
+namespace gpu {
+
+bool IsReadCoalesced(const std::optional<HloFusionAnalysis>& fusion_analysis,
+                     const HloInstruction* producer,
+                     const HloInstruction* consumer) {
+  auto analyzed_kind_or_reduction =
+      fusion_analysis ? fusion_analysis->GetEmitterFusionKind()
+                      : HloFusionAnalysis::EmitterFusionKind::kReduction;
+
+  // Transposing minor dimension breaks coalescing.
+  if (analyzed_kind_or_reduction !=
+      HloFusionAnalysis::EmitterFusionKind::kTranspose) {
+    auto is_broadcast = [&](const HloInstruction* instr) {
+      while (true) {
+        if (instr->opcode() == HloOpcode::kBroadcast) return true;
+        if (instr->operand_count() != 1) return false;
+        if (instr->opcode() != HloOpcode::kBitcast && !instr->IsElementwise()) {
+          return false;
+        }
+        instr = instr->operand(0);
+      }
+    };
+
+    auto is_bad_transpose = [&](const HloInstruction* instr) {
+      if (instr->opcode() == HloOpcode::kFusion) {
+        for (auto* instr : instr->fused_instructions()) {
+          // Hack: we allow transposes of broadcasts.
+          if (TransposesMinorDimension(instr) &&
+              !is_broadcast(instr->operand(0))) {
+            return true;
+          }
+        }
+        return false;
+      }
+      return TransposesMinorDimension(instr);
+    };
+
+    if (is_bad_transpose(producer)) return false;
+    if (consumer && is_bad_transpose(consumer)) return false;
+  }
+
+  // Fusing two row reductions breaks coalescing.
+  if (analyzed_kind_or_reduction ==
+          HloFusionAnalysis::EmitterFusionKind::kReduction &&
+      IsInputFusibleReduction(*producer) && consumer &&
+      IsInputFusibleReduction(*consumer)) {
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
new file mode 100644
index 00000000000000..8545e964aa6e3d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
@@ -0,0 +1,37 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_COALESCING_ANALYSIS_H_
+#define XLA_SERVICE_GPU_MODEL_COALESCING_ANALYSIS_H_
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+
+namespace xla {
+namespace gpu {
+
+// Returns true if all input reads are coalesced. If consumer is not nullptr,
+// producer and consumer are considered as one fusion, otherwise it's only the
+// producer.
+//
+// This is a crude heuristic until we get proper tile analysis.
+bool IsReadCoalesced(const std::optional<HloFusionAnalysis>& fusion_analysis,
+                     const HloInstruction* producer,
+                     const HloInstruction* consumer = nullptr);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_COALESCING_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.cc b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.cc
new file mode 100644
index 00000000000000..d93dcffb07f762
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.cc
@@ -0,0 +1,107 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/fusion_analysis_cache.h"
+
+#include <optional>
+#include <utility>
+
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+
+namespace xla::gpu {
+
+const std::optional<HloFusionAnalysis>& HloFusionAnalysisCache::Get(
+    const HloInstruction& instruction) {
+  {
+    absl::MutexLock lock(&mutex_);
+    auto it = analyses_.find(instruction.unique_id());
+    if (it != analyses_.end()) {
+      return it->second;
+    }
+  }
+
+  std::optional<HloFusionAnalysis> analysis =
+      AnalyzeFusion(instruction, device_info_);
+  absl::MutexLock lock(&mutex_);
+
+  // If some other thread created an entry for this key concurrently, return
+  // that instead (the other thread is likely using the instance).
+  auto it = analyses_.find(instruction.unique_id());
+  if (it != analyses_.end()) {
+    return it->second;
+  }
+
+  return analyses_[instruction.unique_id()] = std::move(analysis);
+}
+
+const std::optional<HloFusionAnalysis>& HloFusionAnalysisCache::Get(
+    const HloInstruction& producer, const HloInstruction& consumer) {
+  std::pair<int, int> key{producer.unique_id(), consumer.unique_id()};
+  {
+    absl::MutexLock lock(&mutex_);
+    auto it = producer_consumer_analyses_.find(key);
+    if (it != producer_consumer_analyses_.end()) {
+      return it->second;
+    }
+  }
+
+  std::optional<HloFusionAnalysis> analysis =
+      AnalyzeProducerConsumerFusion(producer, consumer, device_info_);
+  absl::MutexLock lock(&mutex_);
+
+  // If some other thread created an entry for this key concurrently, return
+  // that instead (the other thread is likely using the instance).
+  auto it = producer_consumer_analyses_.find(key);
+  if (it != producer_consumer_analyses_.end()) {
+    return it->second;
+  }
+
+  producers_for_consumers_[consumer.unique_id()].push_back(
+      producer.unique_id());
+  consumers_for_producers_[producer.unique_id()].push_back(
+      consumer.unique_id());
+  return producer_consumer_analyses_[key] = std::move(analysis);
+}
+
+void HloFusionAnalysisCache::Invalidate(const HloInstruction& instruction) {
+  absl::MutexLock lock(&mutex_);
+  analyses_.erase(instruction.unique_id());
+
+  if (auto consumers =
+          consumers_for_producers_.extract(instruction.unique_id())) {
+    for (const auto consumer : consumers.mapped()) {
+      producer_consumer_analyses_.erase({instruction.unique_id(), consumer});
+    }
+  }
+  if (auto producers =
+          producers_for_consumers_.extract(instruction.unique_id())) {
+    for (const auto producer : producers.mapped()) {
+      producer_consumer_analyses_.erase({producer, instruction.unique_id()});
+    }
+  }
+}
+
+void HloFusionAnalysisCache::Clear() {
+  absl::MutexLock lock(&mutex_);
+
+  analyses_.clear();
+  producer_consumer_analyses_.clear();
+  consumers_for_producers_.clear();
+  producers_for_consumers_.clear();
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.h b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.h
new file mode 100644
index 00000000000000..4621123cfce677
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.h
@@ -0,0 +1,72 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_FUSION_ANALYSIS_CACHE_H_
+#define XLA_SERVICE_GPU_MODEL_FUSION_ANALYSIS_CACHE_H_
+
+#include "absl/container/node_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla::gpu {
+
+// Caches HloFusionAnalyses. Thread-compatible, if no threads concurrently `Get`
+// and `Invalidate` the same key. Analyses are cached based on unique_ids, no
+// checking or tracking of changes is done.
+class HloFusionAnalysisCache {
+ public:
+  explicit HloFusionAnalysisCache(
+      const stream_executor::DeviceDescription& device_info)
+      : device_info_(device_info) {}
+
+  // Returns the analysis for the given instruction, creating it if it doesn't
+  // exist yet. Do not call concurrently with `Invalidate` for the same key.
+  const std::optional<HloFusionAnalysis>& Get(
+      const HloInstruction& instruction);
+
+  // Returns the analysis for the given producer/consumer pair.
+  const std::optional<HloFusionAnalysis>& Get(const HloInstruction& producer,
+                                              const HloInstruction& consumer);
+
+  // Removes the cache entry for the given instruction, if it exists. Also
+  // removes all producer-consumer fusions that involve this instruction.
+  void Invalidate(const HloInstruction& instruction);
+
+  // Delete all cache entries.
+  void Clear();
+
+ private:
+  const stream_executor::DeviceDescription& device_info_;
+
+  absl::Mutex mutex_;
+
+// All `int` keys and values here are unique instruction IDs.
+  absl::node_hash_map<int, std::optional<HloFusionAnalysis>> analyses_;
+  absl::node_hash_map<std::pair<int, int>, std::optional<HloFusionAnalysis>>
+      producer_consumer_analyses_;
+
+  // For each instruction `producer`, contains the `consumer`s for which we have
+  // entries {`producer`, `consumer`} in `producer_consumer_analyses_`.
+  absl::flat_hash_map<int, std::vector<int>> consumers_for_producers_;
+  // For each instruction `consumer`, contains the `producer`s for which we have
+  // entries {`producer`, `consumer`} in `producer_consumer_analyses_`.
+  absl::flat_hash_map<int, std::vector<int>> producers_for_consumers_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_MODEL_FUSION_ANALYSIS_CACHE_H_
diff --git a/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc
new file mode 100644
index 00000000000000..88a50501ad3e07
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc
@@ -0,0 +1,119 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/fusion_analysis_cache.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/hlo_parser.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+class FusionAnalysisCacheTest : public HloTestBase {
+ public:
+  stream_executor::DeviceDescription device_{
+      TestGpuDeviceInfo::RTXA6000DeviceInfo()};
+  HloFusionAnalysisCache cache_{device_};
+};
+
+TEST_F(FusionAnalysisCacheTest, CachesAndInvalidates) {
+  absl::string_view hlo_string = R"(
+    HloModule m
+
+    f {
+      c0 = f32[] constant(0)
+      b0 = f32[1000] broadcast(c0)
+      ROOT n0 = f32[1000] negate(b0)
+    }
+
+    ENTRY e {
+      ROOT r.1 = f32[1000] fusion(), kind=kLoop, calls=f
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+
+  auto* computation = module->GetComputationWithName("f");
+  auto* broadcast = computation->GetInstructionWithName("b0");
+  auto* negate = computation->GetInstructionWithName("n0");
+  auto* fusion = module->entry_computation()->root_instruction();
+
+  EXPECT_THAT(cache_.Get(*fusion)->fusion_roots(),
+              ::testing::ElementsAre(negate));
+
+  computation->set_root_instruction(broadcast);
+
+  EXPECT_THAT(cache_.Get(*fusion)->fusion_roots(),
+              ::testing::ElementsAre(negate))
+      << "Analysis should be cached.";
+
+  cache_.Invalidate(*fusion);
+  EXPECT_THAT(cache_.Get(*fusion)->fusion_roots(),
+              ::testing::ElementsAre(broadcast))
+      << "Analysis should have been recomputed";
+}
+
+TEST_F(FusionAnalysisCacheTest, CachesAndInvalidatesProducerConsumerFusions) {
+  absl::string_view hlo_string = R"(
+    HloModule m
+
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+
+    f {
+      c0 = f32[] constant(0)
+      b0 = f32[1000] broadcast(c0)
+      ROOT r0 = f32[] reduce(b0, c0), dimensions={0}, to_apply=add
+    }
+
+    ENTRY e {
+      f0 = f32[] fusion(), kind=kInput, calls=f
+      ROOT n0 = f32[] negate(f0)
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+
+  auto* fusion = module->entry_computation()->GetInstructionWithName("f0");
+  auto* neg = module->entry_computation()->GetInstructionWithName("n0");
+
+  auto* computation = module->GetComputationWithName("f");
+  auto* constant = computation->GetInstructionWithName("c0");
+
+  EXPECT_EQ(cache_.Get(*fusion, *neg)->GetEmitterFusionKind(),
+            HloFusionAnalysis::EmitterFusionKind::kReduction);
+
+  computation->set_root_instruction(constant);
+
+  EXPECT_EQ(cache_.Get(*fusion, *neg)->GetEmitterFusionKind(),
+            HloFusionAnalysis::EmitterFusionKind::kReduction)
+      << "Analysis should be cached.";
+
+  cache_.Invalidate(*fusion);
+  EXPECT_EQ(cache_.Get(*fusion, *neg)->GetEmitterFusionKind(),
+            HloFusionAnalysis::EmitterFusionKind::kLoop)
+      << "Analysis should have been recomputed";
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
index 14472872c3ab9c..cec1c974018dad 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/service/gpu/model/gpu_cost_model_stats_collection.h"
 
 #include "absl/container/flat_hash_set.h"
-#include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
index 68fd748a27a19c..352241d9aacc05 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 #include <memory>
 
-#include "absl/status/statusor.h"
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
@@ -28,7 +29,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
-#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
index e468b1825a0b2c..e11530c1f2b6da 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
@@ -16,13 +16,19 @@ limitations under the License.
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <utility>
 #include <variant>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -38,8 +44,12 @@ limitations under the License.
 #include "xla/service/gpu/model/hlo_op_profiles.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/stream_executor/device_description.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -307,8 +317,7 @@ int64_t GpuHloCostAnalysis::GetConvolutionFlops(
 
 using ProfilesNestedMap = absl::flat_hash_map<
     std::string,  // compute capability.
-    absl::flat_hash_map<PrimitiveType,
-                        absl::flat_hash_map<HloOpcode, int64_t>>>;
+    absl::flat_hash_map<std::pair<HloOpcode, PrimitiveType>, int64_t>>;
 
 const ProfilesNestedMap* LoadOpProfiles() {
   ProfilesNestedMap* ret = new ProfilesNestedMap();
@@ -317,9 +326,11 @@ const ProfilesNestedMap* LoadOpProfiles() {
       std::string(kDeviceHloOpProfiles), &all_device_profiles));
   for (const auto& device_profile : all_device_profiles.entries()) {
     for (const auto& entry : device_profile.second.entries()) {
-      (*ret)[device_profile.first][entry.instruction().shape().element_type()]
-            [StringToHloOpcode(entry.instruction().opcode()).value()] =
-                entry.clock_cycles();
+      auto op_code = StringToHloOpcode(entry.instruction().opcode()).value();
+      auto element_type = entry.instruction().shape().element_type();
+
+      (*ret)[device_profile.first][std::make_pair(op_code, element_type)] =
+          entry.clock_cycles();
     }
   }
   return ret;
@@ -339,16 +350,12 @@ int64_t FlopsPerElement(const se::DeviceDescription* device_info,
 
   static const auto* all_profiles = LoadOpProfiles();
   static const auto& default_profile = all_profiles->at("sm_86");
-  auto device_profiles =
+  auto device_profile =
       FindOrDefault(*all_profiles, compute_capability, default_profile);
-  auto dtype_profiles = MaybeFind(device_profiles, type);
-
   // Elementwise instructions typically take at least a few clock cycles.
   constexpr int64_t kDefaultFlopsPerElement = 3;
-  if (!dtype_profiles.ok()) {
-    return kDefaultFlopsPerElement;
-  }
-  return FindOrDefault(dtype_profiles->get(), opcode, kDefaultFlopsPerElement);
+  return FindOrDefault(device_profile, std::make_pair(opcode, type),
+                       kDefaultFlopsPerElement);
 }
 
 int64_t GetFlopsForElementwiseOp(const se::DeviceDescription* gpu_device_info,
diff --git a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc
index e0f4e2a4cd1779..9cc5c230b13012 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc
@@ -15,7 +15,15 @@ limitations under the License.
 
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 
+#include <cstdint>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
index 51579cd34a2cf1..ad65442ada58f7 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
@@ -20,21 +20,30 @@ limitations under the License.
 #include <cstdint>
 #include <cstdlib>
 #include <optional>
-#include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/numbers.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
+#include "llvm/ADT/STLExtras.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/hlo_traversal.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/coalescing_analysis.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/hlo_dataflow_analysis.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/util.h"
+#include "tsl/platform/status.h"
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/nvml/include/nvml.h"
@@ -45,7 +54,9 @@ namespace gpu {
 namespace {
 
 // Estimated values in the absence of easy ways to query them.
-static constexpr absl::Duration kKernelLaunchOverhead = absl::Microseconds(5);
+static constexpr absl::Duration kKernelLaunchOverhead = absl::Microseconds(1);
+static constexpr absl::Duration kNcclKernelLaunchOverhead =
+    absl::Microseconds(5);
 static constexpr float kL2CacheSpeedup = 2.5;
 static constexpr float kL1CacheSpeedup = 8;
 // A very conservative estimate. L1 size varies because it can be dynamically
@@ -65,37 +76,84 @@ bool FusionUsesParameterElementwiseFromRoot(
              fusion->fused_expression_root()) == 1.f;
 }
 
+int GetCoalescingWasteFactor(PrimitiveType element_type) {
+  int64_t element_size_bytes =
+      element_type == PrimitiveType::TUPLE ||
+              element_type == PrimitiveType::TOKEN
+          ? 4 /* Dummy value. TODO(jreiffers): Model this case. */
+          : ShapeUtil::ByteSizeOfPrimitiveType(element_type);
+  // Cache line is 128B that is split into 4 sectors of 32B. Default transaction
+  // size from DRAM -> L2 = 64 Bytes = 2 sectors, since V100, but it can be also
+  // configured.
+  // https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21819-optimizing-applications-for-nvidia-ampere-gpu-architecture.pdf
+  // (page 10).
+  constexpr int kDRAMToL2TransactionSizeBytes = 64;
+  // Assume we use one element from the cache line and waste the remaining
+  // bandwidth. For example, if we're reading f32s, we use 1/16nd of the cache
+  // line.
+  return kDRAMToL2TransactionSizeBytes / element_size_bytes;
+}
+
 // Estimate read time of n_bytes_total bytes from global memory on a
 // given GPU. Account for L1 / L2 cache speedup if the input's nominal size
 // n_bytes_net is small.
 absl::Duration ReadTime(const se::DeviceDescription& gpu_device_info,
                         int64_t num_blocks, int64_t n_bytes_net,
                         int64_t n_bytes_total, PrimitiveType element_type,
-                        bool coalesced) {
-  float bandwidth = gpu_device_info.memory_bandwidth();
-  if (n_bytes_net < gpu_device_info.l2_cache_size()) {
-    bandwidth *= kL2CacheSpeedup;
-    if (n_bytes_net < kL1CacheSizePerSM * gpu_device_info.core_count()) {
-      bandwidth *= kL1CacheSpeedup;
-    }
-  } else if (!coalesced) {
-    int64_t element_size_bytes =
-        element_type == PrimitiveType::TUPLE ||
-                element_type == PrimitiveType::TOKEN
-            ? 4 /* Dummy value. TODO(jreiffers): Model this case. */
-            : ShapeUtil::ByteSizeOfPrimitiveType(element_type);
-    constexpr int kCacheLineSizeBytes = 128;
-    // Assume we use one element from the cache line and waste the remaining
-    // bandwidth. For example, if we're reading f32s, we use 1/32nd of the cache
-    // line.
-    bandwidth /= kCacheLineSizeBytes / element_size_bytes;
-  }
-  // Limit the bandwidth for low occupancy cases. Each SM can issue at most one
-  // 32B memory transaction per clock. H100 needs at least 56.8 active SMs
+                        bool coalesced, bool first_read_from_dram) {
+  int waste_factor = coalesced ? 1 : GetCoalescingWasteFactor(element_type);
+
+  // Limit the bandwidth for low occupancy cases. Each SM can issue at most
+  // one 32B memory transaction per clock. H100 needs at least 56.8 active SMs
   // (1830 MHz) to saturate the memory bandwidth (3.35 TB/s).
   float per_block_bandwidth = gpu_device_info.clock_rate_ghz() * 1.0e9f * 32;
-  bandwidth = std::min(bandwidth, num_blocks * per_block_bandwidth);
-  return absl::Seconds(n_bytes_total / bandwidth);
+  float max_bandwidth = num_blocks * per_block_bandwidth;
+
+  if (first_read_from_dram) {
+    // The first read of the input buffer always happens from DRAM. If reads are
+    // no coaleced, bandwidth is reduced by the waste factor.
+    float dram_bandwidth = gpu_device_info.memory_bandwidth() / waste_factor;
+
+    // Two things can happed on re-reading the buffer:
+    //   - If the buffer fits into cache, the L1/L2 cache speedup is applied.
+    //   - If the buffer doesn't fit, it will be read from DRAM and the same
+    //     coalessing waste factor is applied.
+    float rest_bandwidth = gpu_device_info.memory_bandwidth();
+    if (n_bytes_net < gpu_device_info.l2_cache_size()) {
+      rest_bandwidth *= kL2CacheSpeedup;
+      if (n_bytes_net < kL1CacheSizePerSM * gpu_device_info.core_count()) {
+        rest_bandwidth *= kL1CacheSpeedup;
+      }
+    } else {
+      rest_bandwidth /= waste_factor;
+    }
+
+    dram_bandwidth = std::min(dram_bandwidth, max_bandwidth);
+    rest_bandwidth = std::min(rest_bandwidth, max_bandwidth);
+
+    // n_bytes_net > n_bytes_total can happend when we compute read time of
+    // shared operand. This is a flaw in the interface that should be fixed.
+    int64_t n_bytes_read_dram = std::min(n_bytes_net, n_bytes_total);
+
+    // Number of bytes that we be re-read, potentially from cache.
+    int64_t n_bytes_read_cache = n_bytes_total - n_bytes_read_dram;
+
+    return absl::Seconds(n_bytes_read_dram / dram_bandwidth) +
+           absl::Seconds(n_bytes_read_cache / rest_bandwidth);
+  } else {
+    float bandwidth = gpu_device_info.memory_bandwidth();
+    if (n_bytes_net < gpu_device_info.l2_cache_size()) {
+      bandwidth *= kL2CacheSpeedup;
+      if (n_bytes_net < kL1CacheSizePerSM * gpu_device_info.core_count()) {
+        bandwidth *= kL1CacheSpeedup;
+      }
+    } else if (!coalesced) {
+      bandwidth /= waste_factor;
+    }
+
+    bandwidth = std::min(bandwidth, max_bandwidth);
+    return absl::Seconds(n_bytes_total / bandwidth);
+  }
 }
 
 int64_t GetNcclMaxNumChannels(
@@ -173,36 +231,11 @@ float GetMaxSysBwFromGpu(const se::CudaComputeCapability cc,
   return -1;
 }
 
-std::vector<const HloInstruction*> GetRoots(const HloInstruction& consumer) {
-  return consumer.opcode() == HloOpcode::kFusion
-             ? GetFusionRoots(*consumer.fused_instructions_computation())
-             : std::vector<const HloInstruction*>{&consumer};
-}
-
-std::optional<HloFusionAnalysis> AnalyzeProducerConsumerFusion(
-    const HloInstruction& producer, const HloInstruction& consumer,
-    const se::DeviceDescription& device_info) {
-  auto ret = HloFusionAnalysis::Create(
-      FusionBackendConfig::default_instance(), GetRoots(consumer),
-      MakeProducerConsumerFusion(producer, consumer), &device_info);
-  if (!ret.ok()) return std::nullopt;
-  return {std::move(*ret)};
-}
-
-std::optional<HloFusionAnalysis> AnalyzeFusion(
-    const HloInstruction& consumer, const se::DeviceDescription& device_info) {
-  auto ret = HloFusionAnalysis::Create(
-      FusionBackendConfig::default_instance(), GetRoots(consumer),
-      MakeSingleInstructionFusion(consumer), &device_info);
-  if (!ret.ok()) return std::nullopt;
-  return {std::move(*ret)};
-}
-
 // Uses HloFusionAnalysis for computing the actual number of threads and blocks
 // that the IR emitter will use.
 LaunchDimensions EstimateFusionLaunchDimensions(
     int64_t estimated_num_threads,
-    std::optional<HloFusionAnalysis>& fusion_analysis,
+    const std::optional<HloFusionAnalysis>& fusion_analysis,
     const se::DeviceDescription& device_info) {
   if (fusion_analysis) {
     // TODO(jreiffers): This is the wrong place for this DUS analysis.
@@ -237,6 +270,67 @@ LaunchDimensions EstimateFusionLaunchDimensions(
 
 }  // namespace
 
+std::optional<EstimateRunTimeData> GpuPerformanceModelCache::Get(
+    const HloInstruction& instruction) {
+  absl::MutexLock lock(&mutex_);
+
+  auto it = instruction_runtime_data_.find(HloInstructionAdaptor(instruction));
+  if (it != instruction_runtime_data_.end()) {
+    return it->second;
+  }
+  return std::nullopt;
+}
+
+std::optional<absl::Duration> GpuPerformanceModelCache::Get(
+    const HloInstruction& producer, const HloInstruction& consumer) {
+  absl::MutexLock lock(&mutex_);
+
+  auto it = fusion_runtime_data_.find(HloInstructionAdaptor(producer));
+  if (it != fusion_runtime_data_.end()) {
+    auto jt = it->second.find(HloInstructionAdaptor(consumer));
+    if (jt != it->second.end()) {
+      return jt->second;
+    }
+  }
+  return std::nullopt;
+}
+
+void GpuPerformanceModelCache::Set(const HloInstruction& instruction,
+                                   const EstimateRunTimeData& runtime_data) {
+  absl::MutexLock lock(&mutex_);
+
+  instruction_runtime_data_[HloInstructionAdaptor(instruction)] = runtime_data;
+}
+
+void GpuPerformanceModelCache::Set(const HloInstruction& producer,
+                                   const HloInstruction& consumer,
+                                   absl::Duration runtime) {
+  absl::MutexLock lock(&mutex_);
+  fusion_runtime_data_[HloInstructionAdaptor(producer)]
+                      [HloInstructionAdaptor(consumer)] = runtime;
+}
+
+void GpuPerformanceModelCache::Invalidate(const HloInstruction& instruction) {
+  absl::MutexLock lock(&mutex_);
+  HloInstructionAdaptor adaptor(instruction);
+
+  // Remove runtime data for the instruction.
+  instruction_runtime_data_.erase(adaptor);
+
+  // Remove cache for all producer-consumer pairs where the instruction is
+  // producer.
+  fusion_runtime_data_.erase(adaptor);
+
+  // Iterate through operands to find all producer-consumer pairs where
+  // instruction is consumer and remove them from cache.
+  for (auto* operand : instruction.operands()) {
+    auto it = fusion_runtime_data_.find(HloInstructionAdaptor(*operand));
+    if (it != fusion_runtime_data_.end()) {
+      it->second.erase(adaptor);
+    }
+  }
+}
+
 /*static*/ EstimateRunTimeData
 GpuPerformanceModel::EstimateRunTimeForInstruction(
     const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis,
@@ -247,7 +341,15 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
   int64_t bytes_written = cost_analysis->output_bytes_accessed(*instr);
   int64_t bytes_read = cost_analysis->bytes_accessed(*instr) - bytes_written;
 
-  auto fusion_analysis = AnalyzeFusion(*instr, *cost_analysis->device_info_);
+  // Use the analysis cache if present.
+  // TODO(jreiffers): Remove this once all callers use a cache.
+  std::optional<HloFusionAnalysis> local_analysis =
+      config.fusion_analysis_cache
+          ? std::nullopt
+          : AnalyzeFusion(*instr, *cost_analysis->device_info_);
+  const auto& fusion_analysis = config.fusion_analysis_cache
+                                    ? config.fusion_analysis_cache->Get(*instr)
+                                    : local_analysis;
   LaunchDimensions launch_dimensions = EstimateFusionLaunchDimensions(
       ShapeUtil::ElementsInRecursive(instr->shape()), fusion_analysis,
       *device_info);
@@ -274,6 +376,105 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
   return {flops, bytes_written, num_threads, write_time, exec_time};
 }
 
+/*static*/ EstimateRunTimeData
+GpuPerformanceModel::EstimateRunTimeForInstructionCached(
+    const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis,
+    const GpuPerformanceModelOptions& config) {
+  if (config.gpu_performance_model_cache) {
+    if (auto cached_result = config.gpu_performance_model_cache->Get(*instr)) {
+      return *cached_result;
+    }
+  }
+
+  auto runtime_data =
+      EstimateRunTimeForInstruction(instr, cost_analysis, config);
+
+  if (config.gpu_performance_model_cache) {
+    config.gpu_performance_model_cache->Set(*instr, runtime_data);
+  }
+
+  return runtime_data;
+}
+
+// Returns utilization of operand by instruction. Returns 0, if the operand is
+// not used by the instruction.
+float GetOperandUtilization(const GpuHloCostAnalysis* cost_analysis,
+                            const HloInstruction* instr,
+                            const HloInstruction* operand) {
+  if (!instr->IsUserOf(operand)) {
+    return 0.f;
+  }
+
+  return cost_analysis->operand_utilization(*instr,
+                                            instr->operand_index(operand));
+}
+
+// Returns utilization `overlap` between a common operand of producer and
+// consumer on merge. `utilization > 0` means that the operand will be accessed
+// more efficiently after fusion.
+//
+// Currently covers two cases:
+// 1) Producer has to use the common operand elementwise from its root if it is
+//    a fusion or just be an elementwise instruction.
+// 2) Consumer has to have common elementwise roots for the producer and the
+//    common operand if it is a fusion or just be an elementwise instruction.
+float GetCommonUtilization(const GpuHloCostAnalysis* cost_analysis,
+                           const HloInstruction* producer,
+                           int64_t producer_idx_of_operand,
+                           const HloInstruction* consumer) {
+  const auto* operand = producer->operand(producer_idx_of_operand);
+
+  if (!consumer || !consumer->IsUserOf(operand)) {
+    return 0.f;
+  }
+
+  if (producer->IsElementwise() ||
+      (producer->opcode() == HloOpcode::kFusion &&
+       FusionUsesParameterElementwiseFromRoot(producer, producer_idx_of_operand,
+                                              cost_analysis))) {
+    if (consumer->opcode() == HloOpcode::kFusion) {
+      int64_t consumer_idx_of_common_operand = consumer->operand_index(operand);
+      int64_t consumer_idx_of_producer = consumer->operand_index(producer);
+      return cost_analysis->CommonElementwiseUtilization(
+          consumer->fused_parameter(consumer_idx_of_common_operand),
+          consumer->fused_parameter(consumer_idx_of_producer));
+    } else {
+      if (consumer->IsElementwise()) {
+        return 1.f;
+      }
+    }
+  }
+  return 0.f;
+}
+
+// Returns utilization of operand after producer and consumer are fused
+// together. `GetCommonUtilization` works only for a limited set of elementwise
+// cases.
+// TODO(shyshkov): Combine logic from GpuHloCostAnalysis with boundary function
+// to properly calculate utilization.
+float GetSharedUtilization(const GpuHloCostAnalysis* cost_analysis,
+                           const HloInstruction* producer,
+                           const HloInstruction* consumer,
+                           const HloInstruction* operand) {
+  float producer_utilization_by_consumer =
+      GetOperandUtilization(cost_analysis, consumer, producer);
+
+  float operand_utilization_by_producer =
+      GetOperandUtilization(cost_analysis, producer, operand);
+
+  float operand_utilization_by_consumer =
+      GetOperandUtilization(cost_analysis, consumer, operand);
+
+  float common_utilization =
+      producer->IsUserOf(operand)
+          ? GetCommonUtilization(cost_analysis, producer,
+                                 producer->operand_index(operand), consumer)
+          : 0.f;
+
+  return producer_utilization_by_consumer * operand_utilization_by_producer +
+         operand_utilization_by_consumer - common_utilization;
+}
+
 // Tells input access time of the producer alone if fused_consumer
 // is not specified. Otherwise estimates the access time to producer's
 // inputs as if it is fused into the consumer.
@@ -281,23 +482,20 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
     const GpuHloCostAnalysis* cost_analysis,
     const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
     const HloInstruction* producer,
-    std::optional<HloFusionAnalysis>& fusion_analysis,
+    const std::optional<HloFusionAnalysis>& fusion_analysis,
     const GpuPerformanceModelOptions& config,
     const HloInstruction* fused_consumer) {
   absl::Duration ret = absl::ZeroDuration();
-  float producer_output_utilization = 1.f;
-  ConstHloInstructionSet consumer_operands;
-  bool consumer_transposes = false;
-  if (fused_consumer) {
-    consumer_transposes = TransposesMinorDimension(fused_consumer);
-    producer_output_utilization = cost_analysis->operand_utilization(
-        *fused_consumer, fused_consumer->operand_index(producer));
-    for (const HloInstruction* op : fused_consumer->operands()) {
-      consumer_operands.insert(op);
-    }
-  }
-
-  bool producer_transposes = TransposesMinorDimension(producer);
+  float producer_output_utilization =
+      fused_consumer
+          ? GetOperandUtilization(cost_analysis, fused_consumer, producer)
+          : 1.f;
+
+  // TODO(jreiffers): We should be checking each operand.
+  bool coalesced =
+      config.consider_coalescing
+          ? IsReadCoalesced(fusion_analysis, producer, fused_consumer)
+          : true;
   for (int i = 0; i < producer->operand_count(); ++i) {
     // Information about data read taking into account utilization.
     // If `operand_utilization` is 0, `operand_bytes_accessed` should be also 0.
@@ -314,53 +512,19 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
     int64_t n_bytes_net = std::llround(operand_bytes_accessed /
                                        std::max(operand_utilization, 1.0f));
 
-    // Look for common operands of producer and consumer that are accessed
-    // more efficiently on merge:
-    // 1) Producer has to use the common operand elementwise from its root if
-    //    it is a fusion or just be an elementwise instruction.
-    // 2) Consumer has to have common elementwise roots for the producer
-    //    and the common operand if it is a fusion or just be an elementwise
-    //    instruction.
-    float common_utilization = 0;
-    if (consumer_operands.count(producer->operand(i)) &&
-        (producer->IsElementwise() ||
-         (producer->opcode() == HloOpcode::kFusion &&
-          FusionUsesParameterElementwiseFromRoot(producer, i,
-                                                 cost_analysis)))) {
-      if (fused_consumer->opcode() == HloOpcode::kFusion) {
-        int64_t consumer_idx_of_common_operand =
-            fused_consumer->operand_index(producer->operand(i));
-        int64_t consumer_idx_of_producer =
-            fused_consumer->operand_index(producer);
-        common_utilization = cost_analysis->CommonElementwiseUtilization(
-            fused_consumer->fused_parameter(consumer_idx_of_common_operand),
-            fused_consumer->fused_parameter(consumer_idx_of_producer));
-      } else {
-        if (fused_consumer->IsElementwise()) {
-          common_utilization = 1.f;
-        }
-      }
-    }
+    // Look if common operand of producer and consumer will be accessed more
+    // efficiently on merge.
+    float common_utilization = GetCommonUtilization(
+        cost_analysis, producer, /*producer_idx_of_operand=*/i, fused_consumer);
 
-    // TODO(jreiffers): We should be checking each operand here.
-    bool coalesced = (fusion_analysis &&
-                      fusion_analysis->GetEmitterFusionKind() ==
-                          HloFusionAnalysis::EmitterFusionKind::kTranspose) ||
-                     (!producer_transposes && !consumer_transposes);
-    // Fusing two row reductions breaks coalescing.
-    coalesced &= ((fusion_analysis &&
-                   fusion_analysis->GetEmitterFusionKind() !=
-                       HloFusionAnalysis::EmitterFusionKind::kReduction) ||
-                  !fused_consumer || !IsInputFusibleReduction(*producer) ||
-                  !IsInputFusibleReduction(*fused_consumer));
     const auto& operand_shape = producer->operand(i)->shape();
 
     CHECK_LE(common_utilization, producer_output_utilization);
     float n_bytes_total = operand_bytes_accessed *
                           (producer_output_utilization - common_utilization);
     ret += ReadTime(gpu_device_info, num_blocks, /*n_bytes_net=*/n_bytes_net,
-                    n_bytes_total, operand_shape.element_type(),
-                    coalesced || !config.consider_coalescing);
+                    n_bytes_total, operand_shape.element_type(), coalesced,
+                    config.first_read_from_dram);
   }
   return ret;
 }
@@ -376,51 +540,177 @@ absl::Duration GpuPerformanceModel::ComputeTime(
   return absl::Nanoseconds(1.0f * flops / flop_per_ns_effective);
 }
 
-GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
-    const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
+absl::Duration GpuPerformanceModel::EstimateUnfusedExecTime(
+    const HloInstruction* producer, const EstimateRunTimeData& producer_runtime,
+    const GpuHloCostAnalysis* cost_analysis,
     const GpuPerformanceModelOptions& config,
-    std::vector<HloInstruction*> fused_consumers, bool multi_output) {
-  VLOG(8) << "Producer: " << producer->name();
-  if (producer->opcode() == HloOpcode::kFusion) {
-    VLOG(10) << producer->fused_instructions_computation()->ToString();
+    const std::vector<HloInstruction*>& fused_consumers,
+    const std::vector<EstimateRunTimeData>& consumer_runtimes) {
+  const se::DeviceDescription* device_info = cost_analysis->device_info_;
+
+  absl::Duration time_unfused =
+      kKernelLaunchOverhead * (fused_consumers.size() + 1) +
+      producer_runtime.exec_time;
+
+  if (config.calculate_full_priority) {
+    for (const auto& consumer_runtime : consumer_runtimes) {
+      time_unfused += consumer_runtime.exec_time;
+    }
+    return time_unfused;
   }
 
+  for (const HloInstruction* fused_consumer : fused_consumers) {
+    VLOG(8) << "Unfused consumer: " << fused_consumer->name();
+    float utilization_by_this_consumer =
+        GetOperandUtilization(cost_analysis, fused_consumer, producer);
+
+    // Use the analysis cache if present.
+    // TODO(jreiffers): Remove this once all callers use a cache.
+    std::optional<HloFusionAnalysis> local_analysis =
+        config.fusion_analysis_cache
+            ? std::nullopt
+            : AnalyzeFusion(*fused_consumer, *device_info);
+    const auto& analysis_unfused =
+        config.fusion_analysis_cache
+            ? config.fusion_analysis_cache->Get(*fused_consumer)
+            : local_analysis;
+
+    LaunchDimensions launch_dimensions_unfused = EstimateFusionLaunchDimensions(
+        ShapeUtil::ElementsInRecursive(fused_consumer->shape()),
+        analysis_unfused, *device_info);
+
+    int64_t n_bytes_total = std::llround(producer_runtime.bytes_written *
+                                         utilization_by_this_consumer);
+    int64_t n_bytes_net =
+        std::min(producer_runtime.bytes_written, n_bytes_total);
+
+    bool coalesced =
+        config.consider_coalescing
+            ? IsReadCoalesced(analysis_unfused, /*producer=*/fused_consumer)
+            : true;
+    auto read_time_unfused = ReadTime(
+        *device_info, launch_dimensions_unfused.num_blocks(), n_bytes_net,
+        n_bytes_total, fused_consumer->shape().element_type(), coalesced,
+        config.first_read_from_dram);
+
+    VLOG(10) << "  Read time unfused: " << read_time_unfused;
+    time_unfused += read_time_unfused;
+  }
+
+  return time_unfused;
+}
+
+/*static*/ absl::Duration GpuPerformanceModel::EstimateRunTimeForFusion(
+    const HloInstruction* producer, const HloInstruction* consumer,
+    const EstimateRunTimeData& producer_runtime,
+    const EstimateRunTimeData& consumer_runtime,
+    const LaunchDimensions& launch_dimensions,
+    float utilization_by_this_consumer, const GpuHloCostAnalysis* cost_analysis,
+    const std::optional<HloFusionAnalysis>& fusion_analysis,
+    const GpuPerformanceModelOptions& config) {
   const se::DeviceDescription* device_info = cost_analysis->device_info_;
 
-  EstimateRunTimeData producer_data =
-      EstimateRunTimeForInstruction(producer, cost_analysis, config);
+  int64_t fused_flops = producer_runtime.flops * utilization_by_this_consumer +
+                        consumer_runtime.flops;
 
-  int64_t fused_consumer_count = fused_consumers.size();
-  float total_producer_utilization = 0;
+  absl::Duration compute_time =
+      ComputeTime(*device_info, fused_flops, launch_dimensions.launch_bound());
+
+  absl::flat_hash_set<const HloInstruction*> fusion_operands;
+  for (auto* operand : producer->operands()) {
+    fusion_operands.insert(operand);
+  }
+  for (auto* operand : consumer->operands()) {
+    if (operand != producer) {
+      fusion_operands.insert(operand);
+    }
+  }
+
+  absl::Duration read_time;
+  for (const auto* operand : fusion_operands) {
+    float operand_utilization =
+        GetSharedUtilization(cost_analysis, producer, consumer, operand);
+
+    int64_t operand_size = cost_analysis->GetShapeSize(operand->shape());
+
+    int64_t n_bytes_total = std::llround(operand_size * operand_utilization);
+    int64_t n_bytes_net = std::min(operand_size, n_bytes_total);
+
+    bool coalesced = config.consider_coalescing
+                         ? IsReadCoalesced(fusion_analysis, producer, consumer)
+                         : true;
+
+    read_time +=
+        ReadTime(*device_info, launch_dimensions.num_blocks(), n_bytes_net,
+                 n_bytes_total, operand->shape().element_type(), coalesced,
+                 config.first_read_from_dram);
+  }
+
+  return std::max(compute_time, read_time + consumer_runtime.write_time);
+}
+
+absl::Duration GpuPerformanceModel::EstimateFusedExecTime(
+    const HloInstruction* producer, const EstimateRunTimeData& producer_runtime,
+    const GpuHloCostAnalysis* cost_analysis,
+    const GpuPerformanceModelOptions& config,
+    const std::vector<HloInstruction*>& fused_consumers,
+    const std::vector<EstimateRunTimeData>& consumer_runtimes,
+    bool multi_output) {
+  const se::DeviceDescription* device_info = cost_analysis->device_info_;
+
+  absl::Duration exec_time_fused =
+      kKernelLaunchOverhead * fused_consumers.size();
+  for (auto [idx, fused_consumer] : llvm::enumerate(fused_consumers)) {
+    VLOG(8) << "Fused consumer: " << fused_consumer->name();
+
+    if (config.calculate_full_priority && config.gpu_performance_model_cache) {
+      if (auto fusion_runtime = config.gpu_performance_model_cache->Get(
+              *producer, *fused_consumer)) {
+        exec_time_fused += *fusion_runtime;
+        continue;
+      }
+    }
 
-  absl::Duration exec_time_fused = absl::ZeroDuration();
-  absl::Duration producer_output_read_time_unfused = absl::ZeroDuration();
-  for (const HloInstruction* fused_consumer : fused_consumers) {
     float utilization_by_this_consumer = cost_analysis->operand_utilization(
         *fused_consumer, fused_consumer->operand_index(producer));
-    total_producer_utilization += utilization_by_this_consumer;
 
-    // The model ignores consumer computation and output writes. The main goal
-    // of the model is to compare estimates of fused and unfused cases. Since
-    // epilog of the consumers remains unchanged in both bases, we only consider
-    // duplication of the producer computation and repeated access to producer
-    // inputs.
-    //
-    // TODO(shyshkov): Add calculations for consumer epilogue in the formula to
-    // make it complete.
-    auto analysis_fused =
-        AnalyzeProducerConsumerFusion(*producer, *fused_consumer, *device_info);
-    auto analysis_unfused = AnalyzeFusion(*fused_consumer, *device_info);
+    std::optional<HloFusionAnalysis> local_analysis_fused =
+        config.fusion_analysis_cache
+            ? std::nullopt
+            : AnalyzeProducerConsumerFusion(*producer, *fused_consumer,
+                                            *device_info);
+    const auto& analysis_fused =
+        config.fusion_analysis_cache
+            ? config.fusion_analysis_cache->Get(*producer, *fused_consumer)
+            : local_analysis_fused;
 
     LaunchDimensions launch_dimensions_fused = EstimateFusionLaunchDimensions(
-        producer_data.num_threads * utilization_by_this_consumer,
+        producer_runtime.num_threads * utilization_by_this_consumer,
         analysis_fused, *device_info);
-    LaunchDimensions launch_dimensions_unfused = EstimateFusionLaunchDimensions(
-        ShapeUtil::ElementsInRecursive(fused_consumer->shape()),
-        analysis_unfused, *device_info);
+
+    // The original model ignores consumer computation and output writes. The
+    // main goal of the model is to compare estimates of fused and unfused
+    // cases. Since epilog of the consumers remains unchanged in both bases, we
+    // only consider duplication of the producer computation and repeated access
+    // to producer inputs.
+    //
+    // With `calculate_full_priority`, consumer computation and full read time
+    // is accounted in the priority.
+    if (config.calculate_full_priority) {
+      auto fusion_runtime = EstimateRunTimeForFusion(
+          producer, fused_consumer, producer_runtime, consumer_runtimes[idx],
+          launch_dimensions_fused, utilization_by_this_consumer, cost_analysis,
+          analysis_fused, config);
+      exec_time_fused += fusion_runtime;
+      if (config.gpu_performance_model_cache) {
+        config.gpu_performance_model_cache->Set(*producer, *fused_consumer,
+                                                fusion_runtime);
+      }
+      continue;
+    }
 
     absl::Duration compute_time_by_this_consumer = ComputeTime(
-        *device_info, producer_data.flops * utilization_by_this_consumer,
+        *device_info, producer_runtime.flops * utilization_by_this_consumer,
         launch_dimensions_fused.launch_bound());
 
     // Here, we assume that the read is distributed over all the threads in the
@@ -430,30 +720,59 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
     absl::Duration input_access_time_by_this_consumer = ProducerInputAccessTime(
         cost_analysis, *device_info, launch_dimensions_fused.num_blocks(),
         producer, analysis_fused, config, fused_consumer);
+    VLOG(10) << "  Compute time by consumer: " << compute_time_by_this_consumer;
+    VLOG(10) << "  Input access time by consumer: "
+             << input_access_time_by_this_consumer;
 
     exec_time_fused += std::max(compute_time_by_this_consumer,
                                 input_access_time_by_this_consumer);
+  }
 
-    int64_t n_bytes_total = std::llround(producer_data.bytes_written *
-                                         utilization_by_this_consumer);
-    int64_t n_bytes_net = std::min(producer_data.bytes_written, n_bytes_total);
+  // Multi-output fusion still writes the initial output of the producer.
+  // For now assume that the producer's output does not need to be recomputed.
+  if (multi_output) {
+    exec_time_fused += producer_runtime.write_time;
+  }
 
-    producer_output_read_time_unfused += ReadTime(
-        *device_info, launch_dimensions_unfused.num_blocks(), n_bytes_net,
-        n_bytes_total, fused_consumer->shape().element_type(),
-        /*coalesced=*/!TransposesMinorDimension(fused_consumer));
+  return exec_time_fused;
+}
+
+GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
+    const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
+    const GpuPerformanceModelOptions& config,
+    std::vector<HloInstruction*> fused_consumers, bool multi_output) {
+  VLOG(8) << "Producer: " << producer->name();
+  if (producer->opcode() == HloOpcode::kFusion) {
+    VLOG(10) << producer->fused_instructions_computation()->ToString();
+  }
+
+  EstimateRunTimeData producer_runtime =
+      EstimateRunTimeForInstructionCached(producer, cost_analysis, config);
+
+  std::vector<EstimateRunTimeData> consumer_runtimes;
+  if (config.calculate_full_priority) {
+    consumer_runtimes.reserve(fused_consumers.size());
+    for (auto* consumer : fused_consumers) {
+      consumer_runtimes.push_back(
+          EstimateRunTimeForInstructionCached(consumer, cost_analysis, config));
+    }
   }
 
   absl::Duration time_unfused =
-      kKernelLaunchOverhead * (fused_consumer_count + 1) +
-      producer_data.exec_time + producer_output_read_time_unfused;
+      EstimateUnfusedExecTime(producer, producer_runtime, cost_analysis, config,
+                              fused_consumers, consumer_runtimes);
 
   absl::Duration time_fused =
-      kKernelLaunchOverhead * fused_consumer_count + exec_time_fused;
-  // Multi-output fusion still writes the initial output of the producer.
-  // For now assume that the producer's output does not need to be recomputed.
-  if (multi_output) {
-    time_fused += producer_data.write_time;
+      EstimateFusedExecTime(producer, producer_runtime, cost_analysis, config,
+                            fused_consumers, consumer_runtimes, multi_output);
+
+  int64_t fused_consumer_count = fused_consumers.size();
+  float total_producer_utilization = 0;
+
+  for (const HloInstruction* fused_consumer : fused_consumers) {
+    float utilization_by_this_consumer = cost_analysis->operand_utilization(
+        *fused_consumer, fused_consumer->operand_index(producer));
+    total_producer_utilization += utilization_by_this_consumer;
   }
 
   if (VLOG_IS_ON(8)) {
@@ -474,7 +793,7 @@ void GpuPerformanceModel::RecordEstimatedRunTime(
   DCHECK(cost_analysis != nullptr) << "expected cost analysis";
 
   EstimateRunTimeData data =
-      EstimateRunTimeForInstruction(instruction, cost_analysis, config);
+      EstimateRunTimeForInstructionCached(instruction, cost_analysis, config);
   double cycles = absl::ToDoubleNanoseconds(data.exec_time) *
                   cost_analysis->device_info_->clock_rate_ghz();
 
@@ -572,7 +891,7 @@ GpuPerformanceWithCollectiveModel::ComputeAllreduceTime(
     const se::DeviceDescription& gpu_device_info) {
   // We use nccl group call to launch multiple allreduces so launch overhead
   // only occurs once.
-  absl::Duration total_time = kKernelLaunchOverhead;
+  absl::Duration total_time = kNcclKernelLaunchOverhead;
   stream_executor::CudaComputeCapability compute_cap =
       gpu_device_info.cuda_compute_capability();
 
@@ -649,7 +968,7 @@ GpuPerformanceWithCollectiveModel::ComputeCollectiveTime(
     const se::DeviceDescription& gpu_device_info) {
   if (cost_analysis->NumOfDevices(instr) == 1) {
     VLOG(8) << "Returning only kernel launch overhead for a single partition.";
-    return kKernelLaunchOverhead;
+    return kNcclKernelLaunchOverhead;
   }
 
   if (HloDataflowAnalysis::IsAsynchronousOperationDone(instr.opcode())) {
@@ -664,7 +983,7 @@ GpuPerformanceWithCollectiveModel::ComputeCollectiveTime(
       LOG(WARNING)
           << "Runtime estimate for " << instr.name()
           << " not implemented. Returning only the kernel launch time.";
-      return kKernelLaunchOverhead;
+      return kNcclKernelLaunchOverhead;
     }
   }
 }
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
index 8bca1d8d784000..9e5cfe49d58c2f 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
@@ -19,9 +19,12 @@ limitations under the License.
 #include <array>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/time/time.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/hlo_traversal.h"
+#include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/stream_executor/device_description.h"
 
@@ -54,23 +57,76 @@ struct EstimateRunTimeData {
   absl::Duration exec_time;
 };
 
+class GpuPerformanceModelCache {
+ public:
+  // Returns cached runtime data for the instruction or producer-consumer pair.
+  // Returns nullopt if there is no data in cache.
+  std::optional<EstimateRunTimeData> Get(const HloInstruction& instruction);
+  std::optional<absl::Duration> Get(const HloInstruction& producer,
+                                    const HloInstruction& consumer);
+
+  // Sets cache value for the instruction or producer-consumer pair.
+  void Set(const HloInstruction& instruction,
+           const EstimateRunTimeData& runtime_data);
+  void Set(const HloInstruction& producer, const HloInstruction& consumer,
+           absl::Duration runtime);
+
+  // Removes all cache entries for this instruction. The cache contains entries
+  // for individual instructions in instruction_runtime_data_ and for
+  // producer-consumer pairs in fusion_runtime_data_.
+  void Invalidate(const HloInstruction& instruction);
+
+ private:
+  absl::Mutex mutex_;
+
+  // Stores unfused runtime data for individual instructions.
+  absl::flat_hash_map<HloInstructionAdaptor, EstimateRunTimeData>
+      instruction_runtime_data_;
+
+  // Stores fused runtime data for producer-consumer pairs.
+  absl::flat_hash_map<
+      HloInstructionAdaptor,
+      absl::flat_hash_map<HloInstructionAdaptor, absl::Duration>>
+      fusion_runtime_data_;
+};
+
 struct GpuPerformanceModelOptions {
   // Whether to attempt to model the effect of uncoalesced reads.
   bool consider_coalescing = false;
 
+  // Use better read modelling, when first read always happends from DRAM and
+  // re-reads can happen from cache.
+  bool first_read_from_dram = false;
+
+  // Properly calculate read+write and compute time in both fused and unfused
+  // case for producer and consumer.
+  bool calculate_full_priority = false;
+
+  // If present, use this to retrieve fusion analyses.
+  HloFusionAnalysisCache* fusion_analysis_cache = nullptr;
+
+  GpuPerformanceModelCache* gpu_performance_model_cache = nullptr;
+
   static GpuPerformanceModelOptions Default() {
     return GpuPerformanceModelOptions();
   }
 
-  static GpuPerformanceModelOptions PriorityFusion() {
+  static GpuPerformanceModelOptions PriorityFusion(
+      HloFusionAnalysisCache* fusion_analysis_cache,
+      GpuPerformanceModelCache* gpu_performance_model_cache) {
     GpuPerformanceModelOptions config;
     config.consider_coalescing = true;
+    config.first_read_from_dram = true;
+    config.calculate_full_priority = true;
+    config.fusion_analysis_cache = fusion_analysis_cache;
+    config.gpu_performance_model_cache = gpu_performance_model_cache;
     return config;
   }
 
   static GpuPerformanceModelOptions ForModule(const HloModule* module) {
     return module->config().debug_options().xla_gpu_enable_priority_fusion()
-               ? PriorityFusion()
+               ? PriorityFusion(nullptr,
+                                nullptr)  // Only cache within priority fusion.
                : Default();
   }
 };
@@ -86,6 +142,38 @@ class GpuPerformanceModel {
       const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis,
       const GpuPerformanceModelOptions& config);
 
+  static EstimateRunTimeData EstimateRunTimeForInstructionCached(
+      const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis,
+      const GpuPerformanceModelOptions& config);
+
+  // TODO(shyshkov): Unify interface with EstimateRunTimeForInstruction.
+  static absl::Duration EstimateRunTimeForFusion(
+      const HloInstruction* producer, const HloInstruction* consumer,
+      const EstimateRunTimeData& producer_runtime,
+      const EstimateRunTimeData& consumer_runtime,
+      const LaunchDimensions& launch_dimensions,
+      float utilization_by_this_consumer,
+      const GpuHloCostAnalysis* cost_analysis,
+      const std::optional<HloFusionAnalysis>& fusion_analysis,
+      const GpuPerformanceModelOptions& config);
+
+  static absl::Duration EstimateUnfusedExecTime(
+      const HloInstruction* producer,
+      const EstimateRunTimeData& producer_runtime,
+      const GpuHloCostAnalysis* cost_analysis,
+      const GpuPerformanceModelOptions& config,
+      const std::vector<HloInstruction*>& fused_consumers,
+      const std::vector<EstimateRunTimeData>& consumer_runtime);
+
+  static absl::Duration EstimateFusedExecTime(
+      const HloInstruction* producer,
+      const EstimateRunTimeData& producer_runtime,
+      const GpuHloCostAnalysis* cost_analysis,
+      const GpuPerformanceModelOptions& config,
+      const std::vector<HloInstruction*>& fused_consumers,
+      const std::vector<EstimateRunTimeData>& consumer_runtimes,
+      bool multi_output);
+
   static RunTimes EstimateRunTimes(
       const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
       const GpuPerformanceModelOptions& config,
@@ -104,7 +192,7 @@ class GpuPerformanceModel {
       const GpuHloCostAnalysis* cost_analysis,
       const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
       const HloInstruction* producer,
-      std::optional<HloFusionAnalysis>& fusion_analysis,
+      const std::optional<HloFusionAnalysis>& fusion_analysis,
       const GpuPerformanceModelOptions& config,
       const HloInstruction* fused_consumer = nullptr);
 };
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
index 1eed176de8e6dc..0c0c6a97237c4f 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <utility>
+#include <vector>
 
+#include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -32,7 +34,10 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/test_helpers.h"
 #include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -105,7 +110,7 @@ ENTRY e {
   GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
       root, &analysis_, GpuPerformanceModelOptions::Default());
   // Dominated by the kernel launch overhead.
-  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 5, 1);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 1, 1);
 
   GpuPerformanceModel::RecordEstimatedRunTime(
       root, &analysis_, GpuPerformanceModelOptions::Default());
@@ -237,7 +242,7 @@ TEST_F(GpuPerformanceModelTest, UnusedParameter) {
 
   GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
       root, &analysis_, GpuPerformanceModelOptions::Default());
-  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 5, 1);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 1, 1);
 }
 
 using GpuPerformanceWithCollectiveModelTest = GpuPerformanceModelTest;
@@ -360,11 +365,11 @@ ENTRY fusion {
   std::vector<HloInstruction*> consumers{
       module->entry_computation()->GetInstructionWithName("reduce.1")};
   GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
-      producer, &analysis_, GpuPerformanceModelOptions::PriorityFusion(),
-      consumers);
+      producer, &analysis_,
+      GpuPerformanceModelOptions::PriorityFusion(nullptr, nullptr), consumers);
 
   EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 105, 10);
-  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_fused), 1030, 10);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_fused), 514, 10);
 }
 
 TEST_F(GpuPerformanceModelTest, FusingNonMinorTransposeIntoReduceIsFast) {
@@ -456,6 +461,98 @@ ENTRY main {
               absl::ToInt64Microseconds(t2.time_unfused), 10);
 }
 
+TEST_F(GpuPerformanceModelTest, EqualCostBeforeAndAfterFusion) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f1 {
+  p0 = f32[4194304] parameter(0)
+  p1 = f32[4194304] parameter(1)
+  ROOT tmp_3 = f32[4194304] multiply(f32[4194304] p0, f32[4194304] p1)
+}
+
+e1 {
+  p0 = f32[4194304] parameter(0)
+  p1 = f32[4194304] parameter(1)
+
+  f.1 = f32[4194304] fusion(f32[4194304] p0, f32[4194304] p1), kind=kLoop, calls=f1
+  ROOT r.1 = f32[4194304] tanh(f32[4194304] f.1)
+}
+
+f2 {
+  p0 = f32[4194304] parameter(0)
+  p1 = f32[4194304] parameter(1)
+  mul = f32[4194304] multiply(f32[4194304] p0, f32[4194304] p1)
+  ROOT res = f32[4194304] tanh(f32[4194304] mul)
+}
+
+ENTRY e2 {
+  p0 = f32[4194304] parameter(0)
+  p1 = f32[4194304] parameter(1)
+
+  ROOT f.2 = f32[4194304] fusion(f32[4194304] p0, f32[4194304] p1), kind=kLoop, calls=f2
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  HloComputation* computation_without_fusion =
+      module->GetComputationWithName("e1");
+  ASSERT_IS_OK(computation_without_fusion->Accept(&analysis_));
+  HloInstruction* consumer = computation_without_fusion->root_instruction();
+  const HloInstruction* producer = consumer->operand(0);
+
+  GpuPerformanceModel::RunTimes t1 = GpuPerformanceModel::EstimateRunTimes(
+      producer, &analysis_,
+      GpuPerformanceModelOptions::PriorityFusion(nullptr, nullptr), {consumer});
+
+  HloComputation* computation_with_fusion =
+      module->GetComputationWithName("e2");
+  ASSERT_IS_OK(computation_with_fusion->Accept(&analysis_));
+  HloInstruction* root_with_fusion =
+      computation_with_fusion->root_instruction();
+
+  GpuPerformanceModel::RunTimes t2 = GpuPerformanceModel::EstimateRunTimes(
+      root_with_fusion, &analysis_,
+      GpuPerformanceModelOptions::PriorityFusion(nullptr, nullptr), {});
+
+  EXPECT_EQ(t1.time_fused, t2.time_unfused);
+}
+
+TEST_F(GpuPerformanceModelTest, DoNotFuseDivideIntoSmallReduce) {
+  // Fusing this divide is not supported by reduce epilogue fusion.
+  constexpr absl::string_view kHlo = R"(
+HloModule testmodule
+
+add {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT add = f32[] add(p0, p1)
+}
+
+ENTRY fusion {
+  c = f32[] constant(0)
+  p0 = f32[3072] parameter(0)
+  p1 = f32[] parameter(1)
+  reduce = f32[] reduce(p0, c), dimensions={0}, to_apply=add
+  ROOT divide = f32[] divide(reduce, p1)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
+
+  auto* producer =
+      module->entry_computation()->GetInstructionWithName("reduce");
+  std::vector<HloInstruction*> consumers{
+      module->entry_computation()->GetInstructionWithName("divide")};
+  GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
+      producer, &analysis_,
+      GpuPerformanceModelOptions::PriorityFusion(nullptr, nullptr), consumers);
+
+  EXPECT_LT(t.time_unfused, t.time_fused);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc
index 5a3d32e2eca086..fde451c8251aa5 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiler.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "xla/service/gpu/model/hlo_op_profiler.h"
 
-#include <algorithm>
-#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <random>
@@ -24,6 +22,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "xla/debug_options_flags.h"
@@ -43,6 +43,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 #ifdef GOOGLE_CUDA
 #include "xla/backends/profiler/gpu/cupti_collector.h"
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
index 606fe4dab5f95f..5c1686b4538fca 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
@@ -119,7 +119,7 @@ int RunProfiler(int argc, char** argv) {
     }
   }
 
-  VLOG(1) << "\n" << instr_profiles.DebugString();
+  VLOG(1) << "\n" << instr_profiles;
 
   DeviceHloInstructionProfiles device_profiles;
   device_profiles.mutable_entries()->insert({dev_info.name(), instr_profiles});
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_test.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_test.cc
index 755e26bb8656ae..e8cfd9b88ccc42 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_test.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/model/hlo_op_profiler.h"
 
+#include <gtest/gtest.h>
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/tests/hlo_test_base.h"
 
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_simplifier.cc b/third_party/xla/xla/service/gpu/model/indexing_map_simplifier.cc
new file mode 100644
index 00000000000000..1fe0059954d3a1
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_simplifier.cc
@@ -0,0 +1,284 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/indexing_map_simplifier.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using mlir::AffineBinaryOpExpr;
+using mlir::AffineDimExpr;
+using mlir::AffineExpr;
+using mlir::AffineExprKind;
+using mlir::AffineMap;
+using mlir::AffineSymbolExpr;
+using mlir::getAffineBinaryOpExpr;
+using mlir::getAffineConstantExpr;
+
+int64_t FloorDiv(int64_t dividend, int64_t divisor) {
+  return dividend / divisor -
+         (((dividend >= 0) != (divisor >= 0) && dividend % divisor) ? 1 : 0);
+}
+
+}  // namespace
+
+void IndexingMapSimplifier::SetInclusiveBounds(AffineExpr expr, int64_t lower,
+                                               int64_t upper) {
+  bounds_[expr] = {lower, upper};
+}
+
+IndexingMapSimplifier::Bounds IndexingMapSimplifier::GetInclusiveBounds(
+    AffineExpr expr) {
+  auto bound = bounds_.find(expr);
+  if (bound != bounds_.end()) return bound->second;
+
+  switch (expr.getKind()) {
+    case AffineExprKind::Constant: {
+      int64_t value = mlir::cast<mlir::AffineConstantExpr>(expr).getValue();
+      return bounds_[expr] = {value, value};
+    }
+    case AffineExprKind::DimId: {
+      LOG(FATAL) << "Unknown dim "
+                 << mlir::cast<mlir::AffineDimExpr>(expr).getPosition();
+    }
+    case AffineExprKind::SymbolId: {
+      LOG(FATAL) << "Unknown symbol"
+                 << mlir::cast<mlir::AffineSymbolExpr>(expr).getPosition();
+    }
+    default:
+      auto binary_op = mlir::dyn_cast<AffineBinaryOpExpr>(expr);
+      CHECK(binary_op);
+      auto lhs = GetInclusiveBounds(binary_op.getLHS());
+      auto rhs = GetInclusiveBounds(binary_op.getRHS());
+
+      auto& result = bounds_[expr];
+      switch (expr.getKind()) {
+        case AffineExprKind::Add:
+          return result = {lhs.lower + rhs.lower, lhs.upper + rhs.upper};
+        case AffineExprKind::Mul: {
+          int64_t a = lhs.lower * rhs.lower;
+          int64_t b = lhs.upper * rhs.upper;
+          return result = {std::min(a, b), std::max(a, b)};
+        }
+        case AffineExprKind::Mod: {
+          CHECK_EQ(rhs.lower, rhs.upper) << "RHS of mod must be a constant";
+          int64_t m = rhs.lower;
+          if (0 <= lhs.lower && lhs.upper < m) {
+            return result = lhs;
+          }
+          return result = {0, m - 1};
+        }
+        case AffineExprKind::FloorDiv: {
+          CHECK_EQ(rhs.lower, rhs.upper)
+              << "RHS of floor_div must be a constant";
+          int64_t d = rhs.lower;
+          int a = FloorDiv(lhs.lower, d);
+          int b = FloorDiv(lhs.upper, d);
+          return result = {std::min(a, b), std::max(a, b)};
+        }
+        default:
+          // We don't use ceildiv, so we don't support it.
+          LOG(FATAL) << "Unsupported expression";
+      }
+  }
+}
+
+AffineExpr IndexingMapSimplifier::RewriteMod(AffineBinaryOpExpr mod) {
+  auto lhs_simplified = SimplifyOnce(mod.getLHS());
+
+  auto lhs = GetInclusiveBounds(lhs_simplified);
+  auto rhs = GetInclusiveBounds(mod.getRHS());
+
+  // a % b where b is always larger than a?
+  if (0 <= lhs.lower && lhs.upper < rhs.lower) return lhs_simplified;
+
+  // The logic below assumes we have a constant RHS.
+  if (rhs.lower != rhs.upper) return mod;
+  int64_t m = rhs.lower;
+
+  auto new_lhs = RewriteSumIf(lhs_simplified, [&](AffineExpr expr) {
+    if (expr.getKind() != AffineExprKind::Mul) {
+      return true;
+    }
+
+    auto mul_rhs =
+        GetInclusiveBounds(mlir::cast<AffineBinaryOpExpr>(expr).getRHS());
+    bool remove = mul_rhs.lower == mul_rhs.upper && (mul_rhs.lower % m) == 0;
+    return !remove;  // We keep it if we don't remove it!
+  });
+
+  // If we weren't able to remove or simplify anything, return the original
+  // expression.
+  if (new_lhs == mod.getLHS()) {
+    return mod;
+  }
+  // If we removed everything, return 0.
+  if (!new_lhs) {
+    return getAffineConstantExpr(0, mlir_context_);
+  }
+  // Otherwise, return new_sum % m.
+  return getAffineBinaryOpExpr(AffineExprKind::Mod, new_lhs, mod.getRHS());
+}
+
+AffineExpr IndexingMapSimplifier::RewriteFloorDiv(AffineBinaryOpExpr div) {
+  auto lhs_simplified = SimplifyOnce(div.getLHS());
+  auto lhs = GetInclusiveBounds(lhs_simplified);
+  auto rhs = GetInclusiveBounds(div.getRHS());
+
+  if (0 <= lhs.lower && lhs.upper < rhs.lower) {
+    return getAffineConstantExpr(0, mlir_context_);
+  }
+
+  // The logic below assumes we have a constant RHS.
+  if (rhs.lower != rhs.upper) return div;
+  int64_t d = rhs.lower;
+
+  // If the dividend's range has a single element, return its value.
+  int64_t a = FloorDiv(lhs.lower, d);
+  int64_t b = FloorDiv(lhs.upper, d);
+  if (a == b) {
+    return getAffineConstantExpr(a, mlir_context_);
+  }
+
+  AffineExpr extracted = getAffineConstantExpr(0, mlir_context_);
+  auto new_dividend = RewriteSumIf(lhs_simplified, [&](AffineExpr expr) {
+    if (auto multiplier = GetConstantRhsMultiplier(expr)) {
+      // (x * 7 + ...) / 3 -> can't extract. We could extract x * 2 and keep
+      // one x, but we currently have no reason to do that.
+      if (*multiplier % d != 0) return true;
+      int64_t factor = *multiplier / d;
+      extracted = getAffineBinaryOpExpr(
+          AffineExprKind::Add, extracted,
+          getAffineBinaryOpExpr(AffineExprKind::Mul,
+                                mlir::cast<AffineBinaryOpExpr>(expr).getLHS(),
+                                getAffineConstantExpr(factor, mlir_context_)));
+      // Remove from dividend.
+      return false;
+    }
+
+    // Not a constant multiplier, keep in dividend.
+    return true;
+  });
+
+  // If we removed everything, skip the div.
+  if (!new_dividend) return extracted;
+  // If we removed nothing, return the original division.
+  if (extracted == getAffineConstantExpr(0, mlir_context_) &&
+      new_dividend == div.getLHS()) {
+    return div;
+  }
+
+  return getAffineBinaryOpExpr(
+      AffineExprKind::Add, extracted,
+      getAffineBinaryOpExpr(AffineExprKind::FloorDiv, new_dividend,
+                            div.getRHS()));
+}
+
+std::optional<int64_t> IndexingMapSimplifier::GetConstantRhsMultiplier(
+    AffineExpr expr) {
+  if (expr.getKind() != AffineExprKind::Mul) return std::nullopt;
+  auto bound =
+      GetInclusiveBounds(mlir::cast<AffineBinaryOpExpr>(expr).getRHS());
+  if (bound.lower != bound.upper) return std::nullopt;
+  return bound.lower;
+}
+
+AffineExpr IndexingMapSimplifier::RewriteSumIf(
+    AffineExpr expr, const std::function<bool(AffineExpr)>& pred) {
+  if (expr.getKind() == AffineExprKind::Add) {
+    auto add = mlir::dyn_cast<AffineBinaryOpExpr>(expr);
+    auto lhs = RewriteSumIf(add.getLHS(), pred);
+    auto rhs = RewriteSumIf(add.getRHS(), pred);
+    if (lhs == add.getLHS() && rhs == add.getRHS()) {
+      return add;
+    }
+    if (lhs && rhs) {
+      return getAffineBinaryOpExpr(AffineExprKind::Add, lhs, rhs);
+    }
+    return lhs ? lhs : (rhs ? rhs : nullptr);
+  }
+  return pred(expr) ? expr : nullptr;
+}
+
+AffineExpr IndexingMapSimplifier::SimplifyOnce(AffineExpr expr) {
+  switch (expr.getKind()) {
+    case AffineExprKind::Mul:
+    case AffineExprKind::Add: {
+      auto binop = mlir::cast<AffineBinaryOpExpr>(expr);
+      auto lhs = SimplifyOnce(binop.getLHS());
+      auto rhs = SimplifyOnce(binop.getRHS());
+      if (lhs == binop.getLHS() && rhs == binop.getRHS()) {
+        return expr;
+      }
+      return getAffineBinaryOpExpr(expr.getKind(), lhs, rhs);
+    }
+    case AffineExprKind::Mod:
+      return RewriteMod(mlir::cast<AffineBinaryOpExpr>(expr));
+    case AffineExprKind::FloorDiv:
+      return RewriteFloorDiv(mlir::cast<AffineBinaryOpExpr>(expr));
+    default:
+      return expr;
+  }
+}
+
+std::string ToString(const AffineMap& affine_map) {
+  std::string s;
+  llvm::raw_string_ostream ss(s);
+  affine_map.print(ss);
+  return s;
+}
+
+AffineExpr IndexingMapSimplifier::Simplify(AffineExpr expr) {
+  while (true) {
+    auto simplified = SimplifyOnce(expr);
+    if (simplified == expr) return expr;
+    expr = simplified;
+  }
+}
+
+AffineMap IndexingMapSimplifier::Simplify(AffineMap affine_map) {
+  mlir::SmallVector<AffineExpr, 4> results;
+  results.reserve(affine_map.getNumResults());
+  bool nothing_changed = true;
+  for (AffineExpr expr : affine_map.getResults()) {
+    AffineExpr simplified = Simplify(expr);
+    nothing_changed &= simplified == expr;
+    results.push_back(simplified);
+  }
+
+  if (nothing_changed) {
+    return affine_map;
+  }
+  return mlir::simplifyAffineMap(
+      AffineMap::get(affine_map.getNumDims(), affine_map.getNumSymbols(),
+                     results, affine_map.getContext()));
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_simplifier.h b/third_party/xla/xla/service/gpu/model/indexing_map_simplifier.h
new file mode 100644
index 00000000000000..840e7de348576e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_simplifier.h
@@ -0,0 +1,81 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_INDEXING_MAP_SIMPLIFIER_H_
+#define XLA_SERVICE_GPU_MODEL_INDEXING_MAP_SIMPLIFIER_H_
+
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <string>
+
+#include "llvm/ADT/DenseMap.h"
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+
+namespace xla {
+namespace gpu {
+
+class IndexingMapSimplifier {
+ public:
+  explicit IndexingMapSimplifier(mlir::MLIRContext* mlir_context)
+      : mlir_context_(mlir_context) {}
+
+  // Sets the inclusive bounds for the given expression. It can be used to set
+  // bounds for dimensions and symbols.
+  void SetInclusiveBounds(mlir::AffineExpr expr, int64_t lower, int64_t upper);
+
+  // Simplifies the map as much as possible.
+  mlir::AffineMap Simplify(mlir::AffineMap affine_map);
+
+  // Simplifies the expression as much as possible.
+  mlir::AffineExpr Simplify(mlir::AffineExpr expr);
+
+ private:
+  struct Bounds {
+    int64_t lower;
+    int64_t upper;
+  };
+  Bounds GetInclusiveBounds(mlir::AffineExpr expr);
+
+  std::optional<int64_t> GetConstantRhsMultiplier(mlir::AffineExpr expr);
+
+  // Simplifier for mod.
+  // - Rewrites (a * 100 + ...) % 100 to (...) % 100
+  // - Rewrites a % b to a if a is known to be less than b.
+  mlir::AffineExpr RewriteMod(mlir::AffineBinaryOpExpr mod);
+
+  // Simplifier for floordiv.
+  // - Rewrites (a * 100 + ...) / 100 to a + (...) / 100
+  // - Rewrites a / 100 to 0 when a is known to be less than 100.
+  mlir::AffineExpr RewriteFloorDiv(mlir::AffineBinaryOpExpr div);
+
+  mlir::AffineExpr RewriteSumIf(
+      mlir::AffineExpr expr, const std::function<bool(mlir::AffineExpr)>& pred);
+
+  // Attempts to simplify the expression, but doesn't attempt to simplify the
+  // result further.
+  mlir::AffineExpr SimplifyOnce(mlir::AffineExpr expr);
+
+  mlir::MLIRContext* mlir_context_;
+  llvm::DenseMap<mlir::AffineExpr, Bounds> bounds_{};
+};
+
+std::string ToString(const mlir::AffineMap& affine_map);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_INDEXING_MAP_SIMPLIFIER_H_
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_simplifier_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_simplifier_test.cc
new file mode 100644
index 00000000000000..14bbe72975d172
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_simplifier_test.cc
@@ -0,0 +1,186 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/indexing_map_simplifier.h"
+
+#include <gmock/gmock.h>
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::mlir::AffineExpr;
+using ::mlir::AffineExprKind;
+using ::mlir::AffineMap;
+using ::mlir::getAffineBinaryOpExpr;
+using ::mlir::getAffineConstantExpr;
+using ::mlir::getAffineDimExpr;
+using ::testing::HasSubstr;
+
+class IndexingMapSimplifierTest : public HloTestBase {
+ public:
+  mlir::MLIRContext mlir_context_;
+};
+
+TEST_F(IndexingMapSimplifierTest, SimplifyDivsAndModsIfSmallerThanDivisor) {
+  AffineExpr d0 = getAffineDimExpr(0, &mlir_context_);
+  AffineExpr d1 = getAffineDimExpr(1, &mlir_context_);
+  AffineExpr c16 = getAffineConstantExpr(16, &mlir_context_);
+
+  // (d0, d1) -> (d0 + d1 floordiv 16, d1 mod 16).
+  AffineExpr result0 = getAffineBinaryOpExpr(
+      AffineExprKind::Add, d0,
+      getAffineBinaryOpExpr(AffineExprKind::FloorDiv, d1, c16));
+  AffineExpr result1 = getAffineBinaryOpExpr(AffineExprKind::Mod, d1, c16);
+  auto map = AffineMap::get(2, 0, {result0, result1}, &mlir_context_);
+
+  // d0 in [0, 8) and d1 in [0, 16).
+  IndexingMapSimplifier simplifier(&mlir_context_);
+  simplifier.SetInclusiveBounds(d0, 0, 7);
+  simplifier.SetInclusiveBounds(d1, 0, 15);
+
+  EXPECT_THAT(ToString(simplifier.Simplify(map)),
+              HasSubstr("(d0, d1) -> (d0, d1)"));
+}
+
+TEST_F(IndexingMapSimplifierTest, SimplifyDivsAndModsWithMultipliers) {
+  AffineExpr d0 = getAffineDimExpr(0, &mlir_context_);
+  AffineExpr d1 = getAffineDimExpr(1, &mlir_context_);
+  AffineExpr d2 = getAffineDimExpr(2, &mlir_context_);
+  AffineExpr c10 = getAffineConstantExpr(10, &mlir_context_);
+  AffineExpr c100 = getAffineConstantExpr(100, &mlir_context_);
+
+  //  (d0, d1, d2) -> ((d0 * 100 + d1 * 10 + d2) floordiv 100,
+  //                  "((d0 * 100 + d1 * 10 + d2) mod 100) floordiv 10,
+  //                    d2 mod 10)"
+  AffineExpr weighted_sum =
+      getAffineBinaryOpExpr(AffineExprKind::Mul, d0, c100);
+  weighted_sum = getAffineBinaryOpExpr(
+      AffineExprKind::Add, weighted_sum,
+      getAffineBinaryOpExpr(AffineExprKind::Mul, d1, c10));
+  weighted_sum = getAffineBinaryOpExpr(AffineExprKind::Add, weighted_sum, d2);
+
+  AffineExpr result0 =
+      getAffineBinaryOpExpr(AffineExprKind::FloorDiv, weighted_sum, c100);
+  AffineExpr result1 = getAffineBinaryOpExpr(
+      AffineExprKind::FloorDiv,
+      getAffineBinaryOpExpr(AffineExprKind::Mod, weighted_sum, c100), c10);
+  AffineExpr result2 = getAffineBinaryOpExpr(AffineExprKind::Mod, d2, c10);
+
+  auto map = AffineMap::get(3, 0, {result0, result1, result2}, &mlir_context_);
+
+  // d_i in [0, 10).
+  IndexingMapSimplifier simplifier(&mlir_context_);
+  simplifier.SetInclusiveBounds(d0, 0, 9);
+  simplifier.SetInclusiveBounds(d1, 0, 9);
+  simplifier.SetInclusiveBounds(d2, 0, 9);
+
+  EXPECT_THAT(ToString(simplifier.Simplify(map)),
+              HasSubstr("(d0, d1, d2) -> (d0, d1, d2)"));
+}
+
+TEST_F(IndexingMapSimplifierTest, SimplifyDivsAndModsWithDivisibleMultipliers) {
+  AffineExpr d0 = getAffineDimExpr(0, &mlir_context_);
+  AffineExpr d1 = getAffineDimExpr(1, &mlir_context_);
+  AffineExpr d2 = getAffineDimExpr(2, &mlir_context_);
+  AffineExpr c4 = getAffineConstantExpr(4, &mlir_context_);
+  AffineExpr c8 = getAffineConstantExpr(8, &mlir_context_);
+  AffineExpr c16 = getAffineConstantExpr(16, &mlir_context_);
+
+  // (d0, d1, d2) -> ((d0 * 16 + d1 * 4 + d2) floordiv 8, "
+  //                  (d0 * 16 + d1 * 4 + d2) mod 8)
+  AffineExpr weighted_sum = getAffineBinaryOpExpr(AffineExprKind::Mul, d0, c16);
+  weighted_sum =
+      getAffineBinaryOpExpr(AffineExprKind::Add, weighted_sum,
+                            getAffineBinaryOpExpr(AffineExprKind::Mul, d1, c4));
+  weighted_sum = getAffineBinaryOpExpr(AffineExprKind::Add, weighted_sum, d2);
+
+  AffineExpr result0 =
+      getAffineBinaryOpExpr(AffineExprKind::FloorDiv, weighted_sum, c8);
+  AffineExpr result1 =
+      getAffineBinaryOpExpr(AffineExprKind::Mod, weighted_sum, c8);
+
+  auto map = AffineMap::get(3, 0, {result0, result1}, &mlir_context_);
+
+  // d_0 in [0, 10).
+  IndexingMapSimplifier simplifier(&mlir_context_);
+  simplifier.SetInclusiveBounds(d0, 0, 1);
+  simplifier.SetInclusiveBounds(d1, 0, 3);
+  simplifier.SetInclusiveBounds(d2, 0, 3);
+
+  EXPECT_THAT(ToString(simplifier.Simplify(map)),
+              HasSubstr("(d0, d1, d2) -> (d0 * 2 + (d1 * 4 + d2) floordiv 8, "
+                        "(d1 * 4 + d2) mod 8)"));
+}
+
+TEST_F(IndexingMapSimplifierTest, SimplifyDivsAndModsWithReverse) {
+  AffineExpr d0 = getAffineDimExpr(0, &mlir_context_);
+  AffineExpr d1 = getAffineDimExpr(1, &mlir_context_);
+  AffineExpr mc1 = getAffineConstantExpr(-1, &mlir_context_);
+  AffineExpr c9 = getAffineConstantExpr(9, &mlir_context_);
+  AffineExpr c11 = getAffineConstantExpr(11, &mlir_context_);
+  AffineExpr mc11 = getAffineConstantExpr(-11, &mlir_context_);
+  AffineExpr mc99 = getAffineConstantExpr(-99, &mlir_context_);
+  AffineExpr c109 = getAffineConstantExpr(109, &mlir_context_);
+
+  // (d0, d1) -> (-((d0 * -11 - d1 + 109) floordiv 11) + 9,
+  //              d0 * 11 + d1 + ((d0 * -11 - d1 + 109) floordiv 11) * 11 - 99).
+  AffineExpr weighted_sum =
+      getAffineBinaryOpExpr(AffineExprKind::Mul, d0, mc11);
+  weighted_sum = getAffineBinaryOpExpr(
+      AffineExprKind::Add, weighted_sum,
+      getAffineBinaryOpExpr(AffineExprKind::Mul, d1, mc1));
+  weighted_sum = getAffineBinaryOpExpr(AffineExprKind::Add, weighted_sum, c109);
+  weighted_sum =
+      getAffineBinaryOpExpr(AffineExprKind::FloorDiv, weighted_sum, c11);
+
+  AffineExpr result0 = getAffineBinaryOpExpr(
+      AffineExprKind::Add, c9,
+      getAffineBinaryOpExpr(AffineExprKind::Mul, mc1, weighted_sum));
+  AffineExpr result1 = getAffineBinaryOpExpr(
+      AffineExprKind::Add,
+      getAffineBinaryOpExpr(AffineExprKind::Add,
+                            getAffineBinaryOpExpr(AffineExprKind::Mul, c11, d0),
+                            d1),
+      getAffineBinaryOpExpr(
+          AffineExprKind::Add,
+          getAffineBinaryOpExpr(AffineExprKind::Mul, c11, weighted_sum), mc99));
+
+  auto map = AffineMap::get(2, 0, {result0, result1}, &mlir_context_);
+
+  // d0 in [0, 10) and d1 in [0, 11).
+  IndexingMapSimplifier simplifier(&mlir_context_);
+  simplifier.SetInclusiveBounds(d0, 0, 9);
+  simplifier.SetInclusiveBounds(d1, 0, 10);
+
+  EXPECT_THAT(ToString(simplifier.Simplify(map)),
+              HasSubstr("(d0, d1) -> (d0, d1)"));
+}
+
+// TODO(b/313840171): Simplify `(d1 * 4 + d2) floordiv 8` to `d1 floordiv 2`.
+
+// TODO(b/313840171): Simplify `(d0 * 8 + d1) floordiv 16` to `d0 floordiv 2`.
+
+// TODO(b/313840171): Simplify `((d0 * 8 + d1) mod 16) floordiv 4` to
+// `((d0 * 8 + d1) floordiv 4) mod 4` to `(d0 * 2 + d1 floordiv 4) mod 4`.
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.cc b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
new file mode 100644
index 00000000000000..ab648aad99aaa1
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
@@ -0,0 +1,975 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/tile_analysis.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <optional>
+#include <ostream>
+#include <queue>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/model/indexing_map_simplifier.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using llvm::SmallVector;
+using mlir::AffineExpr;
+using mlir::AffineExprKind;
+using mlir::AffineMap;
+using mlir::getAffineBinaryOpExpr;
+using mlir::getAffineConstantExpr;
+using mlir::getAffineDimExpr;
+using mlir::MLIRContext;
+
+StatusOr<HloInstructionIndexing> ComputeOutputToInputCwiseOpIndexing(
+    const HloInstruction* instr, MLIRContext* mlir_context) {
+  auto dims = instr->shape().dimensions();
+  IndexingMap identity_map{.affine_map = AffineMap::getMultiDimIdentityMap(
+                               dims.size(), mlir_context),
+                           .domain = Domain::FromUpperBounds(dims, {})};
+
+  HloInstructionIndexing instr_indexing;
+  int64_t operand_count = instr->operand_count();
+  for (int64_t operand_id = 0; operand_id < operand_count; ++operand_id) {
+    instr_indexing.indexing_maps[operand_id].insert(identity_map);
+  }
+  return instr_indexing;
+}
+
+StatusOr<HloInstructionIndexing> ComputeInputToOutputCwiseOpIndexing(
+    const HloInstruction* instr, MLIRContext* mlir_context) {
+  auto dims = instr->shape().dimensions();
+  IndexingMap identity_map{.affine_map = AffineMap::getMultiDimIdentityMap(
+                               dims.size(), mlir_context),
+                           .domain = Domain::FromUpperBounds(dims, {})};
+  return HloInstructionIndexing::FromIndexingMaps({identity_map});
+}
+
+StatusOr<HloInstructionIndexing> ComputeOutputToInputBroadcastOpIndexing(
+    const HloBroadcastInstruction* bcast, MLIRContext* mlir_context) {
+  auto output_dims = bcast->shape().dimensions();
+
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(bcast->dimensions().size());
+  for (int64_t bcast_dim : bcast->dimensions()) {
+    exprs.push_back(getAffineDimExpr(bcast_dim, mlir_context));
+  }
+  IndexingMap indexing_map{
+      .affine_map = AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
+                                   mlir_context),
+      .domain = Domain::FromUpperBounds(output_dims, {})};
+  return HloInstructionIndexing::FromIndexingMaps({indexing_map});
+}
+
+StatusOr<HloInstructionIndexing> ComputeInputToOutputBroadcastOpIndexing(
+    const HloBroadcastInstruction* bcast, MLIRContext* mlir_context) {
+  absl::Span<const int64_t> bcast_dims = bcast->dimensions();
+
+  const Shape& input_shape = bcast->operand(0)->shape();
+  const Shape& output_shape = bcast->shape();
+
+  std::vector<int64_t> added_dims_sizes;
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(output_shape.rank());
+  for (auto [output_dim_id, output_dim] :
+       llvm::enumerate(output_shape.dimensions())) {
+    auto bcast_dim =
+        std::find(bcast_dims.begin(), bcast_dims.end(), output_dim_id);
+    if (bcast_dim == bcast_dims.end()) {
+      exprs.push_back(
+          getAffineSymbolExpr(added_dims_sizes.size(), mlir_context));
+      added_dims_sizes.push_back(output_dim);
+      continue;
+    }
+    exprs.push_back(getAffineDimExpr(
+        std::distance(bcast_dims.begin(), bcast_dim), mlir_context));
+  }
+  IndexingMap indexing_map{
+      .affine_map = AffineMap::get(input_shape.rank(), added_dims_sizes.size(),
+                                   exprs, mlir_context),
+      .domain =
+          Domain::FromUpperBounds(input_shape.dimensions(), added_dims_sizes)};
+
+  return HloInstructionIndexing::FromIndexingMaps({indexing_map});
+}
+
+// Composes affine maps, i.e. consumer_map ∘ producer_map.
+IndexingMap ComposeIndexingMaps(const IndexingMap& producer_map,
+                                const IndexingMap& consumer_map) {
+  // AffineMap::compose(some_affine_map) actually computes some_affine_map ∘
+  // this.
+  AffineMap composed_map = mlir::simplifyAffineMap(
+      producer_map.affine_map.compose(consumer_map.affine_map));
+
+  // After the composition some of the symbols might become unused, e.g. when a
+  // dimension was added by broadcasting as then reduced. We should remove these
+  // dimensions from the composed affine map and also from the resulting
+  // `domain.symbol_ranges`.
+  //
+  // For example, if there is a reduction(broadcast):
+  //
+  //   param = f32[15] parameter(0)
+  //   bcast = f32[15, 20] broadcast(p0), dimensions={0}
+  //   reduce = f32[15, 20] reduce(bcast, init) dimensions={1}
+  //
+  // then `reduce` has (d0)[s0] -> (d0, s0) with s0 in [0, 20).
+  // and  `bcast` has (d0, d1) -> (d0) indexing map.
+  //
+  // The composition of there two maps yields (d0)[s0] -> (d0),
+  // although `s0` is not used in the mapping. In order to remove such symbols,
+  // we get the indices of unused symbols and remove them from the composed
+  // affine map and the `domain.symbol_ranges`.
+  auto unused_symbols_bit_vector =
+      mlir::getUnusedSymbolsBitVector({composed_map});
+  composed_map = mlir::compressSymbols(composed_map, unused_symbols_bit_vector);
+
+  // The symbols in the composed map, i.e. combined
+  // producer_map.compose(consumer_map) are packed as [symbols(producer_map) |
+  // symbols(consumer_map)]. In that order we are adding the symbol ranges while
+  // skipping the symbols that are unused.
+  std::vector<Range> combined_symbol_ranges;
+  combined_symbol_ranges.reserve(producer_map.domain.symbol_ranges.size() +
+                                 consumer_map.domain.symbol_ranges.size());
+  int64_t symbol_id = 0;
+  for (const Range& symbol_range :
+       llvm::concat<const Range>(producer_map.domain.symbol_ranges,
+                                 consumer_map.domain.symbol_ranges)) {
+    if (unused_symbols_bit_vector[symbol_id++]) continue;
+    combined_symbol_ranges.push_back(symbol_range);
+  }
+  IndexingMap composed_indexing_map{
+      .affine_map = std::move(composed_map),
+      .domain = Domain{.dimension_ranges = consumer_map.domain.dimension_ranges,
+                       .symbol_ranges = combined_symbol_ranges}};
+  composed_indexing_map.Simplify();
+  return composed_indexing_map;
+}
+
+// Composes instruction indexing maps starting at the root instruction
+// until the HloParameterInstruction is found.
+StatusOr<HloInstructionIndexing> ComputeOutputToInputFusionOpIndexing(
+    const HloFusionInstruction* fusion, int output_id,
+    MLIRContext* mlir_context) {
+  const HloInstruction* root =
+      fusion->shape().IsTuple()
+          ? fusion->fused_expression_root()->operand(output_id)
+          : fusion->fused_expression_root();
+  TF_ASSIGN_OR_RETURN(auto root_indexing, ComputeOutputToInputIndexing(
+                                              root, output_id, mlir_context));
+
+  auto grouped_indexing_maps =
+      GroupIndexingMapsByProducers(root_indexing, root);
+
+  // `bfs` is initialized with all producer instructions of the fusion root that
+  // are not parameters of the fusion.
+  std::queue<const HloInstruction*> bfs;
+  for (const auto& [instr, indexing_maps] : grouped_indexing_maps) {
+    if (instr->opcode() == HloOpcode::kParameter) continue;
+    bfs.push(instr);
+  }
+  while (!bfs.empty()) {
+    const HloInstruction* producer_instr = bfs.front();
+    bfs.pop();
+    TF_CHECK_OK(FuseProducerConsumerOutputToInputIndexing(
+        producer_instr, &grouped_indexing_maps, mlir_context));
+
+    for (const HloInstruction* producer_operand_instr :
+         producer_instr->operands()) {
+      if (producer_operand_instr->opcode() != HloOpcode::kParameter) {
+        bfs.push(producer_operand_instr);
+      }
+    }
+  }
+  // After the traversal, `grouped_indexing_maps` is keyed by
+  // HloParameterInstructions. Convert them back to the operand id and return.
+  HloInstructionIndexing fusion_indexing;
+  for (auto& [instr, indexing_maps] : grouped_indexing_maps) {
+    fusion_indexing.indexing_maps[instr->parameter_number()] =
+        std::move(indexing_maps);
+  }
+  return fusion_indexing;
+}
+
+StatusOr<HloInstructionIndexing> ComputeOutputToInputDotOpIndexing(
+    const HloDotInstruction* dot, MLIRContext* mlir_context) {
+  CHECK_NE(dot, nullptr);
+  const DotDimensionNumbers& dim_numbers = dot->dot_dimension_numbers();
+  absl::Span<const int64_t> lhs_contracting_dims(
+      dim_numbers.lhs_contracting_dimensions());
+  absl::Span<const int64_t> rhs_contracting_dims =
+      dim_numbers.rhs_contracting_dimensions();
+
+  absl::Span<const int64_t> lhs_batch_dims = dim_numbers.lhs_batch_dimensions();
+  absl::Span<const int64_t> rhs_batch_dims = dim_numbers.rhs_batch_dimensions();
+
+  const Shape& lhs_shape = dot->operand(0)->shape();
+  const Shape& rhs_shape = dot->operand(1)->shape();
+  // According to the StableHLO specification, the dimensions of the output
+  // shape are ordered as follows:
+  //   lhs_batch_dims | lhs_non_contracting_dims | rhs_non_contracting_dims
+  SmallVector<AffineExpr> lhs_exprs(lhs_shape.rank());
+  SmallVector<AffineExpr> rhs_exprs(rhs_shape.rank());
+  int64_t output_dim_id = 0;
+
+  // lhs_batch_dims
+  for (auto [lhs_batch_dim, rhs_batch_dim] :
+       llvm::zip(lhs_batch_dims, rhs_batch_dims)) {
+    AffineExpr output_dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
+    lhs_exprs[lhs_batch_dim] = output_dim_expr;
+    rhs_exprs[rhs_batch_dim] = output_dim_expr;
+    ++output_dim_id;
+  }
+
+  // lhs_non_contracting_dims
+  TF_ASSIGN_OR_RETURN(
+      std::vector<int64_t> lhs_non_contracting_dims,
+      GetNonContractingDims(lhs_shape, lhs_batch_dims, lhs_contracting_dims));
+
+  for (int64_t lhs_non_contracting_dim : lhs_non_contracting_dims) {
+    lhs_exprs[lhs_non_contracting_dim] =
+        getAffineDimExpr(output_dim_id++, mlir_context);
+  }
+
+  // rhs_non_contracting_dims
+  TF_ASSIGN_OR_RETURN(
+      std::vector<int64_t> rhs_non_contracting_dims,
+      GetNonContractingDims(rhs_shape, rhs_batch_dims, rhs_contracting_dims));
+
+  for (int64_t rhs_non_contracting_dim : rhs_non_contracting_dims) {
+    rhs_exprs[rhs_non_contracting_dim] =
+        getAffineDimExpr(output_dim_id++, mlir_context);
+  }
+
+  int64_t input_dim_id = 0;
+  std::vector<int64_t> input_dim_sizes;
+  input_dim_sizes.reserve(lhs_contracting_dims.size());
+
+  for (auto [lhs_contracting_dim, rhs_contracting_dim] :
+       llvm::zip(lhs_contracting_dims, rhs_contracting_dims)) {
+    AffineExpr input_dim_expr = getAffineSymbolExpr(input_dim_id, mlir_context);
+    lhs_exprs[lhs_contracting_dim] = input_dim_expr;
+    rhs_exprs[rhs_contracting_dim] = input_dim_expr;
+    ++input_dim_id;
+
+    // LHS and RHS contracting dimensions must match pairwise, and we therefore
+    // need only populate a single input_dim_sizes vector.
+    input_dim_sizes.push_back(lhs_shape.dimensions(lhs_contracting_dim));
+  }
+
+  IndexingMap lhs_indexing_map{
+      .affine_map = AffineMap::get(dot->shape().rank(), input_dim_sizes.size(),
+                                   lhs_exprs, mlir_context),
+      .domain =
+          Domain::FromUpperBounds(dot->shape().dimensions(), input_dim_sizes)};
+
+  IndexingMap rhs_indexing_map{
+      .affine_map = AffineMap::get(dot->shape().rank(), input_dim_sizes.size(),
+                                   rhs_exprs, mlir_context),
+      .domain =
+          Domain::FromUpperBounds(dot->shape().dimensions(), input_dim_sizes)};
+  return HloInstructionIndexing::FromIndexingMaps(
+      {lhs_indexing_map, rhs_indexing_map});
+}
+
+StatusOr<HloInstructionIndexing> ComputeOutputToInputReduceOpIndexing(
+    const HloReduceInstruction* reduce, int output_id,
+    MLIRContext* mlir_context) {
+  absl::flat_hash_set<int64_t> reduce_dims_ids(reduce->dimensions().begin(),
+                                               reduce->dimensions().end());
+
+  const Shape& input_shape = reduce->operand(output_id)->shape();
+  const Shape& output_shape = reduce->shape().IsTuple()
+                                  ? ShapeUtil::GetSubshape(reduce->shape(), {0})
+                                  : reduce->shape();
+
+  std::vector<int64_t> parallel_dims_sizes;
+  int64_t output_dim_id = 0;
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(input_shape.rank());
+  for (auto [input_dim_id, input_dim] :
+       llvm::enumerate(input_shape.dimensions())) {
+    if (reduce_dims_ids.contains(input_dim_id)) {
+      exprs.push_back(
+          getAffineSymbolExpr(parallel_dims_sizes.size(), mlir_context));
+      parallel_dims_sizes.push_back(input_dim);
+      continue;
+    }
+    exprs.push_back(getAffineDimExpr(output_dim_id++, mlir_context));
+  }
+  IndexingMap inputs_indexing_map{
+      .affine_map = AffineMap::get(output_shape.rank(), reduce_dims_ids.size(),
+                                   exprs, mlir_context),
+      .domain = Domain::FromUpperBounds(output_shape.dimensions(),
+                                        parallel_dims_sizes)};
+  IndexingMap inits_indexing_map{
+      .affine_map = AffineMap::get(output_shape.rank(), /*symbolCount=*/0, {},
+                                   mlir_context),
+      .domain = Domain::FromUpperBounds(output_shape.dimensions(), {})};
+
+  HloInstructionIndexing instr_indexing;
+  for (int64_t id = 0; id < reduce->input_count(); ++id) {
+    instr_indexing.indexing_maps[id].insert(inputs_indexing_map);
+  }
+  for (int64_t id = reduce->input_count(); id < reduce->operand_count(); ++id) {
+    instr_indexing.indexing_maps[id].insert(inits_indexing_map);
+  }
+  return instr_indexing;
+}
+
+StatusOr<HloInstructionIndexing> ComputeInputToOutputReduceOpIndexing(
+    const HloReduceInstruction* reduce, int input_id,
+    MLIRContext* mlir_context) {
+  absl::flat_hash_set<int64_t> reduce_dims_ids(reduce->dimensions().begin(),
+                                               reduce->dimensions().end());
+  const Shape& input_shape = reduce->operand(input_id)->shape();
+  const Shape& output_shape = reduce->shape().IsTuple()
+                                  ? ShapeUtil::GetSubshape(reduce->shape(), {0})
+                                  : reduce->shape();
+  int64_t output_rank = output_shape.rank();
+
+  int64_t output_dim_id = 0;
+  std::vector<AffineExpr> inputs_exprs, inits_exprs;
+  inputs_exprs.reserve(output_rank);
+  inits_exprs.reserve(output_rank);
+  for (auto [input_dim_id, input_dim] :
+       llvm::enumerate(input_shape.dimensions())) {
+    if (reduce_dims_ids.contains(input_dim_id)) {
+      continue;
+    }
+    inputs_exprs.push_back(getAffineDimExpr(input_dim_id, mlir_context));
+    inits_exprs.push_back(
+        mlir::getAffineSymbolExpr(output_dim_id++, mlir_context));
+  }
+  IndexingMap inputs_indexing_map{
+      .affine_map = AffineMap::get(input_shape.rank(), /*symbolCount=*/0,
+                                   inputs_exprs, mlir_context),
+      .domain = Domain::FromUpperBounds(input_shape.dimensions(), {})};
+  IndexingMap inits_indexing_map{
+      .affine_map = AffineMap::get(0, /*symbolCount=*/output_rank, inits_exprs,
+                                   mlir_context),
+      .domain = Domain::FromUpperBounds({}, output_shape.dimensions())};
+
+  HloInstructionIndexing instr_indexing;
+  for (int64_t id = 0; id < reduce->input_count(); ++id) {
+    instr_indexing.indexing_maps[id].insert(inputs_indexing_map);
+  }
+  for (int64_t id = reduce->input_count(); id < reduce->operand_count(); ++id) {
+    instr_indexing.indexing_maps[id].insert(inits_indexing_map);
+  }
+  return instr_indexing;
+}
+
+// Computes strides for a shape.
+std::vector<int64_t> ComputeStrides(absl::Span<const int64_t> dims) {
+  int rank = static_cast<int>(dims.size());
+  std::vector<int64_t> strides(rank, 1);
+  for (int i = rank - 2; i >= 0; --i) {
+    strides[i] = dims[i + 1] * strides[i + 1];
+  }
+  return strides;
+}
+
+// Computes 1D index given a shape and N-d indexing expressions.
+AffineExpr LinearizeShape(absl::Span<const int64_t> dims,
+                          absl::Span<const AffineExpr> dimension_exprs,
+                          MLIRContext* mlir_context) {
+  AffineExpr linear_index = getAffineConstantExpr(0, mlir_context);
+
+  auto strides = ComputeStrides(dims);
+  for (auto [stride, dimension_expr] : llvm::zip(strides, dimension_exprs)) {
+    linear_index = getAffineBinaryOpExpr(
+        AffineExprKind::Add, linear_index,
+        getAffineBinaryOpExpr(AffineExprKind::Mul,
+                              getAffineConstantExpr(stride, mlir_context),
+                              dimension_expr));
+  }
+  return linear_index;
+}
+
+// Computes N-d indexing expressions given a linear index and a shape.
+std::vector<AffineExpr> DelinearizeIndex(absl::Span<const int64_t> dims,
+                                         AffineExpr linear_index,
+                                         MLIRContext* mlir_context) {
+  std::vector<AffineExpr> multi_index;
+  multi_index.reserve(dims.size());
+
+  AffineExpr remainder = linear_index;
+  for (int64_t stride : ComputeStrides(dims)) {
+    AffineExpr stride_expr = getAffineConstantExpr(stride, mlir_context);
+    multi_index.push_back(getAffineBinaryOpExpr(AffineExprKind::FloorDiv,
+                                                remainder, stride_expr));
+    remainder =
+        getAffineBinaryOpExpr(AffineExprKind::Mod, remainder, stride_expr);
+  }
+  return multi_index;
+}
+
+// Computes indexing for "minimal" reshapes, i.e. reshapes that cannot be
+// represented by a series of composed reshapes, i.e. when there are no
+// subshapes in input and output that have the same number of elements.
+// For example, [8, 4] -> [8, 2, 2] is not a minimal reshape, it has matching
+// subshapes [8] -> [8] and [4] -> [2, 2].
+//
+// There are only 4 types of "minimal" reshapes considers only 4 cases:
+//   1. Dimension is not changed, e.g. [8] -> [8]
+//   2. Dimension is expanded, e.g. [8] -> [4, 2]
+//   3. Dimension is collapsed, e.g. [4, 2] -> [8]
+//   4. Dimension is collapsed and expanded, e.g. [8, 16] -> [4, 32]
+//
+// The function computes indexing maps for these 4 cases, i.e. considers given
+// input/output shapes and checks if the shapes are the same, expanded or
+// collapsed. Otherwise, performs linearization/delinearization.
+void ComputeMinimalReshapeIndexing(
+    absl::Span<const int64_t> input_dims, absl::Span<const int64_t> output_dims,
+    absl::Span<const AffineExpr> output_dims_exprs,
+    std::vector<AffineExpr>* exprs, MLIRContext* mlir_context) {
+  // The shape does not change.
+  if (input_dims.size() == 1 && output_dims.size() == 1) {
+    absl::c_copy(output_dims_exprs, std::back_inserter(*exprs));
+    return;
+  }
+  // Expand shape.
+  if (input_dims.size() == 1) {
+    exprs->push_back(
+        LinearizeShape(output_dims, output_dims_exprs, mlir_context));
+    return;
+  }
+  // Collapse shape.
+  if (output_dims.size() == 1) {
+    auto multi_index =
+        DelinearizeIndex(input_dims, output_dims_exprs.front(), mlir_context);
+    absl::c_copy(multi_index, std::back_inserter(*exprs));
+    return;
+  }
+  // Generic case.
+  AffineExpr linear_index =
+      LinearizeShape(output_dims, output_dims_exprs, mlir_context);
+  auto multi_index = DelinearizeIndex(input_dims, linear_index, mlir_context);
+  absl::c_copy(multi_index, std::back_inserter(*exprs));
+}
+
+// Scans input and output shapes from left to right in an attempt to find
+// subshapes with the same number of elements and then computes indexing map for
+// every pair of subshapes.
+//
+// Example:
+//   p0 = f32[4, 8, 12] parameter(0)
+//   reshape = f32[32, 3, 4] reshape(p0)
+//
+// This reshape can be represented as a composition of two reshapes.
+// The first reshape collapses dimensions first two input dimensions [4, 8] onto
+// the output dimension [32].
+// The second reshape expands the input dimension [12] into two output
+// dimensions [3, 4].
+// This is an optimization that allows us to construct simpler affine maps,
+// otherwise we would need to linearize/delinearize even some of the simpler
+// cases.
+AffineMap ComputeReshapeIndexingMap(absl::Span<const int64_t> input_dims,
+                                    absl::Span<const int64_t> output_dims,
+                                    MLIRContext* mlir_context) {
+  size_t input_rank = input_dims.size();
+  size_t output_rank = output_dims.size();
+
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(output_rank);
+
+  std::vector<AffineExpr> output_dims_exprs;
+
+  // Find subshapes with the same element count and compute indexing for them.
+  int64_t input_num_elements = 1;
+  int64_t output_num_elements = 1;
+  std::vector<int64_t> input_subshape, output_subshape;
+  size_t input_dim_id = 0, output_dim_id = 0;
+  while (input_dim_id < input_rank || output_dim_id < output_rank ||
+         !input_subshape.empty()) {
+    if (input_dim_id < input_rank &&
+        (input_subshape.empty() || input_num_elements < output_num_elements ||
+         input_dims[input_dim_id] == 1)) {
+      input_num_elements *= input_dims[input_dim_id];
+      input_subshape.push_back(input_dims[input_dim_id]);
+      ++input_dim_id;
+      continue;
+    }
+    if (output_dim_id < output_rank &&
+        (output_subshape.empty() || output_num_elements < input_num_elements ||
+         output_dims[output_dim_id] == 1)) {
+      output_num_elements *= output_dims[output_dim_id];
+      output_subshape.push_back(output_dims[output_dim_id]);
+      output_dims_exprs.push_back(
+          getAffineDimExpr(output_dim_id, mlir_context));
+      ++output_dim_id;
+      continue;
+    }
+    ComputeMinimalReshapeIndexing(input_subshape, output_subshape,
+                                  output_dims_exprs, &exprs, mlir_context);
+    input_num_elements = 1;
+    output_num_elements = 1;
+    input_subshape.clear();
+    output_subshape.clear();
+    output_dims_exprs.clear();
+  }
+  return AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
+                        mlir_context);
+};
+
+StatusOr<HloInstructionIndexing> ComputeOutputToInputReshapeOpIndexing(
+    const HloReshapeInstruction* reshape, MLIRContext* mlir_context) {
+  auto input_dims = reshape->operand(0)->shape().dimensions();
+  auto output_dims = reshape->shape().dimensions();
+
+  IndexingMap reshape_indexing_map{
+      .affine_map =
+          ComputeReshapeIndexingMap(input_dims, output_dims, mlir_context),
+      .domain = Domain::FromUpperBounds(output_dims, {})};
+  reshape_indexing_map.Simplify();
+  return HloInstructionIndexing::FromIndexingMaps({reshape_indexing_map});
+}
+StatusOr<HloInstructionIndexing> ComputeInputToOutputReshapeOpIndexing(
+    const HloReshapeInstruction* reshape, MLIRContext* mlir_context) {
+  auto input_dims = reshape->operand(0)->shape().dimensions();
+  auto output_dims = reshape->shape().dimensions();
+
+  IndexingMap reshape_indexing_map{
+      .affine_map =
+          ComputeReshapeIndexingMap(output_dims, input_dims, mlir_context),
+      .domain = Domain::FromUpperBounds(input_dims, {})};
+  reshape_indexing_map.Simplify();
+  return HloInstructionIndexing::FromIndexingMaps({reshape_indexing_map});
+}
+
+StatusOr<HloInstructionIndexing> ComputeReverseOpIndexing(
+    const HloReverseInstruction* reverse, MLIRContext* mlir_context) {
+  absl::flat_hash_set<int64_t> reverse_dims(reverse->dimensions().begin(),
+                                            reverse->dimensions().end());
+  auto output_dims = reverse->shape().dimensions();
+
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(output_dims.size());
+  for (auto [output_dim_id, output_dim] : llvm::enumerate(output_dims)) {
+    auto dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
+    if (!reverse_dims.contains(output_dim_id)) {
+      exprs.push_back(dim_expr);
+      continue;
+    }
+    auto dim_bound = getAffineConstantExpr(output_dim - 1, mlir_context);
+    auto neg_dim_expr = getAffineBinaryOpExpr(
+        AffineExprKind::Mul, getAffineConstantExpr(-1, mlir_context), dim_expr);
+    exprs.push_back(
+        getAffineBinaryOpExpr(AffineExprKind::Add, neg_dim_expr, dim_bound));
+  }
+
+  IndexingMap indexing_map{
+      .affine_map = AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
+                                   mlir_context),
+      .domain = Domain::FromUpperBounds(output_dims, {})};
+
+  return HloInstructionIndexing::FromIndexingMaps({indexing_map});
+}
+
+StatusOr<HloInstructionIndexing> ComputeOutputToInputSliceOpIndexing(
+    const HloSliceInstruction* slice, MLIRContext* mlir_context) {
+  auto output_rank = slice->shape().rank();
+
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(output_rank);
+  for (int64_t dim = 0; dim < output_rank; ++dim) {
+    AffineExpr offset =
+        getAffineConstantExpr(slice->slice_starts()[dim], mlir_context);
+    AffineExpr stride =
+        getAffineConstantExpr(slice->slice_strides()[dim], mlir_context);
+    AffineExpr dim_expr = getAffineDimExpr(dim, mlir_context);
+
+    AffineExpr mul =
+        getAffineBinaryOpExpr(AffineExprKind::Mul, stride, dim_expr);
+    exprs.push_back(getAffineBinaryOpExpr(AffineExprKind::Add, offset, mul));
+  }
+  IndexingMap indexing_map{
+      .affine_map =
+          AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
+      .domain = Domain::FromUpperBounds(slice->shape().dimensions(), {})};
+  return HloInstructionIndexing::FromIndexingMaps({indexing_map});
+}
+
+AffineMap ComputeTransposeIndexingMap(absl::Span<const int64_t> permutation,
+                                      bool invert, MLIRContext* mlir_context) {
+  auto forward_permutation = AffineMap::getPermutationMap(
+      std::vector<unsigned>(permutation.begin(), permutation.end()),
+      mlir_context);
+  return invert ? mlir::inversePermutation(forward_permutation)
+                : forward_permutation;
+}
+
+StatusOr<HloInstructionIndexing> ComputeOutputToInputTransposeOpIndexing(
+    const HloTransposeInstruction* transpose, MLIRContext* mlir_context) {
+  AffineMap inverse_permutation = ComputeTransposeIndexingMap(
+      transpose->dimensions(), /*invert=*/true, mlir_context);
+  return HloInstructionIndexing::FromIndexingMaps({IndexingMap{
+      .affine_map = inverse_permutation,
+      .domain = Domain::FromUpperBounds(transpose->shape().dimensions(), {})}});
+}
+
+StatusOr<HloInstructionIndexing> ComputeInputToOutputTransposeOpIndexing(
+    const HloTransposeInstruction* transpose, MLIRContext* mlir_context) {
+  AffineMap forward_permutation = ComputeTransposeIndexingMap(
+      transpose->dimensions(), /*invert=*/false, mlir_context);
+  return HloInstructionIndexing::FromIndexingMaps(
+      {IndexingMap{.affine_map = forward_permutation,
+                   .domain = Domain::FromUpperBounds(
+                       transpose->operand(0)->shape().dimensions(), {})}});
+}
+
+StatusOr<AffineMap> ComputeOutputToInputBitcastOpIndexingImpl(
+    const Shape& input_shape, const Shape& output_shape,
+    MLIRContext* mlir_context) {
+  ShapeUtil::BitcastDecomposition decomposed_bitcast =
+      ShapeUtil::DecomposeBitcast(input_shape, output_shape);
+
+  if (std::holds_alternative<ShapeUtil::BitcastDecompositionTranspose>(
+          decomposed_bitcast)) {
+    auto permutation = ShapeUtil::DeduceTransposeDimensionsForBitcast(
+        input_shape, output_shape);
+    CHECK(permutation.has_value())
+        << "Failed to deduce permutation for a bitcast.";
+
+    return ComputeTransposeIndexingMap(permutation.value(), /*invert=*/true,
+                                       mlir_context);
+  }
+  if (std::holds_alternative<ShapeUtil::BitcastDecompositionReshape>(
+          decomposed_bitcast)) {
+    return ComputeReshapeIndexingMap(input_shape.dimensions(),
+                                     output_shape.dimensions(), mlir_context);
+  }
+  // `trt` stands for transpose-reshape-transpose decomposition of bitcast.
+  auto trt = std::get<ShapeUtil::BitcastDecompositionTrt>(decomposed_bitcast);
+  AffineMap transpose_map_1 = ComputeTransposeIndexingMap(
+      trt.transpose1_dims, /*invert=*/true, mlir_context);
+  AffineMap reshape_map =
+      ComputeReshapeIndexingMap(trt.transpose1_shape.dimensions(),
+                                trt.reshape_shape.dimensions(), mlir_context);
+  AffineMap transpose_map_2 = ComputeTransposeIndexingMap(
+      trt.transpose2_dims, /*invert=*/true, mlir_context);
+  return transpose_map_1.compose(reshape_map).compose(transpose_map_2);
+}
+
+StatusOr<HloInstructionIndexing> ComputeOutputToInputBitcastOpIndexing(
+    const HloInstruction* bitcast, MLIRContext* mlir_context) {
+  const Shape& input_shape = bitcast->operand(0)->shape();
+  const Shape& output_shape = bitcast->shape();
+  TF_ASSIGN_OR_RETURN(auto bitcast_affine_map,
+                      ComputeOutputToInputBitcastOpIndexingImpl(
+                          input_shape, output_shape, mlir_context));
+  IndexingMap bitcast_indexing_map{
+      .affine_map = bitcast_affine_map,
+      .domain = Domain::FromUpperBounds(output_shape.dimensions(), {})};
+  bitcast_indexing_map.Simplify();
+
+  return HloInstructionIndexing::FromIndexingMaps({bitcast_indexing_map});
+}
+
+StatusOr<HloInstructionIndexing> ComputeInputToOutputBitcastOpIndexing(
+    const HloInstruction* bitcast, MLIRContext* mlir_context) {
+  const Shape& input_shape = bitcast->operand(0)->shape();
+  const Shape& output_shape = bitcast->shape();
+
+  TF_ASSIGN_OR_RETURN(auto bitcast_affine_map,
+                      ComputeOutputToInputBitcastOpIndexingImpl(
+                          output_shape, input_shape, mlir_context));
+
+  IndexingMap bitcast_indexing_map{
+      .affine_map = bitcast_affine_map,
+      .domain = Domain::FromUpperBounds(input_shape.dimensions(), {})};
+  bitcast_indexing_map.Simplify();
+
+  return HloInstructionIndexing::FromIndexingMaps({bitcast_indexing_map});
+}
+
+template <typename T>
+std::string ToStringImpl(const T& value) {
+  std::string s;
+  std::stringstream ss(s);
+  ss << value;
+  return ss.str();
+}
+
+}  // namespace
+
+bool IndexingMap::Simplify() {
+  auto* mlir_context = affine_map.getContext();
+  IndexingMapSimplifier simplifier{mlir_context};
+  for (const auto& [index, range] : llvm::enumerate(domain.dimension_ranges)) {
+    simplifier.SetInclusiveBounds(getAffineDimExpr(index, mlir_context),
+                                  range.lower_bound, range.upper_bound - 1);
+  }
+  for (const auto& [index, range] : llvm::enumerate(domain.symbol_ranges)) {
+    simplifier.SetInclusiveBounds(getAffineSymbolExpr(index, mlir_context),
+                                  range.lower_bound, range.upper_bound - 1);
+  }
+  AffineMap simplified_affine_map = simplifier.Simplify(affine_map);
+  if (simplified_affine_map == affine_map) {
+    return false;
+  }
+  affine_map = simplified_affine_map;
+  return true;
+}
+
+bool HloInstructionIndexing::Simplify() {
+  bool any_simplified = false;
+  for (auto& operand_indexing : indexing_maps) {
+    std::vector<IndexingMap> to_remove;
+    std::vector<IndexingMap> to_add;
+    absl::flat_hash_set<IndexingMap>& indexing_maps = operand_indexing.second;
+    for (IndexingMap map : indexing_maps) {
+      to_remove.push_back(map);
+      if (map.Simplify()) {
+        to_add.push_back(map);
+      } else {
+        to_remove.pop_back();
+      }
+    }
+    for (auto& map : to_remove) {
+      indexing_maps.erase(map);
+    }
+    for (auto& map : to_add) {
+      indexing_maps.insert(map);
+    }
+    any_simplified |= !to_remove.empty();
+  }
+  return any_simplified;
+}
+
+bool operator==(const Range& lhs, const Range& rhs) {
+  return lhs.lower_bound == rhs.lower_bound &&
+         lhs.upper_bound == rhs.upper_bound;
+}
+
+bool operator==(const Domain& lhs, const Domain& rhs) {
+  return lhs.dimension_ranges == rhs.dimension_ranges &&
+         lhs.symbol_ranges == rhs.symbol_ranges;
+}
+
+bool operator==(const IndexingMap& lhs, const IndexingMap& rhs) {
+  return lhs.affine_map == rhs.affine_map && lhs.domain == rhs.domain;
+}
+
+std::ostream& operator<<(std::ostream& out, const Range& range) {
+  out << '[' << range.lower_bound << ", " << range.upper_bound << ")";
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const Domain& domain) {
+  for (const auto& [index, range] : llvm::enumerate(domain.dimension_ranges)) {
+    out << 'd' << index << " in " << range << '\n';
+  }
+  for (const auto& [index, range] : llvm::enumerate(domain.symbol_ranges)) {
+    out << 's' << index << " in " << range << '\n';
+  }
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const IndexingMap& indexing_map) {
+  out << ToString(indexing_map.affine_map) << " with domain\n"
+      << indexing_map.domain << "\n";
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out,
+                         const HloInstructionIndexing& instr_indexing) {
+  for (const auto& [operand_id, indexing_maps] : instr_indexing.indexing_maps) {
+    out << "operand id = " << operand_id << ' ';
+    for (const auto& indexing_map : indexing_maps) {
+      out << indexing_map;
+    }
+  }
+  return out;
+}
+
+std::string Range::ToString() const { return ToStringImpl(*this); }
+
+std::string Domain::ToString() const { return ToStringImpl(*this); }
+
+Domain Domain::FromUpperBounds(absl::Span<const int64_t> dimension_upper_bounds,
+                               absl::Span<const int64_t> symbol_upper_bounds) {
+  Domain domain;
+  domain.dimension_ranges.reserve(dimension_upper_bounds.size());
+  for (const int64_t ub : dimension_upper_bounds) {
+    CHECK_GT(ub, 0);
+    domain.dimension_ranges.push_back({.lower_bound = 0, .upper_bound = ub});
+  }
+  domain.symbol_ranges.reserve(symbol_upper_bounds.size());
+  for (const int64_t ub : symbol_upper_bounds) {
+    CHECK_GT(ub, 0);
+    domain.symbol_ranges.push_back({.lower_bound = 0, .upper_bound = ub});
+  }
+  return domain;
+}
+
+std::string IndexingMap::ToString() const { return ToStringImpl(*this); }
+
+HloInstructionIndexing HloInstructionIndexing::FromIndexingMaps(
+    absl::Span<const IndexingMap> indexing_maps) {
+  HloInstructionIndexing instr_indexing;
+  instr_indexing.indexing_maps.reserve(indexing_maps.size());
+  for (const auto& [index, map] : llvm::enumerate(indexing_maps)) {
+    instr_indexing.indexing_maps[index].insert(map);
+  }
+  return instr_indexing;
+}
+
+std::string HloInstructionIndexing::ToString() const {
+  return ToStringImpl(*this);
+}
+
+absl::flat_hash_map<const HloInstruction*, absl::flat_hash_set<IndexingMap>>
+GroupIndexingMapsByProducers(const HloInstructionIndexing& indexing,
+                             const HloInstruction* instr) {
+  absl::flat_hash_map<const HloInstruction*, absl::flat_hash_set<IndexingMap>>
+      result;
+  for (const auto& [operand_id, indexing_maps] : indexing.indexing_maps) {
+    result[instr->operand(operand_id)].insert(indexing_maps.begin(),
+                                              indexing_maps.end());
+  }
+  return result;
+}
+
+Status FuseProducerConsumerOutputToInputIndexing(
+    const HloInstruction* producer_instr,
+    absl::flat_hash_map<const HloInstruction*,
+                        absl::flat_hash_set<IndexingMap>>* consumer_indexing,
+    MLIRContext* mlir_context) {
+  TF_ASSIGN_OR_RETURN(auto producer_indexing,
+                      ComputeOutputToInputIndexing(
+                          producer_instr, /*output_id=*/0, mlir_context));
+
+  auto consumer_indexing_maps = (*consumer_indexing)[producer_instr];
+  for (const auto& [producer_operand_id, producer_operand_indexing] :
+       producer_indexing.indexing_maps) {
+    const HloInstruction* producer_operand_instr =
+        producer_instr->operand(producer_operand_id);
+    for (const IndexingMap& producer_map : producer_operand_indexing) {
+      for (const IndexingMap& consumer_map : consumer_indexing_maps) {
+        (*consumer_indexing)[producer_operand_instr].insert(
+            ComposeIndexingMaps(producer_map, consumer_map));
+      }
+    }
+  }
+  consumer_indexing->erase(producer_instr);
+  return OkStatus();
+}
+
+StatusOr<HloInstructionIndexing> ComputeOutputToInputIndexing(
+    const HloInstruction* instr, int output_id, MLIRContext* ctx) {
+  if (HloInstruction::IsOpElementwise(instr->opcode())) {
+    return ComputeOutputToInputCwiseOpIndexing(instr, ctx);
+  }
+  if (instr->opcode() == HloOpcode::kBitcast) {
+    return ComputeOutputToInputBitcastOpIndexing(instr, ctx);
+  }
+  if (auto broadcast = DynCast<HloBroadcastInstruction>(instr)) {
+    return ComputeOutputToInputBroadcastOpIndexing(broadcast, ctx);
+  }
+  if (auto constant = DynCast<HloConstantInstruction>(instr)) {
+    return HloInstructionIndexing{};
+  }
+  if (auto dot = DynCast<HloDotInstruction>(instr)) {
+    return ComputeOutputToInputDotOpIndexing(dot, ctx);
+  }
+  if (auto fusion = DynCast<HloFusionInstruction>(instr)) {
+    return ComputeOutputToInputFusionOpIndexing(fusion, output_id, ctx);
+  }
+  if (auto iota = DynCast<HloIotaInstruction>(instr)) {
+    return HloInstructionIndexing{};
+  }
+  if (auto reduce = DynCast<HloReduceInstruction>(instr)) {
+    return ComputeOutputToInputReduceOpIndexing(reduce, output_id, ctx);
+  }
+  if (auto reshape = DynCast<HloReshapeInstruction>(instr)) {
+    return ComputeOutputToInputReshapeOpIndexing(reshape, ctx);
+  }
+  if (auto reverse = DynCast<HloReverseInstruction>(instr)) {
+    return ComputeReverseOpIndexing(reverse, ctx);
+  }
+  if (auto slice = DynCast<HloSliceInstruction>(instr)) {
+    return ComputeOutputToInputSliceOpIndexing(slice, ctx);
+  }
+  if (auto transpose = DynCast<HloTransposeInstruction>(instr)) {
+    return ComputeOutputToInputTransposeOpIndexing(transpose, ctx);
+  }
+  return InvalidArgument("Unsupported instruction type");
+}
+
+StatusOr<HloInstructionIndexing> ComputeInputToOutputIndexing(
+    const HloInstruction* instr, int input_id, MLIRContext* mlir_context) {
+  if (HloInstruction::IsOpElementwise(instr->opcode())) {
+    return ComputeInputToOutputCwiseOpIndexing(instr, mlir_context);
+  }
+  if (instr->opcode() == HloOpcode::kBitcast) {
+    return ComputeInputToOutputBitcastOpIndexing(instr, mlir_context);
+  }
+  if (auto broadcast = DynCast<HloBroadcastInstruction>(instr)) {
+    return ComputeInputToOutputBroadcastOpIndexing(broadcast, mlir_context);
+  }
+  if (auto reduce = DynCast<HloReduceInstruction>(instr)) {
+    return ComputeInputToOutputReduceOpIndexing(reduce, input_id, mlir_context);
+  }
+  if (auto reshape = DynCast<HloReshapeInstruction>(instr)) {
+    return ComputeInputToOutputReshapeOpIndexing(reshape, mlir_context);
+  }
+  if (auto reverse = DynCast<HloReverseInstruction>(instr)) {
+    return ComputeReverseOpIndexing(reverse, mlir_context);
+  }
+  if (auto transpose = DynCast<HloTransposeInstruction>(instr)) {
+    return ComputeInputToOutputTransposeOpIndexing(transpose, mlir_context);
+  }
+  return InvalidArgument("Unsupported instruction type");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.h b/third_party/xla/xla/service/gpu/model/tile_analysis.h
new file mode 100644
index 00000000000000..63765abb0ee66b
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis.h
@@ -0,0 +1,164 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_TILE_ANALYSIS_H_
+#define XLA_SERVICE_GPU_MODEL_TILE_ANALYSIS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/Hashing.h"
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+// Range represents a semi-closed interval [lower_bound, upper_bound).
+struct Range {
+  std::string ToString() const;
+
+  int64_t lower_bound = 0;
+  int64_t upper_bound = 0;
+};
+std::ostream& operator<<(std::ostream& out, const Range& range);
+
+template <typename H>
+H AbslHashValue(H h, const Range& range) {
+  return H::combine(std::move(h), range.lower_bound, range.upper_bound);
+}
+
+// Domain contains ranges for symbols and dimensions of an affine map.
+struct Domain {
+  std::string ToString() const;
+
+  static Domain FromUpperBounds(
+      absl::Span<const int64_t> dimension_upper_bounds,
+      absl::Span<const int64_t> symbol_upper_bounds);
+
+  std::vector<Range> dimension_ranges;
+  std::vector<Range> symbol_ranges;
+};
+std::ostream& operator<<(std::ostream& out, const Domain& domain);
+
+template <typename H>
+H AbslHashValue(H h, const Domain& domain) {
+  return H::combine(std::move(h), domain.dimension_ranges,
+                    domain.symbol_ranges);
+}
+
+// Contains an affine map with N dimension expressions and M symbols:
+//   (d0, ..., d_{N - 1})[s_0, ..., s_{M - 1}] -> f(d_i, s_j)
+// Dimensions d_i correspond to the iteration space of the output tensor. Some
+// or all of the dimensions of the input operands can be expressed as a function
+// of dimensions of output. For example, for broadcasts and cwise ops all
+// dimensions of the inputs are covered by the output dimensions.
+// Domain specifies for what ranges of values the indexing map is specified.
+//
+// Example:
+//
+// 1. Indexing map for the input of the following reduction
+// ```
+//   p0 = f32[150, 20, 10, 50] parameter(0)
+//   reduce = f32[150, 10] reduce(p0, p0_init), dimensions={3, 1}
+// ```
+// can be written as `(d0, d1)[s0, s1] -> (d0, s0, d1, s1)`  with
+// d0 in [0, 150), d1 in [0, 10), s0 in [0, 20) and s1 in [0, 50).
+//
+// 2. Indexing map for the input of the reverse op
+// ```
+//  %p0 = f32[1, 17, 9, 9] parameter(0)
+//  reverse = f32[1, 17, 9, 9] reverse(%p0), dimensions={1, 2}
+// ```
+// can be written as `(d0, d1, d2, d3) -> (d0, -d1 + 17, -d2 + 9, d3)` with
+// d0 in [0, 1), d1 in [0, 17), d2 in [0, 9) and d3 in [0, 9).
+struct IndexingMap {
+  std::string ToString() const;
+
+  // Returns true if the map was simplified.
+  bool Simplify();
+
+  mlir::AffineMap affine_map;
+  Domain domain;
+};
+std::ostream& operator<<(std::ostream& out, const IndexingMap& indexing_map);
+bool operator==(const IndexingMap& lhs, const IndexingMap& rhs);
+
+template <typename H>
+H AbslHashValue(H h, const IndexingMap& indexing_map) {
+  llvm::hash_code affine_map_hash = llvm::hash_combine(indexing_map.affine_map);
+  return H::combine(std::move(h), static_cast<size_t>(affine_map_hash),
+                    indexing_map.domain);
+}
+
+// Contains indexing maps for all N-dimensional tensor input operands that
+// correspond to a particular output.
+struct HloInstructionIndexing {
+  std::string ToString() const;
+
+  // Returns true if the indexing was simplified.
+  bool Simplify();
+
+  // Creates a HloInstructionIndexing from a list of indexing maps for all
+  // operands and sorted w.r.t. operand index, i.e. indexing_maps[i] corresponds
+  // to operand[i] of the instruction.
+  static HloInstructionIndexing FromIndexingMaps(
+      absl::Span<const IndexingMap> indexing_maps);
+
+  // Maps input operand index to the indexing map for one particular output.
+  absl::flat_hash_map<int64_t, absl::flat_hash_set<IndexingMap>> indexing_maps;
+};
+std::ostream& operator<<(std::ostream& out,
+                         const HloInstructionIndexing& instr_indexing);
+
+std::string ToString(const mlir::AffineMap& affine_map);
+
+// Computes indexing maps for all input operands necessary to compute an element
+// of the `output_id` instruction output.
+StatusOr<HloInstructionIndexing> ComputeOutputToInputIndexing(
+    const HloInstruction* instr, int output_id,
+    mlir::MLIRContext* mlir_context);
+
+// Computes indexing maps for all output operands that the element of the
+// `input_id` instruction input will participate in.
+StatusOr<HloInstructionIndexing> ComputeInputToOutputIndexing(
+    const HloInstruction* instr, int input_id, mlir::MLIRContext* mlir_context);
+
+// Groups indexing maps by instructions.
+absl::flat_hash_map<const HloInstruction*, absl::flat_hash_set<IndexingMap>>
+GroupIndexingMapsByProducers(const HloInstructionIndexing& indexing,
+                             const HloInstruction* instr);
+
+// Computes producer indexing maps and fuse/compose them with the consumer
+// indexing maps.
+Status FuseProducerConsumerOutputToInputIndexing(
+    const HloInstruction* producer_instr,
+    absl::flat_hash_map<const HloInstruction*,
+                        absl::flat_hash_set<IndexingMap>>* consumer_indexing,
+    mlir::MLIRContext* mlir_context);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_TILE_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
new file mode 100644
index 00000000000000..fbebbe9378bf59
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
@@ -0,0 +1,1244 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/tile_analysis.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/status_macros.h"
+#include "xla/statusor.h"
+#include "xla/test_helpers.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::ExplainMatchResult;
+using ::testing::FieldsAre;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::Pair;
+using ::testing::PrintToString;
+using ::testing::UnorderedElementsAre;
+
+MATCHER_P2(MatchRange, lower_bound, upper_bound,
+           absl::StrCat(negation ? "equals " : "doesn't equal ", "range [",
+                        lower_bound, ", ", upper_bound, "]")) {
+  return ExplainMatchResult(FieldsAre(lower_bound, upper_bound), arg,
+                            result_listener);
+}
+
+MATCHER_P3(MatchIndexingMap, affine_map_string, dim_ranges, symbol_ranges, "") {
+  return ExplainMatchResult(HasSubstr(affine_map_string),
+                            ToString(arg.affine_map), result_listener) &&
+         ExplainMatchResult(dim_ranges, arg.domain.dimension_ranges,
+                            result_listener) &&
+         ExplainMatchResult(symbol_ranges, arg.domain.symbol_ranges,
+                            result_listener);
+}
+
+MATCHER_P2(MatchInstrIndexing, operand_id, indexing_map_matchers, "") {
+  return ExplainMatchResult(Eq(operand_id), arg.operand_id, result_listener) &&
+         ExplainMatchResult(indexing_map_matchers, arg.indexing_maps,
+                            result_listener);
+}
+
+class TileAnalysisTest : public HloTestBase {
+ public:
+  StatusOr<HloInstructionIndexing> GetOutputToInputIndexingForEntryComputation(
+      absl::string_view hlo_string, int output_id = 0) {
+    TF_ASSIGN_OR_RETURN(auto module, ParseAndReturnVerifiedModule(hlo_string));
+    HloInstruction* root = module->entry_computation()->root_instruction();
+
+    for (auto* operand : root->operands()) {
+      TF_RET_CHECK(operand->opcode() == HloOpcode::kParameter ||
+                   operand->opcode() == HloOpcode::kConstant)
+          << "If there are multiple instructions, they need to be wrapped in a "
+             "fusion.";
+    }
+    return ComputeOutputToInputIndexing(root, output_id, &mlir_context_);
+  }
+
+  StatusOr<HloInstructionIndexing> GetInputToOutputIndexingForEntryComputation(
+      absl::string_view hlo_string, int input_id = 0) {
+    TF_ASSIGN_OR_RETURN(auto module, ParseAndReturnVerifiedModule(hlo_string));
+    HloInstruction* root = module->entry_computation()->root_instruction();
+
+    for (auto* operand : root->operands()) {
+      TF_RET_CHECK(operand->opcode() == HloOpcode::kParameter ||
+                   operand->opcode() == HloOpcode::kConstant)
+          << "If there are multiple instructions, they need to be wrapped in a "
+             "fusion.";
+    }
+    return ComputeInputToOutputIndexing(root, input_id, &mlir_context_);
+  }
+  mlir::MLIRContext mlir_context_;
+};
+
+TEST_F(TileAnalysisTest, FuseProducerConsumerOutputToInputIndexing) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1000, 1000] parameter(0)
+      transpose_p0 = f32[1000, 1000]{0, 1} transpose(p0), dimensions={1, 0}
+      ROOT a0 = f32[1000, 1000] add(p0, transpose_p0)
+    }
+  )"));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* parameter = root->operand(0);
+  const HloInstruction* transpose = root->operand(1);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto root_indexing,
+      ComputeOutputToInputIndexing(root, /*output_id=*/0, &mlir_context_));
+
+  auto grouped_by_key = GroupIndexingMapsByProducers(root_indexing, root);
+
+  EXPECT_THAT(
+      grouped_by_key,
+      UnorderedElementsAre(
+          Pair(parameter,
+               ElementsAre(MatchIndexingMap(
+                   "(d0, d1) -> (d0, d1)",
+                   ElementsAre(MatchRange(0, 1000), MatchRange(0, 1000)),
+                   IsEmpty()))),
+          Pair(transpose,
+               ElementsAre(MatchIndexingMap(
+                   "(d0, d1) -> (d0, d1)",
+                   ElementsAre(MatchRange(0, 1000), MatchRange(0, 1000)),
+                   IsEmpty())))));
+
+  TF_CHECK_OK(FuseProducerConsumerOutputToInputIndexing(
+      transpose, &grouped_by_key, &mlir_context_));
+  EXPECT_THAT(
+      grouped_by_key,
+      UnorderedElementsAre(
+          Pair(parameter, UnorderedElementsAre(
+                              MatchIndexingMap("(d0, d1) -> (d0, d1)",
+                                               ElementsAre(MatchRange(0, 1000),
+                                                           MatchRange(0, 1000)),
+                                               IsEmpty()),
+                              MatchIndexingMap("(d0, d1) -> (d1, d0)",
+                                               ElementsAre(MatchRange(0, 1000),
+                                                           MatchRange(0, 1000)),
+                                               IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, ElementwiseOp) {
+  auto ir = R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[10, 20] parameter(0)
+      p1 = f32[10, 20] parameter(1)
+      ROOT add0 = f32[10, 20] add(p0, p1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(ir));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              UnorderedElementsAre(
+                  Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1) -> (d0, d1)",
+                              ElementsAre(MatchRange(0, 10), MatchRange(0, 20)),
+                              IsEmpty()))),
+                  Pair(1, ElementsAre(MatchIndexingMap(
+                              "(d0, d1) -> (d0, d1)",
+                              ElementsAre(MatchRange(0, 10), MatchRange(0, 20)),
+                              IsEmpty())))));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto output_indexing,
+      GetInputToOutputIndexingForEntryComputation(ir, /*input_id=*/0));
+  EXPECT_THAT(output_indexing.indexing_maps,
+              UnorderedElementsAre(
+                  Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1) -> (d0, d1)",
+                              ElementsAre(MatchRange(0, 10), MatchRange(0, 20)),
+                              IsEmpty())))));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto output_indexing1,
+      GetInputToOutputIndexingForEntryComputation(ir, /*input_id=*/1));
+  EXPECT_THAT(output_indexing1.indexing_maps,
+              UnorderedElementsAre(
+                  Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1) -> (d0, d1)",
+                              ElementsAre(MatchRange(0, 10), MatchRange(0, 20)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, BitcastIsReshape) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[4, 32] parameter(0)
+      ROOT bitcast = f32[4, 8, 4] bitcast(p0)
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      UnorderedElementsAre(Pair(
+          0,
+          ElementsAre(MatchIndexingMap(
+              "(d0, d1, d2) -> (d0, d1 * 4 + d2)",
+              ElementsAre(MatchRange(0, 4), MatchRange(0, 8), MatchRange(0, 4)),
+              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, BitcastIsTranspose) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[3, 12288, 6, 128] parameter(0)
+      ROOT bitcast = f32[3, 6, 128, 12288] {2, 1, 3, 0} bitcast(p0)
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              UnorderedElementsAre(Pair(
+                  0, ElementsAre(MatchIndexingMap(
+                         "(d0, d1, d2, d3) -> (d0, d3, d1, d2)",
+                         ElementsAre(MatchRange(0, 3), MatchRange(0, 6),
+                                     MatchRange(0, 128), MatchRange(0, 12288)),
+                         IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, BitcastIsTransposeReshapeTranspose) {
+  auto ir = R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[16, 17, 3] parameter(0)
+      ROOT bitcast = f32[51, 16] {0, 1} bitcast(p0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(ir));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              UnorderedElementsAre(
+                  Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1) -> (d1, d0 floordiv 3, d0 mod 3)",
+                              ElementsAre(MatchRange(0, 51), MatchRange(0, 16)),
+                              IsEmpty())))));
+  TF_ASSERT_OK_AND_ASSIGN(auto output_indexing,
+                          GetInputToOutputIndexingForEntryComputation(ir));
+  EXPECT_THAT(output_indexing.indexing_maps,
+              UnorderedElementsAre(
+                  Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1, d2) -> (d1 * 3 + d2, d0)",
+                              ElementsAre(MatchRange(0, 16), MatchRange(0, 17),
+                                          MatchRange(0, 3)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, BroadcastOp) {
+  auto ir = R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[20] parameter(0)
+      ROOT bc0 = f32[10, 20, 30] broadcast(p0), dimensions={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(ir));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              UnorderedElementsAre(
+                  Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1, d2) -> (d1)",
+                              ElementsAre(MatchRange(0, 10), MatchRange(0, 20),
+                                          MatchRange(0, 30)),
+                              IsEmpty())))));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto output_indexing,
+                          GetInputToOutputIndexingForEntryComputation(ir));
+  EXPECT_THAT(
+      output_indexing.indexing_maps,
+      UnorderedElementsAre(Pair(
+          0, ElementsAre(MatchIndexingMap(
+                 "(d0)[s0, s1] -> (s0, d0, s1)", ElementsAre(MatchRange(0, 20)),
+                 ElementsAre(MatchRange(0, 10), MatchRange(0, 30)))))));
+}
+
+TEST_F(TileAnalysisTest, ConstantOp) {
+  auto ir = R"(
+    HloModule m
+    ENTRY e {
+      ROOT c1 = bf16[17, 22] constant(1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(ir));
+  EXPECT_THAT(input_indexing.indexing_maps, IsEmpty());
+}
+
+TEST_F(TileAnalysisTest, FusionOpWithSingleBinaryOp) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    f {
+      p0 = f32[100] parameter(0)
+      p1 = f32[100] parameter(1)
+      ROOT a0 = f32[100] add(p0, p1)
+    }
+    ENTRY e {
+      p0 = f32[100] parameter(0)
+      p1 = f32[100] parameter(1)
+      ROOT fusion = f32[100] fusion(p0, p1), kind=kLoop, calls=f
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              UnorderedElementsAre(
+                  Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0) -> (d0)", ElementsAre(MatchRange(0, 100)),
+                              IsEmpty()))),
+                  Pair(1, ElementsAre(MatchIndexingMap(
+                              "(d0) -> (d0)", ElementsAre(MatchRange(0, 100)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, FusionOpWithDot) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    f {
+      p0 = s8[3,12288,6,128]{3,2,1,0} parameter(0)
+      bitcast1 = s8[3,6,128,12288]{2,1,3,0} bitcast(p0)
+      copy1 = s8[3,6,128,12288]{3,2,1,0} copy(bitcast1)
+      bitcast2 = s8[2304,12288]{1,0} bitcast(copy1)
+      convert1 = bf16[2304,12288]{1,0} convert(bitcast2)
+      bitcast3 = bf16[2304,16,768]{2,1,0} bitcast(convert1)
+      p3 = bf16[16,12288]{1,0} parameter(3)
+      convert2 = f32[16,12288]{1,0} convert(p3)
+      p4 = bf16[16,12288]{1,0} parameter(4)
+      convert3 = f32[16,12288]{1,0} convert(p4)
+      add1 = f32[16,12288]{1,0} add(convert2, convert3)
+      p2 = bf16[16]{0} parameter(2)
+      convert15 = f32[16]{0} convert(p2)
+      rsqrt = f32[16]{0} rsqrt(convert15)
+      convert4 = bf16[16]{0} convert(rsqrt)
+      bcast1 = bf16[16,12288]{1,0} broadcast(convert4), dimensions={0}
+      convert5 = f32[16,12288]{1,0} convert(bcast1)
+      multiply1 = f32[16,12288]{1,0} multiply(add1, convert5)
+      p1 = bf16[12288]{0} parameter(1)
+      convert6 = f32[12288]{0} convert(p1)
+      c1 = bf16[] constant(1)
+      bcast2 = bf16[12288]{0} broadcast(c1), dimensions={}
+      convert7 = f32[12288]{0} convert(bcast2)
+      add2 = f32[12288]{0} add(convert6, convert7)
+      convert8 = bf16[12288]{0} convert(add2)
+      bcast3 = bf16[16,12288]{1,0} broadcast(convert8), dimensions={1}
+      convert9 = f32[16,12288]{1,0} convert(bcast3)
+      multiply2 = f32[16,12288]{1,0} multiply(multiply1, convert9)
+      convert10 = bf16[16,12288]{1,0} convert(multiply2)
+      bcast4 = bf16[16,16,768]{2,1,0} bitcast(convert10)
+      dot = bf16[16,2304,16]{2,1,0} dot(bitcast3, bcast4),
+        lhs_batch_dims={1}, lhs_contracting_dims={2},
+        rhs_batch_dims={1}, rhs_contracting_dims={2}
+      bcast5 = bf16[16,3,6,128,16]{4,3,2,1,0} bitcast(dot)
+      copy2 = bf16[16,3,6,128,16]{3,2,4,1,0} copy(bcast5)
+      convert13 = f32[16,3,6,128,16]{3,2,4,1,0} convert(copy2)
+      p5 = bf16[3,6,128]{2,1,0} parameter(5)
+      bcast6 = bf16[3,6,128,16]{2,1,3,0} broadcast(p5), dimensions={0,1,2}
+      convert11 = f32[3,6,128,16]{2,1,3,0} convert(bcast6)
+      bcast7 = f32[16,3,6,128,16]{3,2,4,1,0} broadcast(convert11),
+        dimensions={1,2,3,4}
+      multiply3 = f32[16,3,6,128,16]{3,2,4,1,0} multiply(convert13, bcast7)
+      convert12 = bf16[16,3,6,128,16]{3,2,4,1,0} convert(multiply3)
+      ROOT bcast8 = bf16[16,16,3,1,6,128]{5,4,1,3,2,0} bitcast(convert12)
+    }
+    ENTRY e {
+      p0 = s8[3,12288,6,128]{3,2,1,0} parameter(0)
+      p1 = bf16[12288]{0} parameter(1)
+      p2 = bf16[16]{0} parameter(2)
+      p3 = bf16[16,12288]{1,0} parameter(3)
+      p4 = bf16[16,12288]{1,0} parameter(4)
+      p5 = bf16[3,6,128]{2,1,0} parameter(5)
+      ROOT fusion = bf16[16,16,3,1,6,128]{5,4,1,3,2,0}
+        fusion(p0, p1, p2, p3, p4, p5), kind=kLoop, calls=f
+    }
+  )"));
+
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      UnorderedElementsAre(
+          Pair(0, ElementsAre(MatchIndexingMap(
+                      "(d0, d1, d2, d3, d4, d5)[s0] -> "
+                      "(d2 + d3, d0 * 768 + s0, d4, d5)",
+                      ElementsAre(MatchRange(0, 16), MatchRange(0, 16),
+                                  MatchRange(0, 3), MatchRange(0, 1),
+                                  MatchRange(0, 6), MatchRange(0, 128)),
+                      ElementsAre(MatchRange(0, 768))))),
+          Pair(1, ElementsAre(MatchIndexingMap(
+                      "(d0, d1, d2, d3, d4, d5)[s0] -> (d0 * 768 + s0)",
+                      ElementsAre(MatchRange(0, 16), MatchRange(0, 16),
+                                  MatchRange(0, 3), MatchRange(0, 1),
+                                  MatchRange(0, 6), MatchRange(0, 128)),
+                      ElementsAre(MatchRange(0, 768))))),
+          Pair(2, ElementsAre(MatchIndexingMap(
+                      "(d0, d1, d2, d3, d4, d5) -> (d1)",
+                      ElementsAre(MatchRange(0, 16), MatchRange(0, 16),
+                                  MatchRange(0, 3), MatchRange(0, 1),
+                                  MatchRange(0, 6), MatchRange(0, 128)),
+                      IsEmpty()))),
+          Pair(3, ElementsAre(MatchIndexingMap(
+                      "(d0, d1, d2, d3, d4, d5)[s0] -> (d1, d0 * 768 + s0)",
+                      ElementsAre(MatchRange(0, 16), MatchRange(0, 16),
+                                  MatchRange(0, 3), MatchRange(0, 1),
+                                  MatchRange(0, 6), MatchRange(0, 128)),
+                      ElementsAre(MatchRange(0, 768))))),
+          Pair(4, ElementsAre(MatchIndexingMap(
+                      "(d0, d1, d2, d3, d4, d5)[s0] -> (d1, d0 * 768 + s0)",
+                      ElementsAre(MatchRange(0, 16), MatchRange(0, 16),
+                                  MatchRange(0, 3), MatchRange(0, 1),
+                                  MatchRange(0, 6), MatchRange(0, 128)),
+                      ElementsAre(MatchRange(0, 768))))),
+          Pair(5, ElementsAre(MatchIndexingMap(
+                      "(d0, d1, d2, d3, d4, d5) -> (d2 + d3, d4, d5)",
+                      ElementsAre(MatchRange(0, 16), MatchRange(0, 16),
+                                  MatchRange(0, 3), MatchRange(0, 1),
+                                  MatchRange(0, 6), MatchRange(0, 128)),
+                      IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, FusionOpWithSoftmax) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    add_computation {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+    max_computation {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT max = f32[] maximum(p0, p1)
+    }
+    softmax {
+      p0 = f32[2,65,125]{2,1,0} parameter(0)
+      bitcast0 = f32[65,2,125]{2,1,0} bitcast(p0)
+      constant_neg_inf_1 = f32[] constant(-inf)
+      reduce0 = f32[2,65]{1,0} reduce(p0, constant_neg_inf_1),
+        dimensions={2}, to_apply=max_computation
+      bitcast1 = f32[130]{0} bitcast(reduce0)
+      bcast1 = f32[130,125]{1,0} broadcast(bitcast1), dimensions={0}
+      bitcast2 = f32[65,2,125]{2,1,0} bitcast(bcast1)
+      subtract0 = f32[65,2,125]{2,1,0} subtract(bitcast0, bitcast2)
+      exponential0 = f32[65,2,125]{2,1,0} exponential(subtract0)
+      bitcast3 = f32[65,2,125]{2,1,0} bitcast(p0)
+      reduce1 = f32[2,65]{1,0} reduce(p0, constant_neg_inf_1),
+        dimensions={2}, to_apply=max_computation
+      bitcast4 = f32[130]{0} bitcast(reduce1)
+      bcast2 = f32[130,125]{1,0} broadcast(bitcast4), dimensions={0}
+      bitcast5 = f32[65,2,125]{2,1,0} bitcast(bcast2)
+      subtract1 = f32[65,2,125]{2,1,0} subtract(bitcast3, bitcast5)
+      exponential1 = f32[65,2,125]{2,1,0} exponential(subtract1)
+      constant_zero_1 = f32[] constant(0)
+      reduce2 = f32[65,2]{1,0} reduce(exponential1, constant_zero_1),
+        dimensions={2}, to_apply=add_computation
+      bitcast6 = f32[130]{0} bitcast(reduce2)
+      bcast3 = f32[130,125]{1,0} broadcast(bitcast6), dimensions={0}
+      bitcast7 = f32[65,2,125]{2,1,0} bitcast(bcast3)
+      divide = f32[65,2,125]{2,1,0} divide(exponential0, bitcast7)
+      ROOT bitcast8 = f32[2,65,125]{2,1,0} bitcast(divide)
+    }
+    ENTRY e {
+      p0 = f32[2,65,125]{2,1,0} parameter(0)
+      ROOT fusion = f32[2,65,125]{2,1,0}
+        fusion(p0), kind=kLoop, calls=softmax
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      UnorderedElementsAre(Pair(
+          0,
+          UnorderedElementsAre(
+              MatchIndexingMap("(d0, d1, d2) -> (d0, d1, d2)",
+                               ElementsAre(MatchRange(0, 2), MatchRange(0, 65),
+                                           MatchRange(0, 125)),
+                               IsEmpty()),
+              MatchIndexingMap("(d0, d1, d2)[s0] -> (d0, d1, s0)",
+                               ElementsAre(MatchRange(0, 2), MatchRange(0, 65),
+                                           MatchRange(0, 125)),
+                               ElementsAre(MatchRange(0, 125)))))));
+}
+
+TEST_F(TileAnalysisTest, FusionOpTensorPlusTransposedTensor) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    f {
+      p0 = f32[1000, 1000] parameter(0)
+      transpose_p0 = f32[1000, 1000]{0, 1} transpose(p0), dimensions={1, 0}
+      ROOT a0 = f32[1000, 1000] add(p0, transpose_p0)
+    }
+    ENTRY e {
+      p0 = f32[1000,1000] parameter(0)
+      ROOT fusion = f32[1000,1000] fusion(p0), kind=kLoop, calls=f
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              UnorderedElementsAre(
+                  Pair(0, UnorderedElementsAre(
+                              MatchIndexingMap("(d0, d1) -> (d1, d0)",
+                                               ElementsAre(MatchRange(0, 1000),
+                                                           MatchRange(0, 1000)),
+                                               IsEmpty()),
+                              MatchIndexingMap("(d0, d1) -> (d0, d1)",
+                                               ElementsAre(MatchRange(0, 1000),
+                                                           MatchRange(0, 1000)),
+                                               IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, FusionExponentialDuplication) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule test_module
+
+    fused_computation {
+      p0 = f32[4] parameter(0)
+      p1 = f32[4] parameter(1)
+      add0 = f32[4] add(p0, p1)
+      slice1.0 = f32[3] slice(add0), slice={[0:3]}
+      slice1.1 = f32[3] slice(add0), slice={[1:4]}
+      add1 = f32[3]{0} add(slice1.0, slice1.1)
+      slice2.0 = f32[2] slice(add1), slice={[0:2]}
+      slice2.1 = f32[2] slice(add1), slice={[1:3]}
+      ROOT add2 = f32[2] add(slice2.0, slice2.1)
+    }
+
+    ENTRY entry_computation {
+      p0 = f32[4] parameter(0)
+      p1 = f32[4] parameter(1)
+      ROOT fusion = f32[2] fusion(p0, p1), kind=kLoop,
+      calls=fused_computation
+    })"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      UnorderedElementsAre(
+          Pair(0,
+               UnorderedElementsAre(
+                   MatchIndexingMap("(d0) -> (d0)",
+                                    ElementsAre(MatchRange(0, 2)), IsEmpty()),
+                   MatchIndexingMap("(d0) -> (d0 + 1)",
+                                    ElementsAre(MatchRange(0, 2)), IsEmpty()),
+                   MatchIndexingMap("(d0) -> (d0 + 2)",
+                                    ElementsAre(MatchRange(0, 2)), IsEmpty()))),
+          Pair(1,
+               UnorderedElementsAre(
+                   MatchIndexingMap("(d0) -> (d0)",
+                                    ElementsAre(MatchRange(0, 2)), IsEmpty()),
+                   MatchIndexingMap("(d0) -> (d0 + 1)",
+                                    ElementsAre(MatchRange(0, 2)), IsEmpty()),
+                   MatchIndexingMap("(d0) -> (d0 + 2)",
+                                    ElementsAre(MatchRange(0, 2)),
+                                    IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, FusionOpWithReduceOfReduce) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    max {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT max = f32[] maximum(p0, p1)
+    }
+    f {
+      p0 = f32[150, 20, 10, 50] parameter(0)
+      p0_init = f32[] parameter(1)
+      reduce_1 = f32[20, 10] reduce(p0, p0_init),
+        dimensions={0, 3}, to_apply=max
+      ROOT reduce_2 = f32[10] reduce(reduce_1, p0_init),
+        dimensions={0}, to_apply=max
+    }
+    ENTRY e {
+      p0 = f32[150, 20, 10, 50] parameter(0)
+      p0_init = f32[] constant(-inf)
+      ROOT fusion = f32[10] fusion(p0, p0_init), kind=kLoop, calls=f
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              UnorderedElementsAre(
+                  Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0)[s0, s1, s2] -> (s0, s2, d0, s1)",
+                              ElementsAre(MatchRange(0, 10)),
+                              ElementsAre(MatchRange(0, 150), MatchRange(0, 50),
+                                          MatchRange(0, 20))))),
+                  Pair(1, ElementsAre(MatchIndexingMap(
+                              "(d0) -> ()", ElementsAre(MatchRange(0, 10)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, FusionOpWithReduceOfBroadcast) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    max {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT max = f32[] maximum(p0, p1)
+    }
+    f {
+      p0 = f32[15, 20] parameter(0)
+      p0_init = f32[] parameter(1)
+      p0_bcast = f32[15, 32, 20, 64] broadcast(p0), dimensions={0, 2}
+
+      ROOT reduce_2 = f32[15, 64] reduce(p0_bcast, p0_init),
+        dimensions={1, 2}, to_apply=max
+    }
+    ENTRY e {
+      p0 = f32[15, 20] parameter(0)
+      p0_init = f32[] constant(-inf)
+      ROOT fusion = f32[15, 64] fusion(p0, p0_init), kind=kLoop, calls=f
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              UnorderedElementsAre(
+                  Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1)[s0] -> (d0, s0)",
+                              ElementsAre(MatchRange(0, 15), MatchRange(0, 64)),
+                              ElementsAre(MatchRange(0, 20))))),
+                  Pair(1, ElementsAre(MatchIndexingMap(
+                              "(d0, d1) -> ()",
+                              ElementsAre(MatchRange(0, 15), MatchRange(0, 64)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, FusionOpWithTransposeOfTranspose) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    f {
+      p0 = f32[20, 10, 50] parameter(0)
+
+      lhs_transpose_1 = f32[10, 20, 50]
+             transpose(p0), dimensions={1, 0, 2}
+      lhs_e = f32[10, 20, 50] exponential(lhs_transpose_1)
+      lhs_transpose_2 = f32[10, 50, 20]
+             transpose(lhs_e), dimensions={0, 2, 1}
+
+      rhs_transpose_1 = f32[50, 10, 20]
+             transpose(p0), dimensions={2, 1, 0}
+      rhs_log = f32[50, 10, 20] exponential(rhs_transpose_1)
+      rhs_transpose_2 = f32[10, 50, 20]
+             transpose(rhs_log), dimensions={1, 0, 2}
+
+      ROOT add = f32[10, 50, 20] add(lhs_transpose_2, rhs_transpose_2)
+    }
+    ENTRY e {
+      p0 = f32[20, 10, 50] parameter(0)
+      ROOT fusion = f32[10, 50, 20] fusion(p0), kind=kLoop, calls=f
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              UnorderedElementsAre(
+                  Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1, d2) -> (d2, d0, d1)",
+                              ElementsAre(MatchRange(0, 10), MatchRange(0, 50),
+                                          MatchRange(0, 20)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, FusionOpWithReducedSlice) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    max {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT max = f32[] maximum(p0, p1)
+    }
+    f {
+      p0 = f32[150, 64, 1024] parameter(0)
+      p0_init = f32[] parameter(1)
+      p0_slice = f32[16, 32, 128] slice(f32[150, 64, 1024] p0),
+                slice={[5:21:1], [0:64:2], [50:434:3]}
+      ROOT reduce = f32[32] reduce(p0_slice, p0_init),
+        dimensions={0, 2}, to_apply=max
+    }
+    ENTRY e {
+      p0 = f32[150, 64, 1024] parameter(0)
+      p0_init = f32[] constant(-inf)
+      ROOT fusion = f32[32] fusion(p0, p0_init), kind=kLoop, calls=f
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      UnorderedElementsAre(
+          Pair(0, ElementsAre(MatchIndexingMap(
+                      "(d0)[s0, s1] -> (s0 + 5, d0 * 2, s1 * 3 + 50)",
+                      ElementsAre(MatchRange(0, 32)),
+                      ElementsAre(MatchRange(0, 16), MatchRange(0, 128))))),
+          Pair(1,
+               ElementsAre(MatchIndexingMap(
+                   "(d0) -> ()", ElementsAre(MatchRange(0, 32)), IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, FusionOpWithReshape_CollapseOfExpand) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    f {
+      p0 = f32[128] parameter(0)
+      expand = f32[8, 16] reshape(p0)
+      ROOT collapse = f32[128] reshape(expand)
+    }
+    ENTRY e {
+      p0 = f32[128] parameter(0)
+      ROOT fusion = f32[128] fusion(p0), kind=kLoop, calls=f
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0) -> (d0)", ElementsAre(MatchRange(0, 128)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, FusionOpWithReshape_ExpandOfCollapse) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    f {
+      p0 = f32[8, 16] parameter(0)
+      collapse = f32[128] reshape(p0)
+      ROOT expand = f32[8, 16] reshape(collapse)
+    }
+    ENTRY e {
+      p0 = f32[8, 16] parameter(0)
+      ROOT fusion = f32[8, 16] fusion(p0), kind=kLoop, calls=f
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1) -> (d0, d1)",
+                              ElementsAre(MatchRange(0, 8), MatchRange(0, 16)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, FusionOpWithReshape_ChainedGenericReshapes) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    f {
+      p0 = f32[10, 10, 10] parameter(0)
+      reshape1 = f32[50, 20] reshape(p0)
+      ROOT reshape2 = f32[10, 10, 10] reshape(reshape1)
+    }
+    ENTRY e {
+      p0 = f32[10, 10, 10] parameter(0)
+      ROOT fusion = f32[10, 10, 10] fusion(p0), kind=kLoop, calls=f
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1, d2) -> (d0, d1, d2)",
+                              ElementsAre(MatchRange(0, 10), MatchRange(0, 10),
+                                          MatchRange(0, 10)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, FusionOpWithSliceOfSlice) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    f {
+      p0 = f32[150, 64, 1024] parameter(0)
+      p0_slice_1 = f32[16, 32, 128] slice(f32[150, 64, 1024] p0),
+                slice={[5:21:1], [0:64:2], [50:434:3]}
+      ROOT p0_slice_2 = f32[7, 9, 24] slice(f32[16, 32, 128] p0_slice_1),
+                slice={[3:16:2], [4:30:3], [5:100:4]}
+    }
+    ENTRY e {
+      p0 = f32[150, 64, 1024] parameter(0)
+      ROOT fusion = f32[7, 9, 24] fusion(p0), kind=kLoop, calls=f
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(
+          Pair(0, ElementsAre(MatchIndexingMap(
+                      "(d0, d1, d2) -> (d0 * 2 + 8, d1 * 6 + 8, d2 * 12 + 65)",
+                      ElementsAre(MatchRange(0, 7), MatchRange(0, 9),
+                                  MatchRange(0, 24)),
+                      IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, IotaOp) {
+  auto ir = R"(
+    HloModule m
+    ENTRY e {
+      ROOT iota = s32[5,5,111,42] iota(), iota_dimension=0
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(ir));
+  EXPECT_THAT(input_indexing.indexing_maps, IsEmpty());
+}
+
+TEST_F(TileAnalysisTest, ReshapeOpCollapseShape) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[4,8] parameter(0)
+      ROOT reshape = f32[32] reshape(p0)
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0) -> (d0 floordiv 8, d0 mod 8)",
+                              ElementsAre(MatchRange(0, 32)), IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, ReshapeOpExpandShape) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[32] parameter(0)
+      ROOT reshape = f32[4, 8] reshape(p0)
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1) -> (d0 * 8 + d1)",
+                              ElementsAre(MatchRange(0, 4), MatchRange(0, 8)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, ReshapeOpExpandAndCollapseShape) {
+  auto ir = R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[4, 8, 12] parameter(0)
+      ROOT reshape = f32[32, 3, 4] reshape(p0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(ir));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(
+          Pair(0, ElementsAre(MatchIndexingMap(
+                      "(d0, d1, d2) -> (d0 floordiv 8, d0 mod 8, d1 * 4 + d2)",
+                      ElementsAre(MatchRange(0, 32), MatchRange(0, 3),
+                                  MatchRange(0, 4)),
+                      IsEmpty())))));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto output_indexing,
+                          GetInputToOutputIndexingForEntryComputation(ir));
+  EXPECT_THAT(
+      output_indexing.indexing_maps,
+      ElementsAre(
+          Pair(0, ElementsAre(MatchIndexingMap(
+                      "(d0, d1, d2) -> (d0 * 8 + d1, d2 floordiv 4, d2 mod 4)",
+                      ElementsAre(MatchRange(0, 4), MatchRange(0, 8),
+                                  MatchRange(0, 12)),
+                      IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, ReshapeOpExpandSubshapeOnly) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[16, 8] parameter(0)
+      ROOT reshape = f32[4, 4, 8] reshape(p0)
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1, d2) -> (d0 * 4 + d1, d2)",
+                              ElementsAre(MatchRange(0, 4), MatchRange(0, 4),
+                                          MatchRange(0, 8)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, ReshapeOpGenericReshape2DTO3D) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[4,8] parameter(0)
+      ROOT reshape = f32[2, 4, 4] reshape(p0)
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(Pair(
+          0,
+          ElementsAre(MatchIndexingMap(
+              "(d0, d1, d2) -> (d0 * 2 + (d1 * 4 + d2) floordiv 8, "
+              "(d1 * 4 + d2) mod 8)",
+              ElementsAre(MatchRange(0, 2), MatchRange(0, 4), MatchRange(0, 4)),
+              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, ReshapeOpGenericReshape3DTO2D) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[2, 4, 4] parameter(0)
+      ROOT reshape = f32[4, 8] reshape(p0)
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1) -> ((d0 * 8 + d1) floordiv 16, "
+                              "((d0 * 8 + d1) mod 16) floordiv 4, d1 mod 4)",
+                              ElementsAre(MatchRange(0, 4), MatchRange(0, 8)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, ReduceOp) {
+  auto ir = R"(
+    HloModule m
+    max {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT max = f32[] maximum(p0, p1)
+    }
+    ENTRY e {
+      p0 = f32[150, 20, 10, 50] parameter(0)
+      p0_init = f32[] constant(-inf)
+      ROOT reduce = f32[150, 10] reduce(p0, p0_init),
+        dimensions={3, 1}, to_apply=max
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(ir));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      UnorderedElementsAre(
+          Pair(0, ElementsAre(MatchIndexingMap(
+                      "(d0, d1)[s0, s1] -> (d0, s0, d1, s1)",
+                      ElementsAre(MatchRange(0, 150), MatchRange(0, 10)),
+                      ElementsAre(MatchRange(0, 20), MatchRange(0, 50))))),
+          Pair(1, ElementsAre(MatchIndexingMap(
+                      "(d0, d1) -> ()",
+                      ElementsAre(MatchRange(0, 150), MatchRange(0, 10)),
+                      IsEmpty())))));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto output_indexing,
+                          GetInputToOutputIndexingForEntryComputation(ir));
+  EXPECT_THAT(
+      output_indexing.indexing_maps,
+      UnorderedElementsAre(
+          Pair(0, ElementsAre(MatchIndexingMap(
+                      "(d0, d1, d2, d3) -> (d0, d2)",
+                      ElementsAre(MatchRange(0, 150), MatchRange(0, 20),
+                                  MatchRange(0, 10), MatchRange(0, 50)),
+                      IsEmpty()))),
+          Pair(1, ElementsAre(MatchIndexingMap(
+                      "()[s0, s1] -> (s0, s1)", IsEmpty(),
+                      ElementsAre(MatchRange(0, 150), MatchRange(0, 10)))))));
+}
+
+TEST_F(TileAnalysisTest, VariadicReduceOp) {
+  absl::string_view ir = R"(
+    HloModule m
+    min {
+      tmp_0 = f32[] parameter(0)
+      tmp_1 = f32[] parameter(2)
+      tmp_2 = s32[] parameter(1)
+      tmp_3 = s32[] parameter(3)
+      cmp = pred[] compare(tmp_0, tmp_1), direction=GE
+      select1 = f32[] select(cmp, tmp_0, tmp_1)
+      select2 = s32[] select(cmp, tmp_2, tmp_3)
+      ROOT tmp_4 = (f32[], s32[]) tuple(select1, select2)
+    }
+    ENTRY e {
+      p0 = f32[256,10] parameter(0)
+      p0_init = f32[] constant(-inf)
+      p1 = s32[256,10] parameter(1)
+      p1_init = s32[] constant(0)
+      ROOT reduce = (f32[10], s32[10]) reduce(p0, p1, p0_init, p1_init),
+        dimensions={0}, to_apply=min
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto output_indexing_0,
+      GetOutputToInputIndexingForEntryComputation(ir, /*output_id=*/0));
+  EXPECT_THAT(
+      output_indexing_0.indexing_maps,
+      UnorderedElementsAre(
+          Pair(0, ElementsAre(MatchIndexingMap(
+                      "(d0)[s0] -> (s0, d0)", ElementsAre(MatchRange(0, 10)),
+                      ElementsAre(MatchRange(0, 256))))),
+          Pair(1, ElementsAre(MatchIndexingMap(
+                      "(d0)[s0] -> (s0, d0)", ElementsAre(MatchRange(0, 10)),
+
+                      ElementsAre(MatchRange(0, 256))))),
+          Pair(2,
+               ElementsAre(MatchIndexingMap(
+                   "(d0) -> ()", ElementsAre(MatchRange(0, 10)), IsEmpty()))),
+          Pair(3,
+               ElementsAre(MatchIndexingMap(
+                   "(d0) -> ()", ElementsAre(MatchRange(0, 10)), IsEmpty())))));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto output_indexing_1,
+      GetOutputToInputIndexingForEntryComputation(ir, /*output_id=*/1));
+  EXPECT_THAT(
+      output_indexing_1.indexing_maps,
+      UnorderedElementsAre(
+          Pair(0, ElementsAre(MatchIndexingMap(
+                      "(d0)[s0] -> (s0, d0)", ElementsAre(MatchRange(0, 10)),
+                      ElementsAre(MatchRange(0, 256))))),
+          Pair(1, ElementsAre(MatchIndexingMap(
+                      "(d0)[s0] -> (s0, d0)", ElementsAre(MatchRange(0, 10)),
+
+                      ElementsAre(MatchRange(0, 256))))),
+          Pair(2,
+               ElementsAre(MatchIndexingMap(
+                   "(d0) -> ()", ElementsAre(MatchRange(0, 10)), IsEmpty()))),
+          Pair(3,
+               ElementsAre(MatchIndexingMap(
+                   "(d0) -> ()", ElementsAre(MatchRange(0, 10)), IsEmpty())))));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_indexing_0,
+      GetInputToOutputIndexingForEntryComputation(ir, /*input_id=*/0));
+  EXPECT_THAT(
+      input_indexing_0.indexing_maps,
+      UnorderedElementsAre(
+          Pair(0, ElementsAre(MatchIndexingMap(
+                      "(d0, d1) -> (d1)",
+                      ElementsAre(MatchRange(0, 256), MatchRange(0, 10)),
+                      IsEmpty()))),
+          Pair(1, ElementsAre(MatchIndexingMap(
+                      "(d0, d1) -> (d1)",
+                      ElementsAre(MatchRange(0, 256), MatchRange(0, 10)),
+                      IsEmpty()))),
+          Pair(2,
+               ElementsAre(MatchIndexingMap("()[s0] -> (s0)", IsEmpty(),
+                                            ElementsAre(MatchRange(0, 10))))),
+          Pair(3,
+               ElementsAre(MatchIndexingMap("()[s0] -> (s0)", IsEmpty(),
+                                            ElementsAre(MatchRange(0, 10)))))));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_indexing_1,
+      GetInputToOutputIndexingForEntryComputation(ir, /*input_id=*/1));
+  EXPECT_THAT(
+      input_indexing_1.indexing_maps,
+      UnorderedElementsAre(
+          Pair(0, ElementsAre(MatchIndexingMap(
+                      "(d0, d1) -> (d1)",
+                      ElementsAre(MatchRange(0, 256), MatchRange(0, 10)),
+                      IsEmpty()))),
+          Pair(1, ElementsAre(MatchIndexingMap(
+                      "(d0, d1) -> (d1)",
+                      ElementsAre(MatchRange(0, 256), MatchRange(0, 10)),
+                      IsEmpty()))),
+          Pair(2,
+               ElementsAre(MatchIndexingMap("()[s0] -> (s0)", IsEmpty(),
+                                            ElementsAre(MatchRange(0, 10))))),
+          Pair(3,
+               ElementsAre(MatchIndexingMap("()[s0] -> (s0)", IsEmpty(),
+                                            ElementsAre(MatchRange(0, 10)))))));
+}
+
+TEST_F(TileAnalysisTest, ReverseOp) {
+  auto ir = R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1, 17, 9, 9] parameter(0)
+      ROOT reverse = f32[1, 17, 9, 9] reverse(p0), dimensions={1, 2}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(ir));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1, d2, d3) -> (d0, -d1 + 16, -d2 + 8, d3)",
+                              ElementsAre(MatchRange(0, 1), MatchRange(0, 17),
+                                          MatchRange(0, 9), MatchRange(0, 9)),
+                              IsEmpty())))));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto output_indexing,
+                          GetInputToOutputIndexingForEntryComputation(ir));
+  EXPECT_THAT(
+      output_indexing.indexing_maps,
+      ElementsAre(Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1, d2, d3) -> (d0, -d1 + 16, -d2 + 8, d3)",
+                              ElementsAre(MatchRange(0, 1), MatchRange(0, 17),
+                                          MatchRange(0, 9), MatchRange(0, 9)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, ReverseReshape) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    fused_computation {
+      p0 = f32[10, 11] parameter(0)
+      reverse.0 = f32[10, 11] reverse(p0), dimensions={0, 1}
+      reshape.0 = f32[110] reshape(reverse.0)
+      reverse.1 = f32[110] reverse(reshape.0), dimensions={0}
+      ROOT reshape.1 = f32[10, 11] reshape(reverse.1)
+    }
+    ENTRY e {
+      p0 = f32[10, 11] parameter(0)
+      ROOT fusion = f32[10, 11] fusion(p0), kind=kLoop,
+      calls=fused_computation
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1) -> (d0, d1)",
+                              ElementsAre(MatchRange(0, 10), MatchRange(0, 11)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, SliceOp) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[10, 20, 50] parameter(0)
+      ROOT slice = f32[5, 3, 25] slice(f32[10, 20, 50] p0),
+          slice={[5:10:1], [3:20:7], [0:50:2]}
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(Pair(0, ElementsAre(MatchIndexingMap(
+                              "(d0, d1, d2) -> (d0 + 5, d1 * 7 + 3, d2 * 2)",
+                              ElementsAre(MatchRange(0, 5), MatchRange(0, 3),
+                                          MatchRange(0, 25)),
+                              IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, TransposeOp) {
+  auto ir = R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[3, 12288, 6, 128] parameter(0)
+      ROOT transpose = f32[3, 6, 128, 12288]
+        transpose(p0), dimensions={0, 2, 3, 1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(ir));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(Pair(
+                  0, ElementsAre(MatchIndexingMap(
+                         "(d0, d1, d2, d3) -> (d0, d3, d1, d2)",
+                         ElementsAre(MatchRange(0, 3), MatchRange(0, 6),
+                                     MatchRange(0, 128), MatchRange(0, 12288)),
+                         IsEmpty())))));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto output_indexing,
+                          GetInputToOutputIndexingForEntryComputation(ir));
+  EXPECT_THAT(output_indexing.indexing_maps,
+              ElementsAre(Pair(
+                  0, ElementsAre(MatchIndexingMap(
+                         "(d0, d1, d2, d3) -> (d0, d2, d3, d1)",
+                         ElementsAre(MatchRange(0, 3), MatchRange(0, 12288),
+                                     MatchRange(0, 6), MatchRange(0, 128)),
+                         IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, TransposeOp4D) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[3, 12288, 6, 128] parameter(0)
+      ROOT bitcast = f32[3, 6, 128, 12288] {2, 1, 3, 0} bitcast(p0)
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(Pair(
+                  0, ElementsAre(MatchIndexingMap(
+                         "(d0, d1, d2, d3) -> (d0, d3, d1, d2)",
+                         ElementsAre(MatchRange(0, 3), MatchRange(0, 6),
+                                     MatchRange(0, 128), MatchRange(0, 12288)),
+                         IsEmpty())))));
+}
+
+TEST_F(TileAnalysisTest, DotOp) {
+  TF_ASSERT_OK_AND_ASSIGN(auto input_indexing,
+                          GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[4, 38, 17, 11, 18, 10] parameter(0)
+      p1 = f32[17, 10, 16, 18, 22, 38] parameter(1)
+      ROOT dot = f32[10, 38, 4, 11, 16, 22] dot(p0, p1),
+        lhs_batch_dims={5,1}, rhs_batch_dims={1,5},
+        lhs_contracting_dims={4,2}, rhs_contracting_dims={3,0}
+    }
+  )"));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      UnorderedElementsAre(
+          Pair(0, ElementsAre(MatchIndexingMap(
+                      "(d0, d1, d2, d3, d4, d5)[s0, s1] -> "
+                      "(d2, d1, s1, d3, s0, d0)",
+                      ElementsAre(MatchRange(0, 10), MatchRange(0, 38),
+                                  MatchRange(0, 4), MatchRange(0, 11),
+                                  MatchRange(0, 16), MatchRange(0, 22)),
+                      ElementsAre(MatchRange(0, 18), MatchRange(0, 17))))),
+          Pair(1, ElementsAre(MatchIndexingMap(
+                      "(d0, d1, d2, d3, d4, d5)[s0, s1] -> "
+                      "(s1, d0, d4, s0, d5, d1)",
+                      ElementsAre(MatchRange(0, 10), MatchRange(0, 38),
+                                  MatchRange(0, 4), MatchRange(0, 11),
+                                  MatchRange(0, 16), MatchRange(0, 22)),
+                      ElementsAre(MatchRange(0, 18), MatchRange(0, 17)))))));
+}
+
+TEST_F(TileAnalysisTest, UnsupportedOps) {
+  ASSERT_IS_NOT_OK(GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1, 17, 9, 9] parameter(0)
+      p1 = f32[5, 17, 9, 9] parameter(1)
+      ROOT concat = f32[6, 17, 9, 9] concatenate(p0, p1)
+    }
+  )"));
+  ASSERT_IS_NOT_OK(GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      input = s32[1,1,25,1] parameter(0)
+      update = s32[1,1,2,1] parameter(1)
+      start_indices = s32[4] parameter(2)
+      ROOT dyn-update = s32[1,1,25,1] dynamic-update-slice(
+        s32[1,1,25,1] input, s32[1,1,2,1] update, s32[4] start_indices)
+    }
+  )"));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/nccl_utils.cc b/third_party/xla/xla/service/gpu/nccl_utils.cc
index 82b160d849558b..a1c0beb0b9ab7e 100644
--- a/third_party/xla/xla/service/gpu/nccl_utils.cc
+++ b/third_party/xla/xla/service/gpu/nccl_utils.cc
@@ -49,9 +49,10 @@ Status ToStatus(ncclResult_t s, const char* file, int64_t line,
   if (s == ncclSuccess) {
     return OkStatus();
   }
-  return tsl::errors::Internal(
-      absl::StrFormat("%s:%d: NCCL operation %s failed: %s", file, line, expr,
-                      ncclGetErrorString(s)));
+  return tsl::errors::Internal(absl::StrFormat(
+      "%s:%d: NCCL operation %s failed: %s."
+      " Last NCCL warning(error) log entry (may be unrelated) '%s'.",
+      file, line, expr, ncclGetErrorString(s), ncclGetLastError(NULL)));
 }
 
 ncclRedOp_t ToNcclReduction(ReductionKind kind) {
@@ -210,7 +211,9 @@ void CheckNcclAsyncError(NcclComm& lockable_comm) {
     if (async_err != ncclSuccess) {
       LOG(ERROR) << "Aborting communicator: " << comm
                  << " due to async NCCL error: "
-                 << ncclGetErrorString(async_err);
+                 << ncclGetErrorString(async_err)
+                 << ". Last NCCL warning(error) log entry (may be unrelated): "
+                 << ncclGetLastError(NULL);
       XLA_CUDA_RETURN_IF_ERROR(ncclCommAbort(comm));
     }
     return XLA_CUDA_STATUS(async_err);
@@ -259,11 +262,16 @@ StatusOr<NcclComm::Lock> AcquireNcclComm(
     bool enable_clique_optimization) {
   // Ensure that this group of threads have exclusive access to the clique to
   // prevent threads from different groups locking communicators in the clique.
+  // The enable_clique_optimization value is only used for asynchronous
+  // collective stream currenly. For synchronous collectives, we should always
+  // enable the optimization. For P2P stream, we currently have to always enable
+  // the optimization, because we initially implement this optimization to
+  // workaround an NCCL bug related to P2P operations.
   NcclCliqueKey clique_key(std::move(participants), stream_id);
   std::shared_ptr<StatusOr<NcclClique::Lock>> clique = AcquireNcclClique(
       run_id, op_id, clique_key, unique_id_callback, num_local_participants,
       enable_clique_optimization ||
-          stream_id == GetStreamId(true, kAsyncStreamP2P));
+          stream_id != GetStreamId(/*is_async=*/true, kAsyncStreamCollective));
 
   if (!clique->ok()) return clique->status();
 
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index ed41927807cd96..2c1ca2b4400349 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "xla/service/gpu/metrics.h"
+#include "xla/service/gpu/move_copy_to_users.h"
 #include "xla/service/gpu/target_constants.h"
 #include "xla/service/gpu/triangular_solve_rewriter.h"
 #include "xla/service/gpu/triton_autotuner.h"
@@ -225,13 +226,14 @@ Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
         false);
     if (debug_options.xla_gpu_normalize_layouts()) {
       mha_fusion_pipeline.AddPass<ReshapeDecomposer>();
+      mha_fusion_pipeline.AddPass<HloPassFix<MoveCopyToUsers>>();
       mha_fusion_pipeline.AddPass<LayoutNormalization>();
     }
+
     mha_fusion_pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
     mha_fusion_pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
         alg_sim_options);
     mha_fusion_pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
-
     // Rewrite Multi-Headed Attention modules to Fused MHA custom-calls.
     if (stream_exec) {
       mha_fusion_pipeline.AddPass<CudnnFusedMHARewriter>(
@@ -450,13 +452,10 @@ HloDataflowAnalysis::CanShareBuffer NVPTXCompiler::GetCanShareBuffer() const {
   return &CanShareBufferHint;
 }
 
-StatusOr<std::pair<std::string, std::vector<uint8_t>>>
-NVPTXCompiler::CompileTargetBinary(const HloModuleConfig& module_config,
-                                   llvm::Module* llvm_module,
-                                   se::GpuComputeCapability gpu_version,
-                                   bool relocatable,
-                                   const HloModule* debug_module,
-                                   const CompileOptions& options) {
+StatusOr<GpuCompiler::BackendCompileResult> NVPTXCompiler::CompileTargetBinary(
+    const HloModuleConfig& module_config, llvm::Module* llvm_module,
+    se::GpuComputeCapability gpu_version, bool relocatable,
+    const HloModule* debug_module, const CompileOptions& options) {
   std::unique_ptr<llvm::Module> loaded_module =
       MaybeLoadLLVMFromFile(debug_module, llvm_module);
   llvm::Module* selected_module = nullptr;
@@ -492,11 +491,11 @@ NVPTXCompiler::CompileTargetBinary(const HloModuleConfig& module_config,
       (debug_module != nullptr ? debug_module->name() : "(unknown)"),
       relocatable, options);
 
-  if (maybe_cubin.status().code() == absl::StatusCode::kCancelled) {
+  if (maybe_cubin.status().code() == absl::StatusCode::kCancelled ||
+      maybe_cubin.status().code() == absl::StatusCode::kResourceExhausted) {
     return maybe_cubin.status();
   }
-  return std::pair<std::string, std::vector<uint8_t>>(
-      std::move(ptx), std::move(maybe_cubin.value()));
+  return BackendCompileResult{std::move(ptx), std::move(maybe_cubin.value())};
 }
 
 StatusOr<std::vector<uint8_t>> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
@@ -594,6 +593,14 @@ StatusOr<std::vector<uint8_t>> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
             cache_value->compilation_done = true;
             cache_value->compilation_done_cv.SignalAll();
             return maybe_cubin;
+          } else if (maybe_cubin.status().code() ==
+                     absl::StatusCode::kResourceExhausted) {
+            // Exhausting the register limit during autotuning is not a fatal
+            // error, we should just skip the problematic tiling.
+            CHECK(options.is_autotuning_compilation);
+            cache_value->compilation_done = true;
+            cache_value->compilation_done_cv.SignalAll();
+            return maybe_cubin;
           } else if (maybe_cubin.status().code() !=
                      absl::StatusCode::kUnimplemented) {
             // If unimplemented is returned, we fallback to the driver.
@@ -665,6 +672,14 @@ StatusOr<NVPTXCompiler::LinkingMethod> NVPTXCompiler::ChooseLinkingMethod(
   TF_ASSIGN_OR_RETURN(auto ptxas_version_tuple,
                       se::GetAsmCompilerVersion(preferred_cuda_dir));
 
+  // ptxas versions prior to 11.8 are not supported anymore. We check this here,
+  // since we are fetching the ptxas version anyway. Catching the error
+  // elsewhere might introduce unnecessary overhead.
+  if (ptxas_version_tuple < std::array<int64_t, 3>{11, 8, 0}) {
+    return Status(absl::StatusCode::kInternal,
+                  "XLA requires ptxas version 11.8 or higher");
+  }
+
   static const std::optional<std::array<int64_t, 3>> nvlink_version =
       GetNvLinkVersion(preferred_cuda_dir);
   if (nvlink_version && *nvlink_version >= ptxas_version_tuple) {
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.h b/third_party/xla/xla/service/gpu/nvptx_compiler.h
index b1c4c2b1c46465..2de9551cb0aecd 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.h
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.h
@@ -67,7 +67,7 @@ class NVPTXCompiler : public GpuCompiler {
 
   HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() const override;
 
-  StatusOr<std::pair<std::string, std::vector<uint8_t>>> CompileTargetBinary(
+  StatusOr<BackendCompileResult> CompileTargetBinary(
       const HloModuleConfig& module_config, llvm::Module* llvm_module,
       se::GpuComputeCapability gpu_version, bool relocatable,
       const HloModule* debug_module, const CompileOptions& options) override;
diff --git a/third_party/xla/xla/service/gpu/precompiled_kernels.cc b/third_party/xla/xla/service/gpu/precompiled_kernels.cc
deleted file mode 100644
index d380a496cf4aaf..00000000000000
--- a/third_party/xla/xla/service/gpu/precompiled_kernels.cc
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/precompiled_kernels.h"
-
-#include <string>
-#include <utility>
-
-#include "absl/base/call_once.h"
-#include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "xla/statusor.h"
-#include "xla/stream_executor/gpu/asm_compiler.h"
-#include "xla/util.h"
-
-#if TENSORFLOW_USE_ROCM
-#include "xla/stream_executor/gpu/gpu_stream.h"
-namespace stream_executor {
-namespace gpu {
-
-extern void rocm_MakeBatchPointers(void* stream, char* base, int stride, int n,
-                                   void** ptrs_out);
-
-}
-}  // namespace stream_executor
-#endif
-
-namespace xla {
-namespace gpu {
-namespace {
-
-// GPU kernel to populate an array of pointers:
-//
-//   [base + stride * i for i in range(n)].
-//
-// Generated from the following CUDA code.
-//
-// extern "C" {
-// __global__ void __xla_MakeBatchPointers(char* base, int stride,
-//                                         int n, void** ptrs_out) {
-//   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-//   if (idx >= n) return;
-//   ptrs_out[idx] = base + idx * stride;
-// }
-// }
-constexpr const char* kMakeBatchPointersPtx = R"(
-.version 4.2
-.target sm_35
-.address_size 64
-
-.visible .entry __xla_MakeBatchPointers(
-        .param .u64 __xla_MakeBatchPointers_param_0,
-        .param .u32 __xla_MakeBatchPointers_param_1,
-        .param .u32 __xla_MakeBatchPointers_param_2,
-        .param .u64 __xla_MakeBatchPointers_param_3
-)
-{
-        .reg .pred      %p<2>;
-        .reg .b32       %r<8>;
-        .reg .b64       %rd<8>;
-
-        ld.param.u32    %r2, [__xla_MakeBatchPointers_param_2];
-        mov.u32         %r3, %tid.x;
-        mov.u32         %r4, %ctaid.x;
-        mov.u32         %r5, %ntid.x;
-        mad.lo.s32      %r6, %r4, %r5, %r3;
-        setp.ge.s32     %p1, %r6, %r2;
-        @%p1 bra        LBB0_2;
-        ld.param.u64    %rd3, [__xla_MakeBatchPointers_param_0];
-        ld.param.u64    %rd4, [__xla_MakeBatchPointers_param_3];
-        cvta.to.global.u64      %rd5, %rd4;
-        ld.param.u32    %r1, [__xla_MakeBatchPointers_param_1];
-        mul.wide.s32    %rd6, %r6, 8;
-        add.s64         %rd1, %rd5, %rd6;
-        mul.lo.s32      %r7, %r6, %r1;
-        cvt.s64.s32     %rd7, %r7;
-        add.s64         %rd2, %rd3, %rd7;
-        st.global.u64   [%rd1], %rd2;
-LBB0_2:
-        ret;
-}
-)";
-
-// Lazily compiles ptx kernel, once per StreamExecutor.
-//
-// Thread-safe.
-template <typename... KernelArgs>
-class LazyKernel {
- public:
-  LazyKernel(absl::string_view kernel_name, const char* ptx,
-             const se::GpuAsmOpts& asm_opts)
-      : kernel_name_(kernel_name), ptx_(ptx), asm_opts_(asm_opts) {}
-
-  StatusOr<se::TypedKernel<KernelArgs...>*> Get(
-      se::StreamExecutor* stream_exec) {
-    absl::MutexLock lock(&mu_);
-
-    auto result = kernels_.emplace(stream_exec, nullptr);
-    if (result.second) {
-      absl::Span<const uint8_t> compiled_ptx;
-      StatusOr<absl::Span<const uint8_t>> compiled_ptx_or =
-          se::CompileGpuAsmOrGetCached(stream_exec->device_ordinal(), ptx_,
-                                       asm_opts_);
-      if (compiled_ptx_or.ok()) {
-        compiled_ptx = std::move(compiled_ptx_or).value();
-      } else {
-        static absl::once_flag logged_once;
-        absl::call_once(logged_once, [&]() {
-          LOG(WARNING)
-              << compiled_ptx_or.status()
-              << "\nRelying on driver to perform ptx compilation. "
-              << "\nSetting XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda "
-              << " or modifying $PATH can be used to set the location of ptxas."
-              << "\nThis message will only be logged once.";
-        });
-      }
-
-      auto kernel = stream_exec->CreateTypedKernel<KernelArgs...>(
-          kernel_name_, ptx_, compiled_ptx);
-      if (kernel.ok()) {
-        result.first->second = *std::move(kernel);
-      } else {
-        kernels_.erase(result.first);
-        return kernel.status();
-      }
-    }
-    return result.first->second.get();
-  }
-
- private:
-  std::string kernel_name_;
-  const char* ptx_;
-  se::GpuAsmOpts asm_opts_;
-
-  absl::Mutex mu_;
-
-  // A mutex keyed on StreamExecutor* is ok because StreamExecutors are never
-  // destroyed.
-  absl::flat_hash_map<se::StreamExecutor*,
-                      std::unique_ptr<se::TypedKernel<KernelArgs...>>>
-      kernels_ ABSL_GUARDED_BY(mu_);
-};
-
-}  // anonymous namespace
-
-Status MakeBatchPointers(se::Stream* stream, const se::GpuAsmOpts& asm_opts,
-                         se::DeviceMemoryBase base_ptr, int stride_bytes, int n,
-                         se::DeviceMemoryBase ptrs_out) {
-#if TENSORFLOW_USE_ROCM
-  stream_executor::gpu::rocm_MakeBatchPointers(
-      se::gpu::AsGpuStreamValue(stream),
-      reinterpret_cast<char*>(base_ptr.opaque()), stride_bytes, n,
-      reinterpret_cast<void**>(ptrs_out.opaque()));
-#else
-  static auto* lazy_kernel =
-      new LazyKernel<se::DeviceMemoryBase /*base_ptr*/, int /*stride_bytes*/,
-                     int /*n*/, se::DeviceMemoryBase /*ptrs_out*/>(
-          "__xla_MakeBatchPointers", kMakeBatchPointersPtx, asm_opts);
-
-  TF_ASSIGN_OR_RETURN(auto kernel, lazy_kernel->Get(stream->parent()));
-
-  constexpr int kThreads = 128;
-  TF_RETURN_IF_ERROR(
-      stream->ThenLaunch(se::ThreadDim(kThreads, 1, 1),
-                         se::BlockDim(CeilOfRatio(n, kThreads), 1, 1), *kernel,
-                         base_ptr, stride_bytes, n, ptrs_out));
-#endif
-  return OkStatus();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/priority_fusion.cc b/third_party/xla/xla/service/gpu/priority_fusion.cc
index 8821d3f3ba70be..5a522f144b5ac3 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/priority_fusion.h"
 
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <iterator>
@@ -31,6 +32,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/meta/type_traits.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -40,12 +42,15 @@ limitations under the License.
 #include "xla/service/fusion_queue.h"
 #include "xla/service/gpu/fusion_process_dump.pb.h"
 #include "xla/service/gpu/gpu_fusible.h"
+#include "xla/service/gpu/hlo_traversal.h"
+#include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/blocking_counter.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 
@@ -58,6 +63,45 @@ bool ElementIsF32OrF16(const Shape& shape) {
   return type == F32 || type == F16;
 }
 
+bool IsFusible(const HloInstruction& instr) {
+  // Side-effecting operations are not fusible.
+  if (!instr.IsFusible()) {
+    return false;
+  }
+
+  // Element-wise operations are always fusible.
+  if (instr.IsElementwise()) {
+    return true;
+  }
+
+  // Other non-elementwise ops also supported by elemental fusion.
+  switch (instr.opcode()) {
+    case HloOpcode::kFusion:
+      return instr.fusion_kind() != HloInstruction::FusionKind::kCustom;
+
+    case HloOpcode::kCopy:
+    case HloOpcode::kIota:
+    case HloOpcode::kConstant:
+    case HloOpcode::kReduce:
+    case HloOpcode::kBitcast:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kGather:
+    case HloOpcode::kPad:
+    case HloOpcode::kReduceWindow:
+    case HloOpcode::kReshape:
+    case HloOpcode::kReverse:
+    case HloOpcode::kScatter:
+    case HloOpcode::kSlice:
+    case HloOpcode::kTranspose:
+      return true;
+    default:
+      return false;
+  }
+}
+
 // An implementation of FusionQueue that determines whether to fuse instructions
 // according to a cost model, and chooses the next fusion candidate according to
 // dynamically updated priorities. The elements in the queue are producer nodes
@@ -74,34 +118,69 @@ class GpuPriorityFusionQueue : public FusionQueue {
   GpuPriorityFusionQueue(
       HloComputation* computation,
       const GpuHloCostAnalysis::Options& cost_analysis_options,
-      const se::DeviceDescription* device_info, const CanFuseCallback& can_fuse,
-      FusionProcessDumpProto* fusion_process_dump)
+      const se::DeviceDescription* device_info,
+      FusionProcessDumpProto* fusion_process_dump,
+      tsl::thread::ThreadPool* thread_pool,
+      HloFusionAnalysisCache& fusion_analysis_cache)
       : computation_(computation),
         cost_analysis_(cost_analysis_options, device_info),
-        can_fuse_(can_fuse),
-        fusion_process_dump_(fusion_process_dump) {
+        fusion_process_dump_(fusion_process_dump),
+        thread_pool_(thread_pool),
+        fusion_analysis_cache_(fusion_analysis_cache) {
     VLOG(2) << "Running full HLO cost analysis for " << computation_->name();
     TF_CHECK_OK(computation_->Accept(&cost_analysis_));
 
     // Initializes the priority queue.
-    for (auto instruction : computation->MakeInstructionPostOrder()) {
+    std::vector<HloInstruction*> instructions;
+    for (auto* instruction : computation->MakeInstructionPostOrder()) {
       if (instruction->opcode() == HloOpcode::kParameter ||
           instruction->user_count() == 0 || !instruction->IsFusible() ||
           instruction->opcode() == HloOpcode::kTuple ||
           instruction->opcode() == HloOpcode::kGetTupleElement) {
         continue;
       }
-      Priority priority = CalculateProducerPriority(instruction);
+      instructions.push_back(instruction);
+    }
+    std::vector<Priority> priorities = ComputePriorities(instructions);
+
+    for (auto [instruction, priority] : llvm::zip(instructions, priorities)) {
       auto emplace_result = producer_priority_queue_.emplace(
           std::make_pair(priority, instruction->unique_id()), instruction);
       CHECK(emplace_result.second);
       reverse_map_.emplace(instruction, emplace_result.first);
-      producer_user_count_[instruction] = instruction->user_count();
     }
   }
 
+  std::vector<Priority> ComputePriorities(
+      const std::vector<HloInstruction*>& instructions) {
+    auto schedule_or_run = [this](std::function<void()> fn) {
+      if (thread_pool_) {
+        thread_pool_->Schedule(std::move(fn));
+      } else {
+        fn();
+      }
+    };
+    tsl::BlockingCounter counter(instructions.size());
+    std::vector<Priority> priorities(instructions.size());
+
+    for (size_t i = 0; i < instructions.size(); ++i) {
+      schedule_or_run([&, i] {
+        priorities[i] = CalculateProducerPriority(instructions[i]);
+        counter.DecrementCount();
+      });
+    }
+    counter.Wait();
+    return priorities;
+  }
+
   std::pair<HloInstruction*, std::vector<int64_t>>
   DequeueNextInstructionAndOperandsToFuseInOrder() override {
+    // When current_consumers_ is empty, we need to dequeue a new producer.
+    // Update the priorities that changed during the last fusion.
+    if (current_consumers_.empty()) {
+      UpdatePriorities();
+    }
+
     while (current_consumers_.empty()) {
       if (producer_priority_queue_.empty()) {
         return {};
@@ -119,6 +198,14 @@ class GpuPriorityFusionQueue : public FusionQueue {
         continue;
       }
       current_consumers_ = current_producer_->users();
+
+      if (current_producer_->opcode() == HloOpcode::kBitcast) {
+        // We don't check if bitcasts can be fused with all consumers, so we
+        // have to do it here.
+        llvm::erase_if(current_consumers_, [&](HloInstruction* consumer) {
+          return !CanFuseCached(current_producer_, consumer);
+        });
+      }
     }
 
     auto next_consumer = current_consumers_.back();
@@ -131,9 +218,69 @@ class GpuPriorityFusionQueue : public FusionQueue {
     return {next_consumer, {producer_operand_index}};
   }
 
-  // Calculates the compute cost and free computation of the new fusion in the
-  // PreFusion callback.
-  void PreFusion(HloInstruction* producer, HloInstruction* consumer) override {}
+  // Update priorities of all affected ops.
+  void UpdatePriorities() {
+    // Revisit costs of all updated ops. It's important to update cost analysis
+    // before recalculating priorities.
+    for (auto instruction : to_update_priority_) {
+      TF_CHECK_OK(cost_analysis_.RevisitInstruction(instruction));
+    }
+
+    std::vector<HloInstruction*> to_update_vector{to_update_priority_.begin(),
+                                                  to_update_priority_.end()};
+    std::vector<Priority> new_priorities = ComputePriorities(to_update_vector);
+
+    for (auto [instruction, new_priority] :
+         llvm::zip(to_update_vector, new_priorities)) {
+      auto reverse_it = reverse_map_.find(instruction);
+      const auto new_key =
+          std::make_pair(new_priority, instruction->unique_id());
+      if (reverse_it != reverse_map_.end()) {
+        if (new_key == reverse_it->second->first) {
+          continue;
+        }
+        producer_priority_queue_.erase(reverse_it->second);
+      }
+      auto emplace_result =
+          producer_priority_queue_.emplace(new_key, instruction);
+      CHECK(emplace_result.second);
+      if (reverse_it != reverse_map_.end()) {
+        reverse_it->second = emplace_result.first;
+      } else {
+        reverse_map_.emplace(instruction, emplace_result.first);
+      }
+    }
+    to_update_priority_.clear();
+  }
+
+  // Prepares producer and consumer instruction to be fused. Invalidates caches
+  // and writes logs.
+  void PreFusion(HloInstruction* producer, HloInstruction* consumer) override {
+    InvalidateCaches(producer);
+    InvalidateCaches(consumer);
+  }
+
+  // Invalidates all cached value related to this instruction. Called before the
+  // instruction is fused. The instruction can be either producer or consumer.
+  void InvalidateCaches(HloInstruction* instruction) {
+    HloInstructionAdaptor instruction_adaptor(*instruction);
+
+    can_fuse_cache_.erase(instruction_adaptor);
+    for (auto operand : instruction_adaptor.GetOperands()) {
+      auto it = can_fuse_cache_.find(operand);
+      if (it != can_fuse_cache_.end()) {
+        it->second.erase(instruction_adaptor);
+      }
+    }
+
+    gpu_performance_model_cache_.Invalidate(*instruction);
+    fusion_analysis_cache_.Invalidate(*instruction);
+
+    for (auto* user : instruction->users()) {
+      fusion_node_evaluations_.erase(user);
+    }
+    fusion_node_evaluations_.erase(instruction);
+  }
 
   // Updates data for the new fusion instruction and its users and operands.
   void OnFusingInstruction(HloInstruction* fusion,
@@ -179,48 +326,16 @@ class GpuPriorityFusionQueue : public FusionQueue {
       if (!operand->IsFusible()) {
         continue;
       }
-      producer_user_count_[operand] = operand->user_count();
+
       to_update_priority_.insert(operand);
     }
     to_update_priority_.insert(fusion);
-
-    // When current_consumers_ is empty, we will need to dequeue a new producer
-    // next time, so we update the priorities now.
-    if (current_consumers_.empty()) {
-      // Revisit costs of all updated ops. It's important to update cost
-      // analysis before recalculating priorities.
-      for (auto instruction : to_update_priority_) {
-        TF_CHECK_OK(cost_analysis_.RevisitInstruction(instruction));
-      }
-
-      for (auto instruction : to_update_priority_) {
-        auto reverse_it = reverse_map_.find(instruction);
-        const auto new_priority = CalculateProducerPriority(instruction);
-        const auto new_key =
-            std::make_pair(new_priority, instruction->unique_id());
-        if (reverse_it != reverse_map_.end()) {
-          if (new_key == reverse_it->second->first) {
-            continue;
-          }
-          producer_priority_queue_.erase(reverse_it->second);
-        }
-        auto emplace_result =
-            producer_priority_queue_.emplace(new_key, instruction);
-        CHECK(emplace_result.second);
-        if (reverse_it != reverse_map_.end()) {
-          reverse_it->second = emplace_result.first;
-        } else {
-          reverse_map_.emplace(instruction, emplace_result.first);
-        }
-      }
-      to_update_priority_.clear();
-    }
   }
 
   // Removes data for the instruction.
   void RemoveInstruction(HloInstruction* instruction) override {
     to_update_priority_.erase(instruction);
-    producer_user_count_.erase(instruction);
+    fusion_analysis_cache_.Invalidate(*instruction);
 
     auto reverse_it = reverse_map_.find(instruction);
     if (reverse_it == reverse_map_.end()) {
@@ -236,10 +351,22 @@ class GpuPriorityFusionQueue : public FusionQueue {
   // Returns the priority of the producer based on its current operands and
   // users.
   Priority CalculateProducerPriority(HloInstruction* producer) {
+    // Bitcasts should always be fused first, since they are no-ops.
+    if (producer->opcode() == HloOpcode::kBitcast) {
+      return std::numeric_limits<Priority>::max();
+    }
+    // We always fuse constants, but the cost model doesn't handle them very
+    // well: fusing constants changes costs significantly. Also, there's no
+    // point recomputing priorities. Therefore, we fuse all of them at the end.
+    if (producer->opcode() == HloOpcode::kConstant) {
+      return std::numeric_limits<Priority>::min();
+    }
+
     // Don't fuse if we can't fuse in all users.
     if (auto fusion_decision = CanFuseWithAllUsers(producer);
         !fusion_decision) {
       if (fusion_process_dump_) {
+        absl::MutexLock lock(&fusion_process_dump_mutex_);
         auto* step = fusion_process_dump_->add_fusion_steps()
                          ->mutable_producer_ineligible();
         step->set_producer_name(std::string(producer->name()));
@@ -251,8 +378,11 @@ class GpuPriorityFusionQueue : public FusionQueue {
     GpuPerformanceModel::RunTimes run_times =
         GpuPerformanceModel::EstimateRunTimes(
             producer, &cost_analysis_,
-            GpuPerformanceModelOptions::PriorityFusion(), producer->users());
+            GpuPerformanceModelOptions::PriorityFusion(
+                &fusion_analysis_cache_, &gpu_performance_model_cache_),
+            producer->users());
     if (fusion_process_dump_) {
+      absl::MutexLock lock(&fusion_process_dump_mutex_);
       auto* step =
           fusion_process_dump_->add_fusion_steps()->mutable_update_priority();
       step->set_producer_name(std::string(producer->name()));
@@ -266,14 +396,135 @@ class GpuPriorityFusionQueue : public FusionQueue {
                                     run_times.time_fused);
   }
 
-  FusionDecision CanFuseWithAllUsers(HloInstruction* producer) const {
+  FusionDecision CanFuse(HloInstruction* producer, HloInstruction* consumer) {
+    if (!IsFusible(*producer)) {
+      return "the producer is not fusible";
+    }
+
+    if (!IsFusible(*consumer)) {
+      return "the consumer is not fusible";
+    }
+
+    // Scatter is special as it has no elemental version but is still input
+    // fusible. Block attempts to create scatter fusions we can't codegen.
+    if (auto can_fuse = CanEmitInputFusedScatter(*producer, *consumer);
+        !can_fuse) {
+      return can_fuse;
+    }
+
+    // Avoid fusing reduce into reduce. Our cost model doesn't currently
+    // understand this case due to a lack of tiling analysis.
+    // TODO(b/312200883): Remove this.
+    auto contains_signficant_reduce = [&](const HloInstruction* instr) {
+      auto fusion = HloFusionAdaptor::ForInstruction(instr);
+      return HloAnyOf(fusion->GetRoots(), *fusion, [](auto node) {
+        if (node.opcode() != HloOpcode::kReduce) return false;
+
+        int64_t reduction_size =
+            ShapeUtil::ElementsIn(node.instruction().operand(0)->shape()) /
+            ShapeUtil::ElementsIn(node.shape());
+
+        // Small reductions are emitted using the elemental emitter anyway.
+        return reduction_size >= 16;
+      });
+    };
+    if (contains_signficant_reduce(producer) &&
+        contains_signficant_reduce(consumer)) {
+      return "both the producer and the consumer contain a reduce";
+    }
+
+    // Avoid doing fusions into the output of an "input" fusion when it would
+    // switch it to the loop emitter. This often occurs during epilog fusion for
+    // reductions, which suffer from limited emitter support.
+    // TODO(b/312686229): Cost model should handle this.
+    const auto& analysis_fused =
+        fusion_analysis_cache_.Get(*producer, *consumer);
+    if (producer->IsInputFusion() && analysis_fused &&
+        analysis_fused->GetEmitterFusionKind() ==
+            HloFusionAnalysis::EmitterFusionKind::kLoop) {
+      const auto& analysis = fusion_analysis_cache_.Get(*producer);
+      if (!analysis || analysis->GetEmitterFusionKind() ==
+                           HloFusionAnalysis::EmitterFusionKind::kReduction) {
+        return "fusion into output of a reduce fusion would create a loop "
+               "fusion";
+      }
+    }
+
+    // Avoid cases where we'd create a fusion that hit limitations in ptxas.
+    // Would be nice to model this with cost instead.
+    if (auto fits_budget = FusionFitsInBudget(
+            *consumer, *producer, *cost_analysis_.device_info_,
+            /*is_consumer_producer_fusion=*/true);
+        !fits_budget) {
+      return fits_budget;
+    }
+
+    // Also check that our emitter can handle the fusion node. We currently can
+    // have exponential time/memory requirements for emitting certain fusion
+    // kernels, in which case we don't want to fuse.
+    // TODO(b/119692968): Remove this once we have fixed our fusion emitter.
+    if (consumer->opcode() == HloOpcode::kFusion) {
+      absl::MutexLock lock(&fusion_node_evaluations_mutex_);
+      if (fusion_node_evaluations_.find(consumer) ==
+          fusion_node_evaluations_.end()) {
+        // We have no cached results for this fusion node yet. Compute it now.
+        fusion_node_evaluations_.emplace(
+            consumer, FusionNodeIndexingEvaluation(consumer));
+      }
+      if (fusion_node_evaluations_.at(consumer).CodeDuplicationTooHigh(
+              producer)) {
+        return "the fusion would result in an overly large code duplication";
+      }
+    }
+
+    // Don't fuse across a root instruction. There are situation when a root
+    // instruction is not the last in the computation. Instructions after the
+    // root are not necessary dead. They can be inputs to instructions with side
+    // effects, like outfeed.
+    if (producer == producer->parent()->root_instruction()) {
+      return "not fusing into the output of the root instruction";
+    }
+
+    return InstructionFusion::ShouldFuseInPlaceOp(producer, consumer);
+  }
+
+  FusionDecision CanFuseCached(HloInstruction* producer,
+                               HloInstruction* consumer) {
+    HloInstructionAdaptor producer_adaptor(*producer);
+    HloInstructionAdaptor consumer_adaptor(*consumer);
+
+    {
+      absl::MutexLock lock(&can_fuse_cache_mutex_);
+      auto& producer_cache = can_fuse_cache_[producer_adaptor];
+
+      auto it = producer_cache.find(consumer_adaptor);
+      if (it != producer_cache.end()) {
+        return it->second;
+      }
+    }
+
+    auto fusion_decision = CanFuse(producer, consumer);
+
+    // The lock is required, because writing to a flat_hash_map is not
+    // thread-safe even for different keys. We never call this computation
+    // concurrently for the same producer, so it's guaranteed that we don't
+    // override any value.
+    {
+      absl::MutexLock lock(&can_fuse_cache_mutex_);
+      can_fuse_cache_[producer_adaptor][consumer_adaptor] = fusion_decision;
+    }
+
+    return fusion_decision;
+  }
+
+  FusionDecision CanFuseWithAllUsers(HloInstruction* producer) {
     if (producer->users().size() == 0) {
       return "No users to fuse";
     }
 
     FusionDecision result;
     for (const auto& user : producer->users()) {
-      if (auto fusion_decision = can_fuse_(user, user->operand_index(producer));
+      if (auto fusion_decision = CanFuseCached(producer, user);
           !fusion_decision) {
         VLOG(10) << "Cannot fuse " << producer->name() << " with "
                  << user->name() << ", because: " << fusion_decision.Explain();
@@ -304,15 +555,6 @@ class GpuPriorityFusionQueue : public FusionQueue {
   // The current consumers being visited.
   std::vector<HloInstruction*> current_consumers_;
 
-  // Callbacks passed from the caller to check if we can fuse a pair of
-  // producer and consumer, where the consumer is given as a HloInstruction*
-  // and the producer is given as the consumer's operand index.
-  CanFuseCallback can_fuse_;
-
-  // The user counts of producers, used to determine whether we update their
-  // priorities when fusion happens.
-  absl::flat_hash_map<HloInstruction*, int64_t> producer_user_count_;
-
   // The set of producers whose priorities need to be updated. Their
   // priorities are changed because their neighbors got fused, but we delay
   // the priority updates until current_consumers_ becomes empty. This is to
@@ -323,6 +565,27 @@ class GpuPriorityFusionQueue : public FusionQueue {
   // Proto with structured logs of fusion decisions. Used only for debugging. If
   // null, logging is disabled.
   FusionProcessDumpProto* fusion_process_dump_;
+  absl::Mutex fusion_process_dump_mutex_;
+
+  tsl::thread::ThreadPool* thread_pool_;
+
+  HloFusionAnalysisCache& fusion_analysis_cache_;
+
+  // Caches result of can_fuse for a (producer, consumer) pair. A cache entry is
+  // invalidated if producer or consumer is modified.
+  absl::flat_hash_map<
+      HloInstructionAdaptor,
+      absl::flat_hash_map<HloInstructionAdaptor, FusionDecision>>
+      can_fuse_cache_;
+  absl::Mutex can_fuse_cache_mutex_;
+
+  GpuPerformanceModelCache gpu_performance_model_cache_;
+
+  // Keep track of the number of times each instruction inside a fusion node is
+  // indexed with different index vectors.
+  absl::Mutex fusion_node_evaluations_mutex_;
+  absl::flat_hash_map<const HloInstruction*, FusionNodeIndexingEvaluation>
+      fusion_node_evaluations_;
 };
 
 }  // namespace
@@ -356,8 +619,52 @@ StatusOr<bool> GpuPriorityFusion::Run(
     fusion_process_dump_ = std::make_unique<FusionProcessDumpProto>();
   }
 
+  // Appends ".0" suffix to all instructions.
+  //
+  // Every time an instruction is duplicated, the last integer suffix is
+  // incremented.
+  // Before: broadcast.123 -> broadcast.124
+  // After: broadcast.123.0 -> broadcast.123.1
+  //
+  // With this modification it will be easier to match intructions before and
+  // after fusion passes, because they will have the same unique prefix. Names
+  // are not used in the pipeline, but it makes debugging much easier.
+  for (auto* computation : GetFusionComputations(module, execution_threads)) {
+    for (auto* instruction : computation->instructions()) {
+      instruction->SetAndSanitizeName(absl::StrCat(instruction->name(), ".0"));
+    }
+  }
+
   auto result = InstructionFusion::Run(module, execution_threads);
 
+  // Fuse all constants.
+  if (result.ok()) {
+    // Note: `GetFusionComputations` doesn't return the fusion computations, but
+    // the computations to be fused.
+    for (auto* computation : GetFusionComputations(module, execution_threads)) {
+      std::vector<HloInstruction*> constants;
+      for (auto* instruction : computation->instructions()) {
+        if (instruction->opcode() == HloOpcode::kConstant) {
+          constants.push_back(instruction);
+        }
+      }
+      for (auto* constant : constants) {
+        auto users = constant->users();
+        for (auto* user : users) {
+          if (IsFusible(*user)) {
+            result.value() = true;
+            InstructionFusion::Fuse(constant, user, computation);
+          }
+        }
+      }
+    }
+  }
+
+  // FusionAnalysis cache uses unique_id as key. IDs are only unique inside one
+  // module. It's important to fully clear the cache if the same instance of the
+  // pass will be called on a different module.
+  fusion_analysis_cache_.Clear();
+
   if (dump_enabled) {
     DumpPerModuleProtobufToFile(*module, *fusion_process_dump_,
                                 module->config().debug_options(),
@@ -369,76 +676,33 @@ StatusOr<bool> GpuPriorityFusion::Run(
 
 FusionDecision GpuPriorityFusion::ShouldFuse(HloInstruction* consumer,
                                              int64_t operand_index) {
-  auto isFusible = [](const HloInstruction& instr) {
-    // Side-effecting operations are not fusible.
-    if (!instr.IsFusible()) {
-      return false;
-    }
-
-    // Element-wise operations are always fusible.
-    if (instr.IsElementwise()) {
-      return true;
-    }
-
-    // Other non-elementwise ops also supported by elemental fusion.
-    switch (instr.opcode()) {
-      case HloOpcode::kFusion:
-        return instr.fusion_kind() != HloInstruction::FusionKind::kCustom;
-
-      case HloOpcode::kCopy:
-      case HloOpcode::kIota:
-      case HloOpcode::kConstant:
-      case HloOpcode::kReduce:
-      case HloOpcode::kBitcast:
-      case HloOpcode::kBroadcast:
-      case HloOpcode::kConcatenate:
-      case HloOpcode::kDynamicSlice:
-      case HloOpcode::kDynamicUpdateSlice:
-      case HloOpcode::kGather:
-      case HloOpcode::kPad:
-      case HloOpcode::kReduceWindow:
-      case HloOpcode::kReshape:
-      case HloOpcode::kReverse:
-      case HloOpcode::kScatter:
-      case HloOpcode::kSlice:
-      case HloOpcode::kTranspose:
-        return true;
-      default:
-        return false;
-    }
-  };
-
-  HloInstruction* producer = consumer->mutable_operand(operand_index);
-  if (!isFusible(*producer)) {
-    return "the producer is not fusible";
-  }
-
-  if (!isFusible(*consumer)) {
-    return "the consumer is not fusible";
-  }
-
-  // Scatter is special as it has no elemental version but is still input
-  // fusible. Block attempts to create scatter fusions we can't codegen.
-  if (auto can_fuse = CanEmitInputFusedScatter(*producer, *consumer);
-      !can_fuse) {
-    return can_fuse;
-  }
-
-  // Avoid cases where we'd create a fusion that hit limitations in ptxas. Would
-  // be nice to model this with cost instead.
-  if (auto fits_budget =
-          FusionFitsInBudget(*consumer, *producer, device_info_,
-                             /*is_consumer_producer_fusion=*/true);
-      !fits_budget) {
-    return fits_budget;
-  }
-
-  return InstructionFusion::ShouldFuse(consumer, operand_index);
+  // This method is called in `InstructionFusion::Run` right before fusion, but
+  // it will always return true. Fusion decision are fully controlled by the
+  // PriorityQueue. If the queue returns a producer that shouldn't be fused,
+  // it's a bug and should be fixed in the queue logic.
+  return {};
 }
 
 HloInstruction::FusionKind GpuPriorityFusion::ChooseKind(
     const HloInstruction* producer, const HloInstruction* consumer) {
-  return ChooseFusionKind(*producer, *consumer);
+  // Derive kInput/kLoop fusion kinds from fusion analysis. This shouldn't
+  // matter but some passes downstream still query these instead of fusion
+  // analysis.
+  // TODO: Don't recompute this all the time.
+  const auto& analysis = fusion_analysis_cache_.Get(*producer, *consumer);
+  if (!analysis) return HloInstruction::FusionKind::kLoop;
+  switch (analysis->GetEmitterFusionKind()) {
+    case HloFusionAnalysis::EmitterFusionKind::kLoop:
+      return HloInstruction::FusionKind::kLoop;
+    case HloFusionAnalysis::EmitterFusionKind::kTriton:
+    case HloFusionAnalysis::EmitterFusionKind::kCustomFusion:
+      return HloInstruction::FusionKind::kCustom;
+    case HloFusionAnalysis::EmitterFusionKind::kReduction:
+    case HloFusionAnalysis::EmitterFusionKind::kTranspose:
+    case HloFusionAnalysis::EmitterFusionKind::kInputSlices:
+    case HloFusionAnalysis::EmitterFusionKind::kScatter:
+      return HloInstruction::FusionKind::kInput;
+  }
 }
 
 HloInstruction* GpuPriorityFusion::FuseInstruction(
@@ -456,10 +720,7 @@ std::unique_ptr<FusionQueue> GpuPriorityFusion::GetFusionQueue(
     HloComputation* computation) {
   return std::unique_ptr<FusionQueue>(new GpuPriorityFusionQueue(
       computation, cost_analysis_options_, &device_info_,
-      [this](HloInstruction* consumer, int64_t operand_index) {
-        return ShouldFuse(consumer, operand_index);
-      },
-      fusion_process_dump_.get()));
+      fusion_process_dump_.get(), thread_pool_, fusion_analysis_cache_));
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/priority_fusion.h b/third_party/xla/xla/service/gpu/priority_fusion.h
index eb729f85235c26..45d9beaa0080d7 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion.h
+++ b/third_party/xla/xla/service/gpu/priority_fusion.h
@@ -19,33 +19,37 @@ limitations under the License.
 #include <stdint.h>
 
 #include <memory>
-#include <optional>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/dump.h"
+#include "xla/service/fusion_node_indexing_evaluation.h"
 #include "xla/service/fusion_queue.h"
 #include "xla/service/gpu/fusion_process_dump.pb.h"
+#include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/statusor.h"
+#include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace gpu {
 
 class GpuPriorityFusion : public InstructionFusion {
  public:
-  explicit GpuPriorityFusion(
-      const se::DeviceDescription& d,
-      const GpuHloCostAnalysis::Options& cost_analysis_options)
+  GpuPriorityFusion(tsl::thread::ThreadPool* thread_pool,
+                    const se::DeviceDescription& device,
+                    GpuHloCostAnalysis::Options cost_analysis_options)
       : InstructionFusion(GpuPriorityFusion::IsExpensive),
-        device_info_(d),
-        cost_analysis_options_(cost_analysis_options) {}
+        thread_pool_(thread_pool),
+        device_info_(device),
+        cost_analysis_options_(std::move(cost_analysis_options)),
+        fusion_analysis_cache_(device_info_) {}
 
   absl::string_view name() const override { return "priority-fusion"; }
 
@@ -59,6 +63,7 @@ class GpuPriorityFusion : public InstructionFusion {
  protected:
   std::unique_ptr<FusionQueue> GetFusionQueue(
       HloComputation* computation) override;
+
   FusionDecision ShouldFuse(HloInstruction* consumer,
                             int64_t operand_index) override;
 
@@ -69,6 +74,7 @@ class GpuPriorityFusion : public InstructionFusion {
   HloInstruction* FuseInstruction(HloInstruction* fusion_instruction,
                                   HloInstruction* producer) override;
 
+  tsl::thread::ThreadPool* thread_pool_;
   se::DeviceDescription device_info_;
 
   // Cost model options that defines priorities in the queue.
@@ -77,6 +83,8 @@ class GpuPriorityFusion : public InstructionFusion {
   // Proto with structured logs of fusion decisions. Used only for debugging. If
   // null, logging is disabled.
   std::unique_ptr<FusionProcessDumpProto> fusion_process_dump_;
+
+  HloFusionAnalysisCache fusion_analysis_cache_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/priority_fusion_test.cc b/third_party/xla/xla/service/gpu/priority_fusion_test.cc
index f78f127523d08e..74373063392580 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <memory>
 
+#include <gmock/gmock.h>
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -33,9 +34,15 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
+#include "tsl/platform/status_matchers.h"
 
 namespace m = ::xla::match;
 
+using ::testing::ElementsAre;
+using ::testing::UnorderedElementsAre;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::IsOkAndHolds;
+
 namespace xla {
 namespace gpu {
 
@@ -51,8 +58,8 @@ class PriorityFusionTest : public HloTestBase {
   std::vector<HloFusionAnalysis::EmitterFusionKind> RunAndGetFusionKinds(
       absl::string_view hlo) {
     auto module = ParseAndReturnVerifiedModule(hlo).value();
-    EXPECT_TRUE(priority_fusion_.Run(module.get()).value());
-    TF_CHECK_OK(module->RemoveUnusedComputations());
+    EXPECT_THAT(priority_fusion_.Run(module.get()), IsOkAndHolds(true));
+    EXPECT_THAT(module->RemoveUnusedComputations(), IsOk());
     std::vector<HloFusionAnalysis::EmitterFusionKind> kinds;
     for (auto computation : module->computations()) {
       if (!computation->FusionInstruction()) continue;
@@ -69,7 +76,7 @@ class PriorityFusionTest : public HloTestBase {
   }
 
   GpuPriorityFusion priority_fusion_{
-      TestGpuDeviceInfo::RTXA6000DeviceInfo(),
+      /*thread_pool=*/nullptr, TestGpuDeviceInfo::RTXA6000DeviceInfo(),
       GpuHloCostAnalysis::Options{ShapeSizeBytesFunction(),
                                   /*per_second_rates=*/{},
                                   /*count_multiple_input_accesses=*/true}};
@@ -90,7 +97,7 @@ TEST_F(PriorityFusionTest, FuseWithSharedArgument) {
     })")
                     .value();
 
-  EXPECT_TRUE(priority_fusion_.Run(module.get()).value());
+  EXPECT_THAT(priority_fusion_.Run(module.get()), IsOkAndHolds(true));
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Fusion()));
@@ -239,8 +246,8 @@ TEST_F(PriorityFusionTest, ReductionEpilogueFusionRegressionTest) {
 
   EXPECT_THAT(
       RunAndGetFusionKinds(kHlo),
-      ::testing::ElementsAre(HloFusionAnalysis::EmitterFusionKind::kLoop,
-                             HloFusionAnalysis::EmitterFusionKind::kReduction));
+      UnorderedElementsAre(HloFusionAnalysis::EmitterFusionKind::kLoop,
+                           HloFusionAnalysis::EmitterFusionKind::kReduction));
 
   RunAndFilecheckHloRewrite(kHlo, std::move(priority_fusion_), R"(
 CHECK: ENTRY
@@ -316,10 +323,11 @@ TEST_F(PriorityFusionTest, DoNotFuseTransposeIntoReduce) {
     })";
 
   using Kind = HloFusionAnalysis::EmitterFusionKind;
-  EXPECT_THAT(RunAndGetFusionKinds(kHlo),
-              ::testing::UnorderedElementsAre(
-                  Kind::kReduction, Kind::kReduction, Kind::kTranspose,
-                  Kind::kTranspose, Kind::kTranspose));
+  EXPECT_THAT(
+      RunAndGetFusionKinds(kHlo),
+      UnorderedElementsAre(Kind::kLoop, Kind::kLoop, Kind::kLoop,
+                           Kind::kReduction, Kind::kReduction, Kind::kTranspose,
+                           Kind::kTranspose, Kind::kTranspose));
 }
 
 TEST_F(PriorityFusionTest, DoNotFuseReduceIntoReduce) {
@@ -462,34 +470,6 @@ CHECK-COUNT-2: fusion(
   )");
 }
 
-TEST_F(PriorityFusionTest, SingleTransposeFusion) {
-  // A regression test that verifies the given HLO fuses into a single fusion.
-  absl::string_view kHlo = R"(
-    HloModule test_module
-
-  ENTRY main {
-    param_0.14390 = bf16[2048,24576]{1,0} parameter(0)
-    convert.34192 = f32[2048,24576]{1,0} convert(param_0.14390)
-    constant_11107 = bf16[] constant(0.02002)
-    convert.35472 = f32[] convert(constant_11107)
-    broadcast.21886 = f32[2048,24576]{1,0} broadcast(convert.35472), dimensions={}
-    multiply.14420 = f32[2048,24576]{1,0} multiply(convert.34192, broadcast.21886)
-    fusion.3520 = f32[2048,24576]{1,0} tanh(multiply.14420)
-
-    constant_11286 = bf16[] constant(50)
-    convert.42562 = f32[] convert(constant_11286)
-    broadcast.22230 = f32[2048,24576]{1,0} broadcast(convert.42562), dimensions={}
-    multiply.14798 = f32[2048,24576]{1,0} multiply(fusion.3520, broadcast.22230)
-    convert.34603 = bf16[2048,24576]{1,0} convert(multiply.14798)
-    bitcast.21354 = bf16[1,2048,2048,12]{3,2,1,0} bitcast(convert.34603)
-    ROOT transpose.6502 = bf16[1,12,2048,2048]{3,2,1,0} transpose(bitcast.21354), dimensions={0,3,2,1}
-  })";
-
-  using Kind = HloFusionAnalysis::EmitterFusionKind;
-  EXPECT_THAT(RunAndGetFusionKinds(kHlo),
-              ::testing::ElementsAre(Kind::kTranspose));
-}
-
 TEST_F(PriorityFusionTest, DontFuseIntoFirstOperandOfScatter) {
   auto module = *ParseAndReturnVerifiedModule(R"(
     HloModule test_module
@@ -516,7 +496,7 @@ TEST_F(PriorityFusionTest, DontFuseIntoFirstOperandOfScatter) {
       ROOT add = s32[3,3] add(scatter, scatter)
     })");
 
-  EXPECT_TRUE(priority_fusion_.Run(module.get()).value());
+  EXPECT_THAT(priority_fusion_.Run(module.get()), IsOkAndHolds(true));
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   const HloInstruction* fusion = nullptr;
@@ -549,5 +529,120 @@ CHECK: ROOT {{.*}} reduce(
   )");
 }
 
+TEST_F(PriorityFusionTest, FuseReductionEpilogueWithMultipleUsers) {
+  // Regression test that verifies we correctly fuse the `log` into the reduce.
+  constexpr absl::string_view kHlo = R"(
+    HloModule test_module
+
+    add {
+      x = f32[] parameter(0)
+      y = f32[] parameter(1)
+      ROOT add = f32[] add(x, y)
+    }
+
+    fused_computation {
+      p0 = f32[64,16384]{1,0} parameter(0)
+      c0 = f32[] constant(0)
+      ROOT reduce.858 = f32[64]{0} reduce(p0, c0), dimensions={1}, to_apply=add
+    }
+
+    ENTRY main {
+      p0 = f32[64,16384]{1,0} parameter(0)
+      fusion = f32[64]{0} fusion(p0), kind=kInput, calls=fused_computation
+      log = f32[64]{0} log(fusion)
+      negate = f32[64]{0} custom-call(log), custom_call_target="negate"
+      ROOT add = f32[64]{0} add(negate, log)
+    }
+  )";
+
+  RunAndFilecheckHloRewrite(kHlo, std::move(priority_fusion_), R"(
+    CHECK: ENTRY
+    CHECK: %[[PARAM:.*]] = {{.*}} parameter(0)
+    CHECK: %[[FUSION:.*]] = {{.*}} fusion(%[[PARAM]])
+    CHECK: custom-call(%[[FUSION]])
+  )");
+}
+
+TEST_F(PriorityFusionTest, EpilogueFusion) {
+  absl::string_view kHlo = R"(
+    HloModule test_module
+
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add.13235 = f32[] add(p0, p1)
+    }
+
+    fused_computation.1 {
+      p0 = f32[8,4,128,226]{3,2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      ROOT r0 = f32[8,4,128]{2,1,0} reduce(p0, c0), dimensions={3}, to_apply=add
+    }
+
+    fused_computation.2 {
+      p0 = f32[8,4,128]{2,1,0} parameter(0)
+      r1 = f32[8,4,128]{2,1,0} log(p0)
+      ROOT r2 = f32[8,4,128]{2,1,0} log(r1)
+    }
+
+    ENTRY main {
+      p0 = f32[8,4,128,226]{3,2,1,0} parameter(0)
+      f1 = f32[8,4,128]{2,1,0} fusion(p0), kind=kInput, calls=%fused_computation.1
+      ROOT fusion = f32[8,4,128]{2,1,0} fusion(f1), kind=kLoop, calls=%fused_computation.2
+    })";
+
+  RunAndFilecheckHloRewrite(kHlo, std::move(priority_fusion_), R"(
+CHECK: ROOT {{.*}} = f32[8,4,128]{2,1,0} fusion(%p{{.*}}), kind=kInput, calls=%fused_computation)");
+}
+
+TEST_F(PriorityFusionTest, EpilogueFusionFails) {
+  auto module = *ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add.13235 = f32[] add(p0, p1)
+    }
+
+    fused_computation.1 {
+      p0 = f32[28672,4096]{1,0} parameter(0)
+      c0 = f32[] constant(0)
+      ROOT r = f32[28672]{0} reduce(p0, c0), dimensions={1}, to_apply=add
+    }
+
+    fused_computation.2 {
+      p0 = f32[28672]{0} parameter(0)
+      p1 = f32[28672]{0} parameter(1)
+      ROOT a = f32[28672]{0} add(p0, p1)
+    }
+
+    ENTRY main {
+      p0 = f32[28672,4096]{1,0} parameter(0)
+      p1 = f32[28672]{0} parameter(1)
+      f = f32[28672]{0} fusion(p0), kind=kInput, calls=%fused_computation.1
+      ROOT fusion = f32[28672]{0} fusion(f,p1), kind=kLoop, calls=%fused_computation.2
+    })");
+
+  EXPECT_THAT(priority_fusion_.Run(module.get()), IsOkAndHolds(false));
+}
+
+TEST_F(PriorityFusionTest, DoNotFuseIntoRoot) {
+  auto module = *ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    ENTRY %main (p.0: u32[2], p.1: u32[]) -> u32[2] {
+      %p.0 = u32[2]{0} parameter(0)
+      %p.1 = u32[] parameter(1)
+      ROOT %broadcast = u32[2]{0} broadcast(u32[] %p.1), dimensions={}, sharding={replicated}
+      %add = u32[2]{0} add(u32[2]{0} %p.0, u32[2]{0} %broadcast)
+      %tuple.1 = (u32[2]{0}) tuple(u32[2]{0} %add)
+      %token.0 = token[] after-all()
+      %outfeed.6 = token[] outfeed((u32[2]{0}) %tuple.1, token[] %token.0), outfeed_shape=(u32[2]{0}), sharding={maximal device=0}
+    })");
+
+  EXPECT_THAT(priority_fusion_.Run(module.get()), IsOkAndHolds(false));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/reduction_utils.cc b/third_party/xla/xla/service/gpu/reduction_utils.cc
index a8576a5e4f1c3e..153d094c159667 100644
--- a/third_party/xla/xla/service/gpu/reduction_utils.cc
+++ b/third_party/xla/xla/service/gpu/reduction_utils.cc
@@ -17,11 +17,13 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstdint>
 
 #include "absl/algorithm/container.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/layout_util.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/util.h"
@@ -96,17 +98,26 @@ Vector3 GetReductionTiling(const ReductionDimensions& reduction_dimensions) {
   return {1, 128, 1};
 }
 
+int64_t ReductionDimensionRaceFreeBound(
+    const HloModuleConfig& hlo_module_config,
+    const ReductionDimensions& reduction_dimensions) {
+  Vector3 reduction_tiling = GetReductionTiling(reduction_dimensions);
+  if (reduction_dimensions.is_row_reduction) {
+    return MinThreadsXRowReduction(hlo_module_config) * reduction_tiling[2];
+  }
+  return WarpSize() * reduction_tiling[1];
+}
+
 static bool IsUnnestedReductionFasterThanElemental(
     const ReductionDimensions& reduction_dimensions) {
-  const int kWarpSize = 32;
   if (reduction_dimensions.is_row_reduction) {
     // For row reduction, the tile block is 1 x tile_size_x, and we are reducing
     // along tile_size_x which needs to be large enough to make the tiling
     // implementation efficient.
     // For very small reductions with a power-of-two size, we can fit multiple
     // reductions inside a single warp, which is more efficient than a loop.
-    return (reduction_dimensions.dimensions[2] >= kWarpSize) ||
-           ((kWarpSize % reduction_dimensions.dimensions[2]) == 0);
+    return (reduction_dimensions.dimensions[2] >= WarpSize()) ||
+           ((WarpSize() % reduction_dimensions.dimensions[2]) == 0);
   }
 
   // For column reduction, the tile block is tile_size_y x tile_size_x, and we
@@ -117,10 +128,10 @@ static bool IsUnnestedReductionFasterThanElemental(
 
   // Rule generated by sweeping the search space of small column reductions.
   bool prefer_elemental_emitter =
-      (major_size < kWarpSize) ||
-      (major_size < 2 * kWarpSize && minor_size < kWarpSize) ||
-      (major_size < 4 * kWarpSize && minor_size < 8) ||
-      (major_size < 8 * kWarpSize && minor_size < 3);
+      (major_size < WarpSize()) ||
+      (major_size < 2 * WarpSize() && minor_size < WarpSize()) ||
+      (major_size < 4 * WarpSize() && minor_size < 8) ||
+      (major_size < 8 * WarpSize() && minor_size < 3);
 
   return !prefer_elemental_emitter;
 }
@@ -153,18 +164,18 @@ bool IsReductionFromOrToContiguousDimensions(const HloInstruction& reduce) {
 
 bool ReductionIsRaceFree(const HloModuleConfig& hlo_module_config,
                          const ReductionDimensions& reduction_dimensions) {
-  const int kWarpSize = 32;
-  Vector3 reduction_tiling = GetReductionTiling(reduction_dimensions);
   if (reduction_dimensions.is_row_reduction) {
     return reduction_dimensions.dimensions[2] <=
-               MinThreadsXRowReduction(hlo_module_config) *
-                   reduction_tiling[2] &&
+               ReductionDimensionRaceFreeBound(hlo_module_config,
+                                               reduction_dimensions) &&
            reduction_dimensions.dimensions[0] <=
                BatchedReductionRaceFreeBound();
   }
 
   // Column reduction.
-  return reduction_dimensions.dimensions[1] <= kWarpSize * reduction_tiling[1];
+  return reduction_dimensions.dimensions[1] <=
+         ReductionDimensionRaceFreeBound(hlo_module_config,
+                                         reduction_dimensions);
 }
 
 ReductionDimensions GetReductionKindAndContiguousComponents(
diff --git a/third_party/xla/xla/service/gpu/reduction_utils.h b/third_party/xla/xla/service/gpu/reduction_utils.h
index 610200c8884642..6d20c5d4032cff 100644
--- a/third_party/xla/xla/service/gpu/reduction_utils.h
+++ b/third_party/xla/xla/service/gpu/reduction_utils.h
@@ -59,6 +59,11 @@ ReductionDimensions GetReductionKindAndContiguousComponents(
 // Get tiling per thread for the given reduction in dimensions [D, H, W].
 Vector3 GetReductionTiling(const ReductionDimensions& reduction_dimensions);
 
+// How big the reduction dimension can be to be race free.
+int64_t ReductionDimensionRaceFreeBound(
+    const HloModuleConfig& hlo_module_config,
+    const ReductionDimensions& reduction_dimensions);
+
 // Returns whether the given reduction can be safely generated without atomics :
 // that is, at most one block will write to every output element.
 bool ReductionIsRaceFree(const HloModuleConfig& hlo_module_config,
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index 9bdb7dac382d0c..a296a45bce2d2f 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -1,14 +1,15 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/service/gpu:build_defs.bzl", "gpu_kernel_library")
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
-    "rocm_default_copts",
+    "rocm_library",
 )
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
-    "tf_cuda_tests_tags",
+    "tf_gpu_tests_tags",
 )
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
@@ -24,11 +25,29 @@ package_group(
     ],
 )
 
+gpu_kernel_library(
+    name = "gpu_kernel_helper",
+    hdrs = if_gpu_is_configured(["gpu_kernel_helper.h"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    deps = [
+        "//xla/stream_executor/platform",
+        "@local_tsl//tsl/lib/math:math_util",
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
+)
+
 cc_library(
     name = "cholesky",
     srcs = ["cholesky.cc"],
     hdrs = ["cholesky.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         ":support",
@@ -44,34 +63,24 @@ cc_library(
 
 cuda_library(
     name = "sleep_kernel_cuda",
-    srcs = if_cuda_is_configured(
-        [
-            "sleep_kernel.cu.cc",
-        ],
-    ),
+    srcs = if_cuda_is_configured(["sleep_kernel.cu.cc"]),
     hdrs = if_cuda_is_configured(["sleep_kernel.h"]),
     compatible_with = [],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
+        ":gpu_kernel_helper",
     ]),
 )
 
-cc_library(
+rocm_library(
     name = "sleep_kernel_rocm",
-    srcs = if_rocm_is_configured(
-        [
-            "sleep_kernel.cu.cc",
-        ],
-    ),
+    srcs = if_rocm_is_configured(["sleep_kernel.cu.cc"]),
     hdrs = if_rocm_is_configured(["sleep_kernel.h"]),
     compatible_with = [],
-    copts = rocm_default_copts(),
-    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM"]),
-    visibility = ["//visibility:public"],
+    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
+        ":gpu_kernel_helper",
     ]),
 )
 
@@ -79,7 +88,9 @@ cc_library(
     name = "collectives",
     srcs = ["collectives.cc"],
     hdrs = ["collectives.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         ":support",
@@ -100,15 +111,15 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
     ] + if_gpu_is_configured([
+        ":gpu_kernel_helper",
         "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/stream_executor/gpu:gpu_stream_header",
     ]) + if_cuda_is_configured([
         ":sleep_kernel_cuda",
-        "@local_config_cuda//cuda:cuda_headers",
     ]) + if_rocm_is_configured([
         ":sleep_kernel_rocm",
-        "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
 
@@ -243,7 +254,9 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         ":support",
@@ -336,18 +349,21 @@ cc_library(
 
 cc_library(
     name = "topk_kernel",
-    srcs = if_cuda_is_configured(
-        [
-            "topk_kernel.cc",
-        ],
-    ),
-    hdrs = if_cuda_is_configured(["topk_kernel.h"]),
+    srcs = if_gpu_is_configured(["topk_kernel.cc"]),
+    hdrs = if_gpu_is_configured(["topk_kernel.h"]),
     compatible_with = [],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
-        ":topk_kernel_cuda",
+        ":gpu_kernel_helper",
+        ":support",
         "//xla:shape_util",
+        "//xla:types",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/runtime:memref_view",
         "//xla/stream_executor",  # build_cleaner: keep
         "//xla/stream_executor:platform",
         "//xla/stream_executor/gpu:gpu_stream_header",
@@ -355,8 +371,14 @@ cc_library(
         "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ] + if_cuda_is_configured([
+        ":topk_kernel_cuda",
+    ]) + if_rocm_is_configured([
+        ":topk_kernel_rocm",
+    ]),
 )
 
 cuda_library(
@@ -370,27 +392,54 @@ cuda_library(
     ),
     hdrs = if_cuda_is_configured(["topk_kernel_common.h"]),
     compatible_with = [],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     visibility = ["//visibility:public"],
     deps = [
+        ":gpu_kernel_helper",
+        "@eigen_archive//:eigen3",
+    ],
+)
+
+rocm_library(
+    name = "topk_kernel_rocm",
+    srcs = if_rocm_is_configured(
+        [
+            "topk_kernel_bfloat16.cu.cc",
+            "topk_kernel_float.cu.cc",
+            "topk_kernel.cu.h",
+        ],
+    ),
+    hdrs = if_rocm_is_configured(["topk_kernel_common.h"]),
+    compatible_with = [],
+    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
+    deps = [
+        ":gpu_kernel_helper",
         "@eigen_archive//:eigen3",
-        "@local_config_cuda//cuda:cuda_headers",
     ],
 )
 
 xla_cc_test(
     name = "topk_kernel_test",
-    srcs = if_cuda_is_configured(["topk_kernel_test.cc"]),
-    linkstatic = 1,
-    tags = tf_cuda_tests_tags(),
+    srcs = if_gpu_is_configured(["topk_kernel_test.cc"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    tags = tf_gpu_tests_tags(),
     deps = [
+        ":gpu_kernel_helper",
         ":topk_kernel",
         "//xla:xla_data_proto_cc",
+        "//xla/stream_executor",  # build_cleaner: keep
+        "//xla/stream_executor:multi_platform_manager",
+        "//xla/stream_executor/gpu:gpu_stream_header",
+        "//xla/stream_executor/gpu:gpu_timer_header",
         "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/stream_executor/host:host_platform",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
         "@eigen_archive//:eigen3",
-        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
         "@local_tsl//tsl/platform:test_main",
@@ -399,8 +448,11 @@ xla_cc_test(
 
 xla_cc_test(
     name = "topk_test",
-    srcs = if_cuda_is_configured(["topk_test.cc"]),
-    tags = tf_cuda_tests_tags(),
+    srcs = if_gpu_is_configured(["topk_test.cc"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    tags = tf_gpu_tests_tags(),
     deps = [
         ":topk",
         "//xla:error_spec",
@@ -426,13 +478,13 @@ xla_cc_test(
 
 cc_library(
     name = "topk",
-    srcs = if_cuda_is_configured(
-        ["topk.cc"],
-        ["topk_no_cuda.cc"],
-    ),
+    srcs = if_gpu_is_configured(["topk.cc"]),
     hdrs = ["topk.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([":topk_kernel"]) + [
+    deps = if_gpu_is_configured([":topk_kernel"]) + [
         ":support",
         "//xla:executable_run_options",
         "//xla:shape_util",
@@ -465,7 +517,9 @@ cc_library(
     name = "gemm",
     srcs = ["gemm.cc"],
     hdrs = ["gemm.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         ":support",
@@ -485,7 +539,7 @@ cc_library(
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:errors",
-    ] + if_cuda_is_configured([
+    ] + if_gpu_is_configured([
         "//xla/service/gpu:gemm_algorithm_picker",
         "//xla/stream_executor/gpu:redzone_allocator",
     ]),
@@ -495,7 +549,9 @@ cc_library(
     name = "graph_launch",
     srcs = ["graph_launch.cc"],
     hdrs = ["graph_launch.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         ":concurrent_region",
@@ -578,23 +634,38 @@ cc_library(
     name = "kernel_launch",
     srcs = ["kernel_launch.cc"],
     hdrs = ["kernel_launch.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         ":concurrent_region",
         ":support",
+        "//xla:statusor",
         "//xla:types",
+        "//xla/hlo/ir:hlo",
         "//xla/runtime:custom_call",
         "//xla/runtime:custom_call_registry",
         "//xla/runtime:executable",
+        "//xla/runtime:memref_view",
         "//xla/runtime:state",
         "//xla/service:executable",
+        "//xla/service:hlo_proto_cc",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:stream_executor_util",
+        "//xla/service/gpu/kernels:custom_fusion",
+        "//xla/service/gpu/kernels:custom_kernel",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_graph",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -602,7 +673,9 @@ cc_library(
     name = "gpublas_lt_matmul",
     srcs = ["gpublas_lt_matmul.cc"],
     hdrs = ["gpublas_lt_matmul.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         ":support",
@@ -649,7 +722,7 @@ cc_library(
         "//xla/runtime:custom_call_registry",
         "//xla/runtime:executable",
         "//xla/service:executable",
-        "//xla/service/gpu:io_feed_manager",
+        "@com_google_absl//absl/base",
     ],
 )
 
@@ -657,7 +730,9 @@ cc_library(
     name = "support",
     srcs = ["support.cc"],
     hdrs = ["support.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
@@ -699,20 +774,27 @@ cc_library(
 
 cc_library(
     name = "tracing",
-    srcs = ["tracing.cc"],
-    hdrs = ["tracing.h"],
+    srcs = [
+        "annotation.cc",
+        "tracing.cc",
+    ],
+    hdrs = [
+        "annotation.h",
+        "tracing.h",
+    ],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     visibility = ["//visibility:public"],
     deps = [
         ":support",
-        "//xla/runtime:custom_call",
+        "//xla/hlo/ir:hlo",
         "//xla/runtime:custom_call_registry",
         "//xla/runtime:executable",
         "//xla/runtime:tracing",
         "//xla/runtime:type_id",
-        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/profiler/lib:nvtx_utils",
         "@local_tsl//tsl/profiler/lib:scoped_annotation_stack",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/runtime/annotation.cc b/third_party/xla/xla/service/gpu/runtime/annotation.cc
new file mode 100644
index 00000000000000..8b3f56ec00b8e0
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime/annotation.cc
@@ -0,0 +1,229 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/runtime/annotation.h"
+
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+
+namespace xla::gpu {
+
+namespace {
+nvtxStringHandle_t RegisterString(const char* str) {
+#if GOOGLE_CUDA
+  auto domain = tsl::profiler::nvtx::GetNVTXDomain();
+  if (!domain) {
+    // NVTX not enabled, so don't bother registering strings with it
+    return {};
+  }
+  std::string buffer{};
+  constexpr auto max_length = 65330;
+  if (auto const length = std::strlen(str); length >= max_length) {
+    // nvbugs 4340868
+    std::string_view suffix{"\n[truncated]\n"};
+    buffer.reserve(max_length);
+    buffer.assign(str, str + length - suffix.size());
+    buffer.append(suffix);
+    str = buffer.c_str();
+  }
+  return nvtxDomainRegisterStringA(*domain, str);
+#else
+  return {};
+#endif
+}
+
+template <typename Visitor>
+Status VisitInstAndCalledButNotOperands(Visitor& visitor,
+                                        const HloInstruction& inst) {
+  // Visit the given instruction, and the things it calls, but not its operands.
+  TF_RETURN_IF_ERROR(visitor.DefaultAction(&inst));
+  for (const HloComputation* called : inst.called_computations()) {
+    const HloInstruction* const root = called->root_instruction();
+    TF_RETURN_IF_ERROR(root->Accept(&visitor, false /* call_finish_visit */,
+                                    true /* ignore_control_predecessors */,
+                                    true /* cross_computation */));
+  }
+  return OkStatus();
+}
+
+// Split `a` and `b` by `delim` into two lists of possibly-empty tokens, then
+// rejoin the first N of those lists that match by `delim`. Note: it is
+// unspecified which argument the return value points into.
+std::string_view LongestPrefix(std::string_view a, std::string_view b,
+                               char delim = '/') {
+  if (a.size() > b.size()) a.swap(b);  // allow assumption that b is longer
+  for (auto start_a = a.begin(), iter_a = start_a, start_b = b.begin(),
+            iter_b = start_b;
+       ; ++iter_a, ++iter_b) {
+    if (iter_a == a.end() && (iter_b == b.end() || *iter_b == delim)) {
+      // reached both ends without finding a mismatch, or reached the end of `a`
+      // and not `b` but it was the end of the chunk in `b`
+      return a;
+    }
+    if (*iter_a != *iter_b) {
+      // mismatch in this chunk
+      return {a.begin(),
+              static_cast<std::size_t>(std::distance(a.begin(), start_a))};
+    }
+    if (*iter_a == delim) {
+      // end of this chunk, start the next one
+      start_a = iter_a;
+      start_b = iter_b;
+    }
+  }
+}
+
+// Find the longest prefix among instructions' op_name metadata
+// Chunk this by delimiting slashes, i.e. given a/b/cat and a/b/cabbage, the
+// longest prefix is a/b not a/b/ca
+class OpNamePrefixVisitor : public ConstDfsHloVisitorWithDefault {
+ public:
+  Status DefaultAction(const HloInstruction* inst) final {
+    auto const& op_name = inst->metadata().op_name();
+    if (!op_name.empty()) {
+      prefix = prefix ? LongestPrefix(*prefix, op_name) : op_name;
+    }
+    return OkStatus();
+  }
+  std::string_view longest_op_name_prefix() const {
+    return prefix.value_or(std::string_view{});
+  }
+
+ private:
+  std::optional<std::string_view> prefix{};
+};
+
+std::string_view GetLongestOpNamePrefix(const HloModule& mod) {
+  // In the presence of (at least) debug callbacks, calling Accept on the root
+  // instruction of the module may not reach all instructions in the module.
+  OpNamePrefixVisitor visitor{};
+  for (const HloComputation* computation : mod.computations()) {
+    for (const HloInstruction* inst : computation->instructions()) {
+      if (!visitor.DefaultAction(inst).ok()) {
+        return {};
+      }
+    }
+  }
+  return visitor.longest_op_name_prefix();
+}
+
+std::string_view GetLongestOpNamePrefix(const HloInstruction& inst) {
+  OpNamePrefixVisitor visitor{};
+  if (!VisitInstAndCalledButNotOperands(visitor, inst).ok()) {
+    return {};
+  }
+  return visitor.longest_op_name_prefix();
+}
+
+std::string MakeTitle(const HloModule& mod, std::string_view longest_prefix) {
+  if (longest_prefix.empty()) {
+    return absl::StrFormat("XlaModule:#hlo_module=%s,program_id=%d#",
+                           mod.name(), mod.unique_id());
+  }
+  return absl::StrFormat("XlaModule:#prefix=%s,hlo_module=%s,program_id=%d#",
+                         longest_prefix, mod.name(), mod.unique_id());
+}
+}  // namespace
+
+ModuleAnnotation::ModuleAnnotation(std::string module_name_, int module_id_)
+    : longest_prefix{},
+      title_str{
+          module_id_ >= 0
+              ? absl::StrFormat("XlaModule:#hlo_module=%s,program_id=%d",
+                                module_name_, module_id_)
+              : absl::StrFormat("XlaModule:#hlo_module=%s", module_name_)},
+      title{RegisterString(title_str.c_str())} {}
+
+ModuleAnnotation::ModuleAnnotation(const HloModule& mod)
+    : longest_prefix{GetLongestOpNamePrefix(mod)},
+      title_str{MakeTitle(mod, longest_prefix)},
+      title{RegisterString(title_str.c_str())} {}
+
+std::string_view ModuleAnnotation::longest_op_name_prefix() const {
+  return longest_prefix;
+}
+
+std::string_view ModuleAnnotation::Title() const { return title_str; }
+
+nvtxStringHandle_t ModuleAnnotation::NvtxRegisteredTitle() const {
+  return title;
+}
+
+namespace {
+std::string MakeKernelName(std::string_view prefix,
+                           const HloInstruction& inst) {
+  // Sometimes an instruction doesn't have metadata, but the computations that
+  // it calls do have metadata. Consider all of those metadata op_name entries
+  // and attach the longest prefix to this launch.
+  std::string_view op_name = GetLongestOpNamePrefix(inst);
+  if (op_name.empty()) {
+    return absl::StrFormat("Thunk:#hlo_op=%s#", inst.name());
+  } else if (op_name.substr(0, prefix.size()) != prefix) {
+    // the op_name we got for this instruction does not start with the prefix
+    // that we thought was common to all instructions in the module
+    return absl::StrFormat("Thunk:#name=%s,hlo_op=%s#", op_name, inst.name());
+  } else {
+    // remove the prefix that's in the parent module annotation
+    auto short_name = op_name.substr(prefix.size());
+    // remove the leading / if there is one (prefix might be an empty string)
+    if (!short_name.empty() && short_name.front() == '/') {
+      short_name = short_name.substr(1);
+    }
+    return absl::StrFormat("Thunk:#name=%s,hlo_op=%s#", short_name,
+                           inst.name());
+  }
+}
+}  // namespace
+
+KernelAnnotation::KernelAnnotation(const ModuleAnnotation& module_annotation,
+                                   const HloInstruction& inst)
+    : title_str{MakeKernelName(module_annotation.longest_op_name_prefix(),
+                               inst)},
+      title{RegisterString(title_str.c_str())} {}
+
+std::string_view KernelAnnotation::Title() const { return title_str; }
+
+nvtxStringHandle_t KernelAnnotation::NvtxRegisteredTitle() const {
+  return title;
+}
+
+ModuleAnnotations::ModuleAnnotations(const HloModule& mod) : top_level{mod} {
+  // loop through `mod` and populate `kernels` (string -> KernelAnnotation map)
+  // with the information we want to attach to individual kernels.
+  for (const HloComputation* computation :
+       mod.computations()) {  // top-level blocks in the module
+    for (const HloInstruction* inst :
+         computation->instructions()) {  // statements within block
+      // working assumption: only custom calls and fusions end up with NVTX
+      // ranges named after them. bad assumption [at least partially]: cuda
+      // graph launches are not handled correctly
+      switch (inst->opcode()) {
+        case HloOpcode::kCustomCall:
+        case HloOpcode::kFusion: {
+          // e.g. inst.name is "fusion.6", inst.opcode is "kFusion" and called
+          // is ["fused_computation.5"], in which case the content of
+          // "fused_computation.5" ends up under an NVTX range called
+          // "fusion.6". We want to construct a useful annotation for that NVTX
+          // range based on the content of `inst`, including `called` etc.
+          // FIXME: using try_emplace here was sensitive to
+          // https://github.com/abseil/abseil-cpp/issues/388.
+          kernels.insert({inst->name(), {top_level, *inst}});
+        } break;
+        default:
+          break;
+      }
+    }
+  }
+}
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/annotation.h b/third_party/xla/xla/service/gpu/runtime/annotation.h
new file mode 100644
index 00000000000000..c454d06e74e109
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime/annotation.h
@@ -0,0 +1,59 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_ANNOTATION_H_
+#define XLA_SERVICE_GPU_RUNTIME_ANNOTATION_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "tsl/profiler/lib/nvtx_utils.h"
+
+namespace xla::gpu {
+// Prepared information for the top level NVTX/profiler range covering an
+// HloModule
+struct ModuleAnnotation {
+  ModuleAnnotation(std::string module_name, int module_id);
+  ModuleAnnotation(const HloModule& mod);
+  std::string_view longest_op_name_prefix() const;
+  nvtxStringHandle_t NvtxRegisteredTitle() const;
+  std::string_view Title() const;
+
+ private:
+  std::string longest_prefix;
+  std::string title_str;
+  nvtxStringHandle_t title{};
+};
+
+// Prepared information for a kernel/thunk/fusion/... within an HloModule
+struct KernelAnnotation {
+  KernelAnnotation(const ModuleAnnotation& module_annotaion,
+                   const HloInstruction& inst);
+  nvtxStringHandle_t NvtxRegisteredTitle() const;
+  std::string_view Title() const;
+
+ private:
+  std::string title_str;
+  nvtxStringHandle_t title{};
+};
+// Parsed/prepared information for an HloModule that gets propagated to NVTX
+// ranges/profilers/... at execution time.
+struct ModuleAnnotations {
+  ModuleAnnotations(const HloModule&);
+  ModuleAnnotation top_level;
+  absl::flat_hash_map<std::string_view, KernelAnnotation> kernels{};
+};
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_ANNOTATION_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/collectives.cc b/third_party/xla/xla/service/gpu/runtime/collectives.cc
index 00021ab2a1810c..10ca6375f53e60 100644
--- a/third_party/xla/xla/service/gpu/runtime/collectives.cc
+++ b/third_party/xla/xla/service/gpu/runtime/collectives.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "xla/runtime/custom_call.h"
 #include "xla/runtime/executable.h"
 #include "xla/service/collective_ops_utils.h"
@@ -45,11 +46,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 
 #if XLA_ENABLE_XCCL
-#if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#include "third_party/gpus/cuda/include/driver_types.h"
-#include "third_party/gpus/cuda/include/vector_types.h"
-#endif
+#include "xla/service/gpu/runtime/gpu_kernel_helper.h"
 #include "xla/service/gpu/runtime/sleep_kernel.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
@@ -102,11 +99,22 @@ absl::Status RunSyncOrAsync(
 }
 
 #if XLA_ENABLE_XCCL
+bool ShouldEnableCliqueOptimization(const NcclExecuteParams& params,
+                                    const DebugOptions* debug_options,
+                                    bool no_parallel_custom_call) {
+  // Enable clique optimization for single-host application, which is indicated
+  // by the absence of nccl_unique_id_callback. For multiple-host, only enable
+  // when a debug flag is set for now, due to some divergent compilation issues.
+  return no_parallel_custom_call &&
+         (!params.nccl_unique_id_callback ||
+          debug_options->xla_gpu_enable_nccl_clique_optimization());
+}
+
 StatusOr<NcclComm::Lock> GetNcclComm(
     const NcclExecuteParams& params, int64_t group_mode, int64_t op_id,
     absl::Span<const int64_t> replica_group_offsets,
     absl::Span<const int64_t> replica_group_values, int64_t stream_id,
-    bool enable_clique_optimization_flag) {
+    bool enable_clique_optimization) {
   // TODO(b/233930690): Pass the attribute below as a nested array.
   // Pass an array of arrays using two vectors; one specifying all the values
   // and another specifying the (ending) offsets of each array in the other
@@ -121,12 +129,9 @@ StatusOr<NcclComm::Lock> GetNcclComm(
     replica_groups.push_back(replica_group);
   }
 
-  // Always enable clique optimization for single host, which is indicated by
-  // the absence of nccl_unique_id_callback.
-  return LockNcclComm(
-      params, replica_groups, static_cast<CollectiveOpGroupMode>(group_mode),
-      op_id, stream_id,
-      enable_clique_optimization_flag || !params.nccl_unique_id_callback);
+  return LockNcclComm(params, replica_groups,
+                      static_cast<CollectiveOpGroupMode>(group_mode), op_id,
+                      stream_id, enable_clique_optimization);
 }
 #endif  // XLA_ENABLE_XCCL
 
@@ -189,35 +194,30 @@ absl::Status AsyncDoneImpl(const ServiceExecutableRunOptions* run_options,
 #if XLA_ENABLE_XCCL
 absl::Status NcclMockImplCommon(se::Stream* stream) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  se::gpu::GpuStreamHandle gpu_stream = se::gpu::AsGpuStreamValue(stream);
+#define CHK(x)                                              \
+  if (auto res = (x); res != gpuSuccess) {                  \
+    return absl::InternalError(                             \
+        absl::StrFormat("Call failed with '%s' at line %d", \
+                        gpuGetErrorString(res), __LINE__)); \
+  }
+  auto gpu_stream = se::gpu::AsGpuStreamValue(stream);
   uint32_t sleep_duration_ns = 1000;
   void* kernel = GetSleepKernel();
   dim3 gridDim = {1, 1, 1};
   dim3 blockDim = {512, 1, 1};
-#endif
+
 #if GOOGLE_CUDA
   void* kernel_args[] = {&sleep_duration_ns};
-  cudaError_t launch_status =
-      cudaLaunchKernel(kernel, gridDim, blockDim, kernel_args, 0, gpu_stream);
-  if (launch_status != cudaSuccess) {
-    return absl::InternalError(absl::StrCat("Failed to launch kernel: ",
-                                            cudaGetErrorString(launch_status)));
-  }
-#elif TENSORFLOW_USE_ROCM
-#define CHK(x)                                                  \
-  if (auto res = (x); res != hipSuccess) {                      \
-    return absl::InternalError(                                 \
-        absl::StrFormat("HIP call failed with '%s' at line %d", \
-                        hipGetErrorString(res), __LINE__));     \
-  }
+#else
   int devID = 0;
   hipDeviceProp_t prop{};
   CHK(hipGetDevice(&devID));
   CHK(hipGetDeviceProperties(&prop, devID));
   void* kernel_args[] = {&sleep_duration_ns, &prop.clockRate};
-  CHK(hipLaunchKernel(kernel, gridDim, blockDim, kernel_args, 0, gpu_stream));
+#endif
+  CHK(gpuLaunchKernel(kernel, gridDim, blockDim, kernel_args, 0, gpu_stream));
 #undef CHK
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   return absl::OkStatus();
 }
 #endif  // XLA_ENABLE_XCCL
@@ -240,6 +240,7 @@ absl::Status P2PImplCommon(const ServiceExecutableRunOptions* run_options,
                            const DebugOptions* debug_options,
                            se::Stream* stream, CustomCall::RemainingArgs args,
                            int64_t group_mode, int64_t op_id,
+                           bool no_parallel_custom_call,
                            absl::Span<const int64_t> replica_group_offsets,
                            absl::Span<const int64_t> replica_group_values,
                            absl::Span<const int64_t> source_peers,
@@ -247,13 +248,15 @@ absl::Status P2PImplCommon(const ServiceExecutableRunOptions* run_options,
                            NcclP2PRunner runner,
                            DeviceBuffersGetter device_buffers_getter,
                            uint64_t stream_id) {
+  (void)no_parallel_custom_call;
   NcclExecuteParams params(*run_options, stream->parent());
+  bool enable_clique_opt = ShouldEnableCliqueOptimization(
+      params, debug_options, no_parallel_custom_call);
 
   const std::string device_string =
       NcclCollectiveThunk::GetDeviceString(params);
-  auto comm = GetNcclComm(
-      params, group_mode, op_id, replica_group_offsets, replica_group_values,
-      stream_id, debug_options->xla_gpu_enable_nccl_clique_optimization());
+  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                          replica_group_values, stream_id, enable_clique_opt);
   if (!comm.ok()) return comm.status();
 
   auto device_buffers = device_buffers_getter(args);
@@ -295,12 +298,14 @@ absl::Status CollectivePermuteImpl(
     const DebugOptions* debug_options, CollectivesSupport* collectives,
     AsyncCollectivesSupport* async_collectives, CustomCall::RemainingArgs args,
     int32_t uid, int64_t group_mode, int64_t op_id, bool is_async,
+    bool no_parallel_custom_call,
     absl::Span<const int64_t> replica_group_offsets,
     absl::Span<const int64_t> replica_group_values,
     absl::Span<const int64_t> source_peers,
     absl::Span<const int64_t> target_peers) {
 #if XLA_ENABLE_XCCL
-  VLOG(3) << "Running CollectivePermute " << (is_async ? "(Async)" : "(Sync)");
+  VLOG(3) << "Running CollectivePermute " << (is_async ? "(Async) " : "(Sync) ")
+          << no_parallel_custom_call;
   return RunSyncOrAsync(
       run_options, collectives, async_collectives, uid, is_async,
       [&](se::Stream* stream) {
@@ -310,10 +315,10 @@ absl::Status CollectivePermuteImpl(
           return NcclMockImplCommon(stream);
         }
         return P2PImplCommon(run_options, debug_options, stream, args,
-                             group_mode, op_id, replica_group_offsets,
-                             replica_group_values, source_peers, target_peers,
-                             RunCollectivePermute, GetDeviceBufferPairs,
-                             GetStreamId(is_async));
+                             group_mode, op_id, no_parallel_custom_call,
+                             replica_group_offsets, replica_group_values,
+                             source_peers, target_peers, RunCollectivePermute,
+                             GetDeviceBufferPairs, GetStreamId(is_async));
       });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL disabled");
@@ -332,6 +337,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
         .Attr<int64_t>("op_id")
         .Attr<bool>("is_async")
+        .Attr<bool>("no_parallel_custom_call")
         .Attr<absl::Span<const int64_t>>("replica_group_offsets")
         .Attr<absl::Span<const int64_t>>("replica_group_values")
         .Attr<absl::Span<const int64_t>>("source_peers")
@@ -347,7 +353,7 @@ static absl::Status P2PSendImpl(const ServiceExecutableRunOptions* run_options,
                                 AsyncCollectivesSupport* async_collectives,
                                 CustomCall::RemainingArgs args, int32_t uid,
                                 int64_t group_mode, int64_t op_id,
-                                bool is_async,
+                                bool is_async, bool no_parallel_custom_call,
                                 absl::Span<const int64_t> replica_group_offsets,
                                 absl::Span<const int64_t> replica_group_values,
                                 absl::Span<const int64_t> source_peers,
@@ -355,14 +361,17 @@ static absl::Status P2PSendImpl(const ServiceExecutableRunOptions* run_options,
 #if XLA_ENABLE_XCCL
   VLOG(3) << "Running Send";
   TF_RET_CHECK(is_async);
+  // The scheduler guarantee no_parallel_custom_call for P2P chain, which is not
+  // reflected in the default value for the attribute.
   return RunSyncOrAsync(
       run_options, collectives, async_collectives, uid, is_async,
       [&](se::Stream* stream) {
-        return P2PImplCommon(run_options, debug_options, stream, args,
-                             group_mode, op_id, replica_group_offsets,
-                             replica_group_values, source_peers, target_peers,
-                             RunSend, GetSingleArgAsDeviceBufferPair,
-                             GetStreamId(is_async, kAsyncStreamP2P));
+        return P2PImplCommon(
+            run_options, debug_options, stream, args, group_mode, op_id,
+            /*no_parallel_custom_call=*/true, replica_group_offsets,
+            replica_group_values, source_peers, target_peers, RunSend,
+            GetSingleArgAsDeviceBufferPair,
+            GetStreamId(is_async, kAsyncStreamP2P));
       },
       kAsyncStreamP2P);
 #else   // XLA_ENABLE_XCCL
@@ -382,6 +391,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
         .Attr<int64_t>("op_id")
         .Attr<bool>("is_async")
+        .Attr<bool>("no_parallel_custom_call")
         .Attr<absl::Span<const int64_t>>("replica_group_offsets")
         .Attr<absl::Span<const int64_t>>("replica_group_values")
         .Attr<absl::Span<const int64_t>>("source_peers")
@@ -397,7 +407,7 @@ static absl::Status P2PRecvImpl(const ServiceExecutableRunOptions* run_options,
                                 AsyncCollectivesSupport* async_collectives,
                                 CustomCall::RemainingArgs args, int32_t uid,
                                 int64_t group_mode, int64_t op_id,
-                                bool is_async,
+                                bool is_async, bool no_parallel_custom_call,
                                 absl::Span<const int64_t> replica_group_offsets,
                                 absl::Span<const int64_t> replica_group_values,
                                 absl::Span<const int64_t> source_peers,
@@ -405,14 +415,17 @@ static absl::Status P2PRecvImpl(const ServiceExecutableRunOptions* run_options,
 #if XLA_ENABLE_XCCL
   VLOG(3) << "Running Recv";
   TF_RET_CHECK(is_async);
+  // The scheduler guarantee no_parallel_custom_call for P2P chain, which is not
+  // reflected in the default value for the attribute.
   return RunSyncOrAsync(
       run_options, collectives, async_collectives, uid, is_async,
       [&](se::Stream* stream) {
-        return P2PImplCommon(run_options, debug_options, stream, args,
-                             group_mode, op_id, replica_group_offsets,
-                             replica_group_values, source_peers, target_peers,
-                             RunRecv, GetSingleArgAsDeviceBufferPair,
-                             GetStreamId(is_async, kAsyncStreamP2P));
+        return P2PImplCommon(
+            run_options, debug_options, stream, args, group_mode, op_id,
+            /*no_parallel_custom_call=*/true, replica_group_offsets,
+            replica_group_values, source_peers, target_peers, RunRecv,
+            GetSingleArgAsDeviceBufferPair,
+            GetStreamId(is_async, kAsyncStreamP2P));
       },
       kAsyncStreamP2P);
 #else   // XLA_ENABLE_XCCL
@@ -432,6 +445,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
         .Attr<int64_t>("op_id")
         .Attr<bool>("is_async")
+        .Attr<bool>("no_parallel_custom_call")
         .Attr<absl::Span<const int64_t>>("replica_group_offsets")
         .Attr<absl::Span<const int64_t>>("replica_group_values")
         .Attr<absl::Span<const int64_t>>("source_peers")
@@ -447,14 +461,15 @@ absl::Status AllGatherImplCommon(
     const DebugOptions* debug_options, se::Stream* stream,
     CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
     absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values, bool is_async) {
+    absl::Span<const int64_t> replica_group_values, bool is_async,
+    bool no_parallel_custom_call) {
   NcclExecuteParams params(*run_options, stream->parent());
-
+  bool enable_clique_opt = ShouldEnableCliqueOptimization(
+      params, debug_options, no_parallel_custom_call);
   TF_ASSIGN_OR_RETURN(
-      auto comm,
-      GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                  replica_group_values, GetStreamId(is_async),
-                  debug_options->xla_gpu_enable_nccl_clique_optimization()));
+      auto comm, GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                             replica_group_values, GetStreamId(is_async),
+                             enable_clique_opt));
 
   TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
 
@@ -470,10 +485,12 @@ absl::Status AllGatherImpl(const ServiceExecutableRunOptions* run_options,
                            AsyncCollectivesSupport* async_collectives,
                            CustomCall::RemainingArgs args, int32_t uid,
                            int64_t group_mode, int64_t op_id, bool is_async,
+                           bool no_parallel_custom_call,
                            absl::Span<const int64_t> replica_group_offsets,
                            absl::Span<const int64_t> replica_group_values) {
 #if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllGather " << (is_async ? "(Async)" : "(Sync)");
+  VLOG(3) << "Running AllGather " << (is_async ? "(Async) " : "(Sync) ")
+          << no_parallel_custom_call;
   return RunSyncOrAsync(
       run_options, collectives, async_collectives, uid, is_async,
       [&](se::Stream* stream) {
@@ -484,7 +501,8 @@ absl::Status AllGatherImpl(const ServiceExecutableRunOptions* run_options,
         }
         return AllGatherImplCommon(run_options, debug_options, stream, args,
                                    group_mode, op_id, replica_group_offsets,
-                                   replica_group_values, is_async);
+                                   replica_group_values, is_async,
+                                   no_parallel_custom_call);
       });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL diasbled");
@@ -503,6 +521,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
         .Attr<int64_t>("op_id")
         .Attr<bool>("is_async")
+        .Attr<bool>("no_parallel_custom_call")
         .Attr<absl::Span<const int64_t>>("replica_group_offsets")
         .Attr<absl::Span<const int64_t>>("replica_group_values"));
 
@@ -516,14 +535,16 @@ absl::Status AllReduceImplCommon(
     const DebugOptions* debug_options, se::Stream* stream,
     CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
     int64_t reduction_kind, absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values, bool is_async) {
+    absl::Span<const int64_t> replica_group_values, bool is_async,
+    bool no_parallel_custom_call) {
   NcclExecuteParams params(*run_options, stream->parent());
+  bool enable_clique_opt = ShouldEnableCliqueOptimization(
+      params, debug_options, no_parallel_custom_call);
 
   TF_ASSIGN_OR_RETURN(
-      auto comm,
-      GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                  replica_group_values, GetStreamId(is_async),
-                  debug_options->xla_gpu_enable_nccl_clique_optimization()));
+      auto comm, GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                             replica_group_values, GetStreamId(is_async),
+                             enable_clique_opt));
 
   TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
 
@@ -541,11 +562,12 @@ absl::Status AllReduceImpl(const ServiceExecutableRunOptions* run_options,
                            AsyncCollectivesSupport* async_collectives,
                            CustomCall::RemainingArgs args, int32_t uid,
                            int64_t group_mode, int64_t op_id, bool is_async,
-                           int64_t reduction_kind,
+                           bool no_parallel_custom_call, int64_t reduction_kind,
                            absl::Span<const int64_t> replica_group_offsets,
                            absl::Span<const int64_t> replica_group_values) {
 #if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllReduce " << (is_async ? "(Async)" : "(Sync)");
+  VLOG(3) << "Running AllReduce " << (is_async ? "(Async) " : "(Sync) ")
+          << no_parallel_custom_call;
   return RunSyncOrAsync(
       run_options, collectives, async_collectives, uid, is_async,
       [&](se::Stream* stream) {
@@ -557,7 +579,7 @@ absl::Status AllReduceImpl(const ServiceExecutableRunOptions* run_options,
         return AllReduceImplCommon(run_options, debug_options, stream, args,
                                    group_mode, op_id, reduction_kind,
                                    replica_group_offsets, replica_group_values,
-                                   is_async);
+                                   is_async, no_parallel_custom_call);
       });
 #else   // XLA_ENABLE_XCCL
   // NCCL disabled.
@@ -577,6 +599,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
         .Attr<int64_t>("op_id")
         .Attr<bool>("is_async")
+        .Attr<bool>("no_parallel_custom_call")
         .Attr<int64_t>("reduction_kind")  // ReductionKind
         .Attr<absl::Span<const int64_t>>("replica_group_offsets")
         .Attr<absl::Span<const int64_t>>("replica_group_values"));
@@ -594,14 +617,15 @@ absl::Status AllToAllImplCommon(const ServiceExecutableRunOptions* run_options,
                                 int64_t op_id,
                                 absl::Span<const int64_t> replica_group_offsets,
                                 absl::Span<const int64_t> replica_group_values,
-                                bool is_async) {
+                                bool is_async, bool no_parallel_custom_call) {
   NcclExecuteParams params(*run_options, stream->parent());
+  bool enable_clique_opt = ShouldEnableCliqueOptimization(
+      params, debug_options, no_parallel_custom_call);
 
   TF_ASSIGN_OR_RETURN(
-      auto comm,
-      GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                  replica_group_values, GetStreamId(is_async),
-                  debug_options->xla_gpu_enable_nccl_clique_optimization()));
+      auto comm, GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                             replica_group_values, GetStreamId(is_async),
+                             enable_clique_opt));
 
   TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
 
@@ -619,10 +643,12 @@ absl::Status AllToAllImpl(const ServiceExecutableRunOptions* run_options,
                           CustomCall::RemainingArgs args, int32_t uid,
                           int64_t group_mode, bool has_split_dimension,
                           int64_t op_id, bool is_async,
+                          bool no_parallel_custom_call,
                           absl::Span<const int64_t> replica_group_offsets,
                           absl::Span<const int64_t> replica_group_values) {
 #if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllToAll " << (is_async ? "(Async)" : "(Sync)");
+  VLOG(3) << "Running AllToAll " << (is_async ? "(Async) " : "(Sync) ")
+          << no_parallel_custom_call;
   return RunSyncOrAsync(
       run_options, collectives, async_collectives, uid, is_async,
       [&](se::Stream* stream) {
@@ -634,7 +660,7 @@ absl::Status AllToAllImpl(const ServiceExecutableRunOptions* run_options,
         return AllToAllImplCommon(run_options, debug_options, stream, args,
                                   group_mode, has_split_dimension, op_id,
                                   replica_group_offsets, replica_group_values,
-                                  is_async);
+                                  is_async, no_parallel_custom_call);
       });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL disabled");
@@ -654,6 +680,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .Attr<bool>("has_split_dimension")
         .Attr<int64_t>("op_id")
         .Attr<bool>("is_async")
+        .Attr<bool>("no_parallel_custom_call")
         .Attr<absl::Span<const int64_t>>("replica_group_offsets")
         .Attr<absl::Span<const int64_t>>("replica_group_values"));
 
@@ -667,14 +694,16 @@ absl::Status ReduceScatterImplCommon(
     const DebugOptions* debug_options, se::Stream* stream,
     CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
     int64_t reduction_kind, absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values, bool is_async) {
+    absl::Span<const int64_t> replica_group_values, bool is_async,
+    bool no_parallel_custom_call) {
   NcclExecuteParams params(*run_options, stream->parent());
+  bool enable_clique_opt = ShouldEnableCliqueOptimization(
+      params, debug_options, no_parallel_custom_call);
 
   TF_ASSIGN_OR_RETURN(
-      auto comm,
-      GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                  replica_group_values, GetStreamId(is_async),
-                  debug_options->xla_gpu_enable_nccl_clique_optimization()));
+      auto comm, GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                             replica_group_values, GetStreamId(is_async),
+                             enable_clique_opt));
 
   TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
 
@@ -692,11 +721,13 @@ absl::Status ReduceScatterImpl(const ServiceExecutableRunOptions* run_options,
                                AsyncCollectivesSupport* async_collectives,
                                CustomCall::RemainingArgs args, int32_t uid,
                                int64_t group_mode, int64_t op_id, bool is_async,
+                               bool no_parallel_custom_call,
                                int64_t reduction_kind,
                                absl::Span<const int64_t> replica_group_offsets,
                                absl::Span<const int64_t> replica_group_values) {
 #if XLA_ENABLE_XCCL
-  VLOG(3) << "Running ReduceScatter " << (is_async ? "(Async)" : "(Sync)");
+  VLOG(3) << "Running ReduceScatter " << (is_async ? "(Async) " : "(Sync) ")
+          << no_parallel_custom_call;
   return RunSyncOrAsync(
       run_options, collectives, async_collectives, uid, is_async,
       [&](se::Stream* stream) {
@@ -705,10 +736,10 @@ absl::Status ReduceScatterImpl(const ServiceExecutableRunOptions* run_options,
         if (gpu_opts && gpu_opts->enable_mock_nccl_collectives()) {
           return NcclMockImplCommon(stream);
         }
-        return ReduceScatterImplCommon(run_options, debug_options, stream, args,
-                                       group_mode, op_id, reduction_kind,
-                                       replica_group_offsets,
-                                       replica_group_values, is_async);
+        return ReduceScatterImplCommon(
+            run_options, debug_options, stream, args, group_mode, op_id,
+            reduction_kind, replica_group_offsets, replica_group_values,
+            is_async, no_parallel_custom_call);
       });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL disabled");
@@ -727,6 +758,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
         .Attr<int64_t>("op_id")
         .Attr<bool>("is_async")
+        .Attr<bool>("no_parallel_custom_call")
         .Attr<int64_t>("reduction_kind")  // ReductionKind
         .Attr<absl::Span<const int64_t>>("replica_group_offsets")
         .Attr<absl::Span<const int64_t>>("replica_group_values"));
diff --git a/third_party/xla/xla/service/gpu/runtime/executable.cc b/third_party/xla/xla/service/gpu/runtime/executable.cc
index bb355d748e3e9c..50e3c5fc44404d 100644
--- a/third_party/xla/xla/service/gpu/runtime/executable.cc
+++ b/third_party/xla/xla/service/gpu/runtime/executable.cc
@@ -87,7 +87,6 @@ void RegisterXlaGpuRuntimeCustomCalls(DirectCustomCallRegistry& registry) {
   RegisterIoFeedCustomCalls(registry);
   RegisterMemsetCustomCalls(registry);
   RegisterSendRecvCustomCalls(registry);
-  RegisterTopkCustomCall(registry);
 
 #if GOOGLE_CUDA || TF_HIPBLASLT
   RegisterMatmulCustomCalls(registry);
@@ -104,6 +103,7 @@ void RegisterXlaGpuRuntimeCustomCalls(DirectCustomCallRegistry& registry) {
   RegisterStreamSynchronizationCustomCalls(registry);
   RegisterCubSortCustomCalls(registry);
   RegisterXlaClassicCustomCalls(registry);
+  RegisterTopkCustomCall(registry);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
 
@@ -494,7 +494,7 @@ Status GpuRuntimeExecutable::Execute(
 
 //===----------------------------------------------------------------------===//
 
-Executable& GpuRuntimeExecutable::executable() {
+const Executable& GpuRuntimeExecutable::executable() const {
   if (auto* jit = std::get_if<std::unique_ptr<JitExecutable>>(&executable_)) {
     return *(*jit)->DefaultExecutable();
   }
@@ -502,10 +502,7 @@ Executable& GpuRuntimeExecutable::executable() {
 }
 
 StatusOr<std::string_view> GpuRuntimeExecutable::GetObjFile() const {
-  const auto* jit = std::get_if<std::unique_ptr<JitExecutable>>(&executable_);
-  if (!jit) return InternalError("ObjFile is not available");
-
-  if (auto obj_file = (*jit)->DefaultExecutable()->obj_file())
+  if (auto obj_file = executable().obj_file())
     return std::string_view(obj_file->getBuffer());
 
   return InternalError("gpu runtime executable didn't save the obj file");
diff --git a/third_party/xla/xla/service/gpu/runtime/executable.h b/third_party/xla/xla/service/gpu/runtime/executable.h
index e3951647fedf44..fbc95fe960ac45 100644
--- a/third_party/xla/xla/service/gpu/runtime/executable.h
+++ b/third_party/xla/xla/service/gpu/runtime/executable.h
@@ -139,7 +139,11 @@ class GpuRuntimeExecutable {
 
   // Depending on the state of `executable_` returns a reference to active
   // Xla runtime executable.
-  runtime::Executable& executable();
+  runtime::Executable& executable() {
+    return const_cast<runtime::Executable&>(
+        const_cast<const GpuRuntimeExecutable*>(this)->executable());
+  }
+  const runtime::Executable& executable() const;
 
   std::vector<int64_t> buffer_sizes_;
 
diff --git a/third_party/xla/xla/service/gpu/runtime/fused_attention.cc b/third_party/xla/xla/service/gpu/runtime/fused_attention.cc
index 3174445ef1ee5e..9dbd3e6aa1907f 100644
--- a/third_party/xla/xla/service/gpu/runtime/fused_attention.cc
+++ b/third_party/xla/xla/service/gpu/runtime/fused_attention.cc
@@ -114,6 +114,10 @@ static auto EncodeFusedAttentionBackwardDAGSignature(
     lmhlo_gpu::FusedMhaBackwardDagSignature signature) {
   switch (signature) {
     // backward
+    case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::BackwardSoftmax:
+      return xla::gpu::CudnnfMHAKind::kBackwardSoftmax;
+    case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::BackwardSoftmaxDropout:
+      return xla::gpu::CudnnfMHAKind::kBackwardSoftmaxDropout;
     case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::
         BackwardScaleBiasSoftmax:
       return xla::gpu::CudnnfMHAKind::kBackwardScaleBiasSoftmax;
@@ -193,8 +197,8 @@ static GpufMHADescriptor GetGpufMHADescriptor(
     absl::Span<const int64_t> intermediate_tensor_dimensions,
     absl::Span<const int64_t> intermediate_tensor_layout, AlgorithmConfig algo,
     DotDimensionNumbers bmm1_dot_dimension_numbers,
-    DotDimensionNumbers bmm2_dot_dimension_numbers,
-    std::optional<DropoutAttrs> dropout = std::nullopt) {
+    DotDimensionNumbers bmm2_dot_dimension_numbers, bool is_flash_attention,
+    bool is_causal_mask, std::optional<DropoutAttrs> dropout = std::nullopt) {
   GpufMHADescriptor descriptor;
   descriptor.backend_config.set_fmha_scale(fmha_scale);
 
@@ -250,7 +254,8 @@ static GpufMHADescriptor GetGpufMHADescriptor(
   }
 
   descriptor.kind = kind;
-
+  descriptor.is_flash_attention = is_flash_attention;
+  descriptor.is_causal_mask = is_causal_mask;
   return descriptor;
 }
 
@@ -262,11 +267,19 @@ static GpufMHABackwardDescriptor GetGpufMHABackwardDescriptor(
     std::optional<StridedMemrefView> mask,
     std::optional<StridedMemrefView> d_bias, StridedMemrefView d_bmm1_lhs,
     StridedMemrefView d_bmm1_rhs, StridedMemrefView d_bmm2_rhs,
-    StridedMemrefView d_S, double fmha_scale, AlgorithmConfig algo,
+    std::optional<StridedMemrefView> d_S,
+    std::optional<StridedMemrefView> softmax_sum,
+    std::optional<StridedMemrefView> d_Q_accum,
+    std::optional<StridedMemrefView> fwd_output,
+    std::optional<StridedMemrefView> bias, double fmha_scale,
+    AlgorithmConfig algo,
     DotDimensionNumbers bmm1_grad_gemm1_dot_dimension_numbers,
     DotDimensionNumbers bmm1_grad_gemm2_dot_dimension_numbers,
     DotDimensionNumbers bmm2_grad_gemm1_dot_dimension_numbers,
     DotDimensionNumbers bmm2_grad_gemm2_dot_dimension_numbers,
+    absl::Span<const int64_t> intermediate_tensor_dimensions,
+    absl::Span<const int64_t> intermediate_tensor_layout,
+    bool is_flash_attention, bool is_causal_mask,
     std::optional<DropoutAttrs> dropout_attrs = std::nullopt) {
   GpufMHABackwardDescriptor descriptor;
   descriptor.backend_config.set_fmha_scale(fmha_scale);
@@ -313,7 +326,15 @@ static GpufMHABackwardDescriptor GetGpufMHABackwardDescriptor(
   descriptor.bmm1_grad_gemm1_rhs_shape = apply_shape(bmm1_grad_gemm1_rhs);
   descriptor.bmm1_grad_gemm2_rhs_shape = apply_shape(bmm1_grad_gemm2_rhs);
   descriptor.bmm2_grad_gemm2_rhs_shape = apply_shape(bmm2_grad_gemm2_rhs);
-  descriptor.bmm2_grad_gemm1_lhs_shape = apply_shape(bmm2_grad_gemm1_lhs);
+  if (is_flash_attention) {
+    // if it is flash attention then bmm2_grad_gemm1_lhs will be softmax_stats
+    // instead of P we need to use real P layout
+    descriptor.bmm2_grad_gemm1_lhs_shape = ShapeUtil::MakeShapeWithDenseLayout(
+        descriptor.bmm2_grad_gemm2_rhs_shape.element_type(),
+        intermediate_tensor_dimensions, intermediate_tensor_layout);
+  } else {
+    descriptor.bmm2_grad_gemm1_lhs_shape = apply_shape(bmm2_grad_gemm1_lhs);
+  }
 
   descriptor.d_output_shape = apply_shape(d_output);
   descriptor.d_bmm1_lhs_shape = apply_shape(d_bmm1_lhs);
@@ -326,14 +347,20 @@ static GpufMHABackwardDescriptor GetGpufMHABackwardDescriptor(
   if (d_bias.has_value()) {
     descriptor.d_bias_shape = apply_shape(*d_bias);
   }
-
+  if (fwd_output.has_value()) {
+    descriptor.fwd_output_shape = apply_shape(*fwd_output);
+  }
+  if (bias.has_value()) {
+    descriptor.bias_shape = apply_shape(*bias);
+  }
   if (dropout_attrs.has_value()) {
     descriptor.backend_config.set_dropout_rate(dropout_attrs->dropout_rate);
     descriptor.backend_config.set_seed(dropout_attrs->seed);
   }
 
   descriptor.kind = kind;
-
+  descriptor.is_flash_attention = is_flash_attention;
+  descriptor.is_causal_mask = is_causal_mask;
   return descriptor;
 }
 
@@ -344,7 +371,8 @@ static absl::Status FusedAttentionForwardImpl(
     StridedMemrefView rhs_bmm2, std::optional<StridedMemrefView> mask,
     std::optional<StridedMemrefView> bias, StridedMemrefView output,
     FlatMemrefView scratch, std::optional<StridedMemrefView> activation,
-    int64_t uid, double fmha_scale,
+    int64_t uid, double fmha_scale, bool is_flash_attention,
+    bool is_causal_mask,
     absl::Span<const int64_t> intermediate_tensor_dimensions,
     absl::Span<const int64_t> intermediate_tensor_layout,
     DotDimensionNumbers bmm1_dot_dimension_numbers,
@@ -364,7 +392,7 @@ static absl::Status FusedAttentionForwardImpl(
             fmha_scale, intermediate_tensor_dimensions,
             intermediate_tensor_layout, algorithm_config,
             bmm1_dot_dimension_numbers, bmm2_dot_dimension_numbers,
-            dropout_attrs);
+            is_flash_attention, is_causal_mask, dropout_attrs);
 
         StatusOr<GpufMHAConfig> config = GpufMHAConfig::For(descriptor);
         if (!config.ok()) return tsl::ToAbslStatus(config.status());
@@ -414,10 +442,17 @@ static absl::Status FusedAttentionBackwardImpl(
     StridedMemrefView bmm1_grad_gemm2_rhs,
     StridedMemrefView bmm2_grad_gemm2_rhs,
     StridedMemrefView bmm2_grad_gemm1_lhs, StridedMemrefView d_output,
-    std::optional<StridedMemrefView> mask, StridedMemrefView d_bmm1_lhs,
+    std::optional<StridedMemrefView> mask,
+    std::optional<StridedMemrefView> bias,
+    std::optional<StridedMemrefView> fwd_output, StridedMemrefView d_bmm1_lhs,
     StridedMemrefView d_bmm1_rhs, StridedMemrefView d_bmm2_rhs,
-    StridedMemrefView d_S, FlatMemrefView scratch,
+    std::optional<StridedMemrefView> d_S,
+    std::optional<StridedMemrefView> softmax_sum,
+    std::optional<StridedMemrefView> d_Q_accum, FlatMemrefView scratch,
     std::optional<StridedMemrefView> d_bias, int64_t uid, double fmha_scale,
+    bool is_flash_attention, bool is_causal_mask,
+    absl::Span<const int64_t> intermediate_tensor_dimensions,
+    absl::Span<const int64_t> intermediate_tensor_layout,
     DotDimensionNumbers bmm1_grad_gemm1_dot_dimension_numbers,
     DotDimensionNumbers bmm1_grad_gemm2_dot_dimension_numbers,
     DotDimensionNumbers bmm2_grad_gemm1_dot_dimension_numbers,
@@ -436,12 +471,13 @@ static absl::Status FusedAttentionBackwardImpl(
         GpufMHABackwardDescriptor descriptor = GetGpufMHABackwardDescriptor(
             kind, bmm1_grad_gemm1_rhs, bmm1_grad_gemm2_rhs, bmm2_grad_gemm2_rhs,
             bmm2_grad_gemm1_lhs, d_output, mask, d_bias, d_bmm1_lhs, d_bmm1_rhs,
-            d_bmm2_rhs, d_S, fmha_scale, algorithm_config,
-            bmm1_grad_gemm1_dot_dimension_numbers,
+            d_bmm2_rhs, d_S, softmax_sum, d_Q_accum, fwd_output, bias,
+            fmha_scale, algorithm_config, bmm1_grad_gemm1_dot_dimension_numbers,
             bmm1_grad_gemm2_dot_dimension_numbers,
             bmm2_grad_gemm1_dot_dimension_numbers,
-            bmm2_grad_gemm2_dot_dimension_numbers, dropout_attrs);
-
+            bmm2_grad_gemm2_dot_dimension_numbers,
+            intermediate_tensor_dimensions, intermediate_tensor_layout,
+            is_flash_attention, is_causal_mask, dropout_attrs);
         StatusOr<GpufMHABackwardConfig> config =
             GpufMHABackwardConfig::For(descriptor);
         if (!config.ok()) return tsl::ToAbslStatus(config.status());
@@ -463,9 +499,13 @@ static absl::Status FusedAttentionBackwardImpl(
   se::DeviceMemoryBase d_bmm1_lhs_buffer = GetDeviceAddress(d_bmm1_lhs);
   se::DeviceMemoryBase d_bmm1_rhs_buffer = GetDeviceAddress(d_bmm1_rhs);
   se::DeviceMemoryBase d_bmm2_rhs_buffer = GetDeviceAddress(d_bmm2_rhs);
-  se::DeviceMemoryBase d_S_buffer = GetDeviceAddress(d_S);
   se::DeviceMemoryBase scratch_buffer = GetDeviceAddress(scratch);
 
+  se::DeviceMemoryBase d_S_buffer;
+  if (d_S.has_value()) {
+    d_S_buffer = GetDeviceAddress(*d_S);
+  }
+
   se::DeviceMemoryBase mask_buffer;
   if (mask.has_value()) {
     mask_buffer = GetDeviceAddress(*mask);
@@ -476,6 +516,26 @@ static absl::Status FusedAttentionBackwardImpl(
     d_bias_buffer = GetDeviceAddress(*d_bias);
   }
 
+  se::DeviceMemoryBase softmax_sum_buffer;
+  if (softmax_sum.has_value()) {
+    softmax_sum_buffer = GetDeviceAddress(*softmax_sum);
+  }
+
+  se::DeviceMemoryBase d_Q_accum_buffer;
+  if (d_Q_accum.has_value()) {
+    d_Q_accum_buffer = GetDeviceAddress(*d_Q_accum);
+  }
+
+  se::DeviceMemoryBase fwd_output_buffer;
+  if (fwd_output.has_value()) {
+    fwd_output_buffer = GetDeviceAddress(*fwd_output);
+  }
+
+  se::DeviceMemoryBase bias_buffer;
+  if (bias.has_value()) {
+    bias_buffer = GetDeviceAddress(*bias);
+  }
+
   RunFusedMHABackwardOptions opts;
   opts.runner_cache = &(*fda)->runner;
 
@@ -484,7 +544,9 @@ static absl::Status FusedAttentionBackwardImpl(
       (*fda)->config, bmm1_grad_gemm1_rhs_buffer, bmm1_grad_gemm2_rhs_buffer,
       bmm2_grad_gemm1_lhs_buffer, bmm2_grad_gemm2_rhs_buffer, d_output_buffer,
       scratch_buffer, d_bmm1_lhs_buffer, d_bmm1_rhs_buffer, d_bmm2_rhs_buffer,
-      d_S_buffer, mask_buffer, d_bias_buffer, run_options->stream(), opts);
+      d_S_buffer, softmax_sum_buffer, d_Q_accum_buffer, mask_buffer,
+      d_bias_buffer, fwd_output_buffer, bias_buffer, run_options->stream(),
+      opts);
   if (!st.ok() || !run_options->stream()->ok()) {
     return tsl::ToAbslStatus(st);
   }
@@ -500,6 +562,8 @@ auto BindFusedAttentionAttributes(runtime::CustomCallBinding<Ts...> binding) {
   return std::move(binding)
       .template Attr<int64_t>("uid")
       .template Attr<double>("fmha_scale")
+      .template Attr<bool>("is_flash_attention")
+      .template Attr<bool>("is_causal_mask")
       .template Attr<absl::Span<const int64_t>>(
           "intermediate_tensor_dimensions")
       .template Attr<absl::Span<const int64_t>>("intermediate_tensor_layout")
@@ -805,6 +869,11 @@ auto BindFusedAttentionBackwardAttributes(
   return std::move(binding)
       .template Attr<int64_t>("uid")
       .template Attr<double>("fmha_scale")
+      .template Attr<bool>("is_flash_attention")
+      .template Attr<bool>("is_causal_mask")
+      .template Attr<absl::Span<const int64_t>>(
+          "intermediate_tensor_dimensions")
+      .template Attr<absl::Span<const int64_t>>("intermediate_tensor_layout")
       .template Attr<DotDimensionNumbers>(
           "bmm1_grad_gemm1_dot_dimension_numbers")
       .template Attr<DotDimensionNumbers>(
@@ -822,11 +891,11 @@ auto FusedAttentionBackwardCall(const char* name) {
       .UserData<const ServiceExecutableRunOptions*>()
       .UserData<const DebugOptions*>()
       .State<FusedAttentionBackwardRunner>("uid")
-      .Arg<StridedMemrefView>()  // bmm1_grad_gemm1_rhs
-      .Arg<StridedMemrefView>()  // bmm1_grad_gemm2_rhs
-      .Arg<StridedMemrefView>()  // bmm2_grad_gemm2_rhs
-      .Arg<StridedMemrefView>()  // bmm2_grad_gemm1_lhs
-      .Arg<StridedMemrefView>();
+      .Arg<StridedMemrefView>()   // bmm1_grad_gemm1_rhs
+      .Arg<StridedMemrefView>()   // bmm1_grad_gemm2_rhs
+      .Arg<StridedMemrefView>()   // bmm2_grad_gemm2_rhs
+      .Arg<StridedMemrefView>()   // bmm2_grad_gemm1_lhs
+      .Arg<StridedMemrefView>();  // d_output
 }
 
 XLA_RUNTIME_DEFINE_CUSTOM_CALL(
@@ -836,10 +905,14 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         FusedAttentionBackwardCall(
             "xla.gpu.fused.attention.backward.scale.dbias.softmax")
             .Value(std::optional<StridedMemrefView>())  // mask
+            .Value(std::optional<StridedMemrefView>())  // bias
+            .Value(std::optional<StridedMemrefView>())  // fwd_output
             .Arg<StridedMemrefView>()                   // d_bmm1_lhs
             .Arg<StridedMemrefView>()                   // d_bmm1_rhs
             .Arg<StridedMemrefView>()                   // d_bmm2_rhs
             .Arg<StridedMemrefView>()                   // d_S
+            .Value(std::optional<StridedMemrefView>())  // softmax_sum
+            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
             .Arg<FlatMemrefView>()                      // scratch
             .Arg<StridedMemrefView>()                   // d_bias
         )
@@ -854,10 +927,14 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         FusedAttentionBackwardCall(
             "xla.gpu.fused.attention.backward.scale.softmax")
             .Value(std::optional<StridedMemrefView>())  // mask
+            .Value(std::optional<StridedMemrefView>())  // bias
+            .Value(std::optional<StridedMemrefView>())  // fwd_output
             .Arg<StridedMemrefView>()                   // d_bmm1_lhs
             .Arg<StridedMemrefView>()                   // d_bmm1_rhs
             .Arg<StridedMemrefView>()                   // d_bmm2_rhs
             .Arg<StridedMemrefView>()                   // d_S
+            .Value(std::optional<StridedMemrefView>())  // softmax_sum
+            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
             .Arg<FlatMemrefView>()                      // scratch
             .Value(std::optional<StridedMemrefView>())  // d_bias
         )
@@ -872,10 +949,14 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         FusedAttentionBackwardCall(
             "xla.gpu.fused.attention.backward.scale.dbias.softmax.dropout")
             .Value(std::optional<StridedMemrefView>())  // mask
+            .Value(std::optional<StridedMemrefView>())  // bias
+            .Value(std::optional<StridedMemrefView>())  // fwd_output
             .Arg<StridedMemrefView>()                   // d_bmm1_lhs
             .Arg<StridedMemrefView>()                   // d_bmm1_rhs
             .Arg<StridedMemrefView>()                   // d_bmm2_rhs
             .Arg<StridedMemrefView>()                   // d_S
+            .Value(std::optional<StridedMemrefView>())  // softmax_sum
+            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
             .Arg<FlatMemrefView>()                      // scratch
             .Arg<StridedMemrefView>()                   // d_bias
         )
@@ -890,10 +971,14 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         FusedAttentionBackwardCall(
             "xla.gpu.fused.attention.backward.scale.softmax.dropout")
             .Value(std::optional<StridedMemrefView>())  // mask
+            .Value(std::optional<StridedMemrefView>())  // bias
+            .Value(std::optional<StridedMemrefView>())  // fwd_output
             .Arg<StridedMemrefView>()                   // d_bmm1_lhs
             .Arg<StridedMemrefView>()                   // d_bmm1_rhs
             .Arg<StridedMemrefView>()                   // d_bmm2_rhs
             .Arg<StridedMemrefView>()                   // d_S
+            .Value(std::optional<StridedMemrefView>())  // softmax_sum
+            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
             .Arg<FlatMemrefView>()                      // scratch
             .Value(std::optional<StridedMemrefView>())  // d_bias
         )
@@ -907,13 +992,17 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
     BindFusedAttentionBackwardAttributes(
         FusedAttentionBackwardCall(
             "xla.gpu.fused.attention.backward.scale.dbias.mask.softmax")
-            .Arg<StridedMemrefView>()  // mask
-            .Arg<StridedMemrefView>()  // d_bmm1_lhs
-            .Arg<StridedMemrefView>()  // d_bmm1_rhs
-            .Arg<StridedMemrefView>()  // d_bmm2_rhs
-            .Arg<StridedMemrefView>()  // d_S
-            .Arg<FlatMemrefView>()     // scratch
-            .Arg<StridedMemrefView>()  // d_bias
+            .Arg<StridedMemrefView>()                   // mask
+            .Value(std::optional<StridedMemrefView>())  // bias
+            .Value(std::optional<StridedMemrefView>())  // fwd_output
+            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
+            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
+            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
+            .Arg<StridedMemrefView>()                   // d_S
+            .Value(std::optional<StridedMemrefView>())  // softmax_sum
+            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
+            .Arg<FlatMemrefView>()                      // scratch
+            .Arg<StridedMemrefView>()                   // d_bias
         )
         .Value(std::optional<double>())   // dropout_rate
         .Value(std::optional<int64_t>())  // seed
@@ -926,10 +1015,14 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         FusedAttentionBackwardCall(
             "xla.gpu.fused.attention.backward.scale.mask.softmax")
             .Arg<StridedMemrefView>()                   // mask
+            .Value(std::optional<StridedMemrefView>())  // bias
+            .Value(std::optional<StridedMemrefView>())  // fwd_output
             .Arg<StridedMemrefView>()                   // d_bmm1_lhs
             .Arg<StridedMemrefView>()                   // d_bmm1_rhs
             .Arg<StridedMemrefView>()                   // d_bmm2_rhs
             .Arg<StridedMemrefView>()                   // d_S
+            .Value(std::optional<StridedMemrefView>())  // softmax_sum
+            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
             .Arg<FlatMemrefView>()                      // scratch
             .Value(std::optional<StridedMemrefView>())  // d_bias
         )
@@ -943,13 +1036,17 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
     BindFusedAttentionBackwardAttributes(
         FusedAttentionBackwardCall(
             "xla.gpu.fused.attention.backward.scale.dbias.mask.softmax.dropout")
-            .Arg<StridedMemrefView>()  // mask
-            .Arg<StridedMemrefView>()  // d_bmm1_lhs
-            .Arg<StridedMemrefView>()  // d_bmm1_rhs
-            .Arg<StridedMemrefView>()  // d_bmm2_rhs
-            .Arg<StridedMemrefView>()  // d_S
-            .Arg<FlatMemrefView>()     // scratch
-            .Arg<StridedMemrefView>()  // d_bias
+            .Arg<StridedMemrefView>()                   // mask
+            .Value(std::optional<StridedMemrefView>())  // bias
+            .Value(std::optional<StridedMemrefView>())  // fwd_output
+            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
+            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
+            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
+            .Arg<StridedMemrefView>()                   // d_S
+            .Value(std::optional<StridedMemrefView>())  // softmax_sum
+            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
+            .Arg<FlatMemrefView>()                      // scratch
+            .Arg<StridedMemrefView>()                   // d_bias
         )
         .Attr<double>("dropout_rate")  // dropout_rate
         .Attr<int64_t>("seed")         // seed
@@ -962,16 +1059,66 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         FusedAttentionBackwardCall(
             "xla.gpu.fused.attention.backward.scale.mask.softmax.dropout")
             .Arg<StridedMemrefView>()                   // mask
+            .Value(std::optional<StridedMemrefView>())  // bias
+            .Value(std::optional<StridedMemrefView>())  // fwd_output
             .Arg<StridedMemrefView>()                   // d_bmm1_lhs
             .Arg<StridedMemrefView>()                   // d_bmm1_rhs
             .Arg<StridedMemrefView>()                   // d_bmm2_rhs
             .Arg<StridedMemrefView>()                   // d_S
+            .Value(std::optional<StridedMemrefView>())  // softmax_sum
+            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
             .Arg<FlatMemrefView>()                      // scratch
             .Value(std::optional<StridedMemrefView>())  // d_bias
         )
         .Attr<double>("dropout_rate")  // dropout_rate
         .Attr<int64_t>("seed")         // seed
 );
+
+// flash attention backward custom call
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    FlashAttentionScaleBiasSoftmaxBackward,
+    FunctionWrapper<FusedAttentionBackwardImpl>(), checks,
+    BindFusedAttentionBackwardAttributes(
+        FusedAttentionBackwardCall(
+            "xla.gpu.flash.attention.backward.scale.bias.softmax")
+            .Value(std::optional<StridedMemrefView>())  // mask
+            .Arg<StridedMemrefView>()                   // bias
+            .Arg<StridedMemrefView>()                   // fwd_output
+            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
+            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
+            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
+            .Value(std::optional<StridedMemrefView>())  // d_S
+            .Arg<StridedMemrefView>()                   // softmax_sum
+            .Arg<StridedMemrefView>()                   // d_Q_accum
+            .Arg<FlatMemrefView>()                      // scratch
+            .Value(std::optional<StridedMemrefView>())  // d_bias
+        )
+        .Value(std::optional<double>())   // dropout_rate
+        .Value(std::optional<int64_t>())  // seed
+);
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    FlashAttentionScaleSoftmaxBackward,
+    FunctionWrapper<FusedAttentionBackwardImpl>(), checks,
+    BindFusedAttentionBackwardAttributes(
+        FusedAttentionBackwardCall(
+            "xla.gpu.flash.attention.backward.scale.softmax")
+            .Value(std::optional<StridedMemrefView>())  // mask
+            .Value(std::optional<StridedMemrefView>())  // bias
+            .Arg<StridedMemrefView>()                   // fwd_output
+            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
+            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
+            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
+            .Value(std::optional<StridedMemrefView>())  // d_S
+            .Arg<StridedMemrefView>()                   // softmax_sum
+            .Arg<StridedMemrefView>()                   // d_Q_accum
+            .Arg<FlatMemrefView>()                      // scratch
+            .Value(std::optional<StridedMemrefView>())  // d_bias
+        )
+        .Value(std::optional<double>())   // dropout_rate
+        .Value(std::optional<int64_t>())  // seed
+);
+
 //===----------------------------------------------------------------------===//
 // cuBLASLt custom calls bindings and registration.
 //===----------------------------------------------------------------------===//
@@ -1040,6 +1187,14 @@ void RegisterFusedAttentionBackwardCustomCalls(
                     FusedAttentionScaleBiasMaskSoftmaxDropoutBackward);
   registry.Register(fused_attention("scale.mask.softmax.dropout"),
                     FusedAttentionScaleMaskSoftmaxDropoutBackward);
+  // flash attention bwd
+  auto flash_attention = [](std::string name) {
+    return "xla.gpu.flash.attention.backward." + name;
+  };
+  registry.Register(flash_attention("scale.bias.softmax"),
+                    FlashAttentionScaleBiasSoftmaxBackward);
+  registry.Register(flash_attention("scale.softmax"),
+                    FlashAttentionScaleSoftmaxBackward);
 }
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/gemm.cc b/third_party/xla/xla/service/gpu/runtime/gemm.cc
index a57e43ed8a8a69..d56bbad214b601 100644
--- a/third_party/xla/xla/service/gpu/runtime/gemm.cc
+++ b/third_party/xla/xla/service/gpu/runtime/gemm.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "xla/xla.pb.h"
 #include "tsl/platform/errors.h"
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/service/gpu/gemm_algorithm_picker.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
 #endif
@@ -48,7 +48,7 @@ using xla::runtime::CustomCall;
 using xla::runtime::State;
 using xla::runtime::StridedMemrefView;
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // TODO(ezhulenev): Delete run time auto tuning from XLA.
 Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config,
@@ -108,7 +108,7 @@ Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config,
     return InternalError("Runtime autotuning failed to select an algorithm");
   }
 }
-#endif
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
                              const DebugOptions* debug_options,
@@ -144,7 +144,7 @@ static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
   // outside of state.GetOrCreate() because otherwise it would be a potential
   // deadlock.
   if (gemm_config->algorithm == stream_executor::blas::kRuntimeAutotuning) {
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     auto status = DoRuntimeAutotuning(stream, *gemm_config, lhs_data, rhs_data,
                                       output_data, output_shape, beta,
                                       debug_options, gpu_lock);
diff --git a/third_party/xla/xla/service/gpu/runtime/gpu_kernel_helper.h b/third_party/xla/xla/service/gpu/runtime/gpu_kernel_helper.h
new file mode 100644
index 00000000000000..f3ee9b250249d3
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime/gpu_kernel_helper.h
@@ -0,0 +1,148 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_GPU_KERNEL_HELPER_H_
+#define XLA_SERVICE_GPU_RUNTIME_GPU_KERNEL_HELPER_H_
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include <type_traits>
+
+#include "tsl/lib/math/math_util.h"
+
+namespace xla {
+namespace gpu {
+
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#else
+#include "rocm/include/hip/hip_runtime.h"
+#endif
+
+#if GOOGLE_CUDA
+#define WAVEFRONT_SIZE 32
+#define FORCEINLINE __forceinline__
+using gpuStream_t = cudaStream_t;
+using gpuError_t = cudaError_t;
+using gpuEvent_t = cudaEvent_t;
+#define gpuSuccess cudaSuccess
+#define gpuGetLastError cudaGetLastError
+#define gpuGetErrorString cudaGetErrorString
+#define gpuEventRecord cudaEventRecord
+#define gpuEventSynchronize cudaEventSynchronize
+#define gpuEventDestroy cudaEventDestroy
+#define gpuEventCreate cudaEventCreate
+#define gpuEventCreateWithFlags cudaEventCreateWithFlags
+#define gpuEventDisableTiming cudaEventDisableTiming
+#define gpuEventElapsedTime cudaEventElapsedTime
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuLaunchKernel cudaLaunchKernel
+#define gpuMemcpy cudaMemcpy
+#define gpuMalloc cudaMalloc
+#define gpuFree cudaFree
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuStreamCreate cudaStreamCreate
+#define gpuStreamSynchronize cudaStreamSynchronize
+
+#elif TENSORFLOW_USE_ROCM
+using gpuStream_t = hipStream_t;
+using gpuError_t = hipError_t;
+using gpuEvent_t = hipEvent_t;
+#define gpuSuccess hipSuccess
+#define gpuGetLastError hipGetLastError
+#define gpuGetErrorString hipGetErrorString
+#define gpuEventRecord hipEventRecord
+#define gpuEventDestroy hipEventDestroy
+#define gpuEventSynchronize hipEventSynchronize
+#define gpuEventCreate hipEventCreate
+#define gpuEventCreateWithFlags hipEventCreateWithFlags
+#define gpuEventDisableTiming hipEventDisableTiming
+#define gpuEventElapsedTime hipEventElapsedTime
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuLaunchKernel hipLaunchKernel
+#define gpuMemcpy hipMemcpy
+#define gpuMalloc hipMalloc
+#define gpuFree hipFree
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuStreamCreate hipStreamCreate
+#define gpuStreamSynchronize hipStreamSynchronize
+
+#ifdef __AMDGCN_WAVEFRONT_SIZE
+#define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE
+#else
+#define WAVEFRONT_SIZE 64
+#endif
+#define FORCEINLINE __forceinline__
+#endif
+
+// macro wrapper to declare dynamic shared memory
+#if GOOGLE_CUDA
+
+#define GPU_DYNAMIC_SHARED_MEM_DECL(ALIGN, TYPE, NAME) \
+  extern __shared__ __align__(ALIGN)                   \
+  TYPE NAME[]
+
+#elif TENSORFLOW_USE_ROCM
+
+#define GPU_DYNAMIC_SHARED_MEM_DECL(ALIGN, TYPE, NAME) \
+  HIP_DYNAMIC_SHARED(TYPE, NAME)
+
+#endif
+
+enum class ShflType { Sync, Up, Down, Xor };
+
+template <ShflType Type, class NT>
+__device__ FORCEINLINE NT GpuShuffle(NT val, uint32_t idx,
+                                     uint32_t allmsk = 0xffffffffu) {
+  constexpr uint32_t SZ =
+      tsl::MathUtil::CeilOfRatio(sizeof(NT), sizeof(uint32_t));
+  union S {
+    NT v;
+    uint32_t d[SZ];
+  };
+  S in{val}, res{};
+
+#pragma unroll
+  for (uint32_t i = 0; i < SZ; i++) {
+#if GOOGLE_CUDA
+    if constexpr (Type == ShflType::Sync)
+      res.d[i] = __shfl_sync(allmsk, in.d[i], idx);
+    else if constexpr (Type == ShflType::Up)
+      res.d[i] = __shfl_up_sync(allmsk, in.d[i], idx);
+    else if constexpr (Type == ShflType::Down)
+      res.d[i] = __shfl_down_sync(allmsk, in.d[i], idx);
+    else if constexpr (Type == ShflType::Xor)
+      res.d[i] = __shfl_xor_sync(allmsk, in.d[i], idx);
+#elif TENSORFLOW_USE_ROCM  // ROcm does not support sync shuffle intrinsics
+    if constexpr (Type == ShflType::Sync)
+      res.d[i] = __shfl(in.d[i], idx);
+    else if constexpr (Type == ShflType::Up)
+      res.d[i] = __shfl_up(in.d[i], idx);
+    else if constexpr (Type == ShflType::Down)
+      res.d[i] = __shfl_down(in.d[i], idx);
+    else if constexpr (Type == ShflType::Xor)
+      res.d[i] = __shfl_xor(in.d[i], idx);
+#endif
+  }
+  return res.v;
+}
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif  // XLA_SERVICE_GPU_RUNTIME_GPU_KERNEL_HELPER_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.cc b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.cc
index c008bf5c3ffdd4..4e7285d7c0b196 100644
--- a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.cc
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.cc
@@ -109,6 +109,13 @@ absl::Status DoMatmul(
   }));
 
   TF_ASSIGN_OR_RETURN(auto algos, (*plan)->GetAlgorithms());
+  if (static_cast<size_t>(algorithm) >= algos.size()) {
+    return absl::InternalError(
+        absl::StrFormat("The requested gpublas-lt matmul "
+                        "algorithm is not found. Total algorithms available: "
+                        "%zu; requested: %ld",
+                        algos.size(), algorithm));
+  }
 
   se::DeviceMemoryBase a_data = GetDeviceAddress(a);
   se::DeviceMemoryBase b_data = GetDeviceAddress(b);
diff --git a/third_party/xla/xla/service/gpu/runtime/kernel_launch.cc b/third_party/xla/xla/service/gpu/runtime/kernel_launch.cc
index 78165c3cef1433..8ffca574914c21 100644
--- a/third_party/xla/xla/service/gpu/runtime/kernel_launch.cc
+++ b/third_party/xla/xla/service/gpu/runtime/kernel_launch.cc
@@ -15,21 +15,40 @@ limitations under the License.
 
 #include "xla/service/gpu/runtime/kernel_launch.h"
 
+#include <cassert>
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <string_view>
-#include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/runtime/custom_call.h"
+#include "xla/runtime/custom_call_registry.h"
 #include "xla/runtime/executable.h"
+#include "xla/runtime/memref_view.h"
 #include "xla/runtime/state.h"
+#include "xla/service/gpu/kernels/custom_fusion.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/runtime/concurrent_region.h"
 #include "xla/service/gpu/runtime/support.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/service/service_executable_run_options.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/stream_executor/gpu/gpu_graph.h"
@@ -56,7 +75,7 @@ static absl::Status LaunchImpl(
     const ServiceExecutableRunOptions* run_options, const std::string* ptx,
     const std::vector<uint8_t>* cubin, se::DeviceMemoryBase* temp_buffer,
     ConcurrentRegionStatus* region_status,
-    State<std::unique_ptr<se::KernelBase>> device_kernel,
+    State<std::unique_ptr<se::Kernel>> device_kernel,
     int32_t shared_memory_bytes, int32_t grid_size_x, int32_t grid_size_y,
     int32_t grid_size_z, int32_t block_size_x, int32_t block_size_y,
     int32_t block_size_z, CustomCall::RemainingArgs args, std::string_view name,
@@ -72,7 +91,7 @@ static absl::Status LaunchImpl(
 
   // If kernel does not exist create it from the ptx and cubin.
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<se::KernelBase> * kernel, device_kernel.GetOrCreate([&] {
+      std::unique_ptr<se::Kernel> * kernel, device_kernel.GetOrCreate([&] {
         return ToAbsl(CreateKernel(absl::string_view(name.data(), name.size()),
                                    args_size_including_temp_buffer, *ptx,
                                    *cubin, executor, shared_memory_bytes));
@@ -133,6 +152,132 @@ static absl::Status LaunchImpl(
                                execution_stream);
 }
 
+//===----------------------------------------------------------------------===//
+// Define the custom kernel (fusion) launch custom call.
+//===----------------------------------------------------------------------===//
+
+static StatusOr<std::unique_ptr<se::Kernel>> CreateCustomKernel(
+    se::StreamExecutor* executor, std::string_view name,
+    std::string_view custom_fusion_computation) {
+  auto* registry = CustomFusionRegistry::Default();
+  auto* custom_fusion = registry->Lookup(name);
+
+  // If custom fusion is not found it means that some of the build targets might
+  // not be statically linked into the binary.
+  if (custom_fusion == nullptr) {
+    return absl::InternalError(absl::StrCat(
+        "Custom fusion ", name, " not found in a default registry."));
+  }
+
+  // Parse attached custom fusion computation.
+  HloComputationProto computation_proto;
+  if (!computation_proto.ParseFromArray(custom_fusion_computation.data(),
+                                        custom_fusion_computation.size())) {
+    return absl::InternalError("Failed to parse custom fusion computation");
+  }
+
+  // Build HloComputation from a proto for passing to custom fusion.
+  absl::flat_hash_map<int64_t, HloComputation*> computation_map;
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloComputation> computation,
+      HloComputation::CreateFromProto(computation_proto, computation_map));
+
+  // Load custom kernels that can implement a fusion computation.
+  TF_ASSIGN_OR_RETURN(std::vector<CustomKernel> kernels,
+                      custom_fusion->LoadKernels(
+                          executor->GetDeviceDescription(), computation.get()));
+
+  // This should never happen, it means that compilation pipeline created a
+  // fusion operation that is not supported by a given custom fusion.
+  if (kernels.empty()) {
+    return absl::InternalError(
+        absl::StrCat("Custom fusion ", name,
+                     " returned empty custom kernels for a fused computation"));
+  }
+
+  auto kernel = std::make_unique<se::Kernel>(executor);
+  TF_RETURN_IF_ERROR(
+      executor->GetKernel(kernels[0].kernel_spec(), kernel.get()));
+
+  return kernel;
+}
+
+static absl::Status CustomLaunchImpl(
+    const ServiceExecutableRunOptions* run_options, const std::string* ptx,
+    const std::vector<uint8_t>* cubin, se::DeviceMemoryBase* temp_buffer,
+    ConcurrentRegionStatus* region_status,
+    State<std::unique_ptr<se::Kernel>> device_kernel,
+    int32_t shared_memory_bytes, int32_t grid_size_x, int32_t grid_size_y,
+    int32_t grid_size_z, int32_t block_size_x, int32_t block_size_y,
+    int32_t block_size_z, CustomCall::RemainingArgs args, std::string_view name,
+    int64_t stream_id, std::string_view custom_fusion_computation) {
+  se::Stream* stream = run_options->stream();
+  se::StreamExecutor* executor = stream->parent();
+
+  LaunchDimensions launch_dimensions(
+      {grid_size_x, grid_size_y, grid_size_z},
+      {block_size_x, block_size_y, block_size_z});
+
+  // If kernel does not exist load it from a custom fusion computation.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<se::Kernel> * kernel, device_kernel.GetOrCreate([&] {
+        return ToAbsl(
+            CreateCustomKernel(executor, name, custom_fusion_computation));
+      }));
+  assert((*kernel)->name() == name && "unexpected loaded kernel");
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  if (VLOG_IS_ON(3)) {
+    TF_ASSIGN_OR_RETURN(bool is_capturing, se::gpu::IsStreamCapturing(stream));
+    if (is_capturing) {
+      if (region_status->IsInConcurrentRegion()) {
+        LOG(INFO) << "Launching " << (*kernel)->name()
+                  << "in a concurrent region during GPU graph capture";
+      } else {
+        LOG(INFO) << "Launching " << (*kernel)->name()
+                  << "during GPU graph capture";
+      }
+    } else {
+      LOG(INFO) << "Launching " << (*kernel)->name();
+    }
+  }
+#else
+  VLOG(3) << "Launching " << (*kernel)->name();
+#endif
+
+  absl::InlinedVector<se::DeviceMemoryBase, 8> buffer_args(args.size());
+
+  // Add MemRef arguments as buffer arguments.
+  for (unsigned i = 0; i < args.size(); ++i) {
+    // We get arguments corresponding to XLA allocations required by the
+    // compiled device kernel, and not the actual memrefs that device kernel
+    // writes/reads, so we don't have to pass the size along with the pointer.
+    if (auto strided = args.get<StridedMemrefView>(i); succeeded(strided)) {
+      buffer_args[i] = se::DeviceMemoryBase(strided->data);
+      continue;
+    }
+
+    return absl::InvalidArgumentError(
+        absl::StrFormat("Unsupported argument #%d type", i));
+  }
+
+  // If we are capturing a concurrent region in a GPU graph, then use the
+  // stream provided by ConcurrentRegionStatus to execute the kernel.
+  se::Stream* execution_stream = stream;
+  if (stream_id != 0) {
+    DCHECK(region_status->IsInConcurrentRegion());
+    TF_ASSIGN_OR_RETURN(execution_stream, region_status->GetStream(stream_id));
+  } else if (region_status->IsInConcurrentRegion()) {
+    execution_stream = region_status->GetNextStream();
+  }
+
+  se::KernelArgsDeviceMemoryArray kernel_args(buffer_args, shared_memory_bytes);
+  return executor->Launch(
+      stream, se::ThreadDim(block_size_x, block_size_y, block_size_z),
+      se::BlockDim(grid_size_x, grid_size_y, grid_size_z), **kernel,
+      kernel_args);
+}
+
 //===----------------------------------------------------------------------===//
 
 XLA_RUNTIME_DEFINE_CUSTOM_CALL(
@@ -143,7 +288,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .UserData<const std::vector<uint8_t>*>()
         .UserData<se::DeviceMemoryBase*>()
         .UserData<ConcurrentRegionStatus*>()
-        .State<std::unique_ptr<se::KernelBase>>("uid")
+        .State<std::unique_ptr<se::Kernel>>("uid")
         .Arg<int32_t>()   // shared_memory_bytes
         .Arg<int32_t>()   // grid_size_x
         .Arg<int32_t>()   // grid_size_y
@@ -155,9 +300,31 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .Attr<std::string_view>("kernel")
         .Attr<int64_t>("stream"));
 
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    CustomLaunch, FunctionWrapper<CustomLaunchImpl>(), checks,
+    CustomCall::Bind("xla.gpu.func.custom_launch")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<const std::string*>()
+        .UserData<const std::vector<uint8_t>*>()
+        .UserData<se::DeviceMemoryBase*>()
+        .UserData<ConcurrentRegionStatus*>()
+        .State<std::unique_ptr<se::Kernel>>("uid")
+        .Arg<int32_t>()   // shared_memory_bytes
+        .Arg<int32_t>()   // grid_size_x
+        .Arg<int32_t>()   // grid_size_y
+        .Arg<int32_t>()   // grid_size_z
+        .Arg<int32_t>()   // block_size_x
+        .Arg<int32_t>()   // block_size_y
+        .Arg<int32_t>()   // block_size_x
+        .RemainingArgs()  // args
+        .Attr<std::string_view>("kernel")
+        .Attr<int64_t>("stream")
+        .Attr<std::string_view>("__custom_fusion_computation"));
+
 void RegisterKernelLaunchCustomCalls(
     runtime::DirectCustomCallRegistry& registry) {
   registry.Register("xla.gpu.func.launch", Launch);
+  registry.Register("xla.gpu.func.custom_launch", CustomLaunch);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/kernel_launch.h b/third_party/xla/xla/service/gpu/runtime/kernel_launch.h
index 80040b0f55f411..4a7f294f67f70f 100644
--- a/third_party/xla/xla/service/gpu/runtime/kernel_launch.h
+++ b/third_party/xla/xla/service/gpu/runtime/kernel_launch.h
@@ -35,7 +35,7 @@ void RegisterKernelLaunchCustomCalls(
 
 // Kernels loaded by Gpu executable for a single stream executor.
 class StreamExecutorKernels
-    : public runtime::StateVector<std::unique_ptr<se::KernelBase>> {};
+    : public runtime::StateVector<std::unique_ptr<se::Kernel>> {};
 
 // Xla runtime Gpu executable owns the pre-compiled device module (PTX and
 // Cubin for Nvidia Gpus) for all device kernels, and the cache keeps a mapping
diff --git a/third_party/xla/xla/service/gpu/runtime/memset.cc b/third_party/xla/xla/service/gpu/runtime/memset.cc
index 0e31161604a413..76314e07fc9a3d 100644
--- a/third_party/xla/xla/service/gpu/runtime/memset.cc
+++ b/third_party/xla/xla/service/gpu/runtime/memset.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/runtime/memset.h"
 
+#include "absl/base/casts.h"
 #include "xla/runtime/custom_call.h"
 #include "xla/runtime/executable.h"
 #include "xla/service/gpu/runtime/support.h"
@@ -95,9 +96,9 @@ static absl::StatusOr<uint32_t> ToBitPattern(CustomCall::VariantArg constant) {
   else if (auto i64 = constant.get<int64_t>(); succeeded(i64))
     return truncate(*i64);
   else if (auto bf16 = constant.get<bfloat16>(); succeeded(bf16))
-    return extend(static_cast<uint16_t>(*bf16));
+    return extend(absl::bit_cast<uint16_t>(*bf16));
   else if (auto f16 = constant.get<half>(); succeeded(f16))
-    return extend(static_cast<uint16_t>(*f16));
+    return extend(absl::bit_cast<uint16_t>(*f16));
   else if (auto f32 = constant.get<float>(); succeeded(f32))
     return truncate(*f32);
   else if (auto f64 = constant.get<double>(); succeeded(f64))
diff --git a/third_party/xla/xla/service/gpu/runtime/topk.cc b/third_party/xla/xla/service/gpu/runtime/topk.cc
index 385dad8b865be1..d21d08e98d88f7 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/topk.cc
@@ -47,9 +47,9 @@ static absl::Status TopkImpl(const ServiceExecutableRunOptions* run_options,
   size_t batch_size = has_batch ? data.sizes[0] : 1;
   size_t n = has_batch ? data.sizes[1] : data.sizes[0];
   size_t k = has_batch ? top_elements.sizes[1] : top_elements.sizes[0];
-  return RunTopk(se::gpu::AsGpuStreamValue(run_options->stream()), data.dtype,
-                 data.data, n, top_elements.data,
-                 static_cast<uint32_t*>(indices.data), k, batch_size);
+  return RunTopk(run_options->stream(), data.dtype, GetDeviceAddress(data), n,
+                 GetDeviceAddress(top_elements), GetDeviceAddress(indices), k,
+                 batch_size);
 }
 
 XLA_RUNTIME_DEFINE_CUSTOM_CALL(
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_kernel.cc b/third_party/xla/xla/service/gpu/runtime/topk_kernel.cc
index eec60d6e514bae..3bc3dfc585b6b6 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_kernel.cc
+++ b/third_party/xla/xla/service/gpu/runtime/topk_kernel.cc
@@ -20,22 +20,24 @@ limitations under the License.
 #include "xla/service/gpu/runtime/topk_kernel.h"
 
 #include <algorithm>
+#include <cstdint>
 
 #include "absl/numeric/bits.h"
 #include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "xla/primitive_util.h"
+#include "xla/service/gpu/runtime/gpu_kernel_helper.h"
 #include "xla/service/gpu/runtime/topk_kernel_common.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 
 namespace {
 
-using ::stream_executor::gpu::GpuStreamHandle;
-
 size_t NumThreads(size_t n, size_t k, size_t batch_size) {
   // Estimate number of threads per block that can run concurrently given the
   // register footprint.
@@ -47,38 +49,6 @@ size_t NumThreads(size_t n, size_t k, size_t batch_size) {
   return std::min(threads_per_block, min_slice);
 }
 
-// Helper type for converting the untyped arguments of RunTopk to TypedTopk
-template <typename T>
-struct TopkArgs {
-  TopkArgs(GpuStreamHandle stream, PrimitiveType dtype, T* data,
-           size_t num_elements, T* top_elements, uint32_t* top_indices,
-           size_t k, size_t batch_size)
-      : stream(stream),
-        dtype(dtype),
-        data(data),
-        num_elements(num_elements),
-        top_elements(top_elements),
-        top_indices(top_indices),
-        k(k),
-        batch_size(batch_size) {}
-
-  template <typename T2>
-  TopkArgs<T2> Convert() const {
-    return TopkArgs<T2>(stream, dtype, static_cast<T2*>(data), num_elements,
-                        static_cast<T2*>(top_elements), top_indices, k,
-                        batch_size);
-  }
-
-  GpuStreamHandle stream;
-  PrimitiveType dtype;
-  T* data;
-  size_t num_elements;
-  T* top_elements;
-  uint32_t* top_indices;
-  size_t k;
-  size_t batch_size;
-};
-
 template <typename T>
 absl::StatusOr<void*> GetKernel(int n, int k) {
   if (k <= 1) return GetTopKKernelForK<T, 1>(n);
@@ -90,44 +60,56 @@ absl::StatusOr<void*> GetKernel(int n, int k) {
 }
 
 template <typename T>
-absl::Status TypedTopK(TopkArgs<T> args) {
-  int num_threads = NumThreads(args.num_elements, args.k, args.batch_size);
+absl::Status TypedTopK(se::Stream* stream, se::DeviceMemoryBase data,
+                       size_t num_elements, se::DeviceMemoryBase top_elements,
+                       se::DeviceMemoryBase top_indices, size_t k,
+                       size_t batch_size) {
+  constexpr size_t max_kv_size = sizeof(uint64_t);
+  // Allocate shmem assuming we have a full reduction.
+  int shmem_size = absl::bit_ceil(k) * max_kv_size * WAVEFRONT_SIZE;
+  int num_threads = NumThreads(num_elements, k, batch_size);
   if (num_threads == 0) {
     return absl::FailedPreconditionError(
-        "Invalid kernel pameters. This is likely a bug in the "
+        "Invalid kernel parameters. This is likely a bug in the "
         "TopkSpecializer.");
   }
-  TF_ASSIGN_OR_RETURN(void* kernel, GetKernel<T>(args.num_elements, args.k));
-  int blocks_per_grid = args.batch_size;
-  constexpr size_t max_kv_size = sizeof(uint64_t);
-  // Allocate shmem assuming we have a full reduction.
-  int shmem_size = absl::bit_ceil(args.k) * max_kv_size * 32;
-  void* kernel_args[] = {&args.data, &args.num_elements, &args.top_elements,
-                         &args.top_indices, &args.k};
-  cudaError_t launch_status =
-      cudaLaunchKernel(kernel, blocks_per_grid, num_threads, kernel_args,
-                       shmem_size, args.stream);
-  if (launch_status != cudaSuccess) {
-    return absl::InternalError(absl::StrCat("Failed to launch kernel: ",
-                                            cudaGetErrorString(launch_status)));
-  }
+  se::StreamExecutor* executor = stream->parent();
+  se::DeviceMemory<T> data_typed(data);
+  se::DeviceMemory<T> top_elements_typed(top_elements);
+  se::DeviceMemory<uint32_t> top_indices_typed(top_indices);
+
+  TF_ASSIGN_OR_RETURN(void* kernel_symbol, GetKernel<T>(num_elements, k));
+  TF_ASSIGN_OR_RETURN(
+      auto kernel,
+      (executor
+           ->CreateTypedKernel<se::DeviceMemory<T>, size_t, se::DeviceMemory<T>,
+                               se::DeviceMemory<uint32_t>, size_t>(
+               "topk", kernel_symbol)));
+
+  TF_RETURN_IF_ERROR(stream->ThenLaunch(
+      se::ThreadDim(num_threads, 1, 1), se::BlockDim(batch_size, 1, 1),
+      shmem_size, *kernel, data_typed, num_elements, top_elements_typed,
+      top_indices_typed, k));
+
   return absl::OkStatus();
 }
 
 }  // namespace
 
-absl::Status RunTopk(GpuStreamHandle stream, PrimitiveType dtype, void* data,
-                     size_t num_elements, void* top_elements,
-                     uint32_t* top_indices, size_t k, size_t batch_size) {
+absl::Status RunTopk(se::Stream* stream, PrimitiveType dtype,
+                     se::DeviceMemoryBase data, size_t num_elements,
+                     se::DeviceMemoryBase top_elements,
+                     se::DeviceMemoryBase top_indices, size_t k,
+                     size_t batch_size) {
   VLOG(2) << "TopK: " << primitive_util::LowercasePrimitiveTypeName(dtype)
           << ", n: " << num_elements << ", k: " << k << ", bs: " << batch_size;
-  auto args = TopkArgs<void>(stream, dtype, data, num_elements, top_elements,
-                             top_indices, k, batch_size);
   switch (dtype) {
     case PrimitiveType::F32:
-      return TypedTopK(args.Convert<float>());
+      return TypedTopK<float>(stream, data, num_elements, top_elements,
+                              top_indices, k, batch_size);
     case PrimitiveType::BF16:
-      return TypedTopK(args.Convert<Eigen::bfloat16>());
+      return TypedTopK<Eigen::bfloat16>(
+          stream, data, num_elements, top_elements, top_indices, k, batch_size);
     default:
       return absl::UnimplementedError("GpuTopK not implemented for this dtype");
   }
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_kernel.cu.h b/third_party/xla/xla/service/gpu/runtime/topk_kernel.cu.h
index 908401c9979af8..d34ec1e870da7b 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_kernel.cu.h
+++ b/third_party/xla/xla/service/gpu/runtime/topk_kernel.cu.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <cstdint>
 #include <limits>
 
+#include "xla/service/gpu/runtime/gpu_kernel_helper.h"
 #include "xla/service/gpu/runtime/topk_kernel_common.h"
 
 namespace xla::gpu {
@@ -33,50 +34,16 @@ namespace xla::gpu {
 // implementations below.
 template <typename T, typename V>
 struct Descending {
-  class KVT {
-   public:
-    __device__ KVT() = default;
-    __device__ KVT& operator=(const KVT&) = default;
-    __device__ KVT& operator=(KVT&&) = default;
-    __device__ KVT(const KVT&) = default;
-    __device__ KVT(KVT&&) = default;
-
-    __device__ KVT(T k, V v) : k_(k), v_(v) {}
-    __forceinline__ __device__ void Write(T* key, uint32_t* value) const {
-      *key = k_;
-      *value = v_;
-    }
-
-    __device__ __forceinline__ KVT ShuffleDown(int offset) const {
-      unsigned FULL_MASK = 0xffffffff;
-      // The static casts here are necessary because some types will be
-      // broadened (e.g. bfloat16 -> f32), so we need to narrow them back after
-      // the shuffle.
-      return KVT(static_cast<T>(__shfl_down_sync(FULL_MASK, k_, offset)),
-                 static_cast<V>(__shfl_down_sync(FULL_MASK, v_, offset)));
-    }
-
-   private:
-    T k_;
-    V v_;
-    friend class Descending<T, V>;
+  struct KVT {
+    T key;
+    V idx;
   };
 
-  __device__ __forceinline__ static constexpr bool Gt(const KVT& lhs,
-                                                      const KVT& rhs) {
-    return lhs.k_ == rhs.k_ ? lhs.v_ < rhs.v_ : lhs.k_ > rhs.k_;
+  __device__ FORCEINLINE static bool cmp(const KVT& lhs, const KVT& rhs) {
+    return lhs.key == rhs.key ? lhs.idx < rhs.idx : lhs.key > rhs.key;
   }
 };
 
-// -----------------------------------------------------------------------------
-// More efficient implementation of Descending.
-// -----------------------------------------------------------------------------
-
-// Strided indexing.
-__device__ __forceinline__ int Idx(int i) {
-  return blockDim.x * i + threadIdx.x;
-}
-
 // TopK implements a faster TopK for K < 16.
 //
 // To compute the final largest K elements, we shard the data threads and each
@@ -146,105 +113,108 @@ __device__ __forceinline__ int Idx(int i) {
 //    allow better scaling past K=16. This is fairly tricky to implement
 //    efficiently, so it was let out of v1.
 //
+
 template <size_t K, typename KT, typename VT,
-          template <typename KT1, typename VT2> class Traits = Descending>
-class TopK {
- public:
+          template <class, class> class Traits = Descending>
+struct TopK {
   using Trait = Traits<KT, VT>;
   using KVT = typename Trait::KVT;
 
   __device__ TopK(void* buffer, int num_outputs)
       : buffer_(reinterpret_cast<KVT*>(buffer)), num_outputs_(num_outputs) {}
 
-  __device__ void Run(KT* key, int n, KT* keys, uint32_t* values) {
-    PerWarpTopK(key, n);
-    MergeTopKs(keys, values);
+  __device__ FORCEINLINE uint32_t Idx(uint32_t i) {
+    return blockDim.x * i + threadIdx.x;
   }
 
- private:
   // Compute a per-warp topk of a slice of data.
   __device__ void PerWarpTopK(KT* key, int n) {
     KVT tmp[K];
     // TODO(doak): Use bitonic sort.
 #pragma unroll
-    for (int i = 0; i < K; i++) tmp[i] = KVT(key[Idx(i)], Idx(i));
+    for (int i = 0; i < K; i++) {
+      tmp[i] = {key[Idx(i)], VT(Idx(i))};
+    }
 #pragma unroll
     for (int i = 0; i < K; i++) {
 #pragma unroll
       for (int j = i + 1; j < K; j++) {
         KVT ti = tmp[i];
         KVT tj = tmp[j];
-        bool cmp = Trait::Gt(ti, tj);
-        tmp[i] = cmp ? ti : tj;
-        tmp[j] = cmp ? tj : ti;
+        bool res = Trait::cmp(ti, tj);
+        tmp[i] = res ? ti : tj;
+        tmp[j] = res ? tj : ti;
       }
     }
+    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
 
     for (int idx = K; idx < n; idx++) {
-      KVT kv(key[Idx(idx)], Idx(idx));
+      KVT kv{key[Idx(idx)], VT(Idx(idx))};
       Push(tmp, kv);
     }
+    Reduce(tmp, WarpSize);
 
-    Reduce(tmp, 32);
-
-    if (threadIdx.x % 32 != 0) return;
-    int warp_id = threadIdx.x / 32;
+    if (threadIdx.x % WarpSize != 0) return;
+    int warp_id = threadIdx.x / WarpSize;
+#pragma unroll
     for (int i = 0; i < K; i++) {
-      buffer_[i * 32 + warp_id] = tmp[i];
+      buffer_[i * WarpSize + warp_id] = tmp[i];
     }
   }
 
   // Merge the per-warp topks into a single topk. The final data is written to
-  // `keys` and `values`
-  __device__ void MergeTopKs(KT* keys, uint32_t* values) {
+  // `keys` and `idxs`
+  __device__ void MergeTopKs(KT* keys, uint32_t* idxs) {
+    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
     KVT tmp[K];
     // We only use one warp for this step.
-    if (threadIdx.x / 32 != 0) return;
+    if (threadIdx.x >= WarpSize) return;
     __syncthreads();
 #pragma unroll
-    for (int i = 0; i < K; i++) tmp[i] = buffer_[i * 32 + threadIdx.x];
-    Reduce(tmp, blockDim.x / 32);
+    for (int i = 0; i < K; i++) {
+      tmp[i] = buffer_[i * WarpSize + threadIdx.x];
+    }
+    Reduce(tmp, blockDim.x / WarpSize);
     if (threadIdx.x != 0) return;
     for (int i = 0; i < num_outputs_; ++i) {
-      tmp[i].Write(&keys[i], &values[i]);
+      keys[i] = tmp[i].key;
+      idxs[i] = tmp[i].idx;
     }
   }
 
   // Merge `tmp` (a reverse-sorted array) from (0, `num_lanes`) lanes. The
   // resulting array is stored in the tmp array of lane 0. For all other lanes,
   // `tmp` is unspecified after this function is called.
-  __device__ __forceinline__ void Reduce(KVT tmp[K], int num_lanes) {
-    int lane_id = threadIdx.x % 32;
+  __device__ FORCEINLINE void Reduce(KVT tmp[K], int num_lanes) {
+    constexpr uint32_t WarpSize = WAVEFRONT_SIZE;
+    int lane_id = threadIdx.x % WarpSize;
     for (int offset = num_lanes / 2; offset > 0; offset /= 2) {
 #pragma unroll
       for (int i = 0; i < K; i++) {
-        KVT kv = tmp[i].ShuffleDown(offset);
+        KVT kv = GpuShuffle<ShflType::Down>(tmp[i], offset);
         if (lane_id >= offset) continue;
         Push(tmp, kv);
       }
     }
   }
 
-  // Given a K-array of previously reverse-sorted KVTs, add kv to to it and
+  // Given a K-array of previously reverse-sorted KVTs, add kv to it and
   // remove the smallest element of the resulting array. Preserves the sorted
   // order of `tmp`.
-  static __device__ __forceinline__ bool Push(KVT tmp[K], const KVT& kv) {
-    if (Trait::Gt(tmp[K - 1], kv)) return false;
-    tmp[K - 1] = kv;
-    if constexpr (K >= 2) {
+  static __device__ FORCEINLINE bool Push(KVT tmp[K], const KVT& kv) {
+    if (Trait::cmp(tmp[K - 1], kv)) return false;
+    tmp[K - 1] = kv;  // (K-1)th is the smallest element out of K
 #pragma unroll
-      for (int i = K - 2; i >= 0; --i) {
-        if (Trait::Gt(tmp[i], kv)) break;
-        // Swap
-        KVT t = tmp[i];
-        tmp[i] = tmp[i + 1];
-        tmp[i + 1] = t;
-      }
+    for (int i = (int)K - 2; i >= 0; --i) {
+      if (Trait::cmp(tmp[i], kv)) break;
+      // Swap
+      auto t = tmp[i];
+      tmp[i] = tmp[i + 1];
+      tmp[i + 1] = t;
     }
     return true;
   }
 
-  int source_ = 0;
   KVT* buffer_;
   int num_outputs_;
 };
@@ -257,13 +227,19 @@ extern __device__ __shared__ int shmem[];
 template <size_t K, typename KT, typename VT>
 __launch_bounds__(kTopKMaxThreadsPerBlock, 1) __global__
     void Run(KT* data, int n, KT* result, uint32_t* result_idxs, int k) {
-  TopK<K, KT, VT> top_k(shmem, k);
+  TopK<K, KT, VT> obj(shmem, k);
+
+  const uint32_t bidx = blockIdx.x;
+  auto in = data + n * bidx;
+  auto vals_out = result + k * bidx;
+  auto idxs_out = result_idxs + k * bidx;
   int slice_size = n / blockDim.x;
   if (threadIdx.x < n % blockDim.x) {
     slice_size++;
   }
-  top_k.Run(&data[n * blockIdx.x], slice_size, &result[k * blockIdx.x],
-            &result_idxs[k * blockIdx.x]);
+
+  obj.PerWarpTopK(in, slice_size);
+  obj.MergeTopKs(vals_out, idxs_out);
 }
 
 template <typename T, size_t K>
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_kernel.h b/third_party/xla/xla/service/gpu/runtime/topk_kernel.h
index 6afd56d35fe9d6..5be0d121d4f31b 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_kernel.h
+++ b/third_party/xla/xla/service/gpu/runtime/topk_kernel.h
@@ -20,9 +20,11 @@ limitations under the License.
 #include <stdint.h>
 
 #include "absl/status/status.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/types.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
@@ -33,9 +35,10 @@ namespace xla::gpu {
 //  - top_indices: [batch_size, k] u32
 // Where `top_elements` contains the largest elements of the input, and
 // `top_indices` their original indices.
-absl::Status RunTopk(::tensorflow::se::gpu::GpuStreamHandle stream,
-                     PrimitiveType dtype, void* data, size_t num_elements,
-                     void* top_elements, uint32_t* top_indices, size_t k,
+absl::Status RunTopk(se::Stream* stream, PrimitiveType dtype,
+                     se::DeviceMemoryBase data, size_t num_elements,
+                     se::DeviceMemoryBase top_elements,
+                     se::DeviceMemoryBase top_indices, size_t k,
                      size_t batch_size);
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_kernel_common.h b/third_party/xla/xla/service/gpu/runtime/topk_kernel_common.h
index a805b4507aa09c..fdb04623d6f28b 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_kernel_common.h
+++ b/third_party/xla/xla/service/gpu/runtime/topk_kernel_common.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_COMMON_H_
 #define XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_COMMON_H_
 
+#include <cstddef>
+
 // Contains shared declarations between topk_kernel.cc and topk_kernel.cu.cc
 // but avoids including ABSL, etc. which some CUDA compilers cannot
 // handle.
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_kernel_test.cc b/third_party/xla/xla/service/gpu/runtime/topk_kernel_test.cc
index da993cfd0204d0..ace5363adcd5ef 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/topk_kernel_test.cc
@@ -23,11 +23,18 @@ limitations under the License.
 #include <tuple>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/random/random.h"
 #include "absl/strings/substitute.h"
+#include "absl/time/time.h"
 #include "Eigen/Core"  // from @eigen_archive
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "xla/service/gpu/runtime/gpu_kernel_helper.h"
+#include "xla/stream_executor/gpu/gpu_stream.h"
+#include "xla/stream_executor/gpu/gpu_timer.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
+#include "xla/stream_executor/multi_platform_manager.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/test.h"
 #include "tsl/platform/test_benchmark.h"
@@ -35,45 +42,31 @@ limitations under the License.
 namespace xla::gpu {
 namespace {
 
-using ::stream_executor::gpu::GpuStreamHandle;
+using se::gpu::GpuStreamHandle;
 using ::testing::Combine;
 using ::testing::Values;
 
-#define CUDA_CHECK(s)                                  \
-  do {                                                 \
-    CHECK_EQ(s, cudaSuccess) << cudaGetErrorString(s); \
-  } while (0)
-
-template <typename T>
-T* AllocateGpuBuffer(int num_elements) {
-  void* buffer;
-  CUDA_CHECK(cudaMalloc(&buffer, num_elements * sizeof(T)));
-  return static_cast<T*>(buffer);
-}
-
 template <typename T>
-std::vector<T> RandomFillRange(void* buffer, int num_elements, T start, T end) {
+std::vector<T> RandomVecRange(int num_elements, T start, T end) {
   std::vector<T> local;
   local.reserve(num_elements);
   thread_local absl::BitGen gen;
   for (int i = 0; i < num_elements; ++i) {
     local.push_back(absl::Uniform<T>(gen, start, end));
   }
-  CUDA_CHECK(cudaMemcpy(buffer, local.data(), num_elements * sizeof(T),
-                        cudaMemcpyHostToDevice));
   return local;
 }
 
 template <typename T>
-std::vector<T> RandomFill(void* buffer, int num_elements) {
-  return RandomFillRange(buffer, num_elements, static_cast<T>(0),
-                         static_cast<T>(num_elements));
+std::vector<T> RandomVec(int num_elements) {
+  return RandomVecRange(num_elements, static_cast<T>(0),
+                        static_cast<T>(num_elements));
 }
 
 template <typename T>
-std::vector<T> RandomFillNegative(void* buffer, int num_elements) {
-  return RandomFillRange(buffer, num_elements, -static_cast<T>(num_elements),
-                         static_cast<T>(0));
+std::vector<T> RandomVecNegative(int num_elements) {
+  return RandomVecRange(num_elements, -static_cast<T>(num_elements),
+                        static_cast<T>(0));
 }
 
 PrimitiveType Get(float) { return PrimitiveType::F32; }
@@ -91,62 +84,82 @@ using TopkTest = ::testing::TestWithParam<std::tuple<int, int, int, int>>;
 // utilities to simplify the test logic.
 TEST_P(TopkTest, TopKFloat) {
   using T = float;
+
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("CUDA").value();
+  se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  se::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
   const auto [n_kb, k, batch_size, offset] = GetParam();
   const size_t n = n_kb * 1024 + offset;
-  T* input_buffer = AllocateGpuBuffer<T>(n * batch_size);
-  auto source = RandomFill<T>(input_buffer, n * batch_size);
-  T* output_values = AllocateGpuBuffer<T>(k * batch_size);
-  auto* output_indices =
-      static_cast<uint32_t*>(AllocateGpuBuffer<uint32_t>(k * batch_size));
-  GpuStreamHandle stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
-  ASSERT_TRUE(RunTopk(stream, Get(T()), input_buffer, n, output_values,
+
+  se::DeviceMemory<T> input_buffer =
+      executor->AllocateArray<T>(n * batch_size, 0);
+  se::DeviceMemory<T> output_values =
+      executor->AllocateArray<T>(k * batch_size, 0);
+  se::DeviceMemory<uint32_t> output_indices =
+      executor->AllocateArray<uint32_t>(k * batch_size, 0);
+
+  auto source = RandomVec<T>(n * batch_size);
+  stream.ThenMemcpy(&input_buffer, source.data(), n * batch_size * sizeof(T));
+
+  ASSERT_TRUE(RunTopk(&stream, Get(T()), input_buffer, n, output_values,
                       output_indices, k, batch_size)
                   .ok());
   std::vector<T> got(k);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  ASSERT_TRUE(stream.BlockHostUntilDone().ok());
   for (int i = 0; i < batch_size; i++) {
-    CUDA_CHECK(cudaMemcpy(got.data(), &output_values[k * i], k * sizeof(T),
-                          cudaMemcpyDeviceToHost));
+    stream.ThenMemcpy(got.data(), output_values.GetSlice(k * i, k),
+                      k * sizeof(T));
     std::vector<T> slice(source.data() + n * i, source.data() + n * (i + 1));
     std::sort(slice.begin(), slice.end(), std::greater<T>());
     slice.resize(k);
     EXPECT_THAT(got, ::testing::ElementsAreArray(slice))
-        << " k=" << k << ", batch_size=" << batch_size;
+        << " k=" << k << ", batch_size=" << batch_size << " i=" << i;
   }
-  CUDA_CHECK(cudaFree(input_buffer));
-  CUDA_CHECK(cudaFree(output_indices));
-  CUDA_CHECK(cudaFree(output_values));
 }
 
 TEST_P(TopkTest, TopKPackedNegative) {
   using T = float;
+
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("CUDA").value();
+  se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  se::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
   const auto [n_kb, k, batch_size, offset] = GetParam();
   const size_t n = n_kb * 1024 + offset;
-  T* input_buffer = AllocateGpuBuffer<T>(n * batch_size);
-  auto source = RandomFillNegative<T>(input_buffer, n * batch_size);
-  T* output_values = AllocateGpuBuffer<T>(k * batch_size);
-  auto* output_indices =
-      static_cast<uint32_t*>(AllocateGpuBuffer<uint32_t>(k * batch_size));
-  GpuStreamHandle stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
-  ASSERT_TRUE(RunTopk(stream, Get(T()), input_buffer, n, output_values,
+
+  se::DeviceMemory<T> input_buffer =
+      executor->AllocateArray<T>(n * batch_size, 0);
+  se::DeviceMemory<T> output_values =
+      executor->AllocateArray<T>(k * batch_size, 0);
+  se::DeviceMemory<uint32_t> output_indices =
+      executor->AllocateArray<uint32_t>(k * batch_size, 0);
+
+  auto source = RandomVecNegative<T>(n * batch_size);
+  stream.ThenMemcpy(&input_buffer, source.data(), n * batch_size * sizeof(T));
+
+  ASSERT_TRUE(RunTopk(&stream, Get(T()), input_buffer, n, output_values,
                       output_indices, k, batch_size)
                   .ok());
   std::vector<T> got(k);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  ASSERT_TRUE(stream.BlockHostUntilDone().ok());
   for (int i = 0; i < batch_size; i++) {
-    CUDA_CHECK(cudaMemcpy(got.data(), &output_values[k * i], k * sizeof(T),
-                          cudaMemcpyDeviceToHost));
+    stream.ThenMemcpy(got.data(), output_values.GetSlice(k * i, k),
+                      k * sizeof(T));
     std::vector<T> slice(source.data() + n * i, source.data() + n * (i + 1));
     std::sort(slice.begin(), slice.end(), std::greater<T>());
     slice.resize(k);
     EXPECT_THAT(got, ::testing::ElementsAreArray(slice))
-        << " k=" << k << ", batch_size=" << batch_size;
+        << " k=" << k << ", batch_size=" << batch_size << " i=" << i;
   }
-  CUDA_CHECK(cudaFree(input_buffer));
-  CUDA_CHECK(cudaFree(output_indices));
-  CUDA_CHECK(cudaFree(output_values));
 }
 
 INSTANTIATE_TEST_SUITE_P(TopkTests, TopkTest,
@@ -166,46 +179,50 @@ INSTANTIATE_TEST_SUITE_P(TopkTests, TopkTest,
 template <size_t K>
 void BM_SmallTopk(benchmark::State& state) {
   using T = float;
+
   size_t k = K;
   size_t batch_size = state.range(0);
   size_t n = state.range(1) * 1024;
   state.SetLabel(
       absl::Substitute("n=$0Ki k=$1 batch_size=$2", n / 1024, k, batch_size));
-  void* input_buffer = AllocateGpuBuffer<T>(n * batch_size);
-  auto source = RandomFill<T>(input_buffer, n);
-  void* output_values = AllocateGpuBuffer<T>(k);
-  auto* output_indices = static_cast<uint32_t*>(AllocateGpuBuffer<uint32_t>(k));
-  GpuStreamHandle stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
+
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName("CUDA").value();
+  se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  se::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  se::DeviceMemory<T> input_buffer =
+      executor->AllocateArray<T>(n * batch_size, 0);
+  se::DeviceMemory<T> output_values = executor->AllocateArray<T>(k, 0);
+  se::DeviceMemory<uint32_t> output_indices =
+      executor->AllocateArray<uint32_t>(k, 0);
+
+  auto source = RandomVec<T>(n);
+  stream.ThenMemcpy(&input_buffer, source.data(), n * sizeof(T));
+
   for (auto _ : state) {
-    cudaEvent_t start, stop;
-    CUDA_CHECK(cudaEventCreate(&start));
-    CUDA_CHECK(cudaEventCreate(&stop));
-    CUDA_CHECK(cudaEventRecord(start, stream));
-    CHECK_OK(RunTopk(stream, Get(T()), input_buffer, n, output_values,
+    auto timer = se::gpu::GpuTimer::Create(se::gpu::AsGpuStream(&stream));
+    CHECK_OK(timer.status());
+    CHECK_OK(RunTopk(&stream, Get(T()), input_buffer, n, output_values,
                      output_indices, k, batch_size));
-    CUDA_CHECK(cudaGetLastError());
-    CUDA_CHECK(cudaEventRecord(stop, stream));
-    CUDA_CHECK(cudaEventSynchronize(stop));
-    float milliseconds = 0;
-    CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
-    state.SetIterationTime(static_cast<double>(milliseconds) / 1000);
-    CUDA_CHECK(cudaEventDestroy(start));
-    CUDA_CHECK(cudaEventDestroy(stop));
+    CHECK_OK(stream.BlockHostUntilDone());
+    auto timer_duration = timer.value().GetElapsedDuration();
+    CHECK_OK(timer_duration.status());
+    state.SetIterationTime(absl::ToDoubleMicroseconds(timer_duration.value()));
   }
   size_t items_processed = batch_size * n * state.iterations();
   state.SetItemsProcessed(items_processed);
   state.SetBytesProcessed(items_processed * sizeof(T));
-  CUDA_CHECK(cudaFree(input_buffer));
-  CUDA_CHECK(cudaFree(output_values));
-  CUDA_CHECK(cudaFree(output_indices));
 }
 
 BENCHMARK(BM_SmallTopk<1>)->RangePair(1, 512, 16, 1024)->UseManualTime();
 BENCHMARK(BM_SmallTopk<2>)->RangePair(1, 512, 16, 1024)->UseManualTime();
 BENCHMARK(BM_SmallTopk<4>)->RangePair(1, 512, 16, 1024)->UseManualTime();
-BENCHMARK(BM_SmallTopk<8>)->RangePair(1, 512, 16, 1024)->UseManualTime();
-BENCHMARK(BM_SmallTopk<16>)->RangePair(1, 512, 16, 1024)->UseManualTime();
+BENCHMARK(BM_SmallTopk<8>)->RangePair(1, 1024, 16, 1024)->UseManualTime();
+BENCHMARK(BM_SmallTopk<16>)->RangePair(1, 1024, 16, 1024)->UseManualTime();
 
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_test.cc b/third_party/xla/xla/service/gpu/runtime/topk_test.cc
index ccf4a780a7c442..6a21ec10876680 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/topk_test.cc
@@ -64,11 +64,11 @@ using ParameterizedInterface =
 class TopkTest : public HloTestBase, public ParameterizedInterface {
  public:
   TopkTest()
-      : HloTestBase(*PlatformUtil::GetPlatform("CUDA"),
-                    *PlatformUtil::GetPlatform("CUDA"), true, true, {}) {}
+      : HloTestBase(*PlatformUtil::GetPlatform("gpu"),
+                    *PlatformUtil::GetPlatform("gpu"), true, true, {}) {}
 
  protected:
-  StatusOr<std::unique_ptr<HloModule>> TopkHlo(int n, int k,
+  StatusOr<std::unique_ptr<HloModule>> TopkHlo(int n, int k, int batch_size,
                                                std::string_view dtype) {
     return ParseAndReturnVerifiedModule(absl::Substitute(
         R"(
@@ -84,11 +84,11 @@ class TopkTest : public HloTestBase, public ParameterizedInterface {
       }
 
       ENTRY top_k {
-        %arg = $2[32,$0] parameter(0)
-        ROOT %result = ($2[32,$1], s32[32,$1]) custom-call(%arg), custom_call_target="TopK", to_apply=%compare
+        %arg = $3[$2,$0] parameter(0)
+        ROOT %result = ($3[$2,$1], s32[$2,$1]) custom-call(%arg), custom_call_target="TopK", to_apply=%compare
       }
     )",
-        n, k, dtype));
+        n, k, batch_size, dtype));
   }
 };
 
@@ -139,7 +139,7 @@ void ToSortAndSlice(HloModule* module) {
 TEST_P(TopkTest, ProducesCorrectResult) {
   const auto [n_kb, k, batch_size, dtype] = GetParam();
   const size_t n = n_kb * 1024;
-  TF_ASSERT_OK_AND_ASSIGN(auto topk_module, TopkHlo(n, k, dtype));
+  TF_ASSERT_OK_AND_ASSIGN(auto topk_module, TopkHlo(n, k, batch_size, dtype));
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           gpu::TopkSpecializer().Run(topk_module.get()));
   ASSERT_TRUE(changed);
@@ -152,7 +152,7 @@ INSTANTIATE_TEST_SUITE_P(
     Combine(
         /*n_kb=*/Values(1, 8, 12, 32),
         /*k=*/Values(1, 2, 4, 8, 16, 7, 12),
-        /*batch_size=*/Values(1, 16, 64, 128),
+        /*batch_size=*/Values(1, 16, 32, 64, 128),
         /*dtype=*/Values(absl::string_view("f32"), "bf16")),
     [](const auto& info) {
       return absl::Substitute("n$0KiB_k$1_batch_size$2_$3",
diff --git a/third_party/xla/xla/service/gpu/runtime/tracing.cc b/third_party/xla/xla/service/gpu/runtime/tracing.cc
index 49767a85500ba9..e5d986db4b6250 100644
--- a/third_party/xla/xla/service/gpu/runtime/tracing.cc
+++ b/third_party/xla/xla/service/gpu/runtime/tracing.cc
@@ -44,11 +44,23 @@ void RegisterTracingTypeIdNames(runtime::TypeIDNameRegistry& registry) {
 // Tracing custom calls implementation.
 //===----------------------------------------------------------------------===//
 
+namespace {
+thread_local const ModuleAnnotations* current_annotations{};
+}
+
 static absl::StatusOr<int64_t> ActivityStart(runtime::HloTrace annotation) {
   SetCurrentTracingScope(annotation.hlo_op);
+  if (current_annotations) {
+    // We know which HloModule we belong to, and may have pre-prepared
+    // annotation structs ready to use
+    const auto iter = current_annotations->kernels.find(annotation.hlo_op);
+    if (iter != current_annotations->kernels.end()) {
+      // Have a pre-prepared annotation, use it
+      return ScopedAnnotationStack::ActivityStart([&] { return iter->second; });
+    }
+  }
   return ScopedAnnotationStack::ActivityStart([&] {
-    // We use the same tracing annotation scheme as the ThunkSequence (see
-    // implementation of `GetThunkInfo` in `ir_emitter_unnested.cc`).
+    // We use the same tracing annotation scheme as the ThunkSequence.
     return absl::StrFormat("Thunk:#hlo_op=%s#", annotation.hlo_op);
   });
 }
@@ -73,5 +85,10 @@ void RegisterTracingCustomCalls(runtime::DirectCustomCallRegistry& registry) {
   registry.Register("xla.trace.activity_end", End);
 }
 
+const ModuleAnnotations* SetCurrentModuleAnnotations(
+    const ModuleAnnotations* annotations) {
+  return std::exchange(current_annotations, annotations);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/tracing.h b/third_party/xla/xla/service/gpu/runtime/tracing.h
index 7f5efe48accac4..7446411d035010 100644
--- a/third_party/xla/xla/service/gpu/runtime/tracing.h
+++ b/third_party/xla/xla/service/gpu/runtime/tracing.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "xla/runtime/custom_call_registry.h"
 #include "xla/runtime/type_id.h"
+#include "xla/service/gpu/runtime/annotation.h"
 
 namespace xla {
 namespace gpu {
@@ -28,6 +29,9 @@ void RegisterTracingTypeIdNames(runtime::TypeIDNameRegistry& registry);
 
 void RegisterTracingCustomCalls(runtime::DirectCustomCallRegistry& registry);
 
+const ModuleAnnotations* SetCurrentModuleAnnotations(
+    const ModuleAnnotations* annotations);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/runtime3/BUILD b/third_party/xla/xla/service/gpu/runtime3/BUILD
index 0e01c476ed61ab..f566ad2e40ed25 100644
--- a/third_party/xla/xla/service/gpu/runtime3/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime3/BUILD
@@ -1,5 +1,6 @@
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
 package(
@@ -16,25 +17,48 @@ package_group(
 # Command Buffer Integration
 #===-------------------------------------------------------------------------------------------===//
 
+cc_library(
+    name = "command_buffer_allocations",
+    srcs = ["command_buffer_allocations.cc"],
+    hdrs = ["command_buffer_allocations.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:status",
+        "//xla:statusor",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "command_buffer_cmd",
     srcs = ["command_buffer_cmd.cc"],
     hdrs = ["command_buffer_cmd.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":command_buffer_allocations",
         "//xla:status",
         "//xla:statusor",
         "//xla:types",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:gemm_thunk",
         "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:stream_executor_util",
         "//xla/service/gpu:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
@@ -42,6 +66,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "command_buffer_cmd_emitter",
+    srcs = ["command_buffer_cmd_emitter.cc"],
+    hdrs = ["command_buffer_cmd_emitter.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":command_buffer_cmd",
+        ":sequential_thunk",
+        ":while_thunk",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla/service/gpu:gpu_executable",
+        "//xla/service/gpu:thunk",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 xla_test(
     name = "command_buffer_cmd_test",
     srcs = if_gpu_is_configured(["command_buffer_cmd_test.cc"]),
@@ -75,7 +116,7 @@ cc_library(
     deps = if_gpu_is_configured([
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:cusolver_context",
-        "//xla/service/gpu:precompiled_kernels",
+        "//xla/service/gpu:make_batch_pointers",
         "//xla/service/gpu:thunk",
         "//xla:types",
         "//xla:util",
@@ -95,6 +136,7 @@ cc_library(
     hdrs = ["command_buffer_thunk.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":command_buffer_allocations",
         ":command_buffer_cmd",
         "//xla:status",
         "//xla:statusor",
@@ -104,7 +146,7 @@ cc_library(
         "//xla/service/gpu:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:errors",
@@ -118,13 +160,16 @@ xla_test(
     srcs = if_gpu_is_configured(["command_buffer_thunk_test.cc"]),
     backends = ["gpu"],
     deps = [
+        ":command_buffer_allocations",
         ":command_buffer_cmd",
         ":command_buffer_thunk",
+        "//xla:shape_util",
         "//xla:types",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:multi_platform_manager",
@@ -132,11 +177,42 @@ xla_test(
         "//xla/stream_executor/cuda:cuda_test_kernels",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
 
+cc_library(
+    name = "conditional_thunk",
+    srcs = ["conditional_thunk.cc"],
+    hdrs = ["conditional_thunk.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":sequential_thunk",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+    ],
+)
+
+cc_library(
+    name = "convolution_thunk",
+    srcs = ["convolution_thunk.cc"],
+    hdrs = ["convolution_thunk.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:util",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:gpu_conv_runner",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
 cc_library(
     name = "custom_call_thunk",
     srcs = ["custom_call_thunk.cc"],
@@ -150,9 +226,10 @@ cc_library(
         "//xla:shape_util",
         "//xla:status",
         "//xla:util",
-        "//xla/ffi",
         "//xla/ffi:call_frame",
+        "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
+        "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_status_internal",
@@ -190,6 +267,68 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "for_thunk",
+    srcs = ["for_thunk.cc"],
+    hdrs = ["for_thunk.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":sequential_thunk",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "@local_tsl//tsl/platform:errors",
+    ],
+)
+
+cc_library(
+    name = "sequential_thunk",
+    srcs = ["sequential_thunk.cc"],
+    hdrs = ["sequential_thunk.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:thunk",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
+    ],
+)
+
+cc_library(
+    name = "send_recv_thunk",
+    srcs = ["send_recv_thunk.cc"],
+    hdrs = ["send_recv_thunk.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/concurrency:async_value",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_tsl//tsl/profiler/lib:traceme_encode",
+    ],
+)
+
 cc_library(
     name = "triangular_solve_thunk",
     srcs = if_gpu_is_configured(["triangular_solve_thunk.cc"]),
@@ -198,16 +337,32 @@ cc_library(
     deps = if_gpu_is_configured([
         "@com_google_absl//absl/strings:str_format",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:precompiled_kernels",
+        "//xla/service/gpu:make_batch_pointers",
         "//xla/service/gpu:thunk",
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/service:buffer_assignment",
         "//xla/hlo/ir:hlo",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor/gpu:gpu_asm_opts",
     ]) + ["@local_tsl//tsl/platform:status"],
 )
+
+cc_library(
+    name = "while_thunk",
+    srcs = ["while_thunk.cc"],
+    hdrs = ["while_thunk.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":sequential_thunk",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "@local_tsl//tsl/platform:errors",
+    ],
+)
diff --git a/third_party/xla/xla/service/gpu/runtime3/cholesky_thunk.cc b/third_party/xla/xla/service/gpu/runtime3/cholesky_thunk.cc
index b00d8ee29c36b3..10235cd9bf5677 100644
--- a/third_party/xla/xla/service/gpu/runtime3/cholesky_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime3/cholesky_thunk.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <utility>
 
 #include "xla/service/gpu/cusolver_context.h"
-#include "xla/service/gpu/precompiled_kernels.h"
+#include "xla/service/gpu/make_batch_pointers.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -54,7 +54,7 @@ Status DoPotrfBatched(const se::GpuAsmOpts& asm_opts, CholeskyParams* params,
   // Run a kernel that sets as[i] = &a_base[i * stride].
   const int64_t stride_bytes = params->n * params->n * sizeof(T);
   TF_RETURN_IF_ERROR(MakeBatchPointers(
-      stream, asm_opts, se::DeviceMemoryBase(a_base), stride_bytes,
+      stream, se::DeviceMemoryBase(a_base), stride_bytes,
       static_cast<int>(params->batch_size), se::DeviceMemoryBase(as)));
 
   // Now that we've set up the `as` array, we can call cusolver.
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_allocations.cc b/third_party/xla/xla/service/gpu/runtime3/command_buffer_allocations.cc
new file mode 100644
index 00000000000000..20a5c10e2de58b
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime3/command_buffer_allocations.cc
@@ -0,0 +1,64 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/runtime3/command_buffer_allocations.h"
+
+#include <utility>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_memory.h"
+
+namespace xla::gpu {
+
+StatusOr<se::DeviceMemoryBase> CommandBufferAllocations::GetDeviceAddress(
+    BufferAllocation::Index index) const {
+  auto base = allocs_.find(index);
+  if (base == allocs_.end()) {
+    return absl::InternalError(absl::StrCat("Command buffer allocation #",
+                                            index, " was not allocated"));
+  }
+  return allocs_.at(index);
+}
+
+Status CommandBufferAllocations::AddAllocation(BufferAllocation::Index index,
+                                               se::DeviceMemoryBase memory) {
+  VLOG(2) << "Add comand buffer allocation: index=" << index
+          << "; ptr=" << memory.opaque();
+
+  auto emplaced = allocs_.try_emplace(index, std::move(memory));
+  if (emplaced.second == false) {
+    return absl::InternalError(absl::StrCat("Command buffer allocation #",
+                                            index, " was already allocated"));
+  }
+  return OkStatus();
+}
+
+Status CommandBufferAllocations::EraseAllocation(
+    BufferAllocation::Index index) {
+  VLOG(2) << "Erase comand buffer allocation: index=" << index;
+
+  if (allocs_.erase(index) == 0) {
+    return absl::InternalError(absl::StrCat("Command buffer allocation #",
+                                            index, " was not allocated"));
+  }
+  return OkStatus();
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_allocations.h b/third_party/xla/xla/service/gpu/runtime3/command_buffer_allocations.h
new file mode 100644
index 00000000000000..d0db712a0a4a40
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime3/command_buffer_allocations.h
@@ -0,0 +1,51 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_ALLOCATIONS_H_
+#define XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_ALLOCATIONS_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/buffer_allocations.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_memory.h"
+
+namespace xla::gpu {
+
+// Command buffer allocations tracks external buffer allocations done via the
+// CommandBuffer API and owned by the XLA executable (via instantiated command
+// buffers and memory allocation Gpu graph nodes).
+class CommandBufferAllocations : public BufferAllocations::ExternalAllocations {
+ public:
+  StatusOr<se::DeviceMemoryBase> GetDeviceAddress(
+      BufferAllocation::Index index) const override;
+
+  // Adds an external allocation for a given buffer index. Returns error if
+  // allocation already exists.
+  Status AddAllocation(BufferAllocation::Index index,
+                       se::DeviceMemoryBase memory) override;
+
+  // Erases an external allocation for a given buffer index. Returns error if
+  // allocation does not exists.
+  Status EraseAllocation(BufferAllocation::Index index) override;
+
+ private:
+  absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase> allocs_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_ALLOCATIONS_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd.cc b/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd.cc
index 6efdeb35a8b2d0..4d23268bba6b8a 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd.cc
+++ b/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd.cc
@@ -19,30 +19,60 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/status.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 
+// Creates condition command buffer builder from a cmd sequence.
+static se::CommandBuffer::Builder ConditionBuilder(
+    CommandBufferCmdSequence* commands,
+    const CommandBufferCmd::RecordParams* params) {
+  return [=](se::CommandBuffer* command_buffer) {
+    return commands->Record(*params, command_buffer,
+                            CommandBufferCmdSequence::RecordMode::kConditional);
+  };
+}
+
+// Creates condition command buffer builders from a span of cmd sequences.
+static std::vector<se::CommandBuffer::Builder> ConditionBuilders(
+    absl::Span<CommandBufferCmdSequence> commands,
+    const CommandBufferCmd::RecordParams* params) {
+  std::vector<se::CommandBuffer::Builder> builders;
+  for (CommandBufferCmdSequence& cmd : commands) {
+    builders.push_back(ConditionBuilder(&cmd, params));
+  }
+  return builders;
+}
+
 //===----------------------------------------------------------------------===//
 // CommandBufferCmdSequence
 //===----------------------------------------------------------------------===//
 
 void CommandBufferCmdSequence::Append(std::unique_ptr<CommandBufferCmd> cmd) {
+  for (BufferAllocation::Slice& slice : cmd->slices()) {
+    slices_.insert(slice);
+    allocs_indices_.insert(slice.index());
+  }
   commands_.push_back(std::move(cmd));
 }
 
@@ -56,14 +86,34 @@ Status CommandBufferCmdSequence::Initialize(
 
 Status CommandBufferCmdSequence::Record(
     const CommandBufferCmd::RecordParams& params,
-    se::CommandBuffer* command_buffer) {
-  if (command_buffer->state() == se::CommandBuffer::State::kFinalized) {
-    TF_RETURN_IF_ERROR(command_buffer->Update());
+    se::CommandBuffer* command_buffer, RecordMode mode) {
+  if (mode == RecordMode::kExclusive) {
+    if (command_buffer->state() == se::CommandBuffer::State::kFinalized) {
+      TF_RETURN_IF_ERROR(command_buffer->Update());
+    }
   }
+
   for (auto& cmd : commands_) {
     TF_RETURN_IF_ERROR(cmd->Record(params, command_buffer));
   }
-  return command_buffer->Finalize();
+
+  if (mode == RecordMode::kExclusive) {
+    TF_RETURN_IF_ERROR(command_buffer->Finalize());
+  }
+
+  return OkStatus();
+}
+
+// Returns buffer allocation slices referenced by commands in this sequence.
+const absl::flat_hash_set<BufferAllocation::Slice>&
+CommandBufferCmdSequence::slices() const {
+  return slices_;
+}
+
+// Returns buffer allocations indices referenced by commands in this sequence.
+const absl::flat_hash_set<BufferAllocation::Index>&
+CommandBufferCmdSequence::allocs_indices() const {
+  return allocs_indices_;
 }
 
 //===----------------------------------------------------------------------===//
@@ -82,7 +132,7 @@ Status LaunchCmd::Initialize(se::StreamExecutor* executor,
                              ExecutableSource source) {
   if (kernels_.contains(executor)) return OkStatus();
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<se::KernelBase> kernel,
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Kernel> kernel,
                       CreateKernel(kernel_name_, args_.size(), source.text,
                                    source.binary, executor, shmem_bytes_));
 
@@ -90,27 +140,12 @@ Status LaunchCmd::Initialize(se::StreamExecutor* executor,
   return OkStatus();
 }
 
-static std::unique_ptr<se::KernelArgsArrayBase> AllocateKernelArgs(
-    absl::Span<const se::DeviceMemoryBase> args, int64_t shmem_bytes) {
-  static constexpr int kKernelArgsLimit = 1024;
-
-  // Specialize kernel arguments array for small sizes to allocate a smaller
-  // chunk of memory and hopefully hit a small allocations cache.
-  if (args.size() <= 64) {
-    return se::MakeKernelArgs<64>(args, shmem_bytes);
-  } else if (args.size() <= 256) {
-    return se::MakeKernelArgs<256>(args, shmem_bytes);
-  }
-
-  return se::MakeKernelArgs<kKernelArgsLimit>(args, shmem_bytes);
-}
-
 Status LaunchCmd::Record(const RecordParams& params,
                          se::CommandBuffer* command_buffer) {
   VLOG(5) << "LaunchCmd: kernel=" << kernel_name_
           << ", shmem_bytes=" << shmem_bytes_;
 
-  se::KernelBase* kernel = kernels_[command_buffer->executor()].get();
+  se::Kernel* kernel = kernels_[params.executor].get();
   if (kernel == nullptr) {
     return absl::InternalError(
         "Kernel not loaded on a command buffer executor");
@@ -123,7 +158,8 @@ Status LaunchCmd::Record(const RecordParams& params,
     buffers.push_back(buf);
   }
 
-  auto kernel_args = AllocateKernelArgs(buffers, shmem_bytes_);
+  TF_ASSIGN_OR_RETURN(auto kernel_args,
+                      se::PackKernelArgs(buffers, shmem_bytes_));
 
   LaunchDimensions::Dim3D thread_counts = dims_.thread_counts_per_block();
   LaunchDimensions::Dim3D block_counts = dims_.block_counts();
@@ -134,6 +170,10 @@ Status LaunchCmd::Record(const RecordParams& params,
       *kernel_args);
 }
 
+CommandBufferCmd::Slices LaunchCmd::slices() {
+  return CommandBufferCmd::Slices(args_.begin(), args_.end());
+}
+
 //===----------------------------------------------------------------------===//
 // MemcpyDeviceToDeviceCmd
 //===----------------------------------------------------------------------===//
@@ -145,11 +185,282 @@ MemcpyDeviceToDeviceCmd::MemcpyDeviceToDeviceCmd(BufferAllocation::Slice dst,
 
 Status MemcpyDeviceToDeviceCmd::Record(const RecordParams& params,
                                        se::CommandBuffer* command_buffer) {
-  VLOG(5) << "MemcpyDeviceToDeviceCmd: dst=" << dst_ << ", src=" << src_
-          << ", num_bytes=" << num_bytes_;
   se::DeviceMemoryBase dst = params.buffer_allocations->GetDeviceAddress(dst_);
   se::DeviceMemoryBase src = params.buffer_allocations->GetDeviceAddress(src_);
+
+  VLOG(5) << "MemcpyDeviceToDeviceCmd: dst=" << dst_ << " (" << dst.opaque()
+          << "), src=" << src_ << " (" << src.opaque()
+          << "), num_bytes=" << num_bytes_;
+
+  if (num_bytes_ == 0) {
+    VLOG(5) << "Skip recording MemcpyDeviceToDeviceCmd command of 0 bytes";
+    return OkStatus();
+  }
+
   return command_buffer->MemcpyDeviceToDevice(&dst, src, num_bytes_);
 }
 
+CommandBufferCmd::Slices MemcpyDeviceToDeviceCmd::slices() {
+  return {dst_, src_};
+}
+
+//===----------------------------------------------------------------------===//
+// IfCmd
+//===----------------------------------------------------------------------===//
+
+IfCmd::IfCmd(BufferAllocation::Slice pred,
+             CommandBufferCmdSequence then_commands)
+    : pred_(pred), then_commands_(std::move(then_commands)) {}
+
+Status IfCmd::Initialize(se::StreamExecutor* executor,
+                         ExecutableSource source) {
+  return then_commands_.Initialize(executor, source);
+}
+
+Status IfCmd::Record(const RecordParams& params,
+                     se::CommandBuffer* command_buffer) {
+  se::DeviceMemoryBase pred =
+      params.buffer_allocations->GetDeviceAddress(pred_);
+
+  return command_buffer->If(params.executor, se::DeviceMemory<bool>(pred),
+                            ConditionBuilder(&then_commands_, &params));
+}
+
+CommandBufferCmd::Slices IfCmd::slices() {
+  absl::flat_hash_set<BufferAllocation::Slice> slices = {pred_};
+  slices.insert(then_commands_.slices().begin(), then_commands_.slices().end());
+  return {slices.begin(), slices.end()};
+}
+
+//===----------------------------------------------------------------------===//
+// IfElseCmd
+//===----------------------------------------------------------------------===//
+
+IfElseCmd::IfElseCmd(BufferAllocation::Slice pred,
+                     CommandBufferCmdSequence then_commands,
+                     CommandBufferCmdSequence else_commands)
+    : pred_(pred),
+      then_commands_(std::move(then_commands)),
+      else_commands_(std::move(else_commands)) {}
+
+Status IfElseCmd::Initialize(se::StreamExecutor* executor,
+                             ExecutableSource source) {
+  TF_RETURN_IF_ERROR(then_commands_.Initialize(executor, source));
+  TF_RETURN_IF_ERROR(else_commands_.Initialize(executor, source));
+  return OkStatus();
+}
+
+Status IfElseCmd::Record(const RecordParams& params,
+                         se::CommandBuffer* command_buffer) {
+  se::DeviceMemoryBase pred =
+      params.buffer_allocations->GetDeviceAddress(pred_);
+
+  return command_buffer->IfElse(params.executor, se::DeviceMemory<bool>(pred),
+                                ConditionBuilder(&then_commands_, &params),
+                                ConditionBuilder(&else_commands_, &params));
+}
+
+CommandBufferCmd::Slices IfElseCmd::slices() {
+  absl::flat_hash_set<BufferAllocation::Slice> slices = {pred_};
+  slices.insert(then_commands_.slices().begin(), then_commands_.slices().end());
+  slices.insert(else_commands_.slices().begin(), else_commands_.slices().end());
+  return {slices.begin(), slices.end()};
+}
+
+//===----------------------------------------------------------------------===//
+// CaseCmd
+//===----------------------------------------------------------------------===//
+
+CaseCmd::CaseCmd(BufferAllocation::Slice index,
+                 std::vector<CommandBufferCmdSequence> branches_commands)
+    : index_(index), branches_commands_(std::move(branches_commands)) {}
+
+Status CaseCmd::Initialize(se::StreamExecutor* executor,
+                           ExecutableSource source) {
+  for (auto& branch : branches_commands_) {
+    TF_RETURN_IF_ERROR(branch.Initialize(executor, source));
+  }
+  return OkStatus();
+}
+
+Status CaseCmd::Record(const RecordParams& params,
+                       se::CommandBuffer* command_buffer) {
+  se::DeviceMemoryBase index =
+      params.buffer_allocations->GetDeviceAddress(index_);
+
+  return command_buffer->Case(
+      params.executor, se::DeviceMemory<int32_t>(index),
+      ConditionBuilders(absl::MakeSpan(branches_commands_), &params));
+}
+
+CommandBufferCmd::Slices CaseCmd::slices() {
+  absl::flat_hash_set<BufferAllocation::Slice> slices = {index_};
+  for (auto& branch : branches_commands_) {
+    slices.insert(branch.slices().begin(), branch.slices().end());
+  }
+  return {slices.begin(), slices.end()};
+}
+
+//===----------------------------------------------------------------------===//
+// ForCmd
+//===----------------------------------------------------------------------===//
+
+ForCmd::ForCmd(int32_t num_iterations, BufferAllocation::Slice loop_counter,
+               CommandBufferCmdSequence body_commands)
+    : num_iterations_(num_iterations),
+      loop_counter_(loop_counter),
+      body_commands_(std::move(body_commands)) {}
+
+Status ForCmd::Initialize(se::StreamExecutor* executor,
+                          ExecutableSource source) {
+  return body_commands_.Initialize(executor, source);
+}
+
+Status ForCmd::Record(const RecordParams& params,
+                      se::CommandBuffer* command_buffer) {
+  se::DeviceMemoryBase loop_counter =
+      params.buffer_allocations->GetDeviceAddress(loop_counter_);
+
+  return command_buffer->For(params.executor, num_iterations_,
+                             se::DeviceMemory<int32_t>(loop_counter),
+                             ConditionBuilder(&body_commands_, &params));
+}
+
+CommandBufferCmd::Slices ForCmd::slices() {
+  absl::flat_hash_set<BufferAllocation::Slice> slices = {loop_counter_};
+  slices.insert(body_commands_.slices().begin(), body_commands_.slices().end());
+  return {slices.begin(), slices.end()};
+}
+
+//===----------------------------------------------------------------------===//
+// WhileCmd
+//===----------------------------------------------------------------------===//
+
+WhileCmd::WhileCmd(BufferAllocation::Slice pred,
+                   CommandBufferCmdSequence cond_commands,
+                   CommandBufferCmdSequence body_commands)
+    : pred_(pred),
+      cond_commands_(std::move(cond_commands)),
+      body_commands_(std::move(body_commands)) {}
+
+Status WhileCmd::Initialize(se::StreamExecutor* executor,
+                            ExecutableSource source) {
+  TF_RETURN_IF_ERROR(cond_commands_.Initialize(executor, source));
+  return body_commands_.Initialize(executor, source);
+}
+
+Status WhileCmd::Record(const RecordParams& params,
+                        se::CommandBuffer* command_buffer) {
+  se::DeviceMemoryBase pred =
+      params.buffer_allocations->GetDeviceAddress(pred_);
+
+  return command_buffer->While(params.executor, se::DeviceMemory<bool>(pred),
+                               ConditionBuilder(&cond_commands_, &params),
+                               ConditionBuilder(&body_commands_, &params));
+}
+
+CommandBufferCmd::Slices WhileCmd::slices() {
+  absl::flat_hash_set<BufferAllocation::Slice> slices = {pred_};
+  slices.insert(cond_commands_.slices().begin(), cond_commands_.slices().end());
+  slices.insert(body_commands_.slices().begin(), body_commands_.slices().end());
+  return {slices.begin(), slices.end()};
+}
+
+//===----------------------------------------------------------------------===//
+// AllocateCmd
+//===----------------------------------------------------------------------===//
+
+AllocateCmd::AllocateCmd(BufferAllocation allocation)
+    : allocation_(allocation) {}
+
+Status AllocateCmd::Record(const RecordParams& params,
+                           se::CommandBuffer* command_buffer) {
+  // Memory allocation address is returned on graph creation, and there is no
+  // update operation
+  VLOG(2) << "AllocationCmd: index=" << allocation_.index();
+
+  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buffer,
+                      command_buffer->Allocate(allocation_.size()));
+  return params.buffer_allocations->AddExternalAllocation(allocation_.index(),
+                                                          buffer);
+}
+
+CommandBufferCmd::Slices AllocateCmd::slices() { return {}; }
+
+//===----------------------------------------------------------------------===//
+// FreeCmd
+//===----------------------------------------------------------------------===//
+
+FreeCmd::FreeCmd(BufferAllocation allocation) : allocation_(allocation) {}
+
+Status FreeCmd::Record(const RecordParams& params,
+                       se::CommandBuffer* command_buffer) {
+  VLOG(2) << "FreeCmd: index=" << allocation_.index();
+
+  se::DeviceMemoryBase address =
+      params.buffer_allocations->GetDeviceAddress(allocation_.index());
+
+  // Free is in the same command buffer
+  TF_RETURN_IF_ERROR(command_buffer->Free(address));
+
+  // Remove the buffer from external allocations.
+  return params.buffer_allocations->EraseExternalAllocation(
+      allocation_.index());
+}
+
+CommandBufferCmd::Slices FreeCmd::slices() { return {}; }
+
+//===----------------------------------------------------------------------===//
+// GemmCmd
+//===----------------------------------------------------------------------===//
+
+GemmCmd::GemmCmd(GemmConfig config, const BufferAllocation::Slice& lhs_buffer,
+                 const BufferAllocation::Slice& rhs_buffer,
+                 const BufferAllocation::Slice& output_buffer,
+                 bool deterministic)
+    : config_(std::move(config)),
+      lhs_buffer_(lhs_buffer),
+      rhs_buffer_(rhs_buffer),
+      output_buffer_(output_buffer),
+      deterministic_(deterministic) {}
+
+Status GemmCmd::Initialize(se::StreamExecutor* executor,
+                           ExecutableSource source) {
+  if (!executor->AsBlas()) {
+    return absl::InternalError("Failed to initialize BLAS support for GemmCmd");
+  }
+  return OkStatus();
+}
+
+Status GemmCmd::Record(const RecordParams& params,
+                       se::CommandBuffer* command_buffer) {
+  VLOG(5) << "GemmCmd: lhs=" << lhs_buffer_ << ", rhs=" << rhs_buffer_
+          << ", output=" << output_buffer_
+          << ", deterministic=" << deterministic_;
+
+  se::DeviceMemoryBase workspace(nullptr, 0);
+
+  se::DeviceMemoryBase lhs =
+      params.buffer_allocations->GetDeviceAddress(lhs_buffer_);
+
+  se::DeviceMemoryBase rhs =
+      params.buffer_allocations->GetDeviceAddress(rhs_buffer_);
+
+  se::DeviceMemoryBase out =
+      params.buffer_allocations->GetDeviceAddress(output_buffer_);
+
+  TF_ASSIGN_OR_RETURN(
+      auto nested_buffer,
+      se::CommandBuffer::Trace(params.executor, [&](se::Stream* stream) {
+        return RunGemm(config_, lhs, rhs, out, workspace, deterministic_,
+                       stream);
+      }));
+
+  return command_buffer->AddNestedCommandBuffer(nested_buffer);
+}
+
+CommandBufferCmd::Slices GemmCmd::slices() {
+  return {lhs_buffer_, rhs_buffer_, output_buffer_};
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd.h b/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd.h
index b32c29eb2b9524..48dcff6d27f6c5 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd.h
+++ b/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd.h
@@ -22,16 +22,20 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/runtime3/command_buffer_allocations.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 
 namespace xla::gpu {
 
@@ -44,12 +48,19 @@ namespace xla::gpu {
 class CommandBufferCmd {
  public:
   using ExecutableSource = Thunk::ExecutableSource;
+  using Slices = absl::InlinedVector<BufferAllocation::Slice, 4>;
 
   // Run time parameters required for recording commands into the command
   // buffer. For example when we emit command buffer cmd sequence from an HLO
   // module, we only know the buffer slices required for HLO operations, but the
   // concrete device pointers become available only at run time.
+  //
+  // For allocations that performed through command buffer Allocate command, the
+  // target addresses are tracked by command buffer runtime. To record command
+  // that consumes buffers allocated inside command buffer, user should specify
+  // the target address as se::DeviceMemoryBase{nullptr, size}.
   struct RecordParams {
+    se::StreamExecutor* executor;
     const BufferAllocations* buffer_allocations;
   };
 
@@ -65,6 +76,10 @@ class CommandBufferCmd {
   virtual Status Record(const RecordParams& params,
                         se::CommandBuffer* command_buffer) = 0;
 
+  // Returns all buffer slices of the cmd. These will be used to track cmd
+  // updates, thus they need to be consistent across calls to the function.
+  virtual Slices slices() = 0;
+
   virtual ~CommandBufferCmd() = default;
 };
 
@@ -79,6 +94,21 @@ class CommandBufferCmdSequence {
  public:
   CommandBufferCmdSequence() = default;
 
+  enum class RecordMode {
+    // In exclusive mode no one else is recording commands into the command
+    // buffer argument, and cmd sequence is responsible for updating command
+    // buffer state: finalizing after all commands recorded, and
+    // switching to update state before recording updates.
+    kExclusive,
+
+    // In conditional mode multiple cmd sequences can be recorded into the
+    // command buffer argument, and with command buffer state managed externally
+    // cmd sequence should not finalize or update it. This mode is used when
+    // command buffer cmd sequence is recorded into conditional command buffers
+    // owned by the parent command buffer.
+    kConditional
+  };
+
   void Append(std::unique_ptr<CommandBufferCmd> cmd);
 
   template <typename T, typename... Args>
@@ -92,10 +122,23 @@ class CommandBufferCmdSequence {
 
   // Records all commands added to a sequence into the given command buffer.
   Status Record(const CommandBufferCmd::RecordParams& params,
-                se::CommandBuffer* command_buffer);
+                se::CommandBuffer* command_buffer,
+                RecordMode mode = RecordMode::kExclusive);
+
+  // Returns buffer allocation slices referenced by commands in this sequence.
+  const absl::flat_hash_set<BufferAllocation::Slice>& slices() const;
+
+  // Returns buffer allocations indices referenced by commands in this sequence.
+  const absl::flat_hash_set<BufferAllocation::Index>& allocs_indices() const;
 
  private:
   std::vector<std::unique_ptr<CommandBufferCmd>> commands_;
+
+  // Buffer allocation slices referenced by commands in this sequence.
+  absl::flat_hash_set<BufferAllocation::Slice> slices_;
+
+  // Buffer allocations indices referenced by commands in this sequence.
+  absl::flat_hash_set<BufferAllocation::Index> allocs_indices_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -114,8 +157,10 @@ class LaunchCmd : public CommandBufferCmd {
   Status Record(const RecordParams& params,
                 se::CommandBuffer* command_buffer) override;
 
+  Slices slices() override;
+
  private:
-  using OwnedKernel = std::unique_ptr<se::KernelBase>;
+  using OwnedKernel = std::unique_ptr<se::Kernel>;
 
   std::string kernel_name_;
   std::vector<BufferAllocation::Slice> args_;
@@ -137,12 +182,192 @@ class MemcpyDeviceToDeviceCmd : public CommandBufferCmd {
   Status Record(const RecordParams& params,
                 se::CommandBuffer* command_buffer) override;
 
+  Slices slices() override;
+
  private:
   BufferAllocation::Slice dst_;
   BufferAllocation::Slice src_;
   int64_t num_bytes_;
 };
 
+//===----------------------------------------------------------------------===//
+// IfCmd
+//===----------------------------------------------------------------------===//
+
+class IfCmd : public CommandBufferCmd {
+ public:
+  IfCmd(BufferAllocation::Slice pred, CommandBufferCmdSequence then_commands);
+
+  Status Initialize(se::StreamExecutor* executor,
+                    ExecutableSource source) override;
+
+  Status Record(const RecordParams& params,
+                se::CommandBuffer* command_buffer) override;
+
+  Slices slices() override;
+
+ private:
+  BufferAllocation::Slice pred_;
+  CommandBufferCmdSequence then_commands_;
+};
+
+//===----------------------------------------------------------------------===//
+// IfElseCmd
+//===----------------------------------------------------------------------===//
+
+class IfElseCmd : public CommandBufferCmd {
+ public:
+  IfElseCmd(BufferAllocation::Slice pred,
+            CommandBufferCmdSequence then_commands,
+            CommandBufferCmdSequence else_commands);
+
+  Status Initialize(se::StreamExecutor* executor,
+                    ExecutableSource source) override;
+
+  Status Record(const RecordParams& params,
+                se::CommandBuffer* command_buffer) override;
+
+  Slices slices() override;
+
+ private:
+  BufferAllocation::Slice pred_;
+  CommandBufferCmdSequence then_commands_;
+  CommandBufferCmdSequence else_commands_;
+};
+
+//===----------------------------------------------------------------------===//
+// CaseCmd
+//===----------------------------------------------------------------------===//
+
+class CaseCmd : public CommandBufferCmd {
+ public:
+  CaseCmd(BufferAllocation::Slice index,
+          std::vector<CommandBufferCmdSequence> branches_commands);
+
+  Status Initialize(se::StreamExecutor* executor,
+                    ExecutableSource source) override;
+
+  Status Record(const RecordParams& params,
+                se::CommandBuffer* command_buffer) override;
+
+  Slices slices() override;
+
+ private:
+  BufferAllocation::Slice index_;
+  std::vector<CommandBufferCmdSequence> branches_commands_;
+};
+
+//===----------------------------------------------------------------------===//
+// ForCmd
+//===----------------------------------------------------------------------===//
+
+class ForCmd : public CommandBufferCmd {
+ public:
+  ForCmd(int32_t num_iterations, BufferAllocation::Slice loop_counter,
+         CommandBufferCmdSequence body_commands);
+
+  Status Initialize(se::StreamExecutor* executor,
+                    ExecutableSource source) override;
+
+  Status Record(const RecordParams& params,
+                se::CommandBuffer* command_buffer) override;
+
+  Slices slices() override;
+
+ private:
+  int32_t num_iterations_;
+  BufferAllocation::Slice loop_counter_;
+  CommandBufferCmdSequence body_commands_;
+};
+
+//===----------------------------------------------------------------------===//
+// WhileCmd
+//===----------------------------------------------------------------------===//
+
+class WhileCmd : public CommandBufferCmd {
+ public:
+  WhileCmd(BufferAllocation::Slice pred, CommandBufferCmdSequence cond_commands,
+           CommandBufferCmdSequence body_commands);
+
+  Status Initialize(se::StreamExecutor* executor,
+                    ExecutableSource source) override;
+
+  Status Record(const RecordParams& params,
+                se::CommandBuffer* command_buffer) override;
+
+  Slices slices() override;
+
+ private:
+  BufferAllocation::Slice pred_;
+  CommandBufferCmdSequence cond_commands_;
+  CommandBufferCmdSequence body_commands_;
+};
+
+//===----------------------------------------------------------------------===//
+// AllocateCmd
+//===----------------------------------------------------------------------===//
+
+class AllocateCmd : public CommandBufferCmd {
+ public:
+  AllocateCmd(BufferAllocation allocation);
+
+  // After calling this function, the allocated memory is tracked in
+  // CommandBuffer object.
+  Status Record(const RecordParams& params,
+                se::CommandBuffer* command_buffer) override;
+
+  Slices slices() override;
+
+ private:
+  BufferAllocation allocation_;
+};
+
+//===----------------------------------------------------------------------===//
+// FreeCmd
+//===----------------------------------------------------------------------===//
+
+class FreeCmd : public CommandBufferCmd {
+ public:
+  FreeCmd(BufferAllocation allocation);
+
+  // After calling this function, the allocated memory address for dst
+  // BufferAllocation is freed, no update is required.
+  Status Record(const RecordParams& params,
+                se::CommandBuffer* command_buffer) override;
+
+  Slices slices() override;
+
+ private:
+  BufferAllocation allocation_;
+};
+
+//===----------------------------------------------------------------------===//
+// GemmCmd
+//===----------------------------------------------------------------------===//
+
+class GemmCmd : public CommandBufferCmd {
+ public:
+  GemmCmd(GemmConfig config, const BufferAllocation::Slice& lhs_buffer,
+          const BufferAllocation::Slice& rhs_buffer,
+          const BufferAllocation::Slice& output_buffer, bool deterministic);
+
+  Status Initialize(se::StreamExecutor* executor,
+                    ExecutableSource source) override;
+
+  Status Record(const RecordParams& params,
+                se::CommandBuffer* command_buffer) override;
+
+  Slices slices() override;
+
+ private:
+  const GemmConfig config_;
+  const BufferAllocation::Slice lhs_buffer_;
+  const BufferAllocation::Slice rhs_buffer_;
+  const BufferAllocation::Slice output_buffer_;
+  // Whether to run deterministically.
+  const bool deterministic_;
+};
+
 }  // namespace xla::gpu
 
 #endif  // XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_CMD_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_emitter.cc b/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_emitter.cc
new file mode 100644
index 00000000000000..0f5f6797d4bd0e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_emitter.cc
@@ -0,0 +1,82 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/runtime3/command_buffer_cmd_emitter.h"
+
+#include <memory>
+#include <utility>
+
+#include "xla/service/gpu/copy_thunk.h"
+#include "xla/service/gpu/kernel_thunk.h"
+#include "xla/service/gpu/runtime3/command_buffer_cmd.h"
+#include "xla/service/gpu/runtime3/sequential_thunk.h"
+#include "xla/service/gpu/runtime3/while_thunk.h"
+#include "xla/service/gpu/thunk.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu {
+
+using Command = std::unique_ptr<CommandBufferCmd>;
+
+static StatusOr<Command> ConvertKernelThunk(const KernelThunk& thunk) {
+  return std::make_unique<LaunchCmd>(thunk.kernel_name(), thunk.arguments(),
+                                     thunk.launch_dimensions(),
+                                     thunk.shmem_bytes());
+}
+
+static StatusOr<Command> ConvertCopyThunk(
+    const DeviceToDeviceCopyThunk& thunk) {
+  return std::make_unique<MemcpyDeviceToDeviceCmd>(
+      thunk.destination(), thunk.source(), thunk.size_bytes());
+}
+
+static StatusOr<Command> ConvertWhileThunk(const WhileThunk& thunk) {
+  TF_ASSIGN_OR_RETURN(
+      CommandBufferCmdSequence cond_cmds,
+      ConvertToCommands(thunk.condition_thunk_sequence()->thunks()));
+  TF_ASSIGN_OR_RETURN(CommandBufferCmdSequence body_cmds,
+                      ConvertToCommands(thunk.body_thunk_sequence()->thunks()));
+  return std::make_unique<WhileCmd>(thunk.condition_result_buffer(),
+                                    std::move(cond_cmds), std::move(body_cmds));
+}
+
+static StatusOr<Command> ConvertThunk(const Thunk& thunk) {
+  switch (thunk.kind()) {
+    case Thunk::Kind::kKernel:
+      return ConvertKernelThunk(static_cast<const KernelThunk&>(thunk));
+    case Thunk::Kind::kCopy:
+      return ConvertCopyThunk(
+          static_cast<const DeviceToDeviceCopyThunk&>(thunk));
+    case Thunk::Kind::kWhile:
+      return ConvertWhileThunk(static_cast<const WhileThunk&>(thunk));
+    default:
+      return InternalError("Unsupported thunk kind: %s",
+                           Thunk::KindToString(thunk.kind()));
+  }
+}
+
+StatusOr<CommandBufferCmdSequence> ConvertToCommands(
+    const ThunkSequence& sequence) {
+  CommandBufferCmdSequence cmd_sequence;
+  for (const std::unique_ptr<Thunk>& thunk : sequence) {
+    TF_ASSIGN_OR_RETURN(Command cmd, ConvertThunk(*thunk));
+    cmd_sequence.Append(std::move(cmd));
+  }
+  return cmd_sequence;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/topk_specializer_nocuda.cc b/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_emitter.h
similarity index 64%
rename from third_party/xla/xla/service/gpu/topk_specializer_nocuda.cc
rename to third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_emitter.h
index eb69225ba48e27..f5abc2cd5d3f1a 100644
--- a/third_party/xla/xla/service/gpu/topk_specializer_nocuda.cc
+++ b/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_emitter.h
@@ -13,13 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/topk_specializer.h"
+#ifndef XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_CMD_EMITTER_H_
+#define XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_CMD_EMITTER_H_
+
+#include "xla/service/gpu/runtime3/command_buffer_cmd.h"
+#include "xla/service/gpu/thunk.h"
+#include "xla/statusor.h"
 
 namespace xla::gpu {
 
-StatusOr<bool> TopkSpecializer::Run(
-    HloModule*, const absl::flat_hash_set<absl::string_view>&) {
-  return false;
-}
+StatusOr<CommandBufferCmdSequence> ConvertToCommands(
+    const ThunkSequence& sequence);
 
 }  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_CMD_EMITTER_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_test.cc b/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_test.cc
index 0d2ebb60fcb40b..61474a5ba05e62 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_test.cc
@@ -68,7 +68,7 @@ TEST(CommandBufferCmdTest, MemcpyCmd) {
   BufferAllocations allocations({a, b}, 0, executor->GetAllocator());
 
   auto command_buffer = se::CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(commands.Record({&allocations}, &command_buffer));
+  TF_ASSERT_OK(commands.Record({executor, &allocations}, &command_buffer));
 
   // Execute command buffer and verify that it copied the memory.
   TF_ASSERT_OK(executor->Submit(&stream, command_buffer));
@@ -119,7 +119,7 @@ TEST(CommandBufferCmdTest, LaunchCmd) {
   BufferAllocations allocations({a, b}, 0, executor->GetAllocator());
 
   auto command_buffer = se::CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(commands.Record({&allocations}, &command_buffer));
+  TF_ASSERT_OK(commands.Record({executor, &allocations}, &command_buffer));
 
   // Execute command buffer and verify that it copied the memory.
   TF_ASSERT_OK(executor->Submit(&stream, command_buffer));
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk.cc b/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk.cc
index a845dce067a5ef..e3b4ee21a51375 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk.cc
@@ -15,22 +15,27 @@ limitations under the License.
 
 #include "xla/service/gpu/runtime3/command_buffer_thunk.h"
 
-#include <memory>
+#include <cstdint>
 #include <utility>
 
 #include "absl/synchronization/mutex.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/gpu/runtime3/command_buffer_allocations.h"
 #include "xla/service/gpu/runtime3/command_buffer_cmd.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/command_buffer.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 
-CommandBufferThunk::State::State(se::CommandBuffer command_buffer)
+CommandBufferThunk::ExecutorCommandBuffer::ExecutorCommandBuffer(
+    se::CommandBuffer command_buffer)
     : command_buffer(std::move(command_buffer)) {}
 
 CommandBufferThunk::CommandBufferThunk(CommandBufferCmdSequence commands,
@@ -43,33 +48,62 @@ Status CommandBufferThunk::Initialize(se::StreamExecutor* executor,
   return commands_.Initialize(executor, executable_source);
 }
 
+bool CommandBufferThunk::ExecutorCommandBuffer::ShouldUpdateCommandBuffer(
+    const CommandBufferCmdSequence& commands,
+    const CommandBufferCmd::RecordParams& params) {
+  bool should_update = false;
+  const BufferAllocations* allocs = params.buffer_allocations;
+
+  // We check only allocations referenced by commands in a cmd sequence, and
+  // leave every other entry default initialized (nullptr device memory).
+  for (BufferAllocation::Index index : commands.allocs_indices()) {
+    se::DeviceMemoryBase alloc = allocs->GetDeviceAddress(index);
+
+    if (recorded_allocs.size() <= index) {
+      recorded_allocs.resize(index + 1);
+    }
+
+    if (!recorded_allocs[index].IsSameAs(alloc)) {
+      recorded_allocs[index] = alloc;
+      should_update = true;
+    }
+  }
+
+  return should_update;
+}
+
 Status CommandBufferThunk::ExecuteOnStream(const ExecuteParams& params) {
   se::StreamExecutor* executor = params.stream->parent();
-  TF_ASSIGN_OR_RETURN(State * state, GetOrCreateCommandBuffer(executor));
+  TF_ASSIGN_OR_RETURN(ExecutorCommandBuffer * cmd_buffer,
+                      GetOrCreateCommandBuffer(executor));
+
+  absl::MutexLock lock(&cmd_buffer->mutex);
 
-  absl::MutexLock lock(&state->mutex);
+  CommandBufferCmd::RecordParams record_params = {
+      executor, const_cast<BufferAllocations*>(params.buffer_allocations)};
 
-  CommandBufferCmd::RecordParams record_params = {params.buffer_allocations};
-  TF_RETURN_IF_ERROR(commands_.Record(record_params, &state->command_buffer));
+  if (cmd_buffer->ShouldUpdateCommandBuffer(commands_, record_params)) {
+    TF_RETURN_IF_ERROR(
+        commands_.Record(record_params, &cmd_buffer->command_buffer));
+  }
 
-  return executor->Submit(params.stream, state->command_buffer);
+  return executor->Submit(params.stream, cmd_buffer->command_buffer);
 }
 
-StatusOr<CommandBufferThunk::State*>
+StatusOr<CommandBufferThunk::ExecutorCommandBuffer*>
 CommandBufferThunk::GetOrCreateCommandBuffer(se::StreamExecutor* executor) {
   absl::MutexLock lock(&mutex_);
 
   // Check if command buffer already exists
   if (auto it = command_buffers_.find(executor); it != command_buffers_.end()) {
-    return it->second.get();
+    return &it->second;
   }
 
   // Create a new empty command buffer.
   TF_ASSIGN_OR_RETURN(auto command_buffer, se::CommandBuffer::Create(executor));
-  auto emplaced = command_buffers_.emplace(
-      executor, std::make_unique<State>(std::move(command_buffer)));
+  auto emplaced = command_buffers_.emplace(executor, std::move(command_buffer));
 
-  return emplaced.first->second.get();
+  return &emplaced.first->second;
 }
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk.h b/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk.h
index a66f47f1c65863..156c1d4bbb8597 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk.h
@@ -16,17 +16,19 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_THUNK_H_
 #define XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_THUNK_H_
 
-#include <memory>
+#include <vector>
 
 #include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/service/gpu/runtime3/command_buffer_allocations.h"
 #include "xla/service/gpu/runtime3/command_buffer_cmd.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/command_buffer.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream_executor.h"
 
 namespace xla::gpu {
 
@@ -38,27 +40,62 @@ class CommandBufferThunk : public Thunk {
   Status Initialize(se::StreamExecutor*, ExecutableSource) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
- private:
-  // se::CommandBuffer is not thread safe, and we guard it with a mutex to
-  // guarantee that we do not mutate it concurrently.
-  struct State {
-    explicit State(se::CommandBuffer command_buffer);
+  // Return the allocation address that was lazilly allocated inside command
+  // buffer. This API is required when the buffers are allocated inside command
+  // buffer but will be consumed by non-command buffer operations.
+  StatusOr<se::DeviceMemoryBase> GetCommandBufferAllocationAddress(
+      const ExecuteParams& params, int64_t index);
 
+ private:
+  // Command buffer instantiated on a `se::StreamExecutor` instance, and
+  // auxiliary state required for efficient command buffer updates.
+  struct ExecutorCommandBuffer {
+    explicit ExecutorCommandBuffer(se::CommandBuffer command_buffer);
+
+    // Returns true if `commands` cmd sequence has to be recorded into
+    // `command_buffer` to update it (see `recorded_allocs` below).
+    bool ShouldUpdateCommandBuffer(const CommandBufferCmdSequence& commands,
+                                   const CommandBufferCmd::RecordParams& params)
+        ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex);
+
+    // se::CommandBuffer is not thread safe, and we guard it with a mutex to
+    // guarantee that we do not mutate it concurrently.
     absl::Mutex mutex;
     se::CommandBuffer command_buffer ABSL_GUARDED_BY(mutex);
+
+    // TODO(ezhulenev): We need to move command buffer allocations all the way
+    // up to the GpuExecutable as we can have Allocate and Free commands in
+    // different command buffers. Consider making it a part of
+    // BufferAllocations (as std::unique_ptr<ExternalAllocations> member).
+
+    // Memory allocations performed by a `command_buffer`.
+    CommandBufferAllocations allocations ABSL_GUARDED_BY(mutex);
+
+    // Mapping from buffer allocation index to the device memory passed at
+    // that index to the last call of `commands_.Record(...)` for
+    // `command_buffer`. We can just use a vector instead of map because
+    // `BufferAllocation::Index` is a unique identifier assigned
+    // contiguously and thus can be used as array index.
+    //
+    // If no device memory addresses changed from a previous call to
+    // `Record`, we can skip command buffer update and simply submit it for
+    // execution on a stream. All other pieces of information (like thread
+    // and block sizes) captured by commands at construction time and do not
+    // change.
+    std::vector<se::DeviceMemoryBase> recorded_allocs ABSL_GUARDED_BY(mutex);
   };
-  using OwnedCommandBuffer = std::unique_ptr<State>;
 
   // Returns a command buffer instantiated for `executor` or creates new one.
-  StatusOr<State*> GetOrCreateCommandBuffer(se::StreamExecutor* executor);
+  StatusOr<ExecutorCommandBuffer*> GetOrCreateCommandBuffer(
+      se::StreamExecutor* executor);
 
   // Command sequence that initializes command buffers on each executor.
   CommandBufferCmdSequence commands_;
 
   // Command buffer sequence instantiates command buffers on all executors.
   absl::Mutex mutex_;
-  absl::flat_hash_map<se::StreamExecutor*, OwnedCommandBuffer> command_buffers_
-      ABSL_GUARDED_BY(mutex_);
+  absl::node_hash_map<se::StreamExecutor*, ExecutorCommandBuffer>
+      command_buffers_ ABSL_GUARDED_BY(mutex_);
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk_test.cc
index fbd83e3ce027a0..ab85bde3cefef3 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk_test.cc
@@ -17,20 +17,27 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/runtime3/command_buffer_allocations.h"
 #include "xla/service/gpu/runtime3/command_buffer_cmd.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/service_executable_run_options.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/cuda/cuda_test_kernels.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/test.h"
@@ -79,12 +86,165 @@ TEST(CommandBufferThunkTest, MemcpyCmd) {
 
   // Execute command buffer thunk and verify that it copied the memory.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
 
   // Copy `b` data back to host.
   std::vector<int32_t> dst(4, 0);
   stream.ThenMemcpy(dst.data(), b, byte_length);
 
   ASSERT_EQ(dst, std::vector<int32_t>(4, 42));
+
+  // Try to update the command buffer with the same buffers.
+  stream.ThenMemZero(&b, byte_length);
+
+  // Thunk execution should automatically update underlying command buffer.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `b` data back to host.
+  std::fill(dst.begin(), dst.end(), 0);
+  stream.ThenMemcpy(dst.data(), b, byte_length);
+
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 42));
+}
+
+// This test does the following operations:
+// 1. Allocates memory region "a" and "c" outside command buffer.
+// 2. Allocates memory region "b" inside command buffer.
+// 3. MemCopyDeviceToDevice from "a" to "b" inside command buffer.
+
+// 4. MemCopyDeviceToDevice from "b" to "c" inside command buffer.
+// 5. Free memory region "b" inside command buffer.
+// 6. Verify that region "c" has the same content as "a".
+TEST(CommandBufferThunkTest, MemallocFreeCmdSameThunk) {
+  se::StreamExecutor* executor = CudaExecutor();
+
+  se::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  // Prepare arguments:
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  BufferAllocation alloc_a(/*index=*/0, byte_length, /*color=*/0);
+  BufferAllocation alloc_b(/*index=*/1, byte_length, /*color=*/0);
+  BufferAllocation alloc_c(/*index=*/2, byte_length, /*color=*/0);
+  BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
+  BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
+  BufferAllocation::Slice slice_c(&alloc_c, 0, byte_length);
+
+  // Prepare commands sequence for constructing command buffer.
+  CommandBufferCmdSequence commands;
+  commands.Emplace<AllocateCmd>(alloc_b);
+  commands.Emplace<MemcpyDeviceToDeviceCmd>(slice_b, slice_a, byte_length);
+  commands.Emplace<MemcpyDeviceToDeviceCmd>(slice_c, slice_b, byte_length);
+  commands.Emplace<FreeCmd>(alloc_b);
+
+  // Construct a thunk with command sequence.
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+
+  // Prepare arguments: a=42, b=0
+  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  stream.ThenMemset32(&a, 42, byte_length);
+
+  se::DeviceMemory<int32_t> b(se::DeviceMemoryBase(
+      reinterpret_cast<int32_t*>(BufferAllocations::kExternalAllocationMarker),
+      byte_length));
+  se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
+
+  std::unique_ptr<CommandBufferAllocations> external_allocation =
+      std::make_unique<CommandBufferAllocations>();
+
+  BufferAllocations allocations({a, b, c}, 0, executor->GetAllocator(),
+                                external_allocation.get());
+
+  ServiceExecutableRunOptions run_options;
+  Thunk::ExecuteParams params(run_options, allocations, &stream, {});
+
+  // Execute command buffer thunk and verify that it copied the memory.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `b` data back to host.
+  std::vector<int32_t> dst(4, 0);
+  stream.ThenMemcpy(dst.data(), allocations.GetMutableDeviceAddress(2),
+                    byte_length);
+
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 42));
+}
+
+// This test does the following operations:
+// 1. Allocates memory region "a" and "c" outside command buffer.
+// 2. Allocates memory region "b" inside command buffer thunk 1.
+// 3. MemCopyDeviceToDevice from "a" to "b" inside command buffer 1.
+// 4. MemCopyDeviceToDevice from "b" to "c" inside command buffer 2.
+// 5. Free memory region "b" inside command buffer 2.
+// 6. Verify that region "c" has the same content as "a".
+TEST(CommandBufferThunkTest, MemallocFreeCmdAcrossThunk) {
+  se::StreamExecutor* executor = CudaExecutor();
+
+  se::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  // Prepare arguments:
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  BufferAllocation alloc_a(/*index=*/0, byte_length, /*color=*/0);
+  BufferAllocation alloc_b(/*index=*/1, byte_length, /*color=*/0);
+  BufferAllocation alloc_c(/*index=*/2, byte_length, /*color=*/0);
+  BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
+  BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
+  BufferAllocation::Slice slice_c(&alloc_c, 0, byte_length);
+
+  // =================Thunk 1=================================
+  // Prepare commands sequence for constructing command buffer.
+  CommandBufferCmdSequence commands1;
+  commands1.Emplace<AllocateCmd>(alloc_b);
+  commands1.Emplace<MemcpyDeviceToDeviceCmd>(slice_b, slice_a, byte_length);
+
+  // Construct a thunk with command sequence.
+  CommandBufferThunk thunk1(std::move(commands1), Thunk::ThunkInfo(nullptr));
+
+  // Prepare arguments: a=42, b=0
+  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  stream.ThenMemset32(&a, 42, byte_length);
+  se::DeviceMemory<int32_t> b(se::DeviceMemoryBase(
+      reinterpret_cast<int32_t*>(BufferAllocations::kExternalAllocationMarker),
+      byte_length));
+  se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
+
+  std::unique_ptr<CommandBufferAllocations> external_allocation =
+      std::make_unique<CommandBufferAllocations>();
+
+  BufferAllocations allocations({a, b, c}, 0, executor->GetAllocator(),
+                                external_allocation.get());
+
+  ServiceExecutableRunOptions run_options;
+  Thunk::ExecuteParams params(run_options, allocations, &stream, {});
+
+  // Execute command buffer thunk and verify that it copied the memory.
+  TF_ASSERT_OK(thunk1.ExecuteOnStream(params));
+
+  // =================Thunk 2=================================
+  CommandBufferCmdSequence commands2;
+  commands2.Emplace<MemcpyDeviceToDeviceCmd>(slice_c, slice_b, byte_length);
+  commands2.Emplace<FreeCmd>(alloc_b);
+
+  // Construct a thunk with command sequence.
+  CommandBufferThunk thunk2(std::move(commands2), Thunk::ThunkInfo(nullptr));
+
+  // Execute command buffer thunk and verify that it copied the memory.
+  TF_ASSERT_OK(thunk2.ExecuteOnStream(params));
+
+  // Copy `c` data back to host.
+  std::vector<int32_t> dst(4, 0);
+  stream.ThenMemcpy(dst.data(), allocations.GetMutableDeviceAddress(2),
+                    byte_length);
+
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 42));
 }
 
 TEST(CommandBufferThunkTest, LaunchCmd) {
@@ -129,8 +289,9 @@ TEST(CommandBufferThunkTest, LaunchCmd) {
       /*text=*/se::cuda::internal::kAddI32Kernel, /*binary=*/{}};
   TF_ASSERT_OK(thunk.Initialize(executor, source));
 
-  // Execute command buffer thunk and verify that it copied the memory.
+  // Execute command buffer thunk and verify that it added the value.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
 
   // Copy `b` data back to host.
   std::vector<int32_t> dst(4, 0);
@@ -147,12 +308,560 @@ TEST(CommandBufferThunkTest, LaunchCmd) {
 
   // Thunk execution should automatically update underlying command buffer.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
 
   // Copy `c` data back to host.
   std::fill(dst.begin(), dst.end(), 0);
   stream.ThenMemcpy(dst.data(), c, byte_length);
 
   ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
+
+  // Try to update the command buffer with the same buffers.
+  stream.ThenMemZero(&c, byte_length);
+
+  // Thunk execution should automatically update underlying command buffer.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `c` data back to host.
+  std::fill(dst.begin(), dst.end(), 0);
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
+}
+
+TEST(CommandBufferThunkTest, GemmCmd) {
+#if CUDA_VERSION < 12030
+  GTEST_SKIP() << "Command buffer tracing is not supported";
+#endif
+
+  se::StreamExecutor* executor = CudaExecutor();
+
+  se::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  int64_t lhs_length = sizeof(float) * 2 * 4;
+  int64_t rhs_length = sizeof(float) * 4 * 3;
+  int64_t out_length = sizeof(float) * 2 * 3;
+
+  // Prepare arguments:
+  // lhs = [1.0, 2.0, 3.0, 4.0
+  //        5.0, 6.0, 7.0, 8.0]
+  // rhs = [1.0, 1.0, 1.0
+  //        1.0, 1.0, 1.0
+  //        1.0, 1.0, 1.0
+  //        1.0, 1.0, 1.0]
+  se::DeviceMemory<float> lhs = executor->AllocateArray<float>(2 * 4);
+  std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
+  stream.ThenMemcpy(&lhs, lhs_arr.data(), lhs_length);
+
+  se::DeviceMemory<float> rhs = executor->AllocateArray<float>(4 * 3);
+  std::vector<float> rhs_arr(12, 1);
+  stream.ThenMemcpy(&rhs, rhs_arr.data(), rhs_length);
+
+  se::DeviceMemory<float> out = executor->AllocateArray<float>(2 * 3);
+  stream.ThenMemZero(&out, out_length);
+
+  // Prepare buffer allocations for recording command buffer.
+  BufferAllocation alloc_lhs(/*index=*/0, lhs_length, /*color=*/0);
+  BufferAllocation alloc_rhs(/*index=*/1, rhs_length, /*color=*/0);
+  BufferAllocation alloc_out(/*index=*/2, out_length, /*color=*/0);
+
+  BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, lhs_length);
+  BufferAllocation::Slice slice_rhs(&alloc_rhs, 0, rhs_length);
+  BufferAllocation::Slice slice_out(&alloc_out, 0, out_length);
+
+  auto config = GemmConfig::For(
+      ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), {}, {1},
+      ShapeUtil::MakeShape(PrimitiveType::F32, {4, 3}), {}, {0},
+      ShapeUtil::MakeShape(PrimitiveType::F32, {2, 3}), 1.0, 0.0, 0.0,
+      std::nullopt, se::blas::kDefaultComputePrecision, false, false);
+  ASSERT_TRUE(config.ok());
+
+  // Prepare commands sequence for constructing command buffer.
+  CommandBufferCmdSequence commands;
+  commands.Emplace<GemmCmd>(config.value(), slice_lhs, slice_rhs, slice_out,
+                            /*deterministic=*/true);
+
+  // Construct a thunk with command sequence.
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations({lhs, rhs, out}, 0, executor->GetAllocator());
+  Thunk::ExecuteParams params(run_options, allocations, &stream, {});
+
+  CommandBufferCmd::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(thunk.Initialize(executor, source));
+
+  // Execute command buffer thunk and verify that it executed a GEMM.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `out` data back to host.
+  std::vector<float> dst(6, 0);
+  stream.ThenMemcpy(dst.data(), out, out_length);
+
+  ASSERT_EQ(dst, std::vector<float>({10, 10, 10, 26, 26, 26}));
+
+  // Prepare buffer allocation for updating command buffer.
+  se::DeviceMemory<float> updated_out = executor->AllocateArray<float>(2 * 3);
+  stream.ThenMemZero(&updated_out, out_length);
+
+  // Update buffer allocation to updated `out` buffer.
+  allocations =
+      BufferAllocations({lhs, rhs, updated_out}, 0, executor->GetAllocator());
+
+  // Thunk execution should automatically update underlying command buffer.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `updated_out` data back to host.
+  std::fill(dst.begin(), dst.end(), 0);
+  stream.ThenMemcpy(dst.data(), updated_out, out_length);
+
+  ASSERT_EQ(dst, std::vector<float>({10, 10, 10, 26, 26, 26}));
+
+  // Try to update the command buffer with the same buffers.
+  stream.ThenMemZero(&updated_out, out_length);
+
+  // Thunk execution should automatically update underlying command buffer.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `updated_out` data back to host.
+  std::fill(dst.begin(), dst.end(), 0);
+  stream.ThenMemcpy(dst.data(), updated_out, out_length);
+
+  ASSERT_EQ(dst, std::vector<float>({10, 10, 10, 26, 26, 26}));
+}
+
+TEST(CommandBufferThunkTest, MultipleLaunchCmd) {
+  se::StreamExecutor* executor = CudaExecutor();
+
+  se::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: a=42, b=0
+  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> d = executor->AllocateArray<int32_t>(length, 0);
+
+  stream.ThenMemset32(&a, 42, byte_length);
+  stream.ThenMemZero(&b, byte_length);
+  stream.ThenMemset32(&c, 21, byte_length);
+  stream.ThenMemZero(&d, byte_length);
+
+  // Prepare buffer allocations for recording command buffer.
+  BufferAllocation alloc_a(/*index=*/0, byte_length, /*color=*/0);
+  BufferAllocation alloc_b(/*index=*/1, byte_length, /*color=*/0);
+  BufferAllocation alloc_c(/*index=*/2, byte_length, /*color=*/0);
+  BufferAllocation alloc_d(/*index=*/3, byte_length, /*color=*/0);
+
+  BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
+  BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
+  BufferAllocation::Slice slice_c(&alloc_c, 0, byte_length);
+  BufferAllocation::Slice slice_d(&alloc_d, 0, byte_length);
+
+  auto args = {slice_a, slice_a, slice_b};    // b = a + a
+  auto args_1 = {slice_c, slice_c, slice_d};  // d = c + c
+
+  // Prepare commands sequence for constructing command buffer.
+  CommandBufferCmdSequence commands;
+  commands.Emplace<LaunchCmd>("add", args, LaunchDimensions(1, 4),
+                              /*shmem_bytes=*/0);
+  commands.Emplace<LaunchCmd>("add", args_1, LaunchDimensions(1, 4),
+                              /*shmem_bytes=*/0);
+
+  // Construct a thunk with command sequence.
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations({a, b, c, d}, 0, executor->GetAllocator());
+  Thunk::ExecuteParams params(run_options, allocations, &stream, {});
+
+  CommandBufferCmd::ExecutableSource source = {
+      /*text=*/se::cuda::internal::kAddI32Kernel, /*binary=*/{}};
+  TF_ASSERT_OK(thunk.Initialize(executor, source));
+
+  // Execute command buffer thunk and verify that it added the value.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `b` data back to host.
+  std::vector<int32_t> dst(4, 0);
+  stream.ThenMemcpy(dst.data(), b, byte_length);
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
+
+  // Copy `d` data back to host.
+  std::fill(dst.begin(), dst.end(), 0);
+  stream.ThenMemcpy(dst.data(), d, byte_length);
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 21 + 21));
+
+  BufferAllocation alloc_e(/*index=*/3, byte_length, /*color=*/0);
+  BufferAllocation::Slice slice_e(&alloc_e, 0, byte_length);
+
+  // Prepare buffer allocation for updating command buffer: e=0
+  se::DeviceMemory<int32_t> e = executor->AllocateArray<int32_t>(length, 0);
+  stream.ThenMemZero(&e, byte_length);
+
+  // Update buffer allocation #1 to buffer `c`.
+  allocations = BufferAllocations({a, b, c, e}, 0, executor->GetAllocator());
+
+  // Thunk execution should automatically update underlying command buffer.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `b` data back to host.
+  std::fill(dst.begin(), dst.end(), 0);
+  stream.ThenMemcpy(dst.data(), b, byte_length);
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
+
+  // Copy `e` data back to host.
+  std::fill(dst.begin(), dst.end(), 0);
+  stream.ThenMemcpy(dst.data(), e, byte_length);
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 21 + 21));
+
+  // Try to update the command buffer with the same buffers.
+  stream.ThenMemZero(&e, byte_length);
+
+  // Thunk execution should automatically update underlying command buffer.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `b` data back to host.
+  std::fill(dst.begin(), dst.end(), 0);
+  stream.ThenMemcpy(dst.data(), b, byte_length);
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
+
+  // Copy `e` data back to host.
+  std::fill(dst.begin(), dst.end(), 0);
+  stream.ThenMemcpy(dst.data(), e, byte_length);
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 21 + 21));
+}
+
+TEST(CommandBufferThunkTest, IfCmd) {
+  se::StreamExecutor* executor = CudaExecutor();
+  if (!se::CommandBuffer::SupportsConditionalCommands(executor->platform())) {
+    GTEST_SKIP() << "CUDA graph conditionals are not supported";
+  }
+
+  se::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: pred=true, a=42, b=0
+  se::DeviceMemory<bool> pred = executor->AllocateArray<bool>(1, 0);
+  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+
+  constexpr bool kTrue = true;
+  stream.ThenMemcpy(&pred, &kTrue, 1);
+  stream.ThenMemset32(&a, 42, byte_length);
+  stream.ThenMemZero(&b, byte_length);
+
+  // Prepare buffer allocations for recording command buffer.
+  BufferAllocation alloc_p(/*index=*/0, 1, /*color=*/0);
+  BufferAllocation alloc_a(/*index=*/1, byte_length, /*color=*/0);
+  BufferAllocation alloc_b(/*index=*/2, byte_length, /*color=*/0);
+
+  BufferAllocation::Slice slice_p(&alloc_p, 0, 1);
+  BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
+  BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
+
+  auto args = {slice_a, slice_a, slice_b};  // b = a + a
+
+  // Prepare commands sequence for `then` branch.
+  CommandBufferCmdSequence then_commands;
+  then_commands.Emplace<LaunchCmd>("add", args, LaunchDimensions(1, 4),
+                                   /*shmem_bytes=*/0);
+
+  // Prepare commands sequence for thunk.
+  CommandBufferCmdSequence commands;
+  commands.Emplace<IfCmd>(slice_p, std::move(then_commands));
+
+  // Construct a thunk with command sequence.
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations({pred, a, b}, 0, executor->GetAllocator());
+  Thunk::ExecuteParams params(run_options, allocations, &stream, {});
+
+  CommandBufferCmd::ExecutableSource source = {
+      /*text=*/se::cuda::internal::kAddI32Kernel, /*binary=*/{}};
+  TF_ASSERT_OK(thunk.Initialize(executor, source));
+
+  // Execute command buffer thunk and verify that it added the value.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `b` data back to host.
+  std::vector<int32_t> dst(4, 0);
+  stream.ThenMemcpy(dst.data(), b, byte_length);
+
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
+
+  // Prepare buffer allocation for updating command buffer: c=0
+  se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
+  stream.ThenMemZero(&c, byte_length);
+
+  // Update buffer allocation #2 to buffer `c`.
+  allocations = BufferAllocations({pred, a, c}, 0, executor->GetAllocator());
+
+  // Thunk execution should automatically update underlying command buffer.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `c` data back to host.
+  std::fill(dst.begin(), dst.end(), 0);
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
+}
+
+TEST(CommandBufferThunkTest, IfElseCmd) {
+  se::StreamExecutor* executor = CudaExecutor();
+  if (!se::CommandBuffer::SupportsConditionalCommands(executor->platform())) {
+    GTEST_SKIP() << "CUDA graph conditionals are not supported";
+  }
+
+  se::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: pred=true, a=42, b=0
+  se::DeviceMemory<bool> pred = executor->AllocateArray<bool>(1, 0);
+  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+
+  constexpr bool kTrue = true;
+  stream.ThenMemcpy(&pred, &kTrue, 1);
+  stream.ThenMemset32(&a, 42, byte_length);
+  stream.ThenMemZero(&b, byte_length);
+
+  // Prepare buffer allocations for recording command buffer.
+  BufferAllocation alloc_p(/*index=*/0, 1, /*color=*/0);
+  BufferAllocation alloc_a(/*index=*/1, byte_length, /*color=*/0);
+  BufferAllocation alloc_b(/*index=*/2, byte_length, /*color=*/0);
+
+  BufferAllocation::Slice slice_p(&alloc_p, 0, 1);
+  BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
+  BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
+
+  // Prepare commands sequence for `then` & `else` branches.
+  CommandBufferCmdSequence then_commands;
+  CommandBufferCmdSequence else_commands;
+
+  {  // Then: b = a + a
+    auto args = {slice_a, slice_a, slice_b};
+    then_commands.Emplace<LaunchCmd>("add", args, LaunchDimensions(1, 4),
+                                     /*shmem_bytes=*/0);
+  }
+
+  {  // Else: b = b + b
+    auto args = {slice_b, slice_b, slice_b};
+    else_commands.Emplace<LaunchCmd>("add", args, LaunchDimensions(1, 4),
+                                     /*shmem_bytes=*/0);
+  }
+
+  // Prepare commands sequence for thunk.
+  CommandBufferCmdSequence commands;
+  commands.Emplace<IfElseCmd>(slice_p, std::move(then_commands),
+                              std::move(else_commands));
+
+  // Construct a thunk with command sequence.
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations({pred, a, b}, 0, executor->GetAllocator());
+  Thunk::ExecuteParams params(run_options, allocations, &stream, {});
+
+  CommandBufferCmd::ExecutableSource source = {
+      /*text=*/se::cuda::internal::kAddI32Kernel, /*binary=*/{}};
+  TF_ASSERT_OK(thunk.Initialize(executor, source));
+
+  // Execute command buffer thunk and verify that it added the value.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `b` data back to host.
+  std::vector<int32_t> dst(4, 0);
+  stream.ThenMemcpy(dst.data(), b, byte_length);
+
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
+
+  // Change branch to `else` and check that it updated the `b` buffer.
+  constexpr bool kFalse = false;
+  stream.ThenMemcpy(&pred, &kFalse, 1);
+
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  stream.ThenMemcpy(dst.data(), b, byte_length);
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 2 * (42 + 42)));
+}
+
+TEST(CommandBufferThunkTest, CaseCmd) {
+  se::StreamExecutor* executor = CudaExecutor();
+  if (!se::CommandBuffer::SupportsConditionalCommands(executor->platform())) {
+    GTEST_SKIP() << "CUDA graph conditionals are not supported";
+  }
+
+  se::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: index=0, a=42, b=0
+  se::DeviceMemory<int32_t> index = executor->AllocateArray<int32_t>(1, 0);
+  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+
+  stream.ThenMemset32(&index, 0, sizeof(int32_t));
+  stream.ThenMemset32(&a, 42, byte_length);
+  stream.ThenMemZero(&b, byte_length);
+
+  // Prepare buffer allocations for recording command buffer.
+  BufferAllocation alloc_i(/*index=*/0, 1, /*color=*/0);
+  BufferAllocation alloc_a(/*index=*/1, byte_length, /*color=*/0);
+  BufferAllocation alloc_b(/*index=*/2, byte_length, /*color=*/0);
+
+  BufferAllocation::Slice slice_i(&alloc_i, 0, sizeof(int32_t));
+  BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
+  BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
+
+  // Prepare commands sequence for branches.
+  std::vector<CommandBufferCmdSequence> branches(2);
+
+  {  // Case 0: b = a + a
+    auto args = {slice_a, slice_a, slice_b};
+    branches[0].Emplace<LaunchCmd>("add", args, LaunchDimensions(1, 4),
+                                   /*shmem_bytes=*/0);
+  }
+
+  {  // Case 1: b = b + b
+    auto args = {slice_b, slice_b, slice_b};
+    branches[1].Emplace<LaunchCmd>("add", args, LaunchDimensions(1, 4),
+                                   /*shmem_bytes=*/0);
+  }
+
+  // Prepare commands sequence for thunk.
+  CommandBufferCmdSequence commands;
+  commands.Emplace<CaseCmd>(slice_i, std::move(branches));
+
+  // Construct a thunk with command sequence.
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations({index, a, b}, 0, executor->GetAllocator());
+  Thunk::ExecuteParams params(run_options, allocations, &stream, {});
+
+  CommandBufferCmd::ExecutableSource source = {
+      /*text=*/se::cuda::internal::kAddI32Kernel, /*binary=*/{}};
+  TF_ASSERT_OK(thunk.Initialize(executor, source));
+
+  // Execute command buffer thunk and verify that it added the value.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `b` data back to host.
+  std::vector<int32_t> dst(4, 0);
+  stream.ThenMemcpy(dst.data(), b, byte_length);
+
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
+
+  // Change `index` to `1` and check that it updated the `b` buffer.
+  stream.ThenMemset32(&index, 1, sizeof(int32_t));
+
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  stream.ThenMemcpy(dst.data(), b, byte_length);
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 2 * (42 + 42)));
+}
+
+TEST(CommandBufferThunkTest, ForCmd) {
+  se::StreamExecutor* executor = CudaExecutor();
+  if (!se::CommandBuffer::SupportsConditionalCommands(executor->platform())) {
+    GTEST_SKIP() << "CUDA graph conditionals are not supported";
+  }
+
+  se::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: loop_cnt=0, a=1, b=0
+  se::DeviceMemory<int32_t> loop_cnt = executor->AllocateArray<int32_t>(1, 0);
+  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  se::DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+
+  stream.ThenMemset32(&loop_cnt, 0, sizeof(int32_t));
+  stream.ThenMemset32(&a, 1, byte_length);
+  stream.ThenMemZero(&b, byte_length);
+
+  // Prepare buffer allocations for recording command buffer.
+  BufferAllocation alloc_cnt(/*index=*/0, 1, /*color=*/0);
+  BufferAllocation alloc_a(/*index=*/1, byte_length, /*color=*/0);
+  BufferAllocation alloc_b(/*index=*/2, byte_length, /*color=*/0);
+
+  BufferAllocation::Slice slice_cnt(&alloc_cnt, 0, sizeof(int32_t));
+  BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
+  BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
+
+  auto args = {slice_a, slice_b, slice_b};  // b = a + b
+
+  // Prepare commands sequence for loop `body`.
+  CommandBufferCmdSequence body_commands;
+  body_commands.Emplace<LaunchCmd>("add", args, LaunchDimensions(1, 4),
+                                   /*shmem_bytes=*/0);
+
+  // Prepare commands sequence for thunk.
+  CommandBufferCmdSequence commands;
+  commands.Emplace<ForCmd>(/*num_iterations=*/10, slice_cnt,
+                           std::move(body_commands));
+
+  // Construct a thunk with command sequence.
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations({loop_cnt, a, b}, 0, executor->GetAllocator());
+  Thunk::ExecuteParams params(run_options, allocations, &stream, {});
+
+  CommandBufferCmd::ExecutableSource source = {
+      /*text=*/se::cuda::internal::kAddI32Kernel, /*binary=*/{}};
+  TF_ASSERT_OK(thunk.Initialize(executor, source));
+
+  // Execute command buffer thunk and verify that it added the value 10 times.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `b` data back to host.
+  std::vector<int32_t> dst(4, 0);
+  stream.ThenMemcpy(dst.data(), b, byte_length);
+
+  ASSERT_EQ(dst, std::vector<int32_t>(4, 10));
+}
+
+TEST(CommandBufferThunkTest, WhileCmd) {
+  // TODO(ezhulenev): Find a way to test WhileCmd: add a test only TraceCmd that
+  // could allow us trace custom kernels to update while loop iterations. Or
+  // maybe add a CustomLaunchCmd and wrap loop update into custom kernel.
 }
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/conditional_thunk.cc b/third_party/xla/xla/service/gpu/runtime3/conditional_thunk.cc
similarity index 98%
rename from third_party/xla/xla/service/gpu/conditional_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime3/conditional_thunk.cc
index 6867544addc039..04aad407018c58 100644
--- a/third_party/xla/xla/service/gpu/conditional_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime3/conditional_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/conditional_thunk.h"
+#include "xla/service/gpu/runtime3/conditional_thunk.h"
 
 #include <memory>
 
diff --git a/third_party/xla/xla/service/gpu/conditional_thunk.h b/third_party/xla/xla/service/gpu/runtime3/conditional_thunk.h
similarity index 91%
rename from third_party/xla/xla/service/gpu/conditional_thunk.h
rename to third_party/xla/xla/service/gpu/runtime3/conditional_thunk.h
index 357d09e663605a..7b7f99f05db07b 100644
--- a/third_party/xla/xla/service/gpu/conditional_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime3/conditional_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_
-#define XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME3_CONDITIONAL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME3_CONDITIONAL_THUNK_H_
 
 #include <memory>
 #include <vector>
@@ -22,7 +22,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/sequential_thunk.h"
+#include "xla/service/gpu/runtime3/sequential_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/stream_executor/stream_executor.h"
 
@@ -69,4 +69,4 @@ class ConditionalThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME3_CONDITIONAL_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/convolution_thunk.cc b/third_party/xla/xla/service/gpu/runtime3/convolution_thunk.cc
similarity index 95%
rename from third_party/xla/xla/service/gpu/convolution_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime3/convolution_thunk.cc
index 17096e890cf77e..06a99973abfe40 100644
--- a/third_party/xla/xla/service/gpu/convolution_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime3/convolution_thunk.cc
@@ -13,20 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/convolution_thunk.h"
+#include "xla/service/gpu/runtime3/convolution_thunk.h"
 
 #include <memory>
 #include <optional>
-#include <string>
 
-#include "absl/strings/str_cat.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
-#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/types.h"
 #include "xla/util.h"
-#include "tsl/platform/logging.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/convolution_thunk.h b/third_party/xla/xla/service/gpu/runtime3/convolution_thunk.h
similarity index 88%
rename from third_party/xla/xla/service/gpu/convolution_thunk.h
rename to third_party/xla/xla/service/gpu/runtime3/convolution_thunk.h
index d7f731302c1a94..5030c33ac41f6c 100644
--- a/third_party/xla/xla/service/gpu/convolution_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime3/convolution_thunk.h
@@ -13,24 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_CONVOLUTION_THUNK_H_
-#define XLA_SERVICE_GPU_CONVOLUTION_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME3_CONVOLUTION_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME3_CONVOLUTION_THUNK_H_
 
 #include <memory>
-#include <optional>
 
 #include "absl/container/flat_hash_map.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
-#include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/types.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 namespace gpu {
@@ -92,4 +84,4 @@ class ConvolutionReorderThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_CONVOLUTION_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME3_CONVOLUTION_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/custom_call_thunk.cc b/third_party/xla/xla/service/gpu/runtime3/custom_call_thunk.cc
index b53a258a591372..cf556923de5377 100644
--- a/third_party/xla/xla/service/gpu/runtime3/custom_call_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime3/custom_call_thunk.cc
@@ -24,7 +24,8 @@ limitations under the License.
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/call_frame.h"
-#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
@@ -59,12 +60,14 @@ CustomCallThunk::CustomCallThunk(ThunkInfo thunk_info,
 CustomCallThunk::CustomCallThunk(ThunkInfo thunk_info, XLA_FFI_Handler* handler,
                                  std::vector<std::optional<Slice>> operands,
                                  std::vector<std::optional<Slice>> results,
-                                 AttributesMap attributes)
+                                 AttributesMap attributes,
+                                 const HloComputation* called_computation)
     : Thunk(Thunk::kCustomCall, thunk_info),
       operands_(std::move(operands)),
       results_(std::move(results)),
       handler_(std::move(handler)),
-      attributes_(std::move(attributes)) {}
+      attributes_(std::move(attributes)),
+      called_computation_(called_computation) {}
 
 Status CustomCallThunk::ExecuteCustomCall(const ExecuteParams& params) {
   // gpu_stream is CUstream or e.g. the equivalent type in ROCm.
@@ -125,7 +128,10 @@ Status CustomCallThunk::ExecuteFfiHandler(const ExecuteParams& params) {
     }
   }
 
-  builder.AddAttributes(attributes_);
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Append(attributes_);
+
+  builder.AddAttributes(attrs.Build());
   CallFrame call_frame = builder.Build();
 
   // TODO(ezhulenev): Remove `ServiceExecutableRunOptions` from FFI handler
@@ -134,7 +140,7 @@ Status CustomCallThunk::ExecuteFfiHandler(const ExecuteParams& params) {
   run_options.set_stream(params.stream);
   ServiceExecutableRunOptions service_run_options(run_options);
 
-  CallOptions options = {&service_run_options};
+  CallOptions options = {&service_run_options, called_computation_};
   return Call(handler_, call_frame, options);
 }
 
diff --git a/third_party/xla/xla/service/gpu/runtime3/custom_call_thunk.h b/third_party/xla/xla/service/gpu/runtime3/custom_call_thunk.h
index f544485edd8f87..229d87f6a62357 100644
--- a/third_party/xla/xla/service/gpu/runtime3/custom_call_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime3/custom_call_thunk.h
@@ -26,6 +26,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/call_frame.h"
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/gpu/thunk.h"
@@ -68,8 +70,8 @@ class CustomCallThunk : public Thunk {
     Shape shape;
   };
 
-  using Attribute = std::variant<int32_t, float, std::string>;
-  using AttributesMap = absl::flat_hash_map<std::string, Attribute>;
+  using Attribute = ffi::CallFrameBuilder::FlatAttribute;
+  using AttributesMap = ffi::CallFrameBuilder::FlatAttributesMap;
 
   CustomCallThunk(ThunkInfo thunk_info, CustomCallTarget call_target,
                   std::vector<std::optional<Slice>> operands,
@@ -79,7 +81,8 @@ class CustomCallThunk : public Thunk {
   CustomCallThunk(ThunkInfo thunk_info, XLA_FFI_Handler* handler,
                   std::vector<std::optional<Slice>> operands,
                   std::vector<std::optional<Slice>> results,
-                  AttributesMap attributes);
+                  AttributesMap attributes,
+                  const HloComputation* called_computation);
 
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
@@ -100,6 +103,16 @@ class CustomCallThunk : public Thunk {
   // a lot of features. Long term it will replace legacy custom calls.
   XLA_FFI_Handler* handler_ = nullptr;
   AttributesMap attributes_;
+
+  // TODO(ezhulenev): Currently we assume that HloModule that owns this
+  // computation is owned by a GpuExecutable and stays alive for as long as
+  // thunk is alive, however in general it might not be true and we can destroy
+  // underlying HloModule. We have to make a copy of HloComputation for a thunk,
+  // and also pass some form of relatively-ABI-stable representation to external
+  // custom calls, i.e. we can pass it as HloComputationProto or as MLIR
+  // bytecode of the computation serialized to StableHLO. Today we assume that
+  // custom calls that access called computation can only be linked statically.
+  const HloComputation* called_computation_ = nullptr;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/for_thunk.cc b/third_party/xla/xla/service/gpu/runtime3/for_thunk.cc
similarity index 96%
rename from third_party/xla/xla/service/gpu/for_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime3/for_thunk.cc
index 2d4a8b4176a52e..0687afff7d7bb3 100644
--- a/third_party/xla/xla/service/gpu/for_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime3/for_thunk.cc
@@ -13,12 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/for_thunk.h"
+#include "xla/service/gpu/runtime3/for_thunk.h"
 
 #include <memory>
 #include <utility>
 
-#include "xla/util.h"
 #include "tsl/platform/errors.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/for_thunk.h b/third_party/xla/xla/service/gpu/runtime3/for_thunk.h
similarity index 89%
rename from third_party/xla/xla/service/gpu/for_thunk.h
rename to third_party/xla/xla/service/gpu/runtime3/for_thunk.h
index caa4db61b306f9..403c34330f4aeb 100644
--- a/third_party/xla/xla/service/gpu/for_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime3/for_thunk.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_FOR_THUNK_H_
-#define XLA_SERVICE_GPU_FOR_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME3_FOR_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME3_FOR_THUNK_H_
 
 #include <vector>
 
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/sequential_thunk.h"
+#include "xla/service/gpu/runtime3/sequential_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/stream_executor/stream_executor.h"
 
@@ -49,4 +49,4 @@ class ForThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_FOR_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME3_FOR_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/send_recv_thunk.cc b/third_party/xla/xla/service/gpu/runtime3/send_recv_thunk.cc
new file mode 100644
index 00000000000000..487f48c45fc7bb
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime3/send_recv_thunk.cc
@@ -0,0 +1,234 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/runtime3/send_recv_thunk.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/thunk.h"
+#include "xla/shape.h"
+#include "xla/status.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/concurrency/async_value.h"
+#include "tsl/concurrency/async_value_ref.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/profiler/lib/traceme.h"
+
+namespace xla::gpu {
+
+using tsl::AsyncValueRef;
+using tsl::profiler::TraceMe;
+using tsl::profiler::TraceMeEncode;
+
+//===----------------------------------------------------------------------===//
+// SendRecvAsyncEvents
+//===----------------------------------------------------------------------===//
+
+absl::Status SendRecvAsyncEvents::Emplace(se::StreamExecutor* executor,
+                                          int32_t channel_id,
+                                          tsl::AsyncValueRef<se::Event> event) {
+  Key key = {executor, channel_id};
+
+  absl::MutexLock lock(&mutex_);
+  if (auto it = events_.try_emplace(key, std::move(event)); it.second)
+    return OkStatus();
+
+  return absl::InternalError(absl::StrFormat(
+      "Async send/recv event already exists (channel_id=%d)", channel_id));
+}
+
+absl::StatusOr<AsyncValueRef<se::Event>> SendRecvAsyncEvents::Extract(
+    se::StreamExecutor* executor, int32_t channel_id) {
+  Key key = {executor, channel_id};
+
+  absl::MutexLock lock(&mutex_);
+  if (auto event = events_.extract(key)) return std::move(event.mapped());
+
+  return absl::InternalError(absl::StrFormat(
+      "Async send/recv event was not found (channel_id==%d)", channel_id));
+}
+
+//===----------------------------------------------------------------------===//
+// SendThunk
+//===----------------------------------------------------------------------===//
+
+SendThunk::SendThunk(
+    ThunkInfo thunk_info, Shape shape, BufferAllocation::Slice buffer,
+    int64_t channel_id, std::shared_ptr<SendRecvAsyncEvents> events,
+    absl::flat_hash_map<std::string, std::string> frontend_attrs)
+    : Thunk(Thunk::kSend, thunk_info),
+      shape_(shape),
+      buffer_(buffer),
+      channel_id_(channel_id),
+      events_(std::move(events)),
+      frontend_attrs_(std::move(frontend_attrs)) {}
+
+Status SendThunk::ExecuteOnStream(const ExecuteParams& params) {
+  VLOG(3) << "Send buffer: channel_id=" << channel_id_
+          << "; shape=" << shape_.ToString();
+
+  TraceMe trace([&] {
+    return TraceMeEncode("Send", {{"channel_id", channel_id_}});
+  });
+
+  // Use device_to_host stream if it is available.
+  se::Stream* stream = params.device_to_host_stream;
+  if (stream) {
+    stream->ThenWaitFor(params.stream);
+  } else {
+    stream = params.stream;
+  }
+
+  se::DeviceMemoryBase src =
+      params.buffer_allocations->GetDeviceAddress(buffer_);
+
+  // Send buffer to a handler registered with the executable.
+  if (auto* send = params.send_device_memory_function) {
+    TF_ASSIGN_OR_RETURN(
+        AsyncValueRef<se::Event> done,
+        (*send)(channel_id_, stream, shape_, src, frontend_attrs_));
+    return events_->Emplace(stream->parent(), channel_id_, std::move(done));
+  }
+
+  return absl::InvalidArgumentError(
+      "SendDeviceMemoryFunction is not available");
+}
+
+//===----------------------------------------------------------------------===//
+// SendDoneThunk
+//===----------------------------------------------------------------------===//
+
+SendDoneThunk::SendDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
+
+                             std::shared_ptr<SendRecvAsyncEvents> events)
+    : Thunk(Thunk::kSend, thunk_info),
+      channel_id_(channel_id),
+      events_(std::move(events)) {}
+
+Status SendDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
+  VLOG(3) << "Wait for Host Send completion:"
+          << " channel_id=" << channel_id_;
+
+  TraceMe trace([&] {
+    return TraceMeEncode("SendDone", {{"channel_id", channel_id_}});
+  });
+
+  se::StreamExecutor* executor = params.stream->parent();
+  TF_ASSIGN_OR_RETURN(auto done_event, events_->Extract(executor, channel_id_));
+
+  // Wait until send handler will record an event on the stream.
+  BlockUntilReady(done_event.GetAsyncValue());
+  if (done_event.IsError()) return done_event.GetError();
+
+  VLOG(5) << "Completed Host Send operation: "
+          << " channel_id=" << channel_id_;
+
+  // Once event is recorded we can add a stream dependency.
+  params.stream->ThenWaitFor(&done_event.get());
+  return OkStatus();
+}
+
+//===----------------------------------------------------------------------===//
+// RecvThunk
+//===----------------------------------------------------------------------===//
+
+RecvThunk::RecvThunk(
+    ThunkInfo thunk_info, Shape shape, BufferAllocation::Slice buffer,
+    int64_t channel_id, std::shared_ptr<SendRecvAsyncEvents> events,
+    absl::flat_hash_map<std::string, std::string> frontend_attrs)
+    : Thunk(Thunk::kSend, thunk_info),
+      shape_(shape),
+      buffer_(buffer),
+      channel_id_(channel_id),
+      events_(std::move(events)),
+      frontend_attrs_(std::move(frontend_attrs)) {}
+
+Status RecvThunk::ExecuteOnStream(const ExecuteParams& params) {
+  VLOG(3) << "Recv buffer: channel_id=" << channel_id_
+          << "; shape=" << shape_.ToString();
+
+  TraceMe trace([&] {
+    return TraceMeEncode("Recv", {{"channel_id", channel_id_}});
+  });
+
+  // Use host_to_device stream if it is available.
+  se::Stream* stream = params.host_to_device_stream;
+  if (stream) {
+    stream->ThenWaitFor(params.stream);
+  } else {
+    stream = params.stream;
+  }
+
+  se::DeviceMemoryBase dst =
+      params.buffer_allocations->GetDeviceAddress(buffer_);
+
+  // Recv buffer from a handler registered with the run options.
+  if (auto* recv = params.recv_device_memory_function) {
+    TF_ASSIGN_OR_RETURN(
+        AsyncValueRef<se::Event> done,
+        (*recv)(channel_id_, stream, shape_, &dst, frontend_attrs_));
+    return events_->Emplace(stream->parent(), channel_id_, std::move(done));
+  }
+
+  return absl::InvalidArgumentError(
+      "RecvDeviceMemoryFunction is not available");
+}
+
+//===----------------------------------------------------------------------===//
+// RecvDoneThunk
+//===----------------------------------------------------------------------===//
+
+RecvDoneThunk::RecvDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
+
+                             std::shared_ptr<SendRecvAsyncEvents> events)
+    : Thunk(Thunk::kSend, thunk_info),
+      channel_id_(channel_id),
+      events_(std::move(events)) {}
+
+Status RecvDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
+  VLOG(3) << "Wait for Recv completion:"
+          << " channel_id=" << channel_id_;
+
+  TraceMe trace([&] {
+    return TraceMeEncode("RecvDone", {{"channel_d", channel_id_}});
+  });
+
+  se::StreamExecutor* executor = params.stream->parent();
+  TF_ASSIGN_OR_RETURN(auto done_event, events_->Extract(executor, channel_id_));
+
+  // Wait until send handler will record an event on the stream.
+  BlockUntilReady(done_event.GetAsyncValue());
+  if (done_event.IsError()) return done_event.GetError();
+
+  VLOG(5) << "Completed Host Recv operation: "
+          << " channel=" << channel_id_;
+
+  // Once event is recorded we can add a stream dependency.
+  params.stream->ThenWaitFor(&done_event.get());
+  return OkStatus();
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime3/send_recv_thunk.h b/third_party/xla/xla/service/gpu/runtime3/send_recv_thunk.h
new file mode 100644
index 00000000000000..6fe6127a06f2db
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime3/send_recv_thunk.h
@@ -0,0 +1,159 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME3_SEND_RECV_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME3_SEND_RECV_THUNK_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/thunk.h"
+#include "xla/shape.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/concurrency/async_value_ref.h"
+
+namespace xla::gpu {
+
+//===----------------------------------------------------------------------===//
+// SendRecvAsyncEvents
+//===----------------------------------------------------------------------===//
+
+// Send/Recv operations have two levels of async behavior:
+//
+// (1) AsyncValueRef will become available only after send/recv handler
+//     schedules all activities on the device.
+//
+// (2) se::Event will become available when device activity recorded by
+//     send/recv handlers complete.
+//
+// We  keep track of Send/Recv commands in flight, and synchronize `send` and
+// `recv` operations with corresponding `send-done` and `recv-done`.
+//
+// Each channel can have at most one event in flight for a given executor.
+//
+// We have a single instance of `SendRecvAsyncEvents` for each Gpu executable,
+// and all thunks share it using a shared pointer.
+//
+// TODO(ezhulenev): Rename to `SendRecvEvents` once we remove deprecated XLA
+// runtime, as it has name conflict.
+class SendRecvAsyncEvents {
+ public:
+  // Emplace a new send/recv completion event.
+  Status Emplace(se::StreamExecutor* executor, int32_t channel_id,
+                 tsl::AsyncValueRef<se::Event> event);
+
+  // Extract a send/recv completion event.
+  StatusOr<tsl::AsyncValueRef<se::Event>> Extract(se::StreamExecutor* executor,
+                                                  int32_t channel_id);
+
+ private:
+  using Key = std::pair<se::StreamExecutor*, /*channel_id=*/int64_t>;
+
+  absl::Mutex mutex_;
+  absl::flat_hash_map<Key, tsl::AsyncValueRef<se::Event>> events_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+//===----------------------------------------------------------------------===//
+// SendThunk
+//===----------------------------------------------------------------------===//
+
+class SendThunk : public Thunk {
+ public:
+  SendThunk(ThunkInfo thunk_info, Shape shape, BufferAllocation::Slice buffer,
+            int64_t channel_id, std::shared_ptr<SendRecvAsyncEvents> events,
+            absl::flat_hash_map<std::string, std::string> frontend_attrs);
+
+  Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  Shape shape_;
+  BufferAllocation::Slice buffer_;
+
+  int64_t channel_id_;
+
+  std::shared_ptr<SendRecvAsyncEvents> events_;
+  absl::flat_hash_map<std::string, std::string> frontend_attrs_;
+};
+
+//===----------------------------------------------------------------------===//
+// SendDoneThunk
+//===----------------------------------------------------------------------===//
+
+class SendDoneThunk : public Thunk {
+ public:
+  SendDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
+                std::shared_ptr<SendRecvAsyncEvents> events);
+
+  Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  int64_t channel_id_;
+
+  std::shared_ptr<SendRecvAsyncEvents> events_;
+};
+
+//===----------------------------------------------------------------------===//
+// RecvThunk
+//===----------------------------------------------------------------------===//
+
+class RecvThunk : public Thunk {
+ public:
+  RecvThunk(ThunkInfo thunk_info, Shape shape, BufferAllocation::Slice buffer,
+            int64_t channel_id, std::shared_ptr<SendRecvAsyncEvents> events,
+            absl::flat_hash_map<std::string, std::string> frontend_attrs);
+
+  Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  Shape shape_;
+  BufferAllocation::Slice buffer_;
+
+  int64_t channel_id_;
+
+  std::shared_ptr<SendRecvAsyncEvents> events_;
+  absl::flat_hash_map<std::string, std::string> frontend_attrs_;
+};
+
+//===----------------------------------------------------------------------===//
+// RecvDoneThunk
+//===----------------------------------------------------------------------===//
+
+class RecvDoneThunk : public Thunk {
+ public:
+  RecvDoneThunk(ThunkInfo thunk_info, int64_t channel_id,
+                std::shared_ptr<SendRecvAsyncEvents> events);
+
+  Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  int64_t channel_id_;
+
+  std::shared_ptr<SendRecvAsyncEvents> events_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME3_SEND_RECV_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/sequential_thunk.cc b/third_party/xla/xla/service/gpu/runtime3/sequential_thunk.cc
similarity index 96%
rename from third_party/xla/xla/service/gpu/sequential_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime3/sequential_thunk.cc
index e403170af957e2..4f8a13ee701b5f 100644
--- a/third_party/xla/xla/service/gpu/sequential_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime3/sequential_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/sequential_thunk.h"
+#include "xla/service/gpu/runtime3/sequential_thunk.h"
 
 #include "tsl/platform/errors.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
diff --git a/third_party/xla/xla/service/gpu/sequential_thunk.h b/third_party/xla/xla/service/gpu/runtime3/sequential_thunk.h
similarity index 90%
rename from third_party/xla/xla/service/gpu/sequential_thunk.h
rename to third_party/xla/xla/service/gpu/runtime3/sequential_thunk.h
index e839f560e6e144..fff05a41501084 100644
--- a/third_party/xla/xla/service/gpu/sequential_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime3/sequential_thunk.h
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_SEQUENTIAL_THUNK_H_
-#define XLA_SERVICE_GPU_SEQUENTIAL_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME3_SEQUENTIAL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME3_SEQUENTIAL_THUNK_H_
 
 #include <vector>
 
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/thunk.h"
-#include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace gpu {
@@ -51,4 +50,4 @@ class SequentialThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_SEQUENTIAL_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME3_SEQUENTIAL_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/triangular_solve_thunk.cc b/third_party/xla/xla/service/gpu/runtime3/triangular_solve_thunk.cc
index 36794b866a4337..47729a82f7e5e6 100644
--- a/third_party/xla/xla/service/gpu/runtime3/triangular_solve_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime3/triangular_solve_thunk.cc
@@ -16,13 +16,14 @@ limitations under the License.
 #include "xla/service/gpu/runtime3/triangular_solve_thunk.h"
 
 #include "absl/strings/str_format.h"
-#include "xla/service/gpu/precompiled_kernels.h"
+#include "xla/service/gpu/make_batch_pointers.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
@@ -158,10 +159,10 @@ Status RunTriangularSolve(se::DeviceMemoryBase a_data,
     se::DeviceMemoryBase b_pointers(temp_base + batch_size,
                                     batch_pointers_bytes);
 
-    TF_RETURN_IF_ERROR(MakeBatchPointers(
-        stream, asm_opts, a_data, a_batch_stride, batch_size, a_pointers));
-    TF_RETURN_IF_ERROR(MakeBatchPointers(
-        stream, asm_opts, b_data, b_batch_stride, batch_size, b_pointers));
+    TF_RETURN_IF_ERROR(MakeBatchPointers(stream, a_data, a_batch_stride,
+                                         batch_size, a_pointers));
+    TF_RETURN_IF_ERROR(MakeBatchPointers(stream, b_data, b_batch_stride,
+                                         batch_size, b_pointers));
 
     switch (type) {
       case F32: {
diff --git a/third_party/xla/xla/service/gpu/while_thunk.cc b/third_party/xla/xla/service/gpu/runtime3/while_thunk.cc
similarity index 97%
rename from third_party/xla/xla/service/gpu/while_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime3/while_thunk.cc
index 7c096227d8dbcd..ac46ca0e3ca5d3 100644
--- a/third_party/xla/xla/service/gpu/while_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime3/while_thunk.cc
@@ -13,12 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/while_thunk.h"
+#include "xla/service/gpu/runtime3/while_thunk.h"
 
 #include <memory>
 #include <utility>
 
-#include "xla/util.h"
 #include "tsl/platform/errors.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/while_thunk.h b/third_party/xla/xla/service/gpu/runtime3/while_thunk.h
similarity index 83%
rename from third_party/xla/xla/service/gpu/while_thunk.h
rename to third_party/xla/xla/service/gpu/runtime3/while_thunk.h
index 9484446d75aa39..33a6e0e00808a8 100644
--- a/third_party/xla/xla/service/gpu/while_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime3/while_thunk.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_WHILE_THUNK_H_
-#define XLA_SERVICE_GPU_WHILE_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME3_WHILE_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME3_WHILE_THUNK_H_
 
 #include <vector>
 
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/sequential_thunk.h"
+#include "xla/service/gpu/runtime3/sequential_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/stream_executor/stream_executor.h"
 
@@ -49,10 +49,16 @@ class WhileThunk : public Thunk {
                     ExecutableSource src) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
-  SequentialThunk* condition_thunk_sequence() {
+  SequentialThunk* condition_thunk_sequence() const {
     return condition_thunk_sequence_.get();
   }
-  SequentialThunk* body_thunk_sequence() { return body_thunk_sequence_.get(); }
+  SequentialThunk* body_thunk_sequence() const {
+    return body_thunk_sequence_.get();
+  }
+
+  const BufferAllocation::Slice& condition_result_buffer() const {
+    return condition_result_buffer_index_;
+  }
 
  private:
   const BufferAllocation::Slice condition_result_buffer_index_;
@@ -63,4 +69,4 @@ class WhileThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_WHILE_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME3_WHILE_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
index 68a0fb269555fa..039bc7baf371df 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
@@ -31,8 +31,8 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/layout_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/gemm_rewriter_triton.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/triton_support.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -292,12 +292,6 @@ std::optional<HloInstruction*> MatchesTritonCompatibleClosedReductionDiamond(
     return match_failure;
   }
 
-  // TODO(b/291204753): remove this filter. This heuristic enables flipping the
-  // default flag while filtering out cases that could result in regressions.
-  if (reduce->operand(0)->shape().dimensions().back() < 64) {
-    return match_failure;
-  }
-
   while (IsTriviallyFusible(producer, gpu_version)) {
     producer = ChooseOperandForFusionProcessing(producer);
   }
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
index 67e341e146095e..ad055a03414beb 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
@@ -28,7 +28,9 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
+#include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace gpu {
@@ -1036,7 +1038,7 @@ ENTRY main {
       SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get()).value());
 }
 
-TEST_P(SoftmaxRewriterTritonTest, DoNotFuseSoftmaxWithSmallRows) {
+TEST_P(SoftmaxRewriterTritonTest, CanFuseSoftmaxDiamondWithSmallRows) {
   PrimitiveType data_type = GetParam();
   const std::string hlo_string_template = R"(
 HloModule softmax
@@ -1046,11 +1048,11 @@ max_computation {
   ROOT maximum = $0[] maximum(arg_0, arg_1)
 }
 ENTRY main {
-  param_0 = $0[127,50]{1,0} parameter(0)
+  param_0 = $0[127,7]{1,0} parameter(0)
   constant_neg_inf = $0[] constant(-inf)
   reduce = $0[127]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=max_computation
-  broadcast = $0[127,50]{1,0} broadcast(reduce), dimensions={0}
-  ROOT subtract = $0[127,50]{1,0} subtract(param_0, broadcast)
+  broadcast = $0[127,7]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = $0[127,7]{1,0} subtract(param_0, broadcast)
 }
 )";
   const std::string hlo_string =
@@ -1058,8 +1060,12 @@ ENTRY main {
                        primitive_util::LowercasePrimitiveTypeName(data_type));
 
   auto module = ParseAndReturnVerifiedModule(hlo_string).value();
-  EXPECT_FALSE(
-      SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get()).value());
+  EXPECT_THAT(SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get()),
+              tsl::testing::IsOkAndHolds(true));
+  TF_EXPECT_OK(verifier().Run(module.get()).status());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter())));
 }
 
 TEST_P(
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
index 345b1b6e3b18ce..2da6897e22734a 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
@@ -39,9 +39,11 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/layout.h"
 #include "xla/literal_util.h"
-#include "xla/service/gpu/gemm_rewriter_triton.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/triton_fusion_analysis.h"
+#include "xla/service/gpu/triton_support.h"
+#include "xla/service/gpu/triton_tiling_propagation.h"
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
index da131e5d7c6ad6..40b2bde030dd39 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
@@ -28,8 +28,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout.h"
-#include "xla/service/gpu/gemm_rewriter_triton.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/hlo_verifier.h"
 #include "xla/service/layout_assignment.h"
 #include "xla/service/pattern_matcher.h"
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.cc b/third_party/xla/xla/service/gpu/stream_executor_util.cc
index e178a8fbbb727a..803342532b9998 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.cc
@@ -27,15 +27,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout_util.h"
+#include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
-#include "tsl/platform/regexp.h"
-#include "tsl/profiler/lib/traceme.h"
 #include "tsl/util/env_var.h"
 #include "tsl/util/proto/proto_utils.h"
 
@@ -319,7 +316,7 @@ absl::Mutex& GetGpuMutex(const se::StreamExecutor* stream_exec) {
   return it->second;
 }
 
-StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel(
+StatusOr<std::unique_ptr<se::Kernel>> CreateKernel(
     absl::string_view kernel_name, uint64_t num_args, absl::string_view ptx,
     absl::Span<const uint8_t> cubin_data, se::StreamExecutor* stream_exec,
     uint32_t shared_mem_bytes) {
@@ -331,7 +328,7 @@ StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel(
         reinterpret_cast<const char*>(cubin_data.data()), kernel_name);
   }
 
-  auto kernel_base = std::make_unique<se::KernelBase>(stream_exec);
+  auto kernel_base = std::make_unique<se::Kernel>(stream_exec);
   TF_RETURN_IF_ERROR(stream_exec->GetKernel(loader_spec, kernel_base.get()));
   se::KernelMetadata m;
   m.set_shared_memory_bytes(shared_mem_bytes);
@@ -339,23 +336,12 @@ StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel(
   return std::move(kernel_base);
 }
 
-Status ExecuteKernelOnStream(const se::KernelBase& kernel,
+Status ExecuteKernelOnStream(const se::Kernel& kernel,
                              absl::Span<const se::DeviceMemoryBase> args,
                              const LaunchDimensions& dims, se::Stream* stream) {
-  int shared_mem_bytes = kernel.metadata().shared_memory_bytes().value_or(0);
-  static constexpr int kKernelArgsLimit = 1024;
-  std::unique_ptr<se::KernelArgsArrayBase> kernel_args;
-  // The KernelArgsArray structure requires at a minimum 48 * args.size()
-  // bytes. It can be expensive to allocate, say, 48KiB, so we add
-  // specializations for smaller sizes. 64 arguments are likely to fit in a
-  // 4KiB page.
-  if (args.size() <= 64) {
-    kernel_args = se::MakeKernelArgs<64>(args, shared_mem_bytes);
-  } else if (args.size() <= 256) {
-    kernel_args = se::MakeKernelArgs<256>(args, shared_mem_bytes);
-  } else {
-    kernel_args = se::MakeKernelArgs<kKernelArgsLimit>(args, shared_mem_bytes);
-  }
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<se::KernelArgsPackedArrayBase> kernel_args,
+      se::PackKernelArgs(args, kernel.metadata()));
 
   LaunchDimensions::Dim3D thread_counts = dims.thread_counts_per_block();
   LaunchDimensions::Dim3D block_counts = dims.block_counts();
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.h b/third_party/xla/xla/service/gpu/stream_executor_util.h
index 4e291b5e9a04cc..52f39ac32cabed 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.h
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.h
@@ -80,13 +80,13 @@ absl::Mutex& GetGpuMutex(const se::StreamExecutor* stream_exec);
 //
 // The canonical storage for both ptx and cubin_data should outlive
 // the lifetime of the kernel.
-StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel(
+StatusOr<std::unique_ptr<se::Kernel>> CreateKernel(
     absl::string_view kernel_name, uint64_t num_args, absl::string_view ptx,
     absl::Span<const uint8_t> cubin_data, se::StreamExecutor* stream_exec,
     uint32_t shared_mem_bytes = 0);
 
 // Runs loaded kernel on the stream with the provided arguments.
-Status ExecuteKernelOnStream(const se::KernelBase& kernel,
+Status ExecuteKernelOnStream(const se::Kernel& kernel,
                              absl::Span<const se::DeviceMemoryBase> args,
                              const LaunchDimensions& dims, se::Stream* stream);
 
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index ad4aebe9c4398f..934caf24cc1655 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -17,11 +17,16 @@ load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
+    "tf_gpu_tests_tags",
 )
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load(
+    "//xla/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
 
 package(
     default_visibility = ["//visibility:public"],
@@ -140,9 +145,11 @@ xla_cc_test(
 
 xla_cc_test(
     name = "gemm_rewrite_test",
-    srcs = if_cuda_is_configured(["gemm_rewrite_test.cc"]) + if_rocm_is_configured(["gemm_rewrite_test.cc"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
-    tags = tf_cuda_tests_tags(),
+    srcs = if_gpu_is_configured(["gemm_rewrite_test.cc"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    tags = tf_gpu_tests_tags(),
     deps = [
         ":gpu_codegen_test",
         "//xla:statusor",
@@ -156,6 +163,7 @@ xla_cc_test(
         "//xla/service/gpu:gemm_rewriter",
         "//xla/service/gpu:gpu_executable",
         "//xla/tests:filecheck",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
@@ -169,12 +177,11 @@ xla_cc_test(
 
 xla_cc_test(
     name = "gemm_broadcast_folding_rewrite_test",
-    srcs = [
-        "gemm_broadcast_folding_rewrite_test.cc",
-    ],
-    tags = tf_cuda_tests_tags() + [
-        "no_rocm",
-    ],
+    srcs = ["gemm_broadcast_folding_rewrite_test.cc"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    tags = tf_gpu_tests_tags(),
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -736,6 +743,7 @@ xla_cc_binary(
     deps = [
         "//xla:status",
         "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:buffer_sharing",
         "//xla/service/gpu:compile_module_to_llvm_ir",
         "//xla/service/gpu:gpu_compiler",
         "//xla/service/gpu:gpu_device_info_for_tests",
@@ -881,6 +889,7 @@ xla_test(
         ":gpu_codegen_test",
         "//xla:error_spec",
         "//xla/tests:hlo_test_base",
+        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:tensor_float_32_utils",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -998,3 +1007,16 @@ xla_cc_test(
         "@local_tsl//tsl/platform:test_main",
     ],
 )
+
+xla_cc_test(
+    name = "simplify_fp_conversions_test",
+    srcs = ["simplify_fp_conversions_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        "//xla:xla_proto_cc",
+        "//xla/service:gpu_plugin",
+        "//xla/tests:hlo_test_base",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
diff --git a/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc b/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
index 262a408ebd9cf7..28a8f0d82e71ab 100644
--- a/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
@@ -151,7 +151,7 @@ TEST(SharedMemoryUseTest, ArrayReversalWorks) {
       se::CompileGpuAsm(executor->device_ordinal(), kPTX.data(),
                         PtxOptsFromDebugOptions(DebugOptions{}))
           .value();
-  std::unique_ptr<stream_executor::KernelBase> kernel =
+  std::unique_ptr<stream_executor::Kernel> kernel =
       CreateKernel("dyn_shmem_kernel", /*num_args=*/3,
                    reinterpret_cast<char*>(compiled_ptx.data()),
                    /*cubin_data=*/{}, executor,
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc
index c314853f9aba7d..43def8531d02e2 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc
@@ -26,11 +26,11 @@ namespace {
 
 class GemmBroadcastFoldingRewriteTest : public GpuCodegenTest {
  protected:
-  se::CudaComputeCapability GetCudaComputeCapability() {
+  const auto& GpuComputeComp() {
     return backend()
         .default_stream_executor()
         ->GetDeviceDescription()
-        .cuda_compute_capability();
+        .gpu_compute_capability();
   }
 
   DebugOptions GetDebugOptionsForTest() override {
@@ -137,7 +137,7 @@ ENTRY AddDotsFunc {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   // Use GemmRewriter to generate cublasGemm call.
-  GemmRewriter gemm_rewriter(GetCudaComputeCapability());
+  GemmRewriter gemm_rewriter(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           this->RunHloPass(&gemm_rewriter, module.get()));
   EXPECT_TRUE(changed);
@@ -163,7 +163,7 @@ ENTRY AddDotsFunc {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   // Use GemmRewriter to generate cublasGemm call.
-  GemmRewriter gemm_rewriter(GetCudaComputeCapability());
+  GemmRewriter gemm_rewriter(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           this->RunHloPass(&gemm_rewriter, module.get()));
   EXPECT_TRUE(changed);
@@ -187,7 +187,7 @@ ENTRY %LHSBatchDimNonZero (Arg_1: f32[4,3], Arg_2: f32[4,7,3]) -> f32[4,7,7] {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   // Use GemmRewriter to generate cublasGemm call.
-  GemmRewriter gemm_rewriter(GetCudaComputeCapability());
+  GemmRewriter gemm_rewriter(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           this->RunHloPass(&gemm_rewriter, module.get()));
   EXPECT_TRUE(changed);
@@ -210,7 +210,7 @@ ENTRY %RHSBatchDimNonZero (Arg_1: f32[4,3], Arg_2: f32[4,7,3]) -> f32[4,7,7] {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter gemm_rewriter(GetCudaComputeCapability());
+  GemmRewriter gemm_rewriter(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           this->RunHloPass(&gemm_rewriter, module.get()));
   EXPECT_TRUE(changed);
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
index a05f411fec5f31..22792c953d5149 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/functional/any_invocable.h"
 #include "absl/strings/str_replace.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -43,15 +44,76 @@ namespace gpu {
 
 namespace {
 
+template <class... Ts>
+struct Overload : Ts... {
+  using Ts::operator()...;
+};
+template <class... Ts>
+Overload(Ts...) -> Overload<Ts...>;
+
 namespace m = ::xla::match;
 
 class GemmRewriteTest : public GpuCodegenTest {
+  const auto& device_desc() {
+    return backend().default_stream_executor()->GetDeviceDescription();
+  }
+
  public:
-  se::CudaComputeCapability GetCudaComputeCapability() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .cuda_compute_capability();
+  const se::GpuComputeCapability& GpuComputeComp() {
+    return device_desc().gpu_compute_capability();
+  }
+  se::GpuComputeCapability CudaHopperOrRocm() {
+#if GOOGLE_CUDA
+    return se::CudaComputeCapability{se::CudaComputeCapability::HOPPER, 0};
+#elif TENSORFLOW_USE_ROCM
+    return device_desc().rocm_compute_capability();
+#endif
+  }
+
+  enum class Switch : uint32_t {
+    False,  // check always fails
+    True,   // check always succeeds
+  };
+  // switch based on architecture only
+  bool CudaOrRocmCheck(Switch cuda_set, Switch rocm_set) {
+    return std::visit(
+        Overload{[cuda_set](const se::CudaComputeCapability&) {
+                   return cuda_set == Switch::False ? false : true;
+                 },
+                 [rocm_set](const se::RocmComputeCapability&) {
+                   return rocm_set == Switch::False ? false : true;
+                 }},
+        GpuComputeComp());
+  }
+  // major version check for CUDA and true/false for rocm
+  bool CudaOrRocmCheck(int cuda_major, Switch rocm_set) {
+    return CudaOrRocmCheck(cuda_major, 0, rocm_set);
+  }
+  // full version check for CUDA and true/false for rocm
+  bool CudaOrRocmCheck(int cuda_major, int cuda_minor, Switch rocm_set) {
+    return std::visit(
+        Overload{
+            [cuda_major, cuda_minor](const se::CudaComputeCapability& cc) {
+              return cc.IsAtLeast(cuda_major, cuda_minor);
+            },
+            [rocm_set](const se::RocmComputeCapability&) {
+              return rocm_set == Switch::False ? false : true;
+            },
+        },
+        GpuComputeComp());
+  }
+  // most generic check: passes if NULL function is specified
+  bool CudaOrRocmCheck(
+      absl::AnyInvocable<bool(const se::CudaComputeCapability&)> cuda_fun,
+      absl::AnyInvocable<bool(const se::RocmComputeCapability&)> rocm_fun) {
+    return std::visit(
+        Overload{[&cuda_fun](const se::CudaComputeCapability& cc) {
+                   return (cuda_fun ? cuda_fun(cc) : true);
+                 },
+                 [&rocm_fun](const se::RocmComputeCapability& cc) {
+                   return (rocm_fun ? rocm_fun(cc) : true);
+                 }},
+        GpuComputeComp());
   }
 
   DebugOptions GetDebugOptionsForTest() override {
@@ -61,9 +123,24 @@ class GemmRewriteTest : public GpuCodegenTest {
     debug_options.set_xla_gpu_enable_triton_gemm(false);
     return debug_options;
   }
+
+  bool SkipGpuBlasLtTest() {
+    return CudaOrRocmCheck(
+        [](se::CudaComputeCapability) {  // never skip gpublas-lt tests for CUDA
+          return false;
+        },
+        [this](se::RocmComputeCapability rocm) {
+          bool blaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+          return (blaslt && !rocm.has_hipblaslt());
+        });
+  }
 };
 
 TEST_F(GemmRewriteTest, CheckCustomCallTarget) {
+  if (SkipGpuBlasLtTest()) {
+    GTEST_SKIP() << "BlasLt is not supported on this GPU architecture";
+  }
+
   const char* hlo_text = R"(
 HloModule SimpleGemm
 
@@ -74,7 +151,6 @@ ENTRY AddDotsFunc {
 }
 
 )";
-
   DebugOptions debug_options = GetDebugOptionsForTest();
   if (debug_options.xla_gpu_enable_cublaslt()) {
     MatchOptimizedHlo(hlo_text,
@@ -85,9 +161,9 @@ ENTRY AddDotsFunc {
   }
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(GemmRewriteTest, TestBatchedAutotuning) {
-  if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::AMPERE)) {
+  if (CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::False)) {
     GTEST_SKIP()
         << "There is no autotuning starting with the Nvidia Ampere generation";
   }
@@ -110,6 +186,10 @@ ENTRY %test {
 #endif
 
 TEST_F(GemmRewriteTest, SimpleRewriteDeterministic) {
+  if (SkipGpuBlasLtTest()) {
+    GTEST_SKIP() << "BlasLt is not supported on this GPU architecture";
+  }
+
   const char* hlo_text = R"(
 HloModule SimpleGemm
 
@@ -195,7 +275,7 @@ ENTRY broadcast {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // A test fixture class for tests which should have similar results with legacy
 // cublas and cublasLt
 class ParameterizedGemmRewriteTest
@@ -222,6 +302,13 @@ class ParameterizedGemmRewriteTest
     return replacements_[kCustomCallTargetPlaceholder];
   }
 
+ protected:
+  void SetUp() override {
+    if (SkipGpuBlasLtTest()) {
+      GTEST_SKIP() << "BlasLt is not supported on this GPU architecture";
+    }
+  }
+
  protected:
   absl::flat_hash_map<absl::string_view, absl::string_view> replacements_;
 
@@ -709,6 +796,13 @@ ENTRY AddDotsFunc {
 }
 
 TEST_P(ParameterizedGemmRewriteTest, ComplexAlphaSimpleRewrite) {
+  if (CudaOrRocmCheck(
+          [](se::CudaComputeCapability) { return false; },
+          [this](se::RocmComputeCapability rocm) {
+            return GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+          })) {
+    GTEST_SKIP() << "TODO: Unsupported C64 gpublas-lt datatype on ROCM";
+  }
   const char* hlo_text = R"(
 HloModule ComplexAlphaSimpleRewrite
 
@@ -840,7 +934,7 @@ ENTRY bf16gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::AMPERE)) {
+  if (CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::True)) {
     MatchOptimizedHlo(hlo_text,
                       R"(
 ; CHECK: {{.*}} custom-call(bf16[16,8]{1,0} {{.*}}, bf16[8,8]{1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
@@ -868,7 +962,7 @@ ENTRY bf16gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::AMPERE)) {
+  if (CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::True)) {
     MatchOptimizedHlo(hlo_text,
                       R"(
     ; CHECK: {{.*}} custom-call(bf16[3,8,8]{2,1,0} {{.*}}, bf16[3,8,8]{2,1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
@@ -890,6 +984,10 @@ ENTRY bf16gemm {
 }
 
 TEST_P(ParameterizedGemmRewriteTest, Int8Gemm) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
+  }
+
   const char* hlo_text = R"(
 HloModule int8gemm
 
@@ -901,7 +999,7 @@ ENTRY int8gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+  if (CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
     MatchOptimizedHlo(hlo_text,
                       R"(
 ; CHECK: {{.*}} custom-call(s8[12,4]{1,0} [[A:%[^ ]+]], s8[4,8]{0,1} [[B:%[^ ]+]]), custom_call_target="__cublas$gemm"
@@ -918,6 +1016,10 @@ ENTRY int8gemm {
 }
 
 TEST_F(GemmRewriteTest, Int8GemmRankGreaterThanTwo) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
+  }
+
   const char* hlo_text = R"(
 HloModule int8gemm
 
@@ -931,27 +1033,20 @@ ENTRY main.4 {
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+  if (CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
     MatchOptimizedHlo(hlo_text,
                       R"(
 ; CHECK: [[GEMM:%[^ ]+]] = (s32[8,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call(s8[8,4]{1,0} %fusion.1, s8[4,4]{0,1} %bitcast.13), custom_call_target="__cublas$gemm",
-; CHECK:   backend_config={
-; CHECK-DAG:   "selected_algorithm":"0"
-; CHECK-DAG:   "alpha_real":1
-; CHECK-DAG:   "alpha_imag":0
-; CHECK-DAG:   "beta":0
-; CHECK-DAG:   "dot_dimension_numbers":{"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]}
-; CHECK-DAG:   "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]}
-; CHECK-DAG:   "epilogue":"DEFAULT"
-; CHECK:   }
-; CHECK: [[RES:%[^ ]+]] = s32[8,4]{1,0} get-tuple-element((s32[8,4]{1,0}, s8[{{[0-9]+}}]{0}) [[GEMM]]), index=0
-; CHECK: ROOT [[OUT:%[^ ]+]] = s32[1,8,4]{2,1,0} bitcast(s32[8,4]{1,0} [[RES]])
   )",
                       /*print_operand_shape=*/true);
   }
 }
 
 TEST_P(ParameterizedGemmRewriteTest, Int8GemmNoAlphaRewrite) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
+  }
+
   const char* hlo_text = R"(
 HloModule int8gemm
 
@@ -966,7 +1061,7 @@ ENTRY int8gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+  if (CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
     MatchOptimizedHlo(hlo_text,
                       R"(
 ; CHECK: {{.*}} custom-call(s8[12,4]{1,0} [[A:%[^ ]+]], s8[4,8]{0,1} [[B:%[^ ]+]]),
@@ -987,6 +1082,9 @@ ENTRY int8gemm {
 }
 
 TEST_P(ParameterizedGemmRewriteTest, Int8GemmNoBetaRewrite) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
+  }
   const char* hlo_text = R"(
 HloModule int8gemm
 
@@ -1000,7 +1098,7 @@ ENTRY int8gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+  if (CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
     MatchOptimizedHlo(hlo_text,
                       R"(
 ; CHECK: {{.*}} custom-call(s8[12,4]{1,0} [[A:%[^ ]+]], s8[4,8]{0,1} [[B:%[^ ]+]]),
@@ -1022,6 +1120,10 @@ ENTRY int8gemm {
 }
 
 TEST_P(ParameterizedGemmRewriteTest, Int8GemmNotMultipleOfFour) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
+  }
+
   const char* hlo_text = R"(
 HloModule int8gemm
 
@@ -1033,7 +1135,7 @@ ENTRY int8gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+  if (CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
     MatchOptimizedHlo(hlo_text,
                       R"(
 ; CHECK: {{.*}} custom-call(s8[16,4]{1,0} [[A:%[^ ]+]], s8[4,12]{0,1} [[B:%[^ ]+]]), custom_call_target="__cublas$gemm"
@@ -1050,6 +1152,10 @@ ENTRY int8gemm {
 }
 
 TEST_P(ParameterizedGemmRewriteTest, GemmTypeCombinationCheck) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
+  }
+
   std::vector<std::tuple<absl::string_view, absl::string_view, bool>>
       type_combinations = {{"s8", "s8", true},
                            {"s32", "s32", true},
@@ -1065,7 +1171,7 @@ TEST_P(ParameterizedGemmRewriteTest, GemmTypeCombinationCheck) {
                            {"f16", "f32", true},
                            {"bf16", "f32", true}};
 
-  if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+  if (CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
     // For compute capabilities before volta, we always do upcasting, so it
     // would be impossible for this test to fail. That is why we only add these
     // cases when the compute capability is at least Volta.
@@ -1129,7 +1235,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
+  GemmRewriter pass(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1153,7 +1259,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
+  GemmRewriter pass(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1177,7 +1283,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
+  GemmRewriter pass(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1204,7 +1310,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
+  GemmRewriter pass(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1228,7 +1334,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
+  GemmRewriter pass(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1256,7 +1362,7 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
+  GemmRewriter pass(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1289,7 +1395,7 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
+  GemmRewriter pass(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1324,7 +1430,7 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
+  GemmRewriter pass(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1789,9 +1895,13 @@ ENTRY test {
 )");
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Test gemm matrix bias add fusion with mix type
 TEST_F(LegacyCublasGemmRewriteTest, MatrixBiasMixType) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP()
+        << "TODO: DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
+  }
   std::vector<std::tuple<absl::string_view, absl::string_view>>
       type_combinations = {
           {"f16", "f32"},
@@ -1830,6 +1940,10 @@ ENTRY test {
 
 // Test batch gemm matrix bias add fusion with mix type
 TEST_F(LegacyCublasGemmRewriteTest, MatrixBiasMixTypeBatched) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP()
+        << "TODO: DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
+  }
   std::vector<std::tuple<absl::string_view, absl::string_view>>
       type_combinations = {
           {"f16", "f32"},
@@ -1940,7 +2054,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
+  GemmRewriter pass(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1990,7 +2104,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
+  GemmRewriter pass(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   SCOPED_TRACE(module->ToString());
   EXPECT_TRUE(changed);
@@ -2012,7 +2126,7 @@ ENTRY test {
               0))));
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // A test fixture class for tests which are specific to cublasLt
 class CublasLtGemmRewriteTest : public GemmRewriteTest {
  public:
@@ -2022,6 +2136,13 @@ class CublasLtGemmRewriteTest : public GemmRewriteTest {
     debug_options.set_xla_gpu_enable_triton_gemm(false);
     return debug_options;
   }
+
+ protected:
+  void SetUp() override {
+    if (SkipGpuBlasLtTest()) {
+      GTEST_SKIP() << "BlasLt is not supported on this GPU architecture";
+    }
+  }
 };
 
 TEST_F(CublasLtGemmRewriteTest, AlphaBetaRewrite) {
@@ -2844,8 +2965,7 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, BF16VectorBiasPadded) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
+  if (!CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::True)) {
     GTEST_SKIP() << "Padding of GEMM bf16 operands only implemented on "
                     "architectures with bf16 Tensor Cores.";
   }
@@ -3403,6 +3523,9 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasThenApproxGeluActivation) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "TODO: Unsupported blas-lt epilogue on ROCM";
+  }
   const char* hlo_text = R"(
 HloModule test
 
@@ -3463,6 +3586,9 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, ApproxGeluActivationWithAux) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "TODO: Unsupported blas-lt epilogue on ROCM";
+  }
   const char* hlo_text = R"(
 HloModule test
 
@@ -3520,6 +3646,9 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasThenApproxGeluActivationWithAux) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "TODO: Unsupported blas-lt epilogue on ROCM";
+  }
   const char* hlo_text = R"(
 HloModule test
 
@@ -3581,8 +3710,7 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, ApproxGeluActivationBF16) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
+  if (!CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::True)) {
     GTEST_SKIP() << "Padding of GEMM bf16 operands only implemented on "
                     "architectures with bf16 Tensor Cores.";
   }
@@ -3677,7 +3805,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
+  GemmRewriter pass(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -3751,7 +3879,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
+  GemmRewriter pass(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -3809,7 +3937,7 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasF16Padded) {
-  if (!GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+  if (!CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
     GTEST_SKIP() << "Padding of GEMM operands only implemented on "
                     "architectures with Tensor Cores.";
   }
@@ -3902,7 +4030,7 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, ReluActivationF16Padded) {
-  if (!GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+  if (!CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
     GTEST_SKIP() << "Padding of GEMM operands only implemented on "
                     "architectures with Tensor Cores.";
   }
@@ -4043,7 +4171,7 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasReluActivationF16Padded) {
-  if (!GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+  if (!CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
     GTEST_SKIP() << "Padding of GEMM operands only implemented on "
                     "architectures with Tensor Cores.";
   }
@@ -4155,7 +4283,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
+  GemmRewriter pass(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -4215,8 +4343,7 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasBF16Padded) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
+  if (!CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::True)) {
     GTEST_SKIP() << "Padding of GEMM operands in bfloat16 only implemented on "
                     "Ampere and newer architectures.";
   }
@@ -4309,8 +4436,7 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, ReluActivationBF16Padded) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
+  if (!CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::True)) {
     GTEST_SKIP() << "Padding of GEMM operands in bfloat16 only implemented on "
                     "Ampere and newer architectures.";
   }
@@ -4406,8 +4532,7 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasReluActivationBF16Padded) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
+  if (!CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::True)) {
     GTEST_SKIP() << "Padding of GEMM operands in bfloat16 only implemented on "
                     "Ampere and newer architectures.";
   }
@@ -4460,6 +4585,9 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasReluActivationF64) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "TODO: Unsupported blas-lt F64 datatype on ROCM";
+  }
   const char* hlo_text = R"(
 HloModule test
 
@@ -4582,7 +4710,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
+  GemmRewriter pass(GpuComputeComp());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   SCOPED_TRACE(module->ToString());
   EXPECT_TRUE(changed);
@@ -4637,6 +4765,9 @@ ENTRY main {
 // Test gemm matrix bias add fusion with mix type and out of place update(C !=
 // D)
 TEST_F(CublasLtGemmRewriteTest, MatrixBiasMixTypeOutOfPlace) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "TODO: Unsupported mixed datatypes on ROCM";
+  }
   std::vector<std::tuple<absl::string_view, absl::string_view>>
       type_combinations = {
           {"f16", "f32"},
@@ -4671,6 +4802,9 @@ ENTRY test {
 // Test batch gemm matrix bias add fusion with mix type and out of place
 // update(C != D)
 TEST_F(CublasLtGemmRewriteTest, MatrixBiasMixTypeOutOfPlaceBatched) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "TODO: Unsupported mixed datatypes on ROCM";
+  }
   std::vector<std::tuple<absl::string_view, absl::string_view>>
       type_combinations = {
           {"f16", "f32"},
@@ -4704,6 +4838,9 @@ ENTRY test {
 
 // Test gemm matrix bias add fusion with mix type and in place update(C = D)
 TEST_F(CublasLtGemmRewriteTest, MatrixBiasMixTypeInPlace) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "TODO: Unsupported mixed datatypes on ROCM";
+  }
   std::vector<std::tuple<absl::string_view, absl::string_view>>
       type_combinations = {
           {"f16", "f32"},
@@ -4769,7 +4906,7 @@ class ParameterizedFp8GemmRewriteTest : public ParameterizedGemmRewriteTest {
   // architectures (Ada, Hopper, and later).
   void CheckFp8IfSupported(absl::string_view hlo_text,
                            ErrorSpec error_spec = ErrorSpec{1e-2, 1e-2}) {
-    if (!GetCudaComputeCapability().IsAtLeast(8, 9)) {
+    if (!CudaOrRocmCheck(8, 9, Switch::False)) {
       return;
     }
     EXPECT_TRUE(RunAndCompare(hlo_text, error_spec));
@@ -4784,10 +4921,17 @@ class ParameterizedFp8GemmRewriteTest : public ParameterizedGemmRewriteTest {
     ASSERT_NE(call, nullptr);
     EXPECT_EQ(call->custom_call_target(), "__cublas$lt$matmul$f8");
   }
+  void SetUp() override {
+    // VLOG(-1) << "Running test " <<
+    // ::testing::UnitTest::GetInstance()->current_test_info()->name();
+    if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+      GTEST_SKIP() << "F8 gemm rewrite is not yet supported on ROCm platform";
+    }
+  }
 };
 
 TEST_P(ParameterizedFp8GemmRewriteTest, DoNotRewriteToF8OnPreAda) {
-  if (GetCudaComputeCapability().IsAtLeast(8, 9)) {
+  if (CudaOrRocmCheck(8, 9, Switch::False)) {
     GTEST_SKIP() << "Test requires a pre-Ada GPU.";
   }
   const char* hlo_text = R"(
@@ -4811,9 +4955,10 @@ TEST_P(ParameterizedFp8GemmRewriteTest, DoNotRewriteToF8OnPreAda) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, UnsupportedTypesF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
+
   // Test with types unsupported by cuBLAS LT when FP8 is used. cuBLAS LT with
   // FP8 requires one of the operands to be F8E4M3FN.
   const char* hlo_text = R"(
@@ -4826,7 +4971,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnsupportedTypesF8) {
           }
 )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-2, 1e-2}));
-  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(GetCudaComputeCapability()),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(GpuComputeComp()),
                             absl::StrReplaceAll(R"(
 ; CHECK-LABEL: ENTRY %unsupported_types (x: f8e5m2[16,16], y: f8e5m2[16,16]) -> f8e5m2[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e5m2[16,16]{1,0} parameter(0)
@@ -4856,7 +5001,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnsupportedTypesF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -4871,9 +5016,8 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDF8) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  // auto comp = se::DeviceCom
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16]) -> f8e4m3fn[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
@@ -4901,7 +5045,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -4924,9 +5068,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], x_scale: f32[], y_scale: f32[]) -> f32[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
@@ -4956,7 +5098,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDPaddedF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -4979,9 +5121,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDPaddedF8) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[13,17], y: f8e4m3fn[17,31], x_scale: f32[], y_scale: f32[]) -> f32[13,31] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[13,17]{1,0} parameter(0)
@@ -5016,7 +5156,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDPaddedF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDBitcastF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -5041,8 +5181,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDBitcastF8) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(
-      se::CudaComputeCapability{se::CudaComputeCapability::HOPPER, 0});
+  GemmRewriter pass(CudaHopperOrRocm());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -5053,7 +5192,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDBitcastF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDUnaryOpsF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -5080,9 +5219,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDUnaryOpsF8) {
 
 )";
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[3], y: f8e4m3fn[32,16], x_scale: f32[], y_scale: f32[]) -> f32[16,16] {
@@ -5119,10 +5256,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDUnaryOpsF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDDynamicSliceF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
-#endif
-
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -5144,15 +5280,12 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDDynamicSliceF8) {
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(
-      se::CudaComputeCapability{se::CudaComputeCapability::HOPPER, 0});
+  GemmRewriter pass(CudaHopperOrRocm());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[32,32], y: f8e4m3fn[16,32], x_scale: f32[], y_scale: f32[]) -> f32[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[32,32]{1,0} parameter(0)
@@ -5183,10 +5316,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDDynamicSliceF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDSelectF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
-#endif
-
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -5210,15 +5342,12 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDSelectF8) {
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(
-      se::CudaComputeCapability{se::CudaComputeCapability::HOPPER, 0});
+  GemmRewriter pass(CudaHopperOrRocm());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[16,32], x_scale: f32[], y_scale: f32[], k: pred[16,32]) -> f32[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
@@ -5253,10 +5382,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDSelectF8) {
 
 TEST_P(ParameterizedFp8GemmRewriteTest,
        ScaledABUnscaledDSelectNonzeroConstantF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
-#endif
-
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
 
@@ -5280,14 +5408,11 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(
-      se::CudaComputeCapability{se::CudaComputeCapability::HOPPER, 0});
+  GemmRewriter pass(CudaHopperOrRocm());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[16,32], x_scale: f32[], y_scale: f32[], k: pred[16,32]) -> f32[16,16] {
 ; CHECK-NOT:           custom_call_target="__cublas$lt$matmul$f8"
@@ -5295,7 +5420,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, BatchedScaledABUnscaledDF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -5318,9 +5443,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, BatchedScaledABUnscaledDF8) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[10,16,32], y: f8e4m3fn[10,32,16], x_scale: f32[], y_scale: f32[]) -> f32[10,16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[10,16,32]{2,1,0} parameter(0)
@@ -5350,7 +5473,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, BatchedScaledABUnscaledDF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABAlphaDF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -5376,9 +5499,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABAlphaDF8) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], x_scale: f32[], y_scale: f32[]) -> f32[16,16] {
@@ -5409,7 +5530,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABAlphaDF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDReluActivationF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -5435,9 +5556,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDReluActivationF8) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], x_scale: f32[], y_scale: f32[]) -> f32[16,16] {
@@ -5468,7 +5587,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDReluActivationF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, InvScaledABUnscaledDF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -5491,16 +5610,14 @@ TEST_P(ParameterizedFp8GemmRewriteTest, InvScaledABUnscaledDF8) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
       )");
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -5525,9 +5642,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasF8) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], b: f32[16,16], x_scale: f32[], y_scale: f32[]) -> f32[16,16] {
@@ -5559,7 +5674,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasPaddedF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -5584,9 +5699,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasPaddedF8) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[14,31], y: f8e4m3fn[31,14], b: f32[14,14], x_scale: f32[], y_scale: f32[]) -> f32[14,14] {
@@ -5625,7 +5738,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasPaddedF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -5657,9 +5770,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDF8) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], x_scale: f32[], y_scale: f32[], z_scale: f32[]) -> f8e4m3fn[16,16] {
 ; CHECK:         [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
@@ -5692,7 +5803,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABInvScaledDF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -5724,9 +5835,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABInvScaledDF8) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 
 ; CHECK-NOT:     divide
@@ -5737,7 +5846,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABInvScaledDF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDReluActivationF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -5770,9 +5879,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDReluActivationF8) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], x_scale: f32[], y_scale: f32[], z_scale: f32[]) -> f8e4m3fn[16,16] {
 ; CHECK:         [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
@@ -5805,7 +5912,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDReluActivationF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDMatrixBiasF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -5839,9 +5946,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDMatrixBiasF8) {
 )";
 
   CheckFp8IfSupported(hlo_text, ErrorSpec{0.1, 0.1});
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], b: f16[16,16], x_scale: f16[], y_scale: f16[], z_scale: f16[]) -> f8e4m3fn[16,16] {
@@ -5874,7 +5979,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDMatrixBiasF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDVectorBiasF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -5909,9 +6014,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDVectorBiasF8) {
 )";
 
   CheckFp8IfSupported(hlo_text, ErrorSpec{0.1, 0.1});
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], b: f16[16], x_scale: f16[], y_scale: f16[], z_scale: f16[]) -> f8e4m3fn[16,16] {
@@ -5949,7 +6052,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDVectorBiasF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF32VectorBiasF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -5977,9 +6080,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF32VectorBiasF8) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], b: f32[16], x_scale: f32[], y_scale: f32[]) -> f32[16,16] {
 ; CHECK:         [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
@@ -6012,7 +6113,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF32VectorBiasF8) {
 
 TEST_P(ParameterizedFp8GemmRewriteTest,
        ScaledABUnscaledDVectorBiasThenReluActivationF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
@@ -6040,9 +6141,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 )";
 
   CheckFp8IfSupported(hlo_text, ErrorSpec{2e-3, 0.});
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], b: f16[16], x_scale: f16[], y_scale: f16[]) -> f16[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
@@ -6075,9 +6174,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDVectorBiasF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "A matrix bias on a matmul is only supported in CUDA 12";
-#endif
+#endif  // CUDA_VERSION < 12000
   const char* hlo_text = R"(
     HloModule test
     ENTRY test {
@@ -6103,8 +6202,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDVectorBiasF8) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(
-      se::CudaComputeCapability{se::CudaComputeCapability::HOPPER, 0});
+  GemmRewriter pass(CudaHopperOrRocm());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -6113,9 +6211,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDVectorBiasF8) {
                                         .WithShape(F16, {64, 32}))
                              .WithShape(F16, {4, 16, 32})));
 
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[4,16,16], y: f8e4m3fn[16,32], b: f32[32], x_scale: f16[], y_scale: f16[]) -> f16[4,16,32] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[4,16,16]{2,1,0} parameter(0)
@@ -6152,7 +6248,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDVectorBiasF8) {
 
 TEST_P(ParameterizedFp8GemmRewriteTest,
        Rank3ScaledABUnscaledDVectorBiasPaddedF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "A matrix bias on a matmul is only supported in CUDA 12";
 #endif
   const char* hlo_text = R"(
@@ -6180,8 +6276,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(
-      se::CudaComputeCapability{se::CudaComputeCapability::HOPPER, 0});
+  GemmRewriter pass(CudaHopperOrRocm());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -6192,9 +6287,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
                                 .WithShape(F16, {60, 31}))
                      .WithShape(F16, {4, 15, 31})));
 
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[4,15,15], y: f8e4m3fn[15,31], b: f32[31], x_scale: f16[], y_scale: f16[]) -> f16[4,15,31] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[4,15,15]{2,1,0} parameter(0)
@@ -6237,7 +6330,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDMatrixBiasF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "A matrix bias on a matmul is only supported in CUDA 12";
 #endif
   const char* hlo_text = R"(
@@ -6263,8 +6356,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDMatrixBiasF8) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(
-      se::CudaComputeCapability{se::CudaComputeCapability::HOPPER, 0});
+  GemmRewriter pass(CudaHopperOrRocm());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -6273,9 +6365,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDMatrixBiasF8) {
                                         .WithShape(F32, {64, 32}))
                              .WithShape(F32, {4, 16, 32})));
 
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[4,16,16], y: f8e4m3fn[16,32], b: f32[4,16,32], x_scale: f32[], y_scale: f32[]) -> f32[4,16,32] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[4,16,16]{2,1,0} parameter(0)
@@ -6310,7 +6400,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDMatrixBiasF8) {
 
 TEST_P(ParameterizedFp8GemmRewriteTest,
        Rank3ScaledABUnscaledDMatrixBiasPaddedF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "A matrix bias on a matmul is only supported in CUDA 12";
 #endif
   const char* hlo_text = R"(
@@ -6336,8 +6426,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(
-      se::CudaComputeCapability{se::CudaComputeCapability::HOPPER, 0});
+  GemmRewriter pass(CudaHopperOrRocm());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -6348,9 +6437,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
                                 .WithShape(F32, {45, 31}))
                      .WithShape(F32, {3, 15, 31})));
 
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[3,15,15], y: f8e4m3fn[15,31], b: f32[3,15,31], x_scale: f32[], y_scale: f32[]) -> f32[3,15,31] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[3,15,15]{2,1,0} parameter(0)
@@ -6394,7 +6481,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 // of dimensions.
 TEST_P(ParameterizedFp8GemmRewriteTest,
        ScaledABUnscaledDMatrixBiasWithSliceF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "A matrix bias on a matmul is only supported in CUDA 12";
 #endif
   const char* hlo_text = R"(
@@ -6419,14 +6506,11 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(
-      se::CudaComputeCapability{se::CudaComputeCapability::HOPPER, 0});
+  GemmRewriter pass(CudaHopperOrRocm());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[48,16], y: f8e4m3fn[16,32], b: f32[32,16], x_scale: f32[], y_scale: f32[]) -> f32[32,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[48,16]{1,0} parameter(0)
@@ -6459,10 +6543,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDWithAllGatherF8) {
-#if CUDA_VERSION < 12000
-  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
+  GTEST_SKIP() << "A matrix bias on a matmul is only supported in CUDA 12";
 #endif
-
   absl::string_view hlo_text = R"(
     HloModule test
 
@@ -6487,9 +6570,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDWithAllGatherF8) {
   config.set_use_spmd_partitioning(true);
   config.set_num_partitions(8);
 
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[16,32], x_scale: f32[], y_scale: f32[]) -> f32[16,32] {
 ; CHECK:         [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
@@ -6522,10 +6603,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDWithAllGatherF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDWithAllToAllF8) {
-#if CUDA_VERSION < 12000
-  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
+  GTEST_SKIP() << "A matrix bias on a matmul is only supported in CUDA 12";
 #endif
-
   absl::string_view hlo_text = R"(
     HloModule test
 
@@ -6549,9 +6629,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDWithAllToAllF8) {
   config.set_use_spmd_partitioning(true);
   config.set_num_partitions(8);
 
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[16,32], x_scale: f32[], y_scale: f32[]) -> f32[16,16] {
 ; CHECK:         [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
@@ -6583,10 +6661,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDWithAllToAllF8) {
 
 TEST_P(ParameterizedFp8GemmRewriteTest,
        ScaledABUnscaledDWithCollectivePermuteF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
 #endif
-
   absl::string_view hlo_text = R"(
     HloModule test
 
@@ -6610,9 +6687,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
   config.set_use_spmd_partitioning(true);
   config.set_num_partitions(8);
 
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[16,32], x_scale: f32[], y_scale: f32[]) -> f32[16,16] {
 ; CHECK:         [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
@@ -6644,9 +6719,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
 TEST_P(ParameterizedFp8GemmRewriteTest,
        ScaledABUnscaledDMatrixBiasThenVectorBiasF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
-#endif  // CUDA_VERSION < 12000
+#endif
   const char* hlo_text = R"(
     HloModule test
 
@@ -6671,9 +6746,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
 )";
   CheckFp8IfSupported(hlo_text, ErrorSpec{2e-3, 0.});
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL:   ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], b: f16[16], b2: f16[16,16], x_scale: f16[], y_scale: f16[]) -> f16[16,16] {
 ; CHECK-DAG:     [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
@@ -6709,9 +6782,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDWithDAmaxF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
-#endif  // CUDA_VERSION < 12000
+#endif
   const char* hlo_text = R"(
     HloModule test
 
@@ -6751,9 +6824,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDWithDAmaxF8) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], x_scale: f32[], y_scale: f32[], z_scale: f32[]) -> (f8e4m3fn[16,16], f32[]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
@@ -6787,9 +6858,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDWithDAmaxF8) {
 
 TEST_P(ParameterizedFp8GemmRewriteTest,
        ScaledABScaledDWithDAmaxF8WithF16Intermediates) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
-#endif  // CUDA_VERSION < 12000
+#endif
   // This is the same as ScaledABScaledDWithDAmaxF8, but uses F16 intermediate
   // values instead of F32 intermediate values.
   const char* hlo_text = R"(
@@ -6831,9 +6902,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], x_scale: f16[], y_scale: f16[], z_scale: f16[]) -> (f8e4m3fn[16,16], f16[]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
@@ -6870,9 +6939,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
 TEST_P(ParameterizedFp8GemmRewriteTest,
        ScaledABScaledDReluActivationWithDAmaxF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
-#endif  // CUDA_VERSION < 12000
+#endif
   const char* hlo_text = R"(
     HloModule test
 
@@ -6914,9 +6983,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], x_scale: f32[], y_scale: f32[], z_scale: f32[]) -> (f8e4m3fn[16,16], f32[]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
@@ -6981,9 +7048,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDPrecisionF8) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8Parameterized) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
-#endif  // CUDA_VERSION < 12000
+#endif
   std::array<std::array<absl::string_view, 7>, 32> combinations;
   int i = 0;
 
@@ -7040,9 +7107,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8Parameterized) {
     const auto hlo_text = absl::StrReplaceAll(hlo_template, replacements);
     CheckFp8IfSupported(hlo_text);
 
-    RunAndFilecheckHloRewrite(hlo_text,
-                              GemmRewriter(se::CudaComputeCapability{
-                                  se::CudaComputeCapability::HOPPER, 0}),
+    RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                               R"(
     ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
           )");
@@ -7051,10 +7116,10 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8Parameterized) {
 
 TEST_P(ParameterizedFp8GemmRewriteTest,
        ScaledABUnscaledDF8ParameterizedBatched) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
-#endif  // CUDA_VERSION < 12000
-  // TODO(wenscarl): For batched matmaul, not all combinations of A, B and
+#endif
+  // TODO(wenscarl): For batched matmul, not all combinations of A, B and
   // output layouts get pattern matched successfully to FP8 custom call. Only
   // a handful of cases are tested here.
   std::array<std::array<std::string, 7>, 32> combinations;
@@ -7109,9 +7174,7 @@ ENTRY f {
     const auto hlo_text = absl::StrReplaceAll(hlo_template, replacements);
     CheckFp8IfSupported(hlo_text);
 
-    RunAndFilecheckHloRewrite(hlo_text,
-                              GemmRewriter(se::CudaComputeCapability{
-                                  se::CudaComputeCapability::HOPPER, 0}),
+    RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                               R"(
     ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
           )");
@@ -7119,9 +7182,9 @@ ENTRY f {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8TF32E5M2) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
-#endif  // CUDA_VERSION < 12000
+#endif
   const char* hlo_text = R"(
     HloModule test
 
@@ -7142,18 +7205,16 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8TF32E5M2) {
 )";
 
   CheckFp8IfSupported(hlo_text);
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             R"(
     ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
           )");
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, FnuzTypeF8) {
-#if CUDA_VERSION < 12000
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
-#endif  // CUDA_VERSION < 12000
+#endif
   // Test that FNUZ FP8 gemms are not rewritten, as cuBLAS does not support them
   const char* hlo_text = R"(
     HloModule test
@@ -7173,9 +7234,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, FnuzTypeF8) {
           }
 )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-2, 1e-2}));
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::HOPPER, 0}),
+  RunAndFilecheckHloRewrite(hlo_text, GemmRewriter(CudaHopperOrRocm()),
                             absl::StrReplaceAll(R"(
 ; CHECK-LABEL: ENTRY %test (x: f8e4m3fnuz[16,32], y: f8e4m3fnuz[32,16], x_scale: f32[], y_scale: f32[]) -> f32[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fnuz[16,32]{1,0} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_index_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_index_test.cc
index 649804c4cd971e..261b2721675403 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_index_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_index_test.cc
@@ -144,7 +144,7 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) {
   CompileAndVerifyIr(std::move(module),
                      R"(
 ; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14
-; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64
+; CHECK: %[[idx1:.*]] = zext nneg i{{[0-9]*}} %[[urem1]] to i64
 ; CHECK: getelementptr inbounds float, ptr{{( addrspace\(1\))?}} %[[alloc:.*]], i64 %[[idx1]]
       )",
                      /*match_optimized_ir=*/true);
diff --git a/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc
index d31d9c3824a5eb..a3bc1ef8263a18 100644
--- a/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/buffer_sharing.h"
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/gpu_hlo_schedule.h"
@@ -90,20 +91,22 @@ xla::Status CompileAndPrintLlvmIr(const std::string& hlo_text,
     return xla::gpu::GetSizeOfShape(buffer.shape(), /*pointer_size=*/8);
   };
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::Module> llvm_module,
-                      xla::gpu::CompileModuleToLlvmIr(
-                          hlo_module.get(), &llvm_context, target_triple,
-                          data_layout, platform_name, platform_id,
-                          gpu_device_info, buffer_size_bytes_function));
+  TF_ASSIGN_OR_RETURN(
+      xla::gpu::CompileModuleResults results,
+      xla::gpu::CompileModuleToLlvmIr(
+          hlo_module.get(), &llvm_context, target_triple, data_layout,
+          platform_name, platform_id, gpu_device_info,
+          &xla::gpu::CanShareBufferHint, buffer_size_bytes_function));
+  llvm::Module* llvm_module = results.llvm_module.get();
 
   if (!generate_ptx) {
     llvm_module->print(llvm::outs(), nullptr);
   } else {
 #if GOOGLE_CUDA
-    TF_ASSIGN_OR_RETURN(std::string ptx,
-                        xla::gpu::nvptx::CompileToPtx(
-                            llvm_module.get(), cuda_compute_capability,
-                            hlo_module->config().debug_options()));
+    TF_ASSIGN_OR_RETURN(
+        std::string ptx,
+        xla::gpu::nvptx::CompileToPtx(llvm_module, cuda_compute_capability,
+                                      hlo_module->config().debug_options()));
     std::cout << ptx << std::endl;
 #elif TENSORFLOW_USE_ROCM
     return {absl::StatusCode::kUnimplemented,
diff --git a/third_party/xla/xla/service/gpu/tests/scatter.hlo b/third_party/xla/xla/service/gpu/tests/scatter.hlo
index ad900c35472a0e..9edff4fc317a84 100644
--- a/third_party/xla/xla/service/gpu/tests/scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/scatter.hlo
@@ -113,14 +113,11 @@
 // CHECK:         br i1 %[[VAL_81]], label %[[VAL_66]], label %[[VAL_74]]
 // CHECK:       entry:
 // CHECK:         %[[VAL_82:.*]] = alloca i32, align 4
-// CHECK-GCN:     %[[VAL_82_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_82]] to ptr
 // CHECK:         %[[VAL_83:.*]] = load i32, ptr %[[VAL_84:.*]], align 4
 // CHECK:         %[[VAL_85:.*]] = load i32, ptr %[[VAL_86:.*]], align 4
 // CHECK:         %[[VAL_87:.*]] = mul i32 %[[VAL_83]], %[[VAL_85]]
-// CHECK-PTX:     store i32 %[[VAL_87]], ptr %[[VAL_82]], align 4
-// CHECK-PTX:     %[[VAL_88:.*]] = load i32, ptr %[[VAL_82]], align 4
-// CHECK-GCN:     store i32 %[[VAL_87]], ptr %[[VAL_82_2]], align 4
-// CHECK-GCN:     %[[VAL_88:.*]] = load i32, ptr %[[VAL_82_2]], align 4
+// CHECK:         store i32 %[[VAL_87]], ptr [[ADDRSPACE_ANNOTATION]]%[[VAL_82]], align 4
+// CHECK:         %[[VAL_88:.*]] = load i32, ptr [[ADDRSPACE_ANNOTATION]]%[[VAL_82]], align 4
 // CHECK:         store i32 %[[VAL_88]], ptr %[[VAL_89:.*]], align 4
 // CHECK:         ret void
 // CHECK:       entry:
diff --git a/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo b/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
index 1f2be41ad5ab26..1bbe30d156c8a3 100644
--- a/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
@@ -89,15 +89,12 @@
 // CHECK:         br label %[[VAL_14]]
 // CHECK:       entry:
 // CHECK:         %[[VAL_56:.*]] = alloca i8, align 1
-// CHECK-GCN:     %[[VAL_56_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_56]] to ptr
 // CHECK:         %[[VAL_57:.*]] = load float, ptr %[[VAL_58:.*]], align 4
 // CHECK:         %[[VAL_59:.*]] = load float, ptr %[[VAL_60:.*]], align 4
 // CHECK:         %[[VAL_61:.*]] = fcmp oge float %[[VAL_57]], %[[VAL_59]]
 // CHECK:         %[[VAL_62:.*]] = zext i1 %[[VAL_61]] to i8
-// CHECK-PTX:     store i8 %[[VAL_62]], ptr %[[VAL_56]], align 1
-// CHECK-PTX:     %[[VAL_63:.*]] = load i8, ptr %[[VAL_56]], align 1
-// CHECK-GCN:     store i8 %[[VAL_62]], ptr %[[VAL_56_2]], align 1
-// CHECK-GCN:     %[[VAL_63:.*]] = load i8, ptr %[[VAL_56_2]], align 1
+// CHECK:         store i8 %[[VAL_62]], ptr [[ADDRSPACE_ANNOTATION]]%[[VAL_56]], align 1
+// CHECK:         %[[VAL_63:.*]] = load i8, ptr [[ADDRSPACE_ANNOTATION]]%[[VAL_56]], align 1
 // CHECK:         store i8 %[[VAL_63]], ptr %[[VAL_64:.*]], align 1
 // CHECK:         ret void
 
diff --git a/third_party/xla/xla/service/gpu/tests/simplify_fp_conversions_test.cc b/third_party/xla/xla/service/gpu/tests/simplify_fp_conversions_test.cc
new file mode 100644
index 00000000000000..5ebc0aed7fa41e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/simplify_fp_conversions_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string_view>
+
+#include <gtest/gtest.h>
+#include "xla/tests/hlo_test_base.h"
+#include "xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class SimplifyFPConversionsTest : public HloTestBase {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_simplify_all_fp_conversions(
+        enable_simplify_all_fp_conversions_);
+    return debug_options;
+  }
+
+  void SetEnableSimplifyFpConversions(bool enable_simplify_all_fp_conversions) {
+    enable_simplify_all_fp_conversions_ = enable_simplify_all_fp_conversions;
+  }
+
+  static constexpr std::string_view kHloText = R"(
+HloModule module
+
+ENTRY main {
+  param0 = bf16[1536]{0} parameter(0)
+  param1 = bf16[4,1536]{1,0} parameter(1)
+
+  s = bf16[1536]{0} rsqrt(param0)
+  // Redundant conversions appear here when the algebraic simplifier
+  // pushes the broadcast op further down
+  b = bf16[4,1536]{1,0} broadcast(s), dimensions={1}
+
+  ROOT d = bf16[4,1536]{1,0} multiply(b, param1)
+}
+  )";
+
+ private:
+  bool enable_simplify_all_fp_conversions_ = false;
+};
+
+TEST_F(SimplifyFPConversionsTest, RedundantTypeConversionsGetCleanedUp) {
+  // The algebraic simplifier might expose redundant type conversions,
+  // i.e. f32 -> bf16 -> f32. This test ensures that they will get cleaned up
+  // eventually by the SimplifyFPConversion pass.
+
+  SetEnableSimplifyFpConversions(true);
+
+  // This matcher ensures that there will be no convert in between the rsqrt and
+  // the broadcast instruction.
+  MatchOptimizedHlo(kHloText, R"(
+// CHECK: rsqrt(
+// CHECK-NOT: convert(
+// CHECK: broadcast(
+)");
+}
+
+TEST_F(SimplifyFPConversionsTest, RedundantTypeConversionsArePresentInTest) {
+  // This test ensures that the HLO that we use in the previous test is actually
+  // meaningful and would lead to redundant type conversions if the simplifier
+  // didn't clean them up.
+
+  SetEnableSimplifyFpConversions(false);
+
+  MatchOptimizedHlo(kHloText, R"(
+// CHECK: rsqrt(
+// CHECK-NEXT: convert(
+// CHECK-NEXT: convert(
+// CHECK-NEXT: broadcast(
+)");
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
index 50af63fb3a866d..51daf82bb7ceb8 100644
--- a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
+++ b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
@@ -1,4 +1,6 @@
 // RUN: hlo_to_llvm_ir --ptx %s | FileCheck %s
+// RUN: hlo_to_llvm_ir --ptx %s --sm=80 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SM80
+// RUN: hlo_to_llvm_ir --ptx %s --sm=90 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SM90
 
 // CHECK-DAG: sqrt.approx.f32
 
@@ -61,3 +63,39 @@ ENTRY main {
   a = f32[] parameter(0)
   ROOT wrapped_b = f32[] fusion(f32[] a), kind=kLoop, calls=fused_computation
 }
+
+// -----
+
+// CHECK-SM80: min.NaN.f32
+
+HloModule Test, is_scheduled=true
+
+fused_computation {
+  param_0 = f32[] parameter(0)
+  param_1 = f32[] parameter(1)
+  ROOT b.1 = f32[] minimum(f32[] param_0, f32[] param_1)
+}
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT wrapped_b = f32[] fusion(f32[] a, f32[] b), kind=kLoop, calls=fused_computation
+}
+
+// -----
+
+// CHECK-SM80: cvt.rn.f32.s16
+// CHECK-SM80: cvt.rn.bf16.f32
+// CHECK-SM90: cvt.rn.bf16.s16
+
+HloModule Test, is_scheduled=true
+
+fused_computation {
+  param_0 = s16[] parameter(0)
+  ROOT b.1 = bf16[] convert(s16[] param_0)
+}
+
+ENTRY main {
+  a = s16[] parameter(0)
+  ROOT wrapped_b = bf16[] fusion(s16[] a), kind=kLoop, calls=fused_computation
+}
diff --git a/third_party/xla/xla/service/gpu/tests/sorting.hlo b/third_party/xla/xla/service/gpu/tests/sorting.hlo
index 31e9c3dad40163..c6b405ce17a4ce 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting.hlo
+++ b/third_party/xla/xla/service/gpu/tests/sorting.hlo
@@ -272,15 +272,12 @@ compare {
 // CHECK:         br label %[[VAL_49]]
 // CHECK:       entry:
 // CHECK:         %[[VAL_179:.*]] = alloca i8, align 1
-// CHECK-GCN:     %[[VAL_179_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_179]] to ptr
 // CHECK:         %[[VAL_180:.*]] = load float, ptr %[[VAL_181:.*]], align 4
 // CHECK:         %[[VAL_182:.*]] = load float, ptr %[[VAL_183:.*]], align 4
 // CHECK:         %[[VAL_184:.*]] = fcmp olt float %[[VAL_180]], %[[VAL_182]]
 // CHECK:         %[[VAL_185:.*]] = zext i1 %[[VAL_184]] to i8
-// CHECK-PTX:     store i8 %[[VAL_185]], ptr %[[VAL_179]], align 1
-// CHECK-PTX:     %[[VAL_186:.*]] = load i8, ptr %[[VAL_179]], align 1
-// CHECK-GCN:     store i8 %[[VAL_185]], ptr %[[VAL_179_2]], align 1
-// CHECK-GCN:     %[[VAL_186:.*]] = load i8, ptr %[[VAL_179_2]], align 1
+// CHECK:         store i8 %[[VAL_185]], ptr [[ADDRSPACE_ANNOTATION]]%[[VAL_179]], align 1
+// CHECK:         %[[VAL_186:.*]] = load i8, ptr [[ADDRSPACE_ANNOTATION]]%[[VAL_179]], align 1
 // CHECK:         store i8 %[[VAL_186]], ptr %[[VAL_187:.*]], align 1
 // CHECK:         ret void
 
@@ -668,15 +665,12 @@ compare {
 // CHECK:         br label %[[VAL_248]]
 // CHECK:       entry:
 // CHECK:         %[[VAL_448:.*]] = alloca i8, align 1
-// CHECK-GCN:     %[[VAL_448_2:.*]] = addrspacecast ptr addrspace(5) %[[VAL_448]] to ptr
 // CHECK:         %[[VAL_449:.*]] = load float, ptr %[[VAL_450:.*]], align 4
 // CHECK:         %[[VAL_451:.*]] = load float, ptr %[[VAL_452:.*]], align 4
 // CHECK:         %[[VAL_453:.*]] = fcmp olt float %[[VAL_449]], %[[VAL_451]]
 // CHECK:         %[[VAL_454:.*]] = zext i1 %[[VAL_453]] to i8
-// CHECK-PTX:     store i8 %[[VAL_454]], ptr %[[VAL_448]], align 1
-// CHECK-PTX:     %[[VAL_455:.*]] = load i8, ptr %[[VAL_448]], align 1
-// CHECK-GCN:     store i8 %[[VAL_454]], ptr %[[VAL_448_2]], align 1
-// CHECK-GCN:     %[[VAL_455:.*]] = load i8, ptr %[[VAL_448_2]], align 1
+// CHECK:         store i8 %[[VAL_454]], ptr [[ADDRSPACE_ANNOTATION]]%[[VAL_448]], align 1
+// CHECK:         %[[VAL_455:.*]] = load i8, ptr [[ADDRSPACE_ANNOTATION]]%[[VAL_448]], align 1
 // CHECK:         store i8 %[[VAL_455]], ptr %[[VAL_456:.*]], align 1
 // CHECK:         ret void
 
diff --git a/third_party/xla/xla/service/gpu/tests/tensor_float_32_global_var_test.cc b/third_party/xla/xla/service/gpu/tests/tensor_float_32_global_var_test.cc
index d05f5aa367537e..cec3db1c9a1d74 100644
--- a/third_party/xla/xla/service/gpu/tests/tensor_float_32_global_var_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/tensor_float_32_global_var_test.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+
+#include <gtest/gtest.h>
 #include "xla/error_spec.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/tensor_float_32_utils.h"
@@ -25,36 +28,52 @@ namespace {
 // TensorFloat-32 not to be used, even when the operand precision is set to the
 // default.
 // TODO(b/280130359): Have XLA ignore the TensorFloat-32 global variable
-class TensorFloat32GlobalVarTest : public HloTestBase {
+// NOTE: Unfortunately TF2XLA doesn't set the precision config for all
+// operations based on tensor_float_32_execution_enabled(), so we can not ignore
+// the global variable.
+class TensorFloat32GlobalVarTest : public ::testing::WithParamInterface<bool>,
+                                   public HloTestBase {
  protected:
   TensorFloat32GlobalVarTest() {
+    tsl::enable_tensor_float_32_execution(false);
+
     // The error tolerances are small enough so that the use of TF32 will cause
     // the error to be greater than the tolerances.
     error_spec_ = ErrorSpec{1e-4, 1e-4};
   }
+
+  ~TensorFloat32GlobalVarTest() override {
+    tsl::enable_tensor_float_32_execution(true);
+  }
+
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    const bool enable_triton_gemm = GetParam();
+    if (enable_triton_gemm) {
+      debug_options.set_xla_gpu_enable_triton_gemm(true);
+      debug_options.set_xla_gpu_triton_gemm_any(true);
+      debug_options.set_xla_gpu_cublas_fallback(false);
+    } else {
+      debug_options.set_xla_gpu_enable_triton_gemm(false);
+    }
+    return debug_options;
+  }
 };
 
-// `dot` doesn't support the flag directly, just by checking the operand
-// precisions. tf2xla/transforms/legalize_tf.cc sets them to highest if the
-// TensorFloat-32 global variable is false. So now we also do that in this test.
-// TODO(b/280130359): Remove these tests and write tests for operand_precision
-// instead.
-TEST_F(TensorFloat32GlobalVarTest, Dot) {
-  // We don't set the global variable, because it's ignored anyway.
+TEST_P(TensorFloat32GlobalVarTest, Dot) {
   const char* hlo_text = R"(
 HloModule TestModule
 
 ENTRY %dot_computation (x: f32[1024,1024], source: f32[1024,1024]) -> f32[1024,1024] {
   %x = f32[1024,1024] parameter(0)
   %y = f32[1024,1024] parameter(1)
-  ROOT %result = f32[1024,1024] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={highest, highest}
+  ROOT %result = f32[1024,1024] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={default, default}
 }
 )";
   EXPECT_TRUE(RunAndCompare(hlo_text, error_spec_));
 }
 
-TEST_F(TensorFloat32GlobalVarTest, Convolution) {
-  tsl::enable_tensor_float_32_execution(false);
+TEST_P(TensorFloat32GlobalVarTest, Convolution) {
   const char* hlo_text = R"(
 HloModule TestModule
 
@@ -67,6 +86,13 @@ ENTRY %conv_computation (x: f32[16,40,40,64], source: f32[3,3,64,64]) -> f32[16,
   EXPECT_TRUE(RunAndCompare(hlo_text, error_spec_));
 }
 
+std::string TestParamToString(const ::testing::TestParamInfo<bool>& info) {
+  return info.param ? "WithTritonGemm" : "WithoutTritonGemm";
+}
+
+INSTANTIATE_TEST_SUITE_P(All, TensorFloat32GlobalVarTest, ::testing::Bool(),
+                         TestParamToString);
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/tests/tree_reduction_rewriter_test.cc b/third_party/xla/xla/service/gpu/tests/tree_reduction_rewriter_test.cc
index 4cfd03a38a66ee..d49464b612f713 100644
--- a/third_party/xla/xla/service/gpu/tests/tree_reduction_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/tree_reduction_rewriter_test.cc
@@ -60,7 +60,7 @@ add {
 }
 
 ENTRY main {
-  input = f32[50000] parameter(0)
+  input = f32[50021] parameter(0)
   zero = f32[] constant(0)
   ROOT out = f32[] reduce(input, zero), dimensions={0}, to_apply=add
 }
@@ -68,9 +68,9 @@ ENTRY main {
 
   CheckTreeRewriter(hlo,
                     R"(
-// CHECK: [[pad_0:%[^ ]+]] = f32[50048]{0} pad([[input_1:%[^ ]+]], [[zero_2:%[^ ]+]]), padding=0_48
-// CHECK: [[bitcast_3:%[^ ]+]] = f32[128,391]{1,0} bitcast([[pad_0]])
-// CHECK: [[reduce_4:%[^ ]+]] = f32[128]{0} reduce([[bitcast_3]], [[zero_2]]), dimensions={1}, to_apply=[[add_5:%[^ ]+]]
+// CHECK: [[pad_0:%[^ ]+]] = f32[50022]{0} pad([[input_1:%[^ ]+]], [[zero_2:%[^ ]+]]), padding=0_1
+// CHECK: [[bitcast_3:%[^ ]+]] = f32[397,126]{1,0} bitcast([[pad_0]])
+// CHECK: [[reduce_4:%[^ ]+]] = f32[397]{0} reduce([[bitcast_3]], [[zero_2]]), dimensions={1}, to_apply=[[add_5:%[^ ]+]]
 // CHECK: ROOT [[out_1_6:%[^ ]+]] = f32[] reduce([[reduce_4]], [[zero_2]]), dimensions={0}, to_apply=[[add_5]]
       )");
 }
@@ -120,9 +120,9 @@ ENTRY main {
   CheckTreeRewriter(hlo,
                     R"(
 // CHECK: [[input_0:%[^ ]+]] = f32[50048]{0} parameter(0)
-// CHECK: [[bitcast_1:%[^ ]+]] = f32[128,391]{1,0} bitcast([[input_0]])
+// CHECK: [[bitcast_1:%[^ ]+]] = f32[391,128]{1,0} bitcast([[input_0]])
 // CHECK: [[zero_2:%[^ ]+]] = f32[] constant(0)
-// CHECK: [[reduce_3:%[^ ]+]] = f32[128]{0} reduce([[bitcast_1]], [[zero_2]]), dimensions={1}, to_apply=[[add_4:%[^ ]+]]
+// CHECK: [[reduce_3:%[^ ]+]] = f32[391]{0} reduce([[bitcast_1]], [[zero_2]]), dimensions={1}, to_apply=[[add_4:%[^ ]+]]
 // CHECK: ROOT [[out_1_5:%[^ ]+]] = f32[] reduce([[reduce_3]], [[zero_2]]), dimensions={0}, to_apply=[[add_4]]
       )");
 }
@@ -269,7 +269,7 @@ add {
 }
 
 ENTRY main {
-  input = f32[10302,100] parameter(0)
+  input = f32[10303,100] parameter(0)
   zero = f32[] constant(0)
   ROOT out = f32[100] reduce(input, zero), dimensions={0}, to_apply=add
 }
@@ -277,11 +277,11 @@ ENTRY main {
 
   CheckTreeRewriter(hlo,
                     R"(
-// CHECK:  [[input_0:%[^ ]+]] = f32[10302,100]{1,0} parameter(0)
+// CHECK:  [[input_0:%[^ ]+]] = f32[10303,100]{1,0} parameter(0)
 // CHECK:  [[zero_2:%[^ ]+]] = f32[] constant(0)
-// CHECK:  [[pad_0:%[^ ]+]] = f32[10304,100]{1,0} pad([[input_1:%[^ ]+]], [[zero_2:%[^ ]+]]), padding=0_2x0_0
-// CHECK:  [[bitcast_1:%[^ ]+]] = f32[64,161,100]{2,1,0} bitcast([[pad_0]])
-// CHECK:  [[reduce_3:%[^ ]+]] = f32[64,100]{1,0} reduce([[bitcast_1]], [[zero_2]]), dimensions={1}, to_apply=[[add_4:%[^ ]+]]
+// CHECK:  [[pad_0:%[^ ]+]] = f32[10304,100]{1,0} pad([[input_1:%[^ ]+]], [[zero_2:%[^ ]+]]), padding=0_1x0_0
+// CHECK:  [[bitcast_1:%[^ ]+]] = f32[161,64,100]{2,1,0} bitcast([[pad_0]])
+// CHECK:  [[reduce_3:%[^ ]+]] = f32[161,100]{1,0} reduce([[bitcast_1]], [[zero_2]]), dimensions={1}, to_apply=[[add_4:%[^ ]+]]
 // CHECK:  ROOT [[out_1_5:%[^ ]+]] = f32[100]{0} reduce([[reduce_3]], [[zero_2]]), dimensions={0}, to_apply=[[add_4]]
       )");
 }
@@ -362,8 +362,8 @@ argmax {
 }
 
 ENTRY main {
-  input = f32[2,100000] parameter(0)
-  idxs = u32[2,100000] iota(), iota_dimension=0
+  input = f32[2,100003] parameter(0)
+  idxs = u32[2,100003] iota(), iota_dimension=0
   zero = f32[] constant(0)
   zero_idx = u32[] constant(0)
 
@@ -376,14 +376,14 @@ ENTRY main {
 
   CheckTreeRewriter(hlo,
                     R"(
-// CHECK:  [[pad_0:%[^ ]+]] = f32[2,100096]{1,0} pad([[input_1:%[^ ]+]], [[zero_2:%[^ ]+]]), padding=0_0x0_96
-// CHECK:  [[bitcast_3:%[^ ]+]] = f32[2,256,391]{2,1,0} bitcast([[pad_0]])
+// CHECK:  [[pad_0:%[^ ]+]] = f32[2,100005]{1,0} pad([[input_1:%[^ ]+]], [[zero_2:%[^ ]+]]), padding=0_0x0_2
+// CHECK:  [[bitcast_3:%[^ ]+]] = f32[2,339,295]{2,1,0} bitcast([[pad_0]])
 // CHECK:  [[zero_idx_4:%[^ ]+]] = u32[] constant(0)
-// CHECK:  [[pad_1_5:%[^ ]+]] = u32[2,100096]{1,0} pad([[idxs_6:%[^ ]+]], [[zero_idx_4]]), padding=0_0x0_96
-// CHECK:  [[bitcast_1_7:%[^ ]+]] = u32[2,256,391]{2,1,0} bitcast([[pad_1_5]])
-// CHECK:  [[reduce_8:%[^ ]+]] = (f32[2,256]{1,0}, u32[2,256]{1,0}) reduce([[bitcast_3]], [[bitcast_1_7]], [[zero_2]], [[zero_idx_4]]), dimensions={2}, to_apply=[[argmax_9:%[^ ]+]]
-// CHECK:  [[get_tuple_element_10:%[^ ]+]] = f32[2,256]{1,0} get-tuple-element([[reduce_8]]), index=0
-// CHECK:  [[get_tuple_element_1_11:%[^ ]+]] = u32[2,256]{1,0} get-tuple-element([[reduce_8]]), index=1
+// CHECK:  [[pad_1_5:%[^ ]+]] = u32[2,100005]{1,0} pad([[idxs_6:%[^ ]+]], [[zero_idx_4]]), padding=0_0x0_2
+// CHECK:  [[bitcast_1_7:%[^ ]+]] = u32[2,339,295]{2,1,0} bitcast([[pad_1_5]])
+// CHECK:  [[reduce_8:%[^ ]+]] = (f32[2,339]{1,0}, u32[2,339]{1,0}) reduce([[bitcast_3]], [[bitcast_1_7]], [[zero_2]], [[zero_idx_4]]), dimensions={2}, to_apply=[[argmax_9:%[^ ]+]]
+// CHECK:  [[get_tuple_element_10:%[^ ]+]] = f32[2,339]{1,0} get-tuple-element([[reduce_8]]), index=0
+// CHECK:  [[get_tuple_element_1_11:%[^ ]+]] = u32[2,339]{1,0} get-tuple-element([[reduce_8]]), index=1
 // CHECK:  ROOT [[out_1_12:%[^ ]+]] = (f32[2]{0}, u32[2]{0}) reduce([[get_tuple_element_10]], [[get_tuple_element_1_11]], [[zero_2]], [[zero_idx_4]]), dimensions={1}, to_apply=[[argmax_9]]
       )");
 }
diff --git a/third_party/xla/xla/service/gpu/thunk.cc b/third_party/xla/xla/service/gpu/thunk.cc
index 6c544a85685138..9d26dcbb49c39f 100644
--- a/third_party/xla/xla/service/gpu/thunk.cc
+++ b/third_party/xla/xla/service/gpu/thunk.cc
@@ -34,7 +34,13 @@ Thunk::ExecuteParams::ExecuteParams(
     : buffer_allocations(&buffer_allocations),
       stream(stream),
       async_comms_streams(async_streams.begin(), async_streams.end()),
-      nccl_params(run_options, stream->parent()) {}
+      nccl_params(run_options, stream->parent()),
+      device_to_host_stream(run_options.run_options().device_to_host_stream()),
+      host_to_device_stream(run_options.run_options().host_to_device_stream()),
+      send_device_memory_function(
+          run_options.run_options().send_device_memory_function()),
+      recv_device_memory_function(
+          run_options.run_options().recv_device_memory_function()) {}
 
 /*static*/ absl::string_view Thunk::KindToString(Thunk::Kind kind) {
 #define CASE(x)  \
@@ -50,6 +56,7 @@ Thunk::ExecuteParams::ExecuteParams(
     CASE(kCubSort);
     CASE(kCublasLtMatmul);
     CASE(kCustomCall);
+    CASE(kCustomKernel);
     CASE(kNcclAllGather);
     CASE(kNcclAllGatherStart);
     CASE(kNcclAllGatherDone);
@@ -76,8 +83,12 @@ Thunk::ExecuteParams::ExecuteParams(
     CASE(kMemzero);
     CASE(kNorm);
     CASE(kOutfeed);
-    CASE(kReplicaId);
+    CASE(kSend);
+    CASE(kSendDone);
     CASE(kPartitionId);
+    CASE(kReplicaId);
+    CASE(kRecv);
+    CASE(kRecvDone);
     CASE(kSequential);
     CASE(kTriangularSolve);
     CASE(kWhile);
diff --git a/third_party/xla/xla/service/gpu/thunk.h b/third_party/xla/xla/service/gpu/thunk.h
index a560899dcebf77..65217f84370fde 100644
--- a/third_party/xla/xla/service/gpu/thunk.h
+++ b/third_party/xla/xla/service/gpu/thunk.h
@@ -19,20 +19,21 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
-#include <optional>
 #include <ostream>
 #include <string>
 #include <string_view>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/types/span.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/service_executable_run_options.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 namespace gpu {
@@ -72,6 +73,7 @@ class Thunk {
     kCubSort,
     kCublasLtMatmul,
     kCustomCall,
+    kCustomKernel,
     kFft,
     kFor,
     kGemm,
@@ -98,9 +100,13 @@ class Thunk {
     kNcclRecv,
     kNorm,
     kOutfeed,
-    kReplicaId,
     kPartitionId,
+    kRecv,
+    kRecvDone,
+    kReplicaId,
     kSequential,
+    kSend,
+    kSendDone,
     kTriangularSolve,
     kWhile,
     kFusedMHA
@@ -166,6 +172,14 @@ class Thunk {
     se::Stream* stream;
     absl::InlinedVector<se::Stream*, kAsyncStreamTotal> async_comms_streams;
     NcclExecuteParams nccl_params;
+
+    // Streams for moving data between host and device.
+    se::Stream* device_to_host_stream;
+    se::Stream* host_to_device_stream;
+
+    // Send/Recv callbacks passed to XLA from PjRt.
+    SendDeviceMemoryFunction* send_device_memory_function;
+    RecvDeviceMemoryFunction* recv_device_memory_function;
   };
 
   // Execute the kernel for the thunk on the given stream. This method must be
diff --git a/third_party/xla/xla/service/gpu/topk_specializer.cc b/third_party/xla/xla/service/gpu/topk_specializer.cc
index f09ce73d9f15e3..e69597bb42e856 100644
--- a/third_party/xla/xla/service/gpu/topk_specializer.cc
+++ b/third_party/xla/xla/service/gpu/topk_specializer.cc
@@ -43,6 +43,8 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 namespace {
 
 StatusOr<HloInstruction*> SmallBufferOptimization(
@@ -109,5 +111,15 @@ StatusOr<bool> TopkSpecializer::Run(
   return SpecializeTopkVisitor().RunOnModule(module, execution_threads);
 }
 
+#else  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+StatusOr<bool> TopkSpecializer::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  return false;
+}
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/tree_reduction_rewriter.cc b/third_party/xla/xla/service/gpu/tree_reduction_rewriter.cc
index bd37ce540540fd..b5dd3047fe727c 100644
--- a/third_party/xla/xla/service/gpu/tree_reduction_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/tree_reduction_rewriter.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <array>
 #include <cmath>
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -115,32 +116,36 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
     // We do this by splitting the input shape [a, n, b] into [a, k, n / k, b].
     //
     // We want to choose k to be roughly equal to sqrt(n) so that we process
-    // "most of" the reduction in the first step.  We also want k to be a power
-    // of 2, so that the GPU kernel doesn't spend all its time doing slow
-    // integer divmods to compute indices into the shape [a,k,n/k,b].  This
-    // means we may need to pad n so that n is divisible by k.
-    //
-    // Thus we consider two options for k:
-    //
-    //   k1 = round_up_pow2(sqrt(n))
-    //   k2 = round_down_pow2(sqrt(n))
-    //
-    // and we choose the value of k that results in the least amount of padding.
-    int64_t k1 = absl::bit_ceil(static_cast<uint64_t>(std::ceil(std::sqrt(n))));
-    int64_t k2 =
-        absl::bit_floor(static_cast<uint64_t>(std::floor(std::sqrt(n))));
-    int64_t padded_n_k1 = RoundUpTo(n, k1);
-    int64_t padded_n_k2 = RoundUpTo(n, k2);
-
-    int64_t k;
-    int64_t padded_n;
-    if (padded_n_k1 < padded_n_k2) {
-      k = k1;
-      padded_n = padded_n_k1;
-    } else {
-      k = k2;
-      padded_n = padded_n_k2;
+    // "most of" the reduction in the first step. But it is also important that
+    // we choose a value of k with the least amount of padding we need to add to
+    // n to make it divisible by k. We search for the best value of n / k
+    // between sqrt(n)/2 and sqrt(n). If there are several possible values for
+    // n / k that result in the minimum amount of padding, we also want n / k to
+    // be a power of 2, so that the GPU kernel doesn't spend all its time doing
+    // slow integer divmods to compute indices into the shape [a,k,n/k,b].
+    // Note that by searching in the range between sqrt(n)/2 and sqrt(n), we
+    // will have a power of 2 in that range.
+    uint64_t n_div_k = static_cast<uint64_t>(std::floor(std::sqrt(n)));
+    int64_t race_free_bound = ReductionDimensionRaceFreeBound(
+        hlo->GetModule()->config(), reduction_dimensions);
+    if (n_div_k > race_free_bound) {
+      // This means we need more than one split. It is best to limit the n/k
+      // dimension to the maximum size that doesn't require further splitting.
+      // Otherwise we might choose a rather small reduce dimension size for the
+      // first step (in the worst case, sqrt(race_free_bound + 1)).
+      n_div_k = race_free_bound;
+    }
+    uint64_t minimum_padding = (n_div_k - n % n_div_k) % n_div_k;
+    uint64_t best_k = (n + minimum_padding) / n_div_k;
+    for (uint64_t i = n_div_k - 1; i > n_div_k / 2; --i) {
+      uint64_t padding = (i - n % i) % i;
+      if (padding < minimum_padding ||
+          (padding == minimum_padding && absl::has_single_bit(i))) {
+        minimum_padding = padding;
+        best_k = (n + padding) / i;
+      }
     }
+    uint64_t padded_n = n + minimum_padding;
 
     // Pad reduced dimension to the required number of elements.
     bool no_padding_necessary = n == padded_n;
@@ -179,8 +184,8 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
     for (int64_t dim_idx = 0; dim_idx < padded[0]->shape().dimensions_size();
          dim_idx++) {
       if (dim_idx == reduced_input_dimension) {
-        reshaped_dimensions.push_back(k);
-        reshaped_dimensions.push_back(padded_n / k);
+        reshaped_dimensions.push_back(best_k);
+        reshaped_dimensions.push_back(padded_n / best_k);
       } else {
         reshaped_dimensions.push_back(padded[0]->shape().dimensions(dim_idx));
       }
diff --git a/third_party/xla/xla/service/gpu/triton_autotuner.cc b/third_party/xla/xla/service/gpu/triton_autotuner.cc
index 8e91a17bc9d07c..8c18f253933c31 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/triton_autotuner.cc
@@ -102,6 +102,9 @@ constexpr int kMinTileSize = 16;
 // Not a hard limit, just an assumption that should stay valid.
 constexpr int kMaxTileSize = 512;
 
+// Default tiling when autotuning is disabled.
+constexpr TritonGemmConfig kDefaultGemmTiling = {32, 32, 32, 1, 1, 4};
+
 class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
  public:
   explicit TritonAutotunerVisitor(const AutotuneConfig& config)
@@ -121,11 +124,12 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
           AutotunerUtil::Autotune(
               hlo, config_, [&]() -> StatusOr<AutotuneResult> {
                 if (config_.IsDeviceless()) {
-                  return InternalError(
+                  return absl::InternalError(absl::StrCat(
                       "Expect autotune result cache hit for deviceless "
-                      "compilation.");
+                      "compilation (HLO: ",
+                      hlo->ToString()));
                 }
-                return InternalError("Expect autotune result cache hit.");
+                return absl::InternalError("Expect autotune result cache hit.");
               }));
       VLOG(4) << "Result: " << autotune_result.ShortDebugString();
 
@@ -225,12 +229,11 @@ class GemmConfigSetCollector : public ConstDfsHloVisitorWithDefault {
   GemmConfigSet GetGemmConfigSet(const HloFusionInstruction* fusion) {
     const DebugOptions& debug_options =
         fusion->GetModule()->config().debug_options();
-    se::StreamExecutor* stream_exec = config_.GetExecutor();
     return {GetPossibleMatmulAutotuneConfigs(
         *Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
             *fusion->called_computations().at(0), HloOpcode::kDot)),
-        stream_exec->GetDeviceDescription().cuda_compute_capability(),
-        debug_options, config_.ExhaustiveTilingSearch())};
+        config_.GetCudaComputeCapability(), debug_options,
+        config_.ExhaustiveTilingSearch())};
   }
 
   AutotuneConfig config_;
@@ -402,6 +405,9 @@ StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
       AutotunerUtil::ExtractInstructionIntoNewModule(*fusion);
   // Reduce memory usage during compilation by disabling GPU runtime.
   debug_opts.set_xla_gpu_enable_xla_runtime_executable(false);
+  // TODO(anlunx): Disable command buffers for now because it breaks triton
+  // autotuner test. Enable this when the function of command buffers is stable.
+  debug_opts.clear_xla_gpu_enable_command_buffer();
   if (!allow_filtering_kernels_spilling_registers) {
     debug_opts.set_xla_gpu_filter_kernels_spilling_registers_on_autotuning(
         false);
@@ -795,10 +801,10 @@ StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
   return best_triton;
 }
 
-Status DumpAutotunedFusions(const AutotuneConfig& config,
-                            AutotunerCompileUtil& util,
-                            const AutotuneResult result,
-                            const HloFusionInstruction* fusion, int fusion_id) {
+Status DumpAutotunedFusion(const AutotuneConfig& config,
+                           AutotunerCompileUtil& util,
+                           const AutotuneResult result,
+                           const HloFusionInstruction* fusion, int fusion_id) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> module,
       util.ExtractModule([&](const DebugOptions& debug_opts) {
@@ -825,8 +831,7 @@ Status Autotune(const AutotuneConfig& config, AutotunerCompileUtil& util,
                 tsl::thread::ThreadPool* thread_pool,
                 const DebugOptions& debug_opts,
                 const absl::flat_hash_map<const HloFusionInstruction*,
-                                          GemmConfigSet>& gemm_config_sets,
-                int& fusion_id_for_dump) {
+                                          GemmConfigSet>& gemm_config_sets) {
   absl::flat_hash_map<const HloFusionInstruction*, ExecutableSet>
       executable_sets;
   TF_ASSIGN_OR_RETURN(
@@ -844,6 +849,7 @@ Status Autotune(const AutotuneConfig& config, AutotunerCompileUtil& util,
     });
   }
 
+  int fusion_id = 0;
   for (const auto& key_value : executable_sets) {
     const HloFusionInstruction* fusion = key_value.first;
     const ExecutableSet& executable_set = key_value.second;
@@ -852,8 +858,8 @@ Status Autotune(const AutotuneConfig& config, AutotunerCompileUtil& util,
                                                        fusion, executable_set));
 
     if (debug_opts.xla_gpu_dump_autotuned_triton_fusions()) {
-      TF_RETURN_IF_ERROR(DumpAutotunedFusions(config, util, result, fusion,
-                                              fusion_id_for_dump));
+      TF_RETURN_IF_ERROR(
+          DumpAutotunedFusion(config, util, result, fusion, fusion_id++));
     }
 
     const AutotuneCacheKey key = AutotunerUtil::GetKey(fusion, config);
@@ -863,8 +869,6 @@ Status Autotune(const AutotuneConfig& config, AutotunerCompileUtil& util,
       LOG(WARNING) << "AutotunerUtil::AddResult already existed: "
                    << key.ToString();
     }
-
-    fusion_id_for_dump += 1;
   }
 
   return OkStatus();
@@ -880,7 +884,7 @@ std::vector<TritonGemmConfig> GetPossibleMatmulAutotuneConfigs(
   constexpr int kMinGemmElements = 32 * 32;
   if (ShapeUtil::ElementsIn(dot.operand(0)->shape()) <= kMinGemmElements &&
       ShapeUtil::ElementsIn(dot.operand(1)->shape()) <= kMinGemmElements) {
-    return ReduceTileSizes(dot, {TritonGemmConfig(32, 32, 32, 1, 1, 4)});
+    return ReduceTileSizes(dot, {kDefaultGemmTiling});
   }
   // Split-K optimization enables more even utilization of a GPU in cases
   // where tiling just the non-contracting dimensions of a GEMM does not create
@@ -910,22 +914,29 @@ StatusOr<bool> TritonAutotuner::Run(
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_SCOPED_LOGGING_TIMER("Triton autotuner");
   const DebugOptions& debug_options = module->config().debug_options();
-  if (debug_options.xla_gpu_autotune_level() == 0) {
-    return false;
-  }
+  TF_ASSIGN_OR_RETURN(std::optional<AutotunerCompileUtil> opt_compile_util,
+                      AutotunerCompileUtil::Create(config_, debug_options));
 
-  if (!config_.IsDeviceless()) {
-    TF_ASSIGN_OR_RETURN(std::optional<AutotunerCompileUtil> opt_compile_util,
-                        AutotunerCompileUtil::Create(config_, debug_options));
+  GemmConfigSetCollector gemm_config_set_collector(config_);
+  absl::flat_hash_map<const HloFusionInstruction*, GemmConfigSet>
+      gemm_config_sets;
+  TF_ASSIGN_OR_RETURN(gemm_config_sets,
+                      gemm_config_set_collector.CollectGemmConfigSets(
+                          module, execution_threads));
+
+  if (debug_options.xla_gpu_autotune_level() == 0 ||
+      debug_options.xla_gpu_deterministic_ops()) {
+    // Pick the first option for each gemm instead of autotuning..
+    for (const auto& [fusion, tilings] : gemm_config_sets) {
+      const AutotuneCacheKey key = AutotunerUtil::GetKey(fusion, config_);
+      AutotuneResult res;
+      *res.mutable_triton() = kDefaultGemmTiling.ToProto();
+      *res.mutable_run_time() =
+          tsl::proto_utils::ToDurationProto(absl::ZeroDuration());
+      AutotunerUtil::AddResult(key, res);
+    }
+  } else if (!config_.IsDeviceless()) {
     TF_RET_CHECK(opt_compile_util.has_value());
-    AutotunerCompileUtil& compile_util = opt_compile_util.value();
-
-    GemmConfigSetCollector gemm_config_set_collector(config_);
-    absl::flat_hash_map<const HloFusionInstruction*, GemmConfigSet>
-        gemm_config_sets;
-    TF_ASSIGN_OR_RETURN(gemm_config_sets,
-                        gemm_config_set_collector.CollectGemmConfigSets(
-                            module, execution_threads));
     if (!gemm_config_sets.empty()) {
       std::string correctness_check_str = config_.should_check_correctness()
                                               ? "(with correctness check)"
@@ -933,22 +944,8 @@ StatusOr<bool> TritonAutotuner::Run(
 
       VLOG(1) << "Autotuning " << gemm_config_sets.size() << " fusions "
               << correctness_check_str << ".";
-      int fusion_id_for_dump = 0;
-      if (debug_options.xla_gpu_single_wave_autotuning()) {
-        // Tune all fusions at once to save time.
-        TF_RETURN_IF_ERROR(Autotune(config_, compile_util, thread_pool_,
-                                    debug_options, gemm_config_sets,
-                                    fusion_id_for_dump));
-      } else {
-        // Tune each fusion separately to avoid running out of memory.
-        for (const auto& key_value : gemm_config_sets) {
-          absl::flat_hash_map<const HloFusionInstruction*, GemmConfigSet>
-              single_element_map({key_value});
-          TF_RETURN_IF_ERROR(Autotune(config_, compile_util, thread_pool_,
-                                      debug_options, single_element_map,
-                                      fusion_id_for_dump));
-        }
-      }
+      TF_RETURN_IF_ERROR(Autotune(config_, *opt_compile_util, thread_pool_,
+                                  debug_options, gemm_config_sets));
       VLOG(1) << "Done autotuning.";
     }
   }
diff --git a/third_party/xla/xla/service/gpu/triton_autotuner_test.cc b/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
index 83a001277f8f2e..0edf9c86a987d8 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tests/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tests/verified_hlo_module.h"
@@ -144,14 +145,28 @@ ENTRY entry {
                                /*allow_mixed_precision=*/false));
 }
 
-class TritonAutotunerTest : public HloTestBase {
+class StatelessAutotunerTest : public HloTestBase {
  public:
-  TritonAutotunerTest()
+  StatelessAutotunerTest()
       : HloTestBase(/*verifier_layout_sensitive=*/true,
                     /*allow_mixed_precision_in_hlo_verifier=*/false) {}
 
+  void SetUp() override {
+    AutotunerUtil::ClearAutotuneResults();
+    HloTestBase::SetUp();
+  }
+
+  void TearDown() override {
+    AutotunerUtil::ClearAutotuneResults();
+    HloTestBase::TearDown();
+  }
+};
+
+class TritonAutotunerTest : public StatelessAutotunerTest {
+ public:
   DebugOptions GetDebugOptionsForTest() override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    DebugOptions debug_options =
+        StatelessAutotunerTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_enable_triton_gemm(true);
     debug_options.set_xla_gpu_cublas_fallback(false);
     return debug_options;
@@ -196,35 +211,6 @@ class TritonAutotunerTest : public HloTestBase {
                    0);
         });
   }
-
-  void CheckTritonAutotuningDeviceless(absl::string_view hlo) {
-    HloPassPipeline pipeline("gemm_rewrite_deviceless");
-    pipeline.AddPass<GemmRewriterTriton>(backend()
-                                             .default_stream_executor()
-                                             ->GetDeviceDescription()
-                                             .cuda_compute_capability());
-    tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "",
-                                        tsl::port::MaxParallelism());
-    DebugOptions opts;
-    pipeline.AddPass<TritonAutotuner>(
-        AutotuneConfig{DevicelessConfig{backend()
-                                            .default_stream_executor()
-                                            ->GetDeviceDescription()
-                                            .model_str(),
-                                        backend()
-                                            .default_stream_executor()
-                                            ->GetDeviceDescription()
-                                            .cuda_compute_capability()},
-                       opts},
-        &thread_pool);
-
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                            ParseAndReturnVerifiedModule(hlo));
-    auto status_or = HloTestBase::RunHloPass(&pipeline, module.get());
-    EXPECT_TRUE(tsl::errors::IsInternal(status_or.status()));
-    EXPECT_EQ("Expect autotune result cache hit for deviceless compilation.",
-              status_or.status().message());
-  }
 };
 
 class TritonAutotunerTestWithMorePreciseReduction : public TritonAutotunerTest {
@@ -567,11 +553,12 @@ ENTRY %e {
 // TODO(b/281489442): Write a testcase called
 // `SkipConfigsProducingDeviantResults` or similar.
 
-class TritonAutotunerLevelTest : public HloTestBase,
+class TritonAutotunerLevelTest : public StatelessAutotunerTest,
                                  public ::testing::WithParamInterface<int> {
  public:
   DebugOptions GetDebugOptionsForTest() override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    DebugOptions debug_options =
+        StatelessAutotunerTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_autotune_level(GetParam());
     debug_options.set_xla_gpu_cublas_fallback(false);
     return debug_options;
@@ -590,23 +577,71 @@ ENTRY e {
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
 
-  AutotunerUtil::ClearAutotuneResults();
-
-  if (GetDebugOptionsForTest().xla_gpu_autotune_level() == 0) {
-    MatchOptimizedHlo(kHloText, R"(
-; CHECK: kind=kCustom
-; CHECK-NOT: block_m
-      )");
-  } else {
-    MatchOptimizedHlo(kHloText, R"(
+  MatchOptimizedHlo(kHloText, R"(
 ; CHECK: kind=kCustom
 ; CHECK-SAME: block_m
       )");
-  }
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+TEST_P(TritonAutotunerLevelTest, Deviceless) {
+  const std::string hlo = R"(
+HloModule module
+
+ENTRY e {
+  x = s8[16,16] parameter(0)
+  c = f16[16,16] convert(x)
+  y = f16[16,16] parameter(1)
+  ROOT out = f16[16,16] dot(c, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  HloPassPipeline pipeline("gemm_rewrite_deviceless");
+  pipeline.AddPass<GemmRewriterTriton>(backend()
+                                           .default_stream_executor()
+                                           ->GetDeviceDescription()
+                                           .cuda_compute_capability());
+  tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "",
+                                      tsl::port::MaxParallelism());
+  DebugOptions opts;
+  pipeline.AddPass<TritonAutotuner>(
+      AutotuneConfig{DevicelessConfig{backend()
+                                          .default_stream_executor()
+                                          ->GetDeviceDescription()
+                                          .model_str(),
+                                      backend()
+                                          .default_stream_executor()
+                                          ->GetDeviceDescription()
+                                          .cuda_compute_capability()},
+                     opts},
+      &thread_pool);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+  if (GetDebugOptionsForTest().xla_gpu_autotune_level() == 0) {
+    TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                            HloTestBase::RunHloPass(&pipeline, module.get()));
+    EXPECT_TRUE(changed);
+
+    // Check default configuration.
+    TF_ASSERT_OK_AND_ASSIGN(
+        bool filecheck_matches,
+        RunFileCheck(
+            module->ToString(HloPrintOptions{}.set_print_operand_shape(false)),
+            R"(
+// CHECK: backend_config={"kind":"__triton_gemm","triton_gemm_config":{"block_m":"32","block_n":"32","block_k":"32","split_k":"1","num_stages":"1","num_warps":"4"}}
+            )"));
+    EXPECT_TRUE(filecheck_matches);
+  } else {
+    EXPECT_THAT(HloTestBase::RunHloPass(&pipeline, module.get()),
+                tsl::testing::StatusIs(
+                    tsl::error::INTERNAL,
+                    ::testing::HasSubstr(
+                        "Expect autotune result cache hit for deviceless")));
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(TritonAutotunerLevelSweep, TritonAutotunerLevelTest,
                          ::testing::Range(0, 5));
 
@@ -639,20 +674,6 @@ ENTRY e {
 )");
 }
 
-TEST_F(TritonAutotunerExhaustiveTest, Deviceless_CompileOnly) {
-  const std::string hlo = R"(
-HloModule module
-
-ENTRY e {
-  x = s8[16,16] parameter(0)
-  c = f16[16,16] convert(x)
-  y = f16[16,16] parameter(1)
-  ROOT out = f16[16,16] dot(c, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)";
-
-  CheckTritonAutotuningDeviceless(hlo);
-}
 
 class TritonAutotunerDisableSplitK : public TritonAutotunerTest {
  public:
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
new file mode 100644
index 00000000000000..f29c8767c47dd9
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
@@ -0,0 +1,316 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/triton_fusion_analysis.h"
+
+#include <cstdint>
+#include <queue>
+#include <string>
+#include <utility>
+#include <variant>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/triton_tiling_propagation.h"
+#include "xla/service/instruction_fusion.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/status_macros.h"
+#include "xla/statusor.h"
+#include "tsl/platform/errors.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+using triton_fusion::DimOrdersAndReqs;
+using triton_fusion::DimOrdersAndReqsOrError;
+using triton_fusion::DotRequirements;
+using triton_fusion::FusionContext;
+using triton_fusion::GetPropagatedDimOrdersAndRequirements;
+using triton_fusion::kNoSplitRequirement;
+using triton_fusion::TransformDirection;
+
+}  // namespace
+
+namespace triton_fusion {
+
+/*static*/ FusionContext FusionContext::FromDotOperand(
+    const HloInstruction& dot, const int operand_number, const int split_k) {
+  // There can be either none or one split-K batch dimension.
+  const int num_split_k_batch_dims = split_k > 1;
+  int split_k_dimension_index = kNoDimensionIndex;
+  if (split_k > 1) {
+    split_k_dimension_index =
+        ContractingDimensionIndex(dot, operand_number) - 1;
+  }
+  int splittable_dimension_index = kNoDimensionIndex;
+  // LHS non-contracting dimension can be split if non-splitK batch is absent.
+  if (operand_number == 0 &&
+      dot.dot_dimension_numbers().lhs_batch_dimensions_size() -
+              num_split_k_batch_dims ==
+          0) {
+    splittable_dimension_index =
+        NonContractingDimensionIndex(dot, operand_number);
+  }
+  FusionContext context(
+      DotProperties{
+          static_cast<int>(NonContractingDimensionIndex(dot, operand_number)),
+          splittable_dimension_index},
+      DotRequirements(kNoSplitRequirement));
+  context.dim_orders_[dot.operand(operand_number)] =
+      DimensionOrder::FromDotOperandOrOutput(*dot.operand(operand_number),
+                                             split_k_dimension_index);
+  return context;
+}
+
+/*static*/ FusionContext FusionContext::FromDotOutput(
+    const HloInstruction& dot, const int split_k,
+    DotRequirements requirements) {
+  // Allow non-contracting dimension originating from LHS to split if
+  // this dimension is split at the output at the same ratio as
+  // at the input.
+  int splittable_dimension_index = kNoDimensionIndex;
+  if (requirements.splittable_dimension_major_part_size > 1) {
+    // Split-K dimension is the first one in the output if present;
+    // LHS non-contracting follows (batch is absent in this case).
+    splittable_dimension_index = (split_k > 1) ? 1 : 0;
+  }
+  FusionContext context(DotProperties{/*noncontracting_dimension=*/-1,
+                                      splittable_dimension_index},
+                        std::move(requirements));
+  context.dim_orders_[&dot] = DimensionOrder::FromDotOperandOrOutput(dot);
+  return context;
+}
+
+/*static*/ FusionContext FusionContext::FromSoftmaxRoot(
+    const HloInstruction& root) {
+  FusionContext context(
+      SoftmaxProperties{DimensionOrder::kSoftmaxReductionDimension,
+                        DimensionOrder::kSoftmaxBatchDimension},
+      SoftmaxRequirements{});
+  context.dim_orders_[&root] = DimensionOrder::FromSoftmaxRoot(root);
+  return context;
+}
+
+namespace {
+
+// Tells how many new parameters does a fusion gain by fusing the operation as
+// an input.
+int64_t NumAddedParameters(const HloInstruction& hlo) {
+  // Non-scalar constant is equivalent to a parameter: one input, one output.
+  if (hlo.opcode() == HloOpcode::kConstant &&
+      !ShapeUtil::IsScalar(hlo.shape())) {
+    return 0;
+  }
+  // All other instructions add all own inputs and remove own single output.
+  return hlo.operand_count() - 1;
+}
+
+}  // namespace
+
+bool FusionContext::CombineDimOrdersAndReqs(const DimOrdersAndReqs& update) {
+  // First check that all updates to insert are compatible to avoid
+  // incomplete merges.
+  for (const auto& [key, value] : update.dim_orders) {
+    auto it = dim_orders_.find(key);
+    if (it != dim_orders_.cend() && !it->second.IsPhysicallyEquivalent(value)) {
+      return false;
+    }
+  }
+
+  RequirementsOrError requirements_or_error =
+      CombineRequirements(requirements_, update.requirements);
+  if (std::holds_alternative<FusionDecision>(requirements_or_error)) {
+    return false;
+  }
+
+  requirements_ = std::move(std::get<Requirements>(requirements_or_error));
+  dim_orders_.insert(update.dim_orders.begin(), update.dim_orders.end());
+  return true;
+}
+
+Status FusionContext::PropagateDimensionOrdersToParameters(
+    const HloInstruction& origin, ConstHloInstructionSet& parameters,
+    ConstHloInstructionMap<TensorIterationSpec>& iter_specs) {
+  absl::flat_hash_set<const HloInstruction*> visited;
+  std::queue<const HloInstruction*> to_process;
+  // Dimension orders describing outputs of corresponding instructions.
+  visited.insert(&origin);
+  to_process.push(&origin);
+  while (!to_process.empty()) {
+    const HloInstruction* hlo = to_process.front();
+    to_process.pop();
+    if (hlo->opcode() == HloOpcode::kParameter) {
+      // One parameter corresponds to one iteration spec in the results of the
+      // analysis. This describes well situations when a parameter has one or
+      // more elementwise users - they share the same tiling. Situations when
+      // one instruction is read differently by different users in the same
+      // scope of the dot are currently prevented during the fusion.
+      TF_RET_CHECK(parameters.insert(hlo).second);
+      VLOG(5) << hlo->ToString();
+    }
+    DimOrdersAndReqsOrError result = GetPropagatedDimOrdersAndRequirements(
+        *hlo, dim_orders_.at(hlo), TransformDirection::kOutputToInput,
+        properties_);
+    TF_RET_CHECK(std::holds_alternative<DimOrdersAndReqs>(result));
+    TF_RET_CHECK(CombineDimOrdersAndReqs(std::get<DimOrdersAndReqs>(result)));
+    iter_specs[hlo] = dim_orders_.at(hlo).ToTensorIterationSpec();
+    for (const HloInstruction* operand : hlo->operands()) {
+      if (!visited.insert(operand).second) {
+        continue;
+      }
+      if (operand->opcode() == HloOpcode::kDot) {
+        // Encountering the dot itself happens during the processing of the
+        // output fusion. The propagation should stop at it.
+        continue;
+      }
+      to_process.push(operand);
+    }
+  }
+  return OkStatus();
+}
+
+}  // namespace triton_fusion
+
+StatusOr<TritonFusionAnalysis> TritonFusionAnalysis::Execute(
+    const HloComputation& computation, const int split_k) {
+  VLOG(5) << computation.ToString(HloPrintOptions::ShortParsable());
+  TritonFusionAnalysis analysis;
+  const HloInstruction* dot =
+      hlo_query::GetFirstInstructionWithOpcode(computation, HloOpcode::kDot);
+  if (dot != nullptr) {
+    TF_RETURN_IF_ERROR(analysis.ExecuteForDotFusion(*dot, split_k));
+  } else {
+    TF_RETURN_IF_ERROR(
+        analysis.ExecuteForSoftmaxFusion(*computation.root_instruction()));
+  }
+  return analysis;
+}
+
+Status TritonFusionAnalysis::ExecuteForSoftmaxFusion(
+    const HloInstruction& root) {
+  auto context = FusionContext::FromSoftmaxRoot(root);
+  // Softmax fusion uses one tiled scope.
+  TF_RETURN_IF_ERROR(context.PropagateDimensionOrdersToParameters(
+      root, parameters_[Scope::OUTPUT], iter_specs_[Scope::OUTPUT]));
+  iter_specs_[Scope::LHS] = {};
+  iter_specs_[Scope::RHS] = {};
+  return OkStatus();
+}
+
+Status TritonFusionAnalysis::ExecuteForDotFusion(const HloInstruction& dot,
+                                                 const int split_k) {
+  DotRequirements lhs_requirements(kNoSplitRequirement);
+  for (const Scope scope : {Scope::LHS, Scope::RHS}) {
+    const int operand_number = static_cast<int>(scope);
+    auto context = FusionContext::FromDotOperand(dot, operand_number, split_k);
+    TF_RETURN_IF_ERROR(context.PropagateDimensionOrdersToParameters(
+        *dot.operand(operand_number), parameters_[scope], iter_specs_[scope]));
+    if (scope == Scope::LHS) {
+      lhs_requirements = std::get<DotRequirements>(context.requirements());
+    }
+  }
+
+  // For now the RHS doesn't support splits, so it also doesn't impose any
+  // requirements.
+  auto context = FusionContext::FromDotOutput(dot, split_k, lhs_requirements);
+  const HloInstruction* output = &dot;
+  // Currently supported is one fusion output and one path from dot to it.
+  // Propagate dimension order from dot to root.
+  while (!output->IsRoot()) {
+    TF_RET_CHECK(output->user_count() == 1);
+    const HloInstruction* input = output;
+    output = output->users()[0];
+    DimOrdersAndReqsOrError result = GetPropagatedDimOrdersAndRequirements(
+        *output, context.dim_orders().at(input),
+        TransformDirection::kInputToOutput, context.hero_properties());
+    TF_RET_CHECK(std::holds_alternative<DimOrdersAndReqs>(result));
+    TF_RET_CHECK(
+        context.CombineDimOrdersAndReqs(std::get<DimOrdersAndReqs>(result)));
+  }
+  TF_RET_CHECK(
+      iter_specs_[Scope::OUTPUT]
+          .insert(
+              {output, context.dim_orders().at(output).ToTensorIterationSpec()})
+          .second);
+  if (output != &dot) {
+    // Propagate back to parameters of the output fusion.
+    TF_RETURN_IF_ERROR(context.PropagateDimensionOrdersToParameters(
+        *output, parameters_[Scope::OUTPUT], iter_specs_[Scope::OUTPUT]));
+  }
+  return OkStatus();
+}
+
+const TensorIterationSpec::DimIterationSpec* TritonFusionAnalysis::IterSpec(
+    const TritonFusionAnalysis::Scope scope, const HloInstruction* hlo,
+    const int dimension) const {
+  auto hlo_spec = iter_specs_.at(scope).find(hlo);
+  if (hlo_spec != iter_specs_.at(scope).cend()) {
+    // The pointer returned here may also be nullptr.
+    return hlo_spec->second.Find(dimension);
+  }
+  return nullptr;
+}
+
+namespace {
+
+std::string IterationSpecByInstructionMapToString(
+    const TritonFusionAnalysis::IterationSpecByInstructionMap& m) {
+  return absl::StrCat("IterSpec{",
+                      absl::StrJoin(m, ", ",
+                                    [&](std::string* s, const auto& kv) {
+                                      absl::StrAppend(s, kv.first->name(), ": ",
+                                                      kv.second.ToString());
+                                    }),
+                      "}");
+}
+
+std::string ScopeToString(TritonFusionAnalysis::Scope s) {
+  switch (s) {
+    case TritonFusionAnalysis::Scope::LHS:
+      return "LHS";
+    case TritonFusionAnalysis::Scope::RHS:
+      return "RHS";
+    case TritonFusionAnalysis::Scope::OUTPUT:
+      return "OUTPUT";
+  }
+}
+
+}  // namespace
+
+std::string TritonFusionAnalysis::ToString() const {
+  return absl::StrCat(
+      "TritonFusionAnalysis{\n",
+      absl::StrJoin(iter_specs_, ",\n",
+                    [&](std::string* s, const auto& kv) {
+                      absl::StrAppend(
+                          s, ScopeToString(kv.first), ": ",
+                          IterationSpecByInstructionMapToString(kv.second));
+                    }),
+      "\n}");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.h b/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
new file mode 100644
index 00000000000000..66a2330dd182e1
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
@@ -0,0 +1,127 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRITON_FUSION_ANALYSIS_H_
+#define XLA_SERVICE_GPU_TRITON_FUSION_ANALYSIS_H_
+
+// This file contains TritonFusionAnalysis and FusionContext.
+
+#include <map>
+#include <string>
+
+#include "xla/autotuning.pb.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/triton_tiling_propagation.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// Analysis of tensor iteration orders within tiled fusions.
+class TritonFusionAnalysis {
+  Status ExecuteForDotFusion(const HloInstruction& dot, int split_k);
+  Status ExecuteForSoftmaxFusion(const HloInstruction& root);
+
+ public:
+  // Execute the analysis of a fusion computation.
+  // `split_k` indicates whether this operation was converted to the split-K
+  // form and tells the analysis how to interpret the batch dimensions.
+  static StatusOr<TritonFusionAnalysis> Execute(
+      const HloComputation& computation, int split_k = 1);
+
+  // A scope is an HLO graph that can be tiled efficiently using same or
+  // compatible tile shapes on all operations. GEMM fusion has 3 scopes
+  // defined by left operand, right operand and output.
+  enum class Scope { LHS = 0, RHS = 1, OUTPUT = 2 };
+
+  using IterationSpecByInstructionMap =
+      ConstHloInstructionMap<TensorIterationSpec>;
+  using IterationSpecByInstructionByScopeMap =
+      std::map<Scope, IterationSpecByInstructionMap>;
+
+  // Every parameter requires a separate piece of shared memory for asynchronous
+  // loads. Multiple parameters are approximately equivalent to multiple
+  // pipeline stages.
+  // Note: This has been tuned specifically for GEMMs, where pipelining with
+  // more than 4 stages has been shown to rarely be practical. This limitation
+  // is not necessarily applicable to other operations.
+  // Note: The limit doesn't apply to the epilogue of the fusion.
+  static constexpr int kMaxParameterPerDotOperand = 4;
+
+  // Scope -> HLO -> dot dimension number -> iteration spec at the HLO's output.
+  const TensorIterationSpec::DimIterationSpec* IterSpec(Scope scope,
+                                                        const HloInstruction*,
+                                                        int dimension) const;
+  // Parameter HLO instructions used in a scope of `dot`.
+  const ConstHloInstructionSet& ScopeParameters(const Scope scope) const {
+    return parameters_.at(scope);
+  }
+
+  std::string ToString() const;
+
+ private:
+  IterationSpecByInstructionByScopeMap iter_specs_;
+  // HLO computation parameters per scope.
+  std::map<Scope, ConstHloInstructionSet> parameters_;
+};
+
+// The details of the Triton fusion / tiling propagation are in a separate
+// namespace to avoid littering the xla::gpu namespace.
+namespace triton_fusion {
+class FusionContext {
+  FusionContext(HeroProperties properties, Requirements requirements)
+      : properties_(properties), requirements_(requirements) {}
+
+ public:
+  // Create fusion context from a dot operand according to
+  // the currently supported configurations.
+  static FusionContext FromDotOperand(const HloInstruction& dot,
+                                      int operand_number, int split_k = 1);
+
+  // Create fusion context from dot's output.
+  static FusionContext FromDotOutput(const HloInstruction& dot, int split_k,
+                                     DotRequirements requirements);
+
+  static FusionContext FromSoftmaxRoot(const HloInstruction&);
+
+  // Add dimension orders from `update` to `dim_orders_` and update
+  // `requirements_` if all of them are compatible.
+  bool CombineDimOrdersAndReqs(const DimOrdersAndReqs& update);
+
+  // Propagate dimension orders in consumer->producer direction starting at
+  // `origin` with output `origin_dim_order` till parameters of the
+  // computation. Store the found parameters and their iteration specs.
+  Status PropagateDimensionOrdersToParameters(
+      const HloInstruction& origin, ConstHloInstructionSet& parameters,
+      ConstHloInstructionMap<TensorIterationSpec>& iter_specs);
+
+  const HeroProperties& hero_properties() const { return properties_; }
+  const DimOrderMap& dim_orders() const { return dim_orders_; }
+  const Requirements& requirements() const { return requirements_; }
+
+ private:
+  const HeroProperties properties_;
+  Requirements requirements_;
+  DimOrderMap dim_orders_;
+};
+
+}  // namespace triton_fusion
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRITON_FUSION_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
new file mode 100644
index 00000000000000..b34a2f4935bc84
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
@@ -0,0 +1,839 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/triton_fusion_analysis.h"
+
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/gemm_rewriter_triton.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/verified_hlo_module.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::FieldsAre;
+
+using TritonDotAnalysisTest = HloTestBase;
+
+TEST_F(TritonDotAnalysisTest, NopBitcasts) {
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_dot {
+  param_0.1 = s8[48,4]{1,0} parameter(0)
+  bitcast.18 = s8[1,48,4]{2,1,0} bitcast(param_0.1)
+  bitcast.19 = s8[48,4]{1,0} bitcast(bitcast.18)
+  convert.4 = bf16[48,4]{1,0} convert(bitcast.19)
+  param_1.1 = bf16[4,3]{1,0} parameter(1)
+  ROOT dot = bf16[48,3]{1,0} dot(convert.4, param_1.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s8[48,4]{1,0} parameter(0)
+  p1 = bf16[4,3]{1,0} parameter(1)
+  custom-call = bf16[48,3]{1,0} custom-call(p0, p1),
+    custom_call_target="__triton",
+    called_computations={triton_dot}
+  ROOT bitcast.2 = bf16[1,8,6,3]{3,2,1,0} bitcast(custom-call)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  const HloComputation* dot_computation = module->entry_computation()
+                                              ->root_instruction()
+                                              ->operand(0)
+                                              ->called_computations()[0];
+  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
+  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
+            p0);
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
+            p1);
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 0),
+      ElementsAre(FieldsAre(/*stride=*/4, /*count=*/48, /*slice_start=*/0,
+                            /*slice_limit=*/48, ElementsAre(48))));
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 1),
+      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/4, /*slice_start=*/0,
+                            /*slice_limit=*/4, ElementsAre(4))));
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 0),
+      ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4, /*slice_start=*/0,
+                            /*slice_limit=*/4, ElementsAre(4))));
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 1),
+      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3, /*slice_start=*/0,
+                            /*slice_limit=*/3, ElementsAre(3))));
+}
+
+TEST_F(TritonDotAnalysisTest, Merge) {
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_dot {
+  param_0.1 = s8[1,8,6,4]{3,2,1,0} parameter(0)
+  bitcast.18 = s8[48,4]{1,0} bitcast(param_0.1)
+  convert.4 = bf16[48,4]{1,0} convert(bitcast.18)
+  param_1.1 = bf16[4,3]{1,0} parameter(1)
+  ROOT dot = bf16[48,3]{1,0} dot(convert.4, param_1.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s8[1,8,6,4]{3,2,1,0} parameter(0)
+  p1 = bf16[4,3]{1,0} parameter(1)
+  custom-call = bf16[48,3]{1,0} custom-call(p0, p1),
+    custom_call_target="__triton",
+    called_computations={triton_dot}
+  ROOT bitcast.2 = bf16[1,8,6,3]{3,2,1,0} bitcast(custom-call)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  const HloComputation* dot_computation = module->entry_computation()
+                                              ->root_instruction()
+                                              ->operand(0)
+                                              ->called_computations()[0];
+  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
+  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
+            p0);
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
+            p1);
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 0),
+              ElementsAre(FieldsAre(/*stride=*/4, /*count=*/6 * 8,
+                                    /*slice_start=*/0, /*slice_limit=*/6 * 8,
+                                    /*subfragments=*/ElementsAre(6, 8))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 1),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/4,
+                                    /*slice_start=*/0, /*slice_limit=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 0),
+              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4,
+                                    /*slice_start=*/0, /*slice_limit=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 1),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3,
+                                    /*slice_start=*/0, /*slice_limit=*/3,
+                                    /*subfragments=*/ElementsAre(3))));
+}
+
+TEST_F(TritonDotAnalysisTest, Split) {
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_dot {
+  %parameter_1 = f32[24000,2]{1,0} parameter(1)
+  %convert.15 = f16[24000,2]{1,0} convert(%parameter_1)
+  %parameter_0 = f16[4]{0} parameter(0)
+  %bitcast.45 = f16[2,2]{1,0} bitcast(%parameter_0)
+  ROOT %dot.26 = f16[24000,2]{1,0} dot(%convert.15, %bitcast.45),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f16[4]{0} parameter(0)
+  p1 = f32[24000,2]{1,0} parameter(1)
+  ROOT r = f16[24000,2]{1,0} custom-call(p0, p1),
+    custom_call_target="__triton",
+    called_computations={triton_dot}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  const HloComputation* dot_computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
+  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
+            p1);
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
+            p0);
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p1, 0),
+              ElementsAre(FieldsAre(/*stride=*/2, /*count=*/24000,
+                                    /*slice_start=*/0, /*slice_limit=*/24000,
+                                    /*subfragments=*/ElementsAre(24000))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p1, 1),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/2,
+                                    /*slice_start=*/0, /*slice_limit=*/2,
+                                    /*subfragments=*/ElementsAre(2))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p0, 0),
+              ElementsAre(FieldsAre(/*stride=*/2, /*count=*/2,
+                                    /*slice_start=*/0, /*slice_limit=*/2,
+                                    /*subfragments=*/ElementsAre(2))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p0, 1),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/2,
+                                    /*slice_start=*/0, /*slice_limit=*/2,
+                                    /*subfragments=*/ElementsAre(2))));
+}
+
+TEST_F(TritonDotAnalysisTest, TransposeMerge) {
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_dot {
+  param_0.1 = s8[1,4,8,6]{3,2,1,0} parameter(0)
+  transpose.3 = s8[1,8,6,4]{3,2,1,0} transpose(param_0.1), dimensions={0,2,3,1}
+  bitcast.18 = s8[48,4]{1,0} bitcast(transpose.3)
+  convert.4 = bf16[48,4]{1,0} convert(bitcast.18)
+  param_1.1 = bf16[4,3]{1,0} parameter(1)
+  ROOT dot = bf16[48,3]{1,0} dot(convert.4, param_1.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s8[1,4,8,6]{3,2,1,0} parameter(0)
+  p1 = bf16[4,3]{1,0} parameter(1)
+  custom-call = bf16[48,3]{1,0} custom-call(p0, p1),
+    custom_call_target="__triton",
+    called_computations={triton_dot}
+  ROOT bitcast.2 = bf16[1,8,6,3]{3,2,1,0} bitcast(custom-call)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  const HloComputation* dot_computation = module->entry_computation()
+                                              ->root_instruction()
+                                              ->operand(0)
+                                              ->called_computations()[0];
+  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
+  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
+            p0);
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
+            p1);
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 0),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8 * 6,
+                                    /*slice_start=*/0, /*slice_limit=*/8 * 6,
+                                    /*subfragments=*/ElementsAre(6, 8))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 1),
+              ElementsAre(FieldsAre(/*stride=*/8 * 6, /*count=*/4,
+                                    /*slice_start=*/0, /*slice_limit=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 0),
+              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4,
+                                    /*slice_start=*/0, /*slice_limit=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 1),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3,
+                                    /*slice_start=*/0, /*slice_limit=*/3,
+                                    /*subfragments=*/ElementsAre(3))));
+}
+
+TEST_F(TritonDotAnalysisTest, CopyMerge) {
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_dot {
+  param_0.1 = s8[1,4,8,6]{3,2,1,0} parameter(0)
+  bitcast.99 = s8[1,8,6,4]{2,1,3,0} bitcast(param_0.1)
+  copy.3 = s8[1,8,6,4]{3,2,1,0} copy(bitcast.99)
+  bitcast.18 = s8[48,4]{1,0} bitcast(copy.3)
+  convert.4 = bf16[48,4]{1,0} convert(bitcast.18)
+  param_1.1 = bf16[4,3]{1,0} parameter(1)
+  ROOT dot = bf16[48,3]{1,0} dot(convert.4, param_1.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s8[1,4,8,6]{3,2,1,0} parameter(0)
+  p1 = bf16[4,3]{1,0} parameter(1)
+  custom-call = bf16[48,3]{1,0} custom-call(p0, p1),
+    custom_call_target="__triton",
+    called_computations={triton_dot}
+  ROOT bitcast.2 = bf16[1,8,6,3]{3,2,1,0} bitcast(custom-call)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  const HloComputation* dot_computation = module->entry_computation()
+                                              ->root_instruction()
+                                              ->operand(0)
+                                              ->called_computations()[0];
+  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
+  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
+            p0);
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
+            p1);
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 0),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8 * 6,
+                                    /*slice_start=*/0, /*slice_limit=*/8 * 6,
+                                    /*subfragments=*/ElementsAre(6, 8))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 1),
+              ElementsAre(FieldsAre(/*stride=*/8 * 6, /*count=*/4,
+                                    /*slice_start=*/0, /*slice_limit=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 0),
+              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4,
+                                    /*slice_start=*/0, /*slice_limit=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 1),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3,
+                                    /*slice_start=*/0, /*slice_limit=*/3,
+                                    /*subfragments=*/ElementsAre(3))));
+}
+
+TEST_F(TritonDotAnalysisTest, TransposeMergeNCN) {
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_dot {
+  param_0.1 = bf16[3,4,8,1]{3,2,1,0} parameter(0)
+  transpose.3 = bf16[3,8,1,4]{3,2,1,0} transpose(param_0.1), dimensions={0,2,3,1}
+  bitcast.18 = bf16[24,4]{1,0} bitcast(transpose.3)
+  param_1.1 = bf16[4,3]{1,0} parameter(1)
+  ROOT dot = bf16[24,3]{1,0} dot(bitcast.18, param_1.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = bf16[3,4,8,1]{3,2,1,0} parameter(0)
+  p1 = bf16[4,3]{1,0} parameter(1)
+  custom-call = bf16[24,3]{1,0} custom-call(p0, p1),
+    custom_call_target="__triton", called_computations={triton_dot}
+  ROOT bitcast.2 = bf16[3,8,1,3]{3,2,1,0} bitcast(custom-call)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  const HloComputation* dot_computation = module->entry_computation()
+                                              ->root_instruction()
+                                              ->operand(0)
+                                              ->called_computations()[0];
+  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
+  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
+            p0);
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
+            p1);
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 0),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8,
+                                    /*slice_start=*/0, /*slice_limit=*/8,
+                                    /*subfragments=*/ElementsAre(8)),
+                          FieldsAre(/*stride=*/4 * 8, /*count=*/3,
+                                    /*slice_start=*/0, /*slice_limit=*/3,
+                                    /*subfragments=*/ElementsAre(3))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 1),
+              ElementsAre(FieldsAre(/*stride=*/8, /*count=*/4,
+                                    /*slice_start=*/0, /*slice_limit=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 0),
+              ElementsAre(FieldsAre(/*stride=*/3, /*count=*/4,
+                                    /*slice_start=*/0, /*slice_limit=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 1),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/3,
+                                    /*slice_start=*/0, /*slice_limit=*/3,
+                                    /*subfragments=*/ElementsAre(3))));
+}
+
+TEST_F(TritonDotAnalysisTest, TransposeOutput) {
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_dot {
+  p0 = bf16[24,4]{1,0} parameter(0)
+  p1 = bf16[4,3]{1,0} parameter(1)
+  dot = bf16[24,3]{1,0} dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  bc = bf16[12,2,3]{2,1,0} bitcast(dot)
+  ROOT t = bf16[3,12,2]{2,1,0} transpose(bc), dimensions={2,0,1}
+}
+
+ENTRY e {
+  p0 = bf16[24,4]{1,0} parameter(0)
+  p1 = bf16[4,3]{1,0} parameter(1)
+  ROOT r = bf16[3,12,2]{2,1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_dot
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  const HloComputation* dot_computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  const HloInstruction* dot_output = dot_computation->root_instruction();
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, dot_output, 0),
+      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/24, /*slice_start=*/0,
+                            /*slice_limit=*/24,
+                            /*subfragments=*/ElementsAre(2, 12))));
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, dot_output, 1),
+      ElementsAre(FieldsAre(/*stride=*/24, /*count=*/3, /*slice_start=*/0,
+                            /*slice_limit=*/3,
+                            /*subfragments=*/ElementsAre(3))));
+}
+
+TEST_F(TritonDotAnalysisTest, OutputParameterIsHandled) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule t
+
+triton_dot {
+  p0 = bf16[24,4]{1,0} parameter(0)
+  p1 = bf16[4,3]{1,0} parameter(1)
+  dot = bf16[24,3]{1,0} dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  p2 = f16[3,24]{1,0} parameter(2)
+  p2t = f16[24,3]{1,0} transpose(p2), dimensions={1,0}
+  p2tc = bf16[24,3]{1,0} convert(p2t)
+  ROOT r = bf16[24,3]{1,0} divide(p2tc, dot)
+}
+
+ENTRY e {
+  p0 = bf16[24,4]{1,0} parameter(0)
+  p1 = bf16[4,3]{1,0} parameter(1)
+  p2 = f16[3,24]{1,0} parameter(2)
+  ROOT r = bf16[24,3]{1,0} fusion(p0, p1, p2), kind=kCustom,
+    calls=triton_dot
+})"));
+  const HloComputation* dot_computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  const HloInstruction* output_param =
+      dot_computation->parameter_instruction(2);
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  EXPECT_EQ(
+      analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, output_param, 0)
+          ->size(),
+      1);
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, output_param, 0),
+      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/24, /*slice_start=*/0,
+                            /*slice_limit=*/24,
+                            /*subfragments=*/ElementsAre(24))));
+  EXPECT_EQ(
+      analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, output_param, 1)
+          ->size(),
+      1);
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, output_param, 1),
+      ElementsAre(FieldsAre(/*stride=*/24, /*count=*/3, /*slice_start=*/0,
+                            /*slice_limit=*/3,
+                            /*subfragments=*/ElementsAre(3))));
+}
+
+TEST_F(TritonDotAnalysisTest, InputBroadcastFromScalarIsHandled) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule t
+
+triton_dot {
+  p0 = bf16[24,4]{1,0} parameter(0)
+  p1 = bf16[] parameter(1)
+  p1b = bf16[4,3] broadcast(p1)
+  ROOT dot = bf16[24,3]{1,0} dot(p0, p1b),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = bf16[24,4]{1,0} parameter(0)
+  p1 = bf16[] parameter(1)
+  ROOT r = bf16[24,3]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_dot
+})"));
+  const HloComputation* dot_computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  const HloInstruction* scalar = dot_computation->parameter_instruction(1);
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  EXPECT_EQ(analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, scalar, 0),
+            nullptr);
+  EXPECT_EQ(analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, scalar, 1),
+            nullptr);
+}
+
+TEST_F(TritonDotAnalysisTest, InputBroadcastFromVectorIsHandled) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule t
+
+triton_dot {
+  p0 = bf16[24,4]{1,0} parameter(0)
+  p1 = bf16[4] parameter(1)
+  p1b = bf16[4,3] broadcast(p1), dimensions={0}
+  ROOT dot = bf16[24,3]{1,0} dot(p0, p1b),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = bf16[24,4]{1,0} parameter(0)
+  p1 = bf16[4] parameter(1)
+  ROOT r = bf16[24,3]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_dot
+})"));
+  const HloComputation* dot_computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  const HloInstruction* vector = dot_computation->parameter_instruction(1);
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  EXPECT_EQ(
+      analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, vector, 0)->size(),
+      1);
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, vector, 0),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/4,
+                                    /*slice_start=*/0, /*slice_limit=*/4,
+                                    /*subfragments=*/ElementsAre(4))));
+}
+
+TEST_F(TritonDotAnalysisTest, OutputBroadcastIsNotAccepted) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule t
+
+ENTRY e {
+  p0 = f16[2,35] parameter(0)
+  p0c = bf16[2,35] convert(p0)
+  p1 = bf16[35,2] parameter(1)
+  dot = bf16[2,2] dot(p0c, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT bc = bf16[2,2,100] broadcast(dot), dimensions={0,1}
+})"));
+  EXPECT_TRUE(GemmRewriterTriton(se::CudaComputeCapability{
+                                     se::CudaComputeCapability::AMPERE, 0})
+                  .Run(module.get())
+                  .value());
+  EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
+            HloOpcode::kBroadcast);
+}
+
+TEST_F(TritonDotAnalysisTest, DegenerateSplitFragmentIsHandled) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+triton_gemm_r {
+  Arg_0.1 = s8[30,913,8,21]{3,2,1,0} parameter(0)
+  bitcast.6 = s8[30,8,21,913]{2,1,3,0} bitcast(Arg_0.1)
+  copy.7 = s8[30,8,21,913]{3,2,1,0} copy(bitcast.6)
+  bitcast.8 = s8[5040,913]{1,0} bitcast(copy.7)
+  convert.9 = bf16[5040,913]{1,0} convert(bitcast.8)
+  bitcast.32 = bf16[58,913]{1,0} parameter(1)
+  dot.33 = bf16[5040,58]{1,0} dot(convert.9, bitcast.32),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  bitcast.34 = bf16[30,8,21,58]{3,2,1,0} bitcast(dot.33)
+  copy.35 = bf16[30,8,21,58]{2,1,3,0} copy(bitcast.34)
+  ROOT bitcast.41 = bf16[30,1,58,8,21]{4,3,2,1,0} bitcast(copy.35)
+}
+
+ENTRY e {
+  Arg_0.1 = s8[30,913,8,21]{3,2,1,0} parameter(0)
+  Arg_1.2 = bf16[58,913]{1,0} parameter(1)
+  ROOT r = bf16[30,1,58,8,21]{4,3,2,1,0} fusion(Arg_0.1, Arg_1.2), kind=kCustom,
+    calls=triton_gemm_r,
+    backend_config={kind: "__triton_gemm"}
+})"));
+  const HloComputation* dot_computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
+                                 dot_computation->root_instruction(), 0),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/8 * 21,
+                                    /*slice_start=*/0, /*slice_limit=*/8 * 21,
+                                    /*subfragments=*/ElementsAre(21, 8)),
+                          FieldsAre(/*stride=*/8 * 21 * 58, /*count=*/30,
+                                    /*slice_start=*/0, /*slice_limit=*/30,
+                                    /*subfragments=*/ElementsAre(30))));
+}
+
+TEST_F(TritonDotAnalysisTest,
+       HandlesFurtherPropagationFromTrivialSizedTensorGracefully) {
+  // We could probably support this better, just checking to avoid a crash for
+  // now.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+triton_gemm_r {
+  a = f32[3,3]{1,0} parameter(0)
+  constant = f32[1,1]{1,0} constant({ {0} })
+  broadcast = f32[1,1]{1,0} broadcast(constant), dimensions={0,1}
+  reshape = f32[] reshape(broadcast)
+  broadcast2 = f32[3,3]{1,0} broadcast(reshape), dimensions={}
+  ROOT dot = f32[3,3]{1,0} dot(a, broadcast2),
+                 lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  a = f32[3,3]{1,0} parameter(0)
+  ROOT dot = f32[3,3]{1,0} fusion(a), kind=kCustom, calls=triton_gemm_r,
+             backend_config={kind: "__triton_gemm"}
+}
+)"));
+
+  const HloComputation* dot_computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+
+  StatusOr<TritonFusionAnalysis> analysis =
+      TritonFusionAnalysis::Execute(*dot_computation);
+  // It can fail but shouldn't crash.
+  (void)analysis;
+}
+
+using TritonSoftmaxAnalysisTest = HloTestBase;
+
+TEST_F(TritonSoftmaxAnalysisTest, DegenerateBatchDimensionIsSupported) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+max {
+  p1 = f32[] parameter(1)
+  p0 = f32[] parameter(0)
+  ROOT m = f32[] maximum(p0, p1)
+}
+
+triton_softmax_computation {
+  p0 = f32[1,97]{1,0} parameter(0)
+  bitcast = f32[97]{0} bitcast(p0)
+  constant = f32[] constant(-inf)
+  reduce = f32[] reduce(bitcast, constant), dimensions={0}, to_apply=max
+  broadcast = f32[1,97]{1,0} broadcast(reduce), dimensions={}
+  ROOT subtract = f32[1,97]{1,0} subtract(p0, broadcast)
+}
+
+ENTRY e {
+  p0 = f32[1,97]{1,0} parameter(0)
+  ROOT r = f32[1,97]{1,0} fusion(p0), kind=kCustom,
+    calls=triton_softmax_computation,
+    backend_config={"kind":"__triton_softmax"}
+})"));
+  const HloComputation* computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*computation));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
+                                 computation->root_instruction(), 0),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/97,
+                                    /*slice_start=*/0, /*slice_limit=*/97,
+                                    /*subfragments=*/ElementsAre(97))));
+  EXPECT_EQ(analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
+                              computation->root_instruction(), 1),
+            nullptr);
+}
+
+TEST_F(TritonSoftmaxAnalysisTest, BroadcastIntoBatchDimensionIsSupported) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+c {
+  p1 = f32[127]{0} parameter(0)
+  ROOT b = f32[125,127]{1,0} broadcast(p1), dimensions={1}
+}
+
+ENTRY e {
+  p0 = f32[127]{0} parameter(0)
+  ROOT t = f32[125,127]{1,0} fusion(p0), kind=kCustom, calls=c
+})"));
+  const HloComputation* computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*computation));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
+                                 computation->root_instruction(), 0),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/127,
+                                    /*slice_start=*/0, /*slice_limit=*/127,
+                                    /*subfragments=*/ElementsAre(127))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
+                                 computation->root_instruction(), 1),
+              ElementsAre(FieldsAre(/*stride=*/127, /*count=*/125,
+                                    /*slice_start=*/0, /*slice_limit=*/125,
+                                    /*subfragments=*/ElementsAre(125))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
+                                 computation->parameter_instruction(0), 0),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/127,
+                                    /*slice_start=*/0, /*slice_limit=*/127,
+                                    /*subfragments=*/ElementsAre(127))));
+  EXPECT_EQ(analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
+                              computation->parameter_instruction(0), 1),
+            nullptr);
+}
+
+TEST_F(TritonSoftmaxAnalysisTest, ReduceOfNonRowDimensionIsNotSupported) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule t
+add {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT add = f32[] add(p0, p1)
+}
+
+triton_softmax_computation {
+  param_0 = f32[8,4,127]{2,1,0} parameter(0)
+  constant = f32[] constant(0)
+  ROOT reduce = f32[4,127]{1,0} reduce(param_0, constant), dimensions={0}, to_apply=add
+}
+
+ENTRY main {
+  param_0 = f32[8,4,127]{2,1,0} parameter(0)
+  ROOT fusion = f32[4,127]{1,0} fusion(param_0), kind=kCustom,
+    calls=triton_softmax_computation,
+    backend_config={"kind":"__triton_softmax"}
+})"));
+
+  const HloComputation* computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  const auto analysis = TritonFusionAnalysis::Execute(*computation);
+  EXPECT_FALSE(analysis.ok());
+}
+
+TEST_F(TritonSoftmaxAnalysisTest, PadWithinTritonSoftmaxIsNotSupported) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule t
+
+add {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT add = f32[] add(p0, p1)
+}
+
+triton_softmax_computation {
+  param_1 = f32[4,127]{1,0} parameter(0)
+  constant_0 = f32[] constant(0)
+  reduce = f32[4]{0} reduce(param_1,  constant_0), dimensions={1}, to_apply=add
+  broadcast = f32[4,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT pad = f32[8,127]{1,0} pad(broadcast, constant_0), padding=0_4x0_0
+}
+
+ENTRY main {
+  param_0 = f32[4,127]{1,0} parameter(0)
+  ROOT fusion = f32[8,127]{1,0} fusion(param_0), kind=kCustom,
+    calls=triton_softmax_computation,
+    backend_config={"kind":"__triton_softmax"}
+})"));
+
+  const HloComputation* computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  const auto analysis = TritonFusionAnalysis::Execute(*computation);
+  EXPECT_FALSE(analysis.ok());
+}
+
+TEST_F(TritonSoftmaxAnalysisTest,
+       BitcastWhichSplitsBatchAndReduceDimensionsIsNotSupported) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+add {
+ p0 = f32[] parameter(0)
+ p1 = f32[] parameter(1)
+ ROOT add = f32[] add(p0, p1)
+}
+
+triton_softmax_computation {
+  param_0 = f32[8,16129]{1,0} parameter(0)
+  bitcast = f32[8,127,127]{2,1,0} bitcast(param_0)
+  constant = f32[] constant(0)
+  reduce = f32[8,127]{1,0} reduce(bitcast, f32[] constant), dimensions={2}, to_apply=add
+  ROOT broadcast = f32[8,127,127]{2,1,0} broadcast(reduce), dimensions={0,1}
+}
+
+ENTRY main {
+  param_1 = f32[8,16129]{1,0} parameter(0)
+  ROOT fusion = f32[8,127,127]{2,1,0} fusion(param_1), kind=kCustom,
+   calls=triton_softmax_computation,
+   backend_config={"kind":"__triton_softmax"}
+})"));
+
+  const HloComputation* computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  const auto analysis = TritonFusionAnalysis::Execute(*computation);
+  EXPECT_FALSE(analysis.ok());
+}
+
+TEST_F(TritonSoftmaxAnalysisTest,
+       BitcastWhichSplitsReduceDimensionIsSupported) {
+  // Clone of BitcastWhichSplitsBatchAndReduceDimensionsIsNotSupported,
+  // But in this case the split dimension can be fully tiled as a reduce dim.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+add {
+ p0 = f32[] parameter(0)
+ p1 = f32[] parameter(1)
+ ROOT add = f32[] add(p0, p1)
+}
+
+triton_softmax_computation {
+  param_0 = f32[1,8,127,128]{3,2,1,0} parameter(0)
+  intermediate_bitcast = f32[8,127,2,64]{3,2,1,0} bitcast(param_0)
+  bitcast = f32[8,127,128]{2,1,0} bitcast(intermediate_bitcast)
+  constant = f32[] constant(0)
+  reduce = f32[8,127]{1,0} reduce(bitcast, constant), dimensions={2}, to_apply=add
+  ROOT broadcast = f32[8,127,128]{2,1 ,0} broadcast(reduce), dimensions={0,1}
+}
+
+ENTRY main {
+  param_1 = f32[1,8,127,128]{3,2,1,0} parameter(0)
+  ROOT fusion = f32[8,127,128]{2,1,0} fusion(param_1), kind=kCustom,
+   calls=triton_softmax_computation,
+   backend_config={"kind":"__triton_softmax"}
+})"));
+
+  const HloComputation* computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*computation));
+}
+
+TEST_F(TritonSoftmaxAnalysisTest,
+       BitcastWhichDoesNotAffectReduceDimIsSupported) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+add {
+ p0 = f32[] parameter(0)
+ p1 = f32[] parameter(1)
+ ROOT add = f32[] add(p0, p1)
+}
+
+triton_softmax_computation {
+  param_0 = f32[1,2,4,127,128]{4,3,2,1,0} parameter(0)
+  bitcast = f32[8,127,128]{2,1,0} bitcast(param_0)
+  constant = f32[] constant(0)
+  reduce = f32[8,127]{1,0} reduce(bitcast, constant), dimensions={2}, to_apply=add
+  ROOT broadcast = f32[8,127,128]{2,1,0} broadcast(reduce), dimensions={0,1}
+}
+
+ENTRY main {
+  param_1 = f32[1,2,4,127,128]{4,3,2,1,0} parameter(0)
+  ROOT fusion =  f32[8,127,128]{2,1,0} fusion(param_1), kind=kCustom,
+   calls=triton_softmax_computation,
+   backend_config={"kind":"__triton_softmax"}
+})"));
+
+  const HloComputation* computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*computation));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/triton_support.cc b/third_party/xla/xla/service/gpu/triton_support.cc
new file mode 100644
index 00000000000000..1f7845ab79bb49
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_support.cc
@@ -0,0 +1,128 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/triton_support.h"
+
+#include <iterator>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+bool IsDistributiveOverAddition(const HloInstruction& hlo) {
+  // The list is most likely incomplete.
+  // For example division can be added too but only for operand #0.
+  if (hlo.opcode() == HloOpcode::kMultiply ||
+      hlo.opcode() == HloOpcode::kNegate ||
+      hlo.opcode() == HloOpcode::kBitcast ||
+      hlo.opcode() == HloOpcode::kReshape || hlo.opcode() == HloOpcode::kCopy ||
+      hlo.opcode() == HloOpcode::kTranspose ||
+      hlo.opcode() == HloOpcode::kConvert ||
+      hlo.opcode() == HloOpcode::kBroadcast ||
+      hlo.opcode() == HloOpcode::kSlice) {
+    return true;
+  }
+  return false;
+}
+
+// Data types that are supported by the Triton emitters.
+//
+// BF16 is supported in a sense that all operations on it are implemented
+// through F32 and converts have to be inserted into the HLO graph, but
+// they can be missing during fusion.
+bool IsTritonSupportedDataType(PrimitiveType type,
+                               se::GpuComputeCapability gpu_version) {
+  auto cuda_compute_capability =
+      std::get<se::CudaComputeCapability>(gpu_version);
+  switch (type) {
+    case PRED:
+    case S8:
+    case S16:
+    case S32:
+    case F16:
+    case F32:
+      return true;
+    case BF16:
+      return cuda_compute_capability.IsAtLeast(
+          stream_executor::CudaComputeCapability::AMPERE);
+    default:
+      return false;
+  }
+}
+
+std::vector<HloOpcode> TritonSupportedUnaryElementwise(
+    PrimitiveType element_type) {
+  std::vector<HloOpcode> ret = {HloOpcode::kConvert};
+  if (element_type == PrimitiveType::PRED) {
+    ret.push_back(HloOpcode::kNot);
+    return ret;
+  }
+  ret.push_back(HloOpcode::kAbs);
+  ret.push_back(HloOpcode::kNegate);
+  if (element_type == PrimitiveType::F32 ||
+      element_type == PrimitiveType::BF16 ||
+      element_type == PrimitiveType::F64) {
+    absl::c_copy(std::vector<HloOpcode>{HloOpcode::kCos, HloOpcode::kExp,
+                                        HloOpcode::kExpm1, HloOpcode::kLog,
+                                        HloOpcode::kLog1p, HloOpcode::kRsqrt,
+                                        HloOpcode::kSin, HloOpcode::kSqrt,
+                                        HloOpcode::kCbrt, HloOpcode::kTan,
+                                        HloOpcode::kTanh},
+                 std::back_inserter(ret));
+  }
+  return ret;
+}
+
+std::vector<HloOpcode> TritonSupportedBinaryElementwise(
+    PrimitiveType element_type) {
+  if (element_type == PrimitiveType::PRED) {
+    return {HloOpcode::kAnd, HloOpcode::kOr, HloOpcode::kXor,
+            HloOpcode::kCompare};
+  }
+  std::vector<HloOpcode> ret = {HloOpcode::kAdd,      HloOpcode::kCompare,
+                                HloOpcode::kMaximum,  HloOpcode::kMinimum,
+                                HloOpcode::kMultiply, HloOpcode::kSubtract};
+  if (element_type == PrimitiveType::F32 ||
+      element_type == PrimitiveType::BF16 ||
+      element_type == PrimitiveType::F64) {
+    ret.push_back(HloOpcode::kAtan2);
+    ret.push_back(HloOpcode::kDivide);
+    ret.push_back(HloOpcode::kPower);
+  }
+  return ret;
+}
+
+std::vector<HloOpcode> TritonSupportedTernaryElementwise(
+    PrimitiveType element_type) {
+  return {HloOpcode::kSelect};
+}
+
+bool IsTritonSupportedElementwise(HloOpcode opcode,
+                                  PrimitiveType element_type) {
+  return absl::c_linear_search(TritonSupportedUnaryElementwise(element_type),
+                               opcode) ||
+         absl::c_linear_search(TritonSupportedBinaryElementwise(element_type),
+                               opcode) ||
+         absl::c_linear_search(TritonSupportedTernaryElementwise(element_type),
+                               opcode);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/triton_support.h b/third_party/xla/xla/service/gpu/triton_support.h
new file mode 100644
index 00000000000000..f3dcd23d954d34
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_support.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRITON_SUPPORT_H_
+#define XLA_SERVICE_GPU_TRITON_SUPPORT_H_
+
+// This file is the home of the basic Triton support checks which are used by
+// multiple other components.
+
+#include <vector>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+// Tells if f(a+b) == f(a) + f(b).
+bool IsDistributiveOverAddition(const HloInstruction& hlo);
+
+// Allowlist of unary elementwise operations supported by Triton GEMM codegen.
+std::vector<HloOpcode> TritonSupportedUnaryElementwise(PrimitiveType);
+
+// Allowlist of binary elementwise operations supported by Triton GEMM codegen.
+std::vector<HloOpcode> TritonSupportedBinaryElementwise(PrimitiveType);
+
+// Allowlist of ternary elementwise operations supported by Triton GEMM codegen.
+std::vector<HloOpcode> TritonSupportedTernaryElementwise(PrimitiveType);
+
+// Data types that are supported by the Triton emitters.
+bool IsTritonSupportedDataType(PrimitiveType, se::GpuComputeCapability);
+
+// Checks elementwise operation against all supported by Triton GEMM codegen.
+bool IsTritonSupportedElementwise(HloOpcode, PrimitiveType);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRITON_SUPPORT_H_
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
new file mode 100644
index 00000000000000..deb569fb1f2781
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -0,0 +1,1104 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/triton_tiling_propagation.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <list>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/permutation_util.h"
+#include "xla/service/gpu/triton_support.h"
+#include "xla/service/instruction_fusion.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+const TensorIterationSpec::DimIterationSpec* TensorIterationSpec::Find(
+    const int dimension) const {
+  if (auto it = dim_iteration_specs_.find(dimension);
+      it != dim_iteration_specs_.end()) {
+    return &it->second;
+  }
+  return nullptr;
+}
+
+bool TensorIterationSpec::IsPhysicallyEquivalent(
+    const TensorIterationSpec& other) const {
+  if (dim_iteration_specs_.size() != other.dim_iteration_specs_.size()) {
+    return false;
+  }
+  for (const auto& pair : dim_iteration_specs_) {
+    int dimension = pair.first;
+    const DimIterationSpec& dim_iter_spec = pair.second;
+    auto other_it = other.dim_iteration_specs_.find(dimension);
+    if (other_it == other.dim_iteration_specs_.end()) {
+      return false;
+    }
+    const DimIterationSpec& other_dim_iter_spec = other_it->second;
+    if (dim_iter_spec.size() != other_dim_iter_spec.size()) {
+      return false;
+    }
+    for (size_t i = 0; i < dim_iter_spec.size(); i++) {
+      if (!dim_iter_spec[i].IsPhysicallyEquivalent(other_dim_iter_spec[i])) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+std::string TensorIterationSpec::IterationSpecFragment::ToString() const {
+  return absl::StrCat("{stride=", stride, ", count=", count,
+                      ", slice_start=", slice_start,
+                      ", sliced_count=", sliced_count, ", subfragments=[",
+                      absl::StrJoin(subfragments, ", "), "]}");
+}
+
+std::string TensorIterationSpec::ToString() const {
+  return absl::StrCat(
+      "{",
+      absl::StrJoin(dim_iteration_specs_, ", ",
+                    [&](std::string* s, const auto& kv) {
+                      absl::StrAppend(
+                          s, kv.first, ": ", "[",
+                          absl::StrJoin(kv.second, ", ",
+                                        [&](std::string* ss, const auto& v) {
+                                          absl::StrAppend(ss, v.ToString());
+                                        }),
+                          "]");
+                    }),
+      "}");
+}
+
+namespace triton_fusion {
+
+using Fragment = DimensionOrder::Fragment;
+using Fragments = DimensionOrder::Fragments;
+using FragmentOrders = DimensionOrder::FragmentOrders;
+
+/*static*/ DimensionOrder DimensionOrder::FromDotOperandOrOutput(
+    const HloInstruction& hlo, const int split_k_dimension_index) {
+  DimensionOrder dim_order;
+  dim_order.tensor_fragments_order_.reserve(hlo.shape().rank());
+  for (const int i : hlo.shape().layout().minor_to_major()) {
+    int target_dim_number = i;
+    if (i == split_k_dimension_index) {
+      CHECK(!dim_order.tensor_fragments_order_.empty())
+          << "The split-K batch dimension has be preceded by the contracting "
+             "dimension it originates from by construction.";
+      target_dim_number =
+          dim_order.tensor_fragments_order_.back().dst_dim_number();
+    }
+    dim_order.dim_fragments_orders_[target_dim_number].push_back(
+        dim_order.tensor_fragments_order_.size());
+    dim_order.tensor_fragments_order_.push_back(
+        Fragment{target_dim_number, hlo.shape().dimensions(i)});
+  }
+  return dim_order;
+}
+
+/*static*/ DimensionOrder DimensionOrder::FromSoftmaxRoot(
+    const HloInstruction& hlo) {
+  DimensionOrder dim_order;
+  dim_order.tensor_fragments_order_.reserve(hlo.shape().rank());
+  dim_order.dim_fragments_orders_[kSoftmaxReductionDimension].push_back(
+      dim_order.tensor_fragments_order_.size());
+  dim_order.tensor_fragments_order_.push_back(
+      Fragment{kSoftmaxReductionDimension, hlo.shape().dimensions_minor(0)});
+  for (int i = 1; i < hlo.shape().rank(); ++i) {
+    dim_order.dim_fragments_orders_[kSoftmaxBatchDimension].push_back(
+        dim_order.tensor_fragments_order_.size());
+    dim_order.tensor_fragments_order_.push_back(
+        Fragment{kSoftmaxBatchDimension, hlo.shape().dimensions_minor(i)});
+  }
+  return dim_order;
+}
+
+std::string DimensionOrder::Fragment::ToString() const {
+  return absl::StrCat(dst_dim_number_, ":", count_, ":", slice_start_, "-",
+                      sliced_count_);
+}
+
+std::string DimensionOrder::ToString() const {
+  std::string ret = absl::StrJoin(tensor_fragments_order_, " - ",
+                                  [](std::string* out, const Fragment& f) {
+                                    absl::StrAppend(out, f.ToString(), " ");
+                                  });
+  absl::StrAppend(&ret, "|");
+  for (const auto& [dim, fragments] : dim_fragments_orders_) {
+    absl::StrAppend(&ret, dim, ":", absl::StrJoin(fragments, ","), " ");
+  }
+  return ret;
+}
+
+TensorIterationSpec DimensionOrder::ToTensorIterationSpec() const {
+  const Fragments& dim_fragments = TensorFragmentsOrder();
+  TensorIterationSpec tensor_spec;
+  int64_t accumulated_stride = 1;
+  int last_dim = -1;
+  auto remove_last_fragment_if_degenerate = [&tensor_spec](const int dim_idx) {
+    if (dim_idx >= 0 && !tensor_spec[dim_idx].empty() &&
+        tensor_spec[dim_idx].back().count == 1) {
+      tensor_spec[dim_idx].pop_back();
+    }
+  };
+  for (int dim_order_index = 0; dim_order_index < dim_fragments.size();
+       ++dim_order_index) {
+    const DimensionOrder::Fragment& fragment = dim_fragments[dim_order_index];
+    VLOG(6) << fragment.ToString();
+
+    TensorIterationSpec::DimIterationSpec& dim_spec =
+        tensor_spec[fragment.dst_dim_number()];
+    if (last_dim == fragment.dst_dim_number()) {
+      // Remove previous 1-sized subfragment if present.
+      if (!dim_spec.empty() && !dim_spec.back().subfragments.empty() &&
+          dim_spec.back().subfragments.back() == 1) {
+        dim_spec.back().subfragments.pop_back();
+      }
+      // Contiguous dimension, split only logically. Merge it back.
+      if (fragment.full_count() > 1) {
+        CHECK(!dim_spec.empty());
+        CHECK(!dim_spec.back().is_sliced())
+            << "Only the major-most fragment can have an offset.";
+        dim_spec.back().slice_start =
+            fragment.slice_start() * dim_spec.back().count;
+        dim_spec.back().sliced_count =
+            fragment.sliced_count() * dim_spec.back().count;
+        dim_spec.back().count *= fragment.full_count();
+        dim_spec.back().subfragments.push_back(fragment.sliced_count());
+      }
+    } else {
+      remove_last_fragment_if_degenerate(last_dim);
+      // Add part of the dimension.
+      dim_spec.push_back(TensorIterationSpec::IterationSpecFragment{
+          accumulated_stride,
+          fragment.full_count(),
+          fragment.slice_start(),
+          fragment.sliced_count(),
+          {fragment.sliced_count()}});
+    }
+
+    accumulated_stride *= fragment.full_count();
+    last_dim = fragment.dst_dim_number();
+  }
+  remove_last_fragment_if_degenerate(last_dim);
+  tensor_spec.RemoveEmptyDimensions();
+  return tensor_spec;
+}
+
+namespace {
+
+// Logical index of a dimension in `shape` labeled with `label` in the
+// `dim_order` describing the shape.
+std::optional<int> LogicalIndexOfLabeledDimension(
+    const Shape& shape, const DimensionOrder& dim_order, const int label) {
+  auto fragment_it = dim_order.TensorFragmentsOrder().cbegin();
+  for (int dim : shape.layout().minor_to_major()) {
+    const int64_t dim_size = shape.dimensions()[dim];
+    int64_t fragments_size = 1;
+    while (fragments_size < dim_size) {
+      fragments_size *= fragment_it->full_count();
+      if (fragment_it->dst_dim_number() == label) {
+        return dim;
+      }
+      ++fragment_it;
+    }
+  }
+  return std::nullopt;
+}
+
+using Int64OrError = std::variant<int64_t, FusionDecision>;
+Int64OrError CombineSplitDimMajorPartSizeReqs(int64_t a, int64_t b) {
+  if (a == b || b == kNoSplitRequirement) {
+    return a;
+  }
+  if (a == kNoSplitRequirement) {
+    return b;
+  }
+  return FusionDecision("Conflicting splits of splittable dimension");
+}
+
+RequirementsOrError CombineDotRequirements(DotRequirements a,
+                                           DotRequirements b) {
+  Int64OrError combined_size_req =
+      CombineSplitDimMajorPartSizeReqs(a.splittable_dimension_major_part_size,
+                                       b.splittable_dimension_major_part_size);
+  if (std::holds_alternative<FusionDecision>(combined_size_req)) {
+    return std::get<FusionDecision>(combined_size_req);
+  }
+  return DotRequirements(std::get<int64_t>(combined_size_req));
+}
+
+RequirementsOrError CombineSoftmaxRequirements(SoftmaxRequirements a,
+                                               SoftmaxRequirements b) {
+  // SoftmaxRequirements is an empty class for now.
+  return a;
+}
+
+}  // namespace
+
+RequirementsOrError CombineRequirements(Requirements a,
+                                        RequirementsOrError b_or_error) {
+  if (std::holds_alternative<FusionDecision>(b_or_error)) {
+    return b_or_error;
+  }
+  const Requirements& b = std::get<Requirements>(b_or_error);
+  if (std::holds_alternative<DotRequirements>(b)) {
+    return CombineDotRequirements(std::get<DotRequirements>(a),
+                                  std::get<DotRequirements>(b));
+  }
+  return CombineSoftmaxRequirements(std::get<SoftmaxRequirements>(a),
+                                    std::get<SoftmaxRequirements>(b));
+}
+
+namespace {
+
+// If the dimension order is supported by the triton emitters, this returns
+// which requirements does this order impose on the fusion.
+//
+// All subdimensions within a dimension have to be ordered.
+RequirementsOrError GetRequirementsIfSupportedOrder(
+    const DimensionOrder& order, const HeroProperties& properties) {
+  VLOG(8) << order.ToString();
+  int64_t split_dim_major_part = kNoSplitRequirement;
+  const Fragments& tensor_dim_fragments = order.TensorFragmentsOrder();
+  for (const auto& [dim_index, dim_fragments] : order.DimFragmentsOrders()) {
+    CHECK(!dim_fragments.empty());
+    for (int i = 0; i < dim_fragments.size() - 1; ++i) {
+      if (tensor_dim_fragments[dim_fragments[i]].is_sliced()) {
+        return "Sliced non-major-most fragment.";
+      }
+    }
+    int group_counter = 0;
+    int last_seen_group_last_fragment_index = -1;
+    auto fragment_it = dim_fragments.cbegin();
+    while (true) {
+      if (fragment_it == dim_fragments.cend()) {
+        break;
+      }
+      int64_t grouped_size = tensor_dim_fragments[*fragment_it].full_count();
+      // Gather contiguous fragments: they have consecutive indices.
+      while ((fragment_it + 1) != dim_fragments.cend() &&
+             *(fragment_it + 1) == *fragment_it + 1) {
+        ++fragment_it;
+        grouped_size *= tensor_dim_fragments[*fragment_it].full_count();
+      }
+      // Ignore 1-sized groups of fragments.
+      if (grouped_size == 1) {
+        ++fragment_it;
+        continue;
+      }
+
+      if (last_seen_group_last_fragment_index > *fragment_it) {
+        return "Transpose within a dimension.";
+      }
+
+      ++group_counter;
+      if (group_counter > 1) {
+        if (!std::holds_alternative<DotProperties>(properties)) {
+          return "Splitting a dimension is not supported for Softmax.";
+        }
+        // Only the dimension indicated by `splittable_dimension_index` (if any)
+        // can be split physically once by other dimensions. Other ones can be
+        // only split logically.
+        const int splittable_dimension_index =
+            std::get<DotProperties>(properties).splittable_dimension_index;
+        if (dim_index == splittable_dimension_index) {
+          if (group_counter == 2) {
+            if (split_dim_major_part != kNoSplitRequirement &&
+                split_dim_major_part != grouped_size) {
+              return "Conflicting splits of splittable dimension";
+            }
+            split_dim_major_part = grouped_size;
+          } else if (group_counter > 2) {
+            return "2nd split of a splittable dimension.";
+          }
+        } else {
+          return "Unsupported split of a dimension.";
+        }
+      }
+
+      last_seen_group_last_fragment_index = *fragment_it;
+      ++fragment_it;
+    }
+  }
+
+  if (std::holds_alternative<DotProperties>(properties)) {
+    return DotRequirements(split_dim_major_part);
+  }
+  return SoftmaxRequirements{};
+}
+
+// Apply GetRequirementsIfSupportedOrder() to all known
+// dimension orders around `hlo` and combine the result.
+RequirementsOrError GetRequirementsIfSupportedOrders(
+    const HloInstruction& hlo, const DimOrderMap& dim_orders,
+    const HeroProperties& properties) {
+  const Requirements empty_requirements =
+      std::holds_alternative<DotProperties>(properties)
+          ? Requirements(DotRequirements(kNoSplitRequirement))
+          : Requirements(SoftmaxRequirements{});
+  auto get_requirements =
+      [&](const HloInstruction& instr) -> RequirementsOrError {
+    if (auto it = dim_orders.find(&instr); it != dim_orders.end()) {
+      return GetRequirementsIfSupportedOrder(it->second, properties);
+    }
+    return empty_requirements;
+  };
+
+  Requirements requirements = empty_requirements;
+  for (const HloInstruction* operand : hlo.operands()) {
+    RequirementsOrError requirements_or_error =
+        CombineRequirements(requirements, get_requirements(*operand));
+    if (std::holds_alternative<FusionDecision>(requirements_or_error)) {
+      return requirements_or_error;
+    }
+    requirements = std::get<Requirements>(requirements_or_error);
+  }
+
+  return CombineRequirements(requirements, get_requirements(hlo));
+}
+
+DimOrderMap GetPropagatedDimOrdersForElementwise(
+    const HloInstruction& hlo, TransformDirection direction,
+    const DimensionOrder& src_dim_order) {
+  if (direction == TransformDirection::kOutputToInput) {
+    DimOrderMap map;
+    for (const HloInstruction* operand : hlo.operands()) {
+      map.insert({operand, src_dim_order});
+    }
+    return map;
+  }
+
+  return {{&hlo, src_dim_order}};
+}
+
+const HloInstruction& GetSourceHlo(const HloInstruction& hlo,
+                                   TransformDirection direction) {
+  CHECK_GE(hlo.operand_count(), 1);
+
+  if (direction == TransformDirection::kOutputToInput) {
+    return hlo;
+  }
+  return *hlo.operand(0);
+}
+
+using ConstInstructionVector = absl::InlinedVector<const HloInstruction*, 2>;
+ConstInstructionVector GetDestHlos(const HloInstruction& hlo,
+                                   TransformDirection direction) {
+  if (direction == TransformDirection::kInputToOutput) {
+    return {&hlo};
+  }
+
+  ConstInstructionVector hlos;
+  hlos.reserve(hlo.operands().size());
+  for (const HloInstruction* operand : hlo.operands()) {
+    hlos.push_back(operand);
+  }
+  return hlos;
+}
+
+const HloInstruction& GetDestHlo(const HloInstruction& hlo,
+                                 TransformDirection direction) {
+  CHECK_EQ(hlo.operand_count(), 1);
+
+  if (direction == TransformDirection::kInputToOutput) {
+    return hlo;
+  }
+
+  return *hlo.operand(0);
+}
+
+DimOrderMapOrError GetPropagatedDimOrdersForBitcast(
+    const HloInstruction& hlo, const TransformDirection direction,
+    const DimensionOrder& src_dim_order, const HeroProperties& properties) {
+  const HloInstruction& dst = GetDestHlo(hlo, direction);
+  const Shape& dst_shape = dst.shape();
+  const Fragments& src_fragments_order = src_dim_order.TensorFragmentsOrder();
+  DimOrderMap dst_dim_orders;
+  DimensionOrder& dst_dim_order =
+      dst_dim_orders.insert({&dst, DimensionOrder()}).first->second;
+  Fragments& dst_fragments_order = dst_dim_order.TensorFragmentsOrder();
+  bool dst_remainder_comes_from_reduce_dim = false;
+  // Size of not yet assigned part of current target dimension.
+  int64_t dst_remaining_size = 1;
+  // Track destination fragments created from a source one.
+  absl::flat_hash_map<const Fragment*, std::vector<int>> src_to_dst;
+  // Iterate in parallel over source dimension order and target dimensions
+  // in minor_to_major order. Find groups of dimensions of equal size
+  // and project the source dimension order onto the destination.
+  auto dst_dim_it = dst_shape.layout().minor_to_major().cbegin();
+  const auto dst_dim_end = dst_shape.layout().minor_to_major().cend();
+  for (auto src_dim = src_fragments_order.cbegin();
+       src_dim != src_fragments_order.cend(); ++src_dim) {
+    auto add_new_fragment = [&](const Fragment& fragment) {
+      dst_fragments_order.push_back(fragment);
+      src_to_dst[&*src_dim].push_back(dst_fragments_order.size() - 1);
+    };
+    if (std::holds_alternative<SoftmaxProperties>(properties) &&
+        src_dim->dst_dim_number() ==
+            std::get<SoftmaxProperties>(properties).softmax_batch_dimension) {
+      // Special handling for softmax batch dimension: allow arbitrary reshapes
+      // on it because it's guaranteed by the construction of the fusion to have
+      // no physical alterations like transposes.
+      // Find a continuous group of fragments corresponding to this dimension in
+      // the source and assign the corresponding size in fragments of the
+      // destination ignoring the source ones.
+
+      // If there is dst_remaining_size leftover from our previous src_dim,
+      // and it came from a reduce dim, we cannot tile it in a batch dim.
+      if (dst_remainder_comes_from_reduce_dim) {
+        return R"(Unsupported bitcast splits dimension between batch and
+                  reduction dimensions in softmax)";
+      }
+
+      dst_remaining_size = src_dim->full_count();
+      while (src_dim + 1 != src_fragments_order.cend() &&
+             (src_dim + 1)->dst_dim_number() == src_dim->dst_dim_number()) {
+        ++src_dim;
+        dst_remaining_size *= src_dim->full_count();
+      }
+      while (dst_remaining_size > 1) {
+        CHECK(dst_dim_it != dst_dim_end);
+        add_new_fragment(Fragment{src_dim->dst_dim_number(),
+                                  dst_shape.dimensions(*dst_dim_it)});
+        dst_remaining_size /= dst_shape.dimensions(*dst_dim_it);
+        ++dst_dim_it;
+      }
+      continue;
+    }
+    if (dst_remaining_size >= src_dim->full_count()) {
+      if (dst_remaining_size % src_dim->full_count()) {
+        return "Unsupported bitcast";
+      }
+      // Source dimension fragment completely fits into the destination one:
+      // just copy it as is.
+      add_new_fragment(*src_dim);
+      // Update the size of the remaining part of the destination that is
+      // carried over to next source dimensions.
+      dst_remaining_size /= src_dim->full_count();
+    } else {
+      // Source is larger than destination.
+      // Assign further destination dimensions.
+      // Size of the not yet assigned part of the source dimension.
+      int64_t src_remaining_size = src_dim->full_count();
+      // Handle dimension splits.
+      if (dst_remaining_size > 1) {
+        // If there is a remaining fragment of a previous destination dimension
+        // assign it first.
+        if (src_remaining_size % dst_remaining_size || (src_dim->is_sliced())) {
+          return "Unsupported bitcast";
+        }
+        add_new_fragment(
+            Fragment{src_dim->dst_dim_number(), dst_remaining_size});
+        // Update the size of the fragment remaining to assign.
+        src_remaining_size /= dst_remaining_size;
+        dst_remaining_size = 1;
+      }
+      while (src_remaining_size > 1) {
+        // Assign destination dimensions until the source remainder is covered.
+        CHECK(dst_dim_it != dst_dim_end);
+        int64_t dst_dim_size = dst_shape.dimensions(*dst_dim_it);
+        int64_t new_fragment_size = dst_dim_size;
+        if (dst_dim_size > src_remaining_size) {
+          // If adding the next destination dimension exceeds source fragment
+          // size assign the remainder of the source and carry over the
+          // remainder of the destination.
+          if (dst_dim_size % src_remaining_size) {
+            return "Unsupported bitcast";
+          }
+          dst_remaining_size = dst_dim_size / src_remaining_size;
+          new_fragment_size = src_remaining_size;
+        }
+        if (src_dim->is_sliced()) {
+          return "Unsupported bitcast";
+        }
+        add_new_fragment(
+            Fragment{src_dim->dst_dim_number(), new_fragment_size});
+        src_remaining_size /= new_fragment_size;
+        ++dst_dim_it;
+      }
+    }
+
+    // We cannot tile a single dim with fragments across both reduce and batch
+    // dimensions. As such, if we have a dst remainder leftover from tiling a
+    // src fragment on the reduce dimension in softmax, we must only tile it
+    // with other src_dim fragments on the reduce dimension.
+    dst_remainder_comes_from_reduce_dim =
+        (dst_remaining_size > 1 &&
+         std::holds_alternative<SoftmaxProperties>(properties) &&
+         src_dim->dst_dim_number() == std::get<SoftmaxProperties>(properties)
+                                          .softmax_reduction_dimension);
+  }
+  CHECK_EQ(dst_remaining_size, 1);
+
+  // Handle remaining major dimensions of the destination. Call all degenerate
+  // ones subdimensions of the most-major non-degenerate one. Otherwise
+  // give up.
+  while (dst_dim_it != dst_dim_end) {
+    if (dst_shape.dimensions(*dst_dim_it) != 1) {
+      return "Unsupported bitcast";
+    }
+    if (!dst_fragments_order.empty()) {
+      dst_fragments_order.push_back(
+          Fragment{dst_fragments_order.back().dst_dim_number(), 1});
+      src_to_dst[&src_fragments_order.back()].push_back(
+          dst_fragments_order.size() - 1);
+    }
+    ++dst_dim_it;
+  }
+
+  FragmentOrders& dst_dim_fragment_orders = dst_dim_order.DimFragmentsOrders();
+  for (const auto& [dim_index, dim_sequence] :
+       src_dim_order.DimFragmentsOrders()) {
+    std::vector<int>& dst = dst_dim_fragment_orders[dim_index];
+    dst.reserve(dim_sequence.size());
+    for (const int src : dim_sequence) {
+      std::copy(src_to_dst[&src_fragments_order[src]].cbegin(),
+                src_to_dst[&src_fragments_order[src]].cend(),
+                std::back_inserter(dst));
+    }
+  }
+
+  return dst_dim_orders;
+}
+
+// Handle copy, transpose, broadcast or reduce.
+// Common between them is that they alter the tensor dimensions or their order
+// and the way to handle layouts.
+DimOrderMapOrError GetPropagatedDimOrdersForDimAlteringOp(
+    const HloInstruction& hlo, const TransformDirection direction,
+    const DimensionOrder& src_dim_order, const HeroProperties& properties) {
+  // Temporary storage for new fragments local to this function.
+  // Please keep this as the first local variable of this function, with type
+  // std::list to make sure that all pointers to elements of this remain valid
+  // throughout the entire function. std::deque would also work but it is
+  // unnecessarily big for a typical size of 1.
+  std::list<Fragment> new_fragments;
+
+  const HloInstruction& src = GetSourceHlo(hlo, direction);
+  // Note: copying instead of using a const reference because
+  // some operations (slice) will modify fragment properties in-place.
+  Fragments src_fragments_order = src_dim_order.TensorFragmentsOrder();
+  if (hlo.opcode() == HloOpcode::kSlice &&
+      ShapeUtil::IsEffectiveScalar(hlo.shape())) {
+    return FusionDecision("Slice to scalar is not implemented yet.");
+  }
+  // Every HLO dimension can correspond to a group of subdimensions in
+  // dim_order_. For the easier handling of permutations: group dim_order_ by
+  // dimension, apply permutations, then finally remove the grouping.
+  // Group subdimensions by iterating over them in the same order as over
+  // full dimensions and matching by total size.
+  std::vector<std::vector<Fragment*>> src_physical;
+  src_physical.reserve(src.shape().rank());
+  if (src_fragments_order.size() < src.shape().rank()) {
+    // It's not supported currently to further propagate dimensions after
+    // reaching a trivial sized tensor. We could probably support it, but now we
+    // just prevent crashing here.
+    return FusionDecision("Cannot propagate further from trivial sized tensor");
+  }
+  auto src_fragment_it = src_fragments_order.begin();
+  for (int64_t dim_index : src.shape().layout().minor_to_major()) {
+    const int64_t dim_size = src.shape().dimensions(dim_index);
+    int64_t subdim_size_accumulator = 1;
+    std::vector<Fragment*> subdim_group;
+    do {
+      CHECK(src_fragment_it != src_fragments_order.end());
+      subdim_size_accumulator *= src_fragment_it->full_count();
+      subdim_group.push_back(&*src_fragment_it);
+      ++src_fragment_it;
+    } while (subdim_size_accumulator < dim_size);
+    CHECK_EQ(subdim_size_accumulator, dim_size);
+    src_physical.push_back(subdim_group);
+  }
+
+  // Source physical -> source logical.
+  std::vector<std::vector<Fragment*>> src_logical;
+  src_logical.resize(src_physical.size());
+  for (int i = 0; i < src_physical.size(); ++i) {
+    src_logical[src.shape().layout().minor_to_major(i)] = src_physical[i];
+  }
+
+  DimOrderMap dst_dim_orders;
+  for (const HloInstruction* dst : GetDestHlos(hlo, direction)) {
+    DimensionOrder& dst_dim_order =
+        dst_dim_orders.insert({dst, DimensionOrder()}).first->second;
+    // Source logical -> destination logical.
+    std::vector<std::vector<Fragment*>> dst_logical;
+    if (hlo.opcode() == HloOpcode::kTranspose) {
+      const auto* transpose = Cast<HloTransposeInstruction>(&hlo);
+      std::vector<int64_t> permutation(transpose->dimensions().cbegin(),
+                                       transpose->dimensions().cend());
+      if (direction == TransformDirection::kInputToOutput) {
+        permutation = InversePermutation(permutation);
+      }
+      dst_logical.resize(permutation.size());
+      for (int i = 0; i < permutation.size(); ++i) {
+        dst_logical[permutation[i]] = src_logical[i];
+      }
+    } else if (hlo.opcode() == HloOpcode::kBroadcast) {
+      const auto* broadcast = Cast<HloBroadcastInstruction>(&hlo);
+      dst_logical.resize(broadcast->dimensions().size());
+      for (int i = 0; i < broadcast->dimensions().size(); ++i) {
+        dst_logical[i] = src_logical[broadcast->dimensions()[i]];
+      }
+    } else if (hlo.opcode() == HloOpcode::kReduce) {
+      // Operand 1 (the neutral value) has to be a scalar.
+      if (dst != &hlo && hlo.operand_index(dst) == 1) {
+        continue;
+      }
+      const auto* reduce = Cast<HloReduceInstruction>(&hlo);
+      dst_logical.resize(src_logical.size() + reduce->dimensions().size());
+
+      if (reduce->dimensions().size() != 1) {
+        return FusionDecision("Unsupported reduction.");
+      } else if (reduce->dimensions().front() !=
+                 reduce->operand(0)->shape().rank() - 1) {
+        return FusionDecision("Only row reductions are supported.");
+      }
+      for (int i = 0; i < dst_logical.size(); ++i) {
+        if (i == reduce->dimensions().front()) {
+          // This way to assign the reduction dimension will only work for
+          // softmax fusions with known patterns for now. Generally a reduction
+          // should create a new tiled dimension.
+          dst_logical[i] = {&new_fragments.emplace_back(
+              std::get<SoftmaxProperties>(properties)
+                  .softmax_reduction_dimension,
+              reduce->operand(0)->shape().dimensions(i))};
+        } else {
+          dst_logical[i] = src_logical[i];
+        }
+      }
+    } else if (hlo.opcode() == HloOpcode::kConcatenate) {
+      dst_logical.resize(src_logical.size());
+      for (int i = 0; i < src_logical.size(); ++i) {
+        dst_logical[i] = src_logical[i];
+        if (i == hlo.concatenate_dimension()) {
+          if (src_logical[i].size() != 1 || src_logical[i][0]->is_sliced()) {
+            return FusionDecision("Unsupported concatenation.");
+          }
+          dst_logical[i][0]->set_count(dst->shape().dimensions(i));
+          dst_logical[i][0]->set_slice(0, dst->shape().dimensions(i));
+        }
+      }
+    } else if (hlo.opcode() == HloOpcode::kCopy) {
+      // Copy preserves the logical shape, just permutes the layout.
+      CHECK(ShapeUtil::SameDimensions(src.shape(), dst->shape()));
+      dst_logical = src_logical;
+    } else if (hlo.opcode() == HloOpcode::kPad) {
+      // Operand 1 (the padding value) has to be a scalar.
+      if (dst != &hlo && hlo.operand_index(dst) == 1) {
+        continue;
+      }
+      const auto* pad = Cast<HloPadInstruction>(&hlo);
+      dst_logical.resize(src_logical.size());
+      for (int i = 0; i < src_logical.size(); ++i) {
+        // This only handles the padding added by
+        // PadDotOperandsIfNeededForSplitK, which sets only edge_padding_high.
+        const int padding =
+            pad->padding_config().dimensions(i).edge_padding_high();
+        CHECK_EQ(pad->padding_config().dimensions(i).edge_padding_low(), 0);
+        CHECK_EQ(pad->padding_config().dimensions(i).interior_padding(), 0);
+        if (padding == 0) {
+          dst_logical[i] = src_logical[i];
+        } else {
+          // This case is executed for the contracting dimension when we run the
+          // TritonFusionAnalysis after the padding and the split-k transform
+          // are applied.
+          const std::vector<Fragment*>& fragments = src_logical[i];
+          // We must have 2 fragments at this point.
+          CHECK_EQ(fragments.size(), 2);
+          // The dst_dim_numbers must be the same for the 2 fragments of the
+          // contracting dimension after applying split-k.
+          CHECK_EQ(fragments[0]->dst_dim_number(),
+                   fragments[1]->dst_dim_number());
+
+          new_fragments.emplace_back(
+              fragments[0]->dst_dim_number(),
+              fragments[0]->full_count() * fragments[1]->full_count() -
+                  padding);
+          dst_logical[i] = {&new_fragments.back()};
+        }
+      }
+    } else if (hlo.opcode() == HloOpcode::kSlice) {
+      const auto slice = Cast<HloSliceInstruction>(&hlo);
+      dst_logical.resize(src_logical.size());
+      for (int dim = 0; dim < src_logical.size(); ++dim) {
+        dst_logical[dim] = src_logical[dim];
+        if (slice->slice_limits(dim) - slice->slice_starts(dim) !=
+            dst->shape().dimensions(dim)) {
+          if (dst_logical[dim].size() > 1) {
+            return FusionDecision("Slicing of fragmented dimension.");
+          }
+          auto fragment = dst_logical[dim].front();
+          fragment->set_count(dst->shape().dimensions(dim));
+          // Slicing of an already sliced dimension means adding offsets.
+          fragment->set_slice(
+              fragment->slice_start() + slice->slice_starts(dim),
+              fragment->sliced_count());
+        }
+      }
+    } else {
+      return FusionDecision("Function called on a wrong instruction.");
+    }
+    // Destination logical -> destination physical and ungroup subdimensions.
+    // Map original fragments to the resulting ones to derive their new
+    // logical ordering within each dimension.
+    absl::flat_hash_map<const Fragment*, int> src_to_dst;
+    Fragments& dst_fragments_order = dst_dim_order.TensorFragmentsOrder();
+    FragmentOrders& dst_dim_fragments_order =
+        dst_dim_order.DimFragmentsOrders();
+    // Remember which dimensions are present before a broadcast;
+    // skip cases when already present dimension is being expanded.
+    absl::flat_hash_set<int> dim_numbers_present_in_dst;
+    for (const int64_t dim_idx : dst->shape().layout().minor_to_major()) {
+      for (const Fragment* subdim : dst_logical[dim_idx]) {
+        dst_fragments_order.push_back(*subdim);
+        src_to_dst[subdim] = dst_fragments_order.size() - 1;
+        dim_numbers_present_in_dst.insert(subdim->dst_dim_number());
+      }
+    }
+    for (const auto& [dim_index, dim_sequence] :
+         src_dim_order.DimFragmentsOrders()) {
+      for (const int fragment_number : dim_sequence) {
+        const auto it = src_to_dst.find(&src_fragments_order[fragment_number]);
+        if (it == src_to_dst.cend()) {
+          if (hlo.opcode() == HloOpcode::kBroadcast &&
+              src_fragments_order[fragment_number].full_count() > 1 &&
+              dim_numbers_present_in_dst.contains(dim_index)) {
+            return FusionDecision("Unsupported broadcast");
+          }
+          continue;
+        }
+        dst_dim_fragments_order[dim_index].push_back(it->second);
+      }
+    }
+  }
+  return dst_dim_orders;
+}
+
+// If possible, propagates `src_dim_order` (describing one side of `hlo`) to
+// the other side and returns those dim orders.
+DimOrderMapOrError GetPropagatedDimOrders(const HloInstruction& hlo,
+                                          const TransformDirection direction,
+                                          const DimensionOrder& src_dim_order,
+                                          const HeroProperties& properties) {
+  VLOG(7) << "Analyzing " << hlo.ToString();
+  if (hlo.opcode() != HloOpcode::kParameter &&
+      direction == TransformDirection::kOutputToInput &&
+      absl::c_any_of(hlo.users(), [](const HloInstruction* user) {
+        return user->opcode() == HloOpcode::kConcatenate;
+      })) {
+    return "No fusion into concatenations";
+  }
+  if (hlo.opcode() == HloOpcode::kParameter ||
+      hlo_query::IsScalarConstant(&hlo)) {
+    CHECK(direction == TransformDirection::kOutputToInput);
+    return DimOrderMap{};
+  } else if (hlo.opcode() == HloOpcode::kTranspose ||
+             hlo.opcode() == HloOpcode::kCopy) {
+    return GetPropagatedDimOrdersForDimAlteringOp(hlo, direction, src_dim_order,
+                                                  properties);
+  } else if (hlo.opcode() == HloOpcode::kBroadcast) {
+    if (direction != TransformDirection::kOutputToInput) {
+      return "Unsupported broadcast direction.";
+    }
+    return GetPropagatedDimOrdersForDimAlteringOp(hlo, direction, src_dim_order,
+                                                  properties);
+  } else if (hlo.opcode() == HloOpcode::kReduce) {
+    if (!std::holds_alternative<SoftmaxProperties>(properties)) {
+      return "Reductions are not supported in GEMM fusions yet.";
+    }
+    if (direction != TransformDirection::kOutputToInput) {
+      return "Unsupported direction of reduction.";
+    }
+    return GetPropagatedDimOrdersForDimAlteringOp(hlo, direction, src_dim_order,
+                                                  properties);
+  } else if (hlo.opcode() == HloOpcode::kPad) {
+    if (std::holds_alternative<SoftmaxProperties>(properties)) {
+      return "Pad ops are only supported when they are generated as part of "
+             "the split-k transform of dot fusions.";
+    }
+    if (direction != TransformDirection::kOutputToInput) {
+      return "Unsupported pad direction.";
+    }
+    return GetPropagatedDimOrdersForDimAlteringOp(hlo, direction, src_dim_order,
+                                                  properties);
+  } else if (hlo.operand_count() > 0 &&
+             IsTritonSupportedElementwise(
+                 hlo.opcode(), hlo.operand(0)->shape().element_type())) {
+    return GetPropagatedDimOrdersForElementwise(hlo, direction, src_dim_order);
+  } else if (hlo.opcode() == HloOpcode::kBitcast) {
+    return GetPropagatedDimOrdersForBitcast(hlo, direction, src_dim_order,
+                                            properties);
+  } else if (hlo.opcode() == HloOpcode::kSlice) {
+    if (direction != TransformDirection::kOutputToInput) {
+      return "Unsupported slice direction.";
+    }
+    return GetPropagatedDimOrdersForDimAlteringOp(hlo, direction, src_dim_order,
+                                                  properties);
+  } else if (hlo.opcode() == HloOpcode::kReshape) {
+    if (!ShapeUtil::ReshapeIsBitcast(hlo.operand(0)->shape(), hlo.shape())) {
+      return "Non-bitcast reshape.";
+    }
+    return GetPropagatedDimOrdersForBitcast(hlo, direction, src_dim_order,
+                                            properties);
+  } else if (hlo.opcode() == HloOpcode::kConcatenate &&
+             direction == TransformDirection::kOutputToInput) {
+    if (!std::holds_alternative<DotProperties>(properties)) {
+      return "Concatenations for now are only supported in GEMM fusions.";
+    }
+    auto dim = LogicalIndexOfLabeledDimension(
+        hlo.shape(), src_dim_order,
+        std::get<DotProperties>(properties).noncontracting_dimension);
+    if (!dim.has_value() || dim.value() != hlo.concatenate_dimension()) {
+      return "Unsupported concatenation.";
+    }
+    if (absl::c_any_of(hlo.operands(), [](const HloInstruction* operand) {
+          return operand->user_count() > 1;
+        })) {
+      return FusionDecision(
+          "Concatenation has to be the only user of its inputs.");
+    }
+    if (absl::c_any_of(hlo.operands(), [&hlo](const HloInstruction* operand) {
+          // In the current simple implementation of concatenation the size of
+          // each of its inputs along the concatenated dimension has to be
+          // divisible by the tile size used for this dimension. Concatenations
+          // with any operand not divisible by kMinConcatFragmentSize will not
+          // be fused; tiling configurations with tile size for this dimension
+          // larger than kMinConcatFragmentSize will not be emitted.
+          constexpr int kMinConcatFragmentSize = 128;
+          return operand->shape().dimensions(hlo.concatenate_dimension()) %
+                     kMinConcatFragmentSize !=
+                 0;
+        })) {
+      return FusionDecision(
+          "At least one operand of concatenation can not be perfectly tiled.");
+    }
+    return GetPropagatedDimOrdersForDimAlteringOp(hlo, direction, src_dim_order,
+                                                  properties);
+  }
+  return "Unimplemented instruction.";
+}
+
+// Difference of input and output data volumes of an instruction.
+int64_t InputMinusOutputBytes(const HloInstruction& hlo) {
+  CHECK(!hlo.shape().IsTuple());
+  int64_t input_size = 0;
+  for (const HloInstruction* operand : hlo.operands()) {
+    CHECK(!operand->shape().IsTuple());
+    input_size += ShapeUtil::ByteSizeOf(operand->shape());
+  }
+  return input_size - ShapeUtil::ByteSizeOf(hlo.shape());
+}
+
+// Tells if an instruction has no user into which it could be fused.
+// More cases should be added here.
+bool CanNotBeFusedIntoAUser(const HloInstruction& hlo) {
+  return hlo.IsRoot() || (hlo.user_count() == 1 && hlo.users()[0]->IsRoot() &&
+                          hlo.users()[0]->opcode() == HloOpcode::kTuple);
+}
+
+// Let input and output data volumes of a fusion grow by small amounts.
+constexpr int kIoToleranceBytes = 1024;
+
+// Tells that fusing an instruction as an input is efficient.
+bool IsInputWorthFusing(const HloInstruction& hlo) {
+  if (InputMinusOutputBytes(hlo) <= kIoToleranceBytes) {
+    return true;
+  }
+  if (hlo.user_count() > 1) {
+    return false;
+  }
+  if (hlo.opcode() == HloOpcode::kSlice &&
+      hlo_query::AllOperandsAreParametersOrConstants(hlo)) {
+    return true;
+  }
+  return hlo_query::AllOperandsAreParametersOrConstantsWithSingleUser(hlo);
+}
+
+// Tells that fusing an instruction as an output is efficient.
+bool IsOutputWorthFusing(const HloInstruction& hlo) {
+  return CanNotBeFusedIntoAUser(hlo) ||
+         InputMinusOutputBytes(hlo) >= -kIoToleranceBytes;
+}
+
+FusionDecision IsConversionWorthFusing(const HloInstruction& input,
+                                       se::GpuComputeCapability gpu_version) {
+  // TODO(b/266862494): Can pick up almost any
+  // convert, but if it's reducing the data volume it should rather be fused
+  // to the output of the producer kernel. However not all operations support
+  // output fusion - then it should be fused here anyway!
+  if (ShapeUtil::ByteSizeOf(input.operand(0)->shape()) >
+      ShapeUtil::ByteSizeOf(input.shape())) {
+    return "Narrowing conversion.";
+  }
+  return FusionDecision{};
+}
+
+}  // namespace
+
+DimOrdersAndReqsOrError GetPropagatedDimOrdersAndRequirements(
+    const HloInstruction& hlo, const DimensionOrder& src_dim_order,
+    TransformDirection direction, const HeroProperties& properties) {
+  DimOrderMapOrError propagated_dim_orders_or_error =
+      GetPropagatedDimOrders(hlo, direction, src_dim_order, properties);
+  if (std::holds_alternative<FusionDecision>(propagated_dim_orders_or_error)) {
+    return std::get<FusionDecision>(propagated_dim_orders_or_error);
+  }
+  DimOrderMap propagated_dim_orders =
+      std::move(std::get<DimOrderMap>(propagated_dim_orders_or_error));
+  RequirementsOrError requirements_or_error =
+      GetRequirementsIfSupportedOrders(hlo, propagated_dim_orders, properties);
+  if (std::holds_alternative<FusionDecision>(requirements_or_error)) {
+    return std::get<FusionDecision>(requirements_or_error);
+  }
+  return DimOrdersAndReqs{propagated_dim_orders,
+                          std::get<Requirements>(requirements_or_error)};
+}
+
+DimOrdersAndReqsOrError
+GetPropagatedDimOrdersAndRequirementsIfProfitablyFusible(
+    const HloInstruction& hlo, TransformDirection transform_direction,
+    const std::optional<int>& src_operand_index,
+    const DimensionOrder& src_dim_order,
+    const se::GpuComputeCapability& gpu_version,
+    const HeroProperties& properties) {
+  CHECK_EQ(transform_direction == TransformDirection::kInputToOutput,
+           src_operand_index.has_value());
+
+  if (hlo.opcode() == HloOpcode::kTuple ||
+      hlo.opcode() == HloOpcode::kGetTupleElement) {
+    return "Unsupported instruction.";
+  }
+  if (hlo.opcode() == HloOpcode::kReduce) {
+    return "Reductions are not fused yet.";
+  }
+  if (hlo.opcode() == HloOpcode::kPad) {
+    return "Pads are not fused yet.";
+  }
+  for (const HloInstruction* operand : hlo.operands()) {
+    if (!IsTritonSupportedDataType(operand->shape().element_type(),
+                                   gpu_version)) {
+      return "Unsupported input data type.";
+    }
+  }
+  if (!IsTritonSupportedDataType(hlo.shape().element_type(), gpu_version)) {
+    return "Unsupported output data type.";
+  }
+  DimOrdersAndReqsOrError result_or_error =
+      GetPropagatedDimOrdersAndRequirements(hlo, src_dim_order,
+                                            transform_direction, properties);
+  if (!std::holds_alternative<DimOrdersAndReqs>(result_or_error)) {
+    return result_or_error;
+  }
+  DimOrdersAndReqs dim_orders_and_requirements =
+      std::move(std::get<DimOrdersAndReqs>(result_or_error));
+  int fusion_level =
+      hlo.GetModule()->config().debug_options().xla_gpu_triton_fusion_level();
+  if (!std::get<se::CudaComputeCapability>(gpu_version)
+           .IsAtLeast(se::CudaComputeCapability::AMPERE)) {
+    fusion_level = std::min(fusion_level, 1);
+  }
+  if (transform_direction == TransformDirection::kOutputToInput) {
+    if (fusion_level < 2) {
+      if (hlo.opcode() == HloOpcode::kConvert) {
+        if (FusionDecision decision = IsConversionWorthFusing(hlo, gpu_version);
+            !decision) {
+          return decision;
+        }
+      } else if (hlo.IsElementwise() && hlo.opcode() != HloOpcode::kCopy) {
+        return "Ignored elementwise operation";
+      }
+    } else {
+      // Exception for binary elementwise operations: in most cases these are
+      // not trivial to fuse because they increase DRAM traffic but if one
+      // of the inputs is for example a broadcast that can be fused too it
+      // becomes worth fusing. Look ahead and analyze operands here.
+      bool accepted = false;
+      if (hlo.IsElementwise() && hlo.operand_count() == 2) {
+        for (const HloInstruction* operand : hlo.operands()) {
+          if (operand->opcode() == HloOpcode::kBroadcast &&
+              (operand->operand(0)->opcode() == HloOpcode::kParameter ||
+               operand->operand(0)->opcode() == HloOpcode::kConstant) &&
+              std::holds_alternative<DimOrdersAndReqs>(
+                  GetPropagatedDimOrdersAndRequirementsIfProfitablyFusible(
+                      *operand, TransformDirection::kOutputToInput,
+                      /*src_operand_index=*/std::nullopt,
+                      /*src_dim_order=*/
+                      dim_orders_and_requirements.dim_orders.at(operand),
+                      gpu_version, properties))) {
+            accepted = true;
+            break;
+          }
+        }
+      }
+      if (!accepted && !IsInputWorthFusing(hlo)) {
+        return "Not obviously profitable to fuse as input.";
+      }
+    }
+  } else {
+    if (fusion_level < 2) {
+      return "Skipping fusing outputs at low fusion levels.";
+    }
+    for (int i = 0; i < hlo.operand_count(); ++i) {
+      const HloInstruction* operand = hlo.operand(i);
+      // Skip source operand.
+      if (i == *src_operand_index) {
+        continue;
+      }
+      // Currently only broadcasts of scalar constants or parameters
+      // are accepted as other inputs of non-unary operations
+      // in the output fusion.
+      if (hlo_query::IsBroadcastOfScalarConstant(*operand) ||
+          operand->opcode() == HloOpcode::kParameter) {
+        continue;
+      }
+      return "Has multiple inputs - not properly analyzed yet.";
+    }
+    if (!IsOutputWorthFusing(hlo)) {
+      return "Not obviously profitable to fuse as output.";
+    }
+  }
+  return dim_orders_and_requirements;
+}
+
+}  // namespace triton_fusion
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.h b/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
new file mode 100644
index 00000000000000..ffd235e1e5946b
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
@@ -0,0 +1,278 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRITON_TILING_PROPAGATION_H_
+#define XLA_SERVICE_GPU_TRITON_TILING_PROPAGATION_H_
+
+// This file contains the logic of the Triton Tiling Propagation in a functional
+// paradigm. Stateful operations belong in triton_fusion_analysis.
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/instruction_fusion.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+class TensorIterationSpec {
+ public:
+  // Description of basic iteration: `count` elements separated by `stride`
+  // with initial offset of `slice_start` and only `sliced_count` elements used.
+  struct IterationSpecFragment {
+    int64_t stride;
+    int64_t count;
+    int64_t slice_start;
+    int64_t sliced_count;
+    // Logical subfragments:
+    // These are the sizes of the HLO dimensions which make up this basic
+    // iteration.
+    std::vector<int64_t> subfragments;
+
+    bool is_sliced() const { return count != sliced_count; }
+
+    auto ToTuple() const {
+      return std::make_tuple(stride, count, slice_start, sliced_count,
+                             subfragments);
+    }
+
+    bool operator==(const IterationSpecFragment& other) const {
+      return ToTuple() == other.ToTuple();
+    }
+    template <typename H>
+    friend H AbslHashValue(H h, const IterationSpecFragment& fragment) {
+      return H::combine(std::move(h), fragment.ToTuple());
+    }
+
+    bool IsPhysicallyEquivalent(const IterationSpecFragment& other) const {
+      // Subfragments don't change the physical layout.
+      return stride == other.stride && count == other.count &&
+             slice_start == other.slice_start &&
+             sliced_count == other.sliced_count;
+    }
+
+    std::string ToString() const;
+  };
+  // Description of complex iteration over a sequence of several strides.
+  // Describes a logically contiguous dimension of a tensor physically
+  // separated into multiple fragments by other dimensions.
+  using DimIterationSpec = std::vector<IterationSpecFragment>;
+
+  const DimIterationSpec& operator[](const int dimension) const {
+    return dim_iteration_specs_.at(dimension);
+  }
+  DimIterationSpec& operator[](const int dimension) {
+    return dim_iteration_specs_[dimension];
+  }
+  // Returns nullptr if not found.
+  const DimIterationSpec* Find(int dimension) const;
+
+  void RemoveEmptyDimensions() {
+    absl::erase_if(dim_iteration_specs_,
+                   [](const auto& it) { return it.second.empty(); });
+  }
+
+  bool operator==(const TensorIterationSpec& other) const {
+    return dim_iteration_specs_ == other.dim_iteration_specs_;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const TensorIterationSpec& spec) {
+    return H::combine(std::move(h), spec.dim_iteration_specs_);
+  }
+
+  // Compares physical layouts of tensors ignoring subfragments of dimensions.
+  // Checking with this, instead of "==" allows a few more edge cases to be
+  // fused.
+  bool IsPhysicallyEquivalent(const TensorIterationSpec& other) const;
+
+  std::string ToString() const;
+
+ private:
+  // Maps dimensions to DimIterationSpecs.
+  absl::flat_hash_map<int, DimIterationSpec> dim_iteration_specs_;
+};
+
+// The details of the Triton fusion / tiling propagation are in a separate
+// namespace to avoid littering the xla::gpu namespace.
+namespace triton_fusion {
+
+// Handles numbers of dimensions of an HLO instruction
+// projected onto another one.
+// Used to calculate cumulative index transformations done by non-elementwise
+// instructions between source and target.
+class DimensionOrder {
+ public:
+  // Softmax fusions have a fixed tiling scheme. These numbers are chosen to
+  // reflect that reductions in softmax fusions currently happen on the minor-
+  // most dimension (dimensions_minor(0)) and the rest (1+) is treated as a
+  // single non-tiled batch dimension. The numbers have to match those the
+  // emitter uses in the queries to the analysis.
+  static constexpr int kSoftmaxReductionDimension = 0;
+  static constexpr int kSoftmaxBatchDimension = 1;
+
+  static DimensionOrder FromDotOperandOrOutput(
+      const HloInstruction& hlo, int split_k_dimension_index = -1);
+
+  static DimensionOrder FromSoftmaxRoot(const HloInstruction& hlo);
+
+  // Description of a continuous fragment of one dimension of a tensor.
+  class Fragment {
+   public:
+    explicit Fragment(int dst_dim_number, int64_t count)
+        : dst_dim_number_(dst_dim_number),
+          count_(count),
+          slice_start_(0),
+          sliced_count_(count) {}
+
+    std::string ToString() const;
+
+    // Label carrying the dimension number of an defining operation.
+    int dst_dim_number() const { return dst_dim_number_; }
+    // Total number of elements in the fragment ignoring slicing.
+    int64_t full_count() const { return count_; }
+    // First used element.
+    int64_t slice_start() const { return slice_start_; }
+    // Number of used elements.
+    int64_t sliced_count() const { return sliced_count_; }
+    bool is_sliced() const { return count_ != sliced_count_; }
+    void set_slice(int64_t start, int64_t count) {
+      slice_start_ = start;
+      sliced_count_ = count;
+    }
+    void set_count(int64_t count) { count_ = count; }
+
+   private:
+    const int dst_dim_number_;
+    int64_t count_;
+    int64_t slice_start_;
+    int64_t sliced_count_;
+  };
+  using Fragments = std::vector<Fragment>;
+  using FragmentOrders = absl::flat_hash_map<int, std::vector<int>>;
+
+  const Fragments& TensorFragmentsOrder() const {
+    return tensor_fragments_order_;
+  }
+  Fragments& TensorFragmentsOrder() { return tensor_fragments_order_; }
+
+  const FragmentOrders& DimFragmentsOrders() const {
+    return dim_fragments_orders_;
+  }
+  FragmentOrders& DimFragmentsOrders() { return dim_fragments_orders_; }
+
+  std::string ToString() const;
+
+  TensorIterationSpec ToTensorIterationSpec() const;
+
+  // Tells that two dimension orders describe the same tensor physical layout.
+  bool IsPhysicallyEquivalent(const DimensionOrder& other) const {
+    return ToTensorIterationSpec().IsPhysicallyEquivalent(
+        other.ToTensorIterationSpec());
+  }
+
+ private:
+  // Sequence of all fragments of dimensions of tensor's shape
+  // in layout minor-to-major (physical) order.
+  Fragments tensor_fragments_order_;
+  // Iteration orders of fragments of each dimension of the defining operation
+  // (fragments can be physically unordered and disconnected within
+  // the shape due to reshapes and transposes).
+  FragmentOrders dim_fragments_orders_;
+};
+
+// This represents an invalid dimension index.
+inline constexpr int kNoDimensionIndex = -1;
+struct DotProperties {
+  const int noncontracting_dimension;
+  // Index of dot dimension that can be split.
+  // Currently typically LHS non-contracting one.
+  const int splittable_dimension_index;
+};
+struct SoftmaxProperties {
+  const int softmax_reduction_dimension;
+  const int softmax_batch_dimension;
+};
+// HeroProperties depend only on the hero op and they don't change as we
+// change the fusion.
+using HeroProperties = std::variant<DotProperties, SoftmaxProperties>;
+
+// A special value for splittable_dimension_major_part_size.
+inline constexpr int kNoSplitRequirement = 1;
+struct DotRequirements {
+  explicit DotRequirements(int64_t splittable_dimension_major_part_size)
+      : splittable_dimension_major_part_size(
+            splittable_dimension_major_part_size) {
+    CHECK_GE(splittable_dimension_major_part_size, 1);
+  }
+  // If not kNoSplitRequirement, then the major part size of the splittable
+  // dimension must be the given value.
+  int64_t splittable_dimension_major_part_size;
+};
+struct SoftmaxRequirements {};
+// Requirements can change depending on what we fuse.
+using Requirements = std::variant<DotRequirements, SoftmaxRequirements>;
+using RequirementsOrError = std::variant<Requirements, FusionDecision>;
+
+RequirementsOrError CombineRequirements(Requirements a,
+                                        RequirementsOrError b_or_error);
+
+enum class TransformDirection { kInputToOutput, kOutputToInput };
+using DimOrderMap = absl::flat_hash_map<const HloInstruction*, DimensionOrder>;
+using DimOrderMapOrError = std::variant<DimOrderMap, FusionDecision>;
+
+// The dimension orders and requirements resulting from propagating the
+// dimension orders through an HLO.
+struct DimOrdersAndReqs {
+  DimOrderMap dim_orders;
+  Requirements requirements;
+};
+using DimOrdersAndReqsOrError = std::variant<DimOrdersAndReqs, FusionDecision>;
+
+// If fusing the instruction is possible then it propagates
+// the `src_dim_order` (describing one side of `hlo`) to the other side and
+// returns those dim orders and the requirements that they impose on the
+// fusion.
+DimOrdersAndReqsOrError GetPropagatedDimOrdersAndRequirements(
+    const HloInstruction& hlo, const DimensionOrder& src_dim_order,
+    TransformDirection direction, const HeroProperties& properties);
+// If fusing the instruction is possible *and profitable* then it propagates
+// the `src_dim_order` (describing one side of `hlo`) to the other side and
+// returns those dim orders and the requirements that they impose on the
+// fusion.
+//
+// `src_operand_index` must be set iff `transform_direction` is
+// kInputToOutput.
+DimOrdersAndReqsOrError
+GetPropagatedDimOrdersAndRequirementsIfProfitablyFusible(
+    const HloInstruction& hlo, TransformDirection transform_direction,
+    const std::optional<int>& src_operand_index,
+    const DimensionOrder& src_dim_order,
+    const se::GpuComputeCapability& gpu_version,
+    const HeroProperties& properties);
+
+}  // namespace triton_fusion
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRITON_TILING_PROPAGATION_H_
diff --git a/third_party/xla/xla/service/gpu_compilation_environment.cc b/third_party/xla/xla/service/gpu_compilation_environment.cc
index 0b0fbfb6bb9986..8f5b0b3c31d128 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment.cc
+++ b/third_party/xla/xla/service/gpu_compilation_environment.cc
@@ -15,17 +15,67 @@ limitations under the License.
 
 #include "xla/service/gpu_compilation_environment.h"
 
+#include <cstdint>
 #include <memory>
+#include <string>
+#include <vector>
 
+#include "absl/strings/str_join.h"
+#include "xla/parse_flags_from_env.h"
 #include "xla/service/compilation_environments.h"
-#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
-// TODO(b/284274097): Create flags with default values when flags
-// are moved from DebugOptions to GpuCompilationEnvironment.
-std::unique_ptr<GpuCompilationEnvironment> CreateDefaultGpuCompEnv() {
-  return std::make_unique<GpuCompilationEnvironment>();
+void InitializeFlagsForGpuCompEnv(std::vector<tsl::Flag>* flag_list,
+                                  GpuCompilationEnvironment* gpu_comp_env) {
+  auto int64_setter_for =
+      [gpu_comp_env](
+          void (GpuCompilationEnvironment::*member_setter)(int64_t)) {
+        return [gpu_comp_env, member_setter](int64_t value) {
+          (gpu_comp_env->*member_setter)(value);
+          return true;
+        };
+      };
+  flag_list->push_back(tsl::Flag(
+      "dummy_flag",
+      int64_setter_for(&GpuCompilationEnvironment::set_dummy_flag),
+      gpu_comp_env->dummy_flag(), "Dummy flag to demonstrate the flow"));
+}
+
+StatusOr<GpuCompilationEnvironment> CreateGpuCompEnvFromFlagStrings(
+    std::vector<std::string>& flags, bool strict) {
+  GpuCompilationEnvironment gpu_comp_env;
+  std::vector<tsl::Flag> flag_objects;
+  InitializeFlagsForGpuCompEnv(&flag_objects, &gpu_comp_env);
+  bool result = tsl::Flags::Parse(flags, flag_objects);
+  if (!result || (strict && !flags.empty())) {
+    return InvalidArgument("Could not parse flags: %s",
+                           absl::StrJoin(flags, ", "));
+  }
+  return gpu_comp_env;
+}
+
+StatusOr<GpuCompilationEnvironment> CreateGpuCompEnvFromEnvVar() {
+  GpuCompilationEnvironment env;
+  std::vector<tsl::Flag> flag_objects;
+  InitializeFlagsForGpuCompEnv(&flag_objects, &env);
+  bool result = ParseFlagsFromEnvAndIgnoreUnknown("XLA_FLAGS", flag_objects);
+  if (!result) {
+    return InvalidArgument("Could not parse XLA_FLAGS.");
+  }
+  return env;
+}
+
+GpuCompilationEnvironment CreateGpuCompEnvWithDefaultValues() {
+  GpuCompilationEnvironment env;
+  env.set_dummy_flag(1);
+  return env;
 }
 
 namespace {
@@ -36,10 +86,39 @@ namespace {
 //
 // The implementation returns Default env if one doesn't exist already.
 // NOLINTNEXTLINE
-std::unique_ptr<tsl::protobuf::Message> ProcessNewGpuCompilationEnvironment(
+StatusOr<std::unique_ptr<tsl::protobuf::Message>>
+ProcessNewGpuCompilationEnvironment(
     std::unique_ptr<tsl::protobuf::Message> env) {  // NOLINT
   if (!env) {
-    return xla::CreateDefaultGpuCompEnv();
+    env = std::make_unique<GpuCompilationEnvironment>();
+  }
+  TF_ASSIGN_OR_RETURN(GpuCompilationEnvironment from_env,
+                      CreateGpuCompEnvFromEnvVar());
+
+  auto default_env = CreateGpuCompEnvWithDefaultValues();
+
+  auto reflection = env->GetReflection();
+  auto reflection_from_env = from_env.GetReflection();
+  auto descriptor = GpuCompilationEnvironment::descriptor();
+  std::vector<const tsl::protobuf::FieldDescriptor*> missing_fields;
+
+  for (int j = 0; j < descriptor->field_count(); ++j) {
+    const tsl::protobuf::FieldDescriptor* field = descriptor->field(j);
+    if (reflection->HasField(*env, field) &&
+        reflection_from_env->HasField(from_env, field)) {
+      return InvalidArgument(
+          "Flag %s is set in both XLA_FLAGS env var and "
+          "GpuCompilationEnvironment.",
+          field->name());
+    } else if (!reflection->HasField(*env, field) &&
+               !reflection_from_env->HasField(from_env, field)) {
+      missing_fields.push_back(field);
+    }
+  }
+  env->MergeFrom(from_env);
+
+  if (!missing_fields.empty()) {
+    reflection->SwapFields(env.get(), &default_env, missing_fields);
   }
   return env;
 }
diff --git a/third_party/xla/xla/service/gpu_compilation_environment.h b/third_party/xla/xla/service/gpu_compilation_environment.h
index ade7536a78261a..99d93c185042ce 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment.h
+++ b/third_party/xla/xla/service/gpu_compilation_environment.h
@@ -15,13 +15,20 @@ limitations under the License.
 
 #ifndef XLA_SERVICE_GPU_COMPILATION_ENVIRONMENT_H_
 #define XLA_SERVICE_GPU_COMPILATION_ENVIRONMENT_H_
-#include <memory>
+#include <string>
+#include <vector>
 
+#include "xla/statusor.h"
 #include "xla/xla.pb.h"
 
 namespace xla {
 
-std::unique_ptr<GpuCompilationEnvironment> CreateDefaultGpuCompEnv();
+StatusOr<GpuCompilationEnvironment> CreateGpuCompEnvFromFlagStrings(
+    std::vector<std::string>& flags, bool strict);
+
+StatusOr<GpuCompilationEnvironment> CreateGpuCompEnvFromEnvVar();
+
+GpuCompilationEnvironment CreateGpuCompEnvWithDefaultValues();
 
 }  // namespace xla
 #endif  // XLA_SERVICE_GPU_COMPILATION_ENVIRONMENT_H_
diff --git a/third_party/xla/xla/service/gpu_compilation_environment_test.cc b/third_party/xla/xla/service/gpu_compilation_environment_test.cc
new file mode 100644
index 00000000000000..22efaa4a317d66
--- /dev/null
+++ b/third_party/xla/xla/service/gpu_compilation_environment_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu_compilation_environment.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/parse_flags_from_env.h"
+#include "xla/service/compilation_environments.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+using ::tsl::testing::StatusIs;
+
+void set_xla_flags_env_var(const std::string& xla_flags) {
+  int* pargc;
+  std::vector<char*>* pargv;
+  ResetFlagsFromEnvForTesting("XLA_FLAGS", &pargc, &pargv);
+  tsl::setenv("XLA_FLAGS", xla_flags.c_str(), true /*overwrite*/);
+}
+
+TEST(CreateGpuCompEnvFromFlagStringsTest, ValidFlags) {
+  std::vector<std::string> flags = {"--dummy_flag=2"};
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      GpuCompilationEnvironment gpu_comp_env,
+      CreateGpuCompEnvFromFlagStrings(flags, /*strict=*/true));
+
+  ASSERT_EQ(gpu_comp_env.dummy_flag(), 2);
+  ASSERT_TRUE(flags.empty());
+}
+
+TEST(CreateGpuCompEnvFromFlagStringsTest, EmptyFlags) {
+  std::vector<std::string> flags;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      GpuCompilationEnvironment gpu_comp_env,
+      CreateGpuCompEnvFromFlagStrings(flags, /*strict=*/true));
+}
+
+TEST(CreateGpuCompEnvFromFlagStringsTest, InvalidFlagName) {
+  std::vector<std::string> flags = {"--xla_gpu_invalid_flag=2"};
+
+  EXPECT_THAT(CreateGpuCompEnvFromFlagStrings(flags, /*strict=*/true),
+              StatusIs(tsl::error::INVALID_ARGUMENT));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      GpuCompilationEnvironment gpu_comp_env,
+      CreateGpuCompEnvFromFlagStrings(flags, /*strict=*/false));
+  ASSERT_EQ(flags.size(), 1);
+}
+
+TEST(CreateGpuCompEnvFromFlagStringsTest, InvalidFlagValue) {
+  std::vector<std::string> flags = {"--dummy_flag=foo"};
+
+  EXPECT_THAT(CreateGpuCompEnvFromFlagStrings(flags, /*strict=*/false),
+              StatusIs(tsl::error::INVALID_ARGUMENT));
+}
+
+TEST(CreateGpuCompEnvFromEnvVarTest, ValidFlags) {
+  set_xla_flags_env_var("--dummy_flag=4");
+
+  TF_ASSERT_OK_AND_ASSIGN(GpuCompilationEnvironment gpu_comp_env,
+                          CreateGpuCompEnvFromEnvVar());
+
+  ASSERT_EQ(gpu_comp_env.dummy_flag(), 4);
+}
+
+TEST(CreateGpuCompEnvFromEnvVarTest, InvalidFlagValue) {
+  set_xla_flags_env_var("--dummy_flag=foo");
+
+  EXPECT_THAT(CreateGpuCompEnvFromEnvVar(),
+              StatusIs(tsl::error::INVALID_ARGUMENT));
+}
+
+TEST(ProcessNewEnvTest, BothProtoAndEnvVarUnset) {
+  set_xla_flags_env_var("");
+  CompilationEnvironments envs;
+
+  const auto& env = envs.GetEnv<GpuCompilationEnvironment>();
+
+  EXPECT_EQ(env.dummy_flag(), 1);
+}
+
+TEST(ProcessNewEnvTest, ProtoSetButEnvVarUnset) {
+  set_xla_flags_env_var("");
+  CompilationEnvironments envs;
+  {
+    auto env = std::make_unique<GpuCompilationEnvironment>();
+    env->set_dummy_flag(2);
+    TF_ASSERT_OK(envs.AddEnv(std::move(env)));
+  }
+  const auto& env = envs.GetEnv<GpuCompilationEnvironment>();
+
+  EXPECT_EQ(env.dummy_flag(), 2);
+}
+
+TEST(ProcessNewEnvTest, ProtoUnsetButEnvVarSet) {
+  set_xla_flags_env_var("--dummy_flag=4");
+  CompilationEnvironments envs;
+  const auto& env = envs.GetEnv<GpuCompilationEnvironment>();
+
+  EXPECT_EQ(env.dummy_flag(), 4);
+}
+
+TEST(ProcessNewEnvTest, BothProtoAndEnvVarSetButNoConflict) {
+  set_xla_flags_env_var("--dummy_flag=4");
+  CompilationEnvironments envs;
+  {
+    auto env = std::make_unique<GpuCompilationEnvironment>();
+    TF_ASSERT_OK(envs.AddEnv(std::move(env)));
+  }
+  const auto& env = envs.GetEnv<GpuCompilationEnvironment>();
+  EXPECT_EQ(env.dummy_flag(), 4);
+}
+
+TEST(ProcessNewEnvTest, BothProtoAndEnvVarSetWithConflict) {
+  set_xla_flags_env_var("--dummy_flag=4");
+
+  CompilationEnvironments envs;
+  auto env = std::make_unique<GpuCompilationEnvironment>();
+  env->set_dummy_flag(2);
+  EXPECT_THAT(envs.AddEnv(std::move(env)),
+              StatusIs(tsl::error::INVALID_ARGUMENT));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/hlo.proto b/third_party/xla/xla/service/hlo.proto
index 22d46f62ff1a76..ea63665e83e85b 100644
--- a/third_party/xla/xla/service/hlo.proto
+++ b/third_party/xla/xla/service/hlo.proto
@@ -111,7 +111,7 @@ enum CustomCallApiVersion {
 }
 
 // Serialization of HloInstruction.
-// Next ID: 83
+// Next ID: 86
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -364,9 +364,24 @@ message HloInstructionProto {
   // Represents the K value for top-k.
   int64 k = 81;
 
+  // Represents the largest flag for top-k.
+  bool largest = 85;
+
   // Represents the information for tracking propagation of values within HLO
   // graph.
   xla.StatisticsViz statistics_viz = 82;
+
+  // Specifies which operation queue the current instruction will run on.
+  // A backend may have multiple operation queues to run instructions
+  // concurrently, use this to signal the backend which queue to dispatch to.
+  // The backend should keep a mapping of
+  // operation_queue_id->actual_hardware_queue_id if runtime will create
+  // different IDs.
+  int64 operation_queue_id = 83;
+
+  // Specifies which operation queues to await for data when running with
+  // multiple operation queues.
+  repeated int64 wait_on_operation_queues = 84;
 }
 
 // Serialization of HloComputation.
@@ -573,6 +588,8 @@ message HloModuleProto {
     xla.ProfileSource profile_source = 3;
     // The compilation event that triggered the use of the profile.
     xla.CompilationEvent compilation_event = 4;
+    // The fingerprint of the unoptimized module this profile was applied to.
+    string fingerprint = 5;
   }
 
   // Profile information for the HLO module.
@@ -784,40 +801,6 @@ message HloPassMetadata {
   int64 end_timestamp_usec = 9;
 }
 
-// Encodes attributes for an entry function.
-message EntryFunctionAttributes {
-  // Acts as the underlying container for an xla::ShapeIndex.
-  message ShapeIndex {
-    repeated int64 indices = 1;
-  }
-
-  // Encodes attributes for a single buffer parameter.
-  message BufferParameterAttributes {
-    // Represents an lmhlo.params function argument attribute.
-    int64 lmhlo_params = 1;
-    // TODO(hanbinyoon): Deprecate when optional fields are available in proto3
-    // (Protocol Buffers v3.15.0).
-    bool lmhlo_params_present = 6;
-
-    // Represents an lmhlo.param_shape_index function argument attribute.
-    ShapeIndex lmhlo_param_shape_index = 2;
-
-    // Represents an lmhlo.constant_name function argument attribute.
-    string lmhlo_constant_name = 3;
-
-    // Represents an lmhlo.must_alias function argument attribute.
-    bool lmhlo_must_alias = 4;
-
-    // Represents an lmhlo.params function argument attribute.
-    ShapeIndex lmhlo_output_index = 5;
-  }
-
-  repeated BufferParameterAttributes buffers = 1;
-
-  // xla::Shape in string format.
-  string result_xla_shape = 2;
-}
-
 // Encodes the underlying Xla runtime executable compiled from the XLA module.
 message XlaRuntimeExecutableProto {
   HloModuleProto hlo_module_proto = 1;
diff --git a/third_party/xla/xla/service/hlo_cost_analysis.cc b/third_party/xla/xla/service/hlo_cost_analysis.cc
index 8207046fa6388a..594da416ad3ee7 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/hlo_cost_analysis.cc
@@ -25,12 +25,15 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
@@ -77,7 +80,7 @@ Status HloCostAnalysis::Postprocess(const HloInstruction* hlo) {
       if (key == kOptimalSecondsKey) {
         return;
       }
-      float per_second_rate = options_.per_second_rates[key];
+      float per_second_rate = options_.per_second_rate(key);
       if (per_second_rate != 0) {
         optimal_seconds = std::max(optimal_seconds, val / per_second_rate);
       }
@@ -222,7 +225,11 @@ Status HloCostAnalysis::FusionCalculateUtilizations(
   // instruction.
   for (const HloInstruction* instr :
        fusion->fused_instructions_computation()->instructions()) {
-    hlo_properties_[instr][kUtilizationKey] = 1.f;
+    if (ShouldFilterFusionInstruction(fusion, instr)) {
+      hlo_properties_[instr][kUtilizationKey] = 0.f;
+    } else {
+      hlo_properties_[instr][kUtilizationKey] = 1.f;
+    }
   }
   return OkStatus();
 }
@@ -405,11 +412,12 @@ Status HloCostAnalysis::HandleDot(const HloInstruction* dot) {
 Status HloCostAnalysis::HandleInfeed(const HloInstruction* infeed) {
   // Count nested infeed output tuples.
   int64_t size = 0;
-  for (const auto& indexed_shape : ShapeUtil::GetLeafShapes(infeed->shape())) {
-    size += GetShapeSize(indexed_shape.shape);
-    current_properties_.set_output_bytes_accessed(
-        indexed_shape.index, GetShapeSize(indexed_shape.shape));
-  }
+  ShapeUtil::ForEachLeafShape(
+      infeed->shape(), [&](const Shape& sub_shape, const ShapeIndex& index) {
+        size += GetShapeSize(sub_shape);
+        current_properties_.set_output_bytes_accessed(index,
+                                                      GetShapeSize(sub_shape));
+      });
   current_properties_.set_output_bytes_accessed(size);
   current_properties_[kBytesAccessedKey] = size;
   return OkStatus();
@@ -421,12 +429,14 @@ Status HloCostAnalysis::HandleOutfeed(const HloInstruction* outfeed) {
   for (int64_t i = 0; i < outfeed->operand_count(); ++i) {
     const HloInstruction* operand = outfeed->operand(i);
     int64_t size = 0;
-    for (const auto& indexed_shape :
-         ShapeUtil::GetLeafShapes(operand->shape())) {
-      size += GetShapeSize(indexed_shape.shape);
-      current_properties_.set_operand_bytes_accessed(
-          i, indexed_shape.index, GetShapeSize(indexed_shape.shape));
-    }
+
+    ShapeUtil::ForEachLeafShape(
+        operand->shape(), [&](const Shape& sub_shape, const ShapeIndex& index) {
+          size += GetShapeSize(sub_shape);
+          current_properties_.set_operand_bytes_accessed(
+              i, index, GetShapeSize(sub_shape));
+        });
+
     current_properties_.set_operand_bytes_accessed(i, size);
     current_properties_[kBytesAccessedKey] += size;
   }
@@ -600,7 +610,7 @@ Status HloCostAnalysis::HandleBitcast(const HloInstruction*) {
 Status HloCostAnalysis::HandleBroadcast(const HloInstruction* broadcast) {
   if (options_.count_multiple_input_accesses) {
     current_properties_.set_operand_bytes_accessed(
-        0, ShapeUtil::ElementsIn(broadcast->shape()));
+        0, GetShapeSize(broadcast->shape()));
     current_properties_.set_operand_utilization(
         0, 1.0 * ShapeUtil::ElementsIn(broadcast->shape()) /
                ShapeUtil::ElementsIn(broadcast->operand(0)->shape()));
@@ -1009,28 +1019,11 @@ Status HloCostAnalysis::HandleRngGetAndUpdateState(
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
-  VLOG(8) << "Processing fusion " << fusion->ToString();
-
-  if (fusion->IsCustomFusion()) {
-    for (const HloInstruction* hlo :
-         fusion->fused_instructions_computation()->instructions()) {
-      if (hlo->opcode() == HloOpcode::kGather) {
-        return HandleGather(hlo);
-      }
-      if (hlo->opcode() == HloOpcode::kScatter) {
-        return HandleScatter(hlo);
-      }
-    }
-  }
-  TF_ASSIGN_OR_RETURN(
-      current_properties_,
-      ProcessSubcomputation(fusion->fused_instructions_computation()));
-
+Status HloCostAnalysis::FusionProcessOutputBytesAccessed(
+    const HloInstruction* fusion) {
   // Fusion nodes that produce a tuple also produce the entries in the tuple.
   // Ignore the memory accessed inside fused ops, since fusion is supposed to
   // prevent intermediate data from touching slow memory.
-  current_properties_[kBytesAccessedKey] = 0;
   ShapeUtil::ForEachSubshape(
       fusion->shape(),
       [this, fusion](const Shape& subshape, const ShapeIndex& shape_index) {
@@ -1039,7 +1032,17 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
         }
 
         const HloInstruction* root = fusion->fused_expression_root();
-        if (shape_index.size() == 1 && root->opcode() == HloOpcode::kTuple) {
+
+        auto further_examine_index =
+            shape_index.size() == 1 && root->opcode() == HloOpcode::kTuple;
+        if (further_examine_index &&
+            ShouldFilterFusionOutputIndex(fusion, shape_index)) {
+          current_properties_.set_output_bytes_accessed(shape_index, 0);
+          hlo_properties_[root->operand(shape_index[0])]
+                         [GetOperandUtilizationKey(0)] = 0;
+          return;
+        }
+        if (further_examine_index) {
           root = root->operand(shape_index[0]);
         }
 
@@ -1072,6 +1075,9 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
       }
       for (int i = 0; i < shape.tuple_shapes_size(); ++i) {
         const Shape& subshape = shape.tuple_shapes(i);
+        if (!subshape.IsTuple() && ShouldFilterFusionOutputIndex(fusion, {i})) {
+          continue;
+        }
         ShapeIndex subshape_index(shape_index);
         subshape_index.push_back(i);
         bytes_accessed +=
@@ -1082,9 +1088,53 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
     current_properties_[GetOutputBytesAccessedKey()] = 0;
     propagate_output_size_to_parent(fusion->shape(), {});
   }
+  return OkStatus();
+}
 
-  TF_RETURN_IF_ERROR(FusionCalculateUtilizations(fusion));
+Status HloCostAnalysis::FusionProcessOperandBytesRead(
+    const HloInstruction* fusion) {
+  for (int64_t i = 0; i < fusion->fused_parameters().size(); ++i) {
+    const HloInstruction* operand = fusion->fused_parameter(i);
+    int64_t operand_size = 0;
+    if (ShouldFilterFusionInput(fusion, i)) {
+      current_properties_.set_operand_bytes_accessed(i, operand_size);
+      current_properties_.set_operand_utilization(
+          i, hlo_properties_[operand][kUtilizationKey]);
+      continue;
+    }
+    if (!operand->shape().IsTuple()) {
+      operand_size = FusionParameterReadBytes(operand);
+    } else {
+      // If the fusion parameter is a tuple type, find the gte for the leaf
+      // shape and calculate the bytes accessed for those array types.
+      ShapeUtil::ForEachLeafShape(
+          operand->shape(),
+          [&](const Shape& /*sub_shape*/, const ShapeIndex& index) {
+            const HloInstruction* gte = operand;
+            for (int64_t sub_index : index) {
+              for (const HloInstruction* user : gte->users()) {
+                if (user->opcode() == HloOpcode::kGetTupleElement &&
+                    user->tuple_index() == sub_index) {
+                  gte = user;
+                  break;
+                }
+              }
+            }
+            int64_t size = FusionParameterReadBytes(gte);
+            operand_size += size;
+            current_properties_.set_operand_bytes_accessed(i, index, size);
+          });
+    }
+    current_properties_[kBytesAccessedKey] += operand_size;
+    current_properties_.set_operand_bytes_accessed(i, operand_size);
+    current_properties_.set_operand_utilization(
+        i, hlo_properties_[operand][kUtilizationKey]);
+  }
+  return OkStatus();
+}
 
+Status HloCostAnalysis::FusionCountConstantsMemoryAccess(
+    const HloInstruction* fusion) {
   // Count memory access to all large constants.
   for (const HloInstruction* instr :
        fusion->fused_instructions_computation()->instructions()) {
@@ -1099,38 +1149,32 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
           GetShapeSize(instr->shape()) * utilization;
     }
   }
+  return OkStatus();
+}
 
-  for (int64_t i = 0; i < fusion->fused_parameters().size(); ++i) {
-    const HloInstruction* operand = fusion->fused_parameter(i);
-    int64_t operand_size = 0;
-    if (!operand->shape().IsTuple()) {
-      operand_size = FusionParameterReadBytes(operand);
-    } else {
-      // If the fusion parameter is a tuple type, find the gte for the leaf
-      // shape and calculate the bytes accessed for those array types.
-      for (const auto& indexed_shape :
-           ShapeUtil::GetLeafShapes(operand->shape())) {
-        const HloInstruction* gte = operand;
-        for (int64_t index : indexed_shape.index) {
-          for (const HloInstruction* user : gte->users()) {
-            if (user->opcode() == HloOpcode::kGetTupleElement &&
-                user->tuple_index() == index) {
-              gte = user;
-              break;
-            }
-          }
-        }
-        int64_t size = FusionParameterReadBytes(gte);
-        operand_size += size;
-        current_properties_.set_operand_bytes_accessed(i, indexed_shape.index,
-                                                       size);
+Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
+  VLOG(8) << "Processing fusion " << fusion->ToString();
+
+  if (fusion->IsCustomFusion()) {
+    for (const HloInstruction* hlo :
+         fusion->fused_instructions_computation()->instructions()) {
+      if (hlo->opcode() == HloOpcode::kGather) {
+        return HandleGather(hlo);
+      }
+      if (hlo->opcode() == HloOpcode::kScatter) {
+        return HandleScatter(hlo);
       }
     }
-    current_properties_[kBytesAccessedKey] += operand_size;
-    current_properties_.set_operand_bytes_accessed(i, operand_size);
-    current_properties_.set_operand_utilization(
-        i, hlo_properties_[operand][kUtilizationKey]);
   }
+  TF_ASSIGN_OR_RETURN(
+      current_properties_,
+      ProcessSubcomputation(fusion->fused_instructions_computation()));
+
+  current_properties_[kBytesAccessedKey] = 0;
+  TF_RETURN_IF_ERROR(FusionProcessOutputBytesAccessed(fusion));
+  TF_RETURN_IF_ERROR(FusionCalculateUtilizations(fusion));
+  TF_RETURN_IF_ERROR(FusionCountConstantsMemoryAccess(fusion));
+  TF_RETURN_IF_ERROR(FusionProcessOperandBytesRead(fusion));
 
   return OkStatus();
 }
@@ -1351,16 +1395,18 @@ int64_t HloCostAnalysis::GetBytesRead(
 int64_t HloCostAnalysis::GetBytesWritten(
     const HloInstruction& hlo, std::optional<int64_t> memory_space) const {
   int64_t bytes_written = 0;
-  for (const ShapeUtil::IndexedShape& indexed_shape :
-       ShapeUtil::GetLeafShapes(hlo.shape())) {
-    std::optional<int64_t> index_memory_space;
-    if (indexed_shape.shape.has_layout()) {
-      index_memory_space = indexed_shape.shape.layout().memory_space();
-    }
-    if (!memory_space || memory_space == index_memory_space) {
-      bytes_written += output_bytes_accessed(hlo, indexed_shape.index);
-    }
-  }
+
+  ShapeUtil::ForEachLeafShape(
+      hlo.shape(), [&](const Shape& sub_shape, const ShapeIndex& index) {
+        std::optional<int64_t> index_memory_space;
+        if (sub_shape.has_layout()) {
+          index_memory_space = sub_shape.layout().memory_space();
+        }
+        if (!memory_space || memory_space == index_memory_space) {
+          bytes_written += output_bytes_accessed(hlo, index);
+        }
+      });
+
   return bytes_written;
 }
 
@@ -1381,19 +1427,17 @@ std::unique_ptr<HloCostAnalysis> HloCostAnalysis::CreateNestedCostAnalysis() {
 
 /*static*/ std::string HloCostAnalysis::GetOperandBytesAccessedKey(
     int64_t operand_num, const ShapeIndex& index) {
-  return absl::StrCat(kBytesAccessedKey, " operand ", operand_num, " ",
-                      index.ToString());
+  return absl::StrCat(kBytesAccessedKey, operand_num, index.ToString());
 }
 
 /*static*/ std::string HloCostAnalysis::GetOperandUtilizationKey(
     int64_t operand_num, const ShapeIndex& index) {
-  return absl::StrCat(kUtilizationKey, " operand ", operand_num, " ",
-                      index.ToString());
+  return absl::StrCat(kUtilizationKey, operand_num, index.ToString());
 }
 
 /*static*/ std::string HloCostAnalysis::GetOutputBytesAccessedKey(
     const ShapeIndex& index) {
-  return absl::StrCat(kBytesAccessedKey, " output ", index.ToString());
+  return absl::StrCat(kBytesAccessedKey, "out", index.ToString());
 }
 
 bool HloCostAnalysis::KeyToCopyFromSubcomputation(absl::string_view key) const {
diff --git a/third_party/xla/xla/service/hlo_cost_analysis.h b/third_party/xla/xla/service/hlo_cost_analysis.h
index 8305b0fadd215c..41ab8791eae9ae 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis.h
+++ b/third_party/xla/xla/service/hlo_cost_analysis.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_HLO_COST_ANALYSIS_H_
 #define XLA_SERVICE_HLO_COST_ANALYSIS_H_
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
@@ -247,7 +248,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
     // props[kFlopsKey] gets optimized to `return flops_` just fine.
 
     // Getters/setters for more complex properties like operand utilization,
-    // where we have a fastpath for e.g. operand 0/1 + shape_index {}.
+    // where we have a fastpath, e.g., operand 0/1 + shape_index {}.
     float operand_utilization(int64_t operand,
                               const ShapeIndex& shape_index = {}) {
       if (operand == 0 && shape_index.empty()) {
@@ -333,15 +334,15 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
    private:
     // These must match GetOperandUtilizationKey(0, {}) etc.
     static inline constexpr absl::string_view kOperand0UtilizationKey =
-        "utilization operand 0 {}";
+        "utilization0{}";
     static inline constexpr absl::string_view kOperand1UtilizationKey =
-        "utilization operand 1 {}";
+        "utilization1{}";
     static inline constexpr absl::string_view kOperand0BytesAccessedKey =
-        "bytes accessed operand 0 {}";
+        "bytes accessed0{}";
     static inline constexpr absl::string_view kOperand1BytesAccessedKey =
-        "bytes accessed operand 1 {}";
+        "bytes accessed1{}";
     static inline constexpr absl::string_view kOutputRootBytesAccessedKey =
-        "bytes accessed output {}";
+        "bytes accessedout{}";
 
     float flops_;
     float transcendentals_;
@@ -398,7 +399,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
     }
 
     // Returns the specified per-second rate used by cost analysis.
-    float per_second_rate(const std::string& key) const {
+    float per_second_rate(absl::string_view key) const {
       return per_second_rates[key];
     }
   };
@@ -546,7 +547,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
 
   // Returns the specified per-second rate used by cost analysis.
   float per_second_rate(absl::string_view key) const {
-    return options_.per_second_rates[key];
+    return options_.per_second_rate(key);
   }
 
   // Return the key that is used to index into Properties for the specified
@@ -571,6 +572,37 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
                              const DotDimensionNumbers& dnums);
 
  protected:
+  // Computes the bytes accessed based on the outputs produced by the fusion
+  // instruction.
+  virtual Status FusionProcessOutputBytesAccessed(const HloInstruction* fusion);
+
+  // Computes the bytes accessed (read) based on the inputs consumed by the
+  // fusion instruction.
+  virtual Status FusionProcessOperandBytesRead(const HloInstruction* fusion);
+
+  // Computes memory access to all larger constants in the fusion instruction.
+  virtual Status FusionCountConstantsMemoryAccess(const HloInstruction* fusion);
+
+  // Allows exclusion of certain types of inputs from bytes accessed during
+  // FusionProcessOperandBytesRead.
+  virtual bool ShouldFilterFusionInput(const HloInstruction* fusion,
+                                       int64_t input_index) {
+    return false;
+  }
+
+  // Allows exclusion of certain instructions from FusionCalculateUtilizations.
+  virtual bool ShouldFilterFusionInstruction(
+      const HloInstruction* fusion, const HloInstruction* instruction) {
+    return false;
+  }
+
+  // Allows exclusion of certain types of output from bytes written during
+  // FusionProcessOutputBytesAccessed.
+  virtual bool ShouldFilterFusionOutputIndex(const HloInstruction* fusion,
+                                             const ShapeIndex& output_index) {
+    return false;
+  }
+
   typedef absl::flat_hash_map<const HloInstruction*, Properties>
       HloToProperties;
 
@@ -588,7 +620,8 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   // given hlo. The cost of visited sub HLO instructions is saved to
   // hlo_properties_, which will be used by functions such as
   // flop_count(hlo_instruction) to return cost of a particular HLO instruction.
-  StatusOr<Properties> ProcessSubcomputation(HloComputation* computation);
+  virtual StatusOr<Properties> ProcessSubcomputation(
+      HloComputation* computation);
 
   // Utility function to handle all element-wise operations.
   Status HandleElementwiseOp(const HloInstruction* hlo_instruction);
@@ -615,7 +648,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   // bottleneck.
   bool current_should_compute_bottleneck_time_;
 
-  // The properties of the currently visited instruction. A HandleFoo method can
+  // The properties of the currently visited instruction. A HandleFoo method
   // modify these to change the default values computed in Preprocess.
   Properties current_properties_;
 
diff --git a/third_party/xla/xla/service/hlo_cost_analysis_test.cc b/third_party/xla/xla/service/hlo_cost_analysis_test.cc
index a656bf47a337f3..f69d0f5ba66658 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis_test.cc
+++ b/third_party/xla/xla/service/hlo_cost_analysis_test.cc
@@ -631,6 +631,23 @@ TEST_F(HloCostAnalysisTest, Broadcast) {
   EXPECT_EQ(analysis.output_bytes_accessed(*root), sizeof(float) * 10 * 7);
 }
 
+TEST_F(HloCostAnalysisTest, BroadcastCountMultipleInputAccesses) {
+  XlaBuilder b("broadcast");
+  Broadcast(ConstantR0<float>(&b, 42), {10, 7});
+  auto hlo_module = BuildHloGraph(&b);
+  HloCostAnalysis analysis(HloCostAnalysis::Options{
+      .shape_size = ShapeSize, .count_multiple_input_accesses = true});
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+  EXPECT_EQ(analysis.flop_count(), 0);
+
+  EXPECT_EQ(analysis.bytes_accessed(), sizeof(float) * (1 + 10 * 7));
+
+  HloInstruction* root = hlo_module->entry_computation()->root_instruction();
+  EXPECT_EQ(analysis.operand_bytes_accessed(*root, 0), sizeof(float) * 10 * 7);
+  EXPECT_EQ(analysis.output_bytes_accessed(*root), sizeof(float) * 10 * 7);
+}
+
 // Calculates the computation cost of a graph with more than one HLO node.
 TEST_F(HloCostAnalysisTest, FullyConnectedForward) {
   XlaBuilder builder("fully_connected_forward");
diff --git a/third_party/xla/xla/service/hlo_dataflow_analysis.cc b/third_party/xla/xla/service/hlo_dataflow_analysis.cc
index 6c9f40e33d8b32..465c88eb01d355 100644
--- a/third_party/xla/xla/service/hlo_dataflow_analysis.cc
+++ b/third_party/xla/xla/service/hlo_dataflow_analysis.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 #include <queue>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -1846,59 +1847,63 @@ std::vector<std::pair<HloOperandIndex, ShapeIndex>>
 GetFusionInstructionInPlaceInputOutputPairs(const HloInstruction* instruction) {
   std::vector<std::pair<HloOperandIndex, ShapeIndex>>
       in_place_input_output_pairs;
+
   // Each of these leaves represents one array output of the fusion that might
   // be aliased with one of the fusion computation's array inputs (both could be
   // nested arbitrarily deep inside tuples).
-  for (const auto& fusion_output_array_shape :
-       ShapeUtil::GetLeafShapes(instruction->shape())) {
-    // Start from the root instruction of the fusion computation and follow
-    // tuple indirection backwards to find the "output source", i.e. the
-    // instruction that is the original source of the array output in question.
-    // If there is no such indirection the "output source" will just be the
-    // fusion root instruction itself.
-    const HloInstruction* output_source_instruction =
-        instruction->fused_expression_root();
-    ShapeIndex output_source_index = fusion_output_array_shape.index;
-    std::tie(output_source_instruction, output_source_index) =
-        FollowTupleIndirection(output_source_instruction, output_source_index);
-
-    // The aliasing rules of the "output source" instruction determine the
-    // aliasing rules for the entire fusion. If we can connect (following tuple
-    // indirection) the input of an "in-place" pair to one of the fusion's
-    // inputs, and the output of this "in-place" pair to the fusion output
-    // in question, then this fusion input and output must alias.
-    auto in_place_pairs = HloDataflowAnalysis::GetInPlaceInputOutputPairs(
-        output_source_instruction);
-    ShapeIndex in_place_input_index;
-    const HloInstruction* in_place_input_source = nullptr;
-
-    for (const auto& output_source_in_place_pair : in_place_pairs) {
-      const HloOperandIndex& input = output_source_in_place_pair.first;
-      const ShapeIndex& output_index = output_source_in_place_pair.second;
-      if (output_index == output_source_index) {
-        // It is not possible for the same output to alias multiple inputs.
-        CHECK(in_place_input_source == nullptr);
-        in_place_input_source =
-            output_source_instruction->operand(input.operand_number);
-        in_place_input_index = input.operand_index;
-      }
-    }
-
-    if (in_place_input_source) {
-      // Follow tuple indirection backwards from the instruction input to try to
-      // find a fusion parameter. If found, that parameter aliases the current
-      // output. If not, the current output aliases no input.
-      std::tie(in_place_input_source, in_place_input_index) =
-          FollowTupleIndirection(in_place_input_source, in_place_input_index);
+  ShapeUtil::ForEachLeafShape(
+      instruction->shape(),
+      [&](const Shape& sub_shape, const ShapeIndex& index) {
+        // Start from the root instruction of the fusion computation and follow
+        // tuple indirection backwards to find the "output source", i.e. the
+        // instruction that is the original source of the array output in
+        // question. If there is no such indirection the "output source" will
+        // just be the fusion root instruction itself.
+        const HloInstruction* output_source_instruction =
+            instruction->fused_expression_root();
+        ShapeIndex output_source_index = index;
+        std::tie(output_source_instruction, output_source_index) =
+            FollowTupleIndirection(output_source_instruction,
+                                   output_source_index);
+
+        // The aliasing rules of the "output source" instruction determine the
+        // aliasing rules for the entire fusion. If we can connect (following
+        // tuple indirection) the input of an "in-place" pair to one of the
+        // fusion's inputs, and the output of this "in-place" pair to the fusion
+        // output in question, then this fusion input and output must alias.
+        auto in_place_pairs = HloDataflowAnalysis::GetInPlaceInputOutputPairs(
+            output_source_instruction);
+        ShapeIndex in_place_input_index;
+        const HloInstruction* in_place_input_source = nullptr;
+
+        for (const auto& output_source_in_place_pair : in_place_pairs) {
+          const HloOperandIndex& input = output_source_in_place_pair.first;
+          const ShapeIndex& output_index = output_source_in_place_pair.second;
+          if (output_index == output_source_index) {
+            // It is not possible for the same output to alias multiple inputs.
+            CHECK(in_place_input_source == nullptr);
+            in_place_input_source =
+                output_source_instruction->operand(input.operand_number);
+            in_place_input_index = input.operand_index;
+          }
+        }
 
-      if (in_place_input_source->opcode() == HloOpcode::kParameter) {
-        in_place_input_output_pairs.emplace_back(
-            HloOperandIndex{in_place_input_source->parameter_number(),
-                            in_place_input_index},
-            fusion_output_array_shape.index);
-      }
-    }
-  }
+        if (in_place_input_source) {
+          // Follow tuple indirection backwards from the instruction input to
+          // try to find a fusion parameter. If found, that parameter aliases
+          // the current output. If not, the current output aliases no input.
+          std::tie(in_place_input_source, in_place_input_index) =
+              FollowTupleIndirection(in_place_input_source,
+                                     in_place_input_index);
+
+          if (in_place_input_source->opcode() == HloOpcode::kParameter) {
+            in_place_input_output_pairs.emplace_back(
+                HloOperandIndex{in_place_input_source->parameter_number(),
+                                in_place_input_index},
+                index);
+          }
+        }
+      });
   return in_place_input_output_pairs;
 }
 
diff --git a/third_party/xla/xla/service/hlo_graph_dumper.cc b/third_party/xla/xla/service/hlo_graph_dumper.cc
index b367a27d25df8e..c5215b14c77e64 100644
--- a/third_party/xla/xla/service/hlo_graph_dumper.cc
+++ b/third_party/xla/xla/service/hlo_graph_dumper.cc
@@ -22,14 +22,17 @@ limitations under the License.
 #include <algorithm>
 #include <atomic>
 #include <deque>
+#include <functional>
 #include <map>
 #include <memory>
 #include <optional>
 #include <queue>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/match.h"
@@ -98,8 +101,9 @@ class NodeFilter {
   NodeFilter() : filter_([](const HloInstruction*) { return kNormalNode; }) {}
 
   explicit NodeFilter(
-      std::function<NodeFilterResult(const HloInstruction* instr)> filter)
-      : filter_(std::move(filter)) {}
+      std::function<NodeFilterResult(const HloInstruction* instr)> filter,
+      std::optional<int> num_rendered = std::nullopt)
+      : filter_(std::move(filter)), num_rendered_(num_rendered) {}
 
   bool Show(const HloInstruction* instr) const {
     return filter_(instr) != kHideNode;
@@ -120,8 +124,12 @@ class NodeFilter {
            result == kSomeUsersOmitted;
   }
 
+  // Returns an optionally recorded number of nodes which will be rendered.
+  std::optional<int> GetNumRendered() const { return num_rendered_; }
+
  private:
   std::function<NodeFilterResult(const HloInstruction* instr)> filter_;
+  std::optional<int> num_rendered_;
 };
 
 // We arbitrarily set this as the boundary between "large" and "small"
@@ -644,6 +652,11 @@ bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
     return false;
   }
 
+  if (subcomp->WhileCallInstruction() != nullptr &&
+      !hlo_render_options_.show_while_subcomputations) {
+    return false;
+  }
+
   // Show the subcomputation if we're showing any of its members.
   return absl::c_any_of(
       subcomp->instructions(),
@@ -1392,11 +1405,12 @@ std::string HloDotDumper::GetInstructionNodeExtraInfo(
   for (const auto& line : instr->ExtraAttributesToString(
            HloPrintOptions().set_print_subcomputation_mode(
                HloPrintOptions::PrintSubcomputationMode::kOff))) {
-    // Some instructions have giant device identifier fields, so truncate their
-    // length to 128.
+    // Some instructions have giant device identifier or control-predecessor
+    // fields, so truncate their length to 128.
     constexpr int kMaxDeviceIdFieldLen = 128;
     if ((absl::StartsWith(line, "replica_groups=") ||
-         absl::StartsWith(line, "source_target_pairs=")) &&
+         absl::StartsWith(line, "source_target_pairs=") ||
+         absl::StartsWith(line, "control-predecessors=")) &&
         line.length() > kMaxDeviceIdFieldLen) {
       lines.push_back(HtmlLikeStringSanitize(
           StrCat(line.substr(0, kMaxDeviceIdFieldLen - 3), "...")));
@@ -1575,8 +1589,20 @@ NodeFilter MakeNodeRadiusAroundFilter(
     // are not interesting to the graph at hand.
     if (instr == root || instr->opcode() != HloOpcode::kTuple) {
       for (const HloInstruction* operand : instr->operands()) {
+        // Special logic for handling bitcasts: since sometimes bitcasts are not
+        // fused, they create a lot of extra nodes in the graph, with exactly
+        // one input and output. Adding such nodes does not "really" increase
+        // the size of the graph (since they don't add extra information), and
+        // stopping the rendering early cuts off important information (you
+        // almost never want the rendering to be cutoff at the bitcast: you'd
+        // like to see its parent).
         if (!nodes.contains(operand)) {
-          worklist.push_back({operand, depth + 1});
+          int new_depth = (operand->opcode() == HloOpcode::kBitcast ||
+                           instr->opcode() == HloOpcode::kBitcast)
+                              ? depth
+                              : depth + 1;
+
+          worklist.push_back({operand, new_depth});
         }
       }
     }
@@ -1643,17 +1669,19 @@ NodeFilter MakeNodeRadiusAroundFilter(
   // Highlight the root node.
   nodes[root] = kHighlightNode;
 
-  return NodeFilter([=](const HloInstruction* instr) {
-    auto it = nodes.find(instr);
-    if (it != nodes.end()) {
-      return it->second;
-    }
-    // Show all nodes in subcomputations.
-    if (instr->parent() != root->parent()) {
-      return kNormalNode;
-    }
-    return kHideNode;
-  });
+  return NodeFilter(
+      [=](const HloInstruction* instr) {
+        auto it = nodes.find(instr);
+        if (it != nodes.end()) {
+          return it->second;
+        }
+        // Show all nodes in subcomputations.
+        if (instr->parent() != root->parent()) {
+          return kNormalNode;
+        }
+        return kHideNode;
+      },
+      nodes.size());
 }
 
 // Gets a node filter that includes nodes on all paths from `from` to `to`.  If
@@ -1706,114 +1734,6 @@ NodeFilter MakeNodeFromToFilter(const HloInstruction* from,
   });
 }
 
-std::string WrapDotInHtml(absl::string_view dot) {
-  std::string html_prefix =
-      absl::StrReplaceAll(R"html(
-<!DOCTYPE html>
-<html>
-<head>
-  <meta charset="utf-8">
-  <style type="text/css">
-    body {
-      height: 100vh;
-      margin: 0;
-    }
-  </style>
-</head>
-<body>
-  $JS_INCLUDE
-  <div id="container" style="height:95vh; border:1px solid black; "></div>
-  <script>
-    var data = `
-)html",
-                          {{"$JS_INCLUDE", kRenderDotJS}});
-
-  static const char html_suffix[] = R"html(
-`;
-    var cssregex = new RegExp('stylesheet=<([^]*)\n>\n', 'gm');
-    var results = cssregex.exec(data)
-    // graphviz has problem dealing with large stylesheets.
-    // https://github.com/tensorflow/tensorflow/issues/17220#issuecomment-369228492
-    // In order to avoid the problem, remove the stylesheet from the dot and
-    // insert it directly info the rendered SVG.
-    var dot_data = data;
-    var css_data = ''
-    if (results !== null) {
-        css_data = results[1].replace(/\s*data:.*\s*,/,''); // Strip content-type field.
-        // CSS inside DOT is URL-escaped, so we must unescape it
-        // before we can insert it into SVG.
-        css_data = unescape(css_data);
-        dot_data = data.replace(cssregex, ''); // Remove the stylesheet
-    }
-
-    var render_start = performance.now()
-    function add_controls(svg) {
-        var htmlblob = new Blob([document.documentElement.innerHTML],
-                                {type: 'text/html'});
-        var savehtml = document.createElement('a');
-        savehtml.setAttribute('href', URL.createObjectURL(htmlblob));
-        savehtml.setAttribute('download', 'graph.html');
-        savehtml.innerHTML = " [Save HTML+SVG] ";
-        document.body.append(savehtml);
-        var svgblob = new Blob([svg.outerHTML], {type: 'image/svg'});
-        var savesvg = document.createElement('a');
-        savesvg.setAttribute('href', URL.createObjectURL(svgblob));
-        savesvg.setAttribute('download', 'graph.svg');
-        savesvg.innerHTML = " [Save SVG] ";
-        document.body.append(savesvg);
-        var dotblob =  new Blob([data], {type: 'text/dot'});
-        var savedot = document.createElement('a');
-        savedot.setAttribute('href', URL.createObjectURL(dotblob));
-        savedot.setAttribute('download', 'graph.dot');
-        savedot.innerHTML = " [Save DOT] ";
-        document.body.append(savedot);
-        // Will get called after embed element was loaded
-        var panzoom = svgPanZoom(svg, {
-            zoomEnabled: true,
-            controlIconsEnabled: true,
-            maxZoom: 100,
-        });
-        document.getElementsByTagName("BODY")[0].onresize = function() {
-            panzoom.resize();
-            panzoom.fit();
-            panzoom.center();
-        };
-        var render_end = performance.now();
-        var render_note = document.createElement('div')
-        render_note.innerHTML = 'Rendering took '
-                                + (render_end - render_start).toFixed(2) + "ms."
-        document.body.append(render_note);
-    }
-    var svg = document.getElementById('graph')
-    if (svg == null) {
-        // Need to render SVG first.
-        var viz = new Viz();
-        viz.renderSVGElement(dot_data)
-            .then(function(svg){
-                var container = document.getElementById('container')
-                var style = document.createElementNS('http://www.w3.org/2000/svg', 'style');
-                var node = document.createTextNode(css_data);
-                style.appendChild(node);
-                svg.setAttribute('width', '100%');
-                svg.setAttribute('height', '100%');
-                svg.setAttribute('id', 'graph');
-                svg.appendChild(style);
-                container.appendChild(svg);
-                add_controls(svg);
-            })
-    } else {
-        // HTML already has rendered SVG embedded, so we just need to add
-        // controls.
-        add_controls(svg);
-    }
-  </script>
-</body>
-</html>
-)html";
-
-  return absl::StrCat(html_prefix, dot, html_suffix);
-}
-
 absl::Mutex url_renderer_mu(absl::kConstInit);
 std::function<StatusOr<std::string>(absl::string_view)>* url_renderer
     ABSL_GUARDED_BY(url_renderer_mu) = nullptr;
@@ -1860,27 +1780,6 @@ static std::pair<int, int> FusionVisualizerStateKey(
                         computation.unique_id());
 }
 
-// Precondition: (url_renderer != nullptr || format != kUrl).
-//
-// (We specify this as a precondition rather than checking it in here and
-// returning an error because we want to fail quickly when there's no URL
-// renderer available, and this function runs only after we've done all the work
-// of producing dot for the graph.)
-StatusOr<std::string> WrapDotInFormat(const HloComputation& computation,
-                                      absl::string_view dot,
-                                      RenderedGraphFormat format)
-    ABSL_EXCLUSIVE_LOCKS_REQUIRED(url_renderer_mu) {
-  switch (format) {
-    case RenderedGraphFormat::kUrl:
-      CHECK(url_renderer != nullptr)
-          << "Should have checked url_renderer != null before calling.";
-      return (*url_renderer)(dot);
-    case RenderedGraphFormat::kHtml:
-      return WrapDotInHtml(dot);
-    case RenderedGraphFormat::kDot:
-      return std::string(dot);
-  }
-}
 
 }  // namespace
 
@@ -1926,13 +1825,9 @@ static std::string EscapeJSONString(absl::string_view raw) {
       "\"");
 }
 
-StatusOr<std::string> WrapFusionExplorer(const HloComputation& computation) {
-  absl::MutexLock lock(&fusion_visualizer_state_mu);
-  using absl::StrAppend;
-  using absl::StrFormat;
-  using absl::StrJoin;
-  const FusionVisualizerProgress& visualizer_progress =
-      fusion_visualizer_states[FusionVisualizerStateKey(computation)];
+StatusOr<std::string> WrapFusionExplorer(
+    const FusionVisualizerProgress& visualizer_progress,
+    absl::string_view graph_title) {
   if (visualizer_progress.frames.empty()) {
     return InternalError("Empty");
   }
@@ -1954,7 +1849,7 @@ StatusOr<std::string> WrapFusionExplorer(const HloComputation& computation) {
                       CompressAndEncode(dot_graphs));
 
   return absl::StrReplaceAll(
-      R"(
+      R"wrapper(
 <!DOCTYPE html>
 <html>
 <head>
@@ -2042,6 +1937,14 @@ StatusOr<std::string> WrapFusionExplorer(const HloComputation& computation) {
       }
       document.getElementById('performance_note').innerText =
         `Rendering took ${(performance.now() - render_start).toFixed(2)}ms`;
+
+      // Change cursor.
+      let text_nodes = document.getElementsByTagName("text");
+      for (var el of text_nodes) {
+        if (title_to_id.has(el.innerHTML)) {
+          el.style.cursor = "pointer";
+        }
+      }
     };
     if (renderCache[dot_ptr]) {
       render_callback(renderCache[dot_ptr]);
@@ -2112,17 +2015,70 @@ StatusOr<std::string> WrapFusionExplorer(const HloComputation& computation) {
       renderFrameList();
       renderCurrentFrame();
     });
+
+    window.title_to_id = new Map();
+    for (let i=0; i < frames.length; i++) {
+       title_to_id.set(frames[i][1], i);
+     }
+
+    // Navigate to next elements on click.
+    document.addEventListener("click", (event) => {
+      let txt = event.target.innerHTML;
+      if (title_to_id.has(txt)) {
+        let id = title_to_id.get(txt);
+        window.location.hash = `#frame${id}`;
+      }
+    });
   });
 
   //-->
   </script>
   </body>
 </html>
-  )",
+  )wrapper",
       {{"$DOTS", dot_graphs_compressed},
        {"$FRAMES", frames},
-       {"$TITLE",
-        absl::StrCat(computation.parent()->name(), "_", computation.name())}});
+       {"$TITLE", graph_title}});
+}
+
+static std::string GraphTitle(const HloComputation& computation) {
+  return absl::StrCat(computation.parent()->name(), "_", computation.name());
+}
+
+StatusOr<std::string> WrapFusionExplorer(const HloComputation& computation) {
+  absl::MutexLock lock(&fusion_visualizer_state_mu);
+  const FusionVisualizerProgress& visualizer_progress =
+      fusion_visualizer_states[FusionVisualizerStateKey(computation)];
+  return WrapFusionExplorer(visualizer_progress, GraphTitle(computation));
+}
+
+static StatusOr<std::string> WrapDotInHtml(absl::string_view dot,
+                                           absl::string_view title) {
+  FusionVisualizerProgress progress;
+  progress.AddState(dot, title, std::nullopt);
+  return WrapFusionExplorer(progress, title);
+}
+
+// Precondition: (url_renderer != nullptr || format != kUrl).
+//
+// (We specify this as a precondition rather than checking it in here and
+// returning an error because we want to fail quickly when there's no URL
+// renderer available, and this function runs only after we've done all the work
+// of producing dot for the graph.)
+static StatusOr<std::string> WrapDotInFormat(const HloComputation& computation,
+                                             absl::string_view dot,
+                                             RenderedGraphFormat format)
+    ABSL_EXCLUSIVE_LOCKS_REQUIRED(url_renderer_mu) {
+  switch (format) {
+    case RenderedGraphFormat::kUrl:
+      CHECK(url_renderer != nullptr)
+          << "Should have checked url_renderer != null before calling.";
+      return (*url_renderer)(dot);
+    case RenderedGraphFormat::kHtml:
+      return WrapDotInHtml(dot, GraphTitle(computation));
+    case RenderedGraphFormat::kDot:
+      return std::string(dot);
+  }
 }
 
 void RegisterGraphToURLRenderer(
@@ -2185,6 +2141,44 @@ StatusOr<std::string> RenderGraph(const HloComputation& computation,
   return WrapDotInFormat(computation, rendered_dot, format);
 }
 
+StatusOr<std::string> RenderAllComputationsToHtml(const HloModule& module) {
+  FusionVisualizerProgress progress;
+
+  std::vector<HloInstruction*> instrs =
+      module.entry_computation()->MakeInstructionPostOrder();
+  absl::c_reverse(instrs);
+  for (const HloInstruction* instr : instrs) {
+    if (absl::c_linear_search(
+            std::vector<HloOpcode>{HloOpcode::kConstant,
+                                   HloOpcode::kGetTupleElement},
+            instr->opcode())) {
+      continue;
+    }
+
+    HloRenderOptions opts;
+    opts.show_fusion_subcomputations = true;
+    opts.show_backend_config = true;
+    opts.show_while_subcomputations = instr->opcode() == HloOpcode::kWhile;
+
+    // Dynamically adjusts the radius with a magical cutoff of 100.
+    static constexpr int64_t max_nodes_to_render = 100;
+    absl::flat_hash_set<const HloInstruction*> render_boundary;
+
+    NodeFilter filter = MakeNodeRadiusAroundFilter(instr, 2, render_boundary);
+    if (filter.GetNumRendered().value_or(1) > max_nodes_to_render) {
+      filter = MakeNodeRadiusAroundFilter(instr, 1, render_boundary);
+    }
+
+    std::string dot =
+        HloDotDumper(module.entry_computation(), instr->name(),
+                     module.config().debug_options(), opts, filter)
+            .Dump();
+    progress.AddState(dot, instr->name(), std::nullopt);
+  }
+
+  return WrapFusionExplorer(progress, module.name());
+}
+
 StatusOr<std::string> RenderNeighborhoodAround(
     const HloInstruction& node, int radius, RenderedGraphFormat format,
     HloRenderOptions hlo_render_options,
diff --git a/third_party/xla/xla/service/hlo_graph_dumper.h b/third_party/xla/xla/service/hlo_graph_dumper.h
index 0017e1f0347dd5..fca549b550fdb1 100644
--- a/third_party/xla/xla/service/hlo_graph_dumper.h
+++ b/third_party/xla/xla/service/hlo_graph_dumper.h
@@ -52,12 +52,11 @@ inline constexpr char kRenderDotJS[] = R"(
   <script src="https://cdn.jsdelivr.net/npm/viz.js@2.1.1/full.render.js"
      integrity="sha384-bAixY275aIpCj6Te19y0MILZ4V+VEC8CVFujFEH+Lf7W+4XYYeYLwW5IBI6yQmMT"
      crossorigin="anonymous"></script>
-  <script src="https://cdn.jsdelivr.net/npm/svg-pan-zoom@3.6.0/dist/svg-pan-zoom.min.js"
-     integrity="sha384-3008WpYB2pOBvE7lwkrKf+qTmbTPGGPYxA9C1YVhvbPukns4ZFj7E98QPLkNW9dS"
-     crossorigin="anonymous"></script>
-  <script src="https://cdn.jsdelivr.net/npm/@hpcc-js/wasm/dist/index.min.js"
-     integrity="sha384-X+8WXyWZ+W2gUHiSSj0aePAkE77Fl6eZ+QIByw+Ii8LzWEJ/W8bI8M4RkneDAJ4D"
-     crossorigin="anonymous"></script>
+  <script src="https://www.gstatic.com/external_hosted/hpcc_js_wasm/index.min.js"
+      integrity="sha384-LigJPbR3TOfU/Xbb+PjiN1dGJYPweLk7kiGnaMgmxnUmKWaCFKbb5tH6iLlyVhPZ"
+      crossorigin="anonymous"></script>
+  <script src="https://www.gstatic.com/external_hosted/svg_pan_zoom/svg-pan-zoom.js">
+  </script>
 )";
 
 // Different formats that a graph can be packaged as.
@@ -73,6 +72,9 @@ struct HloRenderOptions {
 
   // Include the fusion subcomputations in the rendered graph.
   bool show_fusion_subcomputations = true;
+
+  // Include the while subcomputations in the rendered graph.
+  bool show_while_subcomputations = true;
 };
 
 // Renders an HLO module as a human-readable visual graph.
@@ -82,10 +84,13 @@ struct HloRenderOptions {
 // unreadable, or both.  To view such graphs, use a tool such as
 // interactive_graphviz, which calls RenderNeighborhoodAround to render subsets
 // of a graph.
-StatusOr<std::string> RenderGraph(
-    const HloComputation& computation, absl::string_view label,
-    const DebugOptions& debug_options, RenderedGraphFormat format,
-    HloRenderOptions hlo_render_options = {});
+StatusOr<std::string> RenderGraph(const HloComputation& computation,
+                                  absl::string_view label,
+                                  const DebugOptions& debug_options,
+                                  RenderedGraphFormat format,
+                                  HloRenderOptions hlo_render_options = {});
+
+StatusOr<std::string> RenderAllComputationsToHtml(const HloModule& module);
 
 // Like RenderGraph, but renders only nodes "near" the given node in the graph.
 //
diff --git a/third_party/xla/xla/service/hlo_instruction_test.cc b/third_party/xla/xla/service/hlo_instruction_test.cc
index e5ab4db8acba4c..62d86cfa0c70c4 100644
--- a/third_party/xla/xla/service/hlo_instruction_test.cc
+++ b/third_party/xla/xla/service/hlo_instruction_test.cc
@@ -2541,5 +2541,128 @@ TEST_F(HloInstructionTest, PrintCycle) {
   ASSERT_IS_OK(send_done->DropAllControlDeps());
 }
 
+TEST_F(HloInstructionTest, SetOperationQueueId) {
+  std::unique_ptr<HloComputation> main_computation;
+  HloComputation::Builder main_builder("Entry");
+  const Shape scalar_shape = ShapeUtil::MakeScalarShape(F32);
+  HloInstruction* param0 = main_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+  HloInstruction* param1 = main_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+
+  HloInstruction* add =
+      main_builder.AddInstruction(HloInstruction::CreateBinary(
+          scalar_shape, HloOpcode::kAdd, param0, param1));
+  add->set_operation_queue_id(3);
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(main_builder.Build());
+
+  auto options = HloPrintOptions().set_print_metadata(false);
+  EXPECT_EQ(module->entry_computation()->root_instruction()->ToString(options),
+            "%add = f32[] add(f32[] %p0, f32[] %p1), operation_queue_id=3");
+}
+
+TEST_F(HloInstructionTest, SetWaitOnOperationQueues) {
+  std::unique_ptr<HloComputation> main_computation;
+  HloComputation::Builder main_builder("Entry");
+  const Shape scalar_shape = ShapeUtil::MakeScalarShape(F32);
+  HloInstruction* param0 = main_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+  HloInstruction* param1 = main_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+
+  HloInstruction* add =
+      main_builder.AddInstruction(HloInstruction::CreateBinary(
+          scalar_shape, HloOpcode::kAdd, param0, param1));
+  std::vector<int64_t> wait_on_queues = {0, 2};
+  add->set_wait_on_operation_queues(wait_on_queues);
+  add->add_wait_on_operation_queues(5);
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(main_builder.Build());
+
+  auto options = HloPrintOptions().set_print_metadata(false);
+  EXPECT_EQ(module->entry_computation()->root_instruction()->ToString(options),
+            "%add = f32[] add(f32[] %p0, f32[] %p1), "
+            "wait_on_operation_queues={0, 2, 5}");
+}
+
+TEST_F(HloInstructionTest, ParseOperationQueueId) {
+  constexpr char kHloString[] = R"(
+  ENTRY main {
+    c0 = f32[] constant(0)
+    ROOT add0 = f32[] add(c0, c0), operation_queue_id=2
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  EXPECT_EQ(
+      module->entry_computation()->root_instruction()->operation_queue_id(), 2);
+}
+
+TEST_F(HloInstructionTest, ParseWaitOnOperationQueues) {
+  constexpr char kHloString[] = R"(
+  ENTRY main {
+    c0 = f32[] constant(0)
+    ROOT add0 = f32[] add(c0, c0), wait_on_operation_queues={0,2}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  std::vector<int64_t> expected_wait_on_queue_ids = {0, 2};
+  for (int64_t i = 0; i < expected_wait_on_queue_ids.size(); i++) {
+    EXPECT_EQ(expected_wait_on_queue_ids[i],
+              module->entry_computation()
+                  ->root_instruction()
+                  ->wait_on_operation_queues()[i]);
+  }
+}
+
+TEST_F(HloInstructionTest, VerifyBodyComputationPointsToWhile) {
+  auto module = CreateNewVerifiedModule();
+  const Shape scalar_shape = ShapeUtil::MakeScalarShape(F32);
+
+  HloComputation::Builder cond_builder("cond");
+  {
+    const Shape scalar_shape = ShapeUtil::MakeScalarShape(F32);
+    HloInstruction* param = cond_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* constant = cond_builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1024.0)));
+    cond_builder.AddInstruction(
+        HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), param,
+                                      constant, ComparisonDirection::kLt));
+  }
+  auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build());
+
+  HloComputation::Builder body_builder("body");
+  {
+    const Shape scalar_shape = ShapeUtil::MakeScalarShape(F32);
+    HloInstruction* param = body_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    body_builder.AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kMultiply, param, param));
+  }
+  auto body_computation = module->AddEmbeddedComputation(body_builder.Build());
+
+  std::unique_ptr<HloComputation> main_computation;
+  HloComputation::Builder main_builder("Entry");
+  HloInstruction* param = main_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "input"));
+  main_builder.AddInstruction(HloInstruction::CreateWhile(
+      scalar_shape, cond_computation, body_computation, param));
+
+  module->AddEntryComputation(main_builder.Build());
+  // Should find one while body computation in the graph and it should point to
+  // the while instruction.
+  int num_while_body_comp = 0;
+  for (HloComputation* comp : module->MakeComputationPostOrder()) {
+    if (comp->IsWhileBodyComputation()) {
+      num_while_body_comp += 1;
+      EXPECT_EQ(comp->WhileCallInstruction(),
+                module->entry_computation()->root_instruction());
+    }
+  }
+  EXPECT_EQ(num_while_body_comp, 1);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_lexer.cc b/third_party/xla/xla/service/hlo_lexer.cc
index 0e53e6f844e99f..bd516129caa7a4 100644
--- a/third_party/xla/xla/service/hlo_lexer.cc
+++ b/third_party/xla/xla/service/hlo_lexer.cc
@@ -96,6 +96,7 @@ TokKind HloLexer::LexToken() {
     token_state_.token_start = current_ptr_;
 
     int current_char = GetNextChar();
+    TokKind tmp;
     switch (current_char) {
       default:
         // [a-zA-Z_]
@@ -132,7 +133,11 @@ TokKind HloLexer::LexToken() {
           current_ptr_++;
           return TokKind::kArrow;
         }
-        return LexNumberOrPattern();
+        tmp = LexNumberOrPattern();
+        if (tmp == TokKind::kError && current_char == '?') {
+          return TokKind::kQuestionMark;
+        }
+        return tmp;
       case '=':
         return TokKind::kEqual;
       case '<':
@@ -569,6 +574,8 @@ std::string TokKindToString(TokKind kind) {
       return "kColon";
     case TokKind::kAsterisk:
       return "kAsterisk";
+    case TokKind::kQuestionMark:
+      return "kQuestionMark";
     case TokKind::kOctothorp:
       return "kOctothorp";
     case TokKind::kPlus:
diff --git a/third_party/xla/xla/service/hlo_lexer.h b/third_party/xla/xla/service/hlo_lexer.h
index 031ec1ae295330..5681818c07162c 100644
--- a/third_party/xla/xla/service/hlo_lexer.h
+++ b/third_party/xla/xla/service/hlo_lexer.h
@@ -39,13 +39,14 @@ enum class TokKind {
   kError,
 
   // Tokens with no info.
-  kEqual,      // =
-  kComma,      // ,
-  kColon,      // :
-  kAsterisk,   // *
-  kOctothorp,  // #
-  kPlus,       // +
-  kTilde,      // ~
+  kEqual,         // =
+  kComma,         // ,
+  kColon,         // :
+  kAsterisk,      // *
+  kQuestionMark,  // ?
+  kOctothorp,     // #
+  kPlus,          // +
+  kTilde,         // ~
   kLsquare,
   kRsquare,  // [  ]
   kLbrace,
diff --git a/third_party/xla/xla/service/hlo_liveness_analysis.cc b/third_party/xla/xla/service/hlo_liveness_analysis.cc
index 91ec0a6f8d7470..b24dd6678e905f 100644
--- a/third_party/xla/xla/service/hlo_liveness_analysis.cc
+++ b/third_party/xla/xla/service/hlo_liveness_analysis.cc
@@ -15,18 +15,22 @@ limitations under the License.
 
 #include "xla/service/hlo_liveness_analysis.h"
 
+#include <cstddef>
+#include <cstdint>
 #include <deque>
 #include <functional>
 #include <memory>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/call_graph.h"
+#include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/types.h"
@@ -116,25 +120,28 @@ void PropagateLivenessThroughTuple(
     HloLivenessAnalysis::HloIndexMap* live_index_map, Worklist* worklist,
     Workset* workset) {
   CHECK_EQ(instruction->opcode(), HloOpcode::kTuple);
-  for (int64_t operand_index = 0; operand_index < instruction->operand_count();
-       ++operand_index) {
-    const ShapeTree<bool>& index_tree = *live_index_map->at(instruction);
-    ForEachLiveIndex(index_tree, [&](const ShapeIndex& shape_index) {
-      if (shape_index.empty() || shape_index[0] != operand_index) {
-        return;
-      }
-      // Mark top-level index of operand at 'operand_index'.
-      MarkLiveAtIndex(instruction->operand(operand_index), {}, live_index_map,
-                      worklist, workset);
-      // Mark sub-shape index of operand at 'operand_index'.
-      ShapeIndex operand_shape_index;
-      for (int i = 1; i < shape_index.size(); ++i) {
-        operand_shape_index.push_back(shape_index[i]);
-      }
-      MarkLiveAtIndex(instruction->operand(operand_index), operand_shape_index,
-                      live_index_map, worklist, workset);
-    });
-  }
+  const ShapeTree<bool>& index_tree = *live_index_map->at(instruction);
+
+  ForEachLiveIndex(index_tree, [&](const ShapeIndex& shape_index) {
+    const size_t size = shape_index.size();
+    if (size == 0) {
+      return;
+    }
+    const int64_t operand_index = shape_index[0];
+    if (operand_index >= instruction->operand_count()) {
+      return;
+    }
+    // Mark top-level index of operand at 'operand_index'.
+    MarkLiveAtIndex(instruction->operand(operand_index), {}, live_index_map,
+                    worklist, workset);
+    // Mark sub-shape index of operand at 'operand_index'.
+    ShapeIndex operand_shape_index(size - 1);
+    for (int i = 1; i < size; ++i) {
+      operand_shape_index[i - 1] = shape_index[i];
+    }
+    MarkLiveAtIndex(instruction->operand(operand_index), operand_shape_index,
+                    live_index_map, worklist, workset);
+  });
 }
 
 // Propagates liveness through GetTupleElement instructions.
diff --git a/third_party/xla/xla/service/hlo_ordering.cc b/third_party/xla/xla/service/hlo_ordering.cc
index cb23dad2517d95..fba15d3e0f7ded 100644
--- a/third_party/xla/xla/service/hlo_ordering.cc
+++ b/third_party/xla/xla/service/hlo_ordering.cc
@@ -54,7 +54,11 @@ HloOrdering::ExecutionConstraint HloOrdering::GetExecutionConstraint(
   // callgraph ancestor instructions which call (potentially transitively) the
   // computations containing 'a' and 'b' and use these ancestor instructions to
   // compare order.
-  if (a == b) {
+  auto is_async_wrapped = [](const HloInstruction* a, const HloInstruction* b) {
+    // Treats the async wrapped instruction as same as the wrapper.
+    return a->IsAsynchronous() && a->async_wrapped_instruction() == b;
+  };
+  if (a == b || is_async_wrapped(a, b) || is_async_wrapped(b, a)) {
     return ExecutionConstraint::kIsSame;
   }
   const HloInstruction* a_ancestor;
diff --git a/third_party/xla/xla/service/hlo_ordering_test.cc b/third_party/xla/xla/service/hlo_ordering_test.cc
index c3329f768e13d7..0f7f89d16bbc47 100644
--- a/third_party/xla/xla/service/hlo_ordering_test.cc
+++ b/third_party/xla/xla/service/hlo_ordering_test.cc
@@ -636,5 +636,60 @@ ENTRY %main {
       {&async_start_use, &call_use, &async_done_use}, value, *dataflow));
 }
 
+TEST_F(HloOrderingTest, OrderingBetweenAsyncOpAndItsWrapped) {
+  constexpr absl::string_view hlo = R"(
+HloModule test
+
+%async_computation {
+  %param_0 = f32[10,32,512]{2,1,0:T(8,128)S(5)} parameter(0)
+  %param_1 = f32[1,32,512]{2,1,0:T(8,128)} parameter(1)
+  %param_2 = s32[]{:T(128)} parameter(2)
+  %param_3 = s32[]{:T(128)} parameter(3)
+  %param_4 = s32[]{:T(128)} parameter(4)
+  ROOT %dynamic-update-slice.1 = f32[10,32,512]{2,1,0:T(8,128)S(5)}
+    dynamic-update-slice(%param_0, %param_1, %param_2, %param_3, %param_4)
+}
+
+ENTRY %main {
+  %param.1 = (s32[]{:T(128)}, f32[32,512]{1,0:T(8,128)},
+              f32[10,32,512]{2,1,0:T(8,128)S(5)}) parameter(0)
+  %get-tuple-element.132 = f32[10,32,512]{2,1,0:T(8,128)S(5)} get-tuple-element(
+    %param.1), index=2
+  %get-tuple-element.131 = f32[32,512]{1,0:T(8,128)} get-tuple-element(
+    %param.1), index=1
+  %cosine.0 = f32[32,512]{1,0:T(8,128)} cosine(%get-tuple-element.131)
+  %reshape.6 = f32[1,32,512]{2,1,0:T(8,128)} reshape(%cosine.0)
+  %get-tuple-element.130 = s32[]{:T(128)} get-tuple-element(%param.1), index=0
+  %constant.49 = s32[]{:T(128)} constant(0)
+  %compare.13 = pred[]{:T(512)} compare(
+      %get-tuple-element.130, %constant.49), direction=LT
+  %constant.50 = s32[]{:T(128)} constant(10)
+  %add.22 = s32[]{:T(128)} add(%get-tuple-element.130, %constant.50)
+  %select.6 = s32[]{:T(128)} select(
+      %compare.13, %add.22, %get-tuple-element.130)
+  %dynamic-update-slice-start = (
+    (f32[10,32,512]{2,1,0:T(8,128)S(5)}, f32[1,32,512]{2,1,0:T(8,128)},
+     s32[]{:T(128)}, s32[]{:T(128)}, s32[]{:T(128)}),
+     f32[10,32,512]{2,1,0:T(8,128)S(5)}, u32[]) async-start(
+      %get-tuple-element.132, %reshape.6, %select.6,
+      %constant.49, %constant.49), calls=%async_computation
+  ROOT %dynamic-update-slice-done = f32[10,32,512]{2,1,0:T(8,128)S(5)}
+    async-done(%dynamic-update-slice-start), calls=%async_computation
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto dataflow,
+                          HloDataflowAnalysis::Run(*module, /*ssa_form=*/true));
+  DependencyHloOrdering ordering(module.get());
+  auto* async_start =
+      FindInstruction(module.get(), "dynamic-update-slice-start");
+  auto* async_done = FindInstruction(module.get(), "dynamic-update-slice-done");
+  auto* dus = FindInstruction(module.get(), "dynamic-update-slice.1");
+  EXPECT_EQ(ordering.GetExecutionConstraint(async_start, dus),
+            HloOrdering::ExecutionConstraint::kIsSame);
+  EXPECT_EQ(ordering.GetExecutionConstraint(async_done, dus),
+            HloOrdering::ExecutionConstraint::kIsSame);
+}
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_parser.cc b/third_party/xla/xla/service/hlo_parser.cc
index 3178700c1efac0..3a31abe766ba88 100644
--- a/third_party/xla/xla/service/hlo_parser.cc
+++ b/third_party/xla/xla/service/hlo_parser.cc
@@ -248,6 +248,7 @@ class HloParserImpl : public HloParser {
 
   // Stand alone parsing utils for various aggregate data types.
   StatusOr<Shape> ParseShapeOnly();
+  StatusOr<Layout> ParseLayoutOnly();
   StatusOr<HloSharding> ParseShardingOnly();
   StatusOr<FrontendAttributes> ParseFrontendAttributesOnly();
   StatusOr<StatisticsViz> ParseStatisticsVizOnly();
@@ -1005,6 +1006,8 @@ bool HloParserImpl::ParseHloModule(HloModule* module,
                                    bool parse_module_without_header) {
   std::string name;
   std::optional<bool> is_scheduled;
+  std::optional<int64_t> replica_count;
+  std::optional<int64_t> num_partitions;
   std::optional<AliasingData> aliasing_data;
   std::optional<BufferDonor> buffer_donor_data;
   std::optional<bool> alias_passthrough_params;
@@ -1014,6 +1017,9 @@ bool HloParserImpl::ParseHloModule(HloModule* module,
   BoolList allow_spmd_sharding_propagation_to_output;
 
   attrs["is_scheduled"] = {/*required=*/false, AttrTy::kBool, &is_scheduled};
+  attrs["replica_count"] = {/*required=*/false, AttrTy::kInt64, &replica_count};
+  attrs["num_partitions"] = {/*required=*/false, AttrTy::kInt64,
+                             &num_partitions};
   attrs["input_output_alias"] = {/*required=*/false, AttrTy::kAliasing,
                                  &aliasing_data};
   attrs["buffer_donor"] = {/*required=*/false, AttrTy::kBufferDonor,
@@ -1067,6 +1073,15 @@ bool HloParserImpl::ParseHloModule(HloModule* module,
     config.set_alias_passthrough_params(true);
     default_config = false;
   }
+  if (num_partitions.value_or(1) != 1) {
+    config.set_num_partitions(*num_partitions);
+    config.set_use_spmd_partitioning(true);
+    default_config = false;
+  }
+  if (replica_count.value_or(1) != 1) {
+    config.set_replica_count(*replica_count);
+    default_config = false;
+  }
   if (entry_computation_layout.has_value()) {
     *config.mutable_entry_computation_layout() = *entry_computation_layout;
     default_config = false;
@@ -1307,6 +1322,14 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
   attrs["backend_config"] = {/*required=*/false, AttrTy::kStringOrJsonDict,
                              &backend_config};
 
+  optional<int64_t> operation_queue_id;
+  attrs["operation_queue_id"] = {/*required=*/false, AttrTy::kInt64,
+                                 &operation_queue_id};
+
+  optional<std::vector<int64_t>> wait_on_operation_queues;
+  attrs["wait_on_operation_queues"] = {
+      /*required=*/false, AttrTy::kBracedInt64List, &wait_on_operation_queues};
+
   std::optional<Shape> maybe_shape;
   if (parse_shape) {
     maybe_shape = shape;
@@ -1343,7 +1366,7 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
     // normalizing tuple sharding.
     HloSharding hlo_sharding = HloSharding::FromProto(sharding.value()).value();
     hlo_sharding = hlo_sharding.NormalizeTupleSharding(instruction->shape());
-    instruction->set_sharding(hlo_sharding);
+    instruction->set_sharding(std::move(hlo_sharding));
   }
   if (parameter_replication) {
     int leaf_count = ShapeUtil::GetLeafCount(instruction->shape());
@@ -1378,6 +1401,13 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
   if (statistics_viz) {
     instruction->set_statistics_viz(*statistics_viz);
   }
+  if (operation_queue_id) {
+    instruction->set_operation_queue_id(*operation_queue_id);
+  }
+  if (wait_on_operation_queues) {
+    instruction->set_wait_on_operation_queues(*wait_on_operation_queues);
+  }
+
   return AddInstruction(name, instruction, name_loc);
 }
 
@@ -1459,9 +1489,8 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
     case HloOpcode::kTopK: {
       optional<int64_t> k;
       attrs["k"] = {/*required=*/true, AttrTy::kInt64, &k};
-      std::optional<HloComputation*> to_apply;
-      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
-                           &to_apply};
+      optional<bool> largest;
+      attrs["largest"] = {/*required=*/false, AttrTy::kBool, &largest};
       if ((!preset_operands && !ParseOperands(&operands, builder,
                                               /*expected_size=*/1)) ||
           !ParseAttributes(attrs, allow_attributes)) {
@@ -1472,8 +1501,8 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
           })) {
         return nullptr;
       }
-      return builder->AddInstruction(
-          HloInstruction::CreateTopK(*shape, operands[0], *k, *to_apply));
+      return builder->AddInstruction(HloInstruction::CreateTopK(
+          *shape, operands[0], *k, (largest.has_value() ? *largest : true)));
     }
     // Unary ops.
     case HloOpcode::kAbs:
@@ -5381,6 +5410,7 @@ bool HloParserImpl::ParseParamList() {
 // dimension_sizes ::= '[' dimension_list ']'
 // dimension_list
 //   ::= /*empty*/
+//   ::= '?'
 //   ::= <=? int64_t (',' param)*
 // param ::= name shape
 bool HloParserImpl::ParseDimensionSizes(std::vector<int64_t>* dimension_sizes,
@@ -5388,12 +5418,18 @@ bool HloParserImpl::ParseDimensionSizes(std::vector<int64_t>* dimension_sizes,
   auto parse_and_add_item = [&]() {
     int64_t i;
     bool is_dynamic = false;
-    if (lexer_.GetKind() == TokKind::kLeq) {
+    if (lexer_.GetKind() == TokKind::kQuestionMark) {
+      i = Shape::kUnboundedSize;
       is_dynamic = true;
       lexer_.Lex();
-    }
-    if (!ParseInt64(&i)) {
-      return false;
+    } else {
+      if (lexer_.GetKind() == TokKind::kLeq) {
+        is_dynamic = true;
+        lexer_.Lex();
+      }
+      if (!ParseInt64(&i)) {
+        return false;
+      }
     }
     dimension_sizes->push_back(i);
     dynamic_dimensions->push_back(is_dynamic);
@@ -6322,6 +6358,18 @@ StatusOr<Shape> HloParserImpl::ParseShapeOnly() {
   return shape;
 }
 
+StatusOr<Layout> HloParserImpl::ParseLayoutOnly() {
+  lexer_.Lex();
+  Layout layout;
+  if (!ParseLayout(&layout)) {
+    return InvalidArgument("Syntax error:\n%s", GetError());
+  }
+  if (lexer_.GetKind() != TokKind::kEof) {
+    return InvalidArgument("Syntax error:\nExtra content after layout");
+  }
+  return layout;
+}
+
 StatusOr<HloSharding> HloParserImpl::ParseShardingOnly() {
   lexer_.Lex();
   OpSharding op_sharding;
@@ -6562,6 +6610,11 @@ StatusOr<Shape> ParseShape(absl::string_view str) {
   return parser.ParseShapeOnly();
 }
 
+StatusOr<Layout> ParseLayout(absl::string_view str) {
+  HloParserImpl parser(str);
+  return parser.ParseLayoutOnly();
+}
+
 std::unique_ptr<HloParser> HloParser::CreateHloParserForTests(
     absl::string_view str) {
   return std::make_unique<HloParserImpl>(str);
diff --git a/third_party/xla/xla/service/hlo_parser.h b/third_party/xla/xla/service/hlo_parser.h
index 7ac65a6b353c6f..e130b94949401b 100644
--- a/third_party/xla/xla/service/hlo_parser.h
+++ b/third_party/xla/xla/service/hlo_parser.h
@@ -78,6 +78,9 @@ StatusOr<PaddingConfig> ParsePaddingConfig(absl::string_view str);
 // Parses and returns a Shape::ToString-format string.
 StatusOr<Shape> ParseShape(absl::string_view str);
 
+// Parses and returns a Layout::ToString-format string.
+StatusOr<Layout> ParseLayout(absl::string_view str);
+
 // Parses and returns a std::vector<ReplicaGroup> from str. str is supposed to
 // contain a list of the replica groups, i.e. just the rhs of the
 // "replica_groups={...}" attribute string, e.g., "{{0,1}, {2,3}}".
diff --git a/third_party/xla/xla/service/hlo_parser_test.cc b/third_party/xla/xla/service/hlo_parser_test.cc
index 4baf3be5bff223..d3a0b01054f883 100644
--- a/third_party/xla/xla/service/hlo_parser_test.cc
+++ b/third_party/xla/xla/service/hlo_parser_test.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/verified_hlo_module.h"
 #include "xla/window_util.h"
@@ -1644,15 +1645,9 @@ ENTRY Sort {
 "TopK",
 R"(HloModule topk, entry_computation_layout={(f32[10,10]{0,1})->(f32[10,2]{0,1}, s32[10,2]{0,1})}
 
-compare {
-  p.0.lhs = f32[] parameter(0)
-  p.0.rhs = f32[] parameter(1)
-  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
-}
-
 ENTRY TopK {
   x = f32[10,10]{0,1} parameter(0)
-  ROOT topk = (f32[10,2]{0,1}, s32[10,2]{0,1}) topk(x), k=2, to_apply=compare
+  ROOT topk = (f32[10,2]{0,1}, s32[10,2]{0,1}) topk(x), k=2, largest=true
 }
 
 )"
@@ -1816,7 +1811,7 @@ ENTRY CRS {
 // all-reduce with subgroups
 {
 "AllReduceWithSubgroups",
-R"(HloModule CRS_Subgroups, entry_computation_layout={(f32[128,32]{0,1})->f32[128,32]{0,1}}
+R"(HloModule CRS_Subgroups, entry_computation_layout={(f32[128,32]{0,1})->f32[128,32]{0,1}}, replica_count=4
 
 add {
   lhs = f32[] parameter(0)
@@ -1933,7 +1928,7 @@ ENTRY AllGather {
 // all-gather with subgroups
 {
 "AllGatherWithSubgroups",
-R"(HloModule AllGatherWithSubgroups, entry_computation_layout={(f32[128,32]{0,1})->f32[128,64]{0,1}}
+R"(HloModule AllGatherWithSubgroups, entry_computation_layout={(f32[128,32]{0,1})->f32[128,64]{0,1}}, replica_count=4
 
 ENTRY AllGatherWithSubgroups {
   input = f32[128,32]{0,1} parameter(0)
@@ -1958,7 +1953,7 @@ ENTRY AllToAll {
 // all-to-all with subgroups
 {
 "AllToAllWithSubgroups",
-R"(HloModule AllToAllWithSubgroups, entry_computation_layout={(f32[128,32]{0,1}, f32[128,32]{0,1})->(f32[128,32]{0,1}, f32[128,32]{0,1})}
+R"(HloModule AllToAllWithSubgroups, entry_computation_layout={(f32[128,32]{0,1}, f32[128,32]{0,1})->(f32[128,32]{0,1}, f32[128,32]{0,1})}, replica_count=4
 
 ENTRY AllToAllWithSubgroups {
   p0 = f32[128,32]{0,1} parameter(0)
@@ -1972,7 +1967,7 @@ ENTRY AllToAllWithSubgroups {
 // collective-permute
 {
 "CollectivePermute",
-R"(HloModule CollectivePermute, entry_computation_layout={(f32[128,32]{0,1})->f32[128,32]{0,1}}
+R"(HloModule CollectivePermute, entry_computation_layout={(f32[128,32]{0,1})->f32[128,32]{0,1}}, replica_count=4
 
 ENTRY CollectivePermute {
   input = f32[128,32]{0,1} parameter(0)
@@ -1985,7 +1980,7 @@ ENTRY CollectivePermute {
 // collective-permute with in-place updates
 {
 "CollectivePermuteInPlaceUpdate",
-R"(HloModule CollectivePermuteInPlaceUpdate, entry_computation_layout={(f32[128,32]{0,1})->f32[128,128]{0,1}}
+R"(HloModule CollectivePermuteInPlaceUpdate, entry_computation_layout={(f32[128,32]{0,1})->f32[128,128]{0,1}}, replica_count=4
 
 ENTRY CollectivePermuteInPlaceUpdate {
   input = f32[128,32]{0,1} parameter(0)
@@ -2004,7 +1999,7 @@ ENTRY CollectivePermuteInPlaceUpdate {
 // collective-permute with in-place updates with multiple targets per source
 {
 "CollectivePermuteInPlaceUpdateMultipleReadWrite",
-R"(HloModule CollectivePermuteInPlaceUpdateMultipleReadWrite, entry_computation_layout={(f32[8,8,128]{2,1,0})->f32[8,8,128]{2,1,0}}
+R"(HloModule CollectivePermuteInPlaceUpdateMultipleReadWrite, entry_computation_layout={(f32[8,8,128]{2,1,0})->f32[8,8,128]{2,1,0}}, replica_count=4
 
 ENTRY CollectivePermuteInPlaceUpdate {
   constant.3 = s32[] constant(2)
@@ -2028,7 +2023,7 @@ ENTRY CollectivePermuteInPlaceUpdate {
 },
 {
 "CollectivePermuteInPlaceUpdateTupleMultipleReadWrite",
-R"(HloModule hlo_runner_test_0.1, entry_computation_layout={()->(u32[2,8,128]{2,1,0:T(2,128)}, u32[4,8,128]{2,1,0:T(2,128)})}
+R"(HloModule hlo_runner_test_0.1, entry_computation_layout={()->(u32[2,8,128]{2,1,0:T(2,128)}, u32[4,8,128]{2,1,0:T(2,128)})}, replica_count=4
 
 ENTRY hlo_runner_test_0.1 {
   replica_id = u32[] replica-id()
@@ -2059,7 +2054,7 @@ ENTRY hlo_runner_test_0.1 {
 // collective-permute tuple with in-place updates
 {
 "CollectivePermuteTupleInPlaceUpdate",
-R"(HloModule CollectivePermuteTupleInPlaceUpdate, entry_computation_layout={(f32[128,32]{0,1})->(f32[128,128]{0,1}, f32[128,128]{0,1})}
+R"(HloModule CollectivePermuteTupleInPlaceUpdate, entry_computation_layout={(f32[128,32]{0,1})->(f32[128,128]{0,1}, f32[128,128]{0,1})}, replica_count=4
 
 ENTRY CollectivePermuteInPlaceUpdate {
   input = f32[128,32]{0,1} parameter(0)
@@ -2084,7 +2079,7 @@ ENTRY CollectivePermuteInPlaceUpdate {
 // collective-permute-start and -done with inplace update
 {
 "CollectivePermuteStartAndDone",
-R"(HloModule CollectivePermuteStartAndDone, entry_computation_layout={(f32[128,32]{0,1})->f32[128,32]{0,1}}
+R"(HloModule CollectivePermuteStartAndDone, entry_computation_layout={(f32[128,32]{0,1})->f32[128,32]{0,1}}, replica_count=4
 
 ENTRY CollectivePermuteStartAndDone {
   input = f32[128,32]{0,1} parameter(0)
@@ -2098,7 +2093,7 @@ ENTRY CollectivePermuteStartAndDone {
 // collective-permute-start and -done
 {
 "CollectivePermuteStartAndDoneInplaceUpdate",
-R"(HloModule CollectivePermuteStartAndDoneInplaceUpdate, entry_computation_layout={(f32[128,32]{0,1})->f32[128,128]{0,1}}
+R"(HloModule CollectivePermuteStartAndDoneInplaceUpdate, entry_computation_layout={(f32[128,32]{0,1})->f32[128,128]{0,1}}, replica_count=4
 
 ENTRY CollectivePermuteStartAndDoneInplaceUpdate {
   input = f32[128,32]{0,1} parameter(0)
@@ -4069,6 +4064,16 @@ TEST_F(HloParserTest, ParseShapeStringR2F32) {
       << "actual:   " << ShapeUtil::HumanString(actual);
 }
 
+TEST_F(HloParserTest, ParseShapeStringUnbounded) {
+  std::string shape_string = "f32[?,784]";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected =
+      ShapeUtil::MakeShape(F32, {Shape::kUnboundedSize, 784}, {true, false});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
 TEST_F(HloParserTest, ParseShapeStringTupleOfArrays) {
   std::string shape_string = "(f32[1572864],s8[5120,1024])";
   TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
@@ -4487,6 +4492,37 @@ ENTRY TestComputation {
   EXPECT_TRUE(result.value()->config().alias_passthrough_params());
 }
 
+TEST_F(HloParserTest, CheckReplicaCount) {
+  const char* const hlo_string = R"(
+HloModule TestModule, replica_count=5
+
+ENTRY TestComputation {
+    p0 = f16[2048,1024] parameter(0)
+    p1 = f16[2048,1024] parameter(1)
+    ROOT root = (f16[2048,1024], f16[2048,1024]) tuple(p0, p1)
+}
+)";
+  auto result = ParseAndReturnVerifiedModule(hlo_string);
+  TF_EXPECT_OK(result.status());
+  EXPECT_EQ(result.value()->config().replica_count(), 5);
+}
+
+TEST_F(HloParserTest, CheckNumPartitions) {
+  const char* const hlo_string = R"(
+HloModule TestModule, num_partitions=3
+
+ENTRY TestComputation {
+    p0 = f16[2048,1024] parameter(0)
+    p1 = f16[2048,1024] parameter(1)
+    ROOT root = (f16[2048,1024], f16[2048,1024]) tuple(p0, p1)
+}
+)";
+  auto result = ParseAndReturnVerifiedModule(hlo_string);
+  TF_EXPECT_OK(result.status());
+  EXPECT_EQ(result.value()->config().num_partitions(), 3);
+  EXPECT_TRUE(result.value()->config().use_spmd_partitioning());
+}
+
 TEST_F(HloParserTest, CheckFrontendAttributes) {
   const char* const hlo_string = R"(
 HloModule TestModule, frontend_attributes={attr_name="attr_value"}
diff --git a/third_party/xla/xla/service/hlo_pass_pipeline.cc b/third_party/xla/xla/service/hlo_pass_pipeline.cc
index b795e65faf4b42..52fdca45649009 100644
--- a/third_party/xla/xla/service/hlo_pass_pipeline.cc
+++ b/third_party/xla/xla/service/hlo_pass_pipeline.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
+#include "tsl/profiler/lib/scoped_annotation.h"
 
 namespace xla {
 
@@ -146,6 +147,18 @@ Status HloPassPipeline::RunInvariantCheckers(
   return OkStatus();
 }
 
+namespace {
+std::string UniqueId(const HloModule& mod) {
+  return std::to_string(mod.unique_id());
+}
+std::string UniqueId(const HloModuleGroup& group) {
+  return absl::StrJoin(group.modules(), "-",
+                       [](std::string* out, const HloModule* mod) {
+                         out->append(std::to_string(mod->unique_id()));
+                       });
+}
+}  // namespace
+
 template <typename HloT>
 StatusOr<bool> HloPassPipeline::RunPassesInternal(
     HloT* hlo, const DebugOptions& debug_options,
@@ -157,6 +170,10 @@ StatusOr<bool> HloPassPipeline::RunPassesInternal(
   static constexpr absl::string_view kPipelineStart = "pipeline-start";
   static constexpr absl::string_view kPipelineEnd = "pipeline-end";
   std::string pipeline_name = std::string(name());
+  tsl::profiler::ScopedAnnotation annotation{[&] {
+    return absl::StrFormat("XlaPassPipeline:#name=%s,module=%s,program_id=%s#",
+                           pipeline_name, hlo->name(), UniqueId(*hlo));
+  }};
 
   TF_RETURN_IF_ERROR(
       RunInvariantCheckers(hlo, kPipelineStart, execution_threads));
@@ -176,6 +193,8 @@ StatusOr<bool> HloPassPipeline::RunPassesInternal(
     HloPassInterface* pass = passes[i];
     XLA_SCOPED_LOGGING_TIMER(absl::StrCat("HLO pass: ", pass->name()));
     std::string pass_name = std::string(pass->name());
+    tsl::profiler::ScopedAnnotation annotation{
+        [&] { return "XlaPass:" + pass_name; }};
     VLOG(1) << "  HLO pass " << pass_name;
     VLOG(2) << "  Module hash " << absl::HashOf(*hlo);
     if (!pass->IsPassPipeline()) {
diff --git a/third_party/xla/xla/service/hlo_replication_analysis.cc b/third_party/xla/xla/service/hlo_replication_analysis.cc
index 70b73ed0798922..d7505cecc0e303 100644
--- a/third_party/xla/xla/service/hlo_replication_analysis.cc
+++ b/third_party/xla/xla/service/hlo_replication_analysis.cc
@@ -421,13 +421,13 @@ Status HloReplicationAnalysis::ComputeHloReplication() {
           if (replication) {
             // If parameter replication status has been set explicitly, use that
             // instead.
-            if (!cross_partition_spmd_ && replication->at(leaf_index)) {
+            if (!cross_partition_spmd_ && (*replication)[leaf_index]) {
               // Setting parameter replication status for replicas in
               // non cross-partition spmd mode.
               *shape_tree.mutable_element(index) =
                   HloReplication::ReplicatedOnAllDevices();
             }
-            if (cross_partition_spmd_ && !replication->at(leaf_index)) {
+            if (cross_partition_spmd_ && !(*replication)[leaf_index]) {
               // Setting paramemter replication status for partitions in
               // cross-partition spmd mode.
               *shape_tree.mutable_element(index) =
diff --git a/third_party/xla/xla/service/hlo_value.cc b/third_party/xla/xla/service/hlo_value.cc
index 6f2d759067787c..d4e317e4b2b36c 100644
--- a/third_party/xla/xla/service/hlo_value.cc
+++ b/third_party/xla/xla/service/hlo_value.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -169,22 +170,34 @@ HloValue::Uses HloValue::ComputeUses() const {
   Uses uses;
   // Build vector of HloUses for the value.
   for (const HloPosition& position : positions_) {
-    for (HloInstruction* user : position.instruction->users()) {
-      for (int64_t i = 0; i < user->operand_count(); ++i) {
-        if (user->operand(i) != position.instruction) {
+    for (HloInstruction* const user : position.instruction->users()) {
+      int i = -1;
+      for (const auto& operand : user->operands()) {
+        ++i;
+
+        if (operand != position.instruction) {
           continue;
         }
 
+#ifndef NDEBUG
+        // If user is in the root positions of this value, it must be a root.
+        if (root_positions.contains(user)) {
+          CHECK(user->IsRoot());
+        }
+#endif  // NDEBUG
+
         // Root instructions of computations are considered to be uses whether
         // or not the root instruction itself actually uses the value.
         if (MayUseOperandValue(i, position.index, user) ||
-            root_positions.contains(user)) {
+            (user->IsRoot() && root_positions.contains(user))) {
           HloUse new_use{user, i, position.index};
 
+#ifndef NDEBUG
           // The new use must not already exist in uses.
           for (const HloUse& use : uses) {
             DCHECK_NE(use, new_use);
           }
+#endif  // NDEBUG
 
           uses.push_back(std::move(new_use));
         }
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis.cc b/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
index dfb029a54194fa..7e1bd0e87e5af4 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
@@ -51,6 +51,34 @@ limitations under the License.
 
 namespace xla {
 
+extern const char kXlaHostTransferRendezvousNameAttr[];
+
+SendRecvGroupMap CreateSendRecvGroupMap(const HloModule& hlo_module) {
+  SendRecvGroupMap send_recv_group_map;
+  for (HloComputation* computation : hlo_module.computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kSend &&
+          instruction->opcode() != HloOpcode::kRecv) {
+        continue;
+      }
+      std::string rendezvous = instruction->frontend_attributes().map().at(
+          kXlaHostTransferRendezvousNameAttr);
+      auto send_recv_iter = send_recv_group_map.find(rendezvous);
+      if (send_recv_iter == send_recv_group_map.end()) {
+        auto insert_success = send_recv_group_map.insert(
+            {rendezvous, SendRecvGroup{nullptr, nullptr}});
+        send_recv_iter = insert_success.first;
+      }
+      if (instruction->opcode() == HloOpcode::kSend) {
+        send_recv_iter->second.send = instruction;
+      } else {
+        send_recv_iter->second.recv = instruction;
+      }
+    }
+  }
+  return send_recv_group_map;
+}
+
 bool HloPreOrderDFS::IsReady(const HloInstruction* instruction) const {
   for (HloInstruction* user : instruction->users()) {
     if (!visited_.contains(user)) {
@@ -73,6 +101,24 @@ std::vector<HloInstruction*> GetAllInstructionsWithZeroUsers(
   return results;
 }
 
+StatusOr<HloInstruction*> GetMatchingSendOrRecvFromMap(
+    HloInstruction* send_or_recv, const SendRecvGroupMap& send_recv_group_map) {
+  if (send_or_recv->opcode() != HloOpcode::kSend &&
+      send_or_recv->opcode() != HloOpcode::kRecv) {
+    return InvalidArgument("Expecting only send or recv");
+  }
+  std::string rendezvous = send_or_recv->frontend_attributes().map().at(
+      kXlaHostTransferRendezvousNameAttr);
+  auto send_recv_iter = send_recv_group_map.find(rendezvous);
+  if (send_recv_iter == send_recv_group_map.end()) {
+    return InternalError("Missing send or recv from send recv group.");
+  }
+  if (send_or_recv->opcode() == HloOpcode::kSend) {
+    return send_recv_iter->second.recv;
+  }
+  return send_recv_iter->second.send;
+}
+
 }  // namespace
 
 Status HloPreOrderDFS::Run(const HloComputation& computation,
@@ -124,8 +170,10 @@ Status EinsumDepthAnalysis::RunInternal(
 }
 
 StatusOr<std::unique_ptr<EinsumDepthAnalysis>> EinsumDepthAnalysis::Run(
-    const HloComputation& computation) {
-  EinsumDepthAnalysis* analysis_ptr = new EinsumDepthAnalysis();
+    const HloComputation& computation,
+    const SendRecvGroupMap& send_recv_group_map) {
+  EinsumDepthAnalysis* analysis_ptr =
+      new EinsumDepthAnalysis(send_recv_group_map);
   std::unique_ptr<EinsumDepthAnalysis> analysis(analysis_ptr);
   TF_RETURN_IF_ERROR(analysis->RunInternal(computation, std::nullopt));
   return analysis;
@@ -314,7 +362,7 @@ Status EinsumDepthAnalysis::HandleGetTupleElement(
         if (operand_depth.IsLeaf(shape_index)) {
           ShapeIndex output_index = shape_index;
           output_index.pop_front();
-          *depth_ptr = std::max(*depth_ptr, depth_tree.element(output_index));
+          *depth_ptr = MergeDepth(*depth_ptr, depth_tree.element(output_index));
         }
       });
   return OkStatus();
@@ -371,7 +419,7 @@ Status EinsumDepthAnalysis::HandleCustomCall(HloInstruction* custom_call) {
 Status EinsumDepthAnalysis::HandleWhile(HloInstruction* xla_while) {
   auto depth_iter = einsum_depth_map_.find(xla_while);
   CHECK(depth_iter != einsum_depth_map_.end());
-  const ShapeTree<int> depth_tree = depth_iter->second;
+  const ShapeTree<int>& depth_tree = depth_iter->second;
   int max_depth = GetMaxDepth(depth_tree);
   HloComputation* condition_computation = xla_while->while_condition();
   HloInstruction* condition_root = condition_computation->root_instruction();
@@ -379,31 +427,36 @@ Status EinsumDepthAnalysis::HandleWhile(HloInstruction* xla_while) {
   TF_RETURN_IF_ERROR(HandleCalledComputation(
       *condition_computation, condition_depth, xla_while->operands()));
   HloComputation* body_computation = xla_while->while_body();
-  TF_RETURN_IF_ERROR(HandleCalledComputation(*body_computation, depth_tree,
-                                             xla_while->operands()));
-  // Elements of while loop outputs may only be used within the while loop.
-  // Set the depth of the while body outputs to have the max of their original
-  // depth and their corresponding operand depth if their original depth was
-  // negative. Then recompute while loop instruction depths.
-  auto body_depth_iter =
+  bool run_depth_propagation_on_body = true;
+  const ShapeTree<int>* root_depth_ptr = &depth_tree;
+  auto root_depth_iter =
       GetOrCreateDepthTree(body_computation->root_instruction());
-  ShapeTree<int>& body_depth = body_depth_iter->second;
-  // Note: while body computations have a single parameter. See
-  // ShapeVerifier::HandleWhile.
-  HloInstruction* operand = body_computation->parameter_instruction(0);
-  auto operand_depth = GetOrCreateDepthTree(operand)->second;
-  body_depth.ForEachMutableElement(
-      [&body_depth, &operand_depth](const ShapeIndex& shape_index,
-                                    int* depth_ptr) {
-        if (body_depth.IsLeaf(shape_index)) {
-          if (body_depth.element(shape_index) < 0 &&
+  ShapeTree<int>& root_depth = root_depth_iter->second;
+  while (run_depth_propagation_on_body) {
+    run_depth_propagation_on_body = false;
+    TF_RETURN_IF_ERROR(HandleCalledComputation(
+        *body_computation, *root_depth_ptr, xla_while->operands()));
+    // Elements of while loop outputs may only be used within the while loop.
+    // If such elements exist, we set its root depth to it operand depth. Then
+    // recompute while loop instruction depths.
+    HloInstruction* operand = body_computation->parameter_instruction(0);
+    const ShapeTree<int>& operand_depth = GetOrCreateDepthTree(operand)->second;
+
+    root_depth.ForEachMutableElement(
+        [&run_depth_propagation_on_body, &root_depth, &operand_depth](
+            const ShapeIndex& shape_index, int* depth_ptr) {
+          if (!root_depth.IsLeaf(shape_index)) {
+            return;
+          }
+          if (root_depth.element(shape_index) < 0 &&
               operand_depth.element(shape_index) >= 0) {
-            *depth_ptr = 0;
+            *depth_ptr = operand_depth.element(shape_index);
+            run_depth_propagation_on_body = true;
           }
-        }
-      });
-  return HandleCalledComputation(*body_computation, body_depth,
-                                 xla_while->operands());
+        });
+    root_depth_ptr = &root_depth;
+  }
+  return OkStatus();
 }
 
 Status EinsumDepthAnalysis::HandleConditional(HloInstruction* conditional) {
@@ -452,6 +505,114 @@ Status EinsumDepthAnalysis::HandleOutfeed(HloInstruction* outfeed) {
   return OkStatus();
 }
 
+Status EinsumDepthAnalysis::HandleCollectivePermuteStart(
+    HloInstruction* collective_permute_start) {
+  auto depth_iter = einsum_depth_map_.find(collective_permute_start);
+  CHECK(depth_iter != einsum_depth_map_.end());
+  const ShapeTree<int>& depth_tree = depth_iter->second;
+  for (int operand_index = 0;
+       operand_index < collective_permute_start->operand_count();
+       ++operand_index) {
+    HloInstruction* operand =
+        collective_permute_start->mutable_operand(operand_index);
+    if (operand_index >= 2) {
+      TF_RETURN_IF_ERROR(SetInstructionDepth(operand, GetMaxDepth(depth_tree)));
+      continue;
+    }
+    auto operand_depth_iter = GetOrCreateDepthTree(operand);
+    ShapeTree<int>& operand_depth = operand_depth_iter->second;
+    SetDepthFromTupleDepth(operand_depth, depth_tree, 1);
+  }
+  return OkStatus();
+}
+
+Status EinsumDepthAnalysis::HandleCollectivePermuteDone(
+    HloInstruction* collective_permute_done) {
+  auto depth_iter = einsum_depth_map_.find(collective_permute_done);
+  CHECK(depth_iter != einsum_depth_map_.end());
+  const ShapeTree<int>& depth_tree = depth_iter->second;
+  auto operand_depth_iter =
+      GetOrCreateDepthTree(collective_permute_done->mutable_operand(0));
+  ShapeTree<int>& operand_depth = operand_depth_iter->second;
+  int max_depth = GetMaxDepth(depth_tree);
+  operand_depth.ForEachMutableElement([&operand_depth, &depth_tree, max_depth](
+                                          const ShapeIndex& index, int* depth) {
+    if (!operand_depth.IsLeaf(index)) {
+      return;
+    }
+    if (index.front() == 0 || index.front() == 1) {
+      ShapeIndex output_index = index;
+      output_index.pop_front();
+      *depth = depth_tree.element(output_index);
+    }
+    *depth = max_depth;
+  });
+  return OkStatus();
+}
+
+Status EinsumDepthAnalysis::HandleSend(HloInstruction* send) {
+  auto depth_iter = GetOrCreateDepthTree(send);
+  const ShapeTree<int>& depth_tree = depth_iter->second;
+  HloInstruction* send_buffer = send->mutable_operand(0);
+  auto send_buffer_depth_iter = GetOrCreateDepthTree(send_buffer);
+  ShapeTree<int>& send_buffer_depth = send_buffer_depth_iter->second;
+  SetDepthFromTupleDepth(send_buffer_depth, depth_tree, 0);
+  int max_depth = GetMaxDepth(depth_tree);
+  HloInstruction* token = send->mutable_operand(1);
+  return SetInstructionDepth(token, max_depth);
+}
+
+Status EinsumDepthAnalysis::HandleRecv(HloInstruction* recv) {
+  auto depth_iter = GetOrCreateDepthTree(recv);
+  const ShapeTree<int>& depth_tree = depth_iter->second;
+  TF_ASSIGN_OR_RETURN(HloInstruction * send,
+                      GetMatchingSendOrRecvFromMap(recv, send_recv_group_map_));
+  auto send_depth_iter = GetOrCreateDepthTree(send);
+  ShapeTree<int>& send_depth = send_depth_iter->second;
+  int max_depth = GetMaxDepth(depth_tree);
+  send_depth.ForEachMutableElement([&depth_tree, &send_depth, max_depth](
+                                       const ShapeIndex& index, int* depth) {
+    if (!send_depth.IsLeaf(index)) {
+      return;
+    }
+    if (index.front() == 0) {
+      *depth = MergeDepth(*depth, depth_tree.element(index));
+      return;
+    }
+    *depth = MergeDepth(*depth, max_depth);
+  });
+  return OkStatus();
+}
+
+Status EinsumDepthAnalysis::HandleSendDone(HloInstruction* send_done) {
+  HloInstruction* send = send_done->mutable_operand(0);
+  auto depth_iter = GetOrCreateDepthTree(send_done);
+  const ShapeTree<int>& depth_tree = depth_iter->second;
+  int max_depth = GetMaxDepth(depth_tree);
+  return SetInstructionDepth(send, max_depth);
+}
+
+Status EinsumDepthAnalysis::HandleRecvDone(HloInstruction* recv_done) {
+  auto depth_iter = GetOrCreateDepthTree(recv_done);
+  const ShapeTree<int>& depth_tree = depth_iter->second;
+  int max_depth = GetMaxDepth(depth_tree);
+  HloInstruction* recv = recv_done->mutable_operand(0);
+  auto recv_depth_iter = GetOrCreateDepthTree(recv);
+  ShapeTree<int>& recv_depth = recv_depth_iter->second;
+  recv_depth.ForEachMutableElement([&depth_tree, &recv_depth, max_depth](
+                                       const ShapeIndex& index, int* depth) {
+    if (!recv_depth.IsLeaf(index)) {
+      return;
+    }
+    if (index.front() == 0) {
+      *depth = MergeDepth(*depth, depth_tree.element(index));
+      return;
+    }
+    *depth = MergeDepth(*depth, max_depth);
+  });
+  return OkStatus();
+}
+
 std::string HloValueSemanticLabelToString(HloValueSemanticLabel label) {
   switch (label) {
     case HloValueSemanticLabel::kStatic:
@@ -499,6 +660,7 @@ StatusOr<std::unique_ptr<HloValueSemanticsAnalysis>>
 HloValueSemanticsAnalysis::Run(const HloModule& module) {
   std::unique_ptr<HloValueSemanticsAnalysis> value_semantics_analysis =
       absl::WrapUnique(new HloValueSemanticsAnalysis(module));
+  value_semantics_analysis->InitializeSendRecvGroups();
   TF_RETURN_IF_ERROR(value_semantics_analysis->InitializeEinsumDepth());
   value_semantics_analysis->AnnotateWeights();
   TF_RETURN_IF_ERROR(
@@ -509,11 +671,26 @@ HloValueSemanticsAnalysis::Run(const HloModule& module) {
 Status HloValueSemanticsAnalysis::InitializeEinsumDepth() {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<EinsumDepthAnalysis> einsum_depth_analysis,
-      EinsumDepthAnalysis::Run(*module_.entry_computation()));
+      EinsumDepthAnalysis::Run(*module_.entry_computation(),
+                               send_recv_group_map_));
   einsum_depth_map_ = einsum_depth_analysis->GetEinsumDepthMap();
   return OkStatus();
 }
 
+void HloValueSemanticsAnalysis::InitializeSendRecvGroups() {
+  send_recv_group_map_ = CreateSendRecvGroupMap(module_);
+}
+
+bool HloValueSemanticsAnalysis::HasSemanticsFor(
+    const HloInstruction* instruction) const {
+  return value_semantics_.contains(instruction);
+}
+
+StatusOr<HloInstruction*> HloValueSemanticsAnalysis::GetMatchingSendOrRecv(
+    HloInstruction* send_or_recv) const {
+  return GetMatchingSendOrRecvFromMap(send_or_recv, send_recv_group_map_);
+}
+
 HloValueSemantics::Id HloValueSemanticsAnalysis::NextId() { return next_id_++; }
 
 const HloValueSemantics* HloValueSemanticsAnalysis::NewHloValueSemantics(
@@ -743,7 +920,8 @@ HloValueSemanticsPropagation::ComputeSemanticsFromStaticAndOther(
                                instruction->opcode() == HloOpcode::kConvolution;
   if (is_dot_or_convolution &&
       other_semantics.label() == HloValueSemanticLabel::kActivationGradient) {
-    return CreateGradientSemantics(instruction);
+    return MaybeCreateGradientSemantics(
+        instruction, HloValueSemanticLabel::kActivationGradient);
   }
   return CopySemantics(other_semantics);
 }
@@ -762,8 +940,9 @@ HloValueSemanticsPropagation::ComputeSemanticsFromRandomAndOther(
 }
 
 StatusOr<HloValueSemantics>
-HloValueSemanticsPropagation::CreateGradientSemantics(
-    HloInstruction* gradient_candidate) const {
+HloValueSemanticsPropagation::MaybeCreateGradientSemantics(
+    HloInstruction* gradient_candidate,
+    HloValueSemanticLabel fallback_label) const {
   const EinsumDepthMap& einsum_depth_map = analysis_->GetEinsumDepthMap();
   auto depth_iter = einsum_depth_map.find(gradient_candidate);
   CHECK(depth_iter != einsum_depth_map.end());
@@ -779,8 +958,7 @@ HloValueSemanticsPropagation::CreateGradientSemantics(
     return HloValueSemantics(HloValueSemanticLabel::kWeightGradient,
                              {gradient_candidate, {}});
   }
-  return HloValueSemantics(HloValueSemanticLabel::kActivationGradient,
-                           {gradient_candidate, {}});
+  return HloValueSemantics(fallback_label, {gradient_candidate, {}});
 }
 
 StatusOr<HloValueSemantics>
@@ -795,6 +973,9 @@ HloValueSemanticsPropagation::ComputeSemanticsFromWeightAndOther(
                                instruction->opcode() == HloOpcode::kConvolution;
   if (other_semantics.label() == HloValueSemanticLabel::kWeight) {
     if (!is_dot_or_convolution) {
+      if (weight_semantics.origin() == other_semantics.origin()) {
+        return CopySemantics(other_semantics);
+      }
       return CopySemanticsWithNewOrigin(other_semantics, instruction);
     }
     return HloValueSemantics(HloValueSemanticLabel::kActivation,
@@ -811,7 +992,8 @@ HloValueSemanticsPropagation::ComputeSemanticsFromWeightAndOther(
     //  operand.
     if (OriginDependsOn(other_semantics, weight_semantics.origin(),
                         /*recursive=*/true)) {
-      return CreateGradientSemantics(instruction);
+      return MaybeCreateGradientSemantics(
+          instruction, HloValueSemanticLabel::kActivationGradient);
     }
     return CopySemanticsWithNewOrigin(other_semantics, instruction);
   }
@@ -820,7 +1002,8 @@ HloValueSemanticsPropagation::ComputeSemanticsFromWeightAndOther(
     // which produce an Activation. The ActivationGradient to this Activation
     // could be used in an einsum with one of the Weights to compute
     // the WeightGradient for the other Weight.
-    return CreateGradientSemantics(instruction);
+    return MaybeCreateGradientSemantics(
+        instruction, HloValueSemanticLabel::kActivationGradient);
   }
   CHECK(other_semantics.label() == HloValueSemanticLabel::kWeightGradient);
   return CopySemantics(other_semantics);
@@ -838,14 +1021,16 @@ HloValueSemanticsPropagation::ComputeSemanticsFromActivationAndOther(
   bool is_dot_or_convolution = instruction->opcode() == HloOpcode::kDot ||
                                instruction->opcode() == HloOpcode::kConvolution;
   if (!is_dot_or_convolution) {
+    if (activation_semantics.origin() == other_semantics.origin()) {
+      return CopySemantics(other_semantics);
+    }
     return CopySemanticsWithNewOrigin(other_semantics, instruction);
   }
   if (other_semantics.label() == HloValueSemanticLabel::kActivation) {
     // Like said above, since loss is classified as Activation, an einsum
-    // between an Activation X and an Activation Y could be WeightGradient or
-    // even ActivationGradient when either X or Y is the loss. This case is
-    // different from other Activation einsums because there must a dependency
-    // between X and Y.
+    // between an Activation X and an Activation Y could be WeightGradient if
+    // either X or Y is the loss. This case is different from other Activation
+    // einsums because there must a dependency between X and Y.
     bool other_depends_on_activation = OriginDependsOn(
         other_semantics, activation_semantics.origin(), /*recursive=*/true);
     bool activation_depends_on_other =
@@ -855,14 +1040,19 @@ HloValueSemanticsPropagation::ComputeSemanticsFromActivationAndOther(
     // If there is no dependency between the two Activations, the output must
     // be an Activation.
     if (other_depends_on_activation || activation_depends_on_other) {
-      return CreateGradientSemantics(instruction);
+      // We check if the einsum is actually weight gradient. If it is not, fall
+      // back to activation, since we expect the loss to be computed from an
+      // activation-weight einsum.
+      return MaybeCreateGradientSemantics(instruction,
+                                          HloValueSemanticLabel::kActivation);
     }
     return CopySemanticsWithNewOrigin(other_semantics, instruction);
   }
   if (other_semantics.label() == HloValueSemanticLabel::kActivationGradient) {
     // An Activation-ActivationGradient einsum could be computing
     // WeightGradient or ActivationGradient.
-    return CreateGradientSemantics(instruction);
+    return MaybeCreateGradientSemantics(
+        instruction, HloValueSemanticLabel::kActivationGradient);
   }
   CHECK(other_semantics.label() == HloValueSemanticLabel::kWeightGradient)
       << "instruction:  " << instruction->ToString()
@@ -1015,8 +1205,14 @@ HloValueSemanticsPropagation::ComputeSemanticsFromOperands(
   return semantics_vec.back();
 }
 
+#define RETURN_IF_ALREADY_PROPAGATED(instruction) \
+  if (analysis_->HasSemanticsFor(instruction)) {  \
+    return OkStatus();                            \
+  }
+
 Status HloValueSemanticsPropagation::DefaultAction(
     HloInstruction* instruction) {
+  RETURN_IF_ALREADY_PROPAGATED(instruction);
   std::vector<int64_t> operand_indices(instruction->operand_count());
   std::iota(operand_indices.begin(), operand_indices.end(), 0);
   TF_ASSIGN_OR_RETURN(
@@ -1035,6 +1231,7 @@ Status HloValueSemanticsPropagation::HandleParameter(
 }
 
 Status HloValueSemanticsPropagation::HandleConstant(HloInstruction* constant) {
+  RETURN_IF_ALREADY_PROPAGATED(constant);
   const HloValueSemantics* constant_semantics = analysis_->NewHloValueSemantics(
       HloValueSemanticLabel::kStatic, {constant, {}});
   ShapeTree<const HloValueSemantics*> semantics_shape_tree(constant->shape(),
@@ -1044,6 +1241,7 @@ Status HloValueSemanticsPropagation::HandleConstant(HloInstruction* constant) {
 }
 
 Status HloValueSemanticsPropagation::HandleIota(HloInstruction* iota) {
+  RETURN_IF_ALREADY_PROPAGATED(iota);
   const HloValueSemantics* semantics = analysis_->NewHloValueSemantics(
       HloValueSemanticLabel::kStatic, {iota, {}});
   ShapeTree<const HloValueSemantics*> semantics_shape_tree(iota->shape(),
@@ -1054,6 +1252,7 @@ Status HloValueSemanticsPropagation::HandleIota(HloInstruction* iota) {
 
 Status HloValueSemanticsPropagation::HandlePartitionId(
     HloInstruction* partition_id) {
+  RETURN_IF_ALREADY_PROPAGATED(partition_id);
   const HloValueSemantics* semantics = analysis_->NewHloValueSemantics(
       HloValueSemanticLabel::kStatic, {partition_id, {}});
   ShapeTree<const HloValueSemantics*> semantics_shape_tree(
@@ -1063,6 +1262,7 @@ Status HloValueSemanticsPropagation::HandlePartitionId(
 }
 Status HloValueSemanticsPropagation::HandleReplicaId(
     HloInstruction* replica_id) {
+  RETURN_IF_ALREADY_PROPAGATED(replica_id);
   const HloValueSemantics* semantics = analysis_->NewHloValueSemantics(
       HloValueSemanticLabel::kStatic, {replica_id, {}});
   ShapeTree<const HloValueSemantics*> semantics_shape_tree(replica_id->shape(),
@@ -1071,7 +1271,18 @@ Status HloValueSemanticsPropagation::HandleReplicaId(
   return OkStatus();
 }
 
+Status HloValueSemanticsPropagation::HandleRngBitGenerator(
+    HloInstruction* rng_bit_generator) {
+  const HloValueSemantics* semantics = analysis_->NewHloValueSemantics(
+      HloValueSemanticLabel::kRandom, {rng_bit_generator, {}});
+  ShapeTree<const HloValueSemantics*> rbg_semantics_tree(
+      rng_bit_generator->shape(), semantics);
+  analysis_->SetHloValueSemantics(rng_bit_generator, rbg_semantics_tree);
+  return OkStatus();
+}
+
 Status HloValueSemanticsPropagation::HandleClamp(HloInstruction* clamp) {
+  RETURN_IF_ALREADY_PROPAGATED(clamp);
   const ShapeTree<const HloValueSemantics*>& operand_semantics =
       analysis_->GetInstructionSemantics(clamp->operand(1));
   analysis_->DeepCopyHloValueSemantics(clamp, operand_semantics);
@@ -1079,6 +1290,7 @@ Status HloValueSemanticsPropagation::HandleClamp(HloInstruction* clamp) {
 }
 
 Status HloValueSemanticsPropagation::HandleTuple(HloInstruction* tuple) {
+  RETURN_IF_ALREADY_PROPAGATED(tuple);
   ShapeTree<const HloValueSemantics*> semantics_shape_tree(tuple->shape(),
                                                            nullptr);
   for (int operand_index = 0; operand_index < tuple->operand_count();
@@ -1104,6 +1316,7 @@ Status HloValueSemanticsPropagation::HandleTuple(HloInstruction* tuple) {
 
 Status HloValueSemanticsPropagation::HandleGetTupleElement(
     HloInstruction* get_tuple_element) {
+  RETURN_IF_ALREADY_PROPAGATED(get_tuple_element);
   const HloInstruction* tuple = get_tuple_element->operand(0);
   int64_t tuple_index = get_tuple_element->tuple_index();
   const ShapeTree<const HloValueSemantics*>& tuple_semantics =
@@ -1117,6 +1330,7 @@ Status HloValueSemanticsPropagation::HandleGetTupleElement(
 }
 
 Status HloValueSemanticsPropagation::HandleCall(HloInstruction* call) {
+  RETURN_IF_ALREADY_PROPAGATED(call);
   HloComputation* computation = call->called_computations()[0];
   TF_RETURN_IF_ERROR(
       analysis_->RunOnComputation(*computation, call->operands()));
@@ -1127,6 +1341,7 @@ Status HloValueSemanticsPropagation::HandleCall(HloInstruction* call) {
 }
 
 Status HloValueSemanticsPropagation::HandleFusion(HloInstruction* fusion) {
+  RETURN_IF_ALREADY_PROPAGATED(fusion);
   HloComputation* computation = fusion->called_computations()[0];
   TF_RETURN_IF_ERROR(
       analysis_->RunOnComputation(*computation, fusion->operands()));
@@ -1137,6 +1352,7 @@ Status HloValueSemanticsPropagation::HandleFusion(HloInstruction* fusion) {
 }
 
 Status HloValueSemanticsPropagation::HandleWhile(HloInstruction* xla_while) {
+  RETURN_IF_ALREADY_PROPAGATED(xla_while);
   TF_RETURN_IF_ERROR(analysis_->RunOnComputation(*xla_while->while_condition(),
                                                  xla_while->operands()));
   HloComputation* computation = xla_while->while_body();
@@ -1150,7 +1366,10 @@ Status HloValueSemanticsPropagation::HandleWhile(HloInstruction* xla_while) {
 
 Status HloValueSemanticsPropagation::HandleCustomCall(
     HloInstruction* custom_call) {
-  if (custom_call->custom_call_target() == "Sharding") {
+  RETURN_IF_ALREADY_PROPAGATED(custom_call);
+  if (custom_call->custom_call_target() == "Sharding" ||
+      custom_call->custom_call_target() == "SPMDFullToShardShape" ||
+      custom_call->custom_call_target() == "SPMDShardToFullShape") {
     const ShapeTree<const HloValueSemantics*>& operand_semantics =
         analysis_->GetInstructionSemantics(custom_call->operand(0));
     analysis_->DeepCopyHloValueSemantics(custom_call, operand_semantics);
@@ -1162,6 +1381,7 @@ Status HloValueSemanticsPropagation::HandleCustomCall(
 
 Status HloValueSemanticsPropagation::HandleConditional(
     HloInstruction* conditional) {
+  RETURN_IF_ALREADY_PROPAGATED(conditional);
   for (int i = 0; i < conditional->called_computations().size(); ++i) {
     TF_RETURN_IF_ERROR(
         analysis_->RunOnComputation(*conditional->called_computations()[i],
@@ -1175,6 +1395,7 @@ Status HloValueSemanticsPropagation::HandleConditional(
 }
 
 Status HloValueSemanticsPropagation::HandleSelect(HloInstruction* select) {
+  RETURN_IF_ALREADY_PROPAGATED(select);
   TF_ASSIGN_OR_RETURN(HloValueSemantics semantics,
                       ComputeSemanticsFromOperands(select, {1, 2}));
   const HloValueSemantics* semantics_ptr = AddSemantics(semantics);
@@ -1186,6 +1407,7 @@ Status HloValueSemanticsPropagation::HandleSelect(HloInstruction* select) {
 
 Status HloValueSemanticsPropagation::HandleConcatenate(
     HloInstruction* concatenate) {
+  RETURN_IF_ALREADY_PROPAGATED(concatenate);
   const ShapeTree<const HloValueSemantics*>& operand_semantics =
       analysis_->GetInstructionSemantics(concatenate->operand(0));
   analysis_->DeepCopyHloValueSemantics(concatenate, operand_semantics);
@@ -1194,19 +1416,11 @@ Status HloValueSemanticsPropagation::HandleConcatenate(
 
 Status HloValueSemanticsPropagation::HandleDynamicSlice(
     HloInstruction* dynamic_slice) {
+  RETURN_IF_ALREADY_PROPAGATED(dynamic_slice);
   const HloInstruction* dynamic_slice_operand = dynamic_slice->operand(0);
   const HloValueSemantics* operand_semantics =
       analysis_->GetSemantics(dynamic_slice_operand);
-  const HloValueSemantics* semantics = nullptr;
-  if (operand_semantics->label() == HloValueSemanticLabel::kStatic ||
-      operand_semantics->label() == HloValueSemanticLabel::kRandom ||
-      operand_semantics->label() == HloValueSemanticLabel::kWeight) {
-    semantics = analysis_->NewHloValueSemantics(operand_semantics->label(),
-                                                {dynamic_slice, {}});
-  } else {
-    HloValueSemantics semantics_value = CopySemantics(*operand_semantics);
-    semantics = AddSemantics(semantics_value);
-  }
+  const HloValueSemantics* semantics = AddSemantics(*operand_semantics);
   ShapeTree<const HloValueSemantics*> semantics_shape_tree(
       dynamic_slice->shape(), semantics);
   analysis_->SetHloValueSemantics(dynamic_slice, semantics_shape_tree);
@@ -1215,6 +1429,7 @@ Status HloValueSemanticsPropagation::HandleDynamicSlice(
 
 Status HloValueSemanticsPropagation::HandleDynamicUpdateSlice(
     HloInstruction* dynamic_update_slice) {
+  RETURN_IF_ALREADY_PROPAGATED(dynamic_update_slice);
   TF_ASSIGN_OR_RETURN(
       HloValueSemantics semantics,
       ComputeSemanticsFromOperands(dynamic_update_slice, {0, 1}));
@@ -1227,6 +1442,7 @@ Status HloValueSemanticsPropagation::HandleDynamicUpdateSlice(
 
 Status HloValueSemanticsPropagation::HandleCopyStart(
     HloInstruction* copy_start) {
+  RETURN_IF_ALREADY_PROPAGATED(copy_start);
   ShapeTree<const HloValueSemantics*> semantics_shape_tree(copy_start->shape());
   const ShapeTree<const HloValueSemantics*>& operand_semantics_shape_tree =
       analysis_->GetInstructionSemantics(copy_start->operand(0));
@@ -1255,6 +1471,7 @@ Status HloValueSemanticsPropagation::HandleCopyStart(
 }
 
 Status HloValueSemanticsPropagation::HandleCopyDone(HloInstruction* copy_done) {
+  RETURN_IF_ALREADY_PROPAGATED(copy_done);
   const ShapeTree<const HloValueSemantics*>& operand_semantics_shape_tree =
       analysis_->GetInstructionSemantics(copy_done->operand(0));
   analysis_->DeepCopyHloValueSemantics(copy_done, operand_semantics_shape_tree,
@@ -1263,6 +1480,7 @@ Status HloValueSemanticsPropagation::HandleCopyDone(HloInstruction* copy_done) {
 }
 Status HloValueSemanticsPropagation::HandleCollectivePermuteStart(
     HloInstruction* collective_permute_start) {
+  RETURN_IF_ALREADY_PROPAGATED(collective_permute_start);
   ShapeTree<const HloValueSemantics*> semantics_shape_tree(
       collective_permute_start->shape());
   const ShapeTree<const HloValueSemantics*>& operand_semantics_shape_tree =
@@ -1296,6 +1514,7 @@ Status HloValueSemanticsPropagation::HandleCollectivePermuteStart(
 }
 Status HloValueSemanticsPropagation::HandleCollectivePermuteDone(
     HloInstruction* collective_permute_done) {
+  RETURN_IF_ALREADY_PROPAGATED(collective_permute_done);
   const ShapeTree<const HloValueSemantics*>& operand_semantics_shape_tree =
       analysis_->GetInstructionSemantics(collective_permute_done->operand(0));
   analysis_->DeepCopyHloValueSemantics(collective_permute_done,
@@ -1303,6 +1522,7 @@ Status HloValueSemanticsPropagation::HandleCollectivePermuteDone(
   return OkStatus();
 }
 Status HloValueSemanticsPropagation::HandleGather(HloInstruction* gather) {
+  RETURN_IF_ALREADY_PROPAGATED(gather);
   const ShapeTree<const HloValueSemantics*>& operand_semantics_shape_tree =
       analysis_->GetInstructionSemantics(gather->operand(0));
   analysis_->DeepCopyHloValueSemantics(gather, operand_semantics_shape_tree);
@@ -1310,6 +1530,7 @@ Status HloValueSemanticsPropagation::HandleGather(HloInstruction* gather) {
 }
 
 Status HloValueSemanticsPropagation::HandleScatter(HloInstruction* scatter) {
+  RETURN_IF_ALREADY_PROPAGATED(scatter);
   TF_ASSIGN_OR_RETURN(HloValueSemantics semantics,
                       ComputeSemanticsFromOperands(scatter, {0, 2}));
   const HloValueSemantics* semantics_ptr = AddSemantics(semantics);
@@ -1320,6 +1541,7 @@ Status HloValueSemanticsPropagation::HandleScatter(HloInstruction* scatter) {
 }
 
 Status HloValueSemanticsPropagation::HandleAfterAll(HloInstruction* after_all) {
+  RETURN_IF_ALREADY_PROPAGATED(after_all);
   const HloValueSemantics* semantics = analysis_->NewHloValueSemantics(
       HloValueSemanticLabel::kTupleOrToken, {after_all, {}});
   ShapeTree<const HloValueSemantics*> semantics_shape_tree(after_all->shape(),
@@ -1330,14 +1552,58 @@ Status HloValueSemanticsPropagation::HandleAfterAll(HloInstruction* after_all) {
 
 Status HloValueSemanticsPropagation::HandleAsyncStart(
     HloInstruction* async_start) {
-  return Unimplemented("AsyncStart is not supported yet.");
+  RETURN_IF_ALREADY_PROPAGATED(async_start);
+  const HloValueSemantics* semantics = analysis_->NewHloValueSemantics(
+      HloValueSemanticLabel::kTupleOrToken, {async_start, {}});
+  ShapeTree<const HloValueSemantics*> semantics_shape_tree(async_start->shape(),
+                                                           semantics);
+  for (int operand_index = 0; operand_index < async_start->operand_count();
+       ++operand_index) {
+    HloInstruction* operand = async_start->mutable_operand(operand_index);
+    const ShapeTree<const HloValueSemantics*>& operand_semantics_tree =
+        analysis_->GetInstructionSemantics(operand);
+    analysis_->DeepCopyHloValueSemantics(
+        semantics_shape_tree, operand_semantics_tree, {}, {0, operand_index});
+  }
+  std::vector<int64_t> operand_indices(async_start->operand_count());
+  std::iota(operand_indices.begin(), operand_indices.end(), 0);
+  TF_ASSIGN_OR_RETURN(
+      HloValueSemantics output_semantics,
+      ComputeSemanticsFromOperands(async_start, operand_indices));
+  semantics_shape_tree.ForEachMutableElement(
+      [&output_semantics, &semantics_shape_tree, this, async_start](
+          const ShapeIndex& index, const HloValueSemantics** semantics_ptr) {
+        if (index.empty() || index.front() == 0) {
+          return;
+        }
+        if (!semantics_shape_tree.IsLeaf(index)) {
+          *semantics_ptr = analysis_->NewHloValueSemantics(
+              HloValueSemanticLabel::kTupleOrToken, {async_start, {}});
+          return;
+        }
+        if (index.front() == 1) {
+          *semantics_ptr = AddSemantics(output_semantics);
+          return;
+        }
+        if (index.front() == 2) {
+          *semantics_ptr = analysis_->NewHloValueSemantics(
+              HloValueSemanticLabel::kRandom, {async_start, {}});
+        }
+      });
+  analysis_->SetHloValueSemantics(async_start, semantics_shape_tree);
+  return OkStatus();
 }
 Status HloValueSemanticsPropagation::HandleAsyncDone(
     HloInstruction* async_done) {
-  return Unimplemented("AsyncDone is not supported yet.");
+  RETURN_IF_ALREADY_PROPAGATED(async_done);
+  const ShapeTree<const HloValueSemantics*>& operand_semantics_tree =
+      analysis_->GetInstructionSemantics(async_done->operand(0));
+  analysis_->DeepCopyHloValueSemantics(async_done, operand_semantics_tree, {1});
+  return OkStatus();
 }
 
 Status HloValueSemanticsPropagation::HandleInfeed(HloInstruction* infeed) {
+  RETURN_IF_ALREADY_PROPAGATED(infeed);
   ShapeTree<const HloValueSemantics*> semantics_shape_tree(infeed->shape(),
                                                            nullptr);
   semantics_shape_tree.ForEachMutableElement(
@@ -1356,6 +1622,7 @@ Status HloValueSemanticsPropagation::HandleInfeed(HloInstruction* infeed) {
 }
 
 Status HloValueSemanticsPropagation::HandleDomain(HloInstruction* domain) {
+  RETURN_IF_ALREADY_PROPAGATED(domain);
   HloInstruction* domain_operand = domain->mutable_operand(0);
   const ShapeTree<const HloValueSemantics*>& operand_semantics =
       analysis_->GetInstructionSemantics(domain_operand);
@@ -1363,4 +1630,109 @@ Status HloValueSemanticsPropagation::HandleDomain(HloInstruction* domain) {
   return OkStatus();
 }
 
+Status HloValueSemanticsPropagation::HandleOptimizationBarrier(
+    HloInstruction* opt_barrier) {
+  RETURN_IF_ALREADY_PROPAGATED(opt_barrier);
+  HloInstruction* opt_barrier_operand = opt_barrier->mutable_operand(0);
+  const ShapeTree<const HloValueSemantics*>& operand_semantics =
+      analysis_->GetInstructionSemantics(opt_barrier_operand);
+  analysis_->DeepCopyHloValueSemantics(opt_barrier, operand_semantics);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleSend(HloInstruction* send) {
+  RETURN_IF_ALREADY_PROPAGATED(send);
+  ShapeTree<const HloValueSemantics*> semantics_tree(send->shape(), nullptr);
+  HloInstruction* source_buffer = send->mutable_operand(0);
+  const ShapeTree<const HloValueSemantics*>& source_buffer_semantics =
+      analysis_->GetInstructionSemantics(source_buffer);
+  analysis_->DeepCopyHloValueSemantics(semantics_tree, source_buffer_semantics,
+                                       {}, {0});
+
+  semantics_tree.ForEachMutableElement(
+      [this, send, &semantics_tree](const ShapeIndex& index,
+                                    const HloValueSemantics** semantics) {
+        if (!index.empty()) {
+          if (index.front() == 1 && semantics_tree.IsLeaf(index)) {
+            *semantics = analysis_->NewHloValueSemantics(
+                HloValueSemanticLabel::kRandom, {send, index});
+            return;
+          }
+          if (index.front() == 0) {
+            return;
+          }
+        }
+        *semantics = analysis_->NewHloValueSemantics(
+            HloValueSemanticLabel::kTupleOrToken, {send, index});
+      });
+  analysis_->SetHloValueSemantics(send, semantics_tree);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleRecv(HloInstruction* recv) {
+  // Since recv is not a prerequisite of send, we might have not propagated
+  // semantics to the corresponding send when we reach this recv. So we visit
+  // the send first before visiting this recv.
+  // We use RETURN_IF_ALREADY_PROPAGATED to avoid processing an HLO more than
+  // once.
+  RETURN_IF_ALREADY_PROPAGATED(recv);
+  TF_ASSIGN_OR_RETURN(HloInstruction * send,
+                      analysis_->GetMatchingSendOrRecv(recv));
+  TF_RETURN_IF_ERROR(send->Accept(this));
+  ShapeTree<const HloValueSemantics*> semantics_tree(recv->shape(), nullptr);
+  const ShapeTree<const HloValueSemantics*>& send_buffer_semantics =
+      analysis_->GetInstructionSemantics(send);
+  analysis_->DeepCopyHloValueSemantics(semantics_tree, send_buffer_semantics,
+                                       {0}, {0});
+  semantics_tree.ForEachMutableElement(
+      [this, recv, &semantics_tree](const ShapeIndex& index,
+                                    const HloValueSemantics** semantics) {
+        if (!index.empty()) {
+          if (index.front() == 1 && semantics_tree.IsLeaf(index)) {
+            *semantics = analysis_->NewHloValueSemantics(
+                HloValueSemanticLabel::kRandom, {recv, index});
+            return;
+          }
+          if (index.front() == 0) {
+            return;
+          }
+        }
+        *semantics = analysis_->NewHloValueSemantics(
+            HloValueSemanticLabel::kTupleOrToken, {recv, index});
+      });
+  analysis_->SetHloValueSemantics(recv, semantics_tree);
+  return OkStatus();
+}
+
+Status HloValueSemanticsPropagation::HandleSendDone(HloInstruction* send_done) {
+  RETURN_IF_ALREADY_PROPAGATED(send_done);
+  const HloValueSemantics* semantics = analysis_->NewHloValueSemantics(
+      HloValueSemanticLabel::kTupleOrToken, {send_done, {}});
+  ShapeTree<const HloValueSemantics*> send_done_semantics_tree(
+      send_done->shape(), semantics);
+  analysis_->SetHloValueSemantics(send_done, send_done_semantics_tree);
+  return OkStatus();
+}
+Status HloValueSemanticsPropagation::HandleRecvDone(HloInstruction* recv_done) {
+  RETURN_IF_ALREADY_PROPAGATED(recv_done);
+  ShapeTree<const HloValueSemantics*> semantics_tree(recv_done->shape(),
+                                                     nullptr);
+  HloInstruction* recv = recv_done->mutable_operand(0);
+  const ShapeTree<const HloValueSemantics*>& recv_semantics =
+      analysis_->GetInstructionSemantics(recv);
+  analysis_->DeepCopyHloValueSemantics(semantics_tree, recv_semantics, {0},
+                                       {0});
+  semantics_tree.ForEachMutableElement(
+      [this, recv_done](const ShapeIndex& index,
+                        const HloValueSemantics** semantics) {
+        if (!index.empty() && index.front() == 0) {
+          return;
+        }
+        *semantics = analysis_->NewHloValueSemantics(
+            HloValueSemanticLabel::kTupleOrToken, {recv_done, index});
+      });
+  analysis_->SetHloValueSemantics(recv_done, semantics_tree);
+  return OkStatus();
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis.h b/third_party/xla/xla/service/hlo_value_semantics_analysis.h
index c0491079555a90..634b13f21ed65c 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis.h
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis.h
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/node_hash_map.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -37,6 +38,15 @@ limitations under the License.
 
 namespace xla {
 
+struct SendRecvGroup {
+  HloInstruction* send;
+  HloInstruction* recv;
+};
+
+using SendRecvGroupMap = absl::flat_hash_map<std::string, SendRecvGroup>;
+
+SendRecvGroupMap GetSendRecvGroupMap(const HloModule& hlo_module);
+
 class HloPreOrderDFS {
  public:
   HloPreOrderDFS() = default;
@@ -51,7 +61,7 @@ class HloPreOrderDFS {
 };
 
 using EinsumDepthMap =
-    absl::flat_hash_map<const HloInstruction*, ShapeTree<int>>;
+    absl::node_hash_map<const HloInstruction*, ShapeTree<int>>;
 
 // The einsum depth is the length of the einsum dependency chain. And we
 // distinguish instructions that are used by root and that are not used by
@@ -70,7 +80,8 @@ using EinsumDepthMap =
 class EinsumDepthAnalysis : public DfsHloVisitorWithDefault {
  public:
   static StatusOr<std::unique_ptr<EinsumDepthAnalysis>> Run(
-      const HloComputation& computation);
+      const HloComputation& computation,
+      const SendRecvGroupMap& send_recv_group_map);
   ~EinsumDepthAnalysis() override = default;
   Status DefaultAction(HloInstruction* instruction) override;
   Status HandleTuple(HloInstruction* tuple) override;
@@ -84,10 +95,19 @@ class EinsumDepthAnalysis : public DfsHloVisitorWithDefault {
   Status HandleConditional(HloInstruction* conditional) override;
   Status HandleAfterAll(HloInstruction* after_all) override;
   Status HandleOutfeed(HloInstruction* outfeed) override;
+  Status HandleCollectivePermuteStart(
+      HloInstruction* collective_permute_start) override;
+  Status HandleCollectivePermuteDone(
+      HloInstruction* collective_permute_done) override;
+  Status HandleSend(HloInstruction* send) override;
+  Status HandleRecv(HloInstruction* recv) override;
+  Status HandleSendDone(HloInstruction* send_done) override;
+  Status HandleRecvDone(HloInstruction* recv_done) override;
   const EinsumDepthMap& GetEinsumDepthMap() const { return einsum_depth_map_; }
 
  private:
-  EinsumDepthAnalysis() = default;
+  explicit EinsumDepthAnalysis(const SendRecvGroupMap& send_recv_group_map)
+      : send_recv_group_map_(send_recv_group_map) {}
   Status RunInternal(const HloComputation& computation,
                      const std::optional<ShapeTree<int>>& root_depth);
   EinsumDepthMap::iterator GetOrCreateDepthTree(HloInstruction* instruction);
@@ -99,6 +119,49 @@ class EinsumDepthAnalysis : public DfsHloVisitorWithDefault {
                                  const ShapeTree<int>& root_depth,
                                  absl::Span<HloInstruction* const> operands);
   EinsumDepthMap einsum_depth_map_;
+  const SendRecvGroupMap send_recv_group_map_;
+};
+
+using EinsumHeightMap =
+    absl::flat_hash_map<const HloInstruction*, ShapeTree<int>>;
+
+class EinsumHeightAnalysis : public DfsHloVisitorWithDefault {
+ public:
+  static StatusOr<std::unique_ptr<EinsumHeightAnalysis>> Run(
+      const HloComputation& computation);
+  ~EinsumHeightAnalysis() override = default;
+  Status DefaultAction(HloInstruction* instruction) override;
+  Status HandleTuple(HloInstruction* tuple) override;
+  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
+  Status HandleDot(HloInstruction* dot) override;
+  Status HandleConvolution(HloInstruction* convolution) override;
+  Status HandleCall(HloInstruction* call) override;
+  Status HandleFusion(HloInstruction* fusion) override;
+  Status HandleCustomCall(HloInstruction* custom_call) override;
+  Status HandleWhile(HloInstruction* xla_while) override;
+  Status HandleConditional(HloInstruction* conditional) override;
+  Status HandleOutfeed(HloInstruction* outfeed) override;
+  Status HandleCollectivePermuteStart(
+      HloInstruction* collective_permute_start) override;
+  Status HandleCollectivePermuteDone(
+      HloInstruction* collective_permute_done) override;
+  const EinsumHeightMap& GetEinsumHeightMap() const {
+    return einsum_height_map_;
+  }
+
+ private:
+  EinsumHeightAnalysis() = default;
+  Status RunInternal(const HloComputation& computation,
+                     absl::Span<HloInstruction* const> operands);
+  EinsumHeightMap::iterator GetOrCreateHeightTree(HloInstruction* instruction);
+  Status SetInstructionHeight(HloInstruction* instruction, int height);
+  Status SetInstructionHeight(HloInstruction* instruction,
+                              const ShapeTree<int>& height);
+  Status HandleHeightIncrementInstruction(HloInstruction* instruction);
+  Status HandleCalledComputation(const HloComputation& computation,
+                                 absl::Span<HloInstruction* const> operands);
+  EinsumHeightMap einsum_height_map_;
+  const SendRecvGroupMap send_recv_group_map_;
 };
 
 // The comment below explains where the labels could originate from. Once
@@ -145,7 +208,7 @@ class HloValueSemantics {
 };
 
 using HloValueSemanticsMap =
-    absl::flat_hash_map<const HloInstruction*,
+    absl::node_hash_map<const HloInstruction*,
                         ShapeTree<const HloValueSemantics*>>;
 class HloValueSemanticsPropagation;
 
@@ -154,6 +217,7 @@ class HloValueSemanticsAnalysis {
   static StatusOr<std::unique_ptr<HloValueSemanticsAnalysis>> Run(
       const HloModule& module);
   virtual ~HloValueSemanticsAnalysis() = default;
+  bool HasSemanticsFor(const HloInstruction* instruction) const;
   const HloValueSemantics* GetSemantics(const HloInstruction* instruction,
                                         const ShapeIndex& index = {}) const;
 
@@ -162,11 +226,19 @@ class HloValueSemanticsAnalysis {
   }
 
   const EinsumDepthMap& GetEinsumDepthMap() const { return einsum_depth_map_; }
+  const SendRecvGroupMap& GetSendRecvGroupMap() const {
+    return send_recv_group_map_;
+  }
+
+  StatusOr<HloInstruction*> GetMatchingSendOrRecv(
+      HloInstruction* send_or_recv) const;
 
  protected:
   friend class HloValueSemanticsPropagation;
   explicit HloValueSemanticsAnalysis(const HloModule& module);
   Status InitializeEinsumDepth();
+  // We match send and recv HLOs to propagate semantics from send to recv.
+  void InitializeSendRecvGroups();
   void AnnotateWeights();
 
   // Infer semantics for all instructions in the computation. Computation
@@ -201,6 +273,7 @@ class HloValueSemanticsAnalysis {
       value_semantics_map_;
   HloValueSemantics::Id next_id_;
   EinsumDepthMap einsum_depth_map_;
+  SendRecvGroupMap send_recv_group_map_;
 };
 
 class HloValueSemanticsPropagation : public DfsHloVisitorWithDefault {
@@ -240,6 +313,12 @@ class HloValueSemanticsPropagation : public DfsHloVisitorWithDefault {
   Status HandleAsyncDone(HloInstruction* async_done) override;
   Status HandleInfeed(HloInstruction* infeed) override;
   Status HandleDomain(HloInstruction* domain) override;
+  Status HandleOptimizationBarrier(HloInstruction* opt_barrier) override;
+  Status HandleRngBitGenerator(HloInstruction* rng_bit_generator) override;
+  Status HandleSend(HloInstruction* send) override;
+  Status HandleRecv(HloInstruction* recv) override;
+  Status HandleSendDone(HloInstruction* send_done) override;
+  Status HandleRecvDone(HloInstruction* recv_done) override;
 
  protected:
   HloValueSemantics CopySemantics(const HloValueSemantics& semantics) const;
@@ -267,8 +346,9 @@ class HloValueSemanticsPropagation : public DfsHloVisitorWithDefault {
   bool OriginDependsOn(const HloValueSemantics& semantics,
                        const HloPosition& origin_dependence,
                        bool recursive = false) const;
-  StatusOr<HloValueSemantics> CreateGradientSemantics(
-      HloInstruction* gradient_candidate) const;
+  StatusOr<HloValueSemantics> MaybeCreateGradientSemantics(
+      HloInstruction* gradient_candidate,
+      HloValueSemanticLabel fallback_label) const;
   StatusOr<HloValueSemantics> ComputeSemanticsFromStaticAndOther(
       const HloValueSemantics& static_semantics,
       const HloValueSemantics& other_semantics,
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc b/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc
index fd1704f9192e82..3128115727da46 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis_test.cc
@@ -567,7 +567,7 @@ TEST_F(EinsumDepthAnalysisTest, MnistTrainingLoop) {
                                                        /*num_partitions=*/1));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<EinsumDepthAnalysis> einsum_depth_analysis,
-      EinsumDepthAnalysis::Run(*module->entry_computation()));
+      EinsumDepthAnalysis::Run(*module->entry_computation(), {}));
   const EinsumDepthMap& einsum_depth_map =
       einsum_depth_analysis->GetEinsumDepthMap();
   HloComputation* computation = module->GetComputationWithName("body.49");
@@ -612,7 +612,7 @@ TEST_F(EinsumDepthAnalysisTest, HandleConditional) {
                           ParseAndReturnVerifiedModule(hlo_string));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<EinsumDepthAnalysis> einsum_depth_analysis,
-      EinsumDepthAnalysis::Run(*module->entry_computation()));
+      EinsumDepthAnalysis::Run(*module->entry_computation(), {}));
   const EinsumDepthMap& einsum_depth_map =
       einsum_depth_analysis->GetEinsumDepthMap();
   HloComputation* computation = module->GetComputationWithName("entry");
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index ae952f870dcab4..b78b796f8b9853 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -160,6 +160,10 @@ Status ShapeVerifier::Preprocess(HloInstruction* hlo) {
   if (arity) {
     TF_RETURN_IF_ERROR(CheckOperandCount(hlo, *arity));
   }
+  if (!opts_.allow_unbounded_dynamism && hlo->shape().is_unbounded_dynamic()) {
+    return InvalidArgument("Unbounded dynamism is disabled for instruction: %s",
+                           hlo->ToString());
+  }
   return OkStatus();
 }
 
@@ -873,7 +877,8 @@ Status ShapeVerifier::HandleInfeed(HloInstruction* instruction) {
   // The output of infeed is a tuple containing the data value and a token.
   return CheckShape(infeed,
                     ShapeUtil::MakeTupleShape(
-                        {infeed->infeed_shape(), ShapeUtil::MakeTokenShape()}));
+                        {infeed->infeed_shape(), ShapeUtil::MakeTokenShape()}),
+                    /*only_compare_minor_to_major_in_layout=*/true);
 }
 
 Status ShapeVerifier::HandleOutfeed(HloInstruction* instruction) {
@@ -990,35 +995,7 @@ Status ShapeVerifier::HandleReverse(HloInstruction* reverse) {
                                                  reverse->dimensions()));
 }
 
-static bool IsStrictComparison(const HloComputation* cmp) {
-  const HloInstruction* root = cmp->root_instruction();
-  return Match(root, m::Compare(m::Parameter(0), m::Parameter(1))
-                         .WithComparisonDirection(ComparisonDirection::kGt)) ||
-         Match(root, m::Compare(m::Parameter(1), m::Parameter(0))
-                         .WithComparisonDirection(ComparisonDirection::kGt)) ||
-         Match(root, m::Compare(m::Parameter(0), m::Parameter(1))
-                         .WithComparisonDirection(ComparisonDirection::kLt)) ||
-         Match(root, m::Compare(m::Parameter(1), m::Parameter(0))
-                         .WithComparisonDirection(ComparisonDirection::kLt));
-}
-
 Status ShapeVerifier::HandleTopK(HloInstruction* hlo) {
-  HloComputation* compare = hlo->to_apply();
-  Shape compare_shape = compare->root_instruction()->shape();
-  if (!ShapeUtil::Compatible(compare_shape, ShapeUtil::MakeShape(PRED, {}))) {
-    return InternalError(
-        "The TopK compare computation shape does not lead to a scalar "
-        "predicate shape: %s",
-        StringifyShape(compare_shape));
-  }
-
-  TF_RETURN_IF_ERROR(CheckParameterCount(hlo, compare, 2));
-  if (!IsStrictComparison(compare)) {
-    // TODO(cheshire): Less strict restriction.
-    return InternalError(
-        "TopK HLO expects a strict comparison of the operands");
-  }
-
   return CheckShape(
       hlo, ShapeInference::InferTopKShape(hlo->operand(0)->shape(),
                                           Cast<HloTopKInstruction>(hlo)->k()));
@@ -2773,10 +2750,6 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
         }
       }
     }
-    if (opts_.verify_s4_u4_usage) {
-      TF_RETURN_IF_ERROR(VerifyS4U4Usage(instruction));
-    }
-
     return OkStatus();
   }
 
@@ -2802,41 +2775,6 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  static Status VerifyS4U4Usage(HloInstruction* instruction) {
-    // TODO(b/259306620): Support S4/U4 operands in all instructions that
-    // support inputs of other integer dtypes. Currently only aim to use it in
-    // matmul and convolution op.
-    if (instruction->opcode() != HloOpcode::kDot &&
-        instruction->opcode() != HloOpcode::kConvolution &&
-        instruction->opcode() != HloOpcode::kConvert &&
-        instruction->opcode() != HloOpcode::kFusion &&
-        instruction->opcode() != HloOpcode::kBitcast &&
-        instruction->opcode() != HloOpcode::kCopy &&
-        instruction->opcode() != HloOpcode::kCopyStart &&
-        instruction->opcode() != HloOpcode::kCopyDone &&
-        instruction->opcode() != HloOpcode::kGetTupleElement &&
-        instruction->opcode() != HloOpcode::kTuple &&
-        instruction->opcode() != HloOpcode::kWhile &&
-        instruction->opcode() != HloOpcode::kCustomCall &&
-        instruction->opcode() != HloOpcode::kReshape &&
-        instruction->opcode() != HloOpcode::kDynamicSlice &&
-        instruction->opcode() != HloOpcode::kBitcastConvert &&
-        instruction->opcode() != HloOpcode::kSlice &&
-        instruction->opcode() != HloOpcode::kBroadcast &&
-        !(instruction->opcode() == HloOpcode::kCall &&
-          instruction->metadata().op_type() == "XlaCallModule") &&
-        absl::c_any_of(instruction->operands(), [](HloInstruction* operand) {
-          return ShapeUtil::HasPrimitiveType(operand->shape(), S4) ||
-                 ShapeUtil::HasPrimitiveType(operand->shape(), U4);
-        })) {
-      return InvalidArgument(
-          "S4/U4 is currently only supported in matmul and convolution, but "
-          "got instruction with S4/U4 input: %s",
-          instruction->ToString());
-    }
-    return OkStatus();
-  }
-
   absl::flat_hash_map<std::string, const HloInstruction*> instructions_by_name_;
   const HloVerifierOpts& opts_;
   std::optional<int64_t> num_devices_;
diff --git a/third_party/xla/xla/service/hlo_verifier.h b/third_party/xla/xla/service/hlo_verifier.h
index 84f1939de945a6..29744af10982bb 100644
--- a/third_party/xla/xla/service/hlo_verifier.h
+++ b/third_party/xla/xla/service/hlo_verifier.h
@@ -88,7 +88,11 @@ struct HloVerifierOpts {
   }
 
   HloVerifierOpts&& WithVerifyS4U4Usage(bool verify) {
-    verify_s4_u4_usage = verify;
+    return std::move(*this);
+  }
+
+  HloVerifierOpts&& WithAllowUnboundedDynamism(bool allow) {
+    allow_unbounded_dynamism = allow;
     return std::move(*this);
   }
 
@@ -132,8 +136,8 @@ struct HloVerifierOpts {
   // Whether bitcast should have the same size, including all paddings.
   bool allow_bitcast_to_have_different_size = false;
 
-  // Whether to verify S4/U4 usages belong to allowed ops.
-  bool verify_s4_u4_usage = true;
+  // Whether unbounded dynamic sizes should be allowed for shapes.
+  bool allow_unbounded_dynamism = false;
 
   HloPredicate instruction_can_change_layout;
 
diff --git a/third_party/xla/xla/service/hlo_verifier_test.cc b/third_party/xla/xla/service/hlo_verifier_test.cc
index 9c7ef07213f11c..7d9a5a93470b67 100644
--- a/third_party/xla/xla/service/hlo_verifier_test.cc
+++ b/third_party/xla/xla/service/hlo_verifier_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -2710,42 +2711,6 @@ TEST_F(HloVerifierTest, InconsistentConditionSharding) {
       HasSubstr("Inconsistent conditional sharding among instructions"));
 }
 
-TEST_F(HloVerifierTest, InvalidS4Usage) {
-  const char* const hlo = R"(
-  HloModule Module
-
-  ENTRY entry {
-    param0 = s32[] parameter(0)
-    x = s4[] convert(param0)
-    ROOT add = s4[] add(x, x)
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
-  auto status = verifier().Run(module.get()).status();
-  ASSERT_FALSE(status.ok());
-  EXPECT_THAT(
-      status.message(),
-      HasSubstr("S4/U4 is currently only supported in matmul and convolution"));
-}
-
-TEST_F(HloVerifierTest, InvalidU4Usage) {
-  const char* const hlo = R"(
-  HloModule Module
-
-  ENTRY entry {
-    param0 = u32[] parameter(0)
-    x = u4[] convert(param0)
-    ROOT add = u4[] add(x, x)
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
-  auto status = verifier().Run(module.get()).status();
-  ASSERT_FALSE(status.ok());
-  EXPECT_THAT(
-      status.message(),
-      HasSubstr("S4/U4 is currently only supported in matmul and convolution"));
-}
-
 TEST_F(HloVerifierTest, DisableS4Veridication) {
   const char* const hlo = R"(
   HloModule Module
@@ -2791,65 +2756,13 @@ TEST(MetadataTrackerTest, MetadataTrackerLogsInfo) {
   }
 }
 
-TEST_F(HloVerifierTest, TopKWrongComparator) {
-  const char* const hlo = R"(
-HloModule module
-
-compare {
-  p.0.lhs = f32[] parameter(0)
-  p.0.rhs = f32[] parameter(1)
-  p3 = f32[] parameter(2)
-  p4 = f32[] parameter(3)
-  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
-}
-
-ENTRY entry {
-  x = f32[10,10]{0,1} parameter(0)
-  ROOT topk = (f32[10,2]{0,1}, s32[10,2]{0,1}) topk(x), k=2, to_apply=compare
-}
-
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
-  auto status = verifier().Run(module.get()).status();
-  ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.message(), HasSubstr("to have 2 parameters"));
-}
-
-TEST_F(HloVerifierTest, TopKUnexpectedComparator) {
-  const char* const hlo = R"(
-HloModule module
-
-compare {
-  p.0.lhs = f32[] parameter(0)
-  p.0.rhs = f32[] parameter(1)
-  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LE
-}
-
-ENTRY entry {
-  x = f32[10,10]{0,1} parameter(0)
-  ROOT topk = (f32[10,2]{0,1}, s32[10,2]{0,1}) topk(x), k=2, to_apply=compare
-}
-
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
-  auto status = verifier().Run(module.get()).status();
-  ASSERT_FALSE(status.ok());
-  EXPECT_THAT(status.message(), HasSubstr("expects a strict comparison"));
-}
-
 TEST_F(HloVerifierTest, TopKOK) {
   const char* const hlo = R"(
 HloModule topk, entry_computation_layout={(f32[10,10]{0,1})->(f32[10,2]{0,1}, s32[10,2]{0,1})}
 
-compare {
-  p.0.lhs = f32[] parameter(0)
-  p.0.rhs = f32[] parameter(1)
-  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
-}
-
 ENTRY TopK {
   x = f32[10,10]{0,1} parameter(0)
-  ROOT topk = (f32[10,2]{0,1}, s32[10,2]{0,1}) topk(x), k=2, to_apply=compare
+  ROOT topk = (f32[10,2]{0,1}, s32[10,2]{0,1}) topk(x), k=2, largest=true
 }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
@@ -2969,5 +2882,33 @@ ENTRY entry {
   TF_ASSERT_OK(status);
 }
 
+TEST_F(HloVerifierTest, UnboundedDynamism) {
+  const char* const hlo = R"(
+  HloModule Module
+
+  ENTRY entry {
+    ROOT param0 = f32[?,784] parameter(0)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.message(), HasSubstr("Unbounded dynamism is disabled"));
+}
+
+TEST_F(HloVerifierTest, EnableUnboundedDynamism) {
+  const char* const hlo = R"(
+  HloModule Module
+
+  ENTRY entry {
+    ROOT param0 = f32[?,784] parameter(0)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  HloVerifier verifier{HloVerifierOpts{}.WithAllowUnboundedDynamism(true)};
+  auto status = verifier.Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index bc2c49bb0c6805..96dc24ec79da7e 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -52,6 +52,9 @@ limitations under the License.
 
 namespace xla {
 namespace {
+
+const int64_t kDefaultMemorySpace = 0;
+
 bool IsNopInstruction(const HloInstruction& hlo) {
   HloOpcode op = hlo.opcode();
   return op == HloOpcode::kGetTupleElement || op == HloOpcode::kBitcast ||
@@ -407,6 +410,14 @@ AsyncTracker::GetOccupiedShareableResourcesFromVector(
   return {};
 }
 
+// For now, only the target-defined resources have serial hazard type, so
+// this async tracker does not know which resources are serial.
+absl::InlinedVector<int64_t, 1>
+AsyncTracker::GetOccupiedSerialResourcesFromVector(
+    const ResourcesVector& resources) const {
+  return {};
+}
+
 BufferInfoTracker::BufferInfoTracker(
     const HloModule* module, const HloAliasAnalysis* alias_analysis,
     const HloCostAnalysis::ShapeSizeFunction& shape_size_bytes) {
@@ -519,6 +530,10 @@ void MemoryPressureTracker::Initialize(
   if (!initial_live_buffers.empty()) {
     for (HloBuffer::Id id : initial_live_buffers) {
       auto& buffer = buffer_tracker_.GetBufferInfo(id);
+      if (buffer.value->values()[0]->shape().has_layout() &&
+          buffer.value->values()[0]->shape().layout().memory_space() != 0) {
+        continue;
+      }
       live_buffers_[buffer.value->id()] = 1;
       initial_memory_pressure_ += buffer.buffer_size;
     }
@@ -545,7 +560,10 @@ void MemoryPressureTracker::UpdateBuffers(const HloInstruction* instruction) {
   for (auto* op : instruction->operands()) {
     auto& output_values = output_buffers_[op];
     for (auto& info : output_values) {
-      if (ShouldSkipBufferAllocations(instruction, info.second)) {
+      if (ShouldSkipBufferAllocations(instruction, info.second) ||
+          (info.first.value->values()[0]->shape().has_layout() &&
+           info.first.value->values()[0]->shape().layout().memory_space() !=
+               kDefaultMemorySpace)) {
         continue;
       }
       if (live_buffers_[info.first.value->id()] == 0) {
@@ -561,8 +579,14 @@ void MemoryPressureTracker::UpdateBuffers(const HloInstruction* instruction) {
   CHECK(it != defined_buffers_.end());
   if (!ShouldSkipBufferReleases(instruction)) {
     for (auto& b : it->second) {
+      if (b.value->values()[0]->shape().has_layout() &&
+          b.value->values()[0]->shape().layout().memory_space() !=
+              kDefaultMemorySpace) {
+        continue;
+      }
       if (live_buffers_[b.value->id()] != 0) {
         if (b.first_definition == instruction) {
+          // VLOG(0) << "Removing " << b.buffer_size;
           live_memory_usage_ -= b.buffer_size;
           live_buffers_set_.erase(b.value->id());
         }
@@ -596,7 +620,10 @@ std::pair<int64_t, int64_t> MemoryPressureTracker::MemoryPressureDifference(
     auto it = output_buffers_.find(op);
     CHECK(it != output_buffers_.end());
     for (auto& b : it->second) {
-      if (ShouldSkipBufferAllocations(instruction, b.second)) {
+      if (ShouldSkipBufferAllocations(instruction, b.second) ||
+          (b.first.value->values()[0]->shape().has_layout() &&
+           b.first.value->values()[0]->shape().layout().memory_space() !=
+               kDefaultMemorySpace)) {
         continue;
       }
       if (!live_buffers_[b.first.value->id()]) {
@@ -610,6 +637,11 @@ std::pair<int64_t, int64_t> MemoryPressureTracker::MemoryPressureDifference(
   // Decrease memory pressure if some buffers are released.
   if (!ShouldSkipBufferReleases(instruction)) {
     for (auto& b : it->second) {
+      if (b.value->values()[0]->shape().has_layout() &&
+          b.value->values()[0]->shape().layout().memory_space() !=
+              kDefaultMemorySpace) {
+        continue;
+      }
       if (live_buffers_[b.value->id()]) {
         if (b.first_definition == instruction) {
           increase -= b.buffer_size;
@@ -772,6 +804,20 @@ class ReadySetLt {
             b_ready_interval < a_ready_interval, b, "kLessStall")) {
       return *value;
     }
+    if (sched_state_.config.resource_serializing) {
+      // Prioritize scheduling the instruction which has less serial-resource
+      // conflicts with the resources in flight.
+      const int64_t a_num_conflicting_resources =
+          GetNumConflictingSerialResources(a);
+      const int64_t b_num_conflicting_resources =
+          GetNumConflictingSerialResources(b);
+      if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
+              a_num_conflicting_resources < b_num_conflicting_resources, a,
+              b_num_conflicting_resources < a_num_conflicting_resources, b,
+              "kLessSerialResourceConflict")) {
+        return *value;
+      }
+    }
     if (sched_state_.config.aggressive_scheduling_policies) {
       // If an instruction releasing a resource is not resource constrained and
       // has an async depth of 0, delay it as much as possible to avoid
@@ -995,6 +1041,19 @@ class ReadySetLt {
     }
     return *cand.pressure_change;
   }
+  int64_t GetNumConflictingSerialResources(
+      DefaultSchedulerCore::ScheduleCandidate& cand) const {
+    auto resources =
+        sched_state_.async_tracker->GetOccupiedSerialResourcesFromVector(
+            cand.node->GetResources());
+    int64_t num_conflicting_resources = 0;
+    for (int64_t resource : resources) {
+      if (!sched_state_.resources_in_flight.contains(resource)) continue;
+      num_conflicting_resources +=
+          sched_state_.resources_in_flight.at(resource);
+    }
+    return num_conflicting_resources;
+  }
 };
 
 }  // namespace
@@ -1385,6 +1444,28 @@ std::string HloEdge::ToString() const {
                       " latency: ", Latency(), "\n");
 }
 
+bool HloScheduleGraph::IsPredecessorTransitively(
+    const HloGraphNode* node, const HloGraphNode* possible_predecessor) {
+  absl::flat_hash_set<const HloGraphNode*> visited = {possible_predecessor};
+  std::vector<const HloGraphNode*> to_visit_queue = {node};
+  while (!to_visit_queue.empty()) {
+    const HloGraphNode* curr = to_visit_queue.back();
+    to_visit_queue.pop_back();
+    if (curr == possible_predecessor) {
+      return true;
+    }
+    if (visited.contains(curr)) {
+      continue;
+    }
+    visited.insert(curr);
+    for (const auto& edge : curr->GetPredecessors()) {
+      auto user_node_it = nodes_.find(&edge.Target().GetInstr());
+      to_visit_queue.push_back(user_node_it->second.get());
+    }
+  }
+  return false;
+}
+
 HloScheduleGraph::HloScheduleGraph(
     const std::vector<HloInstruction*>* post_order_instructions,
     HloAliasAnalysis* alias_analysis, const LatencyEstimator* latency_estimator,
@@ -1413,14 +1494,8 @@ HloScheduleGraph::HloScheduleGraph(
         async_tracker->GetOccupiedShareableResourcesFromVector(
             new_node_it->second->GetResources());
   }
-  // Cache used to detect if we already added a dependency between two nodes
-  // to avoid duplicates in the predecessors/successors lists.
-  absl::flat_hash_map<const HloInstruction*,
-                      absl::flat_hash_set<const HloInstruction*>>
-      dependencies_set;
-  auto add_dependency_helper = [&dependencies_set, latency_estimator,
-                                async_tracker](HloGraphNode* from,
-                                               HloGraphNode* to) {
+  auto add_dependency_helper = [latency_estimator](HloGraphNode* from,
+                                                   HloGraphNode* to) {
     // Get the latency between these two instructions for this edge.
     const LatencyEstimator::TimeCost latency =
         latency_estimator->GetLatencyBetween(*from, *to);
@@ -1430,9 +1505,6 @@ HloScheduleGraph::HloScheduleGraph(
     to->predecessors_.push_back(HloEdge(latency, from));
     ++to->indegree_;
     ++from->outdegree_;
-    if (async_tracker->IsSupportedAsyncStart(to->GetInstr())) {
-      dependencies_set[&to->GetInstr()].insert(&from->GetInstr());
-    }
   };
   // Add dependencies edges between each of the graph nodes.
   for (const HloInstruction* instr : *post_order_instructions) {
@@ -1473,11 +1545,8 @@ HloScheduleGraph::HloScheduleGraph(
                 // The instruction itself and later ones might be
                 // identified as use.instruction. Add checks here to avoid
                 // adding dependencies for these instructions.
-                // Also don't add the dependency if it has been already added.
-                auto dep_it = dependencies_set.find(async_start);
                 if (use.instruction == async_start ||
-                    reachability->IsReachable(instr, use.instruction) ||
-                    dep_it->second.contains(use.instruction)) {
+                    reachability->IsReachable(instr, use.instruction)) {
                   continue;
                 }
                 auto it = nodes_.find(use.instruction);
@@ -1486,6 +1555,11 @@ HloScheduleGraph::HloScheduleGraph(
                 it = nodes_.find(async_start);
                 CHECK(it != nodes_.end());
                 HloGraphNode* start_node = it->second.get();
+                // If there is already a transitive link between the nodes the
+                // other way then skip adding this one.
+                if (IsPredecessorTransitively(pred_node, start_node)) {
+                  continue;
+                }
                 pred_node->successors_.push_back(HloEdge(1, start_node));
                 start_node->predecessors_.push_back(HloEdge(1, pred_node));
                 ++pred_node->outdegree_;
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.h b/third_party/xla/xla/service/latency_hiding_scheduler.h
index 7775912d0afde9..ca6ccf54f4c498 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.h
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.h
@@ -108,6 +108,7 @@ struct SchedulerConfig {
   bool aggressive_scheduling_policies = false;
   bool enable_release_start_policy = false;
   bool resource_sharing = false;
+  bool resource_serializing = false;
   bool depth_based_memory_pressure_reduction = false;
   int64_t rerun = 0;
 };
@@ -237,6 +238,11 @@ class AsyncTracker {
   GetOccupiedShareableResourcesFromVector(
       const ResourcesVector& resources) const;
 
+  // Returns the list of the occupied serial resources filtered from the given
+  // resources vector.
+  virtual absl::InlinedVector<int64_t, 1> GetOccupiedSerialResourcesFromVector(
+      const ResourcesVector& resources) const;
+
   inline CanonicalAsyncOp GetCanonicalAsyncOp(const HloInstruction& hlo) const {
     return get_canonical_async_op_(hlo);
   }
@@ -499,6 +505,10 @@ class HloScheduleGraph {
   // List containing the original order (before scheduling) of the
   // instructions).
   std::vector<const HloInstruction*> original_order_;
+  // Searches through node's predecessors to see if
+  // possible_predecessor can be found.
+  bool IsPredecessorTransitively(const HloGraphNode* node,
+                                 const HloGraphNode* possible_predecessor);
 };
 
 // Tracks data about HloBuffers like where the first definition is in the
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
index 58d18e910ed685..4edcc1a682c8b3 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
@@ -2916,4 +2916,65 @@ TEST_F(LatencyHidingSchedulerTest, RerunWithSmallerMemoryLimit) {
   EXPECT_LT(PositionInVector(new_instruction_sequence, s),
             PositionInVector(new_instruction_sequence, cps));
 }
+
+TEST_F(LatencyHidingSchedulerTest, MultipleAsyncDoneOperationsDoNotCreateLoop) {
+  absl::string_view hlo_string = R"(
+HloModule multiple_async_done_scheduler_test, is_scheduled=true
+
+called_computation {
+  ROOT %param = s32[<=4096]{0:T(8)M(1024)} parameter(0)
+}
+
+ENTRY main {
+  %while_body_forward_pass_input_tuple = (s32[<=4096]{0:T(8)M(1024)}, s32[<=4096]{0:T(8)M(1024)}, s32[<=4096]{0:T(8)M(1024)}) parameter(0), backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_SCALAR"}
+
+  %get-tuple-element.0 = s32[<=4096]{0:T(8)M(1024)} get-tuple-element(
+      (s32[<=4096]{0:T(8)M(1024)}, s32[<=4096]{0:T(8)M(1024)}, s32[<=4096]{0:T(8)M(1024)}) %while_body_forward_pass_input_tuple),
+      index=0, backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_SCALAR"}
+
+  %get-tuple-element.1 = s32[<=4096]{0:T(8)M(1024)} get-tuple-element(
+      (s32[<=4096]{0:T(8)M(1024)}, s32[<=4096]{0:T(8)M(1024)}, s32[<=4096]{0:T(8)M(1024)}) %while_body_forward_pass_input_tuple),
+      index=1, backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_SCALAR"}
+
+  %call-start.1 = ((s32[<=4096]{0:T(8)M(1024)}), s32[<=4096]{0:T(8)M(1024)}, u32[]{:T(8)S(8)})
+    call-start(s32[<=4096]{0:T(8)M(1024)} %get-tuple-element.1),
+      async_group_id=17, async_execution_thread="sparsecore", to_apply=%called_computation
+
+  %call-done.1 = s32[<=4096]{0:T(8)M(1024)}
+    call-done(((s32[<=4096]{0:T(8)M(1024)}), s32[<=4096]{0:T(8)M(1024)}, u32[]{:T(8)S(8)}) %call-start.1),
+      async_group_id=17, async_execution_thread="sparsecore", to_apply=%called_computation
+
+  %call-start.2 = ((s32[<=4096]{0:T(8)M(1024)}), s32[<=4096]{0:T(8)M(1024)}, u32[]{:T(8)S(8)})
+    call-start(s32[<=4096]{0:T(8)M(1024)} %call-done.1),
+      async_group_id=27, async_execution_thread="sparsecore", to_apply=%called_computation
+
+  %call-done.2 = s32[<=4096]{0:T(8)M(1024)}
+    call-done(((s32[<=4096]{0:T(8)M(1024)}), s32[<=4096]{0:T(8)M(1024)}, u32[]{:T(8)S(8)}) %call-start.2),
+      async_group_id=27, async_execution_thread="sparsecore", to_apply=%called_computation
+
+  %call-start.3 = ((s32[<=4096]{0:T(8)M(1024)}), s32[<=4096]{0:T(8)M(1024)}, u32[]{:T(8)S(8)})
+    call-start(s32[<=4096]{0:T(8)M(1024)} %get-tuple-element.0),
+      async_group_id=14, async_execution_thread="sparsecore", to_apply=%called_computation
+
+  %call-done.3 = s32[<=4096]{0:T(8)M(1024)}
+    call-done(((s32[<=4096]{0:T(8)M(1024)}), s32[<=4096]{0:T(8)M(1024)}, u32[]{:T(8)S(8)}) %call-start.3),
+      async_group_id=14, async_execution_thread="sparsecore", to_apply=%called_computation
+
+  ROOT %tuple.6 = (s32[<=4096]{0:T(8)M(1024)}, s32[<=4096]{0:T(8)M(1024)})
+    tuple(s32[<=4096]{0:T(8)M(1024)} %call-done.2, s32[<=4096]{0:T(8)M(1024)} %call-done.3),
+      backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_SCALAR"}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
+  HloSchedule& module_schedule = hlo_module->schedule();
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+  HloComputation* entry_computation = hlo_module->entry_computation();
+  std::vector<HloInstruction*> original_instruction_sequence =
+      module_schedule.sequence(entry_computation).instructions();
+  auto sched_config = GetDefaultSchedConfig();
+  // The double indirection of the buffer aliasing in the module above should
+  // not create a failure of scheduling by the async done checks.
+  EXPECT_TRUE(RunScheduler(hlo_module.get(), sched_config).ok());
+}
 }  // namespace xla
diff --git a/third_party/xla/xla/service/layout_assignment.cc b/third_party/xla/xla/service/layout_assignment.cc
index 9d2183f22352ca..a79185682d7591 100644
--- a/third_party/xla/xla/service/layout_assignment.cc
+++ b/third_party/xla/xla/service/layout_assignment.cc
@@ -221,12 +221,6 @@ std::string ComputationLayoutConstraint::ToString() const {
                          layout_state_, computation_layout_.ToString());
 }
 
-LayoutAssignment::LayoutConstraints::LayoutConstraints(
-    HloComputation* computation, ComputationLayout* computation_layout,
-    int64_t priority)
-    : computation_(computation),
-      computation_constraint_(computation, computation_layout, priority) {}
-
 PointsToSet::BufferSet* LayoutAssignment::GetBufferSet(
     const HloInstruction* instruction) const {
   auto it = buffer_sets_cache_.find(instruction);
@@ -715,19 +709,23 @@ Status LayoutAssignment::AddMandatoryConstraints(
     } else if (instruction->opcode() == HloOpcode::kParameter) {
       if (reverse_computation_order_ ||
           (constraints->computation()->IsEntryComputation() &&
-           entry_computation_layout_->LayoutIsSet()) ||
+           entry_computation_layout_->AnyLayoutSet()) ||
           (conditional_mismatch_.count(constraints->computation()) == 0 &&
            constraints->computation_constraint().parameter_layout_is_set())) {
         const ShapeLayout& parameter_layout =
             constraints->computation_layout().parameter_layout(
                 instruction->parameter_number());
-        // Parameter layouts must match the respective layout in
-        // ComputationLayout, if there is one.
-        TF_RETURN_IF_ERROR(
-            SetInstructionLayout(parameter_layout.shape(), instruction));
-        if (reverse_computation_order_) {
-          TF_RETURN_IF_ERROR(PropagateParameterLayoutToUsers(
-              instruction, parameter_layout.shape(), this));
+        // Allow some paramter/result layouts to be unset in the entry
+        // computation.
+        if (parameter_layout.LayoutIsSet()) {
+          // Parameter layouts must match the respective layout in
+          // ComputationLayout, if there is one.
+          TF_RETURN_IF_ERROR(
+              SetInstructionLayout(parameter_layout.shape(), instruction));
+          if (reverse_computation_order_) {
+            TF_RETURN_IF_ERROR(PropagateParameterLayoutToUsers(
+                instruction, parameter_layout.shape(), this));
+          }
         }
       }
     } else if (IsLayoutConstrainedCustomCall(instruction)) {
@@ -927,7 +925,8 @@ Status LayoutAssignment::AddMandatoryConstraints(
         current_priority_ + kNumberOfPropagationRounds));
   } else if (reverse_computation_order_ ||
              (constraints->computation()->IsEntryComputation() &&
-              entry_computation_layout_->LayoutIsSet()) ||
+              entry_computation_layout_->AnyLayoutSet() &&
+              entry_computation_layout_->result_layout().LayoutIsSet()) ||
              current_priority_ > LayoutConstraint::kBeginningPriority) {
     const ShapeLayout* result_layout = constraints->ResultLayout();
     if (result_layout != nullptr) {
@@ -1680,14 +1679,17 @@ Status LayoutAssignment::PropagateUseConstraintToDefs(
       [&shape_layout, this, priority, user](
           const ShapeIndex& index,
           const PointsToSet::BufferList& buffers) -> Status {
-        if (ShapeUtil::IsLeafIndex(shape_layout.shape(), index)) {
+        const auto& subshape =
+            ShapeUtil::GetSubshape(shape_layout.shape(), index);
+        if (ShapeUtil::IsLeafIndex(shape_layout.shape(), index) &&
+            subshape.has_layout()) {
           for (const LogicalBuffer* buffer : buffers) {
             if (buffer->shape().IsArray() &&
                 (buffer->instruction()->opcode() != HloOpcode::kReduce ||
                  !buffer->instruction()->shape().IsTuple())) {
-              TF_RETURN_IF_ERROR(SetBufferLayout(
-                  ShapeUtil::GetSubshape(shape_layout.shape(), index).layout(),
-                  *buffer, /*mandatory=*/false, /*dfs=*/true, priority, user));
+              TF_RETURN_IF_ERROR(SetBufferLayout(subshape.layout(), *buffer,
+                                                 /*mandatory=*/false,
+                                                 /*dfs=*/true, priority, user));
             }
           }
         }
@@ -2202,7 +2204,8 @@ Status LayoutAssignment::AssignLayouts(LayoutConstraints& constraints) {
   }
   // Copy the root instruction's result if its layout does not match the result
   // layout constraint.
-  if (constraints.ResultLayout() != nullptr) {
+  if (constraints.ResultLayout() != nullptr &&
+      constraints.ResultLayout()->LayoutIsSet()) {
     // Layout assignment at this point only does minor-to-major assignment so
     // tiling info should be ignored here for comparison.
     VLOG(5) << "Computation result layout needs root copying\n";
@@ -2226,6 +2229,8 @@ Status LayoutAssignment::AssignLayouts(LayoutConstraints& constraints) {
                 result_shape.layout().tiles().begin(),
                 result_shape.layout().tiles().end());
           }
+          subshape->mutable_layout()->set_element_size_in_bits(
+              result_shape.layout().element_size_in_bits());
         }
       };
       xla::ShapeUtil::ForEachMutableSubshape(
@@ -2250,7 +2255,7 @@ Status LayoutAssignment::CalculateComputationLayout(
     ShapeUtil::ForEachSubshape(
         operand->shape(), [this, &change, operand, update](
                               const Shape& subshape, const ShapeIndex& index) {
-          if (subshape.IsTuple()) {
+          if (subshape.IsTuple() || !subshape.has_layout()) {
             return;
           }
           auto param_layout = InferArrayLayout(operand, index);
@@ -2661,10 +2666,10 @@ StatusOr<bool> LayoutAssignment::Run(
   computation_layouts_.emplace(
       module->entry_computation(),
       new LayoutConstraints(entry,
-                            entry_computation_layout_->LayoutIsSet()
+                            entry_computation_layout_->AnyLayoutSet()
                                 ? entry_computation_layout_
                                 : nullptr,
-                            entry_computation_layout_->LayoutIsSet()
+                            entry_computation_layout_->AnyLayoutSet()
                                 ? LayoutConstraint::kGivenPriority
                                 : LayoutConstraint::kDefaultPriority));
   for (int64_t i = 0; i < kNumberOfPropagationRounds; ++i) {
diff --git a/third_party/xla/xla/service/layout_assignment.h b/third_party/xla/xla/service/layout_assignment.h
index a901cef0b98295..18fd5df38f41a8 100644
--- a/third_party/xla/xla/service/layout_assignment.h
+++ b/third_party/xla/xla/service/layout_assignment.h
@@ -273,7 +273,9 @@ class LayoutAssignment : public HloModulePass {
    public:
     explicit LayoutConstraints(HloComputation* computation,
                                ComputationLayout* computation_layout,
-                               int64_t priority);
+                               int64_t priority)
+        : computation_(computation),
+          computation_constraint_(computation, computation_layout, priority) {}
     ~LayoutConstraints() = default;
 
     const HloComputation* computation() const { return computation_; }
diff --git a/third_party/xla/xla/service/layout_assignment_test.cc b/third_party/xla/xla/service/layout_assignment_test.cc
index c3665e7bea1886..e0db0346e4c826 100644
--- a/third_party/xla/xla/service/layout_assignment_test.cc
+++ b/third_party/xla/xla/service/layout_assignment_test.cc
@@ -1692,5 +1692,35 @@ ENTRY main {
   // Expecting a copy before custom call to reconcile the different layouts.
   EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kCopy);
 }
+
+// Test the ability to enforce a partially specified parameter constraint.
+TEST_F(LayoutAssignmentTest, PartialEntryParameterLayout) {
+  const char* module_str = R"(
+ HloModule EntryLayout, entry_computation_layout={(f32[32,650]{1,0},s32[16,1,18]{0,1,2})->(f32[650,32]{1,0},s32[18,16,1]{0,1,2})}
+ 
+ ENTRY %main {
+   operand = f32[32,650] parameter(0)
+   transpose = transpose(operand), dimensions={1,0}
+   indices = s32[16,1,18] parameter(1)
+   transpose_indices = transpose(indices), dimensions={2,0,1}
+   ROOT t = tuple(transpose, transpose_indices)
+ } )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
+  // Allow propagation only to parameter 0
+  m->mutable_entry_computation_layout()->mutable_parameter_layout(0)->Clear();
+
+  LayoutAssignment layout_assignment(m->mutable_entry_computation_layout(),
+                                     nullptr);
+  EXPECT_IS_OK(layout_assignment.Run(m.get()).status());
+  // Assign bitcasting layout to parameter 0
+  ExpectLayoutIs(m->entry_computation_layout().parameter_layout(0).shape(),
+                 {0, 1});
+  // Parameter layout that is set is unmodified.
+  ExpectLayoutIs(m->entry_computation_layout().parameter_layout(1).shape(),
+                 {0, 1, 2});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/llvm_ir/fused_ir_emitter.cc b/third_party/xla/xla/service/llvm_ir/fused_ir_emitter.cc
index 006020e41c1f3d..4b66bd023b9cc6 100644
--- a/third_party/xla/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/third_party/xla/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -109,11 +109,7 @@ FusedIrEmitter::IndexedGenerator FusedIrEmitter::HandleConstant(
   global->setUnnamedAddr(llvm::GlobalVariable::UnnamedAddr::Global);
 
   llvm::Type* shape_type = llvm_ir::ShapeToIrType(constant.shape(), module);
-  llvm::Constant* global_with_shape =
-      llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-          global, shape_type->getPointerTo());
-
-  IrArray array(global_with_shape, shape_type, constant.shape());
+  IrArray array(global, shape_type, constant.shape());
 
   return [&, b, array = std::move(array)](const IrArray::Index& index) {
     return array.EmitReadArrayElement(index, b, constant.name());
diff --git a/third_party/xla/xla/service/llvm_ir/ir_array.cc b/third_party/xla/xla/service/llvm_ir/ir_array.cc
index fa6ca3ffffa06f..e89629b8224b0f 100644
--- a/third_party/xla/xla/service/llvm_ir/ir_array.cc
+++ b/third_party/xla/xla/service/llvm_ir/ir_array.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "xla/layout_util.h"
 #include "xla/permutation_util.h"
@@ -190,8 +191,6 @@ IrArray::IrArray(llvm::Value* base_ptr, llvm::Type* pointee_type, Shape shape)
       shape_(std::move(shape)) {
   TF_CHECK_OK(ShapeUtil::ValidateShape(shape));
   CHECK(base_ptr_->getType()->isPointerTy());
-  CHECK(llvm::cast<llvm::PointerType>(base_ptr_->getType())
-            ->isOpaqueOrPointeeTypeMatches(pointee_type));
   int depth = 0;
   element_type_ = pointee_type;
   while (llvm::ArrayType* array_type =
@@ -484,6 +483,10 @@ llvm::Value* IrArray::EmitArrayElementAddress(
     const IrArray::Index& index, llvm::IRBuilder<>* b, absl::string_view name,
     bool use_linear_index, llvm::Value** is_high_order_bits) const {
   if (ShapeUtil::IsScalar(shape_)) {
+    if (primitive_util::Is4BitType(shape_.element_type())) {
+      CHECK_NE(is_high_order_bits, nullptr);
+      *is_high_order_bits = b->getTrue();
+    }
     // Special handling of scalars: a scalar pretends to have the same value for
     // every index, thus effectively implementing broadcasting of its value
     // over higher-rank arrays.
@@ -548,15 +551,13 @@ llvm::Value* IrArray::EmitLinearArrayElementAddress(
   llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
   llvm::Type* type = PrimitiveTypeToIrType(shape_.element_type(), module);
   if (!primitive_util::Is4BitType(shape_.element_type())) {
-    return b->CreateInBoundsGEP(
-        type, b->CreateBitCast(base_ptr_, type->getPointerTo()), index.linear(),
-        llvm_ir::AsStringRef(name));
+    return b->CreateInBoundsGEP(type, base_ptr_, index.linear(),
+                                llvm_ir::AsStringRef(name));
   }
 
   // Handle int4 case by dividing index by 2. Int4 arrays are represented in
   // LLVM IR as an array of i8 value where each i8 value stores two int4
   // numbers.
-  CHECK(type->isIntegerTy(8));
   llvm::Type* index_type = index.linear()->getType();
   llvm::Value* zero = llvm::ConstantInt::get(index_type, 0);
   llvm::Value* two = llvm::ConstantInt::get(index_type, 2);
@@ -565,9 +566,8 @@ llvm::Value* IrArray::EmitLinearArrayElementAddress(
   // is_high_order_bits must be set for int4 arrays.
   CHECK_NE(is_high_order_bits, nullptr);
   *is_high_order_bits = b->CreateICmpEQ(remainder, zero);
-  return b->CreateInBoundsGEP(type,
-                              b->CreateBitCast(base_ptr_, type->getPointerTo()),
-                              byte_offset, llvm_ir::AsStringRef(name));
+  return b->CreateInBoundsGEP(b->getInt8Ty(), base_ptr_, byte_offset,
+                              llvm_ir::AsStringRef(name));
 }
 
 void IrArray::AnnotateLoadStoreInstructionWithMetadata(
@@ -589,29 +589,18 @@ llvm::Value* IrArray::EmitReadArrayElement(const Index& index,
   llvm::Value* is_high_order_bits = nullptr;
   llvm::Value* element_address = EmitArrayElementAddress(
       index, b, name, use_linear_index, &is_high_order_bits);
+  llvm::Type* load_type = primitive_util::Is4BitType(shape_.element_type())
+                              ? b->getInt8Ty()
+                              : element_type_;
   llvm::LoadInst* load =
-      b->CreateLoad(element_type_, element_address, llvm_ir::AsStringRef(name));
+      b->CreateLoad(load_type, element_address, llvm_ir::AsStringRef(name));
   AnnotateLoadStoreInstructionWithMetadata(load);
   llvm::Value* elem = load;
   if (primitive_util::Is4BitType(shape_.element_type())) {
     llvm::Type* type = load->getType();
-    llvm::Value* high_order_bits;
-    llvm::Value* low_order_bits;
-    if (shape_.element_type() == U4) {
-      high_order_bits = b->CreateLShr(load, llvm::ConstantInt::get(type, 4));
-      low_order_bits = b->CreateAnd(load, llvm::ConstantInt::get(type, 0x0F));
-    } else {
-      CHECK_EQ(shape_.element_type(), S4);
-      high_order_bits = b->CreateAShr(load, llvm::ConstantInt::get(type, 4));
-      // To compute low_order_bits, cast to i4 then back to i8, which fills the
-      // left 4 bits with ones for negative numbers and zeros for positive
-      // numbers.
-      low_order_bits =
-          b->CreateIntCast(load, b->getIntNTy(4), /*isSigned=*/true);
-      low_order_bits =
-          b->CreateIntCast(low_order_bits, b->getInt8Ty(), /*isSigned=*/true);
-    }
-    elem = b->CreateSelect(is_high_order_bits, high_order_bits, low_order_bits);
+    llvm::Value* shifted = b->CreateLShr(load, llvm::ConstantInt::get(type, 4));
+    elem = b->CreateSelect(is_high_order_bits, shifted, load);
+    elem = b->CreateTrunc(elem, b->getIntNTy(4));
   }
   return elem;
 }
@@ -625,9 +614,11 @@ void IrArray::EmitWriteArrayElement(const Index& index, llvm::Value* value,
   if (primitive_util::Is4BitType(shape_.element_type())) {
     // Read a byte, replace the high-order or low-order bits with 'value',
     // and write it back.
-    llvm::LoadInst* load = b->CreateLoad(element_type_, element_address);
+    llvm::LoadInst* load = b->CreateLoad(b->getInt8Ty(), element_address);
     AnnotateLoadStoreInstructionWithMetadata(load);
     llvm::Type* type = load->getType();
+    value = b->CreateIntCast(value, b->getInt8Ty(),
+                             /*isSigned=*/shape_.element_type() == S4);
 
     llvm::Value* high_order_value =
         b->CreateShl(value, llvm::ConstantInt::get(type, 4));
@@ -653,9 +644,7 @@ IrArray IrArray::CastToShape(const Shape& new_shape,
 
   llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
   llvm::Type* new_ir_type = llvm_ir::ShapeToIrType(new_shape, module);
-  IrArray new_irarray(
-      b->CreatePointerCast(base_ptr_, new_ir_type->getPointerTo()), new_ir_type,
-      new_shape);
+  IrArray new_irarray(base_ptr_, new_ir_type, new_shape);
   new_irarray.metadata_ = metadata_;
   return new_irarray;
 }
diff --git a/third_party/xla/xla/service/llvm_ir/ir_array.h b/third_party/xla/xla/service/llvm_ir/ir_array.h
index cfd55362215cc9..1ee0995b523a6b 100644
--- a/third_party/xla/xla/service/llvm_ir/ir_array.h
+++ b/third_party/xla/xla/service/llvm_ir/ir_array.h
@@ -223,12 +223,14 @@ class IrArray {
   // base_ptr is a pointer type pointing to the first element(lowest address)
   // of the array.
   //
-  // For int4 arrays, pointee_type should be i8, not i4, as int4
-  // IrArrays are represented as i8 arrays where each i8 value stores two 4-bit
-  // values. Additionally, reads and write return or take in i8 values which
-  // hold a value representable by i4, instead of directly returning or taking
-  // in i4 values. Specifically, the i8 values returned or passed in are between
-  // 0 and 15 for U4 arrays and between -8 and 7 for S4 arrays.
+  // For int4 arrays, base_ptr should have half the number of bytes as array
+  // elements (rounded up), as two int4 values are packed into a byte.
+  // pointee_type should be an i4 array in this case, and reads and writes will
+  // return or take in i4 values. IrArray internally reads or writes i8 values,
+  // by treating base_ptr as an i8 array and masking out the high- or low-order
+  // 4 bits of the byte. IrArray does not directly read/write i4 values, since
+  // arrays of i4 values in LLVM are not packed (every element of an LLVM IR
+  // array must have unique address).
   IrArray(llvm::Value* base_ptr, llvm::Type* pointee_type, Shape shape);
 
   // Default implementations of copying and moving.
@@ -272,10 +274,6 @@ class IrArray {
   // the emitted LLVM IR.
   // 'use_linear_index' can be used to specify whether the linear index (if
   // available) or the multi-dimensional index should be used.
-  //
-  // For int4 arrays, returns an i8 value that is representable by i4. The
-  // returned i8 value will be between 0 and 15 for U4 arrays and between -8 and
-  // 7 for S4 arrays.
   llvm::Value* EmitReadArrayElement(const Index& index, llvm::IRBuilder<>* b,
                                     absl::string_view name = "",
                                     bool use_linear_index = true) const;
@@ -284,11 +282,10 @@ class IrArray {
   // 'use_linear_index' can be used to specify whether the linear index (if
   // available) or the multi-dimensional index should be used.
   //
-  // For int4 arrays, the given value must be an i8 value representable by i4.
-  // Only 4 bits of a byte in the array are written. First the appropriate byte
-  // is read from the array, then 4 bits are modified and written back. To avoid
-  // race conditions, the caller must ensure that the two different 4-bit values
-  // within a byte are not written to in parallel.
+  // For int4 arrays, only 4 bits of a byte in the array are written. First the
+  // appropriate byte is read from the array, then 4 bits are modified and
+  // written back. To avoid race conditions, the caller must ensure that the two
+  // different 4-bit values within a byte are not written to in parallel.
   void EmitWriteArrayElement(const Index& index, llvm::Value* value,
                              llvm::IRBuilder<>* b,
                              bool use_linear_index = true) const;
diff --git a/third_party/xla/xla/service/llvm_ir/ir_array_test.cc b/third_party/xla/xla/service/llvm_ir/ir_array_test.cc
index ec92a223b421d3..ea1717ce4371fb 100644
--- a/third_party/xla/xla/service/llvm_ir/ir_array_test.cc
+++ b/third_party/xla/xla/service/llvm_ir/ir_array_test.cc
@@ -151,7 +151,7 @@ TEST_F(IrArrayTest, EmitArrayElementAddressInt4) {
                                    /*is_high_order_bits=*/&is_high_order_bits);
   std::string ir_str = DumpToString(&module_);
 
-  // The index is divided by 2 and used as an index to the i8 array. A
+  // The index is divided by 2 and used as an index to the i8 array. A remainder
   // is also computed to calculate is_high_order_bits.
   const char* filecheck_pattern = R"(
     CHECK: define void @test_function(ptr %[[ptr:[0-9]+]], i32 %[[idx:[0-9]+]]) {
@@ -186,7 +186,7 @@ TEST_F(IrArrayTest, EmitArrayElementAddressInt4NonLinear) {
   std::string ir_str = DumpToString(&module_);
 
   // The index is linearized despite use_linear_index=false being passed because
-  // non-linaer indices are not supported with int4
+  // non-linear indices are not supported with int4
   const char* filecheck_pattern = R"(
     CHECK: define void @test_function(ptr %[[ptr:[0-9]+]], i32 %[[idx0:[0-9]+]], i32 %[[idx1:[0-9]+]]) {
     CHECK: %[[mul1:[0-9]+]] = mul nuw nsw i32 %[[idx1]], 1
@@ -222,15 +222,14 @@ TEST_F(IrArrayTest, EmitReadArrayElementInt4) {
     COM: Calculate the address.
     CHECK: %[[srem:[0-9]+]] = srem i32 %[[idx0]], 2
     CHECK: %[[addr:[0-9]+]] = udiv i32 %[[idx0]], 2
-    CHECK: %[[isodd:[0-9]+]] = icmp eq i32 %[[srem]], 0
+    CHECK: %[[iseven:[0-9]+]] = icmp eq i32 %[[srem]], 0
     CHECK: %[[gep:[0-9]+]] = getelementptr inbounds i8, ptr %[[ptr]], i32 %[[addr]]
 
-    COM: Load the element and mask out 4 bits.
+    COM: Load the element, optionally shift, and truncate.
     CHECK: %[[load:[0-9]+]] = load i8, ptr %[[gep]], align 1
-    CHECK: %[[shift:[0-9]+]] = ashr i8 %[[load]], 4
-    CHECK: %[[trunc:[0-9]+]] = trunc i8 %[[load]] to i4
-    CHECK: %[[sext:[0-9]+]] = sext i4 %[[trunc]] to i8
-    CHECK: select i1 %[[isodd]], i8 %[[shift]], i8 %[[sext]]
+    CHECK: %[[shift:[0-9]+]] = lshr i8 %[[load]], 4
+    CHECK: %[[select:[0-9]+]] = select i1 %[[iseven]], i8 %[[shift]], i8 %[[load]]
+    CHECK: trunc i8 %[[select]] to i4
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(bool filecheck_match,
@@ -240,7 +239,7 @@ TEST_F(IrArrayTest, EmitReadArrayElementInt4) {
 
 TEST_F(IrArrayTest, EmitWriteArrayElementInt4) {
   llvm::Function* function = EmitFunctionAndSetInsertPoint(
-      {builder_.getPtrTy(), builder_.getInt32Ty(), builder_.getInt8Ty()});
+      {builder_.getPtrTy(), builder_.getInt32Ty(), builder_.getIntNTy(4)});
   llvm::Argument* array_ptr = function->getArg(0);
   llvm::Argument* array_index = function->getArg(1);
   llvm::Argument* val_to_write = function->getArg(2);
@@ -254,7 +253,7 @@ TEST_F(IrArrayTest, EmitWriteArrayElementInt4) {
   std::string ir_str = DumpToString(&module_);
 
   const char* filecheck_pattern = R"(
-    CHECK: define void @test_function(ptr %[[ptr:[0-9]+]], i32 %[[idx0:[0-9]+]], i8 %[[val:[0-9]+]]) {
+    CHECK: define void @test_function(ptr %[[ptr:[0-9]+]], i32 %[[idx0:[0-9]+]], i4 %[[val:[0-9]+]]) {
 
     COM: Calculate the address.
     CHECK: %[[srem:[0-9]+]] = srem i32 %[[idx0]], 2
@@ -264,10 +263,11 @@ TEST_F(IrArrayTest, EmitWriteArrayElementInt4) {
 
     COM: Load address, replace 4 bits with the value, and write to address.
     CHECK: %[[load:[0-9]+]] = load i8, ptr %[[gep]], align 1
-    CHECK: %[[shl:[0-9]+]] = shl i8 %[[val]], 4
+    CHECK: %[[sext:[0-9]+]] = sext i4 %[[val]] to i8
+    CHECK: %[[shl:[0-9]+]] = shl i8 %[[sext]], 4
     CHECK: %[[and1:[0-9]+]] = and i8 %[[load]], 15
     CHECK: %[[or1:[0-9]+]] = or i8 %[[shl]], %[[and1]]
-    CHECK: %[[and2:[0-9]+]] = and i8 %[[val]], 15
+    CHECK: %[[and2:[0-9]+]] = and i8 %[[sext]], 15
     CHECK: %[[and3:[0-9]+]] = and i8 %[[load]], -16
     CHECK: %[[or2:[0-9]+]] = or i8 %[[and2]], %[[and3]]
     CHECK: %[[towrite:[0-9]+]] = select i1 %[[isodd]], i8 %[[or1]], i8 %[[or2]]
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_util.cc b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
index b3f6e7aac27e4b..bef4129a1b445b 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_util.cc
+++ b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
@@ -193,9 +193,6 @@ llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, llvm::Type* element_type,
                                    llvm::Value* index, llvm::IRBuilder<>* b) {
   llvm::Type* array_type = array->getType();
   CHECK(array_type->isPointerTy());
-  llvm::PointerType* array_type_as_pointer =
-      llvm::cast<llvm::PointerType>(array_type);
-  CHECK(array_type_as_pointer->isOpaqueOrPointeeTypeMatches(element_type));
   VLOG(2) << "EmitBufferIndexingGEP with type="
           << llvm_ir::DumpToString(array_type)
           << " array=" << llvm_ir::DumpToString(array)
@@ -216,10 +213,10 @@ llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, llvm::Type* element_type,
 llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
                                   llvm::Module* module) {
   switch (element_type) {
-    case PRED:
-    // i8 is used for S4/U4 as arrays of i4 values are not packed
     case S4:
     case U4:
+      return llvm::Type::getIntNTy(module->getContext(), 4);
+    case PRED:
     case S8:
     case U8:
       return llvm::Type::getInt8Ty(module->getContext());
@@ -285,11 +282,11 @@ llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
     case TUPLE:
     // An Opaque is like a void*, use i8*.
     case OPAQUE_TYPE:
-      return llvm::Type::getInt8PtrTy(module->getContext());
+      return llvm::PointerType::getUnqual(module->getContext());
     case TOKEN:
       // Tokens do not have a physical representation, but the compiler needs
       // some placeholder type, so use int8_t*.
-      return llvm::Type::getInt8PtrTy(module->getContext());
+      return llvm::PointerType::getUnqual(module->getContext());
     default:
       LOG(FATAL) << "unsupported type " << element_type;
   }
@@ -469,7 +466,7 @@ void EmitLogging(const char* tag, llvm::Value* value, llvm::IRBuilder<>* b) {
       b->getVoidTy(), {b->getInt64Ty(), b->getInt64Ty()}, /*isVarArg=*/false);
   b->CreateCall(log_function_type,
                 b->CreateIntToPtr(b->getInt64(absl::bit_cast<int64_t>(&LogS64)),
-                                  log_function_type->getPointerTo()),
+                                  b->getPtrTy()),
                 {b->getInt64(absl::bit_cast<int64_t>(tag)), value});
 }
 
diff --git a/third_party/xla/xla/service/llvm_ir/sort_util.cc b/third_party/xla/xla/service/llvm_ir/sort_util.cc
index f8f717846f71f5..4b4909ceb9b2c3 100644
--- a/third_party/xla/xla/service/llvm_ir/sort_util.cc
+++ b/third_party/xla/xla/service/llvm_ir/sort_util.cc
@@ -227,10 +227,11 @@ Status EmitTiledCompareLoop(
     // We need a generic pointer with address space 0 instead of a pointer to
     // shared memory (address space 3) so that we can pass it to the comparison
     // computation.
-    return b->CreateAddrSpaceCast(shared_memory_address,
-                                  llvm::PointerType::getWithSamePointeeType(
-                                      llvm::cast<llvm::PointerType>(ptr_type),
-                                      /*AddressSpace=*/0));
+    return b->CreateAddrSpaceCast(
+        shared_memory_address,
+        llvm::PointerType::get(
+            llvm::cast<llvm::PointerType>(ptr_type)->getContext(),
+            /*AddressSpace=*/0));
   };
   auto element_address_pointee_type = [&](int64_t operand, llvm::Value* index) {
     return llvm::GetElementPtrInst::getIndexedType(
diff --git a/third_party/xla/xla/service/llvm_ir/tuple_ops.cc b/third_party/xla/xla/service/llvm_ir/tuple_ops.cc
index 9d04ee7d55c92f..d4d8c6273bd69d 100644
--- a/third_party/xla/xla/service/llvm_ir/tuple_ops.cc
+++ b/third_party/xla/xla/service/llvm_ir/tuple_ops.cc
@@ -89,9 +89,6 @@ llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64_t index,
                                  int alignment, llvm::Value* operand,
                                  llvm::Type* operand_pointee_type,
                                  llvm::IRBuilder<>* b) {
-  CHECK(llvm::cast<llvm::PointerType>(operand->getType())
-            ->isOpaqueOrPointeeTypeMatches(operand_pointee_type));
-  llvm::Module* module = getModuleFromBuilder(b);
   const std::vector<llvm::Value*> gep_index = {b->getInt64(0),
                                                b->getInt64(index)};
   llvm::Value* element_ptr =
@@ -107,11 +104,7 @@ llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64_t index,
         ByteSizeOf(target_shape, src_buffer->getModule()->getDataLayout()));
   }
   SetAlignmentMetadataForLoad(src_buffer, alignment);
-
-  llvm::Type* element_type = ShapeToIrType(target_shape, module);
-  llvm::Value* ret_val =
-      b->CreateBitCast(src_buffer, element_type->getPointerTo());
-  return ret_val;
+  return src_buffer;
 }
 
 }  // namespace llvm_ir
diff --git a/third_party/xla/xla/service/logical_buffer_analysis.cc b/third_party/xla/xla/service/logical_buffer_analysis.cc
index 15afe61bea624f..db661a4485f0f8 100644
--- a/third_party/xla/xla/service/logical_buffer_analysis.cc
+++ b/third_party/xla/xla/service/logical_buffer_analysis.cc
@@ -78,7 +78,7 @@ Status LogicalBufferAnalysis::Analyze() {
 }
 
 LogicalBuffer& LogicalBufferAnalysis::GetBuffer(LogicalBuffer::Id id) const {
-  return *logical_buffers_.at(id);
+  return *logical_buffers_[id];
 }
 
 LogicalBuffer& LogicalBufferAnalysis::GetBuffer(HloInstruction* instruction,
diff --git a/third_party/xla/xla/service/memory_space_assignment/BUILD b/third_party/xla/xla/service/memory_space_assignment/BUILD
index e16247c4eddcec..9f800a22fb022f 100644
--- a/third_party/xla/xla/service/memory_space_assignment/BUILD
+++ b/third_party/xla/xla/service/memory_space_assignment/BUILD
@@ -44,13 +44,18 @@ cc_library(
         "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla:status",
+        "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_live_range",
         "//xla/service:buffer_value",
+        "//xla/service:call_graph",
         "//xla/service:heap_simulator",
+        "//xla/service:hlo_alias_analysis",
+        "//xla/service:hlo_buffer",
         "//xla/service:hlo_cost_analysis",
+        "//xla/service:hlo_dataflow_analysis",
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_value",
         "//xla/service:time_utils",
@@ -58,9 +63,13 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
@@ -86,7 +95,9 @@ xla_cc_test(
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_value",
         "//xla/service:instruction_hoister",
+        "//xla/service:time_utils",
         "//xla/tests:hlo_test_base",
+        "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
index aa129e88bb127f..7d2aa7af631961 100644
--- a/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
@@ -246,7 +246,7 @@ class BestFitRepacker
     CHECK_EQ(allocation_blocks_.size(), full_buffer_interval_map_.size());
     CHECK_EQ(allocation_blocks_.size(), sliced_buffer_interval_map_.size());
 
-    VLOG(1) << [&]() -> std::string {
+    VLOG(2) << [&]() -> std::string {
       int sliced_blocks = 0;
       int colocation_sets = 0;
       int colocation_sets_with_multiple_sliced_blocks = 0;
@@ -323,7 +323,7 @@ class BestFitRepacker
   // - chunks is sorted in slice time order
   void CommitChunks(const AllocationBlock* allocation_block,
                     const std::vector<Chunk>& chunks) {
-    VLOG(2) << "Committing repack chunks for " << allocation_block->ToString();
+    VLOG(3) << "Committing repack chunks for " << allocation_block->ToString();
 
     int64_t new_offset = -1;
     std::optional<SlicedAllocationData> repacked_slice_data = std::nullopt;
@@ -345,7 +345,7 @@ class BestFitRepacker
         const Chunk& chunk = chunks[i];
         int64_t start_time = sorted_inclusive_start_times[i];
         result_.heap_size = result_.UpdatedHeapSize(chunk);
-        VLOG(2) << "Adding sliced chunk " << chunk.ToString() << " at ["
+        VLOG(3) << "Adding sliced chunk " << chunk.ToString() << " at ["
                 << start_time << ", " << allocation_block->end_time << "]";
         interval_tree_.Add(start_time, allocation_block->end_time, chunk);
         new_offset = (new_offset == -1 ? chunk.offset
@@ -361,7 +361,7 @@ class BestFitRepacker
       CHECK_EQ(chunks.size(), 1);
       new_offset = chunks.front().offset;
       result_.heap_size = result_.UpdatedHeapSize(chunks.front());
-      VLOG(2) << "Adding unsliced chunk " << chunks.front().ToString()
+      VLOG(3) << "Adding unsliced chunk " << chunks.front().ToString()
               << " at [" << allocation_block->inclusive_start_time << ", "
               << allocation_block->end_time << ")";
       interval_tree_.Add(allocation_block->inclusive_start_time,
@@ -555,8 +555,7 @@ class BestFitRepacker
     Finish();
     bool success = result_.heap_size <= max_size_;
     if (!success) {
-      LOG(INFO) << "Repacking unsuccessful with heap size "
-                << result_.heap_size;
+      VLOG(1) << "Repacking unsuccessful with heap size " << result_.heap_size;
       return false;
     }
 
@@ -576,13 +575,13 @@ class BestFitRepacker
       DebuggingValidate();
     }
 
-    if (VLOG_IS_ON(1)) {
+    if (VLOG_IS_ON(2)) {
       for (AllocationBlock* block : allocation_blocks_) {
-        VLOG(1) << "AllocationBlock after repacking: " << block->ToString();
+        VLOG(2) << "AllocationBlock after repacking: " << block->ToString();
       }
     }
 
-    LOG(INFO) << "Repacking successful with heap size " << result_.heap_size;
+    VLOG(1) << "Repacking successful with heap size " << result_.heap_size;
 
     return true;
   }
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index a6652179ac2572..ca07e864776ce9 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <iterator>
 #include <limits>
 #include <list>
+#include <map>
 #include <memory>
 #include <optional>
 #include <ostream>
@@ -36,17 +37,31 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/call_graph.h"
 #include "xla/service/heap_simulator.h"
+#include "xla/service/hlo_alias_analysis.h"
+#include "xla/service/hlo_buffer.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/memory_space_assignment/repacking.h"
 #include "xla/service/memory_space_assignment/tuning_utils.h"
@@ -56,6 +71,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
+#include "xla/status_macros.h"
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/errors.h"
@@ -867,8 +883,29 @@ float MemorySpaceAssignmentCostAnalysis::GetBytesAccessedFromAlternateMemory(
   return bytes_accessed_from_alternate_mem;
 }
 
+namespace {
+// Returns true on async instructions since we assume they are already
+// efficiently scheduled such that they are not in the critical path and appear
+// to take no time.
+bool ExcludeInstructionFromElapsed(const HloInstruction& instruction) {
+  return instruction.opcode() == HloOpcode::kAllGatherStart ||
+         instruction.opcode() == HloOpcode::kAllGatherDone ||
+         instruction.opcode() == HloOpcode::kAllReduceStart ||
+         instruction.opcode() == HloOpcode::kAllReduceDone ||
+         instruction.opcode() == HloOpcode::kAsyncStart ||
+         instruction.opcode() == HloOpcode::kAsyncDone ||
+         instruction.opcode() == HloOpcode::kCollectivePermuteStart ||
+         instruction.opcode() == HloOpcode::kCollectivePermuteDone ||
+         instruction.opcode() == HloOpcode::kCopyStart ||
+         instruction.opcode() == HloOpcode::kCopyDone;
+}
+}  // namespace
+
 float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToCompute(
     const HloInstruction& instruction) const {
+  if (ExcludeInstructionFromElapsed(instruction)) {
+    return 0.0f;
+  }
   return std::max(
       cost_analysis_.flop_count(instruction) /
           cost_analysis_.per_second_rate(HloCostAnalysis::kFlopsKey),
@@ -880,6 +917,9 @@ float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToMemory(
     const HloInstruction& instruction,
     absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
     absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
+  if (ExcludeInstructionFromElapsed(instruction)) {
+    return 0.0f;
+  }
   float total_bytes_accessed = cost_analysis_.bytes_accessed(instruction);
   float bytes_accessed_from_alternate_mem = GetBytesAccessedFromAlternateMemory(
       instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
@@ -895,6 +935,9 @@ float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToMemory(
 float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToMemory(
     const HloInstruction& instruction,
     IsInAlternateMemoryFun is_in_alternate_mem) const {
+  if (ExcludeInstructionFromElapsed(instruction)) {
+    return 0.0f;
+  }
   float total_bytes_accessed = cost_analysis_.bytes_accessed(instruction);
   float bytes_accessed_from_alternate_mem = 0.0;
   for (int operand_num = 0; operand_num < instruction.operand_count();
@@ -933,6 +976,9 @@ float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToMemory(
 
 float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsed(
     const HloInstruction& instruction) const {
+  if (ExcludeInstructionFromElapsed(instruction)) {
+    return 0.0f;
+  }
   float overhead = GetDefaultMemoryAccessOverhead(instruction);
   return std::max(GetInstructionElapsedDueToCompute(instruction),
                   GetInstructionElapsedDueToMemory(instruction) + overhead);
@@ -942,6 +988,9 @@ float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedInAlternateMemory(
     const HloInstruction& instruction,
     absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
     absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
+  if (ExcludeInstructionFromElapsed(instruction)) {
+    return 0.0f;
+  }
   float overhead = GetDefaultMemoryAccessOverhead(
       instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
   return std::max(
@@ -954,6 +1003,9 @@ float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedInAlternateMemory(
 float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedInAlternateMemory(
     const HloInstruction& instruction,
     IsInAlternateMemoryFun is_in_alternate_mem) const {
+  if (ExcludeInstructionFromElapsed(instruction)) {
+    return 0.0f;
+  }
   return std::max(
       GetInstructionElapsedDueToCompute(instruction),
       GetInstructionElapsedDueToMemory(instruction, is_in_alternate_mem));
@@ -2677,6 +2729,12 @@ std::string MemoryBoundLoopOptimizer::LoopValue::ToString() const {
       values_str, "\n", allocations_str);
 }
 
+bool MemoryBoundLoopOptimizer::LoopValue::IsAllocationTypeSupported() const {
+  return allocation_type == AllocationType::kTemporary ||
+         allocation_type == AllocationType::kPinned ||
+         allocation_type == AllocationType::kPrefetch;
+}
+
 void MemoryBoundLoopOptimizer::SortLoopValues() {
   absl::c_stable_sort(loop_values_, [](const LoopValue& a, const LoopValue& b) {
     return a.savings_per_byte > b.savings_per_byte;
@@ -3153,6 +3211,11 @@ bool MemoryBoundLoopOptimizer::AllocatePrefetch(
     int loop_idx = i % loop_size_;
     update_additional_memory_used(loop_idx, value->size);
   }
+  // We reset accumulated copy resource and then reuse it to accumulate copy
+  // resource time in order to replay the previous for loop. It is important
+  // that we use the same arithmetic operations (as opposed to subtracting from
+  // copy_resource) because floating point operations aren't commutative.
+  accumulated_copy_resource = 0.0;
   for (int i = first_use_idx - 1; i >= last_use_idx_sentinel - loop_size_;
        --i) {
     int loop_idx = (i + loop_size_) % loop_size_;
@@ -3163,16 +3226,16 @@ bool MemoryBoundLoopOptimizer::AllocatePrefetch(
     int64_t overlap_memory_overhead = 0;
     update_additional_memory_used(loop_idx,
                                   value->size + overlap_memory_overhead);
-    if (bandwidth_idle_time < copy_resource) {
-      copy_resource -= bandwidth_idle_time;
+    if (bandwidth_idle_time < copy_resource - accumulated_copy_resource) {
+      accumulated_copy_resource += bandwidth_idle_time;
       bandwidth_idle_time = 0;
       if (loop_idx == *copy_start_time) {
-        VLOG(3) << "Remaining copy resource: " << copy_resource;
+        VLOG(3) << "Remaining copy resource: "
+                << (copy_resource - accumulated_copy_resource);
         break;
       }
     } else {
-      bandwidth_idle_time -= copy_resource;
-      copy_resource = 0;
+      bandwidth_idle_time -= copy_resource - accumulated_copy_resource;
       CHECK_EQ(loop_idx, *copy_start_time);
       break;
     }
@@ -3338,7 +3401,7 @@ Status AlternateMemoryBestFitHeap::OptimizeMemoryBoundLoop(int loop_start_idx,
   const int loop_optimized_allocations_original_size =
       loop_optimized_allocations_.size();
   for (MemoryBoundLoopOptimizer::LoopValue& value : optimizer->loop_values()) {
-    if (!value.allocations.empty()) {
+    if (!value.allocations.empty() && value.IsAllocationTypeSupported()) {
       loop_optimized_allocations_.push_back(std::move(value.allocations));
     }
   }
@@ -3456,7 +3519,6 @@ void AlternateMemoryBestFitHeap::IdentifyAndOptimizeMemoryBoundLoops() {
   // The minimum and maximum loop sizes that we consider.
   const int kMinLoopSize = 4;
   const int kMaxLoopSize = 400;
-  const float kMinNumIterations = 3.0;
   int optimized_loop_idx = 0;
   while (optimized_loop_idx < instruction_sequence.size()) {
     // Iterate over the flattened instruction sequence. We first try to find a
@@ -3482,6 +3544,11 @@ void AlternateMemoryBestFitHeap::IdentifyAndOptimizeMemoryBoundLoops() {
             // We found two instructions with the same fingerprint. The distance
             // between the two is the loop size candidate.
             loop_size_candidate = distance;
+            // Update the fingerprint map with the current loop index so that if
+            // the loop size candidate doesn't find a valid loop, we can resume
+            // searching from this instruction.
+            fingerprint_schedule_map[fingerprint_it->second] =
+                optimized_loop_idx;
             break;
           }
         }
@@ -3613,7 +3680,8 @@ void AlternateMemoryBestFitHeap::IdentifyAndOptimizeMemoryBoundLoops() {
 
     optimized_loop_idx = std::max(optimized_loop_idx, loop_end_idx) + 1;
 
-    if (num_iterations >= kMinNumIterations) {
+    if (num_iterations >=
+        options_.memory_bound_loop_optimizer_options.min_num_iterations()) {
       VLOG(2) << "Found valid loop. Loop start: " << loop_start_idx
               << " loop end: " << loop_end_idx
               << " num iterations: " << num_iterations;
@@ -3830,6 +3898,9 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
           ImportRepackedAllocations();
           --retry_number;
         }
+        if (*repack_status) {
+          ++num_repacks_successful_;
+        }
       } else {
         // Check if any of the allocation sites are inefficient. If so, get rid
         // of the pending allocation, require all of the inefficient sites in
@@ -3879,6 +3950,9 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
     }
   }
 
+  VLOG(1) << "Repack summary: " << num_repacks_successful_
+          << " succeeded out of " << num_repacks_;
+
   VLOG(3) << "Debug buffer info: ";
   XLA_VLOG_LINES(3, buffer_info_str_);
   VLOG(3) << "Debug allocation info: ";
@@ -5095,9 +5169,11 @@ void AlternateMemoryBestFitHeap::AllocateReservedScopedAllocations() {
   std::vector<MemorySpaceAssignmentRepacker::AllocationBlock*> colocations;
   for (int i = 0; i < instruction_sequence.size(); ++i) {
     const HloInstruction* instruction = instruction_sequence[i];
-    int64_t reserved_scoped_memory = options_.reserved_scoped_memory_fn(
-        instruction, /*operands_in_alternate_memory=*/{},
-        /*outputs_in_alternate_memory=*/{});
+    int64_t reserved_scoped_memory =
+        std::min(options_.reserved_scoped_memory_fn(
+                     instruction, /*operands_in_alternate_memory=*/{},
+                     /*outputs_in_alternate_memory=*/{}),
+                 options_.max_size_in_bytes);
     if (reserved_scoped_memory != 0) {
       VLOG(1) << "Allocate reserved scoped memory at " << i << " ("
               << instruction->name() << "): " << reserved_scoped_memory;
@@ -5325,23 +5401,24 @@ void AlternateMemoryBestFitHeap::AddInputAndOutputRequiredAssignments() {
           continue;
         }
         int64_t constant_instruction_time = constant_instruction_it->second;
-        for (const auto& indexed_shape :
-             ShapeUtil::GetLeafShapes(instruction->shape())) {
-          const ShapeIndex& index = indexed_shape.index;
-          for (const HloBuffer* buffer :
-               alias_analysis_.ComputeBuffersAt(instruction, index)) {
-            for (const HloValue* value : buffer->values()) {
-              VLOG(3) << "Adding required assignment for constant value = "
-                      << value->ToShortString()
-                      << " time = " << constant_instruction_time
-                      << " space = def";
-              AddRequiredAssignment(value, instruction, MemorySpace::kDefault,
-                                    constant_instruction_time,
-                                    /*offset=*/nullptr,
-                                    /*add_to_pending=*/false);
-            }
-          }
-        }
+        ShapeUtil::ForEachLeafShape(
+            instruction->shape(),
+            [&](const Shape& /*sub_shape*/, const ShapeIndex& index) {
+              for (const HloBuffer* buffer :
+                   alias_analysis_.ComputeBuffersAt(instruction, index)) {
+                for (const HloValue* value : buffer->values()) {
+                  VLOG(3) << "Adding required assignment for constant value = "
+                          << value->ToShortString()
+                          << " time = " << constant_instruction_time
+                          << " space = def";
+                  AddRequiredAssignment(value, instruction,
+                                        MemorySpace::kDefault,
+                                        constant_instruction_time,
+                                        /*offset=*/nullptr,
+                                        /*add_to_pending=*/false);
+                }
+              }
+            });
       }
     }
   }
@@ -5950,10 +6027,10 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
                         "CopyAllocations or SlicedCopyAllocations.";
         }
         if (prefetch_time != *request.preferred_prefetch_time) {
-          LOG(WARNING) << "Scheduled prefetch time (" << prefetch_time
-                       << ") doesn't match the preferred prefetch time ("
-                       << *request.preferred_prefetch_time
-                       << "): " << request.use->hlo_use.ToString();
+          VLOG(1) << "Scheduled prefetch time (" << prefetch_time
+                  << ") doesn't match the preferred prefetch time ("
+                  << *request.preferred_prefetch_time
+                  << "): " << request.use->hlo_use.ToString();
         }
       }
       return Result::kSuccess;
@@ -5961,10 +6038,10 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
     // Warn if there was a preferred prefetch time but we couldn't actually
     // prefetch.
     if (request.preferred_prefetch_time) {
-      LOG(WARNING) << "The request has a preferred prefetch time ("
-                   << *request.preferred_prefetch_time
-                   << ") which could not be satisfied: "
-                   << request.use->hlo_use.ToString();
+      VLOG(1) << "The request has a preferred prefetch time ("
+              << *request.preferred_prefetch_time
+              << ") which could not be satisfied: "
+              << request.use->hlo_use.ToString();
     }
     result_mark(prefetch_result, allocation_result);
   }
@@ -6260,8 +6337,8 @@ AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
     return Result::kSuccess;
   }
   if (request.prefer_no_copy_alternate_mem_allocation) {
-    LOG(WARNING) << "Preferred no-copy allocation, but this was not possible: "
-                 << request.use->hlo_use.ToString();
+    VLOG(1) << "Preferred no-copy allocation, but this was not possible: "
+            << request.use->hlo_use.ToString();
   }
   return Result::kFailOutOfMemory;
 }
@@ -6864,9 +6941,23 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::CheckPrefetchFit(
   }
 
   // Update the prefetch start time in our working solution.
-  std::vector<int64_t> exclusive_slice_start_times = PickSliceStartTimes(
-      sliced_buffer_interval->num_slices(),
-      context.exclusive_prefetch_start_time, context.prefetch_end_time);
+  std::vector<int64_t> exclusive_slice_start_times =
+      SlicedPrefetchStartTimePicker::Pick(
+          sliced_buffer_interval->num_slices(),
+          context.exclusive_prefetch_start_time, context.prefetch_end_time,
+          [&](int64_t exclusive_start_time,
+              int64_t exclusive_end_time) -> float {
+            return options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
+                exclusive_start_time, exclusive_end_time);
+          },
+          [&](int64_t lhs_time, int64_t rhs_time) -> bool {
+            return hlo_live_range_.flattened_instruction_sequence()
+                       .instructions()[lhs_time]
+                       ->parent() ==
+                   hlo_live_range_.flattened_instruction_sequence()
+                       .instructions()[rhs_time]
+                       ->parent();
+          });
   CHECK_EQ(sliced_buffer_interval->num_slices(),
            exclusive_slice_start_times.size());
   sliced_buffer_interval->UpdateExclusiveSliceStartTimes(
@@ -7076,12 +7167,14 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::CheckPrefetchFit(
   return Result::kFailOutOfMemory;
 }
 
-std::vector<int64_t> AlternateMemoryBestFitHeap::PickSliceStartTimes(
-    int64_t num_slices, int64_t prefetch_start_time,
-    int64_t prefetch_end_time) const {
-  CHECK_LE(prefetch_start_time, prefetch_end_time);
+std::vector<int64_t> SlicedPrefetchStartTimePicker::Pick(
+    int64_t num_slices, int64_t exclusive_prefetch_start_time,
+    int64_t prefetch_end_time, absl::AnyInvocable<ElapsedTimeFn> elapsed_fn,
+    absl::AnyInvocable<SameComputationParentFn> has_same_parent_fn) {
+  CHECK_LE(exclusive_prefetch_start_time, prefetch_end_time);
   VLOG(5) << "Picking slice start times. num_slices = " << num_slices
-          << "; prefetch_start_time = " << prefetch_start_time
+          << "; exclusive_prefetch_start_time = "
+          << exclusive_prefetch_start_time
           << "; prefetch_end_time = " << prefetch_end_time;
 
   // Prefetching starts after the selected start instruction and ends
@@ -7089,59 +7182,54 @@ std::vector<int64_t> AlternateMemoryBestFitHeap::PickSliceStartTimes(
   // instructions worth of time to perform all of the sliced copies. So, the
   // only choices for start times that give us time to copy are <=
   // prefetch_end_time - 2.
-  if (prefetch_start_time >= prefetch_end_time - 2 || num_slices == 1) {
-    return std::vector<int64_t>(num_slices, prefetch_start_time);
+  if (exclusive_prefetch_start_time >= prefetch_end_time - 2 ||
+      num_slices == 1) {
+    return std::vector<int64_t>(num_slices, exclusive_prefetch_start_time);
   }
 
   float total_elapsed =
-      options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
-          prefetch_start_time, prefetch_end_time);
+      elapsed_fn(exclusive_prefetch_start_time, prefetch_end_time);
   if (total_elapsed <= 0.0) {
-    return std::vector<int64_t>(num_slices, prefetch_start_time);
-  }
-
-  CHECK_LE(prefetch_start_time, prefetch_end_time - 2);
-  std::vector<int64_t> reverse_start_times;
-  reverse_start_times.reserve(num_slices);
-  for (int64_t candidate_start_time = prefetch_end_time - 2;
-       reverse_start_times.size() < num_slices &&
-       candidate_start_time >= prefetch_start_time;
-       --candidate_start_time) {
-    if (candidate_start_time == prefetch_start_time) {
-      while (reverse_start_times.size() < num_slices) {
-        // This is the last good start time, so use it for all remaining slices.
-        reverse_start_times.push_back(candidate_start_time);
-      }
-      break;
+    return std::vector<int64_t>(num_slices, exclusive_prefetch_start_time);
+  }
+
+  std::vector<int64_t> start_times;
+  start_times.reserve(num_slices);
+  start_times.push_back(exclusive_prefetch_start_time);
+  int64_t last_valid_candidate = exclusive_prefetch_start_time;
+  int64_t candidate = exclusive_prefetch_start_time;
+  while (candidate < prefetch_end_time - 1 && start_times.size() < num_slices) {
+    float target_elapsed = total_elapsed *
+                           static_cast<float>(num_slices - start_times.size()) /
+                           static_cast<float>(num_slices);
+    float elapsed = elapsed_fn(candidate, prefetch_end_time);
+    if (elapsed < target_elapsed) {
+      // We've gone past our target, so use the last valid candidate.
+      start_times.push_back(last_valid_candidate);
+      continue;
     }
-    float used = options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
-        candidate_start_time, prefetch_end_time);
-    CHECK_GE(used, 0.0) << used << " real time elapses in logical interval ("
-                        << candidate_start_time << ", " << prefetch_end_time
-                        << "). Expected something >= 0.0.";
-    CHECK_LE(used, total_elapsed);
-    auto compute_target_fraction =
-        [num_slices](const std::vector<int64_t>& reverse_start_times) -> float {
-      return (static_cast<float>(reverse_start_times.size()) + 1.0f) /
-             static_cast<float>(num_slices);
-    };
-    while (used >=
-           compute_target_fraction(reverse_start_times) * total_elapsed) {
-      CHECK_LE(reverse_start_times.size(), num_slices)
-          << "Num slices = " << num_slices
-          << "; Prefetch start = " << prefetch_start_time
-          << "; Slice candidate time = " << candidate_start_time
-          << "; Prefetch end = " << prefetch_end_time
-          << "; Total elapsed = " << total_elapsed << "; Used = " << used
-          << "; Target fraction = "
-          << compute_target_fraction(reverse_start_times);
-      reverse_start_times.push_back(candidate_start_time);
+    bool updating_candidate_impacts_elapsed =
+        last_valid_candidate != candidate &&
+        elapsed_fn(last_valid_candidate,
+                   ExclusiveToInclusiveStartTime(candidate)) > 0.0;
+    // has_same_parent_fn will look up the computation parent of the
+    // instructions at prefetch_start_time and prefetch_end_time. If
+    // prefetch_start_time is -1, no such instruction will exist. However, if we
+    // want to insert an instruction after the -1 schedule position, we can
+    // use the parent of the instruction at index 0 instead. Thus, we use
+    // std::max below.
+    if (has_same_parent_fn(std::max<int64_t>(0, exclusive_prefetch_start_time),
+                           std::max<int64_t>(0, candidate)) &&
+        updating_candidate_impacts_elapsed) {
+      last_valid_candidate = candidate;
     }
+    ++candidate;
+  }
+  while (start_times.size() < num_slices) {
+    start_times.push_back(last_valid_candidate);
   }
 
-  CHECK_EQ(reverse_start_times.size(), num_slices);
-  absl::c_reverse(reverse_start_times);
-  return reverse_start_times;
+  return start_times;
 }
 
 std::string
@@ -7155,7 +7243,7 @@ AlternateMemoryBestFitHeap::AlternateMemoryAllocationAttemptToString(
 
   for (int i = 0; i < sliced_buffer_interval->num_slices(); ++i) {
     slice_times.push_back(absl::StrCat(
-        "(", sliced_buffer_interval->IntervalForMakeFreeChunks(i).start, ", ",
+        "[", sliced_buffer_interval->IntervalForMakeFreeChunks(i).start, ", ",
         sliced_buffer_interval->full_buffer_interval().end, ")"));
     if (context.slice_proposal_collection) {
       estimated_slice_prefetch_end_times.push_back(
@@ -7845,7 +7933,7 @@ Status MemorySpaceAssignment::SlicedCopyAllocation::Process() {
     TF_RETURN_IF_ERROR(slice_detail.CreateAsyncSlice(
         shape, *producing_instruction, *computation, update_layout_fn_));
     VLOG(4) << "Created " << slice_detail.copy_start->name()
-            << " for copy allocation: " << ToString();
+            << " for sliced copy allocation: " << ToString();
     slice_dones.push_back(slice_detail.copy_done);
   }
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
index f518e9f8d79968..cd26fdb21ee458 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_H_
 #define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_H_
 
+#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <list>
@@ -26,6 +27,7 @@ limitations under the License.
 #include <set>
 #include <string>
 #include <tuple>
+#include <type_traits>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -35,25 +37,33 @@ limitations under the License.
 #include "absl/container/btree_map.h"
 #endif
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/functional/function_ref.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/service/buffer_value.h"
+#include "xla/service/call_graph.h"
 #include "xla/service/heap_simulator.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_alias_analysis.h"
+#include "xla/service/hlo_buffer.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
 #include "xla/service/memory_space_assignment/repacking.h"
 #include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/statusor.h"
+#include "xla/util.h"
 
 namespace xla {
 namespace memory_space_assignment {
 
 // Forward Declaration of Options.
-class Options;
+struct Options;
 
 inline constexpr char kConcatBitcastCustomCall[] = "ConcatBitcast";
 
@@ -549,6 +559,33 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
   std::optional<Shape> shape_override_;
 };
 
+// A class for turning a copy start time and end time into slice start times.
+class SlicedPrefetchStartTimePicker {
+ public:
+  // Returns the amount of time elapsed in the instruction schedule between
+  // (exclusive_start_time, exclusive_end_time).
+  using ElapsedTimeFn = std::add_pointer<float(
+      int64_t exclusive_start_time, int64_t exclusive_end_time) const>::type;
+
+  // Returns true if the instructions at lhs_time and rhs_time are in the same
+  // computation.
+  using SameComputationParentFn =
+      std::add_pointer<bool(int64_t lhs_time, int64_t rhs_time) const>::type;
+
+  // Picks slice start times, given the num_slices, prefetch_start_time, and
+  // prefetch_end_time. The returned times are exclusive.
+  //
+  // REQUIRES:
+  // - The instructions following each start time are guaranateed to be in the
+  //   same computation.
+  // - The returned times sorted.
+  // - The first returned time is equal to prefetch_start_time.
+  static std::vector<int64_t> Pick(
+      int64_t num_slices, int64_t exclusive_prefetch_start_time,
+      int64_t prefetch_end_time, absl::AnyInvocable<ElapsedTimeFn> elapsed_fn,
+      absl::AnyInvocable<SameComputationParentFn> has_same_parent_fn);
+};
+
 // MemorySpaceAssignment assigns memory spaces (default or alternate) to each
 // instruction in the module. It will greedily try placing as as many values in
 // the alternate memory space as possible. It uses the heap simulator to
@@ -1890,6 +1927,10 @@ class MemoryBoundLoopOptimizer {
     static std::string AllocationTypeToString(AllocationType allocation_type);
     std::string ToString() const;
 
+    // Returns true if memory-bound loop optimizer supports allocating this type
+    // of a loop value.
+    bool IsAllocationTypeSupported() const;
+
     // The HloValues that correspond to this LoopValue.
     std::vector<const HloValue*> hlo_values;
     // The position in the header, if any.
@@ -2474,11 +2515,6 @@ class AlternateMemoryBestFitHeap
   // Check if for the specified type of solution, using the parameters in
   // context. If we find a solution, it will be stored in context.
   Result CheckPrefetchFit(bool for_sliced_solution, PrefetchContext& context);
-  // Given a specified number of slices, start times, and end times, pick times
-  // to start each slice.
-  std::vector<int64_t> PickSliceStartTimes(int64_t num_slices,
-                                           int64_t prefetch_start_time,
-                                           int64_t prefetch_end_time) const;
   // Creates a debugging string describing the timing of the prefetch solution
   // we are currently attempting (as dictated by for_sliced_solution and
   // context).
@@ -2703,6 +2739,7 @@ class AlternateMemoryBestFitHeap
   // for aliased allocations.
   std::list<RepackAllocationBlock> repack_allocation_blocks_;
   int64_t num_repacks_ = 0;
+  int64_t num_repacks_successful_ = 0;
   std::vector<std::pair<BufferInterval, Chunk>> pending_chunks_;
   std::vector<AsynchronousCopy> pending_async_copies_;
   std::vector<std::pair<const HloValue*, RequiredMemoryAssignment>>
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
index f85692df7af210..426e4a154ff383 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
@@ -58,4 +58,8 @@ message MemoryBoundLoopOptimizerOptions {
   // pipelined prefetch starts the same time as its counterpart in the previous
   // iteration finishes.
   optional bool allow_unsatisfied_fully_pipelined_prefetch = 3;
+
+  // The minimum number of iterations that the loop needs to be unrolled for the
+  // memory-bound loop optimizer to kick in.
+  optional float min_num_iterations = 4;
 }
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index dc20c252483c56..c54a1d03aebf7c 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <limits>
 #include <memory>
+#include <numeric>
 #include <optional>
 #include <ostream>
 #include <sstream>
@@ -55,10 +56,12 @@ limitations under the License.
 #include "xla/service/instruction_hoister.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
 #include "xla/service/memory_space_assignment/repacking.h"
+#include "xla/service/time_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/tests/verified_hlo_module.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -7317,6 +7320,32 @@ ENTRY entry {
   AssignMemorySpaceUsingCostAnalysis(module.get(), options, cost_options);
 }
 
+TEST_P(MemorySpaceAssignmentTest, AsyncOpElapsedTime) {
+  // Test that async ops are treated to take no time. We assume async operations
+  // are efficiently scheduled. So, in this example, collective-permute-start
+  // should take zero time, which should be insufficient time to overlap a
+  // prefetch for negate1's operand.
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY entry {
+  param0 = bf16[16]{0} parameter(0)
+  param1 = bf16[4]{0} parameter(1)
+  collective-permute-start = (bf16[16]{0}, bf16[16]{0}, u32[], u32[]) collective-permute-start(param0), source_target_pairs={{0,1},{1,2},{2,3}}
+  negate1 = bf16[4]{0} negate(param1)
+  collective-permute-done = bf16[16]{0} collective-permute-done(collective-permute-start)
+  ROOT negate2 = bf16[4]{0} negate(negate1)
+}
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AssignMemorySpaceUsingCostAnalysis(module.get());
+  EXPECT_THAT(FindInstruction(module.get(), "negate1")->operand(0),
+              op::Parameter(1));
+}
+
 INSTANTIATE_TEST_SUITE_P(MemorySpaceAssignmentInstantiation,
                          MemorySpaceAssignmentTest,
                          ::testing::Values(false, true));
@@ -9562,6 +9591,7 @@ class MemoryBoundLoopOptimizerTest : public HloTestBase {
     optimizer_options.set_enabled(true);
     optimizer_options.set_desired_copy_ratio(0.7);
     optimizer_options.set_allow_unsatisfied_fully_pipelined_prefetch(false);
+    optimizer_options.set_min_num_iterations(3.0);
     options_.memory_bound_loop_optimizer_options = optimizer_options;
     options_.alternate_mem_bandwidth_bytes_per_second = 128;
     options_.async_copy_bandwidth_bytes_per_second = 32;
@@ -9825,13 +9855,20 @@ ENTRY Entry {
     return preset_assignments;
   }
 
-  Status VerifyMsaEquivalence(HloModule* module) {
+  Status VerifyMsaEquivalence(HloModule* module,
+                              bool expect_unsupported_allocations = false) {
     // Create a map indexed by instruction number and operand number.
     absl::flat_hash_map<std::pair<int, int>,
                         const MemorySpaceAssignment::Allocation*>
         allocation_map;
     for (const MemoryBoundLoopOptimizer::LoopValue& value :
          optimizer_->loop_values()) {
+      // Skip verification for unsupported allocations as they will go through
+      // the usual MSA algorithm and may actually get an alternate memory
+      // allocation.
+      if (!value.IsAllocationTypeSupported()) {
+        continue;
+      }
       for (const auto& allocation : value.allocations) {
         for (const HloUse& use : allocation->uses()) {
           absl::string_view inst_name = use.instruction->name();
@@ -9873,7 +9910,10 @@ ENTRY Entry {
         for (int operand_number = 0; operand_number < 2; ++operand_number) {
           const HloInstruction* operand = inst->operand(operand_number);
           LOG(INFO) << inst->name() << ", operand " << operand_number;
-          TF_RET_CHECK(allocation_map.contains({inst_number, operand_number}));
+          if (!allocation_map.contains({inst_number, operand_number})) {
+            TF_RET_CHECK(expect_unsupported_allocations);
+            continue;
+          }
           const MemorySpaceAssignment::Allocation* allocation =
               allocation_map.at({inst_number, operand_number});
           if (!allocation->is_copy_allocation()) {
@@ -10393,6 +10433,37 @@ TEST_F(MemoryBoundLoopOptimizerTest, OptimizerEndToEnd) {
   TF_ASSERT_OK(VerifyMsaEquivalence(module.get()));
 }
 
+TEST_F(MemoryBoundLoopOptimizerTest, OptimizerEndToEndUnsupportedAllocation) {
+  // op2 is a loop-carried dependency, which is currently not supported. But the
+  // usual MSA algorithm should still be able to give it an alternate memory
+  // allocation.
+  absl::string_view hlo_loop_str = R"(
+    $op0 = f32[1,4] add(f32[1,4] $prev_op3, f32[1,4] $prev_op4)
+    $op1 = f32[8,4] add(f32[8,4] $param0, f32[8,4] $param1)
+    $op2 = f32[1,4] add(f32[1,4] $prev_op2, f32[1,4] $op0)
+    $op3 = f32[1,4] add(f32[1,4] $op0, f32[1,4] $op2)
+    $op4 = f32[1,4] add(f32[1,4] $op2, f32[1,4] $op3)
+    ROOT $root = tuple($op1, $op4)
+  )";
+
+  int loop_start_idx;
+  MemoryBoundLoopOptimizer* optimizer;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndCreateOptimizer(hlo_loop_str,
+                                           /*alternate_memory_size=*/1024,
+                                           loop_start_idx, &optimizer));
+
+  optimizer->Optimize();
+  TF_ASSERT_OK_AND_ASSIGN(auto preset_assignments,
+                          RunMsa(module.get(), /*alternate_memory_size=*/1024));
+
+  TF_ASSERT_OK(VerifyMsaEquivalence(module.get(),
+                                    /*expect_unsupported_allocations=*/true));
+
+  const HloInstruction* op2 = FindInstruction(module.get(), "op2");
+  EXPECT_EQ(op2->shape().layout().memory_space(), kAlternateMemorySpace);
+}
+
 TEST_F(MemoryBoundLoopOptimizerTest, OptimizerEndToEndWhileLoop) {
   absl::string_view hlo_str = R"(
 HloModule module, is_scheduled=true
@@ -10579,6 +10650,195 @@ ENTRY entry {
                           RunMsa(module.get(), /*alternate_memory_size=*/512));
 }
 
+class SlicedPrefetchStartTimePickerTest : public ::testing::Test {
+ protected:
+  struct FakeInstructionData {
+    float elapsed_time = 0.0;
+    std::string computation;
+  };
+
+  std::vector<int64_t> Pick(
+      const std::vector<FakeInstructionData>& schedule_data, int64_t num_slices,
+      int64_t prefetch_start_time, int64_t prefetch_end_time) {
+    return memory_space_assignment::SlicedPrefetchStartTimePicker::Pick(
+        num_slices, prefetch_start_time, prefetch_end_time,
+        [&schedule_data](int64_t exclusive_start_time,
+                         int64_t exclusive_end_time) {
+          auto start_it = schedule_data.begin() +
+                          ExclusiveToInclusiveStartTime(exclusive_start_time);
+          auto end_it = (exclusive_end_time < schedule_data.size()
+                             ? schedule_data.begin() + exclusive_end_time
+                             : schedule_data.end());
+          return std::accumulate(
+              start_it, end_it, 0.0,
+              [](float total, const FakeInstructionData& data) {
+                return total + data.elapsed_time;
+              });
+        },
+        [&schedule_data](int64_t lhs_time, int64_t rhs_time) {
+          CHECK_GE(lhs_time, 0);
+          CHECK_GE(rhs_time, 0);
+          CHECK_LT(lhs_time, schedule_data.size());
+          CHECK_LT(rhs_time, schedule_data.size());
+          return schedule_data[lhs_time].computation ==
+                 schedule_data[rhs_time].computation;
+        });
+  }
+};
+
+TEST_F(SlicedPrefetchStartTimePickerTest, Base1) {
+  // The 2nd slice naturally should start after 1.5 time units have passed,
+  // forcing us to start before t=1.
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {1.0, "a"},
+                       /*t=2*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
+                   /*prefetch_end_time=*/3),
+              ::testing::ElementsAre(-1, 0));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, Base2) {
+  // The 2nd slice naturally should start after 6.0 time units have passed,
+  // forcing us to start before t=0.
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {10.0, "a"},
+                       /*t=1*/ {1.0, "a"},
+                       /*t=2*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
+                   /*prefetch_end_time=*/3),
+              ::testing::ElementsAre(-1, -1));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, Base3) {
+  // The 2nd slice naturally should start after 1.0 time unit has passed.
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
+                   /*prefetch_end_time=*/2),
+              ::testing::ElementsAre(-1, 0));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, Zeros1) {
+  // The 2nd slice naturally should start after 1.0 time unit has passed.
+  // Make sure we don't add extra 0.0 cost instructions to the start time.
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {0.0, "a"},
+                       /*t=2*/ {0.0, "a"},
+                       /*t=3*/ {0.0, "a"},
+                       /*t=4*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
+                   /*prefetch_end_time=*/5),
+              ::testing::ElementsAre(-1, 0));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, Zeros2) {
+  // The 2nd slice naturally should start after 2.0 time units have passed.
+  // Make sure we don't add extra 0.0 cost instructions to the start time.
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {0.0, "a"},
+                       /*t=2*/ {1.0, "a"},
+                       /*t=3*/ {0.0, "a"},
+                       /*t=4*/ {1.0, "a"},
+                       /*t=5*/ {0.0, "a"},
+                       /*t=6*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
+                   /*prefetch_end_time=*/7),
+              ::testing::ElementsAre(-1, 2));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, Zeros3) {
+  // The first slice always comes at prefetch_start_time. The 2nd slice
+  // naturally should start after 1.5 time units have passed, causing us to
+  // start after t=2. Make sure we don't add extra 0.0 cost instructions to the
+  // start time.
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {0.0, "a"},
+                       /*t=2*/ {1.0, "a"},
+                       /*t=3*/ {0.0, "a"},
+                       /*t=4*/ {1.0, "a"},
+                       /*t=5*/ {0.0, "a"},
+                       /*t=6*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/1,
+                   /*prefetch_end_time=*/7),
+              ::testing::ElementsAre(1, 2));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, MidSchedule) {
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {1.0, "a"},
+                       /*t=3*/ {1.0, "a"},
+                       /*t=4*/ {1.0, "a"},
+                       /*t=5*/ {1.0, "a"},
+                       /*t=6*/ {1.0, "a"},
+                       /*t=7*/ {1.0, "a"},
+                       /*t=8*/ {1.0, "a"},
+                       /*t=9*/ {1.0, "a"},
+                       /*t=10*/ {1.0, "a"},
+                       /*t=11*/ {1.0, "a"},
+                       /*t=12*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/5,
+                   /*prefetch_end_time=*/10),
+              ::testing::ElementsAre(5, 7));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, ManySlices) {
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {1.0, "a"},
+                       /*t=2*/ {1.0, "a"},
+                       /*t=3*/ {1.0, "a"},
+                       /*t=4*/ {1.0, "a"},
+                       /*t=5*/ {1.0, "a"},
+                       /*t=6*/ {1.0, "a"},
+                       /*t=7*/ {1.0, "a"},
+                       /*t=8*/ {1.0, "a"},
+                       /*t=9*/ {1.0, "a"},
+                       /*t=10*/ {1.0, "a"},
+                       /*t=11*/ {1.0, "a"},
+                       /*t=12*/ {1.0, "a"},
+                       /*t=13*/ {1.0, "a"},
+                       /*t=14*/ {1.0, "a"},
+                       /*t=15*/ {1.0, "a"},
+                       /*t=16*/ {1.0, "a"},
+                       /*t=17*/ {1.0, "a"},
+                       /*t=18*/ {1.0, "a"},
+                       /*t=19*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/5, /*prefetch_start_time=*/-1,
+                   /*prefetch_end_time=*/20),
+              ::testing::ElementsAre(-1, 3, 7, 11, 15));
+}
+
+TEST_F(SlicedPrefetchStartTimePickerTest, DifferentParents) {
+  // The 2nd slice naturally should start after t=2, but we are forced to push
+  // it after t=1, since the instruction at t=3 has parent "b", while the first
+  // instruction has parent "a."
+  EXPECT_THAT(Pick({
+                       /*t=0*/ {1.0, "a"},
+                       /*t=1*/ {1.0, "a"},
+                       /*t=2*/ {1.0, "b"},
+                       /*t=3*/ {1.0, "b"},
+                       /*t=4*/ {1.0, "b"},
+                       /*t=5*/ {1.0, "a"},
+                   },
+                   /*num_slices=*/2, /*prefetch_start_time=*/-1,
+                   /*prefetch_end_time=*/6),
+              ::testing::ElementsAre(-1, 1));
+}
+
 class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
  protected:
   // Used by CheckSchedule() to classify instructions in the schedule.
@@ -10991,6 +11251,38 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
     return nullptr;
   }
 
+  static StatusOr<std::vector<int>> GetSliceStartIndicies(
+      const std::vector<HloInstruction*>& schedule,
+      const HloInstruction* concat_bitcast) {
+    std::vector<int> indicies;
+
+    if (!IsConcatBitcast(concat_bitcast)) {
+      return InvalidArgumentStrCat(concat_bitcast->name(),
+                                   " is not a concat-bitcast.");
+    }
+    for (int i = 0; i < concat_bitcast->operand_count(); ++i) {
+      const HloInstruction* async_slice_done = concat_bitcast->operand(i);
+      if (!IsAsyncSliceDone(async_slice_done)) {
+        return InvalidArgumentStrCat("Operand ", i, " of ",
+                                     concat_bitcast->name(),
+                                     " is not an async-slice-done.");
+      }
+      const HloInstruction* async_slice_start = async_slice_done->operand(0);
+      if (!IsAsyncSliceStart(async_slice_start)) {
+        return InvalidArgumentStrCat("Operand 0, of operand ", i, " of ",
+                                     concat_bitcast->name(),
+                                     " is not an async-slice-start.");
+      }
+      TF_ASSIGN_OR_RETURN(
+          int schedule_index,
+          FindScheduleIndexOfInstruction(schedule, async_slice_start->name(),
+                                         InstructionClass::kRelatedSliceStart));
+      indicies.push_back(schedule_index);
+    }
+
+    return indicies;
+  }
+
   // REQUIRES:
   // - Concat-bitcast and all slices were found in the schedule used to
   //   construct schedule_to_class.
@@ -12056,5 +12348,227 @@ ENTRY main {
   EXPECT_EQ(p2_slice_offsets[1], 2048);
 }
 
+struct ModuleAndAssignments {
+  std::unique_ptr<VerifiedHloModule> module;
+  std::unique_ptr<PresetAssignments> assignments;
+};
+
+// In this test, we ensure that sliced prefetching does not attempt to start a
+// slice during a different computation than the one where the slice finishes.
+// We do this by forcing a sliced prefetch to start just before back-to-back
+// while loops and to immediately finish after them. We use while loops with
+// different expected elapse times, so that the ideal place to start the second
+// slice is during one of the while loops.
+TEST_F(SlicedPrefetchTest, BackToBackWhileLoops) {
+  // Define constants for building our test HLO.
+  const std::string while_cond = R"zz(
+WhileCond$ID {
+  cond_param = (f32[8,8], f32[8,8], f32[], f32[]) parameter(0)
+  i = f32[] get-tuple-element(cond_param), index=2
+  limit = f32[] get-tuple-element(cond_param), index=3
+
+  ROOT cond_result = pred[] compare(i, limit), direction=LT
+})zz";
+
+  const std::string while_body = R"zz(
+WhileBody$ID {
+  body_param = (f32[8,8], f32[8,8], f32[], f32[]) parameter(0)
+  v0 = f32[8,8] get-tuple-element(body_param), index=0
+  v1 = f32[8,8] get-tuple-element(body_param), index=1
+  i = f32[] get-tuple-element(body_param), index=2
+  limit = f32[] get-tuple-element(body_param), index=3
+  one = f32[] constant(1)
+
+  new_i = f32[] add(i, one)
+  $COMPUTATION
+
+  ROOT while_result = (f32[8,8], f32[8,8], f32[], f32[]) tuple(v0, new_v1, new_i, limit)
+})zz";
+
+  const std::string while_computation_cheap = R"zz(
+  new_v1 = f32[8,8] add(v0, v1))zz";
+
+  std::string while_computation_expensive = R"zz(
+  new_v1_0 = f32[8,8] add(v0, v1)
+  new_v1_1 = f32[8,8] tanh(new_v1_0)
+  new_v1_2 = f32[8,8] tanh(new_v1_1)
+  new_v1_3 = f32[8,8] tanh(new_v1_2)
+  new_v1 = f32[8,8] tanh(new_v1_3))zz";
+
+  std::string module_text = R"zz(
+HloModule Slice, is_scheduled=true
+
+$WHILEBODY1
+$WHILECOND1
+$WHILEBODY2
+$WHILECOND2
+
+ENTRY main {
+  loop1_input1 = f32[8,8] parameter(0)
+  loop1_input2 = f32[8,8] parameter(1)
+  loop1_iterations = f32[] parameter(2)
+  loop1_begin = f32[] constant(0)
+  loop1_tuple = (f32[8,8], f32[8,8], f32[], f32[]) tuple(loop1_input1, loop1_input2, loop1_iterations, loop1_begin)
+  loop2_input1 = f32[8,8] parameter(3)
+  loop2_input2 = f32[8,8] parameter(4)
+  loop2_iterations = f32[] parameter(5)
+  loop2_begin = f32[] constant(0)
+  loop2_tuple = (f32[8,8], f32[8,8], f32[], f32[]) tuple(loop2_input1, loop2_input2, loop2_iterations, loop2_begin)
+
+  prefetch = f32[8,8] parameter(6)
+  loop1_output = (f32[8,8], f32[8,8], f32[], f32[]) while(loop1_tuple), condition=WhileCond1, body=WhileBody1
+  loop2_output = (f32[8,8], f32[8,8], f32[], f32[]) while(loop2_tuple), condition=WhileCond2, body=WhileBody2
+  prefetch_use = f32[8,8] tanh(prefetch)
+
+  loop1_result = f32[8,8] get-tuple-element(loop1_output), index=1
+  loop2_result = f32[8,8] get-tuple-element(loop2_output), index=1
+
+  tmp1 = f32[8,8] add(loop1_result, loop2_result)
+  ROOT r = f32[8,8] add(tmp1, prefetch_use)
+})zz";
+
+  // A lambda for generating HLO with 2 while loops called back to back. The
+  // first while loop will execute while_computation1 and the second while loop
+  // will execute while_computation2.
+  auto gen_hlo = [&](std::string_view while_computation1,
+                     std::string_view while_computation2) {
+    return absl::StrReplaceAll(
+        module_text,
+        {
+            {"$WHILEBODY1",
+             absl::StrReplaceAll(
+                 while_body,
+                 {{"$ID", "1"}, {"$COMPUTATION", while_computation1}})},
+            {"$WHILECOND1", absl::StrReplaceAll(while_cond, {{"$ID", "1"}})},
+            {"$WHILEBODY2",
+             absl::StrReplaceAll(
+                 while_body,
+                 {{"$ID", "2"}, {"$COMPUTATION", while_computation2}})},
+            {"$WHILECOND2", absl::StrReplaceAll(while_cond, {{"$ID", "2"}})},
+        });
+  };
+
+  // Configure MSA.
+  SetupProposeSlicesToExpect2SlicesOfF32x8x8();
+  // Force MSA to prefer prefetching 'prefetch'.
+  MemorySpaceAssignment::BufferIntervalCompare buffer_interval_compare =
+      [](const MemorySpaceAssignment::BufferInterval& lhs,
+         const MemorySpaceAssignment::BufferInterval& rhs) {
+        auto lookup = [](const MemorySpaceAssignment::BufferInterval& x) {
+          // An arbitrary value that is greater than that used for 'prefetch'.
+          int priority = 100;
+          if (x.buffer->instruction()->name() == "prefetch") {
+            priority = 1;
+          }
+          return std::make_tuple(priority, x.buffer->instruction()->name());
+        };
+
+        return lookup(lhs) < lookup(rhs);
+      };
+  // We set the minimum prefetch interval to a large enough value (32) to force
+  // us to prefetch around both while loops, and not just 1.
+  InstructionCountPrefetchIntervalPicker prefetch_interval_picker(32, 100);
+  options_.max_size_in_bytes = 4 * 64;
+
+  // Define a lambda for running MSA on the specified HLO, with the
+  // configuration above.
+  auto run_msa =
+      [&](std::string_view hlo_text) -> StatusOr<ModuleAndAssignments> {
+    ModuleAndAssignments module_and_assignments;
+    TF_ASSIGN_OR_RETURN(module_and_assignments.module,
+                        ParseAndReturnVerifiedModule(hlo_text));
+    VLOG(1) << "Original module:\n"
+            << module_and_assignments.module->ToString(
+                   HloPrintOptions::ShortParsable());
+    module_and_assignments.assignments =
+        AssignMemorySpace(module_and_assignments.module.get(), options_,
+                          buffer_interval_compare, &prefetch_interval_picker);
+    VLOG(1) << "Post-MSA module:\n"
+            << module_and_assignments.module->ToString(
+                   HloPrintOptions::ShortParsable());
+    return module_and_assignments;
+  };
+
+  // In this case, less time elapses during the first while loop than the
+  // second. Make sure we start the second slice between the two while loops,
+  // rather than during the second while loop.
+  TF_ASSERT_OK_AND_ASSIGN(
+      ModuleAndAssignments module_and_assignments1,
+      run_msa(gen_hlo(while_computation_cheap, while_computation_expensive)));
+  auto root1 =
+      module_and_assignments1.module->entry_computation()->root_instruction();
+  EXPECT_THAT(root1, op::Add(_, op::Tanh(IsAsyncSlicedCopy(
+                                    kAlternateMemorySpace, kDefaultMemorySpace,
+                                    {{{0, 4}, {0, 8}}, {{4, 8}, {0, 8}}},
+                                    op::Parameter(6)))));
+  TF_EXPECT_OK(CheckSchedule(
+      *module_and_assignments1.module, root1->operand(1)->operand(0),
+      /*slices_start_after_instruction_name=*/"prefetch",
+      /*slices_done_before_instruction_name=*/"prefetch_use",
+      /*expect_slices_started_at_different_times=*/true));
+  auto entry_schedule1 =
+      module_and_assignments1.module->schedule()
+          .sequence(module_and_assignments1.module->entry_computation())
+          .instructions();
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<int> start_indicies,
+      GetSliceStartIndicies(entry_schedule1, root1->operand(1)->operand(0)));
+  ASSERT_EQ(start_indicies.size(), 2);
+  TF_ASSERT_OK_AND_ASSIGN(
+      int first_while,
+      FindScheduleIndexOfInstruction(
+          entry_schedule1, "loop1_output",
+          SlicedPrefetchTest::InstructionClass::kUnrelatedNonCopy));
+  TF_ASSERT_OK_AND_ASSIGN(
+      int second_while,
+      FindScheduleIndexOfInstruction(
+          entry_schedule1, "loop2_output",
+          SlicedPrefetchTest::InstructionClass::kUnrelatedNonCopy));
+  EXPECT_TRUE(
+      absl::c_is_sorted<std::vector<int>>(
+          {start_indicies[0], first_while, start_indicies[1], second_while}) ||
+      absl::c_is_sorted<std::vector<int>>(
+          {start_indicies[1], first_while, start_indicies[0], second_while}));
+
+  // In this case, more time elapses during the first while loop than the
+  // second. This should push us to use a normal prefetch, rather than slicing,
+  // since the ideal time to start the second slice will get pushed before
+  // both while loops.
+  TF_ASSERT_OK_AND_ASSIGN(
+      ModuleAndAssignments module_and_assignments2,
+      run_msa(gen_hlo(while_computation_expensive, while_computation_cheap)));
+  auto root2 =
+      module_and_assignments2.module->entry_computation()->root_instruction();
+  EXPECT_THAT(root2, op::Add(_, op::Tanh(op::AsyncCopy(kAlternateMemorySpace,
+                                                       kDefaultMemorySpace,
+                                                       op::Parameter(6)))));
+  auto entry_schedule2 =
+      module_and_assignments2.module->schedule()
+          .sequence(module_and_assignments2.module->entry_computation())
+          .instructions();
+  TF_ASSERT_OK_AND_ASSIGN(
+      int copy_done,
+      FindScheduleIndexOfInstruction(
+          entry_schedule2, root2->operand(1)->operand(0)->name(),
+          SlicedPrefetchTest::InstructionClass::kUnrelatedNonCopy));
+  TF_ASSERT_OK_AND_ASSIGN(
+      int copy_start,
+      FindScheduleIndexOfInstruction(
+          entry_schedule2, root2->operand(1)->operand(0)->operand(0)->name(),
+          SlicedPrefetchTest::InstructionClass::kUnrelatedNonCopy));
+  TF_ASSERT_OK_AND_ASSIGN(
+      first_while,
+      FindScheduleIndexOfInstruction(
+          entry_schedule2, "loop1_output",
+          SlicedPrefetchTest::InstructionClass::kUnrelatedNonCopy));
+  TF_ASSERT_OK_AND_ASSIGN(
+      second_while,
+      FindScheduleIndexOfInstruction(
+          entry_schedule2, "loop2_output",
+          SlicedPrefetchTest::InstructionClass::kUnrelatedNonCopy));
+  EXPECT_TRUE(absl::c_is_sorted<std::vector<int>>(
+      {copy_start, first_while, second_while, copy_done}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_propagation.cc b/third_party/xla/xla/service/memory_space_propagation.cc
index e203a1ddb3765e..e0d0ed16e604a9 100644
--- a/third_party/xla/xla/service/memory_space_propagation.cc
+++ b/third_party/xla/xla/service/memory_space_propagation.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "xla/service/memory_space_propagation.h"
 
+#include <cstdint>
+
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+
 namespace xla {
 
 StatusOr<bool> MemorySpacePropagation::Run(
@@ -37,24 +42,24 @@ StatusOr<bool> MemorySpacePropagation::Run(
         // Propagate the operand subshapes.
         for (int operand_idx = 0; operand_idx < instruction->operand_count();
              ++operand_idx) {
-          for (const ShapeUtil::IndexedShape& indexed_shape :
-               ShapeUtil::GetLeafShapes(
-                   instruction->operand(operand_idx)->shape())) {
-            int64_t memory_space = indexed_shape.shape.layout().memory_space();
-            modified |= Propagate(indexed_shape.index,
-                                  instruction->fused_parameter(operand_idx),
-                                  memory_space);
-          }
+          ShapeUtil::ForEachLeafShape(
+              instruction->operand(operand_idx)->shape(),
+              [&](const Shape& sub_shape, const ShapeIndex& index) {
+                int64_t memory_space = sub_shape.layout().memory_space();
+                modified |=
+                    Propagate(index, instruction->fused_parameter(operand_idx),
+                              memory_space);
+              });
         }
 
         // Propagate output subshapes.
-        for (const ShapeUtil::IndexedShape& indexed_shape :
-             ShapeUtil::GetLeafShapes(instruction->shape())) {
-          int64_t memory_space = indexed_shape.shape.layout().memory_space();
-          modified |=
-              Propagate(indexed_shape.index,
-                        instruction->fused_expression_root(), memory_space);
-        }
+        ShapeUtil::ForEachLeafShape(
+            instruction->shape(),
+            [&](const Shape& sub_shape, const ShapeIndex& index) {
+              int64_t memory_space = sub_shape.layout().memory_space();
+              modified |= Propagate(index, instruction->fused_expression_root(),
+                                    memory_space);
+            });
       }
     }
   }
diff --git a/third_party/xla/xla/service/multi_output_fusion.cc b/third_party/xla/xla/service/multi_output_fusion.cc
index 3af2e97fc96604..d9a04c2a4c4cc5 100644
--- a/third_party/xla/xla/service/multi_output_fusion.cc
+++ b/third_party/xla/xla/service/multi_output_fusion.cc
@@ -36,6 +36,11 @@ StatusOr<bool> MultiOutputFusion::Run(
 
   for (auto* computation :
        module->MakeNonfusionComputations(execution_threads)) {
+    // Do not operate over async computations (computations of async
+    // instructions).
+    if (computation->IsAsyncComputation()) {
+      continue;
+    }
     computation_ = computation;
     candidates_.clear();
     candidates_index_.clear();
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation.cc b/third_party/xla/xla/service/p2p_schedule_preparation.cc
index 0b916ab23c2c90..41319120b60c05 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation.cc
@@ -64,6 +64,13 @@ bool IsP2PDoneOp(const HloInstruction* op) {
 // operations, regardless whether they are on hosts or on devices.
 bool IsCollectiveOp(const HloInstruction* op) {
   HloOpcode opcode = op->opcode();
+  // TODO(b/309639264): We temporarily make this pass to also order custom-calls
+  // with respect to P2P chains, to workaround an NVIDIA bug. Remove the code
+  // for custom-calls once the bug has been fixed.
+  if (opcode == HloOpcode::kCustomCall) {
+    return true;
+  }
+
   return hlo_query::IsAsyncCollectiveDoneOp(opcode,
                                             /*include_send_recv=*/true) ||
          (hlo_query::IsCollectiveCommunicationOp(opcode) &&
@@ -510,11 +517,12 @@ Status ChainCollectivesWithUnpipelinedP2P(
       continue;
     }
 
+    HloOpcode opcode = hlo->opcode();
     // Handle a P2P chain when we see its Send-Done.
-    if (hlo->opcode() == HloOpcode::kRecvDone) {
+    if (opcode == HloOpcode::kRecvDone) {
       continue;
     }
-    if (hlo->opcode() == HloOpcode::kSendDone) {
+    if (opcode == HloOpcode::kSendDone) {
       auto group_it = p2p_group_map.find(hlo->channel_id().value());
       if (group_it == p2p_group_map.end()) {
         LOG(INFO) << "Warn unhandled P2P " << hlo->ToString();
@@ -546,8 +554,10 @@ Status ChainCollectivesWithUnpipelinedP2P(
     if (reachability->IsReachable(hlo, send_done)) {
       TF_RETURN_IF_ERROR(OrderBefore(reachability, hlo, recv));
     } else {
-      TF_RETURN_IF_ERROR(
-          OrderBefore(reachability, send_done, GetStartOpForDoneOp(hlo)));
+      // TODO(b/309639264): Remove kCustomCall when the NVIDIA bug is fixed.
+      TF_RETURN_IF_ERROR(OrderBefore(
+          reachability, send_done,
+          opcode == HloOpcode::kCustomCall ? hlo : GetStartOpForDoneOp(hlo)));
     }
   }
 
@@ -586,11 +596,12 @@ Status ChainCollectivesWithPipelinedP2PParent(
       continue;
     }
 
+    HloOpcode opcode = hlo->opcode();
     // Handle a P2P chain when we see its Send-done.
-    if (hlo->opcode() == HloOpcode::kRecvDone) {
+    if (opcode == HloOpcode::kRecvDone) {
       continue;
     }
-    if (hlo->opcode() == HloOpcode::kSendDone) {
+    if (opcode == HloOpcode::kSendDone) {
       auto group_it = p2p_group_map.find(hlo->channel_id().value());
       if (group_it == p2p_group_map.end()) {
         LOG(INFO) << "Warn unhandled P2P " << hlo->ToString();
@@ -622,8 +633,10 @@ Status ChainCollectivesWithPipelinedP2PParent(
     if (reachability->IsReachable(hlo, send_done)) {
       TF_RETURN_IF_ERROR(OrderBefore(reachability, hlo, recv));
     } else {
-      TF_RETURN_IF_ERROR(
-          OrderBefore(reachability, send_done, GetStartOpForDoneOp(hlo)));
+      // TODO(b/309639264): Remove kCustomCall when the NVIDIA bug is fixed.
+      TF_RETURN_IF_ERROR(OrderBefore(
+          reachability, send_done,
+          opcode == HloOpcode::kCustomCall ? hlo : GetStartOpForDoneOp(hlo)));
     }
   }
 
@@ -666,11 +679,12 @@ Status ChainCollectivesWithPipelinedP2PChild(
       return InternalError("Detect deadlock in input HLO");
     }
 
+    HloOpcode opcode = hlo->opcode();
     // Handle a P2P chain when we see its Send-done.
-    if (hlo->opcode() == HloOpcode::kRecvDone) {
+    if (opcode == HloOpcode::kRecvDone) {
       continue;
     }
-    if (hlo->opcode() == HloOpcode::kSendDone) {
+    if (opcode == HloOpcode::kSendDone) {
       auto group_it = p2p_group_map.find(hlo->channel_id().value());
       if (group_it == p2p_group_map.end()) {
         continue;
@@ -697,8 +711,10 @@ Status ChainCollectivesWithPipelinedP2PChild(
     }
 
     // The hlo is not a Send/Recv instruction.
-    TF_RETURN_IF_ERROR(
-        OrderBefore(reachability, send_done, GetStartOpForDoneOp(hlo)));
+    // TODO(b/309639264): Remove kCustomCall when the NVIDIA bug is fixed.
+    TF_RETURN_IF_ERROR(OrderBefore(
+        reachability, send_done,
+        opcode == HloOpcode::kCustomCall ? hlo : GetStartOpForDoneOp(hlo)));
     TF_RETURN_IF_ERROR(OrderBefore(reachability, hlo, recv));
   }
 
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation_test.cc b/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
index 95656520002958..83a9fe56085f7a 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
@@ -284,10 +284,12 @@ TEST_F(P2PSchedulePreparationTest, NestedP2PChainTransformed) {
 //    pipelined P2P chain.
 //  Whether the pipelined while-body contains another P2P chain besides the
 //    pipelined P2P chain.
-//
+//  Whether the main computation uses a custom-call or a collective-permute for
+//    the purpose of testing its ordering with respect to P2P chain.
 std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
                                         bool other_p2p_in_while = false,
-                                        bool deadlock_in_while = false) {
+                                        bool deadlock_in_while = false,
+                                        bool test_custom_call = false) {
   // This is to support the while-loop with nested P2P chains called from the
   // main computation.
   constexpr char kWhileForMain[] = R"(
@@ -329,6 +331,12 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
     source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}}
 )";
 
+  // Similar to the above, but for test_custom_call = true.
+  constexpr char kUnnestedResultWithCustomCall[] = R"(
+  while-result-1 = f32[1, 1024, 1024] get-tuple-element(while-result), index=1
+  ROOT custom-call = f32[1, 1024, 1024] custom-call(while-result-1), custom_call_target="my_custom_call"
+)";
+
   // This is the result for the main computation, if it has another while-loop
   // with nested P2P chains.
   constexpr char kNestedResult[] = R"(
@@ -530,7 +538,10 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
           ? kPipelinedWhileBodyDeadlock
           : (other_p2p_in_while ? kPipelinedWhileBodyWithOtherP2P
                                 : kPipelinedWhileBodyWithoutOtherP2P);
-  const char* result_str = nested_p2p_in_main ? kNestedResult : kUnnestedResult;
+  const char* result_str =
+      nested_p2p_in_main ? kNestedResult
+                         : (test_custom_call ? kUnnestedResultWithCustomCall
+                                             : kUnnestedResult);
   return absl::StrFormat(kModuleTemplate, while_str, pipelined_while_body_str,
                          result_str);
 }
@@ -635,5 +646,23 @@ TEST_F(P2PSchedulePreparationTest,
                              other_send_done));
 }
 
+TEST_F(P2PSchedulePreparationTest,
+       UnnestedPipelinedP2PChainWithCustomCallTransformed) {
+  std::string kModuleStr = GetPipelinedP2PModuleString(
+      /*nested_p2p_in_main=*/false, /*other_p2p_in_while=*/false,
+      /*deadlock_in_while=*/false, /*test_custom_call=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((kModuleStr)));
+  P2PSchedulePreparation preparation;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, preparation.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  // Verify in the main computation custom-call is scheduled after the
+  // Send-done for the pipelined while-loop.
+  HloInstruction* send_done_2 = FindInstruction(module.get(), "send-done.2");
+  HloInstruction* custom_call = FindInstruction(module.get(), "custom-call");
+  EXPECT_EQ(custom_call->control_predecessors()[0], send_done_2);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/pattern_matcher.h b/third_party/xla/xla/service/pattern_matcher.h
index 041d500006ad53..51e78a05e8ff92 100644
--- a/third_party/xla/xla/service/pattern_matcher.h
+++ b/third_party/xla/xla/service/pattern_matcher.h
@@ -1867,6 +1867,53 @@ class HloInstructionPatternOneUserImpl
   }
 };
 
+class HloInstructionPatternNumUserImpl {
+ public:
+  explicit constexpr HloInstructionPatternNumUserImpl(int64_t user_num)
+      : user_num_(user_num) {}
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst->user_count() != user_num_) {
+      EXPLAIN << "HloInstruction has " << inst->user_count()
+              << " users, but expected exactly " << user_num_ << " users.";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "which has exactly " << user_num_
+        << " users (but possibly is used multiple times by "
+           "same instruction)";
+  }
+
+ private:
+  int64_t user_num_;
+};
+
+class HloInstructionPatternAtMostNumUserImpl {
+ public:
+  explicit constexpr HloInstructionPatternAtMostNumUserImpl(int64_t user_num)
+      : user_num_(user_num) {}
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst->user_count() > user_num_) {
+      EXPLAIN << "HloInstruction has " << inst->user_count()
+              << " users, but expected less than or equal " << user_num_
+              << " users.";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "which has less than or equal " << user_num_
+        << " users (but possibly is used multiple times by "
+           "same instruction)";
+  }
+
+ private:
+  int64_t user_num_;
+};
+
 class HloInstructionPatternComparisonDirectionImpl {
  public:
   explicit constexpr HloInstructionPatternComparisonDirectionImpl(
@@ -2418,6 +2465,18 @@ class HloInstructionPattern {
     return AppendImpl(HloInstructionPatternOneUserImpl());
   }
 
+  // Modifies the pattern to match if the instruction is used by exactly
+  // user_num times by other instruction.
+  constexpr auto WithNumUser(int64_t user_num) const {
+    return AppendImpl(HloInstructionPatternNumUserImpl(user_num));
+  }
+
+  // Modifies the pattern to match if the instruction is used by less than
+  // user_num times by other instruction.
+  constexpr auto WithAtMostNumUser(int64_t user_num) const {
+    return AppendImpl(HloInstructionPatternAtMostNumUserImpl(user_num));
+  }
+
   // Modifies the pattern to match only if the instruction has the given
   // comparison direction.
   auto WithComparisonDirection(ComparisonDirection direction) const {
diff --git a/third_party/xla/xla/service/service.cc b/third_party/xla/xla/service/service.cc
index 3aa2f0ca8f8973..caff663a1e19e1 100644
--- a/third_party/xla/xla/service/service.cc
+++ b/third_party/xla/xla/service/service.cc
@@ -62,6 +62,7 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/protobuf.h"
+#include "tsl/profiler/lib/scoped_annotation.h"
 
 namespace xla {
 namespace {
@@ -130,30 +131,6 @@ const std::optional<std::set<int>>& ServiceOptions::allowed_devices() const {
   return allowed_devices_;
 }
 
-/* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
-    se::Platform* platform) {
-  ServiceOptions default_options;
-  default_options.set_platform(platform);
-  return NewService(default_options);
-}
-
-/* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
-    const ServiceOptions& options) {
-  se::Platform* platform = options.platform();
-  std::unique_ptr<Backend> execute_backend;
-  if (platform == nullptr) {
-    TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
-  }
-  BackendOptions backend_options;
-  backend_options.set_platform(platform);
-  backend_options.set_allowed_devices(options.allowed_devices());
-  TF_ASSIGN_OR_RETURN(execute_backend, Backend::CreateBackend(backend_options));
-
-  std::unique_ptr<Service> service(
-      new Service(options, std::move(execute_backend)));
-  return std::move(service);
-}
-
 Service::Service(const ServiceOptions& options,
                  std::unique_ptr<Backend> execute_backend)
     : options_(options),
@@ -774,6 +751,10 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
       "BuildExecutable on service %p with serialized module proto: %s", this,
       module_proto.name());
 
+  tsl::profiler::ScopedAnnotation annotation{[&] {
+    return absl::StrCat("XlaCompile:#module=", module_proto.name(), "#");
+  }};
+
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> module,
       CreateModuleFromProto(module_proto, *module_config, run_backend_only));
@@ -794,21 +775,26 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
                                     std::move(module), executor, options));
   }
 
+  tsl::profiler::ScopedAnnotation backend_annotation{[&] {
+    return absl::StrCat("XlaCompileBackend:#module=", module_proto.name(), "#");
+  }};
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
       backend->compiler()->RunBackend(std::move(module), executor, options));
 
-  const HloProto* hlo_proto_after_opt = executable->hlo_proto();
+  const BufferAssignmentProto* buffer_assignment_proto_after_opt =
+      executable->buffer_assignment_proto();
 
   // If dumping is enabled RunBackend(...) will emit a hlo_proto in the
   // executable. This contains the buffer_assignment that is only available
   // after RunBackend(). If hlo_proto_before_opt is not null, then we replace
   // its buffer_assignment with the one from after_opt and then store it into
   // the executable.
-  if (hlo_proto_before_opt != nullptr && hlo_proto_after_opt != nullptr) {
+  if (hlo_proto_before_opt != nullptr &&
+      buffer_assignment_proto_after_opt != nullptr) {
     CHECK(DumpingEnabledForHloModule(executable->module()));
     *hlo_proto_before_opt->mutable_buffer_assignment() =
-        hlo_proto_after_opt->buffer_assignment();
+        std::move(*buffer_assignment_proto_after_opt);
     executable->set_hlo_proto(std::move(hlo_proto_before_opt));
   }
   return std::move(executable);
@@ -925,21 +911,6 @@ Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) {
   return OkStatus();
 }
 
-Status Service::WaitForExecution(const WaitForExecutionRequest* arg,
-                                 WaitForExecutionResponse* result) {
-  TF_ASSIGN_OR_RETURN(const auto execution,
-                      execution_tracker_.Resolve(arg->execution()));
-
-  TF_RETURN_IF_ERROR(execution->BlockUntilDone());
-
-  *result->mutable_output() = execution->result();
-  *result->mutable_profile() = execution->profile();
-
-  TF_RETURN_IF_ERROR(execution_tracker_.Unregister(arg->execution()));
-  VLOG(1) << "successfully completed 'wait-for-execution' request";
-  return OkStatus();
-}
-
 Status Service::TransferToClient(const TransferToClientRequest* arg,
                                  TransferToClientResponse* result) {
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* shaped_buffer,
diff --git a/third_party/xla/xla/service/service.h b/third_party/xla/xla/service/service.h
index 81abbba520b566..dedf1445bdd906 100644
--- a/third_party/xla/xla/service/service.h
+++ b/third_party/xla/xla/service/service.h
@@ -80,12 +80,6 @@ class ServiceOptions {
 // (target-specific compiler, StreamExecutor).
 class Service : public ServiceInterface {
  public:
-  // Factory method for creating a new Service.
-  static StatusOr<std::unique_ptr<Service>> NewService(
-      se::Platform* platform = nullptr);
-  static StatusOr<std::unique_ptr<Service>> NewService(
-      const ServiceOptions& options);
-
   // Unregisters a previously-allocated global handle.
   //
   // If the handle given is not currently allocated, a NOT_FOUND status is
@@ -124,13 +118,6 @@ class Service : public ServiceInterface {
   Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
                           GetDeviceHandlesResponse* result) override;
 
-  // Waits until the specified execution is complete and returns the result.
-  // Calling this API multiple times with the same execution handle returns the
-  // method with an error since the execution handle is destroyed after the
-  // first call.
-  Status WaitForExecution(const WaitForExecutionRequest* arg,
-                          WaitForExecutionResponse* result) override;
-
   // Requests that global data be transferred to the client in literal form.
   Status TransferToClient(const TransferToClientRequest* arg,
                           TransferToClientResponse* result) override;
diff --git a/third_party/xla/xla/service/shape_inference.cc b/third_party/xla/xla/service/shape_inference.cc
index dcb5d7eded4741..c963f542589c56 100644
--- a/third_party/xla/xla/service/shape_inference.cc
+++ b/third_party/xla/xla/service/shape_inference.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -55,6 +56,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using absl::InvalidArgumentError;
 using absl::StrFormat;
 using absl::StrJoin;
 
@@ -63,6 +65,18 @@ bool AllUnique(absl::Span<const int64_t> slice) {
   return std::set<int64_t>(slice.begin(), slice.end()).size() == slice.size();
 }
 
+// Checks whether the given dimension size `size` is unbounded dynamic size.
+bool IsUnboundedDynamicSize(int64_t size) {
+  return size == Shape::kUnboundedSize;
+}
+
+// Returns success if the given two dimension sizes 'size_a' and 'size_b' are
+// compatible: at least one is dynamic or both are equal.
+bool CompatibleDimensionSizes(int64_t size_a, int64_t size_b) {
+  return IsUnboundedDynamicSize(size_a) || IsUnboundedDynamicSize(size_b) ||
+         size_a == size_b;
+}
+
 Status ExpectArray(const Shape& shape, absl::string_view op_type) {
   if (!shape.IsArray()) {
     return InvalidArgument("Expected array argument for %s, but got %s.",
@@ -736,9 +750,9 @@ Status ValidateDotDimensionNumbers(
         dimension_numbers.lhs_contracting_dimensions(i);
     const int64_t rhs_contracting_dimension =
         dimension_numbers.rhs_contracting_dimensions(i);
-    if (lhs.dimensions(lhs_contracting_dimension) !=
-        rhs.dimensions(rhs_contracting_dimension)) {
-      return fail("Contracting dimension sizes do not match.");
+    if (!CompatibleDimensionSizes(lhs.dimensions(lhs_contracting_dimension),
+                                  rhs.dimensions(rhs_contracting_dimension))) {
+      return fail("Contracting dimension sizes are not compatible.");
     }
   }
 
@@ -750,9 +764,10 @@ Status ValidateDotDimensionNumbers(
 
   // Check that batch dimension numbers and sizes match.
   for (int64_t i = 0; i < dimension_numbers.lhs_batch_dimensions_size(); ++i) {
-    if (lhs.dimensions(dimension_numbers.lhs_batch_dimensions(i)) !=
-        rhs.dimensions(dimension_numbers.rhs_batch_dimensions(i))) {
-      return fail("Batch dimension sizes must match for lhs/rhs.");
+    if (!CompatibleDimensionSizes(
+            lhs.dimensions(dimension_numbers.lhs_batch_dimensions(i)),
+            rhs.dimensions(dimension_numbers.rhs_batch_dimensions(i)))) {
+      return fail("Batch dimension sizes are not compatible.");
     }
   }
 
@@ -813,12 +828,47 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   std::vector<int64_t> output_dimensions(lhs.rank());
   std::vector<bool> output_dimensions_is_dynamic(lhs.rank());
   for (int64_t i = 0; i < lhs.rank(); ++i) {
-    if (lhs.dimensions(i) == rhs.dimensions(i)) {
-      output_dimensions[i] = lhs.dimensions(i);
-    } else if (lhs.dimensions(i) == 1) {
-      output_dimensions[i] = rhs.dimensions(i);
-    } else if (rhs.dimensions(i) == 1) {
+    if (lhs.dimensions(i) == 1 || rhs.dimensions(i) == 1) {
+      // For the unbounded case, the operand with 1 should be broadcasted to the
+      // unbounded size which can be > 1.
+      // LHS | RHS | Result
+      // 1   | X   | X
+      // 1   | <=X | <=X
+      // 1   | ?   | ?
+      // X   | 1   | X
+      // <=X | 1   | <=X
+      // ?   | 1   | ?
+      output_dimensions[i] =
+          lhs.dimensions(i) == 1 ? rhs.dimensions(i) : lhs.dimensions(i);
+      output_dimensions_is_dynamic[i] = lhs.dimensions(i) == 1
+                                            ? rhs.is_dynamic_dimension(i)
+                                            : lhs.is_dynamic_dimension(i);
+    } else if (lhs.dimensions(i) == rhs.dimensions(i)) {
+      // LHS | RHS | Result
+      // X   | X   | X
+      // X   | <=X | <=X
+      // <=X | X   | <=X
+      // <=X | <=X | <=X
+      // ?   | ?   | ?
       output_dimensions[i] = lhs.dimensions(i);
+      output_dimensions_is_dynamic[i] =
+          lhs.is_dynamic_dimension(i) || rhs.is_dynamic_dimension(i);
+    } else if (lhs.is_unbounded_dynamic_dimension(i) ||
+               rhs.is_unbounded_dynamic_dimension(i)) {
+      // For the last two rows, consider when <=X turns out to be 1 and ? turns
+      // out to be 5. It would be wrong to infer <=1 as this is a degenerate
+      // dimension that should be broadcasted to 5.
+      // LHS | RHS | Result
+      // X   | ?   | X
+      // ?   | X   | X
+      // <=X | ?   | ?
+      // ?   | <=X | ?
+      output_dimensions[i] = lhs.is_unbounded_dynamic_dimension(i)
+                                 ? rhs.dimensions(i)
+                                 : lhs.dimensions(i);
+      output_dimensions_is_dynamic[i] = lhs.is_unbounded_dynamic_dimension(i)
+                                            ? rhs.is_dynamic_dimension(i)
+                                            : lhs.is_dynamic_dimension(i);
     } else {
       return InvalidArgument(
           "Binary op %s with incompatible shapes: %s and %s.",
@@ -827,13 +877,6 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     }
   }
 
-  // Merge dynamic dimensions from two shapes.
-  for (int64_t i = 0; i < rhs.rank(); ++i) {
-    if (rhs.is_dynamic_dimension(i) || lhs.is_dynamic_dimension(i)) {
-      output_dimensions_is_dynamic[i] = true;
-    }
-  }
-
   return ShapeUtil::MakeShape(ShapeUtil::HigherPrecisionElementType(lhs, rhs),
                               output_dimensions, output_dimensions_is_dynamic);
 }
@@ -841,20 +884,29 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 /* static */ StatusOr<Shape> ShapeInference::InferInDimBroadcastShape(
     const Shape& smaller_shape, const Shape& larger_shape,
     absl::Span<const int64_t> broadcast_dimensions) {
+  if (smaller_shape.is_unbounded_dynamic() ||
+      larger_shape.is_unbounded_dynamic()) {
+    return InvalidArgumentError(StrFormat(
+        "Unbounded dynamic shapes not supported, but we have %s and %s",
+        ShapeUtil::HumanString(smaller_shape),
+        ShapeUtil::HumanString(larger_shape)));
+  }
+
   if (broadcast_dimensions.empty() && !ShapeUtil::IsScalar(smaller_shape)) {
     // Reject "magic" inference for binops on different shapes, requiring
     // the user to provide an explicit broadcast dimension in this case.
     // See b/25177275 for more details.
-    return InvalidArgument("Shapes must be equal rank, but are %s and %s",
-                           ShapeUtil::HumanString(smaller_shape),
-                           ShapeUtil::HumanString(larger_shape));
+    return InvalidArgumentError(
+        StrFormat("Shapes must be equal rank, but are %s and %s",
+                  ShapeUtil::HumanString(smaller_shape),
+                  ShapeUtil::HumanString(larger_shape)));
   } else if (broadcast_dimensions.size() != smaller_shape.rank()) {
-    return InvalidArgument(
+    return InvalidArgumentError(StrFormat(
         "Size of broadcast_dimensions has to match lower-rank operand's "
         "rank; "
         " lower-rank operand's rank is %d, size of broadcast_dimensions is "
         "%u.",
-        smaller_shape.rank(), broadcast_dimensions.size());
+        smaller_shape.rank(), broadcast_dimensions.size()));
   }
 
   // broadcast_dimensions is a sequence of dimensions; its length is equal to
@@ -902,15 +954,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   for (int i = 0; i < smaller_shape.dimensions_size(); ++i) {
     int64_t dimension_to_match = broadcast_dimensions.at(i);
     if (dimension_to_match < 0) {
-      return InvalidArgument(
-          "Broadcast dimension number (%d) cannot be negative.",
-          dimension_to_match);
+      return InvalidArgumentError(
+          StrFormat("Broadcast dimension number (%d) cannot be negative.",
+                    dimension_to_match));
     }
     if (dimension_to_match >= larger_shape.dimensions_size()) {
-      return InvalidArgument(
-          "Broadcast dimension number (%d) too large; higher-rank "
-          "operand has rank %d.",
-          dimension_to_match, larger_shape.dimensions_size());
+      return InvalidArgumentError(
+          StrFormat("Broadcast dimension number (%d) too large; higher-rank "
+                    "operand has rank %d.",
+                    dimension_to_match, larger_shape.dimensions_size()));
     }
     int64_t small_dimension_size = smaller_shape.dimensions(i);
     int64_t large_dimension_size = larger_shape.dimensions(dimension_to_match);
@@ -922,11 +974,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     // InDim broadcasting).
     if (small_dimension_size != large_dimension_size &&
         small_dimension_size != 1 && large_dimension_size != 1) {
-      return InvalidArgument(
-          "Broadcast dimension %d mismatch: %d != %d; %s and %s.", i,
-          small_dimension_size, large_dimension_size,
-          ShapeUtil::HumanString(smaller_shape),
-          ShapeUtil::HumanString(larger_shape));
+      return InvalidArgumentError(
+          StrFormat("Broadcast dimension %d mismatch: %d != %d; %s and %s.", i,
+                    small_dimension_size, large_dimension_size,
+                    ShapeUtil::HumanString(smaller_shape),
+                    ShapeUtil::HumanString(larger_shape)));
     }
     if (small_is_dynamic != large_is_dynamic) {
       if (small_dimension_size == large_dimension_size ||
@@ -934,18 +986,18 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
           (large_dimension_size == 1 && !large_is_dynamic)) {
         // Do nothing. It's OK when the size-1 dimension is not static.
       } else {
-        return InvalidArgument(
-            "Broadcast dimension %d dynamism mismatch: %s and %s.", i,
-            ShapeUtil::HumanString(smaller_shape),
-            ShapeUtil::HumanString(larger_shape));
+        return InvalidArgumentError(
+            StrFormat("Broadcast dimension %d dynamism mismatch: %s and %s.", i,
+                      ShapeUtil::HumanString(smaller_shape),
+                      ShapeUtil::HumanString(larger_shape)));
       }
     }
     // Make sure the broadcast dimensions are listed in a strictly increasing
     // order.
     if (i > 0 && broadcast_dimensions.at(i - 1) >= dimension_to_match) {
-      return InvalidArgument(
-          "Broadcast dimensions order is wrong: %d comes after %d.",
-          dimension_to_match, broadcast_dimensions.at(i - 1));
+      return InvalidArgumentError(
+          StrFormat("Broadcast dimensions order is wrong: %d comes after %d.",
+                    dimension_to_match, broadcast_dimensions.at(i - 1)));
     }
 
     output_shape.set_dimensions(dimension_to_match, small_dimension_size);
@@ -979,7 +1031,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     }
   }
 
-  if (ShapeUtil::CompatibleIgnoringFpPrecision(lhs, rhs)) {
+  if (ShapeUtil::CompatibleIgnoringFpPrecision(lhs, rhs) &&
+      !lhs.is_unbounded_dynamic() && !rhs.is_unbounded_dynamic()) {
     // If the shapes are the same other than layout, the output shape is the
     // same (elementwise op).
     Shape result = ShapeUtil::ChangeElementType(
@@ -2289,8 +2342,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   for (int64_t i = 1; i < num_reduced_args; ++i) {
     if (!ShapeUtil::SameDimensions(*reduced_args[0], *reduced_args[i])) {
       return InvalidArgument(
-          "All reduced tensors must have the same dimension. Tensor 0 has "
-          "shape %s, Tensor %d has shape %s",
+          "All reduced tensors must have compatible dimension. Tensor at index "
+          "0 has shape %s, and tensor at index %d has shape %s.",
           ShapeUtil::HumanString(*reduced_args[0]), i,
           ShapeUtil::HumanString(*reduced_args[i]));
     }
@@ -2612,11 +2665,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     if (start_index < 0) {
       return InvalidArgument("Negative start index to slice: %d.", start_index);
     }
-    if (limit_index > arg.dimensions(dimension)) {
+    int64_t dimension_size = arg.dimensions(dimension);
+    if (!arg.is_unbounded_dynamic_dimension(dimension) &&
+        limit_index > dimension_size) {
       return error(
           StrFormat("limit index (%d) must be less than or equal to dimension "
                     "size (%d)",
-                    limit_index, arg.dimensions(dimension)));
+                    limit_index, dimension_size));
     }
     VLOG(2) << StrFormat("starts[%d] = %d", dimension, start_index);
     VLOG(2) << StrFormat("limits[%d] = %d", dimension, limit_index);
@@ -2638,7 +2693,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     if (sizes[i] == 1) {
       continue;
     }
-    is_dynamic[i] = arg.is_dynamic_dimension(i);
+    is_dynamic[i] = arg.is_bounded_dynamic_dimension(i);
   }
 
   return ShapeUtil::MakeShape(arg.element_type(), sizes, is_dynamic);
diff --git a/third_party/xla/xla/service/shape_inference_test.cc b/third_party/xla/xla/service/shape_inference_test.cc
index 6a58bd382477b6..b6a541f2c372c9 100644
--- a/third_party/xla/xla/service/shape_inference_test.cc
+++ b/third_party/xla/xla/service/shape_inference_test.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "xla/service/shape_inference.h"
 
+#include <cstdint>
+#include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
@@ -24,7 +27,11 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/client/padding.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/hlo_parser.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/statusor.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
 #include "xla/types.h"
@@ -105,6 +112,14 @@ class SelectAndScatterShapeInferenceTest : public ShapeInferenceTest {
   ProgramShape scatter_program_shape_;
 };
 
+// Subclass for testing unbounded dynamic binary ops
+class UnboundedBinaryOpShapeInferenceTest
+    : public ::testing::TestWithParam<std::vector<std::string>> {};
+
+// Subclass for testing unbounded dynamic unary ops
+class UnboundedUnaryOpShapeInferenceTest
+    : public ::testing::TestWithParam<std::vector<std::string>> {};
+
 TEST_F(ShapeInferenceTest, UnaryNegateMatrix) {
   Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
   auto inferred_status =
@@ -1727,7 +1742,7 @@ TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimSizesFails) {
                                       /*preferred_element_type=*/std::nullopt);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().message(),
-              HasSubstr("Batch dimension sizes must match"));
+              HasSubstr("Batch dimension sizes are not compatible"));
 }
 
 // BatchMatMul with different batch dimension numbers passes
@@ -3733,5 +3748,309 @@ INSTANTIATE_TEST_SUITE_P(All, ScatterShapeInferenceTest,
                                                                       BF16}),
                          ScatterTestName());
 
+TEST_P(UnboundedUnaryOpShapeInferenceTest, UnboundedAbs) {
+  StatusOr<Shape> operand = ParseShape(GetParam()[0]);
+  StatusOr<Shape> expected = ParseShape(GetParam()[1]);
+  ASSERT_IS_OK(operand.status());
+  StatusOr<Shape> inferred_status =
+      ShapeInference::InferUnaryOpShape(HloOpcode::kExp, operand.value());
+  ASSERT_IS_OK(expected.status());
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(inferred_status.value(), expected.value()))
+      << "inferred: " << ShapeUtil::HumanString(inferred_status.value())
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedAdd) {
+  StatusOr<Shape> lhs = ParseShape(GetParam()[0]);
+  StatusOr<Shape> rhs = ParseShape(GetParam()[1]);
+  StatusOr<Shape> expected = ParseShape(GetParam()[2]);
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
+      HloOpcode::kAdd, lhs.value(), rhs.value(),
+      /*broadcast_dimensions=*/{});
+  if (inferred_status.ok()) {
+    ASSERT_IS_OK(expected.status());
+    ASSERT_TRUE(ShapeUtil::Equal(inferred_status.value(), expected.value()))
+        << "inferred: " << ShapeUtil::HumanString(inferred_status.value())
+        << " expected: " << ShapeUtil::HumanString(expected.value());
+  } else {
+    EXPECT_THAT(inferred_status.status().message(),
+                HasSubstr("Binary op add with incompatible shapes"));
+  }
+}
+
+TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedDiv) {
+  auto lhs = ParseShape(GetParam()[0]);
+  auto rhs = ParseShape(GetParam()[1]);
+  auto expected = ParseShape(GetParam()[2]);
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  auto inferred_status = ShapeInference::InferBinaryOpShape(
+      HloOpcode::kDivide, lhs.value(), rhs.value(),
+      /*broadcast_dimensions=*/{});
+  if (inferred_status.ok()) {
+    ASSERT_IS_OK(expected.status());
+    ASSERT_TRUE(ShapeUtil::Equal(inferred_status.value(), expected.value()))
+        << "inferred: " << ShapeUtil::HumanString(inferred_status.value())
+        << " expected: " << ShapeUtil::HumanString(expected.value());
+  } else {
+    EXPECT_THAT(inferred_status.status().message(),
+                HasSubstr("Binary op divide with incompatible shapes"));
+  }
+}
+
+TEST_F(ShapeInferenceTest, UnboundedDot) {
+  StatusOr<Shape> lhs = ParseShape("f32[?, 10]");
+  StatusOr<Shape> rhs = ParseShape("f32[?, 10]");
+  StatusOr<Shape> expected = ParseShape("f32[?, 10]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  ASSERT_IS_OK(expected.status());
+
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_rhs_contracting_dimensions(0);
+
+  StatusOr<Shape> inferred_status = ShapeInference::InferDotOpShape(
+      lhs.value(), rhs.value(), dnums, /*preferred_element_type=*/std::nullopt);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(inferred_status.value(), expected.value()))
+      << "inferred: " << ShapeUtil::HumanString(inferred_status.value())
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_F(ShapeInferenceTest, UnboundedDotGeneral) {
+  StatusOr<Shape> lhs = ParseShape("f32[?, <=3, ?]");
+  StatusOr<Shape> rhs = ParseShape("f32[2, 4, 5]");
+  StatusOr<Shape> expected = ParseShape("f32[?, <=3, 5]");
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  ASSERT_IS_OK(expected.status());
+
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_batch_dimensions(0);
+  dnums.add_rhs_batch_dimensions(0);
+  dnums.add_lhs_contracting_dimensions(2);
+  dnums.add_rhs_contracting_dimensions(1);
+
+  StatusOr<Shape> inferred_status = ShapeInference::InferDotOpShape(
+      lhs.value(), rhs.value(), dnums, /*preferred_element_type=*/std::nullopt);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(inferred_status.value(), expected.value()))
+      << "inferred: " << ShapeUtil::HumanString(inferred_status.value())
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_P(UnboundedUnaryOpShapeInferenceTest, UnboundedExp) {
+  auto operand = ParseShape(GetParam()[0]);
+  auto expected = ParseShape(GetParam()[1]);
+  ASSERT_IS_OK(operand.status());
+  auto inferred_status =
+      ShapeInference::InferUnaryOpShape(HloOpcode::kExp, operand.value());
+  ASSERT_IS_OK(expected.status());
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(inferred_status.value(), expected.value()))
+      << "inferred: " << ShapeUtil::HumanString(inferred_status.value())
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMax) {
+  auto lhs = ParseShape(GetParam()[0]);
+  auto rhs = ParseShape(GetParam()[1]);
+  auto expected = ParseShape(GetParam()[2]);
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  auto inferred_status = ShapeInference::InferBinaryOpShape(
+      HloOpcode::kMaximum, lhs.value(), rhs.value(),
+      /*broadcast_dimensions=*/{});
+  if (inferred_status.ok()) {
+    ASSERT_IS_OK(expected.status());
+    ASSERT_TRUE(ShapeUtil::Equal(inferred_status.value(), expected.value()))
+        << "inferred: " << ShapeUtil::HumanString(inferred_status.value())
+        << " expected: " << ShapeUtil::HumanString(expected.value());
+  } else {
+    EXPECT_THAT(inferred_status.status().message(),
+                HasSubstr("Binary op maximum with incompatible shapes"));
+  }
+}
+
+TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMul) {
+  auto lhs = ParseShape(GetParam()[0]);
+  auto rhs = ParseShape(GetParam()[1]);
+  auto expected = ParseShape(GetParam()[2]);
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  auto inferred_status = ShapeInference::InferBinaryOpShape(
+      HloOpcode::kMultiply, lhs.value(), rhs.value(),
+      /*broadcast_dimensions=*/{});
+  if (inferred_status.ok()) {
+    ASSERT_IS_OK(expected.status());
+    ASSERT_TRUE(ShapeUtil::Equal(inferred_status.value(), expected.value()))
+        << "inferred: " << ShapeUtil::HumanString(inferred_status.value())
+        << " expected: " << ShapeUtil::HumanString(expected.value());
+  } else {
+    EXPECT_THAT(inferred_status.status().message(),
+                HasSubstr("Binary op multiply with incompatible shapes"));
+  }
+}
+
+TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedPow) {
+  auto lhs = ParseShape(GetParam()[0]);
+  auto rhs = ParseShape(GetParam()[1]);
+  auto expected = ParseShape(GetParam()[2]);
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  auto inferred_status = ShapeInference::InferBinaryOpShape(
+      HloOpcode::kPower, lhs.value(), rhs.value(),
+      /*broadcast_dimensions=*/{});
+  if (inferred_status.ok()) {
+    ASSERT_IS_OK(expected.status());
+    ASSERT_TRUE(ShapeUtil::Equal(inferred_status.value(), expected.value()))
+        << "inferred: " << ShapeUtil::HumanString(inferred_status.value())
+        << " expected: " << ShapeUtil::HumanString(expected.value());
+  } else {
+    EXPECT_THAT(inferred_status.status().message(),
+                HasSubstr("Binary op power with incompatible shapes"));
+  }
+}
+
+TEST_F(ShapeInferenceTest, UnboundedReduce) {
+  StatusOr<Shape> input0 = ParseShape("f32[7, 5]");
+  StatusOr<Shape> input1 = ParseShape("f32[?, 5]");
+  StatusOr<Shape> input2 = ParseShape("f32[7, ?]");
+  ASSERT_IS_OK(input0.status());
+  ASSERT_IS_OK(input1.status());
+  ASSERT_IS_OK(input2.status());
+  ProgramShape to_apply = ShapeUtil::MakeProgramShape(
+      {f32_, f32_, f32_, f32_, f32_, f32_},
+      ShapeUtil::MakeTupleShape({f32_, f32_, f32_}));
+
+  StatusOr<Shape> inferred_status = ShapeInference::InferReduceShape(
+      {&input0.value(), &input1.value(), &input2.value(), &f32_, &f32_, &f32_},
+      {1}, to_apply);
+  Shape shape = ShapeUtil::MakeShape(F32, {7});
+  Shape expected = ShapeUtil::MakeTupleShape({shape, shape, shape});
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(inferred_status.value(), expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_status.value())
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_F(ShapeInferenceTest, UnboundedReduceInvalidReduceDimension) {
+  StatusOr<Shape> input0 = ParseShape("f32[7, 5]");
+  StatusOr<Shape> input1 = ParseShape("f32[?, 5]");
+  StatusOr<Shape> input2 = ParseShape("f32[5, ?]");
+  ASSERT_IS_OK(input0.status());
+  ASSERT_IS_OK(input1.status());
+  ASSERT_IS_OK(input2.status());
+  ProgramShape to_apply = ShapeUtil::MakeProgramShape(
+      {f32_, f32_, f32_, f32_, f32_, f32_},
+      ShapeUtil::MakeTupleShape({f32_, f32_, f32_}));
+
+  StatusOr<Shape> inferred_status = ShapeInference::InferReduceShape(
+      {&input0.value(), &input1.value(), &input2.value(), &f32_, &f32_, &f32_},
+      {1}, to_apply);
+  ASSERT_IS_NOT_OK(inferred_status.status());
+  EXPECT_THAT(inferred_status.status().message(),
+              HasSubstr("All reduced tensors must have compatible dimension"));
+}
+
+TEST_F(ShapeInferenceTest, UnboundedSlice) {
+  StatusOr<Shape> operand = ParseShape("f32[1, <=3, ?]");
+  StatusOr<Shape> expected = ParseShape("f32[1, <=2, 3]");
+  ASSERT_IS_OK(operand.status());
+  StatusOr<Shape> inferred_status = ShapeInference::InferSliceShape(
+      operand.value(), /*starts=*/{0, 1, 2}, /*limits=*/{1, 3, 5},
+      /*strides=*/{1, 1, 1});
+  ASSERT_IS_OK(expected.status());
+  ASSERT_TRUE(ShapeUtil::Equal(inferred_status.value(), expected.value()))
+      << "inferred: " << ShapeUtil::HumanString(inferred_status.value())
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedSub) {
+  auto lhs = ParseShape(GetParam()[0]);
+  auto rhs = ParseShape(GetParam()[1]);
+  auto expected = ParseShape(GetParam()[2]);
+  ASSERT_IS_OK(lhs.status());
+  ASSERT_IS_OK(rhs.status());
+  auto inferred_status = ShapeInference::InferBinaryOpShape(
+      HloOpcode::kSubtract, lhs.value(), rhs.value(),
+      /*broadcast_dimensions=*/{});
+  if (inferred_status.ok()) {
+    ASSERT_IS_OK(expected.status());
+    ASSERT_TRUE(ShapeUtil::Equal(inferred_status.value(), expected.value()))
+        << "inferred: " << ShapeUtil::HumanString(inferred_status.value())
+        << " expected: " << ShapeUtil::HumanString(expected.value());
+  } else {
+    EXPECT_THAT(inferred_status.status().message(),
+                HasSubstr("Binary op subtract with incompatible shapes"));
+  }
+}
+
+TEST_F(ShapeInferenceTest, UnboundedTranspose) {
+  auto operand = ParseShape("f32[1, ?, 2, ?, <=2]{4,3,2,1,0}");
+  auto expected = ParseShape("f32[<=2, 1, ?, 2, ?]{0,2,3,4,1}");
+  ASSERT_IS_OK(operand.status());
+  auto inferred_status = ShapeInference::InferTransposeShape(
+      operand.value(), /*dimensions=*/{4, 0, 3, 2, 1});
+  ASSERT_IS_OK(expected.status());
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(inferred_status.value(), expected.value()))
+      << "inferred: " << ShapeUtil::HumanString(inferred_status.value())
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+TEST_F(ShapeInferenceTest, UnboundedTransposeRank1) {
+  auto operand = ParseShape("f32[?]");
+  auto expected = ParseShape("f32[?]");
+  ASSERT_IS_OK(operand.status());
+  auto inferred_status =
+      ShapeInference::InferTransposeShape(operand.value(), /*dimensions=*/{0});
+  ASSERT_IS_OK(expected.status());
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(inferred_status.value(), expected.value()))
+      << "inferred: " << ShapeUtil::HumanString(inferred_status.value())
+      << " expected: " << ShapeUtil::HumanString(expected.value());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    UnboundedDynamism, UnboundedBinaryOpShapeInferenceTest,
+    ::testing::Values(
+        // LHS | RHS | Result
+        // 1   | ?   | ?
+        std::vector<std::string>({"f32[1]", "f32[?]", "f32[?]"}),
+        // ?   | 1   | ?
+        std::vector<std::string>({"f32[?]", "f32[1]", "f32[?]"}),
+        // 2   | ?   | 2
+        std::vector<std::string>({"f32[2]", "f32[?]", "f32[2]"}),
+        // ?   | 2   | 2
+        std::vector<std::string>({"f32[?]", "f32[2]", "f32[2]"}),
+        // <=2 | ?   | <=2
+        std::vector<std::string>({"f32[<=2]", "f32[?]", "f32[<=2]"}),
+        // ?   | <=2 | <=2
+        std::vector<std::string>({"f32[?]", "f32[<=2]", "f32[<=2]"}),
+        // ?   | ?   | ?
+        std::vector<std::string>({"f32[?]", "f32[?]", "f32[?]"}),
+        // ?,2 | ?,3 | error
+        std::vector<std::string>({"f32[?,2]", "f32[?,3]", ""})));
+
+INSTANTIATE_TEST_SUITE_P(UnboundedDynamism, UnboundedUnaryOpShapeInferenceTest,
+                         ::testing::Values(
+                             // OPERAND | Result
+                             // 1       | 1
+                             std::vector<std::string>({"f32[1]", "f32[1]"}),
+                             // 2       | 2
+                             std::vector<std::string>({"f32[2]", "f32[2]"}),
+                             // <=2     | <=2
+                             std::vector<std::string>({"f32[<=2]", "f32[<=2]"}),
+                             // ?       | ?
+                             std::vector<std::string>({"f32[?]", "f32[?]"}),
+                             // ?,3     | ?,3
+                             std::vector<std::string>({"f32[?,3]",
+                                                       "f32[?,3]"})));
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/sharding_propagation.cc b/third_party/xla/xla/service/sharding_propagation.cc
index ad89c0536f5f5a..052c2eba7615ab 100644
--- a/third_party/xla/xla/service/sharding_propagation.cc
+++ b/third_party/xla/xla/service/sharding_propagation.cc
@@ -48,6 +48,7 @@ limitations under the License.
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/sharding_op_util.h"
+#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -101,12 +102,11 @@ bool IsShardingStrictlyBetter(const HloSharding& lhs, const HloSharding& rhs) {
 
 // Implementation for returning a improved sharding from another sharding.
 std::optional<HloSharding> ReturnImprovedShardingImpl(
-    HloSharding from, std::optional<const HloSharding> to_improved,
+    HloSharding from, const HloSharding* to_improved,
     const Shape& to_improved_shape, bool may_combine_partial_sharding,
     bool allow_aggressive_resharding = false) {
   // Always allow improve the sharding if it's straightly better.
-  if (to_improved.has_value() &&
-      IsShardingStrictlyBetter(from, to_improved.value())) {
+  if (to_improved != nullptr && IsShardingStrictlyBetter(from, *to_improved)) {
     return from;
   }
   // We don't want to propagate tile maximal shardings.
@@ -114,7 +114,7 @@ std::optional<HloSharding> ReturnImprovedShardingImpl(
     return std::nullopt;
   }
   // Any sharding is better then no sharding.
-  if (!to_improved.has_value()) {
+  if (to_improved == nullptr) {
     return from;
   }
   // We don't want to propagate manual shardings.
@@ -122,18 +122,17 @@ std::optional<HloSharding> ReturnImprovedShardingImpl(
     return std::nullopt;
   }
   int64_t sharding_tiles = from.NumTiles();
-  if (hlo_sharding_util::MergeSharding(to_improved.value(), &from,
+  if (hlo_sharding_util::MergeSharding(*to_improved, &from,
                                        may_combine_partial_sharding)) {
     // Override existing tiled sharding only when the new sharding is compatible
     // with the existing one. This avoids unexpected resharding when `sharding`
     // just has more tiles than existing sharding but they are not mergeable.
     if (!allow_aggressive_resharding && to_improved_shape.IsArray() &&
-        !to_improved.value().IsTileMaximal() &&
-        from.NumTiles() == sharding_tiles) {
-      if (!hlo_sharding_util::IsSubTilingOrEqualSharding(
-              to_improved_shape, from, to_improved.value())) {
+        !to_improved->IsTileMaximal() && from.NumTiles() == sharding_tiles) {
+      if (!hlo_sharding_util::IsSubTilingOrEqualSharding(to_improved_shape,
+                                                         from, *to_improved)) {
         VLOG(10) << "Not merging because of different device distribution";
-        VLOG(10) << "Instr sharding: " << to_improved.value().ToString();
+        VLOG(10) << "Instr sharding: " << to_improved->ToString();
         VLOG(10) << "New sharding " << from.ToString();
         return std::nullopt;
       }
@@ -149,10 +148,8 @@ std::optional<HloSharding> ReturnImprovedSharding(
     bool may_combine_partial_sharding,
     bool allow_aggressive_resharding = false) {
   return ReturnImprovedShardingImpl(
-      sharding,
-      instruction->has_sharding()
-          ? std::optional<HloSharding>(instruction->sharding())
-          : std::nullopt,
+      std::move(sharding),
+      instruction->has_sharding() ? &instruction->sharding() : nullptr,
       instruction->shape(), may_combine_partial_sharding,
       allow_aggressive_resharding);
 }
@@ -163,14 +160,20 @@ std::optional<HloSharding> ReturnImprovedSubSharding(
     HloSharding sharding, HloInstruction* instruction, const ShapeIndex& index,
     bool may_combine_partial_sharding,
     bool allow_aggressive_resharding = false) {
-  return ReturnImprovedShardingImpl(
-      sharding,
-      instruction->has_sharding()
-          ? std::optional<HloSharding>(instruction->sharding().GetSubSharding(
-                instruction->shape(), index))
-          : std::nullopt,
-      ShapeUtil::GetSubshape(instruction->shape(), index),
-      may_combine_partial_sharding, allow_aggressive_resharding);
+  if (instruction->has_sharding()) {
+    const HloSharding to_improved =
+        instruction->sharding().GetSubSharding(instruction->shape(), index);
+    return ReturnImprovedShardingImpl(
+        std::move(sharding), &to_improved,
+        ShapeUtil::GetSubshape(instruction->shape(), index),
+        may_combine_partial_sharding, allow_aggressive_resharding);
+
+  } else {
+    return ReturnImprovedShardingImpl(
+        std::move(sharding), nullptr,
+        ShapeUtil::GetSubshape(instruction->shape(), index),
+        may_combine_partial_sharding, allow_aggressive_resharding);
+  }
 }
 
 // Updates the sharding of the specified instruction with the specified sharding
@@ -182,9 +185,9 @@ bool MaybeImproveInstructionSharding(HloSharding sharding,
                                      HloInstruction* instruction,
                                      bool may_combine_partial_sharding,
                                      bool allow_aggressive_resharding = false) {
-  if (auto new_sharding = ReturnImprovedSharding(sharding, instruction,
-                                                 may_combine_partial_sharding,
-                                                 allow_aggressive_resharding)) {
+  if (auto new_sharding = ReturnImprovedSharding(
+          std::move(sharding), instruction, may_combine_partial_sharding,
+          allow_aggressive_resharding)) {
     instruction->set_sharding(std::move(*new_sharding));
     return true;
   }
@@ -199,8 +202,8 @@ bool MaybeImproveInstructionSubSharding(
     bool allow_aggressive_resharding = false) {
   if (instruction->shape().IsTuple()) {
     if (auto new_sub_sharding = ReturnImprovedSubSharding(
-            sharding, instruction, index, may_combine_partial_sharding,
-            allow_aggressive_resharding)) {
+            std::move(sharding), instruction, index,
+            may_combine_partial_sharding, allow_aggressive_resharding)) {
       HloSharding new_sharding =
           instruction->has_sharding()
               ? instruction->sharding()
@@ -216,7 +219,7 @@ bool MaybeImproveInstructionSubSharding(
     }
   }
   CHECK(index.size() == 1 && index[0] == 0);
-  return MaybeImproveInstructionSharding(sharding, instruction,
+  return MaybeImproveInstructionSharding(std::move(sharding), instruction,
                                          may_combine_partial_sharding,
                                          allow_aggressive_resharding);
 }
@@ -1597,13 +1600,14 @@ StatusOr<bool> ProcessShardingInstruction(
           absl::c_sort(unspec_dims);
           unspecified_dims->emplace(instruction, std::move(unspec_dims));
         } else if (!instruction->operand(0)->has_sharding()) {
-          instruction->mutable_operand(0)->set_sharding(sharding);
+          instruction->mutable_operand(0)->set_sharding(std::move(sharding));
         }
       } else if (instruction->has_sharding()) {
         // Handle shard group in parameters/outputs.
         process_shard_group_instruction(instruction, instruction->sharding());
         HloSharding sharding = instruction->sharding();
-        instruction->set_sharding(sharding.ClearShardGroup());
+        sharding.ClearShardGroup();
+        instruction->set_sharding(std::move(sharding));
       }
     }
   }
@@ -1668,7 +1672,7 @@ int64_t ComputeNonRootUsers(const HloInstruction* instr) {
               operand_sharding =
                   HloSharding::SingleTuple(operand->shape(), *sharding);
             }
-            operand->set_sharding(operand_sharding);
+            operand->set_sharding(std::move(operand_sharding));
           }
         }
         return OkStatus();
@@ -1936,8 +1940,8 @@ std::optional<HloSharding> ShardingPropagation::GetShardingFromUser(
     case HloOpcode::kSort: {
       HloSharding user_sharding = user.sharding();
       if (user_sharding.IsTuple()) {
-        return user_sharding = user_sharding.GetSubSharding(
-                   user.shape(), {user.operand_index(&instruction)});
+        return user_sharding.GetSubSharding(user.shape(),
+                                            {user.operand_index(&instruction)});
       }
       return user_sharding;
     }
@@ -2164,7 +2168,8 @@ bool ShardingPropagation::InferShardingFromShardGroup(
 // changed and false otherwise.
 bool ShardingPropagation::InferShardingFromOperands(
     HloInstruction* instruction, const ComputationMap& computation_map,
-    int64_t aggressiveness, const CallGraph& call_graph) {
+    int64_t aggressiveness, const CallGraph& call_graph,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (!CanPropagateThroughAtAggressiveLevel(*instruction, aggressiveness)) {
     return false;
   }
@@ -2175,16 +2180,27 @@ bool ShardingPropagation::InferShardingFromOperands(
   // Propagate manual sharding. Avoid tuple shaped HLOs that group independent
   // together. Reduce, ReduceWindow, and Sort can be tuples but the elements
   // are correlated, so we propagate manual sharding through them.
+
   // For custom-calls with manual operand, the default propagation logic will
   // just assign manual to the whole custom-call.
+  const bool custom_call_condition =
+      instruction->opcode() == HloOpcode::kCustomCall &&
+      instruction->shape().IsTuple();
+  // For asynchronous instructions with manual operand, we assign manual to the
+  // whole instructions if the async_execution_thread is not in the
+  // execution_threads.
+  const bool async_instr_condition =
+      instruction->IsAsynchronous() &&
+      !HloInstruction::IsThreadIncluded(instruction->async_execution_thread(),
+                                        execution_threads);
+
   if ((!instruction->has_sharding() ||
        instruction->sharding().IsTileMaximal()) &&
       (instruction->shape().IsArray() ||
        instruction->opcode() == HloOpcode::kReduce ||
        instruction->opcode() == HloOpcode::kSort ||
        instruction->opcode() == HloOpcode::kReduceWindow ||
-       (instruction->opcode() == HloOpcode::kCustomCall &&
-        instruction->shape().IsTuple()))) {
+       custom_call_condition || async_instr_condition)) {
     for (const HloInstruction* op : instruction->operands()) {
       if (!op->has_sharding() || !op->sharding().IsManual()) continue;
       // Do not pass through manual sharding to SPMDShardToFullShape.
@@ -2243,7 +2259,7 @@ bool ShardingPropagation::InferShardingFromOperands(
       HloSharding new_sharding = operand->sharding().GetSubSharding(
           operand->shape(), {instruction->tuple_index()});
       if (new_sharding.IsManual()) {
-        instruction->set_sharding(new_sharding);
+        instruction->set_sharding(std::move(new_sharding));
         return true;
       }
       return MaybeImproveInstructionSharding(
@@ -2759,7 +2775,7 @@ bool ShardingPropagation::InferShardingFromUsers(
             ShardingPropagation::GetShardingFromUser(
                 *instruction, *user, aggressiveness, is_spmd, call_graph);
         if (user_sharding && user_sharding->IsManual()) {
-          instruction->set_sharding(*user_sharding);
+          instruction->set_sharding(std::move(*user_sharding));
           return true;
         }
       }
@@ -2806,6 +2822,12 @@ Status ShardingPropagation::CanonicalizeLayouts(HloModule* module) {
     LOG(INFO) << "There is no registered layout_canonicalization_callback.";
     return OkStatus();
   }
+  // If the result layout is automatically set, allow layout assignment to
+  // choose the layout.
+  if (!module->entry_computation_layout().LayoutIsSet() ||
+      !module->entry_computation_layout().result_layout().LayoutIsSet()) {
+    return OkStatus();
+  }
   TF_ASSIGN_OR_RETURN(auto layouts,
                       module->layout_canonicalization_callback()(*module));
   Shape& result_shape = layouts.second;
@@ -3165,7 +3187,8 @@ StatusOr<bool> ShardingPropagation::Run(
           }
           already_inferred_from_operands.insert(instruction);
           if (InferShardingFromOperands(instruction, computation_map,
-                                        aggressiveness, *call_graph)) {
+                                        aggressiveness, *call_graph,
+                                        execution_threads)) {
             ++inferred_from_operand_counter;
             any_changed = true;
             VLOG(2) << "Add sharding (forward-pass): "
@@ -3311,7 +3334,7 @@ StatusOr<bool> ShardingPropagation::Run(
         root_sharding.tuple_elements()[i] = saved_root_shardings[i];
       }
     }
-    root_instruction->set_sharding(root_sharding);
+    root_instruction->set_sharding(std::move(root_sharding));
   }
   auto params = module->entry_computation()->parameter_instructions();
   if (allow_spmd_sharding_propagation_to_parameters_ &&
diff --git a/third_party/xla/xla/service/sharding_propagation.h b/third_party/xla/xla/service/sharding_propagation.h
index 2cdf11a92ac197..82aa10c6deccc2 100644
--- a/third_party/xla/xla/service/sharding_propagation.h
+++ b/third_party/xla/xla/service/sharding_propagation.h
@@ -148,10 +148,10 @@ class ShardingPropagation : public HloModulePass {
       HloInstruction* instruction, const ComputationMap& computation_map,
       int64_t aggressiveness,
       const absl::flat_hash_set<HloInstruction*>& shard_group);
-  bool InferShardingFromOperands(HloInstruction* instruction,
-                                 const ComputationMap& computation_map,
-                                 int64_t aggressiveness,
-                                 const CallGraph& call_graph);
+  bool InferShardingFromOperands(
+      HloInstruction* instruction, const ComputationMap& computation_map,
+      int64_t aggressiveness, const CallGraph& call_graph,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
   bool InferShardingFromUsers(
       HloInstruction* instruction,
       const ShardingPropagation::ComputationMap& computation_map,
diff --git a/third_party/xla/xla/service/sharding_propagation_test.cc b/third_party/xla/xla/service/sharding_propagation_test.cc
index a3a2143afa879f..e3cdf176be5133 100644
--- a/third_party/xla/xla/service/sharding_propagation_test.cc
+++ b/third_party/xla/xla/service/sharding_propagation_test.cc
@@ -10373,5 +10373,156 @@ ENTRY %entry {
                   "{devices=[4,4,1,4]<=[4,16]T(1,0) last_tile_dim_replicate}"));
 }
 
+TEST_F(ShardingPropagationTest, AsyncInstructionManualShardingArray) {
+  const char* const hlo_string = R"(
+HloModule module
+
+called_computation {
+  p0 = s32[8] parameter(0)
+  p1 = s32[8] parameter(1)
+  ROOT add = s32[8] add(p0, p1)
+}, execution_thread="thread_1" // called_computation
+
+ENTRY entry_computation {
+  p0 = s32[8] parameter(0), sharding={manual}
+  p1 = s32[8] parameter(1), sharding={manual}
+  async-start = ((s32[8], s32[8]), s32[8], u32[]) call-start(p0, p1), async_group_id=0, async_execution_thread="thread_1", to_apply=called_computation
+  ROOT async-done = s32[8] call-done(async-start), async_group_id=0, async_execution_thread="thread_1", to_apply=called_computation
+}, execution_thread="thread_0" // entry_computation
+
+)";
+
+  {
+    // Test with execution_threads = {"thread_0"}
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(hlo_string));
+    TF_ASSERT_OK_AND_ASSIGN(
+        bool changed,
+        ShardingPropagation(
+            /*is_spmd=*/true, /*propagate_metadata=*/true,
+            /*allow_spmd_sharding_propagation_to_output=*/{true},
+            /*allow_spmd_sharding_propagation_to_parameters=*/{true})
+            .Run(module.get(), {"thread_0"}));
+    EXPECT_TRUE(changed);
+
+    XLA_VLOG_LINES(1, module->ToString());
+
+    auto* instruction = FindInstruction(module.get(), "async-start");
+    ASSERT_NE(instruction, nullptr);
+    EXPECT_THAT(instruction,
+                op::Sharding("{{manual}, {manual}, {manual}, {manual}}"));
+
+    auto* async_done = FindInstruction(module.get(), "async-done");
+    ASSERT_NE(async_done, nullptr);
+    EXPECT_THAT(async_done, op::Sharding("{manual}"));
+  }
+
+  {
+    // Test with execution_threads = {"thread_0", "thread_1"}
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(hlo_string));
+    TF_ASSERT_OK_AND_ASSIGN(
+        bool changed,
+        ShardingPropagation(
+            /*is_spmd=*/true, /*propagate_metadata=*/true,
+            /*allow_spmd_sharding_propagation_to_output=*/{true},
+            /*allow_spmd_sharding_propagation_to_parameters=*/{true})
+            .Run(module.get(), {"thread_0", "thread_1"}));
+    EXPECT_FALSE(changed);
+  }
+
+  {
+    // Test with execution_threads = {}. Empty execution_threads means all
+    // execution_threads are included.
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(hlo_string));
+    TF_ASSERT_OK_AND_ASSIGN(
+        bool changed,
+        ShardingPropagation(
+            /*is_spmd=*/true, /*propagate_metadata=*/true,
+            /*allow_spmd_sharding_propagation_to_output=*/{true},
+            /*allow_spmd_sharding_propagation_to_parameters=*/{true})
+            .Run(module.get()));
+    EXPECT_FALSE(changed);
+  }
+}
+
+TEST_F(ShardingPropagationTest, AsyncInstructionManualShardingTuple) {
+  const char* const hlo_string = R"(
+HloModule module
+
+called_computation {
+  p0 = s32[8] parameter(0)
+  p1 = s32[8] parameter(1)
+  add = s32[8] add(p0, p1)
+  mul = s32[8] multiply(p0, p1)
+  ROOT result = (s32[8], s32[8]) tuple(add, mul)
+}, execution_thread="thread_1" // called_computation
+
+ENTRY entry_computation {
+  p0 = s32[8] parameter(0), sharding={manual}
+  p1 = s32[8] parameter(1), sharding={manual}
+  async-start = ((s32[8], s32[8]), (s32[8], s32[8]), u32[]) call-start(p0, p1), async_group_id=0, async_execution_thread="thread_1", to_apply=called_computation
+  ROOT async-done = (s32[8], s32[8]) call-done(async-start), async_group_id=0, async_execution_thread="thread_1", to_apply=called_computation
+}, execution_thread="thread_0" // entry_computation
+
+)";
+
+  {
+    // Test with execution_threads = {"thread_0"}
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(hlo_string));
+    TF_ASSERT_OK_AND_ASSIGN(
+        bool changed,
+        ShardingPropagation(
+            /*is_spmd=*/true, /*propagate_metadata=*/true,
+            /*allow_spmd_sharding_propagation_to_output=*/{true},
+            /*allow_spmd_sharding_propagation_to_parameters=*/{true})
+            .Run(module.get(), {"thread_0"}));
+    EXPECT_TRUE(changed);
+
+    XLA_VLOG_LINES(1, module->ToString());
+
+    auto* async_start = FindInstruction(module.get(), "async-start");
+    ASSERT_NE(async_start, nullptr);
+    EXPECT_THAT(
+        async_start,
+        op::Sharding("{{manual}, {manual}, {manual}, {manual}, {manual}}"));
+
+    auto* async_done = FindInstruction(module.get(), "async-done");
+    ASSERT_NE(async_done, nullptr);
+    EXPECT_THAT(async_done, op::Sharding("{{manual}, {manual}}"));
+  }
+
+  {
+    // Test with execution_threads = {"thread_0", "thread_1"}
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(hlo_string));
+    TF_ASSERT_OK_AND_ASSIGN(
+        bool changed,
+        ShardingPropagation(
+            /*is_spmd=*/true, /*propagate_metadata=*/true,
+            /*allow_spmd_sharding_propagation_to_output=*/{true},
+            /*allow_spmd_sharding_propagation_to_parameters=*/{true})
+            .Run(module.get(), {"thread_0", "thread_1"}));
+    EXPECT_FALSE(changed);
+  }
+
+  {
+    // Test with execution_threads = {}. Empty execution_threads means all
+    // execution_threads are included.
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(hlo_string));
+    TF_ASSERT_OK_AND_ASSIGN(
+        bool changed,
+        ShardingPropagation(
+            /*is_spmd=*/true, /*propagate_metadata=*/true,
+            /*allow_spmd_sharding_propagation_to_output=*/{true},
+            /*allow_spmd_sharding_propagation_to_parameters=*/{true})
+            .Run(module.get()));
+    EXPECT_FALSE(changed);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/BUILD b/third_party/xla/xla/service/spmd/BUILD
index f80c4104cdf29f..19330164818afd 100644
--- a/third_party/xla/xla/service/spmd/BUILD
+++ b/third_party/xla/xla/service/spmd/BUILD
@@ -1,7 +1,7 @@
 # Description: SPMD partitioning pass.
 
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -34,12 +34,16 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//xla:array",
         "//xla:comparison_util",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:protobuf_util",
         "//xla:shape_util",
         "//xla:status",
+        "//xla:status_macros",
+        "//xla:statusor",
+        "//xla:types",
         "//xla:util",
         "//xla:window_util",
         "//xla:xla_data_proto_cc",
@@ -50,12 +54,14 @@ cc_library(
         "//xla/hlo/utils:hlo_query",
         "//xla/hlo/utils:hlo_sharding_util",
         "//xla/service:call_graph",
+        "//xla/service:computation_layout",
         "//xla/service:custom_call_sharding_helper",
         "//xla/service:dot_as_convolution_util",
         "//xla/service:flatten_call_graph",
         "//xla/service:hlo_cse",
         "//xla/service:hlo_dce",
         "//xla/service:hlo_lexer",
+        "//xla/service:hlo_module_config",
         "//xla/service:hlo_pass",
         "//xla/service:hlo_pass_pipeline",
         "//xla/service:pattern_matcher",
@@ -69,8 +75,11 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:statusor",
     ],
diff --git a/third_party/xla/xla/service/spmd/dot_handler.cc b/third_party/xla/xla/service/spmd/dot_handler.cc
index 5b65b476ad4a66..d6b1adee4fdd95 100644
--- a/third_party/xla/xla/service/spmd/dot_handler.cc
+++ b/third_party/xla/xla/service/spmd/dot_handler.cc
@@ -648,7 +648,8 @@ std::optional<WindowedEinsumConfig> GetWindowedEinsumConfiguration(
       rhs_shape_size >=
           options.threshold_for_windowed_einsum_mib * 1024 * 1024 &&
       (!rhs || check_users_sharding(rhs)) &&
-      !disable_windowed_einsum(/*lhs_needs_ag=*/false, /*rhs_needs_ag=*/true)) {
+      !disable_windowed_einsum(/*lhs_needs_ag=*/false, /*rhs_needs_ag=*/true) &&
+      options.enable_windowed_einsum_for_all_gather) {
     if (rhs_contracting_partitions == num_partitions) {
       return WindowedEinsumConfig{
           /*windowed_op=*/WindowedEinsumOperand::RHS,
@@ -676,7 +677,8 @@ std::optional<WindowedEinsumConfig> GetWindowedEinsumConfiguration(
       lhs_shape_size >=
           options.threshold_for_windowed_einsum_mib * 1024 * 1024 &&
       (!lhs || check_users_sharding(lhs)) &&
-      !disable_windowed_einsum(/*lhs_needs_ag=*/true, /*rhs_needs_ag=*/false)) {
+      !disable_windowed_einsum(/*lhs_needs_ag=*/true, /*rhs_needs_ag=*/false) &&
+      options.enable_windowed_einsum_for_all_gather) {
     if (lhs_contracting_partitions == num_partitions) {
       return WindowedEinsumConfig{
           /*windowed_op=*/WindowedEinsumOperand::LHS,
@@ -706,7 +708,8 @@ std::optional<WindowedEinsumConfig> GetWindowedEinsumConfiguration(
       output_shape_size >=
           options.threshold_for_windowed_einsum_mib * 1024 * 1024 &&
       !disable_windowed_einsum(/*lhs_needs_ag=*/false,
-                               /*rhs_needs_ag=*/false)) {
+                               /*rhs_needs_ag=*/false) &&
+      options.enable_windowed_einsum_for_reduce_scatter) {
     if (output_lhs_non_contracting_partitions == num_partitions) {
       return WindowedEinsumConfig{
           /*windowed_op=*/WindowedEinsumOperand::RHS,
@@ -2330,11 +2333,39 @@ GetNonContractingPartitionGroupedShardingForOtherOperand(
   GroupedSharding output_grouped =
       hlo_sharding_util::GroupShardingOnDims(output_sharding, output_dims);
   std::vector<int64_t> other_group_dims;
+  // Try to match on the replicated dimensions first.
   if (other_sharding.ReplicateOnLastTileDim() &&
       other_sharding.tile_assignment().dimensions().back() % group_count == 0) {
+    // Try to aggressively match the replicated dimension with the current
+    // output device groups. If fails then try find a dimension to swap instead
+    // of reordering the mesh with collective permutes that can create weird
+    // patterns. If that fails also do the traditional replication matching.
+    for (int64_t i = other_sharding.tile_assignment().num_dimensions() - 1;
+         i >= 0; --i) {
+      if (other_sharding.tile_assignment().dimensions()[i] % group_count == 0) {
+        std::vector<int64_t> perm(
+            other_sharding.tile_assignment().num_dimensions(), 0);
+        absl::c_iota(perm, 0);
+        std::swap(perm[i],
+                  perm[other_sharding.tile_assignment().num_dimensions() - 1]);
+        auto sharding_to_match =
+            i == other_sharding.tile_assignment().num_dimensions() - 1
+                ? other_sharding
+                : hlo_sharding_util::TransposeSharding(other_sharding, perm);
+        if (auto grouped_sharding = hlo_sharding_util::
+                PartialReplicatedGroupShardingWithAssignedDeviceGroups(
+                    sharding_to_match,
+                    sharding_to_match.tile_assignment().dimensions().back() /
+                        group_count,
+                    output_grouped.device_groups)) {
+          return grouped_sharding.value();
+        }
+      }
+    }
     other_group_dims.push_back(
         other_sharding.tile_assignment().num_dimensions() - 1);
-  } else {
+  }
+  if (other_group_dims.empty()) {
     const bool may_replicate_other_contracting_dims =
         (other_contracting_partitions == group_count &&
          other_non_contracting_partitions ==
@@ -2342,14 +2373,23 @@ GetNonContractingPartitionGroupedShardingForOtherOperand(
     const bool may_replicate_other_non_contracting_dims =
         group_count == other_non_contracting_partitions &&
         matching_contracting_partitions == other_contracting_partitions;
+
     if (auto found_dims = FindMatchingPartitionedDimsForGrouping(
             other_sharding, output_grouped.device_groups)) {
       other_group_dims = std::move(*found_dims);
+    } else if (other_sharding.ReplicateOnLastTileDim() &&
+               // Match grouping non-matching replicated dimension at a lower
+               // priority than finding matched dimensions as it usually pro
+               other_sharding.tile_assignment().dimensions().back() %
+                       group_count ==
+                   0) {
+      other_group_dims.push_back(
+          other_sharding.tile_assignment().num_dimensions() - 1);
     } else if (may_replicate_other_contracting_dims &&
                (!may_replicate_other_non_contracting_dims ||
-                ShapeUtil::ByteSizeOf(other_shape)) <=
-                   ShapeUtil::ByteSizeOf(MakePartitionedShape(
-                       output_base_shape, output_sharding))) {
+                ShapeUtil::ByteSizeOf(other_shape) <=
+                    ShapeUtil::ByteSizeOf(MakePartitionedShape(
+                        output_base_shape, output_sharding)))) {
       for (const auto& dim : other_contracting_dims) {
         other_group_dims.push_back(lhs_matching ? dim.rhs : dim.lhs);
       }
@@ -2364,18 +2404,6 @@ GetNonContractingPartitionGroupedShardingForOtherOperand(
   if (other_group_dims.size() == 1 &&
       other_group_dims[0] ==
           other_sharding.tile_assignment().num_dimensions() - 1) {
-    // Try to reuse the device groups from the output to match the partially
-    // replicated dim.
-    if (auto grouped_sharding = hlo_sharding_util::
-            PartialReplicatedGroupShardingWithAssignedDeviceGroups(
-                other_sharding,
-                other_sharding.tile_assignment().dimensions().back() /
-                    group_count,
-                output_grouped.device_groups)) {
-      std::vector<int64_t> group_dim_shards = {
-          other_sharding.tile_assignment().dimensions().back() / group_count};
-      return grouped_sharding.value();
-    }
     std::vector<int64_t> group_dim_shards = {
         other_sharding.tile_assignment().dimensions().back() / group_count};
     return AlignGroupsWith(
@@ -2445,7 +2473,6 @@ StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
           lhs_matching ? dims_mapping.rhs_non_contracting_dims
                        : dims_mapping.lhs_non_contracting_dims,
           dims_mapping.contracting_dims);
-
   if (!other_grouped) {
     other = other.Replicate();
   }
@@ -3444,6 +3471,8 @@ bool LhsIsBestMatchForNonContractingPartitioning(
       } else {
         lhs_matching = lhs_all_gather_time_in_ms > rhs_all_gather_time_in_ms;
       }
+    } else {
+      lhs_matching = lhs_matching_iterations.has_value();
     }
   }
   return lhs_matching;
diff --git a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
index e47f5570a93176..951efe6f612049 100644
--- a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
+++ b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
@@ -329,7 +329,6 @@ StatusOr<HloInstruction*> PartitionGatherOperandPassthroughDimensions(
   };
 
   SpmdBuilder* b = visitor->builder();
-  GatherDimensionNumbers dnums = gather->gather_dimension_numbers();
   if (auto maybe_passthrough = hlo_sharding_util::
           GatherOutputShardingFromOperandOperandPassthroughDimensions(
               operand.base_shape(), operand.sharding(), *gather, slice_sizes)) {
@@ -1171,7 +1170,6 @@ StatusOr<HloInstruction*> PartitionScatterOperandPassthroughDimensions(
   };
 
   SpmdBuilder* b = visitor->builder();
-  auto dnums = scatter->scatter_dimension_numbers();
   if (auto maybe_passthrough = hlo_sharding_util::
           ScatterUpdateShardingFromOutputOperandPassthroughDimensions(
               operands[0].base_shape(), operands[0].sharding(), *scatter,
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index 98438aa08ae251..61a194790ab58f 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "xla/service/spmd/spmd_partitioner.h"
 
 #include <algorithm>
+#include <array>
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <optional>
@@ -27,13 +29,16 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/array.h"
 #include "xla/comparison_util.h"
-#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -42,20 +47,32 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/hlo/utils/hlo_sharding_util.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/protobuf_util.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/computation_layout.h"
 #include "xla/service/flatten_call_graph.h"
 #include "xla/service/hlo_cse.h"
 #include "xla/service/hlo_dce.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_pass_pipeline.h"
 #include "xla/service/shape_inference.h"
 #include "xla/service/spmd/custom_call_handler.h"
 #include "xla/service/spmd/spmd_partitioner_util.h"
 #include "xla/service/tuple_simplifier.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/status_macros.h"
+#include "xla/statusor.h"
+#include "xla/types.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/numbers.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -1766,19 +1783,21 @@ PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
 
 namespace {
 
-// Matching a pattern like [..,X,..,Y] -> [..,X*Y,..,1] or [..,X,..,Y] ->
-// [..,1,..,X*Y].
+// Matching the following patterns, where X, Y, cannot be 1, Z can be 1.
+// 1. [..,X,..,Y,..] -> [..,X*Y,..,1,..]
+// 2. [..,Y,..,X,..] -> [..,1,..,X*Y,..]
+// 3. [..,X*Y,..,Z,..] -> [..,X,..,Y*Z,..]
+// 4. [..,Z,..,X*Y,..] -> [..,Y*Z,..,X,..]
 // Output tuple:
-// - HloSharding: The original sharding with an extra dimension added of size 1.
-// - HloSharding: The sharding with the dimension we want to merge moved in
-//                place of the dimension of size 1 we added.
-// - int: Dimension in the input that is going to be merged with another
-// dimension (becoming bigger).
-// - int: Dimension in the input that is going to be merged into another
-// dimension (becoming 1).
-std::optional<std::tuple<HloSharding, HloSharding, int, int>>
-PatternMatchMergeSharding(const Shape& shape, const HloSharding& source,
-                          const HloSharding& target) {
+// - HloSharding: The original sharding with an extra dimension added of size 1
+// or Y.
+// - HloSharding: The sharding with the new dimension added moved in the place
+// where we expect the target dimension to be.
+// - int64_t: The index of X.
+std::optional<std::tuple<HloSharding, HloSharding, int64_t>>
+PatternMatchMergeOrSplitSharding(const Shape& shape, const Shape& base_shape,
+                                 const HloSharding& source,
+                                 const HloSharding& target) {
   if (!source.IsTiled() || !target.IsTiled()) {
     return std::nullopt;
   }
@@ -1791,171 +1810,103 @@ PatternMatchMergeSharding(const Shape& shape, const HloSharding& source,
            target.tile_assignment().dimensions()[target.TiledDataRank()])) {
     return std::nullopt;
   }
-  for (int i = 0; i < target.TiledDataRank(); ++i) {
-    if (source.tile_assignment().dim(i) < target.tile_assignment().dim(i) &&
-        (target.tile_assignment().dim(i) % source.tile_assignment().dim(i)) ==
-            0) {
-      auto get_reshaped_sharding =
-          [&](int64_t target_idx) -> std::optional<HloSharding> {
-        if (target.tile_assignment().dim(target_idx) != 1) {
-          return std::nullopt;
-        }
-        if (target.tile_assignment().dim(i) !=
-            source.tile_assignment().dim(i) *
-                source.tile_assignment().dim(target_idx)) {
-          return std::nullopt;
-        }
-        if (shape.dimensions(i) % source.tile_assignment().dim(target_idx) !=
-            0) {
-          return std::nullopt;
-        }
-        return hlo_sharding_util::SplitShardingDimension(
-            source, i, source.tile_assignment().dim(i));
-      };
-      for (int j = i - 1; j >= 0; --j) {
-        if (auto reshaped_sharding = get_reshaped_sharding(j)) {
-          VLOG(10) << "Triggered Merge From Left";
-          std::vector<int64_t> dimensions(
-              reshaped_sharding->tile_assignment().dimensions().begin(),
-              reshaped_sharding->tile_assignment().dimensions().end());
-          std::swap(dimensions[i + 1], dimensions[j]);
-          auto target_tile_assignment =
-              target.tile_assignment().Reshape(dimensions);
-          auto new_sharding =
-              source.HasPartialReplication()
-                  ? HloSharding::PartialTile(target_tile_assignment,
-                                             source.metadata())
-                  : HloSharding::Tile(target_tile_assignment,
-                                      source.metadata());
-          VLOG(10) << "Reshaped sharding before: "
-                   << reshaped_sharding->ToString();
-          VLOG(10) << "Reshaped sharding: " << new_sharding.ToString();
-          return std::make_tuple(std::move(*reshaped_sharding),
-                                 std::move(new_sharding), i, j);
-        }
-      }
-      for (int j = i + 1; j < target.TiledDataRank(); ++j) {
-        if (auto reshaped_sharding = get_reshaped_sharding(j)) {
-          VLOG(10) << "Triggered Merge From Right";
-          std::vector<int64_t> dimensions(
-              reshaped_sharding->tile_assignment().dimensions().begin(),
-              reshaped_sharding->tile_assignment().dimensions().end());
-          std::swap(dimensions[i + 1], dimensions[j + 1]);
-          auto target_tile_assignment =
-              target.tile_assignment().Reshape(dimensions);
-          auto new_sharding =
-              source.HasPartialReplication()
-                  ? HloSharding::PartialTile(target_tile_assignment,
-                                             source.metadata())
-                  : HloSharding::Tile(target_tile_assignment,
-                                      source.metadata());
-          VLOG(10) << "Reshaped sharding before: "
-                   << reshaped_sharding->ToString();
-          VLOG(10) << "Reshaped sharding: " << new_sharding.ToString();
-          return std::make_tuple(std::move(*reshaped_sharding),
-                                 std::move(new_sharding), i, j);
-        }
-      }
-    }
-  }
-  return std::nullopt;
-}
 
-// Matching a pattern like  [..,X*Y,..,1] -> [..,X,..,Y] or [..,1,..,X*Y] ->
-// [..,X,..,Y].
-// Output tuple:
-// - HloSharding: The original sharding with an extra dimension added of size Y.
-// - HloSharding: The sharding with the new dimension added moved in the place
-// where we expect the target dimension to be.
-// - int: Dimension in the input that is going to be unmerged (getting split).
-// - int: Dimension in the input that is going to be the destination of the
-// unmerged dimension.
-std::optional<std::tuple<HloSharding, HloSharding, int, int>>
-PatternMatchUnmergeSharding(const Shape& shape, const Shape& base_shape,
-                            const HloSharding& source,
-                            const HloSharding& target) {
-  if (!source.IsTiled() || !target.IsTiled()) {
-    return std::nullopt;
-  }
-  if (source.TiledDataRank() != target.TiledDataRank()) {
-    return std::nullopt;
+  std::vector<int64_t> diff_index;
+  for (int64_t i = 0; i < target.TiledDataRank(); ++i) {
+    if (source.tile_assignment().dim(i) != target.tile_assignment().dim(i)) {
+      diff_index.push_back(i);
+    }
   }
-  if ((source.HasPartialReplication() ^ target.HasPartialReplication()) ||
-      (source.HasPartialReplication() &&
-       source.tile_assignment().dimensions()[source.TiledDataRank()] !=
-           target.tile_assignment().dimensions()[target.TiledDataRank()])) {
+  if (diff_index.size() < 2) {
     return std::nullopt;
   }
-  for (int i = 0; i < target.TiledDataRank(); ++i) {
-    if (source.tile_assignment().dim(i) > target.tile_assignment().dim(i) &&
-        target.tile_assignment().dim(i) != 1 &&
-        base_shape.dimensions(i) % source.tile_assignment().dim(i) == 0 &&
-        source.tile_assignment().dim(i) % target.tile_assignment().dim(i) ==
-            0) {
-      auto get_reshaped_sharding =
-          [&](int64_t target_dim) -> std::optional<HloSharding> {
-        if (source.tile_assignment().dim(target_dim) != 1) {
-          return std::nullopt;
-        }
-        if (source.tile_assignment().dim(i) !=
-            target.tile_assignment().dim(i) *
-                target.tile_assignment().dim(target_dim)) {
-          VLOG(10) << "Skipped for target dim different from dimension_size "
-                   << target_dim
-                   << " src size: " << source.tile_assignment().dim(i)
-                   << " target size: "
-                   << target.tile_assignment().dim(target_dim);
-          return std::nullopt;
-        }
-        return hlo_sharding_util::SplitShardingDimension(
-            source, i, target.tile_assignment().dim(i));
-      };
-      for (int j = i - 1; j >= 0; --j) {
-        if (auto reshaped_sharding = get_reshaped_sharding(j)) {
-          VLOG(10) << "Triggered Unmerge to Right";
-          std::vector<int64_t> dimensions(
-              reshaped_sharding->tile_assignment().dimensions().begin(),
-              reshaped_sharding->tile_assignment().dimensions().end());
-          std::swap(dimensions[i + 1], dimensions[j]);
-          auto target_tile_assignment =
-              target.tile_assignment().Reshape(dimensions);
-          auto new_sharding =
-              source.HasPartialReplication()
-                  ? HloSharding::PartialTile(target_tile_assignment,
-                                             source.metadata())
-                  : HloSharding::Tile(target_tile_assignment,
-                                      source.metadata());
-          VLOG(10) << "Reshaped sharding before: "
-                   << reshaped_sharding->ToString();
-          VLOG(10) << "Reshaped sharding: " << new_sharding.ToString();
-          return std::make_tuple(std::move(*reshaped_sharding),
-                                 std::move(new_sharding), i, j);
+
+  // Iterate every pair of elements in diff_index.
+  for (int64_t diff_index_i = 0; diff_index_i < diff_index.size();
+       ++diff_index_i) {
+    for (int64_t diff_index_j = diff_index_i + 1;
+         diff_index_j < diff_index.size(); ++diff_index_j) {
+      int64_t i = diff_index[diff_index_i];
+      int64_t j = diff_index[diff_index_j];
+      const std::vector<bool> is_one = {source.tile_assignment().dim(i) == 1,
+                                        source.tile_assignment().dim(j) == 1,
+                                        target.tile_assignment().dim(i) == 1,
+                                        target.tile_assignment().dim(j) == 1};
+      int64_t new_dim_size;
+      switch (std::count(is_one.begin(), is_one.end(), true)) {
+        case 1: {
+          if (source.tile_assignment().dim(i) *
+                  source.tile_assignment().dim(j) !=
+              target.tile_assignment().dim(i) *
+                  target.tile_assignment().dim(j)) {
+            continue;
+          }
+          if (source.tile_assignment().dim(i) == 1 ||
+              target.tile_assignment().dim(i) == 1) {
+            std::swap(i, j);
+            // After the swap, we always have the following.
+            // i is the dimension without size 1 in either source or target
+            // j is the dimension with size 1 in either source or target
+          }
+          if (target.tile_assignment().dim(j) == 1) {
+            // dim of size 1 is in the target
+            if (shape.dimensions(i) % source.tile_assignment().dim(j) != 0) {
+              continue;
+            }
+            new_dim_size = source.tile_assignment().dim(i);
+          } else {
+            // dim of size 1 is in the source
+            if (base_shape.dimensions(i) % source.tile_assignment().dim(i) !=
+                0) {
+              continue;
+            }
+            new_dim_size = target.tile_assignment().dim(i);
+          }
+          break;
         }
-      }
-      for (int j = i + 1; j < target.TiledDataRank(); ++j) {
-        if (auto reshaped_sharding = get_reshaped_sharding(j)) {
-          VLOG(10) << "Triggered Unmerge to Left";
-          std::vector<int64_t> dimensions(
-              reshaped_sharding->tile_assignment().dimensions().begin(),
-              reshaped_sharding->tile_assignment().dimensions().end());
-          std::swap(dimensions[i + 1], dimensions[j + 1]);
-          auto target_tile_assignment =
-              target.tile_assignment().Reshape(dimensions);
-          auto new_sharding =
-              source.HasPartialReplication()
-                  ? HloSharding::PartialTile(target_tile_assignment,
-                                             source.metadata())
-                  : HloSharding::Tile(target_tile_assignment,
-                                      source.metadata());
-          VLOG(10) << "Reshaped sharding before: "
-                   << reshaped_sharding->ToString();
-          VLOG(10) << "Reshaped sharding: " << new_sharding.ToString();
-          return std::make_tuple(std::move(*reshaped_sharding),
-                                 std::move(new_sharding), i, j);
+        case 0: {
+          if (source.tile_assignment().dim(i) <
+              target.tile_assignment().dim(i)) {
+            std::swap(i, j);
+            // After the swap, we always have the following.
+            // source.tile_assignment().dim(i) > target.tile_assignment().dim(i)
+            // source.tile_assignment().dim(j) < target.tile_assignment().dim(j)
+          }
+          if (source.tile_assignment().dim(i) !=
+              target.tile_assignment().dim(i) *
+                  target.tile_assignment().dim(j)) {
+            continue;
+          }
+          if (base_shape.dimensions(i) % source.tile_assignment().dim(i) != 0) {
+            continue;
+          }
+          new_dim_size = target.tile_assignment().dim(i);
+          break;
         }
+        default:
+          continue;
       }
+
+      auto reshaped_sharding =
+          hlo_sharding_util::SplitShardingDimension(source, i, new_dim_size);
+      std::vector<int64_t> dimensions(
+          reshaped_sharding.tile_assignment().dimensions().begin(),
+          reshaped_sharding.tile_assignment().dimensions().end());
+      std::swap(dimensions[i + 1], dimensions[j + (j > i ? 1 : 0)]);
+      auto target_tile_assignment =
+          target.tile_assignment().Reshape(dimensions);
+      auto new_sharding =
+          source.HasPartialReplication()
+              ? HloSharding::PartialTile(target_tile_assignment,
+                                         source.metadata())
+              : HloSharding::Tile(target_tile_assignment, source.metadata());
+      VLOG(10) << "Reshaped sharding before: " << reshaped_sharding.ToString();
+      VLOG(10) << "Reshaped sharding: " << new_sharding.ToString();
+      return std::make_tuple(std::move(reshaped_sharding),
+                             std::move(new_sharding), i);
     }
   }
+
   return std::nullopt;
 }
 
@@ -2047,10 +1998,9 @@ std::optional<PartitionedHlo> PartitionedHlo::TryComplexReshardHandling(
   const bool is_source_partially_replicated =
       sharding().ReplicateOnLastTileDim();
   const bool is_target_partially_replicated = target.ReplicateOnLastTileDim();
-  if (auto reshape =
-          PatternMatchMergeSharding(this->hlo()->shape(), sharding(), target)) {
-    auto& [before_sharding, new_reshaped_sharding, source_dim, target_dim] =
-        *reshape;
+  if (auto reshape = PatternMatchMergeOrSplitSharding(
+          this->hlo()->shape(), this->base_shape(), sharding(), target)) {
+    auto& [before_sharding, new_reshaped_sharding, source_dim] = *reshape;
     VLOG(10) << "Matched \"pattern_match_reshape()\": "
              << std::get<0>(*reshape).ToString();
     VLOG(10) << "Original shape: " << hlo()->shape().ToString();
@@ -2078,38 +2028,6 @@ std::optional<PartitionedHlo> PartitionedHlo::TryComplexReshardHandling(
     }
     return reshaped;
   }
-  if (auto reshape = PatternMatchUnmergeSharding(
-          this->hlo()->shape(), this->base_shape(), sharding(), target)) {
-    auto& [before_sharding, new_reshaped_sharding, source_dim, target_dim] =
-        *reshape;
-    VLOG(10) << "Matched \"unmerge_sharding()\": "
-             << new_reshaped_sharding.ToString();
-    VLOG(10) << "Original shape: " << hlo()->shape().ToString();
-    VLOG(10) << "Base shape: " << base_shape().ToString();
-    PartitionedHlo reshaped = SplitReshapeHelper(
-        *this, source_dim, this->hlo()->shape().dimensions(source_dim),
-        before_sharding);
-    VLOG(10) << "Reshaped shape: " << reshaped.hlo()->shape().ToString();
-    VLOG(10) << "Reshaped base_shape: " << reshaped.base_shape().ToString();
-    VLOG(10) << "Before sharding: " << before_sharding.ToString();
-    auto reshard = reshaped.ReshardNoCache(new_reshaped_sharding,
-                                           /*pad_value=*/std::nullopt,
-                                           /*allow_full_replication=*/false);
-    if (reshard.sharding() != new_reshaped_sharding) {
-      return std::nullopt;
-    }
-    auto reshaped_sharding = hlo_sharding_util::MergeShardingDimension(
-        reshard.sharding(), source_dim);
-    reshaped = MergeReshapeHelper(reshard, source_dim, reshaped_sharding);
-    if (reshaped.sharding() != target) {
-      reshaped = reshaped.ReshardNoCache(target, /*pad_value=*/std::nullopt,
-                                         /*allow_full_replication=*/false);
-      if (reshaped.sharding() != target) {
-        return std::nullopt;
-      }
-    }
-    return reshaped;
-  }
   if (auto intermediate_target =
           PatternMatchPartiallyReplicateDim(sharding(), target)) {
     VLOG(5) << "Matched \"pattern_match_partially_replicate_dim()\": "
@@ -4723,34 +4641,28 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
           SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
           const std::vector<std::vector<int64_t>>& partition_subgroups,
           int64_t channel_id) {
-        if (partition_subgroups.size() <= 1) {
-          std::vector<ReplicaGroup> groups(num_replicas);
-          // TODO(yuanzx): Unify subgroup definition with AllToAll.
-          for (int64_t i = 0; i < num_replicas; ++i) {
-            groups[i].add_replica_ids(i);
-          }
-          HloComputation* reduction_clone =
-              reduction->parent()->AddComputationAndUnifyNamesAndIds(
-                  reduction->Clone(), false);
-          HloInstruction* all_reduce =
-              b->AddInstruction(HloInstruction::CreateAllReduce(
-                  operand->shape(), {operand}, reduction_clone, groups,
-                  /*constrain_layout=*/false, channel_id,
-                  /*use_global_device_ids=*/false));
-          reduction_clone->SetCollectiveCallInstruction(all_reduce);
-          return all_reduce;
-        }
-
         std::vector<ReplicaGroup> device_groups;
-        device_groups.reserve(partition_subgroups.size() * num_replicas);
-        for (int64_t i = 0; i < num_replicas; ++i) {
-          for (const auto& pgroup : partition_subgroups) {
+        if (partition_subgroups.size() <= 1) {
+          device_groups.reserve(num_replicas);
+          for (int64_t rid = 0; rid < num_replicas; ++rid) {
             device_groups.emplace_back();
-            for (int64_t pid : pgroup) {
-              device_groups.back().add_replica_ids(i * num_partitions + pid);
+            for (int64_t pid = 0; pid < num_partitions; ++pid) {
+              device_groups.back().add_replica_ids(rid * num_partitions + pid);
+            }
+          }
+        } else {
+          device_groups.reserve(partition_subgroups.size() * num_replicas);
+          for (int64_t rid = 0; rid < num_replicas; ++rid) {
+            for (const auto& pgroup : partition_subgroups) {
+              device_groups.emplace_back();
+              for (int64_t pid : pgroup) {
+                device_groups.back().add_replica_ids(rid * num_partitions +
+                                                     pid);
+              }
             }
           }
         }
+
         HloComputation* reduction_clone =
             reduction->parent()->AddComputationAndUnifyNamesAndIds(
                 reduction->Clone(), false);
@@ -5049,7 +4961,7 @@ StatusOr<bool> SpmdPartitioner::Run(
   } else {
     // Fix up some bad tiling in entry computation layout.
     auto update_shape = [this](Shape* subshape, const xla::ShapeIndex& index) {
-      if (subshape->IsArray()) {
+      if (subshape->IsArray() && subshape->has_layout()) {
         UpdateLayout(subshape);
       }
     };
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.h b/third_party/xla/xla/service/spmd/spmd_partitioner.h
index de9063deab9389..3f8e9c7f742e7e 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.h
@@ -81,6 +81,11 @@ struct SpmdPartitionerOptions {
   // Whether to skip checking the numbers and shardings of windowed einsum's
   // users.
   bool skip_checking_windowed_einsum_users = false;
+
+  // Enables windowed einsum for operand all-gather.
+  bool enable_windowed_einsum_for_all_gather = true;
+  // Enables windowed einsum for result reduce-scatter.
+  bool enable_windowed_einsum_for_reduce_scatter = true;
 };
 
 // Class to wrap the computation builder to capture information during SPMD
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
index b8f6cf69eccc45..4a1c45cc6e8552 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
@@ -13953,6 +13953,67 @@ ENTRY %entry {
   EXPECT_THAT(topk_operand, op::Shape("bf16[64,128000]{1,0}"));
 }
 
+TEST_P(SpmdPartitioningTest, WindowedEinsumShouldMatchLhs_b305313406) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+
+ENTRY %entry {
+  %copy.11 = bf16[64,2048,20480]{2,1,0} parameter(0), sharding={devices=[8,1,4]<=[32]}
+  %reshape.44 = bf16[20480,65536]{1,0} parameter(1), sharding={devices=[4,4,2]0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31 last_tile_dim_replicate}
+  ROOT %dot.339 = bf16[64,2048,65536]{2,1,0} dot(bf16[64,2048,20480]{2,1,0} %copy.11, bf16[20480,65536]{1,0} %reshape.44), lhs_contracting_dims={2}, rhs_contracting_dims={0}, sharding={devices=[8,1,4]<=[32]}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/32,
+                           /*conv_halo_exchange_always_on_lhs=*/true,
+                           /*choose_faster_windowed_einsum=*/true,
+                           /*unroll_windowed_einsum=*/false,
+                           /*bidirectional_windowed_einsum=*/true,
+                           /*threshold_for_windowed_einsum_mib=*/-1));
+  XLA_VLOG_LINES(1, module->ToString());
+
+  // Check while op.
+  const auto collective_permute =
+      AllOf(op::CollectivePermute(), op::Shape("bf16[8,2048,1,5120]"));
+  const auto broadcast =
+      AllOf(op::Broadcast(), op::Shape("bf16[8,2048,16384]"));
+  const auto all_reduce =
+      AllOf(op::AllReduce(), op::Shape("bf16[20480,16384]"));
+  const auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::GetTupleElement(op::While(op::Tuple(
+                              op::Reshape(), all_reduce, op::Broadcast(),
+                              collective_permute, op::Constant()))),
+                          op::Shape("bf16[8,2048,16384]")));
+}
+
+TEST_P(SpmdPartitioningTest, ComplexReshapeReshard) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY %extracted_computation (param: f32[13,128,312,16,312]) -> f32[13,39936,4992] {
+  %param = f32[13,128,312,16,312]{4,2,3,1,0} parameter(0)
+  %copy.1261 = f32[13,128,312,16,312]{4,3,2,1,0} copy(f32[13,128,312,16,312]{4,2,3,1,0} %param), sharding={devices=[1,32,1,2,1,2]<=[2,64]T(1,0) last_tile_dim_replicate}
+  %reshape.27217 = f32[13,39936,4992]{2,1,0} reshape(f32[13,128,312,16,312]{4,3,2,1,0} %copy.1261), sharding={devices=[1,2,32,2]<=[2,32,2]T(2,1,0) last_tile_dim_replicate}
+  %copy.1260 = f32[13,39936,4992]{2,1,0} copy(f32[13,39936,4992]{2,1,0} %reshape.27217), sharding={devices=[1,2,32,2]<=[2,32,2]T(2,1,0) last_tile_dim_replicate}
+  ROOT %copy = f32[13,39936,4992]{2,1,0} copy(f32[13,39936,4992]{2,1,0} %copy.1260)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/128,
+                           /*conv_halo_exchange_always_on_lhs=*/true,
+                           /*choose_faster_windowed_einsum=*/true,
+                           /*unroll_windowed_einsum=*/false,
+                           /*bidirectional_windowed_einsum=*/true,
+                           /*threshold_for_windowed_einsum_mib=*/-1));
+  XLA_VLOG_LINES(1, module->ToString());
+  // Check an all-to-all is emitted for resharding.
+  auto all_to_all = FindInstruction(module.get(), HloOpcode::kAllToAll);
+  EXPECT_NE(all_to_all, nullptr);
+}
+
 }  // namespace
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
index 76ef60a3c3178a..8c51f549bc087e 100644
--- a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
+++ b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
@@ -45,9 +45,11 @@ class StatefulRngSpmdPartitioningVisitor
 
 class StatefulRngSpmdPartitioner : public spmd::SpmdPartitioner {
  public:
-  StatefulRngSpmdPartitioner(int64_t num_partitions, int64_t num_replicas)
-      : spmd::SpmdPartitioner(num_partitions, num_replicas,
-                              GetSpmdPartitionerOptions()) {}
+  StatefulRngSpmdPartitioner(int64_t num_partitions, int64_t num_replicas,
+                             int64_t threshold_for_windowed_einsum_mib = 100000)
+      : spmd::SpmdPartitioner(
+            num_partitions, num_replicas,
+            GetSpmdPartitionerOptions(threshold_for_windowed_einsum_mib)) {}
 
  protected:
   std::unique_ptr<spmd::SpmdPartitioningVisitor> CreateVisitor(
@@ -64,12 +66,12 @@ class StatefulRngSpmdPartitioner : public spmd::SpmdPartitioner {
       const HloInstruction* hlo) override;
 
  private:
-  static spmd::SpmdPartitionerOptions GetSpmdPartitionerOptions() {
+  static spmd::SpmdPartitionerOptions GetSpmdPartitionerOptions(
+      int64_t threshold_for_windowed_einsum_mib) {
     spmd::SpmdPartitionerOptions options;
     options.allow_module_signature_change = true;
-    // Setting windowed einsum threshold to be large to disable it for GPU by
-    // default.
-    options.threshold_for_windowed_einsum_mib = 100000;
+    options.threshold_for_windowed_einsum_mib =
+        threshold_for_windowed_einsum_mib;
     return options;
   }
 };
diff --git a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc
index 56fc6f9d0553b4..7907e533b78618 100644
--- a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc
@@ -116,6 +116,16 @@ ENTRY entry {
   VerifyNoAllReduce(module.get());
 }
 
+TEST_F(StatefulRngSpmdPartitionerTest, VerifyThresholdSetCorrectly) {
+  auto debug_options = HloTestBase::GetDebugOptionsForTest();
+  int64_t threshold = 400;
+  debug_options.set_xla_gpu_threshold_for_windowed_einsum_mib(threshold);
+  StatefulRngSpmdPartitioner rng_spmd_partitioner(
+      /*num_partitions=*/2, /*num_replicas*/ 1,
+      debug_options.xla_gpu_threshold_for_windowed_einsum_mib());
+  EXPECT_EQ(rng_spmd_partitioner.options().threshold_for_windowed_einsum_mib,
+            threshold);
+}
 }  // namespace
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/service/topk_rewriter.cc b/third_party/xla/xla/service/topk_rewriter.cc
index 4c2dfe6595b5b1..6310f5d9042bbc 100644
--- a/third_party/xla/xla/service/topk_rewriter.cc
+++ b/third_party/xla/xla/service/topk_rewriter.cc
@@ -21,15 +21,17 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/strings/match.h"
+#include "absl/container/flat_hash_set.h"
 #include "xla/client/lib/comparators.h"
 #include "xla/client/xla_builder.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape_util.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
@@ -245,133 +247,157 @@ std::optional<int64_t> TopkRewriter::SortIsInTopK(HloInstruction* inst) {
   return k;
 }
 
-StatusOr<bool> TopkRewriter::TransformToCustomCall(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  bool changed = false;
-  for (HloComputation* comp : module->computations(execution_threads)) {
-    for (HloInstruction* inst : comp->MakeInstructionPostOrder()) {
-      // Check if sort is in TopK.
-      std::optional<int64_t> k = SortIsInTopK(inst);
-      if (!k) {
-        continue;
-      }
+struct TopKCustomCall {
+  HloInstruction* topk;
+  HloInstruction* value_gte;
+  HloInstruction* index_gte;
+};
 
-      HloSortInstruction* sort = DynCast<HloSortInstruction>(inst);
-      HloInstruction* data = sort->mutable_operand(0);
-      const PrimitiveType element_type = data->shape().element_type();
-      const Shape data_shape = data->shape();
+TopKCustomCall CreateTopKCustomCall(HloInstruction* input,
+                                    const int64_t sort_dim, const int64_t k,
+                                    HloComputation* comparator,
+                                    HloComputation* comp) {
+  Shape data_shape = input->shape();
+  PrimitiveType element_type = data_shape.element_type();
+  bool has_batch = data_shape.rank() >= 2;
+  int64_t input_size = data_shape.dimensions(sort_dim);
+  int64_t batch_size = 1;
+  Shape topk_input_shape;
+
+  if (has_batch) {
+    // The TopK custom call expects either a 1d tensor or a 2d tensor with
+    // the last dimension being the sort dimension. An input with rank > 2
+    // is reshaped into a 2d tensor by combining non-sort dimensions into a
+    // single batch dimension. The original non-sort dimensions are
+    // restored for the outputs with another reshape after the custom call.
+    batch_size =
+        ShapeUtil::ElementsIn(data_shape) / data_shape.dimensions(sort_dim);
+    topk_input_shape =
+        ShapeUtil::MakeShape(element_type, {batch_size, input_size});
+
+    if (data_shape.rank() > 2) {
+      // Reshape to 2d.
+      input = comp->AddInstruction(HloInstruction::CreateReshape(
+          sort_dim == 0
+              ? ShapeUtil::MakeShape(element_type, {input_size, batch_size})
+              : ShapeUtil::MakeShape(element_type, {batch_size, input_size}),
+          input));
+    }
 
-      if (element_type != F32 && element_type != BF16) {
-        continue;
-      }
+    if (sort_dim == 0) {
+      // Transpose for the custom call when sorting the first dimension.
+      input = comp->AddInstruction(
+          HloInstruction::CreateTranspose(topk_input_shape, input, {1, 0}));
+    }
+  } else {
+    topk_input_shape = data_shape;
+  }
 
-      // Sort dimension must be the first or last dimension.
-      const int64_t sort_dim = sort->sort_dimension();
-      if (sort_dim != 0 && sort_dim != data_shape.rank() - 1) {
-        continue;
-      }
+  Shape topk_shape =
+      has_batch
+          ? ShapeUtil::MakeTupleShape(
+                {ShapeUtil::MakeShape(element_type, {batch_size, k}),
+                 ShapeUtil::MakeShape(S32, {batch_size, k})})
+          : ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(element_type, {k}),
+                                       ShapeUtil::MakeShape(S32, {k})});
+  HloInstruction* topk = comp->AddInstruction(HloInstruction::CreateCustomCall(
+      topk_shape, {input}, /*to_apply=*/comparator, "TopK"));
+  HloInstruction* value_gte =
+      comp->AddInstruction(HloInstruction::CreateGetTupleElement(
+          topk->shape().tuple_shapes(0), topk, 0));
+  HloInstruction* index_gte =
+      comp->AddInstruction(HloInstruction::CreateGetTupleElement(
+          topk->shape().tuple_shapes(1), topk, 1));
+
+  if (has_batch) {
+    if (sort_dim == 0) {
+      // Transpose back.
+      value_gte = comp->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(element_type, {k, batch_size}), value_gte,
+          {1, 0}));
+      index_gte = comp->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(S32, {k, batch_size}), index_gte, {1, 0}));
+    }
+    if (data_shape.rank() > 2) {
+      // Reshape back.
+      std::vector<int64_t> shape_dim(data_shape.dimensions().begin(),
+                                     data_shape.dimensions().end());
+      shape_dim[sort_dim] = k;
+      value_gte = comp->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(element_type, shape_dim), value_gte));
+      index_gte = comp->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(S32, shape_dim), index_gte));
+    }
+  }
+  return {topk, value_gte, index_gte};
+}
 
-      // Profitability check.
-      if (!is_profitable_to_convert_(sort, *k)) {
-        continue;
-      }
+StatusOr<HloInstruction*> TopkRewriter::TransformPatternToCustomCall(
+    HloInstruction* inst) {
+  // Check if sort is in TopK.
+  std::optional<int64_t> k = SortIsInTopK(inst);
+  if (!k) {
+    return nullptr;
+  }
 
-      HloInstruction* input = data;
-      const bool has_batch = data_shape.rank() >= 2;
-      const int64_t input_size = data_shape.dimensions(sort_dim);
-      int64_t batch_size = 1;
-      Shape topk_input_shape;
-
-      if (has_batch) {
-        // The TopK custom call expects either a 1d tensor or a 2d tensor with
-        // the last dimension being the sort dimension. An input with rank > 2
-        // is reshaped into a 2d tensor by combining non-sort dimensions into a
-        // single batch dimension. The original non-sort dimensions are
-        // restored for the outputs with another reshape after the custom call.
-        batch_size =
-            ShapeUtil::ElementsIn(data_shape) / data_shape.dimensions(sort_dim);
-        topk_input_shape =
-            ShapeUtil::MakeShape(element_type, {batch_size, input_size});
-
-        if (data_shape.rank() > 2) {
-          // Reshape to 2d.
-          input = comp->AddInstruction(HloInstruction::CreateReshape(
-              sort_dim == 0
-                  ? ShapeUtil::MakeShape(element_type, {input_size, batch_size})
-                  : ShapeUtil::MakeShape(element_type,
-                                         {batch_size, input_size}),
-              input));
-        }
+  HloSortInstruction* sort = DynCast<HloSortInstruction>(inst);
+  HloInstruction* data = sort->mutable_operand(0);
+  const PrimitiveType element_type = data->shape().element_type();
 
-        if (sort_dim == 0) {
-          // Transpose for the custom call when sorting the first dimension.
-          input = comp->AddInstruction(
-              HloInstruction::CreateTranspose(topk_input_shape, input, {1, 0}));
-        }
-      } else {
-        topk_input_shape = data_shape;
-      }
+  if (element_type != F32 && element_type != BF16) {
+    return nullptr;
+  }
 
-      Shape topk_shape =
-          has_batch ? ShapeUtil::MakeTupleShape(
-                          {ShapeUtil::MakeShape(element_type,
-                                                {batch_size, k.value()}),
-                           ShapeUtil::MakeShape(S32, {batch_size, k.value()})})
-                    : ShapeUtil::MakeTupleShape(
-                          {ShapeUtil::MakeShape(element_type, {k.value()}),
-                           ShapeUtil::MakeShape(S32, {k.value()})});
-      HloInstruction* topk =
-          comp->AddInstruction(HloInstruction::CreateCustomCall(
-              topk_shape, {input}, /*to_apply=*/sort->to_apply(), "TopK"));
-      HloInstruction* value_gte =
-          comp->AddInstruction(HloInstruction::CreateGetTupleElement(
-              topk->shape().tuple_shapes(0), topk, 0));
-      HloInstruction* index_gte =
-          comp->AddInstruction(HloInstruction::CreateGetTupleElement(
-              topk->shape().tuple_shapes(1), topk, 1));
-
-      if (has_batch) {
-        if (sort_dim == 0) {
-          // Transpose back.
-          value_gte = comp->AddInstruction(HloInstruction::CreateTranspose(
-              ShapeUtil::MakeShape(element_type, {k.value(), batch_size}),
-              value_gte, {1, 0}));
-          index_gte = comp->AddInstruction(HloInstruction::CreateTranspose(
-              ShapeUtil::MakeShape(S32, {k.value(), batch_size}), index_gte,
-              {1, 0}));
-        }
-        if (data_shape.rank() > 2) {
-          // Reshape back.
-          std::vector<int64_t> shape_dim(data_shape.dimensions().begin(),
-                                         data_shape.dimensions().end());
-          shape_dim[sort_dim] = k.value();
-          value_gte = comp->AddInstruction(HloInstruction::CreateReshape(
-              ShapeUtil::MakeShape(element_type, shape_dim), value_gte));
-          index_gte = comp->AddInstruction(HloInstruction::CreateReshape(
-              ShapeUtil::MakeShape(S32, shape_dim), index_gte));
-        }
-      }
+  // Sort dimension must be the first or last dimension.
+  const int64_t sort_dim = sort->sort_dimension();
+  if (sort_dim != 0 && sort_dim != data->shape().rank() - 1) {
+    return nullptr;
+  }
 
-      for (HloInstruction* user : sort->users()) {
-        if (sort->operand_count() == 2) {
-          HloInstruction* gte = user;
-          for (HloInstruction* slice : gte->users()) {
-            if (gte->tuple_index() == 0) {
-              TF_RETURN_IF_ERROR(slice->ReplaceAllUsesWith(value_gte));
-            } else if (gte->tuple_index() == 1) {
-              TF_RETURN_IF_ERROR(slice->ReplaceAllUsesWith(index_gte));
-            } else {
-              LOG(FATAL) << "Sort with more than 2 output isn't supported in "
-                            "topk rewriter";
-            }
-          }
+  // Profitability check.
+  if (!is_profitable_to_convert_(sort, *k)) {
+    return nullptr;
+  }
+
+  TopKCustomCall topkcc = CreateTopKCustomCall(
+      data, sort_dim, k.value(), sort->to_apply(), inst->parent());
+
+  for (HloInstruction* user : sort->users()) {
+    if (sort->operand_count() == 2) {
+      HloInstruction* gte = user;
+      for (HloInstruction* slice : gte->users()) {
+        if (gte->tuple_index() == 0) {
+          TF_RETURN_IF_ERROR(slice->ReplaceAllUsesWith(topkcc.value_gte));
+        } else if (gte->tuple_index() == 1) {
+          TF_RETURN_IF_ERROR(slice->ReplaceAllUsesWith(topkcc.index_gte));
         } else {
-          TF_RETURN_IF_ERROR(user->ReplaceAllUsesWith(value_gte));
+          // The line below should be unreachable. SortIsInTopK() already checks
+          // that sort has either 1 or 2 operands. Reaching this line indicates
+          // a programming error (not a bad input), so crashing is OK.
+          LOG(FATAL) << "Sort with more than 2 output isn't supported in "
+                        "topk rewriter";
         }
       }
-      VLOG(2) << "Rewritten Topk: " << topk->ToString();
-      changed = true;
+    } else {
+      TF_RETURN_IF_ERROR(user->ReplaceAllUsesWith(topkcc.value_gte));
+    }
+  }
+
+  return topkcc.topk;
+}
+
+StatusOr<bool> TopkRewriter::TransformToCustomCall(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  for (HloComputation* comp : module->computations(execution_threads)) {
+    for (HloInstruction* inst : comp->MakeInstructionPostOrder()) {
+      TF_ASSIGN_OR_RETURN(HloInstruction * topkcc,
+                          TransformPatternToCustomCall(inst));
+      if (topkcc != nullptr) {
+        VLOG(2) << "Rewritten Topk: " << topkcc->ToString();
+        changed = true;
+      }
     }
   }
   return changed;
@@ -414,37 +440,26 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
   }
 
  private:
-  StatusOr<HloComputation*> CreateVariadicComparator(HloInstruction* topk) {
+  bool HasSingleUserReadingOnlyTheValueOutput(HloInstruction* inst) {
+    return inst->user_count() == 1 && inst->users().front()->tuple_index() == 0;
+  }
+
+  StatusOr<HloComputation*> CreateVariadicComparator(HloInstruction* inst) {
+    HloTopKInstruction* topk = DynCast<HloTopKInstruction>(inst);
     XlaBuilder b(absl::StrCat("comparator_", topk->name()));
     std::vector<PrimitiveType> ptypes = {
-        topk->operand(0)->shape().element_type(), PrimitiveType::S32};
-    HloComputation* comparison_computation = topk->to_apply();
-
-    auto comparison = [&]() -> StatusOr<XlaComputation> {
-      if (Match(comparison_computation->root_instruction(),
-                m::Compare(m::Parameter(0), m::Parameter(1))
-                    .WithComparisonDirection(ComparisonDirection::kGt)) ||
-          Match(comparison_computation->root_instruction(),
-                m::Compare(m::Parameter(1), m::Parameter(0))
-                    .WithComparisonDirection(ComparisonDirection::kLt))) {
-        return CreateScalarGtComputation(ptypes, &b);
-      } else if (Match(
-                     comparison_computation->root_instruction(),
-                     m::Compare(m::Parameter(0), m::Parameter(1))
-                         .WithComparisonDirection(ComparisonDirection::kLt)) ||
-                 Match(
-                     comparison_computation->root_instruction(),
-                     m::Compare(m::Parameter(1), m::Parameter(0))
-                         .WithComparisonDirection(ComparisonDirection::kGt))) {
-        return CreateScalarLtComputation(ptypes, &b);
-      } else {
-        return InternalError("Unexpected comparator: %s",
-                             comparison_computation->ToString());
-      }
-    }();
-    TF_RETURN_IF_ERROR(comparison.status());
+        topk->operand(0)->shape().element_type()};
+
+    if (!HasSingleUserReadingOnlyTheValueOutput(inst)) {
+      ptypes.emplace_back(PrimitiveType::S32);
+    }
+
+    XlaComputation comparison = topk->largest()
+                                    ? CreateScalarGtComputation(ptypes, &b)
+                                    : CreateScalarLtComputation(ptypes, &b);
+
     TF_ASSIGN_OR_RETURN(HloComputation * comparator,
-                        BuilderToHloComputation(*comparison, topk->parent()));
+                        BuilderToHloComputation(comparison, topk->parent()));
     return comparator;
   }
 
@@ -467,10 +482,10 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
     };
     CHECK_NE(variadic_comparator, nullptr);
     // If only the topk values are necessary, skip the iota.
-    if (call->user_count() == 1 && call->users().front()->tuple_index() == 0 &&
-        call->to_apply()->num_parameters() == 2) {
+    if (HasSingleUserReadingOnlyTheValueOutput(call) &&
+        variadic_comparator->num_parameters() == 2) {
       HloInstruction* sort = comp->AddInstruction(HloInstruction::CreateSort(
-          {input->shape()}, sort_dimension, {input}, call->to_apply(),
+          {input->shape()}, sort_dimension, {input}, variadic_comparator,
           /*is_stable=*/true));
       TF_RETURN_IF_ERROR(ReplaceInstruction(
           call->users().front(),
diff --git a/third_party/xla/xla/service/topk_rewriter.h b/third_party/xla/xla/service/topk_rewriter.h
index 21f96c01002521..f32aa83166feea 100644
--- a/third_party/xla/xla/service/topk_rewriter.h
+++ b/third_party/xla/xla/service/topk_rewriter.h
@@ -59,6 +59,10 @@ class TopkRewriter : public HloModulePass {
   // converted into a custom call.
   std::function<bool(const HloSortInstruction*, int64_t)>
       is_profitable_to_convert_;
+
+  // Matches the input to the sort+iota+slice pattern and converts to custom
+  // call if profitable. Returns the custom call if one was created.
+  StatusOr<HloInstruction*> TransformPatternToCustomCall(HloInstruction* inst);
 };
 
 class TopkDecomposer : public HloModulePass {
diff --git a/third_party/xla/xla/service/topk_rewriter_test.cc b/third_party/xla/xla/service/topk_rewriter_test.cc
index 208b5e0260feeb..655a728cc8b0e6 100644
--- a/third_party/xla/xla/service/topk_rewriter_test.cc
+++ b/third_party/xla/xla/service/topk_rewriter_test.cc
@@ -662,15 +662,9 @@ TEST_F(TopkRewriterTest, TopKDecomposition) {
   const std::string hlo_string = R"(
 HloModule topk
 
-compare {
-  p.0.lhs = bf16[] parameter(0)
-  p.0.rhs = bf16[] parameter(1)
-  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=GT
-}
-
 ENTRY TopK {
   x = bf16[10,10]{0,1} parameter(0)
-  ROOT topk = (bf16[10,2]{0,1}, s32[10,2]{0,1}) topk(x), k=2, to_apply=compare
+  ROOT topk = (bf16[10,2]{0,1}, s32[10,2]{0,1}) topk(x), k=2, largest=true
 }
 
 )";
diff --git a/third_party/xla/xla/service/transfer_manager.cc b/third_party/xla/xla/service/transfer_manager.cc
index 2f9cd018e375f9..395e4086fea203 100644
--- a/third_party/xla/xla/service/transfer_manager.cc
+++ b/third_party/xla/xla/service/transfer_manager.cc
@@ -163,8 +163,7 @@ Status TransferManager::ReadDynamicShapes(se::Stream* stream,
           return InvalidArgument("Dynamic shape metadata size should not be 0");
         }
         auto buffer_8 = se::DeviceMemory<uint8_t>(buffer);
-        auto metadata_buffer =
-            stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
+        auto metadata_buffer = buffer_8.GetSlice(offset, metadata_size);
         TF_ASSIGN_OR_RETURN(
             auto metadata,
             TransferArrayFromDevice(
diff --git a/third_party/xla/xla/service/tuple_points_to_analysis.cc b/third_party/xla/xla/service/tuple_points_to_analysis.cc
index 612fb488b26054..1d6e31bfd5584d 100644
--- a/third_party/xla/xla/service/tuple_points_to_analysis.cc
+++ b/third_party/xla/xla/service/tuple_points_to_analysis.cc
@@ -326,7 +326,7 @@ Status TuplePointsToAnalysis::HandleAsyncStart(HloInstruction* async_start) {
       [&](const ShapeIndex& target_index, PointsToSet::BufferList* buffers) {
         if (target_index.size() >= 2 && target_index.front() == 0) {
           const PointsToSet& operand_points_to_set =
-              GetPointsToSet(async_start->operand(target_index.at(1)));
+              GetPointsToSet(async_start->operand(target_index[1]));
           ShapeIndex source_index(target_index.begin() + 2, target_index.end());
           *buffers = operand_points_to_set.element(source_index);
           for (HloInstruction* tuple :
@@ -645,7 +645,7 @@ StatusOr<const LogicalBuffer*> TuplePointsToAnalysis::GetBufferDefinedAt(
 
 const TuplePointsToAnalysis::BufferAliasVector&
 TuplePointsToAnalysis::GetBufferAliases(const LogicalBuffer& buffer) const {
-  return logical_buffer_aliases_.at(buffer.id());
+  return logical_buffer_aliases_[buffer.id()];
 }
 
 const TuplePointsToAnalysis::BufferDefinitionVector&
@@ -719,7 +719,7 @@ std::string TuplePointsToAnalysis::ToString() const {
   absl::StrAppend(&output, "LogicalBuffers:\n");
   for (const auto& b : logical_buffer_analysis_->logical_buffers()) {
     absl::StrAppend(&output, "  buffer ", b->ToString(), ":\n");
-    for (const BufferAlias& alias : logical_buffer_aliases_.at(b->id())) {
+    for (const BufferAlias& alias : logical_buffer_aliases_[b->id()]) {
       absl::StrAppend(&output, "    alias ", alias.ToString(), "\n");
     }
   }
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking.cc b/third_party/xla/xla/service/while_loop_fusible_sinking.cc
new file mode 100644
index 00000000000000..a3465f01062c8f
--- /dev/null
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking.cc
@@ -0,0 +1,265 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/while_loop_fusible_sinking.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/while_util.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+
+namespace xla {
+
+namespace {
+// Constant and Iota have no operands and an output and broadcasts add
+// dimensions to the output so we are looking fusions that have much smaller
+// operand sizes compared to output sizes to avoid materialization
+bool IsPurelyExpanding(const HloInstruction* instr) {
+  return instr->opcode() == HloOpcode::kBroadcast ||
+         (instr->opcode() == HloOpcode::kConstant &&
+          instr->shape().rank() == 0) ||
+         instr->opcode() == HloOpcode::kIota;
+}
+
+bool IsFusionCandidate(const HloInstruction* instr) {
+  return instr->opcode() != HloOpcode::kRng &&
+         (instr->IsElementwise() || instr->opcode() == HloOpcode::kReshape ||
+          instr->opcode() == HloOpcode::kTranspose);
+}
+}  // namespace
+
+bool WhileLoopFusibleSinking::IsSinkableFusion(HloInstruction* while_operand) {
+  absl::InlinedVector<HloInstruction*, 8> worklist;
+  absl::flat_hash_set<int> visited;
+  worklist.push_back(while_operand);
+  while (!worklist.empty()) {
+    HloInstruction* to_process = worklist.back();
+    worklist.pop_back();
+    if (!to_process->IsFusible()) {
+      return false;
+    }
+    if (!visited.insert(to_process->unique_id()).second) {
+      // Do not sink extremely large subgraphs as they will be expensive to
+      // recompute in the loop.
+      if (visited.size() > 100) {
+        return false;
+      }
+      continue;
+    }
+    if (IsPurelyExpanding(to_process)) {
+      continue;
+    }
+    if (IsFusionCandidate(to_process)) {
+      for (auto* op : to_process->operands()) {
+        worklist.push_back(op);
+      }
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+HloInstruction* WhileLoopFusibleSinking::CreateSinkableFusion(
+    HloInstruction* while_operand) {
+  HloInstruction* fusion =
+      while_operand->AddInstruction(while_operand->CreateFusion(
+          while_operand->shape(), HloInstruction::FusionKind::kLoop,
+          while_operand));
+  bool did_fuse = IsFusionCandidate(while_operand);
+  // Fuse up to broadcasts, this function expects that IsSinkableFusion is true
+  // and does not verify that
+  while (did_fuse) {
+    did_fuse = false;
+    for (int64_t i = fusion->operand_count() - 1; i >= 0; --i) {
+      HloInstruction* op = fusion->mutable_operand(i);
+      if (IsPurelyExpanding(op)) {
+        continue;
+      }
+      fusion->FuseInstruction(op);
+      did_fuse = true;
+      break;
+    }
+  }
+  // Fuse the broadcasts, constants and iota at the terminals.
+  did_fuse = true;
+  while (did_fuse) {
+    did_fuse = false;
+    for (int64_t i = fusion->operand_count() - 1; i >= 0; --i) {
+      HloInstruction* op = fusion->mutable_operand(i);
+      if (IsPurelyExpanding(op)) {
+        fusion->FuseInstruction(op);
+        did_fuse = true;
+        break;
+      }
+    }
+  }
+  return fusion;
+}
+
+StatusOr<bool> WhileLoopFusibleSinking::TrySinkingFusiblesIntoWhileLoop(
+    HloInstruction* while_instr) {
+  HloComputation* while_cond = while_instr->while_condition();
+  HloComputation* while_body = while_instr->while_body();
+
+  // Don't try to mutate unflattened while loop computations.
+  if (call_counts_[while_body] > 1 || call_counts_[while_cond] > 1) {
+    return false;
+  }
+  HloInstruction* init_value = while_instr->mutable_operand(0);
+  if (init_value->opcode() != HloOpcode::kTuple) {
+    return false;
+  }
+
+  bool changed = false;
+
+  absl::flat_hash_map<int64_t, absl::InlinedVector<HloInstruction*, 1>>
+      conditional_gte_index_to_insts =
+          WhileUtil::GetGTEsMapForWhileConditional(*while_cond);
+  std::vector<HloInstruction*> invariant_body_gtes =
+      WhileUtil::GetInvariantGTEsForWhileBody(*while_body);
+  std::vector<int64_t> tuple_indices;
+  std::vector<HloInstruction*> new_operands;
+
+  for (HloInstruction* invariant_body_gte : invariant_body_gtes) {
+    int64_t index = invariant_body_gte->tuple_index();
+    HloInstruction* invariant_value = init_value->mutable_operand(index);
+
+    if (init_value->IsRoot() || init_value->user_count() > 1) {
+      init_value = init_value->AddInstruction(init_value->Clone());
+      TF_RETURN_IF_ERROR(while_instr->ReplaceOperandWith(0, init_value));
+    }
+    // Original value should be a fusible subgraph.
+    if (!IsSinkableFusion(invariant_value)) {
+      continue;
+    }
+    HloInstruction* fusion = CreateSinkableFusion(invariant_value);
+    changed = true;
+    if (fusion->operand_count() > 0 &&
+        (while_instr->IsRoot() ||
+         absl::c_any_of(while_instr->users(), [&](HloInstruction* use) {
+           return use->opcode() != HloOpcode::kGetTupleElement;
+         }))) {
+      // This really only occurs in unit tests or toy programs. Copy the current
+      // users for later replacement.
+      auto uses = while_instr->users();
+      std::vector<HloInstruction*> gtes(init_value->operand_count());
+      for (int64_t i = 0; i < gtes.size(); ++i) {
+        gtes[i] = while_instr->AddInstruction(
+            HloInstruction::CreateGetTupleElement(while_instr, i));
+      }
+      HloInstruction* tuple =
+          while_instr->AddInstruction(HloInstruction::CreateTuple(gtes));
+      if (while_instr->IsRoot()) {
+        while_instr->parent()->set_root_instruction(tuple);
+      }
+      if (!uses.empty()) {
+        TF_RETURN_IF_ERROR(while_instr->ReplaceUsesWith(uses, tuple));
+      }
+    }
+
+    absl::InlinedVector<HloInstruction*, 2> invariant_output_uses;
+    for (auto use : while_instr->users()) {
+      if (use->opcode() == HloOpcode::kGetTupleElement &&
+          use->tuple_index() == index) {
+        invariant_output_uses.push_back(use);
+      }
+    }
+    for (auto use : invariant_output_uses) {
+      TF_RETURN_IF_ERROR(
+          while_instr->parent()->ReplaceInstruction(use, invariant_value));
+    }
+
+    HloInstruction* root = while_body->root_instruction();
+    HloInstruction* parameter = while_body->parameter_instruction(0);
+    tuple_indices.resize(fusion->operand_count());
+    int64_t next_index = init_value->operand_count();
+    new_operands.resize(fusion->operand_count());
+    for (int64_t i = 0; i < fusion->operand_count(); ++i) {
+      init_value->AppendOperand(fusion->mutable_operand(i));
+      parameter->mutable_shape()->mutable_tuple_shapes()->push_back(
+          fusion->mutable_operand(i)->shape());
+      new_operands[i] = root->AddInstruction(
+          HloInstruction::CreateGetTupleElement(parameter, next_index++));
+      root->AppendOperand(new_operands[i]);
+    }
+    *(init_value->mutable_shape()) = parameter->shape();
+    *(while_instr->mutable_shape()) = parameter->shape();
+    *(while_cond->parameter_instruction(0)->mutable_shape()) =
+        parameter->shape();
+    *(root->mutable_shape()) = parameter->shape();
+    auto cloned_fusion = while_body->AddInstruction(
+        fusion->CloneWithNewOperands(fusion->shape(), new_operands));
+    TF_RETURN_IF_ERROR(fusion->parent()->RemoveInstruction(fusion));
+    TF_RETURN_IF_ERROR(
+        while_body->ReplaceInstruction(invariant_body_gte, cloned_fusion));
+    TF_RETURN_IF_ERROR(cloned_fusion->Defuse());
+  }
+
+  return changed;
+}
+
+StatusOr<bool> WhileLoopFusibleSinking::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  std::vector<HloInstruction*> while_instrs;
+  for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
+    // Right now we don't particularly care about optimizing while-of-while
+    // patterns.  If/When we do, we'll want to visit the outer while (while_0)
+    // before we visit the inner while (while_1):
+    //
+    // while_1_body(state) {
+    //   val = gte(state, 0) // Loop invariant
+    //   use(val)
+    // }
+    //
+    // while_0_body(state) {
+    //   val = gte(state, 0) // Loop invariant
+    //   while_1 = while(init=tuple(val, ...), body=while_1_body, ...)
+    //   ...
+    // }
+    //
+    // main {
+    //   while_0 = while(init=(fusible, ...), body=while_0_body, ...)
+    // }
+    //
+    // This will let us sink the fusible into the outer while first and then
+    // into the inner while in a single run of this pass.
+    absl::c_copy_if(comp->instructions(), std::back_inserter(while_instrs),
+                    HloPredicateIsOp<HloOpcode::kWhile>);
+  }
+
+  for (HloInstruction* while_instr : while_instrs) {
+    call_counts_[while_instr->while_body()]++;
+    call_counts_[while_instr->while_condition()]++;
+  }
+
+  for (HloInstruction* while_instr : while_instrs) {
+    TF_ASSIGN_OR_RETURN(bool result,
+                        TrySinkingFusiblesIntoWhileLoop(while_instr));
+    changed |= result;
+  }
+  return changed;
+}
+}  // namespace xla
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking.h b/third_party/xla/xla/service/while_loop_fusible_sinking.h
new file mode 100644
index 00000000000000..26caf689b20465
--- /dev/null
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking.h
@@ -0,0 +1,82 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_WHILE_LOOP_FUSIBLE_SINKING_H_
+#define XLA_SERVICE_WHILE_LOOP_FUSIBLE_SINKING_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+#include "xla/statusor.h"
+
+namespace xla {
+
+// Sinks while loop invariant values that happen to be fusibles into the while
+// loop body and conditional. This is probably not a win in isolation but may
+// unlock further optimizations like fusible folding.
+//
+//   state = (..., fusible_graph, ...)
+//   while (pred(state)) {
+//     (..., v, ...) = state
+//     use(v)
+//     state = (..., v, ...)
+//   }
+//
+// =>
+//
+//   state = (..., fusbile_graph, ..., fusible_graph_operands)
+//   while (pred(state)) {
+//     (..., v, ...) = state
+//     use(fusibile_graph)
+//     state = (..., v, ...)
+//   }
+//
+// Note that it leaves the `v` in place to keep that component of the state
+// tuple trivially loop invariant.  WhileLoopSimplifier will later get rid of
+// `v`.
+//
+class WhileLoopFusibleSinking : public HloModulePass {
+ public:
+  WhileLoopFusibleSinking() = default;
+
+  ~WhileLoopFusibleSinking() override = default;
+
+  absl::string_view name() const override {
+    return "while-loop-fusible-sinking";
+  }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  // Sink a fusible subgraph into a while loop.
+  StatusOr<bool> TrySinkingFusiblesIntoWhileLoop(HloInstruction* while_instr);
+
+  // Creates a loop fusion instruction containing the computation to move into
+  // the while loop to avoid conflicts with actual instruction fusion, the loop
+  // fusion will be defused.
+  bool IsSinkableFusion(HloInstruction* while_operand);
+  HloInstruction* CreateSinkableFusion(HloInstruction* while_operand);
+
+  absl::flat_hash_map<HloComputation*, int> call_counts_;
+};
+}  // namespace xla
+
+#endif  // XLA_SERVICE_WHILE_LOOP_FUSIBLE_SINKING_H_
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc b/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
new file mode 100644
index 00000000000000..2aa5b623536e4c
--- /dev/null
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
@@ -0,0 +1,114 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/while_loop_fusible_sinking.h"
+
+#include "xla/hlo/utils/hlo_matchers.h"
+#include "xla/test.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+using ::testing::_;
+using WhileLoopFusibleSinkingTest = HloTestBase;
+
+TEST_F(WhileLoopFusibleSinkingTest, SinkOneFusible) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[2],f32[2]) parameter(0)
+  p_body.0 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=0
+  p_body.1 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=1
+
+  add.0 = f32[2] add(p_body.0, p_body.1)
+  ROOT root = (f32[2],f32[2]) tuple(add.0, p_body.1)
+}
+
+condition {
+  p_cond = (f32[2],f32[2]) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  const_0 = f32[2] parameter(0)
+  const_1 = f32[2] iota(), iota_dimension=0
+  while_init = (f32[2],f32[2]) tuple(const_0, const_1)
+  ROOT while = (f32[2],f32[2]) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopFusibleSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_body = module->GetComputationWithName("body");
+  EXPECT_THAT(while_body->root_instruction(),
+              op::Tuple(op::Add(_, op::Iota()), _));
+}
+
+TEST_F(WhileLoopFusibleSinkingTest, SinkMask) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[5,7],f32[5,7]) parameter(0)
+  p_body.0 = get-tuple-element(p_body), index=0
+  p_body.1 = get-tuple-element(p_body), index=1
+
+  add.0 = add(p_body.0, p_body.1)
+  ROOT root = tuple(add.0, p_body.1)
+}
+
+condition {
+  p_cond = (f32[5,7],f32[5,7]) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  const_0 = f32[5,7] parameter(0)
+  p = f32[5] parameter(1)
+  a = f32[5,7] iota(), iota_dimension=0
+  b = f32[5,7] iota(), iota_dimension=1
+  c = add(a, b)
+  d = f32[5,7] broadcast(p), dimensions={0}
+  mask = multiply(c,d)
+  while_init = tuple(const_0, mask)
+  ROOT while = while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopFusibleSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_body = module->GetComputationWithName("body");
+  EXPECT_THAT(while_body->root_instruction(),
+              op::Tuple(op::Add(_, op::Multiply(op::Add(op::Iota(), op::Iota()),
+                                                op::Broadcast())),
+                        _, _));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/while_loop_simplifier.cc b/third_party/xla/xla/service/while_loop_simplifier.cc
index d4f3e919f80ae1..bf0e5118c3d55f 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier.cc
@@ -125,10 +125,11 @@ void CopyFrontendAttributes(HloInstruction* old_while_op,
 
 // This is a utility function that removes the given tuple indices from the
 // while loop init, body, and condition. The final shape returned is still the
-// same as before.
+// same as before. If set index_for_replaced will replace any use of the removed
+// indices in the final shape with a copy of the removed index.
 static StatusOr<HloInstruction*> RemoveDeadTupleIndices(
-    HloInstruction* while_op,
-    absl::flat_hash_set<int64_t>& used_tuple_indices) {
+    HloInstruction* while_op, absl::flat_hash_set<int64_t>& used_tuple_indices,
+    int64_t index_for_replaced = -1) {
   // Build up maps from the old/new to the new/old tuple indices.
   std::vector<int64_t> new_to_old_tuple_idx(used_tuple_indices.begin(),
                                             used_tuple_indices.end());
@@ -274,8 +275,11 @@ static StatusOr<HloInstruction*> RemoveDeadTupleIndices(
   const int64_t tuple_size = ShapeUtil::TupleElementCount(while_init->shape());
   for (int64_t old_idx = 0; old_idx < tuple_size; ++old_idx) {
     auto new_tuple_idx_it = old_to_new_tuple_idx.find(old_idx);
-    if (new_tuple_idx_it != old_to_new_tuple_idx.end()) {
-      int64_t gte_idx = new_tuple_idx_it->second;
+    if (new_tuple_idx_it != old_to_new_tuple_idx.end() ||
+        index_for_replaced != -1) {
+      int64_t gte_idx = new_tuple_idx_it != old_to_new_tuple_idx.end()
+                            ? new_tuple_idx_it->second
+                            : index_for_replaced;
       new_tuple_elems.push_back(
           computation->AddInstruction(HloInstruction::CreateGetTupleElement(
               new_while_op->shape().tuple_shapes(gte_idx), new_while_op,
@@ -546,7 +550,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
 // duplicates by replacing them with tuple_index, followed by a call to
 // RemoveDeadTupleIndices.
 static StatusOr<HloInstruction*> TryRemoveRepeatedWhileTupleIndicesHelper(
-    HloInstruction* while_op, const int64_t tuple_index,
+    HloInstruction* while_op, const int64_t tuple_index, bool replace_with_init,
     absl::flat_hash_set<int64_t>& duplicates) {
   HloComputation* while_cond = while_op->while_condition();
   HloComputation* while_body = while_op->while_body();
@@ -586,14 +590,23 @@ static StatusOr<HloInstruction*> TryRemoveRepeatedWhileTupleIndicesHelper(
       used_tuple_indices.insert(index);
     }
   }
-
   // Remove the duplicate tuple elements.
-  TF_ASSIGN_OR_RETURN(while_op,
-                      RemoveDeadTupleIndices(while_op, used_tuple_indices));
+  TF_ASSIGN_OR_RETURN(
+      while_op, RemoveDeadTupleIndices(while_op, used_tuple_indices,
+                                       replace_with_init ? -1 : tuple_index));
 
   return while_op;
 }
 
+// Returns if this instruction looks like an insertion inside a variable of a
+// while loop.
+static bool IsDynamicUpdateSliceWhileInsertion(
+    const HloInstruction* instr, const HloComputation* while_body) {
+  return instr->opcode() == HloOpcode::kDynamicUpdateSlice &&
+         instr->operand(0)->opcode() == HloOpcode::kGetTupleElement &&
+         instr->operand(0)->operand(0) == while_body->parameter_instruction(0);
+}
+
 // If the while loop init passes the same values to several tuple indices, and
 // if the body keeps on passing them through, we can remove the duplicates.
 static StatusOr<bool> TryRemoveRepeatedWhileTupleIndices(
@@ -638,6 +651,7 @@ static StatusOr<bool> TryRemoveRepeatedWhileTupleIndices(
     absl::flat_hash_set<int64_t> duplicates;
     auto* pivot_init_elem = while_init->operand(index_to_investigate);
     auto* pivot_body_elem = while_body_root->operand(index_to_investigate);
+    bool replace_with_init = true;
     if (pivot_body_elem->opcode() == HloOpcode::kGetTupleElement &&
         pivot_body_elem->operand(0) == while_body->parameter_instruction(0)) {
       if (pivot_body_elem->tuple_index() != index_to_investigate) {
@@ -647,6 +661,16 @@ static StatusOr<bool> TryRemoveRepeatedWhileTupleIndices(
         index_to_investigate++;
         continue;
       }
+    } else if (IsDynamicUpdateSliceWhileInsertion(pivot_body_elem,
+                                                  while_body)) {
+      if (pivot_body_elem->operand(0)->tuple_index() != index_to_investigate) {
+        VLOG(2)
+            << "Mismatch between pivot_body_elem->operand(0)->tuple_index() "
+            << pivot_body_elem->operand(0)->tuple_index()
+            << " index_to_investigate " << index_to_investigate;
+        index_to_investigate++;
+        continue;
+      }
     } else {
       index_to_investigate++;
       continue;
@@ -657,13 +681,44 @@ static StatusOr<bool> TryRemoveRepeatedWhileTupleIndices(
          i < while_shape.tuple_shapes_size(); ++i) {
       auto* init_elem = while_init->operand(i);
       auto* body_elem = while_body_root->operand(i);
-      if (body_elem->opcode() == HloOpcode::kGetTupleElement &&
+      if (pivot_body_elem->opcode() == HloOpcode::kGetTupleElement &&
+          body_elem->opcode() == HloOpcode::kGetTupleElement &&
           body_elem->operand(0) == while_body->parameter_instruction(0)) {
         if (body_elem->tuple_index() != i) {
           VLOG(2) << "Mismatch between body_elem->tuple_index() "
                   << body_elem->tuple_index() << " i " << i;
           continue;
         }
+      } else if (IsDynamicUpdateSliceWhileInsertion(pivot_body_elem,
+                                                    while_body) &&
+                 IsDynamicUpdateSliceWhileInsertion(body_elem, while_body)) {
+        if (pivot_body_elem->operand_count() != body_elem->operand_count()) {
+          VLOG(2) << "Mismatch in operand count of dynamic-update-slice "
+                  << pivot_body_elem->operand_count() << " vs "
+                  << body_elem->operand_count();
+          continue;
+        }
+        if (body_elem->operand(0)->tuple_index() != i) {
+          VLOG(2) << "Mismatch between body_elem->operand(0)->tuple_index() "
+                  << body_elem->tuple_index() << " i " << i;
+          continue;
+        }
+        if (pivot_body_elem->operand(0) == body_elem->operand(0)) {
+          VLOG(2) << "Inserting in the same input index";
+          continue;
+        }
+        bool mismatch = false;
+        for (int64_t i = 1; i < body_elem->operand_count(); ++i) {
+          if (body_elem->operand(i) != pivot_body_elem->operand(i)) {
+            VLOG(2) << "Mismatch in insertion indices or values";
+            mismatch = true;
+            break;
+          }
+        }
+        if (mismatch) {
+          continue;
+        }
+        replace_with_init = false;
       } else {
         continue;
       }
@@ -681,9 +736,9 @@ static StatusOr<bool> TryRemoveRepeatedWhileTupleIndices(
     if (!duplicates.empty()) {
       VLOG(2) << "Duplicate found " << duplicates.size() << " pivot_init "
               << pivot_init_elem->ToString();
-      TF_ASSIGN_OR_RETURN(while_op,
-                          TryRemoveRepeatedWhileTupleIndicesHelper(
-                              while_op, index_to_investigate, duplicates));
+      TF_ASSIGN_OR_RETURN(while_op, TryRemoveRepeatedWhileTupleIndicesHelper(
+                                        while_op, index_to_investigate,
+                                        replace_with_init, duplicates));
       changed = true;
       VLOG(2) << "Changed while_op " << while_op->ToString()
               << " while_op operand count " << while_op->operand_count();
diff --git a/third_party/xla/xla/service/while_loop_simplifier_test.cc b/third_party/xla/xla/service/while_loop_simplifier_test.cc
index b3c157aaaee6bf..94d87e754815b4 100644
--- a/third_party/xla/xla/service/while_loop_simplifier_test.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier_test.cc
@@ -1124,5 +1124,64 @@ TEST_F(WhileLoopSimplifierTest, NotRemoveCompare) {
                    .value());
 }
 
+TEST_F(WhileLoopSimplifierTest, RemoveDynUpdSlice) {
+  const std::string hlo_string = R"(
+HloModule jit_scan
+
+%region_0.6 (arg_tuple.7: (s32[], f32[], f32[3], f32[3])) -> (s32[], f32[], f32[3], f32[3]) {
+  %arg_tuple.7 = (s32[], f32[], f32[3]{0}, f32[3]{0}) parameter(0)
+  %get-tuple-element.8 = s32[] get-tuple-element((s32[], f32[], f32[3]{0}, f32[3]{0}) %arg_tuple.7), index=0
+  %constant.12 = s32[] constant(1)
+  %add.28 = s32[] add(s32[] %get-tuple-element.8, s32[] %constant.12)
+  %get-tuple-element.9 = f32[] get-tuple-element((s32[], f32[], f32[3]{0}, f32[3]{0}) %arg_tuple.7), index=1
+  %sine.15 = f32[] sine(f32[] %get-tuple-element.9)
+  %get-tuple-element.10 = f32[3]{0} get-tuple-element((s32[], f32[], f32[3]{0}, f32[3]{0}) %arg_tuple.7), index=2
+  %cosine.16 = f32[] cosine(f32[] %get-tuple-element.9)
+  %reshape.18 = f32[1]{0} reshape(f32[] %cosine.16)
+  %constant.14 = s32[] constant(0)
+  %compare.19 = pred[] compare(s32[] %get-tuple-element.8, s32[] %constant.14), direction=LT
+  %constant.13 = s32[] constant(3)
+  %add.20 = s32[] add(s32[] %get-tuple-element.8, s32[] %constant.13)
+  %select.21 = s32[] select(pred[] %compare.19, s32[] %add.20, s32[] %get-tuple-element.8)
+  %dynamic-update-slice.22 = f32[3]{0} dynamic-update-slice(f32[3]{0} %get-tuple-element.10, f32[1]{0} %reshape.18, s32[] %select.21)
+  %get-tuple-element.11 = f32[3]{0} get-tuple-element((s32[], f32[], f32[3]{0}, f32[3]{0}) %arg_tuple.7), index=3
+  %dynamic-update-slice.27 = f32[3]{0} dynamic-update-slice(f32[3]{0} %get-tuple-element.11, f32[1]{0} %reshape.18, s32[] %select.21)
+  ROOT %tuple.29 = (s32[], f32[], f32[3]{0}, f32[3]{0}) tuple(s32[] %add.28, f32[] %sine.15, f32[3]{0} %dynamic-update-slice.22, f32[3]{0} %dynamic-update-slice.27)
+}
+
+%region_1.30 (arg_tuple.31: (s32[], f32[], f32[3], f32[3])) -> pred[] {
+  %arg_tuple.31 = (s32[], f32[], f32[3]{0}, f32[3]{0}) parameter(0)
+  %get-tuple-element.32 = s32[] get-tuple-element((s32[], f32[], f32[3]{0}, f32[3]{0}) %arg_tuple.31), index=0
+  %constant.36 = s32[] constant(3)
+  ROOT %compare.37 = pred[] compare(s32[] %get-tuple-element.32, s32[] %constant.36), direction=LT
+}
+
+ENTRY %main.44 (Arg_0.1: f32[]) -> (f32[], f32[3], f32[3]) {
+  %constant.4 = s32[] constant(0)
+  %Arg_0.1 = f32[] parameter(0), sharding={replicated}
+  %constant.2 = f32[] constant(0)
+  %broadcast.3 = f32[3]{0} broadcast(f32[] %constant.2), dimensions={}
+  %tuple.5 = (s32[], f32[], f32[3]{0}, f32[3]{0}) tuple(s32[] %constant.4, f32[] %Arg_0.1, f32[3]{0} %broadcast.3, f32[3]{0} %broadcast.3)
+  %while.38 = (s32[], f32[], f32[3]{0}, f32[3]{0}) while((s32[], f32[], f32[3]{0}, f32[3]{0}) %tuple.5), condition=%region_1.30, body=%region_0.6
+  %get-tuple-element.40 = f32[] get-tuple-element((s32[], f32[], f32[3]{0}, f32[3]{0}) %while.38), index=1
+  %get-tuple-element.41 = f32[3]{0} get-tuple-element((s32[], f32[], f32[3]{0}, f32[3]{0}) %while.38), index=2
+  %get-tuple-element.42 = f32[3]{0} get-tuple-element((s32[], f32[], f32[3]{0}, f32[3]{0}) %while.38), index=3
+  ROOT %tuple.43 = (f32[], f32[3]{0}, f32[3]{0}) tuple(f32[] %get-tuple-element.40, f32[3]{0} %get-tuple-element.41, f32[3]{0} %get-tuple-element.42)
+})";
+  auto m = ParseAndReturnVerifiedModule(hlo_string).value();
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).value());
+  HloInstruction* new_while = FindFirstWhile(m.get());
+  Shape new_while_shape = ParseShape("(s32[], f32[], f32[3]{0})").value();
+  EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->root_instruction()->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_condition()->parameter_instruction(0)->shape(),
+      new_while_shape));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/xla_compile_main.cc b/third_party/xla/xla/service/xla_compile_main.cc
index a3a687824210df..6212cf16fe2f6a 100644
--- a/third_party/xla/xla/service/xla_compile_main.cc
+++ b/third_party/xla/xla/service/xla_compile_main.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/cleanup/cleanup.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
@@ -32,39 +33,29 @@ limitations under the License.
 #include "stablehlo/dialect/Register.h"  // from @stablehlo
 #include "xla/autotune_results.pb.h"
 #include "xla/debug_options_flags.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/service/compiler.h"
-#include "xla/service/cpu/cpu_compiler.h"
-#include "xla/service/cpu/cpu_executable.h"
-#include "xla/service/executable.h"
 #include "xla/service/export_hlo.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/symbol_repository.h"
+#include "xla/service/xla_compile_result.pb.h"
 #include "xla/statusor.h"
-#include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/tools/hlo_module_loader.h"
+#include "xla/tools/xla_compile_lib.h"
 #include "xla/util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/protobuf.h"
+#include "tsl/platform/status_to_from_proto.h"
+#include "tsl/platform/types.h"
 #include "tsl/util/command_line_flags.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/service/gpu/autotuner_util.h"
-#include "xla/service/gpu/executable.pb.h"
-#include "xla/service/gpu/gpu_compiler.h"
 #include "xla/service/gpu/gpu_symbol_repository.h"
-#include "xla/stream_executor/gpu/gpu_init.h"
-#endif
-#if GOOGLE_CUDA
-#include "xla/service/gpu/nvptx_compiler.h"
-#elif TENSORFLOW_USE_ROCM
-#include "xla/service/gpu/amdgpu_compiler.h"
 #endif
 
 namespace xla {
@@ -92,75 +83,6 @@ const char kUsageHeader[] =
     "understood by that repository."
     "\n";
 
-StatusOr<std::string> AotCompileCpuExecutable(
-    std::unique_ptr<HloModule> hlo_module) {
-  cpu::CpuCompiler cpu_compiler;
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<cpu::CpuExecutable> cpu_executable,
-      cpu_compiler.CompileXlaRuntimeCpuExecutable(std::move(hlo_module)));
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<AotCompilationResult> aot_result,
-                      cpu_compiler.Export(cpu_executable.get()));
-  TF_ASSIGN_OR_RETURN(std::string result, aot_result->SerializeAsString());
-  return result;
-}
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-StatusOr<std::string> CompileGpuExecutable(
-    std::unique_ptr<HloModule> hlo_module,
-    const std::optional<Compiler::TargetConfig> target_config) {
-  const bool aot = target_config.has_value();
-
-#if GOOGLE_CUDA
-  auto gpu_compiler = gpu::NVPTXCompiler();
-#elif TENSORFLOW_USE_ROCM
-  auto gpu_compiler = gpu::AMDGPUCompiler();
-#endif
-  Compiler::CompileOptions compile_options;
-
-  stream_executor::StreamExecutor* stream_executor = nullptr;
-  std::unique_ptr<stream_executor::StreamExecutorMemoryAllocator> allocator;
-  if (aot) {
-    compile_options.target_config = *target_config;
-  } else {
-    TF_RETURN_IF_ERROR(stream_executor::ValidateGPUMachineManager());
-    TF_ASSIGN_OR_RETURN(
-        stream_executor,
-        stream_executor::GPUMachineManager()->ExecutorForDevice(0));
-    allocator =
-        std::make_unique<stream_executor::StreamExecutorMemoryAllocator>(
-            stream_executor);
-    compile_options.device_allocator = allocator.get();
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> module_after_opt,
-      gpu_compiler.RunHloPasses(std::move(hlo_module), stream_executor,
-                                compile_options));
-
-  if (aot) {
-    auto module_group =
-        std::make_unique<HloModuleGroup>(std::move(module_after_opt));
-
-    AotCompilationOptions aot_options(gpu_compiler.PlatformId());
-    aot_options.set_target_config(*target_config);
-
-    TF_ASSIGN_OR_RETURN(
-        std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
-        gpu_compiler.CompileAheadOfTime(std::move(module_group), aot_options));
-    TF_ASSIGN_OR_RETURN(std::string result,
-                        aot_results[0]->SerializeAsString());
-    return result;
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> executable,
-      gpu_compiler.RunBackend(std::move(module_after_opt), stream_executor,
-                              compile_options));
-  return executable->module().ToString();
-}
-
-#endif
-
 xla::StatusOr<std::unique_ptr<HloModule>> LoadModule(
     const std::string& module_path) {
   auto format = std::string(tsl::io::Extension(module_path));
@@ -204,7 +126,7 @@ Status XlaCompileMain(
     const std::string& platform, const std::string& gpu_target_config_path,
     const std::string& autotune_results_path, const std::string& symbol_repo,
     const std::string& symbol_id, const bool use_attached_device,
-    const bool wait_for_uploads) {
+    const bool wait_for_uploads, const std::string& result_output_file) {
   std::unique_ptr<HloModule> hlo_module;
   std::unique_ptr<Compiler::TargetConfig> target_config;
   if (!symbol_id.empty()) {
@@ -227,12 +149,21 @@ Status XlaCompileMain(
     TF_ASSIGN_OR_RETURN(hlo_module, LoadModule(module_path));
   }
 
+  xla::TimerStats stats;
+  xla::ScopedLoggingTimer timer("compilation", true, "xla_compile_main.cc", 1,
+                                &stats);
+  CompilationResult compilation_result;
+  absl::Cleanup cleanup([&] {
+    // Make sure we stop the timer if compilation failed.
+    timer.StopAndLog();
+    if (!result_output_file.empty()) {
+      TF_QCHECK_OK(
+          WriteResultFile(result_output_file, stats, compilation_result));
+    }
+  });
   // Run AOT compilation.
-  std::string result;
-  if (platform == "cpu") {
-    TF_ASSIGN_OR_RETURN(result, AotCompileCpuExecutable(std::move(hlo_module)));
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  } else if (platform == "gpu") {
+  std::optional<Compiler::TargetConfig> cfg = std::nullopt;
+  if (platform == "gpu") {
     if (!gpu_target_config_path.empty()) {
       // Parse GpuTargetConfig.
       std::string gpu_target_config_string;
@@ -249,24 +180,26 @@ Status XlaCompileMain(
       target_config =
           std::make_unique<Compiler::TargetConfig>(gpu_target_config_proto);
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       if (!autotune_results_path.empty()) {
         TF_RETURN_IF_ERROR(gpu::AutotunerUtil::LoadAutotuneResultsFromFile(
             autotune_results_path));
       }
+#endif
     }
 
-    std::optional<Compiler::TargetConfig> cfg =
-        (use_attached_device) ? std::nullopt
-                              : std::make_optional(*std::move(target_config));
-    TF_ASSIGN_OR_RETURN(result,
-                        CompileGpuExecutable(std::move(hlo_module), cfg));
-#endif
-  } else {
-    return Unimplemented("platform %s not supported", platform);
+    cfg = (use_attached_device) ? std::nullopt
+                                : std::make_optional(*std::move(target_config));
+  }
+  auto result = CompileExecutable(std::move(hlo_module), platform, cfg,
+                                  compilation_result);
+  if (!result.ok()) {
+    *compilation_result.mutable_status() = tsl::StatusToProto(result.status());
+    return result.status();
   }
 
   TF_RETURN_IF_ERROR(
-      tsl::WriteStringToFile(tsl::Env::Default(), output_path, result));
+      tsl::WriteStringToFile(tsl::Env::Default(), output_path, *result));
 
   if (wait_for_uploads) {
     MaybeWaitForUploads();
@@ -289,6 +222,7 @@ int main(int argc, char* argv[]) {
   std::string symbol_id;
   bool use_attached_device = false;
   bool wait_for_uploads = false;
+  std::string result_output_file;
   std::vector<tsl::Flag> flag_list = {
       tsl::Flag("module_file", &module_path,
                 "The path to the HLO, MHLO or StableHLO file"),
@@ -316,6 +250,8 @@ int main(int argc, char* argv[]) {
       tsl::Flag("wait_for_uploads", &wait_for_uploads,
                 "Whether to wait for uploads to a symbol repository to "
                 "complete. See export_hlo.h for more on uploads."),
+      tsl::Flag("result_output_file", &result_output_file,
+                "File to write a serialized xla.CompilationResult proto to."),
   };
 
   tsl::string usage = xla::xla_compile::kUsageHeader;
@@ -333,7 +269,7 @@ int main(int argc, char* argv[]) {
   xla::Status result = xla::xla_compile::XlaCompileMain(
       module_path, output_path, platform, gpu_target_config_path,
       autotune_results_path, symbol_repository, symbol_id, use_attached_device,
-      wait_for_uploads);
+      wait_for_uploads, result_output_file);
   if (!result.ok()) {
     LOG(ERROR) << "Compilation failed: " << result;
     return 1;
diff --git a/third_party/xla/xla/service/xla_compile_result.proto b/third_party/xla/xla/service/xla_compile_result.proto
new file mode 100644
index 00000000000000..b6b102b740da39
--- /dev/null
+++ b/third_party/xla/xla/service/xla_compile_result.proto
@@ -0,0 +1,51 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto2";
+
+package xla;
+
+import "google/protobuf/duration.proto";
+import "xla/service/hlo.proto";
+import "tsl/protobuf/status.proto";
+
+// Statistics on how long various parts of compilation took.
+// Not all durations may be relevant for all producers of this message, in
+// which irrelevant fields should simply be skipped.
+message CompilerPerfStats {
+  // How long did it take to initialize the compiler?
+  optional google.protobuf.Duration init_duration = 1;
+  // How long did it take to verify the HLO?
+  optional google.protobuf.Duration hlo_verification_duration = 2;
+  // How long did it take to prepare for compilation after verification?
+  optional google.protobuf.Duration compilation_prologue_duration = 3;
+  // How long did it take to compile?
+  optional google.protobuf.Duration compilation_duration = 4;
+  // How long did everything take?
+  optional google.protobuf.Duration total_duration = 5;
+}
+
+message CompilationResult {
+  // The compiled HLO. Only set when compilation succeeds.
+  optional xla.HloModuleProto hlo_module = 1;
+  // Always set when compilation succeeds. May or may not be set when
+  // compilation fails.
+  optional CompilerPerfStats perf_stats = 2;
+  // Always set when compilation fails; never set when compilation succeeds.
+  optional tensorflow.StatusProto status = 3;
+  // Collects counters collected during compilation. Not every producer may
+  // include counter support at all or any particular counter.
+  map<string, int64> counters = 4;
+}
diff --git a/third_party/xla/xla/service/xla_debug_info_manager.cc b/third_party/xla/xla/service/xla_debug_info_manager.cc
index fc504ae85c80d2..92147af7173861 100644
--- a/third_party/xla/xla/service/xla_debug_info_manager.cc
+++ b/third_party/xla/xla/service/xla_debug_info_manager.cc
@@ -26,7 +26,7 @@ namespace xla {
 
 void XlaDebugInfoManager::RegisterModule(
     std::shared_ptr<const HloModule> hlo_module,
-    std::shared_ptr<const BufferAssignmentProto> buffer_assignment) {
+    BufferAssignmentProto buffer_assignment) {
   CHECK(hlo_module != nullptr);
   absl::MutexLock lock(&mutex_);
   auto result = modules_.try_emplace(hlo_module->unique_id());
@@ -83,13 +83,8 @@ void XlaDebugInfoManager::StopTracing(
   if (module_debug_info) {
     module_debug_info->clear();
     for (const auto& m : modules_to_serialize) {
-      // In real world, hlo_module and buffer_assignment will always be
-      // non-nullptr. Due to the inconvenience of creation of buffer_assignment
-      // object in test, we set it to nullptr and guard this for it.
       auto hlo_proto = std::make_unique<HloProto>(MakeHloProto(*m.hlo_module));
-      if (m.buffer_assignment != nullptr) {
-        *hlo_proto->mutable_buffer_assignment() = *m.buffer_assignment;
-      }
+      *hlo_proto->mutable_buffer_assignment() = m.buffer_assignment;
       module_debug_info->emplace_back(std::move(hlo_proto));
     }
   }
diff --git a/third_party/xla/xla/service/xla_debug_info_manager.h b/third_party/xla/xla/service/xla_debug_info_manager.h
index ff75a1ad1eef71..7d7cf094911e34 100644
--- a/third_party/xla/xla/service/xla_debug_info_manager.h
+++ b/third_party/xla/xla/service/xla_debug_info_manager.h
@@ -43,9 +43,8 @@ class XlaDebugInfoManager {
 
   // Registers an active module to XlaDebugInfoManager.
   // The module_id of the module is expected to be unique per process.
-  void RegisterModule(
-      std::shared_ptr<const HloModule> hlo_module,
-      std::shared_ptr<const BufferAssignmentProto> buffer_assignment);
+  void RegisterModule(std::shared_ptr<const HloModule> hlo_module,
+                      BufferAssignmentProto buffer_assignment);
 
   // Unregisters an active module.
   void UnregisterModule(ModuleIdentifier module_id);
@@ -70,7 +69,7 @@ class XlaDebugInfoManager {
 
   struct XlaModuleEntry {
     std::shared_ptr<const HloModule> hlo_module;
-    std::shared_ptr<const BufferAssignmentProto> buffer_assignment;
+    BufferAssignmentProto buffer_assignment;
     bool active = false;
   };
 
diff --git a/third_party/xla/xla/service/xla_debug_info_manager_test.cc b/third_party/xla/xla/service/xla_debug_info_manager_test.cc
index 1dd4233bd8e2b8..a33e6ae5b6ed69 100644
--- a/third_party/xla/xla/service/xla_debug_info_manager_test.cc
+++ b/third_party/xla/xla/service/xla_debug_info_manager_test.cc
@@ -26,11 +26,10 @@ namespace xla {
 
 class XlaDebugInfoManagerTestPeer {
  public:
-  void RegisterModule(
-      std::shared_ptr<const HloModule> hlo_module,
-      std::shared_ptr<const BufferAssignmentProto> buffer_assignment) {
+  void RegisterModule(std::shared_ptr<const HloModule> hlo_module,
+                      BufferAssignmentProto buffer_assignment) {
     return xla_debug_info_manager_.RegisterModule(hlo_module,
-                                                  buffer_assignment);
+                                                  std::move(buffer_assignment));
   }
 
   void UnregisterModule(ModuleIdentifier module_id) {
@@ -74,7 +73,6 @@ class XlaDebugInfoManagerTest : public HloTestBase {
     // know which program is referenced (such as in UnregisterProgram).
     ModuleIdentifier unique_id;
     std::shared_ptr<HloModule> module;
-    std::shared_ptr<BufferAssignmentProto> buffer_assignment;
   };
 
   // Return unique id of this module.
@@ -82,11 +80,10 @@ class XlaDebugInfoManagerTest : public HloTestBase {
     DebugMetadata debug_info;
     HloModuleConfig config;
     debug_info.module = std::make_shared<HloModule>(module_name, config);
-    debug_info.buffer_assignment = nullptr;
     ModuleIdentifier unique_id = debug_info.module->unique_id();
     debug_info.unique_id = unique_id;
     xla_debug_info_manager_.RegisterModule(debug_info.module,
-                                           debug_info.buffer_assignment);
+                                           BufferAssignmentProto());
     external_references_.push_back(std::move(debug_info));
     return unique_id;
   }
diff --git a/third_party/xla/xla/service_interface.h b/third_party/xla/xla/service_interface.h
index f6ffabb70bc5a7..6186bae11bbc7d 100644
--- a/third_party/xla/xla/service_interface.h
+++ b/third_party/xla/xla/service_interface.h
@@ -56,9 +56,6 @@ class ServiceInterface {
   virtual Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
                                       ExecuteParallelResponse* result) = 0;
 
-  virtual Status WaitForExecution(const WaitForExecutionRequest* arg,
-                                  WaitForExecutionResponse* result) = 0;
-
   virtual Status DeconstructTuple(const DeconstructTupleRequest* arg,
                                   DeconstructTupleResponse* result) = 0;
 
diff --git a/third_party/xla/xla/shape.cc b/third_party/xla/xla/shape.cc
index 1909291788aa92..f074915d445f3b 100644
--- a/third_party/xla/xla/shape.cc
+++ b/third_party/xla/xla/shape.cc
@@ -137,6 +137,16 @@ bool Shape::is_static() const {
   return !absl::c_any_of(dynamic_dimensions_, [](bool b) { return b; });
 }
 
+bool Shape::is_unbounded_dynamic() const {
+  if (IsTuple() && absl::c_any_of(tuple_shapes_, [](const Shape& subshape) {
+        return subshape.is_unbounded_dynamic();
+      })) {
+    return true;
+  }
+  return absl::c_any_of(dimensions_,
+                        [](int64_t dim) { return dim == kUnboundedSize; });
+}
+
 void Shape::DeleteDimension(int64_t dim_to_delete) {
   CHECK(IsArray());
   CHECK_GE(dim_to_delete, 0);
@@ -149,7 +159,7 @@ void Shape::DeleteDimension(int64_t dim_to_delete) {
 }
 
 const Shape& Shape::tuple_shapes(int index) const {
-  return tuple_shapes_.at(index);
+  return tuple_shapes_[index];
 }
 
 Shape* Shape::add_tuple_shapes() {
diff --git a/third_party/xla/xla/shape.h b/third_party/xla/xla/shape.h
index 214b87a0f3b505..ad72a30bca2886 100644
--- a/third_party/xla/xla/shape.h
+++ b/third_party/xla/xla/shape.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef XLA_SHAPE_H_
 #define XLA_SHAPE_H_
 
-#include <cstdint>
+#include <limits>
 #include <optional>
 #include <ostream>
 #include <string>
@@ -91,9 +91,37 @@ class Shape {
 
   bool is_dynamic() const { return !is_static(); }
 
+  // Unbounded dynamism.
+  // If `dimensions(axis) == kUnboundedSize && is_dynamic_dimension(axis)`,
+  // this means that the axis has unbounded dynamic size.
+  // The sentinel value for kUnboundedSize is chosen to be exactly the same
+  // as the sentinel value mlir::ShapedType::kDynamic.
+  static constexpr int64_t kUnboundedSize = std::numeric_limits<int64_t>::min();
+
+  // Returns true if the shape has one or more dimensions with unbounded sizes.
+  // Tuple shapes are traversed recursively.
+  bool is_unbounded_dynamic() const;
+
+  // Returns true if the given dimension is unbounded dynamic.
+  bool is_unbounded_dynamic_dimension(int dimension) const {
+    return dimensions_[dimension] == kUnboundedSize;
+  }
+
+  // Sets a given dimension as unbounded dynamic.
+  void set_unbounded_dynamic_dimension(int dimension) {
+    dynamic_dimensions_[dimension] = true;
+    dimensions_[dimension] = kUnboundedSize;
+  }
+
+  // Returns true if the given dimension is bounded dynamic.
+  bool is_bounded_dynamic_dimension(int dimension) const {
+    return is_dynamic_dimension(dimension) &&
+           !is_unbounded_dynamic_dimension(dimension);
+  }
+
   // Returns true if the given dimension is dynamically-sized.
   bool is_dynamic_dimension(int dimension) const {
-    return dynamic_dimensions_.at(dimension);
+    return dynamic_dimensions_[dimension];
   }
 
   // Sets whether or not the given dimension is dynamically-sized.
@@ -127,18 +155,16 @@ class Shape {
 
   // Methods for accessing the dimensions array.
   int dimensions_size() const { return dimensions_.size(); }
-  int64_t dimensions(int index) const { return dimensions_.at(index); }
+  int64_t dimensions(int index) const { return dimensions_[index]; }
 
   int64_t dimensions_minor(int index) const {
     CHECK(has_layout());
-    return dimensions_.at(layout_->minor_to_major(index));
-  }
-  void set_dimensions(int index, int64_t value) {
-    dimensions_.at(index) = value;
+    return dimensions_[layout_->minor_to_major(index)];
   }
+  void set_dimensions(int index, int64_t value) { dimensions_[index] = value; }
   void set_dimensions_minor(int index, int64_t value) {
     CHECK(has_layout());
-    dimensions_.at(layout_->minor_to_major(index)) = value;
+    dimensions_[layout_->minor_to_major(index)] = value;
   }
   void add_dimensions(int64_t value) {
     dimensions_.push_back(value);
@@ -157,7 +183,7 @@ class Shape {
   // tuple shapes.
   int tuple_shapes_size() const { return tuple_shapes_.size(); }
   const Shape& tuple_shapes(int index) const;
-  Shape* mutable_tuple_shapes(int index) { return &tuple_shapes_.at(index); }
+  Shape* mutable_tuple_shapes(int index) { return &tuple_shapes_[index]; }
   Shape* add_tuple_shapes();
   void clear_tuple_shapes() { tuple_shapes_.clear(); }
   const std::vector<Shape>& tuple_shapes() const { return tuple_shapes_; }
@@ -350,8 +376,8 @@ class ProgramShape {
 
   // Methods for accessing and manipulating the Shape of the parameters.
   int parameters_size() const { return parameters_.size(); }
-  const Shape& parameters(int index) const { return parameters_.at(index); }
-  Shape* mutable_parameters(int index) { return &parameters_.at(index); }
+  const Shape& parameters(int index) const { return parameters_[index]; }
+  Shape* mutable_parameters(int index) { return &parameters_[index]; }
   Shape* add_parameters() {
     parameters_.emplace_back();
     return &parameters_.back();
@@ -367,13 +393,13 @@ class ProgramShape {
   // Methods for accessing and manipulating the names of the parameters.
   int parameter_names_size() const { return parameter_names_.size(); }
   const std::string& parameter_names(int index) const {
-    return parameter_names_.at(index);
+    return parameter_names_[index];
   }
   void set_parameter_names(int index, const std::string& value) {
-    parameter_names_.at(index) = value;
+    parameter_names_[index] = value;
   }
   std::string* mutable_parameter_names(int index) {
-    return &parameter_names_.at(index);
+    return &parameter_names_[index];
   }
   void add_parameter_names(const std::string& value) {
     parameter_names_.push_back(value);
diff --git a/third_party/xla/xla/shape_test.cc b/third_party/xla/xla/shape_test.cc
index d691ee64b17079..322f02e4773f67 100644
--- a/third_party/xla/xla/shape_test.cc
+++ b/third_party/xla/xla/shape_test.cc
@@ -41,11 +41,14 @@ class ShapeTest : public ::testing::Test {
       ShapeUtil::MakeTupleShape({tuple_, matrix_, token_});
   const Shape dynamic_matrix_ =
       ShapeUtil::MakeShape(S32, {5, 2}, {true, false});
+  const Shape unbounded_ =
+      ShapeUtil::MakeShape(F32, {Shape::kUnboundedSize, 784}, {true, false});
 };
 
 TEST_F(ShapeTest, ShapeToFromProto) {
-  for (const Shape& shape : {opaque_, token_, scalar_, matrix_, matrix2_,
-                             tuple_, nested_tuple_, dynamic_matrix_}) {
+  for (const Shape& shape :
+       {opaque_, token_, scalar_, matrix_, matrix2_, tuple_, nested_tuple_,
+        dynamic_matrix_, unbounded_}) {
     Shape shape_copy(shape.ToProto());
     EXPECT_TRUE(ShapeUtil::Equal(shape, shape_copy))
         << shape << " != " << shape_copy;
@@ -83,6 +86,8 @@ TEST_F(ShapeTest, DynamicShapeToString) {
 
   array_shape.set_dynamic_dimension(2, false);
   EXPECT_EQ("f32[<=23,44,55]", array_shape.ToString());
+
+  EXPECT_EQ("f32[?,784]", unbounded_.ToString());
 }
 
 TEST_F(ShapeTest, EqualityTest) {
@@ -120,6 +125,28 @@ TEST_F(ShapeTest, IsStatic) {
   ShapeUtil::GetMutableSubshape(&dynamic_tuple, {2})
       ->set_dynamic_dimension(1, true);
   EXPECT_FALSE(dynamic_tuple.is_static());
+
+  EXPECT_FALSE(unbounded_.is_static());
+}
+
+TEST_F(ShapeTest, IsDynamic) {
+  EXPECT_FALSE(matrix_.is_dynamic());
+  EXPECT_FALSE(matrix_.is_unbounded_dynamic());
+
+  EXPECT_TRUE(dynamic_matrix_.is_dynamic());
+  EXPECT_FALSE(dynamic_matrix_.is_unbounded_dynamic());
+
+  EXPECT_TRUE(unbounded_.is_dynamic());
+  EXPECT_TRUE(unbounded_.is_unbounded_dynamic());
+
+  Shape unbounded_tuple = tuple_;
+  EXPECT_FALSE(unbounded_tuple.is_unbounded_dynamic());
+  ShapeUtil::GetMutableSubshape(&unbounded_tuple, {2})
+      ->set_dynamic_dimension(1, true);
+  EXPECT_FALSE(unbounded_tuple.is_unbounded_dynamic());
+  ShapeUtil::GetMutableSubshape(&unbounded_tuple, {2})
+      ->set_dimensions(1, Shape::kUnboundedSize);
+  EXPECT_TRUE(unbounded_tuple.is_unbounded_dynamic());
 }
 
 TEST_F(ShapeTest, IsDynamicDimension) {
@@ -133,6 +160,9 @@ TEST_F(ShapeTest, IsDynamicDimension) {
   ShapeUtil::GetMutableSubshape(&dynamic_tuple, {2})
       ->set_dynamic_dimension(1, true);
   EXPECT_FALSE(dynamic_tuple.is_static());
+
+  EXPECT_TRUE(unbounded_.is_dynamic_dimension(0));
+  EXPECT_FALSE(unbounded_.is_dynamic_dimension(1));
 }
 
 TEST_F(ShapeTest, ProgramShapeToFromProto) {
diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index 55882a59e7cbdc..100be5e6d6d1b7 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -145,7 +145,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
       index_primitive_type, pointer_primitive_type, element_size_in_bits,
       memory_space, std::move(physical_shape));
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
-  return shape;
+  return std::move(shape);
 }
 
 template <typename T>
@@ -248,14 +248,18 @@ Shape MakeTupleShapeImpl(absl::Span<ShapePtrOrRef> shapes) {
   const int ndims = dimensions.size();
   auto layout = shape->mutable_layout();
   auto* minor_to_major = layout->mutable_minor_to_major();
+  auto is_unbounded_dynamic = absl::c_any_of(
+      dimensions, [](int64_t dim) { return dim == Shape::kUnboundedSize; });
   for (int i = 0; i < ndims; i++) {
     const int64_t d = dimensions[i];
-    if (d < 0) {
+    if (d < 0 && d != Shape::kUnboundedSize) {
       return false;
     }
-    dense_shape_size = MultiplyWithoutOverflow(dense_shape_size, d);
-    if (dense_shape_size < 0) {
-      return false;
+    if (!is_unbounded_dynamic) {
+      dense_shape_size = MultiplyWithoutOverflow(dense_shape_size, d);
+      if (dense_shape_size < 0) {
+        return false;
+      }
     }
 
     shape->add_dimensions(d);
@@ -307,7 +311,7 @@ Shape MakeTupleShapeImpl(absl::Span<ShapePtrOrRef> shapes) {
                            static_cast<int>(element_type),
                            absl::StrJoin(dimensions, ","));
   }
-  return shape;
+  return std::move(shape);
 }
 
 /* static */ StatusOr<Shape> ShapeUtil::MakeValidatedShape(
@@ -327,8 +331,13 @@ Shape MakeTupleShapeImpl(absl::Span<ShapePtrOrRef> shapes) {
   }
   for (int i = 0, n = dimensions.size(); i < n; i++) {
     shape.set_dynamic_dimension(i, dynamic_dimensions[i]);
+    if (shape.dimensions(i) == Shape::kUnboundedSize &&
+        !dynamic_dimensions[i]) {
+      return InvalidArgument(
+          "Cannot mark a dynamic dimension at dim=%d as static", i);
+    }
   }
-  return shape;
+  return std::move(shape);
 }
 
 /* static */ Shape ShapeUtil::MakeShapeWithDenseLayout(
@@ -698,9 +707,14 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
   printer->Append("[");
   auto print_one = [&](int i) {
     if (shape.is_dynamic_dimension(i)) {
-      printer->Append("<=");
+      if (shape.dimensions(i) != Shape::kUnboundedSize) {
+        printer->Append(StrCat("<=", shape.dimensions(i)));
+      } else {
+        printer->Append("?");
+      }
+    } else {
+      printer->Append(shape.dimensions(i));
     }
-    printer->Append(shape.dimensions(i));
   };
   print_one(0);
   for (int i = 1, n = shape.dimensions_size(); i < n; ++i) {
@@ -776,7 +790,16 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
                                             const Shape& rhs) {
   CHECK(lhs.IsArray());
   CHECK(rhs.IsArray());
-  return absl::c_equal(lhs.dimensions(), rhs.dimensions());
+  if (!SameRank(lhs, rhs)) return false;
+  for (int i = 0; i < lhs.rank(); ++i) {
+    if (!lhs.is_unbounded_dynamic_dimension(i) &&
+        !rhs.is_unbounded_dynamic_dimension(i) &&
+        lhs.dimensions(i) != rhs.dimensions(i)) {
+      return false;
+    }
+  }
+
+  return true;
 }
 
 /* static */ bool ShapeUtil::SameRank(const Shape& lhs, const Shape& rhs) {
@@ -926,7 +949,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
 
   for (int64_t i = 0; i < shape.rank(); ++i) {
     int64_t dimension = shape.dimensions(i);
-    if (dimension < 0) {
+    if (dimension < 0 && dimension != Shape::kUnboundedSize) {
       return InvalidArgument(
           "shape's dimensions must not be < 0; dimension at index %d was %d", i,
           dimension);
@@ -944,6 +967,10 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
     return OkStatus();
   }
 
+  if (shape.is_unbounded_dynamic()) {
+    return OkStatus();
+  }
+
   int64_t shape_size = [&]() {
     int64_t dense_shape_size = 1;
     if (shape.dimensions().empty()) {
@@ -1452,9 +1479,8 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified(
           IndexUtil::MultidimensionalIndexToLinearIndex(input_shape_dim0_major,
                                                         input_unit_index);
       // output_index has the same logical linear index as input_unit_index.
-      std::vector<int64_t> output_index =
-          IndexUtil::LinearIndexToMultidimensionalIndex(output_shape_dim0_major,
-                                                        logical_linear_index);
+      auto output_index = IndexUtil::LinearIndexToMultidimensionalIndex(
+          output_shape_dim0_major, logical_linear_index);
       // Check input_unit_index and output_index have the same physical linear
       // index.
       if (IndexUtil::MultidimensionalIndexToLinearIndex(input_shape,
diff --git a/third_party/xla/xla/shape_util.h b/third_party/xla/xla/shape_util.h
index 760fc42894894e..bea6e84ee30c1d 100644
--- a/third_party/xla/xla/shape_util.h
+++ b/third_party/xla/xla/shape_util.h
@@ -178,8 +178,8 @@ class ShapeUtil {
   // (param_name: f32[42x12], ...) -> f32[24x42]
   static std::string HumanString(const ProgramShape& program_shape);
 
-  // Returns whether the LHS and RHS shapes have the same dimensions; note: does
-  // not check element type.
+  // Returns whether the LHS and RHS shapes have the same dimensions, ignoring
+  // the unbounded dimension sizes; note: does not check element type.
   // Precondition: IsArray(lhs) && IsArray(rhs)
   static bool SameDimensions(const Shape& lhs, const Shape& rhs);
 
@@ -372,8 +372,9 @@ class ShapeUtil {
                          const std::vector<bool>& dynamic_dimensions);
 
   // Constructs a new shape with the given element type and sequence of
-  // dimensions. Method checks if the element type is valid and the shape's
-  // size fits in std::numeric_limits<int64_t>::max().
+  // dimensions. Method checks if the element type is valid, the shape's
+  // size fits in std::numeric_limits<int64_t>::max(), and dynamic size is not
+  // marked static.
   static StatusOr<Shape> MakeValidatedShape(
       PrimitiveType element_type, absl::Span<const int64_t> dimensions);
   static StatusOr<Shape> MakeValidatedShape(
@@ -544,6 +545,23 @@ class ShapeUtil {
     }).IgnoreError();
   }
 
+  // Calls the given visitor function for each leaf subshape of the given shape.
+  // Subshapes are visited in DFS pre-order starting with the entire shape
+  // (index {}).
+  //
+  // The visitor function must have the signature
+  //
+  //   void fn(const Shape& subshape, const ShapeIndex& index)
+  template <typename Fn>
+  static void ForEachLeafShape(const Shape& shape, Fn&& fn) {
+    ForEachSubshape(shape,
+                    [&](const Shape& sub_shape, const ShapeIndex& index) {
+                      if (IsLeafIndex(shape, index)) {
+                        fn(sub_shape, index);
+                      }
+                    });
+  }
+
   // Variants of ForEach(Mutable)Subshape which propagate Status from the
   // visitor function.
   //
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index b63be84136e5dc..f18eb0bbcc0945 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -966,6 +966,16 @@ TEST(ShapeUtilTest, UpdateDynamicDimensions) {
   EXPECT_TRUE(ShapeUtil::GetSubshape(tuple_shape, {0}).is_dynamic_dimension(1));
 }
 
+TEST(ShapeUtilTest, InvalidDynamicDimension) {
+  StatusOr<Shape> error_status = ShapeUtil::MakeValidatedShape(
+      F32, {Shape::kUnboundedSize, Shape::kUnboundedSize}, {true, false});
+
+  EXPECT_FALSE(error_status.ok());
+  EXPECT_THAT(error_status.status().message(),
+              ::testing::HasSubstr(
+                  "Cannot mark a dynamic dimension at dim=1 as static"));
+}
+
 TEST(ShapeUtilTest, PermuteDynamicDimensions) {
   Shape shape =
       ShapeUtil::MakeShape(F32, {10, 100, 1000},
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 7204b2680c0e46..1352f4621f5851 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -75,7 +75,6 @@ filegroup(
         "stream.h",
         "stream_executor.h",
         "stream_executor_internal.h",  # TODO(ezhulenev): Remove private header
-        "stream_executor_pimpl.h",  # TODO(ezhulenev): Remove private header
         "temporary_device_memory.h",
         "temporary_memory_manager.h",
         "trace_listener.h",
@@ -104,6 +103,7 @@ STREAM_EXECUTOR_DEPENDENCIES = [
     "@com_google_absl//absl/algorithm:container",
     "@com_google_absl//absl/base:core_headers",
     "@com_google_absl//absl/container:inlined_vector",
+    "@com_google_absl//absl/meta:type_traits",
     "@com_google_absl//absl/container:node_hash_map",
     "@com_google_absl//absl/functional:any_invocable",
     "@com_google_absl//absl/log:check",
@@ -132,7 +132,9 @@ cc_library(
         ":stream_executor_plugin_headers",
     ],
     visibility = ["//visibility:public"],
-    deps = STREAM_EXECUTOR_DEPENDENCIES + if_static([
+    deps = STREAM_EXECUTOR_DEPENDENCIES + [
+        ":stream_executor_pimpl",
+    ] + if_static([
         ":stream_executor_impl",
         "@com_google_protobuf//:protobuf",  # indirectly-used by dnn.h
     ]),
@@ -177,7 +179,10 @@ cc_library(
     name = "device_memory",
     hdrs = ["device_memory.h"],
     visibility = ["//visibility:public"],
-    deps = ["//xla/stream_executor/platform"],
+    deps = [
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/log:check",
+    ],
 )
 
 # TODO(ezhulenev): Merge this target into `stream_executor`.
@@ -203,9 +208,8 @@ cc_library(
     hdrs = ["device_options.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//xla/stream_executor/platform",
-        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -369,6 +373,7 @@ cc_library(
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -385,6 +390,7 @@ cc_library(
 cc_library(
     name = "stream_executor_headers",
     hdrs = [
+        "stream_executor_pimpl.h",  # TODO(ezhulenev): Remove internal header
         ":stream_executor_api_headers",
         ":stream_executor_plugin_headers",
     ],
@@ -437,6 +443,12 @@ cc_library(
     name = "command_buffer",
     srcs = ["command_buffer.cc"],
     hdrs = ["command_buffer.h"],
+    local_defines = select({
+        "//xla/stream_executor/cuda:graph_conditional_enabled": [
+            "STREAM_EXECUTOR_CUDA_ENABLE_GRAPH_CONDITIONAL=1",
+        ],
+        "//conditions:default": [],
+    }),
     visibility = ["//visibility:public"],
     deps = [
         ":stream_executor_headers",
@@ -496,6 +508,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -510,7 +523,10 @@ cc_library(
         ":stream_executor_headers",
         ":stream_executor_internal",
         "//xla/stream_executor/platform",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/meta:type_traits",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -660,6 +676,21 @@ cc_library(
 # StreamExecutor tests
 #===--------------------------------------------------------------------------------------------===#
 
+xla_cc_test(
+    name = "kernel_test",
+    srcs = ["kernel_test.cc"],
+    deps = [
+        ":device_memory",
+        ":stream_executor",
+        "//xla/stream_executor/host:host_platform",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_benchmark",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 xla_cc_test(
     name = "stream_test",
     size = "small",
diff --git a/third_party/xla/xla/stream_executor/command_buffer.cc b/third_party/xla/xla/stream_executor/command_buffer.cc
index 305872ea5eb67a..8980cb5b7d1faa 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/command_buffer.cc
@@ -15,13 +15,17 @@ limitations under the License.
 
 #include "xla/stream_executor/command_buffer.h"
 
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <utility>
+#include <vector>
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
@@ -35,13 +39,18 @@ CommandBuffer::~CommandBuffer() = default;
 CommandBuffer::CommandBuffer(CommandBuffer&&) = default;
 CommandBuffer& CommandBuffer::operator=(CommandBuffer&&) = default;
 
+void CommandBuffer::Deleter::operator()(
+    internal::CommandBufferInterface* impl) {
+  if (owned) delete impl;
+}
+
 /*static*/ tsl::StatusOr<CommandBuffer> CommandBuffer::Create(
     StreamExecutor* executor, Mode mode) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<internal::CommandBufferInterface> command_buffer,
       executor->implementation()->GetCommandBufferImplementation(mode));
 
-  CommandBuffer cmd(executor, std::move(command_buffer));
+  CommandBuffer cmd(std::move(command_buffer));
   return cmd;
 }
 
@@ -70,15 +79,47 @@ CommandBuffer& CommandBuffer::operator=(CommandBuffer&&) = default;
   return command_buffer;
 }
 
+/*static*/ bool CommandBuffer::SupportsConditionalCommands(
+    const Platform* platform) {
+  // TODO(ezhulenev): We should extend a Platform with a way to query
+  // implemented StreamExecutor features, for now we know that only CUDA
+  // platform supports conditional commands in command buffers.
+#if defined(STREAM_EXECUTOR_CUDA_ENABLE_GRAPH_CONDITIONAL)
+  return platform->Name() == "CUDA";
+#endif
+  return false;
+}
+
+const internal::CommandBufferInterface* CommandBuffer::implementation() const {
+  return implementation_.get();
+}
+
+internal::CommandBufferInterface* CommandBuffer::implementation() {
+  return implementation_.get();
+}
+
+/*static*/ CommandBuffer CommandBuffer::Create(
+    std::unique_ptr<internal::CommandBufferInterface> implementation) {
+  return CommandBuffer(std::move(implementation));
+}
+
+/*static*/ tsl::Status CommandBuffer::Build(
+    internal::CommandBufferInterface* implementation,
+    const CommandBuffer::Builder& builder) {
+  CommandBuffer command_buffer(implementation);
+  return builder(&command_buffer);
+}
+
 CommandBuffer::CommandBuffer(
-    StreamExecutor* executor,
     std::unique_ptr<internal::CommandBufferInterface> implementation)
-    : executor_(executor), implementation_(std::move(implementation)) {}
+    : implementation_(implementation.release(), {/*owned=*/true}) {}
+
+CommandBuffer::CommandBuffer(internal::CommandBufferInterface* implementation)
+    : implementation_(implementation, {/*owned=*/false}) {}
 
 tsl::Status CommandBuffer::Launch(const ThreadDim& threads,
-                                  const BlockDim& blocks,
-                                  const KernelBase& kernel,
-                                  const KernelArgsArrayBase& args) {
+                                  const BlockDim& blocks, const Kernel& kernel,
+                                  const KernelArgs& args) {
   return implementation_->Launch(threads, blocks, kernel, args);
 }
 
@@ -92,6 +133,51 @@ tsl::Status CommandBuffer::MemcpyDeviceToDevice(DeviceMemoryBase* dst,
   return implementation_->MemcpyDeviceToDevice(dst, src, size);
 }
 
+tsl::Status CommandBuffer::Memset(DeviceMemoryBase* dst, BitPattern bit_pattern,
+                                  size_t num_elements) {
+  return implementation_->Memset(dst, bit_pattern, num_elements);
+}
+
+tsl::StatusOr<DeviceMemoryBase> CommandBuffer::Allocate(size_t bytes) {
+  return implementation_->Allocate(bytes);
+}
+
+tsl::Status CommandBuffer::If(StreamExecutor* executor, DeviceMemory<bool> pred,
+                              Builder then_builder) {
+  return implementation_->If(executor, pred, std::move(then_builder));
+}
+
+tsl::Status CommandBuffer::IfElse(StreamExecutor* executor,
+                                  DeviceMemory<bool> pred, Builder then_builder,
+                                  Builder else_builder) {
+  return implementation_->IfElse(executor, pred, std::move(then_builder),
+                                 std::move(else_builder));
+}
+
+tsl::Status CommandBuffer::Case(StreamExecutor* executor,
+                                DeviceMemory<int32_t> index,
+                                std::vector<Builder> branches) {
+  return implementation_->Case(executor, index, std::move(branches));
+}
+
+tsl::Status CommandBuffer::For(StreamExecutor* executor, int32_t num_iteration,
+                               DeviceMemory<int32_t> loop_counter,
+                               Builder body_builder) {
+  return implementation_->For(executor, num_iteration, loop_counter,
+                              std::move(body_builder));
+}
+
+tsl::Status CommandBuffer::While(StreamExecutor* executor,
+                                 DeviceMemory<bool> pred, Builder cond_builder,
+                                 Builder body_builder) {
+  return implementation_->While(executor, pred, std::move(cond_builder),
+                                std::move(body_builder));
+}
+
+tsl::Status CommandBuffer::Free(DeviceMemoryBase dst) {
+  return implementation_->Free(dst);
+}
+
 CommandBuffer::Mode CommandBuffer::mode() const {
   return implementation_->mode();
 }
diff --git a/third_party/xla/xla/stream_executor/command_buffer.h b/third_party/xla/xla/stream_executor/command_buffer.h
index 44791ae22f2ae7..dbb34a2ddc41cd 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.h
+++ b/third_party/xla/xla/stream_executor/command_buffer.h
@@ -16,13 +16,18 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_COMMAND_BUFFER_H_
 #define XLA_STREAM_EXECUTOR_COMMAND_BUFFER_H_
 
+#include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <memory>
-#include <tuple>
+#include <variant>
+#include <vector>
 
 #include "absl/functional/any_invocable.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/platform.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
@@ -48,6 +53,9 @@ class CommandBufferInterface;
 // device.
 class CommandBuffer {
  public:
+  // Builder constructs nested command buffers owned by a parent command buffer.
+  using Builder = std::function<tsl::Status(CommandBuffer*)>;
+
   ~CommandBuffer();
   CommandBuffer(CommandBuffer&&);
   CommandBuffer& operator=(CommandBuffer&&);
@@ -94,13 +102,21 @@ class CommandBuffer {
       absl::AnyInvocable<tsl::Status(Stream*)> function,
       Mode mode = Mode::kPrimary);
 
+  //===--------------------------------------------------------------------===//
+  // Command buffer properties
+  //===--------------------------------------------------------------------===//
+
+  // Returns true if command buffer on a given platform supports conditional
+  // commands (If, IfThen, While).
+  static bool SupportsConditionalCommands(const Platform* platform);
+
   //===--------------------------------------------------------------------===//
   // Command buffer API
   //===--------------------------------------------------------------------===//
 
   // Adds a kernel launch command to the command buffer.
   tsl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
-                     const KernelBase& kernel, const KernelArgsArrayBase& args);
+                     const Kernel& kernel, const KernelArgs& args);
 
   // Adds a nested command buffer to the command buffer.
   tsl::Status AddNestedCommandBuffer(const CommandBuffer& nested);
@@ -109,6 +125,65 @@ class CommandBuffer {
   tsl::Status MemcpyDeviceToDevice(DeviceMemoryBase* dst,
                                    const DeviceMemoryBase& src, uint64_t size);
 
+  // Adds a memset node to the command buffer.
+  using BitPattern = std::variant<uint8_t, uint16_t, uint32_t>;
+  tsl::Status Memset(DeviceMemoryBase* dst, BitPattern bit_pattern,
+                     size_t num_elements);
+
+  //--------------------------------------------------------------------------//
+  // Command buffer condtitional commands API
+  //--------------------------------------------------------------------------//
+
+  // Adds a conditional operation that will execute a command buffer constructed
+  // by `then_builder` if `pred` value is `true`.
+  tsl::Status If(StreamExecutor* executor, DeviceMemory<bool> pred,
+                 Builder then_builder);
+
+  // Adds a conditional operation that will execute a command buffer constructed
+  // by `then_builder` if `pred` value is `true`, or a command buffer
+  // constructed by `else_builder` if `pred` is `false`.
+  tsl::Status IfElse(StreamExecutor* executor, DeviceMemory<bool> pred,
+                     Builder then_builder, Builder else_builder);
+
+  // Adds a conditional operation that will execute a command buffer constructed
+  // by the `branches` builder at `index`. If `index` is out of range, then it
+  // will run a conditional command buffer constructed by the last builder.
+  //
+  // See: https://github.com/openxla/stablehlo/blob/main/docs/spec.md#case
+  tsl::Status Case(StreamExecutor* executor, DeviceMemory<int32_t> index,
+                   std::vector<Builder> branches);
+
+  // Adds a conditional operation that will execute a command buffer constructed
+  // by the `body_builder` exactly `num_iteration` times. This means the
+  // condition is known at compile time (`num_iteration` < `loop_counter`), and
+  // does not require a `cond_builder`.
+  tsl::Status For(StreamExecutor* executor, int32_t num_iteration,
+                  DeviceMemory<int32_t> loop_counter, Builder body_builder);
+
+  // Adds a conditional operation that will execute a command buffer constructed
+  // by the `cond_builder` that must update `pred` value, and then depending on
+  // the value might execute command buffer constructed by `body_builder` and
+  // `cond_builder`. Will continue while `pred` value (which is continuously
+  // updated by `cond_builder`) is `true`.
+  //
+  // In pseudocode:
+  //
+  //   cond_builder()
+  //   while(pred):
+  //     body_builder()
+  //     cond_builder()
+  //
+  tsl::Status While(StreamExecutor* executor, DeviceMemory<bool> pred,
+                    Builder cond_builder, Builder body_builder);
+
+  //--------------------------------------------------------------------------//
+
+  // Adds a device memory allocation command to the command buffer.
+  tsl::StatusOr<DeviceMemoryBase> Allocate(size_t bytes);
+
+  // This API free buffer that is allocated by Allocate command
+  tsl::Status Free(DeviceMemoryBase dst);
+
   // Finalizes command buffer and makes it executable. Once command buffer is
   // finalized no commands can be added to it.
   tsl::Status Finalize();
@@ -130,23 +205,39 @@ class CommandBuffer {
   // Returns command buffer state.
   State state() const;
 
-  internal::CommandBufferInterface* implementation() {
-    return implementation_.get();
-  }
+  //===--------------------------------------------------------------------===//
+  // Semi-internal APIs
+  //===--------------------------------------------------------------------===//
+
+  // Following APIs are public, but considered to be implementation detail and
+  // discouraged from uses outside of StreamExecutor package.
+  const internal::CommandBufferInterface* implementation() const;
+  internal::CommandBufferInterface* implementation();
 
-  StreamExecutor* executor() const { return executor_; }
+  // Creates a command buffer from a platform-specific command buffer
+  // implementation.
+  static CommandBuffer Create(
+      std::unique_ptr<internal::CommandBufferInterface> implementation);
 
-  const internal::CommandBufferInterface* implementation() const {
-    return implementation_.get();
-  }
+  // An adaptor for a command buffer builder that records commands into the
+  // platform-specific implementation
+  static tsl::Status Build(internal::CommandBufferInterface* implementation,
+                           const CommandBuffer::Builder& builder);
 
  private:
-  CommandBuffer(
-      StreamExecutor* executor,
+  explicit CommandBuffer(
       std::unique_ptr<internal::CommandBufferInterface> implementation);
 
-  StreamExecutor* executor_;
-  std::unique_ptr<internal::CommandBufferInterface> implementation_;
+  explicit CommandBuffer(internal::CommandBufferInterface* implementation);
+
+  // A custom deleter to be able to construct command buffer that doesn't own
+  // underlying implementation (behaves like std::weak_ptr for implementation).
+  struct Deleter {
+    void operator()(internal::CommandBufferInterface*);
+    bool owned = true;
+  };
+
+  std::unique_ptr<internal::CommandBufferInterface, Deleter> implementation_;
 
   CommandBuffer(const CommandBuffer&) = delete;
   void operator=(const CommandBuffer&) = delete;
@@ -160,12 +251,8 @@ template <typename... Params, typename... Args>
 inline tsl::Status CommandBuffer::Launch(const TypedKernel<Params...>& kernel,
                                          const ThreadDim& threads,
                                          const BlockDim& blocks, Args... args) {
-  KernelInvocationChecker<std::tuple<Params...>,
-                          std::tuple<Args...>>::CheckAllStaticAssert();
-
-  KernelArgsArray<sizeof...(args)> kernel_args;
-  kernel.PackParams(&kernel_args, args...);
-  TF_RETURN_IF_ERROR(Launch(threads, blocks, kernel, kernel_args));
+  auto kernel_args = PackKernelArgs(kernel, args...);
+  TF_RETURN_IF_ERROR(Launch(threads, blocks, kernel, *kernel_args));
   return tsl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 6caea91253d331..602e4f79ebc10a 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -1,8 +1,6 @@
-# Description:
-#   CUDA-platform specific StreamExecutor support code.
-
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
 load("//xla/tests:build_defs.bzl", "xla_test")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda")
 load(
     "//xla:xla.bzl",
     "xla_cc_test",
@@ -40,6 +38,21 @@ package_group(
     packages = stream_executor_friends(),
 )
 
+# Add `--//third_party/tensorflow/compiler/xla/stream_executor/cuda:enable_graph_conditional` to
+# build command to enable CUDA graph conditional nodes support. Requires CUDA >=12.3.
+#
+# See: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#conditional-graph-nodes
+bool_flag(
+    name = "enable_graph_conditional",
+    build_setting_default = False,
+)
+
+config_setting(
+    name = "graph_conditional_enabled",
+    flag_values = {":enable_graph_conditional": "True"},
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "cuda_platform_id",
     srcs = ["cuda_platform_id.cc"],
@@ -66,9 +79,9 @@ cc_library(
             "@com_google_absl//absl/strings",
             "@com_google_absl//absl/strings:str_format",
             "@com_google_absl//absl/synchronization",
-            "//xla/stream_executor",  # buildcleaner: keep
+            "//xla/stream_executor",
+            "//xla/stream_executor:stream_executor_internal",
             "//xla/stream_executor:multi_platform_manager",
-            "//xla/stream_executor:stream_executor_headers",
             "//xla/stream_executor/gpu:gpu_executor_header",
             "//xla/stream_executor/platform",
             "@local_tsl//tsl/platform:errors",
@@ -163,6 +176,7 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_runtime_header",
         "//xla/stream_executor/gpu:gpu_types_header",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:statusor",
@@ -350,6 +364,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
         "//xla/stream_executor",
+        "//xla/stream_executor:stream_executor_internal",
         "//xla/stream_executor:fft",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor/gpu:gpu_executor_header",
@@ -390,6 +405,7 @@ cc_library(
         ":cuda_stream",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
@@ -402,7 +418,7 @@ cc_library(
         "@local_config_cuda//cuda:cudnn_header",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor:stream_executor_internal",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:gpu_timer_header",
@@ -426,10 +442,13 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_driver",
+        "@com_google_absl//absl/log",
         "@local_config_cuda//cuda:cuda_headers",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_kernel_header",
+        "//xla/stream_executor/gpu:gpu_driver_header",
         "//xla/stream_executor/platform",
+        "@local_tsl//tsl/platform:statusor",
     ]),
 )
 
@@ -442,6 +461,23 @@ cuda_library(
     deps = ["@local_config_cuda//cuda:cuda_headers"],
 )
 
+cuda_library(
+    name = "cuda_conditional_kernels",
+    srcs = if_cuda(
+        ["cuda_conditional_kernels.cu.cc"],
+        ["cuda_conditional_kernels.cc"],
+    ),
+    local_defines = select({
+        ":graph_conditional_enabled": ["STREAM_EXECUTOR_CUDA_ENABLE_GRAPH_CONDITIONAL=1"],
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/log",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
 xla_test(
     name = "cuda_kernel_test",
     srcs = if_cuda_is_configured(["cuda_kernel_test.cc"]),
@@ -470,6 +506,7 @@ xla_test(
         "//xla/stream_executor:multi_platform_manager",
         "//xla/stream_executor:platform",
         "@com_google_absl//absl/log:check",
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:test",
@@ -551,12 +588,15 @@ cc_library(
         ":cuda_runtime",
         ":cuda_stream",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/strings:str_format",
-        "//xla/stream_executor:command_buffer",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//xla/stream_executor",
         "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor:stream_executor_internal",
         "//xla/stream_executor/gpu:asm_compiler",
         "//xla/stream_executor/gpu:gpu_command_buffer",
@@ -565,12 +605,14 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_event_header",
         "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:gpu_kernel_header",
+        "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_timer",
         "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
index 843b3470977b53..a03f609398c1fd 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
@@ -612,7 +612,7 @@ tsl::Status CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
 #else
   if (dtype == blas::DataType::kFloat) {
     math_type = CUBLAS_TF32_TENSOR_OP_MATH;
-    if (numeric_options.allow_tf32) {
+    if (!numeric_options.allow_tf32) {
       math_type = CUBLAS_DEFAULT_MATH;
     }
   }
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
index a8bc2b00d4c2bc..af0487f6931e15 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
@@ -211,9 +211,11 @@ cudaDataType_t BlasLt::MatrixLayout::type() const {
                              AsCublasOperation(trans_b)));
   TF_ASSIGN_OR_RETURN(cublasLtEpilogue_t epi, AsCublasLtEpilogue(epilogue));
   TF_RETURN_IF_ERROR(SetAttr(cu_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, epi));
-  // TODO(b/259609697): Set the CUBLASLT_MATMUL_DESC_FAST_ACCUM attribute if
-  // enable_fast_accum is true, once Flax/Praxis properly pass a PrecisionConfig
-  // of HIGH or HIGHEST on the backwards pass.
+  // The CUBLASLT_MATMUL_DESC_FAST_ACCUM flag only impacts FP8 gemms. It speeds
+  // up gemms at the expense of accumulation precision. In practice, it is safe
+  // to set on the forward pass but not the backward pass.
+  TF_RETURN_IF_ERROR(SetAttr(cu_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM,
+                             static_cast<int8_t>(enable_fast_accum)));
   return std::move(desc);
 }
 
@@ -400,6 +402,7 @@ tsl::Status BlasLt::MatmulPlan::DoMatmul(
     workspace = gpu::GpuMemoryMutable(&alloc);
   }
 
+  auto palgo = std::any_cast<cublasLtMatmulAlgo_t>(&algorithm.opaque_algo);
   {
     absl::MutexLock lock(&blas_lt_ref_.mu_);
     TF_RET_CHECK(blas_lt_ref_.blas_lt_ != nullptr);
@@ -475,8 +478,7 @@ tsl::Status BlasLt::MatmulPlan::DoMatmul(
 
     gpu::ScopedActivateExecutorContext sac{blas_lt_ref_.parent_};
 
-    if (auto palgo =
-            std::any_cast<cublasLtMatmulAlgo_t>(&algorithm.opaque_algo)) {
+    if (palgo != nullptr) {
       SE_CUBLAS_RETURN_IF_ERROR(cublasLtMatmul(
           blas_lt_ref_.blas_lt_.get(), op_desc_.get(), alpha, a.opaque(),
           a_desc_.get(), b.opaque(), b_desc_.get(), beta, c.opaque(),
@@ -489,6 +491,8 @@ tsl::Status BlasLt::MatmulPlan::DoMatmul(
 
   if (profile_result != nullptr) {
     TF_ASSIGN_OR_RETURN(absl::Duration elapsed, timer->GetElapsedDuration());
+    // set algorithm ID to be unique (otherwise it gets kDefaultAlgorithm ID)
+    profile_result->set_algorithm(reinterpret_cast<blas::AlgorithmType>(palgo));
     profile_result->set_is_valid(true);
     profile_result->set_elapsed_time_in_ms(absl::ToDoubleMilliseconds(elapsed));
   }
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
index 9f59c8a820f464..25e35b492bff27 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_command_buffer_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/cuda/cuda_test_kernels.h"
+#include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
@@ -34,6 +35,12 @@ namespace stream_executor::cuda {
 
 using AddI32Kernel = TypedKernel<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
                                  DeviceMemory<int32_t>>;
+using MulI32Kernel = TypedKernel<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
+                                 DeviceMemory<int32_t>>;
+using IncAndCmpKernel =
+    TypedKernel<DeviceMemory<int32_t>, DeviceMemory<bool>, int32_t>;
+
+using AddI32Ptrs3 = TypedKernel<internal::Ptrs3<int32_t>>;
 
 static constexpr auto nested = CommandBuffer::Mode::kNested;    // NOLINT
 static constexpr auto primary = CommandBuffer::Mode::kPrimary;  // NOLINT
@@ -96,6 +103,10 @@ TEST(CudaCommandBufferTest, LaunchSingleKernel) {
 }
 
 TEST(CudaCommandBufferTest, TraceSingleKernel) {
+#if CUDA_VERSION < 12030
+  GTEST_SKIP() << "Command buffer tracing is not supported";
+#endif
+
   Platform* platform = MultiPlatformManager::PlatformWithName("CUDA").value();
   StreamExecutor* executor = platform->ExecutorForDevice(0).value();
 
@@ -103,10 +114,22 @@ TEST(CudaCommandBufferTest, TraceSingleKernel) {
   stream.Init();
   ASSERT_TRUE(stream.ok());
 
-  MultiKernelLoaderSpec spec(/*arity=*/3);
-  spec.AddCudaPtxInMemory(internal::kAddI32Kernel, "add");
+  AddI32Ptrs3 add(executor);
+
+  // Register a kernel with a custom arguments packing function that packs
+  // device memory arguments into a struct with pointers.
+  MultiKernelLoaderSpec spec(/*arity=*/1, [&](const Kernel& kernel,
+                                              const KernelArgs& args) {
+    auto bufs = Cast<KernelArgsDeviceMemoryArray>(&args)->device_memory_args();
+    auto cast = [](auto m) { return reinterpret_cast<int32_t*>(m.opaque()); };
+    return PackKernelArgs(add, internal::Ptrs3<int32_t>{
+                                   cast(bufs[0]),
+                                   cast(bufs[1]),
+                                   cast(bufs[2]),
+                               });
+  });
+  spec.AddInProcessSymbol(internal::GetAddI32Ptrs3CudaKernel(), "add");
 
-  AddI32Kernel add(executor);
   TF_ASSERT_OK(executor->GetKernel(spec, &add));
 
   int64_t length = 4;
@@ -121,9 +144,12 @@ TEST(CudaCommandBufferTest, TraceSingleKernel) {
   stream.ThenMemset32(&b, 2, byte_length);
   stream.ThenMemZero(&c, byte_length);
 
+  // Use an array of device memory base pointers as argument to test packing.
+  KernelArgsDeviceMemoryArray args({a, b, c}, 0);
+
   // Create a command buffer by tracing kernel launch operations.
   auto cmd_buffer = CommandBuffer::Trace(executor, [&](Stream* stream) {
-    return stream->ThenLaunch(ThreadDim(), BlockDim(4), add, a, b, c);
+    return executor->Launch(stream, ThreadDim(), BlockDim(4), add, args);
   });
 
   TF_ASSERT_OK(cmd_buffer.status());
@@ -178,6 +204,532 @@ TEST(CudaCommandBufferTest, LaunchNestedCommandBuffer) {
 
   std::vector<int32_t> expected = {3, 3, 3, 3};
   ASSERT_EQ(dst, expected);
+
+  // Prepare argument for graph update: d = 0
+  DeviceMemory<int32_t> d = executor->AllocateArray<int32_t>(length, 0);
+  stream.ThenMemZero(&d, byte_length);
+
+  // Update command buffer to write into `d` buffer by creating a new nested
+  // command buffer.
+  nested_cmd = CommandBuffer::Create(executor, nested).value();
+  TF_ASSERT_OK(nested_cmd.Launch(add, ThreadDim(), BlockDim(4), a, b, d));
+  TF_ASSERT_OK(primary_cmd.Update());
+  TF_ASSERT_OK(primary_cmd.AddNestedCommandBuffer(nested_cmd));
+  TF_ASSERT_OK(primary_cmd.Finalize());
+
+  TF_ASSERT_OK(executor->Submit(&stream, primary_cmd));
+
+  // Copy `d` data back to host.
+  std::fill(dst.begin(), dst.end(), 42);
+  stream.ThenMemcpy(dst.data(), d, byte_length);
+  ASSERT_EQ(dst, expected);
+}
+
+TEST(CudaCommandBufferTest, MemcpyDeviceToDevice) {
+  Platform* platform = MultiPlatformManager::PlatformWithName("CUDA").value();
+  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: a=42, b=uninitialized
+  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+
+  stream.ThenMemset32(&a, 42, byte_length);
+
+  // Create a command buffer with a single a to b memcpy command.
+  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  TF_ASSERT_OK(cmd_buffer.MemcpyDeviceToDevice(&b, a, byte_length));
+  TF_ASSERT_OK(cmd_buffer.Finalize());
+
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+
+  // Copy `b` data back to host.
+  std::vector<int32_t> dst(4, 0);
+  stream.ThenMemcpy(dst.data(), a, byte_length);
+
+  std::vector<int32_t> expected = {42, 42, 42, 42};
+  ASSERT_EQ(dst, expected);
+
+  // Update command buffer to swap the memcpy direction.
+  TF_ASSERT_OK(cmd_buffer.Update());
+  TF_ASSERT_OK(cmd_buffer.MemcpyDeviceToDevice(&a, b, byte_length));
+  TF_ASSERT_OK(cmd_buffer.Finalize());
+
+  // Clear destination to test that command buffer actually copied memory.
+  stream.ThenMemset32(&a, 0, byte_length);
+
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+
+  // Copy `a` data back to host.
+  std::fill(dst.begin(), dst.end(), 0);
+  stream.ThenMemcpy(dst.data(), a, byte_length);
+  ASSERT_EQ(dst, expected);
+}
+
+TEST(CudaCommandBufferTest, Memset) {
+  Platform* platform = MultiPlatformManager::PlatformWithName("CUDA").value();
+  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+
+  // Create a command buffer with a single memset command.
+  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  TF_ASSERT_OK(cmd_buffer.Memset(&a, uint32_t{42}, length));
+  TF_ASSERT_OK(cmd_buffer.Finalize());
+
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+
+  // Copy `a` data back to host.
+  std::vector<int32_t> dst(4, 0);
+  stream.ThenMemcpy(dst.data(), a, byte_length);
+
+  std::vector<int32_t> expected = {42, 42, 42, 42};
+  ASSERT_EQ(dst, expected);
+
+  // Update command buffer to use a new bit pattern.
+  TF_ASSERT_OK(cmd_buffer.Update());
+  TF_ASSERT_OK(cmd_buffer.Memset(&a, uint32_t{43}, length));
+  TF_ASSERT_OK(cmd_buffer.Finalize());
+
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+
+  // Copy `d` data back to host.
+  std::fill(dst.begin(), dst.end(), 0);
+  stream.ThenMemcpy(dst.data(), a, byte_length);
+
+  expected = {43, 43, 43, 43};
+  ASSERT_EQ(dst, expected);
+}
+
+TEST(CudaCommandBufferTest, ConditionalIf) {
+  Platform* platform = MultiPlatformManager::PlatformWithName("CUDA").value();
+  if (!CommandBuffer::SupportsConditionalCommands(platform)) {
+    GTEST_SKIP() << "CUDA graph conditionals are not supported";
+  }
+
+  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  AddI32Kernel add(executor);
+
+  {  // Load addition kernel.
+    MultiKernelLoaderSpec spec(/*arity=*/3);
+    spec.AddInProcessSymbol(internal::GetAddI32CudaKernel(), "add");
+    TF_ASSERT_OK(executor->GetKernel(spec, &add));
+  }
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: a=1, b=2, c=0, pred=true
+  DeviceMemory<bool> pred = executor->AllocateArray<bool>(1, 0);
+  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
+
+  constexpr bool kTrue = true;
+  stream.ThenMemcpy(&pred, &kTrue, 1);
+  stream.ThenMemset32(&a, 1, byte_length);
+  stream.ThenMemset32(&b, 2, byte_length);
+  stream.ThenMemZero(&c, byte_length);
+
+  // if (pred == true) c = a + b
+  CommandBuffer::Builder then_builder = [&](CommandBuffer* then_cmd) {
+    return then_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, c);
+  };
+
+  // Create a command buffer with a single conditional operation.
+  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  TF_ASSERT_OK(cmd_buffer.If(executor, pred, then_builder));
+  TF_ASSERT_OK(cmd_buffer.Finalize());
+
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+
+  // Copy `c` data back to host.
+  std::vector<int32_t> dst(4, 42);
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+
+  std::vector<int32_t> expected = {3, 3, 3, 3};
+  ASSERT_EQ(dst, expected);
+
+  // Reset predicate to false and clear output buffer.
+  constexpr bool kFalse = false;
+  stream.ThenMemcpy(&pred, &kFalse, 1);
+  stream.ThenMemZero(&c, byte_length);
+
+  // Submit the same command buffer, but this time it should not execute
+  // conditional branch as conditional handle should be updated to false.
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+  std::vector<int32_t> zeroes = {0, 0, 0, 0};
+  ASSERT_EQ(dst, zeroes);
+
+  // Prepare argument for graph update: d = 0
+  DeviceMemory<int32_t> d = executor->AllocateArray<int32_t>(length, 0);
+  stream.ThenMemZero(&d, byte_length);
+
+  // Set predicate buffer to true to run conditional command buffer.
+  stream.ThenMemcpy(&pred, &kTrue, 1);
+
+  // if (pred == true) d = a + b (write to a new location).
+  then_builder = [&](CommandBuffer* then_cmd) {
+    return then_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, d);
+  };
+
+  // Update command buffer with a conditional to use new builder.
+  TF_ASSERT_OK(cmd_buffer.Update());
+  TF_ASSERT_OK(cmd_buffer.If(executor, pred, then_builder));
+  TF_ASSERT_OK(cmd_buffer.Finalize());
+
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+
+  // Copy `d` data back to host.
+  std::fill(dst.begin(), dst.end(), 42);
+  stream.ThenMemcpy(dst.data(), d, byte_length);
+  ASSERT_EQ(dst, expected);
+}
+
+TEST(CudaCommandBufferTest, ConditionalIfElse) {
+  Platform* platform = MultiPlatformManager::PlatformWithName("CUDA").value();
+  if (!CommandBuffer::SupportsConditionalCommands(platform)) {
+    GTEST_SKIP() << "CUDA graph conditionals are not supported";
+  }
+
+  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  AddI32Kernel add(executor);
+  MulI32Kernel mul(executor);
+
+  {  // Load addition kernel.
+    MultiKernelLoaderSpec spec(/*arity=*/3);
+    spec.AddInProcessSymbol(internal::GetAddI32CudaKernel(), "add");
+    TF_ASSERT_OK(executor->GetKernel(spec, &add));
+  }
+
+  {  // Load multiplication kernel.
+    MultiKernelLoaderSpec spec(/*arity=*/3);
+    spec.AddInProcessSymbol(internal::GetMulI32CudaKernel(), "mul");
+    TF_ASSERT_OK(executor->GetKernel(spec, &mul));
+  }
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: a=2, b=3, c=0, pred=true
+  DeviceMemory<bool> pred = executor->AllocateArray<bool>(1, 0);
+  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
+
+  constexpr bool kTrue = true;
+  stream.ThenMemcpy(&pred, &kTrue, 1);
+  stream.ThenMemset32(&a, 2, byte_length);
+  stream.ThenMemset32(&b, 3, byte_length);
+  stream.ThenMemZero(&c, byte_length);
+
+  // if (pred == true) c = a + b
+  CommandBuffer::Builder then_builder = [&](CommandBuffer* then_cmd) {
+    return then_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, c);
+  };
+
+  // if (pred == false) c = a * b
+  CommandBuffer::Builder else_builder = [&](CommandBuffer* else_cmd) {
+    return else_cmd->Launch(mul, ThreadDim(), BlockDim(4), a, b, c);
+  };
+
+  // Create a command buffer with a single conditional operation.
+  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  TF_ASSERT_OK(cmd_buffer.IfElse(executor, pred, then_builder, else_builder));
+  TF_ASSERT_OK(cmd_buffer.Finalize());
+
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `c` data back to host.
+  std::vector<int32_t> dst(4, 42);
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+
+  std::vector<int32_t> expected_add = {5, 5, 5, 5};
+  ASSERT_EQ(dst, expected_add);
+
+  // Reset predicate to false.
+  constexpr bool kFalse = false;
+  stream.ThenMemcpy(&pred, &kFalse, 1);
+
+  // Submit the same command buffer, but this time it should execute `else`
+  // branch and multiply inputs.
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+  std::vector<int32_t> expected_mul = {6, 6, 6, 6};
+  ASSERT_EQ(dst, expected_mul);
+
+  // Prepare argument for graph update: d = 0
+  DeviceMemory<int32_t> d = executor->AllocateArray<int32_t>(length, 0);
+  stream.ThenMemZero(&d, byte_length);
+
+  // if (pred == false) d = a * b (write to a new location).
+  else_builder = [&](CommandBuffer* else_cmd) {
+    return else_cmd->Launch(mul, ThreadDim(), BlockDim(4), a, b, d);
+  };
+
+  // Update command buffer with a conditional to use new `else` builder.
+  TF_ASSERT_OK(cmd_buffer.Update());
+  TF_ASSERT_OK(cmd_buffer.IfElse(executor, pred, then_builder, else_builder));
+  TF_ASSERT_OK(cmd_buffer.Finalize());
+
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `d` data back to host.
+  std::fill(dst.begin(), dst.end(), 42);
+  stream.ThenMemcpy(dst.data(), d, byte_length);
+  ASSERT_EQ(dst, expected_mul);
+}
+
+TEST(CudaCommandBufferTest, ConditionalCase) {
+  Platform* platform = MultiPlatformManager::PlatformWithName("CUDA").value();
+  if (!CommandBuffer::SupportsConditionalCommands(platform)) {
+    GTEST_SKIP() << "CUDA graph conditionals are not supported";
+  }
+
+  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  AddI32Kernel add(executor);
+  MulI32Kernel mul(executor);
+
+  {  // Load addition kernel.
+    MultiKernelLoaderSpec spec(/*arity=*/3);
+    spec.AddInProcessSymbol(internal::GetAddI32CudaKernel(), "add");
+    TF_ASSERT_OK(executor->GetKernel(spec, &add));
+  }
+
+  {  // Load multiplication kernel.
+    MultiKernelLoaderSpec spec(/*arity=*/3);
+    spec.AddInProcessSymbol(internal::GetMulI32CudaKernel(), "mul");
+    TF_ASSERT_OK(executor->GetKernel(spec, &mul));
+  }
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: a=2, b=3, c=0, index=0
+  DeviceMemory<int32_t> index = executor->AllocateArray<int32_t>(1, 0);
+  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
+
+  stream.ThenMemset32(&index, 0, sizeof(int32_t));
+  stream.ThenMemset32(&a, 2, byte_length);
+  stream.ThenMemset32(&b, 3, byte_length);
+  stream.ThenMemZero(&c, byte_length);
+
+  // if (index == 0) c = a + b
+  CommandBuffer::Builder branch0 = [&](CommandBuffer* branch0_cmd) {
+    return branch0_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, c);
+  };
+
+  // if (index == 1) c = a * b
+  CommandBuffer::Builder branch1 = [&](CommandBuffer* branch1_cmd) {
+    return branch1_cmd->Launch(mul, ThreadDim(), BlockDim(4), a, b, c);
+  };
+
+  // Create a command buffer with a single conditional operation.
+  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  TF_ASSERT_OK(cmd_buffer.Case(executor, index, {branch0, branch1}));
+  TF_ASSERT_OK(cmd_buffer.Finalize());
+
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `c` data back to host.
+  std::vector<int32_t> dst(4, 42);
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+
+  std::vector<int32_t> expected_add = {5, 5, 5, 5};
+  ASSERT_EQ(dst, expected_add);
+
+  // Set index to `1`
+  stream.ThenMemset32(&index, 1, sizeof(int32_t));
+
+  // Submit the same command buffer, but this time it should multiply inputs.
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+  std::vector<int32_t> expected_mul = {6, 6, 6, 6};
+  ASSERT_EQ(dst, expected_mul);
+
+  // Set index to `-1` (out of bound index value).
+  stream.ThenMemset32(&index, -1, sizeof(int32_t));
+
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+  ASSERT_EQ(dst, expected_mul);
+
+  // Set index to `2` (out of bound index value).
+  stream.ThenMemset32(&index, 2, sizeof(int32_t));
+
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  stream.ThenMemcpy(dst.data(), c, byte_length);
+  ASSERT_EQ(dst, expected_mul);
+}
+
+TEST(CudaCommandBufferTest, ConditionalFor) {
+  Platform* platform = MultiPlatformManager::PlatformWithName("CUDA").value();
+  if (!CommandBuffer::SupportsConditionalCommands(platform)) {
+    GTEST_SKIP() << "CUDA graph conditionals are not supported";
+  }
+
+  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  AddI32Kernel add(executor);
+
+  {  // Load addition kernel.
+    MultiKernelLoaderSpec spec(/*arity=*/3);
+    spec.AddInProcessSymbol(internal::GetAddI32CudaKernel(), "add");
+    TF_ASSERT_OK(executor->GetKernel(spec, &add));
+  }
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: a=1, b=0, loop_counter=100
+  DeviceMemory<int32_t> loop_counter = executor->AllocateArray<int32_t>(1, 0);
+  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+
+  // Set loop counter to 100 to check that command buffer resets it.
+  stream.ThenMemset32(&loop_counter, 100, sizeof(int32_t));
+  stream.ThenMemset32(&a, 1, byte_length);
+  stream.ThenMemZero(&b, byte_length);
+
+  // Loop body: b = a + b
+  CommandBuffer::Builder body_builder = [&](CommandBuffer* body_cmd) {
+    return body_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, b);
+  };
+
+  int32_t num_iters = 10;
+
+  // Create a command buffer with a single conditional operation.
+  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  TF_ASSERT_OK(cmd_buffer.For(executor, num_iters, loop_counter, body_builder));
+  TF_ASSERT_OK(cmd_buffer.Finalize());
+
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+
+  // Copy `b` data back to host.
+  std::vector<int32_t> dst(4, 42);
+  stream.ThenMemcpy(dst.data(), b, byte_length);
+
+  std::vector<int32_t> expected = {10, 10, 10, 10};
+  ASSERT_EQ(dst, expected);
+}
+
+TEST(CudaCommandBufferTest, ConditionalWhile) {
+  Platform* platform = MultiPlatformManager::PlatformWithName("CUDA").value();
+  if (!CommandBuffer::SupportsConditionalCommands(platform)) {
+    GTEST_SKIP() << "CUDA graph conditionals are not supported";
+  }
+
+  StreamExecutor* executor = platform->ExecutorForDevice(0).value();
+
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  AddI32Kernel add(executor);
+  IncAndCmpKernel inc_and_cmp(executor);
+
+  {  // Load addition kernel.
+    MultiKernelLoaderSpec spec(/*arity=*/3);
+    spec.AddInProcessSymbol(internal::GetAddI32CudaKernel(), "add");
+    TF_ASSERT_OK(executor->GetKernel(spec, &add));
+  }
+
+  {  // Load inc_and_cmp kernel.
+    MultiKernelLoaderSpec spec(/*arity=*/3);
+    spec.AddInProcessSymbol(internal::GetIncAndCmpCudaKernel(), "inc_and_cmp");
+    TF_ASSERT_OK(executor->GetKernel(spec, &inc_and_cmp));
+  }
+
+  int64_t length = 4;
+  int64_t byte_length = sizeof(int32_t) * length;
+
+  // Prepare arguments: a=1, b=0, loop_counter=0, pred=false
+  // Value of `pred` is not important, as it will be updated by `cond_builder`
+  // below.
+  DeviceMemory<bool> pred = executor->AllocateArray<bool>(1, 0);
+  DeviceMemory<int32_t> loop_counter = executor->AllocateArray<int32_t>(1, 0);
+  DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
+  DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(length, 0);
+
+  static constexpr bool kFalse = false;
+  stream.ThenMemcpy(&pred, &kFalse, 1);
+  stream.ThenMemset32(&loop_counter, 0, sizeof(int32_t));
+  stream.ThenMemset32(&a, 1, byte_length);
+  stream.ThenMemZero(&b, byte_length);
+
+  int32_t num_iters = 10;
+
+  // Loop cond: loop_counter++ < num_iters;
+  CommandBuffer::Builder cond_builder = [&](CommandBuffer* cond_cmd) {
+    return cond_cmd->Launch(inc_and_cmp, ThreadDim(), BlockDim(), loop_counter,
+                            pred, num_iters);
+  };
+
+  // Loop body: b = a + b
+  CommandBuffer::Builder body_builder = [&](CommandBuffer* body_cmd) {
+    return body_cmd->Launch(add, ThreadDim(), BlockDim(length), a, b, b);
+  };
+
+  // Create a command buffer with a single conditional operation.
+  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  TF_ASSERT_OK(cmd_buffer.While(executor, pred, cond_builder, body_builder));
+  TF_ASSERT_OK(cmd_buffer.Finalize());
+
+  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+
+  // Copy `b` data back to host.
+  std::vector<int32_t> dst(4, 42);
+  stream.ThenMemcpy(dst.data(), b, byte_length);
+
+  std::vector<int32_t> expected = {10, 10, 10, 10};
+  ASSERT_EQ(dst, expected);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_conditional_kernels.cc b/third_party/xla/xla/stream_executor/cuda/cuda_conditional_kernels.cc
new file mode 100644
index 00000000000000..4bf38d89c5ba2f
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_conditional_kernels.cc
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/log/log.h"
+
+namespace stream_executor::gpu {
+
+void* GetSetIfConditionKernel() {
+  LOG(ERROR) << "XLA compiled without --config=cuda";
+  return nullptr;
+}
+
+void* GetSetIfElseConditionKernel() {
+  LOG(ERROR) << "XLA compiled without --config=cuda";
+  return nullptr;
+}
+
+void* GetSetCaseConditionKernel() {
+  LOG(ERROR) << "XLA compiled without --config=cuda";
+  return nullptr;
+}
+
+void* GetSetForConditionKernel() {
+  LOG(ERROR) << "XLA compiled without --config=cuda";
+  return nullptr;
+}
+
+void* GetSetWhileConditionKernel() {
+  LOG(ERROR) << "XLA compiled without --config=cuda";
+  return nullptr;
+}
+
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_conditional_kernels.cu.cc b/third_party/xla/xla/stream_executor/cuda/cuda_conditional_kernels.cu.cc
new file mode 100644
index 00000000000000..0ce8dbd4a4fa32
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_conditional_kernels.cu.cc
@@ -0,0 +1,129 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+#include <cstdint>
+
+#include "third_party/gpus/cuda/include/cuda.h"
+
+namespace stream_executor {
+namespace cuda {
+namespace {
+
+// In all kernels defined below we set conditional handle value to `1` when we
+// want to execute a CUDA graph tied to it, and to `0` otherwise. For loops, the
+// graph will keep being executed until the conditional handle becomes `0`.
+
+#if defined(STREAM_EXECUTOR_CUDA_ENABLE_GRAPH_CONDITIONAL) && \
+    CUDA_VERSION >= 12030
+
+__global__ void SetIfCondition(cudaGraphConditionalHandle then_handle,
+                               bool* predicate) {
+  if (*predicate) {
+    cudaGraphSetConditional(then_handle, 1);
+  } else {
+    cudaGraphSetConditional(then_handle, 0);
+  }
+}
+
+__global__ void SetIfElseCondition(cudaGraphConditionalHandle then_handle,
+                                   cudaGraphConditionalHandle else_handle,
+                                   bool* predicate) {
+  if (*predicate) {
+    cudaGraphSetConditional(then_handle, 1);
+    cudaGraphSetConditional(else_handle, 0);
+  } else {
+    cudaGraphSetConditional(then_handle, 0);
+    cudaGraphSetConditional(else_handle, 1);
+  }
+}
+
+__global__ void SetCaseCondition(
+    cudaGraphConditionalHandle h0, cudaGraphConditionalHandle h1,
+    cudaGraphConditionalHandle h2, cudaGraphConditionalHandle h3,
+    cudaGraphConditionalHandle h4, cudaGraphConditionalHandle h5,
+    cudaGraphConditionalHandle h6, cudaGraphConditionalHandle h7,
+    int32_t* index, int32_t num_handles) {
+  // Only handles in [0, num_handles) range are valid.
+  //
+  // We can't define a device function with dynamic number of handle arguments,
+  // so we always pass 8 handles, but only some of them are valid. Size 8 picked
+  // as a reasonable (but random) upper bound for what we see in XLA uses.
+  std::array<cudaGraphConditionalHandle, 8> handles = {h0, h1, h2, h3,
+                                                       h4, h5, h6, h7};
+
+  // If branch index is out of range activate the last valid handle.
+  int32_t branch_index = *index;
+  if (branch_index < 0 || branch_index >= num_handles) {
+    branch_index = num_handles - 1;
+  }
+
+  for (int32_t i = 0; i < num_handles; ++i) {
+    if (branch_index == i) {
+      cudaGraphSetConditional(handles[i], 1);
+    } else {
+      cudaGraphSetConditional(handles[i], 0);
+    }
+  }
+}
+
+__global__ void SetForCondition(cudaGraphConditionalHandle handle,
+                                int32_t* loop_index, int32_t num_iterations) {
+  if (*loop_index < num_iterations) {
+    cudaGraphSetConditional(handle, 1);
+  } else {
+    cudaGraphSetConditional(handle, 0);
+  }
+  *loop_index += 1;
+}
+
+#else  // CUDA graph conditionals are not available
+
+__global__ void SetIfCondition() {}
+__global__ void SetIfElseCondition() {}
+__global__ void SetCaseCondition() {}
+__global__ void SetForCondition() {}
+
+#endif
+
+}  // namespace
+}  // namespace cuda
+
+namespace gpu {
+
+void* GetSetIfConditionKernel() {
+  return reinterpret_cast<void*>(&cuda::SetIfCondition);
+}
+
+void* GetSetIfElseConditionKernel() {
+  return reinterpret_cast<void*>(&cuda::SetIfElseCondition);
+}
+
+void* GetSetCaseConditionKernel() {
+  return reinterpret_cast<void*>(&cuda::SetCaseCondition);
+}
+
+void* GetSetForConditionKernel() {
+  return reinterpret_cast<void*>(&cuda::SetForCondition);
+}
+
+void* GetSetWhileConditionKernel() {
+  // While condition kernel is the same as an `If` with a single branch.
+  return reinterpret_cast<void*>(&cuda::SetIfCondition);
+}
+
+}  // namespace gpu
+
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc b/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc
index 4b57ee61c69f44..e7bbd32f134870 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc
@@ -15,17 +15,21 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 
+#if !defined(PLATFORM_WINDOWS)
 #include <dirent.h>
+#endif
+
 #include <limits.h>
-#include <link.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/stat.h>
+#if !defined(PLATFORM_WINDOWS)
+#include <link.h>
 #include <sys/sysmacros.h>
 #include <unistd.h>
-
+#endif
+#include <sys/stat.h>
 #include <algorithm>
 #include <memory>
 #include <vector>
@@ -104,7 +108,9 @@ tsl::StatusOr<DriverVersion> StringToDriverVersion(const std::string &value) {
 namespace stream_executor {
 namespace gpu {
 
+#if !defined(PLATFORM_WINDOWS)
 static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
+#endif
 
 // -- class Diagnostician
 
@@ -113,6 +119,7 @@ std::string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
 }
 
 void Diagnostician::LogDiagnosticInformation() {
+#if !defined(PLATFORM_WINDOWS)
   if (access(kDriverVersionPath, F_OK) != 0) {
     VLOG(1) << "kernel driver does not appear to be running on this host "
             << "(" << tsl::port::Hostname() << "): "
@@ -125,6 +132,7 @@ void Diagnostician::LogDiagnosticInformation() {
             << " does not exist";
     return;
   }
+#endif
 
   LOG(INFO) << "retrieving CUDA diagnostic information for host: "
             << tsl::port::Hostname();
@@ -134,6 +142,7 @@ void Diagnostician::LogDiagnosticInformation() {
 
 /* static */ void Diagnostician::LogDriverVersionInformation() {
   LOG(INFO) << "hostname: " << tsl::port::Hostname();
+#ifndef PLATFORM_WINDOWS
   if (VLOG_IS_ON(1)) {
     const char *value = getenv("LD_LIBRARY_PATH");
     std::string library_path = value == nullptr ? "" : value;
@@ -162,10 +171,13 @@ void Diagnostician::LogDiagnosticInformation() {
   tsl::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
   LOG(INFO) << "kernel reported version is: "
             << cuda::DriverVersionStatusToString(kernel_version);
+#endif
 
+#if !defined(PLATFORM_WINDOWS)
   if (kernel_version.ok() && dso_version.ok()) {
     WarnOnDsoKernelMismatch(dso_version, kernel_version);
   }
+#endif
 }
 
 // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
@@ -175,7 +187,7 @@ tsl::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
       absl::StatusCode::kNotFound,
       "was unable to find libcuda.so DSO loaded into this program"));
 
-#if !defined(ANDROID_TEGRA)
+#if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
   // Callback used when iterating through DSOs. Looks for the driver-interfacing
   // DSO and yields its version number into the callback data, when found.
   auto iterate_phdr = [](struct dl_phdr_info *info, size_t size,
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index 4808f13cdde379..41a5440cc46f7d 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/base/optimization.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -8245,6 +8246,7 @@ tsl::Status CreateOpRunners(
   auto maybe_json_handle_runtime = CudnnExecutionPlanEngineFilterRuntime();
 
   out_runners->clear();
+  absl::flat_hash_set<dnn::AlgorithmDesc> algorithm_deduplication;
   for (int i = 0; i < filtered_configs.size(); i++) {
     auto plan = cudnn_frontend::ExecutionPlanBuilder()
                     .setHandle(cudnn.handle())
@@ -8280,6 +8282,13 @@ tsl::Status CreateOpRunners(
               << runner_or.status();
       continue;
     }
+    // We currently collect a list of algorithms using heuristics_mode_a and
+    // heuristics_mode_b, so we can potentially have duplicates. But we should
+    // not actually autotune the same algorithm twice!
+    if (!algorithm_deduplication.insert(runner_or->ToAlgorithmDesc().value())
+             .second) {
+      continue;
+    }
 
     out_runners->push_back(std::make_unique<CudnnExecutionPlanRunner<Sig>>(
         std::move(runner_or).value()));
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
index 9a92d91611915a..957d94e71125e6 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <cstdint>
 #include <cstring>
 #include <string>
+#include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/base/casts.h"
@@ -30,6 +32,7 @@ limitations under the License.
 #include "absl/debugging/leak_check.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -58,17 +61,17 @@ static constexpr bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
 static constexpr bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
 static constexpr bool FLAGS_gpuexec_cuda_device_0_only = false;
 
-#define RETURN_IF_CUDA_RES_ERROR(expr, ...)                                   \
-  do {                                                                        \
-    CUresult _res = (expr);                                                   \
-    if (ABSL_PREDICT_FALSE(_res != CUDA_SUCCESS)) {                           \
-      if (_res == CUDA_ERROR_OUT_OF_MEMORY)                                   \
-        return tsl::errors::ResourceExhausted(                                \
-            __VA_ARGS__, ":", ::stream_executor::gpu::ToString(_res));        \
-      else                                                                    \
-        return tsl::errors::Internal(__VA_ARGS__, ": ",                       \
-                                     ::stream_executor::gpu::ToString(_res)); \
-    }                                                                         \
+#define RETURN_IF_CUDA_RES_ERROR(expr, ...)                              \
+  do {                                                                   \
+    CUresult _res = (expr);                                              \
+    if (ABSL_PREDICT_FALSE(_res != CUDA_SUCCESS)) {                      \
+      if (_res == CUDA_ERROR_OUT_OF_MEMORY)                              \
+        return absl::ResourceExhaustedError(absl::StrCat(                \
+            __VA_ARGS__, ":", ::stream_executor::gpu::ToString(_res)));  \
+      else                                                               \
+        return absl::InternalError(absl::StrCat(                         \
+            __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
+    }                                                                    \
   } while (0)
 
 #define FAIL_IF_CUDA_RES_ERROR(expr, ...)                   \
@@ -511,13 +514,44 @@ static std::string_view StreamCaptureModeToString(
       break;
   }
 
-  VLOG(2) << "Beging stream " << stream << " capture in "
+  VLOG(2) << "Beginning stream " << stream << " capture in "
           << StreamCaptureModeToString(mode) << " mode";
   RETURN_IF_CUDA_RES_ERROR(cuStreamBeginCapture(stream, cu_mode),
                            "Failed to begin stream capture");
   return ::tsl::OkStatus();
 }
 
+/* static */ tsl::Status GpuDriver::StreamBeginCaptureToGraph(
+    CUstream stream, CUgraph graph, StreamCaptureMode mode) {
+  CUstreamCaptureMode cu_mode;
+  switch (mode) {
+    case StreamCaptureMode::kGlobal:
+      cu_mode = CU_STREAM_CAPTURE_MODE_GLOBAL;
+      break;
+    case StreamCaptureMode::kThreadLocal:
+      cu_mode = CU_STREAM_CAPTURE_MODE_THREAD_LOCAL;
+      break;
+    case StreamCaptureMode::kRelaxed:
+      cu_mode = CU_STREAM_CAPTURE_MODE_RELAXED;
+      break;
+  }
+
+#if CUDA_VERSION >= 12030
+  VLOG(2) << "Beginning stream " << stream << " capture in "
+          << StreamCaptureModeToString(mode) << " mode to graph " << graph;
+  RETURN_IF_CUDA_RES_ERROR(
+      cuStreamBeginCaptureToGraph(stream, graph,
+                                  /*dependencies=*/nullptr,
+                                  /*dependencyData=*/nullptr,
+                                  /*numDependencies=*/0, cu_mode),
+      "Failed to begin stream capture to graph");
+  return ::tsl::OkStatus();
+#else
+  return absl::UnimplementedError(
+      "StreamBeginCaptureToGraph is not implemented");
+#endif  // CUDA_VERSION >= 12030
+}
+
 /* static */ tsl::Status GpuDriver::StreamEndCapture(CUstream stream,
                                                      CUgraph* graph) {
   VLOG(2) << "End stream " << stream << " capture";
@@ -714,6 +748,90 @@ GpuDriver::GraphNodeGetType(CUgraphNode node) {
   return status == CU_STREAM_CAPTURE_STATUS_ACTIVE;
 }
 
+/* static */ tsl::Status GpuDriver::GraphConditionalHandleCreate(
+    GpuGraphConditionalHandle* handle, CUgraph graph, GpuContext* context,
+    unsigned int default_launch_value, unsigned int flags) {
+  VLOG(2) << "Create conditional handle for a graph " << graph
+          << "; context: " << context
+          << "; default_launch_value: " << default_launch_value
+          << "; flags: " << flags;
+
+#if CUDA_VERSION >= 12030
+  RETURN_IF_CUDA_RES_ERROR(
+      cuGraphConditionalHandleCreate(handle, graph, context->context(),
+                                     default_launch_value, flags),
+      "Failed to create conditional handle for a CUDA graph");
+#else
+  return absl::UnimplementedError(
+      "CUDA graph conditional nodes are not implemented");
+#endif  // CUDA_VERSION >= 12030
+  return ::tsl::OkStatus();
+}
+
+static std::string ConditionalTypeToString(
+    GpuDriver::GpuGraphConditionalNodeParams::Type type) {
+  switch (type) {
+    case GpuDriver::GpuGraphConditionalNodeParams::Type::kIf:
+      return "IF";
+    case GpuDriver::GpuGraphConditionalNodeParams::Type::kWhile:
+      return "WHILE";
+  }
+}
+
+/* static */ tsl::StatusOr<GpuDriver::GpuGraphNodeResult>
+GpuDriver::GraphAddNode(CUgraphNode* node, CUgraph graph,
+                        absl::Span<CUgraphNode> deps,
+                        const GpuGraphNodeParams& params) {
+#if CUDA_VERSION >= 12030
+  // Add conditional node to a graph.
+  if (auto* conditional = std::get_if<GpuGraphConditionalNodeParams>(&params)) {
+    VLOG(2) << "Add conditional node to a graph " << graph
+            << "; type: " << ConditionalTypeToString(conditional->type)
+            << "; deps: " << deps.size();
+
+    CUgraphNodeParams cu_params;
+    memset(&cu_params, 0, sizeof(cu_params));
+
+    cu_params.type = CU_GRAPH_NODE_TYPE_CONDITIONAL;
+    cu_params.conditional.handle = conditional->handle;
+    cu_params.conditional.ctx = conditional->context->context();
+    cu_params.conditional.size = 1;
+
+    switch (conditional->type) {
+      case GpuDriver::GpuGraphConditionalNodeParams::Type::kIf:
+        cu_params.conditional.type = CU_GRAPH_COND_TYPE_IF;
+        break;
+      case GpuDriver::GpuGraphConditionalNodeParams::Type::kWhile:
+        cu_params.conditional.type = CU_GRAPH_COND_TYPE_WHILE;
+        break;
+    }
+
+    RETURN_IF_CUDA_RES_ERROR(
+        cuGraphAddNode(node, graph, deps.data(), deps.size(), &cu_params),
+        "Failed to add conditional node to a CUDA graph");
+
+    GpuGraphConditionalNodeParams::Result result;
+    result.graph = cu_params.conditional.phGraph_out[0];
+
+    VLOG(2) << "Created conditional CUDA graph " << result.graph;
+    return result;
+  }
+#endif  // CUDA_VERSION >= 12030
+
+  return absl::UnimplementedError("unsupported node type");
+}
+
+/* static */ tsl::Status GpuDriver::GraphAddEmptyNode(
+    CUgraphNode* node, CUgraph graph, absl::Span<CUgraphNode> deps) {
+  VLOG(2) << "Add empty node to a graph " << graph << "; deps: " << deps.size();
+
+  RETURN_IF_CUDA_RES_ERROR(
+      cuGraphAddEmptyNode(node, graph, deps.data(), deps.size()),
+      "Failed to add empty node to a CUDA graph");
+
+  return tsl::OkStatus();
+}
+
 /* static */ tsl::Status GpuDriver::GraphAddKernelNode(
     CUgraphNode* node, CUgraph graph, absl::Span<CUgraphNode> deps,
     absl::string_view kernel_name, CUfunction function, unsigned int grid_dim_x,
@@ -741,6 +859,9 @@ GpuDriver::GraphNodeGetType(CUgraphNode node) {
   params.kernelParams = kernel_params;
   params.extra = extra;
 
+  // TODO(ezhulenev): Why do we do it on every call to launch kernel? This
+  // should be moved one level up to se::Kernel level, and done just once (or
+  // updated once we get a new larger shared memory request).
   if (shared_mem_bytes != 0) {
     RETURN_IF_CUDA_RES_ERROR(
         cuFuncSetAttribute(function,
@@ -757,14 +878,13 @@ GpuDriver::GraphNodeGetType(CUgraphNode node) {
 }
 
 /*static*/ tsl::Status GpuDriver::GraphExecKernelNodeSetParams(
-    GpuGraphExecHandle exec, GpuGraphNodeHandle node,
-    absl::string_view kernel_name, GpuFunctionHandle function,
-    unsigned int grid_dim_x, unsigned int grid_dim_y, unsigned int grid_dim_z,
-    unsigned int block_dim_x, unsigned int block_dim_y,
+    CUgraphExec exec, CUgraphNode node, absl::string_view kernel_name,
+    CUfunction function, unsigned int grid_dim_x, unsigned int grid_dim_y,
+    unsigned int grid_dim_z, unsigned int block_dim_x, unsigned int block_dim_y,
     unsigned int block_dim_z, unsigned int shared_mem_bytes,
     void** kernel_params, void** extra) {
-  VLOG(2) << "Set kernel node params " << node << " in graph executabe " << exec
-          << "; kernel: " << kernel_name << "; gdx: " << grid_dim_x
+  VLOG(2) << "Set kernel node params " << node << " in graph executable "
+          << exec << "; kernel: " << kernel_name << "; gdx: " << grid_dim_x
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
           << " bdz: " << block_dim_z << "; shmem: " << shared_mem_bytes;
@@ -783,6 +903,9 @@ GpuDriver::GraphNodeGetType(CUgraphNode node) {
   params.kernelParams = kernel_params;
   params.extra = extra;
 
+  // TODO(ezhulenev): Why do we do it on every call to launch kernel? This
+  // should be moved one level up to se::Kernel level, and done just once (or
+  // updated once we get a new larger shared memory request).
   if (shared_mem_bytes != 0) {
     RETURN_IF_CUDA_RES_ERROR(
         cuFuncSetAttribute(function,
@@ -797,6 +920,111 @@ GpuDriver::GraphNodeGetType(CUgraphNode node) {
   return ::tsl::OkStatus();
 }
 
+static CUmemAccess_flags ToCudaMemAccessFlags(
+    GpuDriver::MemAccessFlags access_flags) {
+  switch (access_flags) {
+    case GpuDriver::MemAccessFlags::kNone:
+      return CU_MEM_ACCESS_FLAGS_PROT_NONE;
+    case GpuDriver::MemAccessFlags::kRead:
+      return CU_MEM_ACCESS_FLAGS_PROT_READ;
+    case GpuDriver::MemAccessFlags::kReadWrite:
+      return CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  }
+}
+
+static CUmemLocationType ToCudaLocationType(
+    GpuDriver::MemLocationType location_type) {
+  switch (location_type) {
+    case GpuDriver::MemLocationType::kInvalid:
+      return CU_MEM_LOCATION_TYPE_INVALID;
+    case GpuDriver::MemLocationType::kDevice:
+      return CU_MEM_LOCATION_TYPE_DEVICE;
+#if CUDA_VERSION >= 12030
+    case GpuDriver::MemLocationType::kHost:
+      return CU_MEM_LOCATION_TYPE_HOST;
+    case GpuDriver::MemLocationType::kHostNuma:
+      return CU_MEM_LOCATION_TYPE_HOST_NUMA;
+    case GpuDriver::MemLocationType::kHostNumaCurrent:
+      return CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT;
+#else
+    case GpuDriver::MemLocationType::kHost:
+    case GpuDriver::MemLocationType::kHostNuma:
+    case GpuDriver::MemLocationType::kHostNumaCurrent:
+      return CU_MEM_LOCATION_TYPE_INVALID;
+#endif  // CUDA_VERSION >= 12030
+  }
+}
+
+static CUmemAllocationType ToCudaAllocationType(
+    GpuDriver::MemAllocationType alocation_type) {
+  switch (alocation_type) {
+    case GpuDriver::MemAllocationType::kInvalid:
+      return CU_MEM_ALLOCATION_TYPE_INVALID;
+    case GpuDriver::MemAllocationType::kPinned:
+      return CU_MEM_ALLOCATION_TYPE_PINNED;
+  }
+}
+
+/*static*/ tsl::Status GpuDriver::GraphAddMemAllocNode(
+    CUgraphNode* node, CUgraph graph, absl::Span<CUgraphNode> deps,
+    GpuDriver::MemAccessFlags access_flags,
+    GpuDriver::MemLocationType location_type, int device_id,
+    GpuDriver::MemAllocationType allocation_type, uint64_t size,
+    CUdeviceptr* d_ptr, uint64_t max_pool_size) {
+  CUDA_MEM_ALLOC_NODE_PARAMS params;
+  memset(&params, 0, sizeof(params));
+
+  CUmemLocation mem_location;
+  mem_location.id = device_id;
+  mem_location.type = ToCudaLocationType(location_type);
+
+  CUmemAccessDesc mem_desc;
+  mem_desc.flags = ToCudaMemAccessFlags(access_flags);
+  mem_desc.location = mem_location;
+
+  CUmemPoolProps mem_pool_props;
+  mem_pool_props.allocType = ToCudaAllocationType(allocation_type);
+  mem_pool_props.handleTypes = CU_MEM_HANDLE_TYPE_NONE;
+  mem_pool_props.location = mem_location;
+#if CUDA_VERSION >= 12030
+  mem_pool_props.maxSize = max_pool_size;
+#endif  // CUDA_VERSION >= 12030
+  // cuda graph requires reserved space initialized to 0
+  memset(mem_pool_props.reserved, 0, sizeof(mem_pool_props.reserved));
+
+  params.accessDescCount = 1;
+  params.bytesize = size;
+  params.accessDescs = &mem_desc;
+  params.poolProps = mem_pool_props;
+
+  RETURN_IF_CUDA_RES_ERROR(
+      cuGraphAddMemAllocNode(node, graph, deps.data(), deps.size(), &params),
+      "Failed to add memory allocation node to a CUDA graph");
+
+  VLOG(2) << "Add MemAllocNode to a graph " << graph << " size " << size
+          << " address " << reinterpret_cast<void*>(params.dptr);
+
+  *d_ptr = params.dptr;
+  return ::tsl::OkStatus();
+}
+
+/*static*/ tsl::StatusOr<std::pair<CUdeviceptr, uint64_t>>
+GpuDriver::GraphGetMemAllocNodeParams(CUgraphNode node) {
+  CUDA_MEM_ALLOC_NODE_PARAMS params;
+  RETURN_IF_CUDA_RES_ERROR(cuGraphMemAllocNodeGetParams(node, &params),
+                           "Failed to get memory allocation node parameter");
+  return std::pair<CUdeviceptr, uint64_t>{params.dptr, params.bytesize};
+}
+
+/*static*/ tsl::Status GpuDriver::GraphAddMemFreeNode(
+    CUgraphNode* node, CUgraph graph, absl::Span<CUgraphNode> deps,
+    CUdeviceptr gpu_dst) {
+  RETURN_IF_CUDA_RES_ERROR(
+      cuGraphAddMemFreeNode(node, graph, deps.data(), deps.size(), gpu_dst),
+      "Failed to add memory free node to a CUDA graph");
+  return ::tsl::OkStatus();
+}
+
 /* static */ tsl::Status GpuDriver::GraphAddMemcpyD2DNode(
     GpuContext* context, CUgraphNode* node, CUgraph graph,
     absl::Span<CUgraphNode> deps, CUdeviceptr gpu_dst, CUdeviceptr gpu_src,
@@ -825,6 +1053,124 @@ GpuDriver::GraphNodeGetType(CUgraphNode node) {
   return ::tsl::OkStatus();
 }
 
+/* static */ tsl::Status GpuDriver::GraphExecMemcpyD2DNodeSetParams(
+    GpuContext* context, GpuGraphExecHandle exec, GpuGraphNodeHandle node,
+    GpuDevicePtr gpu_dst, GpuDevicePtr gpu_src, uint64_t size) {
+  VLOG(2) << "Set memcpy d2d node params " << node << " in graph executable "
+          << exec << "; dst: " << reinterpret_cast<void*>(gpu_dst)
+          << "; src: " << reinterpret_cast<void*>(gpu_src) << "; size: " << size
+          << "; context: " << context->context();
+
+  CUDA_MEMCPY3D params;
+  memset(&params, 0, sizeof(params));
+
+  params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+  params.srcDevice = gpu_src;
+  params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+  params.dstDevice = gpu_dst;
+  params.WidthInBytes = size;
+  params.Height = 1;
+  params.Depth = 1;
+
+  RETURN_IF_CUDA_RES_ERROR(
+      cuGraphExecMemcpyNodeSetParams(exec, node, &params, context->context()),
+      "Failed to set memcpy d2d node params");
+
+  return ::tsl::OkStatus();
+}
+
+namespace {
+
+struct BitPatternToString {
+  std::string operator()(uint8_t pattern) {
+    return absl::StrCat("u8:", pattern);
+  }
+  std::string operator()(uint16_t pattern) {
+    return absl::StrCat("u16:", pattern);
+  }
+  std::string operator()(uint32_t pattern) {
+    return absl::StrCat("u32:", pattern);
+  }
+};
+
+// Broadcasts a pattern value of 1/2/4 bytes to a 4 byte value.
+struct BitPatternToValue {
+  std::pair<unsigned, unsigned> operator()(uint8_t pattern) {
+    unsigned value = pattern;
+    return {(value << 24) | (value << 16) | (value << 8) | value,
+            /*element_size=*/1};
+  }
+  std::pair<unsigned, unsigned> operator()(uint16_t pattern) {
+    unsigned value = pattern;
+    return {(value << 16) | value, /*element_size=*/2};
+  }
+  std::pair<unsigned, unsigned> operator()(uint32_t pattern) {
+    return {pattern, /*element_size=*/4};
+  }
+};
+
+}  // namespace
+
+/* static */ tsl::Status GpuDriver::GraphAddMemsetNode(
+    GpuContext* context, CUgraphNode* node, GpuGraphHandle graph,
+    absl::Span<CUgraphNode> deps, CUdeviceptr dst,
+    std::variant<uint8_t, uint16_t, uint32_t> bit_pattern,
+    uint64_t num_elements) {
+  VLOG(2) << "Add memset node to a graph " << graph
+          << "; dst: " << reinterpret_cast<void*>(dst)
+          << "; bit_pattern: " << std::visit(BitPatternToString(), bit_pattern)
+          << "; num_elements: " << num_elements
+          << "; context: " << context->context() << "; deps: " << deps.size();
+
+  CUDA_MEMSET_NODE_PARAMS params;
+  memset(&params, 0, sizeof(params));
+
+  auto [value, element_size] = std::visit(BitPatternToValue(), bit_pattern);
+
+  params.dst = dst;
+  params.elementSize = element_size;
+  params.height = 1;
+  params.pitch = 0;  // unused if height is 1
+  params.value = value;
+  params.width = num_elements;
+
+  RETURN_IF_CUDA_RES_ERROR(
+      cuGraphAddMemsetNode(node, graph, deps.data(), deps.size(), &params,
+                           context->context()),
+      "Failed to add memset node to a CUDA graph");
+
+  return ::tsl::OkStatus();
+}
+
+/* static */ tsl::Status GpuDriver::GraphExecMemsetNodeSetParams(
+    GpuContext* context, CUgraphExec exec, CUgraphNode node, CUdeviceptr dst,
+    std::variant<uint8_t, uint16_t, uint32_t> bit_pattern,
+    uint64_t num_elements) {
+  VLOG(2) << "Set memset node params " << node << " in graph executable "
+          << exec << "; dst: " << reinterpret_cast<void*>(dst)
+          << "; bit_pattern: " << std::visit(BitPatternToString(), bit_pattern)
+          << "; num_elements: " << num_elements
+          << "; context: " << context->context();
+
+  CUDA_MEMSET_NODE_PARAMS params;
+  memset(&params, 0, sizeof(params));
+
+  auto [value, element_size] = std::visit(BitPatternToValue(), bit_pattern);
+
+  params.dst = dst;
+  params.elementSize = element_size;
+  params.height = 1;
+  params.pitch = 0;  // unused if height is 1
+  params.value = value;
+  params.width = num_elements;
+
+  RETURN_IF_CUDA_RES_ERROR(
+      cuGraphExecMemsetNodeSetParams(exec, node, &params, context->context()),
+      "Failed to set memset node params");
+
+  return ::tsl::OkStatus();
+}
+
 /* static */ tsl::Status GpuDriver::GraphAddChildNode(
     CUgraphNode* node, CUgraph graph, absl::Span<CUgraphNode> deps,
     CUgraph child) {
@@ -838,6 +1184,19 @@ GpuDriver::GraphNodeGetType(CUgraphNode node) {
   return ::tsl::OkStatus();
 }
 
+/*static*/ tsl::Status GpuDriver::GraphExecChildNodeSetParams(CUgraphExec exec,
+                                                              CUgraphNode node,
+                                                              CUgraph child) {
+  VLOG(2) << "Set child node params " << node << " in graph executable " << exec
+          << "to params contained in " << child;
+
+  RETURN_IF_CUDA_RES_ERROR(
+      cuGraphExecChildGraphNodeSetParams(exec, node, child),
+      "Failed to set CUDA graph child node params");
+
+  return ::tsl::OkStatus();
+}
+
 /* static */ tsl::Status GpuDriver::LaunchKernel(
     GpuContext* context, absl::string_view kernel_name, CUfunction function,
     unsigned int grid_dim_x, unsigned int grid_dim_y, unsigned int grid_dim_z,
@@ -848,7 +1207,12 @@ GpuDriver::GraphNodeGetType(CUgraphNode node) {
   VLOG(2) << "launching kernel: " << kernel_name << "; gdx: " << grid_dim_x
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
-          << " bdz: " << block_dim_z;
+          << " bdz: " << block_dim_z
+          << "; shared_mem_bytes: " << shared_mem_bytes;
+
+  // TODO(ezhulenev): Why do we do it on every call to launch kernel? This
+  // should be moved one level up to se::Kernel level, and done just once (or
+  // updated once we get a new larger shared memory request).
   if (shared_mem_bytes != 0) {
     RETURN_IF_CUDA_RES_ERROR(
         cuFuncSetAttribute(function,
@@ -856,14 +1220,75 @@ GpuDriver::GraphNodeGetType(CUgraphNode node) {
                            shared_mem_bytes),
         "Failed to set shared memory size");
   }
+
   RETURN_IF_CUDA_RES_ERROR(
       cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x,
                      block_dim_y, block_dim_z, shared_mem_bytes, stream,
                      kernel_params, extra),
       "Failed to launch CUDA kernel: ", kernel_name,
-      " with block dimensions: ", block_dim_x, "x", block_dim_y, "x",
-      block_dim_z, " and grid dimensions: ", grid_dim_x, "x", grid_dim_y, "x",
-      grid_dim_z, " and shared memory size: ", shared_mem_bytes);
+      "; block dims: ", block_dim_x, "x", block_dim_y, "x", block_dim_z,
+      "; grid dims: ", grid_dim_x, "x", grid_dim_y, "x", grid_dim_z,
+      "; shared memory size: ", shared_mem_bytes);
+
+  return ::tsl::OkStatus();
+}
+
+/* static */ tsl::Status GpuDriver::LaunchKernel(
+    GpuContext* context, absl::string_view kernel_name,
+    GpuFunctionHandle function, unsigned int cluster_dim_x,
+    unsigned int cluster_dim_y, unsigned int cluster_dim_z,
+    unsigned int grid_dim_x, unsigned int grid_dim_y, unsigned int grid_dim_z,
+    unsigned int block_dim_x, unsigned int block_dim_y,
+    unsigned int block_dim_z, unsigned int shared_mem_bytes,
+    GpuStreamHandle stream, void** kernel_params, void** extra) {
+  ScopedActivateContext activation(context);
+  VLOG(2) << "launching kernel: " << kernel_name << "; cdx: " << cluster_dim_x
+          << " cdy: " << cluster_dim_y << " cdz: " << cluster_dim_z
+          << " gdx: " << grid_dim_x << " gdy: " << grid_dim_y
+          << " gdz: " << grid_dim_z << " bdx: " << block_dim_x
+          << " bdy: " << block_dim_y << " bdz: " << block_dim_z
+          << "; shared_mem_bytes: " << shared_mem_bytes;
+
+  // TODO(ezhulenev): Why do we do it on every call to launch kernel? This
+  // should be moved one level up to se::Kernel level, and done just once (or
+  // updated once we get a new larger shared memory request).
+  if (shared_mem_bytes != 0) {
+    RETURN_IF_CUDA_RES_ERROR(
+        cuFuncSetAttribute(function,
+                           CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                           shared_mem_bytes),
+        "Failed to set shared memory size");
+  }
+
+  CUlaunchConfig launch_config;
+  memset(&launch_config, 0, sizeof(launch_config));
+  launch_config.blockDimX = block_dim_x;
+  launch_config.blockDimY = block_dim_y;
+  launch_config.blockDimZ = block_dim_z;
+  launch_config.gridDimX = grid_dim_x;
+  launch_config.gridDimY = grid_dim_y;
+  launch_config.gridDimZ = grid_dim_z;
+  launch_config.hStream = stream;
+  launch_config.sharedMemBytes = shared_mem_bytes;
+
+  CUlaunchAttribute cluster_dims;
+  memset(&cluster_dims, 0, sizeof(cluster_dims));
+  cluster_dims.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+  cluster_dims.value.clusterDim.x = cluster_dim_x;
+  cluster_dims.value.clusterDim.y = cluster_dim_y;
+  cluster_dims.value.clusterDim.z = cluster_dim_z;
+
+  launch_config.attrs = &cluster_dims;
+  launch_config.numAttrs = 1;
+
+  RETURN_IF_CUDA_RES_ERROR(
+      cuLaunchKernelEx(&launch_config, function, kernel_params, extra),
+      "Failed to launch CUDA kernel: ", kernel_name,
+      "; cluster dims: ", cluster_dim_x, "x", cluster_dim_y, "x", cluster_dim_z,
+      "; block dims: ", block_dim_x, "x", block_dim_y, "x", block_dim_z,
+      "; grid dims: ", grid_dim_x, "x", grid_dim_y, "x", grid_dim_z,
+      "; shared memory size: ", shared_mem_bytes);
+
   return ::tsl::OkStatus();
 }
 
@@ -931,8 +1356,16 @@ GpuDriver::GraphNodeGetType(CUgraphNode node) {
                                               : 0] = '\0';
       LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
                  << " bytes): " << error_log_buffer.data();
-      ret = tsl::errors::Internal("Failed to load PTX text as a module: ",
-                                  ToString(res));
+      if (absl::StrContains(error_log_buffer.data(),
+                            "Register allocation failed")) {
+        ret = absl::ResourceExhaustedError(
+            absl::StrFormat("Failed to load PTX text as a module (register "
+                            "allocation failed): %s",
+                            ToString(res)));
+      } else {
+        ret = absl::InternalError(absl::StrFormat(
+            "Failed to load PTX text as a module: %s", ToString(res)));
+      }
       notification.Notify();
     }
 
@@ -2064,8 +2497,9 @@ tsl::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerBlockOptin(
 
   int max_blocks;
   RETURN_IF_CUDA_RES_ERROR(
-      cuOccupancyMaxActiveBlocksPerMultiprocessor(
-          &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes),
+      cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+          &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes,
+          CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE),
       absl::StrFormat("Failed to calculate occupancy of kernel %p", kernel));
   return max_blocks;
 }
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index c4abd3e4577ec7..d2d97fad333696 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -13,13 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <unistd.h>
-
+#include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <utility>
 
+#include "absl/synchronization/mutex.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/stream_executor/gpu/gpu_stream.h"
+
+#if defined(PLATFORM_WINDOWS)
+#include <windows.h>
+#define PATH_MAX MAX_PATH
+#else
+#include <unistd.h>
+#endif
 #include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
@@ -27,6 +38,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/cuda/cuda_driver.h"
@@ -38,6 +50,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_runtime.h"
 #include "xla/stream_executor/gpu/gpu_timer.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
+#include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/stream.h"
@@ -45,6 +58,7 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor_internal.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
@@ -174,7 +188,7 @@ tsl::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
 }
 
 tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
-                                   KernelBase* kernel) {
+                                   Kernel* kernel) {
   GpuKernel* cuda_kernel = AsGpuKernel(kernel);
   CUmodule module;
   const std::string* kernel_name;
@@ -231,6 +245,10 @@ tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
                                      cuda_kernel->gpu_function_ptr()));
   }
 
+  // Update CUDA kernel properties after it was loaded in the CUDA context.
+  cuda_kernel->set_name(*kernel_name);
+  cuda_kernel->set_gpu_context(context_);
+
   // We have to trust the kernel loader spec arity because there doesn't appear
   // to be a way to reflect on the number of expected arguments w/the CUDA API.
   cuda_kernel->set_arity(spec.arity());
@@ -239,6 +257,7 @@ tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
   TF_RETURN_IF_ERROR(GetKernelMetadata(cuda_kernel, &kernel_metadata));
   kernel->set_metadata(kernel_metadata);
   kernel->set_name(*kernel_name);
+  kernel->set_kernel_args_packing(spec.kernel_args_packing());
   return ::tsl::OkStatus();
 }
 
@@ -259,7 +278,7 @@ bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
   return true;
 }
 
-void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
+void GpuExecutor::UnloadKernel(const Kernel* kernel) {
   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
 
   absl::MutexLock lock{&in_memory_modules_mu_};
@@ -338,7 +357,7 @@ int fpus_per_core(int cc_major, int cc_minor) {
 
 tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
 GpuExecutor::CreateOrShareConstant(Stream* stream,
-                                   const std::vector<uint8_t>& content) {
+                                   absl::Span<const uint8_t> content) {
   absl::MutexLock lock{&shared_constants_mu_};
   // We assume all constants are uniquely identified by this hash. In the
   // (highly unlikely) event of a hash collision, the program will likely crash
@@ -406,10 +425,22 @@ tsl::Status GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
 
 tsl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
                                 const BlockDim& block_dims,
-                                const KernelBase& kernel,
-                                const KernelArgsArrayBase& args) {
-  CHECK_EQ(kernel.Arity() + (args.number_of_shared_bytes() > 0),
-           args.number_of_arguments());
+                                const Kernel& kernel, const KernelArgs& args) {
+  return Launch(stream, thread_dims, block_dims, std::nullopt, kernel, args);
+}
+
+tsl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                                const BlockDim& block_dims,
+                                const ClusterDim& cluster_dims,
+                                const Kernel& kernel, const KernelArgs& args) {
+  return Launch(stream, thread_dims, block_dims,
+                std::make_optional(cluster_dims), kernel, args);
+}
+
+tsl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                                const BlockDim& block_dims,
+                                const std::optional<ClusterDim>& cluster_dims,
+                                const Kernel& kernel, const KernelArgs& args) {
   CUstream custream = AsGpuStreamValue(stream);
   const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
   CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
@@ -433,13 +464,46 @@ tsl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
         cufunc, cuda_kernel->GetGpuCacheConfig()));
   }
 
-  void** kernel_params = const_cast<void**>(args.argument_addresses().data());
+  // Launch CUDA kernels with packed arguments.
+  auto launch = [&](const KernelArgsPackedArrayBase& packed) {
+    CHECK_EQ(kernel.Arity() + (packed.number_of_shared_bytes() > 0),
+             packed.number_of_arguments());
+    void** params = const_cast<void**>(packed.argument_addresses().data());
+
+    if (cluster_dims.has_value()) {
+      return GpuDriver::LaunchKernel(
+          context_, kernel.name(), cufunc, cluster_dims->x, cluster_dims->y,
+          cluster_dims->z, block_dims.x, block_dims.y, block_dims.z,
+          thread_dims.x, thread_dims.y, thread_dims.z,
+          packed.number_of_shared_bytes(), custream, params,
+          /*extra=*/nullptr);
+    } else {
+      return GpuDriver::LaunchKernel(
+          context_, kernel.name(), cufunc, block_dims.x, block_dims.y,
+          block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
+          packed.number_of_shared_bytes(), custream, params,
+          /*extra=*/nullptr);
+    }
+  };
 
-  return GpuDriver::LaunchKernel(context_, kernel.name(), cufunc, block_dims.x,
-                                 block_dims.y, block_dims.z, thread_dims.x,
-                                 thread_dims.y, thread_dims.z,
-                                 args.number_of_shared_bytes(), custream,
-                                 kernel_params, nullptr /* = extra */);
+  // If arguments are already packed we can just launch the kernel.
+  if (auto* packed = DynCast<KernelArgsPackedArrayBase>(&args)) {
+    return launch(*packed);
+  }
+
+  // For device memory array we rely on a custom kernel arguments packing.
+  if (auto* device_mem = DynCast<KernelArgsDeviceMemoryArray>(&args)) {
+    auto& pack = kernel.kernel_args_packing();
+    if (!pack)
+      return absl::InternalError(
+          "Kernel is missing a custom arguments packing function for device "
+          "memory arguments array");
+
+    TF_ASSIGN_OR_RETURN(auto packed, pack(kernel, *device_mem));
+    return launch(*packed);
+  }
+
+  return absl::InternalError("Unsupported kernel arguments type");
 }
 
 tsl::Status GpuExecutor::Submit(Stream* stream,
@@ -458,7 +522,7 @@ tsl::Status GpuExecutor::Submit(Stream* stream,
 // This is a non-essential operation; if there's a failure, proceed without
 // logging an error. It's nearly certain that in case of failures, we'd never
 // get here in the first place; these are very low-impact routines.
-void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
+void GpuExecutor::VlogOccupancyInfo(const Kernel& kernel,
                                     const ThreadDim& thread_dims,
                                     const BlockDim& block_dims) {
   VLOG(2) << "Computing kernel occupancy for kernel "
@@ -538,12 +602,6 @@ DeviceMemoryBase GpuExecutor::Allocate(uint64_t size, int64_t memory_space) {
   return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
 }
 
-void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64_t offset_bytes,
-                                uint64_t size_bytes) {
-  // offset and size are in bytes, so char* works as the pointer type.
-  return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
-}
-
 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
   GpuDriver::DeviceDeallocate(context_, mem->opaque());
 }
@@ -875,6 +933,16 @@ GpuExecutor::GetCommandBufferImplementation(CommandBuffer::Mode mode) {
   return std::make_unique<GpuCommandBuffer>(mode, /*parent=*/this, graph);
 }
 
+std::unique_ptr<internal::CommandBufferInterface>
+GpuExecutor::GetCommandBufferImplementation(CommandBuffer::Mode mode,
+                                            GpuGraphHandle graph,
+                                            bool is_owned_graph) {
+  VLOG(2) << "Create CUDA command buffer (CUDA graph) from existing graph "
+          << graph << "; is_owned_graph=" << is_owned_graph;
+  return std::make_unique<GpuCommandBuffer>(mode, /*parent=*/this, graph,
+                                            is_owned_graph);
+}
+
 void* GpuExecutor::platform_specific_context() { return context_; }
 
 GpuContext* GpuExecutor::gpu_context() { return context_; }
@@ -886,6 +954,10 @@ GpuContext* GpuExecutor::gpu_context() { return context_; }
 // turn to gsys' topology modeling.
 static int TryToReadNumaNode(const std::string& pci_bus_id,
                              int device_ordinal) {
+#if defined(PLATFORM_WINDOWS)
+  // Windows support for NUMA is not currently implemented. Return node 0.
+  return 0;
+#else
   VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
   static const int kUnknownNumaNode = -1;
 
@@ -936,6 +1008,7 @@ static int TryToReadNumaNode(const std::string& pci_bus_id,
 
   fclose(file);
   return kUnknownNumaNode;
+#endif
 }
 
 tsl::StatusOr<std::unique_ptr<DeviceDescription>>
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc
index 2840c0f8165e8f..464f2118128a4b 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_kernel.cc
@@ -13,7 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/cuda/cuda_kernel.h"
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/log/log.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/stream_executor/gpu/gpu_driver.h"
+#include "xla/stream_executor/gpu/gpu_kernel.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -30,9 +39,21 @@ CUfunc_cache GpuKernel::GetGpuCacheConfig() const {
       return CU_FUNC_CACHE_PREFER_EQUAL;
     default:
       LOG(FATAL) << "Unknown KernelCacheConfig"
-                 << static_cast<int32>(preferred_cache_config_);
+                 << static_cast<int32_t>(preferred_cache_config_);
   }
 }
 
+tsl::StatusOr<int32_t> GpuKernel::GetMaxOccupiedBlocksPerCore(
+    ThreadDim threads, size_t dynamic_shared_memory_bytes) const {
+  int32_t threads_per_block = threads.x * threads.y * threads.z;
+  VLOG(0) << "Get kernel block occupancy: " << name_
+          << "; threads_per_block: " << threads_per_block
+          << "; dynamic_shared_memory_bytes: " << dynamic_shared_memory_bytes;
+
+  return GpuDriver::GetMaxOccupiedBlocksPerCore(gpu_context_, gpu_function_,
+                                                threads_per_block,
+                                                dynamic_shared_memory_bytes);
+}
+
 }  // namespace gpu
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_runtime.cc b/third_party/xla/xla/stream_executor/cuda/cuda_runtime.cc
index 23a15491877d5f..a57f1fbe41c1ba 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_runtime.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_runtime.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "absl/base/optimization.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -39,6 +40,7 @@ static const char* ToString(cudaError_t error) {
   } while (0)
 
 tsl::StatusOr<GpuFunctionHandle> GpuRuntime::GetFuncBySymbol(void* symbol) {
+  VLOG(2) << "Get CUDA function from a symbol: " << symbol;
   cudaFunction_t func;
   RETURN_IF_CUDA_RES_ERROR(cudaGetFuncBySymbol(&func, symbol),
                            "Failed call to cudaGetFuncBySymbol");
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_test_kernels.cu.cc b/third_party/xla/xla/stream_executor/cuda/cuda_test_kernels.cu.cc
index 1f02ab397b59ec..84b4e0a8d4d5c0 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_test_kernels.cu.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_test_kernels.cu.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/cuda_test_kernels.h"
 
+#include <cstdint>
+
 namespace stream_executor::cuda::internal {
 
 __global__ void AddI32(int32_t* a, int32_t* b, int32_t* c) {
@@ -22,6 +24,30 @@ __global__ void AddI32(int32_t* a, int32_t* b, int32_t* c) {
   c[index] = a[index] + b[index];
 }
 
+__global__ void MulI32(int32_t* a, int32_t* b, int32_t* c) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  c[index] = a[index] * b[index];
+}
+
+__global__ void IncAndCmp(int32_t* counter, bool* pred, int32_t value) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  pred[index] = counter[index] < value;
+  counter[index] += 1;
+}
+
+__global__ void AddI32Ptrs3(Ptrs3<int32_t> ptrs) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  ptrs.c[index] = ptrs.a[index] + ptrs.b[index];
+}
+
 void* GetAddI32CudaKernel() { return reinterpret_cast<void*>(&AddI32); }
 
+void* GetMulI32CudaKernel() { return reinterpret_cast<void*>(&MulI32); }
+
+void* GetIncAndCmpCudaKernel() { return reinterpret_cast<void*>(&IncAndCmp); }
+
+void* GetAddI32Ptrs3CudaKernel() {
+  return reinterpret_cast<void*>(&AddI32Ptrs3);
+}
+
 }  // namespace stream_executor::cuda::internal
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_test_kernels.h b/third_party/xla/xla/stream_executor/cuda/cuda_test_kernels.h
index 59913ce70df521..94014f5d76092f 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_test_kernels.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_test_kernels.h
@@ -75,9 +75,28 @@ inline constexpr std::string_view kAddI32Kernel = R"(
 
 })";
 
+template <typename T>
+struct Ptrs3 {
+  T* a;
+  T* b;
+  T* c;
+};
+
 // Returns a pointer to device kernel compiled from the CUDA C++ code above.
 void* GetAddI32CudaKernel();
 
+// Returns a pointer to device kernel doing multiplication instead of addition.
+void* GetMulI32CudaKernel();
+
+// Returns a pointer to device kernel doing increment and compare, intended for
+// testing on-device while loops.
+void* GetIncAndCmpCudaKernel();
+
+// Returns a pointer to device kernel compiled from the CUDA C++ but with all
+// three pointers passed to argument as an instance of `Ptr3` template to test
+// StreamExecutor arguments packing for custom C++ types.
+void* GetAddI32Ptrs3CudaKernel();
+
 }  // namespace stream_executor::cuda::internal
 
 #endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_TEST_KERNELS_H_
diff --git a/third_party/xla/xla/stream_executor/device_description.h b/third_party/xla/xla/stream_executor/device_description.h
index 6cfae0a05576d4..f5a8f14efc9ff0 100644
--- a/third_party/xla/xla/stream_executor/device_description.h
+++ b/third_party/xla/xla/stream_executor/device_description.h
@@ -159,33 +159,43 @@ class RocmComputeCapability {
     return absl::StrJoin(kSupportedGfxVersions, ", ");
   }
 
-  bool has_nhwc_layout_support() const {
-    static constexpr absl::string_view kList[] = {"gfx908", "gfx90a"};
+  bool gfx9_mi100_or_later() const {
+    static constexpr absl::string_view kList[] = {"gfx908", "gfx90a", "gfx940",
+                                                  "gfx941", "gfx942"};
     return absl::c_count(kList, gfx_version()) != 0;
   }
 
-  bool has_bf16_dtype_support() const {
-    static constexpr absl::string_view kList[] = {"gfx908", "gfx90a"};
+  bool gfx9_mi200_or_later() const {
+    static constexpr absl::string_view kList[] = {"gfx90a", "gfx940", "gfx941",
+                                                  "gfx942"};
     return absl::c_count(kList, gfx_version()) != 0;
   }
 
+  bool navi21() const { return gfx_version() == "gfx1030"; }
+
+  bool navi31() const { return gfx_version() == "gfx1100"; }
+
+  bool has_nhwc_layout_support() const { return gfx9_mi100_or_later(); }
+
+  bool has_bf16_dtype_support() const { return gfx9_mi100_or_later(); }
+
   bool has_fast_fp16_support() const {
-    static constexpr absl::string_view kList[] = {"gfx906", "gfx908", "gfx90a",
-                                                  "gfx1030"};
-    return absl::c_count(kList, gfx_version()) != 0;
+    return gfx9_mi100_or_later() || navi21() || navi31();
   }
 
-  bool has_mfma_instr_support() const {
-    static constexpr absl::string_view kList[] = {"gfx908", "gfx90a"};
-    return absl::c_count(kList, gfx_version()) != 0;
-  }
+  bool has_mfma_instr_support() const { return gfx9_mi100_or_later(); }
 
   bool has_fp16_atomics_support() const {
     // TODO(rocm): Check. This should be the same as has_fast_fp16_support().
-    static constexpr absl::string_view kList[] = {"gfx90a"};
-    return absl::c_count(kList, gfx_version()) != 0;
+    return gfx9_mi200_or_later();
   }
 
+  bool fence_before_barrier() const {
+    return gfx_version() != "gfx900" && gfx_version() != "gfx906";
+  }
+
+  bool has_hipblaslt() const { return gfx9_mi200_or_later(); }
+
   RocmComputeCapabilityProto ToProto() const {
     RocmComputeCapabilityProto proto;
     proto.set_gcn_arch_name(gcn_arch_name_);
@@ -200,11 +210,13 @@ class RocmComputeCapability {
   std::string gcn_arch_name_ = "gfx000";  // default to invalid arch.
 
   static constexpr absl::string_view kSupportedGfxVersions[]{
-      "gfx900",  // MI25
-      "gfx906",  // MI50 / MI60
-      "gfx908",  // MI100
-      "gfx90a",  // MI200
-      "gfx1030"  // Navi21
+      "gfx900",                       // MI25
+      "gfx906",                       // MI50 / MI60
+      "gfx908",                       // MI100
+      "gfx90a",                       // MI200
+      "gfx940",  "gfx941", "gfx942",  // MI300
+      "gfx1030",                      // Navi21
+      "gfx1100"                       // Navi31
   };
 };
 
@@ -361,10 +373,10 @@ class DeviceDescription {
   static const char *kUndefinedString;
 
  private:
-  DeviceDescription();
-
   friend class internal::DeviceDescriptionBuilder;
 
+  DeviceDescription();
+
   // For description of the following members, see the corresponding accessor
   // above.
   //
diff --git a/third_party/xla/xla/stream_executor/device_memory.h b/third_party/xla/xla/stream_executor/device_memory.h
index f69fe8148c67a0..f5548dd9ca1fd9 100644
--- a/third_party/xla/xla/stream_executor/device_memory.h
+++ b/third_party/xla/xla/stream_executor/device_memory.h
@@ -26,6 +26,10 @@ limitations under the License.
 
 #include <stddef.h>
 
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/log/check.h"
 #include "xla/stream_executor/platform/port.h"
 
 namespace stream_executor {
@@ -54,6 +58,7 @@ class DeviceMemoryBase {
   // Returns whether the backing memory is the null pointer.
   // A `== nullptr` convenience method is also provided.
   bool is_null() const { return opaque_ == nullptr; }
+
   bool operator==(std::nullptr_t other) const { return is_null(); }
   bool operator!=(std::nullptr_t other) const { return !is_null(); }
 
@@ -85,6 +90,19 @@ class DeviceMemoryBase {
     return opaque() == other.opaque() && size() == other.size();
   }
 
+  // Creates a memory region (slice) inside another allocated memory region.
+  // Offset and size are in bytes.
+  DeviceMemoryBase GetByteSlice(uint64_t offset_bytes,
+                                uint64_t size_bytes) const {
+    DCHECK(offset_bytes + size_bytes <= size_)
+        << "requested slice allocation (offset + size) is greater "
+        << "than parent allocation size: (" << offset_bytes << " + "
+        << size_bytes << ") vs. (" << size_ << ")";
+
+    return DeviceMemoryBase(
+        reinterpret_cast<std::byte *>(opaque_) + offset_bytes, size_bytes);
+  }
+
  protected:
   friend class StreamExecutor;
 
@@ -96,7 +114,13 @@ class DeviceMemoryBase {
   }
 
  private:
-  void *opaque_;  // Platform-dependent value representing allocated memory.
+  // Platform-dependent value representing allocated memory.
+  //
+  // User may also constructs the object with `kExternalAllocationMarker`
+  // address and non-zero size, which indicates the case that buffer is
+  // allocated externally (for Gpu backends we use it to allocate memory via
+  // command buffer APIs).
+  void *opaque_;
   uint64_t size_;         // Size in bytes of this allocation.
   uint64_t payload_ = 0;  // Payload data associated with this allocation.
 };
@@ -129,13 +153,21 @@ class DeviceMemory final : public DeviceMemoryBase {
   // Returns whether this is a single-element allocation.
   bool IsScalar() const { return ElementCount() == 1; }
 
-  // Create a typed area of DeviceMemory with a given opaque pointer and the
+  // Creates a typed area of DeviceMemory with a given opaque pointer and the
   // quantity of bytes in the allocation. This function is broken out to
   // distinguish bytes from an element count.
   static DeviceMemory<ElemT> MakeFromByteSize(void *opaque, uint64_t bytes) {
     return DeviceMemory<ElemT>(opaque, bytes);
   }
 
+  // Creates a memory region (slice) inside another allocated memory region.
+  // Offset and size are specified in terms of ElemT elements.
+  DeviceMemory<ElemT> GetSlice(uint64_t element_offset,
+                               uint64_t element_count) {
+    return DeviceMemory<ElemT>(GetByteSlice(sizeof(ElemT) * element_offset,
+                                            sizeof(ElemT) * element_count));
+  }
+
   // Resets the DeviceMemory data, in MakeFromByteSize fashion.
   // This simply clobbers the prior values.
   void ResetFromByteSize(void *opaque, uint64_t bytes) {
@@ -157,26 +189,6 @@ class DeviceMemory final : public DeviceMemoryBase {
   DeviceMemory(void *opaque, uint64_t size) : DeviceMemoryBase(opaque, size) {}
 };
 
-// A class to encapsulate the type and size of a dynamic shared memory
-// buffer. Because the buffer exists solely on the device and is not copyable
-// to the host, memory objects of this type do not maintain buffer pointers
-// on the host.
-template <typename ElemT>
-class SharedDeviceMemory final : public DeviceMemoryBase {
- public:
-  explicit SharedDeviceMemory(uint64_t elem_count)
-      : DeviceMemoryBase(nullptr, elem_count * kElemSize) {}
-
-  static constexpr size_t kElemSize = sizeof(ElemT);
-
-  // Returns the number of elements of type ElemT that constitute this
-  // allocation.
-  uint64_t ElementCount() const { return size() / kElemSize; }
-
-  // Returns whether this is a single-element allocation.
-  bool IsScalar() const { return ElementCount() == 1; }
-};
-
 // Host-side representation of packed-and-aligned vector datatypes on the device
 // side. Since these can appear in device kernel signatures, we support
 // launching them with these datatypes in launch signatures.
diff --git a/third_party/xla/xla/stream_executor/device_options.h b/third_party/xla/xla/stream_executor/device_options.h
index 776fa4220813c4..c6812a101078fb 100644
--- a/third_party/xla/xla/stream_executor/device_options.h
+++ b/third_party/xla/xla/stream_executor/device_options.h
@@ -24,8 +24,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/log/check.h"
 #include "absl/strings/str_join.h"
+#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index 20d2534941bbbd..a181516e9b26e6 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -882,6 +882,9 @@ class AlgorithmDesc {
 
   uint64_t hash() const;
 
+  template <typename H>
+  friend H AbslHashValue(H h, const AlgorithmDesc& algo_desc);
+
   AlgorithmProto ToProto() const { return proto_; }
 
   std::string ToString() const;
@@ -890,6 +893,11 @@ class AlgorithmDesc {
   AlgorithmProto proto_;
 };
 
+template <typename H>
+H AbslHashValue(H h, const AlgorithmDesc& algo_desc) {
+  return H::combine(std::move(h), algo_desc.hash());
+}
+
 // Describes the result from a perf experiment.
 //
 // Arguments:
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index a161b0f0825bf4..b380f4838b4fad 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -74,6 +74,8 @@ cc_library(
         ":gpu_types_header",
         "//xla/stream_executor:device_options",
         "//xla/stream_executor:stream_executor_headers",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ] + if_libtpu(
@@ -105,6 +107,7 @@ cc_library(
         ":gpu_types_header",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor:stream_executor_internal",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -115,7 +118,11 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-    ],
+    ] + if_cuda_is_configured([
+        "//xla/stream_executor/cuda:cuda_conditional_kernels",
+    ]) + if_rocm_is_configured([
+        "//xla/stream_executor/rocm:hip_conditional_kernels",
+    ]),
 )
 
 cc_library(
@@ -160,6 +167,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -216,10 +224,12 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":gpu_driver_header",
+        ":gpu_types_header",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor:stream_executor_internal",
         "//xla/stream_executor/platform",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -361,6 +371,7 @@ cc_library(
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/platform",
         "//xla:util",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
@@ -383,7 +394,9 @@ cc_library(
     srcs = if_gpu_is_configured(["redzone_allocator.cc"]),
     hdrs = if_gpu_is_configured(["redzone_allocator.h"]),
     copts = tsl_copts(),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     visibility = ["//visibility:public"],
     deps = if_gpu_is_configured([
         ":asm_compiler",
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
index 5dc748d3634515..aeec6deb5a680c 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
@@ -28,6 +28,8 @@ limitations under the License.
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
@@ -84,10 +86,11 @@ tsl::StatusOr<std::array<int64_t, 3>> GetToolVersion(
     return tsl::errors::FailedPrecondition(
         "Couldn't get ptxas/nvlink version string: ", tool_version.status());
   }
+  static constexpr LazyRE2 kVersionRegex = {R"(\bV(\d+)\.(\d+)\.(\d+)\b)"};
   std::array<int64_t, 3> version;
-  std::string vmaj_str, vmin_str, vdot_str;
-  if (!RE2::PartialMatch(tool_version.value(), R"(\bV(\d+)\.(\d+)\.(\d+)\b)",
-                         &vmaj_str, &vmin_str, &vdot_str) ||
+  absl::string_view vmaj_str, vmin_str, vdot_str;
+  if (!RE2::PartialMatch(tool_version.value(), *kVersionRegex, &vmaj_str,
+                         &vmin_str, &vdot_str) ||
       !absl::SimpleAtoi(vmaj_str, &version[0]) ||
       !absl::SimpleAtoi(vmin_str, &version[1]) ||
       !absl::SimpleAtoi(vdot_str, &version[2])) {
@@ -171,6 +174,12 @@ std::string FindCudaExecutable(const std::string& binary_name,
       new absl::flat_hash_map<std::pair<std::string, std::string>,
                               std::string>();
 
+#if defined(PLATFORM_WINDOWS)
+  const std::string binary_filename = binary_name + ".exe";
+#else
+  const std::string& binary_filename = binary_name;
+#endif
+
   auto cache_key = std::make_pair(binary_name, preferred_cuda_dir);
 
   absl::MutexLock lock(&mu);
@@ -180,10 +189,11 @@ std::string FindCudaExecutable(const std::string& binary_name,
   }
 
   // Try searching in the default PATH first if applicable.
-  if (tsl::PreferPtxasFromPath() && GetToolVersionString(binary_name).ok()) {
-    VLOG(2) << "Using " << binary_name;
-    seen_binary_paths->emplace(std::move(cache_key), binary_name);
-    return binary_name;
+  if (tsl::PreferPtxasFromPath() &&
+      GetToolVersionString(binary_filename).ok()) {
+    VLOG(2) << "Using " << binary_filename;
+    seen_binary_paths->emplace(std::move(cache_key), binary_filename);
+    return binary_filename;
   }
 
   // Search in cuda root candidates.
@@ -191,8 +201,8 @@ std::string FindCudaExecutable(const std::string& binary_name,
   std::string binary_path;
   for (const std::string& cuda_root :
        tsl::CandidateCudaRoots(preferred_cuda_dir)) {
-    binary_path = tsl::io::JoinPath(cuda_root, "bin", binary_name);
-    VLOG(2) << "Looking for " << binary_name << " at " << binary_path;
+    binary_path = tsl::io::JoinPath(cuda_root, "bin", binary_filename);
+    VLOG(2) << "Looking for " << binary_filename << " at " << binary_path;
     if (env->FileExists(binary_path).ok() &&
         GetToolVersionString(binary_path).ok()) {
       break;
@@ -203,9 +213,9 @@ std::string FindCudaExecutable(const std::string& binary_name,
     // binary. This won't work, in all probability, given we already tried that
     // above, but it's the best we can do.
     VLOG(2) << "Unable to find " << binary_name;
-    binary_path = binary_name;
+    binary_path = binary_filename;
   }
-  VLOG(2) << "Using " << binary_name << " at " << binary_path;
+  VLOG(2) << "Using " << binary_filename << " at " << binary_path;
   seen_binary_paths->emplace(std::move(cache_key), binary_path);
   return binary_path;
 }
@@ -248,6 +258,12 @@ tsl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
                                                   const char* ptx_contents,
                                                   GpuAsmOpts options,
                                                   bool cancel_if_reg_spill) {
+  auto ptxas_version_tuple = GetAsmCompilerVersion(options.preferred_cuda_dir);
+  if (ptxas_version_tuple.value() == std::array<int64_t, 3>{12, 3, 1}) {
+    return tsl::errors::Internal(
+        absl::StrFormat("ptxas 12.3.1 has a bug that we think can affect XLA. "
+                        "Please use a different version."));
+  }
   std::string ptxas_path =
       FindCudaExecutable("ptxas", options.preferred_cuda_dir);
 
@@ -308,12 +324,18 @@ tsl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
     //  Example error message associated with this error code:
     //      ptxas fatal   : Value 'sm_80' is not defined for option 'gpu-name'
     // In that case, fallback to the driver for compilation
-    if (absl::StartsWith(stderr_output, "ptxas fatal   : Value '") &&
+    if (absl::StrContains(stderr_output, "ptxas fatal   : Value '") &&
         absl::StrContains(stderr_output,
                           "is not defined for option 'gpu-name'")) {
       LogPtxasTooOld(ptxas_path, cc_major, cc_minor);
-      return tsl::errors::Unimplemented(
-          ptxas_path, " ptxas too old. Falling back to the driver to compile.");
+      return absl::UnimplementedError(absl::StrFormat(
+          "%s ptxas too old. Falling back to the driver to compile.",
+          ptxas_path));
+    }
+    if (absl::StrContains(stderr_output, "ptxas fatal") &&
+        absl::StrContains(stderr_output, "Register allocation failed")) {
+      LOG(INFO) << stderr_output;
+      return absl::ResourceExhaustedError("Register allocation failed");
     }
 
     return tsl::errors::Internal(
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
index 6a5e006bfbfa4e..a2ad3228e2de7e 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
@@ -15,10 +15,15 @@ limitations under the License.
 
 #include "xla/stream_executor/gpu/gpu_command_buffer.h"
 
+#include <array>
 #include <atomic>
+#include <cstddef>
 #include <cstdint>
 #include <string_view>
+#include <utility>
+#include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -26,16 +31,19 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_kernel.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor::gpu {
 
@@ -88,23 +96,40 @@ static int64_t NotifyExecDestroyed() {
 //===----------------------------------------------------------------------===//
 
 GpuCommandBuffer::GpuCommandBuffer(Mode mode, GpuExecutor* parent,
-                                   GpuGraphHandle graph)
-    : mode_(mode), parent_(parent), graph_(graph) {}
+                                   GpuGraphHandle graph, bool is_owned_graph)
+    : mode_(mode),
+      parent_(parent),
+      graph_(graph),
+      is_owned_graph_(is_owned_graph) {}
 
 GpuCommandBuffer::~GpuCommandBuffer() {
-  if (exec_ != nullptr) {
+  if (exec_ != nullptr && is_owned_graph_exec_) {
     VLOG(5) << "Destroy GPU command buffer executable graph " << exec_ << " "
             << "(remaining alive executable graphs: " << NotifyExecDestroyed()
             << ")";
     auto st = GpuDriver::DestroyGraphExec(exec_);
     CHECK(st.ok()) << "Failed to destroy GPU graph exec: " << st.message();
   }
-  if (graph_ != nullptr) {
+  if (graph_ != nullptr && is_owned_graph_) {
     auto st = GpuDriver::DestroyGraph(graph_);
     CHECK(st.ok()) << "Failed to destroy GPU graph: " << st.message();
   }
 }
 
+GpuCommandBuffer::ScopedGpuGraphExec::ScopedGpuGraphExec(
+    GpuCommandBuffer* cmd_buffer, GpuGraphExecHandle exec)
+    : cmd_buffer(cmd_buffer),
+      restore(cmd_buffer->exec_),
+      restore_is_owned(cmd_buffer->is_owned_graph_exec_) {
+  cmd_buffer->exec_ = exec;
+  cmd_buffer->is_owned_graph_exec_ = false;
+}
+
+GpuCommandBuffer::ScopedGpuGraphExec::~ScopedGpuGraphExec() {
+  cmd_buffer->exec_ = restore;
+  cmd_buffer->is_owned_graph_exec_ = restore_is_owned;
+}
+
 static GpuDevicePtr AsDevicePtr(const DeviceMemoryBase& mem) {
   return reinterpret_cast<GpuDevicePtr>(const_cast<void*>(mem.opaque()));
 }
@@ -122,13 +147,15 @@ tsl::Status GpuCommandBuffer::Trace(
 
   // Switch stream into the capture mode.
   uint64_t start_nanos = tsl::Env::Default()->NowNanos();
-  TF_RETURN_IF_ERROR(GpuDriver::StreamBeginCapture(
-      gpu_stream, GpuDriver::StreamCaptureMode::kThreadLocal));
+  TF_RETURN_IF_ERROR(GpuDriver::StreamBeginCaptureToGraph(
+      gpu_stream, graph_, GpuDriver::StreamCaptureMode::kThreadLocal));
 
   auto traced = function();
 
   // Always stop capturing the stream before checking `traced` result.
-  TF_RETURN_IF_ERROR(GpuDriver::StreamEndCapture(gpu_stream, &graph_));
+  GpuGraphHandle captured_graph;
+  TF_RETURN_IF_ERROR(GpuDriver::StreamEndCapture(gpu_stream, &captured_graph));
+  DCHECK(captured_graph == graph_) << "Stream capture should update graph_";
   uint64_t end_nanos = tsl::Env::Default()->NowNanos();
 
   if (!traced.ok())
@@ -141,9 +168,8 @@ tsl::Status GpuCommandBuffer::Trace(
   return tsl::OkStatus();
 }
 
-absl::Span<GpuGraphNodeHandle> GpuCommandBuffer::GetDependencies() {
-  return nodes_.empty() ? absl::Span<GpuGraphNodeHandle>()
-                        : absl::Span<GpuGraphNodeHandle>(&nodes_.back(), 1);
+GpuCommandBuffer::Dependencies GpuCommandBuffer::GetDependencies() {
+  return nodes_.empty() ? Dependencies() : Dependencies{nodes_.back()};
 }
 
 tsl::Status GpuCommandBuffer::CheckNotFinalized() {
@@ -160,30 +186,45 @@ tsl::Status GpuCommandBuffer::CheckPrimary() {
   return tsl::OkStatus();
 }
 
+tsl::Status GpuCommandBuffer::CheckNumCommandBuffers(
+    const ConditionalCommandBuffers& cmd_buffers, size_t num_cmd_buffers) {
+  if (cmd_buffers.handles.size() != num_cmd_buffers) {
+    return absl::InternalError(absl::StrCat(
+        "Expected to have ", num_cmd_buffers,
+        " conditional command buffers, got ", cmd_buffers.handles.size()));
+  }
+  return tsl::OkStatus();
+}
+
 tsl::Status GpuCommandBuffer::Launch(const ThreadDim& threads,
                                      const BlockDim& blocks,
-                                     const KernelBase& kernel,
-                                     const KernelArgsArrayBase& args) {
+                                     const Kernel& kernel,
+                                     const KernelArgs& args) {
   TF_RETURN_IF_ERROR(CheckNotFinalized());
 
   const GpuKernel* gpu_kernel = AsGpuKernel(&kernel);
   GpuFunctionHandle gpu_func = gpu_kernel->AsGpuFunctionHandle();
 
-  void** kernel_params = const_cast<void**>(args.argument_addresses().data());
+  auto* packed_args = DynCast<KernelArgsPackedArrayBase>(&args);
+  if (!packed_args)
+    return absl::InternalError("Unsupported kernel arguments type");
+
+  void** kernel_params =
+      const_cast<void**>(packed_args->argument_addresses().data());
 
   // Adds a new kernel node to the graph under construction.
   if (state_ == State::kCreate) {
-    absl::Span<GpuGraphNodeHandle> deps = GetDependencies();
+    Dependencies deps = GetDependencies();
     GpuGraphNodeHandle* node = &nodes_.emplace_back();
     return GpuDriver::GraphAddKernelNode(
-        node, graph_, deps, kernel.name(), gpu_func, blocks.x, blocks.y,
-        blocks.z, threads.x, threads.y, threads.z,
+        node, graph_, absl::MakeSpan(deps), kernel.name(), gpu_func, blocks.x,
+        blocks.y, blocks.z, threads.x, threads.y, threads.z,
         args.number_of_shared_bytes(), kernel_params, /*extra=*/nullptr);
   }
 
   // Updates kernel node in the executable graph.
   if (state_ == State::kUpdate) {
-    GpuGraphNodeHandle node = nodes_[node_update_idx_++];
+    GpuGraphNodeHandle node = nodes_[update_state_.node_idx++];
     return GpuDriver::GraphExecKernelNodeSetParams(
         exec_, node, kernel.name(), gpu_func, blocks.x, blocks.y, blocks.z,
         threads.x, threads.y, threads.z, args.number_of_shared_bytes(),
@@ -198,12 +239,20 @@ tsl::Status GpuCommandBuffer::AddNestedCommandBuffer(
   TF_RETURN_IF_ERROR(CheckNotFinalized());
   TF_RETURN_IF_ERROR(CheckPrimary());
 
+  GpuGraphHandle child_graph = GpuCommandBuffer::Cast(&nested)->graph();
+
   // Adds a child graph node to the graph under construction.
   if (state_ == State::kCreate) {
-    absl::Span<GpuGraphNodeHandle> deps = GetDependencies();
+    Dependencies deps = GetDependencies();
     GpuGraphNodeHandle* node = &nodes_.emplace_back();
-    return GpuDriver::GraphAddChildNode(
-        node, graph_, deps, GpuCommandBuffer::Cast(&nested)->graph());
+    return GpuDriver::GraphAddChildNode(node, graph_, absl::MakeSpan(deps),
+                                        child_graph);
+  }
+
+  // Updates child graph node in the executable graph.
+  if (state_ == State::kUpdate) {
+    GpuGraphNodeHandle node = nodes_[update_state_.node_idx++];
+    return GpuDriver::GraphExecChildNodeSetParams(exec_, node, child_graph);
   }
 
   return UnsupportedStateError(state_);
@@ -214,18 +263,424 @@ tsl::Status GpuCommandBuffer::MemcpyDeviceToDevice(DeviceMemoryBase* dst,
                                                    uint64_t size) {
   TF_RETURN_IF_ERROR(CheckNotFinalized());
 
-  // Adds a new memcpy node to the graph under construction.
   if (state_ == State::kCreate) {
-    absl::Span<GpuGraphNodeHandle> deps = GetDependencies();
+    Dependencies deps = GetDependencies();
+    GpuGraphNodeHandle* node = &nodes_.emplace_back();
+    return GpuDriver::GraphAddMemcpyD2DNode(
+        parent_->gpu_context(), node, graph_, absl::MakeSpan(deps),
+        AsDevicePtr(*dst), AsDevicePtr(src), size);
+  }
+
+  if (state_ == State::kUpdate) {
+    GpuGraphNodeHandle node = nodes_[update_state_.node_idx++];
+    return GpuDriver::GraphExecMemcpyD2DNodeSetParams(
+        parent_->gpu_context(), exec_, node, AsDevicePtr(*dst),
+        AsDevicePtr(src), size);
+  }
+
+  return UnsupportedStateError(state_);
+}
+
+tsl::Status GpuCommandBuffer::Memset(DeviceMemoryBase* dst,
+                                     CommandBuffer::BitPattern bit_pattern,
+                                     size_t num_elements) {
+  TF_RETURN_IF_ERROR(CheckNotFinalized());
+
+  if (state_ == State::kCreate) {
+    Dependencies deps = GetDependencies();
     GpuGraphNodeHandle* node = &nodes_.emplace_back();
-    return GpuDriver::GraphAddMemcpyD2DNode(parent_->gpu_context(), node,
-                                            graph_, deps, AsDevicePtr(*dst),
-                                            AsDevicePtr(src), size);
+    return GpuDriver::GraphAddMemsetNode(
+        parent_->gpu_context(), node, graph_, absl::MakeSpan(deps),
+        AsDevicePtr(*dst), bit_pattern, num_elements);
+  }
+
+  if (state_ == State::kUpdate) {
+    GpuGraphNodeHandle node = nodes_[update_state_.node_idx++];
+    return GpuDriver::GraphExecMemsetNodeSetParams(
+        parent_->gpu_context(), exec_, node, AsDevicePtr(*dst), bit_pattern,
+        num_elements);
+  }
+
+  return UnsupportedStateError(state_);
+}
+
+tsl::StatusOr<DeviceMemoryBase> GpuCommandBuffer::Allocate(size_t bytes) {
+  TF_RETURN_IF_ERROR(CheckNotFinalized());
+
+  // Adds a new memory allocation node to the graph under construction.
+  if (state_ == State::kCreate) {
+    Dependencies deps = GetDependencies();
+    GpuGraphNodeHandle* node = &nodes_.emplace_back();
+
+    GpuDevicePtr ptr;
+    TF_RETURN_IF_ERROR(GpuDriver::GraphAddMemAllocNode(
+        node, graph_, absl::MakeSpan(deps),
+        GpuDriver::MemAccessFlags::kReadWrite,
+        GpuDriver::MemLocationType::kDevice, parent_->device_ordinal(),
+        GpuDriver::MemAllocationType::kPinned, bytes, &ptr));
+    // For CUDA impl, VA range is reserved when adding memory allocation node.
+    CHECK(ptr) << "CUDA graph memory allocation node returned nullptr";
+
+    VLOG(2) << "Setting device memory base with opaque pointer "
+            << reinterpret_cast<void*>(ptr)
+            << " device ordinal: " << parent_->device_ordinal();
+    return DeviceMemoryBase(reinterpret_cast<void*>(ptr), bytes);
+  }
+
+  if (state_ == State::kUpdate) {
+    // Memory allocation node implemented through CUDA graph does not allocate
+    // new memory region on update, just return the memory region allocated
+    // during the create step.
+    TF_ASSIGN_OR_RETURN(AllocationResult params,
+                        GpuDriver::GraphGetMemAllocNodeParams(
+                            nodes_[update_state_.node_idx++]));
+    return DeviceMemoryBase(reinterpret_cast<void*>(params.first),
+                            params.second);
+  }
+
+  return UnsupportedStateError(state_);
+}
+
+tsl::Status GpuCommandBuffer::Free(DeviceMemoryBase dst) {
+  TF_RETURN_IF_ERROR(CheckNotFinalized());
+
+  // Adds a new memfree node to the graph under construction.
+  if (state_ == State::kCreate) {
+    Dependencies deps = GetDependencies();
+    GpuGraphNodeHandle* node = &nodes_.emplace_back();
+    GpuDevicePtr gpu_dptr = AsDevicePtr(dst);
+    TF_RETURN_IF_ERROR(GpuDriver::GraphAddMemFreeNode(
+        node, graph_, absl::MakeSpan(deps), gpu_dptr));
+    return tsl::OkStatus();
+  }
+
+  if (state_ == State::kUpdate) {
+    // memfree node implemented through CUDA graph only free buffers that is
+    // allocated through memory alloc node, so buffer address will not change,
+    // no update is required.
+    update_state_.node_idx++;
+    return tsl::OkStatus();
+  }
+
+  return UnsupportedStateError(state_);
+}
+
+//--------------------------------------------------------------------------//
+// Command buffer condtitional commands API
+//--------------------------------------------------------------------------//
+
+/*static*/ GpuCommandBuffer::ConditionBuilder
+GpuCommandBuffer::ToConditionBuilder(CommandBuffer::Builder builder) {
+  return [builder = std::move(builder)](CommandBuffer* cmd_buffer,
+                                        GpuGraphConditionalHandle) {
+    return builder(cmd_buffer);
+  };
+}
+
+tsl::StatusOr<std::vector<GpuGraphConditionalHandle>>
+GpuCommandBuffer::CreateConditionalHandles(size_t num_handles) {
+  std::vector<GpuGraphConditionalHandle> handles;
+  for (size_t i = 0; i < num_handles; ++i) {
+    TF_RETURN_IF_ERROR(GpuDriver::GraphConditionalHandleCreate(
+        &handles.emplace_back(), graph_, parent_->gpu_context(), 0, 0));
+  }
+  return handles;
+}
+
+tsl::StatusOr<std::vector<GpuGraphHandle>>
+GpuCommandBuffer::CreateConditionalNodes(
+    ConditionType type, absl::Span<const GpuGraphConditionalHandle> handles) {
+  std::vector<GpuGraphHandle> conditional_graphs;
+
+  using ConditionalParams = GpuDriver::GpuGraphConditionalNodeParams;
+  using ConditionalResult = GpuDriver::GpuGraphConditionalNodeParams::Result;
+
+  for (GpuGraphConditionalHandle handle : handles) {
+    Dependencies deps = GetDependencies();
+    GpuGraphNodeHandle* node = &nodes_.emplace_back();
+
+    ConditionalParams params;
+    params.type = type;
+    params.handle = handle;
+    params.context = parent_->gpu_context();
+
+    TF_ASSIGN_OR_RETURN(
+        GpuDriver::GpuGraphNodeResult result,
+        GpuDriver::GraphAddNode(node, graph_, absl::MakeSpan(deps), params));
+
+    conditional_graphs.push_back(std::get<ConditionalResult>(result).graph);
+  }
+
+  return conditional_graphs;
+}
+
+tsl::StatusOr<std::vector<CommandBuffer>>
+GpuCommandBuffer::CreateConditionalCommandBuffers(
+    absl::Span<const GpuGraphConditionalHandle> handles,
+    absl::Span<const GpuGraphHandle> graphs,
+    absl::Span<const ConditionBuilder> builders) {
+  std::vector<CommandBuffer> cmd_buffers;
+
+  // Conditional command buffers always created in nested mode and with
+  // underlying graphs owned by a conditional node.
+  CommandBuffer::Mode nested = CommandBuffer::Mode::kNested;
+  bool is_owned_graph = false;
+
+  for (size_t i = 0; i < handles.size(); ++i) {
+    auto command_buffer_impl = parent_->GetCommandBufferImplementation(
+        nested, graphs[i], is_owned_graph);
+
+    auto command_buffer = CommandBuffer::Create(std::move(command_buffer_impl));
+
+    TF_RETURN_IF_ERROR(builders[i](&command_buffer, handles[i]));
+    TF_RETURN_IF_ERROR(command_buffer.Finalize());
+
+    cmd_buffers.push_back(std::move(command_buffer));
+  }
+
+  return cmd_buffers;
+}
+
+tsl::Status GpuCommandBuffer::UpdateConditionalCommandBuffers(
+    absl::Span<const GpuGraphConditionalHandle> handles,
+    absl::Span<CommandBuffer> command_buffers,
+    absl::Span<const ConditionBuilder> builders) {
+  for (size_t i = 0; i < command_buffers.size(); ++i) {
+    // Use parent graph executable for conditional command buffer update.
+    ScopedGpuGraphExec scoped_exec(Cast(&command_buffers[i]), exec_);
+
+    // Update command buffer using user-provided builder callback.
+    TF_RETURN_IF_ERROR(command_buffers[i].Update());
+    TF_RETURN_IF_ERROR(builders[i](&command_buffers[i], handles[i]));
+    TF_RETURN_IF_ERROR(command_buffers[i].Finalize());
+  }
+  return tsl::OkStatus();
+}
+
+tsl::Status GpuCommandBuffer::CreateConditionalCommand(
+    ConditionType type, SetConditionFn set_condition,
+    absl::Span<const ConditionBuilder> builders) {
+  TF_RETURN_IF_ERROR(CheckNotFinalized());
+
+  // Every conditional command buffer is controlled by its own handle.
+  size_t num_handles = builders.size();
+
+  if (state_ == State::kCreate) {
+    TF_ASSIGN_OR_RETURN(auto handles, CreateConditionalHandles(num_handles));
+
+    // Add a kernel to update conditional handles values.
+    TF_RETURN_IF_ERROR(set_condition(handles));
+
+    // Create conditional command buffer for each builder.
+    TF_ASSIGN_OR_RETURN(auto graphs, CreateConditionalNodes(type, handles));
+    TF_ASSIGN_OR_RETURN(auto cmd_buffers, CreateConditionalCommandBuffers(
+                                              handles, graphs, builders));
+
+    // Keep track of created conditional handles and command buffers.
+    conditional_command_buffers_.emplace_back(std::move(handles),
+                                              std::move(cmd_buffers));
+
+    return tsl::OkStatus();
+  }
+
+  if (state_ == State::kUpdate) {
+    ConditionalCommandBuffers& cond_cmd_buffers =
+        conditional_command_buffers_[update_state_.conditional_idx++];
+
+    // Sanity check that we got the correct conditional command buffers.
+    TF_RETURN_IF_ERROR(CheckNumCommandBuffers(cond_cmd_buffers, num_handles));
+
+    // Update a kernel that updates conditional handles values.
+    TF_RETURN_IF_ERROR(set_condition(cond_cmd_buffers.handles));
+
+    // Skip updating conditional nodes.
+    update_state_.node_idx += num_handles;
+
+    return UpdateConditionalCommandBuffers(
+        cond_cmd_buffers.handles,
+        absl::MakeSpan(cond_cmd_buffers.command_buffers), builders);
   }
 
   return UnsupportedStateError(state_);
 }
 
+tsl::Status GpuCommandBuffer::If(StreamExecutor* executor,
+                                 DeviceMemory<bool> predicate,
+                                 CommandBuffer::Builder then_builder) {
+  DCHECK(executor->implementation() == parent_);
+
+  // TODO(ezhulenev): Keep kernel in `GpuCommandBuffer` to avoid loading it on
+  // every call to `If`.
+  SetIfConditionKernel set_if_condition(executor);
+
+  {  // Load kernels that updates condition handle value.
+    MultiKernelLoaderSpec spec(/*arity=*/2);
+    spec.AddInProcessSymbol(gpu::GetSetIfConditionKernel(), "set_if_condition");
+    TF_RETURN_IF_ERROR(executor->GetKernel(spec, &set_if_condition));
+  }
+
+  auto set_cond_fn = [&](absl::Span<const GpuGraphConditionalHandle> handles) {
+    return Launch(set_if_condition, ThreadDim(), BlockDim(), handles[0],
+                  predicate);
+  };
+
+  std::array<ConditionBuilder, 1> builders = {
+      ToConditionBuilder(std::move(then_builder))};
+
+  return CreateConditionalCommand(ConditionType::kIf, set_cond_fn, builders);
+}
+
+tsl::Status GpuCommandBuffer::IfElse(StreamExecutor* executor,
+                                     DeviceMemory<bool> predicate,
+                                     CommandBuffer::Builder then_builder,
+                                     CommandBuffer::Builder else_builder) {
+  DCHECK(executor->implementation() == parent_);
+
+  // TODO(ezhulenev): Keep kernel in `GpuCommandBuffer` to avoid loading it on
+  // every call to `IfElse`.
+  SetIfElseConditionKernel set_if_else_condition(executor);
+
+  {  // Load kernels that updates condition handle value.
+    MultiKernelLoaderSpec spec(/*arity=*/3);
+    spec.AddInProcessSymbol(gpu::GetSetIfElseConditionKernel(),
+                            "set_if_else_condition");
+    TF_RETURN_IF_ERROR(executor->GetKernel(spec, &set_if_else_condition));
+  }
+
+  auto set_cond_fn = [&](absl::Span<const GpuGraphConditionalHandle> handles) {
+    return Launch(set_if_else_condition, ThreadDim(), BlockDim(), handles[0],
+                  handles[1], predicate);
+  };
+
+  std::array<ConditionBuilder, 2> builders = {
+      ToConditionBuilder(std::move(then_builder)),
+      ToConditionBuilder(std::move(else_builder))};
+
+  return CreateConditionalCommand(ConditionType::kIf, set_cond_fn, builders);
+}
+
+tsl::Status GpuCommandBuffer::Case(
+    StreamExecutor* executor, DeviceMemory<int32_t> index,
+    std::vector<CommandBuffer::Builder> branches) {
+  DCHECK(executor->implementation() == parent_);
+
+  // TODO(ezhulenev): Relax this constraint, we can launch multiple back to back
+  // kernels to update conditional handles in batches of size 8.
+  if (branches.size() > 8) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Case command supports only up to 8 branches, got: ", branches.size()));
+  }
+
+  // TODO(ezhulenev): Keep kernel in `GpuCommandBuffer` to avoid loading it on
+  // every call to `Case`.
+  SetCaseConditionKernel set_case_condition(executor);
+
+  {  // Load kernels that updates condition handle value.
+    MultiKernelLoaderSpec spec(/*arity=*/10);
+    spec.AddInProcessSymbol(gpu::GetSetCaseConditionKernel(),
+                            "set_case_condition");
+    TF_RETURN_IF_ERROR(executor->GetKernel(spec, &set_case_condition));
+  }
+
+  auto set_cond_fn = [&](absl::Span<const GpuGraphConditionalHandle> handles) {
+    int32_t num_handles = handles.size();
+
+    // Pad handles up to size 8 with a default initialized handle.
+    std::vector<GpuGraphConditionalHandle> padded_handles(handles.begin(),
+                                                          handles.end());
+    padded_handles.resize(8);
+
+    return Launch(set_case_condition, ThreadDim(), BlockDim(),
+                  padded_handles[0], padded_handles[1], padded_handles[2],
+                  padded_handles[3], padded_handles[4], padded_handles[5],
+                  padded_handles[6], padded_handles[7], index, num_handles);
+  };
+
+  // Wrap all branches into conditional command buffer builders.
+  absl::InlinedVector<ConditionBuilder, 8> builders;
+  builders.reserve(branches.size());
+  for (auto& branch : branches) {
+    builders.push_back(ToConditionBuilder(std::move(branch)));
+  }
+
+  return CreateConditionalCommand(ConditionType::kIf, set_cond_fn, builders);
+}
+
+tsl::Status GpuCommandBuffer::For(StreamExecutor* executor,
+                                  int32_t num_iteration,
+                                  DeviceMemory<int32_t> loop_counter,
+                                  CommandBuffer::Builder body_builder) {
+  DCHECK(executor->implementation() == parent_);
+
+  // TODO(ezhulenev): Keep kernel in `GpuCommandBuffer` to avoid loading it on
+  // every call to `For`.
+  SetForConditionKernel set_for_condition(executor);
+
+  {  // Load kernels that updates condition handle value.
+    MultiKernelLoaderSpec spec(/*arity=*/3);
+    spec.AddInProcessSymbol(gpu::GetSetForConditionKernel(),
+                            "set_for_condition");
+    TF_RETURN_IF_ERROR(executor->GetKernel(spec, &set_for_condition));
+  }
+
+  // Reset loop counter to zero.
+  TF_RETURN_IF_ERROR(Memset(&loop_counter, uint32_t{0}, 1));
+
+  auto set_cond_fn = [&](absl::Span<const GpuGraphConditionalHandle> handles) {
+    return Launch(set_for_condition, ThreadDim(), BlockDim(), handles[0],
+                  loop_counter, num_iteration);
+  };
+
+  auto body = [&](CommandBuffer* body, GpuGraphConditionalHandle handle) {
+    TF_RETURN_IF_ERROR(body_builder(body));
+
+    // Decide if we want to continue loop iteration.
+    return body->Launch(set_for_condition, ThreadDim(), BlockDim(), handle,
+                        loop_counter, num_iteration);
+  };
+
+  std::array<ConditionBuilder, 1> builders = {std::move(body)};
+
+  return CreateConditionalCommand(ConditionType::kWhile, set_cond_fn, builders);
+}
+
+tsl::Status GpuCommandBuffer::While(StreamExecutor* executor,
+                                    DeviceMemory<bool> pred,
+                                    CommandBuffer::Builder cond_builder,
+                                    CommandBuffer::Builder body_builder) {
+  DCHECK(executor->implementation() == parent_);
+
+  // TODO(ezhulenev): Keep kernel in `GpuCommandBuffer` to avoid loading it on
+  // every call to `While`.
+  SetWhileConditionKernel set_while_condition(executor);
+
+  {  // Load kernels that updates condition handle value.
+    MultiKernelLoaderSpec spec(/*arity=*/2);
+    spec.AddInProcessSymbol(gpu::GetSetWhileConditionKernel(),
+                            "set_while_condition");
+    TF_RETURN_IF_ERROR(executor->GetKernel(spec, &set_while_condition));
+  }
+
+  // Record condition commands into the parent command buffer.
+  TF_RETURN_IF_ERROR(CommandBuffer::Build(this, cond_builder));
+
+  auto set_cond_fn = [&](absl::Span<const GpuGraphConditionalHandle> handles) {
+    return Launch(set_while_condition, ThreadDim(), BlockDim(), handles[0],
+                  pred);
+  };
+
+  auto body = [&](CommandBuffer* body, GpuGraphConditionalHandle handle) {
+    TF_RETURN_IF_ERROR(body_builder(body));
+    TF_RETURN_IF_ERROR(cond_builder(body));
+    return body->Launch(set_while_condition, ThreadDim(), BlockDim(), handle,
+                        pred);
+  };
+
+  std::array<ConditionBuilder, 1> builders = {std::move(body)};
+
+  return CreateConditionalCommand(ConditionType::kWhile, set_cond_fn, builders);
+}
+
 tsl::Status GpuCommandBuffer::Finalize() {
   TF_RETURN_IF_ERROR(CheckNotFinalized());
 
@@ -238,10 +693,11 @@ tsl::Status GpuCommandBuffer::Finalize() {
     TF_RETURN_IF_ERROR(GpuDriver::GraphInstantiate(&exec_, graph_, flags));
     uint64_t end_nanos = tsl::Env::Default()->NowNanos();
 
-    VLOG(5) << "Instantiated executable graph " << exec_ << " in "
-            << (end_nanos - start_nanos) / 1000 << " μs ("
-            << "#" << NotifyExecCreated() << ", "
-            << "alive executable graphs: " << AliveExecs() << ")";
+    VLOG(5) << "Instantiated executable graph #" << NotifyExecCreated() << " "
+            << exec_ << " in " << (end_nanos - start_nanos) / 1000 << " μs"
+            << "; nodes: " << nodes_.size()
+            << "; conditionals: " << conditional_command_buffers_.size()
+            << "; alive executable graphs: " << AliveExecs();
 
   } else if (mode_ == Mode::kPrimary && state_ == State::kUpdate) {
     // If this is a finalization after update, we don't have to do anything as
@@ -261,24 +717,21 @@ tsl::Status GpuCommandBuffer::Finalize() {
 }
 
 tsl::Status GpuCommandBuffer::Update() {
-  if (state_ != State::kFinalized) {
+  if (exec_ == nullptr) {
     return absl::InternalError(
-        "Command buffer has to be finalized first before it can be updated");
+        "Command buffer has to have a graph executable to be updated");
   }
 
-  // TODO(ezhulenev): Add support for updating nested command buffers. Today
-  // we only support updating primary command buffers as we need a non null
-  // executable graph.
-  if (exec_ == nullptr) {
-    return absl::UnimplementedError(
-        "Nested command buffer update is not implemented");
+  if (state_ != State::kFinalized) {
+    return absl::InternalError(
+        "Command buffer has to be finalized first before it can be updated");
   }
 
-  VLOG(5) << "Begin primary command buffer update for executable graph "
-          << exec_;
+  VLOG(5) << "Begin " << (mode_ == Mode::kPrimary ? "primary" : "nested")
+          << " command buffer update for executable graph " << exec_;
 
   state_ = State::kUpdate;
-  node_update_idx_ = 0;
+  update_state_ = UpdateState();
   return tsl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
index 0c761f44bb9ec6..8938e04c047785 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
@@ -16,19 +16,26 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_GPU_GPU_COMMAND_BUFFER_H_
 #define XLA_STREAM_EXECUTOR_GPU_GPU_COMMAND_BUFFER_H_
 
+#include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 #include "absl/functional/any_invocable.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor_internal.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor::gpu {
 
@@ -37,15 +44,14 @@ namespace stream_executor::gpu {
 class GpuCommandBuffer : public internal::CommandBufferInterface {
  public:
   GpuCommandBuffer(CommandBuffer::Mode mode, GpuExecutor* parent,
-                   GpuGraphHandle graph);
+                   GpuGraphHandle graph, bool is_owned_graph = true);
   ~GpuCommandBuffer() override;
 
   tsl::Status Trace(Stream* stream,
                     absl::AnyInvocable<tsl::Status()> function) override;
 
   tsl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
-                     const KernelBase& kernel,
-                     const KernelArgsArrayBase& args) override;
+                     const Kernel& kernel, const KernelArgs& args) override;
 
   tsl::Status AddNestedCommandBuffer(const CommandBuffer& nested) override;
 
@@ -53,6 +59,32 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
                                    const DeviceMemoryBase& src,
                                    uint64_t size) override;
 
+  tsl::Status Memset(DeviceMemoryBase* dst,
+                     CommandBuffer::BitPattern bit_pattern,
+                     size_t num_elements) override;
+
+  tsl::StatusOr<DeviceMemoryBase> Allocate(size_t bytes) override;
+
+  tsl::Status Free(DeviceMemoryBase dst) override;
+
+  tsl::Status If(StreamExecutor* executor, DeviceMemory<bool> predicate,
+                 CommandBuffer::Builder then_builder) override;
+
+  tsl::Status IfElse(StreamExecutor* executor, DeviceMemory<bool> predicate,
+                     CommandBuffer::Builder then_builder,
+                     CommandBuffer::Builder else_builder) override;
+
+  tsl::Status Case(StreamExecutor* executor, DeviceMemory<int32_t> index,
+                   std::vector<CommandBuffer::Builder> branches) override;
+
+  tsl::Status For(StreamExecutor* executor, int32_t num_iteration,
+                  DeviceMemory<int32_t> loop_counter,
+                  CommandBuffer::Builder body_builder) override;
+
+  tsl::Status While(StreamExecutor* executor, DeviceMemory<bool> pred,
+                    CommandBuffer::Builder cond_builder,
+                    CommandBuffer::Builder body_builder) override;
+
   tsl::Status Finalize() override;
   tsl::Status Update() override;
 
@@ -62,6 +94,12 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   CommandBuffer::Mode mode() const override { return mode_; }
   CommandBuffer::State state() const override { return state_; }
 
+  // A helper template for launching typed kernels.
+  template <typename... Params, typename... Args>
+  tsl::Status Launch(const TypedKernel<Params...>& kernel,
+                     const ThreadDim& threads, const BlockDim& blocks,
+                     Args... args);
+
   // We track the total number of allocated and alive executable graphs in the
   // process to track the command buffers resource usage. Executable graph
   // allocates resources on a GPU devices (rule of thumb is ~8kb per node), so
@@ -75,16 +113,105 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   static int64_t AllocatedExecs();
   static int64_t AliveExecs();
 
+  static GpuCommandBuffer* Cast(CommandBuffer* command_buffer) {
+    return static_cast<GpuCommandBuffer*>(command_buffer->implementation());
+  }
+
   static const GpuCommandBuffer* Cast(const CommandBuffer* command_buffer) {
     return static_cast<const GpuCommandBuffer*>(
         command_buffer->implementation());
   }
 
  private:
+  using Dependencies = absl::InlinedVector<GpuGraphNodeHandle, 1>;
+
+  // A signature of a device kernels updating conditional handle(s).
+  using SetIfConditionKernel =
+      TypedKernel<GpuGraphConditionalHandle, DeviceMemory<bool>>;
+
+  using SetIfElseConditionKernel =
+      TypedKernel<GpuGraphConditionalHandle, GpuGraphConditionalHandle,
+                  DeviceMemory<bool>>;
+
+  using SetCaseConditionKernel =
+      TypedKernel<GpuGraphConditionalHandle, GpuGraphConditionalHandle,
+                  GpuGraphConditionalHandle, GpuGraphConditionalHandle,
+                  GpuGraphConditionalHandle, GpuGraphConditionalHandle,
+                  GpuGraphConditionalHandle, GpuGraphConditionalHandle,
+                  DeviceMemory<int32_t>, int32_t>;
+
+  using SetForConditionKernel =
+      TypedKernel<GpuGraphConditionalHandle, DeviceMemory<int32_t>, int32_t>;
+
+  using SetWhileConditionKernel =
+      TypedKernel<GpuGraphConditionalHandle, DeviceMemory<bool>>;
+
+  // A callback to launch a kernel that updates conditional handles state.
+  using SetConditionFn =
+      std::function<tsl::Status(absl::Span<const GpuGraphConditionalHandle>)>;
+
+  // An extension of `CommandBuffer::Builder` for building conditional command
+  // buffers tied to conditional handles.
+  using ConditionBuilder =
+      std::function<tsl::Status(CommandBuffer*, GpuGraphConditionalHandle)>;
+
+  // Wraps a regular command buffer builder into condition builder.
+  static ConditionBuilder ToConditionBuilder(CommandBuffer::Builder builder);
+
+  using ConditionType = typename GpuDriver::GpuGraphConditionalNodeParams::Type;
+
+  // Overwrites the `exec_` handle in a Gpu command buffer by `exec`, and
+  // restores to the original handle when destroyed. This allows us updating
+  // primary graph executable using nested command buffers (command buffers that
+  // do not have their own executable), which is required for updating
+  // conditional commands.
+  struct ScopedGpuGraphExec {
+    ScopedGpuGraphExec(GpuCommandBuffer* cmd_buffer, GpuGraphExecHandle exec);
+    ~ScopedGpuGraphExec();
+
+    GpuCommandBuffer* cmd_buffer;
+    GpuGraphExecHandle restore;
+    bool restore_is_owned;
+  };
+
+  // For each conditional node in the Gpu graph we keep a record of conditional
+  // command buffers attached to a node, so we can apply updates to them.
+  struct ConditionalCommandBuffers {
+    ConditionalCommandBuffers(std::vector<GpuGraphConditionalHandle> handles,
+                              std::vector<CommandBuffer> command_buffers)
+        : handles(std::move(handles)),
+          command_buffers(std::move(command_buffers)) {}
+
+    std::vector<GpuGraphConditionalHandle> handles;
+    std::vector<CommandBuffer> command_buffers;
+  };
+
+  using AllocationResult = std::pair<GpuDevicePtr, uint64_t>;
+
+  tsl::StatusOr<std::vector<GpuGraphConditionalHandle>>
+  CreateConditionalHandles(size_t num_handles);
+
+  tsl::StatusOr<std::vector<GpuGraphHandle>> CreateConditionalNodes(
+      ConditionType type, absl::Span<const GpuGraphConditionalHandle> handles);
+
+  tsl::StatusOr<std::vector<CommandBuffer>> CreateConditionalCommandBuffers(
+      absl::Span<const GpuGraphConditionalHandle> handles,
+      absl::Span<const GpuGraphHandle> graphs,
+      absl::Span<const ConditionBuilder> builders);
+
+  tsl::Status UpdateConditionalCommandBuffers(
+      absl::Span<const GpuGraphConditionalHandle> handles,
+      absl::Span<CommandBuffer> command_buffers,
+      absl::Span<const ConditionBuilder> builders);
+
+  tsl::Status CreateConditionalCommand(
+      ConditionType type, SetConditionFn set_condition,
+      absl::Span<const ConditionBuilder> builders);
+
   // TODO(ezhulenev): Currently we serialize all Gpu nodes by adding a
-  // dependency between all nodes added to a command buffer. We need a concept
-  // of a barrier at a command buffer level.
-  absl::Span<GpuGraphNodeHandle> GetDependencies();
+  // dependency between all nodes added to a command buffer. We need a
+  // concept of a barrier at a command buffer level.
+  Dependencies GetDependencies();
 
   // Returns OK status if command buffer is not finalized and it is still
   // possible to add new commands to it, otherwise returns internal error.
@@ -94,6 +221,11 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   // error.
   tsl::Status CheckPrimary();
 
+  // Returns OK status if the number of command buffers is equal to the expected
+  // one, otherwise returns internal error.
+  tsl::Status CheckNumCommandBuffers(
+      const ConditionalCommandBuffers& cmd_buffers, size_t num_cmd_buffers);
+
   static_assert(std::is_pointer_v<GpuGraphHandle>,
                 "GpuGraphHandle must be a pointer");
   static_assert(std::is_pointer_v<GpuGraphExecHandle>,
@@ -104,22 +236,62 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   CommandBuffer::Mode mode_;
   CommandBuffer::State state_ = CommandBuffer::State::kCreate;
 
-  GpuExecutor* parent_;                // not owned, must outlive *this
-  GpuGraphHandle graph_ = nullptr;     // owned handle
-  GpuGraphExecHandle exec_ = nullptr;  // owned handle
+  GpuExecutor* parent_;  // not owned, must outlive *this
+
+  GpuGraphHandle graph_ = nullptr;  // owned if `is_owned_graph_`
+  bool is_owned_graph_ = true;      // ownership of `graph_`
+
+  GpuGraphExecHandle exec_ = nullptr;  // owned if `is_owned_graph_exec_`
+  bool is_owned_graph_exec_ = true;    // ownership of `is_owned_graph_exec_`
 
   // Handles to graph nodes corresponding to command buffer commands. Owned by
   // the `graph_` instance.
   std::vector<GpuGraphNodeHandle> nodes_;
 
-  // When command buffer is in update state this index will point to the graph
-  // node inside `nodes_` that will be updated next.
-  int64_t node_update_idx_ = 0;
+  // Command buffers for conditional nodes in the Gpu graph. Underlying Gpu
+  // graphs owned by the `graph_` instance.
+  std::vector<ConditionalCommandBuffers> conditional_command_buffers_;
 
   // Track the number of command buffer updates for debugging.
   int64_t num_updates_ = 0;
+
+  // Tracks indices into internal data structures during command buffer updates.
+  struct UpdateState {
+    // Index points to the graph node inside `nodes_` that will be updated next.
+    int64_t node_idx = 0;
+
+    // Index points to the conditional command buffers that will be updated next
+    // when we'll be updating next conditional command (If, Case, While).
+    int64_t conditional_idx = 0;
+  };
+
+  UpdateState update_state_;
 };
 
+template <typename... Params, typename... Args>
+inline tsl::Status GpuCommandBuffer::Launch(
+    const TypedKernel<Params...>& kernel, const ThreadDim& threads,
+    const BlockDim& blocks, Args... args) {
+  auto kernel_args = PackKernelArgs(kernel, args...);
+  TF_RETURN_IF_ERROR(Launch(threads, blocks, kernel, *kernel_args));
+  return tsl::OkStatus();
+}
+
+//===----------------------------------------------------------------------===//
+// Implementation details device kernels required by GpuCommandBuffer.
+//===----------------------------------------------------------------------===//
+
+// See `cuda_conditional_kernels.cu.cc` for CUDA implementations. These are
+// various kernels that update Gpu conditionals based on the device memory
+// values, and allow implementing on-device control flow via conditional command
+// buffers.
+
+void* GetSetIfConditionKernel();
+void* GetSetIfElseConditionKernel();
+void* GetSetCaseConditionKernel();
+void* GetSetForConditionKernel();
+void* GetSetWhileConditionKernel();
+
 }  // namespace stream_executor::gpu
 
 #endif  // XLA_STREAM_EXECUTOR_GPU_GPU_COMMAND_BUFFER_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_driver.h b/third_party/xla/xla/stream_executor/gpu/gpu_driver.h
index 8dff2fd9724650..ec7057692e8916 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_driver.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_driver.h
@@ -21,7 +21,11 @@ limitations under the License.
 #include <stddef.h>
 
 #include <cstdint>
+#include <utility>
+#include <variant>
 
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/stream_executor/device_options.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/platform.h"
@@ -285,8 +289,6 @@ class GpuDriver {
       GpuContext* context, GpuSharedMemConfig shared_mem_config);
 
   // Launches a CUDA/ROCm kernel via cuLaunchKernel/hipModuleLaunchKernel.
-  // TODO(leary) describe the structure of kernel_params and extra in a readable
-  // way.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
   // https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#execution-control
   static tsl::Status LaunchKernel(
@@ -297,6 +299,17 @@ class GpuDriver {
       unsigned int block_dim_z, unsigned int shared_mem_bytes,
       GpuStreamHandle stream, void** kernel_params, void** extra);
 
+  // Launches a CUDA/ROCm kernel via cuLaunchKernelEx/hipModuleLaunchKernelEx.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb9c891eb6bb8f4089758e64c9c976db9
+  static tsl::Status LaunchKernel(
+      GpuContext* context, absl::string_view kernel_name,
+      GpuFunctionHandle function, unsigned int cluster_dim_x,
+      unsigned int cluster_dim_y, unsigned int cluster_dim_z,
+      unsigned int grid_dim_x, unsigned int grid_dim_y, unsigned int grid_dim_z,
+      unsigned int block_dim_x, unsigned int block_dim_y,
+      unsigned int block_dim_z, unsigned int shared_mem_bytes,
+      GpuStreamHandle stream, void** kernel_params, void** extra);
+
   // Creates a new GPU graph.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gd885f719186010727b75c3315f865fdf
   // https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#graph-management
@@ -314,6 +327,13 @@ class GpuDriver {
   static tsl::Status StreamBeginCapture(GpuStreamHandle stream,
                                         StreamCaptureMode mode);
 
+  // Begins graph capture on a stream to an existing graph.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1gac495e0527d1dd6437f95ee482f61865
+  // https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#graph-management
+  static tsl::Status StreamBeginCaptureToGraph(GpuStreamHandle stream,
+                                               GpuGraphHandle graph,
+                                               StreamCaptureMode mode);
+
   // Ends capture on a stream, returning the captured graph.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g03dab8b2ba76b00718955177a929970c
   // https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#graph-management
@@ -424,6 +444,48 @@ class GpuDriver {
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g57c87f4ba6af41825627cdd4e5a8c52b
   static tsl::Status DeviceGraphMemTrim(GpuDeviceHandle device);
 
+  // Creates a conditional handle.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gece6f3b9e85d0edb8484d625fe567376
+  static tsl::Status GraphConditionalHandleCreate(
+      GpuGraphConditionalHandle* handle, GpuGraphHandle graph,
+      GpuContext* context, unsigned int default_launch_value,
+      unsigned int flags);
+
+  // Conditional node parameters.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/structCUDA__CONDITIONAL__NODE__PARAMS.html#structCUDA__CONDITIONAL__NODE__PARAMS
+  struct GpuGraphConditionalNodeParams {
+    // Conditional node type.
+    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1g04ade961d0263336423eb216fbe514da
+    enum class Type { kIf, kWhile };
+
+    // A struct for returning output arguments back to the caller.
+    struct Result {
+      GpuGraphHandle graph;
+    };
+
+    Type type;
+    GpuGraphConditionalHandle handle;
+    GpuContext* context;
+  };
+
+  // Graph node parameters
+  // https://docs.nvidia.com/cuda/cuda-driver-api/structCUgraphNodeParams.html#structCUgraphNodeParams
+  using GpuGraphNodeParams = std::variant<GpuGraphConditionalNodeParams>;
+  using GpuGraphNodeResult =
+      std::variant<GpuGraphConditionalNodeParams::Result>;
+
+  // Adds a node of arbitrary type to a graph.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g4210c258cbba352040a26d1b4e658f9d
+  static tsl::StatusOr<GpuGraphNodeResult> GraphAddNode(
+      GpuGraphNodeHandle* node, GpuGraphHandle graph,
+      absl::Span<GpuGraphNodeHandle> deps, const GpuGraphNodeParams& params);
+
+  // Creates an empty node and adds it to a graph.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g14b625984430cb2d574c63f29c9b9223
+  static tsl::Status GraphAddEmptyNode(GpuGraphNodeHandle* node,
+                                       GpuGraphHandle graph,
+                                       absl::Span<GpuGraphNodeHandle> deps);
+
   // Creates a kernel execution node and adds it to a graph.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g50d871e3bd06c1b835e52f2966ef366b
   // https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#graph-management
@@ -447,6 +509,52 @@ class GpuDriver {
       unsigned int block_dim_z, unsigned int shared_mem_bytes,
       void** kernel_params, void** extra);
 
+  // Memory protection flags for mappings.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gfba87b8c4a8cd091554d8e2c3fc9b40a
+  enum class MemAccessFlags {
+    kNone,
+    kRead,
+    kReadWrite,
+  };
+
+  // Specifies the type of memory location
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1g75cfd5b9fa5c1c6ee2be2547bfbe882e
+  enum class MemLocationType {
+    kInvalid,
+    kDevice,
+    kHost,
+    kHostNuma,
+    kHostNumaCurrent,
+  };
+
+  // The memory allocation type
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1g7ed3482e0df8712d79a99bcb3bc4a95b
+  enum class MemAllocationType {
+    kInvalid,
+    kPinned,
+  };
+
+  // Creates a memory allocation node and adds it to a graph.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g73a351cb71b2945a0bcb913a93f69ec9
+  static tsl::Status GraphAddMemAllocNode(
+      GpuGraphNodeHandle* node, GpuGraphHandle graph,
+      absl::Span<GpuGraphNodeHandle> deps, MemAccessFlags access_flags,
+      MemLocationType location_type, int device_id,
+      MemAllocationType allocation_type, uint64_t size, GpuDevicePtr* d_ptr,
+      uint64_t max_pool_size = 0);
+
+  // Fetch memory allocation node's allocated address;
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gee2c7d66d3d96b1470c1d1a769f250a2
+  static tsl::StatusOr<std::pair<GpuDevicePtr, uint64_t>>
+  GraphGetMemAllocNodeParams(GpuGraphNodeHandle node);
+
+  // Create a memfree node and adds it to a graph.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1geb7cdce5d9be2d28d9428e74eb00fa53
+  static tsl::Status GraphAddMemFreeNode(GpuGraphNodeHandle* node,
+                                         GpuGraphHandle graph,
+                                         absl::Span<GpuGraphNodeHandle> deps,
+                                         GpuDevicePtr gpu_dst);
+
   // Creates a memcpy node and adds it to a graph.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g674da6ab54a677f13e0e0e8206ff5073
   static tsl::Status GraphAddMemcpyD2DNode(GpuContext* context,
@@ -456,6 +564,27 @@ class GpuDriver {
                                            GpuDevicePtr gpu_dst,
                                            GpuDevicePtr gpu_src, uint64_t size);
 
+  // Sets the parameters for a memcpy node in the given graphExec.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g26186d58858ab32ccc7425b53786cce5
+  static tsl::Status GraphExecMemcpyD2DNodeSetParams(
+      GpuContext* context, GpuGraphExecHandle exec, GpuGraphNodeHandle node,
+      GpuDevicePtr gpu_dst, GpuDevicePtr gpu_src, uint64_t size);
+
+  // Creates a memset node and adds it to a graph.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g89dc8fc3743392777c0daa2c4aca40d3
+  static tsl::Status GraphAddMemsetNode(
+      GpuContext* context, GpuGraphNodeHandle* node, GpuGraphHandle graph,
+      absl::Span<GpuGraphNodeHandle> deps, GpuDevicePtr dst,
+      std::variant<uint8_t, uint16_t, uint32_t> bit_pattern,
+      uint64_t num_elements);
+
+  // Sets the parameters for a memset node in the given graph exec.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g5df5be09a0b7b3513e740ebbbcd59739
+  static tsl::Status GraphExecMemsetNodeSetParams(
+      GpuContext* context, GpuGraphExecHandle exec, GpuGraphNodeHandle node,
+      GpuDevicePtr dst, std::variant<uint8_t, uint16_t, uint32_t> bit_pattern,
+      uint64_t num_elements);
+
   // Creates a child graph node and adds it to a graph.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gde52afbcf91a8c79d4d7efbe0e3b6844
   static tsl::Status GraphAddChildNode(GpuGraphNodeHandle* node,
@@ -463,6 +592,12 @@ class GpuDriver {
                                        absl::Span<GpuGraphNodeHandle> deps,
                                        GpuGraphHandle child);
 
+  // Sets the parameters for a child graph node in the given graph exec.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g8f2d9893f6b899f992db1a2942ec03ff
+  static tsl::Status GraphExecChildNodeSetParams(GpuGraphExecHandle exec,
+                                                 GpuGraphNodeHandle node,
+                                                 GpuGraphHandle child);
+
   // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
   // handle in "module". Any error logs that are produced are logged internally.
   // (supported on CUDA only)
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index bda86b803ec673..5168799ec65197 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/gpu/gpu_kernel.h"
@@ -100,11 +101,13 @@ class GpuExecutor : public internal::StreamExecutorInterface {
 
   tsl::Status Init(int device_ordinal, DeviceOptions device_options) override;
 
+  int device_ordinal() const override { return device_ordinal_; };
+
   tsl::Status GetKernel(const MultiKernelLoaderSpec& spec,
-                        KernelBase* kernel) override;
+                        Kernel* kernel) override;
 
   // (supported on CUDA only)
-  void UnloadKernel(const KernelBase* kernel) override;
+  void UnloadKernel(const Kernel* kernel) override;
   tsl::Status LoadModule(const MultiModuleLoaderSpec& spec,
                          ModuleHandle* module_handle) override;
   bool UnloadModule(ModuleHandle module_handle) override;
@@ -113,11 +116,15 @@ class GpuExecutor : public internal::StreamExecutorInterface {
   // content. Or, if a device with identical content is already on-device,
   // returns a pointer to that buffer with shared ownership.
   tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>> CreateOrShareConstant(
-      Stream* stream, const std::vector<uint8_t>& content) override;
+      Stream* stream, absl::Span<const uint8_t> content) override;
+
+  tsl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
+                     const BlockDim& block_dims, const Kernel& kernel,
+                     const KernelArgs& args) override;
 
   tsl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
-                     const BlockDim& block_dims, const KernelBase& k,
-                     const KernelArgsArrayBase& args) override;
+                     const BlockDim& block_dims, const ClusterDim& cluster_dims,
+                     const Kernel& kernel, const KernelArgs& args) override;
 
   tsl::Status Submit(Stream* stream,
                      const CommandBuffer& command_buffer) override;
@@ -137,9 +144,6 @@ class GpuExecutor : public internal::StreamExecutorInterface {
 
   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
 
-  void* GetSubBuffer(DeviceMemoryBase* mem, uint64_t offset_bytes,
-                     uint64_t size_bytes) override;
-
   void Deallocate(DeviceMemoryBase* mem) override;
 
   void* UnifiedMemoryAllocate(uint64_t size) override {
@@ -262,6 +266,13 @@ class GpuExecutor : public internal::StreamExecutorInterface {
   tsl::StatusOr<std::unique_ptr<internal::CommandBufferInterface>>
   GetCommandBufferImplementation(CommandBuffer::Mode mode) override;
 
+  // Wraps existing Gpu graph handle into an instance of Gpu command buffer.
+  // This is required for wrapping nested graphs constructed for conditional
+  // nodes and owned by a parent graph executable.
+  std::unique_ptr<internal::CommandBufferInterface>
+  GetCommandBufferImplementation(CommandBuffer::Mode mode, GpuGraphHandle graph,
+                                 bool is_owned_graph);
+
   void* platform_specific_context() override;
 
   GpuContext* gpu_context();
@@ -306,7 +317,7 @@ class GpuExecutor : public internal::StreamExecutorInterface {
 
   // Prints to VLOG(2) information about the kernel's occupancy and how it might
   // be improved.
-  void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims,
+  void VlogOccupancyInfo(const Kernel& kernel, const ThreadDim& thread_dims,
                          const BlockDim& block_dims);
 
   // (supported on CUDA only)
@@ -322,6 +333,11 @@ class GpuExecutor : public internal::StreamExecutorInterface {
   tsl::Status LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
+  tsl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
+                     const BlockDim& block_dims,
+                     const std::optional<ClusterDim>& cluster_dims,
+                     const Kernel& kernel, const KernelArgs& args);
+
   bool UnloadGpuBinary(const void* gpu_binary)
       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
@@ -349,7 +365,7 @@ class GpuExecutor : public internal::StreamExecutorInterface {
       shared_constants_ ABSL_GUARDED_BY(shared_constants_mu_);
 
   // Kernel -> loaded GPU binary. Many kernels may load the same binary.
-  std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
+  std::unordered_map<const Kernel*, const void*> kernel_to_gpu_binary_
       ABSL_GUARDED_BY(in_memory_modules_mu_);
   // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
   std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64_t>>
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_graph.cc b/third_party/xla/xla/stream_executor/gpu/gpu_graph.cc
index 3b194f314ee590..5d9df50df7c855 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_graph.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_graph.cc
@@ -22,13 +22,14 @@ limitations under the License.
 #include <cstring>
 #include <string>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_kernel.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
-#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/kernel.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/path.h"
@@ -209,12 +210,17 @@ tsl::StatusOr<OwnedGpuGraph> CreateGpuGraph() {
 
 tsl::StatusOr<GpuGraphNodeHandle> AddKernelNode(
     GpuGraphHandle graph, absl::Span<GpuGraphNodeHandle> deps,
-    ThreadDim threads, BlockDim blocks, const KernelBase& kernel,
-    const KernelArgsArrayBase& args) {
+    ThreadDim threads, BlockDim blocks, const Kernel& kernel,
+    const KernelArgs& args) {
   const GpuKernel* gpu_kernel = AsGpuKernel(&kernel);
   GpuFunctionHandle gpu_func = gpu_kernel->AsGpuFunctionHandle();
 
-  void** kernel_params = const_cast<void**>(args.argument_addresses().data());
+  auto* packed_args = DynCast<KernelArgsPackedArrayBase>(&args);
+  if (!packed_args)
+    return absl::InternalError("Unsupported kernel arguments type");
+
+  void** kernel_params =
+      const_cast<void**>(packed_args->argument_addresses().data());
 
   GpuGraphNodeHandle node;
   TF_RETURN_IF_ERROR(GpuDriver::GraphAddKernelNode(
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_graph.h b/third_party/xla/xla/stream_executor/gpu/gpu_graph.h
index d9415cbe7f6436..28abf986049e8b 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_graph.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_graph.h
@@ -115,8 +115,8 @@ tsl::StatusOr<OwnedGpuGraph> CreateGpuGraph();
 // Adds a kernel node to the graph.
 tsl::StatusOr<GpuGraphNodeHandle> AddKernelNode(
     GpuGraphHandle graph, absl::Span<GpuGraphNodeHandle> deps,
-    ThreadDim threads, BlockDim blocks, const KernelBase& kernel,
-    const KernelArgsArrayBase& args);
+    ThreadDim threads, BlockDim blocks, const Kernel& kernel,
+    const KernelArgs& args);
 
 // Adds a memory copy node to the graph.
 tsl::StatusOr<GpuGraphNodeHandle> AddMemcpyD2DNode(
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h b/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h
index 146f7fff51aa21..7f8ea596902133 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h
@@ -22,11 +22,17 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
 #define XLA_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
 
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <utility>
+
 #include "xla/stream_executor/gpu/gpu_driver.h"
+#include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/platform/port.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor_internal.h"
-#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -35,10 +41,7 @@ namespace gpu {
 // KernelInterface.
 class GpuKernel : public internal::KernelInterface {
  public:
-  GpuKernel()
-      : gpu_function_(nullptr),
-        arity_(0),
-        preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
+  GpuKernel() = default;
 
   // Note that the function is unloaded when the module is unloaded, and the
   // module that the function is contained in is owned by the GpuExecutor.
@@ -49,6 +52,9 @@ class GpuKernel : public internal::KernelInterface {
   void set_arity(unsigned arity) { arity_ = arity; }
   unsigned Arity() const override { return arity_; }
 
+  void set_name(std::string name) { name_ = std::move(name); }
+  void set_gpu_context(GpuContext* gpu_context) { gpu_context_ = gpu_context; }
+
   // Returns the GpuFunctionHandle value for passing to the CUDA API.
   GpuFunctionHandle AsGpuFunctionHandle() const {
     DCHECK(gpu_function_ != nullptr);
@@ -79,23 +85,29 @@ class GpuKernel : public internal::KernelInterface {
   // CUfunc_cache.
   GpuFuncCachePreference GetGpuCacheConfig() const;
 
+  tsl::StatusOr<int32_t> GetMaxOccupiedBlocksPerCore(
+      ThreadDim threads, size_t dynamic_shared_memory_bytes) const override;
+
  private:
-  GpuFunctionHandle gpu_function_;  // Wrapped CUDA kernel handle.
-  unsigned arity_;  // Number of formal parameters the kernel takes.
+  GpuContext* gpu_context_ = nullptr;  // context where kernel is loaded
+  std::string name_;                   // kernel name
+
+  GpuFunctionHandle gpu_function_ = nullptr;  // wrapped CUDA kernel handle
+  unsigned arity_ = 0;  // number of formal parameters the kernel takes
 
-  // Preferred (but not required) cache configuration for this kernel.
-  KernelCacheConfig preferred_cache_config_;
+  // Preferred (but not required) cache configuration for this kernel
+  KernelCacheConfig preferred_cache_config_ = KernelCacheConfig::kNoPreference;
 };
 
 // Given a platform-independent kernel datatype, returns the (const) internal
 // CUDA platform implementation pointer.
-inline const GpuKernel* AsGpuKernel(const KernelBase* kernel) {
+inline const GpuKernel* AsGpuKernel(const Kernel* kernel) {
   return static_cast<const GpuKernel*>(kernel->implementation());
 }
 
 // Given a platform-independent kernel datatype, returns the (non-const)
 // internal CUDA platform implementation pointer.
-inline GpuKernel* AsGpuKernel(KernelBase* kernel) {
+inline GpuKernel* AsGpuKernel(Kernel* kernel) {
   return static_cast<GpuKernel*>(kernel->implementation());
 }
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_types.h b/third_party/xla/xla/stream_executor/gpu/gpu_types.h
index dea81d66a1d59d..18562fad63728f 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_types.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_types.h
@@ -36,6 +36,10 @@ limitations under the License.
 namespace stream_executor {
 namespace gpu {
 
+// An empty struct to be used as a handle for all unsupported features in
+// current CUDA/HIP version.
+struct UnsupportedGpuFeature {};
+
 #if TENSORFLOW_USE_ROCM
 
 using GpuContextHandle = hipCtx_t;
@@ -57,7 +61,7 @@ using GpuRngHandle = hiprandGenerator_t;
 using GpuGraphHandle = hipGraph_t;
 using GpuGraphExecHandle = hipGraphExec_t;
 using GpuGraphNodeHandle = hipGraphNode_t;
-
+using GpuGraphConditionalHandle = UnsupportedGpuFeature;
 #else  // CUDA
 
 using GpuContextHandle = CUcontext;
@@ -79,6 +83,12 @@ using GpuGraphHandle = CUgraph;
 using GpuGraphExecHandle = CUgraphExec;
 using GpuGraphNodeHandle = CUgraphNode;
 
+#if CUDA_VERSION >= 12030
+using GpuGraphConditionalHandle = CUgraphConditionalHandle;
+#else
+using GpuGraphConditionalHandle = UnsupportedGpuFeature;
+#endif  // #if CUDA_VERSION >= 12030
+
 #endif
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
index c6415812a52ef2..c5632ccf403102 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
@@ -87,23 +87,22 @@ tsl::StatusOr<DeviceMemory<uint8_t>> RedzoneAllocator::AllocateBytes(
   static_assert(sizeof(uint8_t) == 1, "Unexpected size");
   DeviceMemory<uint8_t> allocated_buffer_memory(*allocated_buffer);
 
-  DeviceMemory<uint8_t> lhs_redzone = stream_->parent()->GetSubBuffer(
-      &allocated_buffer_memory, 0, redzone_size_);
+  DeviceMemory<uint8_t> lhs_redzone =
+      allocated_buffer_memory.GetSlice(0, redzone_size_);
 
-  DeviceMemory<uint8_t> data_chunk = stream_->parent()->GetSubBuffer(
-      &allocated_buffer_memory, redzone_size_, byte_size);
+  DeviceMemory<uint8_t> data_chunk =
+      allocated_buffer_memory.GetSlice(redzone_size_, byte_size);
 
   // Split up the RHS redzone into two pieces:
   //  - 0 to kRhsRedzoneAlign bytes adjacent to the user buffer, followed by
   //  - redzone_size_ bytes.
   // We do this because Stream::ThenMemset32 requires the buffer address and
   // size to be aligned to 4 bytes.
-  DeviceMemory<uint8_t> rhs_redzone_slop = stream_->parent()->GetSubBuffer(
-      &allocated_buffer_memory, redzone_size_ + byte_size, rhs_slop);
+  DeviceMemory<uint8_t> rhs_redzone_slop =
+      allocated_buffer_memory.GetSlice(redzone_size_ + byte_size, rhs_slop);
 
-  DeviceMemory<uint8_t> rhs_redzone_nonslop = stream_->parent()->GetSubBuffer(
-      &allocated_buffer_memory, redzone_size_ + byte_size + rhs_slop,
-      redzone_size_);
+  DeviceMemory<uint8_t> rhs_redzone_nonslop = allocated_buffer_memory.GetSlice(
+      redzone_size_ + byte_size + rhs_slop, redzone_size_);
 
   uint8_t pattern_arr[] = {redzone_pattern_, redzone_pattern_, redzone_pattern_,
                            redzone_pattern_};
@@ -260,7 +259,6 @@ static tsl::StatusOr<RedzoneCheckStatus> CheckRedzonesForBuffer(
     const DeviceMemory<uint64_t>& out_param,
     const ComparisonKernelT& comparison_kernel, int64_t user_allocation_size,
     uint64_t redzone_size, uint8_t redzone_pattern) {
-  StreamExecutor* executor = stream->parent();
   int64_t rhs_slop =
       RoundUpToNearest<int64_t>(user_allocation_size, kRhsRedzoneAlign) -
       user_allocation_size;
@@ -268,14 +266,14 @@ static tsl::StatusOr<RedzoneCheckStatus> CheckRedzonesForBuffer(
 
   DeviceMemory<uint8_t> buffer_uint8(memory);
   DeviceMemory<uint8_t> lhs_redzone =
-      executor->GetSubBuffer(&buffer_uint8, 0,
-                             /*element_count=*/redzone_size);
+      buffer_uint8.GetSlice(0,
+                            /*element_count=*/redzone_size);
   DeviceMemory<uint8_t> user_allocation =
-      executor->GetSubBuffer(&buffer_uint8, redzone_size,
-                             /*element_count=*/user_allocation_size);
+      buffer_uint8.GetSlice(redzone_size,
+                            /*element_count=*/user_allocation_size);
   DeviceMemory<uint8_t> rhs_redzone =
-      executor->GetSubBuffer(&buffer_uint8, redzone_size + user_allocation_size,
-                             /*element_count=*/redzone_size + rhs_slop);
+      buffer_uint8.GetSlice(redzone_size + user_allocation_size,
+                            /*element_count=*/redzone_size + rhs_slop);
 
   TF_RETURN_IF_ERROR(RunRedzoneChecker(stream, lhs_redzone, redzone_pattern,
                                        out_param, comparison_kernel));
@@ -336,7 +334,7 @@ tsl::StatusOr<RedzoneCheckStatus> RedzoneAllocator::CheckRedzones() const {
       (LoadKernelOrGetPtr<DeviceMemory<uint8_t>, uint8_t, uint64_t,
                           DeviceMemory<uint64_t>>(
           executor, "redzone_checker", redzone_checker_ptx, compiled_ptx)));
-#else
+#elif TENSORFLOW_USE_ROCM
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<ComparisonKernelT> loaded_kernel,
       (executor->CreateTypedKernel<DeviceMemory<uint8>, uint8, uint64_t,
diff --git a/third_party/xla/xla/stream_executor/host/host_gpu_executor.cc b/third_party/xla/xla/stream_executor/host/host_gpu_executor.cc
index 254d2ac02c4d39..906cad0c7f1aa6 100644
--- a/third_party/xla/xla/stream_executor/host/host_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/host/host_gpu_executor.cc
@@ -73,11 +73,6 @@ DeviceMemoryBase HostExecutor::Allocate(uint64_t size, int64_t memory_space) {
       tsl::port::AlignedMalloc(size, /*minimum_alignment=*/64), size);
 }
 
-void* HostExecutor::GetSubBuffer(DeviceMemoryBase* parent,
-                                 uint64_t offset_bytes, uint64_t size_bytes) {
-  return reinterpret_cast<char*>(parent->opaque()) + offset_bytes;
-}
-
 void HostExecutor::Deallocate(DeviceMemoryBase* mem) {
   tsl::port::AlignedFree(mem->opaque());
 }
diff --git a/third_party/xla/xla/stream_executor/host/host_gpu_executor.h b/third_party/xla/xla/stream_executor/host/host_gpu_executor.h
index 2c581ecbed8edb..51c97e6b59ddc2 100644
--- a/third_party/xla/xla/stream_executor/host/host_gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/host/host_gpu_executor.h
@@ -50,18 +50,16 @@ class HostExecutor : public internal::StreamExecutorInterface {
   tsl::Status Init(int device_ordinal, DeviceOptions device_options) override;
 
   tsl::Status GetKernel(const MultiKernelLoaderSpec& spec,
-                        KernelBase* kernel) override {
+                        Kernel* kernel) override {
     return tsl::errors::Unimplemented("Not Implemented");
   }
   tsl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
-                     const BlockDim& block_dims, const KernelBase& kernel,
-                     const KernelArgsArrayBase& args) override {
+                     const BlockDim& block_dims, const Kernel& kernel,
+                     const KernelArgs& args) override {
     return tsl::errors::Unimplemented("Not Implemented");
   }
 
   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
-  void* GetSubBuffer(DeviceMemoryBase* parent, uint64_t offset_bytes,
-                     uint64_t size_bytes) override;
   void Deallocate(DeviceMemoryBase* mem) override;
 
   void* HostMemoryAllocate(uint64_t size) override { return new char[size]; }
diff --git a/third_party/xla/xla/stream_executor/kernel.cc b/third_party/xla/xla/stream_executor/kernel.cc
index c6d9bff130c4ec..078f7142c40000 100644
--- a/third_party/xla/xla/stream_executor/kernel.cc
+++ b/third_party/xla/xla/stream_executor/kernel.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/stream_executor/kernel.h"
 
+#include <cstddef>
 #include <cstdint>
 #include <optional>
 #include <string>
@@ -26,6 +27,7 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
 #include "tsl/platform/demangle.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -45,7 +47,11 @@ void KernelMetadata::set_shared_memory_bytes(int shared_memory_bytes) {
   shared_memory_bytes_ = shared_memory_bytes;
 }
 
-KernelBase::KernelBase(KernelBase &&from)
+//===----------------------------------------------------------------------===//
+// Kernel
+//===----------------------------------------------------------------------===//
+
+Kernel::Kernel(Kernel &&from)
     : parent_(from.parent_),
       implementation_(std::move(from.implementation_)),
       name_(std::move(from.name_)),
@@ -54,31 +60,33 @@ KernelBase::KernelBase(KernelBase &&from)
   from.parent_ = nullptr;
 }
 
-KernelBase::KernelBase(StreamExecutor *parent)
+Kernel::Kernel(StreamExecutor *parent)
     : parent_(parent),
       implementation_(parent->implementation()->CreateKernelImplementation()) {}
 
-KernelBase::KernelBase(StreamExecutor *parent,
-                       internal::KernelInterface *implementation)
-    : parent_(parent), implementation_(implementation) {}
-
-KernelBase::~KernelBase() {
+Kernel::~Kernel() {
   if (parent_) {
     parent_->UnloadKernel(this);
   }
 }
 
-unsigned KernelBase::Arity() const { return implementation_->Arity(); }
+unsigned Kernel::Arity() const { return implementation_->Arity(); }
 
-void KernelBase::SetPreferredCacheConfig(KernelCacheConfig config) {
+void Kernel::SetPreferredCacheConfig(KernelCacheConfig config) {
   return implementation_->SetPreferredCacheConfig(config);
 }
 
-KernelCacheConfig KernelBase::GetPreferredCacheConfig() const {
+KernelCacheConfig Kernel::GetPreferredCacheConfig() const {
   return implementation_->GetPreferredCacheConfig();
 }
 
-void KernelBase::set_name(absl::string_view name) {
+tsl::StatusOr<int32_t> Kernel::GetMaxOccupiedBlocksPerCore(
+    ThreadDim threads, size_t dynamic_shared_memory_bytes) const {
+  return implementation_->GetMaxOccupiedBlocksPerCore(
+      threads, dynamic_shared_memory_bytes);
+}
+
+void Kernel::set_name(absl::string_view name) {
   name_ = std::string(name);
 
   // CUDA splitter prefixes stub functions with __device_stub_.
diff --git a/third_party/xla/xla/stream_executor/kernel.h b/third_party/xla/xla/stream_executor/kernel.h
index 763755425198fa..c0cd10454a3280 100644
--- a/third_party/xla/xla/stream_executor/kernel.h
+++ b/third_party/xla/xla/stream_executor/kernel.h
@@ -14,13 +14,14 @@ limitations under the License.
 ==============================================================================*/
 
 // Suite of datatypes to represent data-parallel kernel objects (code entities).
+//
 // Kernel is the untyped variant, whereas TypedKernel takes a type signature
 // to do some template-based helper generation and give compile-time type
 // checking for kernel launch parameters.
 //
-// Users typically don't see KernelBase, they see typed kernels, analogous to a
-// typed function pointer. TypedKernels express their argument types via
-// template parameters like so:
+// Users encouraged to use typed kernels when they know the type signature at
+// compile time. TypedKernels express their argument types via template
+// parameters like so:
 //
 //  TypedKernel<DeviceMemory<int>*, int>
 //
@@ -62,32 +63,37 @@ limitations under the License.
 //
 //  void(MyOptionsStructurePassedByValue value, float *result);
 //
-// Users typically won't need to type out the TypedKernel signature in full, it
-// will be typedef'd by automatically generated code; for example, see
-// stream_executor::executor_sample::VecReduceAddKernel.
 
 #ifndef XLA_STREAM_EXECUTOR_KERNEL_H_
 #define XLA_STREAM_EXECUTOR_KERNEL_H_
 
 #include <array>
+#include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <cstring>
+#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
 #include <tuple>
 #include <type_traits>
+#include <utility>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
+#include "absl/meta/type_traits.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 
-class DeviceMemoryBase;
-template <typename ElemT>
-class DeviceMemory;
+class Kernel;
 class StreamExecutor;
 
 namespace internal {
@@ -144,6 +150,67 @@ class KernelMetadata {
   std::optional<int64_t> shared_memory_bytes_;
 };
 
+//===----------------------------------------------------------------------===//
+// Kernel arguments
+//===----------------------------------------------------------------------===//
+
+// A virtual base class for passing kernel arguments to a stream executor APIs.
+class KernelArgs {
+ public:
+  template <typename T>
+  using IsKernelArgs = std::enable_if_t<std::is_base_of<KernelArgs, T>::value>;
+
+  enum class Kind {
+    // A list of type-erased DeviceMemoryBase pointers to on-device memory. This
+    // type of kernel arguments used only when the kernel has to do its own
+    // custom packing, e.g. wrap all device pointers into a custom
+    // structure, but can't be implemented as a TypedKernel because it has to be
+    // passed around as a generic Kernel.
+    kDeviceMemoryArray,
+
+    // A list of kernel arguments packed into a storage that can be passed
+    // directly to device kernel as void** kernel parameters.
+    kPackedArray
+  };
+
+  virtual ~KernelArgs() = default;
+
+  // Gets the number of arguments added so far, including shared memory
+  // arguments.
+  virtual size_t number_of_arguments() const = 0;
+
+  // Gets the total number of shared memory bytes added so far.
+  virtual uint64_t number_of_shared_bytes() const = 0;
+
+  virtual Kind kind() const = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// Kernel arguments packed array
+//===----------------------------------------------------------------------===//
+
+// A virtual base class for passing kernel arguments packed into a storage so
+// that we have stable addresses for all arguments. This is a low level API for
+// passing arguments in a platform-specific way that relies on the knowledge of
+// the ABI of the underlying platform.
+//
+// For example `cuLaunchKernel` accepts arguments as `void** kernelParams`, and
+// packed array base guarantees that `argument_addresses` are compatible with
+// the CUDA APIs.
+//
+// See: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html
+class KernelArgsPackedArrayBase : public KernelArgs {
+ public:
+  // Gets the list of argument addresses.
+  virtual absl::Span<const void *const> argument_addresses() const = 0;
+
+  static bool classof(const KernelArgs *args) {
+    return args->kind() == Kind::kPackedArray;
+  }
+
+  Kind kind() const final { return Kind::kPackedArray; }
+};
+
 //===----------------------------------------------------------------------===//
 // Kernel
 //===----------------------------------------------------------------------===//
@@ -153,22 +220,27 @@ class KernelMetadata {
 // variant.
 //
 // Thread-compatible.
-class KernelBase {
+class Kernel {
  public:
-  KernelBase(KernelBase &&from);
+  // A function for converting kernel arguments into a packed kernels arguments
+  // that can be directly passed to a device kernel. This indirection allows
+  // registering custom CUDA C++ kernels with non-trivial C++ API with a
+  // StreamExecutor as a generic `Kernel`.
+  using KernelArgsPacking =
+      std::function<tsl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>(
+          const Kernel &kernel, const KernelArgs &args)>;
+
+  Kernel(Kernel &&from);
 
   // Constructs an "empty" (not-yet-loaded) kernel instance.
   //
   // parent is the StreamExecutor that will be responsible for loading the
   // implementation of this kernel. It must not be null.
-  explicit KernelBase(StreamExecutor *parent);
-
-  // Test-only constructor that can take a mock KernelInterface implementation.
-  KernelBase(StreamExecutor *parent, internal::KernelInterface *implementation);
+  explicit Kernel(StreamExecutor *parent);
 
   // Releases resources associated with the kernel instance (i.e.
   // platform-specific implementation).
-  ~KernelBase();
+  ~Kernel();
 
   // Returns the number of parameters that this kernel accepts. (Arity refers to
   // nullary, unary, ...).
@@ -198,6 +270,20 @@ class KernelBase {
   // Gets the preferred cache configuration for a kernel.
   KernelCacheConfig GetPreferredCacheConfig() const;
 
+  // Returns the maximum number of blocks (per multiprocessor) occupied by the
+  // kernel given the number of threads per block and shared memory size.
+  tsl::StatusOr<int32_t> GetMaxOccupiedBlocksPerCore(
+      ThreadDim threads, size_t dynamic_shared_memory_bytes) const;
+
+  // Sets custom kernels arguments packing function for a kernel.
+  void set_kernel_args_packing(KernelArgsPacking kernel_args_packing) {
+    kernel_args_packing_ = std::move(kernel_args_packing);
+  }
+
+  const KernelArgsPacking &kernel_args_packing() const {
+    return kernel_args_packing_;
+  }
+
   void set_name(absl::string_view name);
   const std::string &name() const { return name_; }
   const std::string &demangled_name() const { return demangled_name_; }
@@ -214,129 +300,170 @@ class KernelBase {
 
   KernelMetadata metadata_;
 
-  KernelBase(const KernelBase &) = delete;
-  void operator=(const KernelBase &) = delete;
+  KernelArgsPacking kernel_args_packing_;
+
+  Kernel(const Kernel &) = delete;
+  void operator=(const Kernel &) = delete;
 };
 
 //===----------------------------------------------------------------------===//
-// Device memory pointer traits
+// Typed kernel
 //===----------------------------------------------------------------------===//
 
-namespace internal {
-template <typename T>
-struct IsDeviceMemory : public std::false_type {};
-template <typename U>
-struct IsDeviceMemory<DeviceMemory<U>> : public std::true_type {};
-template <>
-struct IsDeviceMemory<DeviceMemoryBase> : public std::true_type {};
+// Typed variant of Kernel, like a typed device function pointer.
+template <typename... Params>
+class TypedKernel : public Kernel {
+ public:
+  static constexpr size_t kNumberOfParameters = sizeof...(Params);
 
-template <typename U>
-struct IsSharedDeviceMemory : public std::false_type {};
-template <typename U>
-struct IsSharedDeviceMemory<SharedDeviceMemory<U>> : public std::true_type {};
-}  // namespace internal
+  explicit TypedKernel(StreamExecutor *parent) : Kernel(parent) {}
+};
 
-template <typename T>
-static constexpr bool is_device_memory_pointer_v =
-    std::is_pointer_v<T> &&
-    internal::IsDeviceMemory<std::remove_pointer_t<T>>::value;
+//===----------------------------------------------------------------------===//
+// Kernel arguments LLVM-style RTTI library
+//===----------------------------------------------------------------------===//
 
-template <typename T>
-static constexpr bool is_device_memory_value_like_v =
-    !std::is_pointer_v<T> &&
-    internal::IsDeviceMemory<std::remove_reference_t<T>>::value;
+template <class T, KernelArgs::IsKernelArgs<T> * = nullptr>
+T *Cast(KernelArgs *args) {
+  CHECK(T::classof(args)) << "Invalid arguments casting to a destination type: "
+                          << typeid(T).name();
+  CHECK(args != nullptr) << "Casted arguments must be not null";
+  return static_cast<const T *>(args);
+}
 
-template <typename T>
-static constexpr bool is_shared_device_memory_v =
-    internal::IsSharedDeviceMemory<std::remove_reference_t<T>>::value;
+template <class T, KernelArgs::IsKernelArgs<T> * = nullptr>
+const T *Cast(const KernelArgs *args) {
+  CHECK(T::classof(args)) << "Invalid arguments casting to a destination type: "
+                          << typeid(T).name();
+  CHECK(args != nullptr) << "Casted arguments must be not null";
+  return static_cast<const T *>(args);
+}
+
+template <class T, KernelArgs::IsKernelArgs<T> * = nullptr>
+const T *DynCast(const KernelArgs *args) {
+  CHECK(args != nullptr) << "Casted arguments must be not null";
+  return T::classof(args) ? static_cast<const T *>(args) : nullptr;
+}
+
+template <class T, KernelArgs::IsKernelArgs<T> * = nullptr>
+const T *DynCastOrNull(const KernelArgs *args) {
+  return args && T::classof(args) ? static_cast<const T *>(args) : nullptr;
+}
 
 //===----------------------------------------------------------------------===//
-// Kernel arguments
+// Kernel arguments device memory array
 //===----------------------------------------------------------------------===//
 
-// Basic data about a kernel argument.
-struct KernelArg {
-  bool is_shared;
-  const void *address;
-  size_t size;
+class KernelArgsDeviceMemoryArray : public KernelArgs {
+ public:
+  KernelArgsDeviceMemoryArray(absl::Span<const DeviceMemoryBase> args,
+                              size_t shared_memory_bytes)
+      : device_memory_args_(args.begin(), args.end()),
+        shared_memory_bytes_(shared_memory_bytes) {}
+
+  static bool classof(const KernelArgs *args) {
+    return args->kind() == Kind::kDeviceMemoryArray;
+  }
+
+  Kind kind() const final { return Kind::kDeviceMemoryArray; }
+
+  size_t number_of_arguments() const final {
+    return device_memory_args_.size() + (shared_memory_bytes_ > 0);
+  }
+
+  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
+
+  absl::Span<const DeviceMemoryBase> device_memory_args() const {
+    return device_memory_args_;
+  }
+
+  const void *device_memory_ptr(size_t index) const {
+    return device_memory_args_[index].opaque();
+  }
+
+  size_t device_memory_size(size_t index) const {
+    return device_memory_args_[index].size();
+  }
+
+ private:
+  absl::InlinedVector<DeviceMemoryBase, 4> device_memory_args_;
+  size_t shared_memory_bytes_ = 0;
 };
 
-// Base class for KernelArgsArray.
-//
-// Supports all the getter methods that do not depend on the compile-time number
-// of arguments template parameter.
-//
-// This class exists as a way to pass kernel arguments to
-// StreamExecutorInterface::Launch. That Launch method is virtual, so it can't
-// be templated to accept any KernelArgsArray type, therefore a reference to
-// this base type is passed instead.
-//
-// Performance is not a concern here because each of these methods will be
-// called at most once per kernel launch. Past performance concerns with
-// KernelArgsArray have been in reference to the argument packing routines which
-// are called once per kernel argument. Those packing routines are now handled
-// by the templated KernelArgsArray subclass of this class where they can take
-// advantage of compile-time knowledge of the number of arguments in order to be
-// very efficient.
-class KernelArgsArrayBase {
- public:
-  virtual ~KernelArgsArrayBase() = default;
+//===----------------------------------------------------------------------===//
+// Kernel arguments packing for device memory and POD args
+//===----------------------------------------------------------------------===//
 
-  // Gets the number of arguments added so far, including shared memory
-  // arguments.
-  virtual size_t number_of_arguments() const = 0;
+// KernelArgsPackedArray is optimized for packing DeviceMemoryBase pointers
+// and POD arguments (i.e. scalars) when the number and type of arguments are
+// not known at compile time.
 
-  // Gets the total number of shared memory bytes added so far.
-  virtual uint64_t number_of_shared_bytes() const = 0;
+namespace internal {
 
-  // Gets the list of argument addresses.
-  virtual absl::Span<const void *const> argument_addresses() const = 0;
+// An empty storage for packing just the device memory arguments, that are
+// stored directly in the `KernelArgsPackedArray`.
+class EmptyArgs {};
+
+// A storage for POD generic arguments that are smaller than `size` and require
+// alignment smaller or equal to `alignment`.
+template <size_t capacity, size_t size = 8,
+          size_t alignment = alignof(std::max_align_t)>
+class PodArgs {
+ protected:
+  template <typename T>
+  const std::byte *add_pod_argument(const T &arg) {
+    static_assert(
+        std::is_pod_v<T> && sizeof(T) <= size & alignof(T) <= alignment,
+        "Type is not compatible with POD arguments storage");
+
+    assert(num_args_ < capacity && "pod args overflow");
+    std::byte *arg_storage = args_storage_[num_args_++].storage;
+    std::memcpy(arg_storage, &arg, sizeof(T));
+
+    return arg_storage;
+  }
+
+ private:
+  struct Arg {
+    alignas(alignment) std::byte storage[size];
+  };
+
+  size_t num_args_ = 0;
+  std::array<Arg, capacity> args_storage_;
 };
 
-// A list of arguments for a kernel call.
-//
-// The template parameter kNumArgs is the maximum number of arguments which can
-// be stored in the list.
-//
-// Contains a list of addresses for non-shared-memory arguments and a list of
-// sizes for shared-memory arguments. Since the shared-memory arguments may be
-// interspersed with the non-shared-memory arguments, it also stores a list of
-// the indices at which the shared-memory arguments appeared.
-//
-// For example, if the argument address list contains {a, b, c, d, e}, the
-// shared-memory arguments list contains the sizes of {A, B, C}, and the
-// shared-memory indices list contains {0, 3, 5}, then the original list of
-// arguments was {A, a, b, B, c, C, d, e}.
-//
-// This way of storing the arguments makes CUDA kernel calls efficient because
-// they only require the argument address list and the total number of shared
-// bytes, but it also makes it possible for OpenCL kernel calls because they
-// depend on the location of each shared-memory argument and its size.
+template <typename ArgsStorage>
+static constexpr bool is_pod_args_v = false;
+
+template <size_t capacity, size_t size, size_t alignment>
+static constexpr bool is_pod_args_v<PodArgs<capacity, size, alignment>> = true;
+
+}  // namespace internal
+
+// An array of arguments for a kernel call.
 //
-// Note that the code for adding arguments has been identified as a performance
-// hotspot in some real-world applications so this structure has been optimized
-// for the performance of argument adding.
-template <size_t kNumArgs>
-class KernelArgsArray : public KernelArgsArrayBase {
+// The template parameter `num_args` is the maximum number of arguments which
+// can be stored in the array.
+template <size_t num_args, typename ArgsStorage = internal::PodArgs<num_args>>
+class KernelArgsPackedArray : public KernelArgsPackedArrayBase, ArgsStorage {
  public:
-  static constexpr int kMaxGenericArgSize = 8;
+  KernelArgsPackedArray() = default;
+
+  // KernelArgsPackedArray is not copyable or movable because argument addresses
+  // point to inline storage that can't be moved.
+  KernelArgsPackedArray(const KernelArgsPackedArray &) = delete;
+  KernelArgsPackedArray &operator=(const KernelArgsPackedArray &) = delete;
 
   // Adds an argument to the list.
   template <typename T>
   void add_argument(const T &arg) {
-    static_assert(sizeof(T) <= kMaxGenericArgSize,
-                  "Please adjust kMaxGenericArgSize");
-    static_assert(std::is_pod_v<T>, "Only pod types supported!");
-    char *generic_arg_storage =
-        &generic_arguments_[number_of_generic_arguments_++ *
-                            kMaxGenericArgSize];
-
-    CHECK_EQ(reinterpret_cast<uintptr_t>(generic_arg_storage) % alignof(T), 0);
-    std::memcpy(generic_arg_storage, &arg, sizeof(T));
-
-    argument_addresses_[number_of_argument_addresses_] = generic_arg_storage;
-    argument_sizes_[number_of_argument_addresses_] = sizeof(arg);
-    ++number_of_argument_addresses_;
+    if constexpr (internal::is_pod_args_v<ArgsStorage>) {
+      argument_addresses_[number_of_argument_addresses_++] =
+          ArgsStorage::add_pod_argument(arg);
+    } else {
+      // https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+      static_assert(sizeof(T) == 0, "Arguments storage is not supported");
+    }
   }
 
   // Adds a device memory argument to the list.
@@ -345,7 +472,6 @@ class KernelArgsArray : public KernelArgsArrayBase {
         &device_memory_opaque_pointers_[number_of_argument_addresses_];
     *copy_ptr = arg.opaque();
     argument_addresses_[number_of_argument_addresses_] = copy_ptr;
-    argument_sizes_[number_of_argument_addresses_] = sizeof(void *);
     ++number_of_argument_addresses_;
   }
 
@@ -354,298 +480,247 @@ class KernelArgsArray : public KernelArgsArrayBase {
   // The only significant information about a shared argument is its size, so
   // that is the only parameter in this function.
   void add_shared_bytes(size_t number_of_bytes) {
-    shared_memory_indices_[number_of_shared_memory_arguments_] =
-        number_of_argument_addresses_ + number_of_shared_memory_arguments_;
-    shared_memory_bytes_[number_of_shared_memory_arguments_] = number_of_bytes;
-    ++number_of_shared_memory_arguments_;
-    total_shared_memory_bytes_ += number_of_bytes;
+    shared_memory_bytes_ += number_of_bytes;
   }
 
   // Gets the number of arguments added so far, including shared memory
   // arguments.
-  size_t number_of_arguments() const override {
-    return number_of_argument_addresses_ + number_of_shared_memory_arguments_;
+  size_t number_of_arguments() const final {
+    return number_of_argument_addresses_ + (shared_memory_bytes_ > 0);
   }
 
   // Gets the total number of shared memory bytes added so far.
-  uint64_t number_of_shared_bytes() const override {
-    return total_shared_memory_bytes_;
-  }
+  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
 
   // Gets the list of argument addresses.
-  absl::Span<const void *const> argument_addresses() const override {
+  absl::Span<const void *const> argument_addresses() const final {
     return absl::Span<const void *const>(argument_addresses_.data(),
                                          number_of_argument_addresses_);
   }
 
  private:
   // A place to store copies of opaque pointers from device memory arguments.
-  std::array<const void *, kNumArgs> device_memory_opaque_pointers_;
+  std::array<const void *, num_args> device_memory_opaque_pointers_;
 
   // Addresses for non-shared-memory arguments.
-  std::array<const void *, kNumArgs> argument_addresses_;
-
-  // Storage for arguments of templated type.
-  alignas(kMaxGenericArgSize)
-      std::array<char, kNumArgs * kMaxGenericArgSize> generic_arguments_;
-
-  // Sizes for non-shared-memory arguments.
-  std::array<size_t, kNumArgs> argument_sizes_;
-
-  // Size in bytes for each shared memory argument.
-  std::array<size_t, kNumArgs> shared_memory_bytes_;
+  std::array<const void *, num_args> argument_addresses_;
 
-  // Indices in the arguments array for shared memory arguments.
-  std::array<size_t, kNumArgs> shared_memory_indices_;
+  // Shared memory required by a kernel.
+  size_t shared_memory_bytes_ = 0;
 
-  // Total of all shared memory sizes.
-  size_t total_shared_memory_bytes_ = 0;
-
-  // Number of significant entries in argument_addresses_ and argument_sizes_.
+  // Number of significant entries in argument_addresses_.
   size_t number_of_argument_addresses_ = 0;
-
-  // Number of significant entries in shared_memory_bytes_ and
-  // shared_memory_indices_.
-  size_t number_of_shared_memory_arguments_ = 0;
-
-  // The number of generic arguments that have been added to generic_arguments_.
-  size_t number_of_generic_arguments_ = 0;
 };
 
+namespace internal {
 template <int n>
-std::unique_ptr<KernelArgsArrayBase> MakeKernelArgs(
+std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
     absl::Span<const DeviceMemoryBase> args, uint32_t shared_mem_bytes) {
-  auto kernel_args = std::make_unique<KernelArgsArray<n>>();
+  auto packed = std::make_unique<KernelArgsPackedArray<n, EmptyArgs>>();
   for (const DeviceMemoryBase &buf : args) {
-    kernel_args->add_device_memory_argument(buf);
+    packed->add_device_memory_argument(buf);
   }
   if (shared_mem_bytes > 0) {
-    kernel_args->add_shared_bytes(shared_mem_bytes);
+    packed->add_shared_bytes(shared_mem_bytes);
+  }
+  return packed;
+}
+}  // namespace internal
+
+inline tsl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>> PackKernelArgs(
+    absl::Span<const DeviceMemoryBase> args, uint32_t shared_mem_bytes) {
+  static constexpr int kKernelArgsLimit = 1024;
+
+  if (args.size() > kKernelArgsLimit)
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Can't pack device memory arguments array of size ", args.size(),
+        " which is larger than the maximum supported size of ",
+        kKernelArgsLimit));
+
+  // Specialize kernel arguments array for small sizes to allocate a smaller
+  // chunk of memory and hopefully hit a small allocations cache.
+  if (args.size() <= 4) {
+    return internal::PackKernelArgs<4>(args, shared_mem_bytes);
+  } else if (args.size() <= 8) {
+    return internal::PackKernelArgs<8>(args, shared_mem_bytes);
+  } else if (args.size() <= 16) {
+    return internal::PackKernelArgs<16>(args, shared_mem_bytes);
+  } else if (args.size() <= 32) {
+    return internal::PackKernelArgs<32>(args, shared_mem_bytes);
+  } else if (args.size() <= 64) {
+    return internal::PackKernelArgs<64>(args, shared_mem_bytes);
+  } else if (args.size() <= 256) {
+    return internal::PackKernelArgs<256>(args, shared_mem_bytes);
+  } else if (args.size() <= 512) {
+    return internal::PackKernelArgs<512>(args, shared_mem_bytes);
   }
-  return kernel_args;
+
+  return internal::PackKernelArgs<kKernelArgsLimit>(args, shared_mem_bytes);
+}
+
+inline tsl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>> PackKernelArgs(
+    absl::Span<const DeviceMemoryBase> args, const KernelMetadata &metadata) {
+  return PackKernelArgs(args, metadata.shared_memory_bytes().value_or(0));
 }
 
 //===----------------------------------------------------------------------===//
-// Typed kernel
+// Kernel arguments packing for statically know argument types
 //===----------------------------------------------------------------------===//
 
-// Typed variant of KernelBase, like a typed device function pointer. See the
-// file comment for details and example usage.
-//
-// This class contains template metaprogramming magic to type check the
-// parameters passed to a kernel launch are acceptable, and subsequently pack
-// them into a form which can be used by the StreamExecutorInterface
-// implementation. (i.e.  CUDA and OpenCL both bind void*s with associated
-// sizes as kernel arguments.)
+// KernelArgsPackedTuple is optimized for packing arguments when their types are
+// known at compile time, and somewhat similar to `std::tuple` but with a few
+// special rules for passing device memory arguments.
+
+namespace internal {
+
+// PackedArgType template specialization defines what storage type we'll be
+// using for each kernel argument type:
 //
-// Thread-compatible.
-template <typename... Params>
-class TypedKernel : public KernelBase {
- public:
-  static constexpr size_t kNumberOfParameters = sizeof...(Params);
+//   (1) We always strip references and store a copy of an argument.
+//   (2) We do not support pointer arguments, as we should not be passing a
+//       pointers to host memory to device kernels.
+//   (3) DeviceMemory passed as an opaque `void*` pointer.
+//   (4) We have a special case for passing pointers to DeviceMemory where we
+//       also pass it as an opaque device pointer.
+template <typename T>
+struct PackedArgType {
+  static_assert(!std::is_pointer_v<T>, "cannot pass raw pointer to the device");
+  using Type = T;
+};
 
-  // Delegates to KernelBase::KernelBase(), see that constructor.
-  explicit TypedKernel(StreamExecutor *parent) : KernelBase(parent) {}
+template <>
+struct PackedArgType<DeviceMemoryBase> {
+  using Type = const void *;
+};
 
-  // Test-only constructor that can take a mock KernelInterface implementation.
-  // Takes ownership of implementation, it should not be null.
-  TypedKernel(StreamExecutor *parent, internal::KernelInterface *implementation)
-      : KernelBase(parent, implementation) {}
+template <typename T>
+struct PackedArgType<DeviceMemory<T>> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
 
-  // This is the main entry point into the magic. Packs the parameters (which
-  // must type check against the class template) into the args and sizes
-  // arrays.
-  //
-  // Const refs are taken as parameters on all of the handlers to avoid
-  // implicit type promotion of integers.
-  //
-  // WARNING: as a performance optimization this method may store pointers to
-  // some of the input parameters in the kernel args structure, so any params
-  // passed into this method must live at least as long as the kernel args
-  // structure.
-  void PackParams(KernelArgsArray<kNumberOfParameters> *args,
-                  const Params &...params) const {
-    PackOneParamFromList(args, params...);
-  }
+template <>
+struct PackedArgType<DeviceMemoryBase *> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
 
- private:
-  // Stream needs access to the specific parameter-packing functionality that
-  // the TypedKernel provides for its corresponding type signature (and no other
-  // type signatures).
-  friend class Stream;
-
-  template <typename T, typename... RestOfParams>
-  void PackOneParamFromList(KernelArgsArray<kNumberOfParameters> *args,
-                            const T &arg, const RestOfParams &...rest) const {
-    PackOneParam(args, arg);
-    PackOneParamFromList(args, rest...);
-  }
+template <>
+struct PackedArgType<const DeviceMemoryBase *> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
 
-  // Base case for variadic template expansion - nothing to do!
-  void PackOneParamFromList(KernelArgsArray<kNumberOfParameters> *args) const {}
+template <typename T>
+struct PackedArgType<DeviceMemory<T> *> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
 
-  // Packs one (non-DeviceMemoryBase) parameter into the arg and sizes array.
-  // The enable_if<> is for excluding DeviceMemoryBase args, which have a
-  // separate implementation below.
-  template <typename T>
-  void PackOneParam(
-      KernelArgsArray<kNumberOfParameters> *args, const T &arg,
-      typename std::enable_if_t<
-          !is_device_memory_value_like_v<T> && !is_device_memory_pointer_v<T> &&
-          !is_shared_device_memory_v<T>> * = nullptr) const {
-    static_assert(!std::is_pointer_v<T>,
-                  "cannot pass raw pointer to the device");
-    static_assert(!std::is_convertible_v<T, DeviceMemoryBase>,
-                  "cannot pass device memory as a normal value");
-    args->add_argument(arg);
-  }
+template <typename T>
+struct PackedArgType<const DeviceMemory<T> *> {
+  using Type = typename PackedArgType<DeviceMemoryBase>::Type;
+};
 
-  // DeviceMemoryBase family reference override.
-  template <typename T>
-  void PackOneParam(KernelArgsArray<kNumberOfParameters> *args, const T &arg,
-                    typename std::enable_if_t<is_device_memory_value_like_v<T>>
-                        * = nullptr) const {
-    args->add_device_memory_argument(arg);
-  }
+// Overload set for packing kernel arguments. This overload set matches
+// supported kernel arguments types defined by `PackedArgType`.
+template <typename T, std::enable_if_t<!std::is_pointer_v<T>> * = nullptr>
+T PackArg(const T &arg) {
+  return arg;
+}
 
-  // DeviceMemoryBase family pointer override.
-  template <typename T>
-  void PackOneParam(KernelArgsArray<kNumberOfParameters> *args, T arg,
-                    typename std::enable_if_t<is_device_memory_pointer_v<T>> * =
-                        nullptr) const {
-    DeviceMemoryBase *ptr = static_cast<DeviceMemoryBase *>(arg);
-    args->add_device_memory_argument(*ptr);
-  }
+inline const void *PackArg(const DeviceMemoryBase &arg) { return arg.opaque(); }
+inline const void *PackArg(const DeviceMemoryBase *arg) {
+  return PackArg(*arg);
+}
 
-  // Dynamic shared device memory has a size, but no associated allocation on
-  // the host; internally, the device will allocate storage.
-  template <typename T>
-  void PackOneParam(KernelArgsArray<kNumberOfParameters> *args, T arg,
-                    typename std::enable_if_t<is_shared_device_memory_v<T>> * =
-                        nullptr) const {
-    args->add_shared_bytes(arg.size());
-  }
+template <typename T>
+const void *PackArg(const DeviceMemory<T> &arg) {
+  return arg.opaque();
+}
 
-  TypedKernel(const TypedKernel &) = delete;
-  void operator=(const TypedKernel &) = delete;
-};
+template <typename T>
+const void *PackArg(const DeviceMemory<T> *arg) {
+  return PackArg(*arg);
+}
 
-// Template metaprogramming helper type that helps us produce better error
-// messages at compile time when the are mismatches between the parameter
-// type list and the argument type list.
-template <typename ParamTuple, typename ArgTuple>
-struct KernelInvocationChecker {
-  // Whether the parameter tuple and argument tuple match in length.
-  static constexpr bool kLengthMatches =
-      std::tuple_size_v<ParamTuple> == std::tuple_size_v<ArgTuple>;
-
-  // The (matching) length of the parameters and arguments type lists.
-  static constexpr int kTupleLength =
-      static_cast<int>(std::tuple_size_v<ArgTuple>);
-
-  // Helper trait to say whether the parameter wants a DeviceMemory-reference
-  // compatible type. This is for inexact type matches, so that it doesn't have
-  // to be precisely a const DeviceMemory<T>&, but can also be a value that
-  // represents the same.
-  template <typename ParamType, typename ArgType>
-  struct IsCompatibleDeviceMemoryRef : public std::false_type {};
-
-  // See type trait definition above.
-  template <typename U>
-  struct IsCompatibleDeviceMemoryRef<const DeviceMemory<U> &, DeviceMemory<U>>
-      : public std::true_type {};
-
-  // See type trait definition above.
-  template <typename U>
-  struct IsCompatibleDeviceMemoryRef<const SharedDeviceMemory<U> &,
-                                     SharedDeviceMemory<U>>
-      : public std::true_type {};
-
-  // Returns whether ParamT and ArgT are compatible for data parallel kernel
-  // parameter packing without any assert functionality.
-  template <typename ParamT, typename ArgT>
-  static constexpr bool CompatibleNoAssert() {
-    return std::is_same_v<typename std::remove_const_t<ParamT>, ArgT> ||
-           IsCompatibleDeviceMemoryRef<ParamT, ArgT>::value;
-  }
+}  // namespace internal
 
-  // Checks whether ParamT and ArgT are compatible for data parallel kernel
-  // parameter packing. kArgumentNumber is unused, it just for error display.
-  //
-  // NOTE: if you encounter an error here, you can see the mismatch by looking
-  // at the end of the last error message, which will be of the form:
-  //
-  //    ...::Compatible<const stream_executor::DeviceMemory<OneThing> &,
-  //                    stream_executor::DeviceMemory<AnotherThing>, true,
-  //                    0>'
-  //    requested here
-  //
-  // This means that the 0th argument you passed to the kernel invocation should
-  // have been DeviceMemory<OneThing> but was observed to be
-  // DeviceMemory<AnotherThing>.
-  template <typename ParamT, typename ArgT, bool kShouldStaticAssert,
-            int kArgumentNumber>
-  static constexpr bool Compatible() {
-    static_assert(
-        kShouldStaticAssert ? CompatibleNoAssert<ParamT, ArgT>() : true,
-        "parameter type (LHS) is not compatible with argument type (RHS)");
-    return CompatibleNoAssert<ParamT, ArgT>();
+template <typename... Args>
+class KernelArgsPackedTuple : public KernelArgsPackedArrayBase {
+ public:
+  static constexpr size_t kSize = sizeof...(Args);
+
+  using Storage = std::tuple<
+      typename internal::PackedArgType<absl::remove_cvref_t<Args>>::Type...>;
+
+  explicit KernelArgsPackedTuple(Args... args, size_t shared_memory_bytes)
+      : storage_(internal::PackArg(std::forward<Args>(args))...),
+        shared_memory_bytes_(shared_memory_bytes) {
+    InitializeArgumentAddresses(std::make_index_sequence<kSize>{});
   }
 
-  // Checks the parameter/argument match at kArgumentNumber for an out of bounds
-  // argument number.
-  //
-  // This is the base case: we've run out of argument to check, so we're all
-  // good.
-  template <int kArgumentNumber, bool kShouldStaticAssert>
-  static constexpr bool CheckParam(
-      typename std::enable_if_t<(kArgumentNumber < 0)> *dummy = nullptr) {
-    return true;
+  // KernelArgsPackedTuple is not copyable or movable because argument addresses
+  // point to inline storage that can't be moved.
+  KernelArgsPackedTuple(const KernelArgsPackedTuple &) = delete;
+  KernelArgsPackedTuple &operator=(const KernelArgsPackedTuple &) = delete;
+
+  size_t number_of_arguments() const final {
+    return kSize + (shared_memory_bytes_ > 0);
   }
 
-  // Checks the parameter/argument match at kArgumentNumber.
-  // kShouldStaticAssert determines whether to assert out on a mismatch, or just
-  // yield the constexpr boolean value.
-  template <int kArgumentNumber, bool kShouldStaticAssert>
-  static constexpr bool CheckParam(
-      typename std::enable_if_t<kArgumentNumber >= 0> *dummy = nullptr) {
-    typedef typename std::tuple_element_t<kArgumentNumber, ParamTuple> ParamT;
-    typedef typename std::tuple_element_t<kArgumentNumber, ArgTuple> ArgT;
-    return Compatible<ParamT, ArgT, kShouldStaticAssert, kArgumentNumber>() &&
-           CheckParam<kArgumentNumber - 1, kShouldStaticAssert>();
+  uint64_t number_of_shared_bytes() const final { return shared_memory_bytes_; }
+
+  absl::Span<const void *const> argument_addresses() const final {
+    return absl::Span<const void *const>(argument_addresses_.data(), kSize);
   }
 
-  // Checks the parameters/arguments for match, but doesn't static assert out.
-  // This is useful for testing/inspecting whether a set of parameters match in
-  // things like tests.
-  static constexpr bool CheckAllNoStaticAssert() {
-    return kLengthMatches && CheckParam<kTupleLength - 1, false>();
+  // Compile time check that KernelArgsPackedTuple is compatible with
+  // `OtherArgs`: after stripping const and reference all types match.
+  template <typename... OtherArgs>
+  static void CheckCompatibleStaticAssert() {
+    static constexpr size_t kOtherSize = sizeof...(OtherArgs);
+    static_assert(kSize == kOtherSize, "length of arguments packs must match");
+
+    using StrippedArgs = std::tuple<absl::remove_cvref_t<Args>...>;
+    using StrippedOtherArgs = std::tuple<absl::remove_cvref_t<OtherArgs>...>;
+    static_assert(std::is_same_v<StrippedArgs, StrippedOtherArgs>,
+                  "arguments types do not match");
   }
 
-  // Checks the parameters and static asserts out with a helpful error message
-  // (and useful template parameters in the instantiation stack) if there is an
-  // error.
-  static constexpr bool CheckAllStaticAssert() {
-    static_assert(kLengthMatches,
-                  "argument length mismatched against typed kernel parameters");
-    return kLengthMatches && CheckParam<kTupleLength - 1, true>();
+ private:
+  template <size_t... Is>
+  void InitializeArgumentAddresses(std::index_sequence<Is...>) {
+    ((argument_addresses_[Is] = &std::get<Is>(storage_)), ...);
   }
-};
 
-// This is a convenience type for checking whether a typed kernel matches
-// against a type list.
-template <typename KernelT, typename... Params>
-struct KernelParamsOk {
-  static constexpr bool kResult = false;
+  // Storage for packed kernel arguments.
+  Storage storage_;
+
+  // Shared memory required by a kernel.
+  size_t shared_memory_bytes_ = 0;
+
+  // Pointers into `storage_`.
+  std::array<const void *, kSize> argument_addresses_;
 };
 
-// See above.
+// Packs the given arguments into a KernelArgsPackedTuple.
+template <typename... Args>
+std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(int64_t shmem_bytes,
+                                                          Args... args) {
+  using PackedArgs = KernelArgsPackedTuple<Args...>;
+  return std::make_unique<PackedArgs>(std::forward<Args>(args)..., shmem_bytes);
+}
+
+// Packs the given arguments into a KernelArgsPackedTuple with compile-time type
+// checks that arguments are compatible with TypedKernel signature.
 template <typename... Params, typename... Args>
-struct KernelParamsOk<TypedKernel<Params...>, Args...> {
-  static constexpr bool kResult =
-      KernelInvocationChecker<std::tuple<Params...>,
-                              std::tuple<Args...>>::CheckAllNoStaticAssert();
-};
+std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
+    const TypedKernel<Params...> &kernel, Args... args) {
+  using PackedParams = KernelArgsPackedTuple<Params...>;
+  using PackedArgs = KernelArgsPackedTuple<Args...>;
+
+  PackedParams::template CheckCompatibleStaticAssert<Args...>();
+
+  int64_t shmem_bytes = kernel.metadata().shared_memory_bytes().value_or(0);
+  return std::make_unique<PackedArgs>(std::forward<Args>(args)..., shmem_bytes);
+}
 
 }  // namespace stream_executor
 
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.cc b/third_party/xla/xla/stream_executor/kernel_spec.cc
index 62cb63b6b3f479..53e5b6687e71da 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.cc
+++ b/third_party/xla/xla/stream_executor/kernel_spec.cc
@@ -234,6 +234,8 @@ MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
   return this;
 }
 
-MultiKernelLoaderSpec::MultiKernelLoaderSpec(size_t arity) : arity_(arity) {}
+MultiKernelLoaderSpec::MultiKernelLoaderSpec(
+    size_t arity, KernelArgsPacking kernel_args_packing)
+    : arity_(arity), kernel_args_packing_(std::move(kernel_args_packing)) {}
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.h b/third_party/xla/xla/stream_executor/kernel_spec.h
index 4a8c4036b9d553..742cfa476381be 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.h
+++ b/third_party/xla/xla/stream_executor/kernel_spec.h
@@ -45,6 +45,7 @@ limitations under the License.
 
 #include <stddef.h>
 
+#include <functional>
 #include <initializer_list>
 #include <map>
 #include <memory>
@@ -56,9 +57,14 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/stream_executor/platform/port.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 
+class Kernel;                     // defined in kernel.h
+class KernelArgs;                 // defined in kernel.h
+class KernelArgsPackedArrayBase;  // defined in kernel.h
+
 // Describes how to load a kernel on a target platform.
 //
 // This is an abstract base class, subclassed for specific platforms.
@@ -251,7 +257,16 @@ class CudaCubinInMemory : public KernelLoaderSpec {
 // Describes how to load a kernel on any subset of a number of target platforms.
 class MultiKernelLoaderSpec {
  public:
-  explicit MultiKernelLoaderSpec(size_t arity);
+  // A function for converting kernel arguments into a packed kernels arguments
+  // that can be directly passed to a device kernel. This indirection allows
+  // registering custom CUDA C++ kernels with non-trivial C++ API with a
+  // StreamExecutor as a generic `Kernel`.
+  using KernelArgsPacking =
+      std::function<tsl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>(
+          const Kernel &kernel, const KernelArgs &args)>;
+
+  explicit MultiKernelLoaderSpec(
+      size_t arity, KernelArgsPacking kernel_args_packing = nullptr);
 
   // Returns the number of arguments that this kernel accepts.
   size_t arity() const { return arity_; }
@@ -314,22 +329,29 @@ class MultiKernelLoaderSpec {
       std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
       absl::string_view kernel_name);
 
+  const KernelArgsPacking &kernel_args_packing() const {
+    return kernel_args_packing_;
+  }
+
  private:
-  std::unique_ptr<InProcessSymbol>
+  std::shared_ptr<InProcessSymbol>
       in_process_symbol_;  // In process symbol pointer.
-  std::unique_ptr<CudaPtxOnDisk>
+  std::shared_ptr<CudaPtxOnDisk>
       cuda_ptx_on_disk_;  // PTX text that resides in a file.
-  std::unique_ptr<CudaCubinOnDisk>
+  std::shared_ptr<CudaCubinOnDisk>
       cuda_cubin_on_disk_;  // Binary CUDA program in a file.
-  std::unique_ptr<CudaCubinInMemory>
+  std::shared_ptr<CudaCubinInMemory>
       cuda_cubin_in_memory_;  // Binary CUDA program in memory.
-  std::unique_ptr<CudaPtxInMemory>
+  std::shared_ptr<CudaPtxInMemory>
       cuda_ptx_in_memory_;  // PTX text that resides in memory.
 
   // Number of parameters that the kernel takes. (This is nicer to have in a
   // constexpr than having to determine it from the types via template
   // metaprogramming).
   size_t arity_;
+
+  // Custom kernel arguments packing.
+  KernelArgsPacking kernel_args_packing_;
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/kernel_test.cc b/third_party/xla/xla/stream_executor/kernel_test.cc
new file mode 100644
index 00000000000000..0397a4db08b51c
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/kernel.h"
+
+#include <cstdint>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/test.h"
+#include "tsl/platform/test_benchmark.h"
+
+namespace stream_executor {
+
+// Struct for testing custom kernel arguments with C++ structs.
+struct Data {};
+
+// Compile time checks to make sure that we correctly infer the storage type
+// from packed arguments.
+template <typename... Args>
+using ArgsStorage = typename KernelArgsPackedTuple<Args...>::Storage;
+
+// We automatically remove const and reference from integral arguments types.
+static_assert(
+    std::is_same_v<ArgsStorage<int32_t, const int32_t, int32_t&, const int32_t>,
+                   std::tuple<int32_t, int32_t, int32_t, int32_t>>);
+
+// We automatically remove const and reference from struct arguments types.
+static_assert(std::is_same_v<ArgsStorage<Data, const Data, Data&, const Data>,
+                             std::tuple<Data, Data, Data, Data>>);
+
+// We pass DeviceMemoryBase as an opaque pointer.
+static_assert(std::is_same_v<
+              ArgsStorage<DeviceMemoryBase, const DeviceMemoryBase,
+                          DeviceMemoryBase&, const DeviceMemoryBase&>,
+              std::tuple<const void*, const void*, const void*, const void*>>);
+
+// We pass DeviceMemory<T> as an opaque pointer.
+static_assert(std::is_same_v<
+              ArgsStorage<DeviceMemory<float>, const DeviceMemory<float>,
+                          DeviceMemory<float>&, const DeviceMemory<float>&>,
+              std::tuple<const void*, const void*, const void*, const void*>>);
+
+// We accept pointers to DeviceMemoryBase and extract opaque pointers from them.
+static_assert(
+    std::is_same_v<ArgsStorage<DeviceMemoryBase*, const DeviceMemoryBase*>,
+                   std::tuple<const void*, const void*>>);
+
+static std::unique_ptr<StreamExecutor> NewStreamExecutor() {
+  Platform* platform = MultiPlatformManager::PlatformWithName("Host").value();
+  StreamExecutorConfig config(/*ordinal=*/0);
+  return platform->GetUncachedExecutor(config).value();
+}
+
+TEST(KernelTest, PackDeviceMemoryArguments) {
+  auto executor = NewStreamExecutor();
+
+  DeviceMemoryBase a(reinterpret_cast<void*>(0x12345678));
+  DeviceMemoryBase b(reinterpret_cast<void*>(0x87654321));
+
+  auto args = PackKernelArgs({a, b}, 0).value();
+  ASSERT_EQ(args->number_of_arguments(), 2);
+
+  auto packed = args->argument_addresses();
+  const void* ptr0 = *reinterpret_cast<const void* const*>(packed[0]);
+  const void* ptr1 = *reinterpret_cast<const void* const*>(packed[1]);
+
+  ASSERT_EQ(ptr0, a.opaque());
+  ASSERT_EQ(ptr1, b.opaque());
+}
+
+TEST(KernelTest, PackPodArguments) {
+  auto args = std::make_unique<KernelArgsPackedArray<4>>();
+  args->add_argument(1);
+  args->add_argument(2.0f);
+  args->add_argument(3.0);
+
+  ASSERT_EQ(args->number_of_arguments(), 3);
+
+  auto packed = args->argument_addresses();
+  int32_t i32 = *reinterpret_cast<const int32_t*>(packed[0]);
+  float f32 = *reinterpret_cast<const float*>(packed[1]);
+  double f64 = *reinterpret_cast<const double*>(packed[2]);
+
+  ASSERT_EQ(i32, 1);
+  ASSERT_EQ(f32, 2.0f);
+  ASSERT_EQ(f64, 3.0);
+}
+
+TEST(KernelTest, PackTypedKernelArguments) {
+  auto executor = NewStreamExecutor();
+  TypedKernel<int32_t, float, double> kernel(executor.get());
+
+  auto args = PackKernelArgs(kernel, 1, 2.0f, 3.0);
+  ASSERT_EQ(args->number_of_arguments(), 3);
+
+  auto packed = args->argument_addresses();
+  int32_t i32 = *reinterpret_cast<const int32_t*>(packed[0]);
+  float f32 = *reinterpret_cast<const float*>(packed[1]);
+  double f64 = *reinterpret_cast<const double*>(packed[2]);
+
+  ASSERT_EQ(i32, 1);
+  ASSERT_EQ(f32, 2.0f);
+  ASSERT_EQ(f64, 3.0);
+}
+
+//===----------------------------------------------------------------------===//
+// Performance benchmarks below
+//===----------------------------------------------------------------------===//
+
+static void BM_PackDeviceMemoryArgs(benchmark::State& state) {
+  std::vector<DeviceMemoryBase> args(state.range(0));
+  for (int i = 0; i < state.range(0); ++i) {
+    args[i] = DeviceMemoryBase(reinterpret_cast<void*>(0x12345678), 42);
+  }
+
+  for (auto s : state) {
+    auto packed = PackKernelArgs(args, 0);
+    benchmark::DoNotOptimize(packed);
+  }
+}
+
+BENCHMARK(BM_PackDeviceMemoryArgs)
+    ->Arg(4)
+    ->Arg(8)
+    ->Arg(32)
+    ->Arg(64)
+    ->Arg(128)
+    ->Arg(256)
+    ->Arg(512)
+    ->Arg(1024);
+
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/launch_dim.h b/third_party/xla/xla/stream_executor/launch_dim.h
index ef7d03311bb623..7bef4f64798108 100644
--- a/third_party/xla/xla/stream_executor/launch_dim.h
+++ b/third_party/xla/xla/stream_executor/launch_dim.h
@@ -13,25 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Types to express dimensionality of a kernel launch. Blocks and threads
-// are (up to) 3-dimensional.
-//
-// A thread is conceptually like a SIMD lane. Some number, typically 32
-// (though that fact should not be relied on) SIMD lanes are tied together with
-// a single PC in a unit called a warp. There is a maximum number of threads
-// that can execute in a shared-context entity called a block. Presently, that
-// number is 1024 -- again, something that should not be relied on from this
-// comment, but checked via stream_executor::DeviceDescription.
-//
-// For additional information, see
-// http://docs.nvidia.com/cuda/kepler-tuning-guide/#device-utilization-and-occupancy
-//
-// Because of that modest thread-per-block limit, a kernel can be launched with
-// multiple blocks. Each block is indivisibly scheduled onto a single core.
-// Blocks can also be used in a multi-dimensional configuration, and the block
-// count has much less modest limits -- typically they're similar to the maximum
-// amount of addressable memory.
-
 #ifndef XLA_STREAM_EXECUTOR_LAUNCH_DIM_H_
 #define XLA_STREAM_EXECUTOR_LAUNCH_DIM_H_
 
@@ -42,13 +23,18 @@ limitations under the License.
 
 namespace stream_executor {
 
-// Thread dimensionality for use in a kernel launch. See file comment for
+// Types to express dimensionality of a kernel launch. Blocks, threads and
+// clusters are (up to) 3-dimensional.
+//
+// See NVIDIA documentation for a thread hierarchy:
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-hierarchy
+
+// Thread dimensionality for use in a kernel launch.
 // details.
 struct ThreadDim {
   explicit ThreadDim(uint64_t x = 1, uint64_t y = 1, uint64_t z = 1)
       : x(x), y(y), z(z) {}
 
-  // Returns a string representation of the thread dimensionality.
   std::string ToString() const {
     return absl::StrCat("ThreadDim{", x, ", ", y, ", ", z, "}");
   }
@@ -56,13 +42,12 @@ struct ThreadDim {
   uint64_t x, y, z;
 };
 
-// Block dimensionality for use in a kernel launch. See file comment for
+// Block dimensionality for use in a kernel launch.
 // details.
 struct BlockDim {
   explicit BlockDim(uint64_t x = 1, uint64_t y = 1, uint64_t z = 1)
       : x(x), y(y), z(z) {}
 
-  // Returns a string representation of the block dimensionality.
   std::string ToString() const {
     return absl::StrCat("BlockDim{", x, ", ", y, ", ", z, "}");
   }
@@ -70,6 +55,18 @@ struct BlockDim {
   uint64_t x, y, z;
 };
 
+// Cluster dimensionality for use in a kernel launch.
+struct ClusterDim {
+  explicit ClusterDim(uint64_t x = 1, uint64_t y = 1, uint64_t z = 1)
+      : x(x), y(y), z(z) {}
+
+  std::string ToString() const {
+    return absl::StrCat("ClusterDim{", x, ", ", y, ", ", z, "}");
+  }
+
+  uint64_t x, y, z;
+};
+
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_LAUNCH_DIM_H_
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index af1c164541674b..7ed801f678d93c 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -9,7 +9,7 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_hipblaslt",
     "if_rocm_is_configured",
-    "rocm_copts",
+    "rocm_library",
 )
 load("@local_tsl//tsl:tsl.bzl", "set_external_visibility", "tsl_copts")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
@@ -112,6 +112,7 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_event",
         "//xla/stream_executor/gpu:gpu_kernel_header",
         "//xla/stream_executor/gpu:gpu_command_buffer",
+        "//xla/stream_executor/gpu:gpu_runtime_header",
         "//xla/stream_executor/gpu:gpu_stream",
         "//xla/stream_executor/gpu:gpu_timer",
         "//xla/stream_executor/platform",
@@ -131,6 +132,12 @@ cc_library(
     alwayslink = True,
 )
 
+rocm_library(
+    name = "hip_conditional_kernels",
+    srcs = if_rocm_is_configured(["hip_conditional_kernels.cu.cc"]),
+    deps = if_rocm_is_configured(["@local_config_rocm//rocm:rocm_headers"]),
+)
+
 cc_library(
     name = "rocm_platform",
     srcs = if_rocm_is_configured(["rocm_platform.cc"]),
@@ -437,7 +444,7 @@ cc_library(
 )
 
 cc_library(
-    name = "hipblaslt_plugin",
+    name = "amdhipblaslt_plugin",
     srcs = if_rocm_is_configured(["hip_blas_lt.cc"]),
     hdrs = if_rocm_is_configured([
         "hip_blas_lt.h",
@@ -535,11 +542,9 @@ cc_library(
     alwayslink = True,
 )
 
-cc_library(
+rocm_library(
     name = "rocm_helpers",
     srcs = if_rocm_is_configured(["rocm_helpers.cu.cc"]),
-    copts = rocm_copts(),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:rocm_headers",
     ]),
@@ -557,7 +562,7 @@ cc_library(
         ":rocm_driver",
         ":rocm_platform",
         ":rocm_helpers",
-        ":hipblaslt_plugin",
+        ":amdhipblaslt_plugin",
     ]),
     alwayslink = 1,
 )
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
index 6a20995b438a3f..48209591f73ad3 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
@@ -111,7 +111,8 @@ static tsl::StatusOr<hipblasLtEpilogue_t> AsHipblasLtEpilogue(
     case gpu::BlasLt::Epilogue::kGELU:
       return HIPBLASLT_EPILOGUE_GELU;
     default:
-      return tsl::errors::Internal("Unsupported epilogue");
+      return tsl::errors::Internal("Unsupported epilogue: " +
+                                   std::to_string((int)epilogue));
   }
 }
 
@@ -152,6 +153,12 @@ tsl::Status BlasLt::Init() {
   if (!batch_stride) {
     batch_stride = (m.batch_size > 1) ? m.num_rows * m.num_cols : 0;
   }
+  VLOG(2) << "BlasLt::MatrixLayout::Create type: " << (int)type
+          << " rows: " << m.num_rows << " cols: " << m.num_cols
+          << " batch_size: " << m.batch_size
+          << " leading_dim_stride: " << *leading_dim_stride
+          << " batch_stride: " << *batch_stride;
+
   TF_RETURN_IF_ERROR(SetAttr(
       hip_layout, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, *batch_stride));
   return std::move(layout);
@@ -162,9 +169,11 @@ tsl::Status BlasLt::Init() {
     blas::Transpose trans_a, blas::Transpose trans_b, Epilogue epilogue,
     PointerMode pointer_mode) {
   hipblasLtMatmulDesc_t hip_desc;
-  VLOG(2) << "BlasLt::MatmulDesc::Create compute_type" << int(compute_type)
-          << " scale_type " << int(scale_type) << " epilogue " << int(epilogue)
-          << " pointer_mode " << int(pointer_mode);
+  VLOG(2) << "BlasLt::MatmulDesc::Create compute_type: " << int(compute_type)
+          << " scale_type: " << int(scale_type)
+          << " epilogue: " << int(epilogue) << " trans_a: " << int(trans_a)
+          << " trans_b: " << int(trans_b) << " pointer_mode "
+          << int(pointer_mode);
   auto hip_scale_type = AsHipblasDataType(scale_type);
   auto hip_compute_type = AsHipblasComputeType(compute_type);
   SE_HIPBLAS_RETURN_IF_ERROR(wrap::hipblasLtMatmulDescCreate(
@@ -373,6 +382,7 @@ tsl::Status BlasLt::MatmulPlan::DoMatmul(
     workspace = gpu::GpuMemoryMutable(&alloc);
   }
 
+  auto palgo = std::any_cast<hipblasLtMatmulAlgo_t>(&algorithm.opaque_algo);
   {
     absl::MutexLock lock(&blas_lt_ref_.mu_);
     TF_RET_CHECK(blas_lt_ref_.blas_lt_ != nullptr);
@@ -399,8 +409,7 @@ tsl::Status BlasLt::MatmulPlan::DoMatmul(
 
     gpu::ScopedActivateExecutorContext sac{blas_lt_ref_.parent_};
 
-    if (auto palgo =
-            std::any_cast<hipblasLtMatmulAlgo_t>(&algorithm.opaque_algo)) {
+    if (palgo != nullptr) {
       SE_HIPBLAS_RETURN_IF_ERROR(wrap::hipblasLtMatmul(
           blas_lt_ref_.blas_lt_.get(), op_desc_.get(), alpha, a.opaque(),
           a_desc_.get(), b.opaque(), b_desc_.get(), beta, c.opaque(),
@@ -413,6 +422,8 @@ tsl::Status BlasLt::MatmulPlan::DoMatmul(
 
   if (profile_result != nullptr) {
     TF_ASSIGN_OR_RETURN(absl::Duration elapsed, timer->GetElapsedDuration());
+    // set algorithm ID to be unique (otherwise it gets kDefaultAlgorithm ID)
+    profile_result->set_algorithm(reinterpret_cast<blas::AlgorithmType>(palgo));
     profile_result->set_is_valid(true);
     profile_result->set_elapsed_time_in_ms(absl::ToDoubleMilliseconds(elapsed));
   }
@@ -421,31 +432,31 @@ tsl::Status BlasLt::MatmulPlan::DoMatmul(
 
 namespace {
 
-template <hipblasltDatatype_t>
+template <hipDataType>
 struct HipToNativeT;
 
 template <>
-struct HipToNativeT<HIPBLASLT_R_16B> {
+struct HipToNativeT<HIP_R_16BF> {
   using type = Eigen::bfloat16;
 };
 template <>
-struct HipToNativeT<HIPBLASLT_R_16F> {
+struct HipToNativeT<HIP_R_16F> {
   using type = Eigen::half;
 };
 template <>
-struct HipToNativeT<HIPBLASLT_R_32F> {
+struct HipToNativeT<HIP_R_32F> {
   using type = float;
 };
 template <>
-struct HipToNativeT<HIPBLASLT_R_64F> {
+struct HipToNativeT<HIP_R_64F> {
   using type = double;
 };
 template <>
-struct HipToNativeT<HIPBLASLT_C_32F> {
+struct HipToNativeT<HIP_C_32F> {
   using type = complex64;
 };
 template <>
-struct HipToNativeT<HIPBLASLT_C_64F> {
+struct HipToNativeT<HIP_C_64F> {
   using type = complex128;
 };
 
@@ -476,22 +487,14 @@ tsl::Status BlasLt::MatmulPlan::ExecuteOnStream(
   }
 
   // Other data types:
-  TYPED_MATMUL(float, HIPBLASLT_R_16B, HIPBLASLT_R_16B, HIPBLASLT_R_16B,
-               HIPBLASLT_R_16B)
-  TYPED_MATMUL(float, HIPBLASLT_R_16F, HIPBLASLT_R_16F, HIPBLASLT_R_16F,
-               HIPBLASLT_R_16F)
-  TYPED_MATMUL(float, HIPBLASLT_R_16B, HIPBLASLT_R_16B, HIPBLASLT_R_32F,
-               HIPBLASLT_R_32F)
-  TYPED_MATMUL(float, HIPBLASLT_R_16F, HIPBLASLT_R_16F, HIPBLASLT_R_32F,
-               HIPBLASLT_R_32F)
-  TYPED_MATMUL(float, HIPBLASLT_R_32F, HIPBLASLT_R_32F, HIPBLASLT_R_32F,
-               HIPBLASLT_R_32F)
-  TYPED_MATMUL(double, HIPBLASLT_R_64F, HIPBLASLT_R_64F, HIPBLASLT_R_64F,
-               HIPBLASLT_R_64F)
-  TYPED_MATMUL(complex64, HIPBLASLT_C_32F, HIPBLASLT_C_32F, HIPBLASLT_C_32F,
-               HIPBLASLT_C_32F)
-  TYPED_MATMUL(complex128, HIPBLASLT_C_64F, HIPBLASLT_C_64F, HIPBLASLT_C_64F,
-               HIPBLASLT_C_64F)
+  TYPED_MATMUL(float, HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_16BF)
+  TYPED_MATMUL(float, HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_16F)
+  TYPED_MATMUL(float, HIP_R_16BF, HIP_R_16BF, HIP_R_32F, HIP_R_32F)
+  TYPED_MATMUL(float, HIP_R_16F, HIP_R_16F, HIP_R_32F, HIP_R_32F)
+  TYPED_MATMUL(float, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F)
+  TYPED_MATMUL(double, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F)
+  TYPED_MATMUL(complex64, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F)
+  TYPED_MATMUL(complex128, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F)
 
 #undef TYPED_MATMUL
 
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
index 678608e1c57ed3..0ab58918a66f43 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
@@ -42,16 +42,16 @@ class BlasLt : public gpu::BlasLt {
   struct MatrixLayout {
     static tsl::StatusOr<MatrixLayout> Create(const gpu::MatrixLayout& m);
 
-    hipblasltDatatype_t type() const { return datatype_; }
+    hipDataType type() const { return datatype_; }
     hipblasLtMatrixLayout_t get() const { return handle_.get(); }
 
    private:
-    MatrixLayout(hipblasLtMatrixLayout_t handle, hipblasltDatatype_t datatype)
+    MatrixLayout(hipblasLtMatrixLayout_t handle, hipDataType datatype)
         : handle_(handle, wrap::hipblasLtMatrixLayoutDestroy),
           datatype_(datatype) {}
 
     Owned<hipblasLtMatrixLayout_t> handle_;
-    hipblasltDatatype_t datatype_;
+    hipDataType datatype_;
   };
 
   class MatmulDesc {
@@ -63,24 +63,23 @@ class BlasLt : public gpu::BlasLt {
         Epilogue epilogue = Epilogue::kDefault,
         PointerMode pointer_mode = PointerMode::kHost);
 
-    hipblasLtComputeType_t compute_type() const { return compute_type_; }
-    hipblasltDatatype_t scale_type() const { return datatype_; }
+    hipblasComputeType_t compute_type() const { return compute_type_; }
+    hipDataType scale_type() const { return datatype_; }
     hipblasPointerMode_t pointer_mode() const {
       return HIPBLAS_POINTER_MODE_HOST;
     }
     hipblasLtMatmulDesc_t get() const { return handle_.get(); }
 
    private:
-    MatmulDesc(hipblasLtMatmulDesc_t handle,
-               hipblasLtComputeType_t compute_type,
-               hipblasltDatatype_t datatype)
+    MatmulDesc(hipblasLtMatmulDesc_t handle, hipblasComputeType_t compute_type,
+               hipDataType datatype)
         : handle_(handle, wrap::hipblasLtMatmulDescDestroy),
           compute_type_(compute_type),
           datatype_(datatype) {}
 
     Owned<hipblasLtMatmulDesc_t> handle_;
-    hipblasLtComputeType_t compute_type_;
-    hipblasltDatatype_t datatype_;
+    hipblasComputeType_t compute_type_;
+    hipDataType datatype_;
   };
 
   struct MatmulPlan : public gpu::BlasLt::MatmulPlan {
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.cc b/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.cc
index 69d2a48bfaf4be..8bd0be07c53464 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.cc
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.cc
@@ -32,36 +32,36 @@ tsl::Status ToStatus(hipblasStatus_t status, const char* prefix) {
   return tsl::OkStatus();
 }
 
-hipblasltDatatype_t AsHipblasDataType(blas::DataType type) {
+hipDataType AsHipblasDataType(blas::DataType type) {
   switch (type) {
     case blas::DataType::kF8E5M2:
     case blas::DataType::kF8E4M3FN:
       LOG(FATAL) << "hipblaslt does not support F8 yet";
     case blas::DataType::kHalf:
-      return HIPBLASLT_R_16F;
+      return HIP_R_16F;
     case blas::DataType::kBF16:
-      return HIPBLASLT_R_16B;
+      return HIP_R_16BF;
     case blas::DataType::kFloat:
-      return HIPBLASLT_R_32F;
+      return HIP_R_32F;
     case blas::DataType::kDouble:
-      return HIPBLASLT_R_64F;
+      return HIP_R_64F;
     case blas::DataType::kInt8:
-      return HIPBLASLT_R_8I;
+      return HIP_R_8I;
     case blas::DataType::kInt32:
-      return HIPBLASLT_R_32I;
+      return HIP_R_32I;
     case blas::DataType::kComplexFloat:
-      return HIPBLASLT_C_32F;
+      return HIP_C_32F;
     case blas::DataType::kComplexDouble:
-      return HIPBLASLT_C_64F;
+      return HIP_C_64F;
     default:
       LOG(FATAL) << "unknown data type";
   }
 }
 
-hipblasLtComputeType_t AsHipblasComputeType(blas::ComputationType type) {
+hipblasComputeType_t AsHipblasComputeType(blas::ComputationType type) {
   if (type == blas::ComputationType::kF32 ||
       type == blas::ComputationType::kTF32AsF32)
-    return HIPBLASLT_COMPUTE_F32;
+    return HIPBLAS_COMPUTE_32F;
   else
     LOG(FATAL) << "unsupported hipblaslt computation type";
 }
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.h b/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.h
index c4f76767c02dc1..726386a1bb6f2b 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.h
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_utils.h
@@ -26,15 +26,20 @@ limitations under the License.
 #if TF_HIPBLASLT
 
 #if TF_ROCM_VERSION < 60000
-#define hipblasltDatatype_t hipblasDatatype_t
-#define HIPBLASLT_R_16F HIPBLAS_R_16F
-#define HIPBLASLT_R_16B HIPBLAS_R_16B
-#define HIPBLASLT_R_32F HIPBLAS_R_32F
-#define HIPBLASLT_R_64F HIPBLAS_R_64F
-#define HIPBLASLT_R_8I HIPBLAS_R_8I
-#define HIPBLASLT_R_32I HIPBLAS_R_32I
-#define HIPBLASLT_C_32F HIPBLAS_C_32F
-#define HIPBLASLT_C_64F HIPBLAS_C_64F
+#define hipDataType hipblasDatatype_t
+#define HIP_R_16F HIPBLAS_R_16F
+#define HIP_R_16BF HIPBLAS_R_16B
+#define HIP_R_32F HIPBLAS_R_32F
+#define HIP_R_64F HIPBLAS_R_64F
+#define HIP_R_8I HIPBLAS_R_8I
+#define HIP_R_32I HIPBLAS_R_32I
+#define HIP_C_32F HIPBLAS_C_32F
+#define HIP_C_64F HIPBLAS_C_64F
+
+#define hipblasComputeType_t hipblasLtComputeType_t
+#define HIPBLAS_COMPUTE_32F HIPBLASLT_COMPUTE_F32
+#define HIPBLAS_COMPUTE_64F HIPBLASLT_COMPUTE_F64
+#define HIPBLAS_COMPUTE_32I HIPBLASLT_COMPUTE_I32
 #endif
 
 namespace stream_executor {
@@ -44,8 +49,8 @@ namespace rocm {
   TF_RETURN_IF_ERROR(::stream_executor::rocm::ToStatus(expr, #expr))
 
 tsl::Status ToStatus(hipblasStatus_t status, const char* prefix);
-hipblasltDatatype_t AsHipblasDataType(blas::DataType type);
-hipblasLtComputeType_t AsHipblasComputeType(blas::ComputationType type);
+hipDataType AsHipblasDataType(blas::DataType type);
+hipblasComputeType_t AsHipblasComputeType(blas::ComputationType type);
 hipblasOperation_t AsHipblasOperation(blas::Transpose trans);
 
 }  // namespace rocm
diff --git a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/remove_label.cc b/third_party/xla/xla/stream_executor/rocm/hip_conditional_kernels.cu.cc
similarity index 51%
rename from third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/remove_label.cc
rename to third_party/xla/xla/stream_executor/rocm/hip_conditional_kernels.cu.cc
index bc6e41a905645f..654b9f02c4879e 100644
--- a/third_party/xla/xla/mlir_hlo/gml_st/transforms/cpu_tiling/remove_label.cc
+++ b/third_party/xla/xla/stream_executor/rocm/hip_conditional_kernels.cu.cc
@@ -13,31 +13,33 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
+#include <hip/hip_runtime.h>
 
-#include "gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir::gml_st {
+namespace stream_executor {
+namespace rocm {
 namespace {
 
-#define GEN_PASS_DEF_REMOVELABELPASS
-#include "gml_st/transforms/passes.h.inc"
-
-struct RemoveLabelPass : public impl::RemoveLabelPassBase<RemoveLabelPass> {
-  using Base::Base;
+__global__ void SetCondition() {}
 
-  void runOnOperation() override {
-    getOperation().walk(
-        [](Operation *op) { removeLabel(op, kTransformedLabel); });
-  }
-};
 }  // namespace
+}  // namespace rocm
 
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createRemoveLabelPass() {
-  return std::make_unique<mlir::gml_st::RemoveLabelPass>();
+namespace gpu {
+void* GetSetIfConditionKernel() {
+  return reinterpret_cast<void*>(&rocm::SetCondition);
+}
+void* GetSetIfElseConditionKernel() {
+  return reinterpret_cast<void*>(&rocm::SetCondition);
+}
+void* GetSetCaseConditionKernel() {
+  return reinterpret_cast<void*>(&rocm::SetCondition);
+}
+void* GetSetForConditionKernel() {
+  return reinterpret_cast<void*>(&rocm::SetCondition);
+}
+void* GetSetWhileConditionKernel() {
+  return reinterpret_cast<void*>(&rocm::SetCondition);
 }
+}  // namespace gpu
 
-}  // namespace mlir::gml_st
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index 9a200ccc8882c1..e0acb59bad069f 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -77,6 +77,23 @@ using dnn::PoolingDescriptor;
 
 namespace gpu {
 
+// Populates the profile result if not empty.
+static tsl::Status PopulateProfileFromTimer(
+    std::optional<GpuTimer>& timer, const dnn::AlgorithmDesc& algorithm,
+    dnn::ProfileResult* profile_result,
+    std::optional<uint64_t> scratch_size = std::nullopt) {
+  if (profile_result) {
+    TF_ASSIGN_OR_RETURN(absl::Duration duration, timer->GetElapsedDuration());
+    profile_result->set_algorithm(algorithm);
+    profile_result->set_elapsed_time_in_ms(
+        absl::ToDoubleMilliseconds(duration));
+    if (scratch_size.has_value()) {
+      profile_result->set_scratch_size(*scratch_size);
+    }
+  }
+  return tsl::OkStatus();
+}
+
 string ToString(miopenStatus_t status) {
   switch (status) {
     case miopenStatusSuccess:
@@ -2285,7 +2302,7 @@ bool CreateRnnWorkspace(Stream* stream, miopenHandle_t miopen_handle,
 }  // namespace
 
 template <class T>
-bool MIOpenSupport::DoRnnForwardImpl(
+tsl::Status MIOpenSupport::DoRnnForwardImpl(
     Stream* stream, const MIOpenRnnDescriptor& rnn_desc,
     const MIOpenRnnSequenceTensorDescriptor& input_desc,
     const DeviceMemory<T>& input_data,
@@ -2311,7 +2328,7 @@ bool MIOpenSupport::DoRnnForwardImpl(
       &model_dims);
   if (!res) {
     LOG(ERROR) << "Invalid parameters for RNN Model";
-    return false;
+    return tsl::errors::Internal("ExtractAndCheckRnnForward returned false");
   }
 
   auto miopen = miopen_->GetHandle(parent_, stream);
@@ -2320,7 +2337,7 @@ bool MIOpenSupport::DoRnnForwardImpl(
 
   if (!CheckRNNParameterSize(miopen.handle(), rnn_desc, input_desc)) {
     LOG(ERROR) << "Invalid parameters";
-    return false;
+    return tsl::errors::Internal("CheckRNNParameterSize returned false");
   }
 
   // create the workspace
@@ -2328,8 +2345,7 @@ bool MIOpenSupport::DoRnnForwardImpl(
   if (!CreateRnnWorkspace(stream, miopen.handle(), rnn_desc, input_desc,
                           workspace_allocator, &workspace)) {
     LOG(ERROR) << "Unable to create rnn workspace";
-
-    return false;
+    return tsl::errors::Internal("CreateRnnWorkspace returned false");
   }
 
   // query the reserve space size
@@ -2343,7 +2359,8 @@ bool MIOpenSupport::DoRnnForwardImpl(
         &reserve_space_size_in_bytes /*sizeInBytes*/);
     if (status != miopenStatusSuccess) {
       LOG(ERROR) << "Unable to query reserve space size: " << ToString(status);
-      return false;
+      return tsl::errors::Internal(
+          "miopenGetRNNTrainingReserveSize returned failure");
     }
 
     if (reserve_space_size_in_bytes > 0) {
@@ -2351,23 +2368,17 @@ bool MIOpenSupport::DoRnnForwardImpl(
           reserve_space_allocator->AllocateBytes(reserve_space_size_in_bytes);
       if (!allocated.ok() || (reserve_space = allocated.value()) == nullptr) {
         LOG(ERROR) << "Fail to allocate RNN reserve space";
-        return false;
+        return tsl::errors::Internal("AllocateBytes for RNN failed");
       }
       stream->ThenMemZero(&reserve_space, reserve_space_size_in_bytes);
     }
   }
 
-  std::optional<GpuTimer> timer;
   const bool is_profiling = output_profile_result != nullptr;
 
-  if (is_profiling) {
-    auto timer_or_status = GpuTimer::Create(AsGpuStream(stream));
-    if (!timer_or_status.ok()) {
-      LOG(ERROR) << "Failed to create timer";
-      return false;
-    }
-    timer.emplace(std::move(*timer_or_status));
-  }
+  TF_ASSIGN_OR_RETURN(
+      std::optional<GpuTimer> timer,
+      GpuTimer::CreateIfNeeded(AsGpuStream(stream), is_profiling));
 
   // make the forward call
   if (!is_training) {
@@ -2386,7 +2397,7 @@ bool MIOpenSupport::DoRnnForwardImpl(
     if (status != miopenStatusSuccess) {
       LOG(ERROR) << "Failed to call miopenRNNForwardInference: "
                  << ToString(status);
-      return false;
+      return tsl::errors::Internal("miopenRNNForwardInference failed");
     }
   } else {
     auto status = wrap::miopenRNNForwardTraining(
@@ -2405,27 +2416,21 @@ bool MIOpenSupport::DoRnnForwardImpl(
     if (status != miopenStatusSuccess) {
       LOG(ERROR) << "Failed to call miopenRNNForwardTraining"
                  << ToString(status);
-      return false;
+      return tsl::errors::Internal("miopenRNNForwardTraining failed");
     }
   }
 
   if (is_profiling) {
-    tsl::StatusOr<absl::Duration> elapsed = timer->GetElapsedDuration();
-    if (!elapsed.ok()) {
-      LOG(ERROR) << "Failed to get elapsed duration";
-      return false;
-    }
-    auto algo_desc = *rnn_desc.algorithm_config().algorithm();
-    output_profile_result->set_algorithm(algo_desc);
-    output_profile_result->set_elapsed_time_in_ms(
-        absl::ToDoubleMilliseconds(*elapsed));
+    TF_RETURN_IF_ERROR(PopulateProfileFromTimer(
+        timer, *rnn_desc.algorithm_config().algorithm(),
+        output_profile_result));
   }
 
-  return true;
+  return ::tsl::OkStatus();
 }
 
 template <class T>
-bool MIOpenSupport::DoRnnBackwardImpl(
+tsl::Status MIOpenSupport::DoRnnBackwardImpl(
     Stream* stream, const MIOpenRnnDescriptor& rnn_desc,
     const MIOpenRnnSequenceTensorDescriptor& input_desc,
     const DeviceMemory<T>& input_data,
@@ -2457,7 +2462,7 @@ bool MIOpenSupport::DoRnnBackwardImpl(
       output_h_desc, output_h_data, output_c_desc, output_c_data, &model_dims);
   if (!res) {
     LOG(ERROR) << "Invalid parameters for RNN Model";
-    return false;
+    return tsl::errors::Internal("ExtractAndCheckRnnForward failed");
   }
 
   auto miopen = miopen_->GetHandle(parent_, stream);
@@ -2466,7 +2471,7 @@ bool MIOpenSupport::DoRnnBackwardImpl(
 
   if (!CheckRNNParameterSize(miopen.handle(), rnn_desc, input_desc)) {
     LOG(ERROR) << "Invalid parameters";
-    return false;
+    return tsl::errors::Internal("CheckRNNParameterSize failed");
   }
 
   // create the workspace
@@ -2474,7 +2479,7 @@ bool MIOpenSupport::DoRnnBackwardImpl(
   if (!CreateRnnWorkspace(stream, miopen.handle(), rnn_desc, input_desc,
                           workspace_allocator, &workspace)) {
     LOG(ERROR) << "Unable to create rnn workspace";
-    return false;
+    return tsl::errors::Internal("CreateRnnWorkspace failed");
   }
 
   // workaround for missing initialization support in MIOpen.
@@ -2495,17 +2500,11 @@ bool MIOpenSupport::DoRnnBackwardImpl(
   if ((size_data > 0) && (input_c_backprop_data->opaque() != nullptr))
     stream->ThenMemZero(input_c_backprop_data, size_data * type_size);
 
-  std::optional<GpuTimer> timer;
   const bool is_profiling = output_profile_result != nullptr;
 
-  if (is_profiling) {
-    auto timer_or_status = GpuTimer::Create(AsGpuStream(stream));
-    if (!timer_or_status.ok()) {
-      LOG(ERROR) << "Failed to create timer";
-      return false;
-    }
-    timer.emplace(std::move(*timer_or_status));
-  }
+  TF_ASSIGN_OR_RETURN(
+      std::optional<GpuTimer> timer,
+      GpuTimer::CreateIfNeeded(AsGpuStream(stream), is_profiling));
 
   // make the backward data call
   auto status = wrap::miopenRNNBackwardData(
@@ -2529,7 +2528,7 @@ bool MIOpenSupport::DoRnnBackwardImpl(
       reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
   if (status != miopenStatusSuccess) {
     LOG(ERROR) << "Failed to call miopenRNNBackwardData: " << ToString(status);
-    return false;
+    return tsl::errors::Internal("miopenRNNBackwardData failed");
   }
 
   if (params_backprop_data != nullptr) {
@@ -2549,23 +2548,17 @@ bool MIOpenSupport::DoRnnBackwardImpl(
     if (status != miopenStatusSuccess) {
       LOG(ERROR) << "Failed to call miopenRNNBackwardWeights: "
                  << ToString(status);
-      return false;
+      return tsl::errors::Internal("miopenRNNBackwardWeights failed");
     }
   }
 
   if (is_profiling) {
-    tsl::StatusOr<absl::Duration> elapsed = timer->GetElapsedDuration();
-    if (!elapsed.ok()) {
-      LOG(ERROR) << "Failed to get elapsed duration";
-      return false;
-    }
-    auto algo_desc = *rnn_desc.algorithm_config().algorithm();
-    output_profile_result->set_algorithm(algo_desc);
-    output_profile_result->set_elapsed_time_in_ms(
-        absl::ToDoubleMilliseconds(*elapsed));
+    TF_RETURN_IF_ERROR(PopulateProfileFromTimer(
+        timer, *rnn_desc.algorithm_config().algorithm(),
+        output_profile_result));
   }
 
-  return true;
+  return ::tsl::OkStatus();
 }
 
 MIOpenRnnParamsDescriptor::MIOpenRnnParamsDescriptor(
@@ -2865,12 +2858,14 @@ bool MIOpenSupport::DoRnnForward(
   const MIOpenRnnStateTensorDescriptor& miopen_output_c_desc =
       static_cast<const MIOpenRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnForwardImpl<Eigen::half>(
-      stream, miopen_rnn_desc, miopen_input_desc, input_data,
-      miopen_input_h_desc, input_h_data, miopen_input_c_desc, input_c_data,
-      params, miopen_output_desc, output_data, miopen_output_h_desc,
-      output_h_data, miopen_output_c_desc, output_c_data, is_training,
-      reserve_space_allocator, workspace_allocator, output_profile_result);
+  return IsStatusOk(
+      DoRnnForwardImpl<Eigen::half>(
+          stream, miopen_rnn_desc, miopen_input_desc, input_data,
+          miopen_input_h_desc, input_h_data, miopen_input_c_desc, input_c_data,
+          params, miopen_output_desc, output_data, miopen_output_h_desc,
+          output_h_data, miopen_output_c_desc, output_c_data, is_training,
+          reserve_space_allocator, workspace_allocator, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool MIOpenSupport::DoRnnForward(
@@ -2906,12 +2901,14 @@ bool MIOpenSupport::DoRnnForward(
   const MIOpenRnnStateTensorDescriptor& miopen_output_c_desc =
       static_cast<const MIOpenRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnForwardImpl<float>(
-      stream, miopen_rnn_desc, miopen_input_desc, input_data,
-      miopen_input_h_desc, input_h_data, miopen_input_c_desc, input_c_data,
-      params, miopen_output_desc, output_data, miopen_output_h_desc,
-      output_h_data, miopen_output_c_desc, output_c_data, is_training,
-      reserve_space_allocator, workspace_allocator, output_profile_result);
+  return IsStatusOk(
+      DoRnnForwardImpl<float>(
+          stream, miopen_rnn_desc, miopen_input_desc, input_data,
+          miopen_input_h_desc, input_h_data, miopen_input_c_desc, input_c_data,
+          params, miopen_output_desc, output_data, miopen_output_h_desc,
+          output_h_data, miopen_output_c_desc, output_c_data, is_training,
+          reserve_space_allocator, workspace_allocator, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool MIOpenSupport::DoRnnForward(
@@ -2978,14 +2975,17 @@ bool MIOpenSupport::DoRnnBackward(
   const MIOpenRnnStateTensorDescriptor& miopen_output_c_desc =
       static_cast<const MIOpenRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnBackwardImpl<Eigen::half>(
-      stream, miopen_rnn_desc, miopen_input_desc, input_data,
-      miopen_input_h_desc, input_h_data, miopen_input_c_desc, input_c_data,
-      params, miopen_output_desc, output_data, miopen_output_h_desc,
-      output_h_data, miopen_output_c_desc, output_c_data, output_backprop_data,
-      output_h_backprop_data, output_c_backprop_data, input_backprop_data,
-      input_h_backprop_data, input_c_backprop_data, params_backprop_data,
-      reserve_space_data, workspace_allocator, output_profile_result);
+  return IsStatusOk(
+      DoRnnBackwardImpl<Eigen::half>(
+          stream, miopen_rnn_desc, miopen_input_desc, input_data,
+          miopen_input_h_desc, input_h_data, miopen_input_c_desc, input_c_data,
+          params, miopen_output_desc, output_data, miopen_output_h_desc,
+          output_h_data, miopen_output_c_desc, output_c_data,
+          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result),
+      /*report_error=*/true);
 }
 
 bool MIOpenSupport::DoRnnBackward(
@@ -3028,14 +3028,17 @@ bool MIOpenSupport::DoRnnBackward(
   const MIOpenRnnStateTensorDescriptor& miopen_output_c_desc =
       static_cast<const MIOpenRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnBackwardImpl<float>(
-      stream, miopen_rnn_desc, miopen_input_desc, input_data,
-      miopen_input_h_desc, input_h_data, miopen_input_c_desc, input_c_data,
-      params, miopen_output_desc, output_data, miopen_output_h_desc,
-      output_h_data, miopen_output_c_desc, output_c_data, output_backprop_data,
-      output_h_backprop_data, output_c_backprop_data, input_backprop_data,
-      input_h_backprop_data, input_c_backprop_data, params_backprop_data,
-      reserve_space_data, workspace_allocator, output_profile_result);
+  return IsStatusOk(
+      DoRnnBackwardImpl<float>(
+          stream, miopen_rnn_desc, miopen_input_desc, input_data,
+          miopen_input_h_desc, input_h_data, miopen_input_c_desc, input_c_data,
+          params, miopen_output_desc, output_data, miopen_output_h_desc,
+          output_h_data, miopen_output_c_desc, output_c_data,
+          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result),
+      /*report_error=*/true);
 }
 
 bool MIOpenSupport::DoRnnBackward(
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
index 8cd6f43ee292d0..2eeaf12e09a999 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
@@ -575,48 +575,48 @@ class MIOpenSupport : public dnn::DnnSupport {
       DeviceMemory<U>* offset_backprop);
 
   template <class T>
-  bool DoRnnForwardImpl(Stream* stream, const MIOpenRnnDescriptor& rnn_desc,
-                        const MIOpenRnnSequenceTensorDescriptor& input_desc,
-                        const DeviceMemory<T>& input_data,
-                        const MIOpenRnnStateTensorDescriptor& input_h_desc,
-                        const DeviceMemory<T>& input_h_data,
-                        const MIOpenRnnStateTensorDescriptor& input_c_desc,
-                        const DeviceMemory<T>& input_c_data,
-                        const DeviceMemory<T>& params,
-                        const MIOpenRnnSequenceTensorDescriptor& output_desc,
-                        DeviceMemory<T>* output_data,
-                        const MIOpenRnnStateTensorDescriptor& output_h_desc,
-                        DeviceMemory<T>* output_h_data,
-                        const MIOpenRnnStateTensorDescriptor& output_c_desc,
-                        DeviceMemory<T>* output_c_data, bool is_training,
-                        ScratchAllocator* reserve_space_allocator,
-                        ScratchAllocator* workspace_allocator,
-                        dnn::ProfileResult* output_profile_result);
+  tsl::Status DoRnnForwardImpl(
+      Stream* stream, const MIOpenRnnDescriptor& rnn_desc,
+      const MIOpenRnnSequenceTensorDescriptor& input_desc,
+      const DeviceMemory<T>& input_data,
+      const MIOpenRnnStateTensorDescriptor& input_h_desc,
+      const DeviceMemory<T>& input_h_data,
+      const MIOpenRnnStateTensorDescriptor& input_c_desc,
+      const DeviceMemory<T>& input_c_data, const DeviceMemory<T>& params,
+      const MIOpenRnnSequenceTensorDescriptor& output_desc,
+      DeviceMemory<T>* output_data,
+      const MIOpenRnnStateTensorDescriptor& output_h_desc,
+      DeviceMemory<T>* output_h_data,
+      const MIOpenRnnStateTensorDescriptor& output_c_desc,
+      DeviceMemory<T>* output_c_data, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result);
   template <class T>
-  bool DoRnnBackwardImpl(Stream* stream, const MIOpenRnnDescriptor& rnn_desc,
-                         const MIOpenRnnSequenceTensorDescriptor& input_desc,
-                         const DeviceMemory<T>& input_data,
-                         const MIOpenRnnStateTensorDescriptor& input_h_desc,
-                         const DeviceMemory<T>& input_h_data,
-                         const MIOpenRnnStateTensorDescriptor& input_c_desc,
-                         const DeviceMemory<T>& input_c_data,
-                         const DeviceMemory<T>& params,
-                         const MIOpenRnnSequenceTensorDescriptor& output_desc,
-                         const DeviceMemory<T>& output_data,
-                         const MIOpenRnnStateTensorDescriptor& output_h_desc,
-                         const DeviceMemory<T>& output_h_data,
-                         const MIOpenRnnStateTensorDescriptor& output_c_desc,
-                         const DeviceMemory<T>& output_c_data,
-                         const DeviceMemory<T>& output_backprop_data,
-                         const DeviceMemory<T>& output_h_backprop_data,
-                         const DeviceMemory<T>& output_c_backprop_data,
-                         DeviceMemory<T>* input_backprop_data,
-                         DeviceMemory<T>* input_h_backprop_data,
-                         DeviceMemory<T>* input_c_backprop_data,
-                         DeviceMemory<T>* params_backprop_data,
-                         DeviceMemory<uint8>* reserve_space_data,
-                         ScratchAllocator* workspace_allocator,
-                         dnn::ProfileResult* output_profile_result);
+  tsl::Status DoRnnBackwardImpl(
+      Stream* stream, const MIOpenRnnDescriptor& rnn_desc,
+      const MIOpenRnnSequenceTensorDescriptor& input_desc,
+      const DeviceMemory<T>& input_data,
+      const MIOpenRnnStateTensorDescriptor& input_h_desc,
+      const DeviceMemory<T>& input_h_data,
+      const MIOpenRnnStateTensorDescriptor& input_c_desc,
+      const DeviceMemory<T>& input_c_data, const DeviceMemory<T>& params,
+      const MIOpenRnnSequenceTensorDescriptor& output_desc,
+      const DeviceMemory<T>& output_data,
+      const MIOpenRnnStateTensorDescriptor& output_h_desc,
+      const DeviceMemory<T>& output_h_data,
+      const MIOpenRnnStateTensorDescriptor& output_c_desc,
+      const DeviceMemory<T>& output_c_data,
+      const DeviceMemory<T>& output_backprop_data,
+      const DeviceMemory<T>& output_h_backprop_data,
+      const DeviceMemory<T>& output_c_backprop_data,
+      DeviceMemory<T>* input_backprop_data,
+      DeviceMemory<T>* input_h_backprop_data,
+      DeviceMemory<T>* input_c_backprop_data,
+      DeviceMemory<T>* params_backprop_data,
+      DeviceMemory<uint8>* reserve_space_data,
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result);
 
   tsl::Status DoPrepareForConvolution(
       dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
index 0941efa5b9999d..3d4952d157a2ae 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
@@ -44,17 +44,17 @@ static constexpr bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
 static constexpr bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
 static constexpr bool FLAGS_gpuexec_rocm_device_0_only = false;
 
-#define RETURN_IF_ROCM_ERROR(expr, ...)                                       \
-  do {                                                                        \
-    hipError_t _res = (expr);                                                 \
-    if (TF_PREDICT_FALSE(_res != hipSuccess)) {                               \
-      if (_res == hipErrorOutOfMemory)                                        \
-        return tsl::errors::ResourceExhausted(                                \
-            __VA_ARGS__, ":", ::stream_executor::gpu::ToString(_res));        \
-      else                                                                    \
-        return tsl::errors::Internal(__VA_ARGS__, ": ",                       \
-                                     ::stream_executor::gpu::ToString(_res)); \
-    }                                                                         \
+#define RETURN_IF_ROCM_ERROR(expr, ...)                                  \
+  do {                                                                   \
+    hipError_t _res = (expr);                                            \
+    if (TF_PREDICT_FALSE(_res != hipSuccess)) {                          \
+      if (_res == hipErrorOutOfMemory)                                   \
+        return absl::ResourceExhaustedError(absl::StrCat(                \
+            __VA_ARGS__, ":", ::stream_executor::gpu::ToString(_res)));  \
+      else                                                               \
+        return absl::InternalError(absl::StrCat(                         \
+            __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
+    }                                                                    \
   } while (0)
 
 #define FAIL_IF_ROCM_ERROR(expr, ...)                       \
@@ -337,9 +337,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
 
   unsigned int former_primary_context_flags;
   int former_primary_context_is_active;
-  CHECK_EQ(hipSuccess,
-           hipDevicePrimaryCtxGetState(device, &former_primary_context_flags,
-                                       &former_primary_context_is_active));
+  CHECK_EQ(hipSuccess, wrap::hipDevicePrimaryCtxGetState(
+                           device, &former_primary_context_flags,
+                           &former_primary_context_is_active));
   if (former_primary_context_flags != flags) {
     if (former_primary_context_is_active) {
       LOG(ERROR)
@@ -347,15 +347,15 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
           << former_primary_context_flags << ") than the desired flag set ("
           << flags << ").";
     } else {
-      CHECK_EQ(hipSuccess, hipDevicePrimaryCtxSetFlags(device, flags));
+      CHECK_EQ(hipSuccess, wrap::hipDevicePrimaryCtxSetFlags(device, flags));
     }
   }
 
   former_context = rocm::CurrentContextOrDie();
-  res = hipDevicePrimaryCtxRetain(&new_context, device);
+  res = wrap::hipDevicePrimaryCtxRetain(&new_context, device);
   if (former_context != nullptr) {
     hipDevice_t former_device;
-    if (hipCtxGetDevice(&former_device) == hipSuccess) {
+    if (wrap::hipCtxGetDevice(&former_device) == hipSuccess) {
       if (former_device == device) {
         if (former_context == new_context) {
           VLOG(2) << "The primary context " << former_context << " for device "
@@ -374,7 +374,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
                  << former_context;
     }
   }
-  CHECK_EQ(hipSuccess, hipCtxSetCurrent(former_context));
+  CHECK_EQ(hipSuccess, wrap::hipCtxSetCurrent(former_context));
 
   if (res == hipSuccess) {
     *context = CreatedContexts::Add(new_context, device_ordinal);
@@ -404,12 +404,12 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
     return;
   }
   hipCtx_t former_context = CurrentContext();
-  hipError_t res = hipCtxSetCurrent(context->context());
+  hipError_t res = wrap::hipCtxSetCurrent(context->context());
   hipDevice_t device;
-  CHECK_EQ(hipSuccess, hipCtxGetDevice(&device));
-  CHECK_EQ(hipSuccess, hipCtxSetCurrent(former_context));
+  CHECK_EQ(hipSuccess, wrap::hipCtxGetDevice(&device));
+  CHECK_EQ(hipSuccess, wrap::hipCtxSetCurrent(former_context));
 
-  res = hipDevicePrimaryCtxRelease(device);
+  res = wrap::hipDevicePrimaryCtxRelease(device);
 
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to release HIP context; leaking: " << ToString(res);
@@ -432,8 +432,12 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
 
 /* static */ tsl::Status GpuDriver::FuncSetCacheConfig(
     hipFunction_t function, hipFuncCache_t cache_config) {
-  RETURN_IF_ROCM_ERROR(wrap::hipFuncSetCacheConfig(function, cache_config),
-                       "Failed to set ROCM kernel cache config.");
+  // NOTE: this function is only available for in-process GPU kernels:
+  // https://rocm.docs.amd.com/projects/HIP/en/latest/.doxygen/docBin/html/group___execution.html#gafdb33ef569eb89808fc5178d04b508ba
+  // but it is no-op for the current HIP release !
+  RETURN_IF_ROCM_ERROR(
+      wrap::hipFuncSetCacheConfig((const void*)function, cache_config),
+      "Failed to set ROCM kernel cache config.");
   return tsl::OkStatus();
 }
 
@@ -456,7 +460,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 
 /* static */ tsl::Status GpuDriver::CreateGraph(hipGraph_t* graph) {
   VLOG(2) << "Create new HIP graph";
-  RETURN_IF_ROCM_ERROR(hipGraphCreate(graph, /*flags=*/0),
+  RETURN_IF_ROCM_ERROR(wrap::hipGraphCreate(graph, /*flags=*/0),
                        "Failed to create HIP graph");
   VLOG(2) << "Created HIP graph " << *graph;
   return ::tsl::OkStatus();
@@ -464,7 +468,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 
 /* static */ tsl::Status GpuDriver::DestroyGraph(hipGraph_t graph) {
   VLOG(2) << "Destroy HIP graph " << graph;
-  RETURN_IF_ROCM_ERROR(hipGraphDestroy(graph), "Failed to destroy HIP graph");
+  RETURN_IF_ROCM_ERROR(wrap::hipGraphDestroy(graph),
+                       "Failed to destroy HIP graph");
   return ::tsl::OkStatus();
 }
 
@@ -497,7 +502,7 @@ static std::string_view StreamCaptureModeToString(
 
   VLOG(2) << "Beging stream " << stream << " capture in "
           << StreamCaptureModeToString(mode) << " mode";
-  RETURN_IF_ROCM_ERROR(hipStreamBeginCapture(stream, hip_mode),
+  RETURN_IF_ROCM_ERROR(wrap::hipStreamBeginCapture(stream, hip_mode),
                        "Failed to begin stream capture");
   return ::tsl::OkStatus();
 }
@@ -506,7 +511,7 @@ static std::string_view StreamCaptureModeToString(
                                                      hipGraph_t* graph) {
   VLOG(2) << "End stream " << stream << " capture";
 
-  RETURN_IF_ROCM_ERROR(hipStreamEndCapture(stream, graph),
+  RETURN_IF_ROCM_ERROR(wrap::hipStreamEndCapture(stream, graph),
                        "Failed to end stream capture");
 
   return ::tsl::OkStatus();
@@ -520,8 +525,9 @@ static std::string_view StreamCaptureModeToString(
           << "device_launch=" << flags.device_launch << ", "
           << "use_node_priority=" << flags.use_node_prirotiy << ", "
           << "upload=" << flags.upload << ")";
-  RETURN_IF_ROCM_ERROR(hipGraphInstantiate(exec, graph, nullptr, nullptr, 0),
-                       "Failed to instantiate HIP graph");
+  RETURN_IF_ROCM_ERROR(
+      wrap::hipGraphInstantiate(exec, graph, nullptr, nullptr, 0),
+      "Failed to instantiate HIP graph");
   return ::tsl::OkStatus();
 }
 
@@ -529,7 +535,7 @@ static std::string_view StreamCaptureModeToString(
                                                 GpuStreamHandle stream) {
   VLOG(2) << "Launching HIP executable graph " << exec << " on a stream "
           << stream;
-  RETURN_IF_ROCM_ERROR(hipGraphLaunch(exec, stream),
+  RETURN_IF_ROCM_ERROR(wrap::hipGraphLaunch(exec, stream),
                        "Failed to launch HIP graph");
   return ::tsl::OkStatus();
 }
@@ -540,7 +546,8 @@ static std::string_view StreamCaptureModeToString(
 
   hipGraphExecUpdateResult hip_result = hipGraphExecUpdateError;
   hipGraphNode_t error_node = nullptr;
-  auto hip_error = hipGraphExecUpdate(exec, graph, &error_node, &hip_result);
+  auto hip_error =
+      wrap::hipGraphExecUpdate(exec, graph, &error_node, &hip_result);
 
   if (error_node) {
     result->error_node = error_node;
@@ -580,7 +587,7 @@ static std::string_view StreamCaptureModeToString(
 
 /* static */ tsl::Status GpuDriver::DestroyGraphExec(hipGraphExec_t exec) {
   VLOG(2) << "Destroying HIP executable graph" << exec;
-  RETURN_IF_ROCM_ERROR(hipGraphExecDestroy(exec),
+  RETURN_IF_ROCM_ERROR(wrap::hipGraphExecDestroy(exec),
                        "Failed to destroy HIP graph");
   return ::tsl::OkStatus();
 }
@@ -631,7 +638,7 @@ GpuDriver::GraphNodeGetType(hipGraphNode_t node) {
   VLOG(2) << "Print HIP graph " << graph << " debug dot file to " << path;
 
   int flags = hipGraphDebugDotFlagsVerbose;
-  RETURN_IF_ROCM_ERROR(hipGraphDebugDotPrint(graph, path, flags),
+  RETURN_IF_ROCM_ERROR(wrap::hipGraphDebugDotPrint(graph, path, flags),
                        "Failed to print gpu graph debug file");
 
   if (VLOG_IS_ON(100)) {
@@ -646,17 +653,43 @@ GpuDriver::GraphNodeGetType(hipGraphNode_t node) {
   return ::tsl::OkStatus();
 }
 
+/* static */ tsl::Status GpuDriver::DeviceGraphMemTrim(GpuDeviceHandle device) {
+  VLOG(2) << "Trim ROCM device graph memory " << device;
+  RETURN_IF_ROCM_ERROR(wrap::hipDeviceGraphMemTrim(device),
+                       "Failed to trim device graph memory");
+  return tsl::OkStatus();
+}
+
 /* static */ tsl::StatusOr<bool> GpuDriver::StreamIsCapturing(
     GpuStreamHandle stream) {
   VLOG(2) << "Checking if stream " << stream << " is capturing";
 
   hipStreamCaptureStatus status;
-  RETURN_IF_ROCM_ERROR(hipStreamIsCapturing(stream, &status),
+  RETURN_IF_ROCM_ERROR(wrap::hipStreamIsCapturing(stream, &status),
                        "Failed to check stream capturing status");
 
   return status == hipStreamCaptureStatusActive;
 }
 
+/* static */ tsl::Status GpuDriver::GraphConditionalHandleCreate(
+    GpuGraphConditionalHandle* handle, hipGraph_t graph, GpuContext* context,
+    unsigned int default_launch_value, unsigned int flags) {
+  VLOG(2) << "Create conditional handle for a graph " << graph
+          << "; context: " << context
+          << "; default_launch_value: " << default_launch_value
+          << "; flags: " << flags;
+
+  return absl::UnimplementedError(
+      "HIP graph conditional nodes are not implemented yet");
+}
+
+/* static */ tsl::StatusOr<GpuDriver::GpuGraphNodeResult>
+GpuDriver::GraphAddNode(hipGraphNode_t* node, hipGraph_t graph,
+                        absl::Span<hipGraphNode_t> deps,
+                        const GpuGraphNodeParams& params) {
+  return absl::UnimplementedError("unsupported node type");
+}
+
 /* static */ tsl::Status GpuDriver::GraphAddKernelNode(
     hipGraphNode_t* node, hipGraph_t graph, absl::Span<hipGraphNode_t> deps,
     absl::string_view kernel_name, hipFunction_t function,
@@ -686,15 +719,15 @@ GpuDriver::GraphNodeGetType(hipGraphNode_t node) {
 
   if (shared_mem_bytes != 0) {
     RETURN_IF_ROCM_ERROR(
-        hipFuncSetAttribute(function,
-                            hipFuncAttributeMaxDynamicSharedMemorySize,
-                            shared_mem_bytes),
+        wrap::hipFuncSetAttribute(function,
+                                  hipFuncAttributeMaxDynamicSharedMemorySize,
+                                  shared_mem_bytes),
         "Failed to set shared memory size");
   }
 
-  RETURN_IF_ROCM_ERROR(
-      hipGraphAddKernelNode(node, graph, deps.data(), deps.size(), &params),
-      "Failed to add kernel node to a HIP graph");
+  RETURN_IF_ROCM_ERROR(wrap::hipGraphAddKernelNode(node, graph, deps.data(),
+                                                   deps.size(), &params),
+                       "Failed to add kernel node to a HIP graph");
 
   return ::tsl::OkStatus();
 }
@@ -728,26 +761,19 @@ GpuDriver::GraphNodeGetType(hipGraphNode_t node) {
 
   if (shared_mem_bytes != 0) {
     RETURN_IF_ROCM_ERROR(
-        hipFuncSetAttribute(function,
-                            hipFuncAttributeMaxDynamicSharedMemorySize,
-                            shared_mem_bytes),
+        wrap::hipFuncSetAttribute(function,
+                                  hipFuncAttributeMaxDynamicSharedMemorySize,
+                                  shared_mem_bytes),
         "Failed to set shared memory size");
   }
 
-  RETURN_IF_ROCM_ERROR(hipGraphExecKernelNodeSetParams(exec, node, &params),
-                       "Failed to set HIP graph kernel node params");
+  RETURN_IF_ROCM_ERROR(
+      wrap::hipGraphExecKernelNodeSetParams(exec, node, &params),
+      "Failed to set HIP graph kernel node params");
 
   return ::tsl::OkStatus();
 }
 
-/* static */ tsl::Status GpuDriver::GraphAddMemcpyD2DNode(
-    GpuContext* context, hipGraphNode_t* node, hipGraph_t graph,
-    absl::Span<hipGraphNode_t> deps, hipDeviceptr_t gpu_dst,
-    hipDeviceptr_t gpu_src, uint64_t size) {
-  return tsl::Status{absl::StatusCode::kInternal,
-                     "hipDrvGraphAddMemcopyNode is not available on ROCm yet"};
-}
-
 /* static */ tsl::Status GpuDriver::GraphAddChildNode(
     hipGraphNode_t* node, hipGraph_t graph, absl::Span<hipGraphNode_t> deps,
     hipGraph_t child) {
@@ -758,6 +784,252 @@ GpuDriver::GraphNodeGetType(hipGraphNode_t node) {
       wrap::hipGraphAddChildGraphNode(node, graph, deps.data(), deps.size(),
                                       child),
       "Failed to create a child graph node and add it to a HIP graph");
+  return tsl::OkStatus();
+}
+
+/*static*/ tsl::Status GpuDriver::GraphExecChildNodeSetParams(
+    GpuGraphExecHandle exec, GpuGraphNodeHandle node, GpuGraphHandle child) {
+  VLOG(2) << "Set child node params " << node << " in graph executable " << exec
+          << "to params contained in " << child;
+
+  RETURN_IF_ROCM_ERROR(
+      wrap::hipGraphExecChildGraphNodeSetParams(exec, node, child),
+      "Failed to set ROCm graph child node params");
+
+  return tsl::OkStatus();
+}
+
+static hipMemAccessFlags ToHipMemAccessFlags(
+    GpuDriver::MemAccessFlags access_flags) {
+  switch (access_flags) {
+    case GpuDriver::MemAccessFlags::kNone:
+      return hipMemAccessFlagsProtNone;
+    case GpuDriver::MemAccessFlags::kRead:
+      return hipMemAccessFlagsProtRead;
+    case GpuDriver::MemAccessFlags::kReadWrite:
+      return hipMemAccessFlagsProtReadWrite;
+  }
+}
+
+static hipMemLocationType ToHipLocationType(
+    GpuDriver::MemLocationType location_type) {
+  switch (location_type) {
+    case GpuDriver::MemLocationType::kInvalid:
+      return hipMemLocationTypeInvalid;
+    case GpuDriver::MemLocationType::kDevice:
+      return hipMemLocationTypeDevice;
+    case GpuDriver::MemLocationType::kHost:
+    case GpuDriver::MemLocationType::kHostNuma:
+    case GpuDriver::MemLocationType::kHostNumaCurrent:
+      return hipMemLocationTypeInvalid;
+  }
+}
+
+static hipMemAllocationType ToHipAllocationType(
+    GpuDriver::MemAllocationType allocation_type) {
+  switch (allocation_type) {
+    case GpuDriver::MemAllocationType::kInvalid:
+      return hipMemAllocationTypeInvalid;
+    case GpuDriver::MemAllocationType::kPinned:
+      return hipMemAllocationTypePinned;
+  }
+}
+
+/*static*/ tsl::Status GpuDriver::GraphAddMemFreeNode(
+    GpuGraphNodeHandle* node, GpuGraphHandle graph,
+    absl::Span<GpuGraphNodeHandle> deps, GpuDevicePtr gpu_dst) {
+  RETURN_IF_ROCM_ERROR(wrap::hipGraphAddMemFreeNode(node, graph, deps.data(),
+                                                    deps.size(), gpu_dst),
+                       "Failed to add memory free node to a ROCM graph");
+  return ::tsl::OkStatus();
+}
+
+/*static*/ tsl::Status GpuDriver::GraphAddMemAllocNode(
+    GpuGraphNodeHandle* node, GpuGraphHandle graph,
+    absl::Span<GpuGraphNodeHandle> deps, MemAccessFlags access_flags,
+    MemLocationType location_type, int device_id,
+    MemAllocationType allocation_type, uint64_t size, GpuDevicePtr* d_ptr,
+    uint64_t max_pool_size) {
+  hipMemLocation mem_loc = {
+      .type = ToHipLocationType(location_type),
+      .id = device_id,
+  };
+
+  hipMemPoolProps props{};
+  props.allocType = ToHipAllocationType(allocation_type);
+  props.handleTypes = hipMemHandleTypeNone;
+  props.location = mem_loc;
+
+  hipMemAccessDesc mem_desc = {
+      .location = mem_loc,
+      .flags = ToHipMemAccessFlags(access_flags),
+  };
+
+  hipMemAllocNodeParams params{
+      .poolProps = props,
+      .accessDescs = &mem_desc,
+      .accessDescCount = 1,
+      .bytesize = size,
+      .dptr = nullptr,
+  };
+
+  RETURN_IF_ROCM_ERROR(wrap::hipGraphAddMemAllocNode(node, graph, deps.data(),
+                                                     deps.size(), &params),
+                       "Failed to add memory allocation node to a CUDA graph");
+
+  VLOG(2) << "Add MemAllocNode to a graph " << graph << " size " << size
+          << " address " << reinterpret_cast<void*>(params.dptr);
+
+  *d_ptr = params.dptr;
+  return ::tsl::OkStatus();
+}
+
+/*static*/ tsl::StatusOr<std::pair<GpuDevicePtr, uint64_t>>
+GpuDriver::GraphGetMemAllocNodeParams(GpuGraphNodeHandle node) {
+  hipMemAllocNodeParams params;
+  RETURN_IF_ROCM_ERROR(wrap::hipGraphMemAllocNodeGetParams(node, &params),
+                       "Failed to get memory allocation node parameter");
+  return std::pair<GpuDevicePtr, uint64_t>{params.dptr, params.bytesize};
+}
+
+/* static */ tsl::Status GpuDriver::GraphAddMemcpyD2DNode(
+    GpuContext* context, GpuGraphNodeHandle* node, GpuGraphHandle graph,
+    absl::Span<GpuGraphNodeHandle> deps, GpuDevicePtr gpu_dst,
+    GpuDevicePtr gpu_src, uint64_t size) {
+  VLOG(2) << "Add memcpy d2d node to a graph " << graph
+          << "; dst: " << reinterpret_cast<void*>(gpu_dst)
+          << "; src: " << reinterpret_cast<void*>(gpu_src) << "; size: " << size
+          << "; context: " << context->context() << "; deps: " << deps.size();
+
+  hipMemcpy3DParms params{
+      .srcArray = {},
+      .srcPos = {},
+      .srcPtr = {.ptr = gpu_src},
+      .dstArray = {},
+      .dstPos = {},
+      .dstPtr = {.ptr = gpu_dst},
+      .extent = hipExtent{.width = size, .height = 1, .depth = 1},
+      .kind = hipMemcpyDeviceToDevice};
+
+  RETURN_IF_ROCM_ERROR(wrap::hipGraphAddMemcpyNode(node, graph, deps.data(),
+                                                   deps.size(), &params),
+                       "Failed to add memcpy d2d node to a HIP graph");
+
+  return ::tsl::OkStatus();
+}
+
+/* static */ tsl::Status GpuDriver::GraphExecMemcpyD2DNodeSetParams(
+    GpuContext* context, GpuGraphExecHandle exec, GpuGraphNodeHandle node,
+    GpuDevicePtr gpu_dst, GpuDevicePtr gpu_src, uint64_t size) {
+  VLOG(2) << "Set memcpy d2d node params " << node << " in graph executable "
+          << exec << "; dst: " << reinterpret_cast<void*>(gpu_dst)
+          << "; src: " << reinterpret_cast<void*>(gpu_src) << "; size: " << size
+          << "; context: " << context->context();
+
+  hipMemcpy3DParms params{
+      .srcArray = {},
+      .srcPos = {},
+      .srcPtr = {.ptr = gpu_src},
+      .dstArray = {},
+      .dstPos = {},
+      .dstPtr = {.ptr = gpu_dst},
+      .extent = hipExtent{.width = size, .height = 1, .depth = 1},
+      .kind = hipMemcpyDeviceToDevice};
+
+  RETURN_IF_ROCM_ERROR(
+      wrap::hipGraphExecMemcpyNodeSetParams(exec, node, &params),
+      "Failed to set memcpy d2d node params");
+
+  return ::tsl::OkStatus();
+}
+
+namespace {
+
+struct BitPatternToString {
+  std::string operator()(uint8_t pattern) {
+    return absl::StrCat("u8:", pattern);
+  }
+  std::string operator()(uint16_t pattern) {
+    return absl::StrCat("u16:", pattern);
+  }
+  std::string operator()(uint32_t pattern) {
+    return absl::StrCat("u32:", pattern);
+  }
+};
+
+// Broadcasts a pattern value of 1/2/4 bytes to a 4 byte value.
+struct BitPatternToValue {
+  std::pair<unsigned, unsigned> operator()(uint8_t pattern) {
+    unsigned value = pattern;
+    return {(value << 24) | (value << 16) | (value << 8) | value,
+            /*element_size=*/1};
+  }
+  std::pair<unsigned, unsigned> operator()(uint16_t pattern) {
+    unsigned value = pattern;
+    return {(value << 16) | value, /*element_size=*/2};
+  }
+  std::pair<unsigned, unsigned> operator()(uint32_t pattern) {
+    return {pattern, /*element_size=*/4};
+  }
+};
+
+}  // namespace
+
+/* static */ tsl::Status GpuDriver::GraphAddMemsetNode(
+    GpuContext* context, GpuGraphNodeHandle* node, GpuGraphHandle graph,
+    absl::Span<GpuGraphNodeHandle> deps, GpuDevicePtr dst,
+    std::variant<uint8_t, uint16_t, uint32_t> bit_pattern,
+    uint64_t num_elements) {
+  VLOG(2) << "Add memset node to a graph " << graph
+          << "; dst: " << reinterpret_cast<void*>(dst)
+          << "; bit_pattern: " << std::visit(BitPatternToString(), bit_pattern)
+          << "; num_elements: " << num_elements
+          << "; context: " << context->context() << "; deps: " << deps.size();
+
+  auto [value, element_size] = std::visit(BitPatternToValue(), bit_pattern);
+
+  hipMemsetParams params{
+      .dst = dst,
+      .elementSize = element_size,
+      .height = 1,
+      .pitch = 0,  // unused if height is 1
+      .value = value,
+      .width = num_elements,
+  };
+
+  RETURN_IF_ROCM_ERROR(wrap::hipGraphAddMemsetNode(node, graph, deps.data(),
+                                                   deps.size(), &params),
+                       "Failed to add memset node to a CUDA graph");
+
+  return ::tsl::OkStatus();
+}
+
+/* static */ tsl::Status GpuDriver::GraphExecMemsetNodeSetParams(
+    GpuContext* context, GpuGraphExecHandle exec, GpuGraphNodeHandle node,
+    GpuDevicePtr dst, std::variant<uint8_t, uint16_t, uint32_t> bit_pattern,
+    uint64_t num_elements) {
+  VLOG(2) << "Set memset node params " << node << " in graph executable "
+          << exec << "; dst: " << reinterpret_cast<void*>(dst)
+          << "; bit_pattern: " << std::visit(BitPatternToString(), bit_pattern)
+          << "; num_elements: " << num_elements
+          << "; context: " << context->context();
+
+  auto [value, element_size] = std::visit(BitPatternToValue(), bit_pattern);
+
+  hipMemsetParams params{
+      .dst = dst,
+      .elementSize = element_size,
+      .height = 1,
+      .pitch = 0,  // unused if height is 1
+      .value = value,
+      .width = num_elements,
+  };
+
+  RETURN_IF_ROCM_ERROR(
+      wrap::hipGraphExecMemsetNodeSetParams(exec, node, &params),
+      "Failed to set memset node params");
+
+  return ::tsl::OkStatus();
 }
 
 /* static */ tsl::Status GpuDriver::LaunchKernel(
@@ -771,13 +1043,26 @@ GpuDriver::GraphNodeGetType(hipGraphNode_t node) {
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
           << " bdz: " << block_dim_z << " smem: " << shared_mem_bytes;
-  RETURN_IF_ROCM_ERROR(wrap::hipModuleLaunchKernel(
-                           function, grid_dim_x, grid_dim_y, grid_dim_z,
-                           block_dim_x, block_dim_y, block_dim_z,
-                           shared_mem_bytes, stream, kernel_params, extra),
-                       "Failed to launch ROCm kernel: ", kernel_name,
+
+  // for in-process kernel this function returns mangled kernel function name,
+  // and null otherwise
+  auto name = wrap::hipKernelNameRefByPtr((const void*)function, stream);
+
+  auto res = hipSuccess;
+  if (name != nullptr) {
+    res = wrap::hipLaunchKernel((const void*)function,
+                                dim3(grid_dim_x, grid_dim_y, grid_dim_z),
+                                dim3(block_dim_x, block_dim_y, block_dim_z),
+                                kernel_params, shared_mem_bytes, stream);
+  } else {
+    res = wrap::hipModuleLaunchKernel(
+        function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y,
+        block_dim_z, shared_mem_bytes, stream, kernel_params, extra);
+  }
+  RETURN_IF_ROCM_ERROR(res, "Failed to launch ROCm kernel: ", kernel_name,
                        " with block dimensions: ", block_dim_x, "x",
                        block_dim_y, "x", block_dim_z);
+
   VLOG(2) << "successfully launched kernel";
   return tsl::OkStatus();
 }
@@ -919,7 +1204,7 @@ GpuDriver::GraphNodeGetType(hipGraphNode_t node) {
     GpuContext* context) {
   ScopedActivateContext activated{context};
   hipDevice_t device = -1;
-  hipError_t result = hipCtxGetDevice(&device);
+  hipError_t result = wrap::hipCtxGetDevice(&device);
   if (result == hipSuccess) return device;
 
   return tsl::Status(
@@ -1410,8 +1695,8 @@ GpuDriver::GraphNodeGetType(hipGraphNode_t node) {
 /* static */ tsl::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
     hipDeviceptr_t pointer) {
   GpuContext* context = nullptr;
-  hipError_t result =
-      hipPointerGetAttribute(&context, HIP_POINTER_ATTRIBUTE_CONTEXT, pointer);
+  hipError_t result = wrap::hipPointerGetAttribute(
+      &context, HIP_POINTER_ATTRIBUTE_CONTEXT, pointer);
   if (result == hipSuccess) {
     if (context == nullptr) {
       return tsl::Status(
@@ -1477,7 +1762,13 @@ GpuDriver::GraphNodeGetType(hipGraphNode_t node) {
   hipDeviceProp_t props;
   hipError_t result = wrap::hipGetDeviceProperties(&props, device);
   if (result == hipSuccess) {
-    *version = props.gcnArch;
+    std::string gcnName = props.gcnArchName;
+    std::vector<std::string> tokens = absl::StrSplit(gcnName, ':');
+    std::string amdgpu_version = gcnName;
+    if (!tokens.empty() && tokens[0].size() >= 3) {
+      amdgpu_version = tokens[0].substr(3);
+    }
+    *version = stoi(amdgpu_version);
     return tsl::OkStatus();
   }
   *version = 0;
@@ -1505,18 +1796,14 @@ GpuDriver::GraphNodeGetType(hipGraphNode_t node) {
 /* static */ tsl::StatusOr<bool> GpuDriver::GetMFMASupport() {
   hipDeviceProp_t props;
   int dev = 0;
-  hipError_t result = hipGetDevice(&dev);
+  hipError_t result = wrap::hipGetDevice(&dev);
   result = wrap::hipGetDeviceProperties(&props, dev);
   if (result == hipSuccess) {
     std::string gcnArchName = props.gcnArchName;
     VLOG(3) << "GCN arch name " << gcnArchName;
-    auto pos = gcnArchName.find(":");
-    if (pos != string::npos) gcnArchName = gcnArchName.substr(0, pos);
-    pos = gcnArchName.find("gfx");
-    if (pos != string::npos) gcnArchName = gcnArchName.substr(pos + 3);
-    VLOG(3) << "GCN arch name (stripped) " << gcnArchName;
-    return ((gcnArchName == "908") || (gcnArchName == "909") ||
-            (gcnArchName == "90a") || (gcnArchName == "940"));
+    auto compute_capability = RocmComputeCapability(gcnArchName);
+    VLOG(3) << "GCN arch name (stripped) " << compute_capability.gfx_version();
+    return compute_capability.gfx9_mi100_or_later();
   }
   return tsl::Status{
       absl::StatusCode::kInternal,
@@ -1657,17 +1944,21 @@ static tsl::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   }
 
   std::string gcnArchName = props.gcnArchName;
+  auto compute_capability = RocmComputeCapability(gcnArchName);
   // On gfx90a, we hide 1 GB of GPU memory (512MB for gfx908) from TF,
   // to allow for late allocations by internal ROCm libraries
   // (e.g. rocBLAS alone needs~200 MB to put its kernels as of ROCm 4.1)
   const uint64_t RESERVED_GFX908 = 1048576 * 512;
   const uint64_t RESERVED_GFX9_X = 1048576 * 1024;
-  if (gcnArchName.substr(0, 6) == "gfx908") {
+  const uint64_t RESERVED_GFX10_X = 1048576 * 512;
+  if (compute_capability.gfx_version() == "gfx908") {
     *reserve = RESERVED_GFX908;
-  } else if (gcnArchName.substr(0, 6) == "gfx90a" ||
-             gcnArchName.substr(0, 6) == "gfx940") {
+  } else if (compute_capability.gfx9_mi200_or_later()) {
     *reserve = RESERVED_GFX9_X;
+  } else if (compute_capability.navi21() || compute_capability.navi31()) {
+    *reserve = RESERVED_GFX10_X;
   }
+
   return true;
 }
 
@@ -1760,7 +2051,7 @@ static tsl::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
 /* static */ bool GpuDriver::CanEnablePeerAccess(GpuDeviceHandle from,
                                                  GpuDeviceHandle to) {
   int can_access_peer = -1;
-  hipError_t result = hipDeviceCanAccessPeer(&can_access_peer, from, to);
+  hipError_t result = wrap::hipDeviceCanAccessPeer(&can_access_peer, from, to);
   if (result != hipSuccess) {
     LOG(ERROR) << "failed to detect peer access capability: "
                << ToString(result);
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h b/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
index 8c3bfb707da1fd..020b5d706a03e7 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_driver_wrapper.h
@@ -79,8 +79,11 @@ namespace wrap {
   __macro(hipDeviceGetName)                         \
   __macro(hipDeviceGetPCIBusId)                     \
   __macro(hipDeviceGetSharedMemConfig)              \
+  __macro(hipDeviceGraphMemTrim)                    \
   __macro(hipDevicePrimaryCtxGetState)              \
   __macro(hipDevicePrimaryCtxSetFlags)              \
+  __macro(hipDevicePrimaryCtxRetain)                \
+  __macro(hipDevicePrimaryCtxRelease)               \
   __macro(hipDeviceSetSharedMemConfig)              \
   __macro(hipDeviceSynchronize)                     \
   __macro(hipDeviceTotalMem)                        \
@@ -94,19 +97,29 @@ namespace wrap {
   __macro(hipFree)                                  \
   __macro(hipFuncSetCacheConfig)                    \
   __macro(hipFuncGetAttribute)                      \
+  __macro(hipFuncSetAttribute)                      \
   __macro(hipGetDevice)                             \
   __macro(hipGetDeviceCount)                        \
   __macro(hipGetDeviceProperties)                   \
   __macro(hipGetErrorString)                        \
   __macro(hipGraphAddKernelNode)                    \
   __macro(hipGraphAddChildGraphNode)                \
+  __macro(hipGraphAddMemAllocNode)                  \
   __macro(hipGraphAddMemcpyNode)                    \
+  __macro(hipGraphAddMemcpyNode1D)                  \
+  __macro(hipGraphAddMemsetNode)                    \
+  __macro(hipGraphAddMemFreeNode)                   \
   __macro(hipGraphCreate)                           \
   __macro(hipGraphDebugDotPrint)                    \
   __macro(hipGraphDestroy)                          \
+  __macro(hipGraphExecChildGraphNodeSetParams)      \
   __macro(hipGraphExecDestroy)                      \
+  __macro(hipGraphExecKernelNodeSetParams)          \
+  __macro(hipGraphExecMemcpyNodeSetParams)          \
+  __macro(hipGraphExecMemsetNodeSetParams)          \
   __macro(hipGraphExecUpdate)                       \
   __macro(hipGraphInstantiate)                      \
+  __macro(hipGraphMemAllocNodeGetParams)            \
   __macro(hipGraphLaunch)                           \
   __macro(hipGraphNodeGetType)                      \
   __macro(hipHostFree)                              \
@@ -114,7 +127,9 @@ namespace wrap {
   __macro(hipHostRegister)                          \
   __macro(hipHostUnregister)                        \
   __macro(hipInit)                                  \
+  __macro(hipKernelNameRefByPtr)                    \
   __macro(hipLaunchHostFunc)                        \
+  __macro(hipLaunchKernel)                          \
   __macro(hipMalloc)                                \
   __macro(hipMemGetAddressRange)                    \
   __macro(hipMemGetInfo)                            \
@@ -137,6 +152,7 @@ namespace wrap {
   __macro(hipModuleLaunchKernel)                    \
   __macro(hipModuleLoadData)                        \
   __macro(hipModuleUnload)                          \
+  __macro(hipPointerGetAttribute)                   \
   __macro(hipPointerGetAttributes)                  \
   __macro(hipSetDevice)                             \
   __macro(hipDeviceGetStreamPriorityRange)          \
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
index 235a4f747b8db0..93a9b16be72880 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_event.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_kernel.h"
+#include "xla/stream_executor/gpu/gpu_runtime.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/gpu/gpu_timer.h"
 #include "xla/stream_executor/platform.h"
@@ -107,9 +108,21 @@ bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
   return UnloadGpuBinary(gpu_binary);
 }
 
+namespace {
+int fpus_per_core(std::string gcn_arch_name) {
+  // Source:
+  // https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf
+  int n = 128;  // gfx90a and gfx908 -> 128
+  if (gcn_arch_name.substr(0, 6) == "gfx906") {
+    n = 64;
+  }
+  return n;
+}
+}  // namespace
+
 tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
 GpuExecutor::CreateOrShareConstant(Stream* stream,
-                                   const std::vector<uint8_t>& content) {
+                                   absl::Span<const uint8_t> content) {
   return tsl::errors::Unimplemented("Not implemented for ROCm");
 }
 
@@ -135,7 +148,7 @@ bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
   return true;
 }
 
-void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
+void GpuExecutor::UnloadKernel(const Kernel* kernel) {
   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
 
   absl::MutexLock lock{&in_memory_modules_mu_};
@@ -196,18 +209,12 @@ static string GetBinaryDir(bool strip_exe) {
 }
 
 tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
-                                   KernelBase* kernel) {
+                                   Kernel* kernel) {
   GpuKernel* rocm_kernel = AsGpuKernel(kernel);
   hipModule_t module = nullptr;
   const string* kernel_name;
 
-  const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
-
-  VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
-
-  if (spec.has_cuda_cubin_on_disk()) on_disk_spec = &spec.cuda_cubin_on_disk();
-
-  if (on_disk_spec != nullptr) {
+  if (spec.has_cuda_cubin_on_disk()) {
     return tsl::errors::Internal(
         "Loading ROCM kernel from disk is not supported");
   } else if (spec.has_cuda_cubin_in_memory()) {
@@ -221,22 +228,40 @@ tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
       TF_RETURN_IF_ERROR(GpuDriver::LoadHsaco(context_, hsaco, &module));
     }
     kernel_to_gpu_binary_[kernel] = hsaco;
+  } else if (spec.has_in_process_symbol()) {
+    kernel_name = &spec.in_process_symbol().kernel_name();
+    void* symbol = spec.in_process_symbol().symbol();
+
+    VLOG(1) << "Resolve ROCM kernel " << *kernel_name
+            << " from symbol pointer: " << symbol;
+
+    *rocm_kernel->gpu_function_ptr() =
+        static_cast<hipFunction_t>(spec.in_process_symbol().symbol());
   } else {
     return tsl::errors::Internal("No method of loading ROCM kernel provided");
   }
 
-  VLOG(2) << "getting function " << *kernel_name << " from module " << module;
-  TF_RETURN_IF_ERROR(GpuDriver::GetModuleFunction(
-      context_, module, kernel_name->c_str(), rocm_kernel->gpu_function_ptr()));
+  // If we resolved kernel from a symbol pointer, there is no need to load it
+  // from a module, as ROCm runtime did that automatically for us.
+  if (!spec.has_in_process_symbol()) {
+    VLOG(2) << "getting function " << *kernel_name << " from module " << module;
+    TF_RETURN_IF_ERROR(
+        GpuDriver::GetModuleFunction(context_, module, kernel_name->c_str(),
+                                     rocm_kernel->gpu_function_ptr()));
+  }
 
   // We have to trust the kernel loader spec arity because there doesn't appear
   // to be a way to reflect on the number of expected arguments w/the ROCM API.
   rocm_kernel->set_arity(spec.arity());
 
-  KernelMetadata kernel_metadata;
-  TF_RETURN_IF_ERROR(GetKernelMetadata(rocm_kernel, &kernel_metadata));
-  kernel->set_metadata(kernel_metadata);
+  // unable to get kernel metadata for in-process kernel
+  if (!spec.has_in_process_symbol()) {
+    KernelMetadata kernel_metadata;
+    TF_RETURN_IF_ERROR(GetKernelMetadata(rocm_kernel, &kernel_metadata));
+    kernel->set_metadata(kernel_metadata);
+  }
   kernel->set_name(*kernel_name);
+  kernel->set_kernel_args_packing(spec.kernel_args_packing());
   return tsl::OkStatus();
 }
 
@@ -256,8 +281,7 @@ tsl::Status GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
 
 tsl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
                                 const BlockDim& block_dims,
-                                const KernelBase& kernel,
-                                const KernelArgsArrayBase& args) {
+                                const Kernel& kernel, const KernelArgs& args) {
   CHECK_EQ(kernel.Arity() + (args.number_of_shared_bytes() > 0),
            args.number_of_arguments());
   GpuStreamHandle hipstream = AsGpuStreamValue(stream);
@@ -283,28 +307,26 @@ tsl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
         hipfunc, rocm_kernel->GetGpuCacheConfig()));
   }
 
-  // prepare kernargs
-  // KernelArgsArrayBase keeps the pointer of arguments
-  // deference them here
-  std::vector<void*> kernargs;
-  KernelArgIterator iter = args.arg_iterator();
-  while (iter.has_next()) {
-    KernelArg arg = iter.next();
-    VLOG(2) << "*(arg.address): "
-            << reinterpret_cast<void*>(
-                   *static_cast<const uint64_t*>(arg.address));
-    kernargs.push_back(
-        reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
-  }
+  auto* packed_args = DynCast<KernelArgsPackedArrayBase>(&args);
+  if (!packed_args)
+    return absl::InternalError("Unsupported kernel arguments type");
 
-  size_t size = sizeof(void*) * kernargs.size();
-  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
-                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
+  void** kernel_params =
+      const_cast<void**>(packed_args->argument_addresses().data());
 
   return GpuDriver::LaunchKernel(
       GetGpuContext(stream), kernel.name(), hipfunc, block_dims.x, block_dims.y,
       block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
-      args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config);
+      args.number_of_shared_bytes(), hipstream, kernel_params, nullptr);
+}
+
+tsl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                                const BlockDim& block_dims,
+                                const ClusterDim& cluster_dims,
+                                const Kernel& kernel, const KernelArgs& args) {
+  if (cluster_dims.x != 1 || cluster_dims.y != 1 || cluster_dims.z != 1)
+    return tsl::errors::Unimplemented("Not implemented for ROCm");
+  return Launch(stream, thread_dims, block_dims, kernel, args);
 }
 
 tsl::Status GpuExecutor::Submit(Stream* stream,
@@ -391,7 +413,7 @@ tsl::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
 // This is a non-essential operation; if there's a failure, proceed without
 // logging an error. It's nearly certain that in case of failures, we'd never
 // get here in the first place; these are very low-impact routines.
-void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
+void GpuExecutor::VlogOccupancyInfo(const Kernel& kernel,
                                     const ThreadDim& thread_dims,
                                     const BlockDim& block_dims) {
   // TODO(ROCm) implement this feature in HIP
@@ -402,12 +424,6 @@ DeviceMemoryBase GpuExecutor::Allocate(uint64_t size, int64_t memory_space) {
   return DeviceMemoryBase(GpuDriver::DeviceAllocate(context_, size), size);
 }
 
-void* GpuExecutor::GetSubBuffer(DeviceMemoryBase* mem, uint64_t offset_bytes,
-                                uint64_t size_bytes) {
-  // offset and size are in bytes, so char* works as the pointer type.
-  return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
-}
-
 void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
   GpuDriver::DeviceDeallocate(context_, mem->opaque());
 }
@@ -743,6 +759,16 @@ GpuExecutor::GetCommandBufferImplementation(CommandBuffer::Mode mode) {
   return std::make_unique<GpuCommandBuffer>(mode, /*parent=*/this, graph);
 }
 
+std::unique_ptr<internal::CommandBufferInterface>
+GpuExecutor::GetCommandBufferImplementation(CommandBuffer::Mode mode,
+                                            GpuGraphHandle graph,
+                                            bool is_owned_graph) {
+  VLOG(2) << "Create HIP command buffer (HIP graph) from existing graph "
+          << graph << "; is_owned_graph=" << is_owned_graph;
+  return std::make_unique<GpuCommandBuffer>(mode, /*parent=*/this, graph,
+                                            is_owned_graph);
+}
+
 void* GpuExecutor::platform_specific_context() { return context_; }
 
 GpuContext* GpuExecutor::gpu_context() { return context_; }
@@ -905,6 +931,7 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
       GpuDriver::GetMaxSharedMemoryPerBlock(device).value());
   int core_count = GpuDriver::GetMultiprocessorCount(device).value();
   builder.set_core_count(core_count);
+  builder.set_fpus_per_core(fpus_per_core(gcn_arch_name));
   builder.set_threads_per_core_limit(
       GpuDriver::GetMaxThreadsPerMultiprocessor(device).value());
   builder.set_registers_per_block_limit(
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_kernel.cc b/third_party/xla/xla/stream_executor/rocm/rocm_kernel.cc
index 5ebad6db18bb5a..c091fdd5f28ef3 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_kernel.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_kernel.cc
@@ -34,5 +34,17 @@ hipFuncCache_t GpuKernel::GetGpuCacheConfig() const {
   }
 }
 
+tsl::StatusOr<int32_t> GpuKernel::GetMaxOccupiedBlocksPerCore(
+    ThreadDim threads, size_t dynamic_shared_memory_bytes) const {
+  int32_t threads_per_block = threads.x * threads.y * threads.z;
+  VLOG(0) << "Get kernel block occupancy: " << name_
+          << "; threads_per_block: " << threads_per_block
+          << "; dynamic_shared_memory_bytes: " << dynamic_shared_memory_bytes;
+
+  return GpuDriver::GetMaxOccupiedBlocksPerCore(gpu_context_, gpu_function_,
+                                                threads_per_block,
+                                                dynamic_shared_memory_bytes);
+}
+
 }  // namespace gpu
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/stream.h b/third_party/xla/xla/stream_executor/stream.h
index cc616e44f13888..665cb90d9a42a3 100644
--- a/third_party/xla/xla/stream_executor/stream.h
+++ b/third_party/xla/xla/stream_executor/stream.h
@@ -47,6 +47,9 @@ limitations under the License.
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/stream_executor/temporary_memory_manager.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -182,6 +185,22 @@ class Stream {
   tsl::Status ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
                          const TypedKernel<Params...> &kernel, Args... args);
 
+  template <typename... Params, typename... Args>
+  tsl::Status ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
+                         ClusterDim cluster_dims,
+                         const TypedKernel<Params...> &kernel, Args... args);
+
+  // Same as above, with an explicit argument for shared memory size in bytes.
+  template <typename... Params, typename... Args>
+  tsl::Status ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
+                         int32_t shmem_bytes,
+                         const TypedKernel<Params...> &kernel, Args... args);
+
+  template <typename... Params, typename... Args>
+  tsl::Status ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
+                         ClusterDim cluster_dims, int32_t shmem_bytes,
+                         const TypedKernel<Params...> &kernel, Args... args);
+
   // Create a dependency for this stream's next work on the other stream
   // completing. Does not take ownership of other, and other must not be
   // null.
@@ -1544,18 +1563,42 @@ inline tsl::Status Stream::ThenLaunch(ThreadDim thread_dims,
                                       BlockDim block_dims,
                                       const TypedKernel<Params...> &kernel,
                                       Args... args) {
-  KernelInvocationChecker<std::tuple<Params...>,
-                          std::tuple<Args...>>::CheckAllStaticAssert();
-
-  // This is the core that allows type-safe kernel launching.
-  // Since the platforms take kernel arguments as tuples of (void *, size),
-  // we pack the variadic parameters passed as ...args into the desired
-  // tuple form and pass that packed form to the StreamExecutor::Launch()
-  // implementation.
-  KernelArgsArray<sizeof...(args)> kernel_args;
-  kernel.PackParams(&kernel_args, args...);
+  auto kernel_args = PackKernelArgs(kernel, args...);
+  TF_RETURN_IF_ERROR(
+      parent_->Launch(this, thread_dims, block_dims, kernel, *kernel_args));
+  return ::tsl::OkStatus();
+}
+
+template <typename... Params, typename... Args>
+inline tsl::Status Stream::ThenLaunch(ThreadDim thread_dims,
+                                      BlockDim block_dims, int32_t shmem_bytes,
+                                      const TypedKernel<Params...> &kernel,
+                                      Args... args) {
+  auto kernel_args = PackKernelArgs(shmem_bytes, args...);
   TF_RETURN_IF_ERROR(
-      parent_->Launch(this, thread_dims, block_dims, kernel, kernel_args));
+      parent_->Launch(this, thread_dims, block_dims, kernel, *kernel_args));
+  return ::tsl::OkStatus();
+}
+
+template <typename... Params, typename... Args>
+inline tsl::Status Stream::ThenLaunch(ThreadDim thread_dims,
+                                      BlockDim block_dims,
+                                      ClusterDim cluster_dims,
+                                      const TypedKernel<Params...> &kernel,
+                                      Args... args) {
+  auto kernel_args = PackKernelArgs(kernel, args...);
+  TF_RETURN_IF_ERROR(parent_->Launch(this, thread_dims, block_dims,
+                                     cluster_dims, kernel, *kernel_args));
+  return ::tsl::OkStatus();
+}
+
+template <typename... Params, typename... Args>
+inline tsl::Status Stream::ThenLaunch(
+    ThreadDim thread_dims, BlockDim block_dims, ClusterDim cluster_dims,
+    int32_t shmem_bytes, const TypedKernel<Params...> &kernel, Args... args) {
+  auto kernel_args = PackKernelArgs(shmem_bytes, args...);
+  TF_RETURN_IF_ERROR(parent_->Launch(this, thread_dims, block_dims,
+                                     cluster_dims, kernel, *kernel_args));
   return ::tsl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/stream_executor/stream_executor_internal.h b/third_party/xla/xla/stream_executor/stream_executor_internal.h
index 424b31c85571e5..4d0c15dbd59b7d 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_internal.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_internal.h
@@ -31,6 +31,7 @@ limitations under the License.
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
+#include "absl/types/span.h"
 #include "xla/stream_executor/allocator_stats.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
@@ -77,7 +78,7 @@ class EventInterface {
 // KernelInterface
 //===----------------------------------------------------------------------===//
 
-// Pointer-to-implementation object type (i.e. the KernelBase class delegates to
+// Pointer-to-implementation object type (i.e. the Kernel class delegates to
 // this interface) with virtual destruction. This class exists for the
 // platform-dependent code to hang any kernel data/resource info/functionality
 // off of.
@@ -98,6 +99,13 @@ class KernelInterface {
   // Gets the preferred cache configuration.
   virtual KernelCacheConfig GetPreferredCacheConfig() const = 0;
 
+  // Returns the maximum number of blocks (per multiprocessor) occupied by the
+  // kernel given the number of threads per block and shared memory size.
+  virtual tsl::StatusOr<int32_t> GetMaxOccupiedBlocksPerCore(
+      ThreadDim threads, size_t dynamic_shared_memory_bytes) const {
+    return absl::UnimplementedError("Not Implemented");
+  }
+
  private:
   KernelInterface(const KernelInterface&) = delete;
   void operator=(const KernelInterface&) = delete;
@@ -130,8 +138,7 @@ class CommandBufferInterface {
 
   // Adds a kernel launch command to the command buffer.
   virtual tsl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
-                             const KernelBase& kernel,
-                             const KernelArgsArrayBase& args) = 0;
+                             const Kernel& kernel, const KernelArgs& args) = 0;
 
   // Adds a nested command buffer to the command buffer.
   virtual tsl::Status AddNestedCommandBuffer(const CommandBuffer& nested) = 0;
@@ -141,6 +148,66 @@ class CommandBufferInterface {
                                            const DeviceMemoryBase& src,
                                            uint64_t size) = 0;
 
+  // Adds a memset node to the command buffer.
+  virtual tsl::Status Memset(DeviceMemoryBase* dst,
+                             CommandBuffer::BitPattern bit_pattern,
+                             size_t num_elements) = 0;
+
+  // Adds a device memory allocation node to the command buffer.
+  virtual tsl::StatusOr<DeviceMemoryBase> Allocate(size_t bytes) = 0;
+
+  // For all conditional command APIs defined below, nested command buffers
+  // constructed for conditional branches owned by *this and should never be
+  // finalized or updated inside builders.
+
+  // Adds a conditional operation that will run a command buffer constructed by
+  // `then_builder` if `predicate` value is `true`.
+  virtual tsl::Status If(StreamExecutor* executor, DeviceMemory<bool> predicate,
+                         CommandBuffer::Builder then_builder) = 0;
+
+  // Adds a conditional operation that will run a command buffer constructed by
+  // `then_builder` if `predicate` value is `true`, or a command buffer
+  // constructed by `else_builder` if `predicate` is `false`.
+  virtual tsl::Status IfElse(StreamExecutor* executor,
+                             DeviceMemory<bool> predicate,
+                             CommandBuffer::Builder then_builder,
+                             CommandBuffer::Builder else_builder) = 0;
+
+  // Adds a conditional operation that will run a command buffer constructed by
+  // the `branches` builder at `index`. If `index` is out of range, then it will
+  // run a conditional command buffer constructed by the last builder.
+  //
+  // See: https://github.com/openxla/stablehlo/blob/main/docs/spec.md#case
+  virtual tsl::Status Case(StreamExecutor* executor,
+                           DeviceMemory<int32_t> index,
+                           std::vector<CommandBuffer::Builder> branches) = 0;
+
+  // Adds a conditional operation that will run a command buffer constructed by
+  // the `body_builder` exactly `num_iteration` times.
+  virtual tsl::Status For(StreamExecutor* executor, int32_t num_iteration,
+                          DeviceMemory<int32_t> loop_index,
+                          CommandBuffer::Builder body_builder) = 0;
+
+  // Adds a conditional operation that will execute a command buffer constructed
+  // by the `cond_builder` that must update `pred` value, and then depending on
+  // the value might execute command buffer constructed by `body_builder` and
+  // `cond_builder`. Will continue while `pred` value is `true`.
+  //
+  // In pseudocode:
+  //
+  //   cond_builder()
+  //   while(pred):
+  //     body_builder()
+  //     cond_builder()
+  //
+  virtual tsl::Status While(StreamExecutor* executor, DeviceMemory<bool> pred,
+                            CommandBuffer::Builder cond_builder,
+                            CommandBuffer::Builder body_builder) = 0;
+
+  // Adds a device memory free command to the command buffer, buffer is
+  // allocated in other command buffer, free through real address.
+  virtual tsl::Status Free(DeviceMemoryBase dst) = 0;
+
   // Finalizes command buffer and makes it executable. Once command buffer is
   // finalized no commands can be added to it.
   virtual tsl::Status Finalize() = 0;
@@ -232,8 +299,10 @@ class StreamExecutorInterface {
     return std::nullopt;
   }
 
+  virtual int device_ordinal() const { return -1; }
+
   virtual tsl::Status GetKernel(const MultiKernelLoaderSpec& spec,
-                                KernelBase* kernel) {
+                                Kernel* kernel) {
     return absl::UnimplementedError("Not Implemented");
   }
   virtual bool UnloadModule(ModuleHandle module_handle) { return false; }
@@ -242,12 +311,19 @@ class StreamExecutorInterface {
     return absl::UnimplementedError("Not Implemented");
   }
   virtual tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
-  CreateOrShareConstant(Stream* stream, const std::vector<uint8_t>& content) {
+  CreateOrShareConstant(Stream* stream, absl::Span<const uint8_t> content) {
     return absl::UnimplementedError("Not Implemented");
   }
   virtual tsl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
-                             const BlockDim& block_dims, const KernelBase& k,
-                             const KernelArgsArrayBase& args) {
+                             const BlockDim& block_dims, const Kernel& k,
+                             const KernelArgs& args) {
+    return absl::UnimplementedError("Not Implemented");
+  }
+
+  virtual tsl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
+                             const BlockDim& block_dims,
+                             const ClusterDim& cluster_dims, const Kernel& k,
+                             const KernelArgs& args) {
     return absl::UnimplementedError("Not Implemented");
   }
 
@@ -257,13 +333,11 @@ class StreamExecutorInterface {
   }
 
   // Releases any state associated with the kernel.
-  virtual void UnloadKernel(const KernelBase* kernel) {}
+  virtual void UnloadKernel(const Kernel* kernel) {}
   virtual DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) = 0;
   DeviceMemoryBase Allocate(uint64_t size) {
     return Allocate(size, /*memory_space=*/0);
   }
-  virtual void* GetSubBuffer(DeviceMemoryBase* parent, uint64_t offset,
-                             uint64_t size) = 0;
   virtual void Deallocate(DeviceMemoryBase* mem) = 0;
   // Allocates unified memory space of the given size, if supported.
   // See
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
index 5aa66c6d5bbcef..45f1af2d4d66a9 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
@@ -30,9 +30,11 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/notification.h"
+#include "absl/types/span.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/fft.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_internal.h"
@@ -173,11 +175,11 @@ tsl::Status StreamExecutor::Init(DeviceOptions device_options) {
 tsl::Status StreamExecutor::Init() { return Init(DeviceOptions::Default()); }
 
 tsl::Status StreamExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
-                                      KernelBase* kernel) {
+                                      Kernel* kernel) {
   return implementation_->GetKernel(spec, kernel);
 }
 
-void StreamExecutor::UnloadKernel(const KernelBase* kernel) {
+void StreamExecutor::UnloadKernel(const Kernel* kernel) {
   implementation_->UnloadKernel(kernel);
 }
 
@@ -192,8 +194,8 @@ bool StreamExecutor::UnloadModule(ModuleHandle module_handle) {
 
 tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
 StreamExecutor::CreateOrShareConstant(Stream* stream,
-                                      const std::vector<uint8_t>& content) {
-  return implementation_->CreateOrShareConstant(stream, std::move(content));
+                                      absl::Span<const uint8_t> content) {
+  return implementation_->CreateOrShareConstant(stream, content);
 }
 
 void StreamExecutor::Deallocate(DeviceMemoryBase* mem) {
@@ -439,14 +441,26 @@ fft::FftSupport* StreamExecutor::AsFft() {
 
 tsl::Status StreamExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
                                    const BlockDim& block_dims,
-                                   const KernelBase& kernel,
-                                   const KernelArgsArrayBase& args) {
+                                   const Kernel& kernel,
+                                   const KernelArgs& args) {
   SubmitTrace(&TraceListener::LaunchSubmit, stream, thread_dims, block_dims,
               kernel, args);
 
   return implementation_->Launch(stream, thread_dims, block_dims, kernel, args);
 }
 
+tsl::Status StreamExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                                   const BlockDim& block_dims,
+                                   const ClusterDim& cluster_dims,
+                                   const Kernel& kernel,
+                                   const KernelArgs& args) {
+  SubmitTrace(&TraceListener::LaunchSubmit, stream, thread_dims, block_dims,
+              kernel, args);
+
+  return implementation_->Launch(stream, thread_dims, block_dims, cluster_dims,
+                                 kernel, args);
+}
+
 tsl::Status StreamExecutor::Submit(Stream* stream,
                                    const CommandBuffer& command_buffer) {
   return implementation_->Submit(stream, command_buffer);
@@ -481,11 +495,6 @@ DeviceMemoryBase StreamExecutor::Allocate(uint64_t size, int64_t memory_space) {
   return buf;
 }
 
-void* StreamExecutor::GetUntypedSubBuffer(DeviceMemoryBase* parent,
-                                          uint64_t offset, uint64_t size) {
-  return implementation_->GetSubBuffer(parent, offset, size);
-}
-
 tsl::StatusOr<DeviceMemoryBase> StreamExecutor::GetUntypedSymbol(
     const std::string& symbol_name, ModuleHandle module_handle) {
   // If failed to get the symbol, opaque/bytes are unchanged. Initialize them to
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
index 3f314d28b6237e..ec35fbb17a93b2 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/fft.h"
 #include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform.h"
@@ -111,10 +112,10 @@ class StreamExecutor {
   //
   // If an error occurs, or there is no kernel available for the StreamExecutor
   // platform, error status is returned.
-  tsl::Status GetKernel(const MultiKernelLoaderSpec& spec, KernelBase* kernel);
+  tsl::Status GetKernel(const MultiKernelLoaderSpec& spec, Kernel* kernel);
 
   // Releases any state associated with the previously loaded kernel.
-  void UnloadKernel(const KernelBase* kernel);
+  void UnloadKernel(const Kernel* kernel);
 
   // Loads a module for the platform this StreamExecutor is acting upon.
   //
@@ -128,7 +129,7 @@ class StreamExecutor {
   bool UnloadModule(ModuleHandle module_handle);
 
   tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>> CreateOrShareConstant(
-      Stream* stream, const std::vector<uint8_t>& content);
+      Stream* stream, absl::Span<const uint8_t> content);
 
   // Synchronously allocates an array on the device of type T with element_count
   // elements.
@@ -155,18 +156,6 @@ class StreamExecutor {
     return AllocateOwnedArray<T>(1);
   }
 
-  // Allocate a memory region inside another allocated memory region.
-  // Offset and size are specified in terms of T elements.
-  // Warning: Do not free a parent buffer before its sub-buffers; this may cause
-  // use-after-free issues (the specific behavior is not consistent across
-  // platforms).
-  //  - Note: OpenCL uses refcounting to manage buffer lifetimes, so use of a
-  //    sub-buffer after parent deallocation is expected to be safe. This will
-  //    render your code non-platform-portable, however.
-  template <typename T>
-  DeviceMemory<T> GetSubBuffer(DeviceMemory<T>* parent, uint64_t element_offset,
-                               uint64_t element_count);
-
   // An untyped version of GetSymbol.
   tsl::StatusOr<DeviceMemoryBase> GetUntypedSymbol(
       const std::string& symbol_name, ModuleHandle module_handle);
@@ -413,8 +402,12 @@ class StreamExecutor {
   // This is called by Stream::Launch() to delegate to the platform's launch
   // implementation in StreamExecutorInterface::Launch().
   tsl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
-                     const BlockDim& block_dims, const KernelBase& kernel,
-                     const KernelArgsArrayBase& args);
+                     const BlockDim& block_dims, const Kernel& kernel,
+                     const KernelArgs& args);
+
+  tsl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
+                     const BlockDim& block_dims, const ClusterDim& cluster_dims,
+                     const Kernel& kernel, const KernelArgs& args);
 
   // Submits command buffer for execution to the underlying platform driver.
   tsl::Status Submit(Stream* stream, const CommandBuffer& command_buffer);
@@ -493,9 +486,6 @@ class StreamExecutor {
   // nullptr is returned.
   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space);
 
-  void* GetUntypedSubBuffer(DeviceMemoryBase* parent, uint64_t offset,
-                            uint64_t size);
-
   // Causes the host code to synchronously wait for operations entrained
   // onto stream to complete. Effectively a join on the asynchronous device
   // operations enqueued on the stream before this program point.
@@ -750,25 +740,6 @@ ScopedDeviceMemory<ElemT>::ScopedDeviceMemory(
   }
 }
 
-template <typename T>
-DeviceMemory<T> StreamExecutor::GetSubBuffer(DeviceMemory<T>* parent,
-                                             uint64_t element_offset,
-                                             uint64_t element_count) {
-  if (element_offset + element_count > parent->ElementCount()) {
-    LOG(ERROR) << "requested sub-buffer allocation (offset + size) is greater "
-               << "than parent allocation size: (" << element_offset << " + "
-               << element_count << ") vs. (" << parent->ElementCount() << ")";
-    return DeviceMemory<T>{};
-  }
-
-  void* opaque = GetUntypedSubBuffer(parent, sizeof(T) * element_offset,
-                                     sizeof(T) * element_count);
-  if (opaque == nullptr) {
-    return DeviceMemory<T>{};
-  }
-  return DeviceMemory<T>(DeviceMemoryBase(opaque, sizeof(T) * element_count));
-}
-
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
index 46ebe431a49fad..c4cd3ed11c1b9c 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
@@ -171,10 +171,6 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
     LOG(FATAL) << "Not yet implemented";
   }
 
-  void* GetSubBuffer(DeviceMemoryBase* parent, uint64_t offset,
-                     uint64_t size) override {
-    LOG(FATAL) << "not yet implemented";
-  }
   tsl::Status MemZero(Stream* stream, DeviceMemoryBase* location,
                       uint64_t size) override {
     LOG(FATAL) << "not yet implemented";
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc b/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc
index f547c327846283..c037327281f213 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc
@@ -34,7 +34,6 @@ tsl::Status SetTpuOpsStructFns(void* library_handle) {  // TENSORFLOW_STATUS_OK
   TFTPU_SET_FN(ops_api_fn, TpuEmbeddingEngineState_GetState);
 
   TFTPU_SET_FN(ops_api_fn, TpuCompile_CompileAndBuild);
-  TFTPU_SET_FN(ops_api_fn, TpuCompile_XrtCompileAndBuild);
 
   TFTPU_SET_FN(ops_api_fn, TpuExecutable_LoadProgramAndEnqueueToStream);
 
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h b/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
index 7df6479cab09d6..14494201e82d16 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
@@ -130,15 +130,6 @@ TFTPU_CAPI_EXPORT void TpuCompile_CompileAndBuild(
     TpuSerializedProto compilation_request, const XLA_TpuMeshState* mesh_state,
     XLA_TpuProgram** tpu_programs[], size_t* count, TF_Status* status);
 
-// Compiles a HLO IR and returns `count` number of TPU programs ready for
-// execution. The API allocates the `XLA_TpuProgram*[]` array `tpu_programs` and
-// creates `XLA_TpuProgram` object(s) using the `TpuProgram_New` API. The caller
-// is responsible to deallocate both the `XLA_TpuProgram*[]` array and the
-// `XLA_TpuProgram` object(s) using `TpuProgram_FreeArray` and `TpuProgram_Free`
-// API respectively.
-TFTPU_CAPI_EXPORT void TpuCompile_XrtCompileAndBuild(
-    TpuSerializedProto xrt_computation, const XLA_TpuMeshState* mesh_state,
-    XLA_TpuProgram** tpu_programs[], size_t* count, TF_Status* status);
 
 // Creates a new TPU mesh state object.
 TFTPU_CAPI_EXPORT XLA_TpuMeshState* TpuMeshState_Create();
@@ -734,7 +725,6 @@ TFTPU_CAPI_EXPORT void SparseCore_GetMaxIdsAndUniques(
 
 struct TfTpu_OpsApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAndBuild);
-  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_XrtCompileAndBuild);
 
   TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_Create);
   TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_Free);
diff --git a/third_party/xla/xla/stream_executor/trace_listener.h b/third_party/xla/xla/stream_executor/trace_listener.h
index 5b1340dc7cfbb8..79909261aca078 100644
--- a/third_party/xla/xla/stream_executor/trace_listener.h
+++ b/third_party/xla/xla/stream_executor/trace_listener.h
@@ -47,9 +47,8 @@ class TraceListener {
   virtual ~TraceListener() {}
 
   virtual void LaunchSubmit(Stream* stream, const ThreadDim& thread_dims,
-                            const BlockDim& block_dims,
-                            const KernelBase& kernel,
-                            const KernelArgsArrayBase& args) {}
+                            const BlockDim& block_dims, const Kernel& kernel,
+                            const KernelArgs& args) {}
 
   virtual void SynchronousMemcpyH2DBegin(int64_t correlation_id,
                                          const void* host_src, int64_t size,
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 959f3032041b9b..5abe4876b9146c 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -7,6 +7,10 @@ load(
     "//xla/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
 )
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
 load("@local_tsl//tsl:tsl.bzl", "tsl_copts")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load(
@@ -14,6 +18,10 @@ load(
     "tf_cuda_tests_tags",
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 
 package(
     default_visibility = ["//visibility:public"],
@@ -172,7 +180,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":pjrt_client_registry",
-        "//xla/pjrt:tfrt_cpu_pjrt_client",
+        "//xla/pjrt/cpu:cpu_client",
     ],
 )
 
@@ -769,6 +777,9 @@ xla_test(
 xla_test(
     name = "array_elementwise_ops_test",
     srcs = ["array_elementwise_ops_test.cc"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     shard_count = 25,
     deps = [
         ":client_library_test_base",
@@ -1879,6 +1890,7 @@ xla_test(
         "//xla/client:local_client",
         "//xla/client:xla_builder",
         "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:test",
     ],
@@ -2355,6 +2367,9 @@ xla_test(
 xla_cc_test(
     name = "llvm_compiler_test",
     srcs = if_gpu_is_configured(["llvm_compiler_test.cc"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM",
+    ]),
     tags = tf_cuda_tests_tags(),
     deps = if_gpu_is_configured([
         ":verified_hlo_module",
@@ -2369,9 +2384,12 @@ xla_cc_test(
         "//xla/service/cpu:cpu_compiler",
         "//xla/service/gpu:gpu_compiler",
         "//xla/stream_executor",
-        "//xla/stream_executor/cuda:cuda_platform_id",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:test_main",
+    ]) + if_cuda_is_configured([
+        "//xla/stream_executor/cuda:cuda_platform_id",
+    ]) + if_rocm_is_configured([
+        "//xla/stream_executor/rocm:rocm_platform_id",
     ]),
 )
 
@@ -2857,3 +2875,39 @@ xla_test(
         "@local_tsl//tsl/platform:platform_port",
     ],
 )
+
+xla_test(
+    name = "onednn_layer_norm_test",
+    srcs = ["onednn_layer_norm_test.cc"],
+    backends = [
+        "cpu",
+    ],
+    copts = tsl_copts(),
+    deps = [
+        ":hlo_test_base",
+        ":test_macros_header",
+        ":xla_internal_test_main",
+        "//xla:literal",
+        "//xla:shape_util",
+        "//xla:test",
+        "//xla:test_helpers",
+    ],
+)
+
+xla_test(
+    name = "onednn_softmax_test",
+    srcs = ["onednn_softmax_test.cc"],
+    backends = [
+        "cpu",
+    ],
+    copts = tsl_copts(),
+    deps = [
+        ":hlo_test_base",
+        ":test_macros_header",
+        ":xla_internal_test_main",
+        "//xla:literal",
+        "//xla:shape_util",
+        "//xla:test",
+        "//xla:test_helpers",
+    ],
+)
diff --git a/third_party/xla/xla/tests/array_elementwise_ops_test.cc b/third_party/xla/xla/tests/array_elementwise_ops_test.cc
index d932c5c540566a..341cd92fb5dd2f 100644
--- a/third_party/xla/xla/tests/array_elementwise_ops_test.cc
+++ b/third_party/xla/xla/tests/array_elementwise_ops_test.cc
@@ -42,6 +42,10 @@ limitations under the License.
 #include "xla/tests/test_macros.h"
 #include "xla/types.h"
 
+#if TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
+#endif
+
 namespace xla {
 namespace {
 
@@ -1590,6 +1594,11 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLtU32s) {
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, PowF32s) {
+#if TENSORFLOW_USE_ROCM && TF_ROCM_VERSION == 50700
+  GTEST_SKIP()
+      << "This test fails on rocm-5.7.0 platform due to a compiler bug";
+#endif
+
   SetFastMathDisabled(true);
   XlaBuilder builder(TestName());
   auto eps = std::numeric_limits<float>::epsilon();
@@ -2000,6 +2009,97 @@ XLA_TEST_F(ArrayElementwiseOpTest, MinF32s) {
                              error_spec_);
 }
 
+using ScalarF32TestCase = std::tuple<float, float>;
+
+class ScalarF32MinTest
+    : public ArrayElementwiseOpTest,
+      public ::testing::WithParamInterface<ScalarF32TestCase> {};
+
+XLA_TEST_P(ScalarF32MinTest, Version_1) {
+  auto test_params = GetParam();
+  XlaBuilder builder(TestName());
+  SetFastMathDisabled(true);
+  float x = std::get<0>(test_params);
+  float y = std::get<1>(test_params);
+  auto lhs = ConstantR0<float>(&builder, x);
+  auto rhs = ConstantR0<float>(&builder, y);
+  Min(Min(lhs, rhs), rhs);
+
+  float expected = std::min(x, y);
+  if (std::isnan(x)) {
+    expected = x;
+  } else if (std::isnan(y)) {
+    expected = y;
+  }
+  ComputeAndCompareR0<float>(&builder, expected, {}, error_spec_);
+}
+
+XLA_TEST_P(ScalarF32MinTest, Version_2) {
+  auto test_params = GetParam();
+  XlaBuilder builder(TestName());
+  SetFastMathDisabled(true);
+  float x = std::get<0>(test_params);
+  float y = std::get<1>(test_params);
+  auto lhs = ConstantR0<float>(&builder, x);
+  auto rhs = ConstantR0<float>(&builder, y);
+  Min(Min(lhs, rhs), lhs);
+
+  float expected = std::min(x, y);
+  if (std::isnan(x)) {
+    expected = x;
+  } else if (std::isnan(y)) {
+    expected = y;
+  }
+  ComputeAndCompareR0<float>(&builder, expected, {}, error_spec_);
+}
+
+XLA_TEST_P(ScalarF32MinTest, Version_3) {
+  auto test_params = GetParam();
+  XlaBuilder builder(TestName());
+  SetFastMathDisabled(true);
+  float x = std::get<0>(test_params);
+  float y = std::get<1>(test_params);
+  auto lhs = ConstantR0<float>(&builder, x);
+  auto rhs = ConstantR0<float>(&builder, y);
+  Min(lhs, Min(lhs, rhs));
+
+  float expected = std::min(x, y);
+  if (std::isnan(x)) {
+    expected = x;
+  } else if (std::isnan(y)) {
+    expected = y;
+  }
+  ComputeAndCompareR0<float>(&builder, expected, {}, error_spec_);
+}
+
+XLA_TEST_P(ScalarF32MinTest, Version_4) {
+  auto test_params = GetParam();
+  XlaBuilder builder(TestName());
+  SetFastMathDisabled(true);
+  float x = std::get<0>(test_params);
+  float y = std::get<1>(test_params);
+  auto lhs = ConstantR0<float>(&builder, x);
+  auto rhs = ConstantR0<float>(&builder, y);
+  Min(rhs, Min(lhs, rhs));
+
+  float expected = std::min(x, y);
+  if (std::isnan(x)) {
+    expected = x;
+  } else if (std::isnan(y)) {
+    expected = y;
+  }
+  ComputeAndCompareR0<float>(&builder, expected, {}, error_spec_);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ScalarF32MinTestInstance, ScalarF32MinTest,
+    ::testing::Combine(
+        ::testing::Values(1.0, std::numeric_limits<float>::infinity(), NAN,
+                          -1.0, -std::numeric_limits<float>::infinity(), -NAN),
+        ::testing::Values(1.0, std::numeric_limits<float>::infinity(), NAN,
+                          -1.0, -std::numeric_limits<float>::infinity(),
+                          -NAN)));
+
 XLA_TEST_F(ArrayElementwiseOpTest, MinZeroElementF32s) {
   XlaBuilder builder(TestName());
   auto lhs = ConstantR1<float>(&builder, {});
diff --git a/third_party/xla/xla/tests/collective_ops_test.cc b/third_party/xla/xla/tests/collective_ops_test.cc
index 29d7e5781bfa43..d87dae6e86dc7f 100644
--- a/third_party/xla/xla/tests/collective_ops_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_test.cc
@@ -983,6 +983,34 @@ XLA_TEST_F(CollectiveOpsTest, AllGather_Dim0) {
   }
 }
 
+XLA_TEST_F(CollectiveOpsTest, AllGather_Dim0_UseGlobalDevices) {
+  const char* const kModuleStr = R"(
+  HloModule test
+  ENTRY test_computation {
+    id = u32[] replica-id()
+    id2 = u32[1, 2] broadcast(id), dimensions={}
+    a0 = u32[1, 2] constant({{10, 15}})
+    a1 = u32[1, 2] add(id2, a0)
+    allgather = u32[2, 2] all-gather(a1), dimensions={0}, use_global_device_ids=true, channel_id=7, replica_groups={{0, 1}}
+    ROOT out = u32[4] reshape(allgather)
+  }
+  )";
+  const int64_t kNumReplicas = 2;
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), {}, kNumReplicas,
+                        /*use_threads=*/true, /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  for (const Literal& result : results) {
+    LiteralTestUtil::ExpectR1Equal<uint32_t>({10, 15, 11, 16}, result);
+  }
+}
+
 XLA_TEST_F(CollectiveOpsTest, AllGather_Dim1) {
   const char* const kModuleStr = R"(
   HloModule test
diff --git a/third_party/xla/xla/tests/convert_test.cc b/third_party/xla/xla/tests/convert_test.cc
index 4330d29c715dbf..3e1ae9c9a30f6a 100644
--- a/third_party/xla/xla/tests/convert_test.cc
+++ b/third_party/xla/xla/tests/convert_test.cc
@@ -601,6 +601,34 @@ TEST_F(ConvertTest, ConvertR1U8ToR1U4) {
   ComputeAndCompareR1<u4>(&builder, expected, {});
 }
 
+TEST_F(ConvertTest, ConvertR1S8ToR1S4Roundtrip) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<int8_t>(&builder, {0, 8, -8, -9, 127, -128});
+  auto b = ConvertElementType(a, S4);
+  ConvertElementType(b, S8);
+
+  std::vector<int8_t> expected = {0, -8, -8, 7, -1, 0};
+  ComputeAndCompareR1<int8_t>(&builder, expected, {});
+}
+
+TEST_F(ConvertTest, ConvertR1F32ToR1S4) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<float>(&builder, {0., 2.5, -2.5});
+  ConvertElementType(a, S4);
+
+  std::vector<s4> expected = {s4(0), s4(2), s4(-2)};
+  ComputeAndCompareR1<s4>(&builder, expected, {});
+}
+
+TEST_F(ConvertTest, ConvertR1S4ToR1F32) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<s4>(&builder, {s4(0), s4(1), s4(2), s4(-8)});
+  ConvertElementType(a, F32);
+
+  std::vector<float> expected = {0, 1, 2, -8};
+  ComputeAndCompareR1<float>(&builder, expected, {});
+}
+
 XLA_TEST_F(ConvertTest, ConvertBF16F32) {
   XlaBuilder builder(TestName());
 
diff --git a/third_party/xla/xla/tests/fuzz/BUILD b/third_party/xla/xla/tests/fuzz/BUILD
index 0f450e028a662d..c1e4edddbc42f2 100644
--- a/third_party/xla/xla/tests/fuzz/BUILD
+++ b/third_party/xla/xla/tests/fuzz/BUILD
@@ -14,22 +14,17 @@ cc_library(
     ],
 )
 
-hlo_test(
-    name = "rand",
-    srcs = [],
-    hlo_files = glob(
-        include = ["rand_*.hlo"],
-        exclude = [
-            "rand_000001.hlo",  # fails on GPU
-            "rand_000004.hlo",  # times out during coverage
-            # These fail on all platforms
-            "rand_000060.hlo",
-            "rand_000067.hlo",
-            "rand_000072.hlo",
-        ],
-    ),
-    deps = [
-        ":hlo_test_lib",
-        "@local_tsl//tsl/platform:test_main",
+[hlo_test(
+    name = hlo + "_test",
+    hlo = hlo,
+) for hlo in glob(
+    include = ["rand_*.hlo"],
+    exclude = [
+        "rand_000001.hlo",  # fails on GPU
+        "rand_000004.hlo",  # times out during coverage
+        # These fail on all platforms
+        "rand_000060.hlo",
+        "rand_000067.hlo",
+        "rand_000072.hlo",
     ],
-)
+)]
diff --git a/third_party/xla/xla/tests/fuzz/build_defs.bzl b/third_party/xla/xla/tests/fuzz/build_defs.bzl
index 4d19dba75f273a..08f5e31a332819 100644
--- a/third_party/xla/xla/tests/fuzz/build_defs.bzl
+++ b/third_party/xla/xla/tests/fuzz/build_defs.bzl
@@ -2,15 +2,47 @@
 
 load("//xla/tests:build_defs.bzl", "xla_test")
 
-def hlo_test(name, hlo_files, srcs, deps, **kwargs):
-    for hlo in hlo_files:
-        without_extension = hlo.split(".")[0]
-        xla_test(
-            name = without_extension,
-            srcs = srcs,
-            env = {"HLO_PATH": "$(location {})".format(hlo)},
-            data = [hlo],
-            real_hardware_only = True,
-            deps = deps,
-            **kwargs
-        )
+def hlo_test(name, hlo, **kwargs):
+    """Wrapper around `xla_test` which runs an HLO through `hlo_test_lib`.
+
+    `srcs = []` because `hlo_test_lib` linked with `tsl/platform:test_main`
+    makes usable test binary where the path to the HLO is given via `HLO_PATH`
+    environment variable.
+
+    This has the following nice properties:
+      * adding an HLO to this directory with the appropriate prefix for a test
+      suite (e.g. rand) will have it automatically create the corresponding test
+      * `hlo_test_lib` only needs to be compiled once instead of for every
+      target
+      * automated tools can easily create reproducer CLs by appending one line
+      to the `xla/tests/fuzz` BUILD file like `hlo_test(name = ..., hlo = ...)`.
+      * plays nicely with `xla_test`, so we have easy testing against all
+      platforms and a `test_suite` generated for each HLO which includes tests
+      against all platforms. This is particularly useful for pruning the set of
+      HLOs, as we can prune against `test_suites` representing all the tests
+      associated with a particular HLO, rather than individual targets.
+
+    In the future it may make sense to reformulate this to use `hlo-opt` and
+    `run_hlo_module` or similar to accomplish the same thing.
+
+    Args:
+      name:
+        The name of the macro. This really could be generated from `hlo`, but
+        tools like build_cleaner assume that all macros have a name attribute.
+      hlo:
+        The hlo to test.
+      **kwargs:
+        Additional arguments passed to `xla_test`.
+    """
+    xla_test(
+        name = name,
+        srcs = [],
+        env = {"HLO_PATH": "$(location {})".format(hlo)},
+        data = [hlo],
+        real_hardware_only = True,
+        deps = [
+            "//xla/tests/fuzz:hlo_test_lib",
+            "@local_tsl//tsl/platform:test_main",
+        ],
+        **kwargs
+    )
diff --git a/third_party/xla/xla/tests/hlo_test_base.cc b/third_party/xla/xla/tests/hlo_test_base.cc
index e9358e88f53be6..fc947372215ec1 100644
--- a/third_party/xla/xla/tests/hlo_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_test_base.cc
@@ -591,6 +591,29 @@ ::testing::AssertionResult HloTestBase::RunAndCompareTwoModules(
                                  run_hlo_passes);
 }
 
+::testing::AssertionResult HloTestBase::RunAndCompareTwoModules(
+    absl::string_view hlo_string_module_0,
+    absl::string_view hlo_string_module_1,
+    const absl::Span<Literal* const> arguments,
+    const std::optional<ErrorSpec>& error, bool run_hlo_passes) {
+  auto module_0_or_status = ParseAndReturnVerifiedModule(hlo_string_module_0);
+  if (!module_0_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "Error while parsing HLO text format: "
+           << module_0_or_status.status().ToString();
+  }
+
+  auto module_1_or_status = ParseAndReturnVerifiedModule(hlo_string_module_1);
+  if (!module_1_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "Error while parsing HLO text format: "
+           << module_1_or_status.status().ToString();
+  }
+  return RunAndCompareTwoModules(std::move(module_0_or_status).value(),
+                                 std::move(module_1_or_status).value(),
+                                 arguments, error, run_hlo_passes);
+}
+
 ::testing::AssertionResult HloTestBase::Run(
     string_view hlo_string, bool run_hlo_passes, ExecutionProfile* profile,
     const tsl::protobuf::Message* backend_config) {
diff --git a/third_party/xla/xla/tests/hlo_test_base.h b/third_party/xla/xla/tests/hlo_test_base.h
index e543e9fe7ca4b9..118fa8713637be 100644
--- a/third_party/xla/xla/tests/hlo_test_base.h
+++ b/third_party/xla/xla/tests/hlo_test_base.h
@@ -313,6 +313,13 @@ class HloTestBase : public ManifestCheckingTest {
       absl::string_view hlo_string_module_1,
       const std::optional<ErrorSpec>& error, bool run_hlo_passes = true);
 
+  // Same as above but requires explicit arguments.
+  ::testing::AssertionResult RunAndCompareTwoModules(
+      absl::string_view hlo_string_module_0,
+      absl::string_view hlo_string_module_1,
+      absl::Span<Literal* const> arguments,
+      const std::optional<ErrorSpec>& error, bool run_hlo_passes = true);
+
   // Executes an hlo module with fake inputs on multiple replicas.
   [[nodiscard]] ::testing::AssertionResult RunReplicated(
       const absl::string_view hlo_string, bool run_hlo_passes = true,
diff --git a/third_party/xla/xla/tests/int4_test.cc b/third_party/xla/xla/tests/int4_test.cc
index 4be51c41519527..84e2e0b98a91c2 100644
--- a/third_party/xla/xla/tests/int4_test.cc
+++ b/third_party/xla/xla/tests/int4_test.cc
@@ -108,5 +108,18 @@ XLA_TEST_F(HloTestBase, OddNumberOfElements) {
   EXPECT_TRUE(RunAndCompare(hlo_text, std::nullopt));
 }
 
+XLA_TEST_F(HloTestBase, Scalar) {
+  // Tests reading an int4 scalar value
+  const std::string hlo_text = R"(
+  HloModule Scalar
+  ENTRY main {
+    x = s4[] parameter(0)
+    y = s8[] convert(x)
+    ROOT z = s8[3, 3] broadcast(y), dimensions={}
+  }
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_text, std::nullopt));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/llvm_compiler_test.cc b/third_party/xla/xla/tests/llvm_compiler_test.cc
index 0003d364620f2e..39acb92460845c 100644
--- a/third_party/xla/xla/tests/llvm_compiler_test.cc
+++ b/third_party/xla/xla/tests/llvm_compiler_test.cc
@@ -26,7 +26,11 @@ limitations under the License.
 #include "xla/service/cpu/cpu_compiler.h"
 #include "xla/service/gpu/gpu_compiler.h"
 #include "xla/service/platform_util.h"
+#if GOOGLE_CUDA
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
+#elif TENSORFLOW_USE_ROCM
+#include "xla/stream_executor/rocm/rocm_platform_id.h"
+#endif
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/test_helpers.h"
@@ -39,14 +43,18 @@ namespace gpu {
 // Creating dummy data structure needed to initialize a GpuDummyCompiler
 constexpr char kDummyTriple[] = "dummy-triple";
 constexpr char kDummyLayout[] = "e";
-
+const se::Platform::Id kGpuPlatformId =
+#if GOOGLE_CUDA
+    se::cuda::kCudaPlatformId;
+#elif TENSORFLOW_USE_ROCM
+    se::rocm::kROCmPlatformId;
+#endif
 // This class is a dummy implementation of GpuCompiler and is targeted for unit
 // test only
 class GpuDummyCompiler : public GpuCompiler {
  public:
   GpuDummyCompiler()
-      : GpuCompiler(se::cuda::kCudaPlatformId, kDummyTriple, kDummyLayout) {}
-
+      : GpuCompiler(kGpuPlatformId, kDummyTriple, kDummyLayout) {}
   Status OptimizeHloConvolutionCanonicalization(
       HloModule* hlo_module, se::GpuComputeCapability gpu_version,
       se::dnn::VersionInfo dnn_version,
@@ -61,13 +69,11 @@ class GpuDummyCompiler : public GpuCompiler {
     return OkStatus();
   }
 
-  StatusOr<std::pair<std::string, std::vector<uint8_t>>> CompileTargetBinary(
+  StatusOr<GpuCompiler::BackendCompileResult> CompileTargetBinary(
       const HloModuleConfig& module_config, llvm::Module* llvm_module,
       se::GpuComputeCapability gpu_version, bool relocatable,
       const HloModule* debug_module, const CompileOptions& options) override {
-    std::vector<uint8_t> compiled_results;
-    return std::pair<std::string, std::vector<uint8_t>>(
-        "", std::move(compiled_results));
+    return BackendCompileResult{};
   }
 };
 }  // namespace gpu
diff --git a/third_party/xla/xla/tests/multioutput_fusion_test.cc b/third_party/xla/xla/tests/multioutput_fusion_test.cc
index 0b14df8dcbfb72..58221ea57aa960 100644
--- a/third_party/xla/xla/tests/multioutput_fusion_test.cc
+++ b/third_party/xla/xla/tests/multioutput_fusion_test.cc
@@ -190,36 +190,6 @@ XLA_TEST_F(MultiOutputFusionTest, DifferentTypesNoFusion) {
 }
 XLA_TEST_F(MultiOutputFusionTest, DifferentTypesFusion) { RunTest1D(true, 8); }
 
-XLA_TEST_F(MultiOutputFusionTest, FusionNodeIsRoot) {
-  const char* testcase = R"(
-    HloModule m, is_scheduled=true
-
-    fused_computation {
-      x.param_0 = (((s32[]), f32[]), (f32[], s32[])) parameter(0)
-      gte.3 = ((s32[]), f32[]) get-tuple-element(x.param_0), index=0
-      gte.2 = (s32[]) get-tuple-element(gte.3), index=0
-      gte.4 = s32[] get-tuple-element(gte.2), index=0
-      copy = s32[] copy(gte.4)
-      ROOT tuple = (s32[]) tuple(copy)
-    }
-
-    ENTRY thing.v3 {
-      x = (((s32[]), f32[]), (f32[], s32[])) parameter(0)
-      ROOT fusion = (s32[]) fusion(x), kind=kLoop, calls=fused_computation
-    }
-  )";
-  auto module = ParseAndReturnVerifiedModule(testcase).value();
-  auto param = LiteralUtil::MakeTupleOwned(
-      LiteralUtil::MakeTupleOwned(
-          LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR0<int32_t>(42)),
-          LiteralUtil::CreateR0<float>(1.0)),
-      LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR0<float>(3.0),
-                                  LiteralUtil::CreateR0<int32_t>(4)));
-  Literal result = ExecuteNoHloPasses(std::move(module), {&param});
-  EXPECT_TRUE(LiteralTestUtil::Equal(
-      LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR0<int32_t>(42)), result));
-}
-
 XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFusion) {
   const char* testcase = R"(
     HloModule m, is_scheduled=true
diff --git a/third_party/xla/xla/tests/onednn_layer_norm_test.cc b/third_party/xla/xla/tests/onednn_layer_norm_test.cc
new file mode 100644
index 00000000000000..699a8f3b1d35bf
--- /dev/null
+++ b/third_party/xla/xla/tests/onednn_layer_norm_test.cc
@@ -0,0 +1,172 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include "xla/test.h"
+#include "xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+class LayerNormTest : public HloTestBase {};
+
+TEST_F(LayerNormTest, SimpleTest) {
+  const char* layer_norm_module_str = R"(
+  HloModule layer_norm.test, entry_computation_layout={(f32[4,1,256]{2,1,0}, f32[1,1,256]{2,1,0}, f32[1,1,256]{2,1,0})->f32[4,1,256]{2,1,0}}
+
+  region_add {
+    Arg_0.7555 = f32[] parameter(0)
+    Arg_1.7556 = f32[] parameter(1)
+    ROOT add.7557 = f32[] add(Arg_0.7555, Arg_1.7556)
+  }
+
+  ENTRY main {
+    Arg_0.1 = f32[4,1,256]{2,1,0} parameter(0), sharding={replicated}
+    Arg_0.2 = f32[1,1,256]{2,1,0} parameter(1), sharding={replicated}
+    Arg_0.3 = f32[1,1,256]{2,1,0} parameter(2), sharding={replicated}
+    reshape.9744 = f32[1,4,1,256]{3,2,1,0} reshape(Arg_0.1)
+    multiply.9743 = f32[4,1,256]{2,1,0} multiply(Arg_0.1, Arg_0.1)
+    reshape.9745 = f32[1,4,1,256]{3,2,1,0} reshape(multiply.9743)
+    concatenate.9746 = f32[2,4,1,256]{3,2,1,0} concatenate(reshape.9744, reshape.9745), dimensions={0}
+    constant.9731 = f32[] constant(0)
+    reduce.9747 = f32[2,4,1]{2,1,0} reduce(concatenate.9746, constant.9731), dimensions={3}, to_apply=region_add
+    constant.9729 = f32[] constant(256)
+    broadcast.9730 = f32[2,4,1]{2,1,0} broadcast(constant.9729), dimensions={}
+    divide.9748 = f32[2,4,1]{2,1,0} divide(reduce.9747, broadcast.9730)
+    slice.9749 = f32[1,4,1]{2,1,0} slice(divide.9748), slice={[0:1], [0:4], [0:1]}
+    reshape.9756 = f32[4,1,1]{2,1,0} reshape(slice.9749)
+    broadcast.9758 = f32[4,1,1]{2,1,0} broadcast(reshape.9756), dimensions={0,1,2}
+    reshape.9759 = f32[4,1]{1,0} reshape(broadcast.9758)
+    broadcast.9760 = f32[4,1,256]{2,1,0} broadcast(reshape.9759), dimensions={0,1}
+    subtract.9761 = f32[4,1,256]{2,1,0} subtract(Arg_0.1, broadcast.9760)
+    slice.9751 = f32[1,4,1]{2,1,0} slice(divide.9748), slice={[1:2], [0:4], [0:1]}
+    reshape.9752 = f32[4,1]{1,0} reshape(slice.9751)
+    reshape.9750 = f32[4,1]{1,0} reshape(slice.9749)
+    multiply.9753 = f32[4,1]{1,0} multiply(reshape.9750, reshape.9750)
+    subtract.9754 = f32[4,1]{1,0} subtract(reshape.9752, multiply.9753)
+    constant.9727 = f32[] constant(0)
+    broadcast.9728 = f32[4,1]{1,0} broadcast(constant.9727), dimensions={}
+    maximum.9755 = f32[4,1]{1,0} maximum(subtract.9754, broadcast.9728)
+    reshape.9757 = f32[4,1,1]{2,1,0} reshape(maximum.9755)
+    constant.9725 = f32[] constant(1e-05)
+    broadcast.9726 = f32[4,1,1]{2,1,0} broadcast(constant.9725), dimensions={}
+    add.9762 = f32[4,1,1]{2,1,0} add(reshape.9757, broadcast.9726)
+    rsqrt.9763 = f32[4,1,1]{2,1,0} rsqrt(add.9762)
+    broadcast.9764 = f32[4,1,1]{2,1,0} broadcast(rsqrt.9763), dimensions={0,1,2}
+    reshape.9765 = f32[4,1]{1,0} reshape(broadcast.9764)
+    broadcast.9766 = f32[4,1,256]{2,1,0} broadcast(reshape.9765), dimensions={0,1}
+    broadcast.9767 = f32[1,1,256]{2,1,0} broadcast(Arg_0.2), dimensions={0,1,2}
+    reshape.9768 = f32[1,256]{1,0} reshape(broadcast.9767)
+    broadcast.9769 = f32[4,1,256]{2,1,0} broadcast(reshape.9768), dimensions={1,2}
+    multiply.9770 = f32[4,1,256]{2,1,0} multiply(broadcast.9766, broadcast.9769)
+    multiply.9771 = f32[4,1,256]{2,1,0} multiply(subtract.9761, multiply.9770)
+    broadcast.9772 = f32[1,1,256]{2,1,0} broadcast(Arg_0.3), dimensions={0,1,2}
+    reshape.9773 = f32[1,256]{1,0} reshape(broadcast.9772)
+    broadcast.9774 = f32[4,1,256]{2,1,0} broadcast(reshape.9773), dimensions={1,2}
+    ROOT add.9775 = f32[4,1,256]{2,1,0} add(multiply.9771, broadcast.9774)
+  }  
+)";
+
+  EXPECT_TRUE(RunAndCompare(layer_norm_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(layer_norm_module_str,
+                    R"(
+  ; CHECK:     custom_call_target="__onednn$layernorm",
+  ; CHECK:       backend_config={
+  ; CHECK-DAG:     "onednn_layer_norm_config":{
+  ; CHECK-DAG:       "fused_ops":"SCALE_AND_SHIFT"
+  ; CHECK-DAG:   }
+  ; CHECK:     }
+  )");
+}
+
+TEST_F(LayerNormTest, SimpleTestBF16) {
+  const char* layer_norm_module_str = R"(
+  HloModule layer_norm_bf16.test, entry_computation_layout={(f32[768]{0}, f32[768]{0}, bf16[16,128,768]{2,1,0})->bf16[16,128,768]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+
+  region_0.16 {
+    Arg_0.17 = f32[] parameter(0)
+    Arg_1.18 = f32[] parameter(1)
+    ROOT add.19 = f32[] add(Arg_0.17, Arg_1.18)
+  }
+
+  ENTRY main.53 {
+    Arg_2.3 = bf16[16,128,768]{2,1,0} parameter(2), sharding={replicated}
+    convert.31 = f32[16,128,768]{2,1,0} convert(Arg_2.3)
+    convert.11 = f32[16,128,768]{2,1,0} convert(Arg_2.3)
+    reshape.13 = f32[1,16,128,768]{3,2,1,0} reshape(convert.11)
+    multiply.12 = f32[16,128,768]{2,1,0} multiply(convert.11, convert.11)
+    reshape.14 = f32[1,16,128,768]{3,2,1,0} reshape(multiply.12)
+    concatenate.15 = f32[2,16,128,768]{3,2,1,0} concatenate(reshape.13, reshape.14), dimensions={0}
+    constant.10 = f32[] constant(0)
+    reduce.20 = f32[2,16,128]{2,1,0} reduce(concatenate.15, constant.10), dimensions={3}, to_apply=region_0.16
+    constant.8 = f32[] constant(768)
+    broadcast.9 = f32[2,16,128]{2,1,0} broadcast(constant.8), dimensions={}
+    divide.21 = f32[2,16,128]{2,1,0} divide(reduce.20, broadcast.9)
+    slice.22 = f32[1,16,128]{2,1,0} slice(divide.21), slice={[0:1], [0:16], [0:128]}
+    reshape.29 = f32[16,128,1]{2,1,0} reshape(slice.22)
+    broadcast.32 = f32[16,128,1]{2,1,0} broadcast(reshape.29), dimensions={0,1,2}
+    reshape.33 = f32[16,128]{1,0} reshape(broadcast.32)
+    broadcast.34 = f32[16,128,768]{2,1,0} broadcast(reshape.33), dimensions={0,1}
+    subtract.35 = f32[16,128,768]{2,1,0} subtract(convert.31, broadcast.34)
+    slice.24 = f32[1,16,128]{2,1,0} slice(divide.21), slice={[1:2], [0:16], [0:128]}
+    reshape.25 = f32[16,128]{1,0} reshape(slice.24)
+    reshape.23 = f32[16,128]{1,0} reshape(slice.22)
+    multiply.26 = f32[16,128]{1,0} multiply(reshape.23, reshape.23)
+    subtract.27 = f32[16,128]{1,0} subtract(reshape.25, multiply.26)
+    constant.6 = f32[] constant(0)
+    broadcast.7 = f32[16,128]{1,0} broadcast(constant.6), dimensions={}
+    maximum.28 = f32[16,128]{1,0} maximum(subtract.27, broadcast.7)
+    reshape.30 = f32[16,128,1]{2,1,0} reshape(maximum.28)
+    constant.4 = f32[] constant(1e-06)
+    broadcast.5 = f32[16,128,1]{2,1,0} broadcast(constant.4), dimensions={}
+    add.36 = f32[16,128,1]{2,1,0} add(reshape.30, broadcast.5)
+    rsqrt.37 = f32[16,128,1]{2,1,0} rsqrt(add.36)
+    broadcast.39 = f32[16,128,1]{2,1,0} broadcast(rsqrt.37), dimensions={0,1,2}
+    reshape.40 = f32[16,128]{1,0} reshape(broadcast.39)
+    broadcast.41 = f32[16,128,768]{2,1,0} broadcast(reshape.40), dimensions={0,1}
+    Arg_1.2 = f32[768]{0} parameter(1), sharding={replicated}
+    reshape.38 = f32[1,1,768]{2,1,0} reshape(Arg_1.2)
+    broadcast.42 = f32[1,1,768]{2,1,0} broadcast(reshape.38), dimensions={0,1,2}
+    reshape.43 = f32[768]{0} reshape(broadcast.42)
+    broadcast.44 = f32[16,128,768]{2,1,0} broadcast(reshape.43), dimensions={2}
+    multiply.45 = f32[16,128,768]{2,1,0} multiply(broadcast.41, broadcast.44)
+    multiply.46 = f32[16,128,768]{2,1,0} multiply(subtract.35, multiply.45)
+    Arg_0.1 = f32[768]{0} parameter(0), sharding={replicated}
+    reshape.47 = f32[1,1,768]{2,1,0} reshape(Arg_0.1)
+    broadcast.48 = f32[1,1,768]{2,1,0} broadcast(reshape.47), dimensions={0,1,2}
+    reshape.49 = f32[768]{0} reshape(broadcast.48)
+    broadcast.50 = f32[16,128,768]{2,1,0} broadcast(reshape.49), dimensions={2}
+    add.51 = f32[16,128,768]{2,1,0} add(multiply.46, broadcast.50)
+    ROOT convert.52 = bf16[16,128,768]{2,1,0} convert(add.51)
+  }
+)";
+
+  EXPECT_TRUE(RunAndCompare(layer_norm_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(layer_norm_module_str,
+                    R"(
+  ; CHECK:     custom_call_target="__onednn$layernorm",
+  ; CHECK:       backend_config={
+  ; CHECK-DAG:     "onednn_layer_norm_config":{
+  ; CHECK-DAG:       "fused_ops":"SCALE_AND_SHIFT"
+  ; CHECK-DAG:   }
+  ; CHECK:     }
+  )");
+}
+
+}  // namespace
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
diff --git a/third_party/xla/xla/tests/onednn_softmax_test.cc b/third_party/xla/xla/tests/onednn_softmax_test.cc
new file mode 100644
index 00000000000000..b3e15eaede3b02
--- /dev/null
+++ b/third_party/xla/xla/tests/onednn_softmax_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include <utility>
+
+#include "xla/literal.h"
+#include "xla/shape_util.h"
+#include "xla/test.h"
+#include "xla/test_helpers.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/test_macros.h"
+
+namespace xla {
+namespace cpu {
+
+class OneDnnSoftmaxTest : public HloTestBase {};
+
+TEST_F(OneDnnSoftmaxTest, Softmaxtest) {
+  const std::string hlo_string = R"(
+        HloModule jit_softmax, entry_computation_layout={(f32[16,128,30522]{2,1,0})->f32[16,128,30522]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+        region_0.4 {
+            Arg_0.5 = f32[] parameter(0)
+            Arg_1.6 = f32[] parameter(1)
+            ROOT maximum.7 = f32[] maximum(Arg_0.5, Arg_1.6)
+        }
+        region_1.15 {
+            Arg_0.16 = f32[] parameter(0)
+            Arg_1.17 = f32[] parameter(1)
+            ROOT add.18 = f32[] add(Arg_0.16, Arg_1.17)
+        }
+        ENTRY main.25 {
+            Arg_0.1 = f32[16,128,30522]{2,1,0} parameter(0), sharding={replicated}
+            constant.3 = f32[] constant(-inf)
+            reduce.8 = f32[16,128]{1,0} reduce(Arg_0.1, constant.3), dimensions={2}, to_apply=region_0.4
+            reshape.9 = f32[16,128,1]{2,1,0} reshape(reduce.8)
+            broadcast.10 = f32[16,128,1]{2,1,0} broadcast(reshape.9), dimensions={0,1,2}
+            reshape.11 = f32[16,128]{1,0} reshape(broadcast.10)
+            broadcast.12 = f32[16,128,30522]{2,1,0} broadcast(reshape.11), dimensions={0,1}
+            subtract.13 = f32[16,128,30522]{2,1,0} subtract(Arg_0.1, broadcast.12)
+            exponential.14 = f32[16,128,30522]{2,1,0} exponential(subtract.13)
+            constant.2 = f32[] constant(0)
+            reduce.19 = f32[16,128]{1,0} reduce(exponential.14, constant.2), dimensions={2}, to_apply=region_1.15
+            reshape.20 = f32[16,128,1]{2,1,0} reshape(reduce.19)
+            broadcast.21 = f32[16,128,1]{2,1,0} broadcast(reshape.20), dimensions={0,1,2}
+            reshape.22 = f32[16,128]{1,0} reshape(broadcast.21)
+            broadcast.23 = f32[16,128,30522]{2,1,0} broadcast(reshape.22), dimensions={0,1}
+            ROOT divide.24 = f32[16,128,30522]{2,1,0} divide(exponential.14, broadcast.23)
+        }
+    )";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-4, 1e-4}));
+}
+
+TEST_F(OneDnnSoftmaxTest, SoftmaxFP32) {
+  const std::string hlo_string = R"(
+        HloModule jit_softmax, entry_computation_layout={(f32[1,128,30522]{2,1,0})->f32[1,128,30522]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+        region_0.4 {
+            Arg_0.5 = f32[] parameter(0)
+            Arg_1.6 = f32[] parameter(1)
+            ROOT maximum.7 = f32[] maximum(Arg_0.5, Arg_1.6)
+        }
+        region_1.15 {
+            Arg_0.16 = f32[] parameter(0)
+            Arg_1.17 = f32[] parameter(1)
+            ROOT add.18 = f32[] add(Arg_0.16, Arg_1.17)
+        }
+        ENTRY main.25 {
+            Arg_0.1 = f32[1,128,30522]{2,1,0} parameter(0), sharding={replicated}
+            constant.3 = f32[] constant(-inf)
+            reduce.8 = f32[1,128]{1,0} reduce(Arg_0.1, constant.3), dimensions={2}, to_apply=region_0.4
+            reshape.9 = f32[1,128,1]{2,1,0} reshape(reduce.8)
+            broadcast.10 = f32[1,128,1]{2,1,0} broadcast(reshape.9), dimensions={0,1,2}
+            reshape.11 = f32[1,128]{1,0} reshape(broadcast.10)
+            broadcast.12 = f32[1,128,30522]{2,1,0} broadcast(reshape.11), dimensions={0,1}
+            subtract.13 = f32[1,128,30522]{2,1,0} subtract(Arg_0.1, broadcast.12)
+            exponential.14 = f32[1,128,30522]{2,1,0} exponential(subtract.13)
+            constant.2 = f32[] constant(0)
+            reduce.19 = f32[1,128]{1,0} reduce(exponential.14, constant.2), dimensions={2}, to_apply=region_1.15
+            reshape.20 = f32[1,128,1]{2,1,0} reshape(reduce.19)
+            broadcast.21 = f32[1,128,1]{2,1,0} broadcast(reshape.20), dimensions={0,1,2}
+            reshape.22 = f32[1,128]{1,0} reshape(broadcast.21)
+            broadcast.23 = f32[1,128,30522]{2,1,0} broadcast(reshape.22), dimensions={0,1}
+            ROOT divide.24 = f32[1,128,30522]{2,1,0} divide(exponential.14, broadcast.23)
+        }
+    )";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-4, 1e-4}));
+}
+
+TEST_F(OneDnnSoftmaxTest, SoftmaxBF16) {
+  const std::string hlo_string = R"(
+        HloModule jit_softmax, entry_computation_layout={(bf16[1,128,30522]{2,1,0})->bf16[1,128,30522]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+        region_0.4 {
+            Arg_0.5 = bf16[] parameter(0)
+            Arg_1.6 = bf16[] parameter(1)
+            ROOT maximum.7 = bf16[] maximum(Arg_0.5, Arg_1.6)
+        }
+        region_1.15 {
+            Arg_0.16 = bf16[] parameter(0)
+            Arg_1.17 = bf16[] parameter(1)
+            ROOT add.18 = bf16[] add(Arg_0.16, Arg_1.17)
+        }
+        ENTRY main.25 {
+            Arg_0.1 = bf16[1,128,30522]{2,1,0} parameter(0), sharding={replicated}
+            constant.3 = bf16[] constant(-inf)
+            reduce.8 = bf16[1,128]{1,0} reduce(Arg_0.1, constant.3), dimensions={2}, to_apply=region_0.4
+            reshape.9 = bf16[1,128,1]{2,1,0} reshape(reduce.8)
+            broadcast.10 = bf16[1,128,1]{2,1,0} broadcast(reshape.9), dimensions={0,1,2}
+            reshape.11 = bf16[1,128]{1,0} reshape(broadcast.10)
+            broadcast.12 = bf16[1,128,30522]{2,1,0} broadcast(reshape.11), dimensions={0,1}
+            subtract.13 = bf16[1,128,30522]{2,1,0} subtract(Arg_0.1, broadcast.12)
+            exponential.14 = bf16[1,128,30522]{2,1,0} exponential(subtract.13)
+            constant.2 = bf16[] constant(0)
+            reduce.19 = bf16[1,128]{1,0} reduce(exponential.14, constant.2), dimensions={2}, to_apply=region_1.15
+            reshape.20 = bf16[1,128,1]{2,1,0} reshape(reduce.19)
+            broadcast.21 = bf16[1,128,1]{2,1,0} broadcast(reshape.20), dimensions={0,1,2}
+            reshape.22 = bf16[1,128]{1,0} reshape(broadcast.21)
+            broadcast.23 = bf16[1,128,30522]{2,1,0} broadcast(reshape.22), dimensions={0,1}
+            ROOT divide.24 = bf16[1,128,30522]{2,1,0} divide(exponential.14, broadcast.23)
+        }
+    )";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-4, 1e-4}));
+}
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
diff --git a/third_party/xla/xla/tests/pjrt_cpu_client_registry.cc b/third_party/xla/xla/tests/pjrt_cpu_client_registry.cc
index 02ec05afbe3352..d65884c15bbdbf 100644
--- a/third_party/xla/xla/tests/pjrt_cpu_client_registry.cc
+++ b/third_party/xla/xla/tests/pjrt_cpu_client_registry.cc
@@ -13,18 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/tests/pjrt_client_registry.h"
 
 namespace xla {
 namespace {
 
 // Register a CPU PjRt client for tests.
-const bool kUnused =
-    (RegisterPjRtClientTestFactory([]() {
-       return GetTfrtCpuClient(/*asynchronous=*/true, /*cpu_device_count=*/4);
-     }),
-     true);
+const bool kUnused = (RegisterPjRtClientTestFactory([]() {
+                        CpuClientOptions options;
+                        options.cpu_device_count = 4;
+                        return GetTfrtCpuClient(options);
+                      }),
+                      true);
 
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/pjrt_gpu_client_registry.cc b/third_party/xla/xla/tests/pjrt_gpu_client_registry.cc
index c6885c8ca16edf..11fe68be7fbba2 100644
--- a/third_party/xla/xla/tests/pjrt_gpu_client_registry.cc
+++ b/third_party/xla/xla/tests/pjrt_gpu_client_registry.cc
@@ -26,8 +26,9 @@ const bool kUnused =
        gpu_config.kind = xla::GpuAllocatorConfig::Kind::kDefault;
        gpu_config.preallocate = true;
        gpu_config.memory_fraction = 0.08;
-       return GetStreamExecutorGpuClient(/*asynchronous=*/true, gpu_config,
-                                         /*node_id=*/0);
+       GpuClientOptions options;
+       options.allocator_config = gpu_config;
+       return GetStreamExecutorGpuClient(options);
      }),
      true);
 
diff --git a/third_party/xla/xla/tests/prng_test.cc b/third_party/xla/xla/tests/prng_test.cc
index cf9e1dcbae618f..accfa44034ae72 100644
--- a/third_party/xla/xla/tests/prng_test.cc
+++ b/third_party/xla/xla/tests/prng_test.cc
@@ -13,10 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
 #include <limits>
 #include <memory>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 #include "absl/types/span.h"
+#include "unsupported/Eigen/SpecialFunctions"  // from @eigen_archive
 #include "xla/client/local_client.h"
 #include "xla/client/xla_builder.h"
 #include "xla/literal.h"
@@ -43,8 +51,8 @@ class PrngTest : public ClientLibraryTestBase {
   // of the given range size. `expected_count` is the number of times each
   // possible value is expected to be generated. Thus, the sample size is
   // `range_size * expected_count`.
-  double UniformChiSquared(int32_t range_size, int32_t expected_count,
-                           int64_t seed = 42);
+  void UniformChiSquared(int32_t range_size, int32_t expected_count,
+                         int64_t seed = 42);
 };
 
 template <typename T>
@@ -141,10 +149,30 @@ template <typename T>
 T Square(T x) {
   return x * x;
 }
+
+// Calculates the p-value (probability) of a given chi-square value and degrees
+// of freedom.
+double ChiSquarePValue(double chi_square, int dof) {
+  // We are doing a right-tailed test so the p-value is calculated as 1 - CDF.
+  //
+  // The CDF can be computed using the regularized lower incomplete gamma
+  // function like so:
+  // gammainc(dof/2, chi_square/2).
+  //
+  // Seeing as we are interested in 1-CDF, we can compute this using the
+  // regularized upper incomplete gamma function like so:
+  // gammaincc(dof/2, chi_square/2).
+  //
+  // NIST/SEMATECH e-Handbook of Statistical Methods, 1.3.6.6.6. Chi-Square
+  // Distribution: Cumulative Distribution Function
+  // https://www.itl.nist.gov/div898/handbook/eda/section3/eda3666.htm#cdf
+  return Eigen::numext::igammac(0.5 * dof, 0.5 * chi_square);
+}
+
 }  // namespace
 
-double PrngTest::UniformChiSquared(int32_t range_size, int32_t expected_count,
-                                   int64_t seed) {
+void PrngTest::UniformChiSquared(int32_t range_size, int32_t expected_count,
+                                 int64_t seed) {
   int32_t sample_size = range_size * expected_count;
 
   XlaBuilder builder(TestName());
@@ -157,34 +185,48 @@ double PrngTest::UniformChiSquared(int32_t range_size, int32_t expected_count,
   std::vector<int32_t> counts(range_size, 0);
   actual.EachCell<int32_t>(
       [&counts](absl::Span<const int64_t>, int32_t value) { ++counts[value]; });
+  LOG(INFO) << "sample_size = " << sample_size;
+  LOG(INFO) << "range_size = " << range_size;
+  LOG(INFO) << "expected_count = " << expected_count;
+  for (int32_t i = 0; i < range_size; ++i) {
+    LOG(INFO) << "counts[" << i << "] = " << counts[i];
+  }
   int64_t sum = 0;
   for (int32_t i = 0; i < range_size; ++i) {
     sum += Square(static_cast<int64_t>(counts[i] - expected_count));
   }
-  return static_cast<double>(sum) / expected_count;
+  double chi_square = static_cast<double>(sum) / expected_count;
+  int64_t dof = range_size - 1;
+  double p_value = ChiSquarePValue(chi_square, dof);
+  const double kLevelOfSignificance = 1e-5;
+  // We have two hypotheses:
+  // - null hypothesis: the distribution we sampled from cannot be distinguished
+  // from a uniform random distribution.
+  // - alternate hypothesis: the distribution we sampled from can be
+  // distinguished from a uniform random distribution.
+  //
+  // The lower our calculated p-value, the less likely we would get this result
+  // if the null hypothesis were true. If our p-value is greater than or equal
+  // to `kLevelOfSignificance`, we cannot reject the null hypothesis.
+  //
+  // Another way of saying this is that if our p-value is greater than or equal
+  // to `kLevelOfSignificance` then we can consider our data randomly
+  // distributed with a confidence of 1-kLevelOfSignificance; otherwise, if our
+  // p-value is less than `kLevelOfSignificance` then our data is non-random
+  // with a confidence of 1-kLevelOfSignificance.
+  EXPECT_GE(p_value, kLevelOfSignificance);
 }
 
 // We only test distribution of uniform discrete PRNG as other types are based
 // on it.
 // These range sizes are arbitrary but include prime numbers, powers of 2, and
 // other composite numbers.
-// The level of significance in all these cases is 1/20.
 // TODO(b/35723038): Use parametrized tests where possible.
-XLA_TEST_F(PrngTest, Uniformity7) {
-  EXPECT_LT(UniformChiSquared(7, 256), 12.5916);
-}
-XLA_TEST_F(PrngTest, Uniformity61) {
-  EXPECT_LT(UniformChiSquared(61, 256), 79.0819);
-}
-XLA_TEST_F(PrngTest, Uniformity64) {
-  EXPECT_LT(UniformChiSquared(64, 256), 82.5287);
-}
-XLA_TEST_F(PrngTest, Uniformity108) {
-  EXPECT_LT(UniformChiSquared(108, 256), 132.144);
-}
-XLA_TEST_F(PrngTest, Uniformity256) {
-  EXPECT_LT(UniformChiSquared(256, 512), 293.248);
-}
+XLA_TEST_F(PrngTest, Uniformity7) { UniformChiSquared(7, 256); }
+XLA_TEST_F(PrngTest, Uniformity61) { UniformChiSquared(61, 256); }
+XLA_TEST_F(PrngTest, Uniformity64) { UniformChiSquared(64, 256); }
+XLA_TEST_F(PrngTest, Uniformity108) { UniformChiSquared(108, 256); }
+XLA_TEST_F(PrngTest, Uniformity256) { UniformChiSquared(256, 256); }
 
 // TODO(b/134770669): May remove this test if we decide not to support map
 //                    computations with kRng instructions.
diff --git a/third_party/xla/xla/tests/reduce_test.cc b/third_party/xla/xla/tests/reduce_test.cc
index 843f276d82868c..9885284eb196d3 100644
--- a/third_party/xla/xla/tests/reduce_test.cc
+++ b/third_party/xla/xla/tests/reduce_test.cc
@@ -1044,6 +1044,26 @@ XLA_TEST_F(ReduceHloTest, HandleReductionToVectorAndOtherReduction) {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-5, 1e-5}));
 }
 
+XLA_TEST_F(ReduceHloTest, ReduceAtomicF16) {
+  absl::string_view hlo_string = R"(
+HloModule jit_reduce_axes12
+
+region_0.3 {
+  Arg_0.4 = f16[] parameter(0)
+  Arg_1.5 = f16[] parameter(1)
+  ROOT minimum.6 = f16[] minimum(Arg_0.4, Arg_1.5)
+}
+
+ENTRY main.8 {
+  constant.1 = f16[] constant(1)
+  Arg_0.1 = f16[2,16385,1]{2,1,0} broadcast(constant.1), dimensions={}
+  constant.2 = f16[] constant(inf)
+  ROOT reduce.7 = f16[2]{0} reduce(Arg_0.1, constant.2), dimensions={1,2}, to_apply=region_0.3
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-5, 1e-5}));
+}
+
 class VariadicReduceTest : public HloTestBase {};
 
 XLA_TEST_F(VariadicReduceTest, Reduce_R3x2_to_R2x2_simple) {
diff --git a/third_party/xla/xla/tests/scalar_computations_test.cc b/third_party/xla/xla/tests/scalar_computations_test.cc
index 7e6c96a596b672..7e53fbce8f441a 100644
--- a/third_party/xla/xla/tests/scalar_computations_test.cc
+++ b/third_party/xla/xla/tests/scalar_computations_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <cmath>
 #include <limits>
 #include <memory>
+#include <type_traits>
 
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
@@ -61,7 +62,13 @@ class ScalarComputationsTest : public ClientLibraryTestBase {
     XlaBuilder builder(TestName());
     XlaOp lhs_op = ConstantR0<NativeT>(&builder, lhs);
     XlaOp rhs_op = ConstantR0<NativeT>(&builder, rhs);
-    op(lhs_op, rhs_op, {});
+    XlaOp minmax_op = op(lhs_op, rhs_op, {});
+    // Canonicalize NaNs so we can do a bitwise compare without caring about
+    // payloads.
+    if constexpr (std::is_floating_point_v<NativeT>) {
+      XlaOp isnan_op = Ne(minmax_op, minmax_op);
+      Select(isnan_op, ConstantR0<NativeT>(&builder, NAN), minmax_op);
+    }
     ComputeAndCompareR0<NativeT>(&builder, expected, {});
   }
 };
diff --git a/third_party/xla/xla/tests/topk_test.cc b/third_party/xla/xla/tests/topk_test.cc
index 31399f88024547..9ce4450770b452 100644
--- a/third_party/xla/xla/tests/topk_test.cc
+++ b/third_party/xla/xla/tests/topk_test.cc
@@ -22,37 +22,25 @@ namespace {
 
 class TopkTest : public HloTestBase {};
 
-XLA_TEST_F(TopkTest, SimpleTopK) {
+XLA_TEST_F(TopkTest, LargestTopK) {
   absl::string_view hlo = R"(
 HloModule topk
 
-compare {
-  p.0.lhs = bf16[] parameter(0)
-  p.0.rhs = bf16[] parameter(1)
-  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=GT
-}
-
 ENTRY TopK {
   x = bf16[10,10] parameter(0)
-  ROOT topk = (bf16[10,2], s32[10,2]) topk(x), k=2, to_apply=compare
+  ROOT topk = (bf16[10,2], s32[10,2]) topk(x), k=2, largest=true
 }
 )";
   EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{1e-5, 1e-5}));
 }
 
-XLA_TEST_F(TopkTest, SimpleTopKReverseDirection) {
+XLA_TEST_F(TopkTest, SmallestTopK) {
   absl::string_view hlo = R"(
 HloModule topk
 
-compare {
-  p.0.lhs = bf16[] parameter(0)
-  p.0.rhs = bf16[] parameter(1)
-  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
-}
-
 ENTRY TopK {
   x = bf16[10,10] parameter(0)
-  ROOT topk = (bf16[10,2], s32[10,2]) topk(x), k=2, to_apply=compare
+  ROOT topk = (bf16[10,2], s32[10,2]) topk(x), k=2, largest=false
 }
 )";
   EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{1e-5, 1e-5}));
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index efe3e9504ba30e..99b1848b934800 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -1,5 +1,6 @@
 # Tools and utilities that aid in XLA development and usage.
 
+load("//xla/tests:build_defs.bzl", "xla_test")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
@@ -8,9 +9,12 @@ load(
     "xla_cc_test",
     "xla_py_proto_library",
 )
+load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load(
     "@local_tsl//tsl:tsl.bzl",
     "if_cuda_or_rocm",
+    "tsl_gpu_library",
 )
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load(
@@ -18,6 +22,10 @@ load(
     "tf_proto_library",
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 
 package(
     default_visibility = ["//visibility:public"],
@@ -274,6 +282,14 @@ xla_cc_binary(
     ],
 )
 
+xla_cc_binary(
+    name = "hlo-opt",
+    testonly = True,
+    deps = [
+        "//xla/tools/hlo_opt:opt_main",
+    ],
+)
+
 cc_library(
     name = "hlo_expand_main",
     srcs = ["hlo_expand_main.cc"],
@@ -616,3 +632,86 @@ xla_cc_binary(
         "@local_tsl//tsl/platform:platform_port",
     ],
 )
+
+tsl_gpu_library(
+    name = "xla_compile_lib",
+    srcs = ["xla_compile_lib.cc"],
+    hdrs = ["xla_compile_lib.h"],
+    cuda_deps = [
+    ],
+    defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:hlo_module_group",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:xla_compile_result_proto_cc_impl",
+        "//xla/service/cpu:cpu_compiler",
+        "//xla/service/cpu:cpu_executable",
+        "//xla/stream_executor:device_memory_allocator",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:env_time",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+    ] + if_cuda_is_configured([
+        "//xla/service/gpu:nvptx_compiler",
+        "//xla/service/gpu:nvptx_compiler_impl",
+    ]) + if_rocm_is_configured([
+        "//xla/service/gpu:amdgpu_compiler",
+        "//xla/service/gpu:amdgpu_compiler_impl",
+    ]) + if_gpu_is_configured([
+        "//xla/service/gpu:executable_proto_cc",
+        "//xla/service/gpu:gpu_compiler",
+        "//xla/stream_executor/gpu:gpu_init",
+    ]),
+)
+
+xla_test(
+    name = "xla_compile_lib_test",
+    srcs = ["xla_compile_lib_test.cc"],
+    backend_tags = {
+        "gpu": [
+            "requires-gpu-nvidia",
+            "config-cuda-only",
+        ],
+    },
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    data = [
+        ":data/add.hlo",
+        "//xla/service:xla_aot_compile_test_gpu_target_config.prototxt",
+    ],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    deps = [
+        ":xla_compile_lib",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:platform_util",
+        "//xla/service:xla_compile_result_proto_cc_impl",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:test_macros_header",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:env_time",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/protobuf:error_codes_proto_impl_cc",
+    ],
+)
diff --git a/third_party/xla/xla/tools/hlo_opt/BUILD b/third_party/xla/xla/tools/hlo_opt/BUILD
new file mode 100644
index 00000000000000..f41550dbc00164
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/BUILD
@@ -0,0 +1,151 @@
+load("//xla:glob_lit_test.bzl", "glob_lit_tests")
+load(
+    "//xla/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
+load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
+load(
+    "@local_tsl//tsl/platform:build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+
+# hlo-opt tool.
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+# Includes a macro to register a provider.
+cc_library(
+    name = "opt_lib",
+    srcs = ["opt_lib.cc"],
+    hdrs = ["opt_lib.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:debug_options_flags",
+        "//xla:statusor",
+        "//xla:types",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:hlo_graph_dumper",
+        "//xla/service:platform_util",
+        "//xla/stream_executor",
+        "//xla/stream_executor:platform",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "gpu_opt",
+    testonly = True,
+    srcs = if_cuda_is_configured(["gpu_opt.cc"]),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":opt_lib",
+        "//xla:debug_options_flags",
+        "//xla:statusor",
+        "//xla:types",
+        "//xla/service:compiler",
+        "//xla/service:dump",
+        "//xla/service:executable",
+        "//xla/service:hlo_graph_dumper",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:executable_proto_cc",
+        "//xla/stream_executor/cuda:cuda_platform_id",
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@local_tsl//tsl/platform:statusor",
+    ] + if_gpu_is_configured([
+        "//xla/service:gpu_plugin",
+        "//xla/service/gpu:gpu_executable",
+    ]) + if_cuda_is_configured([
+        "//xla/stream_executor:cuda_platform",
+    ]),
+    alwayslink = True,  # Initializer needs to run.
+)
+
+cc_library(
+    name = "cpu_opt",
+    testonly = True,
+    srcs = ["cpu_opt.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":opt_lib",
+        "//xla/service:cpu_plugin",
+        "//xla/service:hlo_graph_dumper",
+        "//xla/stream_executor/host:host_platform",
+        "//xla/stream_executor/platform",
+    ],
+    alwayslink = True,  # Initializer needs to run.
+)
+
+cc_library(
+    name = "opt_main",
+    testonly = True,
+    srcs = ["opt_main.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "cpu_opt",
+        ":opt_lib",
+        "//xla:debug_options_flags",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_runner",
+        "//xla/service:platform_util",
+        "//xla/tools:hlo_module_loader",
+        "//xla/tools:run_hlo_module_lib",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/util:command_line_flags",
+    ] + if_gpu_is_configured([
+        ":gpu_opt",
+    ]) + if_cuda_is_configured([
+        "//xla/stream_executor:cuda_platform",
+    ]),
+)
+
+glob_lit_tests(
+    name = "hlo_opt_tests",
+    data = [":test_utilities"],
+    default_tags = tf_cuda_tests_tags() + [
+    ],
+    driver = "//xla:run_lit.sh",
+    test_file_exts = ["hlo"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "gpu_specs/a100.txtpb",
+        "//xla/tools:hlo-opt",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+exports_files(
+    glob([
+        "gpu_specs/*.txtpb",
+    ]),
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/xla/xla/tools/hlo_opt/cpu_hlo.hlo b/third_party/xla/xla/tools/hlo_opt/cpu_hlo.hlo
new file mode 100644
index 00000000000000..1c9b2a81a98946
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/cpu_hlo.hlo
@@ -0,0 +1,12 @@
+// RUN: hlo-opt %s --platform=cpu --stage=hlo | FileCheck %s
+
+HloModule module
+
+ENTRY computation {
+// CHECK: outer_dimension_partitions
+    p = f32[5000,6000]{1,0} parameter(0)
+    e = f32[5000,6000]{1,0} sqrt(p)
+    c = f32[6000,5000] transpose(p), dimensions={1,0}
+    r = f32[300,20,5000] reshape(c)
+    ROOT out = (f32[5000,6000], f32[300,20,5000]) tuple(e,r)
+}
diff --git a/third_party/xla/xla/pjrt/distributed/protocol.h b/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
similarity index 57%
rename from third_party/xla/xla/pjrt/distributed/protocol.h
rename to third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
index 3db718417e176e..949b25515f86a1 100644
--- a/third_party/xla/xla/pjrt/distributed/protocol.h
+++ b/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,13 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_PJRT_DISTRIBUTED_PROTOCOL_H_
-#define XLA_PJRT_DISTRIBUTED_PROTOCOL_H_
+#include <memory>
+#include <string>
+
+#include "xla/stream_executor/platform/initialize.h"
+#include "xla/tools/hlo_opt/opt_lib.h"
 
 namespace xla {
 
-inline constexpr int DistributedRuntimeProtocolVersion() { return 3; }
+namespace {
+
+class CpuOptProvider : public OptProvider {
+ public:
+  std::string GetPlatformName() override { return "cpu"; }
+};
 
+}  // namespace
 }  // namespace xla
 
-#endif  // XLA_PJRT_DISTRIBUTED_PROTOCOL_H_
+REGISTER_MODULE_INITIALIZER(cpu_opt_provider, {
+  xla::OptProvider::RegisterForPlatform(
+      "cpu", std::make_unique<xla::CpuOptProvider>());
+});
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo.hlo
new file mode 100644
index 00000000000000..c170f21bb31942
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo.hlo
@@ -0,0 +1,12 @@
+// RUN: hlo-opt %s --platform=CUDA --stage=hlo --xla_gpu_target_config_filename=%S/gpu_specs/a100.txtpb | FileCheck %s
+
+HloModule module
+
+ENTRY computation {
+// CHECK: bitcast
+    p = f32[5000,6000]{1,0} parameter(0)
+    e = f32[5000,6000]{1,0} sqrt(p)
+    c = f32[6000,5000] transpose(p), dimensions={1,0}
+    r = f32[300,20,5000] reshape(c)
+    ROOT out = (f32[5000,6000], f32[300,20,5000]) tuple(e,r)
+}
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_buffers.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_buffers.hlo
new file mode 100644
index 00000000000000..434d6876f62c87
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_buffers.hlo
@@ -0,0 +1,17 @@
+// RUN: hlo-opt %s --platform=CUDA --stage=buffer-assignment --xla_gpu_target_config_filename=%S/gpu_specs/a100.txtpb | FileCheck %s
+
+HloModule m
+
+add {
+ a = f16[] parameter(0)
+ b = f16[] parameter(1)
+ ROOT out = f16[] add(a, b)
+}
+
+
+// CHECK: parameter allocation: 2.00MiB
+ENTRY e {
+ p1 = f16[1048576] parameter(0)
+ i = f16[] constant(0)
+ ROOT out = f16[] reduce(p1, i), dimensions={0}, to_apply=add
+}
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo
new file mode 100644
index 00000000000000..b99bba81b2a394
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo
@@ -0,0 +1,17 @@
+// RUN: hlo-opt %s --platform=CUDA --stage=llvm --xla_gpu_target_config_filename=%S/gpu_specs/a100.txtpb | FileCheck %s
+
+HloModule m
+
+add {
+ a = f16[] parameter(0)
+ b = f16[] parameter(1)
+ ROOT out = f16[] add(a, b)
+}
+
+
+// CHECK: load half
+ENTRY e {
+ p1 = f16[1048576] parameter(0)
+ i = f16[] constant(0)
+ ROOT out = f16[] reduce(p1, i), dimensions={0}, to_apply=add
+}
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo
new file mode 100644
index 00000000000000..428bb07d95329c
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo
@@ -0,0 +1,17 @@
+// RUN: hlo-opt %s --platform=CUDA --stage=ptx --xla_gpu_target_config_filename=%S/gpu_specs/a100.txtpb | FileCheck %s
+
+HloModule m
+
+add {
+ a = f16[] parameter(0)
+ b = f16[] parameter(1)
+ ROOT out = f16[] add(a, b)
+}
+
+
+// CHECK: shfl.sync.down
+ENTRY e {
+ p1 = f16[1048576] parameter(0)
+ i = f16[] constant(0)
+ ROOT out = f16[] reduce(p1, i), dimensions={0}, to_apply=add
+}
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc b/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
new file mode 100644
index 00000000000000..d5cb121e4c12d4
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
@@ -0,0 +1,83 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/debug_options_flags.h"
+#include "xla/service/compiler.h"
+#include "xla/service/dump.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/executable.pb.h"
+#include "xla/service/gpu/gpu_executable.h"
+#include "xla/service/platform_util.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/platform/initialize.h"
+#include "xla/tools/hlo_opt/opt_lib.h"
+#include "xla/types.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+namespace {
+
+class GpuOptProvider : public OptProvider {
+ public:
+  StatusOr<std::optional<std::string>> GenerateStage(
+      std::unique_ptr<HloModule> module, absl::string_view s) override {
+    if (s == "llvm") {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                          GetExecutable(std::move(module)));
+      return static_cast<gpu::GpuExecutable*>(executable.get())
+          ->ir_module_string();
+    } else if (s == "ptx") {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                          GetExecutable(std::move(module)));
+      return static_cast<gpu::GpuExecutable*>(executable.get())->text();
+    } else if (s == "buffer-assignment") {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                          GetExecutable(std::move(module)));
+      return static_cast<gpu::GpuExecutable*>(executable.get())
+          ->buffer_assignment()
+          ->ToVerboseString(9999);
+    } else {
+      // Delegate to base class.
+      TF_ASSIGN_OR_RETURN(std::optional<std::string> out,
+                          OptProvider::GenerateStage(std::move(module), s));
+      return out;
+    }
+  }
+
+  std::string GetPlatformName() override { return "gpu"; }
+
+  std::set<std::string> SupportedStages() override {
+    std::set<std::string> supported = OptProvider::SupportedStages();
+    supported.insert({"ptx", "llvm", "buffer-assignment"});
+    return supported;
+  }
+};
+
+}  // namespace
+}  // namespace xla
+
+REGISTER_MODULE_INITIALIZER(gpu_opt_provider, {
+  xla::OptProvider::RegisterForPlatform(
+      "gpu", std::make_unique<xla::GpuOptProvider>());
+});
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100.txtpb b/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100.txtpb
new file mode 100644
index 00000000000000..864125066c3ae6
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100.txtpb
@@ -0,0 +1,42 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+gpu_device_info {
+  cuda_compute_capability {
+    major: 8
+    minor: 0
+  }
+  threads_per_block_limit: 1024
+  threads_per_warp: 32
+  shared_memory_per_block: 65536
+  shared_memory_per_block_optin: 65536
+  shared_memory_per_core: 65536
+  threads_per_core_limit: 2048
+  core_count: 6192
+  fpus_per_core: 64
+  block_dim_limit_x: 2147483647
+  block_dim_limit_y: 65535
+  block_dim_limit_z: 65535
+  memory_bandwidth: 2039000000000
+  l2_cache_size: 4194304
+  clock_rate_ghz: 1.1105
+  device_memory_size: 79050250240
+}
+platform_name: "CUDA"
+dnn_version_info {
+  major: 8
+  minor: 3
+  patch: 2
+}
+device_description_str: "A100 80GB"
diff --git a/third_party/xla/xla/tools/hlo_opt/opt_lib.cc b/third_party/xla/xla/tools/hlo_opt/opt_lib.cc
new file mode 100644
index 00000000000000..62c93a0836a73d
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/opt_lib.cc
@@ -0,0 +1,152 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tools/hlo_opt/opt_lib.h"
+
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "xla/debug_options_flags.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/service/hlo_graph_dumper.h"
+#include "xla/service/platform_util.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/types.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+using ProviderMap =
+    absl::flat_hash_map<std::string, std::unique_ptr<OptProvider>>;
+static absl::Mutex provider_mu(absl::kConstInit);
+
+static ProviderMap& GetProviderMap() {
+  static auto& provider_map = *new ProviderMap();
+  return provider_map;
+}
+
+/*static*/ void OptProvider::RegisterForPlatform(
+    std::string platform, std::unique_ptr<OptProvider> translate_provider) {
+  absl::MutexLock l(&provider_mu);
+  CHECK(!GetProviderMap().contains(platform));
+  StatusOr<std::string> canonical_name =
+      xla::PlatformUtil::CanonicalPlatformName(platform);
+  CHECK_OK(canonical_name);
+  GetProviderMap()[*canonical_name] = std::move(translate_provider);
+}
+
+/*static*/ StatusOr<OptProvider*> OptProvider::ProviderForPlatform(
+    std::string platform) {
+  absl::MutexLock l(&provider_mu);
+
+  TF_ASSIGN_OR_RETURN(std::string canonical_name,
+                      xla::PlatformUtil::CanonicalPlatformName(platform));
+  auto it = GetProviderMap().find(canonical_name);
+  if (it == GetProviderMap().end()) {
+    return absl::UnimplementedError(absl::StrCat(
+        "Provider not found for platform ", platform, "; canonical expansion: ",
+        canonical_name, "; supported platforms are: ",
+        absl::StrJoin(GetProviderMap(), ", ",
+                      [&](std::string* s, const auto& p) {
+                        absl::StrAppend(s, p.first);
+                      })));
+  }
+
+  return it->second.get();
+}
+
+StatusOr<se::StreamExecutor*> OptProvider::GetExecutor() {
+  DebugOptions debug_opts = GetDebugOptionsFromFlags();
+  TF_ASSIGN_OR_RETURN(se::Platform * platform,
+                      PlatformUtil::GetPlatform(GetPlatformName()));
+  if (debug_opts.xla_gpu_target_config_filename().empty()) {
+    TF_ASSIGN_OR_RETURN(std::vector<se::StreamExecutor*> stream_executors,
+                        PlatformUtil::GetStreamExecutors(
+                            platform, /*allowed_devices=*/std::nullopt));
+    return stream_executors[0];
+  }
+  return nullptr;
+}
+
+StatusOr<std::optional<std::string>> OptProvider::GenerateStage(
+    std::unique_ptr<HloModule> module, absl::string_view stage) {
+  if (stage == "hlo") {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> optimized_module,
+                        GetOptimizedHlo(std::move(module)));
+    return optimized_module->ToString();
+  } else if (stage == "html") {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> optimized_module,
+                        GetOptimizedHlo(std::move(module)));
+    TF_ASSIGN_OR_RETURN(std::string cmps,
+                        RenderAllComputationsToHtml(*optimized_module));
+    return cmps;
+  }
+
+  return std::nullopt;
+}
+
+StatusOr<Compiler*> OptProvider::GetCompiler() {
+  TF_ASSIGN_OR_RETURN(se::Platform * platform,
+                      PlatformUtil::GetPlatform(GetPlatformName()));
+
+  TF_ASSIGN_OR_RETURN(Compiler * compiler, Compiler::GetForPlatform(platform));
+  return compiler;
+}
+
+StatusOr<std::unique_ptr<HloModule>> OptProvider::GetOptimizedHlo(
+    std::unique_ptr<HloModule> input_module) {
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor, GetExecutor());
+
+  DebugOptions debug_opts = GetDebugOptionsFromFlags();
+  Compiler::CompileOptions opts;
+  TF_ASSIGN_OR_RETURN(Compiler * compiler, GetCompiler());
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> optimized_module,
+      compiler->RunHloPasses(std::move(input_module), executor, opts));
+
+  DebugOptions d = optimized_module->config().debug_options();
+  d.set_xla_embed_ir_in_executable(true);
+  optimized_module->mutable_config().set_debug_options(d);
+  return optimized_module;
+}
+
+StatusOr<std::unique_ptr<Executable>> OptProvider::GetExecutable(
+    std::unique_ptr<HloModule> input_module) {
+  Compiler::CompileOptions opts;
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> optimized_module,
+                      GetOptimizedHlo(std::move(input_module)));
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor, GetExecutor());
+  TF_ASSIGN_OR_RETURN(Compiler * compiler, GetCompiler());
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> executable,
+      compiler->RunBackend(std::move(optimized_module), executor, opts));
+  return executable;
+}
+
+std::set<std::string> OptProvider::SupportedStages() { return {"hlo", "html"}; }
+
+}  // namespace xla
diff --git a/third_party/xla/xla/tools/hlo_opt/opt_lib.h b/third_party/xla/xla/tools/hlo_opt/opt_lib.h
new file mode 100644
index 00000000000000..819423b50d31ef
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/opt_lib.h
@@ -0,0 +1,75 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_HLO_OPT_OPT_LIB_H_
+#define XLA_TOOLS_HLO_OPT_OPT_LIB_H_
+
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/compiler.h"
+#include "xla/service/executable.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/types.h"
+
+namespace xla {
+
+// Platform-specific provider of `hlo_translate` functionality.
+class OptProvider {
+ public:
+  // Generates textual output for a given stage on a given platform, returns
+  // empty optional if the stage is not supported.
+  virtual StatusOr<std::optional<std::string>> GenerateStage(
+      std::unique_ptr<HloModule> module, absl::string_view stage);
+
+  virtual ~OptProvider() = default;
+
+  // Returns a set of stages supported by the opt provider.
+  virtual std::set<std::string> SupportedStages();
+
+  // Registers a given provider for a given platform.
+  static void RegisterForPlatform(
+      std::string platform, std::unique_ptr<OptProvider> translate_provider);
+
+  // Gets a provider for a given platform.
+  static StatusOr<OptProvider*> ProviderForPlatform(std::string platform);
+
+ protected:
+  // Returns platform name associated with the provider.
+  virtual std::string GetPlatformName() = 0;
+
+  // Returns a stream executor for the provider (could be nullptr).
+  virtual StatusOr<se::StreamExecutor*> GetExecutor();
+
+  // Generates executable from a given input module.
+  StatusOr<std::unique_ptr<Executable>> GetExecutable(
+      std::unique_ptr<HloModule> input_module);
+
+  // Generates optimized HLO.
+  StatusOr<std::unique_ptr<HloModule>> GetOptimizedHlo(
+      std::unique_ptr<HloModule> input_module);
+
+  // Gets a compiler associated with the provider.
+  virtual StatusOr<Compiler*> GetCompiler();
+};
+
+}  // namespace xla
+
+#endif  // XLA_TOOLS_HLO_OPT_OPT_LIB_H_
diff --git a/third_party/xla/xla/tools/hlo_opt/opt_main.cc b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
new file mode 100644
index 00000000000000..1274a167281998
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
@@ -0,0 +1,189 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A tool for reading a HloModule from a HloProto file and execute the module on
+// given platform(s). See kUsage for details.
+
+#include <cstdint>
+#include <cstdio>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "xla/debug_options_flags.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_runner.h"
+#include "xla/service/platform_util.h"
+#include "xla/statusor.h"
+#include "xla/tools/hlo_module_loader.h"
+#include "xla/tools/hlo_opt/opt_lib.h"
+#include "xla/tools/run_hlo_module.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/init_main.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/status.h"
+#include "tsl/util/command_line_flags.h"
+
+namespace {
+const char* const kUsage = R"(
+This tool lets you run a given HloModule from a file (or stdin) and convert it
+to expanded HLO, fully optimized HLO, or a binary depending on options.
+
+You can also pass in debug option flags for the HloModule.
+
+Usage:
+
+  bazel run opt -- --platform=[gpu|cpu|...] path/to/hlo_module
+)";
+
+struct HloOptConfig {
+  // Optional flags.
+  bool help{false};
+  bool split_input_file{false};
+  std::string platform{"gpu"};
+  std::string input_file{""};
+  std::string input_format{""};
+  std::string output_file{"-"};
+  std::string stage{"hlo"};
+  bool list_stages{false};
+};
+
+}  // namespace
+
+namespace xla {
+
+namespace {
+
+std::string GetHloPath(const HloOptConfig& opts, int argc, char** argv) {
+  if (!opts.input_file.empty()) {
+    return opts.input_file;
+  }
+  QCHECK(argc == 2) << "Must specify a single input file";
+  return argv[1];
+}
+
+StatusOr<std::string> GetHloContents(const HloOptConfig& opts, int argc,
+                                     char** argv) {
+  std::string hlo_path = GetHloPath(opts, argc, argv);
+  if (hlo_path == "-") {
+    std::string stdin;
+    std::getline(std::cin, stdin, static_cast<char>(EOF));
+    return stdin;
+  }
+
+  std::string data;
+  TF_RETURN_IF_ERROR(
+      tsl::ReadFileToString(tsl::Env::Default(), hlo_path, &data));
+  return data;
+}
+
+StatusOr<std::unique_ptr<HloModule>> GetModule(const HloOptConfig& opts,
+                                               int argc, char** argv) {
+  TF_ASSIGN_OR_RETURN(std::string module_data,
+                      GetHloContents(opts, argc, argv));
+
+  std::string format = opts.input_format;
+  if (format.empty()) {
+    format = std::string(tsl::io::Extension(GetHloPath(opts, argc, argv)));
+  }
+  return LoadModuleFromData(module_data, format);
+}
+
+StatusOr<std::string> TranslateToStage(int argc, char** argv,
+                                       const HloOptConfig& opts) {
+  TF_ASSIGN_OR_RETURN(OptProvider * provider,
+                      OptProvider::ProviderForPlatform(opts.platform));
+
+  if (opts.list_stages) {
+    return absl::StrJoin(provider->SupportedStages(), "\n");
+  }
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      GetModule(opts, argc, argv));
+
+  TF_ASSIGN_OR_RETURN(std::optional<std::string> out,
+                      provider->GenerateStage(std::move(module), opts.stage));
+
+  if (!out.has_value()) {
+    return absl::UnimplementedError("Stage not supported");
+  }
+
+  return *out;
+}
+
+Status RunOpt(int argc, char** argv, const HloOptConfig& opts) {
+  TF_ASSIGN_OR_RETURN(std::string output, TranslateToStage(argc, argv, opts));
+  if (opts.output_file == "-") {
+    std::cout << output << std::endl;
+  } else {
+    TF_RETURN_IF_ERROR(
+        tsl::WriteStringToFile(tsl::Env::Default(), opts.output_file, output));
+  }
+  return OkStatus();
+}
+
+}  // namespace
+}  // namespace xla
+
+// gpu_device_config_filename: Probably deserves it's own flag? Since in here it
+// will affect more top-level logic?
+int main(int argc, char** argv) {
+  HloOptConfig opts;
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("o", &opts.output_file,
+                "Output filename, or '-' for stdout (default)."),
+      tsl::Flag("platform", &opts.platform,
+                "The platform for which we perform the translation"),
+      tsl::Flag("format", &opts.input_format,
+                "The format of the input file. By default inferred from the "
+                "filename. Valid values:\n"
+                "\t\t\t  hlo : HLO textual format\n"
+                "\t\t\t  pb : xla::HloProto in binary proto format\n"
+                "\t\t\t  pbtxt : xla::HloProto in text proto format"),
+      tsl::Flag("stage", &opts.stage,
+                "Output stage to dump. "
+                "Valid values depend on the platform, for GPUs:\n"
+                "\t\t\t * hlo : HLO after all optimizations\n"
+                "\t\t\t * llvm : LLVM IR\n"
+                "\t\t\t * ptx : PTX dump\n"
+                "\t\t\t * buffer-assignment: Buffer Assignment\n"),
+      tsl::Flag("list-stages", &opts.list_stages,
+                "Print all supported stages for a given platform and exit")};
+  // Modifies global DebugOptions, populates flags with every flag available
+  // from xla.proto.
+  xla::AppendDebugOptionsFlags(&flag_list);
+  // The usage string includes the message at the top of the file, the
+  // DebugOptions flags and the flags defined above.
+  const std::string kUsageString =
+      absl::StrCat(kUsage, "\n\n", tsl::Flags::Usage(argv[0], flag_list));
+  bool parse_ok = tsl::Flags::Parse(&argc, argv, flag_list);
+  tsl::port::InitMain(kUsageString.c_str(), &argc, &argv);
+
+  if (!parse_ok) {
+    LOG(QFATAL) << kUsageString;
+  }
+
+  xla::Status s = xla::RunOpt(argc, argv, opts);
+  if (!s.ok()) {
+    std::cerr << s;
+    return 1;
+  }
+  return 0;
+}
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
index 91414e4f6871a1..40efc7f613d7ad 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
@@ -244,18 +244,15 @@ void AddShardingAnnotationsToSpmdPartitionedModule(HloModule* hlo_module) {
 }
 
 StatusOr<std::unique_ptr<PjRtClient>> FunctionalHloRunner::CreateGpuClient() {
-  return GetStreamExecutorGpuClient(
-      /*asynchronous=*/true, GpuAllocatorConfig(), /*node_id=*/0);
+  return GetStreamExecutorGpuClient(GpuClientOptions());
 }
 
 StatusOr<std::unique_ptr<PjRtClient>> FunctionalHloRunner::CreateMockGpuClient(
     int num_nodes) {
-  return GetStreamExecutorGpuClient(
-      /*asynchronous=*/true, GpuAllocatorConfig(), /*node_id=*/0,
-      /*num_nodes=*/num_nodes, /*allowed_devices=*/std::nullopt,
-      /*platform_name=*/std::nullopt,
-      /*should_stage_host_to_device_transfers=*/true,
-      /*kv_get=*/nullptr, /*kv_put=*/nullptr, /*enable_mock_nccl=*/true);
+  GpuClientOptions options;
+  options.num_nodes = num_nodes;
+  options.enable_mock_nccl = true;
+  return GetStreamExecutorGpuClient(options);
 }
 
 StatusOr<std::unique_ptr<PjRtClient>> FunctionalHloRunner::CreateGpuClient(
@@ -273,23 +270,24 @@ StatusOr<std::unique_ptr<PjRtClient>> FunctionalHloRunner::CreateGpuClient(
 
   xla::PjRtClient::KeyValueGetCallback kv_get =
       [distributed_client](
-          const std::string& k,
+          std::string_view k,
           absl::Duration timeout) -> xla::StatusOr<std::string> {
     return distributed_client->BlockingKeyValueGet(absl::StrCat(kKeyPrefix, k),
                                                    timeout);
   };
 
   xla::PjRtClient::KeyValuePutCallback kv_put =
-      [distributed_client](const std::string& k,
-                           const std::string& v) -> xla::Status {
+      [distributed_client](std::string_view k,
+                           std::string_view v) -> xla::Status {
     return distributed_client->KeyValueSet(absl::StrCat(kKeyPrefix, k), v);
   };
 
-  return GetStreamExecutorGpuClient(
-      /*asynchronous=*/true, GpuAllocatorConfig(), node_id, num_nodes,
-      /*allowed_devices=*/std::nullopt,
-      /*platform_name=*/std::nullopt,
-      /*should_stage_host_to_device_transfers=*/true, kv_get, kv_put);
+  GpuClientOptions options;
+  options.node_id = node_id;
+  options.num_nodes = num_nodes;
+  options.kv_get = kv_get;
+  options.kv_put = kv_put;
+  return GetStreamExecutorGpuClient(options);
 }
 
 StatusOr<ExecutionOptions> FunctionalHloRunner::LoadExecutionOptions(
diff --git a/third_party/xla/xla/tools/xla_compile_lib.cc b/third_party/xla/xla/tools/xla_compile_lib.cc
new file mode 100644
index 00000000000000..172e22d5955ff2
--- /dev/null
+++ b/third_party/xla/xla/tools/xla_compile_lib.cc
@@ -0,0 +1,165 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tools/xla_compile_lib.h"
+
+#include <cmath>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "google/protobuf/duration.pb.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/service/compiler.h"
+#include "xla/service/cpu/cpu_compiler.h"
+#include "xla/service/cpu/cpu_executable.h"
+#include "xla/service/executable.h"
+#include "xla/service/xla_compile_result.pb.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/env_time.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "xla/service/gpu/executable.pb.h"
+#include "xla/stream_executor/gpu/gpu_init.h"
+#endif
+#if GOOGLE_CUDA
+#include "xla/service/gpu/nvptx_compiler.h"
+#elif TENSORFLOW_USE_ROCM
+#include "xla/service/gpu/amdgpu_compiler.h"
+#endif
+
+namespace xla {
+
+static StatusOr<std::string> AotCompileCpuExecutable(
+    std::unique_ptr<HloModule> hlo_module) {
+  cpu::CpuCompiler cpu_compiler;
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<cpu::CpuExecutable> cpu_executable,
+      cpu_compiler.CompileXlaRuntimeCpuExecutable(std::move(hlo_module)));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<AotCompilationResult> aot_result,
+                      cpu_compiler.Export(cpu_executable.get()));
+  return aot_result->SerializeAsString();
+}
+
+static StatusOr<std::string> CompileGpuExecutable(
+    std::unique_ptr<HloModule> hlo_module,
+    std::optional<Compiler::TargetConfig> target_config,
+    CompilationResult& result) {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  const bool aot = target_config.has_value();
+
+#if GOOGLE_CUDA
+  auto gpu_compiler = gpu::NVPTXCompiler();
+#elif TENSORFLOW_USE_ROCM
+  auto gpu_compiler = gpu::AMDGPUCompiler();
+#endif
+
+  Compiler::CompileOptions compile_options;
+
+  stream_executor::StreamExecutor* stream_executor = nullptr;
+  std::unique_ptr<stream_executor::StreamExecutorMemoryAllocator> allocator;
+  if (aot) {
+    compile_options.target_config = *target_config;
+  } else {
+    TF_RETURN_IF_ERROR(stream_executor::ValidateGPUMachineManager());
+    TF_ASSIGN_OR_RETURN(
+        stream_executor,
+        stream_executor::GPUMachineManager()->ExecutorForDevice(0));
+    allocator =
+        std::make_unique<stream_executor::StreamExecutorMemoryAllocator>(
+            stream_executor);
+    compile_options.device_allocator = allocator.get();
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> module_after_opt,
+      gpu_compiler.RunHloPasses(std::move(hlo_module), stream_executor,
+                                compile_options));
+
+  *result.mutable_hlo_module() = module_after_opt->ToProto();
+  if (aot) {
+    auto module_group =
+        std::make_unique<HloModuleGroup>(std::move(module_after_opt));
+
+    AotCompilationOptions aot_options(gpu_compiler.PlatformId());
+    aot_options.set_target_config(*target_config);
+
+    TF_ASSIGN_OR_RETURN(
+        std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
+        gpu_compiler.CompileAheadOfTime(std::move(module_group), aot_options));
+    TF_ASSIGN_OR_RETURN(std::string result,
+                        aot_results[0]->SerializeAsString());
+    return result;
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> executable,
+      gpu_compiler.RunBackend(std::move(module_after_opt), stream_executor,
+                              compile_options));
+  return executable->module().ToString();
+#else
+  LOG(ERROR) << "Neither ROCm nor CUDA present; returning empty.";
+  return "";
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}
+
+StatusOr<std::string> CompileExecutable(
+    std::unique_ptr<HloModule> hlo_module, absl::string_view platform,
+    std::optional<Compiler::TargetConfig> target_config,
+    CompilationResult& result) {
+  if (platform == "cpu") {
+    return AotCompileCpuExecutable(std::move(hlo_module));
+  } else if (platform == "gpu") {
+    return CompileGpuExecutable(std::move(hlo_module), target_config, result);
+  }
+
+  return absl::UnimplementedError(
+      absl::StrCat("platform", platform, " is not supported"));
+}
+
+Status WriteResultFile(const std::string& result_output_file, TimerStats& stats,
+                       CompilationResult& compilation_result) {
+  if (result_output_file.empty()) {
+    return absl::OkStatus();
+  }
+  absl::MutexLock ml(&stats.stats_mutex);
+  const double secs = std::floor(stats.cumulative_secs);
+  const double nanos =
+      (stats.cumulative_secs - secs) * tsl::EnvTime::kSecondsToNanos;
+  google::protobuf::Duration duration;
+  duration.set_seconds(secs);
+  duration.set_nanos(nanos);
+
+  *compilation_result.mutable_perf_stats()->mutable_compilation_duration() =
+      duration;
+  *compilation_result.mutable_perf_stats()->mutable_total_duration() = duration;
+
+  return tsl::WriteBinaryProto(tsl::Env::Default(), result_output_file,
+                               compilation_result);
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/tools/xla_compile_lib.h b/third_party/xla/xla/tools/xla_compile_lib.h
new file mode 100644
index 00000000000000..5b8e4f2a43314d
--- /dev/null
+++ b/third_party/xla/xla/tools/xla_compile_lib.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_XLA_COMPILE_LIB_H_
+#define XLA_TOOLS_XLA_COMPILE_LIB_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/compiler.h"
+#include "xla/service/xla_compile_result.pb.h"
+#include "xla/util.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+// Compiles the provided module for the given platform, either "cpu" or "gpu".
+// When compiling for GPU, if the target config is provided, the compilation
+// will be AOT. If it is not provided, an attached GPU will be used. When
+// compiling for CPU, the compilation will always be AOT. If a result is
+// provided, the post-optimization module will be stored in it.
+//
+// This is the expected entry point to the compilation functionality.
+StatusOr<std::string> CompileExecutable(
+    std::unique_ptr<HloModule> hlo_module, absl::string_view platform,
+    std::optional<Compiler::TargetConfig> target_config,
+    CompilationResult& result);
+
+// Merges the measured duration into compilation_result and writes
+// compilation_result to result_output_file in the wire format.
+Status WriteResultFile(const std::string& result_output_file, TimerStats& stats,
+                       CompilationResult& compilation_result);
+
+}  // namespace xla
+
+#endif  // XLA_TOOLS_XLA_COMPILE_LIB_H_
diff --git a/third_party/xla/xla/tools/xla_compile_lib_test.cc b/third_party/xla/xla/tools/xla_compile_lib_test.cc
new file mode 100644
index 00000000000000..8c1252c16f47ef
--- /dev/null
+++ b/third_party/xla/xla/tools/xla_compile_lib_test.cc
@@ -0,0 +1,162 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tools/xla_compile_lib.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "google/protobuf/duration.pb.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/platform_util.h"
+#include "xla/service/xla_compile_result.pb.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/test_macros.h"
+#include "xla/util.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/env_time.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+#include "tsl/protobuf/error_codes.pb.h"
+
+namespace xla {
+namespace {
+
+using ::testing::IsEmpty;
+using ::testing::Not;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::IsOkAndHolds;
+using ::tsl::testing::StatusIs;
+
+#if XLA_TEST_BACKEND_CPU
+static constexpr absl::string_view kPlatformName = "Host";
+#elif XLA_TEST_BACKEND_GPU
+static constexpr absl::string_view kPlatformName =
+#if TENSORFLOW_USE_ROCM
+    "ROCM";
+#else
+    "CUDA";
+#endif
+#endif  // XLA_TEST_BACKEND_CPU
+
+class XlaCompileLibTest : public HloTestBase {
+ protected:
+  XlaCompileLibTest()
+      : HloTestBase(*PlatformUtil::GetPlatform(std::string(kPlatformName)),
+                    GetReferencePlatform()) {}
+  void SetUp() override {
+    const std::string hlo_path = tsl::io::JoinPath(tsl::testing::XlaSrcRoot(),
+                                                   "tools", "data", "add.hlo");
+    std::string hlo;
+    TF_ASSERT_OK(tsl::ReadFileToString(tsl::Env::Default(), hlo_path, &hlo));
+    TF_ASSERT_OK_AND_ASSIGN(module_, ParseAndReturnVerifiedModule(hlo));
+  }
+
+  std::unique_ptr<HloModule> module_;
+};
+
+TEST_F(XlaCompileLibTest, DISABLED_ON_GPU(CompilesForCpu)) {
+  CompilationResult result;
+  EXPECT_THAT(
+      CompileExecutable(std::move(module_), "cpu", std::nullopt, result),
+      IsOkAndHolds(Not(IsEmpty())));
+}
+
+TEST_F(XlaCompileLibTest, DISABLED_ON_CPU(CompilesForGpuWithDevice)) {
+  CompilationResult result;
+  EXPECT_THAT(
+      CompileExecutable(std::move(module_), "gpu", std::nullopt, result),
+      IsOkAndHolds(Not(IsEmpty())));
+}
+
+TEST_F(XlaCompileLibTest, DISABLED_ON_CPU(CompilesForGpuWithoutDevice)) {
+  const std::string target_config_path =
+      tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service",
+                        "xla_aot_compile_test_gpu_target_config.prototxt");
+  stream_executor::GpuTargetConfigProto target_config;
+  TF_ASSERT_OK(tsl::ReadTextProto(tsl::Env::Default(), target_config_path,
+                                  &target_config));
+  CompilationResult result;
+  EXPECT_THAT(
+      CompileExecutable(std::move(module_), "gpu", std::nullopt, result),
+      IsOkAndHolds(Not(IsEmpty())));
+}
+
+TEST_F(XlaCompileLibTest,
+       DISABLED_ON_CPU(ReturnsOptimizedModuleWhenRequested)) {
+  CompilationResult result;
+  EXPECT_THAT(
+      CompileExecutable(std::move(module_), "gpu", std::nullopt, result),
+      IsOkAndHolds(Not(IsEmpty())));
+  EXPECT_TRUE(result.has_hlo_module()) << result.DebugString();
+}
+
+TEST_F(XlaCompileLibTest, DISABLED_ON_GPU(ErrorsOnUnexpectedPlatform)) {
+  CompilationResult result;
+  EXPECT_THAT(CompileExecutable(nullptr, "tpu", std::nullopt, result),
+              StatusIs(tsl::error::UNIMPLEMENTED));
+}
+
+TEST_F(XlaCompileLibTest, DISABLED_ON_GPU(WriteResultFilePropagatesErrors)) {
+  TimerStats stats;
+  CompilationResult result;
+  EXPECT_THAT(WriteResultFile("/does/not/exist", stats, result), Not(IsOk()));
+}
+
+TEST_F(XlaCompileLibTest, DISABLED_ON_GPU(WriteResultFileWritesTheFile)) {
+  std::string result_output_file;
+  ASSERT_TRUE(tsl::Env::Default()->LocalTempFilename(&result_output_file));
+
+  TimerStats stats;
+  {
+    absl::MutexLock ml(&stats.stats_mutex);
+    stats.cumulative_secs = 5.5;
+    stats.max_secs = 5.5;
+  }
+
+  CompilationResult result;
+  google::protobuf::Duration duration;
+  duration.set_seconds(5);
+  duration.set_nanos(0.5 * tsl::EnvTime::kSecondsToNanos);
+  *result.mutable_perf_stats()->mutable_compilation_duration() = duration;
+  *result.mutable_perf_stats()->mutable_total_duration() = duration;
+
+  TF_ASSERT_OK(WriteResultFile(result_output_file, stats, result));
+
+  CompilationResult got_result;
+  TF_ASSERT_OK(tsl::ReadBinaryProto(tsl::Env::Default(), result_output_file,
+                                    &got_result));
+  // Sadly EqualsProto isn't OSS, so we inspect a few fields manually.
+  // See googletest#1761 and b/229726259.
+  EXPECT_EQ(5, got_result.perf_stats().compilation_duration().seconds());
+  EXPECT_EQ(0.5 * tsl::EnvTime::kSecondsToNanos,
+            got_result.perf_stats().compilation_duration().nanos());
+  EXPECT_EQ(5, got_result.perf_stats().total_duration().seconds());
+  EXPECT_EQ(0.5 * tsl::EnvTime::kSecondsToNanos,
+            got_result.perf_stats().total_duration().nanos());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
index 08d5f49c806286..cc7aa9e9ed6e49 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
@@ -1178,6 +1178,16 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           ->create<mlir::mhlo::TupleOp>(loc, result_type, sort_op.getResults())
           .getOperation();
     }
+    case HloOpcode::kTopK: {
+      auto topk_instruction = Cast<HloTopKInstruction>(instruction);
+      auto topk_op = func_builder->create<mlir::mhlo::TopKOp>(
+          loc, result_type.dyn_cast<mlir::TupleType>().getTypes(), operands[0],
+          builder_->getI64IntegerAttr(topk_instruction->k()),
+          builder_->getBoolAttr(topk_instruction->largest()));
+      return func_builder
+          ->create<mlir::mhlo::TupleOp>(loc, result_type, topk_op.getResults())
+          .getOperation();
+    }
     case HloOpcode::kCopyStart: {
       auto copy_start_instruction = Cast<HloCopyStartInstruction>(instruction);
       if (auto cross_program_prefetch_index =
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
index 1574cca50ee837..4c2f38a15a7a13 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
@@ -93,27 +93,6 @@ StatusOr<AffineMap> GetPermutationIfAvailable(const Shape& shape,
   return makeStridedLinearLayoutMap(strides, /*offset=*/0,
                                     builder.getContext());
 }
-
-template <typename T>
-void CopyDenseElementsBy(mlir::DenseElementsAttr data,
-                         std::vector<uint8_t>* output) {
-  output->resize(data.getNumElements() * sizeof(T));
-  int i = 0;
-  for (T element : data.getValues<T>()) {
-    std::memcpy(&(*output)[i], &element, sizeof(T));
-    i += sizeof(T);
-  }
-}
-
-template <>
-void CopyDenseElementsBy<u4>(mlir::DenseElementsAttr data,
-                             std::vector<uint8_t>* output) {
-  output->resize(CeilOfRatio(data.getNumElements(), int64_t{2}));
-  absl::Span<char> output_span =
-      absl::MakeSpan(reinterpret_cast<char*>(output->data()), output->size());
-  PackInt4(data.getRawData(), output_span);
-}
-
 }  // namespace
 
 StatusOr<mlir::MemRefType> ConvertTensorShapeToMemRefType(
@@ -152,84 +131,6 @@ StatusOr<mlir::DenseElementsAttr> CreateDenseElementsAttrFromLiteral(
       element_type);
 }
 
-Status CopyDenseElementsDataToXlaFormat(mlir::DenseElementsAttr data,
-                                        std::vector<uint8_t>* output) {
-  mlir::Type element_type = data.getType().getElementType();
-
-  // TODO(hinsu): Support remaining XLA primitive types.
-  if (element_type.isInteger(1)) {
-    CopyDenseElementsBy<bool>(data, output);
-    return OkStatus();
-  }
-  if (element_type.isInteger(4)) {
-    CopyDenseElementsBy<u4>(data, output);
-    return OkStatus();
-  }
-  if (element_type.isInteger(8)) {
-    CopyDenseElementsBy<uint8_t>(data, output);
-    return OkStatus();
-  }
-  if (element_type.isInteger(16)) {
-    CopyDenseElementsBy<uint16_t>(data, output);
-    return OkStatus();
-  }
-  if (element_type.isInteger(32)) {
-    CopyDenseElementsBy<uint32_t>(data, output);
-    return OkStatus();
-  }
-  if (element_type.isInteger(64)) {
-    CopyDenseElementsBy<uint64_t>(data, output);
-    return OkStatus();
-  }
-  if (element_type.isFloat8E5M2()) {
-    CopyDenseElementsBy<tsl::float8_e5m2>(data, output);
-    return OkStatus();
-  }
-  if (element_type.isFloat8E4M3FN()) {
-    CopyDenseElementsBy<tsl::float8_e4m3fn>(data, output);
-    return OkStatus();
-  }
-  if (element_type.isFloat8E4M3B11FNUZ()) {
-    CopyDenseElementsBy<tsl::float8_e4m3b11>(data, output);
-    return OkStatus();
-  }
-  if (element_type.isFloat8E5M2FNUZ()) {
-    CopyDenseElementsBy<tsl::float8_e5m2fnuz>(data, output);
-    return OkStatus();
-  }
-  if (element_type.isFloat8E4M3FNUZ()) {
-    CopyDenseElementsBy<tsl::float8_e4m3fnuz>(data, output);
-    return OkStatus();
-  }
-  if (element_type.isBF16()) {
-    CopyDenseElementsBy<bfloat16>(data, output);
-    return OkStatus();
-  }
-  if (element_type.isF16()) {
-    CopyDenseElementsBy<half>(data, output);
-    return OkStatus();
-  }
-  if (element_type.isF32()) {
-    CopyDenseElementsBy<float>(data, output);
-    return OkStatus();
-  }
-  if (element_type.isF64()) {
-    CopyDenseElementsBy<double>(data, output);
-    return OkStatus();
-  }
-  if (auto complex_type = element_type.dyn_cast<mlir::ComplexType>()) {
-    if (complex_type.getElementType().isF32()) {
-      CopyDenseElementsBy<complex64>(data, output);
-      return OkStatus();
-    }
-    if (complex_type.getElementType().isF64()) {
-      CopyDenseElementsBy<complex128>(data, output);
-      return OkStatus();
-    }
-  }
-  return Internal("Unsupported type in CopyDenseElementsDataToXlaFormat");
-}
-
 StatusOr<int> GetElementTypeBytes(mlir::Type type) {
   if (type.isInteger(1)) {
     return 1;
@@ -388,6 +289,8 @@ StatusOr<::xla::HloOpcode> MhloToHloOpcode(mlir::Operation* op) {
     return xla::HloOpcode::kConvolution;
   } else if (isa<mlir::mhlo::SortOp, mlir::lmhlo::SortOp>(op)) {
     return xla::HloOpcode::kSort;
+  } else if (isa<mlir::mhlo::TopKOp>(op)) {
+    return xla::HloOpcode::kTopK;
   } else if (isa<mlir::mhlo::RngBitGeneratorOp>(op)) {
     return xla::HloOpcode::kRngBitGenerator;
   } else if (isa<mlir::mhlo::XlaRngGetAndUpdateStateOp>(op)) {
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h
index 42682a251385ae..b5f44584f1abc5 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h
@@ -34,9 +34,6 @@ namespace xla {
 StatusOr<mlir::DenseElementsAttr> CreateDenseElementsAttrFromLiteral(
     const LiteralBase& literal, mlir::Builder builder);
 
-Status CopyDenseElementsDataToXlaFormat(mlir::DenseElementsAttr data,
-                                        std::vector<uint8_t>* output);
-
 StatusOr<int> GetElementTypeBytes(mlir::Type type);
 
 // Creates an DenseIntElementsAttr using the elements of the vector and the
@@ -59,22 +56,24 @@ static StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
       ConvertPrimitiveTypeToMLIRType(xla_ty.element_type(), builder);
   if (!element_type_or.ok()) return element_type_or.status();
 
-  bool is_dynamic = false;
+  bool is_bounded_dynamic = false;
   int64_t rank = xla_ty.rank();
   llvm::SmallVector<int64_t, 4> shape(rank, mlir::ShapedType::kDynamic);
   llvm::SmallVector<int64_t, 4> bounds(rank, mlir::ShapedType::kDynamic);
   for (int64_t dim = 0; dim < rank; ++dim) {
     int64_t dim_size = xla_ty.dimensions(dim);
     if (xla_ty.is_dynamic_dimension(dim)) {
-      bounds[dim] = dim_size;
-      is_dynamic = true;
+      if (!xla_ty.is_unbounded_dynamic_dimension(dim)) {
+        bounds[dim] = dim_size;
+        is_bounded_dynamic = true;
+      }
     } else {
       shape[dim] = dim_size;
     }
   }
   using mlir::mhlo::TypeExtensionsAttr;
   mlir::Attribute encoding;
-  if (is_dynamic) {
+  if (is_bounded_dynamic) {
     encoding = TypeExtensionsAttr::get(builder.getContext(), bounds);
   }
 
@@ -89,10 +88,10 @@ static StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
   if (xla_ty.has_layout()) {
     auto layout = xla_ty.layout();
     if (LayoutUtil::IsSparse(layout)) {
-      if (is_dynamic)
+      if (is_bounded_dynamic)
         return Unimplemented(
             "MHLO doesn't support bounded dynamic shapes for sparse tensors");
-      llvm::SmallVector<mlir::sparse_tensor::DimLevelType> dlts;
+      llvm::SmallVector<mlir::sparse_tensor::LevelType> lts;
       for (size_t i = 0, e = layout.dim_level_types().size(); i < e; ++i) {
         auto dlt = layout.dim_level_types()[i];
         bool ordered =
@@ -101,19 +100,19 @@ static StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
             i < layout.dim_unique().size() ? layout.dim_unique()[i] : true;
         switch (dlt) {
           case DimLevelType::DIM_DENSE:
-            dlts.push_back(*mlir::sparse_tensor::buildLevelType(
+            lts.push_back(*mlir::sparse_tensor::buildLevelType(
                 mlir::sparse_tensor::LevelFormat::Dense, ordered, unique));
             break;
           case DimLevelType::DIM_COMPRESSED:
-            dlts.push_back(*mlir::sparse_tensor::buildLevelType(
+            lts.push_back(*mlir::sparse_tensor::buildLevelType(
                 mlir::sparse_tensor::LevelFormat::Compressed, ordered, unique));
             break;
           case DimLevelType::DIM_SINGLETON:
-            dlts.push_back(*mlir::sparse_tensor::buildLevelType(
+            lts.push_back(*mlir::sparse_tensor::buildLevelType(
                 mlir::sparse_tensor::LevelFormat::Singleton, ordered, unique));
             break;
           case DimLevelType::DIM_LOOSE_COMPRESSED:
-            dlts.push_back(*mlir::sparse_tensor::buildLevelType(
+            lts.push_back(*mlir::sparse_tensor::buildLevelType(
                 mlir::sparse_tensor::LevelFormat::LooseCompressed, ordered,
                 unique));
             break;
@@ -128,7 +127,7 @@ static StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
                                                        builder.getContext());
       // TODO(atondwal): support sizes other than 32 when XLA does
       encoding = SparseTensorEncodingAttr::get(
-          builder.getContext(), dlts, id_map, mlir::AffineMap(), 32, 32);
+          builder.getContext(), lts, id_map, mlir::AffineMap(), 32, 32);
     }
   }
   return TypeT::get(shape, element_type_or.value(), encoding);
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils_test.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils_test.cc
index ef6ae79b2b5da3..295d656275d883 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils_test.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils_test.cc
@@ -69,38 +69,5 @@ TEST(ConvertTensorShapeToType, Simple) {
   }
 }
 
-TEST(LiteralToAttrToXlaFormat, Simple) {
-  mlir::MLIRContext context;
-  context.loadDialect<mlir::mhlo::MhloDialect>();
-  mlir::Builder builder(&context);
-
-  // int16
-  {
-    Literal x = LiteralUtil::CreateR2<int16_t>({{0, 1, 2}, {3, 4, 5}});
-    TF_ASSERT_OK_AND_ASSIGN(mlir::DenseElementsAttr attr,
-                            CreateDenseElementsAttrFromLiteral(x, builder));
-
-    std::vector<uint8_t> data;
-    TF_ASSERT_OK(CopyDenseElementsDataToXlaFormat(attr, &data));
-    for (int i = 0; i < 6; i++) {
-      int16_t x;
-      memcpy(&x, &data[i * 2], 2);
-      EXPECT_EQ(x, i);
-    }
-  }
-
-  // int4
-  {
-    Literal x = LiteralUtil::CreateR2<s4>(
-        {{s4(0), s4(1), s4(2)}, {s4(3), s4(4), s4(5)}});
-    TF_ASSERT_OK_AND_ASSIGN(mlir::DenseElementsAttr attr,
-                            CreateDenseElementsAttrFromLiteral(x, builder));
-
-    std::vector<uint8_t> data;
-    TF_ASSERT_OK(CopyDenseElementsDataToXlaFormat(attr, &data));
-    EXPECT_EQ(data, std::vector<uint8_t>({0x01, 0x23, 0x45}));
-  }
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
index 6e8ef58022478a..e18fc9926588f6 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
@@ -1,6 +1,9 @@
 // RUN: xla-translate --print-sugar=false -hlo-text-to-mlir-hlo -hlo-import-all-computations %s -o - | FileCheck %s
 // RUN: xla-translate --print-sugar=false -hlo-text-to-mlir-hlo %s -o - | FileCheck %s -check-prefix=NO_DEAD_FUNCTION
 
+// CHECK: #[[$DC:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed), posWidth = 32, crdWidth = 32 }>
+// CHECK: #[[$CSS:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton(nonunique, nonordered), d2 : singleton(nonordered)), posWidth = 32, crdWidth = 32 }>
+
 // NO_DEAD_FUNCTION-NOT: @test
 
 // CHECK: module @foobar
@@ -1757,13 +1760,13 @@ add {
 }
 
 // CHECK-LABEL : func private @sparse
-// CHECK: tensor<10x10xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed), posWidth = 32, crdWidth = 32 }>>) -> tensor<10x10xf32, #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed), posWidth = 32, crdWidth = 32 }>>
+// CHECK: tensor<10x10xf32, #[[$DC]]>) -> tensor<10x10xf32, #[[$DC]]>
 %sparse {
   ROOT root = f32[10,10]{1,0:D(D,C)} parameter(0)
 }
 
 // CHECK-LABEL : func private @sparse_nu_no
-// CHECK: tensor<3x4x5xf32, #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton(nonunique, nonordered), d2 : singleton(nonordered)), posWidth = 32, crdWidth = 32 }>>) -> tensor<3x4x5xf32, #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton(nonunique, nonordered), d2 : singleton(nonordered)), posWidth = 32, crdWidth = 32 }>>
+// CHECK: tensor<3x4x5xf32, #[[$CSS]]>) -> tensor<3x4x5xf32, #[[$CSS]]>
 %sparse_nu_no {
   ROOT root = f32[3,4,5]{2,1,0:D(C+,S+~,S~)} parameter(0)
 }
@@ -1838,3 +1841,21 @@ add {
   %b = (f32[2,4]{1,0}, (f32[2,4]{1,0})) parameter(1), parameter_replication={false,true}
   ROOT %tuple = (f32[], (f32[2,4]{1,0}, (f32[2,4]{1,0}))) tuple(f32[] %a, (f32[2,4]{1,0}, (f32[2,4]{1,0})) %b)
 }
+
+// CHECK-LABEL: func.func private @unbounded(%arg0: tensor<?x784xf32>) -> tensor<?x784xf32> {
+// CHECK-NEXT: [[VAL0:%.*]] = mhlo.abs %arg0 : tensor<?x784xf32>
+// CHECK-NEXT: return [[VAL0]] : tensor<?x784xf32>
+// CHECK-NEXT: }
+%unbounded (Arg_0.1: f32[?,784]) -> f32[?,784] {
+  %Arg_0.1 = f32[?,784] parameter(0)
+  ROOT %abs.2 = f32[?,784] abs(f32[?,784] %Arg_0.1)
+}
+
+// Test topk
+%test_topk {
+  x = f32[4,4] parameter(0)
+  ROOT out = (f32[4,2], s32[4,2]) topk(x), k=2, largest=true
+}
+// CHECK-LABEL:  func private @test_topk
+// CHECK-SAME:  ([[ARG:%.*]]: tensor<4x4xf32>) -> tuple<tensor<4x2xf32>, tensor<4x2xi32>>
+// CHECK:  mhlo.topk([[ARG]], k = 2, largest = true) : tensor<4x4xf32> -> (tensor<4x2xf32>, tensor<4x2xi32>)
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
index a9a72de122f5a6..9b125114b29b13 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
@@ -1,8 +1,8 @@
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_binary", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_binary", "cc_library")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -63,7 +63,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//xla/service:hlo_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@llvm-project//mlir:IR",
     ],
 )
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/location_exporter.cc b/third_party/xla/xla/translate/mhlo_to_hlo/location_exporter.cc
index 06e316eac5ccea..1217f9010ff82b 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/location_exporter.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/location_exporter.cc
@@ -110,8 +110,11 @@ xla::OpMetadata CreateOpMetadataFromLocation(
     if (isa<mlir::UnknownLoc>(loc)) return metadata;
 
     if (frame_index_builder != nullptr) {
-      int frameId = frame_index_builder->AddCallStackAndGetFirstFrameId(loc);
-      metadata.set_stack_frame_id(frameId);
+      auto result = frame_index_builder->AddCallStackAndGetFirstFrameId(loc);
+      metadata.set_stack_frame_id(result.last_frame_id);
+      // TODO(b/311155137): Remove when profiler will support stack traces.
+      metadata.set_source_file(result.last_frame_file);
+      metadata.set_source_line(result.last_frame_line);
     }
   }
 
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 5e74c80df520e7..d152f5e7ce64b7 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -835,6 +835,11 @@ namespace mlir {
 namespace mhlo {
 namespace {
 
+LogicalResult ExportXlaOp(CollectiveBroadcastOp, OpLoweringContext) {
+  // TODO: b/314330871 - Implement MHLO export for CollectiveBroadcastOp.
+  return failure();
+}
+
 LogicalResult ExportXlaOp(ComputeReshapeShapeOp, OpLoweringContext) {
   // This op should've been removed during PrepareForExport.
   return failure();
@@ -2733,8 +2738,17 @@ LogicalResult ExportXlaOp(UniformDequantizeOp op, OpLoweringContext ctx) {
 }
 
 LogicalResult ExportXlaOp(TopKOp op, OpLoweringContext ctx) {
-  // TODO(b/284077883): Implement HLO roundtrip for mhlo::TopKOp.
-  return failure();
+  auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
+    return failure();
+  auto topk = xla::TopK(operand, op.getK(), op.getLargest());
+
+  // Untuple the two results of XLA's topk.
+  for (const auto& [index, value] : llvm::enumerate(op.getResults())) {
+    value_map[value] = xla::GetTupleElement(topk, index);
+  }
+  return success();
 }
 
 }  // namespace
@@ -3219,6 +3233,15 @@ LogicalResult ConvertToHloModule::RunOnFunction(mlir::func::FuncOp f) {
       any_arg_replicated |= entry_args_same_across_replicas.back();
       // Pass the alias info to the builder so that it will build the alias info
       // into the resulting HloModule.
+      auto buffer_donor =
+          f.getArgAttrOfType<mlir::BoolAttr>(i, "jax.buffer_donor");
+      if (buffer_donor) {
+        if (use_tuple_args_) {
+          builder.AddBufferDonor(/*param_number=*/0, /*param_index=*/{i});
+        } else {
+          builder.AddBufferDonor(/*param_number=*/i, /*param_index=*/{});
+        }
+      }
       auto aliasing_output =
           f.getArgAttrOfType<mlir::IntegerAttr>(i, "tf.aliasing_output");
       if (!aliasing_output) continue;
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/stack_frame_index_builder.cc b/third_party/xla/xla/translate/mhlo_to_hlo/stack_frame_index_builder.cc
index 27de242099776f..7bf3fe5759c793 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/stack_frame_index_builder.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/stack_frame_index_builder.cc
@@ -101,7 +101,8 @@ bool IsFrameNameLocation(mlir::Location location) {
          isa<mlir::FileLineColLoc>(cast<mlir::NameLoc>(location).getChildLoc());
 }
 
-int StackFrameIndexBuilder::AddCallStackAndGetFirstFrameId(
+StackFrameIndexBuilder::AddStackFrameResult
+StackFrameIndexBuilder::AddCallStackAndGetFirstFrameId(
     const mlir::Location &root_loc) {
   std::stack<mlir::NameLoc> locations;
   mlir::CallSiteLoc call_site;
@@ -130,7 +131,16 @@ int StackFrameIndexBuilder::AddCallStackAndGetFirstFrameId(
     parent_frame_id = AddStackFrameLocation(name_location, parent_frame_id);
   }
 
-  return parent_frame_id;
+  if (parent_frame_id == StackFrameIndexBuilder::kInvalidIndex) {
+    return {StackFrameIndexBuilder::kInvalidIndex, "", 0};
+  }
+
+  auto stack_frame = indexes_.stack_frames(parent_frame_id - 1);
+  auto file_location =
+      indexes_.file_locations(stack_frame.file_location_id() - 1);
+  return {parent_frame_id,
+          indexes_.file_names(file_location.file_name_id() - 1),
+          file_location.line()};
 }
 
 xla::StackFrameIndexProto StackFrameIndexBuilder::Build() const {
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/stack_frame_index_builder.h b/third_party/xla/xla/translate/mhlo_to_hlo/stack_frame_index_builder.h
index 804795eb83f62e..fdfa908427a120 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/stack_frame_index_builder.h
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/stack_frame_index_builder.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_TRANSLATE_MHLO_TO_HLO_STACK_FRAME_INDEX_BUILDER_H_
 
 #include <map>
+#include <string>
 #include <string_view>
 #include <tuple>
 
@@ -30,7 +31,14 @@ class StackFrameIndexBuilder {
 
   xla::StackFrameIndexProto Build() const;
 
-  int AddCallStackAndGetFirstFrameId(const mlir::Location &root_loc);
+  struct AddStackFrameResult {
+    int last_frame_id;
+    std::string last_frame_file;
+    int last_frame_line;
+  };
+
+  AddStackFrameResult AddCallStackAndGetFirstFrameId(
+      const mlir::Location &root_loc);
 
  private:
   int AddStackFrameLocation(const mlir::NameLoc &name_location,
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
index 7cd4212027046d..40efe506d8c1c7 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
@@ -2930,6 +2930,16 @@ func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 
 // -----
 
+// CHECK: HloModule
+func.func @main(%arg0: tensor<4x4xf32>) -> (tensor<4x2xf32>, tensor<4x2xi32>) {
+  // CHECK: %[[ARG0:.*]] = f32[4,4] parameter(0)
+  %0:2 = "mhlo.topk"(%arg0) {k = 2, largest = true} : (tensor<4x4xf32>) -> (tensor<4x2xf32>, tensor<4x2xi32>)
+  // CHECK: (f32[4,2], s32[4,2]) topk(f32[4,4] %[[ARG0]]), k=2, largest=true
+  func.return %0#0, %0#1 : tensor<4x2xf32>, tensor<4x2xi32>
+}
+
+// -----
+
 // CHECK: HloModule
 // CHECK{LITERAL}: output_to_operand_aliasing={{0}: (0, {1})}
 func.func @main(%arg0: tuple<tensor<1x1xf32>, tensor<2x3xf32>>, %arg1: tensor<5x5xf32>) {
@@ -3048,3 +3058,17 @@ func.func @main(%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<1x2x3x
 func.func @main(%arg0: tensor<f32> {mhlo.parameter_replication = [true]}, %arg1: tuple<tensor<2x4xf32>, tuple<tensor<2x4xf32>>> {mhlo.parameter_replication = [false, true]}) -> tensor<f32> {
   return %arg0 : tensor<f32>
 }
+
+// -----
+
+func.func @main(%operand: tensor<?x784xf32>) -> tensor<?x784xf32> {
+  %0 = mhlo.abs %operand : tensor<?x784xf32>
+  func.return %0 : tensor<?x784xf32>
+}
+
+//       CHECK: HloModule {{.*}}, entry_computation_layout={(f32[?,784]{1,0})->f32[?,784]{1,0}}
+// CHECK-EMPTY:
+//  CHECK-NEXT: ENTRY {{.*}} ([[ARG0:.*]]: f32[?,784]) -> f32[?,784] {
+//  CHECK-NEXT:   [[ARG0]] = f32[?,784] parameter(0)
+//  CHECK-NEXT:   ROOT {{.*}} = f32[?,784] abs(f32[?,784] %Arg_0.1), {{.*}}
+//  CHECK-NEXT: }
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/location_to_stacktrace.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/location_to_stacktrace.mlir
index 5eb7cb1c18de04..53e6e3b8fdcb69 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/location_to_stacktrace.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/location_to_stacktrace.mlir
@@ -33,6 +33,8 @@ module @main attributes {mhlo.cross_program_prefetches = [], mhlo.is_dynamic = f
 // CHECK-NEXT: }
 // CHECK-NEXT: metadata {
 // CHECK-NEXT:   op_name: "name(anothername)
+// CHECK-NEXT:   source_file: "file_name"
+// CHECK-NEXT:   source_line: 2
 // CHECK-NEXT:   stack_frame_id: 1
 // CHECK-NEXT: }
 
@@ -71,6 +73,8 @@ module @main attributes {mhlo.cross_program_prefetches = [], mhlo.is_dynamic = f
 // CHECK-NEXT: }
 // CHECK-NEXT: metadata {
 // CHECK-NEXT:   op_name: "name(anothername)
+// CHECK-NEXT:   source_file: "file_name_2"
+// CHECK-NEXT:   source_line: 3
 // CHECK-NEXT:   stack_frame_id: 2
 // CHECK-NEXT: }
 
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.cc b/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.cc
index 8ccc406b756828..e10d8b7cff00f8 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.cc
@@ -82,12 +82,12 @@ PrimitiveType TypeToPrimitiveType(mlir::Type type) {
 }
 
 std::optional<std::tuple<DimLevelType, bool, bool>> ConvertDimLevelType(
-    mlir::sparse_tensor::DimLevelType dlt) {
-  auto f = mlir::sparse_tensor::getLevelFormat(dlt);
+    mlir::sparse_tensor::LevelType lt) {
+  auto f = mlir::sparse_tensor::getLevelFormat(lt);
   if (!f) return std::nullopt;
 
-  bool unique = mlir::sparse_tensor::isUniqueDLT(dlt);
-  bool ordered = mlir::sparse_tensor::isOrderedDLT(dlt);
+  bool unique = mlir::sparse_tensor::isUniqueLT(lt);
+  bool ordered = mlir::sparse_tensor::isOrderedLT(lt);
   switch (*f) {
     case mlir::sparse_tensor::LevelFormat::Singleton:
       return std::make_tuple(DimLevelType::DIM_SINGLETON, unique, ordered);
@@ -178,12 +178,11 @@ Shape TypeToShape(mlir::Type type) {
     llvm::SmallVector<int64_t, 4> shape(rank, mlir::ShapedType::kDynamic);
     std::vector<bool> is_dynamic(rank, false);
     for (int64_t dim = 0; dim < rank; ++dim) {
-      // Only fully static shapes are supported.
-      // TODO(b/115638799): Update once xla::Shape can support dynamic shapes.
       int64_t size = t.getDimSize(dim);
       if (size == ShapedType::kDynamic) {
-        if (bounds[dim] == ShapedType::kDynamic) return {};
-        shape[dim] = bounds[dim];
+        shape[dim] = bounds[dim] != ShapedType::kDynamic
+                         ? bounds[dim]
+                         : Shape::kUnboundedSize;
         is_dynamic[dim] = true;
       } else {
         if (bounds[dim] != ShapedType::kDynamic) return {};
@@ -207,12 +206,12 @@ Shape TypeToShape(mlir::Type type) {
       llvm::SmallVector<DimLevelType, 3> lvl_types;
       llvm::SmallVector<bool, 3> level_unique;
       llvm::SmallVector<bool, 3> level_ordered;
-      for (auto dlt : sparse.getLvlTypes()) {
-        auto new_dlt = ConvertDimLevelType(dlt);
-        if (!new_dlt) return {};
-        lvl_types.push_back(std::get<0>(*new_dlt));
-        level_unique.push_back(std::get<1>(*new_dlt));
-        level_ordered.push_back(std::get<2>(*new_dlt));
+      for (auto lt : sparse.getLvlTypes()) {
+        auto new_lt = ConvertDimLevelType(lt);
+        if (!new_lt) return {};
+        lvl_types.push_back(std::get<0>(*new_lt));
+        level_unique.push_back(std::get<1>(*new_lt));
+        level_ordered.push_back(std::get<2>(*new_lt));
       }
 
       std::vector<int64_t> ordering(rank);
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape_test.cc b/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape_test.cc
index 37d82730cb881f..e38dbc355d0426 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape_test.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape_test.cc
@@ -138,8 +138,15 @@ TEST(TypeToShapeTest, ConvertTensorTypeToTypes) {
           ShapeUtil::MakeShape(PrimitiveType::F32, {8, 128}, {true, false})
               .ToProto()));
 
-  // Shape cannot represent dynamic shapes.
-  // TODO(b/115638799): Update once Shape can support dynamic shapes.
+  EXPECT_THAT(
+      TypeToShape(RankedTensorType::get({mlir::ShapedType::kDynamic, 784},
+                                        b.getF32Type()))
+          .ToProto(),
+      EqualsProto(ShapeUtil::MakeShape(PrimitiveType::F32,
+                                       {Shape::kUnboundedSize, 784},
+                                       {true, false})
+                      .ToProto()));
+
   EXPECT_THAT(TypeToShape(UnrankedTensorType::get(b.getF32Type())).ToProto(),
               EqualsProto(Shape().ToProto()));
 
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
index a0c156937415db..0cbcd44d1e0d21 100644
--- a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
+++ b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
@@ -119,6 +119,12 @@ bool IsSyncCollective(const HloInstruction* instr) {
   return backend_config.is_sync();
 }
 
+bool NoParallelCustomCallCollective(const HloInstruction* instr) {
+  auto backend_config =
+      instr->backend_config<xla::gpu::CollectiveBackendConfig>().value();
+  return backend_config.no_parallel_custom_call();
+}
+
 // Convert the MLIR `module` from HLO dialect to LHLO dialect using XLA for the
 // given platform.
 tsl::Status ConvertHloToLmhlo(std::unique_ptr<HloModule> hlo_module,
@@ -406,7 +412,10 @@ tsl::StatusOr<lmhlo::FusionOp> LhloDialectEmitter::EmitFusionOp(
     llvm::SmallVector<Value, 4> output;
     TF_RETURN_IF_ERROR(GetOrCreateView(instr, &output));
     TF_RETURN_IF_ERROR(WalkTuplePostOrder(result, [&](Value v) mutable {
-      region_builder.create<memref::TensorStoreOp>(loc, v, output[i++]);
+      auto materialize_op =
+          region_builder.create<bufferization::MaterializeInDestinationOp>(
+              loc, v, output[i++]);
+      materialize_op.setWritable(true);
       return ::tsl::OkStatus();
     }));
     if (i != output.size()) {
@@ -427,6 +436,16 @@ tsl::StatusOr<lmhlo::FusionOp> LhloDialectEmitter::EmitFusionOp(
                       HloInstruction::BackendConfigToRawString(backend_config));
   fusion.setBackendConfigAttr(builder_.getStringAttr(backend_config_str));
 
+  // For custom fusion backend config we also attach serialized version of the
+  // attached HLO computation.
+  if (backend_config.kind() == "__custom_fusion") {
+    std::string computation_str;
+    fusion_instr->fused_instructions_computation()->ToProto().SerializeToString(
+        &computation_str);
+    fusion->setAttr("__custom_fusion_computation",
+                    builder_.getStringAttr(computation_str));
+  }
+
   // Fold GTE/Tuple pairs.
   //
   // Since the fused region refers to values in its parent region, we can't
@@ -777,6 +796,12 @@ AsLhloFusedMhaBackwardDagSignature(xla::gpu::CudnnfMHAKind kind) {
       return lmhlo_gpu::FusedMhaBackwardDagSignature::
           BackwardScaleBiasMaskSoftmaxDropout;
       break;
+    case xla::gpu::CudnnfMHAKind::kBackwardSoftmax:
+      return lmhlo_gpu::FusedMhaBackwardDagSignature::BackwardSoftmax;
+      break;
+    case xla::gpu::CudnnfMHAKind::kBackwardSoftmaxDropout:
+      return lmhlo_gpu::FusedMhaBackwardDagSignature::BackwardSoftmaxDropout;
+      break;
     default:
       return xla::InternalError("unknown cudnn fmha bwd kind");
   }
@@ -1267,13 +1292,17 @@ tsl::StatusOr<Operation*> LhloDialectEmitter::EmitDnnfMHA(
                                has_activation ? 1 : 0};
     op->setAttr(op.getOperandSegmentSizeAttr(),
                 builder_.getDenseI32ArrayAttr(operand_sizes));
+    // set is flash attention here
+    op.setIsFlashAttentionAttr(
+        builder_.getBoolAttr(config.is_flash_attention()));
+    // set is causal mask here
+    op.setIsCausalMaskAttr(builder_.getBoolAttr(config.is_causal_mask()));
     return op.getOperation();
   };
   llvm::SmallVector<Value, 8> operands;
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(0), &operands));
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(1), &operands));
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(2), &operands));
-
   switch (kind) {
     case xla::gpu::CudnnfMHAKind::kBmmBmm:
     case xla::gpu::CudnnfMHAKind::kSoftmax: {
@@ -1362,8 +1391,11 @@ tsl::StatusOr<Operation*> LhloDialectEmitter::EmitDnnfMHABackward(
   TF_ASSIGN_OR_RETURN(const xla::gpu::CudnnfMHAKind kind,
                       xla::gpu::GetCudnnfMHAKind(custom_call));
 
-  bool has_dbias = custom_call->shape().tuple_shapes().size() == 6;
+  bool is_flash_attention = config.is_flash_attention();
+  bool has_dbias =
+      custom_call->shape().tuple_shapes().size() == 6 && !is_flash_attention;
   bool has_mask = false;
+  bool has_bias = false;
 
   auto set_common_fmha_backward_attributes =
       [&, this](auto op) -> tsl::StatusOr<Operation*> {
@@ -1381,13 +1413,44 @@ tsl::StatusOr<Operation*> LhloDialectEmitter::EmitDnnfMHABackward(
     op.setBmm2GradGemm2DotDimensionNumbersAttr(GetDotDimensionNumbersAttr(
         builder_, config.bmm2_grad_gemm2_dot_dimension_numbers()));
 
+    auto intermediate_tensor_shape = Shape(config.intermediate_tensor_shape());
+    auto arrayref = [](absl::Span<const int64_t> array) {
+      return llvm::ArrayRef<int64_t>{array.data(), array.size()};
+    };
+    auto intermediate_tensor_dims = builder_.getI64ArrayAttr(
+        arrayref(intermediate_tensor_shape.dimensions()));
+    op.setIntermediateTensorDimensionsAttr(intermediate_tensor_dims);
+
+    auto intermediate_tensor_layout = builder_.getI64ArrayAttr(
+        arrayref(intermediate_tensor_shape.layout().minor_to_major()));
+    op.setIntermediateTensorLayoutAttr(intermediate_tensor_layout);
+
     op.setFmhaScaleAttr(builder_.getF64FloatAttr(config.fmha_scale()));
 
-    int32_t operand_sizes[] = {1, 1, 1, 1, 1, has_mask ? 1 : 0,
-                               1, 1, 1, 1, 1, has_dbias ? 1 : 0};
+    int32_t operand_sizes[] = {1,
+                               1,
+                               1,
+                               1,
+                               1,
+                               has_mask ? 1 : 0,
+                               has_bias ? 1 : 0,
+                               is_flash_attention ? 1 : 0,  // fwd_output
+                               1,
+                               1,
+                               1,
+                               is_flash_attention ? 0 : 1,  // d_S
+                               is_flash_attention ? 1 : 0,  // softmax_sum
+                               is_flash_attention ? 1 : 0,  // d_Q_accum
+                               1,
+                               has_dbias ? 1 : 0};
     op->setAttr(op.getOperandSegmentSizeAttr(),
                 builder_.getDenseI32ArrayAttr(operand_sizes));
 
+    // set is flash attention here
+    op.setIsFlashAttentionAttr(
+        builder_.getBoolAttr(config.is_flash_attention()));
+    // set is causal mask here
+    op.setIsCausalMaskAttr(builder_.getBoolAttr(config.is_causal_mask()));
     const auto& algorithm = config.algorithm();
     std::vector<int64_t> knob_ids;
     std::vector<int64_t> knob_values;
@@ -1403,7 +1466,7 @@ tsl::StatusOr<Operation*> LhloDialectEmitter::EmitDnnfMHABackward(
     return op.getOperation();
   };
 
-  llvm::SmallVector<Value, 12> operands;
+  llvm::SmallVector<Value, 15> operands;
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(0), &operands));
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(1), &operands));
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(2), &operands));
@@ -1412,15 +1475,35 @@ tsl::StatusOr<Operation*> LhloDialectEmitter::EmitDnnfMHABackward(
 
   switch (kind) {
     case xla::gpu::CudnnfMHAKind::kBackwardBmmBmm:
-    case xla::gpu::CudnnfMHAKind::kBackwardSoftmax:
+    case xla::gpu::CudnnfMHAKind::kBackwardSoftmax: {
+      // push fwd output for bwd here if it is flash attention
+      if (config.is_flash_attention()) {
+        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
+      }
+      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
+      auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
+          custom_call, operands);
+      return set_common_fmha_backward_attributes(fmha_backward);
+    }
     case xla::gpu::CudnnfMHAKind::kBackwardScaleBiasSoftmax: {
+      // push fwd output for bwd here if it is flash attention
+      if (config.is_flash_attention()) {
+        has_bias = true;
+        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
+        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(6), &operands));
+      }
       TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
       auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
           custom_call, operands);
       return set_common_fmha_backward_attributes(fmha_backward);
     }
-    case xla::gpu::CudnnfMHAKind::kBackwardSoftmaxDropout:
     case xla::gpu::CudnnfMHAKind::kBackwardScaleBiasSoftmaxDropout: {
+      // push fwd output for bwd here if it is flash attention
+      if (config.is_flash_attention()) {
+        has_bias = true;
+        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
+        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(6), &operands));
+      }
       TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
       auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
           custom_call, operands);
@@ -1430,9 +1513,26 @@ tsl::StatusOr<Operation*> LhloDialectEmitter::EmitDnnfMHABackward(
       return set_common_fmha_backward_attributes(fmha_backward);
     }
 
-    case xla::gpu::CudnnfMHAKind::kBackwardScaleMaskSoftmax:
+    case xla::gpu::CudnnfMHAKind::kBackwardScaleMaskSoftmax: {
+      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
+      // push fwd output for bwd here if it is flash attention
+      if (config.is_flash_attention()) {
+        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(6), &operands));
+      }
+      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
+      has_mask = true;
+      auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
+          custom_call, operands);
+      return set_common_fmha_backward_attributes(fmha_backward);
+    }
     case xla::gpu::CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax: {
       TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
+      // push fwd output for bwd here if it is flash attention
+      if (config.is_flash_attention()) {
+        has_bias = true;
+        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(6), &operands));
+        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(7), &operands));
+      }
       TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
       has_mask = true;
       auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
@@ -1440,9 +1540,31 @@ tsl::StatusOr<Operation*> LhloDialectEmitter::EmitDnnfMHABackward(
       return set_common_fmha_backward_attributes(fmha_backward);
     }
 
-    case xla::gpu::CudnnfMHAKind::kBackwardScaleMaskSoftmaxDropout:
+    case xla::gpu::CudnnfMHAKind::kBackwardScaleMaskSoftmaxDropout: {
+      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
+      // push fwd output for bwd here if it is flash attention
+      if (config.is_flash_attention()) {
+        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(6), &operands));
+      }
+      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
+      has_mask = true;
+      auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
+          custom_call, operands);
+      fmha_backward.setDropoutRateAttr(
+          builder_.getF64FloatAttr(config.dropout_rate()));
+      fmha_backward.setSeedAttr(builder_.getI64IntegerAttr(config.seed()));
+      return set_common_fmha_backward_attributes(fmha_backward);
+    }
     case xla::gpu::CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout: {
       TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
+      // push fwd output for bwd here if it is flash attention
+      if (config.is_flash_attention()) {
+        has_bias = true;
+        TF_RETURN_IF_ERROR(
+            GetOrCreateView(custom_call->operand(6), &operands));  // bias
+        TF_RETURN_IF_ERROR(
+            GetOrCreateView(custom_call->operand(7), &operands));  // fwd_output
+      }
       TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
       has_mask = true;
       auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
@@ -1587,6 +1709,8 @@ LhloDialectEmitter::EmitAllToAllStartOp(const xla::HloInstruction* instr) {
         builder_.getI64IntegerAttr(*all_to_all->split_dimension()));
   }
   all_to_all_start_op.setIsSync(IsSyncCollective(instr));
+  all_to_all_start_op.setNoParallelCustomCall(
+      NoParallelCustomCallCollective(instr));
 
   auto [_, was_inserted] =
       ret_tokens_.insert({instr, all_to_all_start_op.getToken()});
@@ -1622,6 +1746,8 @@ LhloDialectEmitter::EmitAllGatherStartOp(const HloInstruction* instr) {
   all_gather_start_op.setAllGatherDimensionAttr(
       builder_.getI64IntegerAttr(all_gather->all_gather_dimension()));
   all_gather_start_op.setIsSync(IsSyncCollective(instr));
+  all_gather_start_op.setNoParallelCustomCall(
+      NoParallelCustomCallCollective(instr));
   auto [_, was_inserted] =
       ret_tokens_.insert({instr, all_gather_start_op.getToken()});
   TF_RET_CHECK(was_inserted) << "all-gather-start already lowered";
@@ -1653,6 +1779,8 @@ LhloDialectEmitter::EmitAllReduceStartOp(const HloInstruction* instr) {
   all_reduce_start_op.setUseGlobalDeviceIdsAttr(
       builder_.getBoolAttr(all_reduce->use_global_device_ids()));
   all_reduce_start_op.setIsSync(IsSyncCollective(instr));
+  all_reduce_start_op.setNoParallelCustomCall(
+      NoParallelCustomCallCollective(instr));
 
   TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
       *instr->called_computations()[0], symbol_table_,
@@ -1727,6 +1855,8 @@ LhloDialectEmitter::EmitReduceScatterStartOp(const xla::HloInstruction* instr) {
   reduce_scatter_start_op.setScatterDimensionAttr(
       builder_.getI64IntegerAttr(reduce_scatter->scatter_dimension()));
   reduce_scatter_start_op.setIsSync(IsSyncCollective(instr));
+  reduce_scatter_start_op.setNoParallelCustomCall(
+      NoParallelCustomCallCollective(instr));
   TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
       *reduce_scatter->to_apply(), symbol_table_,
       &reduce_scatter_start_op.getComputation(), &builder_));
@@ -1765,6 +1895,8 @@ LhloDialectEmitter::EmitCollectivePermuteStartOp(const HloInstruction* instr) {
   permute_start_op->setAttr(source_target_pairs_attr.getName(),
                             source_target_pairs_attr.getValue());
   permute_start_op.setIsSync(IsSyncCollective(instr));
+  permute_start_op.setNoParallelCustomCall(
+      NoParallelCustomCallCollective(instr));
 
   auto [_, was_inserted] =
       ret_tokens_.insert({instr, permute_start_op.getToken()});
@@ -2400,7 +2532,8 @@ tsl::Status HloToLhloModule(
   TF_RETURN_IF_ERROR(emitter.Initialize(ordered_allocations));
 
   const xla::HloInstructionSequence* schedule =
-      assignment.hlo_ordering().SequentialOrder(*computation);
+      &hlo_module.schedule().sequence(computation);
+
   if (!schedule) {
     return tsl::errors::Unimplemented(
         "Missing sequential order for the computation");
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt
index a1722f6d03326f..6af4c9df2f4da0 100644
--- a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt
+++ b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt
@@ -705,6 +705,7 @@ HloModule TestAllGatherAsyncWithSyncFlagFalse
 // CHECK: %[[TOKEN:.*]] = "lmhlo_gpu.all_gather_start"(%{{.*}}, %{{.*}}) <
 // CHECK-SAME: all_gather_dimension = 1 : i64
 // CHECK-SAME: is_sync = false
+// CHECK-SAME: no_parallel_custom_call = false
 // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
 // CHECK-SAME: use_global_device_ids = false
 // CHECK ""lmhlo_gpu.all_gather_done"(%[[TOKEN]])
@@ -723,12 +724,13 @@ HloModule TestAllGatherAsyncWithSyncFlagTrue
 // CHECK: %[[TOKEN:.*]] = "lmhlo_gpu.all_gather_start"(%{{.*}}, %{{.*}}) <
 // CHECK-SAME: all_gather_dimension = 1 : i64
 // CHECK-SAME: is_sync = true
+// CHECK-SAME: no_parallel_custom_call = true
 // CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
 // CHECK-SAME: use_global_device_ids = false
 // CHECK ""lmhlo_gpu.all_gather_done"(%[[TOKEN]])
 ENTRY main {
   param0 = f32[10,20] parameter(0)
   ags = (f32[10,20], f32[10,80]) all-gather-start(param0), replica_groups={{0,1,2,3}},
-    dimensions={1}, backend_config="{\"is_sync\":true}"
+    dimensions={1}, backend_config="{\"is_sync\":true, \"no_parallel_custom_call\":true}"
   ROOT ag = f32[10,80] all-gather-done(ags)
 }
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/non_identity_layouts.hlotxt b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/non_identity_layouts.hlotxt
index 7967070b27f0b9..d74ec4e3434c93 100644
--- a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/non_identity_layouts.hlotxt
+++ b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/non_identity_layouts.hlotxt
@@ -20,7 +20,8 @@ ENTRY TestComputation {
   // CHECK-SAME:               result_layout = dense<[0, 1]>
   // CHECK-SAME:               xla_shape = "f32[3,2]{0,1}"
   // CHECK-SAME:             } : tensor<3x2xf32>
-  // CHECK:   memref.tensor_store %[[VAL3:.*]], %{{.*}} : memref<3x2xf32, #[[MAP]]>
+  // CHECK:   bufferization.materialize_in_destination %[[VAL3:.*]] in
+  // CHECK-SAME: writable %{{.*}} : (tensor<3x2xf32>, memref<3x2xf32, #[[MAP]]>)
   // CHECK:   "lmhlo.terminator"() : () -> ()
   // CHECK: }) : () -> ()
   ROOT fusion = f32[3, 2]{0,1} fusion(f32[3, 2]{1,0} x), kind=kLoop, calls=Fusion
diff --git a/third_party/xla/xla/types.h b/third_party/xla/xla/types.h
index 1910ea89e2ee4e..145f531473404b 100644
--- a/third_party/xla/xla/types.h
+++ b/third_party/xla/xla/types.h
@@ -22,8 +22,8 @@ limitations under the License.
 #include <type_traits>
 
 #include "absl/strings/str_format.h"
-#include "Eigen/Core"  // from @eigen_archive
-#include "ml_dtypes/include/int4.h"  // from @ml_dtypes
+#include "Eigen/Core"  // from @eigen_archive            // IWYU pragma: export
+#include "ml_dtypes/include/int4.h"  // from @ml_dtypes  // IWYU pragma: export
 
 namespace xla {
 
diff --git a/third_party/xla/xla/util.cc b/third_party/xla/xla/util.cc
index 2c39e7d8a44b53..429ab9d4a060b5 100644
--- a/third_party/xla/xla/util.cc
+++ b/third_party/xla/xla/util.cc
@@ -19,11 +19,13 @@ limitations under the License.
 
 #include <cmath>
 #include <limits>
+#include <memory>
 #include <numeric>
 #include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -39,6 +41,7 @@ limitations under the License.
 #include "tsl/platform/env.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/platform/stacktrace.h"
+#include "tsl/platform/threadpool.h"
 
 namespace xla {
 
@@ -521,4 +524,65 @@ void UnpackInt4(absl::Span<const char> input, absl::Span<char> output) {
     }
   }
 }
+
+/*static*/ MaybeOwningThreadPool MaybeOwningThreadPool::GetOrCreate(
+    int parallelism, tsl::thread::ThreadPool* default_thread_pool,
+    int default_parallelism) {
+  CHECK_GE(parallelism, 0);
+  CHECK_GE(default_parallelism, 1);
+
+  auto create_thread_pool = [&](int num_threads) {
+    CHECK_GE(num_threads, 1);
+    return std::make_unique<tsl::thread::ThreadPool>(tsl::Env::Default(), "",
+                                                     num_threads);
+  };
+
+  switch (parallelism) {
+    case 0:
+      if (default_thread_pool == nullptr && default_parallelism > 1) {
+        return MaybeOwningThreadPool(create_thread_pool(default_parallelism));
+      }
+      return MaybeOwningThreadPool(default_thread_pool);
+    case 1:
+      return MaybeOwningThreadPool(nullptr);
+    default:
+      return MaybeOwningThreadPool(create_thread_pool(parallelism));
+  }
+}
+
+MaybeOwningThreadPool::MaybeOwningThreadPool() : thread_pool_(nullptr) {}
+
+MaybeOwningThreadPool::MaybeOwningThreadPool(
+    tsl::thread::ThreadPool* thread_pool)
+    : thread_pool_(thread_pool) {}
+
+MaybeOwningThreadPool::MaybeOwningThreadPool(
+    std::unique_ptr<tsl::thread::ThreadPool> thread_pool)
+    : thread_pool_(std::move(thread_pool)) {}
+
+tsl::thread::ThreadPool* MaybeOwningThreadPool::get() {
+  if (std::holds_alternative<tsl::thread::ThreadPool*>(thread_pool_)) {
+    return std::get<tsl::thread::ThreadPool*>(thread_pool_);
+  }
+  return std::get<std::unique_ptr<tsl::thread::ThreadPool>>(thread_pool_).get();
+}
+
+const tsl::thread::ThreadPool* MaybeOwningThreadPool::get() const {
+  return const_cast<MaybeOwningThreadPool*>(this)->get();
+}
+
+tsl::thread::ThreadPool* MaybeOwningThreadPool::operator->() {
+  tsl::thread::ThreadPool* thread_pool = get();
+  CHECK_NE(thread_pool, nullptr);
+  return thread_pool;
+}
+
+const tsl::thread::ThreadPool* MaybeOwningThreadPool::operator->() const {
+  return const_cast<MaybeOwningThreadPool*>(this)->operator->();
+}
+
+MaybeOwningThreadPool::operator bool() const { return get() != nullptr; }
+
+bool MaybeOwningThreadPool::operator!() const { return get() == nullptr; }
+
 }  // namespace xla
diff --git a/third_party/xla/xla/util.h b/third_party/xla/xla/util.h
index ac708f039d4128..9894a7b46e4d4f 100644
--- a/third_party/xla/xla/util.h
+++ b/third_party/xla/xla/util.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include <string>
 #include <type_traits>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -53,6 +54,7 @@ limitations under the License.
 #include "tsl/platform/errors.h"  // IWYU pragma: keep
 #include "tsl/platform/float8.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/threadpool.h"
 
 namespace xla {
 
@@ -764,6 +766,37 @@ inline bool HloPredicateFalse(const HloInstruction*) { return false; }
 using Vector2 = std::array<int64_t, 2>;
 using Vector3 = std::array<int64_t, 3>;
 
+// A class for storing either an owned thread pool or a non-owning pointer to an
+// external thread pool.
+class MaybeOwningThreadPool {
+ public:
+  // Gets or creates a thread pool.
+  //
+  // See the code for the logic.
+  static MaybeOwningThreadPool GetOrCreate(
+      int parallelism, tsl::thread::ThreadPool* default_thread_pool,
+      int default_parallelism);
+
+  // Not owning (nullptr).
+  MaybeOwningThreadPool();
+  // Not owning.
+  explicit MaybeOwningThreadPool(tsl::thread::ThreadPool* thread_pool);
+  // Owning.
+  explicit MaybeOwningThreadPool(
+      std::unique_ptr<tsl::thread::ThreadPool> thread_pool);
+  tsl::thread::ThreadPool* get();
+  const tsl::thread::ThreadPool* get() const;
+  tsl::thread::ThreadPool* operator->();
+  const tsl::thread::ThreadPool* operator->() const;
+  explicit operator bool() const;
+  bool operator!() const;
+
+ private:
+  std::variant<tsl::thread::ThreadPool*,
+               std::unique_ptr<tsl::thread::ThreadPool>>
+      thread_pool_;
+};
+
 }  // namespace xla
 
 #define XLA_LOG_LINES(SEV, STRING) \
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index d54122380e378b..ed99ff9849db53 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -436,6 +436,15 @@ message DebugOptions {
   // If true, use XLA runtime for XLA:GPU backend.
   bool xla_gpu_enable_xla_runtime_executable = 169;
 
+  // If true, XLA will try to pattern match subgraphs of HLO operations into
+  // custom fusions registered in the current process (pre-compiled hand written
+  // kernels, e.g. various GEMM fusions writtent in CUTLASS).
+  bool xla_gpu_enable_custom_fusions = 263;
+
+  // A regular expression enabling only a subset of custom fusions. Enabled only
+  // if `xla_gpu_enable_custom_fusion` set to true.
+  string xla_gpu_enable_custom_fusions_re = 264;
+
   // If true, use OpenXLA runtime for XLA:GPU backend. That is, use IREE VM
   // as a host executable, optional CUDA HAL for dispatching device kernels and
   // custom modules for integration with libraries required for running
@@ -469,6 +478,7 @@ message DebugOptions {
     CUBLAS = 2;
     CUDNN = 3;
     NCCL = 4;
+    WHILE = 5;
   }
 
   // Determine the types of commands that are recorded into command buffers.
@@ -632,7 +642,6 @@ message DebugOptions {
 
   bool xla_gpu_enable_split_k_autotuning = 241;
 
-  bool xla_gpu_single_wave_autotuning = 242;
   // Whether reduction epilogue fusion is enabled in fusion passes.
   bool xla_gpu_enable_reduction_epilogue_fusion = 243;
   // Allow early return when acquiring NCCL cliques.
@@ -663,7 +672,10 @@ message DebugOptions {
   // Enable radix sort using CUB.
   bool xla_gpu_enable_cub_radix_sort = 259;
 
-  // Next id: 263
+  // Threshold to enable windowed einsum (collective matmul) in MB.
+  int64 xla_gpu_threshold_for_windowed_einsum_mib = 265;
+
+  // Next id: 266
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
@@ -677,13 +689,18 @@ message DebugOptions {
   // xla_gpu_allow_all_reduce_kernel
   // xla_gpu_enable_experimental_block_size
   // xla_gpu_graph_level
-  reserved 5, 117, 133, 139, 176, 178, 180, 193, 214, 194;
+  // xla_gpu_single_wave_autotuning
+  reserved 5, 117, 133, 139, 176, 178, 180, 193, 214, 194, 242;
 }
 
 // Contains flags which affects the GPU compilation result.
 // These flags are part of Debug Options as of now, and will be migrated to
 // this proto.
-message GpuCompilationEnvironment {}
+message GpuCompilationEnvironment {
+  // Temporary dummy flag is added to test the flow.
+  // To be removed when we add flags here.
+  int64 dummy_flag = 1;
+}
 
 message ShardableValueUpdatePairProto {
   int64 input_parameter_number = 1;
@@ -983,15 +1000,6 @@ message ExecuteParallelResponse {
   repeated ExecuteResponse responses = 1;
 }
 
-message WaitForExecutionRequest {
-  ExecutionHandle execution = 1;
-}
-
-message WaitForExecutionResponse {
-  GlobalDataHandle output = 1;
-  ExecutionProfile profile = 2;
-}
-
 message ComputeConstantGraphRequest {
   HloModuleProto computation = 1;
   LayoutProto output_layout = 2;